Module:headword: Difference between revisions

no edit summary
No edit summary
No edit summary
Line 20: Line 20:


local concat = table.concat
local concat = table.concat
local dump = mw.dumpObject
local insert = table.insert
local insert = table.insert
local ipairs = ipairs
local ipairs = ipairs
Line 36: Line 37:
--[==[
--[==[
Loaders for functions in other modules, which overwrite themselves with the target function when called. This ensures modules are only loaded when needed, retains the speed/convenience of locally-declared pre-loaded functions, and has no overhead after the first call, since the target functions are called directly in any subsequent calls.]==]
Loaders for functions in other modules, which overwrite themselves with the target function when called. This ensures modules are only loaded when needed, retains the speed/convenience of locally-declared pre-loaded functions, and has no overhead after the first call, since the target functions are called directly in any subsequent calls.]==]
local function encode_entities(...)
local function encode_entities(...)
encode_entities = require(string_utilities_module).encode_entities
encode_entities = require(string_utilities_module).encode_entities
return encode_entities(...)
return encode_entities(...)
end
local function extend(...)
extend = require(table_module).extend
return extend(...)
end
end


Line 169: Line 174:
-- If set to true, categories always appear, even in non-mainspace pages
-- If set to true, categories always appear, even in non-mainspace pages
local test_force_categories = false
local test_force_categories = false


local function text_in_script(text, script_code)
local function text_in_script(text, script_code)
Line 218: Line 222:
end
end
if type(list) ~= "table" then
if type(list) ~= "table" then
error(("Internal error: Wrong type for `part.%s`=%s, should be \"table\""):format(field, mw.dumpObject(list)))
error(("Internal error: Wrong type for `part.%s`=%s, should be \"table\""):format(field, dump(list)))
end
end
return list[1]
return list[1]
Line 313: Line 317:
local unique_head_parts = {}
local unique_head_parts = {}


local has_multiple_heads = #data.heads > 1
local has_multiple_heads = not not data.heads[2]


for j, head in ipairs(data.heads) do
for j, head in ipairs(data.heads) do
Line 389: Line 393:


if transliteration_page and transliteration_page.exists then
if transliteration_page and transliteration_page.exists then
if data.lang:hasType("conlang") then
translits_formatted = " [[" .. langname .. " transliteration|•]]" .. translits_formatted
translits_formatted = " [[" .. langname .. " transliteration|•]]" .. translits_formatted
else
translits_formatted = " [[Wiktionary:" .. langname .. " transliteration|•]]" .. translits_formatted
end
saw_translit_page = true
saw_translit_page = true
end
end
Line 403: Line 403:


if transliteration_page and transliteration_page.exists then
if transliteration_page and transliteration_page.exists then
translits_formatted = " [[Wiktionary:" .. langname .. " transliteration|•]]" .. translits_formatted
translits_formatted = " [[" .. langname .. " transliteration|•]]" .. translits_formatted
end
end
end
end
Line 425: Line 425:
local function format_headword_genders(data)
local function format_headword_genders(data)
local retval = ""
local retval = ""
if data.genders and #data.genders > 0 then
if data.genders and data.genders[1] then
if data.gloss then
if data.gloss then
retval = ","
retval = ","
Line 437: Line 437:
end
end
local text, cats = format_genders(data.genders, data.lang, pos_for_cat)
local text, cats = format_genders(data.genders, data.lang, pos_for_cat)
for _, cat in ipairs(cats) do
if cats then
insert(data.categories, cat)
extend(data.categories, cats)
end
end
retval = retval .. " " .. text
retval = retval .. " " .. text
Line 445: Line 445:
end
end


-- Forward reference
local format_inflections


local function format_inflection_parts(data, parts)
local function format_inflection_parts(data, parts)
local any_part_translit = false
for j, part in ipairs(parts) do
for j, part in ipairs(parts) do
if type(part) ~= "table" then
if type(part) ~= "table" then
Line 467: Line 467:
-- right into the 'data' table to disable inflection links of the entire headword
-- right into the 'data' table to disable inflection links of the entire headword
-- when inflected forms aren't entry-worthy, e.g.: in Vulgar Latin
-- when inflected forms aren't entry-worthy, e.g.: in Vulgar Latin
local nolinkinfl = part.face == "hypothetical" or part.nolinkinfl or data.nolinkinfl
local nolinkinfl = part.face == "hypothetical" or part.nolinkinfl or data.nolinkinfl


local formatted
local formatted
Line 481: Line 481:
-- where the script is relatively straightforward to read by learners (e.g. Greek, Russian), but allow it
-- where the script is relatively straightforward to read by learners (e.g. Greek, Russian), but allow it
-- to be enabled in languages with more complex scripts (e.g. Arabic).
-- to be enabled in languages with more complex scripts (e.g. Arabic).
local tr = part.translit or (not (parts.enable_auto_translit or data.inflections.enable_auto_translit) and "-" or nil)
--
if tr ~= "-" then
-- FIXME: With nested inflections, should we also respect `enable_auto_translit` at the top level of the
any_part_translit = true
-- nested inflections structure?
local tr = part.tr or not (parts.enable_auto_translit or data.inflections.enable_auto_translit) and "-" or nil
-- FIXME: Temporary errors added 2025-10-03. Remove after a month or so.
if part.translit then
error("Internal error: Use field `tr` not `translit` for specifying an inflection part translit")
end
if part.transcription then
error("Internal error: Use field `ts` not `transcription` for specifying an inflection part transcription")
end
local postprocess_annotations
if part.inflections then
postprocess_annotations = function(infldata)
insert(infldata.annotations, format_inflections(data, part.inflections))
end
end
end
formatted = full_link(
formatted = full_link(
{
{
Line 497: Line 511:
genders = part.genders,
genders = part.genders,
tr = tr,
tr = tr,
ts = part.transcription,
ts = part.ts,
accel = partaccel or parts.accel,
accel = partaccel or parts.accel,
postprocess_annotations = postprocess_annotations,
},
},
face
face
Line 510: Line 525:
local parts_output
local parts_output


if #parts > 0 then
if parts[1] then
parts_output = (parts.label and " " or "") .. concat(parts)
parts_output = (parts.label and " " or "") .. concat(parts)
elseif parts.request then
elseif parts.request then
Line 520: Line 535:


local parts_label = parts.label and ("<i>" .. parts.label .. "</i>") or ""
local parts_label = parts.label and ("<i>" .. parts.label .. "</i>") or ""
return parts_label .. parts_output, any_part_translit
return format_term_with_qualifiers_and_refs(data.lang, parts, parts_label .. parts_output, 1)
end
end




-- Format the inflections following the headword.
-- Format the inflections following the headword or nested after a given inflection.
local function format_inflections(data)
format_inflections = function(data, inflections)
local any_part_translit = false
if inflections and inflections[1] then
if data.inflections and #data.inflections > 0 then
-- Format each inflection individually.
-- Format each inflection individually.
for key, infl in ipairs(data.inflections) do
for key, infl in ipairs(inflections) do
local this_any_part_translit
inflections[key] = format_inflection_parts(data, infl)
data.inflections[key], this_any_part_translit = format_inflection_parts(data, infl)
if this_any_part_translit then
any_part_translit = true
end
end
end


local concat_result = concat(data.inflections, ", ")
return concat(inflections, ", ")
return " (" .. concat_result .. ")"
else
else
return ""
return ""
end
end
end
end
-- Format the top-level inflections following the headword. Currently this just adds parens around the
-- formatted comma-separated inflections in `data.inflections`.
local function format_top_level_inflections(data)
local result = format_inflections(data, data.inflections)
if result ~= "" then
return " (" .. result .. ")"
else
return result
end
end


--[==[
--[==[
Line 567: Line 589:
]==]
]==]
function export.pos_lemma_or_nonlemma(plpos, best_guess)
function export.pos_lemma_or_nonlemma(plpos, best_guess)
local isLemma = (m_data or get_data()).lemmas
local m_headword_data = m_data or get_data()
local isLemma = m_headword_data.lemmas
-- Is it a lemma category?
-- Is it a lemma category?
if isLemma[plpos] then
if isLemma[plpos] then
Line 577: Line 600:
end
end
-- Is it a nonlemma category?
-- Is it a nonlemma category?
local isNonLemma = (m_data or get_data()).nonlemmas
local isNonLemma = m_headword_data.nonlemmas
if isNonLemma[plpos] or isNonLemma[plpos_no_recon] then
if isNonLemma[plpos] or isNonLemma[plpos_no_recon] then
return "non-lemma form"
return "non-lemma form"
Line 605: Line 628:
error("POS 'pro' for 'pronoun' no longer allowed as it's too ambiguous; use 'pron'")
error("POS 'pro' for 'pronoun' no longer allowed as it's too ambiguous; use 'pron'")
end
end
local data = m_data or get_data()
local m_headword_data = m_data or get_data()
if data.pos_aliases[pos] then
if m_headword_data.pos_aliases[pos] then
pos = data.pos_aliases[pos]
pos = m_headword_data.pos_aliases[pos]
elseif pos:sub(-1) == "f" then
elseif pos:sub(-1) == "f" then
pos = pos:sub(1, -2)
pos = pos:sub(1, -2)
pos = (data.pos_aliases[pos] or pos) .. " forms"
pos = (m_headword_data.pos_aliases[pos] or pos) .. " forms"
end
end
return export.pluralize_pos(pos)
return export.pluralize_pos(pos)
Line 626: Line 649:
local typ = type(data[element])
local typ = type(data[element])
if typ ~= "table" then
if typ ~= "table" then
error(("In full_headword(), `data.%s` must be an array but is a %s"):format(element, typ))
error(("Internal error: In full_headword(), `data.%s` must be an array but is a %s"):format(element, typ))
end
end
for k, v in pairs(data[element]) do
for k, v in pairs(data[element]) do
if k ~= "maxindex" then
if k ~= "maxindex" then
if type(k) ~= "number" then
if type(k) ~= "number" then
error(("Unrecognized non-numeric key '%s' in `data.%s`"):format(k, element))
error(("Internal error: Unrecognized non-numeric key '%s' in `data.%s`"):format(k, element))
end
end
if k > maxind then
if k > maxind then
Line 638: Line 661:
if v then
if v then
if type(v) ~= "string" then
if type(v) ~= "string" then
error(("For key '%s' in `data.%s`, value should be a string but is a %s"):format(k, element, type(v)))
error(("Internal error: For key '%s' in `data.%s`, value should be a string but is a %s"):format(k, element, type(v)))
end
end
if not allow_blank_string and v == "" then
if not allow_blank_string and v == "" then
error(("For key '%s' in `data.%s`, blank string not allowed; use 'false' for the default"):format(k, element))
error(("Internal error: For key '%s' in `data.%s`, blank string not allowed; use 'false' for the default"):format(k, element))
end
end
end
end
Line 670: Line 693:
-- that.
-- that.
if tbl == true then
if tbl == true then
if page.raw_defaultsort ~= sortkey then
insert(lang_cats, lang:getFullName() .. " terms with non-redundant non-automated sortkeys")
end
return
return
end
end
Line 679: Line 705:
different = true
different = true
end
end
end
if redundant then
insert(lang_cats, lang:getFullName() .. " terms with redundant sortkeys")
end
if different then
insert(lang_cats, lang:getFullName() .. " terms with non-redundant non-automated sortkeys")
end
end
return sortkey
return sortkey
Line 684: Line 716:


function export.maintenance_cats(page, lang, lang_cats, page_cats)
function export.maintenance_cats(page, lang, lang_cats, page_cats)
for _, cat in ipairs(page.cats) do
extend(page_cats, page.cats)
insert(page_cats, cat)
end
lang = lang:getFull() -- since we are just generating categories
lang = lang:getFull() -- since we are just generating categories
local canonical = lang:getCanonicalName()
local canonical = lang:getCanonicalName()
local tbl, sortkey = page.wikitext_topic_cat[lang:getCode()]
local tbl, sortkey = page.wikitext_topic_cat[lang:getCode()]
if tbl then
sortkey = handle_raw_sortkeys(tbl, sortkey, page, lang, lang_cats)
insert(lang_cats, canonical .. " entries with topic categories using raw markup")
end
tbl = page.wikitext_langname_cat[canonical]
tbl = page.wikitext_langname_cat[canonical]
if tbl then
handle_raw_sortkeys(tbl, sortkey, page, lang, lang_cats)
insert(lang_cats, canonical .. " entries with language name categories using raw markup")
end
if get_current_L2() ~= canonical then
insert(lang_cats, canonical .. " entries with incorrect language header")
end
end
end
end
end
Line 707: Line 748:


if data.getCanonicalName then
if data.getCanonicalName then
error("In full_headword(), the first argument `data` needs to be a Lua object (table) of properties, not a language object")
error("Internal error: In full_headword(), the first argument `data` needs to be a Lua object (table) of properties, not a language object")
end
end


if not data.lang or type(data.lang) ~= "table" or not data.lang.getCode then
if not data.lang or type(data.lang) ~= "table" or not data.lang.getCode then
error("In full_headword(), the first argument `data` needs to be a Lua object (table) and `data.lang` must be a language object")
error("Internal error: In full_headword(), the first argument `data` needs to be a Lua object (table) and `data.lang` must be a language object")
end
end


if data.id and type(data.id) ~= "string" then
if data.id and type(data.id) ~= "string" then
error("The id in the data table should be a string.")
error("Internal error: The id in the data table should be a string.")
end
end


Line 725: Line 766:
local full_langname = data.lang:getFullName()
local full_langname = data.lang:getFullName()


local raw_pagename, page = data.pagename
local raw_pagename = data.pagename
if raw_pagename and raw_pagename ~= (m_data or get_data()).pagename then -- for testing, doc pages, etc.
local page
page = process_page(raw_pagename)
local m_headword_data = m_data or get_data()
if raw_pagename and raw_pagename ~= m_headword_data.pagename then -- for testing, doc pages, etc.
-- data.pagename is often set on documentation and test pages through the pagename= parameter of various
-- templates, to emulate running on that page. Having a large number of such test templates on a single
-- page often leads to timeouts, because we fetch and parse the contents of each page in turn. However,
-- we don't really need to do that and can function fine without fetching and parsing the contents of a
-- given page, so turn off content fetching/parsing (and also setting the DEFAULTSORT key through a parser
-- function, which is *slooooow*) in certain namespaces where test and documentation templates are likely to
-- be found and where actual content does not live (User, Template, Module).
local actual_namespace = m_headword_data.page.namespace
local no_fetch_content = actual_namespace == "User" or actual_namespace == "Template" or
actual_namespace == "Module"
page = process_page(raw_pagename, no_fetch_content)
else
else
page = (m_data or get_data()).page
page = m_headword_data.page
end
end


-- Check the namespace against the language type.
local namespace = page.namespace
local namespace = page.namespace
if namespace == "" then
if data.lang:hasType("reconstructed") then
error("Entries in " .. langname .. " must be placed in the Reconstruction: namespace")
elseif data.lang:hasType("appendix-constructed") then
error("Entries in " .. langname .. " must be placed in the Appendix: namespace")
end
elseif namespace == "Citations" or namespace == "Thesaurus" then
error("Headword templates should not be used in the " .. namespace .. ": namespace.")
end


------------ 3. Initialize `data.heads` table; if old-style, convert to new-style. ------------
------------ 3. Initialize `data.heads` table; if old-style, convert to new-style. ------------
Line 749: Line 792:
-- new-style
-- new-style
if data.translits or data.transcriptions then
if data.translits or data.transcriptions then
error("In full_headword(), if `data.heads` is new-style (array of head objects), `data.translits` and `data.transcriptions` cannot be given")
error("Internal error: In full_headword(), if `data.heads` is new-style (array of head objects), `data.translits` and `data.transcriptions` cannot be given")
end
end
else
else
Line 782: Line 825:
init_and_find_maximum_index(data, "whole_page_categories")
init_and_find_maximum_index(data, "whole_page_categories")
local pos_category_already_present = false
local pos_category_already_present = false
if #data.categories > 0 then
if data.categories[1] then
local escaped_langname = pattern_escape(full_langname)
local escaped_langname = pattern_escape(full_langname)
local matches_lang_pattern = "^" .. escaped_langname .. " "
local matches_lang_pattern = "^" .. escaped_langname .. " "
Line 796: Line 839:


if not data.pos_category then
if not data.pos_category then
error("`data.pos_category` not specified and could not be inferred from the categories given in "
error("Internal error: `data.pos_category` not specified and could not be inferred from the categories given in "
.. "`data.categories`. Either specify the plural part of speech in `data.pos_category` "
.. "`data.categories`. Either specify the plural part of speech in `data.pos_category` "
.. "(e.g. \"proper nouns\") or ensure that the first category in `data.categories` is formed from the "
.. "(e.g. \"proper nouns\") or ensure that the first category in `data.categories` is formed from the "
Line 815: Line 858:
-- add an appropriate category.
-- add an appropriate category.
local postype = export.pos_lemma_or_nonlemma(data.pos_category)
local postype = export.pos_lemma_or_nonlemma(data.pos_category)
if not postype then
if not data.noposcat then
elseif not data.noposcat then
if postype == "lemma" then postype = data.lang:getMainCategoryName() end
insert(data.categories, 1, full_langname .. " " .. postype .. "s")
insert(data.categories, 1, full_langname .. " " .. postype .. "s")
end
end
Line 827: Line 868:


------------ 5. Create a default headword, and add links to multiword page names. ------------
------------ 5. Create a default headword, and add links to multiword page names. ------------
-- Determine if this is an "anti-asterisk" term, i.e. an attested term in a language that must normally be
-- reconstructed.
local is_anti_asterisk = data.heads[1].term and data.heads[1].term:find("^!!")
local lang_reconstructed = data.lang:hasType("reconstructed")
if is_anti_asterisk then
if not lang_reconstructed then
error("Anti-asterisk feature (head= beginning with !!) can only be used with reconstructed languages")
end
lang_reconstructed = false
end


-- Determine if term is reconstructed
-- Determine if term is reconstructed
local is_reconstructed = namespace == "Reconstruction" or data.lang:hasType("reconstructed")
local is_reconstructed = namespace == "Reconstruction" or lang_reconstructed


-- Create a default headword based on the pagename, which is determined in
-- Create a default headword based on the pagename, which is determined in
Line 837: Line 890:
-- Add links to multi-word page names when appropriate
-- Add links to multi-word page names when appropriate
if not (is_reconstructed or data.nolinkhead) then
if not (is_reconstructed or data.nolinkhead) then
local no_links = (m_data or get_data()).no_multiword_links
local no_links = m_headword_data.no_multiword_links
if not (no_links[langcode] or no_links[full_langcode]) and export.head_is_multiword(default_head) then
if not (no_links[langcode] or no_links[full_langcode]) and export.head_is_multiword(default_head) then
default_head = export.add_multiword_links(default_head, true)
default_head = export.add_multiword_links(default_head, true)
Line 847: Line 900:
end
end


------------ 6. Fill in missing values in `data.heads`. ------------
------------ 6. Check the namespace against the language type. ------------
 
if namespace == "" then
if lang_reconstructed then
error("Entries in " .. langname .. " must be placed in the Reconstruction: namespace")
elseif data.lang:hasType("appendix-constructed") then
error("Entries in " .. langname .. " must be placed in the Appendix: namespace")
end
elseif namespace == "Citations" or namespace == "Thesaurus" then
error("Headword templates should not be used in the " .. namespace .. ": namespace.")
end
 
------------ 7. Fill in missing values in `data.heads`. ------------


-- True if any script among the headword scripts has spaces in it.
-- True if any script among the headword scripts has spaces in it.
Line 856: Line 921:
for _, head in ipairs(data.heads) do
for _, head in ipairs(data.heads) do


------ 6a. If missing head, replace with default head.
------ 7a. If missing head, replace with default head.
if not head.term then
if not head.term then
head.term = default_head
head.term = default_head
elseif head.term == default_head then
elseif head.term == default_head then
has_redundant_head_param = true
has_redundant_head_param = true
elseif is_anti_asterisk and head.term == "!!" then
-- If explicit head=!! is given, it's an anti-asterisk term and we fill in the default head.
head.term = "!!" .. default_head
elseif head.term:find("^[!?]$") then
elseif head.term:find("^[!?]$") then
-- If explicit head= just consists of ! or ?, add it to the end of the default head.
-- If explicit head= just consists of ! or ?, add it to the end of the default head.
head.term = default_head .. head.term
head.term = default_head .. head.term
end
end
head.term_no_initial_bang_bang = is_anti_asterisk and head.term:sub(3) or head.term


if is_reconstructed then
if is_reconstructed then
Line 876: Line 945:
end
end


------ 6b. Try to detect the script(s) if not provided. If a per-head script is provided, that takes precedence,
------ 7b. Try to detect the script(s) if not provided. If a per-head script is provided, that takes precedence,
------    otherwise fall back to the overall script if given. If neither given, autodetect the script.
------    otherwise fall back to the overall script if given. If neither given, autodetect the script.


Line 904: Line 973:
any_script_has_spaces = any_script_has_spaces or head.sc:hasSpaces()
any_script_has_spaces = any_script_has_spaces or head.sc:hasSpaces()


------ 6c. Create automatic transliterations for any non-Latin headwords without manual translit given
------ 7c. Create automatic transliterations for any non-Latin headwords without manual translit given
------    (provided automatic translit is available, e.g. not in Persian or Hebrew).
------    (provided automatic translit is available, e.g. not in Persian or Hebrew).


Line 915: Line 984:
head.tr = nil
head.tr = nil
else
else
local notranslit = (m_data or get_data()).notranslit
local notranslit = m_headword_data.notranslit
if not (notranslit[langcode] or notranslit[full_langcode]) and head.sc:isTransliterated() then
if not (notranslit[langcode] or notranslit[full_langcode]) and head.sc:isTransliterated() then
head.tr_manual = not not head.tr
head.tr_manual = not not head.tr


local text = head.term
local text = head.term_no_initial_bang_bang
if not data.lang:link_tr(head.sc) then
if not data.lang:link_tr(head.sc) then
text = remove_links(text)
text = remove_links(text)
Line 929: Line 998:
if automated_tr or head.tr_fail then
if automated_tr or head.tr_fail then
local manual_tr = head.tr
local manual_tr = head.tr
if manual_tr then
if (remove_links(manual_tr) == remove_links(automated_tr)) and (not head.tr_fail) then
insert(data.categories, full_langname .. " terms with redundant transliterations")
elseif not head.tr_fail then
insert(data.categories, full_langname .. " terms with non-redundant manual transliterations")
end
end


if not manual_tr then
if not manual_tr then
head.tr = automated_tr
head.tr = automated_tr
for _, category in ipairs(tr_categories) do
extend(data.categories, tr_categories)
insert(data.categories, category)
end
end
end
end
end
Line 963: Line 1,038:
end
end


------------ 7. Maybe tag the title with the appropriate script code, using the `display_title` mechanism. ------------
------------ 8. Maybe tag the title with the appropriate script code, using the `display_title` mechanism. ------------


-- Assumes that the scripts in "toBeTagged" will never occur in the Reconstruction namespace.
-- Assumes that the scripts in "toBeTagged" will never occur in the Reconstruction namespace.
Line 979: Line 1,054:
if unsupported == 1 and page.unsupported_titles[unsupported_pagename] then
if unsupported == 1 and page.unsupported_titles[unsupported_pagename] then
display_title = 'Unsupported titles/<span class="' .. dt_script_code .. '">' .. page.unsupported_titles[unsupported_pagename] .. '</span>'
display_title = 'Unsupported titles/<span class="' .. dt_script_code .. '">' .. page.unsupported_titles[unsupported_pagename] .. '</span>'
elseif page_non_ascii and (m_data or get_data()).toBeTagged[dt_script_code]
elseif page_non_ascii and m_headword_data.toBeTagged[dt_script_code]
or (dt_script_code == "Jpan" and (text_in_script(page.pagename, "Hira") or text_in_script(page.pagename, "Kana")))
or (dt_script_code == "Jpan" and (text_in_script(page.pagename, "Hira") or text_in_script(page.pagename, "Kana")))
or (dt_script_code == "Kore" and text_in_script(page.pagename, "Hang")) then
or (dt_script_code == "Kore" and text_in_script(page.pagename, "Hang")) then
Line 1,018: Line 1,093:
end
end


------------ 8. Insert additional categories. ------------
------------ 9. Insert additional categories. ------------
 
if has_redundant_head_param then
if not data.no_redundant_head_cat then
insert(data.categories, full_langname .. " terms with redundant head parameter")
end
end


-- If the first head is multiword (after removing links), maybe insert into "LANG multiword terms".
-- If the first head is multiword (after removing links), maybe insert into "LANG multiword terms".
if not data.nomultiwordcat and any_script_has_spaces and postype == "lemma" then
if not data.nomultiwordcat and any_script_has_spaces and postype == "lemma" then
local no_multiword_cat = (m_data or get_data()).no_multiword_cat
local no_multiword_cat = m_headword_data.no_multiword_cat
if not (no_multiword_cat[langcode] or no_multiword_cat[full_langcode]) then
if not (no_multiword_cat[langcode] or no_multiword_cat[full_langcode]) then
-- Check for spaces or hyphens, but exclude prefixes and suffixes.
-- Check for spaces or hyphens, but exclude prefixes and suffixes.
-- Use the pagename, not the head= value, because the latter may have extra
-- Use the pagename, not the head= value, because the latter may have extra
-- junk in it, e.g. superscripted text that throws off the algorithm.
-- junk in it, e.g. superscripted text that throws off the algorithm.
local no_hyphen = (m_data or get_data()).hyphen_not_multiword_sep
local no_hyphen = m_headword_data.hyphen_not_multiword_sep
-- Exclude hyphens if the data module states that they should for this language.
-- Exclude hyphens if the data module states that they should for this language.
local checkpattern = (no_hyphen[langcode] or no_hyphen[full_langcode]) and ".[%s፡]." or ".[%s%-፡]."
local checkpattern = (no_hyphen[langcode] or no_hyphen[full_langcode]) and ".[%s፡]." or ".[%s%-፡]."
if umatch(page.pagename, checkpattern) and not non_categorizable(page.full_raw_pagename) then
local is_multiword = umatch(page.pagename, checkpattern)
 
if is_multiword and not non_categorizable(page.full_raw_pagename) then
insert(data.categories, full_langname .. " multiword terms")
insert(data.categories, full_langname .. " multiword terms")
elseif not is_multiword then
local long_word_threshold = m_headword_data.long_word_thresholds[langcode]
if long_word_threshold and ulen(page.pagename) >= long_word_threshold then
insert(data.categories, "Long " .. full_langname .. " words")
end
end
end
end
end
Line 1,219: Line 1,307:
insert(data.categories, full_langname .. " terms spelled with " .. character)
insert(data.categories, full_langname .. " terms spelled with " .. character)
end
end
end
end
if data.heads[1].sc:isSystem("alphabet") then
local pagename, i = page.pagename:ulower(), 2
while umatch(pagename, "(%a)" .. ("%1"):rep(i)) do
i = i + 1
insert(data.categories, full_langname .. " terms with " .. i .. " consecutive instances of the same letter")
end
end
end
end
Line 1,229: Line 1,325:
insert(data.categories, full_langname .. " palindromes")
insert(data.categories, full_langname .. " palindromes")
end
end
 
if data.affix then
for _, aff in ipairs(data.affix) do
if mw.ustring.match(aff, "^%-[^-]*%-$") then
table.insert(data.categories, data.lang:getCanonicalName() .. " words interfixed with " .. aff)
elseif mw.ustring.match(aff, "%-%s%-") then
table.insert(data.categories, data.lang:getCanonicalName() .. " words circumfixed with " .. aff)
elseif mw.ustring.match(aff, "%-$") then
table.insert(data.categories, data.lang:getCanonicalName() .. " words prefixed with " .. aff)
elseif mw.ustring.match(aff, "^%-") then
table.insert(data.categories, data.lang:getCanonicalName() .. " words suffixed with " .. aff)
end
end
end
-- Add to various maintenance categories.
-- Add to various maintenance categories.
export.maintenance_cats(page, data.lang, data.categories, data.whole_page_categories)
export.maintenance_cats(page, data.lang, data.categories, data.whole_page_categories)


------------ 9. Format and return headwords, genders, inflections and categories. ------------
------------ 10. Format and return headwords, genders, inflections and categories. ------------


-- Format and return all the gathered information. This may add more categories (e.g. gender/number categories),
-- Format and return all the gathered information. This may add more categories (e.g. gender/number categories),
Line 1,254: Line 1,336:
format_headword(data) ..
format_headword(data) ..
format_headword_genders(data) ..
format_headword_genders(data) ..
format_inflections(data) .. '</span>'
format_top_level_inflections(data) .. '</span>'


-- Language-specific categories.
-- Language-specific categories.