48,355
edits
No edit summary |
No edit summary |
||
| Line 20: | Line 20: | ||
local concat = table.concat | local concat = table.concat | ||
local dump = mw.dumpObject | |||
local insert = table.insert | local insert = table.insert | ||
local ipairs = ipairs | local ipairs = ipairs | ||
| Line 36: | Line 37: | ||
--[==[ | --[==[ | ||
Loaders for functions in other modules, which overwrite themselves with the target function when called. This ensures modules are only loaded when needed, retains the speed/convenience of locally-declared pre-loaded functions, and has no overhead after the first call, since the target functions are called directly in any subsequent calls.]==] | Loaders for functions in other modules, which overwrite themselves with the target function when called. This ensures modules are only loaded when needed, retains the speed/convenience of locally-declared pre-loaded functions, and has no overhead after the first call, since the target functions are called directly in any subsequent calls.]==] | ||
local function encode_entities(...) | local function encode_entities(...) | ||
encode_entities = require(string_utilities_module).encode_entities | encode_entities = require(string_utilities_module).encode_entities | ||
return encode_entities(...) | return encode_entities(...) | ||
end | |||
local function extend(...) | |||
extend = require(table_module).extend | |||
return extend(...) | |||
end | end | ||
| Line 169: | Line 174: | ||
-- If set to true, categories always appear, even in non-mainspace pages | -- If set to true, categories always appear, even in non-mainspace pages | ||
local test_force_categories = false | local test_force_categories = false | ||
local function text_in_script(text, script_code) | local function text_in_script(text, script_code) | ||
| Line 218: | Line 222: | ||
end | end | ||
if type(list) ~= "table" then | if type(list) ~= "table" then | ||
error(("Internal error: Wrong type for `part.%s`=%s, should be \"table\""):format(field, | error(("Internal error: Wrong type for `part.%s`=%s, should be \"table\""):format(field, dump(list))) | ||
end | end | ||
return list[1] | return list[1] | ||
| Line 313: | Line 317: | ||
local unique_head_parts = {} | local unique_head_parts = {} | ||
local has_multiple_heads = | local has_multiple_heads = not not data.heads[2] | ||
for j, head in ipairs(data.heads) do | for j, head in ipairs(data.heads) do | ||
| Line 389: | Line 393: | ||
if transliteration_page and transliteration_page.exists then | if transliteration_page and transliteration_page.exists then | ||
translits_formatted = " [[" .. langname .. " transliteration|•]]" .. translits_formatted | |||
saw_translit_page = true | saw_translit_page = true | ||
end | end | ||
| Line 403: | Line 403: | ||
if transliteration_page and transliteration_page.exists then | if transliteration_page and transliteration_page.exists then | ||
translits_formatted = " [[ | translits_formatted = " [[" .. langname .. " transliteration|•]]" .. translits_formatted | ||
end | end | ||
end | end | ||
| Line 425: | Line 425: | ||
local function format_headword_genders(data) | local function format_headword_genders(data) | ||
local retval = "" | local retval = "" | ||
if data.genders and | if data.genders and data.genders[1] then | ||
if data.gloss then | if data.gloss then | ||
retval = "," | retval = "," | ||
| Line 437: | Line 437: | ||
end | end | ||
local text, cats = format_genders(data.genders, data.lang, pos_for_cat) | local text, cats = format_genders(data.genders, data.lang, pos_for_cat) | ||
if cats then | |||
extend(data.categories, cats) | |||
end | end | ||
retval = retval .. " " .. text | retval = retval .. " " .. text | ||
| Line 445: | Line 445: | ||
end | end | ||
-- Forward reference | |||
local format_inflections | |||
local function format_inflection_parts(data, parts) | local function format_inflection_parts(data, parts) | ||
for j, part in ipairs(parts) do | for j, part in ipairs(parts) do | ||
if type(part) ~= "table" then | if type(part) ~= "table" then | ||
| Line 467: | Line 467: | ||
-- right into the 'data' table to disable inflection links of the entire headword | -- right into the 'data' table to disable inflection links of the entire headword | ||
-- when inflected forms aren't entry-worthy, e.g.: in Vulgar Latin | -- when inflected forms aren't entry-worthy, e.g.: in Vulgar Latin | ||
local nolinkinfl = part.face == "hypothetical" or | local nolinkinfl = part.face == "hypothetical" or part.nolinkinfl or data.nolinkinfl | ||
local formatted | local formatted | ||
| Line 481: | Line 481: | ||
-- where the script is relatively straightforward to read by learners (e.g. Greek, Russian), but allow it | -- where the script is relatively straightforward to read by learners (e.g. Greek, Russian), but allow it | ||
-- to be enabled in languages with more complex scripts (e.g. Arabic). | -- to be enabled in languages with more complex scripts (e.g. Arabic). | ||
local tr = part. | -- | ||
if | -- FIXME: With nested inflections, should we also respect `enable_auto_translit` at the top level of the | ||
-- nested inflections structure? | |||
local tr = part.tr or not (parts.enable_auto_translit or data.inflections.enable_auto_translit) and "-" or nil | |||
-- FIXME: Temporary errors added 2025-10-03. Remove after a month or so. | |||
if part.translit then | |||
error("Internal error: Use field `tr` not `translit` for specifying an inflection part translit") | |||
end | |||
if part.transcription then | |||
error("Internal error: Use field `ts` not `transcription` for specifying an inflection part transcription") | |||
end | |||
local postprocess_annotations | |||
if part.inflections then | |||
postprocess_annotations = function(infldata) | |||
insert(infldata.annotations, format_inflections(data, part.inflections)) | |||
end | |||
end | end | ||
formatted = full_link( | formatted = full_link( | ||
{ | { | ||
| Line 497: | Line 511: | ||
genders = part.genders, | genders = part.genders, | ||
tr = tr, | tr = tr, | ||
ts = part. | ts = part.ts, | ||
accel = partaccel or parts.accel, | accel = partaccel or parts.accel, | ||
postprocess_annotations = postprocess_annotations, | |||
}, | }, | ||
face | face | ||
| Line 510: | Line 525: | ||
local parts_output | local parts_output | ||
if | if parts[1] then | ||
parts_output = (parts.label and " " or "") .. concat(parts) | parts_output = (parts.label and " " or "") .. concat(parts) | ||
elseif parts.request then | elseif parts.request then | ||
| Line 520: | Line 535: | ||
local parts_label = parts.label and ("<i>" .. parts.label .. "</i>") or "" | local parts_label = parts.label and ("<i>" .. parts.label .. "</i>") or "" | ||
return parts_label .. parts_output, | return format_term_with_qualifiers_and_refs(data.lang, parts, parts_label .. parts_output, 1) | ||
end | end | ||
-- Format the inflections following the headword. | -- Format the inflections following the headword or nested after a given inflection. | ||
format_inflections = function(data, inflections) | |||
if inflections and inflections[1] then | |||
if | |||
-- Format each inflection individually. | -- Format each inflection individually. | ||
for key, infl in ipairs( | for key, infl in ipairs(inflections) do | ||
inflections[key] = format_inflection_parts(data, infl) | |||
end | end | ||
return concat(inflections, ", ") | |||
else | else | ||
return "" | return "" | ||
end | end | ||
end | end | ||
-- Format the top-level inflections following the headword. Currently this just adds parens around the | |||
-- formatted comma-separated inflections in `data.inflections`. | |||
local function format_top_level_inflections(data) | |||
local result = format_inflections(data, data.inflections) | |||
if result ~= "" then | |||
return " (" .. result .. ")" | |||
else | |||
return result | |||
end | |||
end | |||
--[==[ | --[==[ | ||
| Line 567: | Line 589: | ||
]==] | ]==] | ||
function export.pos_lemma_or_nonlemma(plpos, best_guess) | function export.pos_lemma_or_nonlemma(plpos, best_guess) | ||
local | local m_headword_data = m_data or get_data() | ||
local isLemma = m_headword_data.lemmas | |||
-- Is it a lemma category? | -- Is it a lemma category? | ||
if isLemma[plpos] then | if isLemma[plpos] then | ||
| Line 577: | Line 600: | ||
end | end | ||
-- Is it a nonlemma category? | -- Is it a nonlemma category? | ||
local isNonLemma = | local isNonLemma = m_headword_data.nonlemmas | ||
if isNonLemma[plpos] or isNonLemma[plpos_no_recon] then | if isNonLemma[plpos] or isNonLemma[plpos_no_recon] then | ||
return "non-lemma form" | return "non-lemma form" | ||
| Line 605: | Line 628: | ||
error("POS 'pro' for 'pronoun' no longer allowed as it's too ambiguous; use 'pron'") | error("POS 'pro' for 'pronoun' no longer allowed as it's too ambiguous; use 'pron'") | ||
end | end | ||
local | local m_headword_data = m_data or get_data() | ||
if | if m_headword_data.pos_aliases[pos] then | ||
pos = | pos = m_headword_data.pos_aliases[pos] | ||
elseif pos:sub(-1) == "f" then | elseif pos:sub(-1) == "f" then | ||
pos = pos:sub(1, -2) | pos = pos:sub(1, -2) | ||
pos = ( | pos = (m_headword_data.pos_aliases[pos] or pos) .. " forms" | ||
end | end | ||
return export.pluralize_pos(pos) | return export.pluralize_pos(pos) | ||
| Line 626: | Line 649: | ||
local typ = type(data[element]) | local typ = type(data[element]) | ||
if typ ~= "table" then | if typ ~= "table" then | ||
error(("In full_headword(), `data.%s` must be an array but is a %s"):format(element, typ)) | error(("Internal error: In full_headword(), `data.%s` must be an array but is a %s"):format(element, typ)) | ||
end | end | ||
for k, v in pairs(data[element]) do | for k, v in pairs(data[element]) do | ||
if k ~= "maxindex" then | if k ~= "maxindex" then | ||
if type(k) ~= "number" then | if type(k) ~= "number" then | ||
error(("Unrecognized non-numeric key '%s' in `data.%s`"):format(k, element)) | error(("Internal error: Unrecognized non-numeric key '%s' in `data.%s`"):format(k, element)) | ||
end | end | ||
if k > maxind then | if k > maxind then | ||
| Line 638: | Line 661: | ||
if v then | if v then | ||
if type(v) ~= "string" then | if type(v) ~= "string" then | ||
error(("For key '%s' in `data.%s`, value should be a string but is a %s"):format(k, element, type(v))) | error(("Internal error: For key '%s' in `data.%s`, value should be a string but is a %s"):format(k, element, type(v))) | ||
end | end | ||
if not allow_blank_string and v == "" then | if not allow_blank_string and v == "" then | ||
error(("For key '%s' in `data.%s`, blank string not allowed; use 'false' for the default"):format(k, element)) | error(("Internal error: For key '%s' in `data.%s`, blank string not allowed; use 'false' for the default"):format(k, element)) | ||
end | end | ||
end | end | ||
| Line 670: | Line 693: | ||
-- that. | -- that. | ||
if tbl == true then | if tbl == true then | ||
if page.raw_defaultsort ~= sortkey then | |||
insert(lang_cats, lang:getFullName() .. " terms with non-redundant non-automated sortkeys") | |||
end | |||
return | return | ||
end | end | ||
| Line 679: | Line 705: | ||
different = true | different = true | ||
end | end | ||
end | |||
if redundant then | |||
insert(lang_cats, lang:getFullName() .. " terms with redundant sortkeys") | |||
end | |||
if different then | |||
insert(lang_cats, lang:getFullName() .. " terms with non-redundant non-automated sortkeys") | |||
end | end | ||
return sortkey | return sortkey | ||
| Line 684: | Line 716: | ||
function export.maintenance_cats(page, lang, lang_cats, page_cats) | function export.maintenance_cats(page, lang, lang_cats, page_cats) | ||
extend(page_cats, page.cats) | |||
lang = lang:getFull() -- since we are just generating categories | lang = lang:getFull() -- since we are just generating categories | ||
local canonical = lang:getCanonicalName() | local canonical = lang:getCanonicalName() | ||
local tbl, sortkey = page.wikitext_topic_cat[lang:getCode()] | local tbl, sortkey = page.wikitext_topic_cat[lang:getCode()] | ||
if tbl then | |||
sortkey = handle_raw_sortkeys(tbl, sortkey, page, lang, lang_cats) | |||
insert(lang_cats, canonical .. " entries with topic categories using raw markup") | |||
end | |||
tbl = page.wikitext_langname_cat[canonical] | tbl = page.wikitext_langname_cat[canonical] | ||
if tbl then | |||
handle_raw_sortkeys(tbl, sortkey, page, lang, lang_cats) | |||
insert(lang_cats, canonical .. " entries with language name categories using raw markup") | |||
end | |||
if get_current_L2() ~= canonical then | |||
insert(lang_cats, canonical .. " entries with incorrect language header") | |||
end | |||
end | end | ||
end | end | ||
| Line 707: | Line 748: | ||
if data.getCanonicalName then | if data.getCanonicalName then | ||
error("In full_headword(), the first argument `data` needs to be a Lua object (table) of properties, not a language object") | error("Internal error: In full_headword(), the first argument `data` needs to be a Lua object (table) of properties, not a language object") | ||
end | end | ||
if not data.lang or type(data.lang) ~= "table" or not data.lang.getCode then | if not data.lang or type(data.lang) ~= "table" or not data.lang.getCode then | ||
error("In full_headword(), the first argument `data` needs to be a Lua object (table) and `data.lang` must be a language object") | error("Internal error: In full_headword(), the first argument `data` needs to be a Lua object (table) and `data.lang` must be a language object") | ||
end | end | ||
if data.id and type(data.id) ~= "string" then | if data.id and type(data.id) ~= "string" then | ||
error("The id in the data table should be a string.") | error("Internal error: The id in the data table should be a string.") | ||
end | end | ||
| Line 725: | Line 766: | ||
local full_langname = data.lang:getFullName() | local full_langname = data.lang:getFullName() | ||
local raw_pagename | local raw_pagename = data.pagename | ||
if raw_pagename and raw_pagename ~= | local page | ||
page = process_page(raw_pagename) | local m_headword_data = m_data or get_data() | ||
if raw_pagename and raw_pagename ~= m_headword_data.pagename then -- for testing, doc pages, etc. | |||
-- data.pagename is often set on documentation and test pages through the pagename= parameter of various | |||
-- templates, to emulate running on that page. Having a large number of such test templates on a single | |||
-- page often leads to timeouts, because we fetch and parse the contents of each page in turn. However, | |||
-- we don't really need to do that and can function fine without fetching and parsing the contents of a | |||
-- given page, so turn off content fetching/parsing (and also setting the DEFAULTSORT key through a parser | |||
-- function, which is *slooooow*) in certain namespaces where test and documentation templates are likely to | |||
-- be found and where actual content does not live (User, Template, Module). | |||
local actual_namespace = m_headword_data.page.namespace | |||
local no_fetch_content = actual_namespace == "User" or actual_namespace == "Template" or | |||
actual_namespace == "Module" | |||
page = process_page(raw_pagename, no_fetch_content) | |||
else | else | ||
page = | page = m_headword_data.page | ||
end | end | ||
local namespace = page.namespace | local namespace = page.namespace | ||
------------ 3. Initialize `data.heads` table; if old-style, convert to new-style. ------------ | ------------ 3. Initialize `data.heads` table; if old-style, convert to new-style. ------------ | ||
| Line 749: | Line 792: | ||
-- new-style | -- new-style | ||
if data.translits or data.transcriptions then | if data.translits or data.transcriptions then | ||
error("In full_headword(), if `data.heads` is new-style (array of head objects), `data.translits` and `data.transcriptions` cannot be given") | error("Internal error: In full_headword(), if `data.heads` is new-style (array of head objects), `data.translits` and `data.transcriptions` cannot be given") | ||
end | end | ||
else | else | ||
| Line 782: | Line 825: | ||
init_and_find_maximum_index(data, "whole_page_categories") | init_and_find_maximum_index(data, "whole_page_categories") | ||
local pos_category_already_present = false | local pos_category_already_present = false | ||
if | if data.categories[1] then | ||
local escaped_langname = pattern_escape(full_langname) | local escaped_langname = pattern_escape(full_langname) | ||
local matches_lang_pattern = "^" .. escaped_langname .. " " | local matches_lang_pattern = "^" .. escaped_langname .. " " | ||
| Line 796: | Line 839: | ||
if not data.pos_category then | if not data.pos_category then | ||
error("`data.pos_category` not specified and could not be inferred from the categories given in " | error("Internal error: `data.pos_category` not specified and could not be inferred from the categories given in " | ||
.. "`data.categories`. Either specify the plural part of speech in `data.pos_category` " | .. "`data.categories`. Either specify the plural part of speech in `data.pos_category` " | ||
.. "(e.g. \"proper nouns\") or ensure that the first category in `data.categories` is formed from the " | .. "(e.g. \"proper nouns\") or ensure that the first category in `data.categories` is formed from the " | ||
| Line 815: | Line 858: | ||
-- add an appropriate category. | -- add an appropriate category. | ||
local postype = export.pos_lemma_or_nonlemma(data.pos_category) | local postype = export.pos_lemma_or_nonlemma(data.pos_category) | ||
if | if not data.noposcat then | ||
insert(data.categories, 1, full_langname .. " " .. postype .. "s") | insert(data.categories, 1, full_langname .. " " .. postype .. "s") | ||
end | end | ||
| Line 827: | Line 868: | ||
------------ 5. Create a default headword, and add links to multiword page names. ------------ | ------------ 5. Create a default headword, and add links to multiword page names. ------------ | ||
-- Determine if this is an "anti-asterisk" term, i.e. an attested term in a language that must normally be | |||
-- reconstructed. | |||
local is_anti_asterisk = data.heads[1].term and data.heads[1].term:find("^!!") | |||
local lang_reconstructed = data.lang:hasType("reconstructed") | |||
if is_anti_asterisk then | |||
if not lang_reconstructed then | |||
error("Anti-asterisk feature (head= beginning with !!) can only be used with reconstructed languages") | |||
end | |||
lang_reconstructed = false | |||
end | |||
-- Determine if term is reconstructed | -- Determine if term is reconstructed | ||
local is_reconstructed = namespace == "Reconstruction" or | local is_reconstructed = namespace == "Reconstruction" or lang_reconstructed | ||
-- Create a default headword based on the pagename, which is determined in | -- Create a default headword based on the pagename, which is determined in | ||
| Line 837: | Line 890: | ||
-- Add links to multi-word page names when appropriate | -- Add links to multi-word page names when appropriate | ||
if not (is_reconstructed or data.nolinkhead) then | if not (is_reconstructed or data.nolinkhead) then | ||
local no_links = | local no_links = m_headword_data.no_multiword_links | ||
if not (no_links[langcode] or no_links[full_langcode]) and export.head_is_multiword(default_head) then | if not (no_links[langcode] or no_links[full_langcode]) and export.head_is_multiword(default_head) then | ||
default_head = export.add_multiword_links(default_head, true) | default_head = export.add_multiword_links(default_head, true) | ||
| Line 847: | Line 900: | ||
end | end | ||
------------ 6. Fill in missing values in `data.heads`. ------------ | ------------ 6. Check the namespace against the language type. ------------ | ||
if namespace == "" then | |||
if lang_reconstructed then | |||
error("Entries in " .. langname .. " must be placed in the Reconstruction: namespace") | |||
elseif data.lang:hasType("appendix-constructed") then | |||
error("Entries in " .. langname .. " must be placed in the Appendix: namespace") | |||
end | |||
elseif namespace == "Citations" or namespace == "Thesaurus" then | |||
error("Headword templates should not be used in the " .. namespace .. ": namespace.") | |||
end | |||
------------ 7. Fill in missing values in `data.heads`. ------------ | |||
-- True if any script among the headword scripts has spaces in it. | -- True if any script among the headword scripts has spaces in it. | ||
| Line 856: | Line 921: | ||
for _, head in ipairs(data.heads) do | for _, head in ipairs(data.heads) do | ||
------ | ------ 7a. If missing head, replace with default head. | ||
if not head.term then | if not head.term then | ||
head.term = default_head | head.term = default_head | ||
elseif head.term == default_head then | elseif head.term == default_head then | ||
has_redundant_head_param = true | has_redundant_head_param = true | ||
elseif is_anti_asterisk and head.term == "!!" then | |||
-- If explicit head=!! is given, it's an anti-asterisk term and we fill in the default head. | |||
head.term = "!!" .. default_head | |||
elseif head.term:find("^[!?]$") then | elseif head.term:find("^[!?]$") then | ||
-- If explicit head= just consists of ! or ?, add it to the end of the default head. | -- If explicit head= just consists of ! or ?, add it to the end of the default head. | ||
head.term = default_head .. head.term | head.term = default_head .. head.term | ||
end | end | ||
head.term_no_initial_bang_bang = is_anti_asterisk and head.term:sub(3) or head.term | |||
if is_reconstructed then | if is_reconstructed then | ||
| Line 876: | Line 945: | ||
end | end | ||
------ | ------ 7b. Try to detect the script(s) if not provided. If a per-head script is provided, that takes precedence, | ||
------ otherwise fall back to the overall script if given. If neither given, autodetect the script. | ------ otherwise fall back to the overall script if given. If neither given, autodetect the script. | ||
| Line 904: | Line 973: | ||
any_script_has_spaces = any_script_has_spaces or head.sc:hasSpaces() | any_script_has_spaces = any_script_has_spaces or head.sc:hasSpaces() | ||
------ | ------ 7c. Create automatic transliterations for any non-Latin headwords without manual translit given | ||
------ (provided automatic translit is available, e.g. not in Persian or Hebrew). | ------ (provided automatic translit is available, e.g. not in Persian or Hebrew). | ||
| Line 915: | Line 984: | ||
head.tr = nil | head.tr = nil | ||
else | else | ||
local notranslit = | local notranslit = m_headword_data.notranslit | ||
if not (notranslit[langcode] or notranslit[full_langcode]) and head.sc:isTransliterated() then | if not (notranslit[langcode] or notranslit[full_langcode]) and head.sc:isTransliterated() then | ||
head.tr_manual = not not head.tr | head.tr_manual = not not head.tr | ||
local text = head. | local text = head.term_no_initial_bang_bang | ||
if not data.lang:link_tr(head.sc) then | if not data.lang:link_tr(head.sc) then | ||
text = remove_links(text) | text = remove_links(text) | ||
| Line 929: | Line 998: | ||
if automated_tr or head.tr_fail then | if automated_tr or head.tr_fail then | ||
local manual_tr = head.tr | local manual_tr = head.tr | ||
if manual_tr then | |||
if (remove_links(manual_tr) == remove_links(automated_tr)) and (not head.tr_fail) then | |||
insert(data.categories, full_langname .. " terms with redundant transliterations") | |||
elseif not head.tr_fail then | |||
insert(data.categories, full_langname .. " terms with non-redundant manual transliterations") | |||
end | |||
end | |||
if not manual_tr then | if not manual_tr then | ||
head.tr = automated_tr | head.tr = automated_tr | ||
extend(data.categories, tr_categories) | |||
end | end | ||
end | end | ||
| Line 963: | Line 1,038: | ||
end | end | ||
------------ | ------------ 8. Maybe tag the title with the appropriate script code, using the `display_title` mechanism. ------------ | ||
-- Assumes that the scripts in "toBeTagged" will never occur in the Reconstruction namespace. | -- Assumes that the scripts in "toBeTagged" will never occur in the Reconstruction namespace. | ||
| Line 979: | Line 1,054: | ||
if unsupported == 1 and page.unsupported_titles[unsupported_pagename] then | if unsupported == 1 and page.unsupported_titles[unsupported_pagename] then | ||
display_title = 'Unsupported titles/<span class="' .. dt_script_code .. '">' .. page.unsupported_titles[unsupported_pagename] .. '</span>' | display_title = 'Unsupported titles/<span class="' .. dt_script_code .. '">' .. page.unsupported_titles[unsupported_pagename] .. '</span>' | ||
elseif page_non_ascii and | elseif page_non_ascii and m_headword_data.toBeTagged[dt_script_code] | ||
or (dt_script_code == "Jpan" and (text_in_script(page.pagename, "Hira") or text_in_script(page.pagename, "Kana"))) | or (dt_script_code == "Jpan" and (text_in_script(page.pagename, "Hira") or text_in_script(page.pagename, "Kana"))) | ||
or (dt_script_code == "Kore" and text_in_script(page.pagename, "Hang")) then | or (dt_script_code == "Kore" and text_in_script(page.pagename, "Hang")) then | ||
| Line 1,018: | Line 1,093: | ||
end | end | ||
------------ | ------------ 9. Insert additional categories. ------------ | ||
if has_redundant_head_param then | |||
if not data.no_redundant_head_cat then | |||
insert(data.categories, full_langname .. " terms with redundant head parameter") | |||
end | |||
end | |||
-- If the first head is multiword (after removing links), maybe insert into "LANG multiword terms". | -- If the first head is multiword (after removing links), maybe insert into "LANG multiword terms". | ||
if not data.nomultiwordcat and any_script_has_spaces and postype == "lemma" then | if not data.nomultiwordcat and any_script_has_spaces and postype == "lemma" then | ||
local no_multiword_cat = | local no_multiword_cat = m_headword_data.no_multiword_cat | ||
if not (no_multiword_cat[langcode] or no_multiword_cat[full_langcode]) then | if not (no_multiword_cat[langcode] or no_multiword_cat[full_langcode]) then | ||
-- Check for spaces or hyphens, but exclude prefixes and suffixes. | -- Check for spaces or hyphens, but exclude prefixes and suffixes. | ||
-- Use the pagename, not the head= value, because the latter may have extra | -- Use the pagename, not the head= value, because the latter may have extra | ||
-- junk in it, e.g. superscripted text that throws off the algorithm. | -- junk in it, e.g. superscripted text that throws off the algorithm. | ||
local no_hyphen = | local no_hyphen = m_headword_data.hyphen_not_multiword_sep | ||
-- Exclude hyphens if the data module states that they should for this language. | -- Exclude hyphens if the data module states that they should for this language. | ||
local checkpattern = (no_hyphen[langcode] or no_hyphen[full_langcode]) and ".[%s፡]." or ".[%s%-፡]." | local checkpattern = (no_hyphen[langcode] or no_hyphen[full_langcode]) and ".[%s፡]." or ".[%s%-፡]." | ||
local is_multiword = umatch(page.pagename, checkpattern) | |||
if is_multiword and not non_categorizable(page.full_raw_pagename) then | |||
insert(data.categories, full_langname .. " multiword terms") | insert(data.categories, full_langname .. " multiword terms") | ||
elseif not is_multiword then | |||
local long_word_threshold = m_headword_data.long_word_thresholds[langcode] | |||
if long_word_threshold and ulen(page.pagename) >= long_word_threshold then | |||
insert(data.categories, "Long " .. full_langname .. " words") | |||
end | |||
end | end | ||
end | end | ||
| Line 1,219: | Line 1,307: | ||
insert(data.categories, full_langname .. " terms spelled with " .. character) | insert(data.categories, full_langname .. " terms spelled with " .. character) | ||
end | end | ||
end | |||
end | |||
if data.heads[1].sc:isSystem("alphabet") then | |||
local pagename, i = page.pagename:ulower(), 2 | |||
while umatch(pagename, "(%a)" .. ("%1"):rep(i)) do | |||
i = i + 1 | |||
insert(data.categories, full_langname .. " terms with " .. i .. " consecutive instances of the same letter") | |||
end | end | ||
end | end | ||
| Line 1,229: | Line 1,325: | ||
insert(data.categories, full_langname .. " palindromes") | insert(data.categories, full_langname .. " palindromes") | ||
end | end | ||
-- Add to various maintenance categories. | -- Add to various maintenance categories. | ||
export.maintenance_cats(page, data.lang, data.categories, data.whole_page_categories) | export.maintenance_cats(page, data.lang, data.categories, data.whole_page_categories) | ||
------------ | ------------ 10. Format and return headwords, genders, inflections and categories. ------------ | ||
-- Format and return all the gathered information. This may add more categories (e.g. gender/number categories), | -- Format and return all the gathered information. This may add more categories (e.g. gender/number categories), | ||
| Line 1,254: | Line 1,336: | ||
format_headword(data) .. | format_headword(data) .. | ||
format_headword_genders(data) .. | format_headword_genders(data) .. | ||
format_top_level_inflections(data) .. '</span>' | |||
-- Language-specific categories. | -- Language-specific categories. | ||