Module:headword: Difference between revisions
No edit summary |
No edit summary |
||
| (36 intermediate revisions by 2 users not shown) | |||
| Line 2: | Line 2: | ||
-- Named constants for all modules used, to make it easier to swap out sandbox versions. | -- Named constants for all modules used, to make it easier to swap out sandbox versions. | ||
local en_utilities_module = "Module:en-utilities" | |||
local gender_and_number_module = "Module:getn" | local gender_and_number_module = "Module:getn" | ||
local headword_data_module = "Module:headword/data" | local headword_data_module = "Module:headword/data" | ||
local headword_page_module = "Module:headword/page" | local headword_page_module = "Module:headword/page" | ||
local links_module = "Module:links" | local links_module = "Module:links" | ||
local load_module = "Module:load" | |||
local pages_module = "Module:pages" | |||
local palindromes_module = "Module:palindromes" | local palindromes_module = "Module:palindromes" | ||
local | local pron_qualifier_module = "Module:pron qualifier" | ||
local scripts_module = "Module:scripts" | local scripts_module = "Module:scripts" | ||
local scripts_data_module = "Module:scripts/data" | local scripts_data_module = "Module:scripts/data" | ||
| Line 15: | Line 18: | ||
local table_module = "Module:table" | local table_module = "Module:table" | ||
local utilities_module = "Module:utilities" | local utilities_module = "Module:utilities" | ||
local concat = table.concat | local concat = table.concat | ||
local | local dump = mw.dumpObject | ||
local insert = table.insert | local insert = table.insert | ||
local ipairs = ipairs | local ipairs = ipairs | ||
local max = math.max | |||
local new_title = mw.title.new | |||
local pairs = pairs | local pairs = pairs | ||
local | local require = require | ||
local toNFC = mw.ustring.toNFC | local toNFC = mw.ustring.toNFC | ||
local toNFD = mw.ustring.toNFD | local toNFD = mw.ustring.toNFD | ||
local type = type | |||
local ufind = mw.ustring.find | |||
local ugmatch = mw.ustring.gmatch | |||
local ugsub = mw.ustring.gsub | |||
local umatch = mw.ustring.match | |||
--[==[ | |||
Loaders for functions in other modules, which overwrite themselves with the target function when called. This ensures modules are only loaded when needed, retains the speed/convenience of locally-declared pre-loaded functions, and has no overhead after the first call, since the target functions are called directly in any subsequent calls.]==] | |||
local function encode_entities(...) | |||
encode_entities = require(string_utilities_module).encode_entities | |||
return encode_entities(...) | |||
end | |||
local function extend(...) | |||
extend = require(table_module).extend | |||
return extend(...) | |||
end | |||
local function find_best_script_without_lang(...) | |||
find_best_script_without_lang = require(scripts_module).findBestScriptWithoutLang | |||
return find_best_script_without_lang(...) | |||
end | |||
local function format_categories(...) | |||
format_categories = require(utilities_module).format_categories | |||
return format_categories(...) | |||
end | |||
local function format_genders(...) | |||
format_genders = require(gender_and_number_module).format_genders | |||
return format_genders(...) | |||
end | |||
local function format_pron_qualifiers(...) | |||
format_pron_qualifiers = require(pron_qualifier_module).format_qualifiers | |||
return format_pron_qualifiers(...) | |||
end | |||
local function full_link(...) | |||
full_link = require(links_module).full_link | |||
return full_link(...) | |||
end | |||
local function get_current_L2(...) | |||
get_current_L2 = require(pages_module).get_current_L2 | |||
return get_current_L2(...) | |||
end | |||
local function get_link_page(...) | |||
get_link_page = require(links_module).get_link_page | |||
return get_link_page(...) | |||
end | |||
local function get_script(...) | |||
get_script = require(scripts_module).getByCode | |||
return get_script(...) | |||
end | |||
local function is_palindrome(...) | |||
is_palindrome = require(palindromes_module).is_palindrome | |||
return is_palindrome(...) | |||
end | |||
local function language_link(...) | |||
language_link = require(links_module).language_link | |||
return language_link(...) | |||
end | |||
local function load_data(...) | |||
load_data = require(load_module).load_data | |||
return load_data(...) | |||
end | |||
local function pattern_escape(...) | |||
pattern_escape = require(string_utilities_module).pattern_escape | |||
return pattern_escape(...) | |||
end | |||
local function pluralize(...) | |||
pluralize = require(en_utilities_module).pluralize | |||
return pluralize(...) | |||
end | |||
local function process_page(...) | |||
process_page = require(headword_page_module).process_page | |||
return process_page(...) | |||
end | |||
local function remove_links(...) | |||
remove_links = require(links_module).remove_links | |||
return remove_links(...) | |||
end | |||
local | local function shallow_copy(...) | ||
shallow_copy = require(table_module).shallowCopy | |||
return shallow_copy(...) | |||
end | |||
local | local function tag_text(...) | ||
local | tag_text = require(script_utilities_module).tag_text | ||
local | return tag_text(...) | ||
local | end | ||
local function tag_transcription(...) | |||
tag_transcription = require(script_utilities_module).tag_transcription | |||
return tag_transcription(...) | |||
end | |||
local function tag_translit(...) | |||
tag_translit = require(script_utilities_module).tag_translit | |||
return tag_translit(...) | |||
end | |||
local function trim(...) | |||
trim = require(string_utilities_module).trim | |||
return trim(...) | |||
end | |||
local function ulen(...) | |||
ulen = require(string_utilities_module).len | |||
return ulen(...) | |||
end | |||
--[==[ | |||
Loaders for objects, which load data (or some other object) into some variable, which can then be accessed as "foo or get_foo()", where the function get_foo sets the object to "foo" and then returns it. This ensures they are only loaded when needed, and avoids the need to check for the existence of the object each time, since once "foo" has been set, "get_foo" will not be called again.]==] | |||
local m_data | |||
local function get_data() | |||
m_data = load_data(headword_data_module) | |||
return m_data | |||
end | |||
local script_data | |||
local function get_script_data() | |||
script_data = load_data(scripts_data_module) | |||
return script_data | |||
end | |||
local script_utilities_data | |||
local function get_script_utilities_data() | |||
script_utilities_data = load_data(script_utilities_data_module) | |||
return script_utilities_data | |||
end | |||
-- If set to true, categories always appear, even in non-mainspace pages | -- If set to true, categories always appear, even in non-mainspace pages | ||
local test_force_categories = false | local test_force_categories = false | ||
local function text_in_script(text, script_code) | local function text_in_script(text, script_code) | ||
local sc = | local sc = get_script(script_code) | ||
if not sc then | if not sc then | ||
error("Internal error: Bad script code " .. script_code) | error("Internal error: Bad script code " .. script_code) | ||
| Line 56: | Line 184: | ||
local out | local out | ||
if characters then | if characters then | ||
text = | text = ugsub(text, "%W", "") | ||
out = | out = ufind(text, "[" .. characters .. "]") | ||
end | end | ||
| Line 71: | Line 199: | ||
--[[ List of punctuation or spacing characters that are found inside of words. | --[[ List of punctuation or spacing characters that are found inside of words. | ||
Used to exclude characters from the regex above. ]] | Used to exclude characters from the regex above. ]] | ||
local wordPunc = "-־׳״'.·*’་•:᠊" | local wordPunc = "-#%%&@־׳״'.·*’་•:᠊" | ||
local notWordPunc = "[^" .. wordPunc .. "]+" | local notWordPunc = "[^" .. wordPunc .. "]+" | ||
-- Format a term (either a head term or an inflection term) along with any left or right qualifiers, references | -- Format a term (either a head term or an inflection term) along with any left or right qualifiers, labels, references | ||
-- customized separator: `part` is the object specifying the term, which should optionally contain: | -- or customized separator: `part` is the object specifying the term (and `lang` the language of the term), which should | ||
-- * left qualifiers in `q`, an array of strings | -- optionally contain: | ||
-- * left qualifiers in `q`, an array of strings; | |||
-- * right qualifiers in `qq`, an array of strings; | -- * right qualifiers in `qq`, an array of strings; | ||
-- * left labels in `l`, an array of strings; | |||
-- * right labels in `ll`, an array of strings; | |||
-- * references in `refs`, an array either of strings (formatted reference text) or objects containing fields `text` | -- * references in `refs`, an array either of strings (formatted reference text) or objects containing fields `text` | ||
-- (formatted reference text) and optionally `name` and/or `group`; | -- (formatted reference text) and optionally `name` and/or `group`; | ||
-- * a separator in `separator`, defaulting to " <i>or</i> " if this is not the first term (j > 1), otherwise "". | -- * a separator in `separator`, defaulting to " <i>or</i> " if this is not the first term (j > 1), otherwise "". | ||
-- `formatted` is the formatted version of the term itself, and `j` is the index of the term. | -- `formatted` is the formatted version of the term itself, and `j` is the index of the term. | ||
local function format_term_with_qualifiers_and_refs(lang, part, formatted, j) | |||
local function part_non_empty(field) | |||
local list = part[field] | |||
if not list then | |||
return nil | |||
end | |||
if type(list) ~= "table" then | |||
error(("Internal error: Wrong type for `part.%s`=%s, should be \"table\""):format(field, dump(list))) | |||
end | |||
return list[1] | |||
end | end | ||
if part_non_empty("q") or part_non_empty("qq") or part_non_empty("l") or | |||
if | part_non_empty("ll") or part_non_empty("refs") then | ||
formatted = format_pron_qualifiers { | |||
lang = lang, | |||
text = formatted, | |||
q = part.q, | |||
qq = part.qq, | |||
l = part.l, | |||
ll = part.ll, | |||
refs = part.refs, | |||
} | |||
end | end | ||
local separator = part.separator or j > 1 and " <i>or</i> " -- use "" to request no separator | local separator = part.separator or j > 1 and " <i>or</i> " -- use "" to request no separator | ||
if separator then | if separator then | ||
formatted = separator .. formatted | formatted = separator .. formatted | ||
| Line 132: | Line 252: | ||
--[==[Return true if the given head is multiword according to the algorithm used in full_headword().]==] | --[==[Return true if the given head is multiword according to the algorithm used in full_headword().]==] | ||
function export.head_is_multiword(head) | function export.head_is_multiword(head) | ||
for possibleWordBreak in | for possibleWordBreak in ugmatch(head, spacingPunctuation) do | ||
if | if umatch(possibleWordBreak, notWordPunc) then | ||
return true | return true | ||
end | end | ||
| Line 141: | Line 261: | ||
end | end | ||
do | |||
local function workaround_to_exclude_chars(s) | local function workaround_to_exclude_chars(s) | ||
return | return (ugsub(s, notWordPunc, "\2%1\1")) | ||
end | end | ||
head = "\1" .. | --[==[Add links to a multiword head.]==] | ||
function export.add_multiword_links(head, default) | |||
head = "\1" .. ugsub(head, spacingPunctuation, workaround_to_exclude_chars) .. "\2" | |||
if default then | |||
head = head | |||
:gsub("(\1[^\2]*)\\([:#][^\2]*\2)", "%1\\\\%2") | |||
:gsub("(\1[^\2]*)([:#][^\2]*\2)", "%1\\%2") | |||
end | |||
--Escape any remaining square brackets to stop them breaking links (e.g. "[citation needed]"). | |||
head = encode_entities(head, "[]", true, true) | |||
--[=[ | |||
use this when workaround is no longer needed: | |||
head = "[[" .. ugsub(head, WORDBREAKCHARS, "]]%1[[") .. "]]" | |||
Remove any empty links, which could have been created above | |||
at the beginning or end of the string. | |||
]=] | |||
return (head | |||
:gsub("\1\2", "") | |||
:gsub("[\1\2]", {["\1"] = "[[", ["\2"] = "]]"})) | |||
end | |||
end | end | ||
local function non_categorizable(full_raw_pagename) | local function non_categorizable(full_raw_pagename) | ||
return full_raw_pagename:find("^Appendix:Gestures/") | return full_raw_pagename:find("^Appendix:Gestures/") or | ||
-- Unsupported titles with descriptive names. | |||
(full_raw_pagename:find("^Unsupported titles/") and not full_raw_pagename:find("`")) | |||
end | end | ||
local function tag_text_and_add_quals_and_refs(data, head, formatted, j) | |||
-- Add language and script wrapper. | |||
formatted = tag_text(formatted, data.lang, head.sc, "head", nil, j == 1 and data.id or nil) | |||
-- Add qualifiers, labels, references and separator. | |||
return format_term_with_qualifiers_and_refs(data.lang, head, formatted, j) | |||
end | |||
-- Format a headword with transliterations. | -- Format a headword with transliterations. | ||
local function format_headword(data) | local function format_headword(data) | ||
-- Are there non-empty transliterations? | -- Are there non-empty transliterations? | ||
local has_translits = false | local has_translits = false | ||
| Line 190: | Line 317: | ||
local unique_head_parts = {} | local unique_head_parts = {} | ||
local has_multiple_heads = | local has_multiple_heads = not not data.heads[2] | ||
for j, head in ipairs(data.heads) do | for j, head in ipairs(data.heads) do | ||
| Line 204: | Line 331: | ||
-- Apply processing to the headword, for formatting links and such. | -- Apply processing to the headword, for formatting links and such. | ||
if head.term:find("[[", nil, true) and head.sc:getCode() ~= "Image" then | if head.term:find("[[", nil, true) and head.sc:getCode() ~= "Image" then | ||
formatted = | formatted = language_link{term = head.term, lang = data.lang} | ||
else | else | ||
formatted = data.lang:makeDisplayText(head.term, head.sc, true) | formatted = data.lang:makeDisplayText(head.term, head.sc, true) | ||
end | end | ||
local | local head_part = tag_text_and_add_quals_and_refs(data, head, formatted, j) | ||
insert(head_parts, head_part) | insert(head_parts, head_part) | ||
| Line 227: | Line 346: | ||
unique_head_part = head_part | unique_head_part = head_part | ||
else | else | ||
unique_head_part = tag_text_and_add_quals_and_refs(head, formatted, 1) | unique_head_part = tag_text_and_add_quals_and_refs(data, head, formatted, 1) | ||
end | end | ||
unique_head_parts[unique_head_part] = true | unique_head_parts[unique_head_part] = true | ||
| Line 255: | Line 374: | ||
local this_parts = {} | local this_parts = {} | ||
if head.tr then | if head.tr then | ||
insert(this_parts, | insert(this_parts, tag_translit(head.tr, data.lang:getCode(), "head", nil, head.tr_manual)) | ||
if head.ts then | if head.ts then | ||
insert(this_parts, " ") | insert(this_parts, " ") | ||
| Line 261: | Line 380: | ||
end | end | ||
if head.ts then | if head.ts then | ||
insert(this_parts, "/" . | insert(this_parts, "/" .. tag_transcription(head.ts, data.lang:getCode(), "head") .. "/") | ||
end | end | ||
insert(translit_parts, concat(this_parts)) | insert(translit_parts, concat(this_parts)) | ||
| Line 270: | Line 389: | ||
local langname = data.lang:getCanonicalName() | local langname = data.lang:getCanonicalName() | ||
local transliteration_page = | local transliteration_page = new_title(langname .. " transliteration") | ||
local saw_translit_page = false | local saw_translit_page = false | ||
if transliteration_page and transliteration_page.exists then | if transliteration_page and transliteration_page.exists then | ||
translits_formatted = " [[ | translits_formatted = " [[" .. langname .. " transliteration|•]]" .. translits_formatted | ||
saw_translit_page = true | saw_translit_page = true | ||
end | end | ||
| Line 281: | Line 400: | ||
if not saw_translit_page and data.lang:hasType("etymology-only") then | if not saw_translit_page and data.lang:hasType("etymology-only") then | ||
langname = data.lang:getFullName() | langname = data.lang:getFullName() | ||
transliteration_page = | transliteration_page = new_title(langname .. " transliteration") | ||
if transliteration_page and transliteration_page.exists then | if transliteration_page and transliteration_page.exists then | ||
translits_formatted = " [[ | translits_formatted = " [[" .. langname .. " transliteration|•]]" .. translits_formatted | ||
end | end | ||
end | end | ||
| Line 304: | Line 423: | ||
local function | local function format_headword_genders(data) | ||
local retval = "" | local retval = "" | ||
if data.genders and | if data.genders and data.genders[1] then | ||
if data.gloss then | if data.gloss then | ||
retval = "," | retval = "," | ||
end | end | ||
local pos_for_cat | local pos_for_cat | ||
if not data.nogendercat | if not data.nogendercat then | ||
local no_gender_cat = (m_data or get_data()).no_gender_cat | |||
if not (no_gender_cat[data.lang:getCode()] or no_gender_cat[data.lang:getFullCode()]) then | |||
pos_for_cat = (m_data or get_data()).pos_for_gender_number_cat[data.pos_category:gsub("^reconstructed ", "")] | |||
end | |||
end | end | ||
local text, cats = | local text, cats = format_genders(data.genders, data.lang, pos_for_cat) | ||
if cats then | |||
extend(data.categories, cats) | |||
end | end | ||
retval = retval .. " " .. text | retval = retval .. " " .. text | ||
| Line 325: | Line 445: | ||
end | end | ||
-- Forward reference | |||
local format_inflections | |||
local function format_inflection_parts(data, parts) | local function format_inflection_parts(data, parts) | ||
for j, part in ipairs(parts) do | for j, part in ipairs(parts) do | ||
if type(part) ~= "table" then | if type(part) ~= "table" then | ||
| Line 338: | Line 458: | ||
if face ~= "bold" and face ~= "plain" and face ~= "hypothetical" then | if face ~= "bold" and face ~= "plain" and face ~= "hypothetical" then | ||
error("The face `" .. face .. "` " .. ( | error("The face `" .. face .. "` " .. ( | ||
(script_utilities_data or get_script_utilities_data()).faces[face] and | |||
"should not be used for non-headword terms on the headword line." or | "should not be used for non-headword terms on the headword line." or | ||
"is invalid." | "is invalid." | ||
| Line 347: | Line 467: | ||
-- right into the 'data' table to disable inflection links of the entire headword | -- right into the 'data' table to disable inflection links of the entire headword | ||
-- when inflected forms aren't entry-worthy, e.g.: in Vulgar Latin | -- when inflected forms aren't entry-worthy, e.g.: in Vulgar Latin | ||
local nolinkinfl = data.nolinkinfl | local nolinkinfl = part.face == "hypothetical" or part.nolinkinfl or data.nolinkinfl | ||
local formatted | local formatted | ||
| Line 361: | Line 481: | ||
-- where the script is relatively straightforward to read by learners (e.g. Greek, Russian), but allow it | -- where the script is relatively straightforward to read by learners (e.g. Greek, Russian), but allow it | ||
-- to be enabled in languages with more complex scripts (e.g. Arabic). | -- to be enabled in languages with more complex scripts (e.g. Arabic). | ||
local tr = part. | -- | ||
if | -- FIXME: With nested inflections, should we also respect `enable_auto_translit` at the top level of the | ||
-- nested inflections structure? | |||
local tr = part.tr or not (parts.enable_auto_translit or data.inflections.enable_auto_translit) and "-" or nil | |||
-- FIXME: Temporary errors added 2025-10-03. Remove after a month or so. | |||
if part.translit then | |||
error("Internal error: Use field `tr` not `translit` for specifying an inflection part translit") | |||
end | end | ||
if part.transcription then | |||
error("Internal error: Use field `ts` not `transcription` for specifying an inflection part transcription") | |||
end | |||
local postprocess_annotations | |||
if part.inflections then | |||
postprocess_annotations = function(infldata) | |||
insert(infldata.annotations, format_inflections(data, part.inflections)) | |||
end | |||
end | |||
formatted = full_link( | |||
{ | { | ||
term = not nolinkinfl and part.term or nil, | term = not nolinkinfl and part.term or nil, | ||
| Line 371: | Line 505: | ||
lang = part.lang or data.lang, | lang = part.lang or data.lang, | ||
sc = part.sc or parts.sc or nil, | sc = part.sc or parts.sc or nil, | ||
gloss = part.gloss, | |||
pos = part.pos, | |||
lit = part.lit, | |||
id = part.id, | id = part.id, | ||
genders = part.genders, | genders = part.genders, | ||
tr = tr, | tr = tr, | ||
ts = part. | ts = part.ts, | ||
accel = partaccel or parts.accel, | accel = partaccel or parts.accel, | ||
postprocess_annotations = postprocess_annotations, | |||
}, | }, | ||
face | face | ||
| Line 381: | Line 519: | ||
end | end | ||
parts[j] = format_term_with_qualifiers_and_refs(part, formatted, j) | parts[j] = format_term_with_qualifiers_and_refs(part.lang or data.lang, part, | ||
formatted, j) | |||
end | end | ||
local parts_output | local parts_output | ||
if | if parts[1] then | ||
parts_output = (parts.label and " " or "") .. concat(parts) | parts_output = (parts.label and " " or "") .. concat(parts) | ||
elseif parts.request then | elseif parts.request then | ||
| Line 396: | Line 535: | ||
local parts_label = parts.label and ("<i>" .. parts.label .. "</i>") or "" | local parts_label = parts.label and ("<i>" .. parts.label .. "</i>") or "" | ||
return parts_label .. parts_output, | return format_term_with_qualifiers_and_refs(data.lang, parts, parts_label .. parts_output, 1) | ||
end | end | ||
-- Format the inflections following the headword. | -- Format the inflections following the headword or nested after a given inflection. | ||
format_inflections = function(data, inflections) | |||
if inflections and inflections[1] then | |||
if | |||
-- Format each inflection individually. | -- Format each inflection individually. | ||
for key, infl in ipairs( | for key, infl in ipairs(inflections) do | ||
inflections[key] = format_inflection_parts(data, infl) | |||
end | end | ||
return concat(inflections, ", ") | |||
else | else | ||
return "" | return "" | ||
end | end | ||
end | end | ||
-- Format the top-level inflections following the headword. Currently this just adds parens around the | |||
-- formatted comma-separated inflections in `data.inflections`. | |||
local function format_top_level_inflections(data) | |||
local result = format_inflections(data, data.inflections) | |||
if result ~= "" then | |||
return " (" .. result .. ")" | |||
else | |||
return result | |||
end | |||
end | |||
--[==[ | --[==[ | ||
Returns the plural form of `pos`, a raw part of speech input, which could be singular or | |||
plural. Irregular plural POS are taken into account (e.g. "kanji" pluralizes to | |||
"kanji"). | |||
]==] | |||
function export.pluralize_pos(pos) | function export.pluralize_pos(pos) | ||
return m_data.irregular_plurals[pos] or | -- Make the plural form of the part of speech | ||
return (m_data or get_data()).irregular_plurals[pos] or | |||
pos:sub(-1) == "s" and pos or | pos:sub(-1) == "s" and pos or | ||
pluralize(pos) | |||
end | end | ||
--[==[ | --[==[ | ||
Return "lemma" if the given POS is a lemma, "non-lemma form" if a non-lemma form, or nil | |||
if unknown. The POS passed in must be in its plural form ("nouns", "prefixes", etc.). | |||
If you have a POS in its singular form, call {export.pluralize_pos()} above to pluralize it | |||
in a smart fashion that knows when to add "-s" and when to add "-es", and also takes | |||
into account any irregular plurals. | |||
If `best_guess` is given and the POS is in neither the lemma nor non-lemma list, guess | |||
based on whether it ends in " forms"; otherwise, return nil. | |||
]==] | |||
function export.pos_lemma_or_nonlemma(plpos, best_guess) | function export.pos_lemma_or_nonlemma(plpos, best_guess) | ||
local m_headword_data = m_data or get_data() | |||
local isLemma = m_headword_data.lemmas | |||
-- Is it a lemma category? | -- Is it a lemma category? | ||
if isLemma[plpos] then | if isLemma[plpos] then | ||
| Line 450: | Line 600: | ||
end | end | ||
-- Is it a nonlemma category? | -- Is it a nonlemma category? | ||
local isNonLemma = m_headword_data.nonlemmas | |||
if isNonLemma[plpos] or isNonLemma[plpos_no_recon] then | if isNonLemma[plpos] or isNonLemma[plpos_no_recon] then | ||
return "non-lemma form" | return "non-lemma form" | ||
| Line 463: | Line 614: | ||
end | end | ||
--[==[ | |||
Canonicalize a part of speech as specified in 2= in {{tl|head}}. This checks for POS aliases and non-lemma form | |||
aliases ending in 'f', and then pluralizes if the POS term does not have an invariable plural. | |||
]==] | |||
function export.canonicalize_pos(pos) | |||
-- FIXME: Temporary code to throw an error for alias 'pre' (= preposition) that will go away. | |||
if pos == "pre" then | |||
-- Don't throw error on 'pref' as it's an alias for "prefix". | |||
error("POS 'pre' for 'preposition' no longer allowed as it's too ambiguous; use 'prep'") | |||
end | |||
-- Likewise for pro = pronoun. | |||
if pos == "pro" or pos == "prof" then | |||
error("POS 'pro' for 'pronoun' no longer allowed as it's too ambiguous; use 'pron'") | |||
end | |||
local m_headword_data = m_data or get_data() | |||
if m_headword_data.pos_aliases[pos] then | |||
pos = m_headword_data.pos_aliases[pos] | |||
elseif pos:sub(-1) == "f" then | |||
pos = pos:sub(1, -2) | |||
pos = (m_headword_data.pos_aliases[pos] or pos) .. " forms" | |||
end | |||
return export.pluralize_pos(pos) | |||
end | |||
-- Find and return the maximum index in the array `data[element]` (which may have gaps in it), and initialize it to a | -- Find and return the maximum index in the array `data[element]` (which may have gaps in it), and initialize it to a | ||
| Line 475: | Line 649: | ||
local typ = type(data[element]) | local typ = type(data[element]) | ||
if typ ~= "table" then | if typ ~= "table" then | ||
error(("In full_headword(), `data.%s` must be an array but is a %s"):format(element, typ)) | error(("Internal error: In full_headword(), `data.%s` must be an array but is a %s"):format(element, typ)) | ||
end | end | ||
for k, v in pairs(data[element]) do | for k, v in pairs(data[element]) do | ||
if k ~= "maxindex" then | if k ~= "maxindex" then | ||
if type(k) ~= "number" then | if type(k) ~= "number" then | ||
error(("Unrecognized non-numeric key '%s' in `data.%s`"):format(k, element)) | error(("Internal error: Unrecognized non-numeric key '%s' in `data.%s`"):format(k, element)) | ||
end | end | ||
if k > maxind then | if k > maxind then | ||
| Line 487: | Line 661: | ||
if v then | if v then | ||
if type(v) ~= "string" then | if type(v) ~= "string" then | ||
error(("For key '%s' in `data.%s`, value should be a string but is a %s"):format(k, element, type(v))) | error(("Internal error: For key '%s' in `data.%s`, value should be a string but is a %s"):format(k, element, type(v))) | ||
end | end | ||
if not allow_blank_string and v == "" then | if not allow_blank_string and v == "" then | ||
error(("For key '%s' in `data.%s`, blank string not allowed; use 'false' for the default"):format(k, element)) | error(("Internal error: For key '%s' in `data.%s`, blank string not allowed; use 'false' for the default"):format(k, element)) | ||
end | end | ||
end | end | ||
| Line 519: | Line 693: | ||
-- that. | -- that. | ||
if tbl == true then | if tbl == true then | ||
return | return | ||
end | end | ||
| Line 531: | Line 702: | ||
different = true | different = true | ||
end | end | ||
end | end | ||
return sortkey | return sortkey | ||
end | end | ||
function export.maintenance_cats(page, lang, lang_cats, page_cats) | function export.maintenance_cats(page, lang, lang_cats, page_cats) | ||
extend(page_cats, page.cats) | |||
lang = lang:getFull() -- since we are just generating categories | lang = lang:getFull() -- since we are just generating categories | ||
local canonical = lang:getCanonicalName() | local canonical = lang:getCanonicalName() | ||
| Line 550: | Line 713: | ||
if tbl then | if tbl then | ||
sortkey = handle_raw_sortkeys(tbl, sortkey, page, lang, lang_cats) | sortkey = handle_raw_sortkeys(tbl, sortkey, page, lang, lang_cats) | ||
end | end | ||
tbl = page.wikitext_langname_cat[canonical] | tbl = page.wikitext_langname_cat[canonical] | ||
if tbl then | if tbl then | ||
handle_raw_sortkeys(tbl, sortkey, page, lang, lang_cats) | handle_raw_sortkeys(tbl, sortkey, page, lang, lang_cats) | ||
end | end | ||
end | end | ||
| Line 570: | Line 728: | ||
]==] | ]==] | ||
function export.full_headword(data) | function export.full_headword(data) | ||
-- Prevent data from being destructively modified. | -- Prevent data from being destructively modified. | ||
local data = | local data = shallow_copy(data) | ||
------------ 1. Basic checks for old-style (multi-arg) calling convention. ------------ | ------------ 1. Basic checks for old-style (multi-arg) calling convention. ------------ | ||
if data.getCanonicalName then | if data.getCanonicalName then | ||
error("In full_headword(), the first argument `data` needs to be a Lua object (table) of properties, not a language object") | error("Internal error: In full_headword(), the first argument `data` needs to be a Lua object (table) of properties, not a language object") | ||
end | end | ||
if not data.lang or type(data.lang) ~= "table" or not data.lang.getCode then | if not data.lang or type(data.lang) ~= "table" or not data.lang.getCode then | ||
error("In full_headword(), the first argument `data` needs to be a Lua object (table) and `data.lang` must be a language object") | error("Internal error: In full_headword(), the first argument `data` needs to be a Lua object (table) and `data.lang` must be a language object") | ||
end | end | ||
if data.id and type(data.id) ~= "string" then | if data.id and type(data.id) ~= "string" then | ||
error("The id in the data table should be a string.") | error("Internal error: The id in the data table should be a string.") | ||
end | end | ||
| Line 593: | Line 748: | ||
local langcode = data.lang:getCode() | local langcode = data.lang:getCode() | ||
local full_langcode = | local full_langcode = data.lang:getFullCode() | ||
local langname = data.lang:getCanonicalName() | local langname = data.lang:getCanonicalName() | ||
local full_langname = | local full_langname = data.lang:getFullName() | ||
local raw_pagename | local raw_pagename = data.pagename | ||
if raw_pagename and raw_pagename ~= | local page | ||
page | local m_headword_data = m_data or get_data() | ||
if raw_pagename and raw_pagename ~= m_headword_data.pagename then -- for testing, doc pages, etc. | |||
-- data.pagename is often set on documentation and test pages through the pagename= parameter of various | |||
-- templates, to emulate running on that page. Having a large number of such test templates on a single | |||
-- page often leads to timeouts, because we fetch and parse the contents of each page in turn. However, | |||
-- we don't really need to do that and can function fine without fetching and parsing the contents of a | |||
-- given page, so turn off content fetching/parsing (and also setting the DEFAULTSORT key through a parser | |||
-- function, which is *slooooow*) in certain namespaces where test and documentation templates are likely to | |||
-- be found and where actual content does not live (User, Template, Module). | |||
local actual_namespace = m_headword_data.page.namespace | |||
local no_fetch_content = actual_namespace == "User" or actual_namespace == "Template" or | |||
actual_namespace == "Module" | |||
page = process_page(raw_pagename, no_fetch_content) | |||
else | else | ||
page = | page = m_headword_data.page | ||
end | end | ||
local namespace = page.namespace | |||
------------ 3. Initialize `data.heads` table; if old-style, convert to new-style. ------------ | ------------ 3. Initialize `data.heads` table; if old-style, convert to new-style. ------------ | ||
| Line 618: | Line 778: | ||
-- new-style | -- new-style | ||
if data.translits or data.transcriptions then | if data.translits or data.transcriptions then | ||
error("In full_headword(), if `data.heads` is new-style (array of head objects), `data.translits` and `data.transcriptions` cannot be given") | error("Internal error: In full_headword(), if `data.heads` is new-style (array of head objects), `data.translits` and `data.transcriptions` cannot be given") | ||
end | end | ||
else | else | ||
-- convert old-style `heads`, `translits` and `transcriptions` to new-style | -- convert old-style `heads`, `translits` and `transcriptions` to new-style | ||
local maxind = | local maxind = max( | ||
init_and_find_maximum_index(data, "heads" | init_and_find_maximum_index(data, "heads"), | ||
init_and_find_maximum_index(data, "translits", true), | init_and_find_maximum_index(data, "translits", true), | ||
init_and_find_maximum_index(data, "transcriptions", true) | init_and_find_maximum_index(data, "transcriptions", true) | ||
| Line 643: | Line 803: | ||
------------ 4. Initialize and validate `data.categories` and `data.whole_page_categories`, and determine `pos_category` if not given, and add basic categories. ------------ | ------------ 4. Initialize and validate `data.categories` and `data.whole_page_categories`, and determine `pos_category` if not given, and add basic categories. ------------ | ||
init_and_find_maximum_index(data, "categories" | -- EXPERIMENTAL: see [[Wiktionary:Beer parlour/2024/June#Decluttering the altform mess]] | ||
init_and_find_maximum_index(data, "whole_page_categories" | if data.altform then | ||
data.noposcat = true | |||
end | |||
init_and_find_maximum_index(data, "categories") | |||
init_and_find_maximum_index(data, "whole_page_categories") | |||
local pos_category_already_present = false | local pos_category_already_present = false | ||
if | if data.categories[1] then | ||
local escaped_langname = pattern_escape(full_langname) | local escaped_langname = pattern_escape(full_langname) | ||
local matches_lang_pattern = "^" .. escaped_langname .. " " | local matches_lang_pattern = "^" .. escaped_langname .. " " | ||
| Line 660: | Line 825: | ||
if not data.pos_category then | if not data.pos_category then | ||
error("`data.pos_category` not specified and could not be inferred from the categories given in " | error("Internal error: `data.pos_category` not specified and could not be inferred from the categories given in " | ||
.. "`data.categories`. Either specify the plural part of speech in `data.pos_category` " | .. "`data.categories`. Either specify the plural part of speech in `data.pos_category` " | ||
.. "(e.g. \"proper nouns\") or ensure that the first category in `data.categories` is formed from the " | .. "(e.g. \"proper nouns\") or ensure that the first category in `data.categories` is formed from the " | ||
| Line 680: | Line 845: | ||
local postype = export.pos_lemma_or_nonlemma(data.pos_category) | local postype = export.pos_lemma_or_nonlemma(data.pos_category) | ||
if not data.noposcat then | if not data.noposcat then | ||
if postype == "lemma" then | |||
postype = data.lang:getMainCategoryName() | |||
end | |||
insert(data.categories, 1, full_langname .. " " .. postype .. "s") | insert(data.categories, 1, full_langname .. " " .. postype .. "s") | ||
end | |||
insert(data.categories, 1, "Contionary") | |||
-- EXPERIMENTAL: see [[Wiktionary:Beer parlour/2024/June#Decluttering the altform mess]] | |||
if data.altform then | |||
insert(data.categories, 1, full_langname .. " alternative forms") | |||
end | end | ||
------------ 5. Create a default headword, and add links to multiword page names. ------------ | ------------ 5. Create a default headword, and add links to multiword page names. ------------ | ||
-- Determine if this is an "anti-asterisk" term, i.e. an attested term in a language that must normally be | |||
-- reconstructed. | |||
local is_anti_asterisk = data.heads[1].term and data.heads[1].term:find("^!!") | |||
local lang_reconstructed = data.lang:hasType("reconstructed") | |||
if is_anti_asterisk then | |||
if not lang_reconstructed then | |||
error("Anti-asterisk feature (head= beginning with !!) can only be used with reconstructed languages") | |||
end | |||
lang_reconstructed = false | |||
end | |||
-- Determine if term is reconstructed | -- Determine if term is reconstructed | ||
local is_reconstructed = | local is_reconstructed = namespace == "Reconstruction" or lang_reconstructed | ||
-- Create a default headword based on the pagename, which is determined in | -- Create a default headword based on the pagename, which is determined in | ||
| Line 693: | Line 881: | ||
-- Add links to multi-word page names when appropriate | -- Add links to multi-word page names when appropriate | ||
if not data.nolinkhead | if not (is_reconstructed or data.nolinkhead) then | ||
local no_links = m_headword_data.no_multiword_links | |||
if not (no_links[langcode] or no_links[full_langcode]) and export.head_is_multiword(default_head) then | |||
default_head = export.add_multiword_links(default_head, true) | |||
end | |||
end | end | ||
| Line 702: | Line 892: | ||
end | end | ||
------------ 6. Fill in missing values in `data.heads`. ------------ | ------------ 6. Check the namespace against the language type. ------------ | ||
if namespace == "" then | |||
if lang_reconstructed then | |||
error("Entries in " .. langname .. " must be placed in the Reconstruction: namespace") | |||
elseif data.lang:hasType("appendix-constructed") then | |||
error("Entries in " .. langname .. " must be placed in the Appendix: namespace") | |||
end | |||
elseif namespace == "Citations" or namespace == "Thesaurus" then | |||
error("Headword templates should not be used in the " .. namespace .. ": namespace.") | |||
end | |||
------------ 7. Fill in missing values in `data.heads`. ------------ | |||
-- True if any script among the headword scripts has spaces in it. | -- True if any script among the headword scripts has spaces in it. | ||
| Line 711: | Line 913: | ||
for _, head in ipairs(data.heads) do | for _, head in ipairs(data.heads) do | ||
------ | ------ 7a. If missing head, replace with default head. | ||
if not head.term then | if not head.term then | ||
head.term = default_head | head.term = default_head | ||
elseif head.term == default_head then | elseif head.term == default_head then | ||
has_redundant_head_param = true | has_redundant_head_param = true | ||
elseif is_anti_asterisk and head.term == "!!" then | |||
-- If explicit head=!! is given, it's an anti-asterisk term and we fill in the default head. | |||
head.term = "!!" .. default_head | |||
elseif head.term:find("^[!?]$") then | |||
-- If explicit head= just consists of ! or ?, add it to the end of the default head. | |||
head.term = default_head .. head.term | |||
end | end | ||
head.term_no_initial_bang_bang = is_anti_asterisk and head.term:sub(3) or head.term | |||
if is_reconstructed then | if is_reconstructed then | ||
local head_term = head.term | local head_term = head.term | ||
if head_term:find("%[%[") then | if head_term:find("%[%[") then | ||
head_term = | head_term = remove_links(head_term) | ||
end | end | ||
if head_term:sub(1, 1) ~= "*" then | if head_term:sub(1, 1) ~= "*" then | ||
| Line 728: | Line 937: | ||
end | end | ||
------ | ------ 7b. Try to detect the script(s) if not provided. If a per-head script is provided, that takes precedence, | ||
------ otherwise fall back to the overall script if given. If neither given, autodetect the script. | ------ otherwise fall back to the overall script if given. If neither given, autodetect the script. | ||
local auto_sc = data.lang: | local auto_sc = data.lang:findBestScript(head.term) | ||
if ( | if ( | ||
auto_sc:getCode() == "None" and | auto_sc:getCode() == "None" and | ||
find_best_script_without_lang(head.term):getCode() ~= "None" | |||
) then | ) then | ||
insert(data.categories, full_langname .. " terms in nonstandard scripts") | insert(data.categories, full_langname .. " terms in nonstandard scripts") | ||
| Line 743: | Line 952: | ||
if not head.sc then -- Overall script code given. | if not head.sc then -- Overall script code given. | ||
head.sc = data.sc | head.sc = data.sc | ||
end | end | ||
end | end | ||
| Line 762: | Line 965: | ||
any_script_has_spaces = any_script_has_spaces or head.sc:hasSpaces() | any_script_has_spaces = any_script_has_spaces or head.sc:hasSpaces() | ||
------ | ------ 7c. Create automatic transliterations for any non-Latin headwords without manual translit given | ||
------ (provided automatic translit is available, e.g. not in Persian or Hebrew). | ------ (provided automatic translit is available, e.g. not in Persian or Hebrew). | ||
-- Make transliterations | -- Make transliterations | ||
head.tr_manual = nil | head.tr_manual = nil | ||
-- Try to generate a transliteration if necessary | -- Try to generate a transliteration if necessary | ||
if head.tr == "-" then | if head.tr == "-" then | ||
head.tr = nil | head.tr = nil | ||
else | |||
local notranslit = m_headword_data.notranslit | |||
if not (notranslit[langcode] or notranslit[full_langcode]) and head.sc:isTransliterated() then | |||
head.tr_manual = not not head.tr | |||
local text = head.term_no_initial_bang_bang | |||
if not data.lang:link_tr(head.sc) then | |||
text = remove_links(text) | |||
end | |||
local automated_tr, tr_categories | |||
automated_tr, head.tr_fail, tr_categories = data.lang:transliterate(text, head.sc) | |||
if automated_tr or head.tr_fail then | |||
local manual_tr = head.tr | |||
if not manual_tr then | |||
if | head.tr = automated_tr | ||
extend(data.categories, tr_categories) | |||
end | end | ||
end | end | ||
if not | -- There is still no transliteration? | ||
head.tr = | -- Add the entry to a cleanup category. | ||
for | if not head.tr then | ||
head.tr = "<small>transliteration needed</small>" | |||
-- FIXME: No current support for 'Request for transliteration of Classical Persian terms' or similar. | |||
-- Consider adding this support in [[Module:category tree/poscatboiler/data/entry maintenance]]. | |||
insert(data.categories, "Requests for transliteration of " .. full_langname .. " terms") | |||
else | |||
-- Otherwise, trim it. | |||
head.tr = trim(head.tr) | |||
end | end | ||
end | end | ||
end | end | ||
| Line 816: | Line 1,013: | ||
-- Link to the transliteration entry for languages that require this. | -- Link to the transliteration entry for languages that require this. | ||
if head.tr and data.lang:link_tr(head.sc) then | if head.tr and data.lang:link_tr(head.sc) then | ||
head.tr = | head.tr = full_link{ | ||
term = head.tr, | term = head.tr, | ||
lang = data.lang, | lang = data.lang, | ||
sc = | sc = get_script("Latn"), | ||
tr = "-" | tr = "-" | ||
} | } | ||
| Line 825: | Line 1,022: | ||
end | end | ||
------------ | ------------ 8. Maybe tag the title with the appropriate script code, using the `display_title` mechanism. ------------ | ||
-- Assumes that the scripts in "toBeTagged" will never occur in the Reconstruction namespace. | -- Assumes that the scripts in "toBeTagged" will never occur in the Reconstruction namespace. | ||
| Line 837: | Line 1,034: | ||
local dt_script = data.heads[1].sc | local dt_script = data.heads[1].sc | ||
local dt_script_code = dt_script:getCode() | local dt_script_code = dt_script:getCode() | ||
local page_non_ascii = | local page_non_ascii = namespace == "" and not page.pagename:find("^[%z\1-\127]+$") | ||
local unsupported_pagename, unsupported = page.full_raw_pagename:gsub("^Unsupported titles/", "") | local unsupported_pagename, unsupported = page.full_raw_pagename:gsub("^Unsupported titles/", "") | ||
if unsupported == 1 and page.unsupported_titles[unsupported_pagename] then | if unsupported == 1 and page.unsupported_titles[unsupported_pagename] then | ||
display_title = 'Unsupported titles/<span class="' .. dt_script_code .. '">' .. page.unsupported_titles[unsupported_pagename] .. '</span>' | display_title = 'Unsupported titles/<span class="' .. dt_script_code .. '">' .. page.unsupported_titles[unsupported_pagename] .. '</span>' | ||
elseif page_non_ascii and toBeTagged[dt_script_code] | elseif page_non_ascii and m_headword_data.toBeTagged[dt_script_code] | ||
or (dt_script_code == "Jpan" and (text_in_script(page.pagename, "Hira") or text_in_script(page.pagename, "Kana"))) | or (dt_script_code == "Jpan" and (text_in_script(page.pagename, "Hira") or text_in_script(page.pagename, "Kana"))) | ||
or (dt_script_code == "Kore" and text_in_script(page.pagename, "Hang")) then | or (dt_script_code == "Kore" and text_in_script(page.pagename, "Hang")) then | ||
| Line 848: | Line 1,045: | ||
elseif page_non_ascii and (dt_script_code == "Hant" or dt_script_code == "Hans") then | elseif page_non_ascii and (dt_script_code == "Hant" or dt_script_code == "Hans") then | ||
display_title = '<span class="Hani">' .. page.full_raw_pagename .. '</span>' | display_title = '<span class="Hani">' .. page.full_raw_pagename .. '</span>' | ||
elseif | elseif namespace == "Reconstruction" then | ||
local matched | local matched | ||
display_title, matched = | display_title, matched = ugsub( | ||
page.full_raw_pagename, | page.full_raw_pagename, | ||
"^(Reconstruction:[^/]+/)(.+)$", | "^(Reconstruction:[^/]+/)(.+)$", | ||
function(before, term) | function(before, term) | ||
return before . | return before .. tag_text(term, data.lang, dt_script) | ||
end | end | ||
) | ) | ||
| Line 865: | Line 1,057: | ||
display_title = nil | display_title = nil | ||
end | end | ||
end | |||
-- FIXME: Generalize this. | |||
-- If the current language uses ur-Arab (for Urdu, etc.), ku-Arab (Central Kurdish) or pa-Arab | |||
-- (Shahmukhi, for Punjabi) and there's more than one language on the page, don't set the display title | |||
-- because these three scripts display in Nastaliq and we don't want this for terms that also exist in other | |||
-- languages that don't display in Nastaliq (e.g. Arabic or Persian) to display in Nastaliq. Because the word | |||
-- "Urdu" occurs near the end of the alphabet, Urdu fonts tend to override the fonts of other languages. | |||
-- FIXME: This is checking for more than one language on the page but instead needs to check if there are any | |||
-- languages using scripts other than the ones just mentioned. | |||
if (dt_script_code == "ur-Arab" or dt_script_code == "ku-Arab" or dt_script_code == "pa-Arab") and page.L2_list.n > 1 then | |||
display_title = nil | |||
end | end | ||
| Line 874: | Line 1,077: | ||
end | end | ||
------------ | ------------ 9. Insert additional categories. ------------ | ||
if | -- If the first head is multiword (after removing links), maybe insert into "LANG multiword terms". | ||
if not data. | if not data.nomultiwordcat and any_script_has_spaces and postype == "lemma" then | ||
local no_multiword_cat = m_headword_data.no_multiword_cat | |||
if not (no_multiword_cat[langcode] or no_multiword_cat[full_langcode]) then | |||
-- Check for spaces or hyphens, but exclude prefixes and suffixes. | |||
-- Use the pagename, not the head= value, because the latter may have extra | |||
-- junk in it, e.g. superscripted text that throws off the algorithm. | |||
local no_hyphen = m_headword_data.hyphen_not_multiword_sep | |||
-- Exclude hyphens if the data module states that they should for this language. | |||
local checkpattern = (no_hyphen[langcode] or no_hyphen[full_langcode]) and ".[%s፡]." or ".[%s%-፡]." | |||
local is_multiword = umatch(page.pagename, checkpattern) | |||
if is_multiword and not non_categorizable(page.full_raw_pagename) then | |||
insert(data.categories, full_langname .. " multiword terms") | |||
elseif not is_multiword then | |||
local long_word_threshold = m_headword_data.long_word_thresholds[langcode] | |||
if long_word_threshold and ulen(page.pagename) >= long_word_threshold then | |||
insert(data.categories, "Long " .. full_langname .. " words") | |||
end | |||
end | |||
end | end | ||
end | end | ||
| Line 906: | Line 1,110: | ||
-- Reconstructed terms often use weird combinations of scripts and realistically aren't spelled so much as notated. | -- Reconstructed terms often use weird combinations of scripts and realistically aren't spelled so much as notated. | ||
if | if namespace ~= "Reconstruction" then | ||
-- Map from languages to a string containing the characters to ignore when considering whether a term has | -- Map from languages to a string containing the characters to ignore when considering whether a term has | ||
-- multiple written scripts in it. Typically these are Greek or Cyrillic letters used for their phonetic | -- multiple written scripts in it. Typically these are Greek or Cyrillic letters used for their phonetic | ||
| Line 956: | Line 1,160: | ||
local ch_to_ignore = characters_to_ignore[full_langcode] | local ch_to_ignore = characters_to_ignore[full_langcode] | ||
if ch_to_ignore then | if ch_to_ignore then | ||
canon_pagename = | canon_pagename = ugsub(canon_pagename, "[" .. ch_to_ignore .. "]", "") | ||
end | end | ||
while true do | while true do | ||
if canon_pagename == "" or num_seen_scripts >= 2 or num_loops >= 10 then | if canon_pagename == "" or num_seen_scripts >= 2 or num_loops >= 10 then | ||
| Line 965: | Line 1,168: | ||
-- Make sure we don't get into a loop checking the same script over and over again; happens with e.g. [[ᠪᡳ]] | -- Make sure we don't get into a loop checking the same script over and over again; happens with e.g. [[ᠪᡳ]] | ||
num_loops = num_loops + 1 | num_loops = num_loops + 1 | ||
local pagename_script = | local pagename_script = find_best_script_without_lang(canon_pagename, "None only as last resort") | ||
local script_chars = pagename_script.characters | local script_chars = pagename_script.characters | ||
if not script_chars then | if not script_chars then | ||
| Line 973: | Line 1,176: | ||
local script_code = pagename_script:getCode() | local script_code = pagename_script:getCode() | ||
local replaced | local replaced | ||
canon_pagename, replaced = | canon_pagename, replaced = ugsub(canon_pagename, "[" .. script_chars .. "]", "") | ||
if replaced and script_code ~= "Zmth" and script_data[script_code] and | if ( | ||
script_data[script_code].character_category ~= false then | replaced and | ||
script_code ~= "Zmth" and | |||
(script_data or get_script_data())[script_code] and | |||
script_data[script_code].character_category ~= false | |||
) then | |||
script_code = script_code:gsub("^.-%-", "") | script_code = script_code:gsub("^.-%-", "") | ||
if not seen_scripts[script_code] then | if not seen_scripts[script_code] then | ||
| Line 988: | Line 1,195: | ||
end | end | ||
end | end | ||
-- Categorise for unusual characters. Takes into account combining characters, so that we can categorise for characters with diacritics that aren't encoded as atomic characters (e.g. U̠). These can be in two formats: single combining characters (i.e. character + diacritic(s)) or double combining characters (i.e. character + diacritic(s) + character). Each can have any number of diacritics. | -- Categorise for unusual characters. Takes into account combining characters, so that we can categorise for characters with diacritics that aren't encoded as atomic characters (e.g. U̠). These can be in two formats: single combining characters (i.e. character + diacritic(s)) or double combining characters (i.e. character + diacritic(s) + character). Each can have any number of diacritics. | ||
local standard = data.lang:getStandardCharacters() | local standard = data.lang:getStandardCharacters() | ||
| Line 1,032: | Line 1,239: | ||
return "" | return "" | ||
end | end | ||
local sc_standard = | local sc_standard = ugsub(sc_standard, page.comb_chars.combined_double, explode) | ||
sc_standard = | sc_standard = ugsub(sc_standard,page.comb_chars.combined_single, explode) | ||
:gsub(".[\128-\191]*", explode) | :gsub(".[\128-\191]*", explode) | ||
local num_cat_inserted | local num_cat_inserted | ||
| Line 1,043: | Line 1,250: | ||
num_cat_inserted = true | num_cat_inserted = true | ||
end | end | ||
elseif | elseif ufind(char, page.emoji_pattern) then | ||
insert(data.categories, full_langname .. " terms spelled with emoji") | insert(data.categories, full_langname .. " terms spelled with emoji") | ||
else | else | ||
| Line 1,057: | Line 1,264: | ||
-- If a diacritic doesn't appear in any of the standard characters, also categorise for it generally. | -- If a diacritic doesn't appear in any of the standard characters, also categorise for it generally. | ||
sc_standard = toNFD(sc_standard) | sc_standard = toNFD(sc_standard) | ||
for diacritic in | for diacritic in ugmatch(page.decompose_pagename, page.comb_chars.diacritics_single) do | ||
if not | if not umatch(sc_standard, diacritic) then | ||
insert(data.categories, full_langname .. " terms spelled with ◌" .. diacritic) | insert(data.categories, full_langname .. " terms spelled with ◌" .. diacritic) | ||
end | end | ||
end | end | ||
for diacritic in | for diacritic in ugmatch(page.decompose_pagename, page.comb_chars.diacritics_double) do | ||
if not | if not umatch(sc_standard, diacritic) then | ||
insert(data.categories, full_langname .. " terms spelled with ◌" .. diacritic .. "◌") | insert(data.categories, full_langname .. " terms spelled with ◌" .. diacritic .. "◌") | ||
end | end | ||
| Line 1,071: | Line 1,278: | ||
-- Ancient Greek, Hindi and Lao handled the old way for now, as their standard chars still need to be converted to the new format (because there are a lot of them). | -- Ancient Greek, Hindi and Lao handled the old way for now, as their standard chars still need to be converted to the new format (because there are a lot of them). | ||
elseif ulen(page.pagename) ~= 1 then | elseif ulen(page.pagename) ~= 1 then | ||
for character in | for character in ugmatch(page.pagename, "([^" .. standard .. "])") do | ||
local upper = char_category(character) | local upper = char_category(character) | ||
if not | if not umatch(upper, "[" .. standard .. "]") then | ||
character = upper | character = upper | ||
end | end | ||
| Line 1,080: | Line 1,287: | ||
end | end | ||
end | end | ||
if data.heads[1].sc:isSystem("alphabet") then | if data.heads[1].sc:isSystem("alphabet") then | ||
local pagename, i = page.pagename:ulower(), 2 | local pagename, i = page.pagename:ulower(), 2 | ||
while | while umatch(pagename, "(%a)" .. ("%1"):rep(i)) do | ||
i = i + 1 | i = i + 1 | ||
insert(data.categories, full_langname .. " terms with " .. i .. " consecutive instances of the same letter") | insert(data.categories, full_langname .. " terms with " .. i .. " consecutive instances of the same letter") | ||
end | end | ||
end | end | ||
-- Categorise for palindromes | -- Categorise for palindromes | ||
if not data.nopalindromecat and | if not data.nopalindromecat and namespace ~= "Reconstruction" and ulen(page.pagename) > 2 | ||
-- FIXME: Use of first script here seems hacky. What is the clean way of doing this in the presence of | -- FIXME: Use of first script here seems hacky. What is the clean way of doing this in the presence of | ||
-- multiple scripts? | -- multiple scripts? | ||
and | and is_palindrome(page.pagename, data.lang, data.heads[1].sc) then | ||
insert(data.categories, full_langname .. " palindromes") | insert(data.categories, full_langname .. " palindromes") | ||
end | end | ||
| Line 1,104: | Line 1,307: | ||
export.maintenance_cats(page, data.lang, data.categories, data.whole_page_categories) | export.maintenance_cats(page, data.lang, data.categories, data.whole_page_categories) | ||
------------ | ------------ 10. Format and return headwords, genders, inflections and categories. ------------ | ||
-- Format and return all the gathered information. This may add more categories (e.g. gender/number categories), | -- Format and return all the gathered information. This may add more categories (e.g. gender/number categories), | ||
| Line 1,110: | Line 1,313: | ||
local text = '<span class="headword-line">' .. | local text = '<span class="headword-line">' .. | ||
format_headword(data) .. | format_headword(data) .. | ||
format_headword_genders(data) .. | |||
format_top_level_inflections(data) .. '</span>' | |||
-- Language-specific categories. | -- Language-specific categories. | ||
Latest revision as of 13:20, 30 November 2025
- The following documentation is located at Module:headword/doc.[edit]
- Useful links: subpage list • links • transclusions • testcases • sandbox
local export = {}
-- Named constants for all modules used, to make it easier to swap out sandbox versions.
local en_utilities_module = "Module:en-utilities"
local gender_and_number_module = "Module:getn"
local headword_data_module = "Module:headword/data"
local headword_page_module = "Module:headword/page"
local links_module = "Module:links"
local load_module = "Module:load"
local pages_module = "Module:pages"
local palindromes_module = "Module:palindromes"
local pron_qualifier_module = "Module:pron qualifier"
local scripts_module = "Module:scripts"
local scripts_data_module = "Module:scripts/data"
local script_utilities_module = "Module:script utilities"
local script_utilities_data_module = "Module:script utilities/data"
local string_utilities_module = "Module:string utilities"
local table_module = "Module:table"
local utilities_module = "Module:utilities"
local concat = table.concat
local dump = mw.dumpObject
local insert = table.insert
local ipairs = ipairs
local max = math.max
local new_title = mw.title.new
local pairs = pairs
local require = require
local toNFC = mw.ustring.toNFC
local toNFD = mw.ustring.toNFD
local type = type
local ufind = mw.ustring.find
local ugmatch = mw.ustring.gmatch
local ugsub = mw.ustring.gsub
local umatch = mw.ustring.match
--[==[
Loaders for functions in other modules, which overwrite themselves with the target function when called. This ensures modules are only loaded when needed, retains the speed/convenience of locally-declared pre-loaded functions, and has no overhead after the first call, since the target functions are called directly in any subsequent calls.]==]
local function encode_entities(...)
encode_entities = require(string_utilities_module).encode_entities
return encode_entities(...)
end
local function extend(...)
extend = require(table_module).extend
return extend(...)
end
local function find_best_script_without_lang(...)
find_best_script_without_lang = require(scripts_module).findBestScriptWithoutLang
return find_best_script_without_lang(...)
end
local function format_categories(...)
format_categories = require(utilities_module).format_categories
return format_categories(...)
end
local function format_genders(...)
format_genders = require(gender_and_number_module).format_genders
return format_genders(...)
end
local function format_pron_qualifiers(...)
format_pron_qualifiers = require(pron_qualifier_module).format_qualifiers
return format_pron_qualifiers(...)
end
local function full_link(...)
full_link = require(links_module).full_link
return full_link(...)
end
local function get_current_L2(...)
get_current_L2 = require(pages_module).get_current_L2
return get_current_L2(...)
end
local function get_link_page(...)
get_link_page = require(links_module).get_link_page
return get_link_page(...)
end
local function get_script(...)
get_script = require(scripts_module).getByCode
return get_script(...)
end
local function is_palindrome(...)
is_palindrome = require(palindromes_module).is_palindrome
return is_palindrome(...)
end
local function language_link(...)
language_link = require(links_module).language_link
return language_link(...)
end
local function load_data(...)
load_data = require(load_module).load_data
return load_data(...)
end
local function pattern_escape(...)
pattern_escape = require(string_utilities_module).pattern_escape
return pattern_escape(...)
end
local function pluralize(...)
pluralize = require(en_utilities_module).pluralize
return pluralize(...)
end
local function process_page(...)
process_page = require(headword_page_module).process_page
return process_page(...)
end
local function remove_links(...)
remove_links = require(links_module).remove_links
return remove_links(...)
end
local function shallow_copy(...)
shallow_copy = require(table_module).shallowCopy
return shallow_copy(...)
end
local function tag_text(...)
tag_text = require(script_utilities_module).tag_text
return tag_text(...)
end
local function tag_transcription(...)
tag_transcription = require(script_utilities_module).tag_transcription
return tag_transcription(...)
end
local function tag_translit(...)
tag_translit = require(script_utilities_module).tag_translit
return tag_translit(...)
end
local function trim(...)
trim = require(string_utilities_module).trim
return trim(...)
end
local function ulen(...)
ulen = require(string_utilities_module).len
return ulen(...)
end
--[==[
Loaders for objects, which load data (or some other object) into some variable, which can then be accessed as "foo or get_foo()", where the function get_foo sets the object to "foo" and then returns it. This ensures they are only loaded when needed, and avoids the need to check for the existence of the object each time, since once "foo" has been set, "get_foo" will not be called again.]==]
local m_data
local function get_data()
m_data = load_data(headword_data_module)
return m_data
end
local script_data
local function get_script_data()
script_data = load_data(scripts_data_module)
return script_data
end
local script_utilities_data
local function get_script_utilities_data()
script_utilities_data = load_data(script_utilities_data_module)
return script_utilities_data
end
-- If set to true, categories always appear, even in non-mainspace pages
local test_force_categories = false
local function text_in_script(text, script_code)
local sc = get_script(script_code)
if not sc then
error("Internal error: Bad script code " .. script_code)
end
local characters = sc.characters
local out
if characters then
text = ugsub(text, "%W", "")
out = ufind(text, "[" .. characters .. "]")
end
if out then
return true
else
return false
end
end
local spacingPunctuation = "[%s%p]+"
--[[ List of punctuation or spacing characters that are found inside of words.
Used to exclude characters from the regex above. ]]
local wordPunc = "-#%%&@־׳״'.·*’་•:᠊"
local notWordPunc = "[^" .. wordPunc .. "]+"
-- Format a term (either a head term or an inflection term) along with any left or right qualifiers, labels, references
-- or customized separator: `part` is the object specifying the term (and `lang` the language of the term), which should
-- optionally contain:
-- * left qualifiers in `q`, an array of strings;
-- * right qualifiers in `qq`, an array of strings;
-- * left labels in `l`, an array of strings;
-- * right labels in `ll`, an array of strings;
-- * references in `refs`, an array either of strings (formatted reference text) or objects containing fields `text`
-- (formatted reference text) and optionally `name` and/or `group`;
-- * a separator in `separator`, defaulting to " <i>or</i> " if this is not the first term (j > 1), otherwise "".
-- `formatted` is the formatted version of the term itself, and `j` is the index of the term.
local function format_term_with_qualifiers_and_refs(lang, part, formatted, j)
local function part_non_empty(field)
local list = part[field]
if not list then
return nil
end
if type(list) ~= "table" then
error(("Internal error: Wrong type for `part.%s`=%s, should be \"table\""):format(field, dump(list)))
end
return list[1]
end
if part_non_empty("q") or part_non_empty("qq") or part_non_empty("l") or
part_non_empty("ll") or part_non_empty("refs") then
formatted = format_pron_qualifiers {
lang = lang,
text = formatted,
q = part.q,
qq = part.qq,
l = part.l,
ll = part.ll,
refs = part.refs,
}
end
local separator = part.separator or j > 1 and " <i>or</i> " -- use "" to request no separator
if separator then
formatted = separator .. formatted
end
return formatted
end
--[==[Return true if the given head is multiword according to the algorithm used in full_headword().]==]
function export.head_is_multiword(head)
for possibleWordBreak in ugmatch(head, spacingPunctuation) do
if umatch(possibleWordBreak, notWordPunc) then
return true
end
end
return false
end
do
local function workaround_to_exclude_chars(s)
return (ugsub(s, notWordPunc, "\2%1\1"))
end
--[==[Add links to a multiword head.]==]
function export.add_multiword_links(head, default)
head = "\1" .. ugsub(head, spacingPunctuation, workaround_to_exclude_chars) .. "\2"
if default then
head = head
:gsub("(\1[^\2]*)\\([:#][^\2]*\2)", "%1\\\\%2")
:gsub("(\1[^\2]*)([:#][^\2]*\2)", "%1\\%2")
end
--Escape any remaining square brackets to stop them breaking links (e.g. "[citation needed]").
head = encode_entities(head, "[]", true, true)
--[=[
use this when workaround is no longer needed:
head = "[[" .. ugsub(head, WORDBREAKCHARS, "]]%1[[") .. "]]"
Remove any empty links, which could have been created above
at the beginning or end of the string.
]=]
return (head
:gsub("\1\2", "")
:gsub("[\1\2]", {["\1"] = "[[", ["\2"] = "]]"}))
end
end
local function non_categorizable(full_raw_pagename)
return full_raw_pagename:find("^Appendix:Gestures/") or
-- Unsupported titles with descriptive names.
(full_raw_pagename:find("^Unsupported titles/") and not full_raw_pagename:find("`"))
end
local function tag_text_and_add_quals_and_refs(data, head, formatted, j)
-- Add language and script wrapper.
formatted = tag_text(formatted, data.lang, head.sc, "head", nil, j == 1 and data.id or nil)
-- Add qualifiers, labels, references and separator.
return format_term_with_qualifiers_and_refs(data.lang, head, formatted, j)
end
-- Format a headword with transliterations.
local function format_headword(data)
-- Are there non-empty transliterations?
local has_translits = false
local has_manual_translits = false
------ Format the headwords. ------
local head_parts = {}
local unique_head_parts = {}
local has_multiple_heads = not not data.heads[2]
for j, head in ipairs(data.heads) do
if head.tr or head.ts then
has_translits = true
end
if head.tr and head.tr_manual or head.ts then
has_manual_translits = true
end
local formatted
-- Apply processing to the headword, for formatting links and such.
if head.term:find("[[", nil, true) and head.sc:getCode() ~= "Image" then
formatted = language_link{term = head.term, lang = data.lang}
else
formatted = data.lang:makeDisplayText(head.term, head.sc, true)
end
local head_part = tag_text_and_add_quals_and_refs(data, head, formatted, j)
insert(head_parts, head_part)
-- If multiple heads, try to determine whether all heads display the same. To do this we need to effectively
-- rerun the text tagging and addition of qualifiers and references, using 1 for all indices.
if has_multiple_heads then
local unique_head_part
if j == 1 then
unique_head_part = head_part
else
unique_head_part = tag_text_and_add_quals_and_refs(data, head, formatted, 1)
end
unique_head_parts[unique_head_part] = true
end
end
local set_size = 0
if has_multiple_heads then
for _ in pairs(unique_head_parts) do
set_size = set_size + 1
end
end
if set_size == 1 then
head_parts = head_parts[1]
else
head_parts = concat(head_parts)
end
------ Format the transliterations and transcriptions. ------
local translits_formatted
if has_translits then
local translit_parts = {}
for _, head in ipairs(data.heads) do
if head.tr or head.ts then
local this_parts = {}
if head.tr then
insert(this_parts, tag_translit(head.tr, data.lang:getCode(), "head", nil, head.tr_manual))
if head.ts then
insert(this_parts, " ")
end
end
if head.ts then
insert(this_parts, "/" .. tag_transcription(head.ts, data.lang:getCode(), "head") .. "/")
end
insert(translit_parts, concat(this_parts))
end
end
translits_formatted = " (" .. concat(translit_parts, " <i>or</i> ") .. ")"
local langname = data.lang:getCanonicalName()
local transliteration_page = new_title(langname .. " transliteration")
local saw_translit_page = false
if transliteration_page and transliteration_page.exists then
translits_formatted = " [[" .. langname .. " transliteration|•]]" .. translits_formatted
saw_translit_page = true
end
-- If data.lang is an etymology-only language and we didn't find a translation page for it, fall back to the
-- full parent.
if not saw_translit_page and data.lang:hasType("etymology-only") then
langname = data.lang:getFullName()
transliteration_page = new_title(langname .. " transliteration")
if transliteration_page and transliteration_page.exists then
translits_formatted = " [[" .. langname .. " transliteration|•]]" .. translits_formatted
end
end
else
translits_formatted = ""
end
------ Paste heads and transliterations/transcriptions. ------
local lemma_gloss
if data.gloss then
lemma_gloss = ' <span class="ib-content qualifier-content">' .. data.gloss .. '</span>'
else
lemma_gloss = ""
end
return head_parts .. translits_formatted .. lemma_gloss
end
local function format_headword_genders(data)
local retval = ""
if data.genders and data.genders[1] then
if data.gloss then
retval = ","
end
local pos_for_cat
if not data.nogendercat then
local no_gender_cat = (m_data or get_data()).no_gender_cat
if not (no_gender_cat[data.lang:getCode()] or no_gender_cat[data.lang:getFullCode()]) then
pos_for_cat = (m_data or get_data()).pos_for_gender_number_cat[data.pos_category:gsub("^reconstructed ", "")]
end
end
local text, cats = format_genders(data.genders, data.lang, pos_for_cat)
if cats then
extend(data.categories, cats)
end
retval = retval .. " " .. text
end
return retval
end
-- Forward reference
local format_inflections
local function format_inflection_parts(data, parts)
for j, part in ipairs(parts) do
if type(part) ~= "table" then
part = {term = part}
end
local partaccel = part.accel
local face = part.face or "bold"
if face ~= "bold" and face ~= "plain" and face ~= "hypothetical" then
error("The face `" .. face .. "` " .. (
(script_utilities_data or get_script_utilities_data()).faces[face] and
"should not be used for non-headword terms on the headword line." or
"is invalid."
))
end
-- Here the final part 'or data.nolinkinfl' allows to have 'nolinkinfl=true'
-- right into the 'data' table to disable inflection links of the entire headword
-- when inflected forms aren't entry-worthy, e.g.: in Vulgar Latin
local nolinkinfl = part.face == "hypothetical" or part.nolinkinfl or data.nolinkinfl
local formatted
if part.label then
-- FIXME: There should be a better way of italicizing a label. As is, this isn't customizable.
formatted = "<i>" .. part.label .. "</i>"
else
-- Convert the term into a full link. Don't show a transliteration here unless enable_auto_translit is
-- requested, either at the `parts` level (i.e. per inflection) or at the `data.inflections` level (i.e.
-- specified for all inflections). This is controllable in {{head}} using autotrinfl=1 for all inflections,
-- or fNautotr=1 for an individual inflection (remember that a single inflection may be associated with
-- multiple terms). The reason for doing this is to avoid clutter in headword lines by default in languages
-- where the script is relatively straightforward to read by learners (e.g. Greek, Russian), but allow it
-- to be enabled in languages with more complex scripts (e.g. Arabic).
--
-- FIXME: With nested inflections, should we also respect `enable_auto_translit` at the top level of the
-- nested inflections structure?
local tr = part.tr or not (parts.enable_auto_translit or data.inflections.enable_auto_translit) and "-" or nil
-- FIXME: Temporary errors added 2025-10-03. Remove after a month or so.
if part.translit then
error("Internal error: Use field `tr` not `translit` for specifying an inflection part translit")
end
if part.transcription then
error("Internal error: Use field `ts` not `transcription` for specifying an inflection part transcription")
end
local postprocess_annotations
if part.inflections then
postprocess_annotations = function(infldata)
insert(infldata.annotations, format_inflections(data, part.inflections))
end
end
formatted = full_link(
{
term = not nolinkinfl and part.term or nil,
alt = part.alt or (nolinkinfl and part.term or nil),
lang = part.lang or data.lang,
sc = part.sc or parts.sc or nil,
gloss = part.gloss,
pos = part.pos,
lit = part.lit,
id = part.id,
genders = part.genders,
tr = tr,
ts = part.ts,
accel = partaccel or parts.accel,
postprocess_annotations = postprocess_annotations,
},
face
)
end
parts[j] = format_term_with_qualifiers_and_refs(part.lang or data.lang, part,
formatted, j)
end
local parts_output
if parts[1] then
parts_output = (parts.label and " " or "") .. concat(parts)
elseif parts.request then
parts_output = " <small>[please provide]</small>"
insert(data.categories, "Requests for inflections in " .. data.lang:getFullName() .. " entries")
else
parts_output = ""
end
local parts_label = parts.label and ("<i>" .. parts.label .. "</i>") or ""
return format_term_with_qualifiers_and_refs(data.lang, parts, parts_label .. parts_output, 1)
end
-- Format the inflections following the headword or nested after a given inflection.
format_inflections = function(data, inflections)
if inflections and inflections[1] then
-- Format each inflection individually.
for key, infl in ipairs(inflections) do
inflections[key] = format_inflection_parts(data, infl)
end
return concat(inflections, ", ")
else
return ""
end
end
-- Format the top-level inflections following the headword. Currently this just adds parens around the
-- formatted comma-separated inflections in `data.inflections`.
local function format_top_level_inflections(data)
local result = format_inflections(data, data.inflections)
if result ~= "" then
return " (" .. result .. ")"
else
return result
end
end
--[==[
Returns the plural form of `pos`, a raw part of speech input, which could be singular or
plural. Irregular plural POS are taken into account (e.g. "kanji" pluralizes to
"kanji").
]==]
function export.pluralize_pos(pos)
-- Make the plural form of the part of speech
return (m_data or get_data()).irregular_plurals[pos] or
pos:sub(-1) == "s" and pos or
pluralize(pos)
end
--[==[
Return "lemma" if the given POS is a lemma, "non-lemma form" if a non-lemma form, or nil
if unknown. The POS passed in must be in its plural form ("nouns", "prefixes", etc.).
If you have a POS in its singular form, call {export.pluralize_pos()} above to pluralize it
in a smart fashion that knows when to add "-s" and when to add "-es", and also takes
into account any irregular plurals.
If `best_guess` is given and the POS is in neither the lemma nor non-lemma list, guess
based on whether it ends in " forms"; otherwise, return nil.
]==]
function export.pos_lemma_or_nonlemma(plpos, best_guess)
local m_headword_data = m_data or get_data()
local isLemma = m_headword_data.lemmas
-- Is it a lemma category?
if isLemma[plpos] then
return "lemma"
end
local plpos_no_recon = plpos:gsub("^reconstructed ", "")
if isLemma[plpos_no_recon] then
return "lemma"
end
-- Is it a nonlemma category?
local isNonLemma = m_headword_data.nonlemmas
if isNonLemma[plpos] or isNonLemma[plpos_no_recon] then
return "non-lemma form"
end
local plpos_no_mut = plpos:gsub("^mutated ", "")
if isLemma[plpos_no_mut] or isNonLemma[plpos_no_mut] then
return "non-lemma form"
elseif best_guess then
return plpos:find(" forms$") and "non-lemma form" or "lemma"
else
return nil
end
end
--[==[
Canonicalize a part of speech as specified in 2= in {{tl|head}}. This checks for POS aliases and non-lemma form
aliases ending in 'f', and then pluralizes if the POS term does not have an invariable plural.
]==]
function export.canonicalize_pos(pos)
-- FIXME: Temporary code to throw an error for alias 'pre' (= preposition) that will go away.
if pos == "pre" then
-- Don't throw error on 'pref' as it's an alias for "prefix".
error("POS 'pre' for 'preposition' no longer allowed as it's too ambiguous; use 'prep'")
end
-- Likewise for pro = pronoun.
if pos == "pro" or pos == "prof" then
error("POS 'pro' for 'pronoun' no longer allowed as it's too ambiguous; use 'pron'")
end
local m_headword_data = m_data or get_data()
if m_headword_data.pos_aliases[pos] then
pos = m_headword_data.pos_aliases[pos]
elseif pos:sub(-1) == "f" then
pos = pos:sub(1, -2)
pos = (m_headword_data.pos_aliases[pos] or pos) .. " forms"
end
return export.pluralize_pos(pos)
end
-- Find and return the maximum index in the array `data[element]` (which may have gaps in it), and initialize it to a
-- zero-length array if unspecified. Check to make sure all keys are numeric (other than "maxindex", which is set by
-- [[Module:parameters]] for list parameters), all values are strings, and unless `allow_blank_string` is given,
-- no blank (zero-length) strings are present.
local function init_and_find_maximum_index(data, element, allow_blank_string)
local maxind = 0
if not data[element] then
data[element] = {}
end
local typ = type(data[element])
if typ ~= "table" then
error(("Internal error: In full_headword(), `data.%s` must be an array but is a %s"):format(element, typ))
end
for k, v in pairs(data[element]) do
if k ~= "maxindex" then
if type(k) ~= "number" then
error(("Internal error: Unrecognized non-numeric key '%s' in `data.%s`"):format(k, element))
end
if k > maxind then
maxind = k
end
if v then
if type(v) ~= "string" then
error(("Internal error: For key '%s' in `data.%s`, value should be a string but is a %s"):format(k, element, type(v)))
end
if not allow_blank_string and v == "" then
error(("Internal error: For key '%s' in `data.%s`, blank string not allowed; use 'false' for the default"):format(k, element))
end
end
end
end
return maxind
end
--[==[
-- Add the page to various maintenance categories for the language and the
-- whole page. These are placed in the headword somewhat arbitrarily, but
-- mainly because headword templates are mandatory for entries (meaning that
-- in theory it provides full coverage).
--
-- This is provided as an external entry point so that modules which transclude
-- information from other entries (such as {{tl|ja-see}}) can take advantage
-- of this feature as well, because they are used in place of a conventional
-- headword template.]==]
do
-- Handle any manual sortkeys that have been specified in raw categories
-- by tracking if they are the same or different from the automatically-
-- generated sortkey, so that we can track them in maintenance
-- categories.
local function handle_raw_sortkeys(tbl, sortkey, page, lang, lang_cats)
sortkey = sortkey or lang:makeSortKey(page.pagename)
-- If there are raw categories with no sortkey, then they will be
-- sorted based on the default MediaWiki sortkey, so we check against
-- that.
if tbl == true then
return
end
local redundant, different
for k in pairs(tbl) do
if k == sortkey then
redundant = true
else
different = true
end
end
return sortkey
end
function export.maintenance_cats(page, lang, lang_cats, page_cats)
extend(page_cats, page.cats)
lang = lang:getFull() -- since we are just generating categories
local canonical = lang:getCanonicalName()
local tbl, sortkey = page.wikitext_topic_cat[lang:getCode()]
if tbl then
sortkey = handle_raw_sortkeys(tbl, sortkey, page, lang, lang_cats)
end
tbl = page.wikitext_langname_cat[canonical]
if tbl then
handle_raw_sortkeys(tbl, sortkey, page, lang, lang_cats)
end
end
end
--[==[This is the primary external entry point.
{{lua|full_headword(data)}}
This is used by {{temp|head}} and various language-specific headword templates (e.g. {{temp|ru-adj}} for Russian adjectives, {{temp|de-noun}} for German nouns, etc.) to display an entire headword line.
See [[#Further explanations for full_headword()]]
]==]
function export.full_headword(data)
-- Prevent data from being destructively modified.
local data = shallow_copy(data)
------------ 1. Basic checks for old-style (multi-arg) calling convention. ------------
if data.getCanonicalName then
error("Internal error: In full_headword(), the first argument `data` needs to be a Lua object (table) of properties, not a language object")
end
if not data.lang or type(data.lang) ~= "table" or not data.lang.getCode then
error("Internal error: In full_headword(), the first argument `data` needs to be a Lua object (table) and `data.lang` must be a language object")
end
if data.id and type(data.id) ~= "string" then
error("Internal error: The id in the data table should be a string.")
end
------------ 2. Initialize pagename etc. ------------
local langcode = data.lang:getCode()
local full_langcode = data.lang:getFullCode()
local langname = data.lang:getCanonicalName()
local full_langname = data.lang:getFullName()
local raw_pagename = data.pagename
local page
local m_headword_data = m_data or get_data()
if raw_pagename and raw_pagename ~= m_headword_data.pagename then -- for testing, doc pages, etc.
-- data.pagename is often set on documentation and test pages through the pagename= parameter of various
-- templates, to emulate running on that page. Having a large number of such test templates on a single
-- page often leads to timeouts, because we fetch and parse the contents of each page in turn. However,
-- we don't really need to do that and can function fine without fetching and parsing the contents of a
-- given page, so turn off content fetching/parsing (and also setting the DEFAULTSORT key through a parser
-- function, which is *slooooow*) in certain namespaces where test and documentation templates are likely to
-- be found and where actual content does not live (User, Template, Module).
local actual_namespace = m_headword_data.page.namespace
local no_fetch_content = actual_namespace == "User" or actual_namespace == "Template" or
actual_namespace == "Module"
page = process_page(raw_pagename, no_fetch_content)
else
page = m_headword_data.page
end
local namespace = page.namespace
------------ 3. Initialize `data.heads` table; if old-style, convert to new-style. ------------
if type(data.heads) == "table" and type(data.heads[1]) == "table" then
-- new-style
if data.translits or data.transcriptions then
error("Internal error: In full_headword(), if `data.heads` is new-style (array of head objects), `data.translits` and `data.transcriptions` cannot be given")
end
else
-- convert old-style `heads`, `translits` and `transcriptions` to new-style
local maxind = max(
init_and_find_maximum_index(data, "heads"),
init_and_find_maximum_index(data, "translits", true),
init_and_find_maximum_index(data, "transcriptions", true)
)
for i = 1, maxind do
data.heads[i] = {
term = data.heads[i],
tr = data.translits[i],
ts = data.transcriptions[i],
}
end
end
-- Make sure there's at least one head.
if not data.heads[1] then
data.heads[1] = {}
end
------------ 4. Initialize and validate `data.categories` and `data.whole_page_categories`, and determine `pos_category` if not given, and add basic categories. ------------
-- EXPERIMENTAL: see [[Wiktionary:Beer parlour/2024/June#Decluttering the altform mess]]
if data.altform then
data.noposcat = true
end
init_and_find_maximum_index(data, "categories")
init_and_find_maximum_index(data, "whole_page_categories")
local pos_category_already_present = false
if data.categories[1] then
local escaped_langname = pattern_escape(full_langname)
local matches_lang_pattern = "^" .. escaped_langname .. " "
-- If `pos_category` not given, try to infer it from the first specified category. If this doesn't work, we
-- throw an error below.
if not data.pos_category and data.categories[1]:find(matches_lang_pattern) then
data.pos_category = data.categories[1]:gsub(matches_lang_pattern, "")
-- Optimization to avoid inserting category already present.
pos_category_already_present = true
end
end
if not data.pos_category then
error("Internal error: `data.pos_category` not specified and could not be inferred from the categories given in "
.. "`data.categories`. Either specify the plural part of speech in `data.pos_category` "
.. "(e.g. \"proper nouns\") or ensure that the first category in `data.categories` is formed from the "
.. "language's canonical name plus the plural part of speech (e.g. \"Norwegian Bokmål proper nouns\")."
)
end
-- Insert a category at the beginning for the part of speech unless it's already present or `data.noposcat` given.
if not pos_category_already_present and not data.noposcat then
local pos_category = full_langname .. " " .. data.pos_category
-- FIXME: [[User:Theknightwho]] Why is this special case here? Please add an explanatory comment.
if pos_category ~= "Translingual Han characters" then
insert(data.categories, 1, pos_category)
end
end
-- Try to determine whether the part of speech refers to a lemma or a non-lemma form; if we can figure this out,
-- add an appropriate category.
local postype = export.pos_lemma_or_nonlemma(data.pos_category)
if not data.noposcat then
if postype == "lemma" then
postype = data.lang:getMainCategoryName()
end
insert(data.categories, 1, full_langname .. " " .. postype .. "s")
end
insert(data.categories, 1, "Contionary")
-- EXPERIMENTAL: see [[Wiktionary:Beer parlour/2024/June#Decluttering the altform mess]]
if data.altform then
insert(data.categories, 1, full_langname .. " alternative forms")
end
------------ 5. Create a default headword, and add links to multiword page names. ------------
-- Determine if this is an "anti-asterisk" term, i.e. an attested term in a language that must normally be
-- reconstructed.
local is_anti_asterisk = data.heads[1].term and data.heads[1].term:find("^!!")
local lang_reconstructed = data.lang:hasType("reconstructed")
if is_anti_asterisk then
if not lang_reconstructed then
error("Anti-asterisk feature (head= beginning with !!) can only be used with reconstructed languages")
end
lang_reconstructed = false
end
-- Determine if term is reconstructed
local is_reconstructed = namespace == "Reconstruction" or lang_reconstructed
-- Create a default headword based on the pagename, which is determined in
-- advance by the data module so that it only needs to be done once.
local default_head = page.pagename
-- Add links to multi-word page names when appropriate
if not (is_reconstructed or data.nolinkhead) then
local no_links = m_headword_data.no_multiword_links
if not (no_links[langcode] or no_links[full_langcode]) and export.head_is_multiword(default_head) then
default_head = export.add_multiword_links(default_head, true)
end
end
if is_reconstructed then
default_head = "*" .. default_head
end
------------ 6. Check the namespace against the language type. ------------
if namespace == "" then
if lang_reconstructed then
error("Entries in " .. langname .. " must be placed in the Reconstruction: namespace")
elseif data.lang:hasType("appendix-constructed") then
error("Entries in " .. langname .. " must be placed in the Appendix: namespace")
end
elseif namespace == "Citations" or namespace == "Thesaurus" then
error("Headword templates should not be used in the " .. namespace .. ": namespace.")
end
------------ 7. Fill in missing values in `data.heads`. ------------
-- True if any script among the headword scripts has spaces in it.
local any_script_has_spaces = false
-- True if any term has a redundant head= param.
local has_redundant_head_param = false
for _, head in ipairs(data.heads) do
------ 7a. If missing head, replace with default head.
if not head.term then
head.term = default_head
elseif head.term == default_head then
has_redundant_head_param = true
elseif is_anti_asterisk and head.term == "!!" then
-- If explicit head=!! is given, it's an anti-asterisk term and we fill in the default head.
head.term = "!!" .. default_head
elseif head.term:find("^[!?]$") then
-- If explicit head= just consists of ! or ?, add it to the end of the default head.
head.term = default_head .. head.term
end
head.term_no_initial_bang_bang = is_anti_asterisk and head.term:sub(3) or head.term
if is_reconstructed then
local head_term = head.term
if head_term:find("%[%[") then
head_term = remove_links(head_term)
end
if head_term:sub(1, 1) ~= "*" then
error("The headword '" .. head_term .. "' must begin with '*' to indicate that it is reconstructed.")
end
end
------ 7b. Try to detect the script(s) if not provided. If a per-head script is provided, that takes precedence,
------ otherwise fall back to the overall script if given. If neither given, autodetect the script.
local auto_sc = data.lang:findBestScript(head.term)
if (
auto_sc:getCode() == "None" and
find_best_script_without_lang(head.term):getCode() ~= "None"
) then
insert(data.categories, full_langname .. " terms in nonstandard scripts")
end
if not (head.sc or data.sc) then -- No script code given, so use autodetected script.
head.sc = auto_sc
else
if not head.sc then -- Overall script code given.
head.sc = data.sc
end
end
-- If using a discouraged character sequence, add to maintenance category.
if head.sc:hasNormalizationFixes() == true then
local composed_head = toNFC(head.term)
if head.sc:fixDiscouragedSequences(composed_head) ~= composed_head then
insert(data.whole_page_categories, "Pages using discouraged character sequences")
end
end
any_script_has_spaces = any_script_has_spaces or head.sc:hasSpaces()
------ 7c. Create automatic transliterations for any non-Latin headwords without manual translit given
------ (provided automatic translit is available, e.g. not in Persian or Hebrew).
-- Make transliterations
head.tr_manual = nil
-- Try to generate a transliteration if necessary
if head.tr == "-" then
head.tr = nil
else
local notranslit = m_headword_data.notranslit
if not (notranslit[langcode] or notranslit[full_langcode]) and head.sc:isTransliterated() then
head.tr_manual = not not head.tr
local text = head.term_no_initial_bang_bang
if not data.lang:link_tr(head.sc) then
text = remove_links(text)
end
local automated_tr, tr_categories
automated_tr, head.tr_fail, tr_categories = data.lang:transliterate(text, head.sc)
if automated_tr or head.tr_fail then
local manual_tr = head.tr
if not manual_tr then
head.tr = automated_tr
extend(data.categories, tr_categories)
end
end
-- There is still no transliteration?
-- Add the entry to a cleanup category.
if not head.tr then
head.tr = "<small>transliteration needed</small>"
-- FIXME: No current support for 'Request for transliteration of Classical Persian terms' or similar.
-- Consider adding this support in [[Module:category tree/poscatboiler/data/entry maintenance]].
insert(data.categories, "Requests for transliteration of " .. full_langname .. " terms")
else
-- Otherwise, trim it.
head.tr = trim(head.tr)
end
end
end
-- Link to the transliteration entry for languages that require this.
if head.tr and data.lang:link_tr(head.sc) then
head.tr = full_link{
term = head.tr,
lang = data.lang,
sc = get_script("Latn"),
tr = "-"
}
end
end
------------ 8. Maybe tag the title with the appropriate script code, using the `display_title` mechanism. ------------
-- Assumes that the scripts in "toBeTagged" will never occur in the Reconstruction namespace.
-- (FIXME: Don't make assumptions like this, and if you need to do so, throw an error if the assumption is violated.)
-- Avoid tagging ASCII as Hani even when it is tagged as Hani in the headword, as in [[check]]. The check for ASCII
-- might need to be expanded to a check for any Latin characters and whitespace or punctuation.
local display_title
-- Where there are multiple headwords, use the script for the first. This assumes the first headword is similar to
-- the pagename, and that headwords that are in different scripts from the pagename aren't first. This seems to be
-- about the best we can do (alternatively we could potentially do script detection on the pagename).
local dt_script = data.heads[1].sc
local dt_script_code = dt_script:getCode()
local page_non_ascii = namespace == "" and not page.pagename:find("^[%z\1-\127]+$")
local unsupported_pagename, unsupported = page.full_raw_pagename:gsub("^Unsupported titles/", "")
if unsupported == 1 and page.unsupported_titles[unsupported_pagename] then
display_title = 'Unsupported titles/<span class="' .. dt_script_code .. '">' .. page.unsupported_titles[unsupported_pagename] .. '</span>'
elseif page_non_ascii and m_headword_data.toBeTagged[dt_script_code]
or (dt_script_code == "Jpan" and (text_in_script(page.pagename, "Hira") or text_in_script(page.pagename, "Kana")))
or (dt_script_code == "Kore" and text_in_script(page.pagename, "Hang")) then
display_title = '<span class="' .. dt_script_code .. '">' .. page.full_raw_pagename .. '</span>'
-- Keep Han entries region-neutral in the display title.
elseif page_non_ascii and (dt_script_code == "Hant" or dt_script_code == "Hans") then
display_title = '<span class="Hani">' .. page.full_raw_pagename .. '</span>'
elseif namespace == "Reconstruction" then
local matched
display_title, matched = ugsub(
page.full_raw_pagename,
"^(Reconstruction:[^/]+/)(.+)$",
function(before, term)
return before .. tag_text(term, data.lang, dt_script)
end
)
if matched == 0 then
display_title = nil
end
end
-- FIXME: Generalize this.
-- If the current language uses ur-Arab (for Urdu, etc.), ku-Arab (Central Kurdish) or pa-Arab
-- (Shahmukhi, for Punjabi) and there's more than one language on the page, don't set the display title
-- because these three scripts display in Nastaliq and we don't want this for terms that also exist in other
-- languages that don't display in Nastaliq (e.g. Arabic or Persian) to display in Nastaliq. Because the word
-- "Urdu" occurs near the end of the alphabet, Urdu fonts tend to override the fonts of other languages.
-- FIXME: This is checking for more than one language on the page but instead needs to check if there are any
-- languages using scripts other than the ones just mentioned.
if (dt_script_code == "ur-Arab" or dt_script_code == "ku-Arab" or dt_script_code == "pa-Arab") and page.L2_list.n > 1 then
display_title = nil
end
if display_title then
mw.getCurrentFrame():callParserFunction(
"DISPLAYTITLE",
display_title
)
end
------------ 9. Insert additional categories. ------------
-- If the first head is multiword (after removing links), maybe insert into "LANG multiword terms".
if not data.nomultiwordcat and any_script_has_spaces and postype == "lemma" then
local no_multiword_cat = m_headword_data.no_multiword_cat
if not (no_multiword_cat[langcode] or no_multiword_cat[full_langcode]) then
-- Check for spaces or hyphens, but exclude prefixes and suffixes.
-- Use the pagename, not the head= value, because the latter may have extra
-- junk in it, e.g. superscripted text that throws off the algorithm.
local no_hyphen = m_headword_data.hyphen_not_multiword_sep
-- Exclude hyphens if the data module states that they should for this language.
local checkpattern = (no_hyphen[langcode] or no_hyphen[full_langcode]) and ".[%s፡]." or ".[%s%-፡]."
local is_multiword = umatch(page.pagename, checkpattern)
if is_multiword and not non_categorizable(page.full_raw_pagename) then
insert(data.categories, full_langname .. " multiword terms")
elseif not is_multiword then
local long_word_threshold = m_headword_data.long_word_thresholds[langcode]
if long_word_threshold and ulen(page.pagename) >= long_word_threshold then
insert(data.categories, "Long " .. full_langname .. " words")
end
end
end
end
if data.sccat then
for _, head in ipairs(data.heads) do
insert(data.categories, full_langname .. " " .. data.pos_category .. " in " ..
head.sc:getDisplayForm())
end
end
-- Reconstructed terms often use weird combinations of scripts and realistically aren't spelled so much as notated.
if namespace ~= "Reconstruction" then
-- Map from languages to a string containing the characters to ignore when considering whether a term has
-- multiple written scripts in it. Typically these are Greek or Cyrillic letters used for their phonetic
-- values.
local characters_to_ignore = {
["aaq"] = "α", -- Penobscot
["acy"] = "δθ", -- Cypriot Arabic
["anc"] = "γ", -- Ngas
["aou"] = "χ", -- A'ou
["awg"] = "β", -- Anguthimri
["bhp"] = "β", -- Bima
["byk"] = "θ", -- Biao
["cdy"] = "θ", -- Chadong
["clm"] = "χ", -- Klallam
["col"] = "χ", -- Colombia-Wenatchi
["coo"] = "χ", -- Comox; FIXME: others? E.g. Greek theta (θ)?
["ets"] = "θ", -- Yekhee
["gmw-gts"] = "χ", -- Gottscheerish
["hur"] = "θ", -- Halkomelem
["izh"] = "ь", -- Ingrian
["kic"] = "θ", -- Kickapoo
["lil"] = "χ", -- Lillooet
["mhz"] = "β", -- Mor (Austronesian)
["neg"]= "ӡ", -- Negidal (normally in Cyrillic)
["oui"] = "γβ", -- Old Uyghur: FIXME: others? E.g. Greek delta (δ)?
["pox"] = "χ", -- Polabian
["rom"] = "Θθ", -- Romani: International Standard; two different thetas???
["sah"] = "ь", -- Yakut (1929 - 1939 Latin spelling)
["sjw"] = "θ", -- Shawnee
["squ"] = "χ", -- Squamish
["str"] = "χθ", -- Saanich; uses two Greek letters
["twa"] = "χ", -- Twana
["yha"] = "θ", -- Baha
["za"] = "зч", -- Zhuang; 1957-1982 alphabet used two Cyrillic letters (as well as some others like
-- ƃ, ƅ, ƨ, ɯ and ɵ that look like Cyrillic or Greek but are actually Latin)
["zlw-slv"] = "χђћ", -- Slovincian; FIXME: χ is Greek, the other two are Cyrillic, but I'm not sure
-- the currect characters are being chosen in the entry names
["zng"] = "θ", -- Mang
}
-- Determine how many real scripts are found in the pagename, where we exclude symbols and such. We exclude
-- scripts whose `character_category` is false as well as Zmth (mathematical notation symbols), which has a
-- category of "Mathematical notation symbols". When counting scripts, we need to elide language-specific
-- variants because e.g. Beng and as-Beng have slightly different characters but we don't want to consider them
-- two different scripts (e.g. [[এৰ]] has two characters which are detected respectively as Beng and as-Beng).
local seen_scripts = {}
local num_seen_scripts = 0
local num_loops = 0
local canon_pagename = page.pagename
local ch_to_ignore = characters_to_ignore[full_langcode]
if ch_to_ignore then
canon_pagename = ugsub(canon_pagename, "[" .. ch_to_ignore .. "]", "")
end
while true do
if canon_pagename == "" or num_seen_scripts >= 2 or num_loops >= 10 then
break
end
-- Make sure we don't get into a loop checking the same script over and over again; happens with e.g. [[ᠪᡳ]]
num_loops = num_loops + 1
local pagename_script = find_best_script_without_lang(canon_pagename, "None only as last resort")
local script_chars = pagename_script.characters
if not script_chars then
-- we are stuck; this happens with None
break
end
local script_code = pagename_script:getCode()
local replaced
canon_pagename, replaced = ugsub(canon_pagename, "[" .. script_chars .. "]", "")
if (
replaced and
script_code ~= "Zmth" and
(script_data or get_script_data())[script_code] and
script_data[script_code].character_category ~= false
) then
script_code = script_code:gsub("^.-%-", "")
if not seen_scripts[script_code] then
seen_scripts[script_code] = true
num_seen_scripts = num_seen_scripts + 1
end
end
end
if num_seen_scripts > 1 then
insert(data.categories, full_langname .. " terms written in multiple scripts")
end
end
-- Categorise for unusual characters. Takes into account combining characters, so that we can categorise for characters with diacritics that aren't encoded as atomic characters (e.g. U̠). These can be in two formats: single combining characters (i.e. character + diacritic(s)) or double combining characters (i.e. character + diacritic(s) + character). Each can have any number of diacritics.
local standard = data.lang:getStandardCharacters()
if standard and not non_categorizable(page.full_raw_pagename) then
local function char_category(char)
local specials = {
["#"] = "number sign",
["("] = "parentheses",
[")"] = "parentheses",
["<"] = "angle brackets",
[">"] = "angle brackets",
["["] = "square brackets",
["]"] = "square brackets",
["_"] = "underscore",
["{"] = "braces",
["|"] = "vertical line",
["}"] = "braces",
["ß"] = "ẞ",
["\205\133"] = "", -- this is UTF-8 for U+0345 ( ͅ)
["\239\191\189"] = "replacement character",
}
char = toNFD(char)
:gsub(".[\128-\191]*", function(m)
local new_m = specials[m]
new_m = new_m or m:uupper()
return new_m
end)
return toNFC(char)
end
if full_langcode ~= "hi" and full_langcode ~= "lo" then
local standard_chars_scripts = {}
for _, head in ipairs(data.heads) do
standard_chars_scripts[head.sc:getCode()] = true
end
-- Iterate over the scripts, in case there is more than one (as they can have different sets of standard characters).
for code in pairs(standard_chars_scripts) do
local sc_standard = data.lang:getStandardCharacters(code)
if sc_standard then
if page.pagename_len > 1 then
local explode_standard = {}
local function explode(char)
explode_standard[char] = true
return ""
end
local sc_standard = ugsub(sc_standard, page.comb_chars.combined_double, explode)
sc_standard = ugsub(sc_standard,page.comb_chars.combined_single, explode)
:gsub(".[\128-\191]*", explode)
local num_cat_inserted
for char in pairs(page.explode_pagename) do
if not explode_standard[char] then
if char:find("[0-9]") then
if not num_cat_inserted then
insert(data.categories, full_langname .. " terms spelled with numbers")
num_cat_inserted = true
end
elseif ufind(char, page.emoji_pattern) then
insert(data.categories, full_langname .. " terms spelled with emoji")
else
local upper = char_category(char)
if not explode_standard[upper] then
char = upper
end
insert(data.categories, full_langname .. " terms spelled with " .. char)
end
end
end
end
-- If a diacritic doesn't appear in any of the standard characters, also categorise for it generally.
sc_standard = toNFD(sc_standard)
for diacritic in ugmatch(page.decompose_pagename, page.comb_chars.diacritics_single) do
if not umatch(sc_standard, diacritic) then
insert(data.categories, full_langname .. " terms spelled with ◌" .. diacritic)
end
end
for diacritic in ugmatch(page.decompose_pagename, page.comb_chars.diacritics_double) do
if not umatch(sc_standard, diacritic) then
insert(data.categories, full_langname .. " terms spelled with ◌" .. diacritic .. "◌")
end
end
end
end
-- Ancient Greek, Hindi and Lao handled the old way for now, as their standard chars still need to be converted to the new format (because there are a lot of them).
elseif ulen(page.pagename) ~= 1 then
for character in ugmatch(page.pagename, "([^" .. standard .. "])") do
local upper = char_category(character)
if not umatch(upper, "[" .. standard .. "]") then
character = upper
end
insert(data.categories, full_langname .. " terms spelled with " .. character)
end
end
end
if data.heads[1].sc:isSystem("alphabet") then
local pagename, i = page.pagename:ulower(), 2
while umatch(pagename, "(%a)" .. ("%1"):rep(i)) do
i = i + 1
insert(data.categories, full_langname .. " terms with " .. i .. " consecutive instances of the same letter")
end
end
-- Categorise for palindromes
if not data.nopalindromecat and namespace ~= "Reconstruction" and ulen(page.pagename) > 2
-- FIXME: Use of first script here seems hacky. What is the clean way of doing this in the presence of
-- multiple scripts?
and is_palindrome(page.pagename, data.lang, data.heads[1].sc) then
insert(data.categories, full_langname .. " palindromes")
end
-- Add to various maintenance categories.
export.maintenance_cats(page, data.lang, data.categories, data.whole_page_categories)
------------ 10. Format and return headwords, genders, inflections and categories. ------------
-- Format and return all the gathered information. This may add more categories (e.g. gender/number categories),
-- so make sure we do it before evaluating `data.categories`.
local text = '<span class="headword-line">' ..
format_headword(data) ..
format_headword_genders(data) ..
format_top_level_inflections(data) .. '</span>'
-- Language-specific categories.
local cats = format_categories(
data.categories, data.lang, data.sort_key, page.encoded_pagename,
data.force_cat_output or test_force_categories, data.heads[1].sc
)
-- Language-agnostic categories.
local whole_page_cats = format_categories(
data.whole_page_categories, nil, "-"
)
return text .. cats .. whole_page_cats
end
return export