Module:headword: Difference between revisions

(10 intermediate revisions by the same user not shown)

Line 20:

local concat = table.concat

local dump = mw.dumpObject

local insert = table.insert

local ipairs = ipairs

Line 36:

Line 37:

--[==[

Loaders for functions in other modules, which overwrite themselves with the target function when called. This ensures modules are only loaded when needed, retains the speed/convenience of locally-declared pre-loaded functions, and has no overhead after the first call, since the target functions are called directly in any subsequent calls.]==]

local function encode_entities(...)

encode_entities = require(string_utilities_module).encode_entities

return encode_entities(...)

end

local function extend(...)

extend = require(table_module).extend

return extend(...)

end

Line 218:

Line 223:

end

if type(list) ~= "table" then

error(("Internal error: Wrong type for `part.%s`=%s, should be \"table\""):format(field, ~~mw.dumpObject~~(list)))

error(("Internal error: Wrong type for `part.%s`=%s, should be \"table\""):format(field, dump(list)))

end

return list[1]

Line 313:

Line 318:

local unique_head_parts = {}

local has_multiple_heads = #data.heads ~~> 1~~

local has_multiple_heads = not not data.heads[2]

for j, head in ipairs(data.heads) do

Line 388:

Line 393:

local saw_translit_page = false

if transliteration_page and transliteration_page~~.exists then~~

if transliteration_page and transliteration_page:getContent() then

~~if data.lang~~:~~hasType~~(~~"conlang"~~) then

translits_formatted = " [[" .. langname .. " transliteration|•]]" .. translits_formatted

~~translits_formatted = " [[" .. langname .. " transliteration|•]]" .. translits_formatted~~

~~else~~

translits_formatted = " [[~~Wiktionary:~~" .. langname .. " transliteration|•]]" .. translits_formatted

~~end~~

saw_translit_page = true

end

Line 402:

Line 403:

transliteration_page = new_title(langname .. " transliteration")

if transliteration_page and transliteration_page~~.exists~~ then

if transliteration_page and transliteration_page:getContent() then

translits_formatted = " [[~~Wiktionary:~~" .. langname .. " transliteration|•]]" .. translits_formatted

translits_formatted = " [[" .. langname .. " transliteration|•]]" .. translits_formatted

end

Line 425:

Line 426:

local function format_headword_genders(data)

local retval = ""

if data.genders and #data.genders ~~> 0~~ then

if data.genders and data.genders[1] then

if data.gloss then

retval = ","

Line 437:

Line 438:

end

local text, cats = format_genders(data.genders, data.lang, pos_for_cat)

~~for _, cat in ipairs(~~cats~~) do~~

if cats then

~~insert~~(data.categories, ~~cat~~)

extend(data.categories, cats)

end

retval = retval .. " " .. text

Line 445:

Line 446:

end

-- Forward reference

local format_inflections

local function format_inflection_parts(data, parts)

~~local any_part_translit = false~~

for j, part in ipairs(parts) do

if type(part) ~= "table" then

Line 467:

Line 468:

-- right into the 'data' table to disable inflection links of the entire headword

-- when inflected forms aren't entry-worthy, e.g.: in Vulgar Latin

local nolinkinfl = part.face == "hypothetical" or part.nolinkinfl or data.nolinkinfl

local formatted

Line 481:

Line 482:

-- where the script is relatively straightforward to read by learners (e.g. Greek, Russian), but allow it

-- to be enabled in languages with more complex scripts (e.g. Arabic).

local tr = part.~~translit~~ or (not (parts.enable_auto_translit or data.inflections.enable_auto_translit) and "-" or nil)

--

if ~~tr ~=~~ "-" then

-- FIXME: With nested inflections, should we also respect `enable_auto_translit` at the top level of the

~~any_part_translit~~ = ~~true~~

-- nested inflections structure?

local tr = part.tr or not (parts.enable_auto_translit or data.inflections.enable_auto_translit) and "-" or nil

-- FIXME: Temporary errors added 2025-10-03. Remove after a month or so.

if part.translit then

error("Internal error: Use field `tr` not `translit` for specifying an inflection part translit")

end

if part.transcription then

error("Internal error: Use field `ts` not `transcription` for specifying an inflection part transcription")

end

local postprocess_annotations

if part.inflections then

postprocess_annotations = function(infldata)

insert(infldata.annotations, format_inflections(data, part.inflections))

end

formatted = full_link(

{

Line 497:

Line 512:

genders = part.genders,

tr = tr,

ts = part.~~transcription~~,

ts = part.ts,

accel = partaccel or parts.accel,

postprocess_annotations = postprocess_annotations,

},

face

Line 510:

Line 526:

local parts_output

if #parts ~~> 0~~ then

if parts[1] then

parts_output = (parts.label and " " or "") .. concat(parts)

elseif parts.request then

Line 520:

Line 536:

local parts_label = parts.label and ("" .. parts.label .. "") or ""

return parts_label .. parts_output, ~~any_part_translit~~

return format_term_with_qualifiers_and_refs(data.lang, parts, parts_label .. parts_output, 1)

end

-- Format the inflections following the headword.

-- Format the inflections following the headword or nested after a given inflection. Declared local above.

~~local~~ function format_inflections(data)

function format_inflections(data, inflections)

~~local any_part_translit = false~~

if inflections and inflections[1] then

if ~~data.~~inflections and ~~#data.~~inflections ~~> 0~~ then

-- Format each inflection individually.

for key, infl in ipairs(~~data.~~inflections) do

for key, infl in ipairs(inflections) do

~~local this_any_part_translit~~

inflections[key] = format_inflection_parts(data, infl)

~~data.~~inflections[key]~~, this_any_part_translit~~ = format_inflection_parts(data, infl)

~~if this_any_part_translit then~~

~~any_part_translit = true~~

~~end~~

end

~~local concat_result =~~ concat(~~data.~~inflections, ", ")

return concat(inflections, ", ")

~~return " (" .. concat_result .. ")"~~

else

return ""

end

-- Format the top-level inflections following the headword. Currently this just adds parens around the

-- formatted comma-separated inflections in `data.inflections`.

local function format_top_level_inflections(data)

local result = format_inflections(data, data.inflections)

if result ~= "" then

return " (" .. result .. ")"

else

return result

end

-- Forward reference

local check_red_link_inflections

-- Check a single inflection (which consists of a label and zero or more terms, each possibly with nested inflections)

-- for red links. If so, insert a red-link category based on `plpos` (the plural part of speech to insert in the

-- category), stop further processing, and return true. If no red links found, return false.

local function check_red_link_inflection_parts(data, parts, plpos)

for _, part in ipairs(parts) do

if type(part) ~= "table" then

part = {term = part}

end

local term = part.term

if term and not term:find("%[%[") then

local stripped_physical_term = get_link_page(term, data.lang, part.sc or parts.sc or nil)

if stripped_physical_term then

local title = mw.title.new(stripped_physical_term)

if title and not title:getContent() then

insert(data.categories, data.lang:getFullName() .. " " .. plpos .. " with red links in their headword lines")

return true

end

if part.inflections then

if check_red_link_inflections(data, part.inflections, plpos) then

return true

end

return false

end

-- Check a set of inflections (each of which describes a single inflection of the term, such as feminine or plural, and

-- consists of a label and zero or more terms, each possibly with nested inflections) for red links. If so, insert a

-- red-link category based on `plpos` (the plural part of speech to insert in the category), stop further processing,

-- and return true. If no red links found, return false.

function check_red_link_inflections(data, inflections, plpos)

if inflections and inflections[1] then

-- Check each inflection individually.

for key, infl in ipairs(inflections) do

if check_red_link_inflection_parts(data, infl, plpos) then

return true

end

return false

end

-- Check the top-level inflections in `data.inflections`, along with any nested inflections, for red links. If so,

-- insert a red-link category based on `plpos` (the plural part of speech to insert in the category), stop further

-- processing, and return true. If no red links found, return false.

local function check_red_link_inflections_top_level(data, plpos)

return check_red_link_inflections(data, data.inflections, plpos)

end

--[==[

Line 567:

Line 649:

]==]

function export.pos_lemma_or_nonlemma(plpos, best_guess)

local ~~isLemma~~ = (m_data or get_data()).lemmas

local m_headword_data = m_data or get_data()

local isLemma = m_headword_data.lemmas

-- Is it a lemma category?

if isLemma[plpos] then

Line 577:

Line 660:

end

-- Is it a nonlemma category?

local isNonLemma = ~~(m_data or get_data())~~.nonlemmas

local isNonLemma = m_headword_data.nonlemmas

if isNonLemma[plpos] or isNonLemma[plpos_no_recon] then

return "non-lemma form"

Line 605:

Line 688:

error("POS 'pro' for 'pronoun' no longer allowed as it's too ambiguous; use 'pron'")

end

local ~~data~~ = m_data or get_data()

local m_headword_data = m_data or get_data()

if ~~data~~.pos_aliases[pos] then

if m_headword_data.pos_aliases[pos] then

pos = ~~data~~.pos_aliases[pos]

pos = m_headword_data.pos_aliases[pos]

elseif pos:sub(-1) == "f" then

pos = pos:sub(1, -2)

pos = (~~data~~.pos_aliases[pos] or pos) .. " forms"

pos = (m_headword_data.pos_aliases[pos] or pos) .. " forms"

end

return export.pluralize_pos(pos)

Line 626:

Line 709:

local typ = type(data[element])

if typ ~= "table" then

error(("In full_headword(), `data.%s` must be an array but is a %s"):format(element, typ))

error(("Internal error: In full_headword(), `data.%s` must be an array but is a %s"):format(element, typ))

end

for k, v in pairs(data[element]) do

if k ~= "maxindex" then

if type(k) ~= "number" then

error(("Unrecognized non-numeric key '%s' in `data.%s`"):format(k, element))

error(("Internal error: Unrecognized non-numeric key '%s' in `data.%s`"):format(k, element))

end

if k > maxind then

Line 638:

Line 721:

if v then

if type(v) ~= "string" then

error(("For key '%s' in `data.%s`, value should be a string but is a %s"):format(k, element, type(v)))

error(("Internal error: For key '%s' in `data.%s`, value should be a string but is a %s"):format(k, element, type(v)))

end

if not allow_blank_string and v == "" then

error(("For key '%s' in `data.%s`, blank string not allowed; use 'false' for the default"):format(k, element))

error(("Internal error: For key '%s' in `data.%s`, blank string not allowed; use 'false' for the default"):format(k, element))

end

Line 693:

Line 776:

function export.maintenance_cats(page, lang, lang_cats, page_cats)

~~for _~~, ~~cat in ipairs(~~page.cats) do

extend(page_cats, page.cats)

~~insert(page_cats, cat)~~

~~end~~

lang = lang:getFull() -- since we are just generating categories

local canonical = lang:getCanonicalName()

Line 727:

Line 808:

if data.getCanonicalName then

error("In full_headword(), the first argument `data` needs to be a Lua object (table) of properties, not a language object")

error("Internal error: In full_headword(), the first argument `data` needs to be a Lua object (table) of properties, not a language object")

end

if not data.lang or type(data.lang) ~= "table" or not data.lang.getCode then

error("In full_headword(), the first argument `data` needs to be a Lua object (table) and `data.lang` must be a language object")

error("Internal error: In full_headword(), the first argument `data` needs to be a Lua object (table) and `data.lang` must be a language object")

end

if data.id and type(data.id) ~= "string" then

error("The id in the data table should be a string.")

error("Internal error: The id in the data table should be a string.")

end

Line 745:

Line 826:

local full_langname = data.lang:getFullName()

local raw_pagename~~, page~~ = data.pagename

local raw_pagename = data.pagename

if raw_pagename and raw_pagename ~= ~~(m_data or get_data())~~.pagename then -- for testing, doc pages, etc.

local page

page = process_page(raw_pagename)

local m_headword_data = m_data or get_data()

if raw_pagename and raw_pagename ~= m_headword_data.pagename then -- for testing, doc pages, etc.

-- data.pagename is often set on documentation and test pages through the pagename= parameter of various

-- templates, to emulate running on that page. Having a large number of such test templates on a single

-- page often leads to timeouts, because we fetch and parse the contents of each page in turn. However,

-- we don't really need to do that and can function fine without fetching and parsing the contents of a

-- given page, so turn off content fetching/parsing (and also setting the DEFAULTSORT key through a parser

-- function, which is *slooooow*) in certain namespaces where test and documentation templates are likely to

-- be found and where actual content does not live (User, Template, Module).

local actual_namespace = m_headword_data.page.namespace

local no_fetch_content = actual_namespace == "User" or actual_namespace == "Template" or

actual_namespace == "Module"

page = process_page(raw_pagename, no_fetch_content)

else

page = ~~(m_data or get_data())~~.page

page = m_headword_data.page

end

~~-- Check the namespace against the language type.~~

local namespace = page.namespace

~~if namespace == "" then~~

~~if data.lang:hasType("reconstructed") then~~

~~error("Entries in " .. langname .. " must be placed in the Reconstruction: namespace")~~

~~elseif data.lang:hasType("appendix-constructed") then~~

~~error("Entries in " .. langname .. " must be placed in the Appendix: namespace")~~

~~end~~

~~elseif namespace == "Citations" or namespace == "Thesaurus" then~~

~~error("Headword templates should not be used in the " .. namespace .. ": namespace.")~~

~~end~~

------------ 3. Initialize `data.heads` table; if old-style, convert to new-style. ------------

Line 769:

Line 852:

-- new-style

if data.translits or data.transcriptions then

error("In full_headword(), if `data.heads` is new-style (array of head objects), `data.translits` and `data.transcriptions` cannot be given")

error("Internal error: In full_headword(), if `data.heads` is new-style (array of head objects), `data.translits` and `data.transcriptions` cannot be given")

end

else

Line 802:

Line 885:

init_and_find_maximum_index(data, "whole_page_categories")

local pos_category_already_present = false

if #data.categories ~~> 0~~ then

if data.categories[1] then

local escaped_langname = pattern_escape(full_langname)

local matches_lang_pattern = "^" .. escaped_langname .. " "

Line 816:

Line 899:

if not data.pos_category then

error("`data.pos_category` not specified and could not be inferred from the categories given in "

error("Internal error: `data.pos_category` not specified and could not be inferred from the categories given in "

.. "`data.categories`. Either specify the plural part of speech in `data.pos_category` "

.. "(e.g. \"proper nouns\") or ensure that the first category in `data.categories` is formed from the "

Line 846:

Line 929:

------------ 5. Create a default headword, and add links to multiword page names. ------------

-- Determine if this is an "anti-asterisk" term, i.e. an attested term in a language that must normally be

-- reconstructed.

local is_anti_asterisk = data.heads[1].term and data.heads[1].term:find("^!!")

local lang_reconstructed = data.lang:hasType("reconstructed")

if is_anti_asterisk then

if not lang_reconstructed then

error("Anti-asterisk feature (head= beginning with !!) can only be used with reconstructed languages")

end

lang_reconstructed = false

end

-- Determine if term is reconstructed

local is_reconstructed = namespace == "Reconstruction" or ~~data.lang:hasType("reconstructed")~~

local is_reconstructed = namespace == "Reconstruction" or lang_reconstructed

-- Create a default headword based on the pagename, which is determined in

Line 856:

Line 951:

-- Add links to multi-word page names when appropriate

if not (is_reconstructed or data.nolinkhead) then

local no_links = ~~(m_data or get_data())~~.no_multiword_links

local no_links = m_headword_data.no_multiword_links

if not (no_links[langcode] or no_links[full_langcode]) and export.head_is_multiword(default_head) then

default_head = export.add_multiword_links(default_head, true)

Line 866:

Line 961:

end

------------ 6. Fill in missing values in `data.heads`. ------------

------------ 6. Check the namespace against the language type. ------------

if namespace == "" then

if lang_reconstructed then

error("Entries in " .. langname .. " must be placed in the Reconstruction: namespace")

elseif data.lang:hasType("appendix-constructed") then

error("Entries in " .. langname .. " must be placed in the Appendix: namespace")

end

elseif namespace == "Citations" or namespace == "Thesaurus" then

error("Headword templates should not be used in the " .. namespace .. ": namespace.")

end

------------ 7. Fill in missing values in `data.heads`. ------------

-- True if any script among the headword scripts has spaces in it.

Line 875:

Line 982:

for _, head in ipairs(data.heads) do

------ 6a. If missing head, replace with default head.

------ 7a. If missing head, replace with default head.

if not head.term then

head.term = default_head

elseif head.term == default_head then

has_redundant_head_param = true

elseif is_anti_asterisk and head.term == "!!" then

-- If explicit head=!! is given, it's an anti-asterisk term and we fill in the default head.

head.term = "!!" .. default_head

elseif head.term:find("^[!?]$") then

-- If explicit head= just consists of ! or ?, add it to the end of the default head.

head.term = default_head .. head.term

end

head.term_no_initial_bang_bang = is_anti_asterisk and head.term:sub(3) or head.term

if is_reconstructed then

Line 895:

Line 1,006:

end

------ 6b. Try to detect the script(s) if not provided. If a per-head script is provided, that takes precedence,

------ 7b. Try to detect the script(s) if not provided. If a per-head script is provided, that takes precedence,

------ otherwise fall back to the overall script if given. If neither given, autodetect the script.

Line 913:

Line 1,024:

-- Track uses of sc parameter.

if head.sc:getCode() == auto_sc:getCode() then

insert(data.categories, full_langname .. " terms with redundant script codes")

if not data.no_script_code_cat then

insert(data.categories, full_langname .. " terms with redundant script codes")

end

else

insert(data.categories, full_langname .. " terms with non-redundant manual script codes")

if not data.no_script_code_cat then

insert(data.categories, full_langname .. " terms with non-redundant manual script codes")

end

Line 929:

Line 1,044:

any_script_has_spaces = any_script_has_spaces or head.sc:hasSpaces()

------ 6c. Create automatic transliterations for any non-Latin headwords without manual translit given

------ 7c. Create automatic transliterations for any non-Latin headwords without manual translit given

------ (provided automatic translit is available, e.g. not in Persian or Hebrew).

Line 940:

Line 1,055:

head.tr = nil

else

local notranslit = ~~(m_data or get_data())~~.notranslit

local notranslit = m_headword_data.notranslit

if not (notranslit[langcode] or notranslit[full_langcode]) and head.sc:isTransliterated() then

head.tr_manual = not not head.tr

local text = head.~~term~~

local text = head.term_no_initial_bang_bang

if not data.lang:link_tr(head.sc) then

text = remove_links(text)

end

local automated_tr~~, tr_categories~~

local automated_tr = data.lang:transliterate(text, head.sc)

~~automated_tr, head.tr_fail, tr_categories~~ = data.lang:transliterate(text, head.sc)

if automated_tr ~~or head.tr_fail~~ then

if automated_tr then

local manual_tr = head.tr

if manual_tr then

if (remove_links(manual_tr) == remove_links(automated_tr~~)) and (not head.tr_fail~~) then

if remove_links(manual_tr) == remove_links(automated_tr) then

insert(data.categories, full_langname .. " terms with redundant transliterations")

~~elseif not head.tr_fail then~~

else

insert(data.categories, full_langname .. " terms with non-redundant manual transliterations")

end

Line 965:

Line 1,079:

if not manual_tr then

head.tr = automated_tr

~~for _, category in ipairs(tr_categories) do~~

~~insert(data.categories, category)~~

~~end~~

end

Line 996:

Line 1,107:

end

------------ 7. Maybe tag the title with the appropriate script code, using the `display_title` mechanism. ------------

------------ 8. Maybe tag the title with the appropriate script code, using the `display_title` mechanism. ------------

-- Assumes that the scripts in "toBeTagged" will never occur in the Reconstruction namespace.

Line 1,012:

Line 1,123:

if unsupported == 1 and page.unsupported_titles[unsupported_pagename] then

display_title = 'Unsupported titles/' .. page.unsupported_titles[unsupported_pagename] .. ''

elseif page_non_ascii and ~~(m_data or get_data())~~.toBeTagged[dt_script_code]

elseif page_non_ascii and m_headword_data.toBeTagged[dt_script_code]

or (dt_script_code == "Jpan" and (text_in_script(page.pagename, "Hira") or text_in_script(page.pagename, "Kana")))

or (dt_script_code == "Kore" and text_in_script(page.pagename, "Hang")) then

Line 1,051:

Line 1,162:

end

------------ 8. Insert additional categories. ------------

------------ 9. Insert additional categories. ------------

if has_redundant_head_param then

if not data.no_redundant_head_cat then

insert(data.categories, full_langname .. " terms with redundant head parameter")

-- This is not the right way to go about this; too many exceptions and problems due to language-specific headword

-- handling customization. If we want this, it should be opt-in by a given language passing in the default headword.

-- insert(data.categories, full_langname .. " terms with redundant head parameter")

end

Line 1,061:

Line 1,174:

-- If the first head is multiword (after removing links), maybe insert into "LANG multiword terms".

if not data.nomultiwordcat and any_script_has_spaces and postype == "lemma" then

local no_multiword_cat = ~~(m_data or get_data())~~.no_multiword_cat

local no_multiword_cat = m_headword_data.no_multiword_cat

if not (no_multiword_cat[langcode] or no_multiword_cat[full_langcode]) then

-- Check for spaces or hyphens, but exclude prefixes and suffixes.

-- Use the pagename, not the head= value, because the latter may have extra

-- junk in it, e.g. superscripted text that throws off the algorithm.

local no_hyphen = ~~(m_data or get_data())~~.hyphen_not_multiword_sep

local no_hyphen = m_headword_data.hyphen_not_multiword_sep

-- Exclude hyphens if the data module states that they should for this language.

local checkpattern = (no_hyphen[langcode] or no_hyphen[full_langcode]) and ".[%s፡]." or ".[%s%-፡]."

if umatch(page.pagename, checkpattern) and not non_categorizable(page.full_raw_pagename) then

local is_multiword = umatch(page.pagename, checkpattern)

if is_multiword and not non_categorizable(page.full_raw_pagename) then

insert(data.categories, full_langname .. " multiword terms")

elseif not is_multiword then

local long_word_threshold = m_headword_data.long_word_thresholds[langcode] or

m_headword_data.long_word_thresholds[full_langcode]

if long_word_threshold and ulen(page.pagename) >= long_word_threshold then

insert(data.categories, "Long " .. full_langname .. " words")

end

if data.sccat then

local default_sccat = m_headword_data.default_sccat

if data.sccat or data.sccat == nil and (default_sccat[langcode] or default_sccat[full_langcode]) then

for _, head in ipairs(data.heads) do

insert(data.categories, full_langname .. " " .. data.pos_category .. " in " ..

Line 1,088:

Line 1,210:

-- values.

local characters_to_ignore = {

["aaq"] = "α", -- Penobscot

["aaq"] = "αάὰ", -- Penobscot (Algonquian)

["acy"] = "δθ", -- Cypriot Arabic

["anc"] = "γ", -- Ngas

["aez"] = "β", -- Aeka (Trans-New Guinea)

["aou"] = "χ", -- A'ou

["anc"] = "γ", -- Ngas (Chadic/Afroasiatic)

["awg"] = "β", -- Anguthimri

["aou"] = "χ", -- A'ou (Kra-Dai)

["bhp"] = "β", -- Bima

["art-blk"] = "ч", -- Bolak (conlang)

["byk"] = "θ", -- Biao

["awg"] = "β", -- Anguthimri (Pama-Nyungan)

["cdy"] = "θ", -- Chadong

["az"] = "ь", -- Azerbaijani (Turkic; Yañalif Latin spelling, c. 1928 - 1938)

["clm"] = "χ", -- Klallam

["ba"] = "ь", -- Bashkir (Turkic; Yañalif Latin spelling, c. 1928 - 1938)

["col"] = "χ", -- Colombia-Wenatchi

["bhp"] = "β", -- Bima (Austronesian)

["coo"] = "χ", -- Comox~~; FIXME: others? E.g. Greek theta~~ (θ)?

["bjz"] = "β", -- Baruga (Trans-New Guinea)

["ets"] = "θ", -- Yekhee

["byk"] = "θ", -- Biao (Kra-Dai)

["gmw-gts"] = "χ", -- Gottscheerish

["cdy"] = "θ", -- Chadong (Kra-Dai)

["hur"] = "θ", -- Halkomelem

["chp"] = "θ", -- Chipewyan (Athabaskan)

["izh"] = "ь", -- Ingrian

["cjh"] = "χ", -- Upper Chehalis (Salishan)

["kic"] = "θ", -- Kickapoo

["clm"] = "χ", -- Klallam (Salishan)

["lil"] = "χ", -- Lillooet

["col"] = "χ", -- Colombia-Wenatchi (Salishan)

["coo"] = "χθ", -- Comox (Salishan)

["crx"] = "θ", -- Carrier (Athabaskan)

["ets"] = "θ", -- Yekhee (Edoid/Niger-Congo)

["ett"] = "χ", -- Etruscan (isolate; in romanizations)

["fla"] = "χ", -- Montana Salish (Salishan)

["grt"] = "་", -- Garo (South Asian Sino-Tibetan)

["gmw-gts"] = "χ", -- Gottscheerish (Bavarian variant spoken in Slovenia)

["hur"] = "χθ", -- Halkomelem (Salishan)

["itc-psa"] = "f", -- Pre-Samnite (Italic; normally written in Greek)

["izh"] = "ь", -- Ingrian (Finnic)

["kic"] = "θ", -- Kickapoo (Algonquian)

["kk"] = "ь", -- Kazakh (Turkic; Yañalif Latin spelling, c. 1928 - 1938)

["ky"] = "ь", -- Kyrgyz (Turkic; Yañalif Latin spelling, c. 1928 - 1938)

["lil"] = "χ", -- Lillooet (Salishan)

["lsi"] = "ꓹ", -- Lashi (Lolo-Burmese/Sino-Tibetan; represents a glottal stop)

["mhz"] = "β", -- Mor (Austronesian)

["neg"]= "ӡ", -- Negidal (normally in Cyrillic)

["mqn"] = "β", -- Moronene (Austronesian)

["oui"] = "γβ", -- Old Uyghur: FIXME: others? E.g. Greek delta (δ)?

["neg"]= "ӡā", -- Negidal (Tungusic; normally in Cyrillic)

["pox"] = "χ", -- Polabian

["oka"] = "χ", -- Okanagan (Salishan)

["rom"] = "Θθ", -- Romani: International Standard; two different thetas???

["ole"] = "θ", -- Olekha (Sino-Tibetan)

["sah"] = "ь", -- Yakut (1929 - 1939 Latin spelling)

["oui"] = "γβ", -- Old Uyghur (Turkic; FIXME: others? E.g. Greek delta (δ)?)

["sjw"] = "θ", -- Shawnee

["pox"] = "χ", -- Polabian (West Slavic)

["squ"] = "χ", -- Squamish

["rif"] = "ε", -- Tarifit (Berber)

["str"] = "χθ", -- Saanich; ~~uses two Greek letters~~

["rom"] = "Θθ", -- Romani (Indic: International Standard; two different thetas???)

["twa"] = "χ", -- Twana

["rpn"] = "β", -- Repanbitip (Austronesian)

["yha"] = "θ", -- Baha

["sah"] = "ь", -- Yakut (Turkic; 1929 - 1939 Latin spelling)

["za"] = "зч", -- Zhuang; 1957-1982 alphabet used two Cyrillic letters (as well as some others like

["sit-jap"] = "χ", -- Japhug (Sino-Tibetan)

["sjw"] = "θ", -- Shawnee (Algonquian)

["squ"] = "χ", -- Squamish (Salishan)

["str"] = "χθ", -- Saanich (Salishan)

["teh"] = "χ", -- Tehuelche (Chonan; spoken in Argentina)

["tep"] = "η", -- Tepecano (Uto-Aztecan)

["thp"] = "χ", -- Thompson (Salishan)

["tk"] = "ь", -- Turkmen (Turkic; Yañalif Latin spelling, c. 1928 - 1938)

["tt"] = "ь", -- Kazakh (Turkic; Yañalif Latin spelling, c. 1928 - 1938)

["twa"] = "χ", -- Twana (Salishan)

["wbl"] = "ы", -- Wakhi (Iranian)

["xbc"] = "ϸ", -- Bactrian (Iranian; represents š; normally written in Greek)

["yha"] = "θ", -- Baha (Kra-Dai)

["za"] = "зч", -- Zhuang (Tai/Kra-Dai); 1957-1982 alphabet used two Cyrillic letters (as well as some others like

-- ƃ, ƅ, ƨ, ɯ and ɵ that look like Cyrillic or Greek but are actually Latin)

["zlw-slv"] = "χђћ", -- Slovincian; FIXME: χ is Greek, the other two are Cyrillic, but I'm not sure

["zlw-slv"] = "χђћ", -- Slovincian (West Slavic; FIXME: χ is Greek, the other two are Cyrillic, but I'm not sure

-- the currect characters are being chosen in the entry names

-- the currect characters are being chosen in the entry names)

["zng"] = "θ", -- Mang

["zng"] = "θ", -- Mang (Mon-Khmer)

["ztp"] = "θ", -- Loxicha Zapotec (Zapotecan)

}

-- Determine how many real scripts are found in the pagename, where we exclude symbols and such. We exclude

Line 1,275:

Line 1,426:

and is_palindrome(page.pagename, data.lang, data.heads[1].sc) then

insert(data.categories, full_langname .. " palindromes")

end

-- Add red link category if called for and we're not a "large" page, where such checks are disabled.

if data.checkredlinks and not m_headword_data.large_pages[m_headword_data.pagename] then

local plposcat = type(data.checkredlinks) == "string" and data.checkredlinks or data.pos_category

check_red_link_inflections_top_level(data, plposcat)

end

Line 1,280:

Line 1,437:

export.maintenance_cats(page, data.lang, data.categories, data.whole_page_categories)

------------ 9. Format and return headwords, genders, inflections and categories. ------------

------------ 10. Format and return headwords, genders, inflections and categories. ------------

-- Format and return all the gathered information. This may add more categories (e.g. gender/number categories),

Line 1,287:

Line 1,444:

format_headword(data) ..

format_headword_genders(data) ..

~~format_inflections~~(data) .. ''

format_top_level_inflections(data) .. ''

-- Language-specific categories.

@@ Line 20: / Line 20: @@
 local concat = table.concat
+local dump = mw.dumpObject
 local insert = table.insert
 local ipairs = ipairs
@@ Line 36: / Line 37: @@
 --[==[
 Loaders for functions in other modules, which overwrite themselves with the target function when called. This ensures modules are only loaded when needed, retains the speed/convenience of locally-declared pre-loaded functions, and has no overhead after the first call, since the target functions are called directly in any subsequent calls.]==]
 	local function encode_entities(...)
 		encode_entities = require(string_utilities_module).encode_entities
 		return encode_entities(...)
+	end
+	local function extend(...)
+		extend = require(table_module).extend
+		return extend(...)
 	end
@@ Line 218: / Line 223: @@
 		end
 		if type(list) ~= "table" then
-			error(("Internal error: Wrong type for `part.%s`=%s, should be \"table\""):format(field, mw.dumpObject(list)))
+			error(("Internal error: Wrong type for `part.%s`=%s, should be \"table\""):format(field, dump(list)))
 		end
 		return list[1]
@@ Line 313: / Line 318: @@
 	local unique_head_parts = {}
-	local has_multiple_heads = #data.heads > 1
+	local has_multiple_heads = not not data.heads[2]
 	for j, head in ipairs(data.heads) do
@@ Line 388: / Line 393: @@
 		local saw_translit_page = false
-		if transliteration_page and transliteration_page.exists then
+		if transliteration_page and transliteration_page:getContent() then
-			if data.lang:hasType("conlang") then
+			translits_formatted = " [[" .. langname .. " transliteration|•]]" .. translits_formatted
-				translits_formatted = " [[" .. langname .. " transliteration|•]]" .. translits_formatted
-			else
-				translits_formatted = " [[Wiktionary:" .. langname .. " transliteration|•]]" .. translits_formatted
-			end
 			saw_translit_page = true
 		end
@@ Line 402: / Line 403: @@
 			transliteration_page = new_title(langname .. " transliteration")
-			if transliteration_page and transliteration_page.exists then
+			if transliteration_page and transliteration_page:getContent() then
-				translits_formatted = " [[Wiktionary:" .. langname .. " transliteration|•]]" .. translits_formatted
+				translits_formatted = " [[" .. langname .. " transliteration|•]]" .. translits_formatted
 			end
 		end
@@ Line 425: / Line 426: @@
 local function format_headword_genders(data)
 	local retval = ""
-	if data.genders and #data.genders > 0 then
+	if data.genders and data.genders[1] then
 		if data.gloss then
 			retval = ","
@@ Line 437: / Line 438: @@
 		end
 		local text, cats = format_genders(data.genders, data.lang, pos_for_cat)
-		for _, cat in ipairs(cats) do
+		if cats then
-			insert(data.categories, cat)
+			extend(data.categories, cats)
 		end
 		retval = retval .. "&nbsp;" .. text
@@ Line 445: / Line 446: @@
 end
+-- Forward reference
+local format_inflections
 local function format_inflection_parts(data, parts)
-	local any_part_translit = false
 	for j, part in ipairs(parts) do
 		if type(part) ~= "table" then
@@ Line 467: / Line 468: @@
 		-- right into the 'data' table to disable inflection links of the entire headword
 		-- when inflected forms aren't entry-worthy, e.g.: in Vulgar Latin
-		local nolinkinfl = part.face == "hypothetical" or  part.nolinkinfl or data.nolinkinfl
+		local nolinkinfl = part.face == "hypothetical" or part.nolinkinfl or data.nolinkinfl
 		local formatted
@@ Line 481: / Line 482: @@
 			-- where the script is relatively straightforward to read by learners (e.g. Greek, Russian), but allow it
 			-- to be enabled in languages with more complex scripts (e.g. Arabic).
-			local tr = part.translit or (not (parts.enable_auto_translit or data.inflections.enable_auto_translit) and "-" or nil)
+			--
-			if tr ~= "-" then
+			-- FIXME: With nested inflections, should we also respect `enable_auto_translit` at the top level of the
-				any_part_translit = true
+			-- nested inflections structure?
+			local tr = part.tr or not (parts.enable_auto_translit or data.inflections.enable_auto_translit) and "-" or nil
+			-- FIXME: Temporary errors added 2025-10-03. Remove after a month or so.
+			if part.translit then
+				error("Internal error: Use field `tr` not `translit` for specifying an inflection part translit")
+			end
+			if part.transcription then
+				error("Internal error: Use field `ts` not `transcription` for specifying an inflection part transcription")
+			end
+			local postprocess_annotations
+			if part.inflections then
+				postprocess_annotations = function(infldata)
+					insert(infldata.annotations, format_inflections(data, part.inflections))
+				end
 			end
 			formatted = full_link(
 				{
@@ Line 497: / Line 512: @@
 					genders = part.genders,
 					tr = tr,
-					ts = part.transcription,
+					ts = part.ts,
 					accel = partaccel or parts.accel,
+					postprocess_annotations = postprocess_annotations,
 				},
 				face
@@ Line 510: / Line 526: @@
 	local parts_output
-	if #parts > 0 then
+	if parts[1] then
 		parts_output = (parts.label and " " or "") .. concat(parts)
 	elseif parts.request then
@@ Line 520: / Line 536: @@
 	local parts_label = parts.label and ("<i>" .. parts.label .. "</i>") or ""
-	return parts_label .. parts_output, any_part_translit
+	return format_term_with_qualifiers_and_refs(data.lang, parts, parts_label .. parts_output, 1)
 end
--- Format the inflections following the headword.
+-- Format the inflections following the headword or nested after a given inflection. Declared local above.
-local function format_inflections(data)
+function format_inflections(data, inflections)
-	local any_part_translit = false
+	if inflections and inflections[1] then
-	if data.inflections and #data.inflections > 0 then
 		-- Format each inflection individually.
-		for key, infl in ipairs(data.inflections) do
+		for key, infl in ipairs(inflections) do
-			local this_any_part_translit
+			inflections[key] = format_inflection_parts(data, infl)
-			data.inflections[key], this_any_part_translit = format_inflection_parts(data, infl)
-			if this_any_part_translit then
-				any_part_translit = true
-			end
 		end
-		local concat_result = concat(data.inflections, ", ")
+		return concat(inflections, ", ")
-		return " (" .. concat_result .. ")"
 	else
 		return ""
 	end
 end
+-- Format the top-level inflections following the headword. Currently this just adds parens around the
+-- formatted comma-separated inflections in `data.inflections`.
+local function format_top_level_inflections(data)
+	local result = format_inflections(data, data.inflections)
+	if result ~= "" then
+		return " (" .. result .. ")"
+	else
+		return result
+	end
+end
+-- Forward reference
+local check_red_link_inflections
+-- Check a single inflection (which consists of a label and zero or more terms, each possibly with nested inflections)
+-- for red links. If so, insert a red-link category based on `plpos` (the plural part of speech to insert in the
+-- category), stop further processing, and return true. If no red links found, return false.
+local function check_red_link_inflection_parts(data, parts, plpos)
+	for _, part in ipairs(parts) do
+		if type(part) ~= "table" then
+			part = {term = part}
+		end
+		local term = part.term
+		if term and not term:find("%[%[") then
+			local stripped_physical_term = get_link_page(term, data.lang, part.sc or parts.sc or nil)
+			if stripped_physical_term then
+				local title = mw.title.new(stripped_physical_term)
+				if title and not title:getContent() then
+					insert(data.categories, data.lang:getFullName() .. " " .. plpos .. " with red links in their headword lines")
+					return true
+				end
+			end
+		end
+		if part.inflections then
+			if check_red_link_inflections(data, part.inflections, plpos) then
+				return true
+			end
+		end
+	end
+	return false
+end
+-- Check a set of inflections (each of which describes a single inflection of the term, such as feminine or plural, and
+-- consists of a label and zero or more terms, each possibly with nested inflections) for red links. If so, insert a
+-- red-link category based on `plpos` (the plural part of speech to insert in the category), stop further processing,
+-- and return true. If no red links found, return false.
+function check_red_link_inflections(data, inflections, plpos)
+	if inflections and inflections[1] then
+		-- Check each inflection individually.
+		for key, infl in ipairs(inflections) do
+			if check_red_link_inflection_parts(data, infl, plpos) then
+				return true
+			end
+		end
+	end
+	return false
+end
+-- Check the top-level inflections in `data.inflections`, along with any nested inflections, for red links. If so,
+-- insert a red-link category based on `plpos` (the plural part of speech to insert in the category), stop further
+-- processing, and return true. If no red links found, return false.
+local function check_red_link_inflections_top_level(data, plpos)
+	return check_red_link_inflections(data, data.inflections, plpos)
+end
 --[==[
@@ Line 567: / Line 649: @@
 ]==]
 function export.pos_lemma_or_nonlemma(plpos, best_guess)
-	local isLemma = (m_data or get_data()).lemmas
+	local m_headword_data = m_data or get_data()
+	local isLemma = m_headword_data.lemmas
 	-- Is it a lemma category?
 	if isLemma[plpos] then
@@ Line 577: / Line 660: @@
 	end
 	-- Is it a nonlemma category?
-	local isNonLemma = (m_data or get_data()).nonlemmas
+	local isNonLemma = m_headword_data.nonlemmas
 	if isNonLemma[plpos] or isNonLemma[plpos_no_recon] then
 		return "non-lemma form"
@@ Line 605: / Line 688: @@
 		error("POS 'pro' for 'pronoun' no longer allowed as it's too ambiguous; use 'pron'")
 	end
-	local data = m_data or get_data()
+	local m_headword_data = m_data or get_data()
-	if data.pos_aliases[pos] then
+	if m_headword_data.pos_aliases[pos] then
-		pos = data.pos_aliases[pos]
+		pos = m_headword_data.pos_aliases[pos]
 	elseif pos:sub(-1) == "f" then
 		pos = pos:sub(1, -2)
-		pos = (data.pos_aliases[pos] or pos) .. " forms"
+		pos = (m_headword_data.pos_aliases[pos] or pos) .. " forms"
 	end
 	return export.pluralize_pos(pos)
@@ Line 626: / Line 709: @@
 	local typ = type(data[element])
 	if typ ~= "table" then
-		error(("In full_headword(), `data.%s` must be an array but is a %s"):format(element, typ))
+		error(("Internal error: In full_headword(), `data.%s` must be an array but is a %s"):format(element, typ))
 	end
 	for k, v in pairs(data[element]) do
 		if k ~= "maxindex" then
 			if type(k) ~= "number" then
-				error(("Unrecognized non-numeric key '%s' in `data.%s`"):format(k, element))
+				error(("Internal error: Unrecognized non-numeric key '%s' in `data.%s`"):format(k, element))
 			end
 			if k > maxind then
@@ Line 638: / Line 721: @@
 			if v then
 				if type(v) ~= "string" then
-					error(("For key '%s' in `data.%s`, value should be a string but is a %s"):format(k, element, type(v)))
+					error(("Internal error: For key '%s' in `data.%s`, value should be a string but is a %s"):format(k, element, type(v)))
 				end
 				if not allow_blank_string and v == "" then
-					error(("For key '%s' in `data.%s`, blank string not allowed; use 'false' for the default"):format(k, element))
+					error(("Internal error: For key '%s' in `data.%s`, blank string not allowed; use 'false' for the default"):format(k, element))
 				end
 			end
@@ Line 693: / Line 776: @@
 	function export.maintenance_cats(page, lang, lang_cats, page_cats)
-		for _, cat in ipairs(page.cats) do
+		extend(page_cats, page.cats)
-			insert(page_cats, cat)
-		end
 		lang = lang:getFull() -- since we are just generating categories
 		local canonical = lang:getCanonicalName()
@@ Line 727: / Line 808: @@
 	if data.getCanonicalName then
-		error("In full_headword(), the first argument `data` needs to be a Lua object (table) of properties, not a language object")
+		error("Internal error: In full_headword(), the first argument `data` needs to be a Lua object (table) of properties, not a language object")
 	end
 	if not data.lang or type(data.lang) ~= "table" or not data.lang.getCode then
-		error("In full_headword(), the first argument `data` needs to be a Lua object (table) and `data.lang` must be a language object")
+		error("Internal error: In full_headword(), the first argument `data` needs to be a Lua object (table) and `data.lang` must be a language object")
 	end
 	if data.id and type(data.id) ~= "string" then
-		error("The id in the data table should be a string.")
+		error("Internal error: The id in the data table should be a string.")
 	end
@@ Line 745: / Line 826: @@
 	local full_langname = data.lang:getFullName()
-	local raw_pagename, page = data.pagename
+	local raw_pagename = data.pagename
-	if raw_pagename and raw_pagename ~= (m_data or get_data()).pagename then -- for testing, doc pages, etc.
+	local page
-		page = process_page(raw_pagename)
+	local m_headword_data = m_data or get_data()
+	if raw_pagename and raw_pagename ~= m_headword_data.pagename then -- for testing, doc pages, etc.
+		-- data.pagename is often set on documentation and test pages through the pagename= parameter of various
+		-- templates, to emulate running on that page. Having a large number of such test templates on a single
+		-- page often leads to timeouts, because we fetch and parse the contents of each page in turn. However,
+		-- we don't really need to do that and can function fine without fetching and parsing the contents of a
+		-- given page, so turn off content fetching/parsing (and also setting the DEFAULTSORT key through a parser
+		-- function, which is *slooooow*) in certain namespaces where test and documentation templates are likely to
+		-- be found and where actual content does not live (User, Template, Module).
+		local actual_namespace = m_headword_data.page.namespace
+		local no_fetch_content = actual_namespace == "User" or actual_namespace == "Template" or
+			actual_namespace == "Module"
+		page = process_page(raw_pagename, no_fetch_content)
 	else
-		page = (m_data or get_data()).page
+		page = m_headword_data.page
 	end
-	-- Check the namespace against the language type.
 	local namespace = page.namespace
-	if namespace == "" then
-		if data.lang:hasType("reconstructed") then
-			error("Entries in " .. langname .. " must be placed in the Reconstruction: namespace")
-		elseif data.lang:hasType("appendix-constructed") then
-			error("Entries in " .. langname .. " must be placed in the Appendix: namespace")
-		end
-	elseif namespace == "Citations" or namespace == "Thesaurus" then
-		error("Headword templates should not be used in the " .. namespace .. ": namespace.")
-	end
 	------------ 3. Initialize `data.heads` table; if old-style, convert to new-style. ------------
@@ Line 769: / Line 852: @@
 		-- new-style
 		if data.translits or data.transcriptions then
-			error("In full_headword(), if `data.heads` is new-style (array of head objects), `data.translits` and `data.transcriptions` cannot be given")
+			error("Internal error: In full_headword(), if `data.heads` is new-style (array of head objects), `data.translits` and `data.transcriptions` cannot be given")
 		end
 	else
@@ Line 802: / Line 885: @@
 	init_and_find_maximum_index(data, "whole_page_categories")
 	local pos_category_already_present = false
-	if #data.categories > 0 then
+	if data.categories[1] then
 		local escaped_langname = pattern_escape(full_langname)
 		local matches_lang_pattern = "^" .. escaped_langname .. " "
@@ Line 816: / Line 899: @@
 	if not data.pos_category then
-		error("`data.pos_category` not specified and could not be inferred from the categories given in "
+		error("Internal error: `data.pos_category` not specified and could not be inferred from the categories given in "
 			.. "`data.categories`. Either specify the plural part of speech in `data.pos_category` "
 			.. "(e.g. \"proper nouns\") or ensure that the first category in `data.categories` is formed from the "
@@ Line 846: / Line 929: @@
 	------------ 5. Create a default headword, and add links to multiword page names. ------------
+	-- Determine if this is an "anti-asterisk" term, i.e. an attested term in a language that must normally be
+	-- reconstructed.
+	local is_anti_asterisk = data.heads[1].term and data.heads[1].term:find("^!!")
+	local lang_reconstructed = data.lang:hasType("reconstructed")
+	if is_anti_asterisk then
+		if not lang_reconstructed then
+			error("Anti-asterisk feature (head= beginning with !!) can only be used with reconstructed languages")
+		end
+		lang_reconstructed = false
+	end
 	-- Determine if term is reconstructed
-	local is_reconstructed = namespace == "Reconstruction" or data.lang:hasType("reconstructed")
+	local is_reconstructed = namespace == "Reconstruction" or lang_reconstructed
 	-- Create a default headword based on the pagename, which is determined in
@@ Line 856: / Line 951: @@
 	-- Add links to multi-word page names when appropriate
 	if not (is_reconstructed or data.nolinkhead) then
-		local no_links = (m_data or get_data()).no_multiword_links
+		local no_links = m_headword_data.no_multiword_links
 		if not (no_links[langcode] or no_links[full_langcode]) and export.head_is_multiword(default_head) then
 			default_head = export.add_multiword_links(default_head, true)
@@ Line 866: / Line 961: @@
 	end
-	------------ 6. Fill in missing values in `data.heads`. ------------
+	------------ 6. Check the namespace against the language type. ------------
+	if namespace == "" then
+		if lang_reconstructed then
+			error("Entries in " .. langname .. " must be placed in the Reconstruction: namespace")
+		elseif data.lang:hasType("appendix-constructed") then
+			error("Entries in " .. langname .. " must be placed in the Appendix: namespace")
+		end
+	elseif namespace == "Citations" or namespace == "Thesaurus" then
+		error("Headword templates should not be used in the " .. namespace .. ": namespace.")
+	end
+	------------ 7. Fill in missing values in `data.heads`. ------------
 	-- True if any script among the headword scripts has spaces in it.
@@ Line 875: / Line 982: @@
 	for _, head in ipairs(data.heads) do
-		------ 6a. If missing head, replace with default head.
+		------ 7a. If missing head, replace with default head.
 		if not head.term then
 			head.term = default_head
 		elseif head.term == default_head then
 			has_redundant_head_param = true
+		elseif is_anti_asterisk and head.term == "!!" then
+			-- If explicit head=!! is given, it's an anti-asterisk term and we fill in the default head.
+			head.term = "!!" .. default_head
 		elseif head.term:find("^[!?]$") then
 			-- If explicit head= just consists of ! or ?, add it to the end of the default head.
 			head.term = default_head .. head.term
 		end
+		head.term_no_initial_bang_bang = is_anti_asterisk and head.term:sub(3) or head.term
 		if is_reconstructed then
@@ Line 895: / Line 1,006: @@
 		end
-		------ 6b. Try to detect the script(s) if not provided. If a per-head script is provided, that takes precedence,
+		------ 7b. Try to detect the script(s) if not provided. If a per-head script is provided, that takes precedence,
 		------     otherwise fall back to the overall script if given. If neither given, autodetect the script.
@@ Line 913: / Line 1,024: @@
 			-- Track uses of sc parameter.
 			if head.sc:getCode() == auto_sc:getCode() then
-				insert(data.categories, full_langname .. " terms with redundant script codes")
+				if not data.no_script_code_cat then
+					insert(data.categories, full_langname .. " terms with redundant script codes")
+				end
 			else
-				insert(data.categories, full_langname .. " terms with non-redundant manual script codes")
+				if not data.no_script_code_cat then
+					insert(data.categories, full_langname .. " terms with non-redundant manual script codes")
+				end
 			end
 		end
@@ Line 929: / Line 1,044: @@
 		any_script_has_spaces = any_script_has_spaces or head.sc:hasSpaces()
-		------ 6c. Create automatic transliterations for any non-Latin headwords without manual translit given
+		------ 7c. Create automatic transliterations for any non-Latin headwords without manual translit given
 		------     (provided automatic translit is available, e.g. not in Persian or Hebrew).
@@ Line 940: / Line 1,055: @@
 			head.tr = nil
 		else
-			local notranslit = (m_data or get_data()).notranslit
+			local notranslit = m_headword_data.notranslit
 			if not (notranslit[langcode] or notranslit[full_langcode]) and head.sc:isTransliterated() then
 				head.tr_manual = not not head.tr
-				local text = head.term
+				local text = head.term_no_initial_bang_bang
 				if not data.lang:link_tr(head.sc) then
 					text = remove_links(text)
 				end
-				local automated_tr, tr_categories
+				local automated_tr = data.lang:transliterate(text, head.sc)
-				automated_tr, head.tr_fail, tr_categories = data.lang:transliterate(text, head.sc)
-				if automated_tr or head.tr_fail then
+				if automated_tr then
 					local manual_tr = head.tr
 					if manual_tr then
-						if (remove_links(manual_tr) == remove_links(automated_tr)) and (not head.tr_fail) then
+						if remove_links(manual_tr) == remove_links(automated_tr) then
 							insert(data.categories, full_langname .. " terms with redundant transliterations")
-						elseif not head.tr_fail then
+						else
 							insert(data.categories, full_langname .. " terms with non-redundant manual transliterations")
 						end
@@ Line 965: / Line 1,079: @@
 					if not manual_tr then
 						head.tr = automated_tr
-						for _, category in ipairs(tr_categories) do
-							insert(data.categories, category)
-						end
 					end
 				end
@@ Line 996: / Line 1,107: @@
 	end
-	------------ 7. Maybe tag the title with the appropriate script code, using the `display_title` mechanism. ------------
+	------------ 8. Maybe tag the title with the appropriate script code, using the `display_title` mechanism. ------------
 	-- Assumes that the scripts in "toBeTagged" will never occur in the Reconstruction namespace.
@@ Line 1,012: / Line 1,123: @@
 	if unsupported == 1 and page.unsupported_titles[unsupported_pagename] then
 		display_title = 'Unsupported titles/<span class="' .. dt_script_code .. '">' .. page.unsupported_titles[unsupported_pagename] .. '</span>'
-	elseif page_non_ascii and (m_data or get_data()).toBeTagged[dt_script_code]
+	elseif page_non_ascii and m_headword_data.toBeTagged[dt_script_code]
 		or (dt_script_code == "Jpan" and (text_in_script(page.pagename, "Hira") or text_in_script(page.pagename, "Kana")))
 		or (dt_script_code == "Kore" and text_in_script(page.pagename, "Hang")) then
@@ Line 1,051: / Line 1,162: @@
 	end
-	------------ 8. Insert additional categories. ------------
+	------------ 9. Insert additional categories. ------------
 	if has_redundant_head_param then
 		if not data.no_redundant_head_cat then
-			insert(data.categories, full_langname .. " terms with redundant head parameter")
+			-- This is not the right way to go about this; too many exceptions and problems due to language-specific headword
+			-- handling customization. If we want this, it should be opt-in by a given language passing in the default headword.
+			-- insert(data.categories, full_langname .. " terms with redundant head parameter")
 		end
 	end
@@ Line 1,061: / Line 1,174: @@
 	-- If the first head is multiword (after removing links), maybe insert into "LANG multiword terms".
 	if not data.nomultiwordcat and any_script_has_spaces and postype == "lemma" then
-		local no_multiword_cat = (m_data or get_data()).no_multiword_cat
+		local no_multiword_cat = m_headword_data.no_multiword_cat
 		if not (no_multiword_cat[langcode] or no_multiword_cat[full_langcode]) then
 			-- Check for spaces or hyphens, but exclude prefixes and suffixes.
 			-- Use the pagename, not the head= value, because the latter may have extra
 			-- junk in it, e.g. superscripted text that throws off the algorithm.
-			local no_hyphen = (m_data or get_data()).hyphen_not_multiword_sep
+			local no_hyphen = m_headword_data.hyphen_not_multiword_sep
 			-- Exclude hyphens if the data module states that they should for this language.
 			local checkpattern = (no_hyphen[langcode] or no_hyphen[full_langcode]) and ".[%s፡]." or ".[%s%-፡]."
-			if umatch(page.pagename, checkpattern) and not non_categorizable(page.full_raw_pagename) then
+			local is_multiword = umatch(page.pagename, checkpattern)
+			if is_multiword and not non_categorizable(page.full_raw_pagename) then
 				insert(data.categories, full_langname .. " multiword terms")
+			elseif not is_multiword then
+				local long_word_threshold = m_headword_data.long_word_thresholds[langcode] or
+					m_headword_data.long_word_thresholds[full_langcode]
+				if long_word_threshold and ulen(page.pagename) >= long_word_threshold then
+					insert(data.categories, "Long " .. full_langname .. " words")
+				end
 			end
 		end
 	end
-	if data.sccat then
+	local default_sccat = m_headword_data.default_sccat
+	if data.sccat or data.sccat == nil and (default_sccat[langcode] or default_sccat[full_langcode]) then
 		for _, head in ipairs(data.heads) do
 			insert(data.categories, full_langname .. " " .. data.pos_category .. " in " ..
@@ Line 1,088: / Line 1,210: @@
 		-- values.
 		local characters_to_ignore = {
-			["aaq"] = "α", -- Penobscot
+			["aaq"] = "αάὰ", -- Penobscot (Algonquian)
 			["acy"] = "δθ", -- Cypriot Arabic
-			["anc"] = "γ", -- Ngas
+			["aez"] = "β", -- Aeka (Trans-New Guinea)
-			["aou"] = "χ", -- A'ou
+			["anc"] = "γ", -- Ngas (Chadic/Afroasiatic)
-			["awg"] = "β", -- Anguthimri
+			["aou"] = "χ", -- A'ou (Kra-Dai)
-			["bhp"] = "β", -- Bima
+			["art-blk"] = "ч", -- Bolak (conlang)
-			["byk"] = "θ", -- Biao
+			["awg"] = "β", -- Anguthimri (Pama-Nyungan)
-			["cdy"] = "θ", -- Chadong
+			["az"] = "ь", -- Azerbaijani (Turkic; Yañalif Latin spelling, c. 1928 - 1938)
-			["clm"] = "χ", -- Klallam
+			["ba"] = "ь", -- Bashkir (Turkic; Yañalif Latin spelling, c. 1928 - 1938)
-			["col"] = "χ", -- Colombia-Wenatchi
+			["bhp"] = "β", -- Bima (Austronesian)
-			["coo"] = "χ", -- Comox; FIXME: others? E.g. Greek theta (θ)?
+			["bjz"] = "β", -- Baruga (Trans-New Guinea)
-			["ets"] = "θ", -- Yekhee
+			["byk"] = "θ", -- Biao (Kra-Dai)
-			["gmw-gts"] = "χ", -- Gottscheerish
+			["cdy"] = "θ", -- Chadong (Kra-Dai)
-			["hur"] = "θ", -- Halkomelem
+			["chp"] = "θ", -- Chipewyan (Athabaskan)
-			["izh"] = "ь", -- Ingrian
+			["cjh"] = "χ", -- Upper Chehalis (Salishan)
-			["kic"] = "θ", -- Kickapoo
+			["clm"] = "χ", -- Klallam (Salishan)
-			["lil"] = "χ", -- Lillooet
+			["col"] = "χ", -- Colombia-Wenatchi (Salishan)
+			["coo"] = "χθ", -- Comox (Salishan)
+			["crx"] = "θ", -- Carrier (Athabaskan)
+			["ets"] = "θ", -- Yekhee (Edoid/Niger-Congo)
+			["ett"] = "χ", -- Etruscan (isolate; in romanizations)
+			["fla"] = "χ", -- Montana Salish (Salishan)
+			["grt"] = "་", -- Garo (South Asian Sino-Tibetan)
+			["gmw-gts"] = "χ", -- Gottscheerish (Bavarian variant spoken in Slovenia)
+			["hur"] = "χθ", -- Halkomelem (Salishan)
+			["itc-psa"] = "f", -- Pre-Samnite (Italic; normally written in Greek)
+			["izh"] = "ь", -- Ingrian (Finnic)
+			["kic"] = "θ", -- Kickapoo (Algonquian)
+			["kk"] = "ь", -- Kazakh (Turkic; Yañalif Latin spelling, c. 1928 - 1938)
+			["ky"] = "ь", -- Kyrgyz (Turkic; Yañalif Latin spelling, c. 1928 - 1938)
+			["lil"] = "χ", -- Lillooet (Salishan)
+			["lsi"] = "ꓹ", -- Lashi (Lolo-Burmese/Sino-Tibetan; represents a glottal stop)
 			["mhz"] = "β", -- Mor (Austronesian)
-			["neg"]=  "ӡ", -- Negidal (normally in Cyrillic)
+			["mqn"] = "β", -- Moronene (Austronesian)
-			["oui"] = "γβ", -- Old Uyghur: FIXME: others? E.g. Greek delta (δ)?
+			["neg"]=  "ӡā", -- Negidal (Tungusic; normally in Cyrillic)
-			["pox"] = "χ", -- Polabian
+			["oka"] = "χ", -- Okanagan (Salishan)
-			["rom"] = "Θθ", -- Romani: International Standard; two different thetas???
+			["ole"] = "θ", -- Olekha (Sino-Tibetan)
-			["sah"] = "ь", -- Yakut (1929 - 1939 Latin spelling)
+			["oui"] = "γβ", -- Old Uyghur (Turkic; FIXME: others? E.g. Greek delta (δ)?)
-			["sjw"] = "θ", -- Shawnee
+			["pox"] = "χ", -- Polabian (West Slavic)
-			["squ"] = "χ", -- Squamish
+			["rif"] = "ε", -- Tarifit (Berber)
-			["str"] = "χθ", -- Saanich; uses two Greek letters
+			["rom"] = "Θθ", -- Romani (Indic: International Standard; two different thetas???)
-			["twa"] = "χ", -- Twana
+			["rpn"] = "β", -- Repanbitip (Austronesian)
-			["yha"] = "θ", -- Baha
+			["sah"] = "ь", -- Yakut (Turkic; 1929 - 1939 Latin spelling)
-			["za"] = "зч", -- Zhuang; 1957-1982 alphabet used two Cyrillic letters (as well as some others like
+			["sit-jap"] = "χ", -- Japhug (Sino-Tibetan)
+			["sjw"] = "θ", -- Shawnee (Algonquian)
+			["squ"] = "χ", -- Squamish (Salishan)
+			["str"] = "χθ", -- Saanich (Salishan)
+			["teh"] = "χ", -- Tehuelche (Chonan; spoken in Argentina)
+			["tep"] = "η", -- Tepecano (Uto-Aztecan)
+			["thp"] = "χ", -- Thompson (Salishan)
+			["tk"] = "ь", -- Turkmen (Turkic; Yañalif Latin spelling, c. 1928 - 1938)
+			["tt"] = "ь", -- Kazakh (Turkic; Yañalif Latin spelling, c. 1928 - 1938)
+			["twa"] = "χ", -- Twana (Salishan)
+			["wbl"] = "ы", -- Wakhi (Iranian)
+			["xbc"] = "ϸ", -- Bactrian (Iranian; represents š; normally written in Greek)
+			["yha"] = "θ", -- Baha (Kra-Dai)
+			["za"] = "зч", -- Zhuang (Tai/Kra-Dai); 1957-1982 alphabet used two Cyrillic letters (as well as some others like
 						   -- ƃ, ƅ, ƨ, ɯ and ɵ that look like Cyrillic or Greek but are actually Latin)
-			["zlw-slv"] = "χђћ", -- Slovincian; FIXME: χ is Greek, the other two are Cyrillic, but I'm not sure
+			["zlw-slv"] = "χђћ", -- Slovincian (West Slavic; FIXME: χ is Greek, the other two are Cyrillic, but I'm not sure
-								 -- the currect characters are being chosen in the entry names
+								 -- the currect characters are being chosen in the entry names)
-			["zng"] = "θ", -- Mang
+			["zng"] = "θ", -- Mang (Mon-Khmer)
+			["ztp"] = "θ", -- Loxicha Zapotec (Zapotecan)
 		}
 		-- Determine how many real scripts are found in the pagename, where we exclude symbols and such. We exclude
@@ Line 1,275: / Line 1,426: @@
 		and is_palindrome(page.pagename, data.lang, data.heads[1].sc) then
 		insert(data.categories, full_langname .. " palindromes")
+	end
+	-- Add red link category if called for and we're not a "large" page, where such checks are disabled.
+	if data.checkredlinks and not m_headword_data.large_pages[m_headword_data.pagename] then
+		local plposcat = type(data.checkredlinks) == "string" and data.checkredlinks or data.pos_category
+		check_red_link_inflections_top_level(data, plposcat)
 	end
@@ Line 1,280: / Line 1,437: @@
 	export.maintenance_cats(page, data.lang, data.categories, data.whole_page_categories)
-	------------ 9. Format and return headwords, genders, inflections and categories. ------------
+	------------ 10. Format and return headwords, genders, inflections and categories. ------------
 	-- Format and return all the gathered information. This may add more categories (e.g. gender/number categories),
@@ Line 1,287: / Line 1,444: @@
 		format_headword(data) ..
 		format_headword_genders(data) ..
-		format_inflections(data) .. '</span>'
+		format_top_level_inflections(data) .. '</span>'
 	-- Language-specific categories.