Module:headword: Difference between revisions

(47 intermediate revisions by 3 users not shown)

Line 2:

-- Named constants for all modules used, to make it easier to swap out sandbox versions.

local gender_and_number_module = "Module:~~getn~~"

local debug_track_module = "Module:debug/track"

local en_utilities_module = "Module:en-utilities"

local gender_and_number_module = "Module:gender and number"

local headword_data_module = "Module:headword/data"

local headword_page_module = "Module:headword/page"

local links_module = "Module:links"

local load_module = "Module:load"

local pages_module = "Module:pages"

local palindromes_module = "Module:palindromes"

local ~~qualifier_module~~ = "Module:qualifier"

local pron_qualifier_module = "Module:pron qualifier"

local scripts_module = "Module:scripts"

local scripts_data_module = "Module:scripts/data"

Line 15:

Line 19:

local table_module = "Module:table"

local utilities_module = "Module:utilities"

~~local m_str_utils = require(string_utilities_module)~~

local concat = table.concat

local ~~encode_entities~~ = ~~m_str_utils~~.~~encode_entities~~

local dump = mw.dumpObject

local insert = table.insert

local ipairs = ipairs

local max = math.max

local new_title = mw.title.new

local pairs = pairs

local ~~pattern_escape~~ = ~~m_str_utils.pattern_escape~~

local require = require

~~local rgmatch = mw.ustring.gmatch~~

~~local rsubn = mw.ustring.gsub~~

~~local rfind = mw.ustring.find~~

~~local ulen = m_str_utils.len~~

~~local rmatch = mw.ustring.match~~

local toNFC = mw.ustring.toNFC

local toNFD = mw.ustring.toNFD

local type = type

local ufind = mw.ustring.find

local ugmatch = mw.ustring.gmatch

local ugsub = mw.ustring.gsub

local umatch = mw.ustring.match

--[==[

Loaders for functions in other modules, which overwrite themselves with the target function when called. This ensures modules are only loaded when needed, retains the speed/convenience of locally-declared pre-loaded functions, and has no overhead after the first call, since the target functions are called directly in any subsequent calls.]==]

local function debug_track(...)

debug_track = require(debug_track_module)

return debug_track(...)

end

local function encode_entities(...)

encode_entities = require(string_utilities_module).encode_entities

return encode_entities(...)

end

local function extend(...)

extend = require(table_module).extend

return extend(...)

end

local function find_best_script_without_lang(...)

find_best_script_without_lang = require(scripts_module).findBestScriptWithoutLang

return find_best_script_without_lang(...)

end

local function format_categories(...)

format_categories = require(utilities_module).format_categories

return format_categories(...)

end

local function format_genders(...)

format_genders = require(gender_and_number_module).format_genders

return format_genders(...)

end

local function format_pron_qualifiers(...)

format_pron_qualifiers = require(pron_qualifier_module).format_qualifiers

return format_pron_qualifiers(...)

end

local function full_link(...)

full_link = require(links_module).full_link

return full_link(...)

end

local function get_current_L2(...)

get_current_L2 = require(pages_module).get_current_L2

return get_current_L2(...)

end

local function get_link_page(...)

get_link_page = require(links_module).get_link_page

return get_link_page(...)

end

local function get_script(...)

get_script = require(scripts_module).getByCode

return get_script(...)

end

local function is_palindrome(...)

is_palindrome = require(palindromes_module).is_palindrome

return is_palindrome(...)

end

local function language_link(...)

language_link = require(links_module).language_link

return language_link(...)

end

local function load_data(...)

load_data = require(load_module).load_data

return load_data(...)

end

local function pattern_escape(...)

pattern_escape = require(string_utilities_module).pattern_escape

return pattern_escape(...)

end

local function pluralize(...)

pluralize = require(en_utilities_module).pluralize

return pluralize(...)

end

local function process_page(...)

process_page = require(headword_page_module).process_page

return process_page(...)

end

local function remove_links(...)

remove_links = require(links_module).remove_links

return remove_links(...)

end

local function shallow_copy(...)

shallow_copy = require(table_module).shallowCopy

return shallow_copy(...)

end

local function tag_text(...)

tag_text = require(script_utilities_module).tag_text

return tag_text(...)

end

local function tag_transcription(...)

tag_transcription = require(script_utilities_module).tag_transcription

return tag_transcription(...)

end

local function tag_translit(...)

tag_translit = require(script_utilities_module).tag_translit

return tag_translit(...)

end

local function trim(...)

trim = require(string_utilities_module).trim

return trim(...)

end

local function ulen(...)

ulen = require(string_utilities_module).len

return ulen(...)

end

local m_data = ~~mw.loadData~~(headword_data_module)

--[==[

Loaders for objects, which load data (or some other object) into some variable, which can then be accessed as "foo or get_foo()", where the function get_foo sets the object to "foo" and then returns it. This ensures they are only loaded when needed, and avoids the need to check for the existence of the object each time, since once "foo" has been set, "get_foo" will not be called again.]==]

local m_data

local function get_data()

m_data = load_data(headword_data_module)

return m_data

end

local ~~isLemma = m_data.lemmas~~

local script_data

local ~~isNonLemma~~ = ~~m_data.nonlemmas~~

local function get_script_data()

local ~~notranslit = m_data.notranslit~~

script_data = load_data(scripts_data_module)

local ~~toBeTagged~~ = ~~m_data.toBeTagged~~

return script_data

end

local script_utilities_data

local function get_script_utilities_data()

script_utilities_data = load_data(script_utilities_data_module)

return script_utilities_data

end

-- If set to true, categories always appear, even in non-mainspace pages

local test_force_categories = false

-- ~~Version of rsubn~~() ~~that discards~~ all ~~but~~ the ~~first return value~~.

-- Add a tracking category to track entries with certain (unusually undesirable) properties. `track_id` is an identifier

local function ~~rsub~~(~~term, foo~~, ~~bar~~)

-- for the particular property being tracked and goes into the tracking page. Specifically, this adds a link in the

~~return~~ (~~rsubn~~(~~term~~, ~~foo~~, ~~bar~~))

-- page text to [[Wiktionary:Tracking/headword/TRACK_ID]], meaning you can find all entries with the `track_id` property

-- by visiting [[Special:WhatLinksHere/Wiktionary:Tracking/headword/TRACK_ID]].

--

-- If `lang` (a language object) is given, an additional tracking page [[Wiktionary:Tracking/headword/TRACK_ID/CODE]] is

-- linked to where CODE is the language code of `lang`, and you can find all entries in the combination of `track_id`

-- and `lang` by visiting [[Special:WhatLinksHere/Wiktionary:Tracking/headword/TRACK_ID/CODE]]. This makes it possible to

-- isolate only the entries with a specific tracking property that are in a given language. Note that if `lang`

-- references at etymology-only language, both that language's code and its full parent's code are tracked.

local function track(track_id, lang)

local tracking_page = "headword/" .. track_id

if lang and lang:hasType("etymology-only") then

debug_track{tracking_page, tracking_page .. "/" .. lang:getCode(),

tracking_page .. "/" .. lang:getFullCode()}

elseif lang then

debug_track{tracking_page, tracking_page .. "/" .. lang:getCode()}

else

debug_track(tracking_page)

end

return true

end

local function text_in_script(text, script_code)

local sc = ~~require(scripts_module).getByCode~~(script_code)

local sc = get_script(script_code)

if not sc then

error("Internal error: Bad script code " .. script_code)

Line 56:

Line 214:

local out

if characters then

text = ~~rsub~~(text, "%W", "")

text = ugsub(text, "%W", "")

out = ~~rfind~~(text, "[" .. characters .. "]")

out = ufind(text, "[" .. characters .. "]")

end

Line 71:

Line 229:

--[[ List of punctuation or spacing characters that are found inside of words.

Used to exclude characters from the regex above. ]]

local wordPunc = "-־׳״'.·*’་•:᠊"

local wordPunc = "-#%%&@־׳״'.·*’་•:᠊"

local notWordPunc = "[^" .. wordPunc .. "]+"

-- Format a term (either a head term or an inflection term) along with any left or right qualifiers, references or

-- Format a term (either a head term or an inflection term) along with any left or right qualifiers, labels, references

-- customized separator: `part` is the object specifying the term, which should optionally contain:

-- or customized separator: `part` is the object specifying the term (and `lang` the language of the term), which should

-- * left qualifiers in `q`, an array of strings ~~(or `qualifiers` for compatibility purposes)~~;

-- optionally contain:

-- * left qualifiers in `q`, an array of strings;

-- * right qualifiers in `qq`, an array of strings;

-- * left labels in `l`, an array of strings;

-- * right labels in `ll`, an array of strings;

-- * references in `refs`, an array either of strings (formatted reference text) or objects containing fields `text`

-- (formatted reference text) and optionally `name` and/or `group`;

-- * a separator in `separator`, defaulting to " or " if this is not the first term (j > 1), otherwise "".

-- `formatted` is the formatted version of the term itself, and `j` is the index of the term.

~~local function format_term_with_qualifiers_and_refs(part, formatted, j)~~

~~local left_qualifiers, right_qualifiers~~

~~local reftext~~

~~left_qualifiers~~ = part~~.q and #part.q > 0 and part.q~~

local function format_term_with_qualifiers_and_refs(lang, part, formatted, j)

if ~~left_qualifiers~~ then

local function part_non_empty(field)

~~left_qualifiers~~ = ~~require~~(~~qualifier_module).format_qualifier~~(~~left_qualifiers) .~~. " "

local list = part[field]

if not list then

return nil

end

if type(list) ~= "table" then

error(("Internal error: Wrong type for `part.%s`=%s, should be \"table\""):format(field, dump(list)))

end

return list[1]

end

~~right_qualifiers = part.qq and #part.qq > 0 and part.qq~~

if part_non_empty("q") or part_non_empty("qq") or part_non_empty("l") or

if ~~right_qualifiers then~~

part_non_empty("ll") or part_non_empty("refs") then

~~right_qualifiers =~~ " " ~~.. require~~(~~qualifier_module~~)~~.format_qualifier~~(~~right_qualifiers~~)

formatted = format_pron_qualifiers {

~~end~~

lang = lang,

~~if part.refs and #part.refs > 0 then~~

text = formatted,

~~local refs = {}~~

q = part.q,

~~for _, ref in ipairs~~(~~part.refs~~) do

qq = part.qq,

~~if type~~(~~ref) ~=~~ "~~table~~" then

l = part.l,

~~ref~~ = {~~text = ref}~~

ll = part.ll,

~~end~~

refs = part.refs,

~~local refargs~~

}

~~if ref~~.~~name or ref.group then~~

~~refargs~~ = ~~{name = ref~~.~~name~~, ~~group~~ = ~~ref~~.~~group}~~

~~end~~

~~insert(~~refs~~, mw~~.~~getCurrentFrame():extensionTag("ref", ref.text~~, ~~refargs))~~

~~end~~

~~reftext = concat(refs)~~

end

local separator = part.separator or j > 1 and " or " -- use "" to request no separator

~~if left_qualifiers then~~

~~formatted = left_qualifiers .. formatted~~

~~end~~

~~if reftext then~~

~~formatted = formatted .. reftext~~

~~end~~

~~if right_qualifiers then~~

~~formatted = formatted .. right_qualifiers~~

~~end~~

if separator then

formatted = separator .. formatted

Line 132:

Line 282:

--[==[Return true if the given head is multiword according to the algorithm used in full_headword().]==]

function export.head_is_multiword(head)

for possibleWordBreak in ~~rgmatch~~(head, spacingPunctuation) do

for possibleWordBreak in ugmatch(head, spacingPunctuation) do

if ~~rmatch~~(possibleWordBreak, notWordPunc) then

if umatch(possibleWordBreak, notWordPunc) then

return true

end

Line 141:

Line 291:

end

do

~~--[==[Add links to a multiword head.]==]~~

~~function export.add_multiword_links(head, default)~~

local function workaround_to_exclude_chars(s)

return ~~rsub~~(s, notWordPunc, "\2%1\1")

return (ugsub(s, notWordPunc, "\2%1\1"))

end

head = "\1" .. ~~rsub~~(head, spacingPunctuation, workaround_to_exclude_chars) .. "\2"

--[==[Add links to a multiword head.]==]

if default then

function export.add_multiword_links(head, default)

head = head

head = "\1" .. ugsub(head, spacingPunctuation, workaround_to_exclude_chars) .. "\2"

:gsub("(\1[^\2]*)\\([:#][^\2]*\2)", "%1\\\\%2")

if default then

:gsub("(\1[^\2]*)([:#][^\2]*\2)", "%1\\%2")

head = head

end

:gsub("(\1[^\2]*)\\([:#][^\2]*\2)", "%1\\\\%2")

:gsub("(\1[^\2]*)([:#][^\2]*\2)", "%1\\%2")

end

--Escape any remaining square brackets to stop them breaking links (e.g. "[citation needed]").

head = encode_entities(head, "[]", true, true)

--[=[

use this when workaround is no longer needed:

head = "[[" .. ~~rsub~~(head, WORDBREAKCHARS, "]]%1[[") .. "]]"

head = "[[" .. ugsub(head, WORDBREAKCHARS, "]]%1[[") .. "]]"

Remove any empty links, which could have been created above

at the beginning or end of the string.

]=]

return (head

:gsub("\1\2", "")

:gsub("[\1\2]", {["\1"] = "[[", ["\2"] = "]]"}))

end

local function non_categorizable(full_raw_pagename)

return full_raw_pagename:find("^Appendix:Gestures/")

return full_raw_pagename:find("^Appendix:Gestures/") or

-- Unsupported titles with descriptive names.

(full_raw_pagename:find("^Unsupported titles/") and not full_raw_pagename:find("`"))

end

local function tag_text_and_add_quals_and_refs(data, head, formatted, j)

-- Add language and script wrapper.

formatted = tag_text(formatted, data.lang, head.sc, "head", nil, j == 1 and data.id or nil)

-- Add qualifiers, labels, references and separator.

return format_term_with_qualifiers_and_refs(data.lang, head, formatted, j)

end

-- Format a headword with transliterations.

local function format_headword(data)

~~local m_scriptutils = require(script_utilities_module)~~

-- Are there non-empty transliterations?

local has_translits = false

Line 190:

Line 347:

local unique_head_parts = {}

local has_multiple_heads = #data.heads ~~> 1~~

local has_multiple_heads = not not data.heads[2]

for j, head in ipairs(data.heads) do

Line 204:

Line 361:

-- Apply processing to the headword, for formatting links and such.

if head.term:find("[[", nil, true) and head.sc:getCode() ~= "Image" then

formatted = ~~require(links_module).~~language_link{term = head.term, lang = data.lang}

formatted = language_link{term = head.term, lang = data.lang}

else

formatted = data.lang:makeDisplayText(head.term, head.sc, true)

end

local ~~function~~ tag_text_and_add_quals_and_refs(~~head, formatted, j)~~

local head_part = tag_text_and_add_quals_and_refs(data, head, formatted, j)

~~-- Add language and script wrapper.~~

~~formatted = m_scriptutils.tag_text(formatted,~~ data~~.lang~~, ~~head.sc, "head", nil, j == 1 and data.id or nil)~~

~~-- Add qualifiers, references and separator.~~

~~return format_term_with_qualifiers_and_refs(head, formatted, j)~~

~~end~~

~~local head_part = tag_text_and_add_quals_and_refs(~~head, formatted, j)

insert(head_parts, head_part)

Line 227:

Line 376:

unique_head_part = head_part

else

unique_head_part = tag_text_and_add_quals_and_refs(head, formatted, 1)

unique_head_part = tag_text_and_add_quals_and_refs(data, head, formatted, 1)

end

unique_head_parts[unique_head_part] = true

Line 243:

Line 392:

else

head_parts = concat(head_parts)

end

if has_manual_translits then

-- [[Special:WhatLinksHere/Wiktionary:Tracking/headword/manual-tr]]

-- [[Special:WhatLinksHere/Wiktionary:Tracking/headword/manual-tr/LANGCODE]]

track("manual-tr", data.lang)

end

Line 255:

Line 410:

local this_parts = {}

if head.tr then

insert(this_parts, ~~m_scriptutils.~~tag_translit(head.tr, data.lang:getCode(), "head", nil, head.tr_manual))

insert(this_parts, tag_translit(head.tr, data.lang:getCode(), "head", nil, head.tr_manual))

if head.ts then

insert(this_parts, " ")

Line 261:

Line 416:

end

if head.ts then

insert(this_parts, "/" .~~. m_scriptutils~~.tag_transcription(head.ts, data.lang:getCode(), "head") .. "/")

insert(this_parts, "/" .. tag_transcription(head.ts, data.lang:getCode(), "head") .. "/")

end

insert(translit_parts, concat(this_parts))

Line 270:

Line 425:

local langname = data.lang:getCanonicalName()

local transliteration_page = ~~mw.title.new~~(langname .. " transliteration~~", "Wiktionary~~")

local transliteration_page = new_title(langname .. " transliteration")

local saw_translit_page = false

if transliteration_page and transliteration_page~~.exists~~ then

if transliteration_page and transliteration_page:getContent() then

translits_formatted = " [[~~Wiktionary:~~" .. langname .. " transliteration|•]]" .. translits_formatted

translits_formatted = " [[" .. langname .. " transliteration|•]]" .. translits_formatted

saw_translit_page = true

end

Line 281:

Line 436:

if not saw_translit_page and data.lang:hasType("etymology-only") then

langname = data.lang:getFullName()

transliteration_page = ~~mw.title.new~~(langname .. " transliteration", "Wiktionary")

transliteration_page = new_title(langname .. " transliteration", "Wiktionary")

if transliteration_page and transliteration_page~~.exists~~ then

if transliteration_page and transliteration_page:getContent() then

translits_formatted = " [[Wiktionary:" .. langname .. " transliteration|•]]" .. translits_formatted

end

Line 304:

Line 459:

local function ~~format_genders~~(data)

local function format_headword_genders(data)

local retval = ""

if data.genders and #data.genders ~~> 0~~ then

if data.genders and data.genders[1] then

if data.gloss then

retval = ","

end

local pos_for_cat

if not data.nogendercat ~~and not~~ m_data.no_gender_cat[data.lang:getCode()] ~~and~~

if not data.nogendercat then

~~not m_data.~~no_gender_cat[data.lang:getFullCode()] then

local no_gender_cat = (m_data or get_data()).no_gender_cat

~~local pos_category~~ = data.pos_category:gsub("^reconstructed ", "")

if not (no_gender_cat[data.lang:getCode()] or no_gender_cat[data.lang:getFullCode()]) then

~~pos_for_cat = m_data.pos_for_gender_number_cat[pos_category]~~

pos_for_cat = (m_data or get_data()).pos_for_gender_number_cat[data.pos_category:gsub("^reconstructed ", "")]

end

local text, cats = ~~require(gender_and_number_module).~~format_genders(data.genders, data.lang, pos_for_cat)

local text, cats = format_genders(data.genders, data.lang, pos_for_cat)

~~for _, cat in ipairs(~~cats~~) do~~

if cats then

~~insert~~(data.categories, ~~cat~~)

extend(data.categories, cats)

end

retval = retval .. " " .. text

Line 325:

Line 481:

end

-- Forward reference

local format_inflections

local function format_inflection_parts(data, parts)

~~local any_part_translit = false~~

for j, part in ipairs(parts) do

if type(part) ~= "table" then

Line 338:

Line 494:

if face ~= "bold" and face ~= "plain" and face ~= "hypothetical" then

error("The face `" .. face .. "` " .. (

~~mw.loadData~~(~~script_utilities_data_module~~).faces[face] and

(script_utilities_data or get_script_utilities_data()).faces[face] and

"should not be used for non-headword terms on the headword line." or

"is invalid."

Line 347:

Line 503:

-- right into the 'data' table to disable inflection links of the entire headword

-- when inflected forms aren't entry-worthy, e.g.: in Vulgar Latin

local nolinkinfl = data.nolinkinfl

local nolinkinfl = part.face == "hypothetical" or (part.nolink and track("nolink") or part.nolinkinfl) or (

data.nolink and track("nolink") or data.nolinkinfl)

local formatted

Line 361:

Line 518:

-- where the script is relatively straightforward to read by learners (e.g. Greek, Russian), but allow it

-- to be enabled in languages with more complex scripts (e.g. Arabic).

local tr = part.~~translit~~ or (not (parts.enable_auto_translit or data.inflections.enable_auto_translit) and "-" or nil)

--

if ~~tr ~=~~ "-" then

-- FIXME: With nested inflections, should we also respect `enable_auto_translit` at the top level of the

~~any_part_translit~~ = ~~true~~

-- nested inflections structure?

local tr = part.tr or not (parts.enable_auto_translit or data.inflections.enable_auto_translit) and "-" or nil

-- FIXME: Temporary errors added 2025-10-03. Remove after a month or so.

if part.translit then

error("Internal error: Use field `tr` not `translit` for specifying an inflection part translit")

end

if part.transcription then

error("Internal error: Use field `ts` not `transcription` for specifying an inflection part transcription")

end

local postprocess_annotations

if part.inflections then

postprocess_annotations = function(infldata)

insert(infldata.annotations, format_inflections(data, part.inflections))

end

formatted = ~~require(links_module).~~full_link(

formatted = full_link(

{

term = not nolinkinfl and part.term or nil,

Line 371:

Line 542:

lang = part.lang or data.lang,

sc = part.sc or parts.sc or nil,

gloss = part.gloss,

pos = part.pos,

lit = part.lit,

id = part.id,

genders = part.genders,

tr = tr,

ts = part.~~transcription~~,

ts = part.ts,

accel = partaccel or parts.accel,

postprocess_annotations = postprocess_annotations,

},

face

Line 381:

Line 556:

end

parts[j] = format_term_with_qualifiers_and_refs(part, formatted, j)

parts[j] = format_term_with_qualifiers_and_refs(part.lang or data.lang, part,

formatted, j)

end

local parts_output

if #parts ~~> 0~~ then

if parts[1] then

parts_output = (parts.label and " " or "") .. concat(parts)

elseif parts.request then

Line 396:

Line 572:

local parts_label = parts.label and ("" .. parts.label .. "") or ""

return parts_label .. parts_output, ~~any_part_translit~~

return format_term_with_qualifiers_and_refs(data.lang, parts, parts_label .. parts_output, 1)

end

-- Format the inflections following the headword.

-- Format the inflections following the headword or nested after a given inflection. Declared local above.

~~local~~ function format_inflections(data)

function format_inflections(data, inflections)

~~local any_part_translit = false~~

if inflections and inflections[1] then

if ~~data.~~inflections and ~~#data.~~inflections ~~> 0~~ then

-- Format each inflection individually.

for key, infl in ipairs(~~data.~~inflections) do

for key, infl in ipairs(inflections) do

~~local this_any_part_translit~~

inflections[key] = format_inflection_parts(data, infl)

~~data.~~inflections[key]~~, this_any_part_translit~~ = format_inflection_parts(data, infl)

~~if this_any_part_translit then~~

~~any_part_translit = true~~

~~end~~

end

~~local concat_result =~~ concat(~~data.~~inflections, ", ")

return concat(inflections, ", ")

~~return " (" .. concat_result .. ")"~~

else

return ""

end

-- Format the top-level inflections following the headword. Currently this just adds parens around the

-- formatted comma-separated inflections in `data.inflections`.

local function format_top_level_inflections(data)

local result = format_inflections(data, data.inflections)

if result ~= "" then

return " (" .. result .. ")"

else

return result

end

-- Forward reference

local check_red_link_inflections

-- Check a single inflection (which consists of a label and zero or more terms, each possibly with nested inflections)

-- for red links. If so, insert a red-link category based on `plpos` (the plural part of speech to insert in the

-- category), stop further processing, and return true. If no red links found, return false.

local function check_red_link_inflection_parts(data, parts, plpos)

for _, part in ipairs(parts) do

if type(part) ~= "table" then

part = {term = part}

end

local term = part.term

if term and not term:find("%[%[") then

local stripped_physical_term = get_link_page(term, data.lang, part.sc or parts.sc or nil)

if stripped_physical_term then

local title = mw.title.new(stripped_physical_term)

if title and not title:getContent() then

return true

end

if part.inflections then

if check_red_link_inflections(data, part.inflections, plpos) then

return true

end

return false

end

-- Check a set of inflections (each of which describes a single inflection of the term, such as feminine or plural, and

-- consists of a label and zero or more terms, each possibly with nested inflections) for red links. If so, insert a

-- red-link category based on `plpos` (the plural part of speech to insert in the category), stop further processing,

-- and return true. If no red links found, return false.

function check_red_link_inflections(data, inflections, plpos)

if inflections and inflections[1] then

-- Check each inflection individually.

for key, infl in ipairs(inflections) do

if check_red_link_inflection_parts(data, infl, plpos) then

return true

end

return false

end

-- Check the top-level inflections in `data.inflections`, along with any nested inflections, for red links. If so,

-- insert a red-link category based on `plpos` (the plural part of speech to insert in the category), stop further

-- processing, and return true. If no red links found, return false.

local function check_red_link_inflections_top_level(data, plpos)

return check_red_link_inflections(data, data.inflections, plpos)

end

--[==[

-- Returns the plural form of `pos`, a raw part of speech input, which could be singular or

Returns the plural form of `pos`, a raw part of speech input, which could be singular or

-- plural. Irregular plural POS are taken into account (e.g. "kanji" pluralizes to

plural. Irregular plural POS are taken into account (e.g. "kanji" pluralizes to

-- "kanji").]==]

"kanji").

]==]

function export.pluralize_pos(pos)

return m_data.irregular_plurals[pos] or

-- Make the plural form of the part of speech

return (m_data or get_data()).irregular_plurals[pos] or

pos:sub(-1) == "s" and pos or

~~-- Make the plural form of the part of speech~~

pluralize(pos)

~~require("Module:string utilities").~~pluralize(pos)

end

--[==[

-- Return "lemma" if the given POS is a lemma, "non-lemma form" if a non-lemma form, or nil

Return "lemma" if the given POS is a lemma, "non-lemma form" if a non-lemma form, or nil

-- if unknown. The POS passed in must be in its plural form ("nouns", "prefixes", etc.).

if unknown. The POS passed in must be in its plural form ("nouns", "prefixes", etc.).

-- If you have a POS in its singular form, call export.pluralize_pos() above to pluralize it

If you have a POS in its singular form, call {export.pluralize_pos()} above to pluralize it

-- in a smart fashion that knows when to add "-s" and when to add "-es", and also takes

in a smart fashion that knows when to add "-s" and when to add "-es", and also takes

-- into account any irregular plurals.~~]==]~~

into account any irregular plurals.

--

-- If `best_guess` is given and the POS is in neither the lemma nor non-lemma list, guess

If `best_guess` is given and the POS is in neither the lemma nor non-lemma list, guess

-- based on whether it ends in " forms"; otherwise, return nil.]==]

based on whether it ends in " forms"; otherwise, return nil.

]==]

function export.pos_lemma_or_nonlemma(plpos, best_guess)

local m_headword_data = m_data or get_data()

local isLemma = m_headword_data.lemmas

-- Is it a lemma category?

if isLemma[plpos] then

Line 450:

Line 695:

end

-- Is it a nonlemma category?

local isNonLemma = m_headword_data.nonlemmas

if isNonLemma[plpos] or isNonLemma[plpos_no_recon] then

return "non-lemma form"

Line 463:

Line 709:

end

--[==[

Canonicalize a part of speech as specified in 2= in {{tl|head}}. This checks for POS aliases and non-lemma form

aliases ending in 'f', and then pluralizes if the POS term does not have an invariable plural.

]==]

function export.canonicalize_pos(pos)

-- FIXME: Temporary code to throw an error for alias 'pre' (= preposition) that will go away.

if pos == "pre" then

-- Don't throw error on 'pref' as it's an alias for "prefix".

error("POS 'pre' for 'preposition' no longer allowed as it's too ambiguous; use 'prep'")

end

-- Likewise for pro = pronoun.

if pos == "pro" or pos == "prof" then

error("POS 'pro' for 'pronoun' no longer allowed as it's too ambiguous; use 'pron'")

end

local m_headword_data = m_data or get_data()

if m_headword_data.pos_aliases[pos] then

pos = m_headword_data.pos_aliases[pos]

elseif pos:sub(-1) == "f" then

pos = pos:sub(1, -2)

pos = (m_headword_data.pos_aliases[pos] or pos) .. " forms"

end

return export.pluralize_pos(pos)

end

-- Find and return the maximum index in the array `data[element]` (which may have gaps in it), and initialize it to a

Line 475:

Line 744:

local typ = type(data[element])

if typ ~= "table" then

error(("In full_headword(), `data.%s` must be an array but is a %s"):format(element, typ))

error(("Internal error: In full_headword(), `data.%s` must be an array but is a %s"):format(element, typ))

end

for k, v in pairs(data[element]) do

if k ~= "maxindex" then

if type(k) ~= "number" then

error(("Unrecognized non-numeric key '%s' in `data.%s`"):format(k, element))

error(("Internal error: Unrecognized non-numeric key '%s' in `data.%s`"):format(k, element))

end

if k > maxind then

Line 487:

Line 756:

if v then

if type(v) ~= "string" then

error(("For key '%s' in `data.%s`, value should be a string but is a %s"):format(k, element, type(v)))

error(("Internal error: For key '%s' in `data.%s`, value should be a string but is a %s"):format(k, element, type(v)))

end

if not allow_blank_string and v == "" then

error(("For key '%s' in `data.%s`, blank string not allowed; use 'false' for the default"):format(k, element))

error(("Internal error: For key '%s' in `data.%s`, blank string not allowed; use 'false' for the default"):format(k, element))

end

Line 519:

Line 788:

-- that.

if tbl == true then

~~if page.raw_defaultsort ~= sortkey then~~

~~insert(lang_cats, lang:getFullName() .. " terms with non-redundant non-automated sortkeys")~~

~~end~~

return

end

Line 531:

Line 797:

different = true

end

~~end~~

~~if redundant then~~

~~insert(lang_cats, lang:getFullName() .. " terms with redundant sortkeys")~~

~~end~~

~~if different then~~

~~insert(lang_cats, lang:getFullName() .. " terms with non-redundant non-automated sortkeys")~~

end

return sortkey

end

function export.maintenance_cats(page, lang, lang_cats, page_cats)

~~for _~~, ~~cat in ipairs(~~page.cats) do

extend(page_cats, page.cats)

~~insert(page_cats, cat)~~

~~end~~

lang = lang:getFull() -- since we are just generating categories

local canonical = lang:getCanonicalName()

Line 550:

Line 808:

if tbl then

sortkey = handle_raw_sortkeys(tbl, sortkey, page, lang, lang_cats)

~~insert(lang_cats, canonical .. " entries with topic categories using raw markup")~~

end

tbl = page.wikitext_langname_cat[canonical]

if tbl then

handle_raw_sortkeys(tbl, sortkey, page, lang, lang_cats)

~~insert(lang_cats, canonical .. " entries with language name categories using raw markup")~~

~~end~~

~~if require(utilities_module).get_current_L2() ~= canonical then~~

~~insert(lang_cats, canonical .. " entries with incorrect language header")~~

end

Line 570:

Line 823:

]==]

function export.full_headword(data)

~~local remove_links = require(links_module).remove_links~~

~~local format_categories = require(utilities_module).format_categories~~

-- Prevent data from being destructively modified.

local data = ~~require(table_module).shallowcopy~~(data)

local data = shallow_copy(data)

------------ 1. Basic checks for old-style (multi-arg) calling convention. ------------

if data.getCanonicalName then

error("In full_headword(), the first argument `data` needs to be a Lua object (table) of properties, not a language object")

error("Internal error: In full_headword(), the first argument `data` needs to be a Lua object (table) of properties, not a language object")

end

if not data.lang or type(data.lang) ~= "table" or not data.lang.getCode then

error("In full_headword(), the first argument `data` needs to be a Lua object (table) and `data.lang` must be a language object")

error("Internal error: In full_headword(), the first argument `data` needs to be a Lua object (table) and `data.lang` must be a language object")

end

if data.id and type(data.id) ~= "string" then

error("The id in the data table should be a string.")

error("Internal error: The id in the data table should be a string.")

end

Line 593:

Line 843:

local langcode = data.lang:getCode()

local full_langcode = ~~langcode~~

local full_langcode = data.lang:getFullCode()

local langname = data.lang:getCanonicalName()

local full_langname = ~~langname~~

local full_langname = data.lang:getFullName()

local raw_pagename~~, page~~ = data.pagename

local raw_pagename = data.pagename

if raw_pagename and raw_pagename ~= ~~m_data~~.pagename then -- for testing, doc pages, etc.

local page

page ~~= require~~(~~headword_page_module~~).process_page(raw_pagename)

local m_headword_data = m_data or get_data()

if raw_pagename and raw_pagename ~= m_headword_data.pagename then -- for testing, doc pages, etc.

-- data.pagename is often set on documentation and test pages through the pagename= parameter of various

-- templates, to emulate running on that page. Having a large number of such test templates on a single

-- page often leads to timeouts, because we fetch and parse the contents of each page in turn. However,

-- we don't really need to do that and can function fine without fetching and parsing the contents of a

-- given page, so turn off content fetching/parsing (and also setting the DEFAULTSORT key through a parser

-- function, which is *slooooow*) in certain namespaces where test and documentation templates are likely to

-- be found and where actual content does not live (User, Template, Module).

local actual_namespace = m_headword_data.page.namespace

local no_fetch_content = actual_namespace == "User" or actual_namespace == "Template" or

actual_namespace == "Module"

page = process_page(raw_pagename, no_fetch_content)

else

page = ~~m_data~~.page

page = m_headword_data.page

end

~~-- Check the~~ namespace ~~against the language type.~~

local namespace = page.namespace

if page.namespace ~~== "" then~~

~~if data.lang:hasType("reconstructed") then~~

~~error("Entries in " .. langname .. " must be placed in the Reconstruction: namespace")~~

~~elseif data.lang:hasType("appendix-constructed") then~~

~~error("Entries in " .. langname .. " must be placed in the Appendix: namespace")~~

~~end~~

------------ 3. Initialize `data.heads` table; if old-style, convert to new-style. ------------

Line 618:

Line 873:

-- new-style

if data.translits or data.transcriptions then

error("In full_headword(), if `data.heads` is new-style (array of head objects), `data.translits` and `data.transcriptions` cannot be given")

error("Internal error: In full_headword(), if `data.heads` is new-style (array of head objects), `data.translits` and `data.transcriptions` cannot be given")

end

else

-- convert old-style `heads`, `translits` and `transcriptions` to new-style

local maxind = ~~math.~~max(

local maxind = max(

init_and_find_maximum_index(data, "heads"~~, true~~),

init_and_find_maximum_index(data, "heads"),

init_and_find_maximum_index(data, "translits", true),

init_and_find_maximum_index(data, "transcriptions", true)

Line 643:

Line 898:

------------ 4. Initialize and validate `data.categories` and `data.whole_page_categories`, and determine `pos_category` if not given, and add basic categories. ------------

init_and_find_maximum_index(data, "categories"~~, true~~)

-- EXPERIMENTAL: see [[Wiktionary:Beer parlour/2024/June#Decluttering the altform mess]]

init_and_find_maximum_index(data, "whole_page_categories"~~, true~~)

if data.altform then

data.noposcat = true

end

init_and_find_maximum_index(data, "categories")

init_and_find_maximum_index(data, "whole_page_categories")

local pos_category_already_present = false

if #data.categories ~~> 0~~ then

if data.categories[1] then

local escaped_langname = pattern_escape(full_langname)

local matches_lang_pattern = "^" .. escaped_langname .. " "

for _, cat in ipairs(data.categories) do

-- Does the category begin with the language name? If not, tag it with a tracking category.

if not cat:find(matches_lang_pattern) then

-- [[Special:WhatLinksHere/Wiktionary:Tracking/headword/no lang category]]

-- [[Special:WhatLinksHere/Wiktionary:Tracking/headword/no lang category/LANGCODE]]

track("no lang category", data.lang)

end

-- If `pos_category` not given, try to infer it from the first specified category. If this doesn't work, we

Line 660:

Line 928:

if not data.pos_category then

error("`data.pos_category` not specified and could not be inferred from the categories given in "

error("Internal error: `data.pos_category` not specified and could not be inferred from the categories given in "

.. "`data.categories`. Either specify the plural part of speech in `data.pos_category` "

.. "(e.g. \"proper nouns\") or ensure that the first category in `data.categories` is formed from the "

Line 679:

Line 947:

-- add an appropriate category.

local postype = export.pos_lemma_or_nonlemma(data.pos_category)

if not data.noposcat then

local main_cat = data.lang:getMainCategoryName()

insert(data.categories, 1, full_langname .. " " .. postype .. "s")

if not postype then

elseif not data.noposcat then

if postype:match("^lemma") and main_cat ~= "lemma" then

postype = main_cat

end

insert(data.categories, 1, full_langname .. " " .. postype .. "s")

end

insert(data.categories, 1, "Contionary")

-- EXPERIMENTAL: see [[Wiktionary:Beer parlour/2024/June#Decluttering the altform mess]]

if data.altform then

insert(data.categories, 1, full_langname .. " alternative forms")

end

------------ 5. Create a default headword, and add links to multiword page names. ------------

-- Determine if this is an "anti-asterisk" term, i.e. an attested term in a language that must normally be

-- reconstructed.

local is_anti_asterisk = data.heads[1].term and data.heads[1].term:find("^!!")

local lang_reconstructed = data.lang:hasType("reconstructed")

if is_anti_asterisk then

if not lang_reconstructed then

error("Anti-asterisk feature (head= beginning with !!) can only be used with reconstructed languages")

end

lang_reconstructed = false

end

-- Determine if term is reconstructed

local is_reconstructed = ~~page.~~namespace == "Reconstruction" or ~~data.lang:hasType("reconstructed")~~

local is_reconstructed = namespace == "Reconstruction" or lang_reconstructed

-- Create a default headword based on the pagename, which is determined in

Line 693:

Line 985:

-- Add links to multi-word page names when appropriate

if not data.nolinkhead ~~and not m_data~~.no_multiword_links[langcode] ~~and not m_data.no_multiword_links~~[full_langcode]

if not (is_reconstructed or data.nolinkhead) then

~~and not is_reconstructed~~ and export.head_is_multiword(default_head) then

local no_links = m_headword_data.no_multiword_links

default_head = export.add_multiword_links(default_head, true)

if not (no_links[langcode] or no_links[full_langcode]) and export.head_is_multiword(default_head) then

default_head = export.add_multiword_links(default_head, true)

end

if is_reconstructed then

if is_reconstructed and not data.lang:hasType("conlang") then

default_head = "*" .. default_head

end

------------ 6. Fill in missing values in `data.heads`. ------------

------------ 6. Check the namespace against the language type. ------------

if namespace == "" then

if lang_reconstructed then

error("Entries in " .. langname .. " must be placed in the Reconstruction: namespace")

elseif data.lang:hasType("appendix-constructed") then

error("Entries in " .. langname .. " must be placed in the Appendix: namespace")

end

elseif namespace == "Citations" or namespace == "Thesaurus" then

error("Headword templates should not be used in the " .. namespace .. ": namespace.")

end

------------ 7. Fill in missing values in `data.heads`. ------------

-- True if any script among the headword scripts has spaces in it.

Line 711:

Line 1,017:

for _, head in ipairs(data.heads) do

------ 6a. If missing head, replace with default head.

------ 7a. If missing head, replace with default head.

if not head.term then

head.term = default_head

elseif head.term == default_head then

has_redundant_head_param = true

elseif is_anti_asterisk and head.term == "!!" then

-- If explicit head=!! is given, it's an anti-asterisk term and we fill in the default head.

head.term = "!!" .. default_head

elseif head.term:find("^[!?]$") then

-- If explicit head= just consists of ! or ?, add it to the end of the default head.

head.term = default_head .. head.term

end

head.term_no_initial_bang_bang = is_anti_asterisk and head.term:sub(3) or head.term

if is_reconstructed then

local head_term = head.term

if head_term:find("%[%[") then

head_term = ~~require(links_module).~~remove_links(head_term)

head_term = remove_links(head_term)

end

if head_term:sub(1, 1) ~= "*" then

error("The headword '" .. head_term .. "' must begin with '*' to indicate that it is reconstructed.")

end

------ 7b. Try to detect the script(s) if not provided. If a per-head script is provided, that takes precedence,

------ otherwise fall back to the overall script if given. If neither given, autodetect the script.

local auto_sc = data.lang:findBestScript(head.term)

if not (head.sc or data.sc) then -- No script code given, so use autodetected script.

head.sc = auto_sc

else

if not head.sc then -- Overall script code given.

head.sc = data.sc

end

-- If using a discouraged character sequence, add to maintenance category.

-- if head.sc:hasNormalizationFixes() == true then

if head.sc:hasNormalizationFixes() == true then

-- local composed_head = toNFC(head.term)

local composed_head = toNFC(head.term)

-- if head.sc:fixDiscouragedSequences(composed_head) ~= composed_head then

if head.sc:fixDiscouragedSequences(composed_head) ~= composed_head then

-- insert(data.whole_page_categories, "Pages using discouraged character sequences")

insert(data.whole_page_categories, "Pages using discouraged character sequences")

-- end

end

-- end

end

any_script_has_spaces = any_script_has_spaces or head.sc:hasSpaces()

------ 6c. Create automatic transliterations for any non-Latin headwords without manual translit given

------ 7c. Create automatic transliterations for any non-Latin headwords without manual translit given

------ (provided automatic translit is available, e.g. not in Persian or Hebrew).

-- Make transliterations

head.tr_manual = nil

-- Try to generate a transliteration if necessary

if head.tr == "-" then

head.tr = nil

~~elseif~~ not notranslit[langcode] ~~and not~~ notranslit[full_langcode] and head.sc:isTransliterated() then

else

head.tr_manual = not not head.tr

local notranslit = m_headword_data.notranslit

if not (notranslit[langcode] or notranslit[full_langcode]) and head.sc:isTransliterated() then

head.tr_manual = not not head.tr

local text = head.~~term~~

local text = head.term_no_initial_bang_bang

if not data.lang:link_tr(head.sc) then

text = remove_links(text)

end

local automated_tr~~, tr_categories~~

local automated_tr = data.lang:transliterate(text, head.sc)

~~automated_tr, head.tr_fail, tr_categories~~ = data.lang:transliterate(text, head.sc)

if automated_tr ~~or head.tr_fail~~ then

if automated_tr then

local manual_tr = head.tr

~~if manual_tr then~~

if not manual_tr then

if ~~(remove_links(~~manual_tr~~) == remove_links(automated_tr)) and (not head.tr_fail)~~ then

head.tr = automated_tr

~~insert(data.categories, full_langname .. " terms with redundant transliterations")~~

~~elseif not~~ head.~~tr_fail then~~

~~insert(data.categories, full_langname .. " terms with non-redundant manual transliterations")~~

end

if not ~~manual_tr~~ then

-- There is still no transliteration?

head.tr = ~~automated_tr~~

-- Add the entry to a cleanup category.

for _, category ~~in ipairs(tr_categories) do~~

if not head.tr then

insert(data.categories, ~~category~~)

head.tr = "transliteration needed"

~~end~~

-- FIXME: No current support for 'Request for transliteration of Classical Persian terms' or similar.

-- Consider adding this support in [[Module:category tree/poscatboiler/data/entry maintenance]].

insert(data.categories, "Requests for transliteration of " .. full_langname .. " terms")

else

-- Otherwise, trim it.

head.tr = trim(head.tr)

end

~~end~~

~~-- There is still no transliteration?~~

~~-- Add the entry to a cleanup category.~~

~~if not head.tr then~~

~~head.tr = "transliteration needed"~~

~~-- FIXME: No current support for 'Request for transliteration of Classical Persian terms' or similar.~~

~~-- Consider adding this support in [[Module:category tree/poscatboiler/data/entry maintenance]].~~

~~insert(data.categories, "Requests for transliteration of " .. full_langname .. " terms")~~

~~else~~

~~-- Otherwise, trim it.~~

~~head.tr = mw.text.trim(head.tr)~~

end

Line 792:

Line 1,109:

-- Link to the transliteration entry for languages that require this.

if head.tr and data.lang:link_tr(head.sc) then

head.tr = ~~require(links_module).~~full_link {

head.tr = full_link{

term = head.tr,

lang = data.lang,

sc = ~~require(scripts_module).getByCode~~("Latn"),

sc = get_script("Latn"),

tr = "-"

}

Line 801:

Line 1,118:

end

------------ 7. Maybe tag the title with the appropriate script code, using the `display_title` mechanism. ------------

------------ 8. Maybe tag the title with the appropriate script code, using the `display_title` mechanism. ------------

-- Assumes that the scripts in "toBeTagged" will never occur in the Reconstruction namespace.

Line 813:

Line 1,130:

local dt_script = data.heads[1].sc

local dt_script_code = dt_script:getCode()

local page_non_ascii = ~~page.~~namespace == "" and not page.pagename:find("^[%z\1-\127]+$")

local page_non_ascii = namespace == "" and not page.pagename:find("^[%z\1-\127]+$")

local unsupported_pagename, unsupported = page.full_raw_pagename:gsub("^Unsupported titles/", "")

if unsupported == 1 and page.unsupported_titles[unsupported_pagename] then

display_title = 'Unsupported titles/' .. page.unsupported_titles[unsupported_pagename] .. ''

elseif page_non_ascii and toBeTagged[dt_script_code]

elseif page_non_ascii and m_headword_data.toBeTagged[dt_script_code]

or (dt_script_code == "Jpan" and (text_in_script(page.pagename, "Hira") or text_in_script(page.pagename, "Kana")))

or (dt_script_code == "Kore" and text_in_script(page.pagename, "Hang")) then

Line 824:

Line 1,141:

elseif page_non_ascii and (dt_script_code == "Hant" or dt_script_code == "Hans") then

display_title = '' .. page.full_raw_pagename .. ''

elseif ~~page.~~namespace == "Reconstruction" then

elseif namespace == "Reconstruction" then

local matched

display_title, matched = ~~rsubn~~(

display_title, matched = ugsub(

page.full_raw_pagename,

"^(Reconstruction:[^/]+/)(.+)$",

function(before, term)

return before ..

return before .. tag_text(term, data.lang, dt_script)

~~require(script_utilities_module)~~.tag_text(

term,

data.lang,

dt_script

)

end

)

Line 841:

Line 1,153:

display_title = nil

end

-- FIXME: Generalize this.

-- If the current language uses ur-Arab (for Urdu, etc.), ku-Arab (Central Kurdish) or pa-Arab

-- (Shahmukhi, for Punjabi) and there's more than one language on the page, don't set the display title

-- because these three scripts display in Nastaliq and we don't want this for terms that also exist in other

-- languages that don't display in Nastaliq (e.g. Arabic or Persian) to display in Nastaliq. Because the word

-- "Urdu" occurs near the end of the alphabet, Urdu fonts tend to override the fonts of other languages.

-- FIXME: This is checking for more than one language on the page but instead needs to check if there are any

-- languages using scripts other than the ones just mentioned.

if (dt_script_code == "ur-Arab" or dt_script_code == "ku-Arab" or dt_script_code == "pa-Arab") and page.L2_list.n > 1 then

display_title = nil

end

Line 850:

Line 1,173:

end

------------ 8. Insert additional categories. ------------

------------ 9. Insert additional categories. ------------

if data.force_cat_output then

-- [[Special:WhatLinksHere/Wiktionary:Tracking/headword/force cat output]]

track("force cat output")

end

if has_redundant_head_param then

if not data.no_redundant_head_cat then

insert(data.categories, full_langname .. " terms with redundant head parameter")

-- This is not the right way to go about this; too many exceptions and problems due to language-specific headword

-- handling customization. If we want this, it should be opt-in by a given language passing in the default headword.

-- insert(data.categories, full_langname .. " terms with redundant head parameter")

end

-- If the first head is multiword (after removing links), maybe insert into "LANG multiword terms".

if not data.nomultiwordcat and any_script_has_spaces and postype == "lemma" ~~and~~

if not data.nomultiwordcat and any_script_has_spaces and postype == "lemma" then

not ~~m_data.~~no_multiword_cat[langcode] ~~and not m_data.~~no_multiword_cat[full_langcode] then

local no_multiword_cat = m_headword_data.no_multiword_cat

-- Check for spaces or hyphens, but exclude prefixes and suffixes.

if not (no_multiword_cat[langcode] or no_multiword_cat[full_langcode]) then

-- Use the pagename, not the head= value, because the latter may have extra

-- Check for spaces or hyphens, but exclude prefixes and suffixes.

-- junk in it, e.g. superscripted text that throws off the algorithm.

-- Use the pagename, not the head= value, because the latter may have extra

local ~~checkpattern~~ = ~~".[%s%-፡]."~~

-- junk in it, e.g. superscripted text that throws off the algorithm.

~~if m_data~~.hyphen_not_multiword_sep~~[langcode] or m_data.hyphen_not_multiword_sep[full_langcode] then~~

local no_hyphen = m_headword_data.hyphen_not_multiword_sep

-- Exclude hyphens if the data module states that they should for this language

-- Exclude hyphens if the data module states that they should for this language.

checkpattern = ".[%s፡]."

local checkpattern = (no_hyphen[langcode] or no_hyphen[full_langcode]) and ".[%s፡]." or ".[%s%-፡]."

~~end~~

local is_multiword = umatch(page.pagename, checkpattern)

~~if rmatch~~(page.pagename, checkpattern) and not non_categorizable(page.full_raw_pagename) then

insert(data.categories, full_langname .. " multiword terms")

if is_multiword and not non_categorizable(page.full_raw_pagename) then

insert(data.categories, full_langname .. " multiword terms")

elseif not is_multiword then

local long_word_threshold = m_headword_data.long_word_thresholds[langcode] or

m_headword_data.long_word_thresholds[full_langcode]

if long_word_threshold and ulen(page.pagename) >= long_word_threshold then

insert(data.categories, "Long " .. full_langname .. " words")

end

if data.sccat then

local default_sccat = m_headword_data.default_sccat

if data.sccat or data.sccat == nil and (default_sccat[langcode] or default_sccat[full_langcode]) then

for _, head in ipairs(data.heads) do

insert(data.categories, full_langname .. " " .. data.pos_category .. " in " ..

Line 882:

Line 1,221:

-- Reconstructed terms often use weird combinations of scripts and realistically aren't spelled so much as notated.

if ~~page.~~namespace ~= "Reconstruction" then

if namespace ~= "Reconstruction" then

-- Map from languages to a string containing the characters to ignore when considering whether a term has

-- multiple written scripts in it. Typically these are Greek or Cyrillic letters used for their phonetic

-- values.

local characters_to_ignore = {

["aaq"] = "α", -- Penobscot

["aaq"] = "αάὰ", -- Penobscot (Algonquian)

["acy"] = "δθ", -- Cypriot Arabic

["anc"] = "γ", -- Ngas

["aez"] = "β", -- Aeka (Trans-New Guinea)

["aou"] = "χ", -- A'ou

["anc"] = "γ", -- Ngas (Chadic/Afroasiatic)

["awg"] = "β", -- Anguthimri

["aou"] = "χ", -- A'ou (Kra-Dai)

["bhp"] = "β", -- Bima

["art-blk"] = "ч", -- Bolak (conlang)

["byk"] = "θ", -- Biao

["awg"] = "β", -- Anguthimri (Pama-Nyungan)

["cdy"] = "θ", -- Chadong

["az"] = "ь", -- Azerbaijani (Turkic; Yañalif Latin spelling, c. 1928 - 1938)

["clm"] = "χ", -- Klallam

["ba"] = "ь", -- Bashkir (Turkic; Yañalif Latin spelling, c. 1928 - 1938)

["col"] = "χ", -- Colombia-Wenatchi

["bhp"] = "β", -- Bima (Austronesian)

["coo"] = "χ", -- Comox~~; FIXME: others? E.g. Greek theta~~ (θ)?

["bjz"] = "β", -- Baruga (Trans-New Guinea)

["ets"] = "θ", -- Yekhee

["byk"] = "θ", -- Biao (Kra-Dai)

["gmw-gts"] = "χ", -- Gottscheerish

["cdy"] = "θ", -- Chadong (Kra-Dai)

["hur"] = "θ", -- Halkomelem

["chp"] = "θ", -- Chipewyan (Athabaskan)

["izh"] = "ь", -- Ingrian

["cjh"] = "χ", -- Upper Chehalis (Salishan)

["kic"] = "θ", -- Kickapoo

["clm"] = "χ", -- Klallam (Salishan)

["lil"] = "χ", -- Lillooet

["col"] = "χ", -- Colombia-Wenatchi (Salishan)

["coo"] = "χθ", -- Comox (Salishan)

["crx"] = "θ", -- Carrier (Athabaskan)

["ets"] = "θ", -- Yekhee (Edoid/Niger-Congo)

["ett"] = "χ", -- Etruscan (isolate; in romanizations)

["fla"] = "χ", -- Montana Salish (Salishan)

["grt"] = "་", -- Garo (South Asian Sino-Tibetan)

["gmw-gts"] = "χ", -- Gottscheerish (Bavarian variant spoken in Slovenia)

["hur"] = "χθ", -- Halkomelem (Salishan)

["itc-psa"] = "f", -- Pre-Samnite (Italic; normally written in Greek)

["izh"] = "ь", -- Ingrian (Finnic)

["kic"] = "θ", -- Kickapoo (Algonquian)

["kk"] = "ь", -- Kazakh (Turkic; Yañalif Latin spelling, c. 1928 - 1938)

["ky"] = "ь", -- Kyrgyz (Turkic; Yañalif Latin spelling, c. 1928 - 1938)

["lil"] = "χ", -- Lillooet (Salishan)

["lsi"] = "ꓹ", -- Lashi (Lolo-Burmese/Sino-Tibetan; represents a glottal stop)

["mhz"] = "β", -- Mor (Austronesian)

["neg"]= "ӡ", -- Negidal (normally in Cyrillic)

["mqn"] = "β", -- Moronene (Austronesian)

["oui"] = "γβ", -- Old Uyghur: FIXME: others? E.g. Greek delta (δ)?

["neg"]= "ӡā", -- Negidal (Tungusic; normally in Cyrillic)

["pox"] = "χ", -- Polabian

["oka"] = "χ", -- Okanagan (Salishan)

["rom"] = "Θθ", -- Romani: International Standard; two different thetas???

["ole"] = "θ", -- Olekha (Sino-Tibetan)

["sah"] = "ь", -- Yakut (1929 - 1939 Latin spelling)

["oui"] = "γβ", -- Old Uyghur (Turkic; FIXME: others? E.g. Greek delta (δ)?)

["sjw"] = "θ", -- Shawnee

["pox"] = "χ", -- Polabian (West Slavic)

["squ"] = "χ", -- Squamish

["rif"] = "ε", -- Tarifit (Berber)

["str"] = "χθ", -- Saanich; ~~uses two Greek letters~~

["rom"] = "Θθ", -- Romani (Indic: International Standard; two different thetas???)

["twa"] = "χ", -- Twana

["rpn"] = "β", -- Repanbitip (Austronesian)

["yha"] = "θ", -- Baha

["sah"] = "ь", -- Yakut (Turkic; 1929 - 1939 Latin spelling)

["za"] = "зч", -- Zhuang; 1957-1982 alphabet used two Cyrillic letters (as well as some others like

["sit-jap"] = "χ", -- Japhug (Sino-Tibetan)

["sjw"] = "θ", -- Shawnee (Algonquian)

["squ"] = "χ", -- Squamish (Salishan)

["str"] = "χθ", -- Saanich (Salishan)

["teh"] = "χ", -- Tehuelche (Chonan; spoken in Argentina)

["tep"] = "η", -- Tepecano (Uto-Aztecan)

["thp"] = "χ", -- Thompson (Salishan)

["tk"] = "ь", -- Turkmen (Turkic; Yañalif Latin spelling, c. 1928 - 1938)

["tt"] = "ь", -- Kazakh (Turkic; Yañalif Latin spelling, c. 1928 - 1938)

["twa"] = "χ", -- Twana (Salishan)

["wbl"] = "ы", -- Wakhi (Iranian)

["xbc"] = "ϸ", -- Bactrian (Iranian; represents š; normally written in Greek)

["yha"] = "θ", -- Baha (Kra-Dai)

["za"] = "зч", -- Zhuang (Tai/Kra-Dai); 1957-1982 alphabet used two Cyrillic letters (as well as some others like

-- ƃ, ƅ, ƨ, ɯ and ɵ that look like Cyrillic or Greek but are actually Latin)

["zlw-slv"] = "χђћ", -- Slovincian; FIXME: χ is Greek, the other two are Cyrillic, but I'm not sure

["zlw-slv"] = "χђћ", -- Slovincian (West Slavic; FIXME: χ is Greek, the other two are Cyrillic, but I'm not sure

-- the currect characters are being chosen in the entry names

-- the currect characters are being chosen in the entry names)

["zng"] = "θ", -- Mang

["zng"] = "θ", -- Mang (Mon-Khmer)

["ztp"] = "θ", -- Loxicha Zapotec (Zapotecan)

}

-- Determine how many real scripts are found in the pagename, where we exclude symbols and such. We exclude

Line 932:

Line 1,300:

local ch_to_ignore = characters_to_ignore[full_langcode]

if ch_to_ignore then

canon_pagename = ~~rsub~~(canon_pagename, "[" .. ch_to_ignore .. "]", "")

canon_pagename = ugsub(canon_pagename, "[" .. ch_to_ignore .. "]", "")

end

~~local script_data = mw.loadData(scripts_data_module)~~

while true do

if canon_pagename == "" or num_seen_scripts >= 2 or num_loops >= 10 then

Line 941:

Line 1,308:

-- Make sure we don't get into a loop checking the same script over and over again; happens with e.g. [[ᠪᡳ]]

num_loops = num_loops + 1

local pagename_script = ~~require(scripts_module).findBestScriptWithoutLang~~(canon_pagename, "None only as last resort")

local pagename_script = find_best_script_without_lang(canon_pagename, "None only as last resort")

local script_chars = pagename_script.characters

if not script_chars then

Line 949:

Line 1,316:

local script_code = pagename_script:getCode()

local replaced

canon_pagename, replaced = ~~rsubn~~(canon_pagename, "[" .. script_chars .. "]", "")

canon_pagename, replaced = ugsub(canon_pagename, "[" .. script_chars .. "]", "")

if replaced and script_code ~= "Zmth" and script_data[script_code] and

if (

script_data[script_code].character_category ~= false then

replaced and

script_code ~= "Zmth" and

(script_data or get_script_data())[script_code] and

script_data[script_code].character_category ~= false

) then

script_code = script_code:gsub("^.-%-", "")

if not seen_scripts[script_code] then

Line 964:

Line 1,335:

end

-- Categorise for unusual characters. Takes into account combining characters, so that we can categorise for characters with diacritics that aren't encoded as atomic characters (e.g. U̠). These can be in two formats: single combining characters (i.e. character + diacritic(s)) or double combining characters (i.e. character + diacritic(s) + character). Each can have any number of diacritics.

local standard = data.lang:getStandardCharacters()

Line 1,008:

Line 1,379:

return ""

end

local sc_standard = ~~rsub~~(sc_standard, page.comb_chars.combined_double, explode)

local sc_standard = ugsub(sc_standard, page.comb_chars.combined_double, explode)

sc_standard = ~~rsub~~(sc_standard,page.comb_chars.combined_single, explode)

sc_standard = ugsub(sc_standard,page.comb_chars.combined_single, explode)

:gsub(".[\128-\191]*", explode)

local num_cat_inserted

Line 1,019:

Line 1,390:

num_cat_inserted = true

end

elseif ~~rfind~~(char, page.emoji_pattern) then

elseif ufind(char, page.emoji_pattern) then

insert(data.categories, full_langname .. " terms spelled with emoji")

else

Line 1,033:

Line 1,404:

-- If a diacritic doesn't appear in any of the standard characters, also categorise for it generally.

sc_standard = toNFD(sc_standard)

for diacritic in ~~rgmatch~~(page.decompose_pagename, page.comb_chars.diacritics_single) do

for diacritic in ugmatch(page.decompose_pagename, page.comb_chars.diacritics_single) do

if not ~~rmatch~~(sc_standard, diacritic) then

if not umatch(sc_standard, diacritic) then

insert(data.categories, full_langname .. " terms spelled with ◌" .. diacritic)

end

for diacritic in ~~rgmatch~~(page.decompose_pagename, page.comb_chars.diacritics_double) do

for diacritic in ugmatch(page.decompose_pagename, page.comb_chars.diacritics_double) do

if not ~~rmatch~~(sc_standard, diacritic) then

if not umatch(sc_standard, diacritic) then

insert(data.categories, full_langname .. " terms spelled with ◌" .. diacritic .. "◌")

end

Line 1,047:

Line 1,418:

-- Ancient Greek, Hindi and Lao handled the old way for now, as their standard chars still need to be converted to the new format (because there are a lot of them).

elseif ulen(page.pagename) ~= 1 then

for character in ~~rgmatch~~(page.pagename, "([^" .. standard .. "])") do

for character in ugmatch(page.pagename, "([^" .. standard .. "])") do

local upper = char_category(character)

if not ~~rmatch~~(upper, "[" .. standard .. "]") then

if not umatch(upper, "[" .. standard .. "]") then

character = upper

end

Line 1,056:

Line 1,427:

end

~~if data.heads[1].sc:isSystem("alphabet") then~~

~~local pagename, i = page.pagename:ulower(), 2~~

~~while rmatch(pagename, "(%a)" .. ("%1"):rep(i)) do~~

~~i = i + 1~~

~~insert(data.categories, full_langname .. " terms with " .. i .. " consecutive instances of the same letter")~~

~~end~~

-- Categorise for palindromes

if not data.nopalindromecat and ~~page.~~namespace ~= "Reconstruction" and ulen(page.pagename) > 2

if not data.nopalindromecat and namespace ~= "Reconstruction" and ulen(page.pagename) > 2

-- FIXME: Use of first script here seems hacky. What is the clean way of doing this in the presence of

-- multiple scripts?

and ~~require(palindromes_module).~~is_palindrome(page.pagename, data.lang, data.heads[1].sc) then

and is_palindrome(page.pagename, data.lang, data.heads[1].sc) then

insert(data.categories, full_langname .. " palindromes")

end

if ~~page.~~namespace == "" and not data.lang:~~hasType~~("~~reconstructed~~") then

if namespace == "" and not lang_reconstructed then

local ~~m_links~~ = ~~require~~(~~links_module~~)

for _, head in ipairs(data.heads) do

if page.full_raw_pagename ~= get_link_page(remove_links(head.term), data.lang, head.sc) then

-- [[Special:WhatLinksHere/Wiktionary:Tracking/headword/pagename spelling mismatch]]

-- [[Special:WhatLinksHere/Wiktionary:Tracking/headword/pagename spelling mismatch/LANGCODE]]

track("pagename spelling mismatch", data.lang)

break

end

-- Add red link category if called for and we're not a "large" page, where such checks are disabled.

if data.checkredlinks and not m_headword_data.large_pages[m_headword_data.pagename] then

local plposcat = type(data.checkredlinks) == "string" and data.checkredlinks or data.pos_category

check_red_link_inflections_top_level(data, plposcat)

end

Line 1,080:

Line 1,456:

export.maintenance_cats(page, data.lang, data.categories, data.whole_page_categories)

------------ 9. Format and return headwords, genders, inflections and categories. ------------

------------ 10. Format and return headwords, genders, inflections and categories. ------------

-- Format and return all the gathered information. This may add more categories (e.g. gender/number categories),

Line 1,086:

Line 1,462:

local text = '' ..

format_headword(data) ..

~~format_genders~~(data) ..

format_headword_genders(data) ..

~~format_inflections~~(data) .. ''

format_top_level_inflections(data) .. ''

-- Language-specific categories.

@@ Line 2: / Line 2: @@
 -- Named constants for all modules used, to make it easier to swap out sandbox versions.
-local gender_and_number_module = "Module:getn"
+local debug_track_module = "Module:debug/track"
+local en_utilities_module = "Module:en-utilities"
+local gender_and_number_module = "Module:gender and number"
 local headword_data_module = "Module:headword/data"
 local headword_page_module = "Module:headword/page"
 local links_module = "Module:links"
+local load_module = "Module:load"
+local pages_module = "Module:pages"
 local palindromes_module = "Module:palindromes"
-local qualifier_module = "Module:qualifier"
+local pron_qualifier_module = "Module:pron qualifier"
 local scripts_module = "Module:scripts"
 local scripts_data_module = "Module:scripts/data"
@@ Line 15: / Line 19: @@
 local table_module = "Module:table"
 local utilities_module = "Module:utilities"
-local m_str_utils = require(string_utilities_module)
 local concat = table.concat
-local encode_entities = m_str_utils.encode_entities
+local dump = mw.dumpObject
 local insert = table.insert
 local ipairs = ipairs
+local max = math.max
+local new_title = mw.title.new
 local pairs = pairs
-local pattern_escape = m_str_utils.pattern_escape
+local require = require
-local rgmatch = mw.ustring.gmatch
-local rsubn = mw.ustring.gsub
-local rfind = mw.ustring.find
-local ulen = m_str_utils.len
-local rmatch = mw.ustring.match
 local toNFC = mw.ustring.toNFC
 local toNFD = mw.ustring.toNFD
+local type = type
+local ufind = mw.ustring.find
+local ugmatch = mw.ustring.gmatch
+local ugsub = mw.ustring.gsub
+local umatch = mw.ustring.match
+--[==[
+Loaders for functions in other modules, which overwrite themselves with the target function when called. This ensures modules are only loaded when needed, retains the speed/convenience of locally-declared pre-loaded functions, and has no overhead after the first call, since the target functions are called directly in any subsequent calls.]==]
+	local function debug_track(...)
+		debug_track = require(debug_track_module)
+		return debug_track(...)
+	end
+	local function encode_entities(...)
+		encode_entities = require(string_utilities_module).encode_entities
+		return encode_entities(...)
+	end
+	local function extend(...)
+		extend = require(table_module).extend
+		return extend(...)
+	end
+	local function find_best_script_without_lang(...)
+		find_best_script_without_lang = require(scripts_module).findBestScriptWithoutLang
+		return find_best_script_without_lang(...)
+	end
+	local function format_categories(...)
+		format_categories = require(utilities_module).format_categories
+		return format_categories(...)
+	end
+	local function format_genders(...)
+		format_genders = require(gender_and_number_module).format_genders
+		return format_genders(...)
+	end
+	local function format_pron_qualifiers(...)
+		format_pron_qualifiers = require(pron_qualifier_module).format_qualifiers
+		return format_pron_qualifiers(...)
+	end
+	local function full_link(...)
+		full_link = require(links_module).full_link
+		return full_link(...)
+	end
+	local function get_current_L2(...)
+		get_current_L2 = require(pages_module).get_current_L2
+		return get_current_L2(...)
+	end
+	local function get_link_page(...)
+		get_link_page = require(links_module).get_link_page
+		return get_link_page(...)
+	end
+	local function get_script(...)
+		get_script = require(scripts_module).getByCode
+		return get_script(...)
+	end
+	local function is_palindrome(...)
+		is_palindrome = require(palindromes_module).is_palindrome
+		return is_palindrome(...)
+	end
+	local function language_link(...)
+		language_link = require(links_module).language_link
+		return language_link(...)
+	end
+	local function load_data(...)
+		load_data = require(load_module).load_data
+		return load_data(...)
+	end
+	local function pattern_escape(...)
+		pattern_escape = require(string_utilities_module).pattern_escape
+		return pattern_escape(...)
+	end
+	local function pluralize(...)
+		pluralize = require(en_utilities_module).pluralize
+		return pluralize(...)
+	end
+	local function process_page(...)
+		process_page = require(headword_page_module).process_page
+		return process_page(...)
+	end
+	local function remove_links(...)
+		remove_links = require(links_module).remove_links
+		return remove_links(...)
+	end
+	local function shallow_copy(...)
+		shallow_copy = require(table_module).shallowCopy
+		return shallow_copy(...)
+	end
+	local function tag_text(...)
+		tag_text = require(script_utilities_module).tag_text
+		return tag_text(...)
+	end
+	local function tag_transcription(...)
+		tag_transcription = require(script_utilities_module).tag_transcription
+		return tag_transcription(...)
+	end
+	local function tag_translit(...)
+		tag_translit = require(script_utilities_module).tag_translit
+		return tag_translit(...)
+	end
+	local function trim(...)
+		trim = require(string_utilities_module).trim
+		return trim(...)
+	end
+	local function ulen(...)
+		ulen = require(string_utilities_module).len
+		return ulen(...)
+	end
-local m_data = mw.loadData(headword_data_module)
+--[==[
+Loaders for objects, which load data (or some other object) into some variable, which can then be accessed as "foo or get_foo()", where the function get_foo sets the object to "foo" and then returns it. This ensures they are only loaded when needed, and avoids the need to check for the existence of the object each time, since once "foo" has been set, "get_foo" will not be called again.]==]
+	local m_data
+	local function get_data()
+		m_data = load_data(headword_data_module)
+		return m_data
+	end
-local isLemma = m_data.lemmas
+	local script_data
-local isNonLemma = m_data.nonlemmas
+	local function get_script_data()
-local notranslit = m_data.notranslit
+		script_data = load_data(scripts_data_module)
-local toBeTagged = m_data.toBeTagged
+		return script_data
+	end
+	local script_utilities_data
+	local function get_script_utilities_data()
+		script_utilities_data = load_data(script_utilities_data_module)
+		return script_utilities_data
+	end
 -- If set to true, categories always appear, even in non-mainspace pages
 local test_force_categories = false
--- Version of rsubn() that discards all but the first return value.
+-- Add a tracking category to track entries with certain (unusually undesirable) properties. `track_id` is an identifier
-local function rsub(term, foo, bar)
+-- for the particular property being tracked and goes into the tracking page. Specifically, this adds a link in the
-	return (rsubn(term, foo, bar))
+-- page text to [[Wiktionary:Tracking/headword/TRACK_ID]], meaning you can find all entries with the `track_id` property
+-- by visiting [[Special:WhatLinksHere/Wiktionary:Tracking/headword/TRACK_ID]].
+--
+-- If `lang` (a language object) is given, an additional tracking page [[Wiktionary:Tracking/headword/TRACK_ID/CODE]] is
+-- linked to where CODE is the language code of `lang`, and you can find all entries in the combination of `track_id`
+-- and `lang` by visiting [[Special:WhatLinksHere/Wiktionary:Tracking/headword/TRACK_ID/CODE]]. This makes it possible to
+-- isolate only the entries with a specific tracking property that are in a given language. Note that if `lang`
+-- references at etymology-only language, both that language's code and its full parent's code are tracked.
+local function track(track_id, lang)
+	local tracking_page = "headword/" .. track_id
+	if lang and lang:hasType("etymology-only") then
+		debug_track{tracking_page, tracking_page .. "/" .. lang:getCode(),
+			tracking_page .. "/" .. lang:getFullCode()}
+	elseif lang then
+		debug_track{tracking_page, tracking_page .. "/" .. lang:getCode()}
+	else
+		debug_track(tracking_page)
+	end
+	return true
 end
 local function text_in_script(text, script_code)
-	local sc = require(scripts_module).getByCode(script_code)
+	local sc = get_script(script_code)
 	if not sc then
 		error("Internal error: Bad script code " .. script_code)
@@ Line 56: / Line 214: @@
 	local out
 	if characters then
-		text = rsub(text, "%W", "")
+		text = ugsub(text, "%W", "")
-		out = rfind(text, "[" .. characters .. "]")
+		out = ufind(text, "[" .. characters .. "]")
 	end
@@ Line 71: / Line 229: @@
 --[[ List of punctuation or spacing characters that are found inside of words.
 	 Used to exclude characters from the regex above. ]]
-local wordPunc = "-־׳״'.·*’་•:᠊"
+local wordPunc = "-#%%&@־׳״'.·*’་•:᠊"
 local notWordPunc = "[^" .. wordPunc .. "]+"
--- Format a term (either a head term or an inflection term) along with any left or right qualifiers, references or
+-- Format a term (either a head term or an inflection term) along with any left or right qualifiers, labels, references
--- customized separator: `part` is the object specifying the term, which should optionally contain:
+-- or customized separator: `part` is the object specifying the term (and `lang` the language of the term), which should
--- * left qualifiers in `q`, an array of strings (or `qualifiers` for compatibility purposes);
+-- optionally contain:
+-- * left qualifiers in `q`, an array of strings;
 -- * right qualifiers in `qq`, an array of strings;
+-- * left labels in `l`, an array of strings;
+-- * right labels in `ll`, an array of strings;
 -- * references in `refs`, an array either of strings (formatted reference text) or objects containing fields `text`
 --   (formatted reference text) and optionally `name` and/or `group`;
 -- * a separator in `separator`, defaulting to " <i>or</i> " if this is not the first term (j > 1), otherwise "".
 -- `formatted` is the formatted version of the term itself, and `j` is the index of the term.
-local function format_term_with_qualifiers_and_refs(part, formatted, j)
-	local left_qualifiers, right_qualifiers
-	local reftext
-	left_qualifiers = part.q and #part.q > 0 and part.q
+local function format_term_with_qualifiers_and_refs(lang, part, formatted, j)
-	if left_qualifiers then
+	local function part_non_empty(field)
-		left_qualifiers = require(qualifier_module).format_qualifier(left_qualifiers) .. " "
+		local list = part[field]
+		if not list then
+			return nil
+		end
+		if type(list) ~= "table" then
+			error(("Internal error: Wrong type for `part.%s`=%s, should be \"table\""):format(field, dump(list)))
+		end
+		return list[1]
 	end
-	right_qualifiers = part.qq and #part.qq > 0 and part.qq
+	if part_non_empty("q") or part_non_empty("qq") or part_non_empty("l") or
-	if right_qualifiers then
+		part_non_empty("ll") or part_non_empty("refs") then
-		right_qualifiers = " " .. require(qualifier_module).format_qualifier(right_qualifiers)
+		formatted = format_pron_qualifiers {
-	end
+			lang = lang,
-	if part.refs and #part.refs > 0 then
+			text = formatted,
-		local refs = {}
+			q = part.q,
-		for _, ref in ipairs(part.refs) do
+			qq = part.qq,
-			if type(ref) ~= "table" then
+			l = part.l,
-				ref = {text = ref}
+			ll = part.ll,
-			end
+			refs = part.refs,
-			local refargs
+		}
-			if ref.name or ref.group then
-				refargs = {name = ref.name, group = ref.group}
-			end
-			insert(refs, mw.getCurrentFrame():extensionTag("ref", ref.text, refargs))
-		end
-		reftext = concat(refs)
 	end
 	local separator = part.separator or j > 1 and " <i>or</i> " -- use "" to request no separator
-	if left_qualifiers then
-		formatted = left_qualifiers .. formatted
-	end
-	if reftext then
-		formatted = formatted .. reftext
-	end
-	if right_qualifiers then
-		formatted = formatted .. right_qualifiers
-	end
 	if separator then
 		formatted = separator .. formatted
@@ Line 132: / Line 282: @@
 --[==[Return true if the given head is multiword according to the algorithm used in full_headword().]==]
 function export.head_is_multiword(head)
-	for possibleWordBreak in rgmatch(head, spacingPunctuation) do
+	for possibleWordBreak in ugmatch(head, spacingPunctuation) do
-		if rmatch(possibleWordBreak, notWordPunc) then
+		if umatch(possibleWordBreak, notWordPunc) then
 			return true
 		end
@@ Line 141: / Line 291: @@
 end
+do
---[==[Add links to a multiword head.]==]
-function export.add_multiword_links(head, default)
 	local function workaround_to_exclude_chars(s)
-		return rsub(s, notWordPunc, "\2%1\1")
+		return (ugsub(s, notWordPunc, "\2%1\1"))
 	end
-	head = "\1" .. rsub(head, spacingPunctuation, workaround_to_exclude_chars) .. "\2"
+	--[==[Add links to a multiword head.]==]
-	if default then
+	function export.add_multiword_links(head, default)
-		head = head
+		head = "\1" .. ugsub(head, spacingPunctuation, workaround_to_exclude_chars) .. "\2"
-			:gsub("(\1[^\2]*)\\([:#][^\2]*\2)", "%1\\\\%2")
+		if default then
-			:gsub("(\1[^\2]*)([:#][^\2]*\2)", "%1\\%2")
+			head = head
-	end
+				:gsub("(\1[^\2]*)\\([:#][^\2]*\2)", "%1\\\\%2")
+				:gsub("(\1[^\2]*)([:#][^\2]*\2)", "%1\\%2")
+		end
-	--Escape any remaining square brackets to stop them breaking links (e.g. "[citation needed]").
+		--Escape any remaining square brackets to stop them breaking links (e.g. "[citation needed]").
-	head = encode_entities(head, "[]", true, true)
+		head = encode_entities(head, "[]", true, true)
-	--[=[
+		--[=[
-	use this when workaround is no longer needed:
+		use this when workaround is no longer needed:
-	head = "[[" .. rsub(head, WORDBREAKCHARS, "]]%1[[") .. "]]"
+		head = "[[" .. ugsub(head, WORDBREAKCHARS, "]]%1[[") .. "]]"
-	Remove any empty links, which could have been created above
+		Remove any empty links, which could have been created above
-	at the beginning or end of the string.
+		at the beginning or end of the string.
-	]=]
+		]=]
-	return (head
+		return (head
-		:gsub("\1\2", "")
+			:gsub("\1\2", "")
-		:gsub("[\1\2]", {["\1"] = "[[", ["\2"] = "]]"}))
+			:gsub("[\1\2]", {["\1"] = "[[", ["\2"] = "]]"}))
+	end
 end
 local function non_categorizable(full_raw_pagename)
-	return full_raw_pagename:find("^Appendix:Gestures/")
+	return full_raw_pagename:find("^Appendix:Gestures/") or
+		-- Unsupported titles with descriptive names.
+		(full_raw_pagename:find("^Unsupported titles/") and not full_raw_pagename:find("`"))
 end
+local function tag_text_and_add_quals_and_refs(data, head, formatted, j)
+	-- Add language and script wrapper.
+	formatted = tag_text(formatted, data.lang, head.sc, "head", nil, j == 1 and data.id or nil)
+	-- Add qualifiers, labels, references and separator.
+	return format_term_with_qualifiers_and_refs(data.lang, head, formatted, j)
+end
 -- Format a headword with transliterations.
 local function format_headword(data)
-	local m_scriptutils = require(script_utilities_module)
 	-- Are there non-empty transliterations?
 	local has_translits = false
@@ Line 190: / Line 347: @@
 	local unique_head_parts = {}
-	local has_multiple_heads = #data.heads > 1
+	local has_multiple_heads = not not data.heads[2]
 	for j, head in ipairs(data.heads) do
@@ Line 204: / Line 361: @@
 		-- Apply processing to the headword, for formatting links and such.
 		if head.term:find("[[", nil, true) and head.sc:getCode() ~= "Image" then
-			formatted = require(links_module).language_link{term = head.term, lang = data.lang}
+			formatted = language_link{term = head.term, lang = data.lang}
 		else
 			formatted = data.lang:makeDisplayText(head.term, head.sc, true)
 		end
-		local function tag_text_and_add_quals_and_refs(head, formatted, j)
+		local head_part = tag_text_and_add_quals_and_refs(data, head, formatted, j)
-			-- Add language and script wrapper.
-			formatted = m_scriptutils.tag_text(formatted, data.lang, head.sc, "head", nil, j == 1 and data.id or nil)
-			-- Add qualifiers, references and separator.
-			return format_term_with_qualifiers_and_refs(head, formatted, j)
-		end
-		local head_part = tag_text_and_add_quals_and_refs(head, formatted, j)
 		insert(head_parts, head_part)
@@ Line 227: / Line 376: @@
 				unique_head_part = head_part
 			else
-				unique_head_part = tag_text_and_add_quals_and_refs(head, formatted, 1)
+				unique_head_part = tag_text_and_add_quals_and_refs(data, head, formatted, 1)
 			end
 			unique_head_parts[unique_head_part] = true
@@ Line 243: / Line 392: @@
 	else
 		head_parts = concat(head_parts)
+	end
+	if has_manual_translits then
+		-- [[Special:WhatLinksHere/Wiktionary:Tracking/headword/manual-tr]]
+		-- [[Special:WhatLinksHere/Wiktionary:Tracking/headword/manual-tr/LANGCODE]]
+		track("manual-tr", data.lang)
 	end
@@ Line 255: / Line 410: @@
 				local this_parts = {}
 				if head.tr then
-					insert(this_parts, m_scriptutils.tag_translit(head.tr, data.lang:getCode(), "head", nil, head.tr_manual))
+					insert(this_parts, tag_translit(head.tr, data.lang:getCode(), "head", nil, head.tr_manual))
 					if head.ts then
 						insert(this_parts, " ")
@@ Line 261: / Line 416: @@
 				end
 				if head.ts then
-					insert(this_parts, "/" .. m_scriptutils.tag_transcription(head.ts, data.lang:getCode(), "head") .. "/")
+					insert(this_parts, "/" .. tag_transcription(head.ts, data.lang:getCode(), "head") .. "/")
 				end
 				insert(translit_parts, concat(this_parts))
@@ Line 270: / Line 425: @@
 		local langname = data.lang:getCanonicalName()
-		local transliteration_page = mw.title.new(langname .. " transliteration", "Wiktionary")
+		local transliteration_page = new_title(langname .. " transliteration")
 		local saw_translit_page = false
-		if transliteration_page and transliteration_page.exists then
+		if transliteration_page and transliteration_page:getContent() then
-			translits_formatted = " [[Wiktionary:" .. langname .. " transliteration|•]]" .. translits_formatted
+			translits_formatted = " [[" .. langname .. " transliteration|•]]" .. translits_formatted
 			saw_translit_page = true
 		end
@@ Line 281: / Line 436: @@
 		if not saw_translit_page and data.lang:hasType("etymology-only") then
 			langname = data.lang:getFullName()
-			transliteration_page = mw.title.new(langname .. " transliteration", "Wiktionary")
+			transliteration_page = new_title(langname .. " transliteration", "Wiktionary")
-			if transliteration_page and transliteration_page.exists then
+			if transliteration_page and transliteration_page:getContent() then
 				translits_formatted = " [[Wiktionary:" .. langname .. " transliteration|•]]" .. translits_formatted
 			end
@@ Line 304: / Line 459: @@
-local function format_genders(data)
+local function format_headword_genders(data)
 	local retval = ""
-	if data.genders and #data.genders > 0 then
+	if data.genders and data.genders[1] then
 		if data.gloss then
 			retval = ","
 		end
 		local pos_for_cat
-		if not data.nogendercat and not m_data.no_gender_cat[data.lang:getCode()] and
+		if not data.nogendercat then
-			not m_data.no_gender_cat[data.lang:getFullCode()] then
+			local no_gender_cat = (m_data or get_data()).no_gender_cat
-			local pos_category = data.pos_category:gsub("^reconstructed ", "")
+			if not (no_gender_cat[data.lang:getCode()] or no_gender_cat[data.lang:getFullCode()]) then
-			pos_for_cat = m_data.pos_for_gender_number_cat[pos_category]
+				pos_for_cat = (m_data or get_data()).pos_for_gender_number_cat[data.pos_category:gsub("^reconstructed ", "")]
+			end
 		end
-		local text, cats = require(gender_and_number_module).format_genders(data.genders, data.lang, pos_for_cat)
+		local text, cats = format_genders(data.genders, data.lang, pos_for_cat)
-		for _, cat in ipairs(cats) do
+		if cats then
-			insert(data.categories, cat)
+			extend(data.categories, cats)
 		end
 		retval = retval .. "&nbsp;" .. text
@@ Line 325: / Line 481: @@
 end
+-- Forward reference
+local format_inflections
 local function format_inflection_parts(data, parts)
-	local any_part_translit = false
 	for j, part in ipairs(parts) do
 		if type(part) ~= "table" then
@@ Line 338: / Line 494: @@
 		if face ~= "bold" and face ~= "plain" and face ~= "hypothetical" then
 			error("The face `" .. face .. "` " .. (
-				mw.loadData(script_utilities_data_module).faces[face] and
+				(script_utilities_data or get_script_utilities_data()).faces[face] and
 				"should not be used for non-headword terms on the headword line." or
 				"is invalid."
@@ Line 347: / Line 503: @@
 		-- right into the 'data' table to disable inflection links of the entire headword
 		-- when inflected forms aren't entry-worthy, e.g.: in Vulgar Latin
-		local nolinkinfl = data.nolinkinfl
+		local nolinkinfl = part.face == "hypothetical" or (part.nolink and track("nolink") or part.nolinkinfl) or (
+			data.nolink and track("nolink") or data.nolinkinfl)
 		local formatted
@@ Line 361: / Line 518: @@
 			-- where the script is relatively straightforward to read by learners (e.g. Greek, Russian), but allow it
 			-- to be enabled in languages with more complex scripts (e.g. Arabic).
-			local tr = part.translit or (not (parts.enable_auto_translit or data.inflections.enable_auto_translit) and "-" or nil)
+			--
-			if tr ~= "-" then
+			-- FIXME: With nested inflections, should we also respect `enable_auto_translit` at the top level of the
-				any_part_translit = true
+			-- nested inflections structure?
+			local tr = part.tr or not (parts.enable_auto_translit or data.inflections.enable_auto_translit) and "-" or nil
+			-- FIXME: Temporary errors added 2025-10-03. Remove after a month or so.
+			if part.translit then
+				error("Internal error: Use field `tr` not `translit` for specifying an inflection part translit")
+			end
+			if part.transcription then
+				error("Internal error: Use field `ts` not `transcription` for specifying an inflection part transcription")
+			end
+			local postprocess_annotations
+			if part.inflections then
+				postprocess_annotations = function(infldata)
+					insert(infldata.annotations, format_inflections(data, part.inflections))
+				end
 			end
-			formatted = require(links_module).full_link(
+			formatted = full_link(
 				{
 					term = not nolinkinfl and part.term or nil,
@@ Line 371: / Line 542: @@
 					lang = part.lang or data.lang,
 					sc = part.sc or parts.sc or nil,
+					gloss = part.gloss,
+					pos = part.pos,
+					lit = part.lit,
 					id = part.id,
 					genders = part.genders,
 					tr = tr,
-					ts = part.transcription,
+					ts = part.ts,
 					accel = partaccel or parts.accel,
+					postprocess_annotations = postprocess_annotations,
 				},
 				face
@@ Line 381: / Line 556: @@
 		end
-		parts[j] = format_term_with_qualifiers_and_refs(part, formatted, j)
+		parts[j] = format_term_with_qualifiers_and_refs(part.lang or data.lang, part,
+			formatted, j)
 	end
 	local parts_output
-	if #parts > 0 then
+	if parts[1] then
 		parts_output = (parts.label and " " or "") .. concat(parts)
 	elseif parts.request then
@@ Line 396: / Line 572: @@
 	local parts_label = parts.label and ("<i>" .. parts.label .. "</i>") or ""
-	return parts_label .. parts_output, any_part_translit
+	return format_term_with_qualifiers_and_refs(data.lang, parts, parts_label .. parts_output, 1)
 end
--- Format the inflections following the headword.
+-- Format the inflections following the headword or nested after a given inflection. Declared local above.
-local function format_inflections(data)
+function format_inflections(data, inflections)
-	local any_part_translit = false
+	if inflections and inflections[1] then
-	if data.inflections and #data.inflections > 0 then
 		-- Format each inflection individually.
-		for key, infl in ipairs(data.inflections) do
+		for key, infl in ipairs(inflections) do
-			local this_any_part_translit
+			inflections[key] = format_inflection_parts(data, infl)
-			data.inflections[key], this_any_part_translit = format_inflection_parts(data, infl)
-			if this_any_part_translit then
-				any_part_translit = true
-			end
 		end
-		local concat_result = concat(data.inflections, ", ")
+		return concat(inflections, ", ")
-		return " (" .. concat_result .. ")"
 	else
 		return ""
 	end
 end
+-- Format the top-level inflections following the headword. Currently this just adds parens around the
+-- formatted comma-separated inflections in `data.inflections`.
+local function format_top_level_inflections(data)
+	local result = format_inflections(data, data.inflections)
+	if result ~= "" then
+		return " (" .. result .. ")"
+	else
+		return result
+	end
+end
+-- Forward reference
+local check_red_link_inflections
+-- Check a single inflection (which consists of a label and zero or more terms, each possibly with nested inflections)
+-- for red links. If so, insert a red-link category based on `plpos` (the plural part of speech to insert in the
+-- category), stop further processing, and return true. If no red links found, return false.
+local function check_red_link_inflection_parts(data, parts, plpos)
+	for _, part in ipairs(parts) do
+		if type(part) ~= "table" then
+			part = {term = part}
+		end
+		local term = part.term
+		if term and not term:find("%[%[") then
+			local stripped_physical_term = get_link_page(term, data.lang, part.sc or parts.sc or nil)
+			if stripped_physical_term then
+				local title = mw.title.new(stripped_physical_term)
+				if title and not title:getContent() then
+					return true
+				end
+			end
+		end
+		if part.inflections then
+			if check_red_link_inflections(data, part.inflections, plpos) then
+				return true
+			end
+		end
+	end
+	return false
+end
+-- Check a set of inflections (each of which describes a single inflection of the term, such as feminine or plural, and
+-- consists of a label and zero or more terms, each possibly with nested inflections) for red links. If so, insert a
+-- red-link category based on `plpos` (the plural part of speech to insert in the category), stop further processing,
+-- and return true. If no red links found, return false.
+function check_red_link_inflections(data, inflections, plpos)
+	if inflections and inflections[1] then
+		-- Check each inflection individually.
+		for key, infl in ipairs(inflections) do
+			if check_red_link_inflection_parts(data, infl, plpos) then
+				return true
+			end
+		end
+	end
+	return false
+end
+-- Check the top-level inflections in `data.inflections`, along with any nested inflections, for red links. If so,
+-- insert a red-link category based on `plpos` (the plural part of speech to insert in the category), stop further
+-- processing, and return true. If no red links found, return false.
+local function check_red_link_inflections_top_level(data, plpos)
+	return check_red_link_inflections(data, data.inflections, plpos)
+end
 --[==[
--- Returns the plural form of `pos`, a raw part of speech input, which could be singular or
+Returns the plural form of `pos`, a raw part of speech input, which could be singular or
--- plural. Irregular plural POS are taken into account (e.g. "kanji" pluralizes to
+plural. Irregular plural POS are taken into account (e.g. "kanji" pluralizes to
--- "kanji").]==]
+"kanji").
+]==]
 function export.pluralize_pos(pos)
-	return m_data.irregular_plurals[pos] or
+	-- Make the plural form of the part of speech
+	return (m_data or get_data()).irregular_plurals[pos] or
 		pos:sub(-1) == "s" and pos or
-		-- Make the plural form of the part of speech
+		pluralize(pos)
-		require("Module:string utilities").pluralize(pos)
 end
 --[==[
--- Return "lemma" if the given POS is a lemma, "non-lemma form" if a non-lemma form, or nil
+Return "lemma" if the given POS is a lemma, "non-lemma form" if a non-lemma form, or nil
--- if unknown. The POS passed in must be in its plural form ("nouns", "prefixes", etc.).
+if unknown. The POS passed in must be in its plural form ("nouns", "prefixes", etc.).
--- If you have a POS in its singular form, call export.pluralize_pos() above to pluralize it
+If you have a POS in its singular form, call {export.pluralize_pos()} above to pluralize it
--- in a smart fashion that knows when to add "-s" and when to add "-es", and also takes
+in a smart fashion that knows when to add "-s" and when to add "-es", and also takes
--- into account any irregular plurals.]==]
+into account any irregular plurals.
---
--- If `best_guess` is given and the POS is in neither the lemma nor non-lemma list, guess
+If `best_guess` is given and the POS is in neither the lemma nor non-lemma list, guess
--- based on whether it ends in " forms"; otherwise, return nil.]==]
+based on whether it ends in " forms"; otherwise, return nil.
+]==]
 function export.pos_lemma_or_nonlemma(plpos, best_guess)
+	local m_headword_data = m_data or get_data()
+	local isLemma = m_headword_data.lemmas
 	-- Is it a lemma category?
 	if isLemma[plpos] then
@@ Line 450: / Line 695: @@
 	end
 	-- Is it a nonlemma category?
+	local isNonLemma = m_headword_data.nonlemmas
 	if isNonLemma[plpos] or isNonLemma[plpos_no_recon] then
 		return "non-lemma form"
@@ Line 463: / Line 709: @@
 end
+--[==[
+Canonicalize a part of speech as specified in 2= in {{tl|head}}. This checks for POS aliases and non-lemma form
+aliases ending in 'f', and then pluralizes if the POS term does not have an invariable plural.
+]==]
+function export.canonicalize_pos(pos)
+	-- FIXME: Temporary code to throw an error for alias 'pre' (= preposition) that will go away.
+	if pos == "pre" then
+		-- Don't throw error on 'pref' as it's an alias for "prefix".
+		error("POS 'pre' for 'preposition' no longer allowed as it's too ambiguous; use 'prep'")
+	end
+	-- Likewise for pro = pronoun.
+	if pos == "pro" or pos == "prof" then
+		error("POS 'pro' for 'pronoun' no longer allowed as it's too ambiguous; use 'pron'")
+	end
+	local m_headword_data = m_data or get_data()
+	if m_headword_data.pos_aliases[pos] then
+		pos = m_headword_data.pos_aliases[pos]
+	elseif pos:sub(-1) == "f" then
+		pos = pos:sub(1, -2)
+		pos = (m_headword_data.pos_aliases[pos] or pos) .. " forms"
+	end
+	return export.pluralize_pos(pos)
+end
 -- Find and return the maximum index in the array `data[element]` (which may have gaps in it), and initialize it to a
@@ Line 475: / Line 744: @@
 	local typ = type(data[element])
 	if typ ~= "table" then
-		error(("In full_headword(), `data.%s` must be an array but is a %s"):format(element, typ))
+		error(("Internal error: In full_headword(), `data.%s` must be an array but is a %s"):format(element, typ))
 	end
 	for k, v in pairs(data[element]) do
 		if k ~= "maxindex" then
 			if type(k) ~= "number" then
-				error(("Unrecognized non-numeric key '%s' in `data.%s`"):format(k, element))
+				error(("Internal error: Unrecognized non-numeric key '%s' in `data.%s`"):format(k, element))
 			end
 			if k > maxind then
@@ Line 487: / Line 756: @@
 			if v then
 				if type(v) ~= "string" then
-					error(("For key '%s' in `data.%s`, value should be a string but is a %s"):format(k, element, type(v)))
+					error(("Internal error: For key '%s' in `data.%s`, value should be a string but is a %s"):format(k, element, type(v)))
 				end
 				if not allow_blank_string and v == "" then
-					error(("For key '%s' in `data.%s`, blank string not allowed; use 'false' for the default"):format(k, element))
+					error(("Internal error: For key '%s' in `data.%s`, blank string not allowed; use 'false' for the default"):format(k, element))
 				end
 			end
@@ Line 519: / Line 788: @@
 		-- that.
 		if tbl == true then
-			if page.raw_defaultsort ~= sortkey then
-				insert(lang_cats, lang:getFullName() .. " terms with non-redundant non-automated sortkeys")
-			end
 			return
 		end
@@ Line 531: / Line 797: @@
 				different = true
 			end
-		end
-		if redundant then
-			insert(lang_cats, lang:getFullName() .. " terms with redundant sortkeys")
-		end
-		if different then
-			insert(lang_cats, lang:getFullName() .. " terms with non-redundant non-automated sortkeys")
 		end
 		return sortkey
 	end
 	function export.maintenance_cats(page, lang, lang_cats, page_cats)
-		for _, cat in ipairs(page.cats) do
+		extend(page_cats, page.cats)
-			insert(page_cats, cat)
-		end
 		lang = lang:getFull() -- since we are just generating categories
 		local canonical = lang:getCanonicalName()
@@ Line 550: / Line 808: @@
 		if tbl then
 			sortkey = handle_raw_sortkeys(tbl, sortkey, page, lang, lang_cats)
-			insert(lang_cats, canonical .. " entries with topic categories using raw markup")
 		end
 		tbl = page.wikitext_langname_cat[canonical]
 		if tbl then
 			handle_raw_sortkeys(tbl, sortkey, page, lang, lang_cats)
-			insert(lang_cats, canonical .. " entries with language name categories using raw markup")
-		end
-		if require(utilities_module).get_current_L2() ~= canonical then
-			insert(lang_cats, canonical .. " entries with incorrect language header")
 		end
 	end
@@ Line 570: / Line 823: @@
 ]==]
 function export.full_headword(data)
-	local remove_links = require(links_module).remove_links
-	local format_categories = require(utilities_module).format_categories
 	-- Prevent data from being destructively modified.
-	local data = require(table_module).shallowcopy(data)
+	local data = shallow_copy(data)
 	------------ 1. Basic checks for old-style (multi-arg) calling convention. ------------
 	if data.getCanonicalName then
-		error("In full_headword(), the first argument `data` needs to be a Lua object (table) of properties, not a language object")
+		error("Internal error: In full_headword(), the first argument `data` needs to be a Lua object (table) of properties, not a language object")
 	end
 	if not data.lang or type(data.lang) ~= "table" or not data.lang.getCode then
-		error("In full_headword(), the first argument `data` needs to be a Lua object (table) and `data.lang` must be a language object")
+		error("Internal error: In full_headword(), the first argument `data` needs to be a Lua object (table) and `data.lang` must be a language object")
 	end
 	if data.id and type(data.id) ~= "string" then
-		error("The id in the data table should be a string.")
+		error("Internal error: The id in the data table should be a string.")
 	end
@@ Line 593: / Line 843: @@
 	local langcode = data.lang:getCode()
-	local full_langcode = langcode
+	local full_langcode = data.lang:getFullCode()
 	local langname = data.lang:getCanonicalName()
-	local full_langname = langname
+	local full_langname = data.lang:getFullName()
-	local raw_pagename, page = data.pagename
+	local raw_pagename = data.pagename
-	if raw_pagename and raw_pagename ~= m_data.pagename then -- for testing, doc pages, etc.
+	local page
-		page = require(headword_page_module).process_page(raw_pagename)
+	local m_headword_data = m_data or get_data()
+	if raw_pagename and raw_pagename ~= m_headword_data.pagename then -- for testing, doc pages, etc.
+		-- data.pagename is often set on documentation and test pages through the pagename= parameter of various
+		-- templates, to emulate running on that page. Having a large number of such test templates on a single
+		-- page often leads to timeouts, because we fetch and parse the contents of each page in turn. However,
+		-- we don't really need to do that and can function fine without fetching and parsing the contents of a
+		-- given page, so turn off content fetching/parsing (and also setting the DEFAULTSORT key through a parser
+		-- function, which is *slooooow*) in certain namespaces where test and documentation templates are likely to
+		-- be found and where actual content does not live (User, Template, Module).
+		local actual_namespace = m_headword_data.page.namespace
+		local no_fetch_content = actual_namespace == "User" or actual_namespace == "Template" or
+			actual_namespace == "Module"
+		page = process_page(raw_pagename, no_fetch_content)
 	else
-		page = m_data.page
+		page = m_headword_data.page
 	end
-	-- Check the namespace against the language type.
+	local namespace = page.namespace
-	if page.namespace == "" then
-		if data.lang:hasType("reconstructed") then
-			error("Entries in " .. langname .. " must be placed in the Reconstruction: namespace")
-		elseif data.lang:hasType("appendix-constructed") then
-			error("Entries in " .. langname .. " must be placed in the Appendix: namespace")
-		end
-	end
 	------------ 3. Initialize `data.heads` table; if old-style, convert to new-style. ------------
@@ Line 618: / Line 873: @@
 		-- new-style
 		if data.translits or data.transcriptions then
-			error("In full_headword(), if `data.heads` is new-style (array of head objects), `data.translits` and `data.transcriptions` cannot be given")
+			error("Internal error: In full_headword(), if `data.heads` is new-style (array of head objects), `data.translits` and `data.transcriptions` cannot be given")
 		end
 	else
 		-- convert old-style `heads`, `translits` and `transcriptions` to new-style
-		local maxind = math.max(
+		local maxind = max(
-			init_and_find_maximum_index(data, "heads", true),
+			init_and_find_maximum_index(data, "heads"),
 			init_and_find_maximum_index(data, "translits", true),
 			init_and_find_maximum_index(data, "transcriptions", true)
@@ Line 643: / Line 898: @@
 	------------ 4. Initialize and validate `data.categories` and `data.whole_page_categories`, and determine `pos_category` if not given, and add basic categories. ------------
-	init_and_find_maximum_index(data, "categories", true)
+	-- EXPERIMENTAL: see [[Wiktionary:Beer parlour/2024/June#Decluttering the altform mess]]
-	init_and_find_maximum_index(data, "whole_page_categories", true)
+	if data.altform then
+		data.noposcat = true
+	end
+	init_and_find_maximum_index(data, "categories")
+	init_and_find_maximum_index(data, "whole_page_categories")
 	local pos_category_already_present = false
-	if #data.categories > 0 then
+	if data.categories[1] then
 		local escaped_langname = pattern_escape(full_langname)
 		local matches_lang_pattern = "^" .. escaped_langname .. " "
+		for _, cat in ipairs(data.categories) do
+			-- Does the category begin with the language name? If not, tag it with a tracking category.
+			if not cat:find(matches_lang_pattern) then
+				-- [[Special:WhatLinksHere/Wiktionary:Tracking/headword/no lang category]]
+				-- [[Special:WhatLinksHere/Wiktionary:Tracking/headword/no lang category/LANGCODE]]
+				track("no lang category", data.lang)
+			end
+		end
 		-- If `pos_category` not given, try to infer it from the first specified category. If this doesn't work, we
@@ Line 660: / Line 928: @@
 	if not data.pos_category then
-		error("`data.pos_category` not specified and could not be inferred from the categories given in "
+		error("Internal error: `data.pos_category` not specified and could not be inferred from the categories given in "
 			.. "`data.categories`. Either specify the plural part of speech in `data.pos_category` "
 			.. "(e.g. \"proper nouns\") or ensure that the first category in `data.categories` is formed from the "
@@ Line 679: / Line 947: @@
 	-- add an appropriate category.
 	local postype = export.pos_lemma_or_nonlemma(data.pos_category)
-	if not data.noposcat then
+    local main_cat = data.lang:getMainCategoryName()
-		insert(data.categories, 1, full_langname .. " " .. postype .. "s")
+    if not postype then
+    elseif not data.noposcat then
+        if postype:match("^lemma") and main_cat ~= "lemma" then
+            postype = main_cat
+        end
+        insert(data.categories, 1, full_langname .. " " .. postype .. "s")
+    end
+    insert(data.categories, 1, "Contionary")
+	-- EXPERIMENTAL: see [[Wiktionary:Beer parlour/2024/June#Decluttering the altform mess]]
+	if data.altform then
+		insert(data.categories, 1, full_langname .. " alternative forms")
 	end
 	------------ 5. Create a default headword, and add links to multiword page names. ------------
+	-- Determine if this is an "anti-asterisk" term, i.e. an attested term in a language that must normally be
+	-- reconstructed.
+	local is_anti_asterisk = data.heads[1].term and data.heads[1].term:find("^!!")
+	local lang_reconstructed = data.lang:hasType("reconstructed")
+	if is_anti_asterisk then
+		if not lang_reconstructed then
+			error("Anti-asterisk feature (head= beginning with !!) can only be used with reconstructed languages")
+		end
+		lang_reconstructed = false
+	end
 	-- Determine if term is reconstructed
-	local is_reconstructed = page.namespace == "Reconstruction" or data.lang:hasType("reconstructed")
+	local is_reconstructed = namespace == "Reconstruction" or lang_reconstructed
 	-- Create a default headword based on the pagename, which is determined in
@@ Line 693: / Line 985: @@
 	-- Add links to multi-word page names when appropriate
-	if not data.nolinkhead and not m_data.no_multiword_links[langcode] and not m_data.no_multiword_links[full_langcode]
+	if not (is_reconstructed or data.nolinkhead) then
-		and	not is_reconstructed and export.head_is_multiword(default_head) then
+		local no_links = m_headword_data.no_multiword_links
-		default_head = export.add_multiword_links(default_head, true)
+		if not (no_links[langcode] or no_links[full_langcode]) and export.head_is_multiword(default_head) then
+			default_head = export.add_multiword_links(default_head, true)
+		end
 	end
-	if is_reconstructed then
+	if is_reconstructed and not data.lang:hasType("conlang") then
 		default_head = "*" .. default_head
 	end
-	------------ 6. Fill in missing values in `data.heads`. ------------
+	------------ 6. Check the namespace against the language type. ------------
+	if namespace == "" then
+		if lang_reconstructed then
+			error("Entries in " .. langname .. " must be placed in the Reconstruction: namespace")
+		elseif data.lang:hasType("appendix-constructed") then
+			error("Entries in " .. langname .. " must be placed in the Appendix: namespace")
+		end
+	elseif namespace == "Citations" or namespace == "Thesaurus" then
+		error("Headword templates should not be used in the " .. namespace .. ": namespace.")
+	end
+	------------ 7. Fill in missing values in `data.heads`. ------------
 	-- True if any script among the headword scripts has spaces in it.
@@ Line 711: / Line 1,017: @@
 	for _, head in ipairs(data.heads) do
-		------ 6a. If missing head, replace with default head.
+		------ 7a. If missing head, replace with default head.
 		if not head.term then
 			head.term = default_head
 		elseif head.term == default_head then
 			has_redundant_head_param = true
+		elseif is_anti_asterisk and head.term == "!!" then
+			-- If explicit head=!! is given, it's an anti-asterisk term and we fill in the default head.
+			head.term = "!!" .. default_head
+		elseif head.term:find("^[!?]$") then
+			-- If explicit head= just consists of ! or ?, add it to the end of the default head.
+			head.term = default_head .. head.term
 		end
+		head.term_no_initial_bang_bang = is_anti_asterisk and head.term:sub(3) or head.term
 		if is_reconstructed then
 			local head_term = head.term
 			if head_term:find("%[%[") then
-				head_term = require(links_module).remove_links(head_term)
+				head_term = remove_links(head_term)
 			end
 			if head_term:sub(1, 1) ~= "*" then
 				error("The headword '"  .. head_term .. "' must begin with '*' to indicate that it is reconstructed.")
+			end
+		end
+		------ 7b. Try to detect the script(s) if not provided. If a per-head script is provided, that takes precedence,
+		------     otherwise fall back to the overall script if given. If neither given, autodetect the script.
+		local auto_sc = data.lang:findBestScript(head.term)
+		if not (head.sc or data.sc) then -- No script code given, so use autodetected script.
+			head.sc = auto_sc
+		else
+			if not head.sc then -- Overall script code given.
+				head.sc = data.sc
 			end
 		end
 		-- If using a discouraged character sequence, add to maintenance category.
-		-- if head.sc:hasNormalizationFixes() == true then
+		if head.sc:hasNormalizationFixes() == true then
-		--	local composed_head = toNFC(head.term)
+			local composed_head = toNFC(head.term)
-		--	if head.sc:fixDiscouragedSequences(composed_head) ~= composed_head then
+			if head.sc:fixDiscouragedSequences(composed_head) ~= composed_head then
-		--		insert(data.whole_page_categories, "Pages using discouraged character sequences")
+				insert(data.whole_page_categories, "Pages using discouraged character sequences")
-		--	end
+			end
-		-- end
+		end
 		any_script_has_spaces = any_script_has_spaces or head.sc:hasSpaces()
-		------ 6c. Create automatic transliterations for any non-Latin headwords without manual translit given
+		------ 7c. Create automatic transliterations for any non-Latin headwords without manual translit given
 		------     (provided automatic translit is available, e.g. not in Persian or Hebrew).
 		-- Make transliterations
 		head.tr_manual = nil
 		-- Try to generate a transliteration if necessary
 		if head.tr == "-" then
 			head.tr = nil
-		elseif not notranslit[langcode] and not notranslit[full_langcode] and head.sc:isTransliterated() then
+		else
-			head.tr_manual = not not head.tr
+			local notranslit = m_headword_data.notranslit
+			if not (notranslit[langcode] or notranslit[full_langcode]) and head.sc:isTransliterated() then
+				head.tr_manual = not not head.tr
-			local text = head.term
+				local text = head.term_no_initial_bang_bang
-			if not data.lang:link_tr(head.sc) then
+				if not data.lang:link_tr(head.sc) then
-				text = remove_links(text)
+					text = remove_links(text)
-			end
+				end
-			local automated_tr, tr_categories
+				local automated_tr = data.lang:transliterate(text, head.sc)
-			automated_tr, head.tr_fail, tr_categories = data.lang:transliterate(text, head.sc)
-			if automated_tr or head.tr_fail then
+				if automated_tr then
-				local manual_tr = head.tr
+					local manual_tr = head.tr
-				if manual_tr then
+					if not manual_tr then
-					if (remove_links(manual_tr) == remove_links(automated_tr)) and (not head.tr_fail) then
+						head.tr = automated_tr
-						insert(data.categories, full_langname .. " terms with redundant transliterations")
-					elseif not head.tr_fail then
-						insert(data.categories, full_langname .. " terms with non-redundant manual transliterations")
 					end
 				end
-				if not manual_tr then
+				-- There is still no transliteration?
-					head.tr = automated_tr
+				-- Add the entry to a cleanup category.
-					for _, category in ipairs(tr_categories) do
+				if not head.tr then
-						insert(data.categories, category)
+					head.tr = "<small>transliteration needed</small>"
-					end
+					-- FIXME: No current support for 'Request for transliteration of Classical Persian terms' or similar.
+					-- Consider adding this support in [[Module:category tree/poscatboiler/data/entry maintenance]].
+					insert(data.categories, "Requests for transliteration of " .. full_langname .. " terms")
+				else
+					-- Otherwise, trim it.
+					head.tr = trim(head.tr)
 				end
-			end
-			-- There is still no transliteration?
-			-- Add the entry to a cleanup category.
-			if not head.tr then
-				head.tr = "<small>transliteration needed</small>"
-				-- FIXME: No current support for 'Request for transliteration of Classical Persian terms' or similar.
-				-- Consider adding this support in [[Module:category tree/poscatboiler/data/entry maintenance]].
-				insert(data.categories, "Requests for transliteration of " .. full_langname .. " terms")
-			else
-				-- Otherwise, trim it.
-				head.tr = mw.text.trim(head.tr)
 			end
 		end
@@ Line 792: / Line 1,109: @@
 		-- Link to the transliteration entry for languages that require this.
 		if head.tr and data.lang:link_tr(head.sc) then
-			head.tr = require(links_module).full_link {
+			head.tr = full_link{
 				term = head.tr,
 				lang = data.lang,
-				sc = require(scripts_module).getByCode("Latn"),
+				sc = get_script("Latn"),
 				tr = "-"
 			}
@@ Line 801: / Line 1,118: @@
 	end
-	------------ 7. Maybe tag the title with the appropriate script code, using the `display_title` mechanism. ------------
+	------------ 8. Maybe tag the title with the appropriate script code, using the `display_title` mechanism. ------------
 	-- Assumes that the scripts in "toBeTagged" will never occur in the Reconstruction namespace.
@@ Line 813: / Line 1,130: @@
 	local dt_script = data.heads[1].sc
 	local dt_script_code = dt_script:getCode()
-	local page_non_ascii = page.namespace == "" and not page.pagename:find("^[%z\1-\127]+$")
+	local page_non_ascii = namespace == "" and not page.pagename:find("^[%z\1-\127]+$")
 	local unsupported_pagename, unsupported = page.full_raw_pagename:gsub("^Unsupported titles/", "")
 	if unsupported == 1 and page.unsupported_titles[unsupported_pagename] then
 		display_title = 'Unsupported titles/<span class="' .. dt_script_code .. '">' .. page.unsupported_titles[unsupported_pagename] .. '</span>'
-	elseif page_non_ascii and toBeTagged[dt_script_code]
+	elseif page_non_ascii and m_headword_data.toBeTagged[dt_script_code]
 		or (dt_script_code == "Jpan" and (text_in_script(page.pagename, "Hira") or text_in_script(page.pagename, "Kana")))
 		or (dt_script_code == "Kore" and text_in_script(page.pagename, "Hang")) then
@@ Line 824: / Line 1,141: @@
 	elseif page_non_ascii and (dt_script_code == "Hant" or dt_script_code == "Hans") then
 		display_title = '<span class="Hani">' .. page.full_raw_pagename .. '</span>'
-	elseif page.namespace == "Reconstruction" then
+	elseif namespace == "Reconstruction" then
 		local matched
-		display_title, matched = rsubn(
+		display_title, matched = ugsub(
 			page.full_raw_pagename,
 			"^(Reconstruction:[^/]+/)(.+)$",
 			function(before, term)
-				return before ..
+				return before .. tag_text(term, data.lang, dt_script)
-					require(script_utilities_module).tag_text(
-						term,
-						data.lang,
-						dt_script
-					)
 			end
 		)
@@ Line 841: / Line 1,153: @@
 			display_title = nil
 		end
+	end
+	-- FIXME: Generalize this.
+	-- If the current language uses ur-Arab (for Urdu, etc.), ku-Arab (Central Kurdish) or pa-Arab
+	-- (Shahmukhi, for Punjabi) and there's more than one language on the page, don't set the display title
+	-- because these three scripts display in Nastaliq and we don't want this for terms that also exist in other
+	-- languages that don't display in Nastaliq (e.g. Arabic or Persian) to display in Nastaliq. Because the word
+	-- "Urdu" occurs near the end of the alphabet, Urdu fonts tend to override the fonts of other languages.
+	-- FIXME: This is checking for more than one language on the page but instead needs to check if there are any
+	-- languages using scripts other than the ones just mentioned.
+	if (dt_script_code == "ur-Arab" or dt_script_code == "ku-Arab" or dt_script_code == "pa-Arab") and page.L2_list.n > 1 then
+		display_title = nil
 	end
@@ Line 850: / Line 1,173: @@
 	end
-	------------ 8. Insert additional categories. ------------
+	------------ 9. Insert additional categories. ------------
+	if data.force_cat_output then
+		-- [[Special:WhatLinksHere/Wiktionary:Tracking/headword/force cat output]]
+		track("force cat output")
+	end
 	if has_redundant_head_param then
 		if not data.no_redundant_head_cat then
-			insert(data.categories, full_langname .. " terms with redundant head parameter")
+			-- This is not the right way to go about this; too many exceptions and problems due to language-specific headword
+			-- handling customization. If we want this, it should be opt-in by a given language passing in the default headword.
+			-- insert(data.categories, full_langname .. " terms with redundant head parameter")
 		end
 	end
 	-- If the first head is multiword (after removing links), maybe insert into "LANG multiword terms".
-	if not data.nomultiwordcat and any_script_has_spaces and postype == "lemma" and
+	if not data.nomultiwordcat and any_script_has_spaces and postype == "lemma" then
-		not m_data.no_multiword_cat[langcode] and not m_data.no_multiword_cat[full_langcode] then
+		local no_multiword_cat = m_headword_data.no_multiword_cat
-		-- Check for spaces or hyphens, but exclude prefixes and suffixes.
+		if not (no_multiword_cat[langcode] or no_multiword_cat[full_langcode]) then
-		-- Use the pagename, not the head= value, because the latter may have extra
+			-- Check for spaces or hyphens, but exclude prefixes and suffixes.
-		-- junk in it, e.g. superscripted text that throws off the algorithm.
+			-- Use the pagename, not the head= value, because the latter may have extra
-		local checkpattern = ".[%s%-፡]."
+			-- junk in it, e.g. superscripted text that throws off the algorithm.
-		if m_data.hyphen_not_multiword_sep[langcode] or m_data.hyphen_not_multiword_sep[full_langcode] then
+			local no_hyphen = m_headword_data.hyphen_not_multiword_sep
-			-- Exclude hyphens if the data module states that they should for this language
+			-- Exclude hyphens if the data module states that they should for this language.
-			checkpattern = ".[%s፡]."
+			local checkpattern = (no_hyphen[langcode] or no_hyphen[full_langcode]) and ".[%s፡]." or ".[%s%-፡]."
-		end
+			local is_multiword = umatch(page.pagename, checkpattern)
-		if rmatch(page.pagename, checkpattern) and not non_categorizable(page.full_raw_pagename) then
-			insert(data.categories, full_langname .. " multiword terms")
+			if is_multiword and not non_categorizable(page.full_raw_pagename) then
+				insert(data.categories, full_langname .. " multiword terms")
+			elseif not is_multiword then
+				local long_word_threshold = m_headword_data.long_word_thresholds[langcode] or
+					m_headword_data.long_word_thresholds[full_langcode]
+				if long_word_threshold and ulen(page.pagename) >= long_word_threshold then
+					insert(data.categories, "Long " .. full_langname .. " words")
+				end
+			end
 		end
 	end
-	if data.sccat then
+	local default_sccat = m_headword_data.default_sccat
+	if data.sccat or data.sccat == nil and (default_sccat[langcode] or default_sccat[full_langcode]) then
 		for _, head in ipairs(data.heads) do
 			insert(data.categories, full_langname .. " " .. data.pos_category .. " in " ..
@@ Line 882: / Line 1,221: @@
 	-- Reconstructed terms often use weird combinations of scripts and realistically aren't spelled so much as notated.
-	if page.namespace ~= "Reconstruction" then
+	if namespace ~= "Reconstruction" then
 		-- Map from languages to a string containing the characters to ignore when considering whether a term has
 		-- multiple written scripts in it. Typically these are Greek or Cyrillic letters used for their phonetic
 		-- values.
 		local characters_to_ignore = {
-			["aaq"] = "α", -- Penobscot
+			["aaq"] = "αάὰ", -- Penobscot (Algonquian)
 			["acy"] = "δθ", -- Cypriot Arabic
-			["anc"] = "γ", -- Ngas
+			["aez"] = "β", -- Aeka (Trans-New Guinea)
-			["aou"] = "χ", -- A'ou
+			["anc"] = "γ", -- Ngas (Chadic/Afroasiatic)
-			["awg"] = "β", -- Anguthimri
+			["aou"] = "χ", -- A'ou (Kra-Dai)
-			["bhp"] = "β", -- Bima
+			["art-blk"] = "ч", -- Bolak (conlang)
-			["byk"] = "θ", -- Biao
+			["awg"] = "β", -- Anguthimri (Pama-Nyungan)
-			["cdy"] = "θ", -- Chadong
+			["az"] = "ь", -- Azerbaijani (Turkic; Yañalif Latin spelling, c. 1928 - 1938)
-			["clm"] = "χ", -- Klallam
+			["ba"] = "ь", -- Bashkir (Turkic; Yañalif Latin spelling, c. 1928 - 1938)
-			["col"] = "χ", -- Colombia-Wenatchi
+			["bhp"] = "β", -- Bima (Austronesian)
-			["coo"] = "χ", -- Comox; FIXME: others? E.g. Greek theta (θ)?
+			["bjz"] = "β", -- Baruga (Trans-New Guinea)
-			["ets"] = "θ", -- Yekhee
+			["byk"] = "θ", -- Biao (Kra-Dai)
-			["gmw-gts"] = "χ", -- Gottscheerish
+			["cdy"] = "θ", -- Chadong (Kra-Dai)
-			["hur"] = "θ", -- Halkomelem
+			["chp"] = "θ", -- Chipewyan (Athabaskan)
-			["izh"] = "ь", -- Ingrian
+			["cjh"] = "χ", -- Upper Chehalis (Salishan)
-			["kic"] = "θ", -- Kickapoo
+			["clm"] = "χ", -- Klallam (Salishan)
-			["lil"] = "χ", -- Lillooet
+			["col"] = "χ", -- Colombia-Wenatchi (Salishan)
+			["coo"] = "χθ", -- Comox (Salishan)
+			["crx"] = "θ", -- Carrier (Athabaskan)
+			["ets"] = "θ", -- Yekhee (Edoid/Niger-Congo)
+			["ett"] = "χ", -- Etruscan (isolate; in romanizations)
+			["fla"] = "χ", -- Montana Salish (Salishan)
+			["grt"] = "་", -- Garo (South Asian Sino-Tibetan)
+			["gmw-gts"] = "χ", -- Gottscheerish (Bavarian variant spoken in Slovenia)
+			["hur"] = "χθ", -- Halkomelem (Salishan)
+			["itc-psa"] = "f", -- Pre-Samnite (Italic; normally written in Greek)
+			["izh"] = "ь", -- Ingrian (Finnic)
+			["kic"] = "θ", -- Kickapoo (Algonquian)
+			["kk"] = "ь", -- Kazakh (Turkic; Yañalif Latin spelling, c. 1928 - 1938)
+			["ky"] = "ь", -- Kyrgyz (Turkic; Yañalif Latin spelling, c. 1928 - 1938)
+			["lil"] = "χ", -- Lillooet (Salishan)
+			["lsi"] = "ꓹ", -- Lashi (Lolo-Burmese/Sino-Tibetan; represents a glottal stop)
 			["mhz"] = "β", -- Mor (Austronesian)
-			["neg"]=  "ӡ", -- Negidal (normally in Cyrillic)
+			["mqn"] = "β", -- Moronene (Austronesian)
-			["oui"] = "γβ", -- Old Uyghur: FIXME: others? E.g. Greek delta (δ)?
+			["neg"]=  "ӡā", -- Negidal (Tungusic; normally in Cyrillic)
-			["pox"] = "χ", -- Polabian
+			["oka"] = "χ", -- Okanagan (Salishan)
-			["rom"] = "Θθ", -- Romani: International Standard; two different thetas???
+			["ole"] = "θ", -- Olekha (Sino-Tibetan)
-			["sah"] = "ь", -- Yakut (1929 - 1939 Latin spelling)
+			["oui"] = "γβ", -- Old Uyghur (Turkic; FIXME: others? E.g. Greek delta (δ)?)
-			["sjw"] = "θ", -- Shawnee
+			["pox"] = "χ", -- Polabian (West Slavic)
-			["squ"] = "χ", -- Squamish
+			["rif"] = "ε", -- Tarifit (Berber)
-			["str"] = "χθ", -- Saanich; uses two Greek letters
+			["rom"] = "Θθ", -- Romani (Indic: International Standard; two different thetas???)
-			["twa"] = "χ", -- Twana
+			["rpn"] = "β", -- Repanbitip (Austronesian)
-			["yha"] = "θ", -- Baha
+			["sah"] = "ь", -- Yakut (Turkic; 1929 - 1939 Latin spelling)
-			["za"] = "зч", -- Zhuang; 1957-1982 alphabet used two Cyrillic letters (as well as some others like
+			["sit-jap"] = "χ", -- Japhug (Sino-Tibetan)
+			["sjw"] = "θ", -- Shawnee (Algonquian)
+			["squ"] = "χ", -- Squamish (Salishan)
+			["str"] = "χθ", -- Saanich (Salishan)
+			["teh"] = "χ", -- Tehuelche (Chonan; spoken in Argentina)
+			["tep"] = "η", -- Tepecano (Uto-Aztecan)
+			["thp"] = "χ", -- Thompson (Salishan)
+			["tk"] = "ь", -- Turkmen (Turkic; Yañalif Latin spelling, c. 1928 - 1938)
+			["tt"] = "ь", -- Kazakh (Turkic; Yañalif Latin spelling, c. 1928 - 1938)
+			["twa"] = "χ", -- Twana (Salishan)
+			["wbl"] = "ы", -- Wakhi (Iranian)
+			["xbc"] = "ϸ", -- Bactrian (Iranian; represents š; normally written in Greek)
+			["yha"] = "θ", -- Baha (Kra-Dai)
+			["za"] = "зч", -- Zhuang (Tai/Kra-Dai); 1957-1982 alphabet used two Cyrillic letters (as well as some others like
 						   -- ƃ, ƅ, ƨ, ɯ and ɵ that look like Cyrillic or Greek but are actually Latin)
-			["zlw-slv"] = "χђћ", -- Slovincian; FIXME: χ is Greek, the other two are Cyrillic, but I'm not sure
+			["zlw-slv"] = "χђћ", -- Slovincian (West Slavic; FIXME: χ is Greek, the other two are Cyrillic, but I'm not sure
-								 -- the currect characters are being chosen in the entry names
+								 -- the currect characters are being chosen in the entry names)
-			["zng"] = "θ", -- Mang
+			["zng"] = "θ", -- Mang (Mon-Khmer)
+			["ztp"] = "θ", -- Loxicha Zapotec (Zapotecan)
 		}
 		-- Determine how many real scripts are found in the pagename, where we exclude symbols and such. We exclude
@@ Line 932: / Line 1,300: @@
 		local ch_to_ignore = characters_to_ignore[full_langcode]
 		if ch_to_ignore then
-			canon_pagename = rsub(canon_pagename, "[" .. ch_to_ignore .. "]", "")
+			canon_pagename = ugsub(canon_pagename, "[" .. ch_to_ignore .. "]", "")
 		end
-		local script_data = mw.loadData(scripts_data_module)
 		while true do
 			if canon_pagename == "" or num_seen_scripts >= 2 or num_loops >= 10 then
@@ Line 941: / Line 1,308: @@
 			-- Make sure we don't get into a loop checking the same script over and over again; happens with e.g. [[ᠪᡳ]]
 			num_loops = num_loops + 1
-			local pagename_script = require(scripts_module).findBestScriptWithoutLang(canon_pagename, "None only as last resort")
+			local pagename_script = find_best_script_without_lang(canon_pagename, "None only as last resort")
 			local script_chars = pagename_script.characters
 			if not script_chars then
@@ Line 949: / Line 1,316: @@
 			local script_code = pagename_script:getCode()
 			local replaced
-			canon_pagename, replaced = rsubn(canon_pagename, "[" .. script_chars .. "]", "")
+			canon_pagename, replaced = ugsub(canon_pagename, "[" .. script_chars .. "]", "")
-			if replaced and script_code ~= "Zmth" and script_data[script_code] and
+			if (
-				script_data[script_code].character_category ~= false then
+				replaced and
+				script_code ~= "Zmth" and
+				(script_data or get_script_data())[script_code] and
+				script_data[script_code].character_category ~= false
+			) then
 				script_code = script_code:gsub("^.-%-", "")
 				if not seen_scripts[script_code] then
@@ Line 964: / Line 1,335: @@
 		end
 	end
 	-- Categorise for unusual characters. Takes into account combining characters, so that we can categorise for characters with diacritics that aren't encoded as atomic characters (e.g. U̠). These can be in two formats: single combining characters (i.e. character + diacritic(s)) or double combining characters (i.e. character + diacritic(s) + character). Each can have any number of diacritics.
 	local standard = data.lang:getStandardCharacters()
@@ Line 1,008: / Line 1,379: @@
 							return ""
 						end
-						local sc_standard = rsub(sc_standard, page.comb_chars.combined_double, explode)
+						local sc_standard = ugsub(sc_standard, page.comb_chars.combined_double, explode)
-						sc_standard = rsub(sc_standard,page.comb_chars.combined_single, explode)
+						sc_standard = ugsub(sc_standard,page.comb_chars.combined_single, explode)
 							:gsub(".[\128-\191]*", explode)
 						local num_cat_inserted
@@ Line 1,019: / Line 1,390: @@
 										num_cat_inserted = true
 									end
-								elseif rfind(char, page.emoji_pattern) then
+								elseif ufind(char, page.emoji_pattern) then
 									insert(data.categories, full_langname .. " terms spelled with emoji")
 								else
@@ Line 1,033: / Line 1,404: @@
 					-- If a diacritic doesn't appear in any of the standard characters, also categorise for it generally.
 					sc_standard = toNFD(sc_standard)
-					for diacritic in rgmatch(page.decompose_pagename, page.comb_chars.diacritics_single) do
+					for diacritic in ugmatch(page.decompose_pagename, page.comb_chars.diacritics_single) do
-						if not rmatch(sc_standard, diacritic) then
+						if not umatch(sc_standard, diacritic) then
 							insert(data.categories, full_langname .. " terms spelled with ◌" .. diacritic)
 						end
 					end
-					for diacritic in rgmatch(page.decompose_pagename, page.comb_chars.diacritics_double) do
+					for diacritic in ugmatch(page.decompose_pagename, page.comb_chars.diacritics_double) do
-						if not rmatch(sc_standard, diacritic) then
+						if not umatch(sc_standard, diacritic) then
 							insert(data.categories, full_langname .. " terms spelled with ◌" .. diacritic .. "◌")
 						end
@@ Line 1,047: / Line 1,418: @@
 		-- Ancient Greek, Hindi and Lao handled the old way for now, as their standard chars still need to be converted to the new format (because there are a lot of them).
 		elseif ulen(page.pagename) ~= 1 then
-			for character in rgmatch(page.pagename, "([^" .. standard .. "])") do
+			for character in ugmatch(page.pagename, "([^" .. standard .. "])") do
 				local upper = char_category(character)
-				if not rmatch(upper, "[" .. standard .. "]") then
+				if not umatch(upper, "[" .. standard .. "]") then
 					character = upper
 				end
@@ Line 1,056: / Line 1,427: @@
 		end
 	end
-	if data.heads[1].sc:isSystem("alphabet") then
-		local pagename, i = page.pagename:ulower(), 2
-		while rmatch(pagename, "(%a)" .. ("%1"):rep(i)) do
-			i = i + 1
-			insert(data.categories, full_langname .. " terms with " .. i .. " consecutive instances of the same letter")
-		end
-	end
 	-- Categorise for palindromes
-	if not data.nopalindromecat and page.namespace ~= "Reconstruction" and ulen(page.pagename) > 2
+	if not data.nopalindromecat and namespace ~= "Reconstruction" and ulen(page.pagename) > 2
 		-- FIXME: Use of first script here seems hacky. What is the clean way of doing this in the presence of
 		-- multiple scripts?
-		and require(palindromes_module).is_palindrome(page.pagename, data.lang, data.heads[1].sc) then
+		and is_palindrome(page.pagename, data.lang, data.heads[1].sc) then
 		insert(data.categories, full_langname .. " palindromes")
 	end
-	if page.namespace == "" and not data.lang:hasType("reconstructed") then
+	if namespace == "" and not lang_reconstructed then
-		local m_links = require(links_module)
+		for _, head in ipairs(data.heads) do
+			if page.full_raw_pagename ~= get_link_page(remove_links(head.term), data.lang, head.sc) then
+				-- [[Special:WhatLinksHere/Wiktionary:Tracking/headword/pagename spelling mismatch]]
+				-- [[Special:WhatLinksHere/Wiktionary:Tracking/headword/pagename spelling mismatch/LANGCODE]]
+				track("pagename spelling mismatch", data.lang)
+				break
+			end
+		end
+	end
+	-- Add red link category if called for and we're not a "large" page, where such checks are disabled.
+	if data.checkredlinks and not m_headword_data.large_pages[m_headword_data.pagename] then
+		local plposcat = type(data.checkredlinks) == "string" and data.checkredlinks or data.pos_category
+		check_red_link_inflections_top_level(data, plposcat)
 	end
@@ Line 1,080: / Line 1,456: @@
 	export.maintenance_cats(page, data.lang, data.categories, data.whole_page_categories)
-	------------ 9. Format and return headwords, genders, inflections and categories. ------------
+	------------ 10. Format and return headwords, genders, inflections and categories. ------------
 	-- Format and return all the gathered information. This may add more categories (e.g. gender/number categories),
@@ Line 1,086: / Line 1,462: @@
 	local text = '<span class="headword-line">' ..
 		format_headword(data) ..
-		format_genders(data) ..
+		format_headword_genders(data) ..
-		format_inflections(data) .. '</span>'
+		format_top_level_inflections(data) .. '</span>'
 	-- Language-specific categories.