Module:headword: Difference between revisions

No edit summary
No edit summary
 
(10 intermediate revisions by the same user not shown)
Line 20: Line 20:


local concat = table.concat
local concat = table.concat
local dump = mw.dumpObject
local insert = table.insert
local insert = table.insert
local ipairs = ipairs
local ipairs = ipairs
Line 36: Line 37:
--[==[
--[==[
Loaders for functions in other modules, which overwrite themselves with the target function when called. This ensures modules are only loaded when needed, retains the speed/convenience of locally-declared pre-loaded functions, and has no overhead after the first call, since the target functions are called directly in any subsequent calls.]==]
Loaders for functions in other modules, which overwrite themselves with the target function when called. This ensures modules are only loaded when needed, retains the speed/convenience of locally-declared pre-loaded functions, and has no overhead after the first call, since the target functions are called directly in any subsequent calls.]==]
local function encode_entities(...)
local function encode_entities(...)
encode_entities = require(string_utilities_module).encode_entities
encode_entities = require(string_utilities_module).encode_entities
return encode_entities(...)
return encode_entities(...)
end
local function extend(...)
extend = require(table_module).extend
return extend(...)
end
end


Line 218: Line 223:
end
end
if type(list) ~= "table" then
if type(list) ~= "table" then
error(("Internal error: Wrong type for `part.%s`=%s, should be \"table\""):format(field, mw.dumpObject(list)))
error(("Internal error: Wrong type for `part.%s`=%s, should be \"table\""):format(field, dump(list)))
end
end
return list[1]
return list[1]
Line 313: Line 318:
local unique_head_parts = {}
local unique_head_parts = {}


local has_multiple_heads = #data.heads > 1
local has_multiple_heads = not not data.heads[2]


for j, head in ipairs(data.heads) do
for j, head in ipairs(data.heads) do
Line 388: Line 393:
local saw_translit_page = false
local saw_translit_page = false


if transliteration_page and transliteration_page.exists then
if transliteration_page and transliteration_page:getContent() then
if data.lang:hasType("conlang") then
translits_formatted = " [[" .. langname .. " transliteration|•]]" .. translits_formatted
translits_formatted = " [[" .. langname .. " transliteration|•]]" .. translits_formatted
else
translits_formatted = " [[Wiktionary:" .. langname .. " transliteration|•]]" .. translits_formatted
end
saw_translit_page = true
saw_translit_page = true
end
end
Line 402: Line 403:
transliteration_page = new_title(langname .. " transliteration")
transliteration_page = new_title(langname .. " transliteration")


if transliteration_page and transliteration_page.exists then
if transliteration_page and transliteration_page:getContent() then
translits_formatted = " [[Wiktionary:" .. langname .. " transliteration|•]]" .. translits_formatted
translits_formatted = " [[" .. langname .. " transliteration|•]]" .. translits_formatted
end
end
end
end
Line 425: Line 426:
local function format_headword_genders(data)
local function format_headword_genders(data)
local retval = ""
local retval = ""
if data.genders and #data.genders > 0 then
if data.genders and data.genders[1] then
if data.gloss then
if data.gloss then
retval = ","
retval = ","
Line 437: Line 438:
end
end
local text, cats = format_genders(data.genders, data.lang, pos_for_cat)
local text, cats = format_genders(data.genders, data.lang, pos_for_cat)
for _, cat in ipairs(cats) do
if cats then
insert(data.categories, cat)
extend(data.categories, cats)
end
end
retval = retval .. " " .. text
retval = retval .. " " .. text
Line 445: Line 446:
end
end


-- Forward reference
local format_inflections


local function format_inflection_parts(data, parts)
local function format_inflection_parts(data, parts)
local any_part_translit = false
for j, part in ipairs(parts) do
for j, part in ipairs(parts) do
if type(part) ~= "table" then
if type(part) ~= "table" then
Line 467: Line 468:
-- right into the 'data' table to disable inflection links of the entire headword
-- right into the 'data' table to disable inflection links of the entire headword
-- when inflected forms aren't entry-worthy, e.g.: in Vulgar Latin
-- when inflected forms aren't entry-worthy, e.g.: in Vulgar Latin
local nolinkinfl = part.face == "hypothetical" or part.nolinkinfl or data.nolinkinfl
local nolinkinfl = part.face == "hypothetical" or part.nolinkinfl or data.nolinkinfl


local formatted
local formatted
Line 481: Line 482:
-- where the script is relatively straightforward to read by learners (e.g. Greek, Russian), but allow it
-- where the script is relatively straightforward to read by learners (e.g. Greek, Russian), but allow it
-- to be enabled in languages with more complex scripts (e.g. Arabic).
-- to be enabled in languages with more complex scripts (e.g. Arabic).
local tr = part.translit or (not (parts.enable_auto_translit or data.inflections.enable_auto_translit) and "-" or nil)
--
if tr ~= "-" then
-- FIXME: With nested inflections, should we also respect `enable_auto_translit` at the top level of the
any_part_translit = true
-- nested inflections structure?
local tr = part.tr or not (parts.enable_auto_translit or data.inflections.enable_auto_translit) and "-" or nil
-- FIXME: Temporary errors added 2025-10-03. Remove after a month or so.
if part.translit then
error("Internal error: Use field `tr` not `translit` for specifying an inflection part translit")
end
if part.transcription then
error("Internal error: Use field `ts` not `transcription` for specifying an inflection part transcription")
end
local postprocess_annotations
if part.inflections then
postprocess_annotations = function(infldata)
insert(infldata.annotations, format_inflections(data, part.inflections))
end
end
end
formatted = full_link(
formatted = full_link(
{
{
Line 497: Line 512:
genders = part.genders,
genders = part.genders,
tr = tr,
tr = tr,
ts = part.transcription,
ts = part.ts,
accel = partaccel or parts.accel,
accel = partaccel or parts.accel,
postprocess_annotations = postprocess_annotations,
},
},
face
face
Line 510: Line 526:
local parts_output
local parts_output


if #parts > 0 then
if parts[1] then
parts_output = (parts.label and " " or "") .. concat(parts)
parts_output = (parts.label and " " or "") .. concat(parts)
elseif parts.request then
elseif parts.request then
Line 520: Line 536:


local parts_label = parts.label and ("<i>" .. parts.label .. "</i>") or ""
local parts_label = parts.label and ("<i>" .. parts.label .. "</i>") or ""
return parts_label .. parts_output, any_part_translit
return format_term_with_qualifiers_and_refs(data.lang, parts, parts_label .. parts_output, 1)
end
end




-- Format the inflections following the headword.
-- Format the inflections following the headword or nested after a given inflection. Declared local above.
local function format_inflections(data)
function format_inflections(data, inflections)
local any_part_translit = false
if inflections and inflections[1] then
if data.inflections and #data.inflections > 0 then
-- Format each inflection individually.
-- Format each inflection individually.
for key, infl in ipairs(data.inflections) do
for key, infl in ipairs(inflections) do
local this_any_part_translit
inflections[key] = format_inflection_parts(data, infl)
data.inflections[key], this_any_part_translit = format_inflection_parts(data, infl)
if this_any_part_translit then
any_part_translit = true
end
end
end


local concat_result = concat(data.inflections, ", ")
return concat(inflections, ", ")
return " (" .. concat_result .. ")"
else
else
return ""
return ""
end
end
end
end
-- Format the top-level inflections following the headword. Currently this just adds parens around the
-- formatted comma-separated inflections in `data.inflections`.
local function format_top_level_inflections(data)
local result = format_inflections(data, data.inflections)
if result ~= "" then
return " (" .. result .. ")"
else
return result
end
end
-- Forward reference
local check_red_link_inflections
-- Check a single inflection (which consists of a label and zero or more terms, each possibly with nested inflections)
-- for red links. If so, insert a red-link category based on `plpos` (the plural part of speech to insert in the
-- category), stop further processing, and return true. If no red links found, return false.
local function check_red_link_inflection_parts(data, parts, plpos)
for _, part in ipairs(parts) do
if type(part) ~= "table" then
part = {term = part}
end
local term = part.term
if term and not term:find("%[%[") then
local stripped_physical_term = get_link_page(term, data.lang, part.sc or parts.sc or nil)
if stripped_physical_term then
local title = mw.title.new(stripped_physical_term)
if title and not title:getContent() then
insert(data.categories, data.lang:getFullName() .. " " .. plpos .. " with red links in their headword lines")
return true
end
end
end
if part.inflections then
if check_red_link_inflections(data, part.inflections, plpos) then
return true
end
end
end
return false
end
-- Check a set of inflections (each of which describes a single inflection of the term, such as feminine or plural, and
-- consists of a label and zero or more terms, each possibly with nested inflections) for red links. If so, insert a
-- red-link category based on `plpos` (the plural part of speech to insert in the category), stop further processing,
-- and return true. If no red links found, return false.
function check_red_link_inflections(data, inflections, plpos)
if inflections and inflections[1] then
-- Check each inflection individually.
for key, infl in ipairs(inflections) do
if check_red_link_inflection_parts(data, infl, plpos) then
return true
end
end
end
return false
end
-- Check the top-level inflections in `data.inflections`, along with any nested inflections, for red links. If so,
-- insert a red-link category based on `plpos` (the plural part of speech to insert in the category), stop further
-- processing, and return true. If no red links found, return false.
local function check_red_link_inflections_top_level(data, plpos)
return check_red_link_inflections(data, data.inflections, plpos)
end


--[==[
--[==[
Line 567: Line 649:
]==]
]==]
function export.pos_lemma_or_nonlemma(plpos, best_guess)
function export.pos_lemma_or_nonlemma(plpos, best_guess)
local isLemma = (m_data or get_data()).lemmas
local m_headword_data = m_data or get_data()
local isLemma = m_headword_data.lemmas
-- Is it a lemma category?
-- Is it a lemma category?
if isLemma[plpos] then
if isLemma[plpos] then
Line 577: Line 660:
end
end
-- Is it a nonlemma category?
-- Is it a nonlemma category?
local isNonLemma = (m_data or get_data()).nonlemmas
local isNonLemma = m_headword_data.nonlemmas
if isNonLemma[plpos] or isNonLemma[plpos_no_recon] then
if isNonLemma[plpos] or isNonLemma[plpos_no_recon] then
return "non-lemma form"
return "non-lemma form"
Line 605: Line 688:
error("POS 'pro' for 'pronoun' no longer allowed as it's too ambiguous; use 'pron'")
error("POS 'pro' for 'pronoun' no longer allowed as it's too ambiguous; use 'pron'")
end
end
local data = m_data or get_data()
local m_headword_data = m_data or get_data()
if data.pos_aliases[pos] then
if m_headword_data.pos_aliases[pos] then
pos = data.pos_aliases[pos]
pos = m_headword_data.pos_aliases[pos]
elseif pos:sub(-1) == "f" then
elseif pos:sub(-1) == "f" then
pos = pos:sub(1, -2)
pos = pos:sub(1, -2)
pos = (data.pos_aliases[pos] or pos) .. " forms"
pos = (m_headword_data.pos_aliases[pos] or pos) .. " forms"
end
end
return export.pluralize_pos(pos)
return export.pluralize_pos(pos)
Line 626: Line 709:
local typ = type(data[element])
local typ = type(data[element])
if typ ~= "table" then
if typ ~= "table" then
error(("In full_headword(), `data.%s` must be an array but is a %s"):format(element, typ))
error(("Internal error: In full_headword(), `data.%s` must be an array but is a %s"):format(element, typ))
end
end
for k, v in pairs(data[element]) do
for k, v in pairs(data[element]) do
if k ~= "maxindex" then
if k ~= "maxindex" then
if type(k) ~= "number" then
if type(k) ~= "number" then
error(("Unrecognized non-numeric key '%s' in `data.%s`"):format(k, element))
error(("Internal error: Unrecognized non-numeric key '%s' in `data.%s`"):format(k, element))
end
end
if k > maxind then
if k > maxind then
Line 638: Line 721:
if v then
if v then
if type(v) ~= "string" then
if type(v) ~= "string" then
error(("For key '%s' in `data.%s`, value should be a string but is a %s"):format(k, element, type(v)))
error(("Internal error: For key '%s' in `data.%s`, value should be a string but is a %s"):format(k, element, type(v)))
end
end
if not allow_blank_string and v == "" then
if not allow_blank_string and v == "" then
error(("For key '%s' in `data.%s`, blank string not allowed; use 'false' for the default"):format(k, element))
error(("Internal error: For key '%s' in `data.%s`, blank string not allowed; use 'false' for the default"):format(k, element))
end
end
end
end
Line 693: Line 776:


function export.maintenance_cats(page, lang, lang_cats, page_cats)
function export.maintenance_cats(page, lang, lang_cats, page_cats)
for _, cat in ipairs(page.cats) do
extend(page_cats, page.cats)
insert(page_cats, cat)
end
lang = lang:getFull() -- since we are just generating categories
lang = lang:getFull() -- since we are just generating categories
local canonical = lang:getCanonicalName()
local canonical = lang:getCanonicalName()
Line 727: Line 808:


if data.getCanonicalName then
if data.getCanonicalName then
error("In full_headword(), the first argument `data` needs to be a Lua object (table) of properties, not a language object")
error("Internal error: In full_headword(), the first argument `data` needs to be a Lua object (table) of properties, not a language object")
end
end


if not data.lang or type(data.lang) ~= "table" or not data.lang.getCode then
if not data.lang or type(data.lang) ~= "table" or not data.lang.getCode then
error("In full_headword(), the first argument `data` needs to be a Lua object (table) and `data.lang` must be a language object")
error("Internal error: In full_headword(), the first argument `data` needs to be a Lua object (table) and `data.lang` must be a language object")
end
end


if data.id and type(data.id) ~= "string" then
if data.id and type(data.id) ~= "string" then
error("The id in the data table should be a string.")
error("Internal error: The id in the data table should be a string.")
end
end


Line 745: Line 826:
local full_langname = data.lang:getFullName()
local full_langname = data.lang:getFullName()


local raw_pagename, page = data.pagename
local raw_pagename = data.pagename
if raw_pagename and raw_pagename ~= (m_data or get_data()).pagename then -- for testing, doc pages, etc.
local page
page = process_page(raw_pagename)
local m_headword_data = m_data or get_data()
if raw_pagename and raw_pagename ~= m_headword_data.pagename then -- for testing, doc pages, etc.
-- data.pagename is often set on documentation and test pages through the pagename= parameter of various
-- templates, to emulate running on that page. Having a large number of such test templates on a single
-- page often leads to timeouts, because we fetch and parse the contents of each page in turn. However,
-- we don't really need to do that and can function fine without fetching and parsing the contents of a
-- given page, so turn off content fetching/parsing (and also setting the DEFAULTSORT key through a parser
-- function, which is *slooooow*) in certain namespaces where test and documentation templates are likely to
-- be found and where actual content does not live (User, Template, Module).
local actual_namespace = m_headword_data.page.namespace
local no_fetch_content = actual_namespace == "User" or actual_namespace == "Template" or
actual_namespace == "Module"
page = process_page(raw_pagename, no_fetch_content)
else
else
page = (m_data or get_data()).page
page = m_headword_data.page
end
end


-- Check the namespace against the language type.
local namespace = page.namespace
local namespace = page.namespace
if namespace == "" then
if data.lang:hasType("reconstructed") then
error("Entries in " .. langname .. " must be placed in the Reconstruction: namespace")
elseif data.lang:hasType("appendix-constructed") then
error("Entries in " .. langname .. " must be placed in the Appendix: namespace")
end
elseif namespace == "Citations" or namespace == "Thesaurus" then
error("Headword templates should not be used in the " .. namespace .. ": namespace.")
end


------------ 3. Initialize `data.heads` table; if old-style, convert to new-style. ------------
------------ 3. Initialize `data.heads` table; if old-style, convert to new-style. ------------
Line 769: Line 852:
-- new-style
-- new-style
if data.translits or data.transcriptions then
if data.translits or data.transcriptions then
error("In full_headword(), if `data.heads` is new-style (array of head objects), `data.translits` and `data.transcriptions` cannot be given")
error("Internal error: In full_headword(), if `data.heads` is new-style (array of head objects), `data.translits` and `data.transcriptions` cannot be given")
end
end
else
else
Line 802: Line 885:
init_and_find_maximum_index(data, "whole_page_categories")
init_and_find_maximum_index(data, "whole_page_categories")
local pos_category_already_present = false
local pos_category_already_present = false
if #data.categories > 0 then
if data.categories[1] then
local escaped_langname = pattern_escape(full_langname)
local escaped_langname = pattern_escape(full_langname)
local matches_lang_pattern = "^" .. escaped_langname .. " "
local matches_lang_pattern = "^" .. escaped_langname .. " "
Line 816: Line 899:


if not data.pos_category then
if not data.pos_category then
error("`data.pos_category` not specified and could not be inferred from the categories given in "
error("Internal error: `data.pos_category` not specified and could not be inferred from the categories given in "
.. "`data.categories`. Either specify the plural part of speech in `data.pos_category` "
.. "`data.categories`. Either specify the plural part of speech in `data.pos_category` "
.. "(e.g. \"proper nouns\") or ensure that the first category in `data.categories` is formed from the "
.. "(e.g. \"proper nouns\") or ensure that the first category in `data.categories` is formed from the "
Line 846: Line 929:


------------ 5. Create a default headword, and add links to multiword page names. ------------
------------ 5. Create a default headword, and add links to multiword page names. ------------
-- Determine if this is an "anti-asterisk" term, i.e. an attested term in a language that must normally be
-- reconstructed.
local is_anti_asterisk = data.heads[1].term and data.heads[1].term:find("^!!")
local lang_reconstructed = data.lang:hasType("reconstructed")
if is_anti_asterisk then
if not lang_reconstructed then
error("Anti-asterisk feature (head= beginning with !!) can only be used with reconstructed languages")
end
lang_reconstructed = false
end


-- Determine if term is reconstructed
-- Determine if term is reconstructed
local is_reconstructed = namespace == "Reconstruction" or data.lang:hasType("reconstructed")
local is_reconstructed = namespace == "Reconstruction" or lang_reconstructed


-- Create a default headword based on the pagename, which is determined in
-- Create a default headword based on the pagename, which is determined in
Line 856: Line 951:
-- Add links to multi-word page names when appropriate
-- Add links to multi-word page names when appropriate
if not (is_reconstructed or data.nolinkhead) then
if not (is_reconstructed or data.nolinkhead) then
local no_links = (m_data or get_data()).no_multiword_links
local no_links = m_headword_data.no_multiword_links
if not (no_links[langcode] or no_links[full_langcode]) and export.head_is_multiword(default_head) then
if not (no_links[langcode] or no_links[full_langcode]) and export.head_is_multiword(default_head) then
default_head = export.add_multiword_links(default_head, true)
default_head = export.add_multiword_links(default_head, true)
Line 866: Line 961:
end
end


------------ 6. Fill in missing values in `data.heads`. ------------
------------ 6. Check the namespace against the language type. ------------
 
if namespace == "" then
if lang_reconstructed then
error("Entries in " .. langname .. " must be placed in the Reconstruction: namespace")
elseif data.lang:hasType("appendix-constructed") then
error("Entries in " .. langname .. " must be placed in the Appendix: namespace")
end
elseif namespace == "Citations" or namespace == "Thesaurus" then
error("Headword templates should not be used in the " .. namespace .. ": namespace.")
end
 
------------ 7. Fill in missing values in `data.heads`. ------------


-- True if any script among the headword scripts has spaces in it.
-- True if any script among the headword scripts has spaces in it.
Line 875: Line 982:
for _, head in ipairs(data.heads) do
for _, head in ipairs(data.heads) do


------ 6a. If missing head, replace with default head.
------ 7a. If missing head, replace with default head.
if not head.term then
if not head.term then
head.term = default_head
head.term = default_head
elseif head.term == default_head then
elseif head.term == default_head then
has_redundant_head_param = true
has_redundant_head_param = true
elseif is_anti_asterisk and head.term == "!!" then
-- If explicit head=!! is given, it's an anti-asterisk term and we fill in the default head.
head.term = "!!" .. default_head
elseif head.term:find("^[!?]$") then
elseif head.term:find("^[!?]$") then
-- If explicit head= just consists of ! or ?, add it to the end of the default head.
-- If explicit head= just consists of ! or ?, add it to the end of the default head.
head.term = default_head .. head.term
head.term = default_head .. head.term
end
end
head.term_no_initial_bang_bang = is_anti_asterisk and head.term:sub(3) or head.term


if is_reconstructed then
if is_reconstructed then
Line 895: Line 1,006:
end
end


------ 6b. Try to detect the script(s) if not provided. If a per-head script is provided, that takes precedence,
------ 7b. Try to detect the script(s) if not provided. If a per-head script is provided, that takes precedence,
------    otherwise fall back to the overall script if given. If neither given, autodetect the script.
------    otherwise fall back to the overall script if given. If neither given, autodetect the script.


Line 913: Line 1,024:
-- Track uses of sc parameter.
-- Track uses of sc parameter.
if head.sc:getCode() == auto_sc:getCode() then
if head.sc:getCode() == auto_sc:getCode() then
insert(data.categories, full_langname .. " terms with redundant script codes")
if not data.no_script_code_cat then
insert(data.categories, full_langname .. " terms with redundant script codes")
end
else
else
insert(data.categories, full_langname .. " terms with non-redundant manual script codes")
if not data.no_script_code_cat then
insert(data.categories, full_langname .. " terms with non-redundant manual script codes")
end
end
end
end
end
Line 929: Line 1,044:
any_script_has_spaces = any_script_has_spaces or head.sc:hasSpaces()
any_script_has_spaces = any_script_has_spaces or head.sc:hasSpaces()


------ 6c. Create automatic transliterations for any non-Latin headwords without manual translit given
------ 7c. Create automatic transliterations for any non-Latin headwords without manual translit given
------    (provided automatic translit is available, e.g. not in Persian or Hebrew).
------    (provided automatic translit is available, e.g. not in Persian or Hebrew).


Line 940: Line 1,055:
head.tr = nil
head.tr = nil
else
else
local notranslit = (m_data or get_data()).notranslit
local notranslit = m_headword_data.notranslit
if not (notranslit[langcode] or notranslit[full_langcode]) and head.sc:isTransliterated() then
if not (notranslit[langcode] or notranslit[full_langcode]) and head.sc:isTransliterated() then
head.tr_manual = not not head.tr
head.tr_manual = not not head.tr


local text = head.term
local text = head.term_no_initial_bang_bang
if not data.lang:link_tr(head.sc) then
if not data.lang:link_tr(head.sc) then
text = remove_links(text)
text = remove_links(text)
end
end


local automated_tr, tr_categories
local automated_tr = data.lang:transliterate(text, head.sc)
automated_tr, head.tr_fail, tr_categories = data.lang:transliterate(text, head.sc)


if automated_tr or head.tr_fail then
if automated_tr then
local manual_tr = head.tr
local manual_tr = head.tr


if manual_tr then
if manual_tr then
if (remove_links(manual_tr) == remove_links(automated_tr)) and (not head.tr_fail) then
if remove_links(manual_tr) == remove_links(automated_tr) then
insert(data.categories, full_langname .. " terms with redundant transliterations")
insert(data.categories, full_langname .. " terms with redundant transliterations")
elseif not head.tr_fail then
else
insert(data.categories, full_langname .. " terms with non-redundant manual transliterations")
insert(data.categories, full_langname .. " terms with non-redundant manual transliterations")
end
end
Line 965: Line 1,079:
if not manual_tr then
if not manual_tr then
head.tr = automated_tr
head.tr = automated_tr
for _, category in ipairs(tr_categories) do
insert(data.categories, category)
end
end
end
end
end
Line 996: Line 1,107:
end
end


------------ 7. Maybe tag the title with the appropriate script code, using the `display_title` mechanism. ------------
------------ 8. Maybe tag the title with the appropriate script code, using the `display_title` mechanism. ------------


-- Assumes that the scripts in "toBeTagged" will never occur in the Reconstruction namespace.
-- Assumes that the scripts in "toBeTagged" will never occur in the Reconstruction namespace.
Line 1,012: Line 1,123:
if unsupported == 1 and page.unsupported_titles[unsupported_pagename] then
if unsupported == 1 and page.unsupported_titles[unsupported_pagename] then
display_title = 'Unsupported titles/<span class="' .. dt_script_code .. '">' .. page.unsupported_titles[unsupported_pagename] .. '</span>'
display_title = 'Unsupported titles/<span class="' .. dt_script_code .. '">' .. page.unsupported_titles[unsupported_pagename] .. '</span>'
elseif page_non_ascii and (m_data or get_data()).toBeTagged[dt_script_code]
elseif page_non_ascii and m_headword_data.toBeTagged[dt_script_code]
or (dt_script_code == "Jpan" and (text_in_script(page.pagename, "Hira") or text_in_script(page.pagename, "Kana")))
or (dt_script_code == "Jpan" and (text_in_script(page.pagename, "Hira") or text_in_script(page.pagename, "Kana")))
or (dt_script_code == "Kore" and text_in_script(page.pagename, "Hang")) then
or (dt_script_code == "Kore" and text_in_script(page.pagename, "Hang")) then
Line 1,051: Line 1,162:
end
end


------------ 8. Insert additional categories. ------------
------------ 9. Insert additional categories. ------------


if has_redundant_head_param then
if has_redundant_head_param then
if not data.no_redundant_head_cat then
if not data.no_redundant_head_cat then
insert(data.categories, full_langname .. " terms with redundant head parameter")
-- This is not the right way to go about this; too many exceptions and problems due to language-specific headword
-- handling customization. If we want this, it should be opt-in by a given language passing in the default headword.
-- insert(data.categories, full_langname .. " terms with redundant head parameter")
end
end
end
end
Line 1,061: Line 1,174:
-- If the first head is multiword (after removing links), maybe insert into "LANG multiword terms".
-- If the first head is multiword (after removing links), maybe insert into "LANG multiword terms".
if not data.nomultiwordcat and any_script_has_spaces and postype == "lemma" then
if not data.nomultiwordcat and any_script_has_spaces and postype == "lemma" then
local no_multiword_cat = (m_data or get_data()).no_multiword_cat
local no_multiword_cat = m_headword_data.no_multiword_cat
if not (no_multiword_cat[langcode] or no_multiword_cat[full_langcode]) then
if not (no_multiword_cat[langcode] or no_multiword_cat[full_langcode]) then
-- Check for spaces or hyphens, but exclude prefixes and suffixes.
-- Check for spaces or hyphens, but exclude prefixes and suffixes.
-- Use the pagename, not the head= value, because the latter may have extra
-- Use the pagename, not the head= value, because the latter may have extra
-- junk in it, e.g. superscripted text that throws off the algorithm.
-- junk in it, e.g. superscripted text that throws off the algorithm.
local no_hyphen = (m_data or get_data()).hyphen_not_multiword_sep
local no_hyphen = m_headword_data.hyphen_not_multiword_sep
-- Exclude hyphens if the data module states that they should for this language.
-- Exclude hyphens if the data module states that they should for this language.
local checkpattern = (no_hyphen[langcode] or no_hyphen[full_langcode]) and ".[%s፡]." or ".[%s%-፡]."
local checkpattern = (no_hyphen[langcode] or no_hyphen[full_langcode]) and ".[%s፡]." or ".[%s%-፡]."
if umatch(page.pagename, checkpattern) and not non_categorizable(page.full_raw_pagename) then
local is_multiword = umatch(page.pagename, checkpattern)
 
if is_multiword and not non_categorizable(page.full_raw_pagename) then
insert(data.categories, full_langname .. " multiword terms")
insert(data.categories, full_langname .. " multiword terms")
elseif not is_multiword then
local long_word_threshold = m_headword_data.long_word_thresholds[langcode] or
m_headword_data.long_word_thresholds[full_langcode]
if long_word_threshold and ulen(page.pagename) >= long_word_threshold then
insert(data.categories, "Long " .. full_langname .. " words")
end
end
end
end
end
end
end


if data.sccat then
local default_sccat = m_headword_data.default_sccat
if data.sccat or data.sccat == nil and (default_sccat[langcode] or default_sccat[full_langcode]) then
for _, head in ipairs(data.heads) do
for _, head in ipairs(data.heads) do
insert(data.categories, full_langname .. " " .. data.pos_category .. " in " ..
insert(data.categories, full_langname .. " " .. data.pos_category .. " in " ..
Line 1,088: Line 1,210:
-- values.
-- values.
local characters_to_ignore = {
local characters_to_ignore = {
["aaq"] = "α", -- Penobscot
["aaq"] = "αάὰ", -- Penobscot (Algonquian)
["acy"] = "δθ", -- Cypriot Arabic
["acy"] = "δθ", -- Cypriot Arabic
["anc"] = "γ", -- Ngas
["aez"] = "β", -- Aeka (Trans-New Guinea)
["aou"] = "χ", -- A'ou
["anc"] = "γ", -- Ngas (Chadic/Afroasiatic)
["awg"] = "β", -- Anguthimri
["aou"] = "χ", -- A'ou (Kra-Dai)
["bhp"] = "β", -- Bima
["art-blk"] = "ч", -- Bolak (conlang)
["byk"] = "θ", -- Biao
["awg"] = "β", -- Anguthimri (Pama-Nyungan)
["cdy"] = "θ", -- Chadong
["az"] = "ь", -- Azerbaijani (Turkic; Yañalif Latin spelling, c. 1928 - 1938)
["clm"] = "χ", -- Klallam
["ba"] = "ь", -- Bashkir (Turkic; Yañalif Latin spelling, c. 1928 - 1938)
["col"] = "χ", -- Colombia-Wenatchi
["bhp"] = "β", -- Bima (Austronesian)
["coo"] = "χ", -- Comox; FIXME: others? E.g. Greek theta (θ)?
["bjz"] = "β", -- Baruga (Trans-New Guinea)
["ets"] = "θ", -- Yekhee
["byk"] = "θ", -- Biao (Kra-Dai)
["gmw-gts"] = "χ", -- Gottscheerish
["cdy"] = "θ", -- Chadong (Kra-Dai)
["hur"] = "θ", -- Halkomelem
["chp"] = "θ", -- Chipewyan (Athabaskan)
["izh"] = "ь", -- Ingrian
["cjh"] = "χ", -- Upper Chehalis (Salishan)
["kic"] = "θ", -- Kickapoo
["clm"] = "χ", -- Klallam (Salishan)
["lil"] = "χ", -- Lillooet
["col"] = "χ", -- Colombia-Wenatchi (Salishan)
["coo"] = "χθ", -- Comox (Salishan)
["crx"] = "θ", -- Carrier (Athabaskan)
["ets"] = "θ", -- Yekhee (Edoid/Niger-Congo)
["ett"] = "χ", -- Etruscan (isolate; in romanizations)
["fla"] = "χ", -- Montana Salish (Salishan)
["grt"] = "་", -- Garo (South Asian Sino-Tibetan)
["gmw-gts"] = "χ", -- Gottscheerish (Bavarian variant spoken in Slovenia)
["hur"] = "χθ", -- Halkomelem (Salishan)
["itc-psa"] = "f", -- Pre-Samnite (Italic; normally written in Greek)
["izh"] = "ь", -- Ingrian (Finnic)
["kic"] = "θ", -- Kickapoo (Algonquian)
["kk"] = "ь", -- Kazakh (Turkic; Yañalif Latin spelling, c. 1928 - 1938)
["ky"] = "ь", -- Kyrgyz (Turkic; Yañalif Latin spelling, c. 1928 - 1938)
["lil"] = "χ", -- Lillooet (Salishan)
["lsi"] = "ꓹ", -- Lashi (Lolo-Burmese/Sino-Tibetan; represents a glottal stop)
["mhz"] = "β", -- Mor (Austronesian)
["mhz"] = "β", -- Mor (Austronesian)
["neg"]=  "ӡ", -- Negidal (normally in Cyrillic)
["mqn"] = "β", -- Moronene (Austronesian)
["oui"] = "γβ", -- Old Uyghur: FIXME: others? E.g. Greek delta (δ)?
["neg"]=  "ӡā", -- Negidal (Tungusic; normally in Cyrillic)
["pox"] = "χ", -- Polabian
["oka"] = "χ", -- Okanagan (Salishan)
["rom"] = "Θθ", -- Romani: International Standard; two different thetas???
["ole"] = "θ", -- Olekha (Sino-Tibetan)
["sah"] = "ь", -- Yakut (1929 - 1939 Latin spelling)
["oui"] = "γβ", -- Old Uyghur (Turkic; FIXME: others? E.g. Greek delta (δ)?)
["sjw"] = "θ", -- Shawnee
["pox"] = "χ", -- Polabian (West Slavic)
["squ"] = "χ", -- Squamish
["rif"] = "ε", -- Tarifit (Berber)
["str"] = "χθ", -- Saanich; uses two Greek letters
["rom"] = "Θθ", -- Romani (Indic: International Standard; two different thetas???)
["twa"] = "χ", -- Twana
["rpn"] = "β", -- Repanbitip (Austronesian)
["yha"] = "θ", -- Baha
["sah"] = "ь", -- Yakut (Turkic; 1929 - 1939 Latin spelling)
["za"] = "зч", -- Zhuang; 1957-1982 alphabet used two Cyrillic letters (as well as some others like
["sit-jap"] = "χ", -- Japhug (Sino-Tibetan)
["sjw"] = "θ", -- Shawnee (Algonquian)
["squ"] = "χ", -- Squamish (Salishan)
["str"] = "χθ", -- Saanich (Salishan)
["teh"] = "χ", -- Tehuelche (Chonan; spoken in Argentina)
["tep"] = "η", -- Tepecano (Uto-Aztecan)
["thp"] = "χ", -- Thompson (Salishan)
["tk"] = "ь", -- Turkmen (Turkic; Yañalif Latin spelling, c. 1928 - 1938)
["tt"] = "ь", -- Kazakh (Turkic; Yañalif Latin spelling, c. 1928 - 1938)
["twa"] = "χ", -- Twana (Salishan)
["wbl"] = "ы", -- Wakhi (Iranian)
["xbc"] = "ϸ", -- Bactrian (Iranian; represents š; normally written in Greek)
["yha"] = "θ", -- Baha (Kra-Dai)
["za"] = "зч", -- Zhuang (Tai/Kra-Dai); 1957-1982 alphabet used two Cyrillic letters (as well as some others like
  -- ƃ, ƅ, ƨ, ɯ and ɵ that look like Cyrillic or Greek but are actually Latin)
  -- ƃ, ƅ, ƨ, ɯ and ɵ that look like Cyrillic or Greek but are actually Latin)
["zlw-slv"] = "χђћ", -- Slovincian; FIXME: χ is Greek, the other two are Cyrillic, but I'm not sure
["zlw-slv"] = "χђћ", -- Slovincian (West Slavic; FIXME: χ is Greek, the other two are Cyrillic, but I'm not sure
-- the currect characters are being chosen in the entry names
-- the currect characters are being chosen in the entry names)
["zng"] = "θ", -- Mang
["zng"] = "θ", -- Mang (Mon-Khmer)
["ztp"] = "θ", -- Loxicha Zapotec (Zapotecan)
}
}
-- Determine how many real scripts are found in the pagename, where we exclude symbols and such. We exclude
-- Determine how many real scripts are found in the pagename, where we exclude symbols and such. We exclude
Line 1,275: Line 1,426:
and is_palindrome(page.pagename, data.lang, data.heads[1].sc) then
and is_palindrome(page.pagename, data.lang, data.heads[1].sc) then
insert(data.categories, full_langname .. " palindromes")
insert(data.categories, full_langname .. " palindromes")
end
-- Add red link category if called for and we're not a "large" page, where such checks are disabled.
if data.checkredlinks and not m_headword_data.large_pages[m_headword_data.pagename] then
local plposcat = type(data.checkredlinks) == "string" and data.checkredlinks or data.pos_category
check_red_link_inflections_top_level(data, plposcat)
end
end


Line 1,280: Line 1,437:
export.maintenance_cats(page, data.lang, data.categories, data.whole_page_categories)
export.maintenance_cats(page, data.lang, data.categories, data.whole_page_categories)


------------ 9. Format and return headwords, genders, inflections and categories. ------------
------------ 10. Format and return headwords, genders, inflections and categories. ------------


-- Format and return all the gathered information. This may add more categories (e.g. gender/number categories),
-- Format and return all the gathered information. This may add more categories (e.g. gender/number categories),
Line 1,287: Line 1,444:
format_headword(data) ..
format_headword(data) ..
format_headword_genders(data) ..
format_headword_genders(data) ..
format_inflections(data) .. '</span>'
format_top_level_inflections(data) .. '</span>'


-- Language-specific categories.
-- Language-specific categories.