Module:headword: Difference between revisions

Jump to navigation Jump to search
no edit summary
No edit summary
No edit summary
Line 2: Line 2:


-- Named constants for all modules used, to make it easier to swap out sandbox versions.
-- Named constants for all modules used, to make it easier to swap out sandbox versions.
local en_utilities_module = "Module:en-utilities"
local gender_and_number_module = "Module:getn"
local gender_and_number_module = "Module:getn"
local headword_data_module = "Module:headword/data"
local headword_data_module = "Module:headword/data"
local headword_page_module = "Module:headword/page"
local headword_page_module = "Module:headword/page"
local links_module = "Module:links"
local links_module = "Module:links"
local load_module = "Module:load"
local pages_module = "Module:pages"
local palindromes_module = "Module:palindromes"
local palindromes_module = "Module:palindromes"
local qualifier_module = "Module:qualifier"
local pron_qualifier_module = "Module:pron qualifier"
local scripts_module = "Module:scripts"
local scripts_module = "Module:scripts"
local scripts_data_module = "Module:scripts/data"
local scripts_data_module = "Module:scripts/data"
Line 15: Line 18:
local table_module = "Module:table"
local table_module = "Module:table"
local utilities_module = "Module:utilities"
local utilities_module = "Module:utilities"
local m_str_utils = require(string_utilities_module)


local concat = table.concat
local concat = table.concat
local encode_entities = m_str_utils.encode_entities
local insert = table.insert
local insert = table.insert
local ipairs = ipairs
local ipairs = ipairs
local max = math.max
local new_title = mw.title.new
local pairs = pairs
local pairs = pairs
local pattern_escape = m_str_utils.pattern_escape
local require = require
local rgmatch = mw.ustring.gmatch
local rsubn = mw.ustring.gsub
local rfind = mw.ustring.find
local ulen = m_str_utils.len
local rmatch = mw.ustring.match
local toNFC = mw.ustring.toNFC
local toNFC = mw.ustring.toNFC
local toNFD = mw.ustring.toNFD
local toNFD = mw.ustring.toNFD
local type = type
local ufind = mw.ustring.find
local ugmatch = mw.ustring.gmatch
local ugsub = mw.ustring.gsub
local umatch = mw.ustring.match
--[==[
Loaders for functions in other modules, which overwrite themselves with the target function when called. This ensures modules are only loaded when needed, retains the speed/convenience of locally-declared pre-loaded functions, and has no overhead after the first call, since the target functions are called directly in any subsequent calls.]==]
local function encode_entities(...)
encode_entities = require(string_utilities_module).encode_entities
return encode_entities(...)
end
local function find_best_script_without_lang(...)
find_best_script_without_lang = require(scripts_module).findBestScriptWithoutLang
return find_best_script_without_lang(...)
end
local function format_categories(...)
format_categories = require(utilities_module).format_categories
return format_categories(...)
end
local function format_genders(...)
format_genders = require(gender_and_number_module).format_genders
return format_genders(...)
end
local function format_pron_qualifiers(...)
format_pron_qualifiers = require(pron_qualifier_module).format_qualifiers
return format_pron_qualifiers(...)
end
local function full_link(...)
full_link = require(links_module).full_link
return full_link(...)
end
local function get_current_L2(...)
get_current_L2 = require(pages_module).get_current_L2
return get_current_L2(...)
end


local m_data = mw.loadData(headword_data_module)
local function get_link_page(...)
get_link_page = require(links_module).get_link_page
return get_link_page(...)
end


local isLemma = m_data.lemmas
local function get_script(...)
local isNonLemma = m_data.nonlemmas
get_script = require(scripts_module).getByCode
local notranslit = m_data.notranslit
return get_script(...)
local toBeTagged = m_data.toBeTagged
end
 
local function is_palindrome(...)
is_palindrome = require(palindromes_module).is_palindrome
return is_palindrome(...)
end
 
local function language_link(...)
language_link = require(links_module).language_link
return language_link(...)
end
 
local function load_data(...)
load_data = require(load_module).load_data
return load_data(...)
end
 
local function pattern_escape(...)
pattern_escape = require(string_utilities_module).pattern_escape
return pattern_escape(...)
end
 
local function pluralize(...)
pluralize = require(en_utilities_module).pluralize
return pluralize(...)
end
 
local function process_page(...)
process_page = require(headword_page_module).process_page
return process_page(...)
end
 
local function remove_links(...)
remove_links = require(links_module).remove_links
return remove_links(...)
end
 
local function shallow_copy(...)
shallow_copy = require(table_module).shallowCopy
return shallow_copy(...)
end
 
local function tag_text(...)
tag_text = require(script_utilities_module).tag_text
return tag_text(...)
end
 
local function tag_transcription(...)
tag_transcription = require(script_utilities_module).tag_transcription
return tag_transcription(...)
end
 
local function tag_translit(...)
tag_translit = require(script_utilities_module).tag_translit
return tag_translit(...)
end
 
local function trim(...)
trim = require(string_utilities_module).trim
return trim(...)
end
 
local function ulen(...)
ulen = require(string_utilities_module).len
return ulen(...)
end
 
--[==[
Loaders for objects, which load data (or some other object) into some variable, which can then be accessed as "foo or get_foo()", where the function get_foo sets the object to "foo" and then returns it. This ensures they are only loaded when needed, and avoids the need to check for the existence of the object each time, since once "foo" has been set, "get_foo" will not be called again.]==]
local m_data
local function get_data()
m_data = load_data(headword_data_module)
return m_data
end
 
local script_data
local function get_script_data()
script_data = load_data(scripts_data_module)
return script_data
end
 
local script_utilities_data
local function get_script_utilities_data()
script_utilities_data = load_data(script_utilities_data_module)
return script_utilities_data
end


-- If set to true, categories always appear, even in non-mainspace pages
-- If set to true, categories always appear, even in non-mainspace pages
local test_force_categories = false
local test_force_categories = false
-- Version of rsubn() that discards all but the first return value.
local function rsub(term, foo, bar)
return (rsubn(term, foo, bar))
end


local function text_in_script(text, script_code)
local function text_in_script(text, script_code)
local sc = require(scripts_module).getByCode(script_code)
local sc = get_script(script_code)
if not sc then
if not sc then
error("Internal error: Bad script code " .. script_code)
error("Internal error: Bad script code " .. script_code)
Line 56: Line 179:
local out
local out
if characters then
if characters then
text = rsub(text, "%W", "")
text = ugsub(text, "%W", "")
out = rfind(text, "[" .. characters .. "]")
out = ufind(text, "[" .. characters .. "]")
end
end


Line 71: Line 194:
--[[ List of punctuation or spacing characters that are found inside of words.
--[[ List of punctuation or spacing characters that are found inside of words.
Used to exclude characters from the regex above. ]]
Used to exclude characters from the regex above. ]]
local wordPunc = "-־׳״'.·*’་•:᠊"
local wordPunc = "-#%%&@־׳״'.·*’་•:᠊"
local notWordPunc = "[^" .. wordPunc .. "]+"
local notWordPunc = "[^" .. wordPunc .. "]+"




-- Format a term (either a head term or an inflection term) along with any left or right qualifiers, references or
-- Format a term (either a head term or an inflection term) along with any left or right qualifiers, labels, references
-- customized separator: `part` is the object specifying the term, which should optionally contain:
-- or customized separator: `part` is the object specifying the term (and `lang` the language of the term), which should
-- * left qualifiers in `q`, an array of strings (or `qualifiers` for compatibility purposes);
-- optionally contain:
-- * left qualifiers in `q`, an array of strings;
-- * right qualifiers in `qq`, an array of strings;
-- * right qualifiers in `qq`, an array of strings;
-- * left labels in `l`, an array of strings;
-- * right labels in `ll`, an array of strings;
-- * references in `refs`, an array either of strings (formatted reference text) or objects containing fields `text`
-- * references in `refs`, an array either of strings (formatted reference text) or objects containing fields `text`
--  (formatted reference text) and optionally `name` and/or `group`;
--  (formatted reference text) and optionally `name` and/or `group`;
-- * a separator in `separator`, defaulting to " <i>or</i> " if this is not the first term (j > 1), otherwise "".
-- * a separator in `separator`, defaulting to " <i>or</i> " if this is not the first term (j > 1), otherwise "".
-- `formatted` is the formatted version of the term itself, and `j` is the index of the term.
-- `formatted` is the formatted version of the term itself, and `j` is the index of the term.
local function format_term_with_qualifiers_and_refs(part, formatted, j)
local left_qualifiers, right_qualifiers
local reftext


left_qualifiers = part.q and #part.q > 0 and part.q
local function format_term_with_qualifiers_and_refs(lang, part, formatted, j)
if left_qualifiers then
local function part_non_empty(field)
left_qualifiers = require(qualifier_module).format_qualifier(left_qualifiers) .. " "
local list = part[field]
if not list then
return nil
end
if type(list) ~= "table" then
error(("Internal error: Wrong type for `part.%s`=%s, should be \"table\""):format(field, mw.dumpObject(list)))
end
return list[1]
end
end


right_qualifiers = part.qq and #part.qq > 0 and part.qq
if part_non_empty("q") or part_non_empty("qq") or part_non_empty("l") or
if right_qualifiers then
part_non_empty("ll") or part_non_empty("refs") then
right_qualifiers = " " .. require(qualifier_module).format_qualifier(right_qualifiers)
formatted = format_pron_qualifiers {
end
lang = lang,
if part.refs and #part.refs > 0 then
text = formatted,
local refs = {}
q = part.q,
for _, ref in ipairs(part.refs) do
qq = part.qq,
if type(ref) ~= "table" then
l = part.l,
ref = {text = ref}
ll = part.ll,
end
refs = part.refs,
local refargs
}
if ref.name or ref.group then
refargs = {name = ref.name, group = ref.group}
end
insert(refs, mw.getCurrentFrame():extensionTag("ref", ref.text, refargs))
end
reftext = concat(refs)
end
end


local separator = part.separator or j > 1 and " <i>or</i> " -- use "" to request no separator
local separator = part.separator or j > 1 and " <i>or</i> " -- use "" to request no separator


if left_qualifiers then
formatted = left_qualifiers .. formatted
end
if reftext then
formatted = formatted .. reftext
end
if right_qualifiers then
formatted = formatted .. right_qualifiers
end
if separator then
if separator then
formatted = separator .. formatted
formatted = separator .. formatted
Line 132: Line 247:
--[==[Return true if the given head is multiword according to the algorithm used in full_headword().]==]
--[==[Return true if the given head is multiword according to the algorithm used in full_headword().]==]
function export.head_is_multiword(head)
function export.head_is_multiword(head)
for possibleWordBreak in rgmatch(head, spacingPunctuation) do
for possibleWordBreak in ugmatch(head, spacingPunctuation) do
if rmatch(possibleWordBreak, notWordPunc) then
if umatch(possibleWordBreak, notWordPunc) then
return true
return true
end
end
Line 141: Line 256:
end
end


 
do
--[==[Add links to a multiword head.]==]
function export.add_multiword_links(head, default)
local function workaround_to_exclude_chars(s)
local function workaround_to_exclude_chars(s)
return rsub(s, notWordPunc, "\2%1\1")
return (ugsub(s, notWordPunc, "\2%1\1"))
end
end


head = "\1" .. rsub(head, spacingPunctuation, workaround_to_exclude_chars) .. "\2"
--[==[Add links to a multiword head.]==]
if default then
function export.add_multiword_links(head, default)
head = head
head = "\1" .. ugsub(head, spacingPunctuation, workaround_to_exclude_chars) .. "\2"
:gsub("(\1[^\2]*)\\([:#][^\2]*\2)", "%1\\\\%2")
if default then
:gsub("(\1[^\2]*)([:#][^\2]*\2)", "%1\\%2")
head = head
end
:gsub("(\1[^\2]*)\\([:#][^\2]*\2)", "%1\\\\%2")
:gsub("(\1[^\2]*)([:#][^\2]*\2)", "%1\\%2")
end


--Escape any remaining square brackets to stop them breaking links (e.g. "[citation needed]").
--Escape any remaining square brackets to stop them breaking links (e.g. "[citation needed]").
head = encode_entities(head, "[]", true, true)
head = encode_entities(head, "[]", true, true)


--[=[
--[=[
use this when workaround is no longer needed:
use this when workaround is no longer needed:


head = "[[" .. rsub(head, WORDBREAKCHARS, "]]%1[[") .. "]]"
head = "[[" .. ugsub(head, WORDBREAKCHARS, "]]%1[[") .. "]]"


Remove any empty links, which could have been created above
Remove any empty links, which could have been created above
at the beginning or end of the string.
at the beginning or end of the string.
]=]
]=]
return (head
return (head
:gsub("\1\2", "")
:gsub("\1\2", "")
:gsub("[\1\2]", {["\1"] = "[[", ["\2"] = "]]"}))
:gsub("[\1\2]", {["\1"] = "[[", ["\2"] = "]]"}))
end
end
end




local function non_categorizable(full_raw_pagename)
local function non_categorizable(full_raw_pagename)
return full_raw_pagename:find("^Appendix:Gestures/")
return full_raw_pagename:find("^Appendix:Gestures/") or
-- Unsupported titles with descriptive names.
(full_raw_pagename:find("^Unsupported titles/") and not full_raw_pagename:find("`"))
end
end


local function tag_text_and_add_quals_and_refs(data, head, formatted, j)
-- Add language and script wrapper.
formatted = tag_text(formatted, data.lang, head.sc, "head", nil, j == 1 and data.id or nil)
-- Add qualifiers, labels, references and separator.
return format_term_with_qualifiers_and_refs(data.lang, head, formatted, j)
end


-- Format a headword with transliterations.
-- Format a headword with transliterations.
local function format_headword(data)
local function format_headword(data)
local m_scriptutils = require(script_utilities_module)
-- Are there non-empty transliterations?
-- Are there non-empty transliterations?
local has_translits = false
local has_translits = false
Line 204: Line 326:
-- Apply processing to the headword, for formatting links and such.
-- Apply processing to the headword, for formatting links and such.
if head.term:find("[[", nil, true) and head.sc:getCode() ~= "Image" then
if head.term:find("[[", nil, true) and head.sc:getCode() ~= "Image" then
formatted = require(links_module).language_link{term = head.term, lang = data.lang}
formatted = language_link{term = head.term, lang = data.lang}
else
else
formatted = data.lang:makeDisplayText(head.term, head.sc, true)
formatted = data.lang:makeDisplayText(head.term, head.sc, true)
end
end


local function tag_text_and_add_quals_and_refs(head, formatted, j)
local head_part = tag_text_and_add_quals_and_refs(data, head, formatted, j)
-- Add language and script wrapper.
formatted = m_scriptutils.tag_text(formatted, data.lang, head.sc, "head", nil, j == 1 and data.id or nil)
 
-- Add qualifiers, references and separator.
return format_term_with_qualifiers_and_refs(head, formatted, j)
end
 
local head_part = tag_text_and_add_quals_and_refs(head, formatted, j)
insert(head_parts, head_part)
insert(head_parts, head_part)


Line 227: Line 341:
unique_head_part = head_part
unique_head_part = head_part
else
else
unique_head_part = tag_text_and_add_quals_and_refs(head, formatted, 1)
unique_head_part = tag_text_and_add_quals_and_refs(data, head, formatted, 1)
end
end
unique_head_parts[unique_head_part] = true
unique_head_parts[unique_head_part] = true
Line 255: Line 369:
local this_parts = {}
local this_parts = {}
if head.tr then
if head.tr then
insert(this_parts, m_scriptutils.tag_translit(head.tr, data.lang:getCode(), "head", nil, head.tr_manual))
insert(this_parts, tag_translit(head.tr, data.lang:getCode(), "head", nil, head.tr_manual))
if head.ts then
if head.ts then
insert(this_parts, " ")
insert(this_parts, " ")
Line 261: Line 375:
end
end
if head.ts then
if head.ts then
insert(this_parts, "/" .. m_scriptutils.tag_transcription(head.ts, data.lang:getCode(), "head") .. "/")
insert(this_parts, "/" .. tag_transcription(head.ts, data.lang:getCode(), "head") .. "/")
end
end
insert(translit_parts, concat(this_parts))
insert(translit_parts, concat(this_parts))
Line 270: Line 384:


local langname = data.lang:getCanonicalName()
local langname = data.lang:getCanonicalName()
local transliteration_page = mw.title.new(langname .. " transliteration", "")
local transliteration_page = langname .. " transliteration"
local saw_translit_page = false
local saw_translit_page = false


Line 281: Line 395:
if not saw_translit_page and data.lang:hasType("etymology-only") then
if not saw_translit_page and data.lang:hasType("etymology-only") then
langname = data.lang:getFullName()
langname = data.lang:getFullName()
transliteration_page = mw.title.new(langname .. " transliteration", "")
transliteration_page = langname .. " transliteration"
 
if transliteration_page and transliteration_page.exists then
if transliteration_page and transliteration_page.exists then
translits_formatted = " [[" .. langname .. " transliteration|•]]" .. translits_formatted
translits_formatted = " [[" .. langname .. " transliteration|•]]" .. translits_formatted
Line 304: Line 418:




local function format_genders(data)
local function format_headword_genders(data)
local retval = ""
local retval = ""
if data.genders and #data.genders > 0 then
if data.genders and #data.genders > 0 then
Line 311: Line 425:
end
end
local pos_for_cat
local pos_for_cat
if not data.nogendercat and not m_data.no_gender_cat[data.lang:getCode()] and
if not data.nogendercat then
not m_data.no_gender_cat[data.lang:getFullCode()] then
local no_gender_cat = (m_data or get_data()).no_gender_cat
local pos_category = data.pos_category:gsub("^reconstructed ", "")
if not (no_gender_cat[data.lang:getCode()] or no_gender_cat[data.lang:getFullCode()]) then
pos_for_cat = m_data.pos_for_gender_number_cat[pos_category]
pos_for_cat = (m_data or get_data()).pos_for_gender_number_cat[data.pos_category:gsub("^reconstructed ", "")]
end
end
end
local text, cats = require(gender_and_number_module).format_genders(data.genders, data.lang, pos_for_cat)
local text, cats = format_genders(data.genders, data.lang, pos_for_cat)
for _, cat in ipairs(cats) do
for _, cat in ipairs(cats) do
insert(data.categories, cat)
insert(data.categories, cat)
Line 338: Line 453:
if face ~= "bold" and face ~= "plain" and face ~= "hypothetical" then
if face ~= "bold" and face ~= "plain" and face ~= "hypothetical" then
error("The face `" .. face .. "` " .. (
error("The face `" .. face .. "` " .. (
mw.loadData(script_utilities_data_module).faces[face] and
(script_utilities_data or get_script_utilities_data()).faces[face] and
"should not be used for non-headword terms on the headword line." or
"should not be used for non-headword terms on the headword line." or
"is invalid."
"is invalid."
Line 365: Line 480:
any_part_translit = true
any_part_translit = true
end
end
formatted = require(links_module).full_link(
formatted = full_link(
{
{
term = not nolinkinfl and part.term or nil,
term = not nolinkinfl and part.term or nil,
Line 371: Line 486:
lang = part.lang or data.lang,
lang = part.lang or data.lang,
sc = part.sc or parts.sc or nil,
sc = part.sc or parts.sc or nil,
gloss = part.gloss,
pos = part.pos,
lit = part.lit,
id = part.id,
id = part.id,
genders = part.genders,
genders = part.genders,
Line 381: Line 499:
end
end


parts[j] = format_term_with_qualifiers_and_refs(part, formatted, j)
parts[j] = format_term_with_qualifiers_and_refs(part.lang or data.lang, part,
formatted, j)
end
end


Line 421: Line 540:


--[==[
--[==[
-- Returns the plural form of `pos`, a raw part of speech input, which could be singular or
Returns the plural form of `pos`, a raw part of speech input, which could be singular or
-- plural. Irregular plural POS are taken into account (e.g. "kanji" pluralizes to
plural. Irregular plural POS are taken into account (e.g. "kanji" pluralizes to
-- "kanji").]==]
"kanji").
]==]
function export.pluralize_pos(pos)
function export.pluralize_pos(pos)
return m_data.irregular_plurals[pos] or
-- Make the plural form of the part of speech
return (m_data or get_data()).irregular_plurals[pos] or
pos:sub(-1) == "s" and pos or
pos:sub(-1) == "s" and pos or
-- Make the plural form of the part of speech
pluralize(pos)
require("Module:string utilities").pluralize(pos)
end
end


--[==[
--[==[
-- Return "lemma" if the given POS is a lemma, "non-lemma form" if a non-lemma form, or nil
Return "lemma" if the given POS is a lemma, "non-lemma form" if a non-lemma form, or nil
-- if unknown. The POS passed in must be in its plural form ("nouns", "prefixes", etc.).
if unknown. The POS passed in must be in its plural form ("nouns", "prefixes", etc.).
-- If you have a POS in its singular form, call export.pluralize_pos() above to pluralize it
If you have a POS in its singular form, call {export.pluralize_pos()} above to pluralize it
-- in a smart fashion that knows when to add "-s" and when to add "-es", and also takes
in a smart fashion that knows when to add "-s" and when to add "-es", and also takes
-- into account any irregular plurals.]==]
into account any irregular plurals.
--
 
-- If `best_guess` is given and the POS is in neither the lemma nor non-lemma list, guess
If `best_guess` is given and the POS is in neither the lemma nor non-lemma list, guess
-- based on whether it ends in " forms"; otherwise, return nil.]==]
based on whether it ends in " forms"; otherwise, return nil.
]==]
function export.pos_lemma_or_nonlemma(plpos, best_guess)
function export.pos_lemma_or_nonlemma(plpos, best_guess)
local isLemma = (m_data or get_data()).lemmas
-- Is it a lemma category?
-- Is it a lemma category?
if isLemma[plpos] then
if isLemma[plpos] then
Line 450: Line 572:
end
end
-- Is it a nonlemma category?
-- Is it a nonlemma category?
local isNonLemma = (m_data or get_data()).nonlemmas
if isNonLemma[plpos] or isNonLemma[plpos_no_recon] then
if isNonLemma[plpos] or isNonLemma[plpos_no_recon] then
return "non-lemma form"
return "non-lemma form"
Line 463: Line 586:
end
end


--[==[
Canonicalize a part of speech as specified in 2= in {{tl|head}}. This checks for POS aliases and non-lemma form
aliases ending in 'f', and then pluralizes if the POS term does not have an invariable plural.
]==]
function export.canonicalize_pos(pos)
-- FIXME: Temporary code to throw an error for alias 'pre' (= preposition) that will go away.
if pos == "pre" then
-- Don't throw error on 'pref' as it's an alias for "prefix".
error("POS 'pre' for 'preposition' no longer allowed as it's too ambiguous; use 'prep'")
end
-- Likewise for pro = pronoun.
if pos == "pro" or pos == "prof" then
error("POS 'pro' for 'pronoun' no longer allowed as it's too ambiguous; use 'pron'")
end
local data = m_data or get_data()
if data.pos_aliases[pos] then
pos = data.pos_aliases[pos]
elseif pos:sub(-1) == "f" then
pos = pos:sub(1, -2)
pos = (data.pos_aliases[pos] or pos) .. " forms"
end
return export.pluralize_pos(pos)
end


-- Find and return the maximum index in the array `data[element]` (which may have gaps in it), and initialize it to a
-- Find and return the maximum index in the array `data[element]` (which may have gaps in it), and initialize it to a
Line 518: Line 664:
-- sorted based on the default MediaWiki sortkey, so we check against
-- sorted based on the default MediaWiki sortkey, so we check against
-- that.
-- that.
if tbl == true then
if page.raw_defaultsort ~= sortkey then
insert(lang_cats, lang:getFullName() .. " terms with non-redundant non-automated sortkeys")
end
return
end
local redundant, different
local redundant, different
for k in pairs(tbl) do
for k in pairs(tbl) do
Line 525: Line 677:
different = true
different = true
end
end
end
if redundant then
insert(lang_cats, lang:getFullName() .. " terms with redundant sortkeys")
end
if different then
insert(lang_cats, lang:getFullName() .. " terms with non-redundant non-automated sortkeys")
end
end
return sortkey
return sortkey
end
end
 
function export.maintenance_cats(page, lang, lang_cats, page_cats)
function export.maintenance_cats(page, lang, lang_cats, page_cats)
for _, cat in ipairs(page.cats) do
for _, cat in ipairs(page.cats) do
Line 538: Line 696:
if tbl then
if tbl then
sortkey = handle_raw_sortkeys(tbl, sortkey, page, lang, lang_cats)
sortkey = handle_raw_sortkeys(tbl, sortkey, page, lang, lang_cats)
insert(lang_cats, canonical .. " entries with topic categories using raw markup")
end
end
tbl = page.wikitext_langname_cat[canonical]
tbl = page.wikitext_langname_cat[canonical]
if tbl then
if tbl then
handle_raw_sortkeys(tbl, sortkey, page, lang, lang_cats)
handle_raw_sortkeys(tbl, sortkey, page, lang, lang_cats)
insert(lang_cats, canonical .. " entries with language name categories using raw markup")
end
end
if require(utilities_module).get_current_L2() ~= canonical then
if get_current_L2() ~= canonical then
insert(lang_cats, canonical .. " entries with incorrect language header")
end
end
end
end
Line 555: Line 716:
]==]
]==]
function export.full_headword(data)
function export.full_headword(data)
local remove_links = require(links_module).remove_links
local format_categories = require(utilities_module).format_categories
-- Prevent data from being destructively modified.
-- Prevent data from being destructively modified.
local data = require(table_module).shallowcopy(data)
local data = shallow_copy(data)


------------ 1. Basic checks for old-style (multi-arg) calling convention. ------------
------------ 1. Basic checks for old-style (multi-arg) calling convention. ------------
Line 578: Line 736:


local langcode = data.lang:getCode()
local langcode = data.lang:getCode()
local full_langcode = langcode
local full_langcode = data.lang:getFullCode()
local langname = data.lang:getCanonicalName()
local langname = data.lang:getCanonicalName()
local full_langname = langname
local full_langname = data.lang:getFullName()


local raw_pagename, page = data.pagename
local raw_pagename, page = data.pagename
if raw_pagename and raw_pagename ~= m_data.pagename then -- for testing, doc pages, etc.
if raw_pagename and raw_pagename ~= (m_data or get_data()).pagename then -- for testing, doc pages, etc.
page = require(headword_page_module).process_page(raw_pagename)
page = process_page(raw_pagename)
else
else
page = m_data.page
page = (m_data or get_data()).page
end
end


-- Check the namespace against the language type.
-- Check the namespace against the language type.
if page.namespace == "" then
local namespace = page.namespace
if namespace == "" then
if data.lang:hasType("reconstructed") then
if data.lang:hasType("reconstructed") then
error("Entries in " .. langname .. " must be placed in the Reconstruction: namespace")
error("Entries in " .. langname .. " must be placed in the Reconstruction: namespace")
Line 596: Line 755:
error("Entries in " .. langname .. " must be placed in the Appendix: namespace")
error("Entries in " .. langname .. " must be placed in the Appendix: namespace")
end
end
elseif namespace == "Citations" or namespace == "Thesaurus" then
error("Headword templates should not be used in the " .. namespace .. ": namespace.")
end
end


Line 607: Line 768:
else
else
-- convert old-style `heads`, `translits` and `transcriptions` to new-style
-- convert old-style `heads`, `translits` and `transcriptions` to new-style
local maxind = math.max(
local maxind = max(
init_and_find_maximum_index(data, "heads"),
init_and_find_maximum_index(data, "heads"),
init_and_find_maximum_index(data, "translits", true),
init_and_find_maximum_index(data, "translits", true),
Line 627: Line 788:


------------ 4. Initialize and validate `data.categories` and `data.whole_page_categories`, and determine `pos_category` if not given, and add basic categories. ------------
------------ 4. Initialize and validate `data.categories` and `data.whole_page_categories`, and determine `pos_category` if not given, and add basic categories. ------------
-- EXPERIMENTAL: see [[Wiktionary:Beer parlour/2024/June#Decluttering the altform mess]]
if data.altform then
data.noposcat = true
end


init_and_find_maximum_index(data, "categories")
init_and_find_maximum_index(data, "categories")
Line 664: Line 830:
-- add an appropriate category.
-- add an appropriate category.
local postype = export.pos_lemma_or_nonlemma(data.pos_category)
local postype = export.pos_lemma_or_nonlemma(data.pos_category)
    if not data.noposcat then
if not data.noposcat then
    if postype == "lemma" then
    postype = data.lang:getMainCategoryName()
    end
insert(data.categories, 1, full_langname .. " " .. postype .. "s")
insert(data.categories, 1, full_langname .. " " .. postype .. "s")
end
-- EXPERIMENTAL: see [[Wiktionary:Beer parlour/2024/June#Decluttering the altform mess]]
if data.altform then
insert(data.categories, 1, full_langname .. " alternative forms")
end
end


Line 675: Line 842:


-- Determine if term is reconstructed
-- Determine if term is reconstructed
local is_reconstructed = page.namespace == "Reconstruction" or data.lang:hasType("reconstructed")
local is_reconstructed = namespace == "Reconstruction" or data.lang:hasType("reconstructed")


-- Create a default headword based on the pagename, which is determined in
-- Create a default headword based on the pagename, which is determined in
Line 682: Line 849:


-- Add links to multi-word page names when appropriate
-- Add links to multi-word page names when appropriate
if not data.nolinkhead and not m_data.no_multiword_links[langcode] and not m_data.no_multiword_links[full_langcode]
if not (is_reconstructed or data.nolinkhead) then
and not is_reconstructed and export.head_is_multiword(default_head) then
local no_links = (m_data or get_data()).no_multiword_links
default_head = export.add_multiword_links(default_head, true)
if not (no_links[langcode] or no_links[full_langcode]) and export.head_is_multiword(default_head) then
default_head = export.add_multiword_links(default_head, true)
end
end
end


if is_reconstructed then
if is_reconstructed then
default_head = "*" .. default_head
default_head = "*" .. default_head
default_head = default_head:gsub("%*%*", "*")
end
end


Line 706: Line 874:
elseif head.term == default_head then
elseif head.term == default_head then
has_redundant_head_param = true
has_redundant_head_param = true
elseif head.term:find("^[!?]$") then
-- If explicit head= just consists of ! or ?, add it to the end of the default head.
head.term = default_head .. head.term
end
end
 
if is_reconstructed then
if is_reconstructed then
local head_term = head.term
local head_term = head.term
if head_term:find("%[%[") then
if head_term:find("%[%[") then
head_term = require(links_module).remove_links(head_term)
head_term = remove_links(head_term)
end
end
if head_term:sub(1, 1) ~= "*" then
if head_term:sub(1, 1) ~= "*" then
Line 720: Line 891:
------ 6b. Try to detect the script(s) if not provided. If a per-head script is provided, that takes precedence,
------ 6b. Try to detect the script(s) if not provided. If a per-head script is provided, that takes precedence,
------    otherwise fall back to the overall script if given. If neither given, autodetect the script.
------    otherwise fall back to the overall script if given. If neither given, autodetect the script.
 
local auto_sc = data.lang:findBestScript(head.term)
local auto_sc = data.lang:findBestScript(head.term)
 
if (
auto_sc:getCode() == "None" and
find_best_script_without_lang(head.term):getCode() ~= "None"
) then
insert(data.categories, full_langname .. " terms in nonstandard scripts")
end
if not (head.sc or data.sc) then -- No script code given, so use autodetected script.
if not (head.sc or data.sc) then -- No script code given, so use autodetected script.
head.sc = auto_sc
head.sc = auto_sc
Line 728: Line 904:
if not head.sc then -- Overall script code given.
if not head.sc then -- Overall script code given.
head.sc = data.sc
head.sc = data.sc
end
-- Track uses of sc parameter.
if head.sc:getCode() == auto_sc:getCode() then
insert(data.categories, full_langname .. " terms with redundant script codes")
else
insert(data.categories, full_langname .. " terms with non-redundant manual script codes")
end
end
end
end
Line 734: Line 916:
if head.sc:hasNormalizationFixes() == true then
if head.sc:hasNormalizationFixes() == true then
local composed_head = toNFC(head.term)
local composed_head = toNFC(head.term)
if head.sc:fixDiscouragedSequences(composed_head) ~= composed_head then
insert(data.whole_page_categories, "Pages using discouraged character sequences")
end
end
end


Line 743: Line 928:
-- Make transliterations
-- Make transliterations
head.tr_manual = nil
head.tr_manual = nil


-- Try to generate a transliteration if necessary
-- Try to generate a transliteration if necessary
if head.tr == "-" then
if head.tr == "-" then
head.tr = nil
head.tr = nil
elseif not notranslit[langcode] and not notranslit[full_langcode] and head.sc:isTransliterated() then
else
head.tr_manual = not not head.tr
local notranslit = (m_data or get_data()).notranslit
if not (notranslit[langcode] or notranslit[full_langcode]) and head.sc:isTransliterated() then
head.tr_manual = not not head.tr


local text = head.term
local text = head.term
if not data.lang:link_tr(head.sc) then
if not data.lang:link_tr(head.sc) then
text = remove_links(text)
text = remove_links(text)
end
end


local automated_tr, tr_categories
local automated_tr, tr_categories
automated_tr, head.tr_fail, tr_categories = data.lang:transliterate(text, head.sc)
automated_tr, head.tr_fail, tr_categories = data.lang:transliterate(text, head.sc)


if automated_tr or head.tr_fail then
if automated_tr or head.tr_fail then
local manual_tr = head.tr
local manual_tr = head.tr


if not manual_tr then
if manual_tr then
head.tr = automated_tr
if (remove_links(manual_tr) == remove_links(automated_tr)) and (not head.tr_fail) then
for _, category in ipairs(tr_categories) do
insert(data.categories, full_langname .. " terms with redundant transliterations")
insert(data.categories, category)
elseif not head.tr_fail then
insert(data.categories, full_langname .. " terms with non-redundant manual transliterations")
end
end
 
if not manual_tr then
head.tr = automated_tr
for _, category in ipairs(tr_categories) do
insert(data.categories, category)
end
end
end
end
end
end


-- There is still no transliteration?
-- There is still no transliteration?
-- Add the entry to a cleanup category.
-- Add the entry to a cleanup category.
if not head.tr then
if not head.tr then
head.tr = "<small>transliteration needed</small>"
head.tr = "<small>transliteration needed</small>"
-- FIXME: No current support for 'Request for transliteration of Classical Persian terms' or similar.
-- FIXME: No current support for 'Request for transliteration of Classical Persian terms' or similar.
-- Consider adding this support in [[Module:category tree/poscatboiler/data/entry maintenance]].
-- Consider adding this support in [[Module:category tree/poscatboiler/data/entry maintenance]].
insert(data.categories, "Requests for transliteration of " .. full_langname .. " terms")
insert(data.categories, "Requests for transliteration of " .. full_langname .. " terms")
else
else
-- Otherwise, trim it.
-- Otherwise, trim it.
head.tr = mw.text.trim(head.tr)
head.tr = trim(head.tr)
end
end
end
end
end
Line 784: Line 981:
-- Link to the transliteration entry for languages that require this.
-- Link to the transliteration entry for languages that require this.
if head.tr and data.lang:link_tr(head.sc) then
if head.tr and data.lang:link_tr(head.sc) then
head.tr = require(links_module).full_link {
head.tr = full_link{
term = head.tr,
term = head.tr,
lang = data.lang,
lang = data.lang,
sc = require(scripts_module).getByCode("Latn"),
sc = get_script("Latn"),
tr = "-"
tr = "-"
}
}
Line 803: Line 1,000:
-- the pagename, and that headwords that are in different scripts from the pagename aren't first. This seems to be
-- the pagename, and that headwords that are in different scripts from the pagename aren't first. This seems to be
-- about the best we can do (alternatively we could potentially do script detection on the pagename).
-- about the best we can do (alternatively we could potentially do script detection on the pagename).
local dt_script = data.lang:findBestScript(data.heads[1].term)
local dt_script = data.heads[1].sc
local dt_script_code = dt_script:getCode()
local dt_script_code = dt_script:getCode()
local page_non_ascii = page.namespace == "" and not page.pagename:find("^[%z\1-\127]+$")
local page_non_ascii = namespace == "" and not page.pagename:find("^[%z\1-\127]+$")
local unsupported_pagename, unsupported = page.full_raw_pagename:gsub("^Unsupported titles/", "")
local unsupported_pagename, unsupported = page.full_raw_pagename:gsub("^Unsupported titles/", "")
if unsupported == 1 and page.unsupported_titles[unsupported_pagename] then
if unsupported == 1 and page.unsupported_titles[unsupported_pagename] then
display_title = 'Unsupported titles/<span class="' .. dt_script_code .. '">' .. page.unsupported_titles[unsupported_pagename] .. '</span>'
display_title = 'Unsupported titles/<span class="' .. dt_script_code .. '">' .. page.unsupported_titles[unsupported_pagename] .. '</span>'
elseif page_non_ascii and toBeTagged[dt_script_code]
elseif page_non_ascii and (m_data or get_data()).toBeTagged[dt_script_code]
or (dt_script_code == "Jpan" and (text_in_script(page.pagename, "Hira") or text_in_script(page.pagename, "Kana")))
or (dt_script_code == "Jpan" and (text_in_script(page.pagename, "Hira") or text_in_script(page.pagename, "Kana")))
or (dt_script_code == "Kore" and text_in_script(page.pagename, "Hang")) then
or (dt_script_code == "Kore" and text_in_script(page.pagename, "Hang")) then
Line 816: Line 1,013:
elseif page_non_ascii and (dt_script_code == "Hant" or dt_script_code == "Hans") then
elseif page_non_ascii and (dt_script_code == "Hant" or dt_script_code == "Hans") then
display_title = '<span class="Hani">' .. page.full_raw_pagename .. '</span>'
display_title = '<span class="Hani">' .. page.full_raw_pagename .. '</span>'
elseif page.namespace == "Reconstruction" then
elseif namespace == "Reconstruction" then
local matched
local matched
display_title, matched = rsubn(
display_title, matched = ugsub(
page.full_raw_pagename,
page.full_raw_pagename,
"^(Reconstruction:[^/]+/)(.+)$",
"^(Reconstruction:[^/]+/)(.+)$",
function(before, term)
function(before, term)
return before ..
return before .. tag_text(term, data.lang, dt_script)
require(script_utilities_module).tag_text(
term,
data.lang,
dt_script
)
end
end
)
)
Line 833: Line 1,025:
display_title = nil
display_title = nil
end
end
end
-- FIXME: Generalize this.
-- If the current language uses ur-Arab (for Urdu, etc.), ku-Arab (Central Kurdish) or pa-Arab
-- (Shahmukhi, for Punjabi) and there's more than one language on the page, don't set the display title
-- because these three scripts display in Nastaliq and we don't want this for terms that also exist in other
-- languages that don't display in Nastaliq (e.g. Arabic or Persian) to display in Nastaliq. Because the word
-- "Urdu" occurs near the end of the alphabet, Urdu fonts tend to override the fonts of other languages.
-- FIXME: This is checking for more than one language on the page but instead needs to check if there are any
-- languages using scripts other than the ones just mentioned.
if (dt_script_code == "ur-Arab" or dt_script_code == "ku-Arab" or dt_script_code == "pa-Arab") and page.L2_list.n > 1 then
display_title = nil
end
end


Line 843: Line 1,046:


------------ 8. Insert additional categories. ------------
------------ 8. Insert additional categories. ------------
if has_redundant_head_param then
if not data.no_redundant_head_cat then
insert(data.categories, full_langname .. " terms with redundant head parameter")
end
end


-- If the first head is multiword (after removing links), maybe insert into "LANG multiword terms".
-- If the first head is multiword (after removing links), maybe insert into "LANG multiword terms".
if not data.nomultiwordcat and any_script_has_spaces and postype == "lemma" and
if not data.nomultiwordcat and any_script_has_spaces and postype == "lemma" then
not m_data.no_multiword_cat[langcode] and not m_data.no_multiword_cat[full_langcode] then
local no_multiword_cat = (m_data or get_data()).no_multiword_cat
-- Check for spaces or hyphens, but exclude prefixes and suffixes.
if not (no_multiword_cat[langcode] or no_multiword_cat[full_langcode]) then
-- Use the pagename, not the head= value, because the latter may have extra
-- Check for spaces or hyphens, but exclude prefixes and suffixes.
-- junk in it, e.g. superscripted text that throws off the algorithm.
-- Use the pagename, not the head= value, because the latter may have extra
local checkpattern = ".[%s%-፡]."
-- junk in it, e.g. superscripted text that throws off the algorithm.
if m_data.hyphen_not_multiword_sep[langcode] or m_data.hyphen_not_multiword_sep[full_langcode] then
local no_hyphen = (m_data or get_data()).hyphen_not_multiword_sep
-- Exclude hyphens if the data module states that they should for this language
-- Exclude hyphens if the data module states that they should for this language.
checkpattern = ".[%s፡]."
local checkpattern = (no_hyphen[langcode] or no_hyphen[full_langcode]) and ".[%s፡]." or ".[%s%-፡]."
end
if umatch(page.pagename, checkpattern) and not non_categorizable(page.full_raw_pagename) then
if rmatch(page.pagename, checkpattern) and not non_categorizable(page.full_raw_pagename) then
insert(data.categories, full_langname .. " multiword terms")
insert(data.categories, full_langname .. " multiword terms")
end
end
end
end
end
Line 868: Line 1,077:


-- Reconstructed terms often use weird combinations of scripts and realistically aren't spelled so much as notated.
-- Reconstructed terms often use weird combinations of scripts and realistically aren't spelled so much as notated.
if page.namespace ~= "Reconstruction" then
if namespace ~= "Reconstruction" then
-- Map from languages to a string containing the characters to ignore when considering whether a term has
-- Map from languages to a string containing the characters to ignore when considering whether a term has
-- multiple written scripts in it. Typically these are Greek or Cyrillic letters used for their phonetic
-- multiple written scripts in it. Typically these are Greek or Cyrillic letters used for their phonetic
Line 918: Line 1,127:
local ch_to_ignore = characters_to_ignore[full_langcode]
local ch_to_ignore = characters_to_ignore[full_langcode]
if ch_to_ignore then
if ch_to_ignore then
canon_pagename = rsub(canon_pagename, "[" .. ch_to_ignore .. "]", "")
canon_pagename = ugsub(canon_pagename, "[" .. ch_to_ignore .. "]", "")
end
end
local script_data = mw.loadData(scripts_data_module)
while true do
while true do
if canon_pagename == "" or num_seen_scripts >= 2 or num_loops >= 10 then
if canon_pagename == "" or num_seen_scripts >= 2 or num_loops >= 10 then
Line 927: Line 1,135:
-- Make sure we don't get into a loop checking the same script over and over again; happens with e.g. [[ᠪᡳ]]
-- Make sure we don't get into a loop checking the same script over and over again; happens with e.g. [[ᠪᡳ]]
num_loops = num_loops + 1
num_loops = num_loops + 1
local pagename_script = require(scripts_module).findBestScriptWithoutLang(canon_pagename, "None only as last resort")
local pagename_script = find_best_script_without_lang(canon_pagename, "None only as last resort")
local script_chars = pagename_script.characters
local script_chars = pagename_script.characters
if not script_chars then
if not script_chars then
Line 935: Line 1,143:
local script_code = pagename_script:getCode()
local script_code = pagename_script:getCode()
local replaced
local replaced
canon_pagename, replaced = rsubn(canon_pagename, "[" .. script_chars .. "]", "")
canon_pagename, replaced = ugsub(canon_pagename, "[" .. script_chars .. "]", "")
if replaced and script_code ~= "Zmth" and script_data[script_code] and
if (
script_data[script_code].character_category ~= false then
replaced and
script_code ~= "Zmth" and
(script_data or get_script_data())[script_code] and
script_data[script_code].character_category ~= false
) then
script_code = script_code:gsub("^.-%-", "")
script_code = script_code:gsub("^.-%-", "")
if not seen_scripts[script_code] then
if not seen_scripts[script_code] then
Line 950: Line 1,162:
end
end
end
end
 
-- Categorise for unusual characters. Takes into account combining characters, so that we can categorise for characters with diacritics that aren't encoded as atomic characters (e.g. U̠). These can be in two formats: single combining characters (i.e. character + diacritic(s)) or double combining characters (i.e. character + diacritic(s) + character). Each can have any number of diacritics.
-- Categorise for unusual characters. Takes into account combining characters, so that we can categorise for characters with diacritics that aren't encoded as atomic characters (e.g. U̠). These can be in two formats: single combining characters (i.e. character + diacritic(s)) or double combining characters (i.e. character + diacritic(s) + character). Each can have any number of diacritics.
local standard = data.lang:getStandardCharacters()
local standard = data.lang:getStandardCharacters()
Line 994: Line 1,206:
return ""
return ""
end
end
local sc_standard = rsub(sc_standard, page.comb_chars.combined_double, explode)
local sc_standard = ugsub(sc_standard, page.comb_chars.combined_double, explode)
sc_standard = rsub(sc_standard,page.comb_chars.combined_single, explode)
sc_standard = ugsub(sc_standard,page.comb_chars.combined_single, explode)
:gsub(".[\128-\191]*", explode)
:gsub(".[\128-\191]*", explode)
local num_cat_inserted
local num_cat_inserted
Line 1,005: Line 1,217:
num_cat_inserted = true
num_cat_inserted = true
end
end
elseif rfind(char, page.emoji_pattern) then
elseif ufind(char, page.emoji_pattern) then
insert(data.categories, full_langname .. " terms spelled with emoji")
insert(data.categories, full_langname .. " terms spelled with emoji")
else
else
Line 1,012: Line 1,224:
char = upper
char = upper
end
end
insert(data.categories, full_langname .. " terms spelled with " .. char)
end
end
end
end
Line 1,018: Line 1,231:
-- If a diacritic doesn't appear in any of the standard characters, also categorise for it generally.
-- If a diacritic doesn't appear in any of the standard characters, also categorise for it generally.
sc_standard = toNFD(sc_standard)
sc_standard = toNFD(sc_standard)
for diacritic in ugmatch(page.decompose_pagename, page.comb_chars.diacritics_single) do
if not umatch(sc_standard, diacritic) then
insert(data.categories, full_langname .. " terms spelled with ◌" .. diacritic)
end
end
for diacritic in ugmatch(page.decompose_pagename, page.comb_chars.diacritics_double) do
if not umatch(sc_standard, diacritic) then
insert(data.categories, full_langname .. " terms spelled with ◌" .. diacritic .. "◌")
end
end
end
end
end
end
-- Ancient Greek, Hindi and Lao handled the old way for now, as their standard chars still need to be converted to the new format (because there are a lot of them).
-- Ancient Greek, Hindi and Lao handled the old way for now, as their standard chars still need to be converted to the new format (because there are a lot of them).
elseif ulen(page.pagename) ~= 1 then
elseif ulen(page.pagename) ~= 1 then
for character in rgmatch(page.pagename, "([^" .. standard .. "])") do
for character in ugmatch(page.pagename, "([^" .. standard .. "])") do
local upper = char_category(character)
local upper = char_category(character)
if not rmatch(upper, "[" .. standard .. "]") then
if not umatch(upper, "[" .. standard .. "]") then
character = upper
character = upper
end
end
insert(data.categories, full_langname .. " terms spelled with " .. character)
end
end
end
end
end
end
 
if data.heads[1].sc:isSystem("alphabet") then
if data.heads[1].sc:isSystem("alphabet") then
local pagename, i = page.pagename:ulower(), 2
local pagename, i = page.pagename:ulower(), 2
while rmatch(pagename, "(%a)" .. ("%1"):rep(i)) do
while umatch(pagename, "(%a)" .. ("%1"):rep(i)) do
i = i + 1
i = i + 1
insert(data.categories, full_langname .. " terms with " .. i .. " consecutive instances of the same letter")
end
end
end
end
 
-- Categorise for palindromes
-- Categorise for palindromes
if not data.nopalindromecat and page.namespace ~= "Reconstruction" and ulen(page.pagename) > 2
if not data.nopalindromecat and namespace ~= "Reconstruction" and ulen(page.pagename) > 2
-- FIXME: Use of first script here seems hacky. What is the clean way of doing this in the presence of
-- FIXME: Use of first script here seems hacky. What is the clean way of doing this in the presence of
-- multiple scripts?
-- multiple scripts?
and require(palindromes_module).is_palindrome(page.pagename, data.lang, data.heads[1].sc) then
and is_palindrome(page.pagename, data.lang, data.heads[1].sc) then
insert(data.categories, full_langname .. " palindromes")
insert(data.categories, full_langname .. " palindromes")
end
end
Line 1,060: Line 1,285:
end
end
if page.namespace == "" and not data.lang:hasType("reconstructed") then
local m_links = require(links_module)
end
-- Add to various maintenance categories.
-- Add to various maintenance categories.
export.maintenance_cats(page, data.lang, data.categories, data.whole_page_categories)
export.maintenance_cats(page, data.lang, data.categories, data.whole_page_categories)
Line 1,073: Line 1,294:
local text = '<span class="headword-line">' ..
local text = '<span class="headword-line">' ..
format_headword(data) ..
format_headword(data) ..
format_genders(data) ..
format_headword_genders(data) ..
format_inflections(data) .. '</span>'
format_inflections(data) .. '</span>'


Navigation menu