47,726
edits
![]() | We're back! Sorry, bad combo of sickness, funeral and a month-long trip abroad. The site is back now. ![]() |
No edit summary |
No edit summary |
||
Line 2: | Line 2: | ||
-- Named constants for all modules used, to make it easier to swap out sandbox versions. | -- Named constants for all modules used, to make it easier to swap out sandbox versions. | ||
local en_utilities_module = "Module:en-utilities" | |||
local gender_and_number_module = "Module:getn" | local gender_and_number_module = "Module:getn" | ||
local headword_data_module = "Module:headword/data" | local headword_data_module = "Module:headword/data" | ||
local headword_page_module = "Module:headword/page" | local headword_page_module = "Module:headword/page" | ||
local links_module = "Module:links" | local links_module = "Module:links" | ||
local load_module = "Module:load" | |||
local pages_module = "Module:pages" | |||
local palindromes_module = "Module:palindromes" | local palindromes_module = "Module:palindromes" | ||
local | local pron_qualifier_module = "Module:pron qualifier" | ||
local scripts_module = "Module:scripts" | local scripts_module = "Module:scripts" | ||
local scripts_data_module = "Module:scripts/data" | local scripts_data_module = "Module:scripts/data" | ||
Line 15: | Line 18: | ||
local table_module = "Module:table" | local table_module = "Module:table" | ||
local utilities_module = "Module:utilities" | local utilities_module = "Module:utilities" | ||
local concat = table.concat | local concat = table.concat | ||
local insert = table.insert | local insert = table.insert | ||
local ipairs = ipairs | local ipairs = ipairs | ||
local max = math.max | |||
local new_title = mw.title.new | |||
local pairs = pairs | local pairs = pairs | ||
local | local require = require | ||
local toNFC = mw.ustring.toNFC | local toNFC = mw.ustring.toNFC | ||
local toNFD = mw.ustring.toNFD | local toNFD = mw.ustring.toNFD | ||
local type = type | |||
local ufind = mw.ustring.find | |||
local ugmatch = mw.ustring.gmatch | |||
local ugsub = mw.ustring.gsub | |||
local umatch = mw.ustring.match | |||
--[==[ | |||
Loaders for functions in other modules, which overwrite themselves with the target function when called. This ensures modules are only loaded when needed, retains the speed/convenience of locally-declared pre-loaded functions, and has no overhead after the first call, since the target functions are called directly in any subsequent calls.]==] | |||
local function encode_entities(...) | |||
encode_entities = require(string_utilities_module).encode_entities | |||
return encode_entities(...) | |||
end | |||
local function find_best_script_without_lang(...) | |||
find_best_script_without_lang = require(scripts_module).findBestScriptWithoutLang | |||
return find_best_script_without_lang(...) | |||
end | |||
local function format_categories(...) | |||
format_categories = require(utilities_module).format_categories | |||
return format_categories(...) | |||
end | |||
local function format_genders(...) | |||
format_genders = require(gender_and_number_module).format_genders | |||
return format_genders(...) | |||
end | |||
local function format_pron_qualifiers(...) | |||
format_pron_qualifiers = require(pron_qualifier_module).format_qualifiers | |||
return format_pron_qualifiers(...) | |||
end | |||
local function full_link(...) | |||
full_link = require(links_module).full_link | |||
return full_link(...) | |||
end | |||
local function get_current_L2(...) | |||
get_current_L2 = require(pages_module).get_current_L2 | |||
return get_current_L2(...) | |||
end | |||
local | local function get_link_page(...) | ||
get_link_page = require(links_module).get_link_page | |||
return get_link_page(...) | |||
end | |||
local | local function get_script(...) | ||
local | get_script = require(scripts_module).getByCode | ||
local | return get_script(...) | ||
local | end | ||
local function is_palindrome(...) | |||
is_palindrome = require(palindromes_module).is_palindrome | |||
return is_palindrome(...) | |||
end | |||
local function language_link(...) | |||
language_link = require(links_module).language_link | |||
return language_link(...) | |||
end | |||
local function load_data(...) | |||
load_data = require(load_module).load_data | |||
return load_data(...) | |||
end | |||
local function pattern_escape(...) | |||
pattern_escape = require(string_utilities_module).pattern_escape | |||
return pattern_escape(...) | |||
end | |||
local function pluralize(...) | |||
pluralize = require(en_utilities_module).pluralize | |||
return pluralize(...) | |||
end | |||
local function process_page(...) | |||
process_page = require(headword_page_module).process_page | |||
return process_page(...) | |||
end | |||
local function remove_links(...) | |||
remove_links = require(links_module).remove_links | |||
return remove_links(...) | |||
end | |||
local function shallow_copy(...) | |||
shallow_copy = require(table_module).shallowCopy | |||
return shallow_copy(...) | |||
end | |||
local function tag_text(...) | |||
tag_text = require(script_utilities_module).tag_text | |||
return tag_text(...) | |||
end | |||
local function tag_transcription(...) | |||
tag_transcription = require(script_utilities_module).tag_transcription | |||
return tag_transcription(...) | |||
end | |||
local function tag_translit(...) | |||
tag_translit = require(script_utilities_module).tag_translit | |||
return tag_translit(...) | |||
end | |||
local function trim(...) | |||
trim = require(string_utilities_module).trim | |||
return trim(...) | |||
end | |||
local function ulen(...) | |||
ulen = require(string_utilities_module).len | |||
return ulen(...) | |||
end | |||
--[==[ | |||
Loaders for objects, which load data (or some other object) into some variable, which can then be accessed as "foo or get_foo()", where the function get_foo sets the object to "foo" and then returns it. This ensures they are only loaded when needed, and avoids the need to check for the existence of the object each time, since once "foo" has been set, "get_foo" will not be called again.]==] | |||
local m_data | |||
local function get_data() | |||
m_data = load_data(headword_data_module) | |||
return m_data | |||
end | |||
local script_data | |||
local function get_script_data() | |||
script_data = load_data(scripts_data_module) | |||
return script_data | |||
end | |||
local script_utilities_data | |||
local function get_script_utilities_data() | |||
script_utilities_data = load_data(script_utilities_data_module) | |||
return script_utilities_data | |||
end | |||
-- If set to true, categories always appear, even in non-mainspace pages | -- If set to true, categories always appear, even in non-mainspace pages | ||
local test_force_categories = false | local test_force_categories = false | ||
local function text_in_script(text, script_code) | local function text_in_script(text, script_code) | ||
local sc = | local sc = get_script(script_code) | ||
if not sc then | if not sc then | ||
error("Internal error: Bad script code " .. script_code) | error("Internal error: Bad script code " .. script_code) | ||
Line 56: | Line 179: | ||
local out | local out | ||
if characters then | if characters then | ||
text = | text = ugsub(text, "%W", "") | ||
out = | out = ufind(text, "[" .. characters .. "]") | ||
end | end | ||
Line 71: | Line 194: | ||
--[[ List of punctuation or spacing characters that are found inside of words. | --[[ List of punctuation or spacing characters that are found inside of words. | ||
Used to exclude characters from the regex above. ]] | Used to exclude characters from the regex above. ]] | ||
local wordPunc = "-־׳״'.·*’་•:᠊" | local wordPunc = "-#%%&@־׳״'.·*’་•:᠊" | ||
local notWordPunc = "[^" .. wordPunc .. "]+" | local notWordPunc = "[^" .. wordPunc .. "]+" | ||
-- Format a term (either a head term or an inflection term) along with any left or right qualifiers, references | -- Format a term (either a head term or an inflection term) along with any left or right qualifiers, labels, references | ||
-- customized separator: `part` is the object specifying the term, which should optionally contain: | -- or customized separator: `part` is the object specifying the term (and `lang` the language of the term), which should | ||
-- * left qualifiers in `q`, an array of strings | -- optionally contain: | ||
-- * left qualifiers in `q`, an array of strings; | |||
-- * right qualifiers in `qq`, an array of strings; | -- * right qualifiers in `qq`, an array of strings; | ||
-- * left labels in `l`, an array of strings; | |||
-- * right labels in `ll`, an array of strings; | |||
-- * references in `refs`, an array either of strings (formatted reference text) or objects containing fields `text` | -- * references in `refs`, an array either of strings (formatted reference text) or objects containing fields `text` | ||
-- (formatted reference text) and optionally `name` and/or `group`; | -- (formatted reference text) and optionally `name` and/or `group`; | ||
-- * a separator in `separator`, defaulting to " <i>or</i> " if this is not the first term (j > 1), otherwise "". | -- * a separator in `separator`, defaulting to " <i>or</i> " if this is not the first term (j > 1), otherwise "". | ||
-- `formatted` is the formatted version of the term itself, and `j` is the index of the term. | -- `formatted` is the formatted version of the term itself, and `j` is the index of the term. | ||
local function format_term_with_qualifiers_and_refs(lang, part, formatted, j) | |||
local function part_non_empty(field) | |||
local list = part[field] | |||
if not list then | |||
return nil | |||
end | |||
if type(list) ~= "table" then | |||
error(("Internal error: Wrong type for `part.%s`=%s, should be \"table\""):format(field, mw.dumpObject(list))) | |||
end | |||
return list[1] | |||
end | end | ||
if part_non_empty("q") or part_non_empty("qq") or part_non_empty("l") or | |||
if | part_non_empty("ll") or part_non_empty("refs") then | ||
formatted = format_pron_qualifiers { | |||
lang = lang, | |||
text = formatted, | |||
q = part.q, | |||
qq = part.qq, | |||
l = part.l, | |||
ll = part.ll, | |||
refs = part.refs, | |||
} | |||
end | end | ||
local separator = part.separator or j > 1 and " <i>or</i> " -- use "" to request no separator | local separator = part.separator or j > 1 and " <i>or</i> " -- use "" to request no separator | ||
if separator then | if separator then | ||
formatted = separator .. formatted | formatted = separator .. formatted | ||
Line 132: | Line 247: | ||
--[==[Return true if the given head is multiword according to the algorithm used in full_headword().]==] | --[==[Return true if the given head is multiword according to the algorithm used in full_headword().]==] | ||
function export.head_is_multiword(head) | function export.head_is_multiword(head) | ||
for possibleWordBreak in | for possibleWordBreak in ugmatch(head, spacingPunctuation) do | ||
if | if umatch(possibleWordBreak, notWordPunc) then | ||
return true | return true | ||
end | end | ||
Line 141: | Line 256: | ||
end | end | ||
do | |||
local function workaround_to_exclude_chars(s) | local function workaround_to_exclude_chars(s) | ||
return | return (ugsub(s, notWordPunc, "\2%1\1")) | ||
end | end | ||
head = "\1" .. | --[==[Add links to a multiword head.]==] | ||
function export.add_multiword_links(head, default) | |||
head = "\1" .. ugsub(head, spacingPunctuation, workaround_to_exclude_chars) .. "\2" | |||
if default then | |||
head = head | |||
:gsub("(\1[^\2]*)\\([:#][^\2]*\2)", "%1\\\\%2") | |||
:gsub("(\1[^\2]*)([:#][^\2]*\2)", "%1\\%2") | |||
end | |||
--Escape any remaining square brackets to stop them breaking links (e.g. "[citation needed]"). | |||
head = encode_entities(head, "[]", true, true) | |||
--[=[ | |||
use this when workaround is no longer needed: | |||
head = "[[" .. ugsub(head, WORDBREAKCHARS, "]]%1[[") .. "]]" | |||
Remove any empty links, which could have been created above | |||
at the beginning or end of the string. | |||
]=] | |||
return (head | |||
:gsub("\1\2", "") | |||
:gsub("[\1\2]", {["\1"] = "[[", ["\2"] = "]]"})) | |||
end | |||
end | end | ||
local function non_categorizable(full_raw_pagename) | local function non_categorizable(full_raw_pagename) | ||
return full_raw_pagename:find("^Appendix:Gestures/") | return full_raw_pagename:find("^Appendix:Gestures/") or | ||
-- Unsupported titles with descriptive names. | |||
(full_raw_pagename:find("^Unsupported titles/") and not full_raw_pagename:find("`")) | |||
end | end | ||
local function tag_text_and_add_quals_and_refs(data, head, formatted, j) | |||
-- Add language and script wrapper. | |||
formatted = tag_text(formatted, data.lang, head.sc, "head", nil, j == 1 and data.id or nil) | |||
-- Add qualifiers, labels, references and separator. | |||
return format_term_with_qualifiers_and_refs(data.lang, head, formatted, j) | |||
end | |||
-- Format a headword with transliterations. | -- Format a headword with transliterations. | ||
local function format_headword(data) | local function format_headword(data) | ||
-- Are there non-empty transliterations? | -- Are there non-empty transliterations? | ||
local has_translits = false | local has_translits = false | ||
Line 204: | Line 326: | ||
-- Apply processing to the headword, for formatting links and such. | -- Apply processing to the headword, for formatting links and such. | ||
if head.term:find("[[", nil, true) and head.sc:getCode() ~= "Image" then | if head.term:find("[[", nil, true) and head.sc:getCode() ~= "Image" then | ||
formatted = | formatted = language_link{term = head.term, lang = data.lang} | ||
else | else | ||
formatted = data.lang:makeDisplayText(head.term, head.sc, true) | formatted = data.lang:makeDisplayText(head.term, head.sc, true) | ||
end | end | ||
local | local head_part = tag_text_and_add_quals_and_refs(data, head, formatted, j) | ||
insert(head_parts, head_part) | insert(head_parts, head_part) | ||
Line 227: | Line 341: | ||
unique_head_part = head_part | unique_head_part = head_part | ||
else | else | ||
unique_head_part = tag_text_and_add_quals_and_refs(head, formatted, 1) | unique_head_part = tag_text_and_add_quals_and_refs(data, head, formatted, 1) | ||
end | end | ||
unique_head_parts[unique_head_part] = true | unique_head_parts[unique_head_part] = true | ||
Line 255: | Line 369: | ||
local this_parts = {} | local this_parts = {} | ||
if head.tr then | if head.tr then | ||
insert(this_parts, | insert(this_parts, tag_translit(head.tr, data.lang:getCode(), "head", nil, head.tr_manual)) | ||
if head.ts then | if head.ts then | ||
insert(this_parts, " ") | insert(this_parts, " ") | ||
Line 261: | Line 375: | ||
end | end | ||
if head.ts then | if head.ts then | ||
insert(this_parts, "/" . | insert(this_parts, "/" .. tag_transcription(head.ts, data.lang:getCode(), "head") .. "/") | ||
end | end | ||
insert(translit_parts, concat(this_parts)) | insert(translit_parts, concat(this_parts)) | ||
Line 270: | Line 384: | ||
local langname = data.lang:getCanonicalName() | local langname = data.lang:getCanonicalName() | ||
local transliteration_page = | local transliteration_page = langname .. " transliteration" | ||
local saw_translit_page = false | local saw_translit_page = false | ||
Line 281: | Line 395: | ||
if not saw_translit_page and data.lang:hasType("etymology-only") then | if not saw_translit_page and data.lang:hasType("etymology-only") then | ||
langname = data.lang:getFullName() | langname = data.lang:getFullName() | ||
transliteration_page = | transliteration_page = langname .. " transliteration" | ||
if transliteration_page and transliteration_page.exists then | if transliteration_page and transliteration_page.exists then | ||
translits_formatted = " [[" .. langname .. " transliteration|•]]" .. translits_formatted | translits_formatted = " [[" .. langname .. " transliteration|•]]" .. translits_formatted | ||
Line 304: | Line 418: | ||
local function | local function format_headword_genders(data) | ||
local retval = "" | local retval = "" | ||
if data.genders and #data.genders > 0 then | if data.genders and #data.genders > 0 then | ||
Line 311: | Line 425: | ||
end | end | ||
local pos_for_cat | local pos_for_cat | ||
if not data.nogendercat | if not data.nogendercat then | ||
local no_gender_cat = (m_data or get_data()).no_gender_cat | |||
if not (no_gender_cat[data.lang:getCode()] or no_gender_cat[data.lang:getFullCode()]) then | |||
pos_for_cat = (m_data or get_data()).pos_for_gender_number_cat[data.pos_category:gsub("^reconstructed ", "")] | |||
end | |||
end | end | ||
local text, cats = | local text, cats = format_genders(data.genders, data.lang, pos_for_cat) | ||
for _, cat in ipairs(cats) do | for _, cat in ipairs(cats) do | ||
insert(data.categories, cat) | insert(data.categories, cat) | ||
Line 338: | Line 453: | ||
if face ~= "bold" and face ~= "plain" and face ~= "hypothetical" then | if face ~= "bold" and face ~= "plain" and face ~= "hypothetical" then | ||
error("The face `" .. face .. "` " .. ( | error("The face `" .. face .. "` " .. ( | ||
(script_utilities_data or get_script_utilities_data()).faces[face] and | |||
"should not be used for non-headword terms on the headword line." or | "should not be used for non-headword terms on the headword line." or | ||
"is invalid." | "is invalid." | ||
Line 365: | Line 480: | ||
any_part_translit = true | any_part_translit = true | ||
end | end | ||
formatted = | formatted = full_link( | ||
{ | { | ||
term = not nolinkinfl and part.term or nil, | term = not nolinkinfl and part.term or nil, | ||
Line 371: | Line 486: | ||
lang = part.lang or data.lang, | lang = part.lang or data.lang, | ||
sc = part.sc or parts.sc or nil, | sc = part.sc or parts.sc or nil, | ||
gloss = part.gloss, | |||
pos = part.pos, | |||
lit = part.lit, | |||
id = part.id, | id = part.id, | ||
genders = part.genders, | genders = part.genders, | ||
Line 381: | Line 499: | ||
end | end | ||
parts[j] = format_term_with_qualifiers_and_refs(part, formatted, j) | parts[j] = format_term_with_qualifiers_and_refs(part.lang or data.lang, part, | ||
formatted, j) | |||
end | end | ||
Line 421: | Line 540: | ||
--[==[ | --[==[ | ||
Returns the plural form of `pos`, a raw part of speech input, which could be singular or | |||
plural. Irregular plural POS are taken into account (e.g. "kanji" pluralizes to | |||
"kanji"). | |||
]==] | |||
function export.pluralize_pos(pos) | function export.pluralize_pos(pos) | ||
return m_data.irregular_plurals[pos] or | -- Make the plural form of the part of speech | ||
return (m_data or get_data()).irregular_plurals[pos] or | |||
pos:sub(-1) == "s" and pos or | pos:sub(-1) == "s" and pos or | ||
pluralize(pos) | |||
end | end | ||
--[==[ | --[==[ | ||
Return "lemma" if the given POS is a lemma, "non-lemma form" if a non-lemma form, or nil | |||
if unknown. The POS passed in must be in its plural form ("nouns", "prefixes", etc.). | |||
If you have a POS in its singular form, call {export.pluralize_pos()} above to pluralize it | |||
in a smart fashion that knows when to add "-s" and when to add "-es", and also takes | |||
into account any irregular plurals. | |||
If `best_guess` is given and the POS is in neither the lemma nor non-lemma list, guess | |||
based on whether it ends in " forms"; otherwise, return nil. | |||
]==] | |||
function export.pos_lemma_or_nonlemma(plpos, best_guess) | function export.pos_lemma_or_nonlemma(plpos, best_guess) | ||
local isLemma = (m_data or get_data()).lemmas | |||
-- Is it a lemma category? | -- Is it a lemma category? | ||
if isLemma[plpos] then | if isLemma[plpos] then | ||
Line 450: | Line 572: | ||
end | end | ||
-- Is it a nonlemma category? | -- Is it a nonlemma category? | ||
local isNonLemma = (m_data or get_data()).nonlemmas | |||
if isNonLemma[plpos] or isNonLemma[plpos_no_recon] then | if isNonLemma[plpos] or isNonLemma[plpos_no_recon] then | ||
return "non-lemma form" | return "non-lemma form" | ||
Line 463: | Line 586: | ||
end | end | ||
--[==[ | |||
Canonicalize a part of speech as specified in 2= in {{tl|head}}. This checks for POS aliases and non-lemma form | |||
aliases ending in 'f', and then pluralizes if the POS term does not have an invariable plural. | |||
]==] | |||
function export.canonicalize_pos(pos) | |||
-- FIXME: Temporary code to throw an error for alias 'pre' (= preposition) that will go away. | |||
if pos == "pre" then | |||
-- Don't throw error on 'pref' as it's an alias for "prefix". | |||
error("POS 'pre' for 'preposition' no longer allowed as it's too ambiguous; use 'prep'") | |||
end | |||
-- Likewise for pro = pronoun. | |||
if pos == "pro" or pos == "prof" then | |||
error("POS 'pro' for 'pronoun' no longer allowed as it's too ambiguous; use 'pron'") | |||
end | |||
local data = m_data or get_data() | |||
if data.pos_aliases[pos] then | |||
pos = data.pos_aliases[pos] | |||
elseif pos:sub(-1) == "f" then | |||
pos = pos:sub(1, -2) | |||
pos = (data.pos_aliases[pos] or pos) .. " forms" | |||
end | |||
return export.pluralize_pos(pos) | |||
end | |||
-- Find and return the maximum index in the array `data[element]` (which may have gaps in it), and initialize it to a | -- Find and return the maximum index in the array `data[element]` (which may have gaps in it), and initialize it to a | ||
Line 518: | Line 664: | ||
-- sorted based on the default MediaWiki sortkey, so we check against | -- sorted based on the default MediaWiki sortkey, so we check against | ||
-- that. | -- that. | ||
if tbl == true then | |||
if page.raw_defaultsort ~= sortkey then | |||
insert(lang_cats, lang:getFullName() .. " terms with non-redundant non-automated sortkeys") | |||
end | |||
return | |||
end | |||
local redundant, different | local redundant, different | ||
for k in pairs(tbl) do | for k in pairs(tbl) do | ||
Line 525: | Line 677: | ||
different = true | different = true | ||
end | end | ||
end | |||
if redundant then | |||
insert(lang_cats, lang:getFullName() .. " terms with redundant sortkeys") | |||
end | |||
if different then | |||
insert(lang_cats, lang:getFullName() .. " terms with non-redundant non-automated sortkeys") | |||
end | end | ||
return sortkey | return sortkey | ||
end | end | ||
function export.maintenance_cats(page, lang, lang_cats, page_cats) | function export.maintenance_cats(page, lang, lang_cats, page_cats) | ||
for _, cat in ipairs(page.cats) do | for _, cat in ipairs(page.cats) do | ||
Line 538: | Line 696: | ||
if tbl then | if tbl then | ||
sortkey = handle_raw_sortkeys(tbl, sortkey, page, lang, lang_cats) | sortkey = handle_raw_sortkeys(tbl, sortkey, page, lang, lang_cats) | ||
insert(lang_cats, canonical .. " entries with topic categories using raw markup") | |||
end | end | ||
tbl = page.wikitext_langname_cat[canonical] | tbl = page.wikitext_langname_cat[canonical] | ||
if tbl then | if tbl then | ||
handle_raw_sortkeys(tbl, sortkey, page, lang, lang_cats) | handle_raw_sortkeys(tbl, sortkey, page, lang, lang_cats) | ||
insert(lang_cats, canonical .. " entries with language name categories using raw markup") | |||
end | end | ||
if | if get_current_L2() ~= canonical then | ||
insert(lang_cats, canonical .. " entries with incorrect language header") | |||
end | end | ||
end | end | ||
Line 555: | Line 716: | ||
]==] | ]==] | ||
function export.full_headword(data) | function export.full_headword(data) | ||
-- Prevent data from being destructively modified. | -- Prevent data from being destructively modified. | ||
local data = | local data = shallow_copy(data) | ||
------------ 1. Basic checks for old-style (multi-arg) calling convention. ------------ | ------------ 1. Basic checks for old-style (multi-arg) calling convention. ------------ | ||
Line 578: | Line 736: | ||
local langcode = data.lang:getCode() | local langcode = data.lang:getCode() | ||
local full_langcode = | local full_langcode = data.lang:getFullCode() | ||
local langname = data.lang:getCanonicalName() | local langname = data.lang:getCanonicalName() | ||
local full_langname = | local full_langname = data.lang:getFullName() | ||
local raw_pagename, page = data.pagename | local raw_pagename, page = data.pagename | ||
if raw_pagename and raw_pagename ~= m_data.pagename then -- for testing, doc pages, etc. | if raw_pagename and raw_pagename ~= (m_data or get_data()).pagename then -- for testing, doc pages, etc. | ||
page = | page = process_page(raw_pagename) | ||
else | else | ||
page = m_data.page | page = (m_data or get_data()).page | ||
end | end | ||
-- Check the namespace against the language type. | -- Check the namespace against the language type. | ||
local namespace = page.namespace | |||
if namespace == "" then | |||
if data.lang:hasType("reconstructed") then | if data.lang:hasType("reconstructed") then | ||
error("Entries in " .. langname .. " must be placed in the Reconstruction: namespace") | error("Entries in " .. langname .. " must be placed in the Reconstruction: namespace") | ||
Line 596: | Line 755: | ||
error("Entries in " .. langname .. " must be placed in the Appendix: namespace") | error("Entries in " .. langname .. " must be placed in the Appendix: namespace") | ||
end | end | ||
elseif namespace == "Citations" or namespace == "Thesaurus" then | |||
error("Headword templates should not be used in the " .. namespace .. ": namespace.") | |||
end | end | ||
Line 607: | Line 768: | ||
else | else | ||
-- convert old-style `heads`, `translits` and `transcriptions` to new-style | -- convert old-style `heads`, `translits` and `transcriptions` to new-style | ||
local maxind = | local maxind = max( | ||
init_and_find_maximum_index(data, "heads"), | init_and_find_maximum_index(data, "heads"), | ||
init_and_find_maximum_index(data, "translits", true), | init_and_find_maximum_index(data, "translits", true), | ||
Line 627: | Line 788: | ||
------------ 4. Initialize and validate `data.categories` and `data.whole_page_categories`, and determine `pos_category` if not given, and add basic categories. ------------ | ------------ 4. Initialize and validate `data.categories` and `data.whole_page_categories`, and determine `pos_category` if not given, and add basic categories. ------------ | ||
-- EXPERIMENTAL: see [[Wiktionary:Beer parlour/2024/June#Decluttering the altform mess]] | |||
if data.altform then | |||
data.noposcat = true | |||
end | |||
init_and_find_maximum_index(data, "categories") | init_and_find_maximum_index(data, "categories") | ||
Line 664: | Line 830: | ||
-- add an appropriate category. | -- add an appropriate category. | ||
local postype = export.pos_lemma_or_nonlemma(data.pos_category) | local postype = export.pos_lemma_or_nonlemma(data.pos_category) | ||
if not data.noposcat then | |||
insert(data.categories, 1, full_langname .. " " .. postype .. "s") | insert(data.categories, 1, full_langname .. " " .. postype .. "s") | ||
end | |||
-- EXPERIMENTAL: see [[Wiktionary:Beer parlour/2024/June#Decluttering the altform mess]] | |||
if data.altform then | |||
insert(data.categories, 1, full_langname .. " alternative forms") | |||
end | end | ||
Line 675: | Line 842: | ||
-- Determine if term is reconstructed | -- Determine if term is reconstructed | ||
local is_reconstructed = | local is_reconstructed = namespace == "Reconstruction" or data.lang:hasType("reconstructed") | ||
-- Create a default headword based on the pagename, which is determined in | -- Create a default headword based on the pagename, which is determined in | ||
Line 682: | Line 849: | ||
-- Add links to multi-word page names when appropriate | -- Add links to multi-word page names when appropriate | ||
if not data.nolinkhead | if not (is_reconstructed or data.nolinkhead) then | ||
local no_links = (m_data or get_data()).no_multiword_links | |||
if not (no_links[langcode] or no_links[full_langcode]) and export.head_is_multiword(default_head) then | |||
default_head = export.add_multiword_links(default_head, true) | |||
end | |||
end | end | ||
if is_reconstructed then | if is_reconstructed then | ||
default_head = "*" .. default_head | default_head = "*" .. default_head | ||
end | end | ||
Line 706: | Line 874: | ||
elseif head.term == default_head then | elseif head.term == default_head then | ||
has_redundant_head_param = true | has_redundant_head_param = true | ||
elseif head.term:find("^[!?]$") then | |||
-- If explicit head= just consists of ! or ?, add it to the end of the default head. | |||
head.term = default_head .. head.term | |||
end | end | ||
if is_reconstructed then | if is_reconstructed then | ||
local head_term = head.term | local head_term = head.term | ||
if head_term:find("%[%[") then | if head_term:find("%[%[") then | ||
head_term = | head_term = remove_links(head_term) | ||
end | end | ||
if head_term:sub(1, 1) ~= "*" then | if head_term:sub(1, 1) ~= "*" then | ||
Line 720: | Line 891: | ||
------ 6b. Try to detect the script(s) if not provided. If a per-head script is provided, that takes precedence, | ------ 6b. Try to detect the script(s) if not provided. If a per-head script is provided, that takes precedence, | ||
------ otherwise fall back to the overall script if given. If neither given, autodetect the script. | ------ otherwise fall back to the overall script if given. If neither given, autodetect the script. | ||
local auto_sc = data.lang:findBestScript(head.term) | local auto_sc = data.lang:findBestScript(head.term) | ||
if ( | |||
auto_sc:getCode() == "None" and | |||
find_best_script_without_lang(head.term):getCode() ~= "None" | |||
) then | |||
insert(data.categories, full_langname .. " terms in nonstandard scripts") | |||
end | |||
if not (head.sc or data.sc) then -- No script code given, so use autodetected script. | if not (head.sc or data.sc) then -- No script code given, so use autodetected script. | ||
head.sc = auto_sc | head.sc = auto_sc | ||
Line 728: | Line 904: | ||
if not head.sc then -- Overall script code given. | if not head.sc then -- Overall script code given. | ||
head.sc = data.sc | head.sc = data.sc | ||
end | |||
-- Track uses of sc parameter. | |||
if head.sc:getCode() == auto_sc:getCode() then | |||
insert(data.categories, full_langname .. " terms with redundant script codes") | |||
else | |||
insert(data.categories, full_langname .. " terms with non-redundant manual script codes") | |||
end | end | ||
end | end | ||
Line 734: | Line 916: | ||
if head.sc:hasNormalizationFixes() == true then | if head.sc:hasNormalizationFixes() == true then | ||
local composed_head = toNFC(head.term) | local composed_head = toNFC(head.term) | ||
if head.sc:fixDiscouragedSequences(composed_head) ~= composed_head then | |||
insert(data.whole_page_categories, "Pages using discouraged character sequences") | |||
end | |||
end | end | ||
Line 743: | Line 928: | ||
-- Make transliterations | -- Make transliterations | ||
head.tr_manual = nil | head.tr_manual = nil | ||
-- Try to generate a transliteration if necessary | -- Try to generate a transliteration if necessary | ||
if head.tr == "-" then | if head.tr == "-" then | ||
head.tr = nil | head.tr = nil | ||
else | |||
local notranslit = (m_data or get_data()).notranslit | |||
if not (notranslit[langcode] or notranslit[full_langcode]) and head.sc:isTransliterated() then | |||
head.tr_manual = not not head.tr | |||
local text = head.term | |||
if not data.lang:link_tr(head.sc) then | |||
text = remove_links(text) | |||
end | |||
local automated_tr, tr_categories | |||
automated_tr, head.tr_fail, tr_categories = data.lang:transliterate(text, head.sc) | |||
if automated_tr or head.tr_fail then | |||
local manual_tr = head.tr | |||
if manual_tr then | |||
if (remove_links(manual_tr) == remove_links(automated_tr)) and (not head.tr_fail) then | |||
insert(data.categories, full_langname .. " terms with redundant transliterations") | |||
elseif not head.tr_fail then | |||
insert(data.categories, full_langname .. " terms with non-redundant manual transliterations") | |||
end | |||
end | |||
if not manual_tr then | |||
head.tr = automated_tr | |||
for _, category in ipairs(tr_categories) do | |||
insert(data.categories, category) | |||
end | |||
end | end | ||
end | end | ||
-- There is still no transliteration? | |||
-- Add the entry to a cleanup category. | |||
if not head.tr then | |||
head.tr = "<small>transliteration needed</small>" | |||
-- FIXME: No current support for 'Request for transliteration of Classical Persian terms' or similar. | |||
-- Consider adding this support in [[Module:category tree/poscatboiler/data/entry maintenance]]. | |||
insert(data.categories, "Requests for transliteration of " .. full_langname .. " terms") | |||
else | |||
-- Otherwise, trim it. | |||
head.tr = trim(head.tr) | |||
end | |||
end | end | ||
end | end | ||
Line 784: | Line 981: | ||
-- Link to the transliteration entry for languages that require this. | -- Link to the transliteration entry for languages that require this. | ||
if head.tr and data.lang:link_tr(head.sc) then | if head.tr and data.lang:link_tr(head.sc) then | ||
head.tr = | head.tr = full_link{ | ||
term = head.tr, | term = head.tr, | ||
lang = data.lang, | lang = data.lang, | ||
sc = | sc = get_script("Latn"), | ||
tr = "-" | tr = "-" | ||
} | } | ||
Line 803: | Line 1,000: | ||
-- the pagename, and that headwords that are in different scripts from the pagename aren't first. This seems to be | -- the pagename, and that headwords that are in different scripts from the pagename aren't first. This seems to be | ||
-- about the best we can do (alternatively we could potentially do script detection on the pagename). | -- about the best we can do (alternatively we could potentially do script detection on the pagename). | ||
local dt_script = | local dt_script = data.heads[1].sc | ||
local dt_script_code = dt_script:getCode() | local dt_script_code = dt_script:getCode() | ||
local page_non_ascii = | local page_non_ascii = namespace == "" and not page.pagename:find("^[%z\1-\127]+$") | ||
local unsupported_pagename, unsupported = page.full_raw_pagename:gsub("^Unsupported titles/", "") | local unsupported_pagename, unsupported = page.full_raw_pagename:gsub("^Unsupported titles/", "") | ||
if unsupported == 1 and page.unsupported_titles[unsupported_pagename] then | if unsupported == 1 and page.unsupported_titles[unsupported_pagename] then | ||
display_title = 'Unsupported titles/<span class="' .. dt_script_code .. '">' .. page.unsupported_titles[unsupported_pagename] .. '</span>' | display_title = 'Unsupported titles/<span class="' .. dt_script_code .. '">' .. page.unsupported_titles[unsupported_pagename] .. '</span>' | ||
elseif page_non_ascii and toBeTagged[dt_script_code] | elseif page_non_ascii and (m_data or get_data()).toBeTagged[dt_script_code] | ||
or (dt_script_code == "Jpan" and (text_in_script(page.pagename, "Hira") or text_in_script(page.pagename, "Kana"))) | or (dt_script_code == "Jpan" and (text_in_script(page.pagename, "Hira") or text_in_script(page.pagename, "Kana"))) | ||
or (dt_script_code == "Kore" and text_in_script(page.pagename, "Hang")) then | or (dt_script_code == "Kore" and text_in_script(page.pagename, "Hang")) then | ||
Line 816: | Line 1,013: | ||
elseif page_non_ascii and (dt_script_code == "Hant" or dt_script_code == "Hans") then | elseif page_non_ascii and (dt_script_code == "Hant" or dt_script_code == "Hans") then | ||
display_title = '<span class="Hani">' .. page.full_raw_pagename .. '</span>' | display_title = '<span class="Hani">' .. page.full_raw_pagename .. '</span>' | ||
elseif | elseif namespace == "Reconstruction" then | ||
local matched | local matched | ||
display_title, matched = | display_title, matched = ugsub( | ||
page.full_raw_pagename, | page.full_raw_pagename, | ||
"^(Reconstruction:[^/]+/)(.+)$", | "^(Reconstruction:[^/]+/)(.+)$", | ||
function(before, term) | function(before, term) | ||
return before . | return before .. tag_text(term, data.lang, dt_script) | ||
end | end | ||
) | ) | ||
Line 833: | Line 1,025: | ||
display_title = nil | display_title = nil | ||
end | end | ||
end | |||
-- FIXME: Generalize this. | |||
-- If the current language uses ur-Arab (for Urdu, etc.), ku-Arab (Central Kurdish) or pa-Arab | |||
-- (Shahmukhi, for Punjabi) and there's more than one language on the page, don't set the display title | |||
-- because these three scripts display in Nastaliq and we don't want this for terms that also exist in other | |||
-- languages that don't display in Nastaliq (e.g. Arabic or Persian) to display in Nastaliq. Because the word | |||
-- "Urdu" occurs near the end of the alphabet, Urdu fonts tend to override the fonts of other languages. | |||
-- FIXME: This is checking for more than one language on the page but instead needs to check if there are any | |||
-- languages using scripts other than the ones just mentioned. | |||
if (dt_script_code == "ur-Arab" or dt_script_code == "ku-Arab" or dt_script_code == "pa-Arab") and page.L2_list.n > 1 then | |||
display_title = nil | |||
end | end | ||
Line 843: | Line 1,046: | ||
------------ 8. Insert additional categories. ------------ | ------------ 8. Insert additional categories. ------------ | ||
if has_redundant_head_param then | |||
if not data.no_redundant_head_cat then | |||
insert(data.categories, full_langname .. " terms with redundant head parameter") | |||
end | |||
end | |||
-- If the first head is multiword (after removing links), maybe insert into "LANG multiword terms". | -- If the first head is multiword (after removing links), maybe insert into "LANG multiword terms". | ||
if not data.nomultiwordcat and any_script_has_spaces and postype == "lemma" | if not data.nomultiwordcat and any_script_has_spaces and postype == "lemma" then | ||
local no_multiword_cat = (m_data or get_data()).no_multiword_cat | |||
if not (no_multiword_cat[langcode] or no_multiword_cat[full_langcode]) then | |||
-- Check for spaces or hyphens, but exclude prefixes and suffixes. | |||
-- Use the pagename, not the head= value, because the latter may have extra | |||
-- junk in it, e.g. superscripted text that throws off the algorithm. | |||
local no_hyphen = (m_data or get_data()).hyphen_not_multiword_sep | |||
-- Exclude hyphens if the data module states that they should for this language | -- Exclude hyphens if the data module states that they should for this language. | ||
checkpattern = ".[%s፡]." | local checkpattern = (no_hyphen[langcode] or no_hyphen[full_langcode]) and ".[%s፡]." or ".[%s%-፡]." | ||
if umatch(page.pagename, checkpattern) and not non_categorizable(page.full_raw_pagename) then | |||
insert(data.categories, full_langname .. " multiword terms") | |||
end | |||
end | end | ||
end | end | ||
Line 868: | Line 1,077: | ||
-- Reconstructed terms often use weird combinations of scripts and realistically aren't spelled so much as notated. | -- Reconstructed terms often use weird combinations of scripts and realistically aren't spelled so much as notated. | ||
if | if namespace ~= "Reconstruction" then | ||
-- Map from languages to a string containing the characters to ignore when considering whether a term has | -- Map from languages to a string containing the characters to ignore when considering whether a term has | ||
-- multiple written scripts in it. Typically these are Greek or Cyrillic letters used for their phonetic | -- multiple written scripts in it. Typically these are Greek or Cyrillic letters used for their phonetic | ||
Line 918: | Line 1,127: | ||
local ch_to_ignore = characters_to_ignore[full_langcode] | local ch_to_ignore = characters_to_ignore[full_langcode] | ||
if ch_to_ignore then | if ch_to_ignore then | ||
canon_pagename = | canon_pagename = ugsub(canon_pagename, "[" .. ch_to_ignore .. "]", "") | ||
end | end | ||
while true do | while true do | ||
if canon_pagename == "" or num_seen_scripts >= 2 or num_loops >= 10 then | if canon_pagename == "" or num_seen_scripts >= 2 or num_loops >= 10 then | ||
Line 927: | Line 1,135: | ||
-- Make sure we don't get into a loop checking the same script over and over again; happens with e.g. [[ᠪᡳ]] | -- Make sure we don't get into a loop checking the same script over and over again; happens with e.g. [[ᠪᡳ]] | ||
num_loops = num_loops + 1 | num_loops = num_loops + 1 | ||
local pagename_script = | local pagename_script = find_best_script_without_lang(canon_pagename, "None only as last resort") | ||
local script_chars = pagename_script.characters | local script_chars = pagename_script.characters | ||
if not script_chars then | if not script_chars then | ||
Line 935: | Line 1,143: | ||
local script_code = pagename_script:getCode() | local script_code = pagename_script:getCode() | ||
local replaced | local replaced | ||
canon_pagename, replaced = | canon_pagename, replaced = ugsub(canon_pagename, "[" .. script_chars .. "]", "") | ||
if replaced and script_code ~= "Zmth" and script_data[script_code] and | if ( | ||
script_data[script_code].character_category ~= false then | replaced and | ||
script_code ~= "Zmth" and | |||
(script_data or get_script_data())[script_code] and | |||
script_data[script_code].character_category ~= false | |||
) then | |||
script_code = script_code:gsub("^.-%-", "") | script_code = script_code:gsub("^.-%-", "") | ||
if not seen_scripts[script_code] then | if not seen_scripts[script_code] then | ||
Line 950: | Line 1,162: | ||
end | end | ||
end | end | ||
-- Categorise for unusual characters. Takes into account combining characters, so that we can categorise for characters with diacritics that aren't encoded as atomic characters (e.g. U̠). These can be in two formats: single combining characters (i.e. character + diacritic(s)) or double combining characters (i.e. character + diacritic(s) + character). Each can have any number of diacritics. | -- Categorise for unusual characters. Takes into account combining characters, so that we can categorise for characters with diacritics that aren't encoded as atomic characters (e.g. U̠). These can be in two formats: single combining characters (i.e. character + diacritic(s)) or double combining characters (i.e. character + diacritic(s) + character). Each can have any number of diacritics. | ||
local standard = data.lang:getStandardCharacters() | local standard = data.lang:getStandardCharacters() | ||
Line 994: | Line 1,206: | ||
return "" | return "" | ||
end | end | ||
local sc_standard = | local sc_standard = ugsub(sc_standard, page.comb_chars.combined_double, explode) | ||
sc_standard = | sc_standard = ugsub(sc_standard,page.comb_chars.combined_single, explode) | ||
:gsub(".[\128-\191]*", explode) | :gsub(".[\128-\191]*", explode) | ||
local num_cat_inserted | local num_cat_inserted | ||
Line 1,005: | Line 1,217: | ||
num_cat_inserted = true | num_cat_inserted = true | ||
end | end | ||
elseif | elseif ufind(char, page.emoji_pattern) then | ||
insert(data.categories, full_langname .. " terms spelled with emoji") | insert(data.categories, full_langname .. " terms spelled with emoji") | ||
else | else | ||
Line 1,012: | Line 1,224: | ||
char = upper | char = upper | ||
end | end | ||
insert(data.categories, full_langname .. " terms spelled with " .. char) | |||
end | end | ||
end | end | ||
Line 1,018: | Line 1,231: | ||
-- If a diacritic doesn't appear in any of the standard characters, also categorise for it generally. | -- If a diacritic doesn't appear in any of the standard characters, also categorise for it generally. | ||
sc_standard = toNFD(sc_standard) | sc_standard = toNFD(sc_standard) | ||
for diacritic in ugmatch(page.decompose_pagename, page.comb_chars.diacritics_single) do | |||
if not umatch(sc_standard, diacritic) then | |||
insert(data.categories, full_langname .. " terms spelled with ◌" .. diacritic) | |||
end | |||
end | |||
for diacritic in ugmatch(page.decompose_pagename, page.comb_chars.diacritics_double) do | |||
if not umatch(sc_standard, diacritic) then | |||
insert(data.categories, full_langname .. " terms spelled with ◌" .. diacritic .. "◌") | |||
end | |||
end | |||
end | end | ||
end | end | ||
-- Ancient Greek, Hindi and Lao handled the old way for now, as their standard chars still need to be converted to the new format (because there are a lot of them). | -- Ancient Greek, Hindi and Lao handled the old way for now, as their standard chars still need to be converted to the new format (because there are a lot of them). | ||
elseif ulen(page.pagename) ~= 1 then | elseif ulen(page.pagename) ~= 1 then | ||
for character in | for character in ugmatch(page.pagename, "([^" .. standard .. "])") do | ||
local upper = char_category(character) | local upper = char_category(character) | ||
if not | if not umatch(upper, "[" .. standard .. "]") then | ||
character = upper | character = upper | ||
end | end | ||
insert(data.categories, full_langname .. " terms spelled with " .. character) | |||
end | end | ||
end | end | ||
end | end | ||
if data.heads[1].sc:isSystem("alphabet") then | if data.heads[1].sc:isSystem("alphabet") then | ||
local pagename, i = page.pagename:ulower(), 2 | local pagename, i = page.pagename:ulower(), 2 | ||
while | while umatch(pagename, "(%a)" .. ("%1"):rep(i)) do | ||
i = i + 1 | i = i + 1 | ||
insert(data.categories, full_langname .. " terms with " .. i .. " consecutive instances of the same letter") | |||
end | end | ||
end | end | ||
-- Categorise for palindromes | -- Categorise for palindromes | ||
if not data.nopalindromecat and | if not data.nopalindromecat and namespace ~= "Reconstruction" and ulen(page.pagename) > 2 | ||
-- FIXME: Use of first script here seems hacky. What is the clean way of doing this in the presence of | -- FIXME: Use of first script here seems hacky. What is the clean way of doing this in the presence of | ||
-- multiple scripts? | -- multiple scripts? | ||
and | and is_palindrome(page.pagename, data.lang, data.heads[1].sc) then | ||
insert(data.categories, full_langname .. " palindromes") | insert(data.categories, full_langname .. " palindromes") | ||
end | end | ||
Line 1,060: | Line 1,285: | ||
end | end | ||
-- Add to various maintenance categories. | -- Add to various maintenance categories. | ||
export.maintenance_cats(page, data.lang, data.categories, data.whole_page_categories) | export.maintenance_cats(page, data.lang, data.categories, data.whole_page_categories) | ||
Line 1,073: | Line 1,294: | ||
local text = '<span class="headword-line">' .. | local text = '<span class="headword-line">' .. | ||
format_headword(data) .. | format_headword(data) .. | ||
format_headword_genders(data) .. | |||
format_inflections(data) .. '</span>' | format_inflections(data) .. '</span>' | ||