Module:headword: Difference between revisions
No edit summary |
No edit summary |
||
| (15 intermediate revisions by 2 users not shown) | |||
| Line 2: | Line 2: | ||
-- Named constants for all modules used, to make it easier to swap out sandbox versions. | -- Named constants for all modules used, to make it easier to swap out sandbox versions. | ||
local debug_track_module = "Module:debug/track" | |||
local en_utilities_module = "Module:en-utilities" | local en_utilities_module = "Module:en-utilities" | ||
local gender_and_number_module = "Module: | local gender_and_number_module = "Module:gender and number" | ||
local headword_data_module = "Module:headword/data" | local headword_data_module = "Module:headword/data" | ||
local headword_page_module = "Module:headword/page" | local headword_page_module = "Module:headword/page" | ||
| Line 37: | Line 38: | ||
--[==[ | --[==[ | ||
Loaders for functions in other modules, which overwrite themselves with the target function when called. This ensures modules are only loaded when needed, retains the speed/convenience of locally-declared pre-loaded functions, and has no overhead after the first call, since the target functions are called directly in any subsequent calls.]==] | Loaders for functions in other modules, which overwrite themselves with the target function when called. This ensures modules are only loaded when needed, retains the speed/convenience of locally-declared pre-loaded functions, and has no overhead after the first call, since the target functions are called directly in any subsequent calls.]==] | ||
local function debug_track(...) | |||
debug_track = require(debug_track_module) | |||
return debug_track(...) | |||
end | |||
local function encode_entities(...) | local function encode_entities(...) | ||
encode_entities = require(string_utilities_module).encode_entities | encode_entities = require(string_utilities_module).encode_entities | ||
| Line 174: | Line 180: | ||
-- If set to true, categories always appear, even in non-mainspace pages | -- If set to true, categories always appear, even in non-mainspace pages | ||
local test_force_categories = false | local test_force_categories = false | ||
-- Add a tracking category to track entries with certain (unusually undesirable) properties. `track_id` is an identifier | |||
-- for the particular property being tracked and goes into the tracking page. Specifically, this adds a link in the | |||
-- page text to [[Wiktionary:Tracking/headword/TRACK_ID]], meaning you can find all entries with the `track_id` property | |||
-- by visiting [[Special:WhatLinksHere/Wiktionary:Tracking/headword/TRACK_ID]]. | |||
-- | |||
-- If `lang` (a language object) is given, an additional tracking page [[Wiktionary:Tracking/headword/TRACK_ID/CODE]] is | |||
-- linked to where CODE is the language code of `lang`, and you can find all entries in the combination of `track_id` | |||
-- and `lang` by visiting [[Special:WhatLinksHere/Wiktionary:Tracking/headword/TRACK_ID/CODE]]. This makes it possible to | |||
-- isolate only the entries with a specific tracking property that are in a given language. Note that if `lang` | |||
-- references at etymology-only language, both that language's code and its full parent's code are tracked. | |||
local function track(track_id, lang) | |||
local tracking_page = "headword/" .. track_id | |||
if lang and lang:hasType("etymology-only") then | |||
debug_track{tracking_page, tracking_page .. "/" .. lang:getCode(), | |||
tracking_page .. "/" .. lang:getFullCode()} | |||
elseif lang then | |||
debug_track{tracking_page, tracking_page .. "/" .. lang:getCode()} | |||
else | |||
debug_track(tracking_page) | |||
end | |||
return true | |||
end | |||
local function text_in_script(text, script_code) | local function text_in_script(text, script_code) | ||
| Line 362: | Line 392: | ||
else | else | ||
head_parts = concat(head_parts) | head_parts = concat(head_parts) | ||
end | |||
if has_manual_translits then | |||
-- [[Special:WhatLinksHere/Wiktionary:Tracking/headword/manual-tr]] | |||
-- [[Special:WhatLinksHere/Wiktionary:Tracking/headword/manual-tr/LANGCODE]] | |||
track("manual-tr", data.lang) | |||
end | end | ||
| Line 392: | Line 428: | ||
local saw_translit_page = false | local saw_translit_page = false | ||
if transliteration_page and transliteration_page | if transliteration_page and transliteration_page:getContent() then | ||
translits_formatted = " [[" .. langname .. " transliteration|•]]" .. translits_formatted | translits_formatted = " [[" .. langname .. " transliteration|•]]" .. translits_formatted | ||
saw_translit_page = true | saw_translit_page = true | ||
| Line 400: | Line 436: | ||
if not saw_translit_page and data.lang:hasType("etymology-only") then | if not saw_translit_page and data.lang:hasType("etymology-only") then | ||
langname = data.lang:getFullName() | langname = data.lang:getFullName() | ||
transliteration_page = new_title(langname .. " transliteration") | transliteration_page = new_title(langname .. " transliteration", "Wiktionary") | ||
if transliteration_page and transliteration_page | if transliteration_page and transliteration_page:getContent() then | ||
translits_formatted = " [[" .. langname .. " transliteration|•]]" .. translits_formatted | translits_formatted = " [[Wiktionary:" .. langname .. " transliteration|•]]" .. translits_formatted | ||
end | end | ||
end | end | ||
| Line 467: | Line 503: | ||
-- right into the 'data' table to disable inflection links of the entire headword | -- right into the 'data' table to disable inflection links of the entire headword | ||
-- when inflected forms aren't entry-worthy, e.g.: in Vulgar Latin | -- when inflected forms aren't entry-worthy, e.g.: in Vulgar Latin | ||
local nolinkinfl = part.face == "hypothetical" or part.nolinkinfl or data.nolinkinfl | local nolinkinfl = part.face == "hypothetical" or (part.nolink and track("nolink") or part.nolinkinfl) or ( | ||
data.nolink and track("nolink") or data.nolinkinfl) | |||
local formatted | local formatted | ||
| Line 539: | Line 576: | ||
-- Format the inflections following the headword or nested after a given inflection. | -- Format the inflections following the headword or nested after a given inflection. Declared local above. | ||
format_inflections | function format_inflections(data, inflections) | ||
if inflections and inflections[1] then | if inflections and inflections[1] then | ||
-- Format each inflection individually. | -- Format each inflection individually. | ||
| Line 563: | Line 600: | ||
return result | return result | ||
end | end | ||
end | |||
-- Forward reference | |||
local check_red_link_inflections | |||
-- Check a single inflection (which consists of a label and zero or more terms, each possibly with nested inflections) | |||
-- for red links. If so, insert a red-link category based on `plpos` (the plural part of speech to insert in the | |||
-- category), stop further processing, and return true. If no red links found, return false. | |||
local function check_red_link_inflection_parts(data, parts, plpos) | |||
for _, part in ipairs(parts) do | |||
if type(part) ~= "table" then | |||
part = {term = part} | |||
end | |||
local term = part.term | |||
if term and not term:find("%[%[") then | |||
local stripped_physical_term = get_link_page(term, data.lang, part.sc or parts.sc or nil) | |||
if stripped_physical_term then | |||
local title = mw.title.new(stripped_physical_term) | |||
if title and not title:getContent() then | |||
return true | |||
end | |||
end | |||
end | |||
if part.inflections then | |||
if check_red_link_inflections(data, part.inflections, plpos) then | |||
return true | |||
end | |||
end | |||
end | |||
return false | |||
end | |||
-- Check a set of inflections (each of which describes a single inflection of the term, such as feminine or plural, and | |||
-- consists of a label and zero or more terms, each possibly with nested inflections) for red links. If so, insert a | |||
-- red-link category based on `plpos` (the plural part of speech to insert in the category), stop further processing, | |||
-- and return true. If no red links found, return false. | |||
function check_red_link_inflections(data, inflections, plpos) | |||
if inflections and inflections[1] then | |||
-- Check each inflection individually. | |||
for key, infl in ipairs(inflections) do | |||
if check_red_link_inflection_parts(data, infl, plpos) then | |||
return true | |||
end | |||
end | |||
end | |||
return false | |||
end | |||
-- Check the top-level inflections in `data.inflections`, along with any nested inflections, for red links. If so, | |||
-- insert a red-link category based on `plpos` (the plural part of speech to insert in the category), stop further | |||
-- processing, and return true. If no red links found, return false. | |||
local function check_red_link_inflections_top_level(data, plpos) | |||
return check_red_link_inflections(data, data.inflections, plpos) | |||
end | end | ||
| Line 814: | Line 909: | ||
local escaped_langname = pattern_escape(full_langname) | local escaped_langname = pattern_escape(full_langname) | ||
local matches_lang_pattern = "^" .. escaped_langname .. " " | local matches_lang_pattern = "^" .. escaped_langname .. " " | ||
for _, cat in ipairs(data.categories) do | |||
-- Does the category begin with the language name? If not, tag it with a tracking category. | |||
if not cat:find(matches_lang_pattern) then | |||
-- [[Special:WhatLinksHere/Wiktionary:Tracking/headword/no lang category]] | |||
-- [[Special:WhatLinksHere/Wiktionary:Tracking/headword/no lang category/LANGCODE]] | |||
track("no lang category", data.lang) | |||
end | |||
end | |||
-- If `pos_category` not given, try to infer it from the first specified category. If this doesn't work, we | -- If `pos_category` not given, try to infer it from the first specified category. If this doesn't work, we | ||
| Line 844: | Line 947: | ||
-- add an appropriate category. | -- add an appropriate category. | ||
local postype = export.pos_lemma_or_nonlemma(data.pos_category) | local postype = export.pos_lemma_or_nonlemma(data.pos_category) | ||
local main_cat = data.lang:getMainCategoryName() | |||
if not postype then | |||
elseif not data.noposcat then | |||
if postype:match("^lemma") and main_cat ~= "lemma" then | |||
postype = main_cat | |||
end | |||
insert(data.categories, 1, full_langname .. " " .. postype .. "s") | |||
end | |||
insert(data.categories, 1, "Contionary") | |||
-- EXPERIMENTAL: see [[Wiktionary:Beer parlour/2024/June#Decluttering the altform mess]] | -- EXPERIMENTAL: see [[Wiktionary:Beer parlour/2024/June#Decluttering the altform mess]] | ||
| Line 882: | Line 992: | ||
end | end | ||
if is_reconstructed then | if is_reconstructed and not data.lang:hasType("conlang") then | ||
default_head = "*" .. default_head | default_head = "*" .. default_head | ||
end | end | ||
| Line 935: | Line 1,045: | ||
local auto_sc = data.lang:findBestScript(head.term) | local auto_sc = data.lang:findBestScript(head.term) | ||
if not (head.sc or data.sc) then -- No script code given, so use autodetected script. | if not (head.sc or data.sc) then -- No script code given, so use autodetected script. | ||
head.sc = auto_sc | head.sc = auto_sc | ||
| Line 979: | Line 1,083: | ||
end | end | ||
local automated_tr | local automated_tr = data.lang:transliterate(text, head.sc) | ||
if automated_tr | if automated_tr then | ||
local manual_tr = head.tr | local manual_tr = head.tr | ||
if not manual_tr then | if not manual_tr then | ||
head.tr = automated_tr | head.tr = automated_tr | ||
end | end | ||
end | end | ||
| Line 1,073: | Line 1,175: | ||
------------ 9. Insert additional categories. ------------ | ------------ 9. Insert additional categories. ------------ | ||
if data.force_cat_output then | |||
-- [[Special:WhatLinksHere/Wiktionary:Tracking/headword/force cat output]] | |||
track("force cat output") | |||
end | |||
if has_redundant_head_param then | |||
if not data.no_redundant_head_cat then | |||
-- This is not the right way to go about this; too many exceptions and problems due to language-specific headword | |||
-- handling customization. If we want this, it should be opt-in by a given language passing in the default headword. | |||
-- insert(data.categories, full_langname .. " terms with redundant head parameter") | |||
end | |||
end | |||
-- If the first head is multiword (after removing links), maybe insert into "LANG multiword terms". | |||
if not data.nomultiwordcat and any_script_has_spaces and postype == "lemma" then | if not data.nomultiwordcat and any_script_has_spaces and postype == "lemma" then | ||
local no_multiword_cat = m_headword_data.no_multiword_cat | local no_multiword_cat = m_headword_data.no_multiword_cat | ||
| Line 1,088: | Line 1,203: | ||
insert(data.categories, full_langname .. " multiword terms") | insert(data.categories, full_langname .. " multiword terms") | ||
elseif not is_multiword then | elseif not is_multiword then | ||
local long_word_threshold = m_headword_data.long_word_thresholds[langcode] | local long_word_threshold = m_headword_data.long_word_thresholds[langcode] or | ||
m_headword_data.long_word_thresholds[full_langcode] | |||
if long_word_threshold and ulen(page.pagename) >= long_word_threshold then | if long_word_threshold and ulen(page.pagename) >= long_word_threshold then | ||
insert(data.categories, "Long " .. full_langname .. " words") | insert(data.categories, "Long " .. full_langname .. " words") | ||
| Line 1,096: | Line 1,212: | ||
end | end | ||
if data.sccat then | local default_sccat = m_headword_data.default_sccat | ||
if data.sccat or data.sccat == nil and (default_sccat[langcode] or default_sccat[full_langcode]) then | |||
for _, head in ipairs(data.heads) do | for _, head in ipairs(data.heads) do | ||
insert(data.categories, full_langname .. " " .. data.pos_category .. " in " .. | insert(data.categories, full_langname .. " " .. data.pos_category .. " in " .. | ||
| Line 1,109: | Line 1,226: | ||
-- values. | -- values. | ||
local characters_to_ignore = { | local characters_to_ignore = { | ||
["aaq"] = " | ["aaq"] = "αάὰ", -- Penobscot (Algonquian) | ||
["acy"] = "δθ", -- Cypriot Arabic | ["acy"] = "δθ", -- Cypriot Arabic | ||
["anc"] = "γ", -- Ngas | ["aez"] = "β", -- Aeka (Trans-New Guinea) | ||
["aou"] = "χ", -- A'ou | ["anc"] = "γ", -- Ngas (Chadic/Afroasiatic) | ||
["awg"] = "β", -- Anguthimri | ["aou"] = "χ", -- A'ou (Kra-Dai) | ||
["bhp"] = "β", -- Bima | ["art-blk"] = "ч", -- Bolak (conlang) | ||
["byk"] = "θ", -- Biao | ["awg"] = "β", -- Anguthimri (Pama-Nyungan) | ||
["cdy"] = "θ", -- Chadong | ["az"] = "ь", -- Azerbaijani (Turkic; Yañalif Latin spelling, c. 1928 - 1938) | ||
["clm"] = "χ", -- Klallam | ["ba"] = "ь", -- Bashkir (Turkic; Yañalif Latin spelling, c. 1928 - 1938) | ||
["col"] = "χ", -- Colombia-Wenatchi | ["bhp"] = "β", -- Bima (Austronesian) | ||
["coo"] = " | ["bjz"] = "β", -- Baruga (Trans-New Guinea) | ||
["ets"] = "θ", -- Yekhee | ["byk"] = "θ", -- Biao (Kra-Dai) | ||
["gmw-gts"] = "χ", -- Gottscheerish | ["cdy"] = "θ", -- Chadong (Kra-Dai) | ||
["hur"] = " | ["chp"] = "θ", -- Chipewyan (Athabaskan) | ||
["izh"] = "ь", -- Ingrian | ["cjh"] = "χ", -- Upper Chehalis (Salishan) | ||
["kic"] = "θ", -- Kickapoo | ["clm"] = "χ", -- Klallam (Salishan) | ||
["lil"] = "χ", -- Lillooet | ["col"] = "χ", -- Colombia-Wenatchi (Salishan) | ||
["coo"] = "χθ", -- Comox (Salishan) | |||
["crx"] = "θ", -- Carrier (Athabaskan) | |||
["ets"] = "θ", -- Yekhee (Edoid/Niger-Congo) | |||
["ett"] = "χ", -- Etruscan (isolate; in romanizations) | |||
["fla"] = "χ", -- Montana Salish (Salishan) | |||
["grt"] = "་", -- Garo (South Asian Sino-Tibetan) | |||
["gmw-gts"] = "χ", -- Gottscheerish (Bavarian variant spoken in Slovenia) | |||
["hur"] = "χθ", -- Halkomelem (Salishan) | |||
["itc-psa"] = "f", -- Pre-Samnite (Italic; normally written in Greek) | |||
["izh"] = "ь", -- Ingrian (Finnic) | |||
["kic"] = "θ", -- Kickapoo (Algonquian) | |||
["kk"] = "ь", -- Kazakh (Turkic; Yañalif Latin spelling, c. 1928 - 1938) | |||
["ky"] = "ь", -- Kyrgyz (Turkic; Yañalif Latin spelling, c. 1928 - 1938) | |||
["lil"] = "χ", -- Lillooet (Salishan) | |||
["lsi"] = "ꓹ", -- Lashi (Lolo-Burmese/Sino-Tibetan; represents a glottal stop) | |||
["mhz"] = "β", -- Mor (Austronesian) | ["mhz"] = "β", -- Mor (Austronesian) | ||
["neg"]= " | ["mqn"] = "β", -- Moronene (Austronesian) | ||
["oui"] = "γβ", -- Old Uyghur | ["neg"]= "ӡā", -- Negidal (Tungusic; normally in Cyrillic) | ||
["pox"] = "χ", -- Polabian | ["oka"] = "χ", -- Okanagan (Salishan) | ||
["rom"] = "Θθ", -- Romani: International Standard; two different thetas??? | ["ole"] = "θ", -- Olekha (Sino-Tibetan) | ||
["sah"] = "ь", -- Yakut (1929 - 1939 Latin spelling) | ["oui"] = "γβ", -- Old Uyghur (Turkic; FIXME: others? E.g. Greek delta (δ)?) | ||
["sjw"] = "θ", -- Shawnee | ["pox"] = "χ", -- Polabian (West Slavic) | ||
["squ"] = "χ", -- Squamish | ["rif"] = "ε", -- Tarifit (Berber) | ||
["str"] = "χθ", -- Saanich; | ["rom"] = "Θθ", -- Romani (Indic: International Standard; two different thetas???) | ||
["twa"] = "χ", -- Twana | ["rpn"] = "β", -- Repanbitip (Austronesian) | ||
["yha"] = "θ", -- Baha | ["sah"] = "ь", -- Yakut (Turkic; 1929 - 1939 Latin spelling) | ||
["za"] = "зч", -- Zhuang; 1957-1982 alphabet used two Cyrillic letters (as well as some others like | ["sit-jap"] = "χ", -- Japhug (Sino-Tibetan) | ||
["sjw"] = "θ", -- Shawnee (Algonquian) | |||
["squ"] = "χ", -- Squamish (Salishan) | |||
["str"] = "χθ", -- Saanich (Salishan) | |||
["teh"] = "χ", -- Tehuelche (Chonan; spoken in Argentina) | |||
["tep"] = "η", -- Tepecano (Uto-Aztecan) | |||
["thp"] = "χ", -- Thompson (Salishan) | |||
["tk"] = "ь", -- Turkmen (Turkic; Yañalif Latin spelling, c. 1928 - 1938) | |||
["tt"] = "ь", -- Kazakh (Turkic; Yañalif Latin spelling, c. 1928 - 1938) | |||
["twa"] = "χ", -- Twana (Salishan) | |||
["wbl"] = "ы", -- Wakhi (Iranian) | |||
["xbc"] = "ϸ", -- Bactrian (Iranian; represents š; normally written in Greek) | |||
["yha"] = "θ", -- Baha (Kra-Dai) | |||
["za"] = "зч", -- Zhuang (Tai/Kra-Dai); 1957-1982 alphabet used two Cyrillic letters (as well as some others like | |||
-- ƃ, ƅ, ƨ, ɯ and ɵ that look like Cyrillic or Greek but are actually Latin) | -- ƃ, ƅ, ƨ, ɯ and ɵ that look like Cyrillic or Greek but are actually Latin) | ||
["zlw-slv"] = "χђћ", -- Slovincian; FIXME: χ is Greek, the other two are Cyrillic, but I'm not sure | ["zlw-slv"] = "χђћ", -- Slovincian (West Slavic; FIXME: χ is Greek, the other two are Cyrillic, but I'm not sure | ||
-- the currect characters are being chosen in the entry names | -- the currect characters are being chosen in the entry names) | ||
["zng"] = "θ", -- Mang | ["zng"] = "θ", -- Mang (Mon-Khmer) | ||
["ztp"] = "θ", -- Loxicha Zapotec (Zapotecan) | |||
} | } | ||
-- Determine how many real scripts are found in the pagename, where we exclude symbols and such. We exclude | -- Determine how many real scripts are found in the pagename, where we exclude symbols and such. We exclude | ||
| Line 1,279: | Line 1,425: | ||
insert(data.categories, full_langname .. " terms spelled with " .. character) | insert(data.categories, full_langname .. " terms spelled with " .. character) | ||
end | end | ||
end | end | ||
end | end | ||
| Line 1,296: | Line 1,434: | ||
and is_palindrome(page.pagename, data.lang, data.heads[1].sc) then | and is_palindrome(page.pagename, data.lang, data.heads[1].sc) then | ||
insert(data.categories, full_langname .. " palindromes") | insert(data.categories, full_langname .. " palindromes") | ||
end | |||
if namespace == "" and not lang_reconstructed then | |||
for _, head in ipairs(data.heads) do | |||
if page.full_raw_pagename ~= get_link_page(remove_links(head.term), data.lang, head.sc) then | |||
-- [[Special:WhatLinksHere/Wiktionary:Tracking/headword/pagename spelling mismatch]] | |||
-- [[Special:WhatLinksHere/Wiktionary:Tracking/headword/pagename spelling mismatch/LANGCODE]] | |||
track("pagename spelling mismatch", data.lang) | |||
break | |||
end | |||
end | |||
end | |||
-- Add red link category if called for and we're not a "large" page, where such checks are disabled. | |||
if data.checkredlinks and not m_headword_data.large_pages[m_headword_data.pagename] then | |||
local plposcat = type(data.checkredlinks) == "string" and data.checkredlinks or data.pos_category | |||
check_red_link_inflections_top_level(data, plposcat) | |||
end | end | ||