Module:headword: Difference between revisions

No edit summary
No edit summary
 
(5 intermediate revisions by the same user not shown)
Line 174: Line 174:
-- If set to true, categories always appear, even in non-mainspace pages
-- If set to true, categories always appear, even in non-mainspace pages
local test_force_categories = false
local test_force_categories = false


local function text_in_script(text, script_code)
local function text_in_script(text, script_code)
Line 392: Line 393:
local saw_translit_page = false
local saw_translit_page = false


if transliteration_page and transliteration_page.exists then
if transliteration_page and transliteration_page:getContent() then
translits_formatted = " [[" .. langname .. " transliteration|•]]" .. translits_formatted
translits_formatted = " [[" .. langname .. " transliteration|•]]" .. translits_formatted
saw_translit_page = true
saw_translit_page = true
Line 402: Line 403:
transliteration_page = new_title(langname .. " transliteration")
transliteration_page = new_title(langname .. " transliteration")


if transliteration_page and transliteration_page.exists then
if transliteration_page and transliteration_page:getContent() then
translits_formatted = " [[" .. langname .. " transliteration|•]]" .. translits_formatted
translits_formatted = " [[" .. langname .. " transliteration|•]]" .. translits_formatted
end
end
Line 539: Line 540:




-- Format the inflections following the headword or nested after a given inflection.
-- Format the inflections following the headword or nested after a given inflection. Declared local above.
format_inflections = function(data, inflections)
function format_inflections(data, inflections)
if inflections and inflections[1] then
if inflections and inflections[1] then
-- Format each inflection individually.
-- Format each inflection individually.
Line 563: Line 564:
return result
return result
end
end
end
-- Forward reference
local check_red_link_inflections
-- Check a single inflection (which consists of a label and zero or more terms, each possibly with nested inflections)
-- for red links. If so, insert a red-link category based on `plpos` (the plural part of speech to insert in the
-- category), stop further processing, and return true. If no red links found, return false.
local function check_red_link_inflection_parts(data, parts, plpos)
for _, part in ipairs(parts) do
if type(part) ~= "table" then
part = {term = part}
end
local term = part.term
if term and not term:find("%[%[") then
local stripped_physical_term = get_link_page(term, data.lang, part.sc or parts.sc or nil)
if stripped_physical_term then
local title = mw.title.new(stripped_physical_term)
if title and not title:getContent() then
insert(data.categories, data.lang:getFullName() .. " " .. plpos .. " with red links in their headword lines")
return true
end
end
end
if part.inflections then
if check_red_link_inflections(data, part.inflections, plpos) then
return true
end
end
end
return false
end
-- Check a set of inflections (each of which describes a single inflection of the term, such as feminine or plural, and
-- consists of a label and zero or more terms, each possibly with nested inflections) for red links. If so, insert a
-- red-link category based on `plpos` (the plural part of speech to insert in the category), stop further processing,
-- and return true. If no red links found, return false.
function check_red_link_inflections(data, inflections, plpos)
if inflections and inflections[1] then
-- Check each inflection individually.
for key, infl in ipairs(inflections) do
if check_red_link_inflection_parts(data, infl, plpos) then
return true
end
end
end
return false
end
-- Check the top-level inflections in `data.inflections`, along with any nested inflections, for red links. If so,
-- insert a red-link category based on `plpos` (the plural part of speech to insert in the category), stop further
-- processing, and return true. If no red links found, return false.
local function check_red_link_inflections_top_level(data, plpos)
return check_red_link_inflections(data, data.inflections, plpos)
end
end


Line 858: Line 918:
-- add an appropriate category.
-- add an appropriate category.
local postype = export.pos_lemma_or_nonlemma(data.pos_category)
local postype = export.pos_lemma_or_nonlemma(data.pos_category)
if not data.noposcat then
if not postype then
elseif not data.noposcat then
insert(data.categories, 1, full_langname .. " " .. postype .. "s")
insert(data.categories, 1, full_langname .. " " .. postype .. "s")
end
end
Line 960: Line 1,021:
if not head.sc then -- Overall script code given.
if not head.sc then -- Overall script code given.
head.sc = data.sc
head.sc = data.sc
end
-- Track uses of sc parameter.
if head.sc:getCode() == auto_sc:getCode() then
if not data.no_script_code_cat then
insert(data.categories, full_langname .. " terms with redundant script codes")
end
else
if not data.no_script_code_cat then
insert(data.categories, full_langname .. " terms with non-redundant manual script codes")
end
end
end
end
end
Line 993: Line 1,064:
end
end


local automated_tr, tr_categories
local automated_tr = data.lang:transliterate(text, head.sc)
automated_tr, head.tr_fail, tr_categories = data.lang:transliterate(text, head.sc)


if automated_tr or head.tr_fail then
if automated_tr then
local manual_tr = head.tr
local manual_tr = head.tr


if manual_tr then
if manual_tr then
if (remove_links(manual_tr) == remove_links(automated_tr)) and (not head.tr_fail) then
if remove_links(manual_tr) == remove_links(automated_tr) then
insert(data.categories, full_langname .. " terms with redundant transliterations")
insert(data.categories, full_langname .. " terms with redundant transliterations")
elseif not head.tr_fail then
else
insert(data.categories, full_langname .. " terms with non-redundant manual transliterations")
insert(data.categories, full_langname .. " terms with non-redundant manual transliterations")
end
end
Line 1,009: Line 1,079:
if not manual_tr then
if not manual_tr then
head.tr = automated_tr
head.tr = automated_tr
extend(data.categories, tr_categories)
end
end
end
end
Line 1,097: Line 1,166:
if has_redundant_head_param then
if has_redundant_head_param then
if not data.no_redundant_head_cat then
if not data.no_redundant_head_cat then
insert(data.categories, full_langname .. " terms with redundant head parameter")
-- This is not the right way to go about this; too many exceptions and problems due to language-specific headword
-- handling customization. If we want this, it should be opt-in by a given language passing in the default headword.
-- insert(data.categories, full_langname .. " terms with redundant head parameter")
end
end
end
end
Line 1,116: Line 1,187:
insert(data.categories, full_langname .. " multiword terms")
insert(data.categories, full_langname .. " multiword terms")
elseif not is_multiword then
elseif not is_multiword then
local long_word_threshold = m_headword_data.long_word_thresholds[langcode]
local long_word_threshold = m_headword_data.long_word_thresholds[langcode] or
m_headword_data.long_word_thresholds[full_langcode]
if long_word_threshold and ulen(page.pagename) >= long_word_threshold then
if long_word_threshold and ulen(page.pagename) >= long_word_threshold then
insert(data.categories, "Long " .. full_langname .. " words")
insert(data.categories, "Long " .. full_langname .. " words")
Line 1,124: Line 1,196:
end
end


if data.sccat then
local default_sccat = m_headword_data.default_sccat
if data.sccat or data.sccat == nil and (default_sccat[langcode] or default_sccat[full_langcode]) then
for _, head in ipairs(data.heads) do
for _, head in ipairs(data.heads) do
insert(data.categories, full_langname .. " " .. data.pos_category .. " in " ..
insert(data.categories, full_langname .. " " .. data.pos_category .. " in " ..
Line 1,137: Line 1,210:
-- values.
-- values.
local characters_to_ignore = {
local characters_to_ignore = {
["aaq"] = "α", -- Penobscot
["aaq"] = "αάὰ", -- Penobscot (Algonquian)
["acy"] = "δθ", -- Cypriot Arabic
["acy"] = "δθ", -- Cypriot Arabic
["anc"] = "γ", -- Ngas
["aez"] = "β", -- Aeka (Trans-New Guinea)
["aou"] = "χ", -- A'ou
["anc"] = "γ", -- Ngas (Chadic/Afroasiatic)
["awg"] = "β", -- Anguthimri
["aou"] = "χ", -- A'ou (Kra-Dai)
["bhp"] = "β", -- Bima
["art-blk"] = "ч", -- Bolak (conlang)
["byk"] = "θ", -- Biao
["awg"] = "β", -- Anguthimri (Pama-Nyungan)
["cdy"] = "θ", -- Chadong
["az"] = "ь", -- Azerbaijani (Turkic; Yañalif Latin spelling, c. 1928 - 1938)
["clm"] = "χ", -- Klallam
["ba"] = "ь", -- Bashkir (Turkic; Yañalif Latin spelling, c. 1928 - 1938)
["col"] = "χ", -- Colombia-Wenatchi
["bhp"] = "β", -- Bima (Austronesian)
["coo"] = "χ", -- Comox; FIXME: others? E.g. Greek theta (θ)?
["bjz"] = "β", -- Baruga (Trans-New Guinea)
["ets"] = "θ", -- Yekhee
["byk"] = "θ", -- Biao (Kra-Dai)
["gmw-gts"] = "χ", -- Gottscheerish
["cdy"] = "θ", -- Chadong (Kra-Dai)
["hur"] = "θ", -- Halkomelem
["chp"] = "θ", -- Chipewyan (Athabaskan)
["izh"] = "ь", -- Ingrian
["cjh"] = "χ", -- Upper Chehalis (Salishan)
["kic"] = "θ", -- Kickapoo
["clm"] = "χ", -- Klallam (Salishan)
["lil"] = "χ", -- Lillooet
["col"] = "χ", -- Colombia-Wenatchi (Salishan)
["coo"] = "χθ", -- Comox (Salishan)
["crx"] = "θ", -- Carrier (Athabaskan)
["ets"] = "θ", -- Yekhee (Edoid/Niger-Congo)
["ett"] = "χ", -- Etruscan (isolate; in romanizations)
["fla"] = "χ", -- Montana Salish (Salishan)
["grt"] = "་", -- Garo (South Asian Sino-Tibetan)
["gmw-gts"] = "χ", -- Gottscheerish (Bavarian variant spoken in Slovenia)
["hur"] = "χθ", -- Halkomelem (Salishan)
["itc-psa"] = "f", -- Pre-Samnite (Italic; normally written in Greek)
["izh"] = "ь", -- Ingrian (Finnic)
["kic"] = "θ", -- Kickapoo (Algonquian)
["kk"] = "ь", -- Kazakh (Turkic; Yañalif Latin spelling, c. 1928 - 1938)
["ky"] = "ь", -- Kyrgyz (Turkic; Yañalif Latin spelling, c. 1928 - 1938)
["lil"] = "χ", -- Lillooet (Salishan)
["lsi"] = "ꓹ", -- Lashi (Lolo-Burmese/Sino-Tibetan; represents a glottal stop)
["mhz"] = "β", -- Mor (Austronesian)
["mhz"] = "β", -- Mor (Austronesian)
["neg"]=  "ӡ", -- Negidal (normally in Cyrillic)
["mqn"] = "β", -- Moronene (Austronesian)
["oui"] = "γβ", -- Old Uyghur: FIXME: others? E.g. Greek delta (δ)?
["neg"]=  "ӡā", -- Negidal (Tungusic; normally in Cyrillic)
["pox"] = "χ", -- Polabian
["oka"] = "χ", -- Okanagan (Salishan)
["rom"] = "Θθ", -- Romani: International Standard; two different thetas???
["ole"] = "θ", -- Olekha (Sino-Tibetan)
["sah"] = "ь", -- Yakut (1929 - 1939 Latin spelling)
["oui"] = "γβ", -- Old Uyghur (Turkic; FIXME: others? E.g. Greek delta (δ)?)
["sjw"] = "θ", -- Shawnee
["pox"] = "χ", -- Polabian (West Slavic)
["squ"] = "χ", -- Squamish
["rif"] = "ε", -- Tarifit (Berber)
["str"] = "χθ", -- Saanich; uses two Greek letters
["rom"] = "Θθ", -- Romani (Indic: International Standard; two different thetas???)
["twa"] = "χ", -- Twana
["rpn"] = "β", -- Repanbitip (Austronesian)
["yha"] = "θ", -- Baha
["sah"] = "ь", -- Yakut (Turkic; 1929 - 1939 Latin spelling)
["za"] = "зч", -- Zhuang; 1957-1982 alphabet used two Cyrillic letters (as well as some others like
["sit-jap"] = "χ", -- Japhug (Sino-Tibetan)
["sjw"] = "θ", -- Shawnee (Algonquian)
["squ"] = "χ", -- Squamish (Salishan)
["str"] = "χθ", -- Saanich (Salishan)
["teh"] = "χ", -- Tehuelche (Chonan; spoken in Argentina)
["tep"] = "η", -- Tepecano (Uto-Aztecan)
["thp"] = "χ", -- Thompson (Salishan)
["tk"] = "ь", -- Turkmen (Turkic; Yañalif Latin spelling, c. 1928 - 1938)
["tt"] = "ь", -- Kazakh (Turkic; Yañalif Latin spelling, c. 1928 - 1938)
["twa"] = "χ", -- Twana (Salishan)
["wbl"] = "ы", -- Wakhi (Iranian)
["xbc"] = "ϸ", -- Bactrian (Iranian; represents š; normally written in Greek)
["yha"] = "θ", -- Baha (Kra-Dai)
["za"] = "зч", -- Zhuang (Tai/Kra-Dai); 1957-1982 alphabet used two Cyrillic letters (as well as some others like
  -- ƃ, ƅ, ƨ, ɯ and ɵ that look like Cyrillic or Greek but are actually Latin)
  -- ƃ, ƅ, ƨ, ɯ and ɵ that look like Cyrillic or Greek but are actually Latin)
["zlw-slv"] = "χђћ", -- Slovincian; FIXME: χ is Greek, the other two are Cyrillic, but I'm not sure
["zlw-slv"] = "χђћ", -- Slovincian (West Slavic; FIXME: χ is Greek, the other two are Cyrillic, but I'm not sure
-- the currect characters are being chosen in the entry names
-- the currect characters are being chosen in the entry names)
["zng"] = "θ", -- Mang
["zng"] = "θ", -- Mang (Mon-Khmer)
["ztp"] = "θ", -- Loxicha Zapotec (Zapotecan)
}
}
-- Determine how many real scripts are found in the pagename, where we exclude symbols and such. We exclude
-- Determine how many real scripts are found in the pagename, where we exclude symbols and such. We exclude
Line 1,324: Line 1,426:
and is_palindrome(page.pagename, data.lang, data.heads[1].sc) then
and is_palindrome(page.pagename, data.lang, data.heads[1].sc) then
insert(data.categories, full_langname .. " palindromes")
insert(data.categories, full_langname .. " palindromes")
end
-- Add red link category if called for and we're not a "large" page, where such checks are disabled.
if data.checkredlinks and not m_headword_data.large_pages[m_headword_data.pagename] then
local plposcat = type(data.checkredlinks) == "string" and data.checkredlinks or data.pos_category
check_red_link_inflections_top_level(data, plposcat)
end
end