Module:siwa-headword: Difference between revisions

no edit summary
No edit summary
No edit summary
Line 1: Line 1:
local export = {}
local export = {}
local spacingPunctuation = "[%s%p]+"
--[[ List of punctuation or spacing characters that are found inside of words.
Used to exclude characters from the regex above. ]]
local wordPunc = "-־׳״'.·*’་"
local notWordPunc = "[^" .. wordPunc .. "]+"
local isLemma = {
"abbreviations",
"acronyms",
"adjectives",
"adnominals",
"adpositions",
"adverbs",
"affixes",
"ambipositions",
"articles",
"circumfixes",
"circumpositions",
"classifiers",
"cmavo",
"cmavo clusters",
"cmene",
"combining forms",
"conjunctions",
"counters",
"determiners",
"diacritical marks",
"equative adjectives",
"fu'ivla",
"gismu",
"Han characters",
"Han tu",
"hanzi",
"hanja",
"ideophones",
"idioms",
"infixes",
"interfixes",
"initialisms",
"interjections",
"kanji",
"letters",
"ligatures",
"lujvo",
"morphemes",
"non-constituents",
"nouns",
"numbers",
"numeral symbols",
"numerals",
"particles",
"phrases",
"postpositions",
"postpositional phrases",
"predicatives",
"prefixes",
"prepositions",
"prepositional phrases",
"preverbs",
"pronominal adverbs",
"pronouns",
"proverbs",
"proper nouns",
"punctuation marks",
"relatives",
"roots",
"stems",
"suffixes",
"syllables",
"symbols",
"verbs",
}
local isNonLemma = {
"active participles",
"adjectival participles",
"adjective forms",
"adjective feminine forms",
"adjective plural forms",
"adverb forms",
"adverbial participles",
"agent participles",
"article forms",
"circumfix forms",
"combined forms",
"comparative adjective forms",
"comparative adjectives",
"comparative adverb forms",
"comparative adverbs",
"contractions",
"converbs",
"determiner comparative forms",
"determiner forms",
"determiner superlative forms",
"diminutive nouns",
"equative adjective forms",
"equative adjectives",
"future participles",
"gerunds",
"infinitive forms",
"infinitives",
"interjection forms",
"jyutping",
"kanji readings",
"misspellings",
"negative participles",
"nominal participles",
"noun case forms",
"noun dual forms",
"noun forms",
"noun plural forms",
"noun possessive forms",
"noun singulative forms",
"numeral forms",
"participles",
"participle forms",
"particle forms",
"passive participles",
"past active participles",
"past participles",
"past participle forms",
"past passive participles",
"perfect active participles",
"perfect participles",
"perfect passive participles",
"pinyin",
"plurals",
"postposition forms",
"prefix forms",
"preposition contractions",
"preposition forms",
"prepositional pronouns",
"present active participles",
"present participles",
"present passive participles",
"pronoun forms",
"pronoun possessive forms",
"proper noun forms",
"proper noun plural forms",
"rafsi",
"romanizations",
"root forms",
"singulatives",
"suffix forms",
"superlative adjective forms",
"superlative adjectives",
"superlative adverb forms",
"superlative adverbs",
"verb forms",
"verbal nouns",
}


-- The main entry point.
-- The main entry point.
Line 16: Line 169:
local postype = args["type"]; if postype == "" then postype = nil end
local postype = args["type"]; if postype == "" then postype = nil end
local data = {lang = lang, pos_category = (postype and postype .. " " or "") .. poscat, categories = {}, heads = {head}, genders = {}, inflections = {}}
local data = {pos_category = (postype and postype .. " " or "") .. poscat, categories = {}, heads = {head}, genders = {}, inflections = {}}
if poscat == "adjectives" then
if poscat == "adjectives" then
Line 114: Line 267:
end
end


function export.full_headword(data)
 
local tracking_categories = {}
 
 
-- Format a headword with transliterations
local function format_headword(data)
for i, head in ipairs(data.heads) do
-- Apply processing to the headword, for formatting links and such
if head:find("[[", nil, true) then
head = {term = head, lang = data.lang}
end
data.heads[i] = head
end
-- Script-tags the topmost header.
return table.concat(data.heads, " <i>or</i> ")
local pagename = title.text
end
local fullPagename = title.fullText
 
local namespace = title.nsText
-- Add links to a multiword head.
function export.add_multiword_links(head)
local function workaround_to_exclude_chars(s)
return mw.ustring.gsub(s, notWordPunc, "]]%1[[Contionary:")
end
if not data.lang or type(data.lang) ~= "table" or not data.lang.getCode then
head = "[[Contionary:"
error("In data, the first argument to full_headword, data.lang should be a language object.")
.. mw.ustring.gsub(
end
head,
spacingPunctuation,
workaround_to_exclude_chars
)
.. "]]"
head = mw.ustring.gsub(head, "%[%[%]%]", "")
return head
end
 
-- Return true if the given head is multiword according to the algorithm used
-- in full_headword().
function export.head_is_multiword(head)
if not data.sc then
for possibleWordBreak in mw.ustring.gmatch(head, spacingPunctuation) do
data.sc = require("Module:scripts").findBestScript(data.heads and data.heads[1] ~= "" and data.heads[1] or pagename, data.lang)
if mw.ustring.find(possibleWordBreak, notWordPunc) then
else
return true
-- Track uses of sc parameter
local best = require("Module:scripts").findBestScript(pagename, data.lang)
require("Module:debug").track("headword/sc")
if data.sc:getCode() == best:getCode() then
require("Module:debug").track("headword/sc/redundant")
require("Module:debug").track("headword/sc/redundant/" .. data.sc:getCode())
else
require("Module:debug").track("headword/sc/needed")
require("Module:debug").track("headword/sc/needed/" .. data.sc:getCode())
end
end
end
return false
end
local function preprocess(data, postype)
if type(data.heads) ~= "table" then
data.heads = { data.heads }
end
end
local displayTitle
if not data.heads or #data.heads == 0 then
-- Assumes that the scripts in "toBeTagged" will never occur in the Reconstruction namespace.
data.heads = {""}
-- Avoid tagging ASCII as Hani even when it is tagged as Hani in the
-- headword, as in [[check]]. The check for ASCII might need to be expanded
-- to a check for any Latin characters and whitespace or punctuation.
if (namespace == "" and data.sc and toBeTagged[data.sc:getCode()]
and not pagename:find "^[%z\1-\127]+$")
or (data.sc:getCode() == "Jpan" and (test_script(pagename, "Hira") or test_script(pagename, "Kana"))) then
displayTitle = '<span class="' .. data.sc:getCode() .. '">' .. pagename .. '</span>'
elseif namespace == "Reconstruction" then
displayTitle, matched = mw.ustring.gsub(
fullPagename,
"^(Reconstruction:[^/]+/)(.+)$",
function(before, term)
return before ..
require("Module:script utilities").tag_text(
term,
data.lang,
data.sc
)
end
)
if matched == 0 then
displayTitle = nil
end
end
end
if displayTitle then
local default_head = mw.title.getCurrentTitle().text
local frame = mw.getCurrentFrame()
local unmodified_default_head = default_head
frame:callParserFunction(
 
"DISPLAYTITLE",
-- Add links to multi-word page names when appropriate
displayTitle
if export.head_is_multiword(default_head) then
)
default_head = export.add_multiword_links(default_head)
end
end
if data.force_cat_output then
-- If a head is the empty string "", then replace it with the default
--[=[
for i, head in ipairs(data.heads) do
[[Special:WhatLinksHere/Template:tracking/headword/force cat output]]
if head == "" then
]=]
head = default_head
require("Module:debug").track("headword/force cat output")
end
data.heads[i] = head
end
end
 
-- Return "lemma" if the given POS is a lemma, "non-lemma form" if a non-lemma form, or nil
-- if unknown. The POS passed in must be in its plural form ("nouns", "prefixes", etc.).
-- If you have a POS in its singular form, call pluralize() in [[Module:string utilities]] to
-- pluralize it in a smart fashion that knows when to add '-s' and when to add '-es'.
--
-- If `best_guess` is given and the POS is in neither the lemma nor non-lemma list, guess
-- based on whether it ends in " forms"; otherwise, return nil.
function pos_lemma_or_nonlemma(plpos, best_guess)
-- Is it a lemma category?
if isLemma[plpos] or isLemma[plpos:gsub("^reconstructed ", "")] then
return "lemma"
-- Is it a nonlemma category?
elseif isNonLemma[plpos] then
return "non-lemma form"
elseif best_guess then
return plpos:find(" forms$") and "non-lemma form" or "lemma"
else
return nil
end
end
 
local function show_headword_line(data)
local namespace = mw.title.getCurrentTitle().nsText
 
if not data.noposcat then
local pos_category = "[sS]iwa " .. data.pos_category
end
end
if data.getCanonicalName then
-- Is it a lemma category?
error('The "data" variable supplied to "full_headword" should not be a language object.')
local postype = pos_lemma_or_nonlemma(data.pos_category)
if not data.noposcat then
table.insert(data.categories, 1, "[sS]iwa " .. postype .. "s")
end
end
 
-- Preprocess
preprocess(data, postype)
-- Format and return all the gathered information
return
format_headword(data) ..
format_genders(data) ..
format_inflections(data) ..
require("Module:utilities").format_categories(
tracking_categories, data.lang, data.sort_key, nil,
data.force_cat_output or test_force_categories, data.sc
)
end
 
function full_headword(data)
local tracking_categories = {}
-- Were any categories specified?
-- Were any categories specified?
if data.categories and #data.categories > 0 then
if data.categories and #data.categories > 0 then
local lang_name = require("Module:string").pattern_escape(data.lang:getCanonicalName())
for _, cat in ipairs(data.categories) do
-- Does the category begin with the language name? If not, tag it with a tracking category.
if not mw.ustring.find(cat, "^" .. lang_name) then
mw.log(cat, data.lang:getCanonicalName())
table.insert(tracking_categories, "head tracking/no lang category")
--[=[
[[Special:WhatLinksHere/Template:tracking/head tracking/no lang category]]
]=]
require("Module:debug").track{
"headword/no lang category",
"headword/no lang category/lang/" .. data.lang:getCode()
}
end
end
if not data.pos_category
if not data.pos_category
and mw.ustring.find(data.categories[1], "^" .. data.lang:getCanonicalName())
and mw.ustring.find(data.categories[1], "^[sS]iwa")
then
then
data.pos_category = mw.ustring.gsub(data.categories[1], "^" .. data.lang:getCanonicalName() .. " ", "")
data.pos_category = mw.ustring.gsub(data.categories[1], "^[sS]iwa ", "")
table.remove(data.categories, 1)
table.remove(data.categories, 1)
end
end
Line 225: Line 412:
end
end
-- Categorise for unusual characters
local standard = data.lang:getStandardCharacters()
if standard then
if mw.ustring.len(title.subpageText) ~= 1 and not non_categorizable() then
for character in mw.ustring.gmatch(title.subpageText, "([^" .. standard .. "])") do
local upper = mw.ustring.upper(character)
if not mw.ustring.find(upper, "[" .. standard .. "]") then
character = upper
end
table.insert(
data.categories,
data.lang:getCanonicalName() .. " terms spelled with " .. character
)
end
end
end
-- Categorise for palindromes
if title.nsText ~= "Reconstruction" and mw.ustring.len(title.subpageText)>2
and require('Module:palindromes').is_palindrome(
title.subpageText, data.lang, data.sc
) then
table.insert(data.categories, data.lang:getCanonicalName() .. " palindromes")
end
-- This may add more categories (e.g. gender categories), so make sure it gets
-- This may add more categories (e.g. gender categories), so make sure it gets
-- evaluated first.
-- evaluated first.
Line 257: Line 418:
text ..
text ..
require("Module:utilities").format_categories(
require("Module:utilities").format_categories(
data.categories, data.lang, data.sort_key, nil,
data.categories, nil,
data.force_cat_output or test_force_categories, data.sc
data.force_cat_output
) ..
) ..
require("Module:utilities").format_categories(
require("Module:utilities").format_categories(
tracking_categories, data.lang, data.sort_key, nil,
tracking_categories, nil,
data.force_cat_output or test_force_categories, data.sc
data.force_cat_output
)
)
end
end


return export
return export