Module:siwa-headword: Difference between revisions
Jump to navigation
Jump to search
No edit summary |
No edit summary |
||
Line 1: | Line 1: | ||
local export = {} | local export = {} | ||
local spacingPunctuation = "[%s%p]+" | |||
--[[ List of punctuation or spacing characters that are found inside of words. | |||
Used to exclude characters from the regex above. ]] | |||
local wordPunc = "-־׳״'.·*’་" | |||
local notWordPunc = "[^" .. wordPunc .. "]+" | |||
local isLemma = { | |||
"abbreviations", | |||
"acronyms", | |||
"adjectives", | |||
"adnominals", | |||
"adpositions", | |||
"adverbs", | |||
"affixes", | |||
"ambipositions", | |||
"articles", | |||
"circumfixes", | |||
"circumpositions", | |||
"classifiers", | |||
"cmavo", | |||
"cmavo clusters", | |||
"cmene", | |||
"combining forms", | |||
"conjunctions", | |||
"counters", | |||
"determiners", | |||
"diacritical marks", | |||
"equative adjectives", | |||
"fu'ivla", | |||
"gismu", | |||
"Han characters", | |||
"Han tu", | |||
"hanzi", | |||
"hanja", | |||
"ideophones", | |||
"idioms", | |||
"infixes", | |||
"interfixes", | |||
"initialisms", | |||
"interjections", | |||
"kanji", | |||
"letters", | |||
"ligatures", | |||
"lujvo", | |||
"morphemes", | |||
"non-constituents", | |||
"nouns", | |||
"numbers", | |||
"numeral symbols", | |||
"numerals", | |||
"particles", | |||
"phrases", | |||
"postpositions", | |||
"postpositional phrases", | |||
"predicatives", | |||
"prefixes", | |||
"prepositions", | |||
"prepositional phrases", | |||
"preverbs", | |||
"pronominal adverbs", | |||
"pronouns", | |||
"proverbs", | |||
"proper nouns", | |||
"punctuation marks", | |||
"relatives", | |||
"roots", | |||
"stems", | |||
"suffixes", | |||
"syllables", | |||
"symbols", | |||
"verbs", | |||
} | |||
local isNonLemma = { | |||
"active participles", | |||
"adjectival participles", | |||
"adjective forms", | |||
"adjective feminine forms", | |||
"adjective plural forms", | |||
"adverb forms", | |||
"adverbial participles", | |||
"agent participles", | |||
"article forms", | |||
"circumfix forms", | |||
"combined forms", | |||
"comparative adjective forms", | |||
"comparative adjectives", | |||
"comparative adverb forms", | |||
"comparative adverbs", | |||
"contractions", | |||
"converbs", | |||
"determiner comparative forms", | |||
"determiner forms", | |||
"determiner superlative forms", | |||
"diminutive nouns", | |||
"equative adjective forms", | |||
"equative adjectives", | |||
"future participles", | |||
"gerunds", | |||
"infinitive forms", | |||
"infinitives", | |||
"interjection forms", | |||
"jyutping", | |||
"kanji readings", | |||
"misspellings", | |||
"negative participles", | |||
"nominal participles", | |||
"noun case forms", | |||
"noun dual forms", | |||
"noun forms", | |||
"noun plural forms", | |||
"noun possessive forms", | |||
"noun singulative forms", | |||
"numeral forms", | |||
"participles", | |||
"participle forms", | |||
"particle forms", | |||
"passive participles", | |||
"past active participles", | |||
"past participles", | |||
"past participle forms", | |||
"past passive participles", | |||
"perfect active participles", | |||
"perfect participles", | |||
"perfect passive participles", | |||
"pinyin", | |||
"plurals", | |||
"postposition forms", | |||
"prefix forms", | |||
"preposition contractions", | |||
"preposition forms", | |||
"prepositional pronouns", | |||
"present active participles", | |||
"present participles", | |||
"present passive participles", | |||
"pronoun forms", | |||
"pronoun possessive forms", | |||
"proper noun forms", | |||
"proper noun plural forms", | |||
"rafsi", | |||
"romanizations", | |||
"root forms", | |||
"singulatives", | |||
"suffix forms", | |||
"superlative adjective forms", | |||
"superlative adjectives", | |||
"superlative adverb forms", | |||
"superlative adverbs", | |||
"verb forms", | |||
"verbal nouns", | |||
} | |||
-- The main entry point. | -- The main entry point. | ||
Line 16: | Line 169: | ||
local postype = args["type"]; if postype == "" then postype = nil end | local postype = args["type"]; if postype == "" then postype = nil end | ||
local data = { | local data = {pos_category = (postype and postype .. " " or "") .. poscat, categories = {}, heads = {head}, genders = {}, inflections = {}} | ||
if poscat == "adjectives" then | if poscat == "adjectives" then | ||
Line 114: | Line 267: | ||
end | end | ||
function | |||
-- Format a headword with transliterations | |||
local function format_headword(data) | |||
for i, head in ipairs(data.heads) do | |||
-- Apply processing to the headword, for formatting links and such | |||
if head:find("[[", nil, true) then | |||
head = {term = head, lang = data.lang} | |||
end | |||
data.heads[i] = head | |||
end | |||
-- | return table.concat(data.heads, " <i>or</i> ") | ||
end | |||
local | |||
-- Add links to a multiword head. | |||
function export.add_multiword_links(head) | |||
local function workaround_to_exclude_chars(s) | |||
return mw.ustring.gsub(s, notWordPunc, "]]%1[[Contionary:") | |||
end | |||
head = "[[Contionary:" | |||
.. mw.ustring.gsub( | |||
head, | |||
spacingPunctuation, | |||
workaround_to_exclude_chars | |||
) | |||
.. "]]" | |||
head = mw.ustring.gsub(head, "%[%[%]%]", "") | |||
return head | |||
end | |||
-- Return true if the given head is multiword according to the algorithm used | |||
-- in full_headword(). | |||
function export.head_is_multiword(head) | |||
for possibleWordBreak in mw.ustring.gmatch(head, spacingPunctuation) do | |||
if mw.ustring.find(possibleWordBreak, notWordPunc) then | |||
return true | |||
end | end | ||
end | |||
return false | |||
end | |||
local function preprocess(data, postype) | |||
if type(data.heads) ~= "table" then | |||
data.heads = { data.heads } | |||
end | end | ||
if not data.heads or #data.heads == 0 then | |||
data.heads = {""} | |||
if | |||
end | end | ||
local default_head = mw.title.getCurrentTitle().text | |||
local unmodified_default_head = default_head | |||
-- Add links to multi-word page names when appropriate | |||
if export.head_is_multiword(default_head) then | |||
) | default_head = export.add_multiword_links(default_head) | ||
end | end | ||
-- If a head is the empty string "", then replace it with the default | |||
--[ | for i, head in ipairs(data.heads) do | ||
if head == "" then | |||
head = default_head | |||
end | |||
data.heads[i] = head | |||
end | |||
end | |||
-- Return "lemma" if the given POS is a lemma, "non-lemma form" if a non-lemma form, or nil | |||
-- if unknown. The POS passed in must be in its plural form ("nouns", "prefixes", etc.). | |||
-- If you have a POS in its singular form, call pluralize() in [[Module:string utilities]] to | |||
-- pluralize it in a smart fashion that knows when to add '-s' and when to add '-es'. | |||
-- | |||
-- If `best_guess` is given and the POS is in neither the lemma nor non-lemma list, guess | |||
-- based on whether it ends in " forms"; otherwise, return nil. | |||
function pos_lemma_or_nonlemma(plpos, best_guess) | |||
-- Is it a lemma category? | |||
if isLemma[plpos] or isLemma[plpos:gsub("^reconstructed ", "")] then | |||
return "lemma" | |||
-- Is it a nonlemma category? | |||
elseif isNonLemma[plpos] then | |||
return "non-lemma form" | |||
elseif best_guess then | |||
return plpos:find(" forms$") and "non-lemma form" or "lemma" | |||
else | |||
return nil | |||
end | |||
end | |||
local function show_headword_line(data) | |||
local namespace = mw.title.getCurrentTitle().nsText | |||
if not data.noposcat then | |||
local pos_category = "[sS]iwa " .. data.pos_category | |||
end | end | ||
if data. | -- Is it a lemma category? | ||
local postype = pos_lemma_or_nonlemma(data.pos_category) | |||
if not data.noposcat then | |||
table.insert(data.categories, 1, "[sS]iwa " .. postype .. "s") | |||
end | end | ||
-- Preprocess | |||
preprocess(data, postype) | |||
-- Format and return all the gathered information | |||
return | |||
format_headword(data) .. | |||
format_genders(data) .. | |||
format_inflections(data) .. | |||
require("Module:utilities").format_categories( | |||
tracking_categories, data.lang, data.sort_key, nil, | |||
data.force_cat_output or test_force_categories, data.sc | |||
) | |||
end | |||
function full_headword(data) | |||
local tracking_categories = {} | |||
-- Were any categories specified? | -- Were any categories specified? | ||
if data.categories and #data.categories > 0 then | if data.categories and #data.categories > 0 then | ||
if not data.pos_category | if not data.pos_category | ||
and mw.ustring.find(data.categories[1], "^" | and mw.ustring.find(data.categories[1], "^[sS]iwa") | ||
then | then | ||
data.pos_category = mw.ustring.gsub(data.categories[1], "^ | data.pos_category = mw.ustring.gsub(data.categories[1], "^[sS]iwa ", "") | ||
table.remove(data.categories, 1) | table.remove(data.categories, 1) | ||
end | end | ||
Line 225: | Line 412: | ||
end | end | ||
-- This may add more categories (e.g. gender categories), so make sure it gets | -- This may add more categories (e.g. gender categories), so make sure it gets | ||
-- evaluated first. | -- evaluated first. | ||
Line 257: | Line 418: | ||
text .. | text .. | ||
require("Module:utilities").format_categories( | require("Module:utilities").format_categories( | ||
data.categories | data.categories, nil, | ||
data.force_cat_output | data.force_cat_output | ||
) .. | ) .. | ||
require("Module:utilities").format_categories( | require("Module:utilities").format_categories( | ||
tracking_categories | tracking_categories, nil, | ||
data.force_cat_output | data.force_cat_output | ||
) | ) | ||
end | end | ||
return export | return export |
Revision as of 15:00, 29 January 2021
- The following documentation is located at Module:siwa-headword/doc.[edit]
- Useful links: subpage list • links • transclusions • testcases • sandbox
local export = {}
local spacingPunctuation = "[%s%p]+"
--[[ List of punctuation or spacing characters that are found inside of words.
Used to exclude characters from the regex above. ]]
local wordPunc = "-־׳״'.·*’་"
local notWordPunc = "[^" .. wordPunc .. "]+"
local isLemma = {
"abbreviations",
"acronyms",
"adjectives",
"adnominals",
"adpositions",
"adverbs",
"affixes",
"ambipositions",
"articles",
"circumfixes",
"circumpositions",
"classifiers",
"cmavo",
"cmavo clusters",
"cmene",
"combining forms",
"conjunctions",
"counters",
"determiners",
"diacritical marks",
"equative adjectives",
"fu'ivla",
"gismu",
"Han characters",
"Han tu",
"hanzi",
"hanja",
"ideophones",
"idioms",
"infixes",
"interfixes",
"initialisms",
"interjections",
"kanji",
"letters",
"ligatures",
"lujvo",
"morphemes",
"non-constituents",
"nouns",
"numbers",
"numeral symbols",
"numerals",
"particles",
"phrases",
"postpositions",
"postpositional phrases",
"predicatives",
"prefixes",
"prepositions",
"prepositional phrases",
"preverbs",
"pronominal adverbs",
"pronouns",
"proverbs",
"proper nouns",
"punctuation marks",
"relatives",
"roots",
"stems",
"suffixes",
"syllables",
"symbols",
"verbs",
}
local isNonLemma = {
"active participles",
"adjectival participles",
"adjective forms",
"adjective feminine forms",
"adjective plural forms",
"adverb forms",
"adverbial participles",
"agent participles",
"article forms",
"circumfix forms",
"combined forms",
"comparative adjective forms",
"comparative adjectives",
"comparative adverb forms",
"comparative adverbs",
"contractions",
"converbs",
"determiner comparative forms",
"determiner forms",
"determiner superlative forms",
"diminutive nouns",
"equative adjective forms",
"equative adjectives",
"future participles",
"gerunds",
"infinitive forms",
"infinitives",
"interjection forms",
"jyutping",
"kanji readings",
"misspellings",
"negative participles",
"nominal participles",
"noun case forms",
"noun dual forms",
"noun forms",
"noun plural forms",
"noun possessive forms",
"noun singulative forms",
"numeral forms",
"participles",
"participle forms",
"particle forms",
"passive participles",
"past active participles",
"past participles",
"past participle forms",
"past passive participles",
"perfect active participles",
"perfect participles",
"perfect passive participles",
"pinyin",
"plurals",
"postposition forms",
"prefix forms",
"preposition contractions",
"preposition forms",
"prepositional pronouns",
"present active participles",
"present participles",
"present passive participles",
"pronoun forms",
"pronoun possessive forms",
"proper noun forms",
"proper noun plural forms",
"rafsi",
"romanizations",
"root forms",
"singulatives",
"suffix forms",
"superlative adjective forms",
"superlative adjectives",
"superlative adverb forms",
"superlative adverbs",
"verb forms",
"verbal nouns",
}
-- The main entry point.
-- This is the only function that can be invoked from a template.
function export.show(frame)
local args = frame:getParent().args
PAGENAME = mw.title.getCurrentTitle().subpageText
local head = args["head"]; if head == "" then head = nil end
-- The part of speech. This is also the name of the category that
-- entries go in. However, the two are separate (the "cat" parameter)
-- because you sometimes want something to behave as an adjective without
-- putting it in the adjectives category.
local poscat = frame.args[1] or error("Part of speech has not been specified. Please pass parameter 1 to the module invocation.")
local postype = args["type"]; if postype == "" then postype = nil end
local data = {pos_category = (postype and postype .. " " or "") .. poscat, categories = {}, heads = {head}, genders = {}, inflections = {}}
if poscat == "adjectives" then
if PAGENAME:find("^-") then
data.pos_category = "suffixes"
data.categories = {"Siwa adjective-forming suffixes"}
end
adjective(args, data)
elseif poscat == "adverbs" then
if PAGENAME:find("^-") then
data.pos_category = "suffixes"
data.categories = {"Siwa adverb-forming suffixes"}
end
adverb(args, data)
elseif poscat == "determiners" then
adjective(args, data)
elseif poscat == "nouns" then
if PAGENAME:find("^-") then
data.pos_category = "suffixes"
data.categories = {"Siwa noun-forming suffixes"}
end
noun_gender(args, data)
elseif poscat == "proper nouns" then
noun_gender(args, data)
elseif poscat == "verbs" then
if PAGENAME:find("^-") then
data.pos_category = "suffixes"
data.categories = {"Siwa verb-forming suffixes"}
end
end
return full_headword(data)
end
-- Display information for a noun's gender
-- This is separate so that it can also be used for proper nouns
function noun_gender(args, data)
local valid_genders = {
["in"] = true,
["an"] = true,
["in-p"] = true,
["an-p"] = true,
}
-- Iterate over all gn parameters (g2, g3 and so on) until one is empty
local g = args[1] or ""; if g == "" then g = "?" end
local i = 2
while g ~= "" do
if not valid_genders[g] then
g = "?"
end
table.insert(data.genders, g)
g = args["g" .. i] or ""
i = i + 1
end
end
function adjective(args, data)
local adverb = args["adv"]; if adverb == "" then adverb = nil end
local comparative = args[1]; if comparative == "" then comparative = nil end
local superlative = args[2]; if superlative == "" then superlative = nil end
if adverb then
table.insert(data.inflections, {label = "adverb", adverb})
end
if comparative then
table.insert(data.inflections, {label = "comparative", comparative})
end
if superlative then
table.insert(data.inflections, {label = "superlative", superlative})
end
end
function adverb(args, data)
local adjective = args["adj"]; if adjective == "" then adjective = nil end
local comparative = args[1]; if comparative == "" then comparative = nil end
local superlative = args[2]; if superlative == "" then superlative = nil end
if adjective then
table.insert(data.inflections, {label = "adjective", adjective})
end
if comparative then
table.insert(data.inflections, {label = "comparative", comparative})
end
if superlative then
table.insert(data.inflections, {label = "superlative", superlative})
end
end
-- Format a headword with transliterations
local function format_headword(data)
for i, head in ipairs(data.heads) do
-- Apply processing to the headword, for formatting links and such
if head:find("[[", nil, true) then
head = {term = head, lang = data.lang}
end
data.heads[i] = head
end
return table.concat(data.heads, " <i>or</i> ")
end
-- Add links to a multiword head.
function export.add_multiword_links(head)
local function workaround_to_exclude_chars(s)
return mw.ustring.gsub(s, notWordPunc, "]]%1[[Contionary:")
end
head = "[[Contionary:"
.. mw.ustring.gsub(
head,
spacingPunctuation,
workaround_to_exclude_chars
)
.. "]]"
head = mw.ustring.gsub(head, "%[%[%]%]", "")
return head
end
-- Return true if the given head is multiword according to the algorithm used
-- in full_headword().
function export.head_is_multiword(head)
for possibleWordBreak in mw.ustring.gmatch(head, spacingPunctuation) do
if mw.ustring.find(possibleWordBreak, notWordPunc) then
return true
end
end
return false
end
local function preprocess(data, postype)
if type(data.heads) ~= "table" then
data.heads = { data.heads }
end
if not data.heads or #data.heads == 0 then
data.heads = {""}
end
local default_head = mw.title.getCurrentTitle().text
local unmodified_default_head = default_head
-- Add links to multi-word page names when appropriate
if export.head_is_multiword(default_head) then
default_head = export.add_multiword_links(default_head)
end
-- If a head is the empty string "", then replace it with the default
for i, head in ipairs(data.heads) do
if head == "" then
head = default_head
end
data.heads[i] = head
end
end
-- Return "lemma" if the given POS is a lemma, "non-lemma form" if a non-lemma form, or nil
-- if unknown. The POS passed in must be in its plural form ("nouns", "prefixes", etc.).
-- If you have a POS in its singular form, call pluralize() in [[Module:string utilities]] to
-- pluralize it in a smart fashion that knows when to add '-s' and when to add '-es'.
--
-- If `best_guess` is given and the POS is in neither the lemma nor non-lemma list, guess
-- based on whether it ends in " forms"; otherwise, return nil.
function pos_lemma_or_nonlemma(plpos, best_guess)
-- Is it a lemma category?
if isLemma[plpos] or isLemma[plpos:gsub("^reconstructed ", "")] then
return "lemma"
-- Is it a nonlemma category?
elseif isNonLemma[plpos] then
return "non-lemma form"
elseif best_guess then
return plpos:find(" forms$") and "non-lemma form" or "lemma"
else
return nil
end
end
local function show_headword_line(data)
local namespace = mw.title.getCurrentTitle().nsText
if not data.noposcat then
local pos_category = "[sS]iwa " .. data.pos_category
end
-- Is it a lemma category?
local postype = pos_lemma_or_nonlemma(data.pos_category)
if not data.noposcat then
table.insert(data.categories, 1, "[sS]iwa " .. postype .. "s")
end
-- Preprocess
preprocess(data, postype)
-- Format and return all the gathered information
return
format_headword(data) ..
format_genders(data) ..
format_inflections(data) ..
require("Module:utilities").format_categories(
tracking_categories, data.lang, data.sort_key, nil,
data.force_cat_output or test_force_categories, data.sc
)
end
function full_headword(data)
local tracking_categories = {}
-- Were any categories specified?
if data.categories and #data.categories > 0 then
if not data.pos_category
and mw.ustring.find(data.categories[1], "^[sS]iwa")
then
data.pos_category = mw.ustring.gsub(data.categories[1], "^[sS]iwa ", "")
table.remove(data.categories, 1)
end
end
if not data.pos_category then
error(
'No valid part-of-speech categories were found in the list '
.. 'of categories passed to the function "full_headword". '
.. 'The part-of-speech category should consist of a language\'s '
.. 'canonical name plus a part of speech.'
)
end
-- This may add more categories (e.g. gender categories), so make sure it gets
-- evaluated first.
local text = show_headword_line(data)
return
text ..
require("Module:utilities").format_categories(
data.categories, nil,
data.force_cat_output
) ..
require("Module:utilities").format_categories(
tracking_categories, nil,
data.force_cat_output
)
end
return export