Module:siwa-headword

From Linguifex
Revision as of 15:04, 29 January 2021 by Sware (talk | contribs)
Jump to navigation Jump to search


local export = {}

local spacingPunctuation = "[%s%p]+"
--[[ List of punctuation or spacing characters that are found inside of words.
Used to exclude characters from the regex above. ]]
local wordPunc = "-־׳״'.·*’་"
local notWordPunc = "[^" .. wordPunc .. "]+"

local isLemma = {
	"abbreviations",
	"acronyms",
	"adjectives",
	"adnominals",
	"adpositions",
	"adverbs",
	"affixes",
	"ambipositions",
	"articles",
	"circumfixes",
	"circumpositions",
	"classifiers",
	"cmavo",
	"cmavo clusters",
	"cmene",
	"combining forms",
	"conjunctions",
	"counters",
	"determiners",
	"diacritical marks",
	"equative adjectives",
	"fu'ivla",
	"gismu",
	"Han characters",
	"Han tu",
	"hanzi",
	"hanja",
	"ideophones",
	"idioms",
	"infixes",
	"interfixes",
	"initialisms",
	"interjections",
	"kanji",
	"letters",
	"ligatures",
	"lujvo",
	"morphemes",
	"non-constituents",
	"nouns",
	"numbers",
	"numeral symbols",
	"numerals",
	"particles",
	"phrases",
	"postpositions",
	"postpositional phrases",
	"predicatives",
	"prefixes",
	"prepositions",
	"prepositional phrases",
	"preverbs",
	"pronominal adverbs",
	"pronouns",
	"proverbs",
	"proper nouns",
	"punctuation marks",
	"relatives",
	"roots",
	"stems",
	"suffixes",
	"syllables",
	"symbols",
	"verbs",
}

local isNonLemma = {
	"active participles",
	"adjectival participles",
	"adjective forms",
	"adjective feminine forms",
	"adjective plural forms",
	"adverb forms",
	"adverbial participles",
	"agent participles",
	"article forms",
	"circumfix forms",
	"combined forms",
	"comparative adjective forms",
	"comparative adjectives",
	"comparative adverb forms",
	"comparative adverbs",
	"contractions",
	"converbs",
	"determiner comparative forms",
	"determiner forms",
	"determiner superlative forms",
	"diminutive nouns",
	"equative adjective forms",
	"equative adjectives",
	"future participles",
	"gerunds",
	"infinitive forms",
	"infinitives",
	"interjection forms",
	"jyutping",
	"kanji readings",
	"misspellings",
	"negative participles",
	"nominal participles",
	"noun case forms",
	"noun dual forms",
	"noun forms",
	"noun plural forms",
	"noun possessive forms",
	"noun singulative forms",
	"numeral forms",
	"participles",
	"participle forms",
	"particle forms",
	"passive participles",
	"past active participles",
	"past participles",
	"past participle forms",
	"past passive participles",
	"perfect active participles",
	"perfect participles",
	"perfect passive participles",
	"pinyin",
	"plurals",
	"postposition forms",
	"prefix forms",
	"preposition contractions",
	"preposition forms",
	"prepositional pronouns",
	"present active participles",
	"present participles",
	"present passive participles",
	"pronoun forms",
	"pronoun possessive forms",
	"proper noun forms",
	"proper noun plural forms",
	"rafsi",
	"romanizations",
	"root forms",
	"singulatives",
	"suffix forms",
	"superlative adjective forms",
	"superlative adjectives",
	"superlative adverb forms",
	"superlative adverbs",
	"verb forms",
	"verbal nouns",
}


-- The main entry point.
-- This is the only function that can be invoked from a template.
function export.show(frame)
	local args = frame:getParent().args
	PAGENAME = mw.title.getCurrentTitle().subpageText
	
	local head = args["head"]; if head == "" then head = nil end
	
	-- The part of speech. This is also the name of the category that
	-- entries go in. However, the two are separate (the "cat" parameter)
	-- because you sometimes want something to behave as an adjective without
	-- putting it in the adjectives category.
	local poscat = frame.args[1] or error("Part of speech has not been specified. Please pass parameter 1 to the module invocation.")
	
	local data = {pos_category = poscat, categories = {}, heads = {head}, genders = {}, inflections = {}}
	
	if poscat == "adjectives" then
		if PAGENAME:find("^-") then
			data.pos_category = "suffixes"
			data.categories = {"Siwa adjective-forming suffixes"}
		end
		
		adjective(args, data)
	elseif poscat == "adverbs" then
		if PAGENAME:find("^-") then
			data.pos_category = "suffixes"
			data.categories = {"Siwa adverb-forming suffixes"}
		end
		
		adverb(args, data)
	elseif poscat == "determiners" then
		adjective(args, data)
	elseif poscat == "nouns" then
		if PAGENAME:find("^-") then
			data.pos_category = "suffixes"
			data.categories = {"Siwa noun-forming suffixes"}
		end
		
		noun_gender(args, data)
	elseif poscat == "proper nouns" then
		noun_gender(args, data)
	elseif poscat == "verbs" then
		if PAGENAME:find("^-") then
			data.pos_category = "suffixes"
			data.categories = {"Siwa verb-forming suffixes"}
		end
	end
	
	return full_headword(data)
end

-- Display information for a noun's gender
-- This is separate so that it can also be used for proper nouns
function noun_gender(args, data)
	local valid_genders = {
		["in"] = true,
		["an"] = true,
		["in-p"] = true,
		["an-p"] = true,
	}
	
	-- Iterate over all gn parameters (g2, g3 and so on) until one is empty
	local g = args[1] or ""; if g == "" then g = "?" end
	local i = 2
	
	while g ~= "" do
		if not valid_genders[g] then
			g = "?"
		end
		
		table.insert(data.genders, g)
		g = args["g" .. i] or ""
		i = i + 1
	end
end

function adjective(args, data)
	local adverb = args["adv"]; if adverb == "" then adverb = nil end
	local comparative = args[1]; if comparative == "" then comparative = nil end
	local superlative = args[2]; if superlative == "" then superlative = nil end
	
	if adverb then
		table.insert(data.inflections, {label = "adverb", adverb})
	end
	
	if comparative then
		table.insert(data.inflections, {label = "comparative", comparative})
	end
	
	if superlative then
		table.insert(data.inflections, {label = "superlative", superlative})
	end
end

function adverb(args, data)
	local adjective = args["adj"]; if adjective == "" then adjective = nil end
	local comparative = args[1]; if comparative == "" then comparative = nil end
	local superlative = args[2]; if superlative == "" then superlative = nil end
	
	if adjective then
		table.insert(data.inflections, {label = "adjective", adjective})
	end
	
	if comparative then
		table.insert(data.inflections, {label = "comparative", comparative})
	end
	
	if superlative then
		table.insert(data.inflections, {label = "superlative", superlative})
	end
end




-- Format a headword with transliterations
local function format_headword(data)
	for i, head in ipairs(data.heads) do
		
		-- Apply processing to the headword, for formatting links and such
		if head:find("[[", nil, true) then
			head = {term = head, lang = data.lang}
		end
		
		data.heads[i] = head
	end
	
	return table.concat(data.heads, " <i>or</i> ")
end

-- Add links to a multiword head.
function export.add_multiword_links(head)
	local function workaround_to_exclude_chars(s)
		return mw.ustring.gsub(s, notWordPunc, "]]%1[[Contionary:")
	end
	
	head = "[[Contionary:"
		.. mw.ustring.gsub(
			head,
			spacingPunctuation,
			workaround_to_exclude_chars
			)
		.. "]]"
	head = mw.ustring.gsub(head, "%[%[%]%]", "")
	return head
end

-- Return true if the given head is multiword according to the algorithm used
-- in full_headword().
function export.head_is_multiword(head)
	
	for possibleWordBreak in mw.ustring.gmatch(head, spacingPunctuation) do
		if mw.ustring.find(possibleWordBreak, notWordPunc) then
			return true
		end
	end

	return false
end

local function preprocess(data, postype)
	if type(data.heads) ~= "table" then
		data.heads = { data.heads }
	end
	
	if not data.heads or #data.heads == 0 then
		data.heads = {""}
	end
	
	local default_head = mw.title.getCurrentTitle().text
	local unmodified_default_head = default_head

	-- Add links to multi-word page names when appropriate
	if export.head_is_multiword(default_head) then
		default_head = export.add_multiword_links(default_head)
	end
	
	-- If a head is the empty string "", then replace it with the default
	for i, head in ipairs(data.heads) do
		if head == "" then
			head = default_head
		end
		data.heads[i] = head
	end
end

-- Return "lemma" if the given POS is a lemma, "non-lemma form" if a non-lemma form, or nil
-- if unknown. The POS passed in must be in its plural form ("nouns", "prefixes", etc.).
-- If you have a POS in its singular form, call pluralize() in [[Module:string utilities]] to
-- pluralize it in a smart fashion that knows when to add '-s' and when to add '-es'.
--
-- If `best_guess` is given and the POS is in neither the lemma nor non-lemma list, guess
-- based on whether it ends in " forms"; otherwise, return nil.
function pos_lemma_or_nonlemma(plpos, best_guess)
	-- Is it a lemma category?
	if isLemma[plpos] or isLemma[plpos:gsub("^reconstructed ", "")] then
		return "lemma"
	-- Is it a nonlemma category?
	elseif isNonLemma[plpos] then
		return "non-lemma form"
	elseif best_guess then
		return plpos:find(" forms$") and "non-lemma form" or "lemma"
	else
		return nil
	end
end

local function show_headword_line(data)
	local namespace = mw.title.getCurrentTitle().nsText

	if not data.noposcat then	
		local pos_category = "[sS]iwa " .. data.pos_category
	end
	
	-- Is it a lemma category?
	local postype = pos_lemma_or_nonlemma(data.pos_category)
	if not data.noposcat then
		table.insert(data.categories, 1, "[sS]iwa " .. postype .. "s")
	end

	-- Preprocess
	preprocess(data, postype)
	
	-- Format and return all the gathered information
	return
		format_headword(data) ..
		format_genders(data) ..
		format_inflections(data) ..
		require("Module:utilities").format_categories(
			tracking_categories, data.lang, data.sort_key, nil,
			data.force_cat_output or test_force_categories, data.sc
			)
end

function full_headword(data)
	local tracking_categories = {}
	
	-- Were any categories specified?
	if data.categories and #data.categories > 0 then
		
		if not data.pos_category
			and mw.ustring.find(data.categories[1], "^[sS]iwa")
				then
			data.pos_category = mw.ustring.gsub(data.categories[1], "^[sS]iwa ", "")
			table.remove(data.categories, 1)
		end
	end
	
	if not data.pos_category then
		error(
			'No valid part-of-speech categories were found in the list '
			.. 'of categories passed to the function "full_headword". '
			.. 'The part-of-speech category should consist of a language\'s '
			.. 'canonical name plus a part of speech.'
			)
	end
	
	-- This may add more categories (e.g. gender categories), so make sure it gets
	-- evaluated first.
	local text = show_headword_line(data)
	return
		text ..
		require("Module:utilities").format_categories(
			data.categories, nil,
			data.force_cat_output
			) ..
		require("Module:utilities").format_categories(
			tracking_categories, nil,
			data.force_cat_output
			)
end

return export