Module:form of: Difference between revisions

Latest revision as of 18:12, 12 August 2024

The following documentation is located at Module:form of/doc.^[edit]
Useful links: subpage list • links • transclusions • testcases • sandbox
local export = {}

export.force_cat = false -- for testing; set to true to display categories even on non-mainspace pages

local m_links = require("Module:links")
local m_string_utils = require("Module:string utilities")
local m_table = require("Module:table")
local parse_utilities_module = "Module:parse utilities"
local labels_module = "Module:labels"
local utilities_module = "Module:utilities"
export.form_of_pos_module = "Module:form of/pos"
export.form_of_functions_module = "Module:form of/functions"
export.form_of_cats_module = "Module:form of/cats"
export.form_of_lang_data_module_prefix = "Module:form of/lang-data/"
export.form_of_data_module = "Module:form of/data"
export.form_of_data2_module = "Module:form of/data2"

local ulen = m_string_utils.len
local rsubn = m_string_utils.gsub
local rmatch = m_string_utils.match
local rsplit = m_string_utils.split

export.TAG_TYPE = 1
export.GLOSSARY = 2
export.SHORTCUTS = 3
export.WIKIDATA = 4

export.APPENDIX = true
export.WP = false
export.WIKT = 0

--[==[
Set listing the languages with lang-specific tags. If a language isn't listed here, the tags for that language won't be
recognized.
]==]
export.langs_with_lang_specific_tags = {
	["en"] = true,
	["got"] = true,
	["lt"] = true,
	["lv"] = true,
	["nl"] = true,
	["pi"] = true,
	["sw"] = true,
	["ttj"] = true,
}

--[==[ intro:

This module implements the underlying processing of {{tl|form of}}, {{tl|inflection of}} and specific variants such as
{{tl|past participle of}} and {{tl|alternative spelling of}}. Most of the logic in this file is to handle tags in
{{tl|inflection of}}. Other related files:

* [[Module:form of/templates]] contains the majority of the logic that implements the templates themselves.
* [[Module:form of/data]] is a data-only file containing information on the more common inflection tags, listing the
  tags, their shortcuts, the category they belong to (tense-aspect, case, gender, voice-valence, etc.), the appropriate
  glossary link and the wikidata ID.
* [[Module:form of/data2]] is a data-only file containing information on the less common inflection tags, in the same
  format as [[Module:form of/data]].
* [[Module:form of/lang-data/LANGCODE]] is a data-only file containing information on the language-specific inflection
  tags for the language with code LANGCODE, in the same format as [[Module:form of/data]]. Language-specific tags
  override general tags.
* [[Module:form of/cats]] is a data-only file listing the language-specific categories that are added when the
  appropriate combinations of tags are seen for a given language.
* [[Module:form of/pos]] is a data-only file listing the recognized parts of speech and their abbreviations, used for
  categorization. FIXME: This should be unified with the parts of speech listed in [[Module:links]].
* [[Module:form of/functions]] contains functions for use with [[Module:form of/data]] and [[Module:form of/cats]].
  They are contained in this module because data-only modules can't contain code. The functions in this file are of two
  types:
*# Display handlers allow for customization of the display of multipart tags (see below). Currently there is only
   one such handler, for handling multipart person tags such as `1//2//3`.
*# Cat functions allow for more complex categorization logic, and are referred to by name in [[Module:form of/cats]].
   Currently no such functions exist.

The following terminology is used in conjunction with {{tl|inflection of}}:

* A ''tag'' is a single grammatical item, as specified in a single numbered parameter of {{tl|inflection of}}. Examples
  are `masculine`, `nominative`, or `first-person`. Tags may be abbreviated, e.g. `m` for `masculine`, `nom` for
  `nominative`, or `1` for `first-person`. Such abbreviations are called ''aliases'', and some tags have multiple
  equivalent aliases (e.g. `p` or `pl` for `plural`). The full, non-abbreviated form of a tag is called its
  ''canonical form''.
* The ''display form'' of a tag is the way it's displayed to the user. Usually the displayed text of the tag is the same
  as its canonical form, and it normally functions as a link to a glossary entry explaining the tag. Usually the link is
  to an entry in [[Appendix:Glossary]], but sometimes the tag is linked to an individual dictionary entry or to a
  Wikipedia entry. Occasionally, the display text differs from the canonical form of the tag. An example is the tag
  `comparative case`, which has the display text read as simply `comparative`. Normally, tags referring to cases don't
  have the word "case" in them, but in this case the tag `comparative` was already used as an alias for the tag
  `comparative degree`, so the tag was named `comparative case` to avoid clashing. A similar situation occurs with
  `adverbial case` vs. the grammar tag `adverbial` (as in `adverbial participle`).
* A ''tag set'' is an ordered list of tags, which together express a single inflection, for example, `1|s|pres|ind`,
  which can be expanded to canonical-form tags as `first-person|singular|present|indicative`.
* A ''conjoined tag set'' is a tag set that consists of multiple individual tag sets separated by a semicolon, e.g.
  `1|s|pres|ind|;|2|s|imp`, which specifies two tag sets, `1|s|pres|ind` as above and `2|s|imp` (in canonical form,
  `second-person|singular|imperative`). Multiple tag sets specified in a single call to {{tl|inflection of}} are
  specified in this fashion. Conjoined tag sets can also occur in list-tag shortcuts.
* A ''multipart tag'' is a tag that embeds multiple tags within it, such as `f//n` or `nom//acc//voc`. These are used in
  the case of [[syncretism]], when the same form applies to multiple inflections. Examples are the Spanish present
  subjunctive, where the first-person and third-person singular have the same form (e.g. {{m|es|siga}} from
  {{m|es|seguir|t=to follow}}), or Latin third-declension adjectives, where the dative and ablative plural of all
  genders have the same form (e.g. {{m|la|omnibus}} from {{m|la|omnis|t=all}}). These would be expressed respectively as
  `1//3|s|pres|sub` and `dat//abl|m//f//n|p`, where the use of the multipart tag compactly encodes the syncretism and
  avoids the need to individually list out all of the inflections. Multipart tags currently display as a list separated
  by a slash, e.g.  ''dative/ablative'' or ''masculine/feminine/neuter'' where each individual word is linked
  appropriately. As a special case, multipart tags involving persons display specially; for example, the multipart tag
  `1//2//3` displays as ''first-, second- and third-person'', with the word "person" occurring only once.
* A ''two-level multipart tag'' is a special type of multipart tag that joins two or more tag sets instead of joining
  individual tags. The tags within the tag set are joined by a colon, e.g. `1:s//3:p`, which is displayed as
  ''first-person singular and third-person plural'', e.g. for use with the form {{m|grc|μέλλον}} of the verb
  {{m|grc|μέλλω|t=to intend}}, which uses the tag set `1:s//3:p|impf|actv|indc|unaugmented` to express the syncretism
  between the first singular and third plural forms of the imperfect active indicative unaugmented conjugation.
  Two-level multipart tags should be used sparingly; if in doubt, list out the inflections separately. [FIXME: Make
  two-level multipart tags obsolete.]
* A ''shortcut'' is a tag that expands to any type of tag described above, or to any type of tag set described above.
  Aliases are a particular type of shortcut whose expansion is a single non-multipart tag.
* A ''multipart shortcut'' is a shortcut that expands into a multipart tag, for example `123`, which expands to the
  multipart tag `1//2//3`. Only the most common such combinations exist as shortcuts.
* A ''list shortcut'' is a special type of shortcut that expands to a list of tags instead of a single tag. For example,
  the shortcut `1s` expands to `1|s` (first-person singular). Only the most common such combinations exist as shortcuts.
* A ''conjoined shortcut'' is a special type of list shortcut that consists of a conjoined tag set (multiple logical tag
  sets). For example, the English language-specific shortcut `ed-form` expands to `spast|;|past|part`, expressing the
  common syncretism between simple past and past participle in English (and in this case, `spast` is itself a list
  shortcut that expands to `simple|past`).
]==]

-- version of rsubn() that discards all but the first return value
local function rsub(term, foo, bar)
	local retval = rsubn(term, foo, bar)
	return retval
end


local function normalize_index(list, index)
	if index < 0 then
		return #list + index + 1
	end
	return index
end

-- FIXME, consider moving to [[Module:table]]
-- Return true if the list `tags1`, treated as a set, is a subset of the list `tags2`, also treated as a set.
local function is_subset(tags1, tags2)
	tags1 = m_table.listToSet(tags1)
	tags2 = m_table.listToSet(tags2)
	for tag, _ in pairs(tags1) do
		if not tags2[tag] then
			return false
		end
	end
	return true
end


-- FIXME, move to [[Module:table]]
local function slice(list, i, j)
	--checkType("slice", 1, list, "table")
	--checkType("slice", 2, i, "number", true)
	--checkType("slice", 3, j, "number", true)
	if i == nil then
		i = 1
	else
		i = normalize_index(list, i)
	end
	j = normalize_index(list, j or -1)

	local retval = {}
	local k = 0
	for index = i, j do
		k = k + 1
		retval[k] = list[index]
	end
	return retval
end


local function wrap_in_span(text, classes)
	return ("<span class='%s'>%s</span>"):format(classes, text)
end


--[==[
Lowest-level implementation of form-of templates, including the general {{tl|form of}} as well as those that deal with
inflection tags, such as the general {{tl|inflection of}}, semi-specific variants such as {{tl|participle of}}, and
specific variants such as {{tl|past participle of}}. `data` contains all the information controlling the display, with
the following fields:

* `.text`: Text to insert before the lemmas. Wrapped in the value of `.text_classes`, or its default; see below.
* `.lemmas`: List of objects describing the lemma(s) of which the term in question is a non-lemma form. These are passed
   directly to {full_link()} in [[Module:links]]. Each object should have at minimum a `.lang` field containing the
   language of the lemma and a `.term` field containing the lemma itself. Each object is formatted using {full_link()}
   and then if there are more than one, they are joined using {serialCommaJoin()} in [[Module:table]]. Alternatively,
   `.lemmas` can be a string, which is displayed directly, or omitted, to show no lemma links and omit the connecting
   text.
* `.lemma_face`: "Face" to use when displaying the lemma objects. Usually should be set to {"term"}.
* `.enclitics`: List of enclitics to display after the lemmas, in parens.
* `.base_lemmas`: List of base lemmas to display after the lemmas, in the case where the lemmas in `.lemmas` are
   themselves forms of another lemma (the base lemma), e.g. a comparative, superlative or participle. Each object is of
   the form { { paramobj = PARAM_OBJ, lemmas = {LEMMA_OBJ, LEMMA_OBJ, ...} }} where PARAM_OBJ describes the properties
   of the base lemma parameter (i.e. the relationship between the intermediate and base lemmas) and LEMMA_OBJ is an
   object suitable to be passed to {full_link()} in [[Module:links]]. PARAM_OBJ is of the format
   { { param = "PARAM", tags = {"TAG", "TAG", ...} } where PARAM is the name of the parameter to {{tl|inflection of}}
   etc. that holds the base lemma(s) of the specified relationship and the tags describe the relationship, such as
   { {"comd"}} or { {"past", "part"}}.
* `.text_classes`: CSS classes used to wrap the tag text and lemma links. Default is {"form-of-definition use-with-mention"}
   for the tag text and lemma links, and additionally {"form-of-definition-link"} specifically for the lemma links.
   (FIXME: Should separate out the lemma links into their own field.)
* `.posttext`: Additional text to display after the lemma links.
]==]
function export.format_form_of(data)
	if type(data) ~= "table" then
		error("Internal error: First argument must now be a table of arguments")
	end
	local text_classes = data.text_classes or "form-of-definition use-with-mention"
	local lemma_classes = data.text_classes or "form-of-definition-link"
	local parts = {}
	local function ins(text)
		table.insert(parts, text)
	end
	ins("<span class='" .. text_classes .. "'>")
	ins(data.text)
	if data.text ~= "" and data.lemmas then
		ins(" ")
	end
	if data.lemmas then
		if type(data.lemmas) == "string" then
			ins(wrap_in_span(data.lemmas, lemma_classes))
		else
			local formatted_terms = {}
			for _, lemma in ipairs(data.lemmas) do
				table.insert(formatted_terms, wrap_in_span(
					m_links.full_link(lemma, data.lemma_face), lemma_classes
				))
			end
			ins(m_table.serialCommaJoin(formatted_terms))
		end
	end
	if data.enclitics and #data.enclitics > 0 then
		-- The outer parens need to be outside of the text_classes span so they show in upright instead of italic, or
		-- they will clash with upright parens generated by link annotations such as transliterations and pos=.
		ins("</span>")
		local formatted_terms = {}
		for _, enclitic in ipairs(data.enclitics) do
			-- FIXME, should we have separate clitic face and/or classes?
			table.insert(formatted_terms, wrap_in_span(
				m_links.full_link(enclitic, data.lemma_face, nil, "show qualifiers"), lemma_classes
			))
		end
		ins(" (")
		ins(wrap_in_span("with enclitic" .. (#data.enclitics > 1 and "s" or "") .. " ", text_classes))
		ins(m_table.serialCommaJoin(formatted_terms))
		ins(")")
		ins("<span class='" .. text_classes .. "'>")
	end
	if data.base_lemmas and #data.base_lemmas > 0 then
		for _, base_lemma in ipairs(data.base_lemmas) do
			ins(", the </span>")
			ins(export.tagged_inflections {
				lang = base_lemma.lemmas[1].lang,
				tags = base_lemma.paramobj.tags,
				lemmas = base_lemma.lemmas,
				lemma_face = data.lemma_face,
				no_format_categories = true,
				nocat = true,
				text_classes = data.text_classes,
			})
			ins("<span class='" .. text_classes .. "'>")
		end
	end
	-- FIXME, should posttext go before enclitics? If so we need to have separate handling for the
	-- final colon when there are multiple tag sets in tagged_inflections().
	if data.posttext then
		ins(data.posttext)
	end
	ins("</span>")
	return table.concat(parts)
end


--[==[
Return true if `tag` contains an internal link or HTML.
]==]
function export.is_link_or_html(tag)
	return tag:find("[[", nil, true) or tag:find("|", nil, true) or tag:find("<", nil, true)
end


--[==[
Look up a tag (either a shortcut of any sort of a canonical long-form tag) and return its expansion. The expansion
will be a string unless the shortcut is a list-tag shortcut such as `1s`; in that case, the expansion will be a
list. The caller must handle both cases. Only one level of expansion happens; hence, `acc` expands to {"accusative"},
`1s` expands to { {"1", "s"}} (not to { {"first", "singular"}}) and `123` expands to {"1//2//3"}. The expansion will be
the same as the passed-in tag in the following circumstances:

# The tag is `;` (this is special-cased, and no lookup is done).
# The tag is a multipart tag such as `nom//acc` (this is special-cased, and no lookup is done).
# The tag contains a raw link (this is special-cased, and no lookup is done).
# The tag contains HTML (this is special-cased, and no lookup is done).
# The tag is already a canonical long-form tag.
# The tag is unrecognized.

This function first looks up in the lang-specific data module [[Module:form of/lang-data/LANGCODE]], then in
[[Module:form of/data]] (which includes more common non-lang-specific tags) and finally (only if the tag is not
recognized as a shortcut or canonical tag, and is not of types 1-4 above) in [[Module:form of/data2]].

If the expansion is a string and is different from the tag, track it if `do_track` is true.
]==]
function export.lookup_shortcut(tag, lang, do_track)
	-- If there is HTML or a link in the tag, return it directly; don't try
	-- to look it up, which will fail.
	if tag == ";" or tag:find("//", nil, true) or export.is_link_or_html(tag) then
		return tag
	end
	local expansion
	local langcode = lang and lang:getCode()
	if langcode and export.langs_with_lang_specific_tags[langcode] then
		local langdata = mw.loadData(export.form_of_lang_data_module_prefix .. langcode)
		-- If this is a canonical long-form tag, just return it, and don't check for shortcuts. This is an
		-- optimization; see below.
		if langdata.tags[tag] then
			return tag
		end
		expansion = langdata.shortcuts[tag]
	end
	if not expansion and lang then
		-- If the lang we're dealing with is an etym-only lang, try again with the corresponding full language.
		local full_langcode = lang:getFullCode()
		if full_langcode ~= langcode and export.langs_with_lang_specific_tags[full_langcode] then
			local langdata = mw.loadData(export.form_of_lang_data_module_prefix .. full_langcode)
			-- If this is a canonical long-form tag, just return it, and don't check for shortcuts. This is an
			-- optimization; see below.
			if langdata.tags[tag] then
				return tag
			end
			expansion = langdata.shortcuts[tag]
		end
	end
	if not expansion then
		local m_data = mw.loadData(export.form_of_data_module)
		-- If this is a canonical long-form tag, just return it, and don't check for shortcuts (which will cause
		-- [[Module:form of/data2]] to be loaded, because there won't be a shortcut entry in [[Module:form of/data]] --
		-- or, for that matter, in [[Module:form of/data2]]). This is an optimization; the code will still work without
		-- it, but will use up more memory.
		if m_data.tags[tag] then
			return tag
		end
		expansion = m_data.shortcuts[tag]
	end
	if not expansion then
		local m_data2 = mw.loadData(export.form_of_data2_module)
		expansion = m_data2.shortcuts[tag]
	end
	if not expansion then
		return tag
	end
	
	return expansion
end


--[==[
Look up a normalized/canonicalized tag and return the data object associated with it. If the tag isn't found, return
nil. This first looks up in the lang-specific data module [[Module:form of/lang-data/LANGCODE]], then in
[[Module:form of/data]] (which includes more common non-lang-specific tags) and then finally in
[[Module:form of/data2]].
]==]
function export.lookup_tag(tag, lang)
	local langcode = lang and lang:getCode()
	if langcode and export.langs_with_lang_specific_tags[langcode] then
		local langdata = mw.loadData(export.form_of_lang_data_module_prefix .. langcode)
		if langdata.tags[tag] then
			return langdata.tags[tag]
		end
	end
	local full_langcode = lang and lang:getFullCode()
	if full_langcode and full_langcode ~= langcode and export.langs_with_lang_specific_tags[full_langcode] then
		-- If the lang we're dealing with is an etym-only lang, try again with the corresponding full language.
		local langdata = mw.loadData(export.form_of_lang_data_module_prefix .. full_langcode)
		if langdata.tags[tag] then
			return langdata.tags[tag]
		end
	end
	local m_data = mw.loadData(export.form_of_data_module)
	local tagobj = m_data.tags[tag]
	if tagobj then
		return tagobj
	end
	local m_data2 = mw.loadData(export.form_of_data2_module)
	local tagobj2 = m_data2.tags[tag]
	if tagobj2 then
		return tagobj2
	end
	return nil
end


-- Normalize a single tag, which may be a shortcut but should not be a multipart tag, a multipart shortcut or a list
-- shortcut.
local function normalize_single_tag(tag, lang, do_track)
	local expansion = export.lookup_shortcut(tag, lang, do_track)
	if type(expansion) ~= "string" then
		error("Tag '" .. tag .. "' is a list shortcut, which is not allowed here")
	end
	tag = expansion
	return tag
end


--[=[
Normalize a component of a multipart tag. This should not have any // in it, but may join multiple individual tags with
a colon, and may be a single list-tag shortcut, which is treated as if colon-separated. The return value may be a list
of tags.
]=]
local function normalize_multipart_component(tag, lang, do_track)
	-- If there is HTML or a link in the tag, don't try to split on colon. A colon may legitimately occur in either one,
	-- and we don't want these things parsed. Note that we don't do this check before splitting on //, which we don't
	-- expect to occur in links or HTML; see comment in normalize_tag().
	if export.is_link_or_html(tag) then
		return tag
	end
	local components = rsplit(tag, ":", true)
	if #components == 1 then
		-- We allow list-tag shortcuts inside of multipart tags, e.g.
		-- '1s//3p'. Check for this now.
		tag = export.lookup_shortcut(tag, lang, do_track)
		if type(tag) == "table" then
			-- We found a list-tag shortcut; treat as if colon-separated.
			components = tag
		else
			return normalize_single_tag(tag, lang, do_track)
		end
	end
	local normtags = {}
	for _, component in ipairs(components) do
		table.insert(normtags, normalize_single_tag(component, lang, do_track))
	end

	return normtags
end


--[=[
Normalize a single tag. The return value may be a list (in the case of multipart tags), which will contain nested lists
in the case of two-level multipart tags.
]=]
local function normalize_tag(tag, lang, do_track)
	-- We don't check for links or HTML before splitting on //, which we don't expect to occur in links or HTML. Doing
	-- it this way allows for a tag like '{{lb|grc|Epic}}//{{lb|grc|Ionic}}' to function correctly (the template calls
	-- will be expanded before we process the tag, and will contain links and HTML). The only check we do is for a URL,
	-- which shouldn't normally occur, but might if the user tries to put an external link into the tag. URL's with //
	-- normally have the sequence ://, which should never normally occur when // and : are used in their normal ways.
	if tag:find("://", nil, true) then
		return tag
	end
	local split_tags = rsplit(tag, "//", true)
	if #split_tags == 1 then
		local retval = normalize_multipart_component(tag, lang, do_track)
		if type(retval) == "table" then
			-- The user gave a tag like '1:s', i.e. with colon but without //. Allow this, but we need to return a
			-- nested list.
			return {retval}
		end
		return retval
	end
	local normtags = {}
	for _, single_tag in ipairs(split_tags) do
		table.insert(normtags, normalize_multipart_component(single_tag, lang, do_track))
	end
	return normtags
end


--[==[
Normalize a tag set (a list of tags) into its canonical-form tags. The return value is a list of normalized tag sets
(a list because of there may be conjoined shortcuts among the input tags). A normalized tag set is a list of tag
elements, where each element is either a string (the canonical form of a tag), a list of such strings (in the case of
multipart tags) or a list of lists of such strings (in the case of two-level multipart tags). For example, the multipart
tag `nom//acc//voc` will be represented in canonical form as { {"nominative", "accusative", "vocative"}}, and the
two-level multipart tag `1:s//3:p` will be represented as { {{"first-person", "singular"}, {"third-person", "plural"}}}.

Example 1:

{normalize_tag_set({"nom//acc//voc", "n", "p"})} = { {{{"nominative", "accusative", "vocative"}, "masculine", "plural"}}}

Example 2:

{normalize_tag_set({"ed-form"}, ENGLISH)} = { {{"simple", "past"}, {"past", "participle"}}}

Example 3:

{normalize_tag_set({"archaic", "ed-form"}, ENGLISH)} = { {{"archaic", "simple", "past"}, {"archaic", "past", "participle"}}}
]==]
function export.normalize_tag_set(tag_set, lang, do_track)
	
	local output_tag_set = {}
	local saw_semicolon = false

	for _, tag in ipairs(tag_set) do
		-- Expand the tag, which may generate a new tag (either a fully canonicalized tag, a multipart tag, or a list
		-- of tags).
		tag = export.lookup_shortcut(tag, lang, do_track)
		if type(tag) == "table" then
			saw_semicolon = m_table.contains(tag, ";")
			if saw_semicolon then
				-- If we saw a conjoined shortcut, we need to use a more general algorithm that can expand a single
				-- tag set into multiple.
				break
			end

			for _, t in ipairs(tag) do
				table.insert(output_tag_set, normalize_tag(t, lang, do_track))
			end
		else
			table.insert(output_tag_set, normalize_tag(tag, lang, do_track))
		end
	end

	if not saw_semicolon then
		return {output_tag_set}
	end

	-- Use a more general algorithm that handles conjoined shortcuts.
	local output_tag_set = {}
	for i, tag in ipairs(tag_set) do
		-- Expand the tag, which may generate a new tag (either a fully canonicalized tag, a multipart tag, or a list
		-- of tags).
		tag = export.lookup_shortcut(tag, lang, do_track)
		if type(tag) == "table" then
			local output_tag_sets = {}
			local shortcut_tag_sets = export.split_tag_set(tag)
			local normalized_shortcut_tag_sets = {}
			for _, shortcut_tag_set in ipairs(shortcut_tag_sets) do
				m_table.extendList(normalized_shortcut_tag_sets,
					export.normalize_tag_set(shortcut_tag_set, lang, do_track))
			end
			local after_tags = slice(tag_set, i + 1)
			local normalized_after_tags_sets = export.normalize_tag_set(after_tags, lang, do_track)
			for _, normalized_shortcut_tag_set in ipairs(normalized_shortcut_tag_sets) do
				for _, normalized_after_tags_set in ipairs(normalized_after_tags_sets) do
					table.insert(output_tag_sets, m_table.append(output_tag_set, normalized_shortcut_tag_set,
						normalized_after_tags_set))
				end
			end
			return output_tag_sets
		else
			table.insert(output_tag_set, normalize_tag(tag, lang, do_track))
		end
	end

	error("Internal error: Should not get here")
end


function export.combine_multipart_tags(tag_set)
	for i, tag in ipairs(tag_set) do
		if type(tag) == "table" then
			for j, subtag in ipairs(tag) do
				if type(subtag) == "table" then
					tag[j] = table.concat(subtag, ":")
				end
			end
			tag_set[i] = table.concat(tag, "//")
		end
	end

	return tag_set
end


function export.normalize_tags(tags, lang, recombine_multitags, do_track)
	local tag_sets = export.normalize_tag_set(tags, lang, do_track)
	if recombine_multitags then
		for i, tag_set in ipairs(tag_sets) do
			tag_sets[i] = export.combine_multipart_tags(tag_set)
		end
		return export.combine_tag_sets(tag_sets)
	end
	return tag_sets
end


--[==[
Split a tag set containing two-level multipart tags into one or more tag sets not containing such tags.
Single-level multipart tags are left alone. (If we need to, a slight modification of the following code
will also split single-level multipart tags.) This assumes that multipart tags are represented as lists
and two-level multipart tags are represented as lists of lists, as is output by {normalize_tag_set()}.
NOTE: We have to be careful to properly handle imbalanced two-level multipart tags such as
`def:s//p` (or the reverse, `s//def:p`).
]==]
function export.split_two_level_multipart_tag_set(tag_set)
	for i, tag in ipairs(tag_set) do
		if type(tag) == "table" then
			-- We saw a multipart tag. Check if any of the parts are two-level.
			local saw_two_level_tag = false
			for _, first_level_tag in ipairs(tag) do
				if type(first_level_tag) == "table" then
					saw_two_level_tag = true
					break
				end
			end
			if saw_two_level_tag then
				-- We found a two-level multipart tag.
				-- (1) Extract the preceding tags.
				local pre_tags = slice(tag_set, 1, i - 1)
				-- (2) Extract the following tags.
				local post_tags = slice(tag_set, i + 1)
				-- (3) Loop over each tag set alternant in the two-level multipart tag.
				-- For each alternant, form the tag set consisting of pre_tags + alternant + post_tags,
				-- and recursively split that tag set.
				local resulting_tag_sets = {}
				for _, first_level_tag_set in ipairs(tag) do
					local expanded_tag_set = {}
					m_table.extendList(expanded_tag_set, pre_tags)
					-- The second level may have a string or a list.
					if type(first_level_tag_set) == "table" then
						m_table.extendList(expanded_tag_set, first_level_tag_set)
					else
						table.insert(expanded_tag_set, first_level_tag_set)
					end
					m_table.extendList(expanded_tag_set, post_tags)
					m_table.extendList(resulting_tag_sets, export.split_two_level_multipart_tag_set(expanded_tag_set))
				end
				return resulting_tag_sets
			end
		end
	end

	return {tag_set}
end


--[==[
Split a tag set that may consist of multiple semicolon-separated tag sets into the component tag sets.
]==]
function export.split_tag_set(tag_set)
	local split_tag_sets = {}
	local cur_tag_set = {}
	for _, tag in ipairs(tag_set) do
		if tag == ";" then
			if #cur_tag_set > 0 then
				table.insert(split_tag_sets, cur_tag_set)
			end
			cur_tag_set = {}
		else
			table.insert(cur_tag_set, tag)
		end
	end
	if #cur_tag_set > 0 then
		table.insert(split_tag_sets, cur_tag_set)
	end
	return split_tag_sets
end

export.split_tags_into_tag_sets = export.split_tag_set


--[==[
Combine multiple tag sets in a tag set group into a simple tag set, with logical tag sets separated by semicolons.
This is the opposite of {split_tag_set()}.
]==]
function export.combine_tag_sets(tag_sets)
	if #tag_sets == 1 then
		return tag_sets[1]
	end
	local combined_tag_set = {}
	for _, tag_set in ipairs(tag_sets) do
		if #combined_tag_set > 0 then
			table.insert(combined_tag_set, ";")
		end
		m_table.extendList(combined_tag_set, tag_set)
	end
	return tags
end


local tag_set_param_mods = {
	lb = {
		item_dest = "labels",
		convert = function(arg, parse_err)
			return rsplit(arg, "//", true)
		end,
	}
}


--[==[
Parse tag set properties from a tag set (list of tags). Currently no per-tag properties are recognized, and the only
per-tag-set property recognized is `<lb:...>` for specifing label(s) for the tag set. Per-tag-set properties must be
attached to the last tag.
]==]
function export.parse_tag_set_properties(tag_set)
	local function generate_tag_set_obj(last_tag)
		tag_set[#tag_set] = last_tag
		return {tags = tag_set}
	end
	local last_tag = tag_set[#tag_set]
	-- Check for inline modifier, e.g. מרים<tr:Miryem>. But exclude HTML entry with <span ...>, <i ...>, <br/> or
	-- similar in it, caused by wrapping an argument in {{l|...}}, {{af|...}} or similar. Basically, all tags of
	-- the sort we parse here should consist of a less-than sign, plus letters, plus a colon, e.g. <lb:...>, so if
	-- we see a tag on the outer level that isn't in this format, we don't try to parse it. The restriction to the
	-- outer level is to allow generated HTML inside of e.g. qualifier tags, such as foo<q:similar to {{m|fr|bar}}>.
	if last_tag:find("<") and not last_tag:find("^[^<]*<[a-z]*[^a-z:]") then
		return require(parse_utilities_module).parse_inline_modifiers(last_tag, {
			param_mods = tag_set_param_mods,
			generate_obj = generate_tag_set_obj,
		})
	else
		return generate_tag_set_obj(last_tag)
	end
end


function export.normalize_pos(pos)
	if not pos then
		return nil
	end
	return mw.loadData(export.form_of_pos_module)[pos] or pos
end


-- Return the display form of a single canonical-form tag. The value
-- passed in must be a string (i.e. it cannot be a list describing a
-- multipart tag). To handle multipart tags, use get_tag_display_form().
local function get_single_tag_display_form(normtag, lang)
	local data = export.lookup_tag(normtag, lang)
	local display = normtag

	-- If the tag has a special display form, use it
	if data and data.display then
		display = data.display
	end

	-- If there is a nonempty glossary index, then show a link to it
	local glossary = data and data[export.GLOSSARY]
	if glossary ~= nil then
		if glossary == export.WIKT then
			display = "[[" .. normtag .. "|" .. display .. "]]"
		elseif glossary == export.WP then
			display = "[[w:" .. normtag .. "|" .. display .. "]]"
		elseif glossary == export.APPENDIX then
			display = "[[Appendix:Glossary#" .. mw.uri.anchorEncode(normtag) .. "|" .. display .. "]]"
		elseif type(glossary) ~= "string" then
			error(("Internal error: Wrong type %s for glossary value %s for tag %s"):format(
				type(glossary), mw.dumpObject(glossary), normtag))
		else
			local link = rmatch(glossary, "^wikt:(.*)")
			if link then
				display = "[[" .. link .. "|" .. display .. "]]"
			end
			if not link then
				link = rmatch(glossary, "^w:(.*)")
				if link then
					display = "[[w:" .. link .. "|" .. display .. "]]"
				end
			end
			if not link then
				display = "[[Appendix:Glossary#" .. mw.uri.anchorEncode(glossary) .. "|" .. display .. "]]"
			end
		end
	end
	return display
end


--[==[
Turn a canonicalized tag spec (which describes a single, possibly multipart tag) into the displayed form. The tag spec
may be a string (a canonical-form tag); a list of canonical-form tags (in the case of a simple multipart tag); or a
list of mixed canonical-form tags and lists of such tags (in the case of a two-level multipart tag). `joiner` indicates
how to join the parts of a multipart tag, and can be either {"and"} ("foo and bar", or "foo, bar and baz" for 3 or
more), {"slash"} ("foo/bar"), {"en-dash"} ("foo–bar") or {nil}, which uses the global default found in
{multipart_join_strategy()} in [[Module:form of/functions]]. (NOTE: The global default is {"slash"} and this seems
unlikely to change.)
]==]
function export.get_tag_display_form(tagspec, lang, joiner)
	if type(tagspec) == "string" then
		return get_single_tag_display_form(tagspec, lang)
	end
	-- We have a multipart tag. See if there's a display handler to display them specially.
	for _, handler in ipairs(require(export.form_of_functions_module).display_handlers) do
		local displayval = handler(tagspec, joiner)
		if displayval then
			return displayval
		end
	end
	-- No display handler.
	local displayed_tags = {}
	for _, first_level_tag in ipairs(tagspec) do
		if type(first_level_tag) == "string" then
			table.insert(displayed_tags, get_single_tag_display_form(first_level_tag, lang))
		else
			-- A first-level element of a two-level multipart tag. Currently we just separate the individual components
			-- with spaces, but other ways are possible, e.g. using an underscore, colon, parens or braces.
			local components = {}
			for _, component in ipairs(first_level_tag) do
				table.insert(components, get_single_tag_display_form(component, lang))
			end
			table.insert(displayed_tags, table.concat(components, " "))
		end
	end
	return require(export.form_of_functions_module).join_multiparts(displayed_tags, joiner)
end


--[==[
Given a normalized tag set (i.e. as output by {normalize_tag_set()}; all tags are in canonical form, multipart tags are
represented as lists, and two-level multipart tags as lists of lists), convert to displayed form (a string). See
{get_tag_display_form()} for the meaning of `joiner`.
]==]
function export.get_tag_set_display_form(normalized_tag_set, lang, joiner)
	local parts = {}

	for _, tagspec in ipairs(normalized_tag_set) do
		local to_insert = export.get_tag_display_form(tagspec, lang, joiner)
		-- Maybe insert a space before inserting the display form of the tag. We insert a space if
		-- (a) we're not the first tag; and
		-- (b) the tag we're about to insert doesn't have the "no_space_on_left" property; and
		-- (c) the preceding tag doesn't have the "no_space_on_right" property.
		-- NOTE: We depend here on the fact that
		-- (1) all tags with either of the above properties set have the same display form as canonical form, and
		-- (2) all tags with either of the above properties set are single-character tags.
		-- The second property is an optimization to avoid looking up display forms resulting from multipart tags,
		-- which won't be found and which will trigger loading of [[Module:form of/data2]]. If multichar punctuation is
		-- added in the future, it's ok to change the == 1 below to <= 2 or <= 3.
		--
		-- If the first property above fails to hold in the future, we need to track the canonical form of each tag
		-- (including the previous one) as well as the display form. This would also avoid the need for the == 1 check.
		if #parts > 0 then
			local most_recent_tagobj = ulen(parts[#parts]) == 1 and export.lookup_tag(parts[#parts], lang)
			local to_insert_tagobj = ulen(to_insert) == 1 and export.lookup_tag(to_insert, lang)
			if (
				(not most_recent_tagobj or not most_recent_tagobj.no_space_on_right) and
				(not to_insert_tagobj or not to_insert_tagobj.no_space_on_left)
			) then
				table.insert(parts, " ")
			end
		end
		table.insert(parts, to_insert)
	end

	return table.concat(parts)
end


--[==[
Given a normalized tag set (i.e. as output by {normalize_tag_set()}; all tags are in canonical form, multipart tags are
represented as lists, and two-level multipart tags as lists of lists), fetch the associated categories and labels.
Return two values, a list of categories and a list of labels. `lang` is the language of term represented by the tag set,
and `POS` is the user-provided part of speech (which may be {nil}).
]==]
function export.fetch_categories_and_labels(normalized_tag_set, lang, POS, pagename, lemmas)
	local m_cats = mw.loadData(export.form_of_cats_module)
	local categories = {}
	local labels = {}

	POS = export.normalize_pos(POS)
	-- First split any two-level multipart tags into multiple sets, to make our life easier.
	for _, tag_set in ipairs(export.split_two_level_multipart_tag_set(normalized_tag_set)) do
		-- Call a named function, either from the lang-specific data in
		-- [[Module:form of/lang-specific/LANGCODE/functions]] or in [[Module:form of/functions]].
		local function call_named_function(name, funtype)
			local data = {
				pagename = pagename or mw.title.getCurrentTitle().subpageText,
				lemmas = lemmas,
				tag_set = normalized_tag_set,
				lang = lang,
				POS = POS
			}
			local modules_tried = {}
			local function try_lang_specific_module(langcode)
				if export.langs_with_lang_specific_tags[langcode] then
					local lang_specific_module = export.form_of_lang_data_module_prefix .. langcode .. "/functions"
					local langdata = require(utilities_module).safe_require(lang_specific_module)
					if langdata then
						table.insert(modules_tried, lang_specific_module)
						if langdata.cat_functions then
							local fn = langdata.cat_functions[name]
							if fn then
								return fn(data), true
							end
						end
					end
				end
				return nil, false
			end
			-- First try lang-specific.
			local langcode = lang and lang:getCode()
			if langcode then
				local retval, found_it = try_lang_specific_module(langcode)
				if found_it then
					return retval
				end
			end
			-- If the lang we're dealing with is an etym-only lang, try again with the corresponding full language.
			local full_langcode = lang and lang:getFullCode()
			if full_langcode and full_langcode ~= langcode then
				local retval, found_it = try_lang_specific_module(full_langcode)
				if found_it then
					return retval
				end
			end
			-- Try lang-independent.
			table.insert(modules_tried, export.form_of_functions_module)
			local fn = require(export.form_of_functions_module).cat_functions[name]
			if fn then
				return fn(data)
			end
			for i, modname in ipairs(modules_tried) do
				modules_tried[i] = "[[" .. modname .. "]]"
			end
			error(("No %s function named '%s' in %s"):format(funtype, name, lang_specific_part,
				m_table.serialCommaJoin(modules_tried, {conj = "or", dontTag = true})))
		end

		-- Given a tag from the current tag set (which may be a list in case of a multipart tag),
		-- and a tag from a categorization spec, check that the two match.
		-- (1) If both are strings, we just check for equality.
		-- (2) If the spec tag is a string and the tag set tag is a list (i.e. it originates from a
		-- multipart tag), we check that the spec tag is in the list. This is because we want to treat
		-- multipart tags in user-specified tag sets as if the user had specified multiple tag sets.
		-- For example, if the user said "1//3|s|pres|ind" and the categorization spec says {"has", "1"},
		-- we want this to match, because "1//3|s|pres|ind" should be treated equivalently to two tag
		-- sets "1|s|pres|ind" and "3|s|pres|ind", and the former matches the categorization spec.
		-- (3) If the spec tag is a list (i.e. it originates from a multipart tag), we check that the
		-- tag set tag is also a list and is a superset of the spec tag. For example, if the categorization
		-- spec says {"has", "1//3"}, then the tag set tag must be a multipart tag that has both "1" and "3"
		-- in it. "1//3" works, as does "1//2//3".
		local function tag_set_tag_matches_spec_tag(tag_set_tag, spec_tag)
			if type(spec_tag) == "table" then
				if type(tag_set_tag) == "table" and is_subset(spec_tag, tag_set_tag) then
					return true
				end
			elseif type(tag_set_tag) == "table" then
				if m_table.contains(tag_set_tag, spec_tag) then
					return true
				end
			elseif tag_set_tag == spec_tag then
				return true
			end
			return false
		end

		-- Check that the current tag set matches the given spec tag. This means that any of the tags
		-- in the current tag set match, according to tag_set_tag_matches_spec_tag(); see above. If the
		-- current tag set contains only string tags (i.e. no multipart tags), and the spec tag is a
		-- string (i.e. not a multipart tag), this boils down to list containment, but it gets more
		-- complex when multipart tags are present.
		local function tag_set_matches_spec_tag(spec_tag)
			spec_tag = normalize_tag(spec_tag, lang)
			for _, tag_set_tag in ipairs(tag_set) do
				if tag_set_tag_matches_spec_tag(tag_set_tag, spec_tag) then
					return true
				end
			end
			return false
		end

		-- Check whether the given spec matches the current tag set. Two values are returned:
		-- (1) whether the spec matches the tag set; (2) the index of the category to add if
		-- the spec matches.
		local function check_condition(spec)
			if type(spec) == "boolean" then
				return spec
			elseif type(spec) ~= "table" then
				error("Wrong type of condition " .. spec .. ": " .. type(spec))
			end
			local predicate = spec[1]
			if predicate == "has" then
				return tag_set_matches_spec_tag(spec[2]), 3
			elseif predicate == "hasall" then
				for _, tag in ipairs(spec[2]) do
					if not tag_set_matches_spec_tag(tag) then
						return false, 3
					end
				end
				return true, 3
			elseif predicate == "hasany" then
				for _, tag in ipairs(spec[2]) do
					if tag_set_matches_spec_tag(tag) then
						return true, 3
					end
				end
				return false, 3
			elseif predicate == "tags=" then
				local normalized_spec_tag_sets = export.normalize_tag_set(spec[2], lang)
				if #normalized_spec_tag_sets > 1 then
					error("Internal error: No support for conjoined shortcuts in category/label specs in "
						.. "[[Module:form of/cats]] when processing spec tag set " .. table.concat(spec[2], "|"))
				end
				local normalized_spec_tag_set = normalized_spec_tag_sets[1]
				-- Check for and disallow two-level multipart tags in the specs. FIXME: Remove this when we remove
				-- support for two-level multipart tags.
				for _, tag in ipairs(normalized_spec_tag_set) do
					if type(tag) == "table" then
						for _, subtag in ipairs(tag) do
							if type(subtag) == "table" then
								error("Internal error: No support for two-level multipart tags in category/label specs"
									.. "[[Module:form of/cats]] when processing spec tag set "
									.. table.concat(spec[2], "|"))
							end
						end
					end
				end
				-- Allow tags to be in different orders, and multipart tags to be in different orders. To handle this,
				-- we first check that both tag set tags and spec tags have the same length. If so, we sort the
				-- multipart tags in the tag set tags and spec tags, and then check that all tags in the spec tags are
				-- in the tag set tags.
				if #tag_set ~= #normalized_spec_tag_set then
					return false, 3
				end
				local tag_set_tags = m_table.deepcopy(tag_set)
				for i=1,#tag_set_tags do
					if type(tag_set_tags[i]) == "table" then
						table.sort(tag_set_tags[i])
					end
					if type(normalized_spec_tag_set[i]) == "table" then
						table.sort(normalized_spec_tag_set[i])
					end
				end
				for i=1,#tag_set_tags do
					if not m_table.contains(tag_set_tags, normalized_spec_tag_set[i]) then
						return false, 3
					end
				end
				return true, 3
			elseif predicate == "p=" then
				return POS == export.normalize_pos(spec[2]), 3
			elseif predicate == "pany" then
				for _, specpos in ipairs(spec[2]) do
					if POS == export.normalize_pos(specpos) then
						return true, 3
					end
				end
				return false, 3
			elseif predicate == "pexists" then
				return POS ~= nil, 2
			elseif predicate == "not" then
				local condval = check_condition(spec[2])
				return not condval, 3
			elseif predicate == "and" then
				local condval = check_condition(spec[2])
				if condval then
					condval = check_condition(spec[3])
				end
				return condval, 4
			elseif predicate == "or" then
				local condval = check_condition(spec[2])
				if not condval then
					condval = check_condition(spec[3])
				end
				return condval, 4
			elseif predicate == "call" then
				return fn(call_named_function(spec[2], "condition")), 3
			else
				error("Unrecognized predicate: " .. predicate)
			end
		end

		-- Process a given spec. This checks any conditions in the spec against the
		-- tag set, and insert any resulting categories into `categories`. Return value
		-- is true if the outermost condition evaluated to true and a category was inserted
		-- (this is used in {"cond" ...} conditions, which stop when a subcondition evaluates
		-- to true).
		local function process_spec(spec)
			if not spec then
				return false
			elseif type(spec) == "string" then
				-- A category. Substitute POS request with user-specified part of speech or default.
				spec = rsub(spec, "<<p=(.-)>>", function(default)
					return POS or export.normalize_pos(default)
				end)
				table.insert(categories, lang:getFullName() .. " " .. spec)
				return true
			elseif type(spec) == "table" and spec.labels then
				-- A label spec.
				for _, label in ipairs(spec.labels) do
					m_table.insertIfNot(labels, label)
				end
				return true
			elseif type(spec) ~= "table" then
				error("Wrong type of specification " .. spec .. ": " .. type(spec))
			end
			local predicate = spec[1]
			if predicate == "multi" then
				-- WARNING! #spec doesn't work for objects loaded from loadData()
				for i, sp in ipairs(spec) do
					if i > 1 then
						process_spec(sp)
					end
				end
				return true
			elseif predicate == "cond" then
				-- WARNING! #spec doesn't work for objects loaded from loadData()
				for i, sp in ipairs(spec) do
					if i > 1 and process_spec(sp) then
						return true
					end
				end
				return false
			elseif predicate == "call" then
				return process_spec(call_named_function(spec[2], "spec"))
			else
				local condval, ifspec = check_condition(spec)
				if condval then
					process_spec(spec[ifspec])
					return true
				else
					process_spec(spec[ifspec + 1])
					-- FIXME: Are we sure this is correct?
					return false
				end
			end
		end

		local langcode = lang:getCode()
		local langspecs = m_cats[langcode]
		if langspecs then
			for _, spec in ipairs(langspecs) do
				process_spec(spec)
			end
		end
		local full_code = lang:getFullCode()
		if full_code ~= langcode then
			local langspecs = m_cats[full_code]
			if langspecs then
				for _, spec in ipairs(langspecs) do
					process_spec(spec)
				end
			end
		end
		if full_code ~= "und" then
			local langspecs = m_cats["und"]
			if langspecs then
				for _, spec in ipairs(langspecs) do
					process_spec(spec)
				end
			end
		end
	end

	return categories, labels
end


--[==[
Implementation of templates that display inflection tags, such as the general {{tl|inflection of}}, semi-specific
variants such as {{tl|participle of}}, and specific variants such as {{tl|past participle of}}. `data` contains all the
information controlling the display, with the following fields:

* `.lang`: ('''''required''''') Language to use when looking up language-specific inflection tags, categories and
  labels, and for displaying categories and labels.
* `.tags`: ('''''required''' unless `.tag_sets` is given'') List of non-canonicalized inflection tags. Multiple tag sets
  can be indicated by a {";"} as one of the tags, and tag-set properties may be attached to the last tag of a tag set.
  The tags themselves may come directly from the user (as in {{tl|inflection of}}); come partly from the user (as in
  {{tl|participle of}}, which adds the tag `part` to user-specified inflection tags); or be entirely specified by the
  template (as in {{tl|past participle of}}).
* `.tag_sets`: ('''''required''' unless `.tags` is given'') List of non-canonicalized tag sets and associated
  per-tag-set properties. Each element of the list is an object of the form
  { {tags = {"TAG", "TAG", ...}, labels = {"LABEL", "LABEL", ...}}. If `.tag_sets` is specified, `.tags` should not be
  given and vice-versa. Specifying `.tag_sets` in place of tags allowed per-tag set labels to be specified; otherwise,
  there is no advantage. [[Module:pt-gl-inflections]] uses this functionality to supply labels like {"Brazil"} and
  {"Portugal"} associated with specific tag sets.
* `.lemmas`: ('''''recommended''''') List of objects describing the lemma(s) of which the term in question is a
  non-lemma form. These are passed directly to {full_link()} in [[Module:links]]. Each object should have at minimum a
  `.lang` field containing the language of the lemma and a `.term` field containing the lemma itself. Each object is
  formatted using {full_link()} and then if there are more than one, they are joined using {serialCommaJoin()} in
  [[Module:table]]. Alternatively, `.lemmas` can be a string, which is displayed directly. If omitted entirely, no lemma
  links are shown and the connecting "of" is also omitted.
* `.lemma_face`: ('''''recommended''''') "Face" to use when displaying the lemma objects. Usually should be set to
  {"term"}.
* `.POS`: ('''''recommended''''') Categorizing part-of-speech tag. Comes from the {{para|p}} or {{para|POS}} argument of
  {{tl|inflection of}}.
* `.pagename`: Page name of "current" page or nil to use the actual page title; for testing purposes.
* `.enclitics`: List of enclitics to display after the lemmas, in parens.
* `.no_format_categories`: If true, don't format the categories derived from the inflection tags; just return them.
* `.sort`: Sort key for formatted categories. Ignored when `.no_format_categories` = {true}.
* `.nocat`: Suppress computation of categories (even if `.no_format_categories` is not given).
* `.notext`: Disable display of all tag text and `inflection of` text. (FIXME: Maybe not implemented correctly.)
* `.capfirst`: Capitalize the first word displayed.
* `.pretext`: Additional text to display before the inflection tags, but after any top-level labels.
* `.posttext`: Additional text to display after the lemma links.
* `.text_classes`: CSS classes used to wrap the tag text and lemma links. Default is
  {"form-of-definition use-with-mention"} for the tag text, {"form-of-definition-link"} for the lemma links. (FIXME:
  Should separate out the lemma links into their own field.)
`.joiner`: Override the joiner (normally a slash) used to join multipart tags. You should normally not specify this.

A typical call might look like this (for {{m+|es|amo}}): {
	local lang = require("Module:languages").getByCode("es")

	local lemma_obj = {
		lang = lang,
		term = "amar",
	}

	return m_form_of.tagged_inflections({
		lang = lang, tags = {"1", "s", "pres", "ind"}, lemmas = {lemma_obj}, lemma_face = "term", POS = "verb"
	})
}

Normally, one value is returned, the formatted text, which has appended to it the formatted categories derived from the
tag-set-related categories generated by the specs in [Module:form of/cats]]. To suppress this, set
`data.no_format_categories` = {true}, in which case two values are returned, the formatted text without any formatted
categories appended and a list of the categories to be formatted.

NOTE: There are two sets of categories that may be generated: (1) categories derived directly from the tag sets, as
specified in [[Module:form of/cats]]; (2) categories derived from tag-set labels, either (a) set explicitly by the
caller in `data.tag_sets`, (b) specified by the user using `<lb:...>` attached to the last tag in a tag set, or
(c) specified in [[Module:form of/cats]]. The second type (label-related categories) are currently not returned in
the second return value of {tagged_inflections()}, and are currently inserted into the output text even if
`data.no_format_categories` is set to {true}; but they can be suppressed by setting `data.nocat` = {true} (which also
suppresses the first type of categories, those derived directly from tag sets, even if `data.no_format_categories` is
set to {true}).
]==]
function export.tagged_inflections(data)
	if not data.tags and not data.tag_sets then
		error("First argument must be a table of arguments, and `.tags` or `.tag_sets` must be specified")
	end
	if data.tags and data.tag_sets then
		error("Both `.tags` and `.tag_sets` cannot be specified")
	end
	local tag_sets = data.tag_sets
	if not tag_sets then
		tag_sets = export.split_tag_set(data.tags)
		for i, tag_set in ipairs(tag_sets) do
			tag_sets[i] = export.parse_tag_set_properties(tag_set)
		end
	end

	local inflections = {}
	local categories = {}
	for _, tag_set in ipairs(tag_sets) do
		local normalized_tag_sets = export.normalize_tag_set(tag_set.tags, data.lang, "do-track")

		for _, normalized_tag_set in ipairs(normalized_tag_sets) do
			local cur_infl = {}
			local this_categories, this_labels = export.fetch_categories_and_labels(normalized_tag_set, data.lang,
				data.POS, data.pagename, type(data.lemmas) == "table" and data.lemmas or nil)
			if not data.nocat then
				m_table.extendList(categories, this_categories)
			end
			local cur_infl = export.get_tag_set_display_form(normalized_tag_set, data.lang, data.joiner)
			if #cur_infl > 0 then
				if tag_set.labels then
					this_labels = m_table.append(tag_set.labels, this_labels)
				end
				table.insert(inflections, {infl_text = cur_infl, labels = this_labels})
			end
		end
	end

	local overall_labels, need_per_tag_set_labels
	for _, inflection in ipairs(inflections) do
		if overall_labels == nil then
			overall_labels = inflection.labels
		elseif not m_table.deepEquals(overall_labels, inflection.labels) then
			need_per_tag_set_labels = true
			overall_labels = nil
			break
		end
	end

	if not need_per_tag_set_labels then
		for _, inflection in ipairs(inflections) do
			inflection.labels = nil
		end
	end

	local format_data = m_table.shallowcopy(data)

	local function format_labels(labels, notext)
		if labels and #labels > 0 then
			return require(labels_module).show_labels { labels = labels, lang = data.lang, sort = data.sort, nocat = data.nocat } ..
				(notext and (data.pretext or "") == "" and "" or " ")
		else
			return ""
		end
	end

	local of_text = data.lemmas and " of" or ""
	local formatted_text
	if #inflections == 1 then
		if need_per_tag_set_labels then
			error("Internal error: need_per_tag_set_labels should not be set with one inflection")
		end
		format_data.text = format_labels(overall_labels, data.notext) .. (data.pretext or "") .. (data.notext and "" or
			((data.capfirst and require("Module:string utilities").ucfirst(inflections[1].infl_text) or inflections[1].infl_text) .. of_text))
		formatted_text = export.format_form_of(format_data)
	else
		format_data.text = format_labels(overall_labels, data.notext) .. (data.pretext or "") .. (data.notext and "" or
			((data.capfirst and "Inflection" or "inflection") .. of_text))
		format_data.posttext = (data.posttext or "") .. ":"
		local link = export.format_form_of(format_data)
		local text_classes = data.text_classes or "form-of-definition use-with-mention"
		for i, inflection in ipairs(inflections) do
			inflections[i] = "\n## " .. format_labels(inflection.labels, false) ..
				"<span class='" .. text_classes .. "'>" .. inflection.infl_text .. "</span>"
		end
		formatted_text = link .. table.concat(inflections)
	end

	if not data.no_format_categories then
		if #categories > 0 then
			formatted_text = formatted_text .. require("Module:utilities").format_categories(categories, data.lang,
				data.sort, nil, export.force_cat)
		end
		return formatted_text
	end
	return formatted_text, categories
end


--[==[
Given a tag set, return a flattened list all Wikidata ID's of all tags in the tag set. FIXME: Only used in a debugging
function in [[Module:se-verbs]]; move there.
]==]
function export.to_Wikidata_IDs(tag_set, lang, skip_tags_without_ids)
	local ret = {}

	local function get_wikidata_id(tag)
		local data = export.lookup_tag(tag, lang)

		if not data or not data[export.WIKIDATA] then
			if not skip_tags_without_ids then
				error('The tag "' .. tag .. '" does not have a Wikidata ID defined in the form-of data modules')
			else
				return nil
			end
		else
			return ("Q%s"):format(data[export.WIKIDATA])
		end
	end

	local normalized_tag_sets = export.normalize_tag_set(tag_set, lang)
	for _, tag_set in ipairs(normalized_tag_sets) do
		for _, tag in ipairs(tag_set) do
			if type(tag) == "table" then
				for _, subtag in ipairs(tag) do
					if type(subtag) == "table" then
						-- two-level multipart tag; FIXME: delete support for this
						for _, subsubtag in ipairs(subtag) do
							table.insert(ret, get_wikidata_id(subsubtag))
						end
					else
						table.insert(ret, get_wikidata_id(subtag))
					end
				end
			else
				table.insert(ret, get_wikidata_id(tag))
			end
		end
	end

	return ret
end


function export.dump_form_of_data(frame)
	local data = {
		data = require(export.form_of_data_module),
		data2 = require(export.form_of_data2_module)
	}
	return require("Module:JSON").toJSON(data)
end


function export.finalize_tag_data(tags, shortcuts)
	local function process_shortcut(name, shortcut)
		-- If the shortcut is already in the list, then there is a duplicate.
		if shortcuts[shortcut] then
			error("The shortcut \"" .. shortcut .. "\" (for the inflection tag \"" .. name .. "\") conflicts with an existing shortcut for the tag \"" .. shortcuts[shortcut] .. "\".")
		elseif tags[shortcut] then
			error("The shortcut \"" .. shortcut .. "\" (for the inflection tag \"" .. name .. "\") conflicts with an existing tag with that name.")
		end

		shortcuts[shortcut] = name
	end
	for name, data in pairs(tags) do
		local data_shortcuts = data[export.SHORTCUTS]
		if data_shortcuts then
			if type(data_shortcuts) == "string" then
				process_shortcut(name, data_shortcuts)
			else
				for _, shortcut in ipairs(data_shortcuts) do
					process_shortcut(name, shortcut)
				end
			end
		end
	end
end


return export
Module:form of: Difference between revisions

Latest revision as of 18:12, 12 August 2024

Navigation menu

Search