Module:form of: Difference between revisions
Jump to navigation
Jump to search
No edit summary |
No edit summary |
||
(10 intermediate revisions by the same user not shown) | |||
Line 1: | Line 1: | ||
local export = {} | |||
export.force_cat = false -- for testing; set to true to display categories even on non-mainspace pages | |||
local m_links = require("Module:links") | local m_links = require("Module:links") | ||
local m_string_utils = require("Module:string utilities") | |||
local m_table = require("Module:table") | local m_table = require("Module:table") | ||
local | local parse_utilities_module = "Module:parse utilities" | ||
local labels_module = "Module:labels" | |||
local utilities_module = "Module:utilities" | |||
export.form_of_pos_module = "Module:form of/pos" | |||
export.form_of_functions_module = "Module:form of/functions" | |||
export.form_of_cats_module = "Module:form of/cats" | |||
export.form_of_lang_data_module_prefix = "Module:form of/lang-data/" | |||
export.form_of_data_module = "Module:form of/data" | |||
export.form_of_data2_module = "Module:form of/data2" | |||
local ulen = | local ulen = m_string_utils.len | ||
local rsubn = | local rsubn = m_string_utils.gsub | ||
local rmatch = | local rmatch = m_string_utils.match | ||
local rsplit = | local rsplit = m_string_utils.split | ||
export.TAG_TYPE = 1 | |||
export.GLOSSARY = 2 | |||
export.SHORTCUTS = 3 | |||
export.WIKIDATA = 4 | |||
export.APPENDIX = true | |||
export.WP = false | |||
export.WIKT = 0 | |||
--[==[ | |||
Set listing the languages with lang-specific tags. If a language isn't listed here, the tags for that language won't be | |||
recognized. | |||
]==] | |||
export.langs_with_lang_specific_tags = { | |||
["en"] = true, | |||
["got"] = true, | |||
["lt"] = true, | |||
["lv"] = true, | |||
["nl"] = true, | |||
["pi"] = true, | |||
["sw"] = true, | |||
["ttj"] = true, | |||
} | |||
--[==[ intro: | |||
This module implements the underlying processing of {{tl|form of}}, {{tl|inflection of}} and specific variants such as | |||
{{tl|past participle of}} and {{tl|alternative spelling of}}. Most of the logic in this file is to handle tags in | |||
{{tl|inflection of}}. Other related files: | |||
* [[Module:form of/templates]] contains the majority of the logic that implements the templates themselves. | |||
* [[Module:form of/data]] is a data-only file containing information on the more common inflection tags, listing the | |||
tags, their shortcuts, the category they belong to (tense-aspect, case, gender, voice-valence, etc.), the appropriate | |||
glossary link and the wikidata ID. | |||
* [[Module:form of/data2]] is a data-only file containing information on the less common inflection tags, in the same | |||
format as [[Module:form of/data]]. | |||
* [[Module:form of/lang-data/LANGCODE]] is a data-only file containing information on the language-specific inflection | |||
tags for the language with code LANGCODE, in the same format as [[Module:form of/data]]. Language-specific tags | |||
override general tags. | |||
* [[Module:form of/cats]] is a data-only file listing the language-specific categories that are added when the | |||
appropriate combinations of tags are seen for a given language. | |||
* [[Module:form of/pos]] is a data-only file listing the recognized parts of speech and their abbreviations, used for | |||
categorization. FIXME: This should be unified with the parts of speech listed in [[Module:links]]. | |||
* [[Module:form of/functions]] contains functions for use with [[Module:form of/data]] and [[Module:form of/cats]]. | |||
They are contained in this module because data-only modules can't contain code. The functions in this file are of two | |||
types: | |||
*# Display handlers allow for customization of the display of multipart tags (see below). Currently there is only | |||
one such handler, for handling multipart person tags such as `1//2//3`. | |||
*# Cat functions allow for more complex categorization logic, and are referred to by name in [[Module:form of/cats]]. | |||
Currently no such functions exist. | |||
The following terminology is used in conjunction with {{tl|inflection of}}: | |||
]=] | * A ''tag'' is a single grammatical item, as specified in a single numbered parameter of {{tl|inflection of}}. Examples | ||
are `masculine`, `nominative`, or `first-person`. Tags may be abbreviated, e.g. `m` for `masculine`, `nom` for | |||
`nominative`, or `1` for `first-person`. Such abbreviations are called ''aliases'', and some tags have multiple | |||
equivalent aliases (e.g. `p` or `pl` for `plural`). The full, non-abbreviated form of a tag is called its | |||
''canonical form''. | |||
* The ''display form'' of a tag is the way it's displayed to the user. Usually the displayed text of the tag is the same | |||
as its canonical form, and it normally functions as a link to a glossary entry explaining the tag. Usually the link is | |||
to an entry in [[Appendix:Glossary]], but sometimes the tag is linked to an individual dictionary entry or to a | |||
Wikipedia entry. Occasionally, the display text differs from the canonical form of the tag. An example is the tag | |||
`comparative case`, which has the display text read as simply `comparative`. Normally, tags referring to cases don't | |||
have the word "case" in them, but in this case the tag `comparative` was already used as an alias for the tag | |||
`comparative degree`, so the tag was named `comparative case` to avoid clashing. A similar situation occurs with | |||
`adverbial case` vs. the grammar tag `adverbial` (as in `adverbial participle`). | |||
* A ''tag set'' is an ordered list of tags, which together express a single inflection, for example, `1|s|pres|ind`, | |||
which can be expanded to canonical-form tags as `first-person|singular|present|indicative`. | |||
* A ''conjoined tag set'' is a tag set that consists of multiple individual tag sets separated by a semicolon, e.g. | |||
`1|s|pres|ind|;|2|s|imp`, which specifies two tag sets, `1|s|pres|ind` as above and `2|s|imp` (in canonical form, | |||
`second-person|singular|imperative`). Multiple tag sets specified in a single call to {{tl|inflection of}} are | |||
specified in this fashion. Conjoined tag sets can also occur in list-tag shortcuts. | |||
* A ''multipart tag'' is a tag that embeds multiple tags within it, such as `f//n` or `nom//acc//voc`. These are used in | |||
the case of [[syncretism]], when the same form applies to multiple inflections. Examples are the Spanish present | |||
subjunctive, where the first-person and third-person singular have the same form (e.g. {{m|es|siga}} from | |||
{{m|es|seguir|t=to follow}}), or Latin third-declension adjectives, where the dative and ablative plural of all | |||
genders have the same form (e.g. {{m|la|omnibus}} from {{m|la|omnis|t=all}}). These would be expressed respectively as | |||
`1//3|s|pres|sub` and `dat//abl|m//f//n|p`, where the use of the multipart tag compactly encodes the syncretism and | |||
avoids the need to individually list out all of the inflections. Multipart tags currently display as a list separated | |||
by a slash, e.g. ''dative/ablative'' or ''masculine/feminine/neuter'' where each individual word is linked | |||
appropriately. As a special case, multipart tags involving persons display specially; for example, the multipart tag | |||
`1//2//3` displays as ''first-, second- and third-person'', with the word "person" occurring only once. | |||
* A ''two-level multipart tag'' is a special type of multipart tag that joins two or more tag sets instead of joining | |||
individual tags. The tags within the tag set are joined by a colon, e.g. `1:s//3:p`, which is displayed as | |||
''first-person singular and third-person plural'', e.g. for use with the form {{m|grc|μέλλον}} of the verb | |||
{{m|grc|μέλλω|t=to intend}}, which uses the tag set `1:s//3:p|impf|actv|indc|unaugmented` to express the syncretism | |||
between the first singular and third plural forms of the imperfect active indicative unaugmented conjugation. | |||
Two-level multipart tags should be used sparingly; if in doubt, list out the inflections separately. [FIXME: Make | |||
two-level multipart tags obsolete.] | |||
* A ''shortcut'' is a tag that expands to any type of tag described above, or to any type of tag set described above. | |||
Aliases are a particular type of shortcut whose expansion is a single non-multipart tag. | |||
* A ''multipart shortcut'' is a shortcut that expands into a multipart tag, for example `123`, which expands to the | |||
multipart tag `1//2//3`. Only the most common such combinations exist as shortcuts. | |||
* A ''list shortcut'' is a special type of shortcut that expands to a list of tags instead of a single tag. For example, | |||
the shortcut `1s` expands to `1|s` (first-person singular). Only the most common such combinations exist as shortcuts. | |||
* A ''conjoined shortcut'' is a special type of list shortcut that consists of a conjoined tag set (multiple logical tag | |||
sets). For example, the English language-specific shortcut `ed-form` expands to `spast|;|past|part`, expressing the | |||
common syncretism between simple past and past participle in English (and in this case, `spast` is itself a list | |||
shortcut that expands to `simple|past`). | |||
]==] | |||
-- version of rsubn() that discards all but the first return value | -- version of rsubn() that discards all but the first return value | ||
Line 121: | Line 129: | ||
local function normalize_index(list, index) | |||
if index < 0 then | |||
return #list + index + 1 | |||
end | |||
return index | |||
end | |||
-- FIXME, consider moving to [[Module:table]] | |||
-- Return true if the list `tags1`, treated as a set, is a subset of the list `tags2`, also treated as a set. | |||
local function is_subset(tags1, tags2) | |||
tags1 = m_table.listToSet(tags1) | |||
tags2 = m_table.listToSet(tags2) | |||
for tag, _ in pairs(tags1) do | |||
if not tags2[tag] then | |||
return false | |||
end | |||
end | |||
return true | |||
end | |||
-- FIXME, move to [[Module:table]] | |||
local function slice(list, i, j) | |||
--checkType("slice", 1, list, "table") | |||
--checkType("slice", 2, i, "number", true) | |||
--checkType("slice", 3, j, "number", true) | |||
if i == nil then | |||
i = 1 | |||
else | |||
i = normalize_index(list, i) | |||
end | |||
j = normalize_index(list, j or -1) | |||
local retval = {} | |||
local k = 0 | |||
for index = i, j do | |||
k = k + 1 | |||
retval[k] = list[index] | |||
end | |||
return retval | |||
end | |||
local function wrap_in_span(text, classes) | |||
return ("<span class='%s'>%s</span>"):format(classes, text) | |||
end | |||
--[==[ | |||
Lowest-level implementation of form-of templates, including the general {{tl|form of}} as well as those that deal with | |||
inflection tags, such as the general {{tl|inflection of}}, semi-specific variants such as {{tl|participle of}}, and | |||
specific variants such as {{tl|past participle of}}. `data` contains all the information controlling the display, with | |||
the following fields: | |||
* `.text`: Text to insert before the lemmas. Wrapped in the value of `.text_classes`, or its default; see below. | |||
* `.lemmas`: List of objects describing the lemma(s) of which the term in question is a non-lemma form. These are passed | |||
directly to {full_link()} in [[Module:links]]. Each object should have at minimum a `.lang` field containing the | |||
language of the lemma and a `.term` field containing the lemma itself. Each object is formatted using {full_link()} | |||
and then if there are more than one, they are joined using {serialCommaJoin()} in [[Module:table]]. Alternatively, | |||
`.lemmas` can be a string, which is displayed directly, or omitted, to show no lemma links and omit the connecting | |||
text. | |||
* `.lemma_face`: "Face" to use when displaying the lemma objects. Usually should be set to {"term"}. | |||
* `.enclitics`: List of enclitics to display after the lemmas, in parens. | |||
* `.base_lemmas`: List of base lemmas to display after the lemmas, in the case where the lemmas in `.lemmas` are | |||
themselves forms of another lemma (the base lemma), e.g. a comparative, superlative or participle. Each object is of | |||
the form { { paramobj = PARAM_OBJ, lemmas = {LEMMA_OBJ, LEMMA_OBJ, ...} }} where PARAM_OBJ describes the properties | |||
of the base lemma parameter (i.e. the relationship between the intermediate and base lemmas) and LEMMA_OBJ is an | |||
object suitable to be passed to {full_link()} in [[Module:links]]. PARAM_OBJ is of the format | |||
{ { param = "PARAM", tags = {"TAG", "TAG", ...} } where PARAM is the name of the parameter to {{tl|inflection of}} | |||
etc. that holds the base lemma(s) of the specified relationship and the tags describe the relationship, such as | |||
{ {"comd"}} or { {"past", "part"}}. | |||
* `.text_classes`: CSS classes used to wrap the tag text and lemma links. Default is {"form-of-definition use-with-mention"} | |||
for the tag text and lemma links, and additionally {"form-of-definition-link"} specifically for the lemma links. | |||
(FIXME: Should separate out the lemma links into their own field.) | |||
* `.posttext`: Additional text to display after the lemma links. | |||
]==] | |||
function export.format_form_of(data) | function export.format_form_of(data) | ||
if type(data) ~= "table" then | if type(data) ~= "table" then | ||
error("First argument must now be a table of arguments") | error("Internal error: First argument must now be a table of arguments") | ||
end | end | ||
local text_classes = data.text_classes or "form-of-definition use-with-mention" | |||
local text_classes = data.text_classes or "" | local lemma_classes = data.text_classes or "form-of-definition-link" | ||
local | |||
local parts = {} | local parts = {} | ||
table.insert(parts, "<span class='" .. text_classes .. "'>") | local function ins(text) | ||
table.insert(parts, text) | |||
if data.text ~= "" and data. | end | ||
ins("<span class='" .. text_classes .. "'>") | |||
ins(data.text) | |||
if data.text ~= "" and data.lemmas then | |||
ins(" ") | |||
end | end | ||
if data. | if data.lemmas then | ||
if type(data.lemmas) == "string" then | |||
if type(data. | ins(wrap_in_span(data.lemmas, lemma_classes)) | ||
else | else | ||
table.insert( | local formatted_terms = {} | ||
for _, lemma in ipairs(data.lemmas) do | |||
table.insert(formatted_terms, wrap_in_span( | |||
m_links.full_link(lemma, data.lemma_face), lemma_classes | |||
)) | |||
end | |||
ins(m_table.serialCommaJoin(formatted_terms)) | |||
end | |||
end | |||
if data.enclitics and #data.enclitics > 0 then | |||
-- The outer parens need to be outside of the text_classes span so they show in upright instead of italic, or | |||
-- they will clash with upright parens generated by link annotations such as transliterations and pos=. | |||
ins("</span>") | |||
local formatted_terms = {} | |||
for _, enclitic in ipairs(data.enclitics) do | |||
-- FIXME, should we have separate clitic face and/or classes? | |||
table.insert(formatted_terms, wrap_in_span( | |||
m_links.full_link(enclitic, data.lemma_face, nil, "show qualifiers"), lemma_classes | |||
)) | |||
end | end | ||
ins(" (") | |||
ins(wrap_in_span("with enclitic" .. (#data.enclitics > 1 and "s" or "") .. " ", text_classes)) | |||
ins(m_table.serialCommaJoin(formatted_terms)) | |||
ins(")") | |||
ins("<span class='" .. text_classes .. "'>") | |||
end | end | ||
if data.base_lemmas and #data.base_lemmas > 0 then | |||
for _, base_lemma in ipairs(data.base_lemmas) do | |||
ins(", the </span>") | |||
ins(export.tagged_inflections { | |||
lang = base_lemma.lemmas[1].lang, | |||
tags = base_lemma.paramobj.tags, | |||
lemmas = base_lemma.lemmas, | |||
lemma_face = data.lemma_face, | |||
no_format_categories = true, | |||
nocat = true, | |||
text_classes = data.text_classes, | |||
}) | |||
ins("<span class='" .. text_classes .. "'>") | |||
end | |||
end | |||
-- FIXME, should posttext go before enclitics? If so we need to have separate handling for the | |||
-- final colon when there are multiple tag sets in tagged_inflections(). | |||
if data.posttext then | if data.posttext then | ||
ins(data.posttext) | |||
end | end | ||
ins("</span>") | |||
return table.concat(parts) | return table.concat(parts) | ||
end | end | ||
--[==[ | |||
return tag:find("[[", nil, true) or tag:find("|", nil, true) or | Return true if `tag` contains an internal link or HTML. | ||
]==] | |||
function export.is_link_or_html(tag) | |||
return tag:find("[[", nil, true) or tag:find("|", nil, true) or tag:find("<", nil, true) | |||
end | end | ||
-- Look up a tag (either a shortcut of any sort of a canonical long-form tag) | --[==[ | ||
Look up a tag (either a shortcut of any sort of a canonical long-form tag) and return its expansion. The expansion | |||
will be a string unless the shortcut is a list-tag shortcut such as `1s`; in that case, the expansion will be a | |||
list. The caller must handle both cases. Only one level of expansion happens; hence, `acc` expands to {"accusative"}, | |||
`1s` expands to { {"1", "s"}} (not to { {"first", "singular"}}) and `123` expands to {"1//2//3"}. The expansion will be | |||
the same as the passed-in tag in the following circumstances: | |||
# The tag is `;` (this is special-cased, and no lookup is done). | |||
# The tag is a multipart tag such as `nom//acc` (this is special-cased, and no lookup is done). | |||
# The tag contains a raw link (this is special-cased, and no lookup is done). | |||
# The tag contains HTML (this is special-cased, and no lookup is done). | |||
# The tag is already a canonical long-form tag. | |||
# The tag is unrecognized. | |||
This function first looks up in the lang-specific data module [[Module:form of/lang-data/LANGCODE]], then in | |||
[[Module:form of/data]] (which includes more common non-lang-specific tags) and finally (only if the tag is not | |||
recognized as a shortcut or canonical tag, and is not of types 1-4 above) in [[Module:form of/data2]]. | |||
If the expansion is a string and is different from the tag, track it if `do_track` is true. | |||
-- | ]==] | ||
function export.lookup_shortcut(tag, lang, do_track) | |||
function export.lookup_shortcut(tag) | |||
-- If there is HTML or a link in the tag, return it directly; don't try | -- If there is HTML or a link in the tag, return it directly; don't try | ||
-- to look it up, which will fail. | -- to look it up, which will fail. | ||
if tag == ";" or tag:find("//", nil, true) or is_link_or_html(tag) then | if tag == ";" or tag:find("//", nil, true) or export.is_link_or_html(tag) then | ||
return tag | return tag | ||
end | end | ||
local | local expansion | ||
local langcode = lang and lang:getCode() | |||
-- check for shortcuts (which will cause [[Module:form of/data2]] to be | if langcode and export.langs_with_lang_specific_tags[langcode] then | ||
local langdata = mw.loadData(export.form_of_lang_data_module_prefix .. langcode) | |||
-- If this is a canonical long-form tag, just return it, and don't check for shortcuts. This is an | |||
-- optimization; see below. | |||
if langdata.tags[tag] then | |||
return tag | |||
end | |||
expansion = langdata.shortcuts[tag] | |||
end | |||
if not expansion and lang then | |||
-- If the lang we're dealing with is an etym-only lang, try again with the corresponding full language. | |||
local full_langcode = lang:getFullCode() | |||
if full_langcode ~= langcode and export.langs_with_lang_specific_tags[full_langcode] then | |||
local langdata = mw.loadData(export.form_of_lang_data_module_prefix .. full_langcode) | |||
-- If this is a canonical long-form tag, just return it, and don't check for shortcuts. This is an | |||
-- optimization; see below. | |||
if langdata.tags[tag] then | |||
return tag | |||
end | |||
expansion = langdata.shortcuts[tag] | |||
end | |||
end | |||
if not expansion then | |||
local m_data = mw.loadData(export.form_of_data_module) | |||
-- If this is a canonical long-form tag, just return it, and don't check for shortcuts (which will cause | |||
-- [[Module:form of/data2]] to be loaded, because there won't be a shortcut entry in [[Module:form of/data]] -- | |||
-- or, for that matter, in [[Module:form of/data2]]). This is an optimization; the code will still work without | |||
-- it, but will use up more memory. | |||
if m_data.tags[tag] then | |||
return tag | |||
end | |||
expansion = m_data.shortcuts[tag] | |||
end | end | ||
if not expansion then | if not expansion then | ||
local m_data2 = mw.loadData( | local m_data2 = mw.loadData(export.form_of_data2_module) | ||
expansion = m_data2.shortcuts[tag] | expansion = m_data2.shortcuts[tag] | ||
end | end | ||
Line 203: | Line 351: | ||
return tag | return tag | ||
end | end | ||
return expansion | return expansion | ||
end | end | ||
-- Look up a normalized/canonicalized tag and return the data object | --[==[ | ||
Look up a normalized/canonicalized tag and return the data object associated with it. If the tag isn't found, return | |||
-- in [[Module:form of/data]] (which includes more common tags) and then in | nil. This first looks up in the lang-specific data module [[Module:form of/lang-data/LANGCODE]], then in | ||
[[Module:form of/data]] (which includes more common non-lang-specific tags) and then finally in | |||
function export.lookup_tag(tag) | [[Module:form of/data2]]. | ||
local m_data = mw.loadData( | ]==] | ||
function export.lookup_tag(tag, lang) | |||
local langcode = lang and lang:getCode() | |||
if langcode and export.langs_with_lang_specific_tags[langcode] then | |||
local langdata = mw.loadData(export.form_of_lang_data_module_prefix .. langcode) | |||
if langdata.tags[tag] then | |||
return langdata.tags[tag] | |||
end | |||
end | |||
local full_langcode = lang and lang:getFullCode() | |||
if full_langcode and full_langcode ~= langcode and export.langs_with_lang_specific_tags[full_langcode] then | |||
-- If the lang we're dealing with is an etym-only lang, try again with the corresponding full language. | |||
local langdata = mw.loadData(export.form_of_lang_data_module_prefix .. full_langcode) | |||
if langdata.tags[tag] then | |||
return langdata.tags[tag] | |||
end | |||
end | |||
local m_data = mw.loadData(export.form_of_data_module) | |||
local tagobj = m_data.tags[tag] | local tagobj = m_data.tags[tag] | ||
if tagobj then | if tagobj then | ||
return tagobj | return tagobj | ||
end | end | ||
local m_data2 = mw.loadData( | local m_data2 = mw.loadData(export.form_of_data2_module) | ||
local tagobj2 = m_data2.tags[tag] | local tagobj2 = m_data2.tags[tag] | ||
if tagobj2 then | if tagobj2 then | ||
Line 226: | Line 392: | ||
-- Normalize a single tag, which may be a shortcut but should not be a | -- Normalize a single tag, which may be a shortcut but should not be a multipart tag, a multipart shortcut or a list | ||
-- shortcut. | |||
local function normalize_single_tag(tag) | local function normalize_single_tag(tag, lang, do_track) | ||
local expansion = export.lookup_shortcut(tag) | local expansion = export.lookup_shortcut(tag, lang, do_track) | ||
if type(expansion) ~= "string" then | if type(expansion) ~= "string" then | ||
error("Tag '" .. tag .. "' is a list | error("Tag '" .. tag .. "' is a list shortcut, which is not allowed here") | ||
end | end | ||
tag = expansion | tag = expansion | ||
return tag | return tag | ||
end | end | ||
-- Normalize a component of a multipart tag. This should not have any // in it, | --[=[ | ||
Normalize a component of a multipart tag. This should not have any // in it, but may join multiple individual tags with | |||
a colon, and may be a single list-tag shortcut, which is treated as if colon-separated. The return value may be a list | |||
of tags. | |||
]=] | |||
local function normalize_multipart_component(tag, lang, do_track) | |||
local function normalize_multipart_component(tag, | -- If there is HTML or a link in the tag, don't try to split on colon. A colon may legitimately occur in either one, | ||
-- If there is HTML or a link in the tag, don't try to split on colon. | -- and we don't want these things parsed. Note that we don't do this check before splitting on //, which we don't | ||
-- expect to occur in links or HTML; see comment in normalize_tag(). | |||
if export.is_link_or_html(tag) then | |||
if is_link_or_html(tag) then | |||
return tag | return tag | ||
end | end | ||
Line 258: | Line 420: | ||
-- We allow list-tag shortcuts inside of multipart tags, e.g. | -- We allow list-tag shortcuts inside of multipart tags, e.g. | ||
-- '1s//3p'. Check for this now. | -- '1s//3p'. Check for this now. | ||
tag = export.lookup_shortcut(tag) | tag = export.lookup_shortcut(tag, lang, do_track) | ||
if type(tag) == "table" then | if type(tag) == "table" then | ||
-- We found a list-tag shortcut; treat as if colon-separated. | -- We found a list-tag shortcut; treat as if colon-separated. | ||
components = tag | components = tag | ||
else | else | ||
return normalize_single_tag(tag) | return normalize_single_tag(tag, lang, do_track) | ||
end | end | ||
end | end | ||
local normtags = {} | local normtags = {} | ||
for _, component in ipairs(components) do | for _, component in ipairs(components) do | ||
table.insert(normtags, normalize_single_tag(component)) | table.insert(normtags, normalize_single_tag(component, lang, do_track)) | ||
end | end | ||
return normtags | |||
end | end | ||
-- Normalize a single tag. | --[=[ | ||
Normalize a single tag. The return value may be a list (in the case of multipart tags), which will contain nested lists | |||
in the case of two-level multipart tags. | |||
]=] | |||
local function normalize_tag(tag, lang, do_track) | |||
local function normalize_tag(tag, | -- We don't check for links or HTML before splitting on //, which we don't expect to occur in links or HTML. Doing | ||
-- We don't check for links or HTML before splitting on //, which we | -- it this way allows for a tag like '{{lb|grc|Epic}}//{{lb|grc|Ionic}}' to function correctly (the template calls | ||
-- will be expanded before we process the tag, and will contain links and HTML). The only check we do is for a URL, | |||
-- which shouldn't normally occur, but might if the user tries to put an external link into the tag. URL's with // | |||
-- | -- normally have the sequence ://, which should never normally occur when // and : are used in their normal ways. | ||
-- which shouldn't normally occur, but might if the user tries to put | |||
-- sequence ://, which should never normally occur when // and : are | |||
if tag:find("://", nil, true) then | if tag:find("://", nil, true) then | ||
return tag | return tag | ||
Line 299: | Line 452: | ||
local split_tags = rsplit(tag, "//", true) | local split_tags = rsplit(tag, "//", true) | ||
if #split_tags == 1 then | if #split_tags == 1 then | ||
local retval = normalize_multipart_component(tag, | local retval = normalize_multipart_component(tag, lang, do_track) | ||
if type(retval) == "table" then | if type(retval) == "table" then | ||
-- The user gave a tag like '1:s', i.e. with colon but without | -- The user gave a tag like '1:s', i.e. with colon but without //. Allow this, but we need to return a | ||
-- nested list. | |||
-- | |||
return {retval} | return {retval} | ||
end | end | ||
Line 310: | Line 462: | ||
local normtags = {} | local normtags = {} | ||
for _, single_tag in ipairs(split_tags) do | for _, single_tag in ipairs(split_tags) do | ||
table.insert(normtags, normalize_multipart_component(single_tag, | table.insert(normtags, normalize_multipart_component(single_tag, lang, do_track)) | ||
end | end | ||
return normtags | |||
end | end | ||
-- Normalize a tag set (a list of tags) into | --[==[ | ||
Normalize a tag set (a list of tags) into its canonical-form tags. The return value is a list of normalized tag sets | |||
(a list because of there may be conjoined shortcuts among the input tags). A normalized tag set is a list of tag | |||
elements, where each element is either a string (the canonical form of a tag), a list of such strings (in the case of | |||
multipart tags) or a list of lists of such strings (in the case of two-level multipart tags). For example, the multipart | |||
tag `nom//acc//voc` will be represented in canonical form as { {"nominative", "accusative", "vocative"}}, and the | |||
two-level multipart tag `1:s//3:p` will be represented as { {{"first-person", "singular"}, {"third-person", "plural"}}}. | |||
Example 1: | |||
{normalize_tag_set({"nom//acc//voc", "n", "p"})} = { {{{"nominative", "accusative", "vocative"}, "masculine", "plural"}}} | |||
Example 2: | |||
function export. | |||
local | {normalize_tag_set({"ed-form"}, ENGLISH)} = { {{"simple", "past"}, {"past", "participle"}}} | ||
for _, tag in ipairs( | |||
-- Expand the tag, which may generate a new tag (either a | Example 3: | ||
tag = export.lookup_shortcut(tag) | {normalize_tag_set({"archaic", "ed-form"}, ENGLISH)} = { {{"archaic", "simple", "past"}, {"archaic", "past", "participle"}}} | ||
]==] | |||
function export.normalize_tag_set(tag_set, lang, do_track) | |||
local output_tag_set = {} | |||
local saw_semicolon = false | |||
for _, tag in ipairs(tag_set) do | |||
-- Expand the tag, which may generate a new tag (either a fully canonicalized tag, a multipart tag, or a list | |||
-- of tags). | |||
tag = export.lookup_shortcut(tag, lang, do_track) | |||
if type(tag) == "table" then | if type(tag) == "table" then | ||
saw_semicolon = m_table.contains(tag, ";") | |||
if saw_semicolon then | |||
-- If we saw a conjoined shortcut, we need to use a more general algorithm that can expand a single | |||
-- tag set into multiple. | |||
break | |||
end | |||
for _, t in ipairs(tag) do | for _, t in ipairs(tag) do | ||
table.insert( | table.insert(output_tag_set, normalize_tag(t, lang, do_track)) | ||
end | |||
else | |||
table.insert(output_tag_set, normalize_tag(tag, lang, do_track)) | |||
end | |||
end | |||
if not saw_semicolon then | |||
return {output_tag_set} | |||
end | |||
-- Use a more general algorithm that handles conjoined shortcuts. | |||
local output_tag_set = {} | |||
for i, tag in ipairs(tag_set) do | |||
-- Expand the tag, which may generate a new tag (either a fully canonicalized tag, a multipart tag, or a list | |||
-- of tags). | |||
tag = export.lookup_shortcut(tag, lang, do_track) | |||
if type(tag) == "table" then | |||
local output_tag_sets = {} | |||
local shortcut_tag_sets = export.split_tag_set(tag) | |||
local normalized_shortcut_tag_sets = {} | |||
for _, shortcut_tag_set in ipairs(shortcut_tag_sets) do | |||
m_table.extendList(normalized_shortcut_tag_sets, | |||
export.normalize_tag_set(shortcut_tag_set, lang, do_track)) | |||
end | end | ||
local after_tags = slice(tag_set, i + 1) | |||
local normalized_after_tags_sets = export.normalize_tag_set(after_tags, lang, do_track) | |||
for _, normalized_shortcut_tag_set in ipairs(normalized_shortcut_tag_sets) do | |||
for _, normalized_after_tags_set in ipairs(normalized_after_tags_sets) do | |||
table.insert(output_tag_sets, m_table.append(output_tag_set, normalized_shortcut_tag_set, | |||
normalized_after_tags_set)) | |||
end | |||
end | |||
return output_tag_sets | |||
else | else | ||
table.insert( | table.insert(output_tag_set, normalize_tag(tag, lang, do_track)) | ||
end | end | ||
end | end | ||
error("Internal error: Should not get here") | |||
end | end | ||
-- Split a tag set containing two-level multipart tags into one or more tag sets not containing such tags. | function export.combine_multipart_tags(tag_set) | ||
for i, tag in ipairs(tag_set) do | |||
if type(tag) == "table" then | |||
for j, subtag in ipairs(tag) do | |||
if type(subtag) == "table" then | |||
tag[j] = table.concat(subtag, ":") | |||
end | |||
end | |||
tag_set[i] = table.concat(tag, "//") | |||
end | |||
end | |||
return tag_set | |||
end | |||
function export.normalize_tags(tags, lang, recombine_multitags, do_track) | |||
local tag_sets = export.normalize_tag_set(tags, lang, do_track) | |||
if recombine_multitags then | |||
for i, tag_set in ipairs(tag_sets) do | |||
tag_sets[i] = export.combine_multipart_tags(tag_set) | |||
end | |||
return export.combine_tag_sets(tag_sets) | |||
end | |||
return tag_sets | |||
end | |||
--[==[ | |||
Split a tag set containing two-level multipart tags into one or more tag sets not containing such tags. | |||
Single-level multipart tags are left alone. (If we need to, a slight modification of the following code | |||
will also split single-level multipart tags.) This assumes that multipart tags are represented as lists | |||
and two-level multipart tags are represented as lists of lists, as is output by {normalize_tag_set()}. | |||
NOTE: We have to be careful to properly handle imbalanced two-level multipart tags such as | |||
`def:s//p` (or the reverse, `s//def:p`). | |||
]==] | |||
function export.split_two_level_multipart_tag_set(tag_set) | function export.split_two_level_multipart_tag_set(tag_set) | ||
for i, tag in ipairs(tag_set) do | for i, tag in ipairs(tag_set) do | ||
if type(tag) == "table" then | if type(tag) == "table" then | ||
Line 374: | Line 599: | ||
-- We found a two-level multipart tag. | -- We found a two-level multipart tag. | ||
-- (1) Extract the preceding tags. | -- (1) Extract the preceding tags. | ||
local pre_tags = | local pre_tags = slice(tag_set, 1, i - 1) | ||
-- (2) Extract the following tags. | -- (2) Extract the following tags. | ||
local post_tags = | local post_tags = slice(tag_set, i + 1) | ||
-- (3) Loop over each tag set alternant in the two-level multipart tag. | -- (3) Loop over each tag set alternant in the two-level multipart tag. | ||
-- For each alternant, form the tag set consisting of pre_tags + alternant + post_tags, | -- For each alternant, form the tag set consisting of pre_tags + alternant + post_tags, | ||
Line 389: | Line 608: | ||
for _, first_level_tag_set in ipairs(tag) do | for _, first_level_tag_set in ipairs(tag) do | ||
local expanded_tag_set = {} | local expanded_tag_set = {} | ||
m_table.extendList(expanded_tag_set, pre_tags) | |||
-- The second level may have a string or a list. | -- The second level may have a string or a list. | ||
if type(first_level_tag_set) == "table" then | if type(first_level_tag_set) == "table" then | ||
m_table.extendList(expanded_tag_set, first_level_tag_set) | |||
else | else | ||
table.insert(expanded_tag_set, first_level_tag_set) | table.insert(expanded_tag_set, first_level_tag_set) | ||
end | end | ||
m_table.extendList(expanded_tag_set, post_tags) | |||
m_table.extendList(resulting_tag_sets, export.split_two_level_multipart_tag_set(expanded_tag_set)) | |||
end | end | ||
return resulting_tag_sets | return resulting_tag_sets | ||
Line 416: | Line 627: | ||
-- | --[==[ | ||
function export. | Split a tag set that may consist of multiple semicolon-separated tag sets into the component tag sets. | ||
local | ]==] | ||
function export.split_tag_set(tag_set) | |||
local split_tag_sets = {} | |||
local cur_tag_set = {} | local cur_tag_set = {} | ||
for _, tag in ipairs( | for _, tag in ipairs(tag_set) do | ||
if tag == ";" then | if tag == ";" then | ||
if #cur_tag_set > 0 then | if #cur_tag_set > 0 then | ||
table.insert( | table.insert(split_tag_sets, cur_tag_set) | ||
end | end | ||
cur_tag_set = {} | cur_tag_set = {} | ||
Line 431: | Line 644: | ||
end | end | ||
if #cur_tag_set > 0 then | if #cur_tag_set > 0 then | ||
table.insert( | table.insert(split_tag_sets, cur_tag_set) | ||
end | end | ||
return | return split_tag_sets | ||
end | end | ||
export.split_tags_into_tag_sets = export.split_tag_set | |||
-- | --[==[ | ||
Combine multiple tag sets in a tag set group into a simple tag set, with logical tag sets separated by semicolons. | |||
This is the opposite of {split_tag_set()}. | |||
function export. | ]==] | ||
function export.combine_tag_sets(tag_sets) | |||
if #tag_sets == 1 then | |||
return tag_sets[1] | |||
local | end | ||
local combined_tag_set = {} | |||
for _, tag_set in ipairs(tag_sets) do | for _, tag_set in ipairs(tag_sets) do | ||
if #combined_tag_set > 0 then | |||
table.insert( | table.insert(combined_tag_set, ";") | ||
end | end | ||
m_table.extendList(combined_tag_set, tag_set) | |||
end | |||
return tags | |||
end | |||
local tag_set_param_mods = { | |||
lb = { | |||
item_dest = "labels", | |||
convert = function(arg, parse_err) | |||
return rsplit(arg, "//", true) | |||
end, | |||
} | |||
} | |||
--[==[ | |||
Parse tag set properties from a tag set (list of tags). Currently no per-tag properties are recognized, and the only | |||
per-tag-set property recognized is `<lb:...>` for specifing label(s) for the tag set. Per-tag-set properties must be | |||
attached to the last tag. | |||
]==] | |||
function export.parse_tag_set_properties(tag_set) | |||
local function generate_tag_set_obj(last_tag) | |||
tag_set[#tag_set] = last_tag | |||
return {tags = tag_set} | |||
end | |||
local last_tag = tag_set[#tag_set] | |||
-- Check for inline modifier, e.g. מרים<tr:Miryem>. But exclude HTML entry with <span ...>, <i ...>, <br/> or | |||
-- similar in it, caused by wrapping an argument in {{l|...}}, {{af|...}} or similar. Basically, all tags of | |||
-- the sort we parse here should consist of a less-than sign, plus letters, plus a colon, e.g. <lb:...>, so if | |||
-- we see a tag on the outer level that isn't in this format, we don't try to parse it. The restriction to the | |||
-- outer level is to allow generated HTML inside of e.g. qualifier tags, such as foo<q:similar to {{m|fr|bar}}>. | |||
if last_tag:find("<") and not last_tag:find("^[^<]*<[a-z]*[^a-z:]") then | |||
return require(parse_utilities_module).parse_inline_modifiers(last_tag, { | |||
param_mods = tag_set_param_mods, | |||
generate_obj = generate_tag_set_obj, | |||
}) | |||
else | |||
return generate_tag_set_obj(last_tag) | |||
end | end | ||
end | end | ||
function export.normalize_pos(pos) | function export.normalize_pos(pos) | ||
return | if not pos then | ||
return nil | |||
end | |||
return mw.loadData(export.form_of_pos_module)[pos] or pos | |||
end | end | ||
Line 462: | Line 719: | ||
-- passed in must be a string (i.e. it cannot be a list describing a | -- passed in must be a string (i.e. it cannot be a list describing a | ||
-- multipart tag). To handle multipart tags, use get_tag_display_form(). | -- multipart tag). To handle multipart tags, use get_tag_display_form(). | ||
local function get_single_tag_display_form(normtag) | local function get_single_tag_display_form(normtag, lang) | ||
local data = export.lookup_tag(normtag) | local data = export.lookup_tag(normtag, lang) | ||
local display = normtag | |||
-- If the tag has a special display form, use it | -- If the tag has a special display form, use it | ||
if data and data.display then | if data and data.display then | ||
display = data.display | |||
end | end | ||
-- If there is a nonempty glossary index, then show a link to it | -- If there is a nonempty glossary index, then show a link to it | ||
local glossary = data and data[export.GLOSSARY] | |||
if | if glossary ~= nil then | ||
if glossary == export.WIKT then | |||
elseif | display = "[[" .. normtag .. "|" .. display .. "]]" | ||
elseif glossary == export.WP then | |||
display = "[[w:" .. normtag .. "|" .. display .. "]]" | |||
elseif glossary == export.APPENDIX then | |||
display = "[[Appendix:Glossary#" .. mw.uri.anchorEncode(normtag) .. "|" .. display .. "]]" | |||
elseif type(glossary) ~= "string" then | |||
error(("Internal error: Wrong type %s for glossary value %s for tag %s"):format( | |||
type(glossary), mw.dumpObject(glossary), normtag)) | |||
else | else | ||
local link = rmatch(glossary, "^wikt:(.*)") | |||
if link then | |||
display = "[[" .. link .. "|" .. display .. "]]" | |||
end | |||
if not link then | |||
link = rmatch(glossary, "^w:(.*)") | |||
if link then | |||
display = "[[w:" .. link .. "|" .. display .. "]]" | |||
end | |||
end | |||
if not link then | |||
display = "[[Appendix:Glossary#" .. mw.uri.anchorEncode(glossary) .. "|" .. display .. "]]" | |||
end | |||
end | end | ||
end | end | ||
return | return display | ||
end | end | ||
-- Turn a canonicalized tag spec (which describes a single, possibly | --[==[ | ||
Turn a canonicalized tag spec (which describes a single, possibly multipart tag) into the displayed form. The tag spec | |||
may be a string (a canonical-form tag); a list of canonical-form tags (in the case of a simple multipart tag); or a | |||
list of mixed canonical-form tags and lists of such tags (in the case of a two-level multipart tag). `joiner` indicates | |||
how to join the parts of a multipart tag, and can be either {"and"} ("foo and bar", or "foo, bar and baz" for 3 or | |||
more), {"slash"} ("foo/bar"), {"en-dash"} ("foo–bar") or {nil}, which uses the global default found in | |||
{multipart_join_strategy()} in [[Module:form of/functions]]. (NOTE: The global default is {"slash"} and this seems | |||
unlikely to change.) | |||
]==] | |||
function export.get_tag_display_form(tagspec, lang, joiner) | |||
function export.get_tag_display_form(tagspec, joiner) | |||
if type(tagspec) == "string" then | if type(tagspec) == "string" then | ||
return get_single_tag_display_form(tagspec) | return get_single_tag_display_form(tagspec, lang) | ||
end | end | ||
-- We have a multipart tag. See if there's a display handler to | -- We have a multipart tag. See if there's a display handler to display them specially. | ||
for _, handler in ipairs(require(export.form_of_functions_module).display_handlers) do | |||
for _, handler in ipairs( | |||
local displayval = handler(tagspec, joiner) | local displayval = handler(tagspec, joiner) | ||
if displayval then | if displayval then | ||
Line 510: | Line 784: | ||
for _, first_level_tag in ipairs(tagspec) do | for _, first_level_tag in ipairs(tagspec) do | ||
if type(first_level_tag) == "string" then | if type(first_level_tag) == "string" then | ||
table.insert(displayed_tags, get_single_tag_display_form(first_level_tag)) | table.insert(displayed_tags, get_single_tag_display_form(first_level_tag, lang)) | ||
else | else | ||
-- A first-level element of a two-level multipart tag. | -- A first-level element of a two-level multipart tag. Currently we just separate the individual components | ||
-- with spaces, but other ways are possible, e.g. using an underscore, colon, parens or braces. | |||
-- with spaces, but other ways are possible, e.g. using | |||
local components = {} | local components = {} | ||
for _, component in ipairs(first_level_tag) do | for _, component in ipairs(first_level_tag) do | ||
table.insert(components, get_single_tag_display_form(component)) | table.insert(components, get_single_tag_display_form(component, lang)) | ||
end | end | ||
table.insert(displayed_tags, table.concat(components, " ")) | table.insert(displayed_tags, table.concat(components, " ")) | ||
end | end | ||
end | end | ||
return | return require(export.form_of_functions_module).join_multiparts(displayed_tags, joiner) | ||
end | end | ||
-- | --[==[ | ||
-- | Given a normalized tag set (i.e. as output by {normalize_tag_set()}; all tags are in canonical form, multipart tags are | ||
local | represented as lists, and two-level multipart tags as lists of lists), convert to displayed form (a string). See | ||
{get_tag_display_form()} for the meaning of `joiner`. | |||
]==] | |||
function export.get_tag_set_display_form(normalized_tag_set, lang, joiner) | |||
local parts = {} | |||
for _, tagspec in ipairs(normalized_tag_set) do | |||
local to_insert = export.get_tag_display_form(tagspec, lang, joiner) | |||
-- Maybe insert a space before inserting the display form of the tag. We insert a space if | |||
-- (a) we're not the first tag; and | |||
-- (b) the tag we're about to insert doesn't have the "no_space_on_left" property; and | |||
-- (c) the preceding tag doesn't have the "no_space_on_right" property. | |||
-- NOTE: We depend here on the fact that | |||
-- (1) all tags with either of the above properties set have the same display form as canonical form, and | |||
-- (2) all tags with either of the above properties set are single-character tags. | |||
-- The second property is an optimization to avoid looking up display forms resulting from multipart tags, | |||
-- which won't be found and which will trigger loading of [[Module:form of/data2]]. If multichar punctuation is | |||
-- added in the future, it's ok to change the == 1 below to <= 2 or <= 3. | |||
-- | |||
-- If the first property above fails to hold in the future, we need to track the canonical form of each tag | |||
-- (including the previous one) as well as the display form. This would also avoid the need for the == 1 check. | |||
if #parts > 0 then | |||
local most_recent_tagobj = ulen(parts[#parts]) == 1 and export.lookup_tag(parts[#parts], lang) | |||
local to_insert_tagobj = ulen(to_insert) == 1 and export.lookup_tag(to_insert, lang) | |||
if ( | |||
(not most_recent_tagobj or not most_recent_tagobj.no_space_on_right) and | |||
(not to_insert_tagobj or not to_insert_tagobj.no_space_on_left) | |||
) then | |||
table.insert(parts, " ") | |||
end | |||
end | end | ||
table.insert(parts, to_insert) | |||
end | end | ||
return | |||
return table.concat(parts) | |||
end | end | ||
-- | --[==[ | ||
Given a normalized tag set (i.e. as output by {normalize_tag_set()}; all tags are in canonical form, multipart tags are | |||
represented as lists, and two-level multipart tags as lists of lists), fetch the associated categories and labels. | |||
Return two values, a list of categories and a list of labels. `lang` is the language of term represented by the tag set, | |||
function export. | and `POS` is the user-provided part of speech (which may be {nil}). | ||
local m_cats = mw.loadData( | ]==] | ||
function export.fetch_categories_and_labels(normalized_tag_set, lang, POS, pagename, lemmas) | |||
local m_cats = mw.loadData(export.form_of_cats_module) | |||
local categories = {} | local categories = {} | ||
local labels = {} | |||
POS = export.normalize_pos(POS) | POS = export.normalize_pos(POS) | ||
-- First split any two-level multipart tags into multiple sets, to make our life easier. | |||
-- | for _, tag_set in ipairs(export.split_two_level_multipart_tag_set(normalized_tag_set)) do | ||
for _, tag_set in ipairs( | -- Call a named function, either from the lang-specific data in | ||
local function | -- [[Module:form of/lang-specific/LANGCODE/functions]] or in [[Module:form of/functions]]. | ||
local function call_named_function(name, funtype) | |||
local data = { | |||
pagename = pagename or mw.title.getCurrentTitle().subpageText, | |||
lemmas = lemmas, | |||
tag_set = normalized_tag_set, | |||
lang = lang, | |||
POS = POS | |||
} | } | ||
local modules_tried = {} | |||
local function try_lang_specific_module(langcode) | |||
if export.langs_with_lang_specific_tags[langcode] then | |||
local lang_specific_module = export.form_of_lang_data_module_prefix .. langcode .. "/functions" | |||
local langdata = require(utilities_module).safe_require(lang_specific_module) | |||
if langdata then | |||
table.insert(modules_tried, lang_specific_module) | |||
if langdata.cat_functions then | |||
local fn = langdata.cat_functions[name] | |||
if fn then | |||
return fn(data), true | |||
end | |||
end | |||
end | |||
end | |||
return nil, false | |||
end | |||
-- First try lang-specific. | |||
local langcode = lang and lang:getCode() | |||
if langcode then | |||
local retval, found_it = try_lang_specific_module(langcode) | |||
if found_it then | |||
return retval | |||
end | |||
end | |||
-- If the lang we're dealing with is an etym-only lang, try again with the corresponding full language. | |||
local full_langcode = lang and lang:getFullCode() | |||
if full_langcode and full_langcode ~= langcode then | |||
local retval, found_it = try_lang_specific_module(full_langcode) | |||
if found_it then | |||
return retval | |||
end | |||
end | |||
-- Try lang-independent. | |||
table.insert(modules_tried, export.form_of_functions_module) | |||
local fn = require(export.form_of_functions_module).cat_functions[name] | |||
if fn then | |||
return fn(data) | |||
end | |||
for i, modname in ipairs(modules_tried) do | |||
modules_tried[i] = "[[" .. modname .. "]]" | |||
end | |||
error(("No %s function named '%s' in %s"):format(funtype, name, lang_specific_part, | |||
m_table.serialCommaJoin(modules_tried, {conj = "or", dontTag = true}))) | |||
end | end | ||
Line 599: | Line 943: | ||
-- complex when multipart tags are present. | -- complex when multipart tags are present. | ||
local function tag_set_matches_spec_tag(spec_tag) | local function tag_set_matches_spec_tag(spec_tag) | ||
spec_tag = normalize_tag(spec_tag) | spec_tag = normalize_tag(spec_tag, lang) | ||
for _, tag_set_tag in ipairs(tag_set) do | for _, tag_set_tag in ipairs(tag_set) do | ||
if tag_set_tag_matches_spec_tag(tag_set_tag, spec_tag) then | if tag_set_tag_matches_spec_tag(tag_set_tag, spec_tag) then | ||
Line 635: | Line 979: | ||
return false, 3 | return false, 3 | ||
elseif predicate == "tags=" then | elseif predicate == "tags=" then | ||
local | local normalized_spec_tag_sets = export.normalize_tag_set(spec[2], lang) | ||
-- Allow tags to be in different orders, and multipart tags to | if #normalized_spec_tag_sets > 1 then | ||
error("Internal error: No support for conjoined shortcuts in category/label specs in " | |||
.. "[[Module:form of/cats]] when processing spec tag set " .. table.concat(spec[2], "|")) | |||
-- | end | ||
local normalized_spec_tag_set = normalized_spec_tag_sets[1] | |||
-- tag set tags. | -- Check for and disallow two-level multipart tags in the specs. FIXME: Remove this when we remove | ||
if #tag_set ~= # | -- support for two-level multipart tags. | ||
for _, tag in ipairs(normalized_spec_tag_set) do | |||
if type(tag) == "table" then | |||
for _, subtag in ipairs(tag) do | |||
if type(subtag) == "table" then | |||
error("Internal error: No support for two-level multipart tags in category/label specs" | |||
.. "[[Module:form of/cats]] when processing spec tag set " | |||
.. table.concat(spec[2], "|")) | |||
end | |||
end | |||
end | |||
end | |||
-- Allow tags to be in different orders, and multipart tags to be in different orders. To handle this, | |||
-- we first check that both tag set tags and spec tags have the same length. If so, we sort the | |||
-- multipart tags in the tag set tags and spec tags, and then check that all tags in the spec tags are | |||
-- in the tag set tags. | |||
if #tag_set ~= #normalized_spec_tag_set then | |||
return false, 3 | return false, 3 | ||
end | end | ||
Line 650: | Line 1,010: | ||
table.sort(tag_set_tags[i]) | table.sort(tag_set_tags[i]) | ||
end | end | ||
if type( | if type(normalized_spec_tag_set[i]) == "table" then | ||
table.sort( | table.sort(normalized_spec_tag_set[i]) | ||
end | end | ||
end | end | ||
for i=1,#tag_set_tags do | for i=1,#tag_set_tags do | ||
if not m_table.contains(tag_set_tags, | if not m_table.contains(tag_set_tags, normalized_spec_tag_set[i]) then | ||
return false, 3 | return false, 3 | ||
end | end | ||
Line 686: | Line 1,046: | ||
end | end | ||
return condval, 4 | return condval, 4 | ||
elseif | elseif predicate == "call" then | ||
return fn(call_named_function(spec[2], "condition")), 3 | |||
else | else | ||
error("Unrecognized predicate: " .. predicate) | error("Unrecognized predicate: " .. predicate) | ||
Line 706: | Line 1,062: | ||
return false | return false | ||
elseif type(spec) == "string" then | elseif type(spec) == "string" then | ||
-- Substitute POS request with user-specified part of speech | -- A category. Substitute POS request with user-specified part of speech or default. | ||
spec = rsub(spec, "<<p=(.-)>>", function(default) | spec = rsub(spec, "<<p=(.-)>>", function(default) | ||
return POS or export.normalize_pos(default) | return POS or export.normalize_pos(default) | ||
end) | end) | ||
table.insert(categories, lang: | table.insert(categories, lang:getFullName() .. " " .. spec) | ||
return true | |||
elseif type(spec) == "table" and spec.labels then | |||
-- A label spec. | |||
for _, label in ipairs(spec.labels) do | |||
m_table.insertIfNot(labels, label) | |||
end | |||
return true | return true | ||
elseif type(spec) ~= "table" then | elseif type(spec) ~= "table" then | ||
Line 734: | Line 1,095: | ||
return false | return false | ||
elseif predicate == "call" then | elseif predicate == "call" then | ||
return process_spec(call_named_function(spec[2], "spec")) | |||
else | else | ||
local condval, ifspec = check_condition(spec) | local condval, ifspec = check_condition(spec) | ||
Line 752: | Line 1,109: | ||
end | end | ||
local | local langcode = lang:getCode() | ||
local langspecs = m_cats[langcode] | |||
if langspecs then | if langspecs then | ||
for _, spec in ipairs(langspecs) do | for _, spec in ipairs(langspecs) do | ||
Line 758: | Line 1,116: | ||
end | end | ||
end | end | ||
local full_code = lang:getFullCode() | |||
if full_code ~= langcode then | |||
local langspecs = m_cats[full_code] | |||
if langspecs then | |||
for _, spec in ipairs(langspecs) do | |||
process_spec(spec) | |||
end | |||
end | |||
end | |||
if full_code ~= "und" then | |||
local langspecs = m_cats["und"] | local langspecs = m_cats["und"] | ||
if langspecs then | if langspecs then | ||
Line 768: | Line 1,135: | ||
end | end | ||
return categories | return categories, labels | ||
end | end | ||
--[==[ | |||
if not data.tags then | Implementation of templates that display inflection tags, such as the general {{tl|inflection of}}, semi-specific | ||
error("First argument must | variants such as {{tl|participle of}}, and specific variants such as {{tl|past participle of}}. `data` contains all the | ||
information controlling the display, with the following fields: | |||
* `.lang`: ('''''required''''') Language to use when looking up language-specific inflection tags, categories and | |||
labels, and for displaying categories and labels. | |||
* `.tags`: ('''''required''' unless `.tag_sets` is given'') List of non-canonicalized inflection tags. Multiple tag sets | |||
can be indicated by a {";"} as one of the tags, and tag-set properties may be attached to the last tag of a tag set. | |||
The tags themselves may come directly from the user (as in {{tl|inflection of}}); come partly from the user (as in | |||
{{tl|participle of}}, which adds the tag `part` to user-specified inflection tags); or be entirely specified by the | |||
template (as in {{tl|past participle of}}). | |||
* `.tag_sets`: ('''''required''' unless `.tags` is given'') List of non-canonicalized tag sets and associated | |||
per-tag-set properties. Each element of the list is an object of the form | |||
{ {tags = {"TAG", "TAG", ...}, labels = {"LABEL", "LABEL", ...}}. If `.tag_sets` is specified, `.tags` should not be | |||
given and vice-versa. Specifying `.tag_sets` in place of tags allowed per-tag set labels to be specified; otherwise, | |||
there is no advantage. [[Module:pt-gl-inflections]] uses this functionality to supply labels like {"Brazil"} and | |||
{"Portugal"} associated with specific tag sets. | |||
* `.lemmas`: ('''''recommended''''') List of objects describing the lemma(s) of which the term in question is a | |||
non-lemma form. These are passed directly to {full_link()} in [[Module:links]]. Each object should have at minimum a | |||
`.lang` field containing the language of the lemma and a `.term` field containing the lemma itself. Each object is | |||
formatted using {full_link()} and then if there are more than one, they are joined using {serialCommaJoin()} in | |||
[[Module:table]]. Alternatively, `.lemmas` can be a string, which is displayed directly. If omitted entirely, no lemma | |||
links are shown and the connecting "of" is also omitted. | |||
* `.lemma_face`: ('''''recommended''''') "Face" to use when displaying the lemma objects. Usually should be set to | |||
{"term"}. | |||
* `.POS`: ('''''recommended''''') Categorizing part-of-speech tag. Comes from the {{para|p}} or {{para|POS}} argument of | |||
{{tl|inflection of}}. | |||
* `.pagename`: Page name of "current" page or nil to use the actual page title; for testing purposes. | |||
* `.enclitics`: List of enclitics to display after the lemmas, in parens. | |||
* `.no_format_categories`: If true, don't format the categories derived from the inflection tags; just return them. | |||
* `.sort`: Sort key for formatted categories. Ignored when `.no_format_categories` = {true}. | |||
* `.nocat`: Suppress computation of categories (even if `.no_format_categories` is not given). | |||
* `.notext`: Disable display of all tag text and `inflection of` text. (FIXME: Maybe not implemented correctly.) | |||
* `.capfirst`: Capitalize the first word displayed. | |||
* `.pretext`: Additional text to display before the inflection tags, but after any top-level labels. | |||
* `.posttext`: Additional text to display after the lemma links. | |||
* `.text_classes`: CSS classes used to wrap the tag text and lemma links. Default is | |||
{"form-of-definition use-with-mention"} for the tag text, {"form-of-definition-link"} for the lemma links. (FIXME: | |||
Should separate out the lemma links into their own field.) | |||
`.joiner`: Override the joiner (normally a slash) used to join multipart tags. You should normally not specify this. | |||
A typical call might look like this (for {{m+|es|amo}}): { | |||
local lang = require("Module:languages").getByCode("es") | |||
local lemma_obj = { | |||
lang = lang, | |||
term = "amar", | |||
} | |||
return m_form_of.tagged_inflections({ | |||
lang = lang, tags = {"1", "s", "pres", "ind"}, lemmas = {lemma_obj}, lemma_face = "term", POS = "verb" | |||
}) | |||
} | |||
Normally, one value is returned, the formatted text, which has appended to it the formatted categories derived from the | |||
tag-set-related categories generated by the specs in [Module:form of/cats]]. To suppress this, set | |||
`data.no_format_categories` = {true}, in which case two values are returned, the formatted text without any formatted | |||
categories appended and a list of the categories to be formatted. | |||
NOTE: There are two sets of categories that may be generated: (1) categories derived directly from the tag sets, as | |||
specified in [[Module:form of/cats]]; (2) categories derived from tag-set labels, either (a) set explicitly by the | |||
caller in `data.tag_sets`, (b) specified by the user using `<lb:...>` attached to the last tag in a tag set, or | |||
(c) specified in [[Module:form of/cats]]. The second type (label-related categories) are currently not returned in | |||
the second return value of {tagged_inflections()}, and are currently inserted into the output text even if | |||
`data.no_format_categories` is set to {true}; but they can be suppressed by setting `data.nocat` = {true} (which also | |||
suppresses the first type of categories, those derived directly from tag sets, even if `data.no_format_categories` is | |||
set to {true}). | |||
]==] | |||
function export.tagged_inflections(data) | |||
if not data.tags and not data.tag_sets then | |||
error("First argument must be a table of arguments, and `.tags` or `.tag_sets` must be specified") | |||
end | |||
if data.tags and data.tag_sets then | |||
error("Both `.tags` and `.tag_sets` cannot be specified") | |||
end | end | ||
local | local tag_sets = data.tag_sets | ||
if not tag_sets then | |||
tag_sets = export.split_tag_set(data.tags) | |||
for i, tag_set in ipairs(tag_sets) do | |||
tag_sets[i] = export.parse_tag_set_properties(tag_set) | |||
end | |||
end | |||
local inflections = {} | local inflections = {} | ||
local categories = {} | |||
for _, tag_set in ipairs(tag_sets) do | |||
local normalized_tag_sets = export.normalize_tag_set(tag_set.tags, data.lang, "do-track") | |||
for _, normalized_tag_set in ipairs(normalized_tag_sets) do | |||
local cur_infl = {} | |||
local this_categories, this_labels = export.fetch_categories_and_labels(normalized_tag_set, data.lang, | |||
data.POS, data.pagename, type(data.lemmas) == "table" and data.lemmas or nil) | |||
if | if not data.nocat then | ||
m_table.extendList(categories, this_categories) | |||
end | end | ||
local cur_infl = export.get_tag_set_display_form(normalized_tag_set, data.lang, data.joiner) | |||
cur_infl | |||
if #cur_infl > 0 then | if #cur_infl > 0 then | ||
if tag_set.labels then | |||
this_labels = m_table.append(tag_set.labels, this_labels) | |||
if | |||
end | end | ||
table.insert(inflections, {infl_text = cur_infl, labels = this_labels}) | |||
end | end | ||
end | end | ||
end | end | ||
if | local overall_labels, need_per_tag_set_labels | ||
for _, inflection in ipairs(inflections) do | |||
if overall_labels == nil then | |||
overall_labels = inflection.labels | |||
elseif not m_table.deepEquals(overall_labels, inflection.labels) then | |||
need_per_tag_set_labels = true | |||
overall_labels = nil | |||
break | |||
end | |||
end | end | ||
local format_data = require( | if not need_per_tag_set_labels then | ||
for _, inflection in ipairs(inflections) do | |||
inflection.labels = nil | |||
end | |||
end | |||
local format_data = m_table.shallowcopy(data) | |||
local function format_labels(labels, notext) | |||
if labels and #labels > 0 then | |||
return require(labels_module).show_labels { labels = labels, lang = data.lang, sort = data.sort, nocat = data.nocat } .. | |||
(notext and (data.pretext or "") == "" and "" or " ") | |||
else | |||
return "" | |||
end | |||
end | |||
local of_text = data.lemmas and " of" or "" | |||
local formatted_text | |||
if #inflections == 1 then | if #inflections == 1 then | ||
format_data.text = | if need_per_tag_set_labels then | ||
error("Internal error: need_per_tag_set_labels should not be set with one inflection") | |||
end | |||
format_data.text = format_labels(overall_labels, data.notext) .. (data.pretext or "") .. (data.notext and "" or | |||
((data.capfirst and require("Module:string utilities").ucfirst(inflections[1].infl_text) or inflections[1].infl_text) .. of_text)) | |||
formatted_text = export.format_form_of(format_data) | |||
else | else | ||
format_data.text = data.notext and "" or ((data.capfirst and "Inflection" or "inflection") .. | format_data.text = format_labels(overall_labels, data.notext) .. (data.pretext or "") .. (data.notext and "" or | ||
((data.capfirst and "Inflection" or "inflection") .. of_text)) | |||
format_data.posttext = (data.posttext or "") .. ":" | format_data.posttext = (data.posttext or "") .. ":" | ||
local link = export.format_form_of(format_data) | local link = export.format_form_of(format_data) | ||
local text_classes = data.text_classes or "" | local text_classes = data.text_classes or "form-of-definition use-with-mention" | ||
for i, inflection in ipairs(inflections) do | |||
inflections[i] = "\n## " .. format_labels(inflection.labels, false) .. | |||
"<span class='" .. text_classes .. "'>" .. inflection.infl_text .. "</span>" | |||
end | |||
formatted_text = link .. table.concat(inflections) | |||
end | end | ||
if not data.no_format_categories then | |||
if #categories > 0 then | |||
formatted_text = formatted_text .. require("Module:utilities").format_categories(categories, data.lang, | |||
data.sort, nil, export.force_cat) | |||
end | |||
return formatted_text | |||
end | |||
return formatted_text, categories | |||
end | end | ||
--[==[ | |||
Given a tag set, return a flattened list all Wikidata ID's of all tags in the tag set. FIXME: Only used in a debugging | |||
function in [[Module:se-verbs]]; move there. | |||
]==] | |||
function export.to_Wikidata_IDs(tag_set, lang, skip_tags_without_ids) | |||
local ret = {} | local ret = {} | ||
local function get_wikidata_id(tag) | local function get_wikidata_id(tag) | ||
local data = export.lookup_tag(tag, lang) | |||
local data = export.lookup_tag(tag) | |||
if not data or not data. | if not data or not data[export.WIKIDATA] then | ||
if not skip_tags_without_ids then | if not skip_tags_without_ids then | ||
error( | error('The tag "' .. tag .. '" does not have a Wikidata ID defined in the form-of data modules') | ||
else | else | ||
return nil | return nil | ||
end | end | ||
else | else | ||
return data. | return ("Q%s"):format(data[export.WIKIDATA]) | ||
end | end | ||
end | end | ||
for | local normalized_tag_sets = export.normalize_tag_set(tag_set, lang) | ||
for _, tag_set in ipairs(normalized_tag_sets) do | |||
for _, tag in ipairs(tag_set) do | |||
if type(tag) == "table" then | |||
table.insert( | for _, subtag in ipairs(tag) do | ||
if type(subtag) == "table" then | |||
-- two-level multipart tag; FIXME: delete support for this | |||
for _, subsubtag in ipairs(subtag) do | |||
table.insert(ret, get_wikidata_id(subsubtag)) | |||
end | |||
else | |||
table.insert(ret, get_wikidata_id(subtag)) | |||
end | |||
end | |||
else | |||
table.insert(ret, get_wikidata_id(tag)) | |||
end | end | ||
end | end | ||
end | end | ||
Line 897: | Line 1,353: | ||
function export.dump_form_of_data(frame) | function export.dump_form_of_data(frame) | ||
local data = { | local data = { | ||
data = require( | data = require(export.form_of_data_module), | ||
data2 = require( | data2 = require(export.form_of_data2_module) | ||
} | } | ||
return require("Module:JSON").toJSON(data) | return require("Module:JSON").toJSON(data) | ||
end | |||
function export.finalize_tag_data(tags, shortcuts) | |||
local function process_shortcut(name, shortcut) | |||
-- If the shortcut is already in the list, then there is a duplicate. | |||
if shortcuts[shortcut] then | |||
error("The shortcut \"" .. shortcut .. "\" (for the inflection tag \"" .. name .. "\") conflicts with an existing shortcut for the tag \"" .. shortcuts[shortcut] .. "\".") | |||
elseif tags[shortcut] then | |||
error("The shortcut \"" .. shortcut .. "\" (for the inflection tag \"" .. name .. "\") conflicts with an existing tag with that name.") | |||
end | |||
shortcuts[shortcut] = name | |||
end | |||
for name, data in pairs(tags) do | |||
local data_shortcuts = data[export.SHORTCUTS] | |||
if data_shortcuts then | |||
if type(data_shortcuts) == "string" then | |||
process_shortcut(name, data_shortcuts) | |||
else | |||
for _, shortcut in ipairs(data_shortcuts) do | |||
process_shortcut(name, shortcut) | |||
end | |||
end | |||
end | |||
end | |||
end | end | ||
return export | return export |
Latest revision as of 18:12, 12 August 2024
- The following documentation is located at Module:form of/doc.[edit]
- Useful links: subpage list • links • transclusions • testcases • sandbox
local export = {}
export.force_cat = false -- for testing; set to true to display categories even on non-mainspace pages
local m_links = require("Module:links")
local m_string_utils = require("Module:string utilities")
local m_table = require("Module:table")
local parse_utilities_module = "Module:parse utilities"
local labels_module = "Module:labels"
local utilities_module = "Module:utilities"
export.form_of_pos_module = "Module:form of/pos"
export.form_of_functions_module = "Module:form of/functions"
export.form_of_cats_module = "Module:form of/cats"
export.form_of_lang_data_module_prefix = "Module:form of/lang-data/"
export.form_of_data_module = "Module:form of/data"
export.form_of_data2_module = "Module:form of/data2"
local ulen = m_string_utils.len
local rsubn = m_string_utils.gsub
local rmatch = m_string_utils.match
local rsplit = m_string_utils.split
export.TAG_TYPE = 1
export.GLOSSARY = 2
export.SHORTCUTS = 3
export.WIKIDATA = 4
export.APPENDIX = true
export.WP = false
export.WIKT = 0
--[==[
Set listing the languages with lang-specific tags. If a language isn't listed here, the tags for that language won't be
recognized.
]==]
export.langs_with_lang_specific_tags = {
["en"] = true,
["got"] = true,
["lt"] = true,
["lv"] = true,
["nl"] = true,
["pi"] = true,
["sw"] = true,
["ttj"] = true,
}
--[==[ intro:
This module implements the underlying processing of {{tl|form of}}, {{tl|inflection of}} and specific variants such as
{{tl|past participle of}} and {{tl|alternative spelling of}}. Most of the logic in this file is to handle tags in
{{tl|inflection of}}. Other related files:
* [[Module:form of/templates]] contains the majority of the logic that implements the templates themselves.
* [[Module:form of/data]] is a data-only file containing information on the more common inflection tags, listing the
tags, their shortcuts, the category they belong to (tense-aspect, case, gender, voice-valence, etc.), the appropriate
glossary link and the wikidata ID.
* [[Module:form of/data2]] is a data-only file containing information on the less common inflection tags, in the same
format as [[Module:form of/data]].
* [[Module:form of/lang-data/LANGCODE]] is a data-only file containing information on the language-specific inflection
tags for the language with code LANGCODE, in the same format as [[Module:form of/data]]. Language-specific tags
override general tags.
* [[Module:form of/cats]] is a data-only file listing the language-specific categories that are added when the
appropriate combinations of tags are seen for a given language.
* [[Module:form of/pos]] is a data-only file listing the recognized parts of speech and their abbreviations, used for
categorization. FIXME: This should be unified with the parts of speech listed in [[Module:links]].
* [[Module:form of/functions]] contains functions for use with [[Module:form of/data]] and [[Module:form of/cats]].
They are contained in this module because data-only modules can't contain code. The functions in this file are of two
types:
*# Display handlers allow for customization of the display of multipart tags (see below). Currently there is only
one such handler, for handling multipart person tags such as `1//2//3`.
*# Cat functions allow for more complex categorization logic, and are referred to by name in [[Module:form of/cats]].
Currently no such functions exist.
The following terminology is used in conjunction with {{tl|inflection of}}:
* A ''tag'' is a single grammatical item, as specified in a single numbered parameter of {{tl|inflection of}}. Examples
are `masculine`, `nominative`, or `first-person`. Tags may be abbreviated, e.g. `m` for `masculine`, `nom` for
`nominative`, or `1` for `first-person`. Such abbreviations are called ''aliases'', and some tags have multiple
equivalent aliases (e.g. `p` or `pl` for `plural`). The full, non-abbreviated form of a tag is called its
''canonical form''.
* The ''display form'' of a tag is the way it's displayed to the user. Usually the displayed text of the tag is the same
as its canonical form, and it normally functions as a link to a glossary entry explaining the tag. Usually the link is
to an entry in [[Appendix:Glossary]], but sometimes the tag is linked to an individual dictionary entry or to a
Wikipedia entry. Occasionally, the display text differs from the canonical form of the tag. An example is the tag
`comparative case`, which has the display text read as simply `comparative`. Normally, tags referring to cases don't
have the word "case" in them, but in this case the tag `comparative` was already used as an alias for the tag
`comparative degree`, so the tag was named `comparative case` to avoid clashing. A similar situation occurs with
`adverbial case` vs. the grammar tag `adverbial` (as in `adverbial participle`).
* A ''tag set'' is an ordered list of tags, which together express a single inflection, for example, `1|s|pres|ind`,
which can be expanded to canonical-form tags as `first-person|singular|present|indicative`.
* A ''conjoined tag set'' is a tag set that consists of multiple individual tag sets separated by a semicolon, e.g.
`1|s|pres|ind|;|2|s|imp`, which specifies two tag sets, `1|s|pres|ind` as above and `2|s|imp` (in canonical form,
`second-person|singular|imperative`). Multiple tag sets specified in a single call to {{tl|inflection of}} are
specified in this fashion. Conjoined tag sets can also occur in list-tag shortcuts.
* A ''multipart tag'' is a tag that embeds multiple tags within it, such as `f//n` or `nom//acc//voc`. These are used in
the case of [[syncretism]], when the same form applies to multiple inflections. Examples are the Spanish present
subjunctive, where the first-person and third-person singular have the same form (e.g. {{m|es|siga}} from
{{m|es|seguir|t=to follow}}), or Latin third-declension adjectives, where the dative and ablative plural of all
genders have the same form (e.g. {{m|la|omnibus}} from {{m|la|omnis|t=all}}). These would be expressed respectively as
`1//3|s|pres|sub` and `dat//abl|m//f//n|p`, where the use of the multipart tag compactly encodes the syncretism and
avoids the need to individually list out all of the inflections. Multipart tags currently display as a list separated
by a slash, e.g. ''dative/ablative'' or ''masculine/feminine/neuter'' where each individual word is linked
appropriately. As a special case, multipart tags involving persons display specially; for example, the multipart tag
`1//2//3` displays as ''first-, second- and third-person'', with the word "person" occurring only once.
* A ''two-level multipart tag'' is a special type of multipart tag that joins two or more tag sets instead of joining
individual tags. The tags within the tag set are joined by a colon, e.g. `1:s//3:p`, which is displayed as
''first-person singular and third-person plural'', e.g. for use with the form {{m|grc|μέλλον}} of the verb
{{m|grc|μέλλω|t=to intend}}, which uses the tag set `1:s//3:p|impf|actv|indc|unaugmented` to express the syncretism
between the first singular and third plural forms of the imperfect active indicative unaugmented conjugation.
Two-level multipart tags should be used sparingly; if in doubt, list out the inflections separately. [FIXME: Make
two-level multipart tags obsolete.]
* A ''shortcut'' is a tag that expands to any type of tag described above, or to any type of tag set described above.
Aliases are a particular type of shortcut whose expansion is a single non-multipart tag.
* A ''multipart shortcut'' is a shortcut that expands into a multipart tag, for example `123`, which expands to the
multipart tag `1//2//3`. Only the most common such combinations exist as shortcuts.
* A ''list shortcut'' is a special type of shortcut that expands to a list of tags instead of a single tag. For example,
the shortcut `1s` expands to `1|s` (first-person singular). Only the most common such combinations exist as shortcuts.
* A ''conjoined shortcut'' is a special type of list shortcut that consists of a conjoined tag set (multiple logical tag
sets). For example, the English language-specific shortcut `ed-form` expands to `spast|;|past|part`, expressing the
common syncretism between simple past and past participle in English (and in this case, `spast` is itself a list
shortcut that expands to `simple|past`).
]==]
-- version of rsubn() that discards all but the first return value
local function rsub(term, foo, bar)
local retval = rsubn(term, foo, bar)
return retval
end
local function normalize_index(list, index)
if index < 0 then
return #list + index + 1
end
return index
end
-- FIXME, consider moving to [[Module:table]]
-- Return true if the list `tags1`, treated as a set, is a subset of the list `tags2`, also treated as a set.
local function is_subset(tags1, tags2)
tags1 = m_table.listToSet(tags1)
tags2 = m_table.listToSet(tags2)
for tag, _ in pairs(tags1) do
if not tags2[tag] then
return false
end
end
return true
end
-- FIXME, move to [[Module:table]]
local function slice(list, i, j)
--checkType("slice", 1, list, "table")
--checkType("slice", 2, i, "number", true)
--checkType("slice", 3, j, "number", true)
if i == nil then
i = 1
else
i = normalize_index(list, i)
end
j = normalize_index(list, j or -1)
local retval = {}
local k = 0
for index = i, j do
k = k + 1
retval[k] = list[index]
end
return retval
end
local function wrap_in_span(text, classes)
return ("<span class='%s'>%s</span>"):format(classes, text)
end
--[==[
Lowest-level implementation of form-of templates, including the general {{tl|form of}} as well as those that deal with
inflection tags, such as the general {{tl|inflection of}}, semi-specific variants such as {{tl|participle of}}, and
specific variants such as {{tl|past participle of}}. `data` contains all the information controlling the display, with
the following fields:
* `.text`: Text to insert before the lemmas. Wrapped in the value of `.text_classes`, or its default; see below.
* `.lemmas`: List of objects describing the lemma(s) of which the term in question is a non-lemma form. These are passed
directly to {full_link()} in [[Module:links]]. Each object should have at minimum a `.lang` field containing the
language of the lemma and a `.term` field containing the lemma itself. Each object is formatted using {full_link()}
and then if there are more than one, they are joined using {serialCommaJoin()} in [[Module:table]]. Alternatively,
`.lemmas` can be a string, which is displayed directly, or omitted, to show no lemma links and omit the connecting
text.
* `.lemma_face`: "Face" to use when displaying the lemma objects. Usually should be set to {"term"}.
* `.enclitics`: List of enclitics to display after the lemmas, in parens.
* `.base_lemmas`: List of base lemmas to display after the lemmas, in the case where the lemmas in `.lemmas` are
themselves forms of another lemma (the base lemma), e.g. a comparative, superlative or participle. Each object is of
the form { { paramobj = PARAM_OBJ, lemmas = {LEMMA_OBJ, LEMMA_OBJ, ...} }} where PARAM_OBJ describes the properties
of the base lemma parameter (i.e. the relationship between the intermediate and base lemmas) and LEMMA_OBJ is an
object suitable to be passed to {full_link()} in [[Module:links]]. PARAM_OBJ is of the format
{ { param = "PARAM", tags = {"TAG", "TAG", ...} } where PARAM is the name of the parameter to {{tl|inflection of}}
etc. that holds the base lemma(s) of the specified relationship and the tags describe the relationship, such as
{ {"comd"}} or { {"past", "part"}}.
* `.text_classes`: CSS classes used to wrap the tag text and lemma links. Default is {"form-of-definition use-with-mention"}
for the tag text and lemma links, and additionally {"form-of-definition-link"} specifically for the lemma links.
(FIXME: Should separate out the lemma links into their own field.)
* `.posttext`: Additional text to display after the lemma links.
]==]
function export.format_form_of(data)
if type(data) ~= "table" then
error("Internal error: First argument must now be a table of arguments")
end
local text_classes = data.text_classes or "form-of-definition use-with-mention"
local lemma_classes = data.text_classes or "form-of-definition-link"
local parts = {}
local function ins(text)
table.insert(parts, text)
end
ins("<span class='" .. text_classes .. "'>")
ins(data.text)
if data.text ~= "" and data.lemmas then
ins(" ")
end
if data.lemmas then
if type(data.lemmas) == "string" then
ins(wrap_in_span(data.lemmas, lemma_classes))
else
local formatted_terms = {}
for _, lemma in ipairs(data.lemmas) do
table.insert(formatted_terms, wrap_in_span(
m_links.full_link(lemma, data.lemma_face), lemma_classes
))
end
ins(m_table.serialCommaJoin(formatted_terms))
end
end
if data.enclitics and #data.enclitics > 0 then
-- The outer parens need to be outside of the text_classes span so they show in upright instead of italic, or
-- they will clash with upright parens generated by link annotations such as transliterations and pos=.
ins("</span>")
local formatted_terms = {}
for _, enclitic in ipairs(data.enclitics) do
-- FIXME, should we have separate clitic face and/or classes?
table.insert(formatted_terms, wrap_in_span(
m_links.full_link(enclitic, data.lemma_face, nil, "show qualifiers"), lemma_classes
))
end
ins(" (")
ins(wrap_in_span("with enclitic" .. (#data.enclitics > 1 and "s" or "") .. " ", text_classes))
ins(m_table.serialCommaJoin(formatted_terms))
ins(")")
ins("<span class='" .. text_classes .. "'>")
end
if data.base_lemmas and #data.base_lemmas > 0 then
for _, base_lemma in ipairs(data.base_lemmas) do
ins(", the </span>")
ins(export.tagged_inflections {
lang = base_lemma.lemmas[1].lang,
tags = base_lemma.paramobj.tags,
lemmas = base_lemma.lemmas,
lemma_face = data.lemma_face,
no_format_categories = true,
nocat = true,
text_classes = data.text_classes,
})
ins("<span class='" .. text_classes .. "'>")
end
end
-- FIXME, should posttext go before enclitics? If so we need to have separate handling for the
-- final colon when there are multiple tag sets in tagged_inflections().
if data.posttext then
ins(data.posttext)
end
ins("</span>")
return table.concat(parts)
end
--[==[
Return true if `tag` contains an internal link or HTML.
]==]
function export.is_link_or_html(tag)
return tag:find("[[", nil, true) or tag:find("|", nil, true) or tag:find("<", nil, true)
end
--[==[
Look up a tag (either a shortcut of any sort of a canonical long-form tag) and return its expansion. The expansion
will be a string unless the shortcut is a list-tag shortcut such as `1s`; in that case, the expansion will be a
list. The caller must handle both cases. Only one level of expansion happens; hence, `acc` expands to {"accusative"},
`1s` expands to { {"1", "s"}} (not to { {"first", "singular"}}) and `123` expands to {"1//2//3"}. The expansion will be
the same as the passed-in tag in the following circumstances:
# The tag is `;` (this is special-cased, and no lookup is done).
# The tag is a multipart tag such as `nom//acc` (this is special-cased, and no lookup is done).
# The tag contains a raw link (this is special-cased, and no lookup is done).
# The tag contains HTML (this is special-cased, and no lookup is done).
# The tag is already a canonical long-form tag.
# The tag is unrecognized.
This function first looks up in the lang-specific data module [[Module:form of/lang-data/LANGCODE]], then in
[[Module:form of/data]] (which includes more common non-lang-specific tags) and finally (only if the tag is not
recognized as a shortcut or canonical tag, and is not of types 1-4 above) in [[Module:form of/data2]].
If the expansion is a string and is different from the tag, track it if `do_track` is true.
]==]
function export.lookup_shortcut(tag, lang, do_track)
-- If there is HTML or a link in the tag, return it directly; don't try
-- to look it up, which will fail.
if tag == ";" or tag:find("//", nil, true) or export.is_link_or_html(tag) then
return tag
end
local expansion
local langcode = lang and lang:getCode()
if langcode and export.langs_with_lang_specific_tags[langcode] then
local langdata = mw.loadData(export.form_of_lang_data_module_prefix .. langcode)
-- If this is a canonical long-form tag, just return it, and don't check for shortcuts. This is an
-- optimization; see below.
if langdata.tags[tag] then
return tag
end
expansion = langdata.shortcuts[tag]
end
if not expansion and lang then
-- If the lang we're dealing with is an etym-only lang, try again with the corresponding full language.
local full_langcode = lang:getFullCode()
if full_langcode ~= langcode and export.langs_with_lang_specific_tags[full_langcode] then
local langdata = mw.loadData(export.form_of_lang_data_module_prefix .. full_langcode)
-- If this is a canonical long-form tag, just return it, and don't check for shortcuts. This is an
-- optimization; see below.
if langdata.tags[tag] then
return tag
end
expansion = langdata.shortcuts[tag]
end
end
if not expansion then
local m_data = mw.loadData(export.form_of_data_module)
-- If this is a canonical long-form tag, just return it, and don't check for shortcuts (which will cause
-- [[Module:form of/data2]] to be loaded, because there won't be a shortcut entry in [[Module:form of/data]] --
-- or, for that matter, in [[Module:form of/data2]]). This is an optimization; the code will still work without
-- it, but will use up more memory.
if m_data.tags[tag] then
return tag
end
expansion = m_data.shortcuts[tag]
end
if not expansion then
local m_data2 = mw.loadData(export.form_of_data2_module)
expansion = m_data2.shortcuts[tag]
end
if not expansion then
return tag
end
return expansion
end
--[==[
Look up a normalized/canonicalized tag and return the data object associated with it. If the tag isn't found, return
nil. This first looks up in the lang-specific data module [[Module:form of/lang-data/LANGCODE]], then in
[[Module:form of/data]] (which includes more common non-lang-specific tags) and then finally in
[[Module:form of/data2]].
]==]
function export.lookup_tag(tag, lang)
local langcode = lang and lang:getCode()
if langcode and export.langs_with_lang_specific_tags[langcode] then
local langdata = mw.loadData(export.form_of_lang_data_module_prefix .. langcode)
if langdata.tags[tag] then
return langdata.tags[tag]
end
end
local full_langcode = lang and lang:getFullCode()
if full_langcode and full_langcode ~= langcode and export.langs_with_lang_specific_tags[full_langcode] then
-- If the lang we're dealing with is an etym-only lang, try again with the corresponding full language.
local langdata = mw.loadData(export.form_of_lang_data_module_prefix .. full_langcode)
if langdata.tags[tag] then
return langdata.tags[tag]
end
end
local m_data = mw.loadData(export.form_of_data_module)
local tagobj = m_data.tags[tag]
if tagobj then
return tagobj
end
local m_data2 = mw.loadData(export.form_of_data2_module)
local tagobj2 = m_data2.tags[tag]
if tagobj2 then
return tagobj2
end
return nil
end
-- Normalize a single tag, which may be a shortcut but should not be a multipart tag, a multipart shortcut or a list
-- shortcut.
local function normalize_single_tag(tag, lang, do_track)
local expansion = export.lookup_shortcut(tag, lang, do_track)
if type(expansion) ~= "string" then
error("Tag '" .. tag .. "' is a list shortcut, which is not allowed here")
end
tag = expansion
return tag
end
--[=[
Normalize a component of a multipart tag. This should not have any // in it, but may join multiple individual tags with
a colon, and may be a single list-tag shortcut, which is treated as if colon-separated. The return value may be a list
of tags.
]=]
local function normalize_multipart_component(tag, lang, do_track)
-- If there is HTML or a link in the tag, don't try to split on colon. A colon may legitimately occur in either one,
-- and we don't want these things parsed. Note that we don't do this check before splitting on //, which we don't
-- expect to occur in links or HTML; see comment in normalize_tag().
if export.is_link_or_html(tag) then
return tag
end
local components = rsplit(tag, ":", true)
if #components == 1 then
-- We allow list-tag shortcuts inside of multipart tags, e.g.
-- '1s//3p'. Check for this now.
tag = export.lookup_shortcut(tag, lang, do_track)
if type(tag) == "table" then
-- We found a list-tag shortcut; treat as if colon-separated.
components = tag
else
return normalize_single_tag(tag, lang, do_track)
end
end
local normtags = {}
for _, component in ipairs(components) do
table.insert(normtags, normalize_single_tag(component, lang, do_track))
end
return normtags
end
--[=[
Normalize a single tag. The return value may be a list (in the case of multipart tags), which will contain nested lists
in the case of two-level multipart tags.
]=]
local function normalize_tag(tag, lang, do_track)
-- We don't check for links or HTML before splitting on //, which we don't expect to occur in links or HTML. Doing
-- it this way allows for a tag like '{{lb|grc|Epic}}//{{lb|grc|Ionic}}' to function correctly (the template calls
-- will be expanded before we process the tag, and will contain links and HTML). The only check we do is for a URL,
-- which shouldn't normally occur, but might if the user tries to put an external link into the tag. URL's with //
-- normally have the sequence ://, which should never normally occur when // and : are used in their normal ways.
if tag:find("://", nil, true) then
return tag
end
local split_tags = rsplit(tag, "//", true)
if #split_tags == 1 then
local retval = normalize_multipart_component(tag, lang, do_track)
if type(retval) == "table" then
-- The user gave a tag like '1:s', i.e. with colon but without //. Allow this, but we need to return a
-- nested list.
return {retval}
end
return retval
end
local normtags = {}
for _, single_tag in ipairs(split_tags) do
table.insert(normtags, normalize_multipart_component(single_tag, lang, do_track))
end
return normtags
end
--[==[
Normalize a tag set (a list of tags) into its canonical-form tags. The return value is a list of normalized tag sets
(a list because of there may be conjoined shortcuts among the input tags). A normalized tag set is a list of tag
elements, where each element is either a string (the canonical form of a tag), a list of such strings (in the case of
multipart tags) or a list of lists of such strings (in the case of two-level multipart tags). For example, the multipart
tag `nom//acc//voc` will be represented in canonical form as { {"nominative", "accusative", "vocative"}}, and the
two-level multipart tag `1:s//3:p` will be represented as { {{"first-person", "singular"}, {"third-person", "plural"}}}.
Example 1:
{normalize_tag_set({"nom//acc//voc", "n", "p"})} = { {{{"nominative", "accusative", "vocative"}, "masculine", "plural"}}}
Example 2:
{normalize_tag_set({"ed-form"}, ENGLISH)} = { {{"simple", "past"}, {"past", "participle"}}}
Example 3:
{normalize_tag_set({"archaic", "ed-form"}, ENGLISH)} = { {{"archaic", "simple", "past"}, {"archaic", "past", "participle"}}}
]==]
function export.normalize_tag_set(tag_set, lang, do_track)
local output_tag_set = {}
local saw_semicolon = false
for _, tag in ipairs(tag_set) do
-- Expand the tag, which may generate a new tag (either a fully canonicalized tag, a multipart tag, or a list
-- of tags).
tag = export.lookup_shortcut(tag, lang, do_track)
if type(tag) == "table" then
saw_semicolon = m_table.contains(tag, ";")
if saw_semicolon then
-- If we saw a conjoined shortcut, we need to use a more general algorithm that can expand a single
-- tag set into multiple.
break
end
for _, t in ipairs(tag) do
table.insert(output_tag_set, normalize_tag(t, lang, do_track))
end
else
table.insert(output_tag_set, normalize_tag(tag, lang, do_track))
end
end
if not saw_semicolon then
return {output_tag_set}
end
-- Use a more general algorithm that handles conjoined shortcuts.
local output_tag_set = {}
for i, tag in ipairs(tag_set) do
-- Expand the tag, which may generate a new tag (either a fully canonicalized tag, a multipart tag, or a list
-- of tags).
tag = export.lookup_shortcut(tag, lang, do_track)
if type(tag) == "table" then
local output_tag_sets = {}
local shortcut_tag_sets = export.split_tag_set(tag)
local normalized_shortcut_tag_sets = {}
for _, shortcut_tag_set in ipairs(shortcut_tag_sets) do
m_table.extendList(normalized_shortcut_tag_sets,
export.normalize_tag_set(shortcut_tag_set, lang, do_track))
end
local after_tags = slice(tag_set, i + 1)
local normalized_after_tags_sets = export.normalize_tag_set(after_tags, lang, do_track)
for _, normalized_shortcut_tag_set in ipairs(normalized_shortcut_tag_sets) do
for _, normalized_after_tags_set in ipairs(normalized_after_tags_sets) do
table.insert(output_tag_sets, m_table.append(output_tag_set, normalized_shortcut_tag_set,
normalized_after_tags_set))
end
end
return output_tag_sets
else
table.insert(output_tag_set, normalize_tag(tag, lang, do_track))
end
end
error("Internal error: Should not get here")
end
function export.combine_multipart_tags(tag_set)
for i, tag in ipairs(tag_set) do
if type(tag) == "table" then
for j, subtag in ipairs(tag) do
if type(subtag) == "table" then
tag[j] = table.concat(subtag, ":")
end
end
tag_set[i] = table.concat(tag, "//")
end
end
return tag_set
end
function export.normalize_tags(tags, lang, recombine_multitags, do_track)
local tag_sets = export.normalize_tag_set(tags, lang, do_track)
if recombine_multitags then
for i, tag_set in ipairs(tag_sets) do
tag_sets[i] = export.combine_multipart_tags(tag_set)
end
return export.combine_tag_sets(tag_sets)
end
return tag_sets
end
--[==[
Split a tag set containing two-level multipart tags into one or more tag sets not containing such tags.
Single-level multipart tags are left alone. (If we need to, a slight modification of the following code
will also split single-level multipart tags.) This assumes that multipart tags are represented as lists
and two-level multipart tags are represented as lists of lists, as is output by {normalize_tag_set()}.
NOTE: We have to be careful to properly handle imbalanced two-level multipart tags such as
`def:s//p` (or the reverse, `s//def:p`).
]==]
function export.split_two_level_multipart_tag_set(tag_set)
for i, tag in ipairs(tag_set) do
if type(tag) == "table" then
-- We saw a multipart tag. Check if any of the parts are two-level.
local saw_two_level_tag = false
for _, first_level_tag in ipairs(tag) do
if type(first_level_tag) == "table" then
saw_two_level_tag = true
break
end
end
if saw_two_level_tag then
-- We found a two-level multipart tag.
-- (1) Extract the preceding tags.
local pre_tags = slice(tag_set, 1, i - 1)
-- (2) Extract the following tags.
local post_tags = slice(tag_set, i + 1)
-- (3) Loop over each tag set alternant in the two-level multipart tag.
-- For each alternant, form the tag set consisting of pre_tags + alternant + post_tags,
-- and recursively split that tag set.
local resulting_tag_sets = {}
for _, first_level_tag_set in ipairs(tag) do
local expanded_tag_set = {}
m_table.extendList(expanded_tag_set, pre_tags)
-- The second level may have a string or a list.
if type(first_level_tag_set) == "table" then
m_table.extendList(expanded_tag_set, first_level_tag_set)
else
table.insert(expanded_tag_set, first_level_tag_set)
end
m_table.extendList(expanded_tag_set, post_tags)
m_table.extendList(resulting_tag_sets, export.split_two_level_multipart_tag_set(expanded_tag_set))
end
return resulting_tag_sets
end
end
end
return {tag_set}
end
--[==[
Split a tag set that may consist of multiple semicolon-separated tag sets into the component tag sets.
]==]
function export.split_tag_set(tag_set)
local split_tag_sets = {}
local cur_tag_set = {}
for _, tag in ipairs(tag_set) do
if tag == ";" then
if #cur_tag_set > 0 then
table.insert(split_tag_sets, cur_tag_set)
end
cur_tag_set = {}
else
table.insert(cur_tag_set, tag)
end
end
if #cur_tag_set > 0 then
table.insert(split_tag_sets, cur_tag_set)
end
return split_tag_sets
end
export.split_tags_into_tag_sets = export.split_tag_set
--[==[
Combine multiple tag sets in a tag set group into a simple tag set, with logical tag sets separated by semicolons.
This is the opposite of {split_tag_set()}.
]==]
function export.combine_tag_sets(tag_sets)
if #tag_sets == 1 then
return tag_sets[1]
end
local combined_tag_set = {}
for _, tag_set in ipairs(tag_sets) do
if #combined_tag_set > 0 then
table.insert(combined_tag_set, ";")
end
m_table.extendList(combined_tag_set, tag_set)
end
return tags
end
local tag_set_param_mods = {
lb = {
item_dest = "labels",
convert = function(arg, parse_err)
return rsplit(arg, "//", true)
end,
}
}
--[==[
Parse tag set properties from a tag set (list of tags). Currently no per-tag properties are recognized, and the only
per-tag-set property recognized is `<lb:...>` for specifing label(s) for the tag set. Per-tag-set properties must be
attached to the last tag.
]==]
function export.parse_tag_set_properties(tag_set)
local function generate_tag_set_obj(last_tag)
tag_set[#tag_set] = last_tag
return {tags = tag_set}
end
local last_tag = tag_set[#tag_set]
-- Check for inline modifier, e.g. מרים<tr:Miryem>. But exclude HTML entry with <span ...>, <i ...>, <br/> or
-- similar in it, caused by wrapping an argument in {{l|...}}, {{af|...}} or similar. Basically, all tags of
-- the sort we parse here should consist of a less-than sign, plus letters, plus a colon, e.g. <lb:...>, so if
-- we see a tag on the outer level that isn't in this format, we don't try to parse it. The restriction to the
-- outer level is to allow generated HTML inside of e.g. qualifier tags, such as foo<q:similar to {{m|fr|bar}}>.
if last_tag:find("<") and not last_tag:find("^[^<]*<[a-z]*[^a-z:]") then
return require(parse_utilities_module).parse_inline_modifiers(last_tag, {
param_mods = tag_set_param_mods,
generate_obj = generate_tag_set_obj,
})
else
return generate_tag_set_obj(last_tag)
end
end
function export.normalize_pos(pos)
if not pos then
return nil
end
return mw.loadData(export.form_of_pos_module)[pos] or pos
end
-- Return the display form of a single canonical-form tag. The value
-- passed in must be a string (i.e. it cannot be a list describing a
-- multipart tag). To handle multipart tags, use get_tag_display_form().
local function get_single_tag_display_form(normtag, lang)
local data = export.lookup_tag(normtag, lang)
local display = normtag
-- If the tag has a special display form, use it
if data and data.display then
display = data.display
end
-- If there is a nonempty glossary index, then show a link to it
local glossary = data and data[export.GLOSSARY]
if glossary ~= nil then
if glossary == export.WIKT then
display = "[[" .. normtag .. "|" .. display .. "]]"
elseif glossary == export.WP then
display = "[[w:" .. normtag .. "|" .. display .. "]]"
elseif glossary == export.APPENDIX then
display = "[[Appendix:Glossary#" .. mw.uri.anchorEncode(normtag) .. "|" .. display .. "]]"
elseif type(glossary) ~= "string" then
error(("Internal error: Wrong type %s for glossary value %s for tag %s"):format(
type(glossary), mw.dumpObject(glossary), normtag))
else
local link = rmatch(glossary, "^wikt:(.*)")
if link then
display = "[[" .. link .. "|" .. display .. "]]"
end
if not link then
link = rmatch(glossary, "^w:(.*)")
if link then
display = "[[w:" .. link .. "|" .. display .. "]]"
end
end
if not link then
display = "[[Appendix:Glossary#" .. mw.uri.anchorEncode(glossary) .. "|" .. display .. "]]"
end
end
end
return display
end
--[==[
Turn a canonicalized tag spec (which describes a single, possibly multipart tag) into the displayed form. The tag spec
may be a string (a canonical-form tag); a list of canonical-form tags (in the case of a simple multipart tag); or a
list of mixed canonical-form tags and lists of such tags (in the case of a two-level multipart tag). `joiner` indicates
how to join the parts of a multipart tag, and can be either {"and"} ("foo and bar", or "foo, bar and baz" for 3 or
more), {"slash"} ("foo/bar"), {"en-dash"} ("foo–bar") or {nil}, which uses the global default found in
{multipart_join_strategy()} in [[Module:form of/functions]]. (NOTE: The global default is {"slash"} and this seems
unlikely to change.)
]==]
function export.get_tag_display_form(tagspec, lang, joiner)
if type(tagspec) == "string" then
return get_single_tag_display_form(tagspec, lang)
end
-- We have a multipart tag. See if there's a display handler to display them specially.
for _, handler in ipairs(require(export.form_of_functions_module).display_handlers) do
local displayval = handler(tagspec, joiner)
if displayval then
return displayval
end
end
-- No display handler.
local displayed_tags = {}
for _, first_level_tag in ipairs(tagspec) do
if type(first_level_tag) == "string" then
table.insert(displayed_tags, get_single_tag_display_form(first_level_tag, lang))
else
-- A first-level element of a two-level multipart tag. Currently we just separate the individual components
-- with spaces, but other ways are possible, e.g. using an underscore, colon, parens or braces.
local components = {}
for _, component in ipairs(first_level_tag) do
table.insert(components, get_single_tag_display_form(component, lang))
end
table.insert(displayed_tags, table.concat(components, " "))
end
end
return require(export.form_of_functions_module).join_multiparts(displayed_tags, joiner)
end
--[==[
Given a normalized tag set (i.e. as output by {normalize_tag_set()}; all tags are in canonical form, multipart tags are
represented as lists, and two-level multipart tags as lists of lists), convert to displayed form (a string). See
{get_tag_display_form()} for the meaning of `joiner`.
]==]
function export.get_tag_set_display_form(normalized_tag_set, lang, joiner)
local parts = {}
for _, tagspec in ipairs(normalized_tag_set) do
local to_insert = export.get_tag_display_form(tagspec, lang, joiner)
-- Maybe insert a space before inserting the display form of the tag. We insert a space if
-- (a) we're not the first tag; and
-- (b) the tag we're about to insert doesn't have the "no_space_on_left" property; and
-- (c) the preceding tag doesn't have the "no_space_on_right" property.
-- NOTE: We depend here on the fact that
-- (1) all tags with either of the above properties set have the same display form as canonical form, and
-- (2) all tags with either of the above properties set are single-character tags.
-- The second property is an optimization to avoid looking up display forms resulting from multipart tags,
-- which won't be found and which will trigger loading of [[Module:form of/data2]]. If multichar punctuation is
-- added in the future, it's ok to change the == 1 below to <= 2 or <= 3.
--
-- If the first property above fails to hold in the future, we need to track the canonical form of each tag
-- (including the previous one) as well as the display form. This would also avoid the need for the == 1 check.
if #parts > 0 then
local most_recent_tagobj = ulen(parts[#parts]) == 1 and export.lookup_tag(parts[#parts], lang)
local to_insert_tagobj = ulen(to_insert) == 1 and export.lookup_tag(to_insert, lang)
if (
(not most_recent_tagobj or not most_recent_tagobj.no_space_on_right) and
(not to_insert_tagobj or not to_insert_tagobj.no_space_on_left)
) then
table.insert(parts, " ")
end
end
table.insert(parts, to_insert)
end
return table.concat(parts)
end
--[==[
Given a normalized tag set (i.e. as output by {normalize_tag_set()}; all tags are in canonical form, multipart tags are
represented as lists, and two-level multipart tags as lists of lists), fetch the associated categories and labels.
Return two values, a list of categories and a list of labels. `lang` is the language of term represented by the tag set,
and `POS` is the user-provided part of speech (which may be {nil}).
]==]
function export.fetch_categories_and_labels(normalized_tag_set, lang, POS, pagename, lemmas)
local m_cats = mw.loadData(export.form_of_cats_module)
local categories = {}
local labels = {}
POS = export.normalize_pos(POS)
-- First split any two-level multipart tags into multiple sets, to make our life easier.
for _, tag_set in ipairs(export.split_two_level_multipart_tag_set(normalized_tag_set)) do
-- Call a named function, either from the lang-specific data in
-- [[Module:form of/lang-specific/LANGCODE/functions]] or in [[Module:form of/functions]].
local function call_named_function(name, funtype)
local data = {
pagename = pagename or mw.title.getCurrentTitle().subpageText,
lemmas = lemmas,
tag_set = normalized_tag_set,
lang = lang,
POS = POS
}
local modules_tried = {}
local function try_lang_specific_module(langcode)
if export.langs_with_lang_specific_tags[langcode] then
local lang_specific_module = export.form_of_lang_data_module_prefix .. langcode .. "/functions"
local langdata = require(utilities_module).safe_require(lang_specific_module)
if langdata then
table.insert(modules_tried, lang_specific_module)
if langdata.cat_functions then
local fn = langdata.cat_functions[name]
if fn then
return fn(data), true
end
end
end
end
return nil, false
end
-- First try lang-specific.
local langcode = lang and lang:getCode()
if langcode then
local retval, found_it = try_lang_specific_module(langcode)
if found_it then
return retval
end
end
-- If the lang we're dealing with is an etym-only lang, try again with the corresponding full language.
local full_langcode = lang and lang:getFullCode()
if full_langcode and full_langcode ~= langcode then
local retval, found_it = try_lang_specific_module(full_langcode)
if found_it then
return retval
end
end
-- Try lang-independent.
table.insert(modules_tried, export.form_of_functions_module)
local fn = require(export.form_of_functions_module).cat_functions[name]
if fn then
return fn(data)
end
for i, modname in ipairs(modules_tried) do
modules_tried[i] = "[[" .. modname .. "]]"
end
error(("No %s function named '%s' in %s"):format(funtype, name, lang_specific_part,
m_table.serialCommaJoin(modules_tried, {conj = "or", dontTag = true})))
end
-- Given a tag from the current tag set (which may be a list in case of a multipart tag),
-- and a tag from a categorization spec, check that the two match.
-- (1) If both are strings, we just check for equality.
-- (2) If the spec tag is a string and the tag set tag is a list (i.e. it originates from a
-- multipart tag), we check that the spec tag is in the list. This is because we want to treat
-- multipart tags in user-specified tag sets as if the user had specified multiple tag sets.
-- For example, if the user said "1//3|s|pres|ind" and the categorization spec says {"has", "1"},
-- we want this to match, because "1//3|s|pres|ind" should be treated equivalently to two tag
-- sets "1|s|pres|ind" and "3|s|pres|ind", and the former matches the categorization spec.
-- (3) If the spec tag is a list (i.e. it originates from a multipart tag), we check that the
-- tag set tag is also a list and is a superset of the spec tag. For example, if the categorization
-- spec says {"has", "1//3"}, then the tag set tag must be a multipart tag that has both "1" and "3"
-- in it. "1//3" works, as does "1//2//3".
local function tag_set_tag_matches_spec_tag(tag_set_tag, spec_tag)
if type(spec_tag) == "table" then
if type(tag_set_tag) == "table" and is_subset(spec_tag, tag_set_tag) then
return true
end
elseif type(tag_set_tag) == "table" then
if m_table.contains(tag_set_tag, spec_tag) then
return true
end
elseif tag_set_tag == spec_tag then
return true
end
return false
end
-- Check that the current tag set matches the given spec tag. This means that any of the tags
-- in the current tag set match, according to tag_set_tag_matches_spec_tag(); see above. If the
-- current tag set contains only string tags (i.e. no multipart tags), and the spec tag is a
-- string (i.e. not a multipart tag), this boils down to list containment, but it gets more
-- complex when multipart tags are present.
local function tag_set_matches_spec_tag(spec_tag)
spec_tag = normalize_tag(spec_tag, lang)
for _, tag_set_tag in ipairs(tag_set) do
if tag_set_tag_matches_spec_tag(tag_set_tag, spec_tag) then
return true
end
end
return false
end
-- Check whether the given spec matches the current tag set. Two values are returned:
-- (1) whether the spec matches the tag set; (2) the index of the category to add if
-- the spec matches.
local function check_condition(spec)
if type(spec) == "boolean" then
return spec
elseif type(spec) ~= "table" then
error("Wrong type of condition " .. spec .. ": " .. type(spec))
end
local predicate = spec[1]
if predicate == "has" then
return tag_set_matches_spec_tag(spec[2]), 3
elseif predicate == "hasall" then
for _, tag in ipairs(spec[2]) do
if not tag_set_matches_spec_tag(tag) then
return false, 3
end
end
return true, 3
elseif predicate == "hasany" then
for _, tag in ipairs(spec[2]) do
if tag_set_matches_spec_tag(tag) then
return true, 3
end
end
return false, 3
elseif predicate == "tags=" then
local normalized_spec_tag_sets = export.normalize_tag_set(spec[2], lang)
if #normalized_spec_tag_sets > 1 then
error("Internal error: No support for conjoined shortcuts in category/label specs in "
.. "[[Module:form of/cats]] when processing spec tag set " .. table.concat(spec[2], "|"))
end
local normalized_spec_tag_set = normalized_spec_tag_sets[1]
-- Check for and disallow two-level multipart tags in the specs. FIXME: Remove this when we remove
-- support for two-level multipart tags.
for _, tag in ipairs(normalized_spec_tag_set) do
if type(tag) == "table" then
for _, subtag in ipairs(tag) do
if type(subtag) == "table" then
error("Internal error: No support for two-level multipart tags in category/label specs"
.. "[[Module:form of/cats]] when processing spec tag set "
.. table.concat(spec[2], "|"))
end
end
end
end
-- Allow tags to be in different orders, and multipart tags to be in different orders. To handle this,
-- we first check that both tag set tags and spec tags have the same length. If so, we sort the
-- multipart tags in the tag set tags and spec tags, and then check that all tags in the spec tags are
-- in the tag set tags.
if #tag_set ~= #normalized_spec_tag_set then
return false, 3
end
local tag_set_tags = m_table.deepcopy(tag_set)
for i=1,#tag_set_tags do
if type(tag_set_tags[i]) == "table" then
table.sort(tag_set_tags[i])
end
if type(normalized_spec_tag_set[i]) == "table" then
table.sort(normalized_spec_tag_set[i])
end
end
for i=1,#tag_set_tags do
if not m_table.contains(tag_set_tags, normalized_spec_tag_set[i]) then
return false, 3
end
end
return true, 3
elseif predicate == "p=" then
return POS == export.normalize_pos(spec[2]), 3
elseif predicate == "pany" then
for _, specpos in ipairs(spec[2]) do
if POS == export.normalize_pos(specpos) then
return true, 3
end
end
return false, 3
elseif predicate == "pexists" then
return POS ~= nil, 2
elseif predicate == "not" then
local condval = check_condition(spec[2])
return not condval, 3
elseif predicate == "and" then
local condval = check_condition(spec[2])
if condval then
condval = check_condition(spec[3])
end
return condval, 4
elseif predicate == "or" then
local condval = check_condition(spec[2])
if not condval then
condval = check_condition(spec[3])
end
return condval, 4
elseif predicate == "call" then
return fn(call_named_function(spec[2], "condition")), 3
else
error("Unrecognized predicate: " .. predicate)
end
end
-- Process a given spec. This checks any conditions in the spec against the
-- tag set, and insert any resulting categories into `categories`. Return value
-- is true if the outermost condition evaluated to true and a category was inserted
-- (this is used in {"cond" ...} conditions, which stop when a subcondition evaluates
-- to true).
local function process_spec(spec)
if not spec then
return false
elseif type(spec) == "string" then
-- A category. Substitute POS request with user-specified part of speech or default.
spec = rsub(spec, "<<p=(.-)>>", function(default)
return POS or export.normalize_pos(default)
end)
table.insert(categories, lang:getFullName() .. " " .. spec)
return true
elseif type(spec) == "table" and spec.labels then
-- A label spec.
for _, label in ipairs(spec.labels) do
m_table.insertIfNot(labels, label)
end
return true
elseif type(spec) ~= "table" then
error("Wrong type of specification " .. spec .. ": " .. type(spec))
end
local predicate = spec[1]
if predicate == "multi" then
-- WARNING! #spec doesn't work for objects loaded from loadData()
for i, sp in ipairs(spec) do
if i > 1 then
process_spec(sp)
end
end
return true
elseif predicate == "cond" then
-- WARNING! #spec doesn't work for objects loaded from loadData()
for i, sp in ipairs(spec) do
if i > 1 and process_spec(sp) then
return true
end
end
return false
elseif predicate == "call" then
return process_spec(call_named_function(spec[2], "spec"))
else
local condval, ifspec = check_condition(spec)
if condval then
process_spec(spec[ifspec])
return true
else
process_spec(spec[ifspec + 1])
-- FIXME: Are we sure this is correct?
return false
end
end
end
local langcode = lang:getCode()
local langspecs = m_cats[langcode]
if langspecs then
for _, spec in ipairs(langspecs) do
process_spec(spec)
end
end
local full_code = lang:getFullCode()
if full_code ~= langcode then
local langspecs = m_cats[full_code]
if langspecs then
for _, spec in ipairs(langspecs) do
process_spec(spec)
end
end
end
if full_code ~= "und" then
local langspecs = m_cats["und"]
if langspecs then
for _, spec in ipairs(langspecs) do
process_spec(spec)
end
end
end
end
return categories, labels
end
--[==[
Implementation of templates that display inflection tags, such as the general {{tl|inflection of}}, semi-specific
variants such as {{tl|participle of}}, and specific variants such as {{tl|past participle of}}. `data` contains all the
information controlling the display, with the following fields:
* `.lang`: ('''''required''''') Language to use when looking up language-specific inflection tags, categories and
labels, and for displaying categories and labels.
* `.tags`: ('''''required''' unless `.tag_sets` is given'') List of non-canonicalized inflection tags. Multiple tag sets
can be indicated by a {";"} as one of the tags, and tag-set properties may be attached to the last tag of a tag set.
The tags themselves may come directly from the user (as in {{tl|inflection of}}); come partly from the user (as in
{{tl|participle of}}, which adds the tag `part` to user-specified inflection tags); or be entirely specified by the
template (as in {{tl|past participle of}}).
* `.tag_sets`: ('''''required''' unless `.tags` is given'') List of non-canonicalized tag sets and associated
per-tag-set properties. Each element of the list is an object of the form
{ {tags = {"TAG", "TAG", ...}, labels = {"LABEL", "LABEL", ...}}. If `.tag_sets` is specified, `.tags` should not be
given and vice-versa. Specifying `.tag_sets` in place of tags allowed per-tag set labels to be specified; otherwise,
there is no advantage. [[Module:pt-gl-inflections]] uses this functionality to supply labels like {"Brazil"} and
{"Portugal"} associated with specific tag sets.
* `.lemmas`: ('''''recommended''''') List of objects describing the lemma(s) of which the term in question is a
non-lemma form. These are passed directly to {full_link()} in [[Module:links]]. Each object should have at minimum a
`.lang` field containing the language of the lemma and a `.term` field containing the lemma itself. Each object is
formatted using {full_link()} and then if there are more than one, they are joined using {serialCommaJoin()} in
[[Module:table]]. Alternatively, `.lemmas` can be a string, which is displayed directly. If omitted entirely, no lemma
links are shown and the connecting "of" is also omitted.
* `.lemma_face`: ('''''recommended''''') "Face" to use when displaying the lemma objects. Usually should be set to
{"term"}.
* `.POS`: ('''''recommended''''') Categorizing part-of-speech tag. Comes from the {{para|p}} or {{para|POS}} argument of
{{tl|inflection of}}.
* `.pagename`: Page name of "current" page or nil to use the actual page title; for testing purposes.
* `.enclitics`: List of enclitics to display after the lemmas, in parens.
* `.no_format_categories`: If true, don't format the categories derived from the inflection tags; just return them.
* `.sort`: Sort key for formatted categories. Ignored when `.no_format_categories` = {true}.
* `.nocat`: Suppress computation of categories (even if `.no_format_categories` is not given).
* `.notext`: Disable display of all tag text and `inflection of` text. (FIXME: Maybe not implemented correctly.)
* `.capfirst`: Capitalize the first word displayed.
* `.pretext`: Additional text to display before the inflection tags, but after any top-level labels.
* `.posttext`: Additional text to display after the lemma links.
* `.text_classes`: CSS classes used to wrap the tag text and lemma links. Default is
{"form-of-definition use-with-mention"} for the tag text, {"form-of-definition-link"} for the lemma links. (FIXME:
Should separate out the lemma links into their own field.)
`.joiner`: Override the joiner (normally a slash) used to join multipart tags. You should normally not specify this.
A typical call might look like this (for {{m+|es|amo}}): {
local lang = require("Module:languages").getByCode("es")
local lemma_obj = {
lang = lang,
term = "amar",
}
return m_form_of.tagged_inflections({
lang = lang, tags = {"1", "s", "pres", "ind"}, lemmas = {lemma_obj}, lemma_face = "term", POS = "verb"
})
}
Normally, one value is returned, the formatted text, which has appended to it the formatted categories derived from the
tag-set-related categories generated by the specs in [Module:form of/cats]]. To suppress this, set
`data.no_format_categories` = {true}, in which case two values are returned, the formatted text without any formatted
categories appended and a list of the categories to be formatted.
NOTE: There are two sets of categories that may be generated: (1) categories derived directly from the tag sets, as
specified in [[Module:form of/cats]]; (2) categories derived from tag-set labels, either (a) set explicitly by the
caller in `data.tag_sets`, (b) specified by the user using `<lb:...>` attached to the last tag in a tag set, or
(c) specified in [[Module:form of/cats]]. The second type (label-related categories) are currently not returned in
the second return value of {tagged_inflections()}, and are currently inserted into the output text even if
`data.no_format_categories` is set to {true}; but they can be suppressed by setting `data.nocat` = {true} (which also
suppresses the first type of categories, those derived directly from tag sets, even if `data.no_format_categories` is
set to {true}).
]==]
function export.tagged_inflections(data)
if not data.tags and not data.tag_sets then
error("First argument must be a table of arguments, and `.tags` or `.tag_sets` must be specified")
end
if data.tags and data.tag_sets then
error("Both `.tags` and `.tag_sets` cannot be specified")
end
local tag_sets = data.tag_sets
if not tag_sets then
tag_sets = export.split_tag_set(data.tags)
for i, tag_set in ipairs(tag_sets) do
tag_sets[i] = export.parse_tag_set_properties(tag_set)
end
end
local inflections = {}
local categories = {}
for _, tag_set in ipairs(tag_sets) do
local normalized_tag_sets = export.normalize_tag_set(tag_set.tags, data.lang, "do-track")
for _, normalized_tag_set in ipairs(normalized_tag_sets) do
local cur_infl = {}
local this_categories, this_labels = export.fetch_categories_and_labels(normalized_tag_set, data.lang,
data.POS, data.pagename, type(data.lemmas) == "table" and data.lemmas or nil)
if not data.nocat then
m_table.extendList(categories, this_categories)
end
local cur_infl = export.get_tag_set_display_form(normalized_tag_set, data.lang, data.joiner)
if #cur_infl > 0 then
if tag_set.labels then
this_labels = m_table.append(tag_set.labels, this_labels)
end
table.insert(inflections, {infl_text = cur_infl, labels = this_labels})
end
end
end
local overall_labels, need_per_tag_set_labels
for _, inflection in ipairs(inflections) do
if overall_labels == nil then
overall_labels = inflection.labels
elseif not m_table.deepEquals(overall_labels, inflection.labels) then
need_per_tag_set_labels = true
overall_labels = nil
break
end
end
if not need_per_tag_set_labels then
for _, inflection in ipairs(inflections) do
inflection.labels = nil
end
end
local format_data = m_table.shallowcopy(data)
local function format_labels(labels, notext)
if labels and #labels > 0 then
return require(labels_module).show_labels { labels = labels, lang = data.lang, sort = data.sort, nocat = data.nocat } ..
(notext and (data.pretext or "") == "" and "" or " ")
else
return ""
end
end
local of_text = data.lemmas and " of" or ""
local formatted_text
if #inflections == 1 then
if need_per_tag_set_labels then
error("Internal error: need_per_tag_set_labels should not be set with one inflection")
end
format_data.text = format_labels(overall_labels, data.notext) .. (data.pretext or "") .. (data.notext and "" or
((data.capfirst and require("Module:string utilities").ucfirst(inflections[1].infl_text) or inflections[1].infl_text) .. of_text))
formatted_text = export.format_form_of(format_data)
else
format_data.text = format_labels(overall_labels, data.notext) .. (data.pretext or "") .. (data.notext and "" or
((data.capfirst and "Inflection" or "inflection") .. of_text))
format_data.posttext = (data.posttext or "") .. ":"
local link = export.format_form_of(format_data)
local text_classes = data.text_classes or "form-of-definition use-with-mention"
for i, inflection in ipairs(inflections) do
inflections[i] = "\n## " .. format_labels(inflection.labels, false) ..
"<span class='" .. text_classes .. "'>" .. inflection.infl_text .. "</span>"
end
formatted_text = link .. table.concat(inflections)
end
if not data.no_format_categories then
if #categories > 0 then
formatted_text = formatted_text .. require("Module:utilities").format_categories(categories, data.lang,
data.sort, nil, export.force_cat)
end
return formatted_text
end
return formatted_text, categories
end
--[==[
Given a tag set, return a flattened list all Wikidata ID's of all tags in the tag set. FIXME: Only used in a debugging
function in [[Module:se-verbs]]; move there.
]==]
function export.to_Wikidata_IDs(tag_set, lang, skip_tags_without_ids)
local ret = {}
local function get_wikidata_id(tag)
local data = export.lookup_tag(tag, lang)
if not data or not data[export.WIKIDATA] then
if not skip_tags_without_ids then
error('The tag "' .. tag .. '" does not have a Wikidata ID defined in the form-of data modules')
else
return nil
end
else
return ("Q%s"):format(data[export.WIKIDATA])
end
end
local normalized_tag_sets = export.normalize_tag_set(tag_set, lang)
for _, tag_set in ipairs(normalized_tag_sets) do
for _, tag in ipairs(tag_set) do
if type(tag) == "table" then
for _, subtag in ipairs(tag) do
if type(subtag) == "table" then
-- two-level multipart tag; FIXME: delete support for this
for _, subsubtag in ipairs(subtag) do
table.insert(ret, get_wikidata_id(subsubtag))
end
else
table.insert(ret, get_wikidata_id(subtag))
end
end
else
table.insert(ret, get_wikidata_id(tag))
end
end
end
return ret
end
function export.dump_form_of_data(frame)
local data = {
data = require(export.form_of_data_module),
data2 = require(export.form_of_data2_module)
}
return require("Module:JSON").toJSON(data)
end
function export.finalize_tag_data(tags, shortcuts)
local function process_shortcut(name, shortcut)
-- If the shortcut is already in the list, then there is a duplicate.
if shortcuts[shortcut] then
error("The shortcut \"" .. shortcut .. "\" (for the inflection tag \"" .. name .. "\") conflicts with an existing shortcut for the tag \"" .. shortcuts[shortcut] .. "\".")
elseif tags[shortcut] then
error("The shortcut \"" .. shortcut .. "\" (for the inflection tag \"" .. name .. "\") conflicts with an existing tag with that name.")
end
shortcuts[shortcut] = name
end
for name, data in pairs(tags) do
local data_shortcuts = data[export.SHORTCUTS]
if data_shortcuts then
if type(data_shortcuts) == "string" then
process_shortcut(name, data_shortcuts)
else
for _, shortcut in ipairs(data_shortcuts) do
process_shortcut(name, shortcut)
end
end
end
end
end
return export