48,357
edits
No edit summary |
No edit summary |
||
| Line 2: | Line 2: | ||
local m_links = require("Module:links") | local m_links = require("Module:links") | ||
local | local m_str_utils = require("Module:string utilities") | ||
local m_table = require("Module:table") | local m_table = require("Module:table") | ||
local put = require("Module:parse utilities") | local put = require("Module:parse utilities") | ||
local headword_data_module = "Module:headword/data" | |||
local script_utilities_module = "Module:script utilities" | |||
local table_tools_module = "Module:table tools" | |||
local | local is_callable = require("Module:fun").is_callable | ||
local split = m_str_utils.split | |||
local rfind = mw.ustring.find | local rfind = mw.ustring.find | ||
local rmatch = mw.ustring.match | local rmatch = mw.ustring.match | ||
local rsubn = mw.ustring.gsub | local rsubn = mw.ustring.gsub | ||
local ucfirst = m_str_utils.ucfirst | |||
local unpack = unpack or table.unpack -- Lua 5.2 compatibility | |||
local dump = mw.dumpObject | |||
-- version of rsubn() that discards all but the first return value | -- version of rsubn() that discards all but the first return value | ||
| Line 15: | Line 22: | ||
local retval = rsubn(term, foo, bar) | local retval = rsubn(term, foo, bar) | ||
return retval | return retval | ||
end | end | ||
| Line 34: | Line 37: | ||
-- | --[==[ intro: | ||
The following code is used in building up the inflection of terms in inflected languages, where a term can potentially | The following code is used in building up the inflection of terms in inflected languages, where a term can potentially | ||
consist of several inflected words, each surrounded by fixed text, and a given slot (e.g. accusative singular) of a | consist of several inflected words, each surrounded by fixed text, and a given slot (e.g. accusative singular) of a | ||
| Line 65: | Line 44: | ||
terminology is helpful to understand: | terminology is helpful to understand: | ||
* A '''term''' is a word or multiword expression that can be inflected. A multiword term may in turn consist of several | |||
single-word inflected terms with surrounding fixed text. A term belongs to a particular '''part of speech''' (e.g. | |||
* A | noun, verb, adjective, etc.). | ||
single-word inflected terms with surrounding fixed text. A term belongs to a particular | * An '''inflection dimension''' is a particular dimension over which a term may be inflected, such as case, number, | ||
gender, person, tense, mood, voice, aspect, etc. | |||
* | * The '''lemma''' is the particular form of a term under which the term is entered into a dictionary. For example, for | ||
* The | |||
verbs, it is most commonly the infinitive, but this differs for some languages: e.g. Latin, Greek and Bulgarian use | verbs, it is most commonly the infinitive, but this differs for some languages: e.g. Latin, Greek and Bulgarian use | ||
the first-person singular present indicative (active voice in the case of Latin and Greek); Sanskrit and Macedonian | the first-person singular present indicative (active voice in the case of Latin and Greek); Sanskrit and Macedonian | ||
| Line 92: | Line 55: | ||
third-person singular masculine past (aka "perfect"); etc. For nouns, the lemma form is most commonly the nominative | third-person singular masculine past (aka "perfect"); etc. For nouns, the lemma form is most commonly the nominative | ||
singular, but e.g. for Old French it is the objective singular and for Sanskrit it is the root. | singular, but e.g. for Old French it is the objective singular and for Sanskrit it is the root. | ||
]=] | * A '''slot''' is a particular combination of inflection dimensions. An example might be "accusative plural" for a noun, | ||
or "first-person singular present indicative" for a verb. Slots are named in a language-specific fashion. For | |||
example, the slot "accusative plural" might have a name `accpl`, while "first-person singular present indicative" | |||
might be variously named `pres1s`, `pres_ind_1_sg`, etc. Each slot is filled with zero or more '''forms'''. | |||
* A '''form''' is a particular inflection of a slot for a particular term. Note that a given slot may (and often does) | |||
have more than one associated form; these different forms are termed '''variants'''. An example is | |||
{{m+|de|Bug||bow (of a ship)}}, which has two genitive singular forms ''Buges'' and ''Bugs''; two plural forms in all | |||
cases, e.g. nominative plural ''Buge'' and ''Büge''; and two dative singular forms ''Bug'' and rare/archaic ''Buge''. | |||
The form variants for a given slot are ordered, and generally should have the more common and/or preferred variants | |||
first, along with rare, archaic or obsolete variants last (if they are included at all). | |||
* Forms are described using '''form objects''', which are Lua objects taking the form | |||
`{form="``form_value``", translit="``manual_translit``", footnotes={"``footnote``", "``footnote``", ...}}`. | |||
(Additional '''metadata''' may be present in a form object, although the support for preserving such metadata when | |||
transformations are applied to form objects isn't yet complete.) ``form_value`` is a '''form value''' specifying the | |||
value of the form itself in the term's script. ``manual_translit`` specifies optional manual transliteration for the | |||
form, in case (a) the form value is in a different script; and (b) either the form's automatic transliteration is | |||
incorrect and needs to be overridden, or the language of the term has no automatic transliteration (e.g. in the case | |||
of Persian and Hebrew). ``footnote`` is a footnote to be attached to the form in question, and should be e.g. | |||
{"[archaic]"} or {"[only in the meaning 'to succeed (an officeholder)']"}, i.e. the string must be surrounded by | |||
brackets and should begin with a lowercase letter and not end in a period/full stop. When such footnotes are converted | |||
to actual footnotes in a table of inflected forms, the brackets will be removed, the first letter will be capitalized | |||
and a period/full stop will be added to the end. (However, when such footnotes are used as qualifiers in headword | |||
lines, only the brackets will be removed, with no capitalization or final period.) Note that only ``form_value`` is | |||
mandatory. | |||
* A list of zero or more form objects is termed a '''form object list''', or usually just a '''form list'''. Such lists | |||
are ordered and go into form tables (see below). | |||
* A '''form table''' is a Lua table (i.e. a dictionary) describing all the possible inflections of a given term. The | |||
keys in such a table are slots (strings) and the values are form lists. '''NOTE:''' All inflection code assumes and | |||
maintains the invariant that no two slots, and no two forms in a single slot, share the same form object (by | |||
reference, i.e. the Lua object describing a form object should never be shared in two places). This allows for safely | |||
side-effecting form objects in certain sorts of operations. This same invariant necessarily applies to the Lua list | |||
objects containing the form objects, but does '''NOT''' apply to metadata inside of form objects. In particular, a | |||
list of footnotes may well be shared among different form objects. This means it is '''NOT''' safe to side-effect | |||
such lists, and in fact no code in this module that manipulates footnote lists will ever side-effect such lists; they | |||
are treated as immutable. | |||
* Some functions, to save memory, accept and work with abbreviated forms of form objects and/or form lists. | |||
Specifically, an '''abbreviated form object''' is either a form object or a string, the latter corresponding to a form | |||
object whose form value is the string and all other properties are nil. Similarly, an '''abbreviated form list''' is | |||
either a single abbreviated form object or a list of such objects, i.e. any of a string, form object or list of | |||
strings and/or form objects. Functions that do not accept such abbreviated structures may be said to insist on being | |||
passed form objects in '''general form''', or form lists in '''general list form'''. | |||
* Each slot is associated with an '''accelerator tag set''', which is a list of inflection tags that are used when | |||
generating an accelerator entry for the forms in the slot (see [[WT:ACCEL]]). For example, the first singular present | |||
indicative of a verb might have slot name `pres_1sg` and corresponding accelerator tag set `1|s|pres|ind`. As shown, | |||
the accelerator tag set is a string consisting of inflection tags (as used in {{tl|inflection of}}) separated by `|`. | |||
Despite the terminology ''tag set'', the tags in a tag set are ordered, although the same tag should never occur | |||
twice. | |||
* Some inflected terms are '''multiword''', i.e. they consist of multiple '''words''', where each word is generally | |||
separated by spaces or sometimes hyphens. In such a term, some of the words inflect, while others remain fixed. Words | |||
that inflect are termed '''inflecting words''' (or more correctly '''inflecting parts''', since in some circumstances, | |||
parts of a word can inflect). The '''fixed text''' is all the parts of a multiword term that do not inflect. | |||
* The descriptor that describes how a given term inflects is called an '''inflection spec''', and consists of the lemma | |||
form of the term itself, annotated with an '''angle bracket spec''' after each inflecting word. As the name implies, | |||
an angle bracket spec is surrounded by angle brackets (`<...>`). A simple example is {{m+|de|Feder||feather}}, whose | |||
inflection spec looks like `Feder<f>`, where `f` specifies the feminine gender. In this case, although there are | |||
several properties that could be specified between angle brackets, all except the gender are optional and have been | |||
left out, indicating that defaults should be used. Another example is {{m+|de|Baske|Basque person}}, whose inflection | |||
spec looks like `Baske<m.weak>`, where `m` specifies the masculine gender and `weak` specifies the weak inflection. | |||
Note that individual components of an angle bracket spec like `m` and `weak` are termed '''indicators''' and are | |||
separated by periods/full stops. A slightly more complex example is {{m+|de|Zeitgeist||zeitgeist}}, whose inflection | |||
spec looks like `Zeitgeist<m,es:s,er>` and which specifies three things in a single '''compound indicator''': `m` (the | |||
masculine gender); `es:s` (the genitive singular, which can end in either ''-es'' or ''-s''); and `er` (the | |||
nomininative plural, which ends in ''-er''). | |||
* If there are several inflecting words in a term, each one will be followed by its own angle bracket spec. An example | |||
is {{m+|de|schwarzes Loch||black hole}}, whose inflection spec looks like `schwarzes<+> Loch<n,es:s,^er>`. Here, | |||
the adjective ''schwarzes'' (the nominative neuter singular of {{m|de|schwarz||black}}) is followed by the angle | |||
bracket spec `<+>` specifying that it inflects as an adjective, and the noun ''Loch'' has the angle bracket spec | |||
`<n,es:s,^er>`, indicating (similarly to the above example) that it is neuter, has a genitive singular in either | |||
''-es'' or ''-s'', and has a nominative plural in ''-er'' with umlaut, hence ''Löcher'' (the `^` specifies that the | |||
form requires umlaut). | |||
* Sometimes a given term has multiple ways of inflecting that differ in ways that can't be specified using a single | |||
angle bracket spec. This is supported using '''alternants''', which are specified using double parentheses. (This is | |||
so that terms that themselves contain parentheses can be specified without interference.) An example is | |||
{{m+|uk|русин||Rusyn}}, which can be stressed either as ''ру́син'' (stress on the first syllable and following accent | |||
paradigm ''a'', hence genitive singular ''ру́сина'') or ''руси́н'' (stress on the second syllable and following accent | |||
paradigm ''b'', hence genitive singular ''русина́''; note how the stress moves onto the ending, in accordance with the | |||
accent paradigm). This is specified using `((ру́син<pr>,руси́н<b.pr>))`, i.e. each separate the alternants with a comma | |||
and surround them with double parentheses. (Here, `pr` means that the terms belong to the personal animacy class, and | |||
`b` specifies the accent paradigm; paradigm ''a'' is the default and hence is omitted.) | |||
* Note that occasionally, parts of a single space-delimited word can inflect separately. An example is | |||
{{m+|la|rōsmarīnus||rosemary}}, which is a compound of {{m+|la|rōs||dew}} and {{m|la|marīnus||marine, of the sea}}. | |||
In this compound, both parts of the compound can inflect separately; hence genitive singular ''rōrismarīnī'', | |||
accusative singular ''rōremmarīnum'', etc. Alternatively, only the second part inflects; hence genitive singular | |||
''rōsmarīnī'', accusative singular ''rōsmarīnum'', etc. This is specified as | |||
`((rōs/rōr<3.M>marīnus<2>,rōsmarīnus<2>))`. Here, the term {{m|la|rōs}} by itself would have inflection spec | |||
`rōs/rōr<3.M>` (indicating that it is third declension masculine with a non-nominative-singular stem ''rōr-'') and | |||
the term {{m|la|marīnus}} would have inflection spec `<2>` (indicating that it is second declension; the masculine | |||
gender is inferred from the ''-us'' ending). When combined in a single inflection spec, the doubly-inflecting | |||
alternant is written `rōs/rōr<3.M>marīnus<2>`, with each inflecting part followed by its corresponding angle bracket | |||
spec, and the singly-inflecting alternant is written `rōsmarīnus<2>`. As this example shows, the two alternants | |||
need not correspond in how many inflecting parts there are. It should also be noted that fixed text can surround | |||
an alternant and it is even possible to supply multiple alternants in a single inflection spec (e.g. if the term | |||
has two words in it and each word requires an alternant to inflect). | |||
* The result of parsing a single angle bracket spec is stored into a '''word spec'''. The structure of a word spec is | |||
fairly arbitrary and is determined by the user-written `parse_indicator_spec` function, but always contains a form | |||
table under the `forms` key that is populated during inflection (see below). A parameter or local variable that holds | |||
a word spec is conventionally named `base` for historical reasons. Word specs are grouped together into a structure | |||
termed a '''multiword spec''', which describes one or more word specs along with the fixed text in between and around | |||
the inflected words. Multiword specs are in turn grouped into structures termed '''alternant specs''', indicating | |||
the distinct alternants and the words in each alternant. Finally, multiword specs and alternant specs are grouped into | |||
an '''alternant multiword spec''', which is the top-level object describing an inflection spec. Each of these | |||
different specs has a form table in it stored in the `forms` key that is populated during the inflection process and | |||
contains the form objects that specify the inflections of this part of the full multiword term. (It should be noted | |||
that the term '''spec''' is overloaded to mean two different things: the user-specified descriptor that specifies the | |||
lemma form of the term and associated inflection, and the associated internal Lua object that encapsulates all | |||
information derived from the descriptor, along with later-generated information on how to inflect the term(s) being | |||
described.) | |||
* Among these various "spec" structures, the two most important are the top-level alternant multiword spec and the | |||
bottom-level word spec or "base". You will rarely find it necessary to manipulate the intermediate structures or | |||
concern yourself with the details of their formation. | |||
* The term ''form'' is unfortunatately overloaded in various modules to mean several things. In particular, for | |||
historical reasons, the form value inside of a form object is stored using the key `form`; the form table inside of an | |||
alternant multiword spec, a word spec (or "base") and the intermediate structures is stored using the key `forms`; and | |||
the accelerator tag set is internally referred to in [[WT:ACCEL]] as a "form". To avoid confusion, the following | |||
conventions are followed in code in this module, and should be followed for code in invoking modules as well: | |||
*# Functions that accept form objects often name the relevant parameter `form` (if a single form object is required) or | |||
`forms` (if a list of form objects, aka form list, is required). | |||
*# Functions that accept abbreviated form objects should (but don't always) indicate this by naming the parameter | |||
`abform` (for a single abbreviated form object) or `abforms` (for an abbreviated form list). | |||
*# Functions that accept a form value (the native-script string portion of a form object, stored for historical reasons | |||
in the `.form` property) should '''not''' call such a parameter `form`, but instead use something that makes clear | |||
that a form value is required, such as `formval` or sometimes just `val`. | |||
*# Similarly, functions that accept a form table should '''not''' call such a parameter `forms` (although for historical | |||
reasons the form table in an alternant multiword spec is stored in the field `forms`). Instead, use `formtable` or | |||
`formtab`, or similar name that makes clear that the value is a form table (i.e. a map from slot to form list). | |||
====Footnote handling==== | |||
Each form can have one or more attached footnotes. The form of a footnote as specified by the user and stored in form | |||
values is e.g. {"[archaic]"} or {"[only in the meaning 'to succeed (an officeholder)']"}, i.e. the string must be | |||
surrounded by brackets and should begin with a lowercase letter and not end in a period/full stop. When such footnotes | |||
are converted to actual footnotes in a table of inflected forms, the brackets will be removed, the first letter will be | |||
capitalized and a period/full stop will be added to the end. (However, when such footnotes are used as qualifiers in | |||
headword lines, only the brackets will be removed, with no capitalization or final period.) | |||
When merging two forms into one, such as when concatenating the form objects of two inflected words in a multiword term | |||
or deduplicating form objects sharing the same form value during `show_forms()`, the footnotes are generally combined as | |||
well. This means that if one form object has footnotes and the other doesn't, the resulting form object inherits the | |||
footnotes of the object that has them, and if both form objects have footnotes, the resulting form object gets all | |||
footnotes from both source form objects, with duplicates removed. However, when inserting a form into a form table slot | |||
that already has a form whose form value and translit are identical to the new form, the behavior is different. In | |||
under normal circumstances the footnotes of the new form are ''not'' incorporated into those of the existing form (if | |||
any), but are simply dropped. To understand why this makes sense, consider a term that has two possible forms of its | |||
lemma (e.g. two forms differing in stress or in vowel length), where the second form is archaic, rare, colloquial or the | |||
like, and has an attached footnote indicating this. An example of this is {{m+|ru|кожух||sheepskin coat; bullet shell}}, | |||
where the form ''кожу́х'' with accent pattern ''b'' is more common overall but the form ''ко́жух'' with accent pattern | |||
''c(1)'' is more common among professionals. On first glance, this could be indicated using | |||
`((кожу́х<b>,ко́жух<c(1).[professional usage only]>))`. But some forms of these two declensions are the same (in | |||
particular, the genitive, dative, instrumental and prepositional plural). If for these slots, the footnotes of the | |||
duplicate forms were combined (i.e. the footnotes of the second declension pattern were added to the already-existing | |||
form taken from the first declension pattern), these forms would wrongly be labeled as ''professional usage only''. | |||
For this reason, it makes more sense to drop the footnotes of the second form when deduplicating. | |||
The same sort of behavior makes sense when a single lemma can have two different declensions, the second of which | |||
requires a footnote and where some forms in the two declensions are shared. An example of this is | |||
{{m+|uk|окови́та||strong, high-quality liquor}}, which can be inflected adjectivally or (rarely) nominally. This would be | |||
indicated as `((окови́та<sg.+>,окови́та<sg.[rare]>))` where the `+` indicates adjectival declension and the `sg` indicates | |||
that this term only exists in the singular. Here, the two declensions differ in the genitive, dative/locative and | |||
vocative (respectively, adjectival ''окови́тої'', ''окови́тій'', ''окови́та'' vs. nominal ''окови́ти'', ''окови́ті'', | |||
''окови́то'') but are the same in the accusative (''окови́ту'') and instrumental (''окови́тою''). Again, dropping the | |||
footnotes of the second form when deduplicating is correct and including them would be wrong. | |||
This behavior can be changed by attaching a '''footnote modifier''' to the footnote associated the second form. A | |||
footnote modifier is a symbol attached to the beginning of a footnote, directly following the opening bracket. The | |||
following modifiers are currently recognized: | |||
* `!` or `+`: If placed on a footnote of the second form, combine that footnote with those of the first form (if any) | |||
rather than dropping it. | |||
* `*`: If placed on a footnote of the first form, drop that footnote when merging a second form with any footnotes. | |||
An example where the `*` modifier makes sense is a modification of the above example with {{m+|ru|кожух}}. If we | |||
notated it as `((кожу́х<b.[more common among laymen]>,ко́жух<c(1).[more common among professionals]>))`, the shared forms | |||
would wrongly have the footnote ''more common among laymen'' when in fact they are the only possible forms. If instead | |||
we used `((кожу́х<b.[*more common among laymen]>,ко́жух<c(1).[more common among professionals]>))`, the shared forms | |||
would correctly have no footnote. | |||
Finally, be aware of '''old-style footnote symbols'''. For compatibility reasons, some inflection implementations | |||
support a system whereby footnote symbols (consisting of numbers; certain ASCII symbols such as `*`, `~`, `@`, `#`, | |||
`+`, etc.; and a large number of Unicode symbols) are directly attached to form values and the footnotes themselves | |||
specified manually using the `footnotes` property passed to `show_forms()`. This is allowed only when | |||
`allow_footnote_symbols` is set and is highly deprecated. All uses of such symbols should be converted to standard | |||
footnotes and the support for such symbols removed. | |||
]==] | |||
| Line 104: | Line 247: | ||
-- Insert a form ( | --[==[ | ||
Insert a form object (see above) into a list of such objects. If the form is already present (i.e. both the form | |||
value and translit, if any, match), the footnotes of the existing and new form might be combined (specifically, | |||
footnotes in the new form beginning with `!` will be combined). | |||
]==] | |||
function export.insert_form_into_list(list, form) | function export.insert_form_into_list(list, form) | ||
-- Don't do anything if the form object or the form inside it is nil. This simplifies | -- Don't do anything if the form object or the form inside it is nil. This simplifies | ||
| Line 146: | Line 291: | ||
-- The behavior here has changed; track cases where the old behavior might | -- The behavior here has changed; track cases where the old behavior might | ||
-- be needed by adding ! to the footnote. | -- be needed by adding ! to the footnote. | ||
local any_footnotes_with_bang = false | local any_footnotes_with_bang = false | ||
for _, footnote in ipairs(form.footnotes) do | for _, footnote in ipairs(form.footnotes) do | ||
| Line 159: | Line 303: | ||
listform.footnotes = {} | listform.footnotes = {} | ||
else | else | ||
listform.footnotes = m_table. | listform.footnotes = m_table.shallowCopy(listform.footnotes) | ||
end | end | ||
for _, footnote in ipairs(form.footnotes) do | for _, footnote in ipairs(form.footnotes) do | ||
| Line 187: | Line 331: | ||
end | end | ||
-- Insert a form ( | --[==[ | ||
Insert a form object (see above) into the given slot in the given form table. ``form`` can be {nil}, in which case | |||
nothing happens. | |||
]==] | |||
function export.insert_form(formtable, slot, form) | function export.insert_form(formtable, slot, form) | ||
-- Don't do anything if the form object or the form inside it is nil. This simplifies | -- Don't do anything if the form object or the form inside it is nil. This simplifies | ||
| Line 203: | Line 349: | ||
-- Insert a list of | --[==[ | ||
Insert a list of form objects (see above) into the given slot in the given form table. ``forms`` can be {nil}, | |||
in which case nothing happens. | |||
]==] | |||
function export.insert_forms(formtable, slot, forms) | function export.insert_forms(formtable, slot, forms) | ||
if not forms then | if not forms then | ||
| Line 216: | Line 363: | ||
function export.identity( | --[==[ | ||
return | Identity mapping function. | ||
]==] | |||
function export.identity(formval, translit) | |||
return formval, translit | |||
end | end | ||
local function form_value_transliterable(formval) | |||
return formval ~= "?" and formval ~= "—" | |||
end | |||
local function call_map_function_str(str, fun) | local function call_map_function_str(str, fun) | ||
| Line 225: | Line 379: | ||
return "?" | return "?" | ||
end | end | ||
local | local newformval, newtranslit = fun(str) | ||
if newtranslit then | if newtranslit then | ||
return {form= | return {form=newformval, translit=newtranslit} | ||
else | else | ||
return | return newformval | ||
end | end | ||
end | end | ||
-- FIXME: This doesn't correctly handle metadata. | |||
local function call_map_function_obj(form, fun) | local function call_map_function_obj(form, fun) | ||
if form.form == "?" then | if form.form == "?" then | ||
return {form = "?", footnotes = form.footnotes} | return {form = "?", footnotes = form.footnotes} | ||
end | end | ||
local | local newformval, newtranslit = fun(form.form, form.translit) | ||
return {form= | return {form = newformval, translit = newtranslit, footnotes = form.footnotes} | ||
end | end | ||
-- Map a function over the form values in | --[==[ | ||
Map a function over the form values in ``forms`` (a list of form objects in "general list form; see above). If an | |||
input form value is {"?"}, it is preserved on output and the function is not called. Otherwise, the function is | |||
called with two arguments, the original form and manual translit; if manual translit isn't relevant, it's fine to | |||
declare the function with only one argument. The return value is either a single value (the new form) or two values | |||
(the new form and new manual translit). The footnotes (if any) from the input form objects are preserved on output. Uses | |||
`insert_form_into_list()` to insert the resulting form objects into the returned list in case two different forms map | |||
to the same thing. | |||
FIXME: Expand this to correctly handle metadata, or create a variant that correctly handles metadata. | |||
]==] | |||
function export.map_forms(forms, fun) | function export.map_forms(forms, fun) | ||
if not forms then | if not forms then | ||
| Line 262: | Line 420: | ||
-- Map a list-returning function over the form values in | --[==[ | ||
Map a list-returning function over the form values in ``forms`` (a list of form objects in "general list form"; see | |||
above). If an input form value is {"?"}, it is preserved on output and the function is not called. Otherwise, the | |||
function is called with two arguments, the original form and manual translit; if manual translit isn't relevant, it's | |||
fine to declare the function with only one argument. The return value of the function can be {nil} or an abbreviated | |||
form list (i.e. anything that is convertible into a general list form, such as a single form value, a list of form | |||
values, a form object or a list of form objects). For each form object in the return value, the footnotes of that form | |||
object (if any) are combined with any footnotes from the input form object, and the result inserted into the returned | |||
list using `insert_form_into_list()` in case two different forms map to the same thing. | |||
FIXME: Expand this to correctly handle metadata, or create a variant that correctly handles metadata. | |||
]==] | |||
function export.flatmap_forms(forms, fun) | function export.flatmap_forms(forms, fun) | ||
if not forms then | if not forms then | ||
| Line 276: | Line 439: | ||
for _, form in ipairs(forms) do | for _, form in ipairs(forms) do | ||
local funret = form.form == "?" and {"?"} or fun(form.form, form.translit) | local funret = form.form == "?" and {"?"} or fun(form.form, form.translit) | ||
for _, fr in ipairs(funret) do | if funret then | ||
funret = export.convert_to_general_list_form(funret) | |||
for _, fr in ipairs(funret) do | |||
local newform = { | |||
form = fr.form, | |||
newform | translit = fr.translit, | ||
footnotes = export.combine_footnotes(form.footnotes, fr.footnotes) | |||
} | |||
export.insert_form_into_list(retval, newform) | |||
end | end | ||
end | end | ||
end | end | ||
| Line 290: | Line 455: | ||
-- Map a function over the form values in | --[==[ | ||
Map a function over the form values in ``abforms`` (an abbreviated form list). If the input form value is {"?"}, it is | |||
preserved on output and the function is not called. If ``first_only`` is given and ``abforms`` is a list, only map over | |||
the first element. Return value is of the same form as ``abforms``, unless ``abforms`` is a string and the function | |||
returns both form value and manual translit (in which case the return value is a form object). The function is called | |||
with two arguments, the original form value and manual translit; if manual translit isn't relevant, it's fine to declare | |||
the function with only one argument. The return value is either a single value (the new form value) or two values (the | |||
new form value and new manual translit). The footnotes (if any) from the input form objects are preserved on output. | |||
FIXME: This function is used only in [[Module:bg-verb]] and should be moved into that module. | |||
function export.map_form_or_forms( | ]==] | ||
if not | function export.map_form_or_forms(abforms, fun, first_only) | ||
if not abforms then | |||
return nil | return nil | ||
elseif type( | elseif type(abforms) == "string" then | ||
return call_map_function_str( | return call_map_function_str(abforms, fun) | ||
elseif | elseif abforms.form then | ||
return call_map_function_obj( | return call_map_function_obj(abforms, fun) | ||
else | else | ||
local retval = {} | local retval = {} | ||
for i, | for i, abform in ipairs(abforms) do | ||
if first_only then | if first_only then | ||
return export.map_form_or_forms( | return export.map_form_or_forms(abform, fun) | ||
end | end | ||
table.insert(retval, export.map_form_or_forms( | table.insert(retval, export.map_form_or_forms(abform, fun)) | ||
end | end | ||
return retval | return retval | ||
| Line 320: | Line 486: | ||
-- Combine two sets of footnotes. If either is nil, just return the other, and if both are nil, return nil. | --[==[ | ||
Combine two sets of footnotes. If either is {nil}, just return the other, and if both are {nil}, return {nil}. | |||
]==] | |||
function export.combine_footnotes(notes1, notes2) | function export.combine_footnotes(notes1, notes2) | ||
if not notes1 and not notes2 then | if not notes1 and not notes2 then | ||
| Line 331: | Line 499: | ||
return notes1 | return notes1 | ||
end | end | ||
local combined = m_table. | local combined = m_table.shallowCopy(notes1) | ||
for _, note in ipairs(notes2) do | for _, note in ipairs(notes2) do | ||
m_table.insertIfNot(combined, note) | m_table.insertIfNot(combined, note) | ||
| Line 339: | Line 507: | ||
-- Expand a given footnote (as specified by the user, including the surrounding brackets) into the form to be inserted | --[==[ | ||
Expand a given footnote (as specified by the user, including the surrounding brackets) into the form to be inserted | |||
into the final generated table. If ``no_parse_refs`` is not given and the footnote is a reference (of the form | |||
{"[ref:...]"}), parse and return the specified reference(s). Two values are returned, `footnote_string` (the expanded | |||
footnote, or nil if the second value is present) and `references` (a list of objects of the form | |||
`{text = ``text``, name = ``name``, group = ``group``}` if the footnote is a reference and ``no_parse_refs`` is not | |||
given, otherwise {nil}). Unless ``return_raw`` is given, the returned footnote string is capitalized and has a final | |||
period added. | |||
]==] | |||
function export.expand_footnote_or_references(note, return_raw, no_parse_refs) | function export.expand_footnote_or_references(note, return_raw, no_parse_refs) | ||
local _, notetext = extract_footnote_modifiers(note) | local _, notetext = extract_footnote_modifiers(note) | ||
| Line 361: | Line 532: | ||
notetext = footnote_abbrevs[notetext] | notetext = footnote_abbrevs[notetext] | ||
else | else | ||
local split_notes = | local split_notes = split(notetext, "<(.-)>") | ||
for i, split_note in ipairs(split_notes) do | for i, split_note in ipairs(split_notes) do | ||
if i % 2 == 0 then | if i % 2 == 0 then | ||
| Line 376: | Line 547: | ||
notetext = table.concat(split_notes) | notetext = table.concat(split_notes) | ||
end | end | ||
return return_raw and notetext or | return return_raw and notetext or ucfirst(notetext) .. "." | ||
end | end | ||
function export. | --[==[ | ||
Convert a list of foonotes to qualifiers and references for use in [[Module:headword]] or similar. Returns two values, | |||
a list of qualifiers (possibly {nil}) and a list of reference structures (possibly {nil}), following the structure | |||
defined in [[Module:references]]). | |||
]==] | |||
function export.convert_footnotes_to_qualifiers_and_references(footnotes) | |||
if not footnotes then | if not footnotes then | ||
return nil | return nil | ||
| Line 415: | Line 583: | ||
-- Combine | --[==[ | ||
Combine an abbreviated form object (either a string or a table) with additional footnotes, possibly replacing the form | |||
value and/or translit in the process. Normally called in one of two ways: | |||
(1) `combine_form_and_footnotes(``form_obj``, ``addl_footnotes``, ``new_form``, ``new_translit``)` where ``form_obj`` | |||
is an existing abbreviated form object; ``addl_footnotes`` is either {nil}, a single string (a footnote) or a list | |||
of footnotes; ``new_formval`` is either {nil} or the new form value to substitute; and ``new_translit`` is either | |||
{nil} or the new translit string to substitute. | |||
(2) `combine_form_and_footnotes(``form_value``, ``footnotes``)`, where ``form_value`` is a form value (a string) and | |||
``footnotes`` is either {nil}, a single string (a footnote) or a list of footnotes. | |||
In either case, a form object is returned, preserving as many properties as possible from any existing form object in | |||
``abform``. Do the minimal amount of work; e.g. if ``abform`` is a form object and ``addl_footnotes``, ``new_formval`` | |||
and ``new_translit`` are all {nil}, the same object as passed in is returned. Under no circumstances is the existing | |||
function export.combine_form_and_footnotes( | form object side-effected. | ||
'''FIXME:''' This does not correctly preserve metadata. | |||
]==] | |||
function export.combine_form_and_footnotes(abform, addl_footnotes, new_formval, new_translit) | |||
if type(addl_footnotes) == "string" then | if type(addl_footnotes) == "string" then | ||
addl_footnotes = {addl_footnotes} | addl_footnotes = {addl_footnotes} | ||
end | end | ||
if not addl_footnotes and not | if not addl_footnotes and not new_formval and not new_translit then | ||
return | return abform | ||
end | end | ||
if type( | if type(abform) == "string" then | ||
new_formval = new_formval or abform | |||
return {form = | return {form = new_formval, translit = new_translit, footnotes = addl_footnotes} | ||
end | end | ||
abform = m_table.shallowCopy(abform) | |||
if | if new_formval then | ||
abform.form = new_formval | |||
end | end | ||
if new_translit then | if new_translit then | ||
abform.translit = new_translit | |||
end | end | ||
if addl_footnotes then | if addl_footnotes then | ||
abform.footnotes = export.combine_footnotes(abform.footnotes, addl_footnotes) | |||
end | end | ||
return | return abform | ||
end | end | ||
-- | --[==[ | ||
Convert an abbreviated form list (either a string, form object, or list of either) into general list form. If | |||
``footnotes`` is supplied, then for each form in the form list, combine the form's footnotes with ``footnotes``. | |||
This function does not side-effect any of the objects passed into ``abforms``, but will return ``abforms`` | |||
function export.convert_to_general_list_form( | unchanged if already in general list form and ``footnotes`` is {nil}. | ||
'''FIXME:''' This does not correctly preserve metadata. | |||
]==] | |||
function export.convert_to_general_list_form(abforms, footnotes) | |||
if type(footnotes) == "string" then | if type(footnotes) == "string" then | ||
footnotes = {footnotes} | footnotes = {footnotes} | ||
end | end | ||
if type( | if type(abforms) == "string" then | ||
return {{form = | return {{form = abforms, footnotes = footnotes}} | ||
elseif | elseif abforms.form then | ||
return {export.combine_form_and_footnotes( | return {export.combine_form_and_footnotes(abforms, footnotes)} | ||
elseif not footnotes then | elseif not footnotes then | ||
-- Check if already in general list form and return directly if so. | -- Check if already in general list form and return directly if so. | ||
local must_convert = false | local must_convert = false | ||
for _, form in ipairs( | for _, form in ipairs(abforms) do | ||
if type(form) == "string" then | if type(form) == "string" then | ||
must_convert = true | must_convert = true | ||
| Line 475: | Line 651: | ||
end | end | ||
if not must_convert then | if not must_convert then | ||
return | return abforms | ||
end | end | ||
end | end | ||
local retval = {} | local retval = {} | ||
for _, form in ipairs( | for _, form in ipairs(abforms) do | ||
if type(form) == "string" then | if type(form) == "string" then | ||
table.insert(retval, {form = form, footnotes = footnotes}) | table.insert(retval, {form = form, footnotes = footnotes}) | ||
| Line 500: | Line 676: | ||
-- Combine `stems` and `endings` and store into slot `slot` of form table ` | local function lang_or_func_transliterate(func, lang, text) | ||
local retval | |||
if func then | |||
retval = func(text) | |||
else | |||
retval = (lang:transliterate(text)) | |||
end | |||
-- FIXME! Hack to work around bug in ...:transliterate(). Remove me as soon as this bug is fixed. | |||
function export.add_forms( | if not retval and (text == " " or text == "-" or text == "?") then | ||
retval = text | |||
end | |||
if not retval then | |||
error(("Unable to transliterate text '%s'"):format(text)) | |||
end | |||
return retval | |||
end | |||
--[==[ | |||
Combine ``stems`` and ``endings`` and store into slot ``slot`` of form table ``formtable``. Either of ``stems`` and | |||
``endings`` can be {nil} or an abbreviated form list. The combination of a given stem and ending happens using | |||
``combine_stem_ending``, which takes two parameters (stem and ending, each a string) and returns one value (a string). | |||
If manual transliteration is present in either ``stems`` or ``endings``, ``lang`` (a language object or a function of | |||
one argument to transliterate a string) along with ``combine_stem_ending_tr`` (a function for combining manual | |||
transliterations that works much like ``combine_stem_ending``) must be given. ``footnotes``, if specified, is a list of | |||
additional footnotes to attach to the resulting inflections (stem+ending combinations). The resulting inflections are | |||
inserted into the form table using `insert_form()`, in case of duplication. | |||
]==] | |||
function export.add_forms(formtable, slot, stems, endings, combine_stem_ending, lang, combine_stem_ending_tr, footnotes) | |||
if stems == nil or endings == nil then | if stems == nil or endings == nil then | ||
return | return | ||
| Line 517: | Line 713: | ||
end | end | ||
return combine_stem_ending(stem, ending) | return combine_stem_ending(stem, ending) | ||
end | |||
local function transliterate(text) | |||
return lang_or_func_transliterate(is_callable(lang) and lang or nil, lang, text) | |||
end | end | ||
if type(stems) == "string" and type(endings) == "string" then | if type(stems) == "string" and type(endings) == "string" then | ||
export.insert_form( | export.insert_form(formtable, slot, {form = combine(stems, endings), footnotes = footnotes}) | ||
elseif type(stems) == "string" and is_table_of_strings(endings) then | elseif type(stems) == "string" and is_table_of_strings(endings) then | ||
for _, ending in ipairs(endings) do | for _, ending in ipairs(endings) do | ||
export.insert_form( | export.insert_form(formtable, slot, {form = combine(stems, ending), footnotes = footnotes}) | ||
end | end | ||
else | else | ||
| Line 531: | Line 730: | ||
local footnotes = nil | local footnotes = nil | ||
if stem.footnotes and ending.footnotes then | if stem.footnotes and ending.footnotes then | ||
footnotes = m_table. | footnotes = m_table.shallowCopy(stem.footnotes) | ||
for _, footnote in ipairs(ending.footnotes) do | for _, footnote in ipairs(ending.footnotes) do | ||
m_table.insertIfNot(footnotes, footnote) | m_table.insertIfNot(footnotes, footnote) | ||
| Line 546: | Line 745: | ||
error("Internal error: With manual translit, 'lang' and 'combine_stem_ending_tr' must be passed to 'add_forms'") | error("Internal error: With manual translit, 'lang' and 'combine_stem_ending_tr' must be passed to 'add_forms'") | ||
end | end | ||
local stem_tr = stem.translit or | local stem_tr = stem.translit or transliterate(m_links.remove_links(stem.form)) | ||
local ending_tr = ending.translit or | local ending_tr = ending.translit or transliterate(m_links.remove_links(ending.form)) | ||
new_translit = combine_stem_ending_tr(stem_tr, ending_tr) | new_translit = combine_stem_ending_tr(stem_tr, ending_tr) | ||
end | end | ||
export.insert_form( | export.insert_form(formtable, slot, {form = new_form, translit = new_translit, footnotes = footnotes}) | ||
end | end | ||
end | end | ||
| Line 557: | Line 756: | ||
function export.add_multiple_forms( | --[==[ | ||
Combine any number of form components and store into slot ``slot`` of form table ``formtable``. ``components`` is a list of abbreviated form | |||
if # | lists which should be concatenated similarly to how `add_forms()` does it, and stored in ``slot`` along with any footnotes in ``footnotes``. | ||
More specifically: | |||
# If there are no components, nothing happens. | |||
# If there is one component, it is converted to general list form and `insert_forms()` called. | |||
# If there are two components, they are treated as stems and endings respectively and `add_forms()` is called. | |||
# If there are three or more components, they are concatenated left-to-right in the manner of a `reduce()` operation: the first two components | |||
are combined using `add_forms()` and stored into a temporary table, then the next component is combined with the result of the previous | |||
operation, etc. In the last combination, footnotes in `footnotes` are combined in, and the result stored into `formtable`. | |||
This should generally be used when you are likely to have three or more components, as in [[Module:ar-verb]] (prefixes, stems and endings) | |||
and [[Module:de-verb]] (which in some situations has five components combined together). ``combine_stem_ending``, ``lang``, | |||
``combine_stem_ending_tr`` and ``footnotes`` are as in `add_forms()`. | |||
]==] | |||
function export.add_multiple_forms(formtable, slot, components, combine_stem_ending, lang, combine_stem_ending_tr, footnotes) | |||
if #components == 0 then | |||
return | return | ||
elseif # | elseif #components == 1 then | ||
local | local forms = export.convert_to_general_list_form(components[1], footnotes) | ||
export.insert_forms( | export.insert_forms(formtable, slot, forms) | ||
elseif # | elseif #components == 2 then | ||
local stems = | local stems = components[1] | ||
local endings = | local endings = components[2] | ||
export.add_forms( | export.add_forms(formtable, slot, stems, endings, combine_stem_ending, lang, combine_stem_ending_tr, footnotes) | ||
else | else | ||
local prev = | local prev = components[1] | ||
for i=2,# | for i=2, #components do | ||
local | local temptable = {} | ||
export.add_forms( | export.add_forms(temptable, slot, prev, components[i], combine_stem_ending, lang, combine_stem_ending_tr, | ||
i == #components and footnotes or nil) | |||
prev = | prev = temptable[slot] | ||
end | end | ||
export.insert_forms( | export.insert_forms(formtable, slot, prev) | ||
end | end | ||
end | end | ||
| Line 584: | Line 795: | ||
local function iterate_slot_list_or_table(props, do_slot) | local function iterate_slot_list_or_table(props, do_slot) | ||
if props.slot_list then | if props.slot_list then | ||
for _, | for _, slot_and_accel_tag_set in ipairs(props.slot_list) do | ||
local slot, | local slot, accel_tag_set = unpack(slot_and_accel_tag_set) | ||
do_slot(slot, | do_slot(slot, accel_tag_set) | ||
end | end | ||
else | else | ||
for slot, | for slot, accel_tag_set in pairs(props.slot_table) do | ||
do_slot(slot, | do_slot(slot, accel_tag_set) | ||
end | end | ||
end | end | ||
| Line 596: | Line 807: | ||
local function parse_before_or_post_text(props, text, segments, lemma_is_last | function export.default_split_bracketed_runs_into_words(bracketed_runs, data) | ||
-- If the text begins with a hyphen, include the hyphen in the set of allowed characters | |||
-- for an inflected segment. This way, e.g. conjugating "-ir" is treated as a regular | |||
-- -ir verb rather than a hyphen + irregular [[ir]]. | |||
local is_suffix = (not data or data.text_index == 1) and rfind(bracketed_runs[1], "^%-") | |||
local split_pattern = is_suffix and " " or "[ %-]" | |||
return put.split_alternating_runs(bracketed_runs, split_pattern, "preserve splitchar") | |||
end | |||
local function props_transliterate(props, text) | |||
return lang_or_func_transliterate(props.transliterate, props.lang, text) | |||
end | |||
local function parse_before_or_post_text(data) | |||
local props, text, text_index, segments, lemma_is_last = data.props, data.text, data.text_index, data.segments, | |||
data.lemma_is_last | |||
-- Call parse_balanced_segment_run() to keep multiword links together. | -- Call parse_balanced_segment_run() to keep multiword links together. | ||
local bracketed_runs = put.parse_balanced_segment_run(text, "[", "]") | local bracketed_runs = put.parse_balanced_segment_run(text, "[", "]") | ||
| Line 606: | Line 834: | ||
end | end | ||
if not space_separated_groups then | if not space_separated_groups then | ||
space_separated_groups = export.default_split_bracketed_runs_into_words(bracketed_runs, data) | |||
space_separated_groups = | |||
end | end | ||
| Line 631: | Line 854: | ||
end | end | ||
saw_manual_translit = true | saw_manual_translit = true | ||
local split = | local split = split(component, "//", "plain") | ||
if #split ~= 2 then | if #split ~= 2 then | ||
error("Term with translit or respelling should have only one // in it: " .. component) | error("Term with translit or respelling should have only one // in it: " .. component) | ||
| Line 651: | Line 874: | ||
for j, parsed_component in ipairs(parsed_components) do | for j, parsed_component in ipairs(parsed_components) do | ||
if not parsed_components_translit[j] then | if not parsed_components_translit[j] then | ||
parsed_components_translit[j] = | parsed_components_translit[j] = props_transliterate(props, m_links.remove_links(parsed_component)) | ||
end | end | ||
end | end | ||
| Line 673: | Line 895: | ||
The return value is a table of the form | The return value is a table of the form | ||
{ | { | ||
word_specs = { | word_specs = {``word_spec``, ``word_spec``, ...}, | ||
post_text = " | post_text = "``text-at-end``", | ||
post_text_no_links = " | post_text_no_links = "``text-at-end-no-links``", | ||
post_text_translit = " | post_text_translit = "``manual-translit-of-text-at-end``" or nil (if no manual translit or respelling was specified in the post-text) | ||
} | } | ||
where | where ``word_spec`` describes an individual inflected word and "``text-at-end``" is any raw text that may occur | ||
after all inflected words. Individual words or linked text (including multiword text) may be given manual | after all inflected words. Individual words or linked text (including multiword text) may be given manual | ||
transliteration or respelling in languages that support this using | transliteration or respelling in languages that support this using ``text``//``translit`` or ``text``//``respelling``. | ||
Each | Each ``word_spec`` is of the form returned by parse_indicator_spec(): | ||
{ | { | ||
lemma = " | lemma = "``lemma``", | ||
before_text = " | before_text = "``text-before-word``", | ||
before_text_no_links = " | before_text_no_links = "``text-before-word-no-links``", | ||
before_text_translit = " | before_text_translit = "``manual-translit-of-text-before-word``" or nil (if no manual translit or respelling was specified in the before-text) | ||
-- Fields as described in parse_indicator_spec() | -- Fields as described in parse_indicator_spec() | ||
... | ... | ||
| Line 747: | Line 969: | ||
for i = 2, #segments - 1, 2 do | for i = 2, #segments - 1, 2 do | ||
local before_text, before_text_translit, lemma = | local before_text, before_text_translit, lemma = | ||
parse_before_or_post_text | parse_before_or_post_text { | ||
props = props, | |||
text = segments[i - 1], | |||
text_index = i - 1, | |||
segments = segments, | |||
lemma_is_last = true | |||
} | |||
local base = props.parse_indicator_spec(segments[i], lemma) | local base = props.parse_indicator_spec(segments[i], lemma) | ||
base.before_text = before_text | base.before_text = before_text | ||
| Line 756: | Line 984: | ||
end | end | ||
multiword_spec.post_text, multiword_spec.post_text_translit = | multiword_spec.post_text, multiword_spec.post_text_translit = | ||
parse_before_or_post_text | parse_before_or_post_text { | ||
props = props, | |||
text = segments[#segments], | |||
text_index = #segments, | |||
segments = segments, | |||
lemma_is_last = false | |||
} | |||
multiword_spec.post_text_no_links = m_links.remove_links(multiword_spec.post_text) | multiword_spec.post_text_no_links = m_links.remove_links(multiword_spec.post_text) | ||
return multiword_spec | return multiword_spec | ||
| Line 766: | Line 1,000: | ||
The return value is a table of the form | The return value is a table of the form | ||
{ | { | ||
alternants = { | alternants = {``multiword_spec``, ``multiword_spec``, ...} | ||
} | } | ||
where | where ``multiword_spec`` describes a given alternant and is as returned by parse_multiword_spec(). | ||
]=] | ]=] | ||
local function parse_alternant(alternant, props) | local function parse_alternant(alternant, props) | ||
| Line 784: | Line 1,018: | ||
--[=[ | --[==[ | ||
Top-level parsing function. Parse text describing one or more inflected words. | Top-level parsing function. Parse text describing one or more inflected words. `text` is the inflected text to parse, | ||
`text` is the inflected text to parse, which generally has <...> specs following words to | which generally has `<...>` specs following words to be inflected, and may have alternants indicated using double | ||
be inflected, and may have alternants indicated using double parens. Examples: | parens. Examples: | ||
"[[медичний|меди́чна]]<+> [[сестра́]]<*,*#.pr>" (Ukrainian, for | * {"[[медичний|меди́чна]]<+> [[сестра́]]<*,*#.pr>"} (Ukrainian, for {{m|uk|меди́чна сестра́||nurse|lit=medical sister}}); | ||
"((ру́син<pr>,руси́н<b.pr>))" (Ukrainian, for | * {"((ру́син<pr>,руси́н<b.pr>))"} (Ukrainian, for {{m|uk|русин||Rusyn}}, with two possible stress patterns); | ||
"पंचायती//पंचाय*ती राज<M>" (Hindi, for | * {"पंचायती//पंचाय*ती राज<M>"} (Hindi, for {{m|hi|पंचायती राज||village council}}, with phonetic respelling in the | ||
"((<M>,<M.plstem:फ़तूह.dirpl:फ़तूह>))" (Hindi, for | before-text component); | ||
"" (for any number of Hindi adjectives, where the lemma is omitted and taken from the pagename, and the angle bracket spec <> is assumed) | * {"((<M>,<M.plstem:फ़तूह.dirpl:फ़तूह>))"} (Hindi, for {{m|hi|फ़तह||win, victory}} when used on that page, where the lemma | ||
"काला<+>धन<M>" (Hindi, for | is omitted and taken from the pagename); | ||
* {""} (for any number of Hindi adjectives, where the lemma is omitted and taken from the pagename, and the angle | |||
bracket spec <> is assumed); | |||
* {"काला<+>धन<M>"} (Hindi, for {{m|hi|कालाधन||black money}}, showing that closed compounds where each part is declined | |||
can be correctly handled). | |||
`props` is an object specifying properties used during parsing, as follows: | `props` is an object specifying properties used during parsing, as follows: | ||
{ | |||
parse_indicator_spec = | ```{ | ||
lang = | parse_indicator_spec = __function__(``angle_bracket_spec``, ``lemma``) `''(required)''`, | ||
transliterate_respelling = | lang = __lang object__, | ||
split_bracketed_runs_into_words = | transliterate_respelling = __function__(``respelling_or_translit``) `''(optional)''`, | ||
allow_default_indicator = | split_bracketed_runs_into_words = __function__(``bracket_split_runs``) `''(optional)''`, | ||
angle_brackets_omittable = | allow_default_indicator = __boolean__, | ||
allow_blank_lemma = | angle_brackets_omittable = __boolean__, | ||
} | allow_blank_lemma = __boolean__, | ||
}``` | |||
`parse_indicator_spec` is a required function that takes two arguments, a string surrounded by angle brackets and the | `parse_indicator_spec` is a required function that takes two arguments, a string surrounded by angle brackets and the | ||
lemma, and should return | lemma, and should return an arbitrary object containing properties describing the indicators inside of the angle | ||
brackets). | brackets). This object is often called a '''base''' and given the argument name `base` in inflection code. | ||
`lang` is the language object for the language in question; only needed if manual translit or respelling may be present | `lang` is the language object for the language in question; only needed if manual translit or respelling may be present | ||
using //. | using `//`. | ||
`transliterate_respelling` is a function that is only needed if respelling is allowed in place of manual translit after | `transliterate_respelling` is a function that is only needed if respelling is allowed in place of manual translit after | ||
//. It takes one argument, the respelling or translit, and should return the transliteration of any respelling but | `//`. It takes one argument, the respelling or translit, and should return the transliteration of any respelling but | ||
return any translit unchanged. | return any translit unchanged. | ||
| Line 822: | Line 1,061: | ||
inflected is vs. surrounding fixed text. It takes one argument, the result of splitting the original text on brackets, | inflected is vs. surrounding fixed text. It takes one argument, the result of splitting the original text on brackets, | ||
and should return alternating runs of words and split characters, or nil to apply the default algorithm. Specifically, | and should return alternating runs of words and split characters, or nil to apply the default algorithm. Specifically, | ||
the value passed in is the result of calling `parse_balanced_segment_run(text, "[", "]")` from | the value passed in is the result of calling `parse_balanced_segment_run(``text``, "[", "]")` from | ||
[[Module:parse utilities]] on the original text, and the default version of this function calls | [[Module:parse utilities]] on the original text, and the default version of this function calls | ||
`split_alternating_runs(bracketed_runs, pattern, "preserve splitchar")`, where `bracketed_runs` is the value passed in | `split_alternating_runs(``bracketed_runs``, ``pattern``, "preserve splitchar")`, where ``bracketed_runs`` is the value | ||
and `pattern` splits on either spaces or hyphens (unless the text begins with a hyphen, in which case splitting is only | passed in and ``pattern`` splits on either spaces or hyphens (unless the text begins with a hyphen, in which case | ||
on spaces, so that suffixes can be inflected). | splitting is only on spaces, so that suffixes can be inflected). | ||
`allow_default_indicator` should be true if an empty indicator in angle brackets <> can be omitted and should be | `allow_default_indicator` should be {true} if an empty indicator in angle brackets `<>` can be omitted and should be | ||
automatically added at the end of the multiword text (if no alternants) or at the end of each alternant (if alternants | automatically added at the end of the multiword text (if no alternants) or at the end of each alternant (if alternants | ||
present). | present). | ||
`angle_brackets_omittable` should be true if angle brackets can be omitted around a non-empty indicator in the presence | `angle_brackets_omittable` should be {true} if angle brackets can be omitted around a non-empty indicator in the | ||
of a blank lemma. In this case, if the combined indicator spec has no angle brackets, they will be added around the | presence of a blank lemma. In this case, if the combined indicator spec has no angle brackets, they will be added around | ||
indicator (or around all indicators, if alternants are present). This only makes sense when `allow_blank_lemma` is | the indicator (or around all indicators, if alternants are present). This only makes sense when `allow_blank_lemma` is | ||
specified. | specified. | ||
`allow_blank_lemma` should be true of if a blank lemma is allowed; in such a case, the calling function should | `allow_blank_lemma` should be {true} of if a blank lemma is allowed; in such a case, the calling function should | ||
substitute a default lemma, typically taken from the pagename. | substitute a default lemma, typically taken from the pagename. | ||
The return value is a table of the form | The return value is a table referred to as an '''alternant multiword spec''', and is of the form | ||
{ | |||
alternant_or_word_specs = { | ```{ | ||
post_text = " | alternant_or_word_specs = {``alternant_or_word_spec``, ``alternant_or_word_spec``, ...}, | ||
post_text_no_links = " | post_text = "``text_at_end``", | ||
post_text_translit = " | post_text_no_links = "``text_at_end_no_links``", | ||
} | post_text_translit = "``translit_of_text_at_end``" `(or nil)`, | ||
}``` | |||
where `alternant_or_word_spec` is either an '''alternant spec''' as returned by `parse_alternant()` or a | |||
'''multiword spec''' as described in the comment above `parse_multiword_spec()`. An alternant spec looks as follows: | |||
```{ | |||
alternants = {``multiword_spec``, ``multiword_spec``, ...}, | |||
before_text = "``text_before_alternant``", | |||
before_text_no_links = "``text_before_alternant``", | |||
before_text_translit = "``translit_of_text_before_alternant``" `(or nil)`, | |||
}``` | |||
i.e. it is like what is returned by `parse_alternant()` but has extra `before_text` and `before_text_no_links` fields. | |||
]==] | |||
i.e. it is like what is returned by parse_alternant() but has extra `before_text` | |||
and `before_text_no_links` fields. | |||
]=] | |||
function export.parse_inflected_text(text, props) | function export.parse_inflected_text(text, props) | ||
if props.angle_brackets_omittable and not props.allow_blank_lemma then | if props.angle_brackets_omittable and not props.allow_blank_lemma then | ||
| Line 865: | Line 1,105: | ||
end | end | ||
local alternant_multiword_spec = {alternant_or_word_specs = {}} | local alternant_multiword_spec = {alternant_or_word_specs = {}} | ||
local alternant_segments = | local alternant_segments = split(text, "(%(%(.-%)%))") | ||
local last_post_text, last_post_text_no_links, last_post_text_translit | local last_post_text, last_post_text_no_links, last_post_text_translit | ||
for i = 1, #alternant_segments do | for i = 1, #alternant_segments do | ||
| Line 891: | Line 1,131: | ||
alternant_multiword_spec.post_text_no_links = last_post_text_no_links | alternant_multiword_spec.post_text_no_links = last_post_text_no_links | ||
alternant_multiword_spec.post_text_translit = last_post_text_translit | alternant_multiword_spec.post_text_translit = last_post_text_translit | ||
-- Save boolean properties from `props`. We need at least `allow_default_indicator` when implementing | |||
-- `reconstruct_original_spec()`. | |||
alternant_multiword_spec.allow_default_indicator = props.allow_default_indicator | |||
alternant_multiword_spec.angle_brackets_omittable = props.angle_brackets_omittable | |||
alternant_multiword_spec.allow_blank_lemma = props.allow_blank_lemma | |||
return alternant_multiword_spec | return alternant_multiword_spec | ||
end | end | ||
-- Inflect alternants in | -- Inflect alternants in ``alternant_spec`` (an object as returned by parse_alternant()). | ||
-- This sets the form values in ` | -- This sets the form values in ```alternant_spec``.forms` for all slots. | ||
-- (If a given slot has no values, it will not be present in ` | -- (If a given slot has no values, it will not be present in ```alternant_spec``.forms`). | ||
local function inflect_alternants(alternant_spec, props) | local function inflect_alternants(alternant_spec, props) | ||
alternant_spec.forms = {} | alternant_spec.forms = {} | ||
| Line 913: | Line 1,158: | ||
--[=[ | --[=[ | ||
Subfunction of | Subfunction of `inflect_multiword_or_alternant_multiword_spec()`. This is used in building up the inflections of | ||
multiword expressions. The basic purpose of this function is to append a set of forms representing the inflections of | multiword expressions. The basic purpose of this function is to append a set of forms representing the inflections of | ||
a given inflected term in a given slot onto the existing forms for that slot. Given a multiword expression potentially | a given inflected term in a given slot onto the existing forms for that slot. Given a multiword expression potentially | ||
| Line 922: | Line 1,167: | ||
Specifically, `formtable` is a table of per-slot forms, where the key is a slot and the value is a list of form objects | Specifically, `formtable` is a table of per-slot forms, where the key is a slot and the value is a list of form objects | ||
(objects of the form {form= | (objects of the form {form=``form``, translit=``manual_translit``, footnotes=``footnotes``}). `slot` is the slot in question. | ||
`forms` specifies the forms to be appended onto the existing forms, and is likewise a list of form objects. `props` | `forms` specifies the forms to be appended onto the existing forms, and is likewise a list of form objects. `props` | ||
is the same as in | is the same as in `inflect_multiword_or_alternant_multiword_spec()`. `before_text` is the fixed text that goes before | ||
the forms to be added. `before_text_no_links` is the same as `before_text` but with any links (i.e. hyperlinks of the | |||
of the form [[ | form [[``term``]] or [[``term``|``display``]]) converted into raw terms using remove_links() in [[Module:links]], and | ||
`before_text_translit` is optional manual translit of `before_text_no_links`. | `before_text_translit` is optional manual translit of `before_text_no_links`. | ||
| Line 945: | Line 1,190: | ||
-- Reject combination due to non-matching variant codes. | -- Reject combination due to non-matching variant codes. | ||
else | else | ||
local | local new_formval | ||
local new_translit | local new_translit | ||
if old_form.form == "?" or form.from == "?" then | if old_form.form == "?" or form.from == "?" then | ||
new_formval = "?" | |||
else | else | ||
new_formval = old_form.form .. before_text .. form.form | |||
if old_form.translit or before_text_translit or form.translit then | if old_form.translit or before_text_translit or form.translit then | ||
if not props.lang then | if not props.lang then | ||
| Line 956: | Line 1,201: | ||
end | end | ||
if not before_text_translit then | if not before_text_translit then | ||
before_text_translit = (props | before_text_translit = props_transliterate(props, before_text_no_links) or "" | ||
end | end | ||
local old_translit = old_form.translit or (props | local old_translit = | ||
local translit = form.translit or (props | old_form.translit or props_transliterate(props, m_links.remove_links(old_form.form)) or "" | ||
local translit = | |||
form.translit or props_transliterate(props, m_links.remove_links(form.form)) or "" | |||
new_translit = old_translit .. before_text_translit .. translit | new_translit = old_translit .. before_text_translit .. translit | ||
end | end | ||
end | end | ||
local new_formobj | |||
local new_footnotes = export.combine_footnotes(old_form.footnotes, form.footnotes) | local new_footnotes = export.combine_footnotes(old_form.footnotes, form.footnotes) | ||
if new_formval == form.form and new_translit == form.translit then | |||
-- Automatically preserve metadata when possible. | |||
new_formobj = m_table.shallowCopy(form) | |||
new_formobj.footnotes = new_footnotes | |||
else | |||
local new_footnotes = export.combine_footnotes(old_form.footnotes, form.footnotes) | |||
new_formobj = {form=new_formval, translit=new_translit, footnotes=new_footnotes} | |||
if props.combine_metadata then | |||
props.combine_metadata { | |||
slot = slot, | |||
dest_form = new_formobj, | |||
form1 = old_form, | |||
form2 = form, | |||
between_text = before_text, | |||
between_text_no_links = before_text_no_links, | |||
between_text_translit = before_text_translit, | |||
} | |||
end | |||
end | |||
table.insert(ret_forms, new_formobj) | |||
end | end | ||
end | end | ||
| Line 973: | Line 1,239: | ||
--[=[ | --[==[ | ||
Top-level inflection function. Create the inflections of a noun, verb, adjective or similar. ` | Top-level inflection function. Create the inflections of a noun, verb, adjective or similar. `alternant_multiword_spec` | ||
is as | |||
returned by `parse_inflected_text` and describes the properties of the term to be inflected, including all the | returned by `parse_inflected_text` and describes the properties of the term to be inflected, including all the | ||
user-provided inflection specifications (e.g. the number, gender, conjugation/declension/etc. of each word) and the | user-provided inflection specifications (e.g. the number, gender, conjugation/declension/etc. of each word) and the | ||
| Line 980: | Line 1,247: | ||
stored into the `.forms` property of `multiword_spec`. This property holds a table whose keys are slots (i.e. ID's | stored into the `.forms` property of `multiword_spec`. This property holds a table whose keys are slots (i.e. ID's | ||
of individual inflected forms, such as "pres_1sg" for the first-person singular present indicative tense of a verb) | of individual inflected forms, such as "pres_1sg" for the first-person singular present indicative tense of a verb) | ||
and whose values are lists of the form { form = | and whose values are lists of the form `{ form = ``form``, translit = ``manual_translit_or_nil``, footnotes = ``footnote_list_or_nil``}`, | ||
where | where ``form`` is a string specifying the value of the form (e.g. "ouço" for the first-person singular present indicative | ||
of the Portuguese verb [[ouvir]]); | of the Portuguese verb [[ouvir]]); ``manual_translit_or_nil`` is the corresponding manual transliteration if needed (i.e. | ||
if the form is in a non-Latin script and the automatic transliteration is incorrect or unavailable), otherwise nil; | if the form is in a non-Latin script and the automatic transliteration is incorrect or unavailable), otherwise nil; | ||
and | and ``footnote_list_or_nil`` is a list of footnotes to be attached to the form, or nil for no footnotes. Note that | ||
currently footnotes must be surrounded by brackets, e.g "[archaic]", and should not begin with a capital letter or end | currently footnotes must be surrounded by brackets, e.g "[archaic]", and should not begin with a capital letter or end | ||
with a period. (Conversion from "[archaic]" to "Archaic." happens automatically.) | with a period. (Conversion from "[archaic]" to "Archaic." happens automatically.) | ||
This function has no return value, but modifies `multiword_spec` in-place, adding the `forms` table as described above. | This function has no return value, but modifies `multiword_spec` in-place, adding the `forms` table as described above. | ||
| Line 993: | Line 1,260: | ||
`props` is an object specifying properties used during inflection, as follows: | `props` is an object specifying properties used during inflection, as follows: | ||
`slot_list` is a list of two-element lists of slots and associated accelerator | ```{ | ||
slot_list = {{"``slot``", "``accel``"}, {"``slot``", "``accel``"}, ...}, | |||
e.g. if | slot_table = {``slot`` = "``accel``", ``slot`` = "``accel``", ...}, | ||
during show_forms(), which takes the same `slot_list` as a property upon input. | skip_slot = nil `or` __function__(slot), | ||
lang = nil `or` __lang_object__, | |||
inflect_word_spec = __function__(base), | |||
get_variants = nil 'or` __function__(formval), | |||
combine_metadata = nil `or` __function__(data), | |||
include_user_specified_links = __boolean__, | |||
}``` | |||
`slot_list` is a list of two-element lists of slots and associated accelerator tags. ``slot`` is arbitrary but should | |||
correspond with slot names as generated by `inflect_word_spec`. ``accel`` is the corresponding accelerator tags; e.g. if | |||
``slot`` is "pres_1sg", ``accel`` might be "1|s|pres|ind". ``accel`` is actually unused during inflection, but is used | |||
during `show_forms()`, which takes the same `slot_list` as a property upon input. | |||
`slot_table` is a table mapping slots to associated accelerator | `slot_table` is a table mapping slots to associated accelerator tags and serves the same function as `slot_list`. Only | ||
`slot_list`. Only one of `slot_list` or `slot_table` must be given. For new code it is preferable to use `slot_list` | one of `slot_list` or `slot_table` must be given. For new code it is preferable to use `slot_list` because this allows | ||
because this allows you to control the order of processing slots, which may occasionally be important. | you to control the order of processing slots, which may occasionally be important. | ||
`skip_slot` is a function of one argument, a slot name, and should return a boolean indicating whether to skip the | `skip_slot` is a function of one argument, a slot name, and should return a boolean indicating whether to skip the | ||
| Line 1,020: | Line 1,289: | ||
(Manual transliteration is allowed if the `lang` object is set in the `props` passed to parse_inflected_text().) | (Manual transliteration is allowed if the `lang` object is set in the `props` passed to parse_inflected_text().) | ||
`inflect_word_spec` is the function to do the actual inflection. It is passed a single argument, which is a | `inflect_word_spec` is the function to do the actual inflection. It is passed a single argument, which is a ``word_spec`` | ||
object describing the word to be inflected and the user-provided inflection specifications. It is exactly the same as | object describing the word to be inflected and the user-provided inflection specifications. It is exactly the same as | ||
was returned by the `parse_indicator_spec` function provided in the `props` sent on input to `parse_inflected_text`, but | was returned by the `parse_indicator_spec` function provided in the `props` sent on input to `parse_inflected_text`, but | ||
has additional fields describing the word to be inflected and the surrounding text, as follows: | has additional fields describing the word to be inflected and the surrounding text, as follows: | ||
{ | |||
lemma = " | ```{ | ||
before_text = " | lemma = "``lemma``", | ||
before_text_no_links = " | before_text = "``text-before-word``", | ||
before_text_translit = " | before_text_no_links = "``text-before-word-no-links``", | ||
before_text_translit = "``manual-translit-of-text-before-word``" or nil (if no manual translit or respelling was specified in the before-text) | |||
-- Fields as described in parse_indicator_spec() | -- Fields as described in parse_indicator_spec() | ||
... | ... | ||
} | }``` | ||
Here | Here ``lemma`` is the word to be inflected as specified by the user (including any links if so given), and the | ||
`before_text*` fields describe the raw text preceding the word to be inflected. Any other fields in this object are as | `before_text*` fields describe the raw text preceding the word to be inflected. Any other fields in this object are as | ||
set by `parse_inflected_text`, and describe things like the gender, number, conjugation/declension, etc. as specified | set by `parse_inflected_text`, and describe things like the gender, number, conjugation/declension, etc. as specified | ||
by the user in the <...> spec following the word to be inflected. | by the user in the <...> spec following the word to be inflected. | ||
`inflect_word_spec` should initialize the `.forms` property of the passed-in | `inflect_word_spec` should initialize the `.forms` property of the passed-in ``word_spec`` object to the inflected forms of | ||
the word in question. The value of this property is a table of the same format as the `.forms` property that is | the word in question. The value of this property is a table of the same format as the `.forms` property that is | ||
ultimately generated by inflect_multiword_or_alternant_multiword_spec() and described above near the top of this | ultimately generated by inflect_multiword_or_alternant_multiword_spec() and described above near the top of this | ||
documentation: i.e. a table whose keys are slots and whose values are lists of the form | documentation: i.e. a table whose keys are slots and whose values are lists of the form | ||
{ form = | `{ form = ``form``, translit = ``manual_translit_or_nil``, footnotes = ``footnote_list_or_nil``}`. | ||
`get_variants` is either nil or a function of one argument (a string, | `get_variants` is either {nil} or a function of one argument (a string, a form value). The purpose of | ||
this function is to ensure that in a multiword term where a given slot has more than one possible variant, the final | this function is to ensure that in a multiword term where a given slot has more than one possible variant, the final | ||
output has only parallel variants in it. For example, feminine nouns and adjectives in Russian have two possible | output has only parallel variants in it. For example, feminine nouns and adjectives in Russian have two possible | ||
| Line 1,068: | Line 1,338: | ||
See [[Module:hi-verb]] and [[Module:hi-common]] for an example of doing this in a generalized fashion. (Look for | See [[Module:hi-verb]] and [[Module:hi-common]] for an example of doing this in a generalized fashion. (Look for | ||
add_variant_codes(), get_variants() and remove_variant_codes().) | add_variant_codes(), get_variants() and remove_variant_codes().) | ||
`combine_metadata` is a function that is invoked when combining two form objects along along with in-between text and | |||
storing into a destination form object. When this happens, if the the form value and translit in the first form object | |||
is empty and the in-between text is likewise empty (which regularly happens when appending the form object describing | |||
the first word in a multiword expression to empty base text), the second form object is simply shallow-copied along with | |||
all of its metadata, and any footnotes are combined appropriately (normally the first form object is such a case won't | |||
have footnotes). Otherwise, a new form object is constructed by combining the form values, translit and footnotes from | |||
the two objects and in-between text, and calling `combine_metadata` to combine any other metadata. Leave this | |||
unspecified if there is no additional metadata or if you don't want any metadata carried over. (Examples of metadata | |||
that should generally not be carried over are glosses of individual words, sense ID's and similar word-level properties | |||
that can't easily be combined to generate a multiword equivalent. Examples of metadata that should be carried over and | |||
combined are qualifiers, labels and certain boolean properties such as an uncertainty flag indicating that a given form | |||
is uncertain. For some metadata, it is more complex; for example, if both source words have the same gender or part of | |||
speech, the destination should keep that value, but if they differ, it may be safest to leave the field blank.) This | |||
function, if specified, is called with a single argument as follows: | |||
```{ | |||
slot = "__string__", | |||
dest_form = __formobj__, | |||
form1 = __formobj__, | |||
form2 = __formobj__, | |||
between_text = "__string__", | |||
between_text_no_links = "__string__", | |||
between_text_translit = "__string__" `or` nil | |||
}``` | |||
Here, `slot` is the slot whose forms are being constructed. `dest_form` is the destination form object into which the | |||
combined metadata should be written, and is pre-populated with appropriate `form`, `translit` and `footnotes` fields. | |||
`form1` and `form2` are the two source forms being combined, and `between_text` is the text to be inserted between the | |||
two source forms. `between_text_no_links` is the same as `between_text` but with double-bracket links removed, and | |||
`between_text_translit` is the manual transliteration of `between_text_no_links`, if specified. The function should | |||
return nothing, but should side-effect `dest_form` as appropriate. | |||
`include_user_specified_links`, if given, ensures that user-specified links in the raw text surrounding a given word | `include_user_specified_links`, if given, ensures that user-specified links in the raw text surrounding a given word | ||
are preserved in the output. If omitted or set to false, such links will be removed and the whole multiword expression | are preserved in the output. If omitted or set to false, such links will be removed and the whole multiword expression | ||
will be linked. | will be linked. | ||
]=] | ]==] | ||
function export.inflect_multiword_or_alternant_multiword_spec(multiword_spec, props) | function export.inflect_multiword_or_alternant_multiword_spec(multiword_spec, props) | ||
multiword_spec.forms = {} | multiword_spec.forms = {} | ||
| Line 1,134: | Line 1,436: | ||
function export.get_footnote_text( | function export.get_footnote_text(footnotes, footnote_obj) | ||
if not | if not footnotes then | ||
return "" | return "" | ||
end | end | ||
local link_indices = {} | local link_indices = {} | ||
local all_refs = {} | local all_refs = {} | ||
for _, footnote in ipairs( | for _, footnote in ipairs(footnotes) do | ||
local refs | local refs | ||
footnote, refs = export.expand_footnote_or_references(footnote) | footnote, refs = export.expand_footnote_or_references(footnote) | ||
| Line 1,149: | Line 1,451: | ||
this_noteindex = footnote_obj.noteindex | this_noteindex = footnote_obj.noteindex | ||
footnote_obj.noteindex = footnote_obj.noteindex + 1 | footnote_obj.noteindex = footnote_obj.noteindex + 1 | ||
table.insert(footnote_obj.notes, '<sup style="color: red">' .. this_noteindex .. '</sup>' .. footnote) | table.insert(footnote_obj.notes, '<sup style="color: var(--wikt-palette-red, red)">' .. this_noteindex .. '</sup>' .. footnote) | ||
footnote_obj.seen_notes[footnote] = this_noteindex | footnote_obj.seen_notes[footnote] = this_noteindex | ||
end | end | ||
| Line 1,196: | Line 1,498: | ||
local link_text | local link_text | ||
if #link_indices > 0 then | if #link_indices > 0 then | ||
link_text = '<sup style="color: red">' .. table.concat(link_indices, ",") .. '</sup>' | link_text = '<sup style="color: var(--wikt-palette-red, red)">' .. table.concat(link_indices, ",") .. '</sup>' | ||
else | else | ||
link_text = "" | link_text = "" | ||
| Line 1,209: | Line 1,511: | ||
-- Add links around words in a term. If multiword_only, do it only in multiword terms. | --[==[ | ||
function export.add_links( | Add links around words in a term. If multiword_only, do it only in multiword terms. | ||
]==] | |||
function export.add_links(form, multiword_only) | |||
if form == "" or form == " " then | if form == "" or form == " " then | ||
return form | return form | ||
| Line 1,229: | Line 1,533: | ||
-- Remove redundant link surrounding entire term. | --[==[ | ||
Remove redundant link surrounding entire term. | |||
]==] | |||
function export.remove_redundant_links(term) | function export.remove_redundant_links(term) | ||
return rsub(term, "^%[%[([^%[%]|]*)%]%]$", "%1") | return rsub(term, "^%[%[([^%[%]|]*)%]%]$", "%1") | ||
| Line 1,235: | Line 1,541: | ||
-- Add links to all before and after text; for use in inflection modules that preserve links in multiword lemmas and | --[==[ | ||
Add links to all before and after text; for use in inflection modules that preserve links in multiword lemmas and | |||
include links in non-lemma forms rather than allowing the entire form to be a link. If `remember_original`, remember | |||
the original user-specified before/after text so we can reconstruct the original spec later. `add_links` is a | |||
function of one argument to add links to a given piece of text; if unspecified, it defaults to `export.add_links`. | |||
]==] | |||
function export.add_links_to_before_and_after_text(alternant_multiword_spec, remember_original, add_links) | function export.add_links_to_before_and_after_text(alternant_multiword_spec, remember_original, add_links) | ||
add_links = add_links or export.add_links | add_links = add_links or export.add_links | ||
| Line 1,263: | Line 1,571: | ||
--[=[ | --[==[ | ||
Convert the forms in ` | Reconstruct the original overall spec from the output of parse_inflected_text(), so we can use it in the | ||
language-specific acceleration module in the implementation of {{tl|pt-verb form of}} and the like. `props` is an | |||
into strings. Each form list turns into a string consisting of a comma-separated list of linked forms, with accelerators | optional table of properties. Currently only `preprocess_angle_bracket_spec` is recognized, and is an optional function | ||
(unless `no_accel` is set in a given form). `props` is a table used in generating the strings, as follows: | of one argument that is called to process an angle-bracket spec before inserting into the reconstructed spec. | ||
{ | ]==] | ||
lang = | function export.reconstruct_original_spec(alternant_multiword_spec, props) | ||
lemmas = {" | local parts = {} | ||
slot_list = {{" | props = props or {} | ||
slot_table = { | |||
include_translit = | local function ins(txt) | ||
create_footnote_obj = nil or | table.insert(parts, txt) | ||
canonicalize = nil or | end | ||
transform_accel_obj = nil or | local function insert_angle_bracket_spec(spec) | ||
join_spans = nil or | if props.preprocess_angle_bracket_spec then | ||
allow_footnote_symbols = | spec = props.preprocess_angle_bracket_spec(spec) | ||
footnotes = nil or {" | end | ||
} | ins(spec) | ||
end | |||
for _, alternant_or_word_spec in ipairs(alternant_multiword_spec.alternant_or_word_specs) do | |||
ins(alternant_or_word_spec.user_specified_before_text) | |||
if alternant_or_word_spec.alternants then | |||
ins("((") | |||
for i, multiword_spec in ipairs(alternant_or_word_spec.alternants) do | |||
if i > 1 then | |||
ins(",") | |||
end | |||
for _, word_spec in ipairs(multiword_spec.word_specs) do | |||
ins(word_spec.user_specified_before_text) | |||
ins(word_spec.user_specified_lemma) | |||
insert_angle_bracket_spec(word_spec.angle_bracket_spec) | |||
end | |||
ins(multiword_spec.user_specified_post_text) | |||
end | |||
ins("))") | |||
else | |||
ins(alternant_or_word_spec.user_specified_lemma) | |||
insert_angle_bracket_spec(alternant_or_word_spec.angle_bracket_spec) | |||
end | |||
end | |||
ins(alternant_multiword_spec.user_specified_post_text) | |||
local retval = table.concat(parts) | |||
if alternant_multiword_spec.allow_default_indicator then | |||
-- As a special case, if we see e.g. "amar<>", remove the <>. Don't do this if there are spaces or alternants. | |||
if not retval:find(" ") and not retval:find("%(%(") then | |||
local retval_no_angle_brackets = retval:match("^(.*)<>$") | |||
if retval_no_angle_brackets then | |||
return retval_no_angle_brackets | |||
end | |||
end | |||
end | |||
return retval | |||
end | |||
--[==[ | |||
Convert the forms in ``formtable`` (a form table, whose keys are slots and whose values are lists of form objects, each | |||
of which is a table of the form `form = ``form``, translit = ``manual_translit_or_nil``, footnotes = ``footnote_list_or_nil``, no_accel = ``true_to_suppress_accelerators``, ... `) | |||
into strings. The form table is side-effected. Each form list turns into a string consisting of a comma-separated list | |||
of linked forms, with accelerators (unless `no_accel` is set in a given form object). If `include_translit` is | |||
specified, each string consists of a comma-separated list of form values (each formatted as a link), an HTML | |||
`<br/>`, and a comma-separated list of transliterations. `props` is a table used in generating the strings, as | |||
follows: | |||
```{ | |||
lang = __lang_object__, | |||
lemmas = {"``lemma``", "``lemma``", ...}, | |||
slot_list = {{"``slot``", "``accel``"}, {"``slot``", "``accel``"}, ...}, | |||
slot_table = {``slot`` = "``accel``", ``slot`` = "``accel``", ...}, | |||
include_translit = __boolean__, | |||
create_footnote_obj = nil `or` __function__(), | |||
canonicalize = nil or __function__(formval), | |||
preprocess_forms = nil `or` __function__(data), | |||
no_deduplicate_forms = __boolean__, | |||
combine_metadata_during_dedup = nil `or` __function__(data), | |||
transform_accel_obj = nil `or` __function__(slot, form, accel_obj), | |||
format_forms = nil `or` __function__(data), | |||
generate_link = nil `or` __function__(data), | |||
format_tr = nil `or` __function__(data), | |||
join_spans = nil `or` __function__(data), | |||
allow_footnote_symbols = __boolean__, | |||
footnotes = nil or {"``extra_footnote``", "``extra_footnote``", ...}, | |||
}``` | |||
`lemmas` is the list of lemmas, used in the accelerators. | `lemmas` is the list of lemmas, used in the accelerators. | ||
`slot_list` is a list of two-element lists of slots and associated accelerator | `slot_list` is a list of two-element lists of slots and associated accelerator tag sets. ``slot`` should correspond | ||
slots generated during inflect_multiword_or_alternant_multiword_spec(). | to slots generated during `inflect_multiword_or_alternant_multiword_spec()`. ``accel`` is the corresponding accelerator | ||
e.g. if | tag set; e.g. if ``slot`` is "pres_1sg", ``accel`` might be "1|s|pres|ind". ``accel`` is used in generating entries for | ||
(see [[WT:ACCEL]]). | accelerator support (see [[WT:ACCEL]]). | ||
`slot_table` is a table mapping slots to associated accelerator | `slot_table` is a table mapping slots to associated accelerator tag sets and serves the same function as `slot_list`. | ||
`slot_list`. Only one of `slot_list` or `slot_table` must be given. For new code it is preferable to use `slot_list` | Only one of `slot_list` or `slot_table` must be given. For new code it is preferable to use `slot_list` because this | ||
because this allows you to control the order of processing slots, which may occasionally be important. | allows you to control the order of processing slots, which may occasionally be important. | ||
`include_translit`, if given, causes transliteration to be included in the generated strings. | `include_translit`, if given, causes transliteration to be included in the generated strings. | ||
The function works as follows: | |||
# Create an object to hold footnotes (customizable using `create_footnote_obj`). | |||
# Generate the comma-separated lemma form values and store in `.lemma` in the form table. | |||
# Loop over the slots specified using `slot_list` or `slot_table`. For each slot: | |||
## Canonicalize the form values (customizable using `canonicalize`; by default does nothing). | |||
## Preprocess the forms (customizable using `preprocess_forms`; by default does nothing). | |||
## Unless `no_deduplicate_forms` is set, deduplicate forms in a slot sharing the same form value but possibly different | |||
transliteration. (This happens e.g. in Russian, where it is relatively common for a given form to have two possible | |||
transliterations, one reflecting a more nativized pronunciation where Cyrillic е triggers palatalization of the | |||
preceding consonant, and one reflecting a more "foreign" pronunciation where this palatalization does not happen. In | |||
such a case, the automatic transliteration would normally suffice for the more nativized pronunciation but the more | |||
"foreign" pronunciation will need manual transliteration.) As part of deduplication, footnotes will be combined using | |||
`combine_footnotes`; distinct manual transliterations will be combined into a list (meaning the `translit` field of | |||
form objects in some subsequent `props` functions may hold a list; this will be noted when possible); and any | |||
remaining metadata will be combined using the `combine_metadata_during_dedup` method, if provided. | |||
## Add acceleration to all forms. The acceleration tag set associated with a given form comes from `slot_list` or | |||
`slot_table`, i.e. all forms in a given slot have the same tag set. However, different forms will have different | |||
associated transliterations stored into the accelerator object associated with the form, as well as possibly | |||
different lemmas. In particular, when there are multiple lemma forms, this is often due to alternative ways to | |||
pronounce the lemma (e.g. alternative stress positions or vowel lengths), and there are often associated non-lemma | |||
forms that match each lemma. An example given in the introduction is {{m+|uk|русин||Rusyn}}, stressed in the lemma | |||
as ''ру́син'' or ''руси́н'' with associated genitive singulars ''ру́сина'' and ''русина́''. We would like the | |||
auto-generated accelerator entry for {{m|uk|русина}} to show the variant ''ру́сина'' as having lemma ''ру́син'' and | |||
the variant ''русина́'' as having the lemma ''руси́н'', rather than showing both variants as having both lemmas, | |||
which is less accurate. As a result, the code that generates acceleration objects for forms matches up forms and | |||
lemmas one-to-one if possible. If this is not possible, the matching is usually one lemma to many forms, as in | |||
{{m+|uk|міст||bridge}} with genitive singular ''мо́сту'' or ''моста́'' (in which case all forms get the same lemma), or | |||
many lemmas to one form, as in {{m+|uk|черга||turn, queue}} stressed either ''че́рга'' or ''черга́'' with nominative | |||
singular only ''че́рги'' (in which case the single form gets assicated all lemmas). If there are multiple lemmas and | |||
multiple forms, the algorithm attempts to align them as evenly as possible (e.g. two lemma variants to four forms | |||
means the first two forms get assigned the first lemma variant and the last two forms get assigned the second lemma | |||
variant); this is often going to be incorrect, but (a) there's unlikely to be a single algorithm that works in all | |||
such circumstances, and (b) these cases are very rare. Finally, note the following: | |||
##* No acceleration is assigned to a form if any of the following apply: (a) there are no lemmas given in | |||
`props.lemmas`; (b) the `no_accel` key in the form object has a non-falsy value; (c) the form value of the form is | |||
{"?"} or an em-dash ({"—"}); (d) the accelerator tag set is given as a hyphen {"-"}); or (e) the form value contains | |||
an internal link. | |||
##* The accelerator code sets the `formval_for_link` key in each form object to the version of the form value that | |||
should be passed to `full_link()` in [[Module:links]]. This is usually the same as the passed-in form value, but | |||
differs when `props.allow_footnote_symbols` is specified and an old-style footnote symbol is attached to the form | |||
(the removed footnote symbol is stored in the `formval_old_style_footnote_symbol` key), and also differs when the | |||
entire form value is surrounded with a redundant internal link (which is removed). | |||
##* The resulting accelerator object can be modified (or replaced entirely) by the `transform_accel_obj` function. This | |||
is used, for example, in [[Module:es-verb]], [[Module:pt-verb]] and other Romance-language verb conjugation modules | |||
(likewise [[Module:ar-verb]]) to replace the tag set with the original verb spec used to generate the verb, so that | |||
the accelerator code can generate the appropriate call to {{tl|es-verb form of}}, {{tl|pt-verb form of}} or the | |||
like, which computes the inflections, instead of directly listing the inflections. | |||
## Format the forms into strings. The entire default process can be replaced using `format_forms`; otherwise the default | |||
algorithm works as follows: | |||
### Generate the '''form value spans''', with one entry (a linked HTML-ized version of the form value) per form. This | |||
can be customized using `generate_link`. (Various modules do this. For example, the Arabic verb module includes | |||
qualifiers, labels, ID's and the like that can be specified by the user; the Portuguese and reintegrated Galician | |||
verb modules italicize certain superseded or otherwise less-desirable forms instead of linking them normally; the | |||
German verb module adds {{m|de|dass}} to subjunctive forms and optional pronouns to imperative forms; and the German | |||
adjective module adds articles to adjective forms normally accompanied by articles and the equivalent of "he/she is" | |||
etc. to predicate forms.) The default uses `full_link()` in [[Module:links]] (with transliteration generation | |||
disabled) concatenated with the appropriate footnote symbol(s) (if any). | |||
### Generate the '''transliteration spans''', with one entry per distinct translit, auto-generated if manual translit | |||
isn't available. Note that, due to the earlier form value deduplication step, there may be multiple translits per | |||
form object. These translits are themselves deduplicated to get the list of spans. (Such duplication can happen, for | |||
example, in Arabic with terms containing a glottal stop in them; there may be multiple ways of spelling the glottal | |||
stop or ''hamza'' in Arabic, but only one way of transliterating it.) Each span consists of an object specifying | |||
the translit minus any attached old-style footnote symbols (which are only allowed if | |||
`props.allow_footnote_symbols` is set); the attached old-style footnote symbol, which is always an empty string when | |||
`props.allow_footnote_symbols` is not set; and the list of (new-style) footnotes. These objects are then converted | |||
to formatted strings, either using `format_tr` if supplied or else calling `tag_translit()` in | |||
[[Module:script utilities]] and concatenating the appropriate footnote symbol(s) (if any). | |||
### Combine the form value and transliteration spans. If `join_spans` is supplied, use it; otherwise, concatenate the | |||
form value spans (comma-separated) and (if available) transliteration spans (comma-separated), and (if appropriate) | |||
combine them using {<br />}. | |||
`create_footnote_obj` is an optional function of no arguments to create the footnote object used to track footnotes; | `create_footnote_obj` is an optional function of no arguments to create the footnote object used to track footnotes; | ||
see | see `create_footnote_obj()`. Customizing it is useful to prepopulate the footnote table using `get_footnote_text()`. | ||
`canonicalize` is an optional function of one argument (a form value) to canonicalize each form before processing; it | |||
can return nil for no change. The most common purpose of this function is to remove variant codes from the form value. | |||
See the documentation for `inflect_multiword_or_alternant_multiword_spec()` for a description of variant codes and their | |||
purpose. | |||
` | `preprocess_forms` is an optional function of one argument (a table of properties) to preprocess the form objects as | ||
a whole. It runs after `canonicalize` (meaning that the form values passed in are canonicalized) and before | |||
deduplication and the addition of acceleration info. The property table passed in has the following properties: | |||
* `slot`: The slot being processed. | |||
* `forms`: The list of form objects for this slot. | |||
* `accel_tag_set`: The accelerator tag set for this slot, taken from `slot_list` or `slot_table`. | |||
* `footnote_obj`: The footnote object returned by the `create_footnote_obj` property or the default | |||
`create_footnote_obj()` function. | |||
`preprocess_forms` should return a list of preprocessed form objects, or {nil} to use the passed-in `forms`. If this | |||
function does deduplication, you should set `no_deduplicate_forms` to disable the default deduplication process. | |||
` | `no_deduplicate_forms`, if set, disables the deduplication step (see above). | ||
` | `combine_metadata_during_dedup` is an optional function of one argument (a table of properties) to combine the metadata | ||
of deduplicated form objects. The property table passed in has the following properties: | |||
* `slot`: The slot being processed. | |||
* `existing_form`: The existing form object into which a duplicated form is being combined. | |||
* `dup_form`: The duplicated form being combined into `existing_form`. | |||
* `existing_form_pos`: The one-based position of the existing form in the deduplicated form list (not necessarily its | |||
original position). | |||
* `dup_form_pos`: The one-based position of the duplicated form in its original list. | |||
The following should be noted about the form objects passed in: | |||
# The form values in `.form` have been canonicalized using `.canonicalize`, if provided. | |||
# The form values in `existing_form` and `dup_form` are always the same. | |||
# The footnotes in `existing_form` have already been combined with those in `dup_form`. | |||
# If there was manual translit either in `existing_form` (prior to deduplication) or in `dup_form`, there will be | |||
manual translit in `existing_form.translit` that is a list and combines any previous accumulated translits in | |||
`existing_form` as well as the translit in `dup_form` (even if one of them was specified as {nil} indicating an | |||
automatic translit). This means that the translit in `existing_form.translit` is always either {nil} or a list of | |||
strings (and the same applies to `dup_form.translit`). | |||
`transform_accel_obj` is an optional function of three arguments (slot, formobj, accel_obj) to transform the default | `transform_accel_obj` is an optional function of three arguments (``slot``, ``formobj``, ``accel_obj``) to transform the | ||
constructed accelerator object in `accel_obj` into an object that should be passed to full_link() in [[Module:links]]. | default constructed accelerator object in ``accel_obj`` into an object that should be passed to `full_link()` in | ||
It should return the new accelerator object, or nil for no acceleration. | [[Module:links]]. It should return the new accelerator object, or {nil} for no acceleration. (If {nil} is returned, | ||
object passed in. NOTE: This is called even when the passed-in `accel_obj` is nil ( | the corresponding form has no acceleration; this is unlike most customization functions, where returning {nil} causes | ||
the default algorithm to be invoked.) The function can destructively modify the accelerator object passed in. | |||
'''NOTE''': This is called even when the passed-in ``accel_obj`` is {nil} (see the (a) through (e) reasons above why no | |||
acceleration may be assigned to a form). Thus, your code needs to do something sensible in this case. The description | |||
above of how `show_forms()` works inclues various examples of modules that supply a `transform_accel_obj` function and | |||
the reasons for doing so. | |||
` | `format_forms`, if supplied, is a function that entirely replaces the formatting portion of `show_forms()`. An example | ||
of why you might want to do this is to get a different layout than the default, e.g. one where translit is displayed | |||
next to each form value instead of the form values and translits grouped and displayed on separate lines. Under normal | |||
circumstances, you should not do this, but instead customize the functions that replace specific parts of the default | |||
formatting algorithm (see below). This function is passed one argument (a table of properties) and should return a | |||
string (the formatted forms, ready to store into the slot in the form table) or {nil} to proceed with the default | |||
algorithm (see above). The property table passed in has the following properties: | |||
* `slot`: The slot being processed. | |||
* `forms`: The list of form objects, deduplicated and with accelerator info added. | |||
* `footnote_obj`: The footnote object returned by the `create_footnote_obj` property or the default | |||
`create_footnote_obj()` function. | |||
The following should be noted about the form objects in `forms`: | |||
# There are extra fields `formval_for_link`, `formval_old_style_footnote_symbol` and `accel_obj`. The first two are as | |||
described above under the paragraph beginning "Add acceleration to all forms" under "The function works as follows". | |||
The third one is the accelerator object in the format expected by [[Module:links]]. | |||
# The `translit` field, if non-{nil}, is a list of transliterations rather than a single transliteration; this is due to | |||
the form value deduplication step. | |||
`allow_footnote_symbols`, if given, causes any footnote symbols attached to forms (e.g. numbers, asterisk) to be | `generate_link` is an optional function to generate the link text for a given form value. It is passed a single argument | ||
separated off, placed outside the links, and superscripted. In this case, `footnotes` should be a list of footnotes | (a table of properties) and should return a string, the formatted link. If it returns {nil}, the default algorithm (see | ||
above) is invoked. The property table passed in has the following properties: | |||
* `slot`: The slot being processed. | |||
* `form`: The form to be converted to a formatted link. As with the `format_forms` function described above, the form | |||
objects passed in contain extra fields `formval_for_link`, `formval_old_style_footnote_symbol` and `accel_obj` (all | |||
of which will normally be used), and the `translit` field, if non-{nil}, is a list. | |||
* `pos`: The one-based position of the form being processed, in the list of form value spans. Rarely used. | |||
* `footnote_obj`: The footnote object returned by the `create_footnote_obj` property or the default | |||
`create_footnote_obj()` function. Normally used in order to get the (new-style) footnote symbol associated with any | |||
footnotes in `footnotes`. | |||
The description above of how `show_forms()` works inclues various examples of modules that supply a `generate_link` | |||
function and the reasons for doing so. | |||
`format_tr` is an optional function to generate the formatted text for a given transliteration. It is passed a single | |||
argument (a table of properties) and should return a string, the formatted transliteration text. If it returns {nil}, | |||
the default algorithm (see above) is invoked. The property table passed in has the following properties: | |||
* `slot`: The slot being processed. | |||
* `tr_for_tag`: The transliteration to process, where old-style footnote symbols have been removed. | |||
* `old_style_footnote_symbol`: The removed old-style footnote symbol, or a blank string if no symbol was removed. | |||
* `pos`: The one-based position of the transliteration being processed, in the list of transliteration spans. Rarely | |||
used. | |||
* `footnotes`: The list of footnotes associated with all form objects with this transliteration. (If there were multiple | |||
form objects with the same transliteration, the list of footnotes will have been generated using | |||
`combine_footnotes()`.) | |||
* `footnote_obj`: The footnote object returned by the `create_footnote_obj` property or the default | |||
`create_footnote_obj()` function. Normally used in order to get the (new-style) footnote symbol associated with any | |||
footnotes in `footnotes`. | |||
`join_spans` is an optional function to join the processed form value and transliteration spans into a formatted string. | |||
It is passed a single argument (a table of properties) and should return the final string to store into the form table | |||
slot. If it returns {nil}, the default algorithm (see above) is invoked. The property table passed in has the following | |||
properties: | |||
* `slot`: The slot being processed. | |||
* `formval_spans`: A list of strings, the formatted form value spans. | |||
* `tr_spans`: A list of strings, the formatted transliteration spans. If there is no transliteration, this will be an | |||
empty list. | |||
A custom `join_spans` is provided by [[Module:de-verb]], which concatenates the form value spans vertically (using | |||
{"<br />"}) instead of horizontally using a comma, as is normal; this is because there is no translit and the form | |||
values are often long, containing extra words attached during `generate_link()`. The only exception is the `aux` slot | |||
holding the auxiliaries, which is concatenated horizontally using {" or "}. [[Module:de-adjective]] similarly provides | |||
a custom `join_spans` function that concatenates the form value spans vertically. | |||
`allow_footnote_symbols`, if given, causes any old-style footnote symbols attached to forms (e.g. numbers, asterisk) to | |||
be separated off, placed outside the links, and superscripted. In this case, `footnotes` should be a list of footnotes | |||
(preceded by footnote symbols, which are superscripted). These footnotes are combined with any footnotes found in the | (preceded by footnote symbols, which are superscripted). These footnotes are combined with any footnotes found in the | ||
forms and placed into `forms.footnotes`. This mechanism of specifying footnotes is provided for backward compatibility | forms and placed into `forms.footnotes`. This mechanism of specifying footnotes is provided for backward compatibility | ||
with certain existing inflection modules and should not be used for new modules. Instead, use the regular footnote | with certain existing inflection modules and should not be used for new modules. Instead, use the regular footnote | ||
mechanism specified using the `footnotes` property attached to each form object. | mechanism specified using the `footnotes` property attached to each form object. | ||
]=] | ]==] | ||
function export.show_forms( | function export.show_forms(formtable, props) | ||
local footnote_obj = props.create_footnote_obj and props.create_footnote_obj() or export.create_footnote_obj() | local footnote_obj = props.create_footnote_obj and props.create_footnote_obj() or export.create_footnote_obj() | ||
local function | local function fetch_formval_and_translit(entry, remove_links) | ||
local | local formval, translit | ||
if type(entry) == "table" then | if type(entry) == "table" then | ||
formval, translit = entry.form, entry.translit | |||
else | else | ||
formval = entry | |||
end | end | ||
if remove_links then | if remove_links then | ||
formval = m_links.remove_links(formval) | |||
end | end | ||
return | return formval, translit | ||
end | end | ||
local | local lemma_formvals = {} | ||
for _, lemma in ipairs(props.lemmas) do | for _, lemma in ipairs(props.lemmas) do | ||
local | local lemma_formval, _ = fetch_formval_and_translit(lemma) | ||
m_table.insertIfNot( | m_table.insertIfNot(lemma_formvals, lemma_formval) | ||
end | |||
formtable.lemma = #lemma_formvals > 0 and table.concat(lemma_formvals, ", ") or | |||
mw.loadData(headword_data_module).pagename | |||
-- For safety, since we in-place modify `lemmas` usually before processing a given slot, make a copy. | |||
local props_lemmas = m_table.shallowCopy(props.lemmas) | |||
for i, lemma in ipairs(props_lemmas) do | |||
props_lemmas[i] = m_table.shallowCopy(lemma) | |||
end | end | ||
local function | local function do_slot(slot, accel_tag_set) | ||
local formobjs = formtable[slot] | |||
if formobjs then | |||
if type(formobjs) ~= "table" then | |||
error("Internal error: For slot '" .. slot .. "', expected table but saw " .. dump(formobjs)) | |||
end | |||
-- Maybe canonicalize the form values (e.g. remove variant codes and monosyllabic accents). | |||
if props.canonicalize then | |||
for _, form in ipairs(formobjs) do | |||
form.form = props.canonicalize(form.form) or form.form | |||
end | |||
end | |||
-- Preprocess the forms as a whole if called for. | |||
if props.preprocess_forms then | |||
formobjs = props.preprocess_forms { | |||
slot = slot, | |||
forms = formobjs, | |||
accel_tag_set = accel_tag_set, | |||
footnote_obj = footnote_obj, | |||
} or formobjs | |||
end | |||
-- Maybe deduplicate form values (happens e.g. in Russian with two terms with the same Russian form but | |||
-- different translits). | |||
if not props.no_deduplicate_forms then | |||
local deduped_formobjs = {} | |||
for i, form in ipairs(formobjs) do | |||
local function combine_forms(existing_form, dup_form, pos) | |||
assert(existing_form.form == dup_form.form) | |||
-- Combine footnotes. | |||
existing_form.footnotes = export.combine_footnotes(existing_form.footnotes, dup_form.footnotes) | |||
-- If translit is being generated, and there's manual translit associated with either form, we | |||
-- need to generate any missing translits and combine them, taking into account the fact that a | |||
-- translit value may actually be a list of translits (particularly with the existing form if we | |||
-- already combined an item with manual translit into it). | |||
if props.include_translit and form_value_transliterable(existing_form.form) and ( | |||
existing_form.translit or dup_form.translit) then | |||
local combined_translit | |||
if not existing_form.translit then | |||
combined_translit = { | |||
props_transliterate(props, m_links.remove_links(existing_form.form)) | |||
} | |||
elseif type(existing_form.translit) == "string" then | |||
combined_translit = {existing_form.translit} | |||
else | |||
combined_translit = existing_form.translit | |||
end | |||
local dup_form_translit = dup_form.translit | |||
if not dup_form_translit then | |||
-- dup_form.form is the same as existing_form.form (see assert above), but this is | |||
-- defensive programming in case that changes | |||
dup_form_translit = {props_transliterate(props, m_links.remove_links(dup_form.form))} | |||
elseif type(dup_form_translit) == "string" then | |||
dup_form_translit = {dup_form_translit} | |||
end | |||
for _, translit in ipairs(dup_form_translit) do | |||
m_table.insertIfNot(combined_translit, translit) | |||
end | |||
existing_form.translit = combined_translit | |||
end | |||
if props.combine_metadata_during_dedup then | |||
props.combine_metadata_during_dedup { | |||
slot = slot, | |||
existing_form = existing_form, | |||
existing_form_pos = pos, | |||
dup_form = dup_form, | |||
dup_form_pos = i, | |||
} | |||
end | |||
end | |||
m_table.insertIfNot(deduped_formobjs, form, { | |||
key = function(form) return form.form end, | |||
combine = combine_forms, | |||
}) | |||
end | |||
formobjs = deduped_formobjs | |||
end | end | ||
for i, form in ipairs( | |||
local | -- Add acceleration info to form objects. | ||
for i, form in ipairs(formobjs) do | |||
local formval = form.form | |||
if not form_value_transliterable(formval) then | |||
form.formval_for_link = formval | |||
form.formval_old_style_footnote_symbol = "" | |||
else | else | ||
local | local formval_for_link, formval_old_style_footnote_symbol | ||
if props.allow_footnote_symbols then | if props.allow_footnote_symbols then | ||
formval_for_link, formval_old_style_footnote_symbol = | |||
require(table_tools_module).get_notes(formval) | |||
else | else | ||
formval_for_link = formval | |||
formval_old_style_footnote_symbol = "" | |||
end | end | ||
-- remove redundant link surrounding entire form | -- remove redundant link surrounding entire form | ||
formval_for_link = export.remove_redundant_links(formval_for_link) | |||
form.formval_for_link = formval_for_link | |||
form.formval_old_style_footnote_symbol = formval_old_style_footnote_symbol | |||
-------------------- Compute the accelerator object. ----------------- | -------------------- Compute the accelerator object. ----------------- | ||
| Line 1,394: | Line 1,993: | ||
-- Check if form still has links; if so, don't add accelerators because the resulting entries will | -- Check if form still has links; if so, don't add accelerators because the resulting entries will | ||
-- be wrong. | -- be wrong. | ||
if | if props_lemmas[1] and not form.no_accel and accel_tag_set ~= "-" and | ||
not rfind(formval_for_link, "%[%[") then | |||
-- If there is more than one form or more than one lemma, things get tricky. Often, there are | -- If there is more than one form or more than one lemma, things get tricky. Often, there are | ||
-- the same number of forms as lemmas, e.g. for Ukrainian [[зимовий]] "wintry; winter (rel.)", | -- the same number of forms as lemmas, e.g. for Ukrainian [[зимовий]] "wintry; winter (rel.)", | ||
| Line 1,413: | Line 2,013: | ||
-- form was generated by which lemma, but that is significant extra work for little gain. | -- form was generated by which lemma, but that is significant extra work for little gain. | ||
local first_lemma, last_lemma | local first_lemma, last_lemma | ||
if # | if #formobjs >= #props_lemmas then | ||
-- More forms than lemmas. Try to even out the forms assigned per lemma. | -- More forms than lemmas. Try to even out the forms assigned per lemma. | ||
local forms_per_lemma = math.ceil(# | local forms_per_lemma = math.ceil(#formobjs / #props_lemmas) | ||
first_lemma = math.floor((i - 1) / forms_per_lemma) + 1 | first_lemma = math.floor((i - 1) / forms_per_lemma) + 1 | ||
last_lemma = first_lemma | last_lemma = first_lemma | ||
else | else | ||
-- More lemmas than forms. Try to even out the lemmas assigned per form. | -- More lemmas than forms. Try to even out the lemmas assigned per form. | ||
local lemmas_per_form = math.ceil(# | local lemmas_per_form = math.ceil(#props_lemmas / #formobjs) | ||
first_lemma = (i - 1) * lemmas_per_form + 1 | first_lemma = (i - 1) * lemmas_per_form + 1 | ||
last_lemma = math.min(first_lemma + lemmas_per_form - 1, # | last_lemma = math.min(first_lemma + lemmas_per_form - 1, #props_lemmas) | ||
end | end | ||
local accel_lemma, accel_lemma_translit | local accel_lemma, accel_lemma_translit | ||
if first_lemma == last_lemma then | if first_lemma == last_lemma then | ||
accel_lemma, accel_lemma_translit = | accel_lemma, accel_lemma_translit = | ||
fetch_formval_and_translit(props_lemmas[first_lemma], "remove links") | |||
else | else | ||
accel_lemma = {} | accel_lemma = {} | ||
accel_lemma_translit = {} | accel_lemma_translit = {} | ||
for j=first_lemma, last_lemma do | for j=first_lemma, last_lemma do | ||
local this_lemma = | local this_lemma = props_lemmas[j] | ||
local this_accel_lemma, this_accel_lemma_translit = | local this_accel_lemma, this_accel_lemma_translit = | ||
fetch_formval_and_translit(props_lemmas[j], "remove links") | |||
-- Do not use table.insert() especially for the translit because it may be nil and in | -- Do not use table.insert() especially for the translit because it may be nil and in | ||
-- that case we want gaps in the array. | -- that case we want gaps in the array. | ||
accel_lemma[j - first_lemma + 1] = this_accel_lemma | accel_lemma[j - first_lemma + 1] = this_accel_lemma | ||
accel_lemma_translit[j - first_lemma + 1] = this_accel_lemma_translit | accel_lemma_translit[j - first_lemma + 1] = this_accel_lemma_translit | ||
end | |||
end | |||
local accel_translit | |||
if props.include_translit and form.translit then | |||
if type(form.translit) == "table" then | |||
accel_translit = table.concat(form.translit, ", ") | |||
elseif type(form.translit) == "string" then | |||
accel_translit = form.translit | |||
else | |||
error(("Internal error: For slot '%s', form translit is not a table or string: %s"): | |||
format(slot, dump(accel_translit))) | |||
end | end | ||
end | end | ||
accel_obj = { | accel_obj = { | ||
form = | form = accel_tag_set, | ||
translit = | translit = accel_translit, | ||
lemma = accel_lemma, | lemma = accel_lemma, | ||
lemma_translit = props.include_translit and accel_lemma_translit or nil, | lemma_translit = props.include_translit and accel_lemma_translit or nil, | ||
| Line 1,451: | Line 2,065: | ||
accel_obj = props.transform_accel_obj(slot, form, accel_obj) | accel_obj = props.transform_accel_obj(slot, form, accel_obj) | ||
end | end | ||
form.accel_obj = accel_obj | |||
end | |||
end | |||
-- Format the form objects into a string for insertion into the table. | |||
local formatted_forms | |||
if props.format_forms then | |||
formatted_forms = props.format_forms { | |||
slot = slot, | |||
forms = forms, | |||
footnote_obj = footnote_obj, | |||
} | |||
end | |||
if not formatted_forms then | |||
-- Default algorithm: Separate form values and translits and concatenate on separate lines. | |||
-- Form values have already been deduplicated but we may need to deduplicate translits (this happens | |||
-- e.g. in Arabic where there may be multiple ways of spelling a hamza in the Arabic script but only | |||
-- one way in transliteration). | |||
local formval_spans = {} | |||
local tr_spans = {} | |||
for i, form in ipairs(formobjs) do | |||
local link | |||
if props.generate_link then | if props.generate_link then | ||
link = props.generate_link | link = props.generate_link { | ||
slot = slot, | |||
pos = i, | |||
form = form, | |||
footnote_obj = footnote_obj, | |||
} | |||
end | |||
if not link then | |||
link = m_links.full_link { | |||
lang = props.lang, term = form.formval_for_link, tr = "-", accel = form.accel_obj | |||
} .. form.formval_old_style_footnote_symbol .. | |||
export.get_footnote_text(form.footnotes, footnote_obj) | |||
end | |||
formval_spans[i] = link | |||
if props.include_translit then | |||
-- Note that if there is an attached old-style footnote symbol, we transliterate it. | |||
local translits = form.translit or props_transliterate(props, m_links.remove_links(form.form)) | |||
if type(translits) == "string" then | |||
translits = {translits} | |||
end | |||
for _, tr in ipairs(translits) do | |||
local tr_for_tag, tr_old_style_footnote_symbol | |||
if props.allow_footnote_symbols then | |||
tr_for_tag, tr_old_style_footnote_symbol = require(table_tools_module).get_notes(tr) | |||
else | |||
tr_for_tag = tr | |||
tr_old_style_footnote_symbol = "" | |||
end | |||
m_table.insertIfNot(tr_spans, { | |||
tr_for_tag = tr_for_tag, | |||
old_style_footnote_symbol = tr_old_style_footnote_symbol, | |||
footnotes = form.footnotes, | |||
}, { | |||
key = function(trobj) return trobj.tr_for_tag end, | |||
combine = function(tr, newtr) | |||
-- Combine footnotes. | |||
tr.footnotes = export.combine_footnotes(tr.footnotes, newtr.footnotes) | |||
tr.old_style_footnote_symbol = tr.old_style_footnote_symbol .. | |||
newtr.old_style_footnote_symbol | |||
end, | |||
}) | |||
end | |||
end | end | ||
end | end | ||
for i, tr_span in ipairs(tr_spans) do | |||
local formatted_tr | |||
if props.format_tr then | |||
formatted_tr = props.format_tr { | |||
slot = slot, | |||
pos = i, | |||
tr_for_tag = tr_span.tr_for_tag, | |||
old_style_footnote_symbol = tr_span.old_style_footnote_symbol, | |||
if | footnotes = tr_span.footnotes, | ||
footnote_obj = footnote_obj, | |||
} | |||
end | |||
if not formatted_tr then | |||
formatted_tr = require(script_utilities_module).tag_translit(tr_span.tr_for_tag, props.lang, | |||
"default", " style=\"color: var(--wikt-palette-grey-8,#888);\"") .. tr_span.old_style_footnote_symbol .. | |||
export.get_footnote_text(tr_span.footnotes, footnote_obj) | |||
end | end | ||
tr_spans[i] = formatted_tr | |||
end | end | ||
if props.join_spans then | |||
formatted_forms = props.join_spans { | |||
slot = slot, | |||
formval_spans = formval_spans, | |||
tr_spans = tr_spans, | |||
} | |||
end | end | ||
if not formatted_forms then | |||
if | local formval_span = table.concat(formval_spans, ", ") | ||
local tr_span | |||
if #tr_spans > 0 then | |||
tr_span = table.concat(tr_spans, ", ") | |||
end | |||
if tr_span then | |||
formatted_forms = formval_span .. "<br />" .. tr_span | |||
else | |||
formatted_forms = formval_span | |||
end | |||
end | end | ||
end | end | ||
formtable[slot] = formatted_forms | |||
else | else | ||
formtable[slot] = "—" | |||
end | end | ||
end | end | ||
| Line 1,511: | Line 2,184: | ||
if props.footnotes then | if props.footnotes then | ||
for _, note in ipairs(props.footnotes) do | for _, note in ipairs(props.footnotes) do | ||
local symbol, entry = | local symbol, entry = require(table_tools_module).get_initial_notes(note) | ||
table.insert(all_notes, symbol .. entry) | table.insert(all_notes, symbol .. entry) | ||
end | end | ||
end | end | ||
formtable.footnote = table.concat(all_notes, "<br />") | |||
end | end | ||
return export | return export | ||