45,340
edits
(Created page with "local export = {} local m_links = require("Module:links") local m_string_utilities = require("Module:string utilities") local m_table = require("Module:table") local rsplit...") |
No edit summary |
||
Line 4: | Line 4: | ||
local m_string_utilities = require("Module:string utilities") | local m_string_utilities = require("Module:string utilities") | ||
local m_table = require("Module:table") | local m_table = require("Module:table") | ||
local put = require("Module:parse utilities") | |||
local rsplit = mw.text.split | local rsplit = mw.text.split | ||
Line 16: | Line 17: | ||
end | end | ||
local function track(page) | |||
return true | |||
end | |||
local footnote_abbrevs = { | local footnote_abbrevs = { | ||
Line 30: | Line 34: | ||
------------------------------------------------------------------------------------------------------------ | |||
-- PARSING CODE -- | |||
------------------------------------------------------------------------------------------------------------ | |||
-- | |||
-- FIXME: Callers of this code should call [[Module:parse-utilities]] directly. | |||
] | |||
export.parse_balanced_segment_run = put.parse_balanced_segment_run | |||
export.parse_multi_delimiter_balanced_segment_run = put.parse_multi_delimiter_balanced_segment_run | |||
export.split_alternating_runs = put.split_alternating_runs | |||
-- | -- FIXME: Older entry point. Call split_alternating_runs_and_frob_raw_text() directly. | ||
-- | -- Like split_alternating_runs() but strips spaces from both ends of the odd-numbered elements (only in | ||
-- | -- odd-numbered runs if preserve_splitchar is given). Effectively we leave alone the footnotes and splitchars | ||
-- | -- themselves, but otherwise strip extraneous spaces. Spaces in the middle of an element are also left alone. | ||
function export.split_alternating_runs_and_strip_spaces(segment_runs, splitchar, preserve_splitchar) | |||
track("split-alternating-runs-and-strip-spaces") | |||
function export. | return put.split_alternating_runs_and_frob_raw_text(segment_runs, splitchar, put.strip_spaces, preserve_splitchar) | ||
end | end | ||
-- | ------------------------------------------------------------------------------------------------------------ | ||
-- | -- INFLECTION CODE -- | ||
-- | ------------------------------------------------------------------------------------------------------------ | ||
-- | |||
--[=[ | --[=[ | ||
The following code is used in building up the inflection of terms in inflected languages, where a term can potentially | |||
consist of several inflected words, each surrounded by fixed text, and a given slot (e.g. accusative singular) of a | |||
given word can potentially consist of multiple possible inflected forms. In addition, each form may be associated with | |||
The | a manual transliteration and/or a list of footnotes (or qualifiers, in the case of headword lines). The following | ||
terminology is helpful to understand: | |||
* An `inflection dimension` is a particular dimension over which a term may be inflected, such as case, number, gender, | |||
as | person, tense, mood, voice, aspect, etc. | ||
* A `term` is a word or multiword expression that can be inflected. A multiword term may in turn consist of several | |||
single-word inflected terms with surrounding fixed text. A term belongs to a particular `part of speech` (e.g. noun, | |||
verb, adjective, etc.). | |||
* A `slot` is a particular combination of inflection dimensions. An example might be "accusative plural" for a noun, | |||
or "first-person singular present indicative" for a verb. Slots are named in a language-specific fashion. For | |||
example, the slot "accusative plural" might have a name "accpl", while "first-person singular present indicative" | |||
might be variously named "pres1s", "pres_ind_1_sg", etc. Each slot is filled with zero or more `forms`. | |||
* A `form` is a particular inflection of a slot for a particular term. Forms are described using `form objects`, which | |||
are Lua objects taking the form {form="FORM", translit="MANUAL_TRANSLIT", footnotes={"FOOTNOTE", "FOOTNOTE", ...}}. | |||
FORM is a `form string` specifying the value of the form itself. MANUAL_TRANSLIT specifies optional manual | |||
transliteration for the form, in case (a) the form string is in a different script; and (b) either the form's | |||
automatic transliteration is incorrect and needs to be overridden, or the language of the term has no automatic | |||
transliteration (e.g. in the case of Persian and Hebrew). FOOTNOTE is a footnote to be attached to the form in | |||
question, and should be e.g. "[archaic]" or "[only in the meaning 'to succeed (an officeholder)']", i.e. the string | |||
must be surrounded by brackets and should begin with a lowercase letter and not end in a period/full stop. When such | |||
footnotes are converted to actual footnotes in a table of inflected forms, the brackets will be removed, the first | |||
letter will be capitalized and a period/full stop will be added to the end. (However, when such footnotes are used | |||
as qualifiers in headword lines, only the brackets will be removed, with no capitalization or final period.) Note | |||
that only FORM is mandatory. | |||
* The `lemma` is the particular form of a term under which the term is entered into a dictionary. For example, for | |||
verbs, it is most commonly the infinitive, but this differs for some languages: e.g. Latin, Greek and Bulgarian use | |||
the first-person singular present indicative (active voice in the case of Latin and Greek); Sanskrit and Macedonian | |||
use the third-person singular present indicative (active voice in the case of Sanskrit); Hebrew and Arabic use the | |||
third-person singular masculine past (aka "perfect"); etc. For nouns, the lemma form is most commonly the nominative | |||
singular, but e.g. for Old French it is the objective singular and for Sanskrit it is the root. | |||
]=] | ]=] | ||
local function extract_footnote_modifiers(footnote) | |||
local footnote_mods, footnote_without_mods = rmatch(footnote, "^%[([!*+]?)(.*)%]$") | |||
if not footnote_mods then | |||
error("Saw footnote '" .. footnote .. "' not surrounded by brackets") | |||
if | |||
end | end | ||
return footnote_mods, footnote_without_mods | |||
end | end | ||
-- Insert a form (an object of the form {form=FORM, translit=MANUAL_TRANSLIT, footnotes=FOOTNOTES}) | -- Insert a form (an object of the form {form=FORM, translit=MANUAL_TRANSLIT, footnotes=FOOTNOTES}) into a list of such | ||
-- forms. If the form is already present, the footnotes of the existing and new form might be combined (specifically, | |||
-- footnotes in the new form beginning with ! will be combined). | |||
function export.insert_form_into_list(list, form) | function export.insert_form_into_list(list, form) | ||
-- Don't do anything if the form object or the form inside it is nil. This simplifies | -- Don't do anything if the form object or the form inside it is nil. This simplifies | ||
Line 282: | Line 118: | ||
-- Form already present; maybe combine footnotes. | -- Form already present; maybe combine footnotes. | ||
if form.footnotes then | if form.footnotes then | ||
-- Check to see if there are existing footnotes with *; if so, remove them. | |||
if listform.footnotes then | |||
local any_footnotes_with_asterisk = false | |||
for _, footnote in ipairs(listform.footnotes) do | |||
local footnote_mods, _ = extract_footnote_modifiers(footnote) | |||
if rfind(footnote_mods, "%*") then | |||
any_footnotes_with_asterisk = true | |||
break | |||
end | |||
end | |||
if any_footnotes_with_asterisk then | |||
local filtered_footnotes = {} | |||
for _, footnote in ipairs(listform.footnotes) do | |||
local footnote_mods, _ = extract_footnote_modifiers(footnote) | |||
if not rfind(footnote_mods, "%*") then | |||
table.insert(filtered_footnotes, footnote) | |||
end | |||
end | |||
if #filtered_footnotes > 0 then | |||
listform.footnotes = filtered_footnotes | |||
else | |||
listform.footnotes = nil | |||
end | |||
end | |||
end | |||
-- The behavior here has changed; track cases where the old behavior might | -- The behavior here has changed; track cases where the old behavior might | ||
-- be needed by adding ! to the footnote. | -- be needed by adding ! to the footnote. | ||
track("combining-footnotes") | |||
local any_footnotes_with_bang = false | local any_footnotes_with_bang = false | ||
for _, footnote in ipairs(form.footnotes) do | for _, footnote in ipairs(form.footnotes) do | ||
if rfind( | local footnote_mods, _ = extract_footnote_modifiers(footnote) | ||
if rfind(footnote_mods, "[!+]") then | |||
any_footnotes_with_bang = true | any_footnotes_with_bang = true | ||
break | break | ||
Line 299: | Line 163: | ||
for _, footnote in ipairs(form.footnotes) do | for _, footnote in ipairs(form.footnotes) do | ||
local already_seen = false | local already_seen = false | ||
if rfind( | local footnote_mods, footnote_without_mods = extract_footnote_modifiers(footnote) | ||
if rfind(footnote_nods, "[!+]") then | |||
for _, existing_footnote in ipairs(listform.footnotes) do | for _, existing_footnote in ipairs(listform.footnotes) do | ||
local existing_footnote_mods, existing_footnote_without_mods = | |||
extract_footnote_modifiers(existing_footnote) | |||
if existing_footnote_without_mods == footnote_without_mods then | |||
already_seen = true | already_seen = true | ||
break | break | ||
Line 349: | Line 216: | ||
-- Map a function over the form values in FORMS (a list of objects of the form | function export.identity(form, translit) | ||
return form, translit | |||
end | |||
-- single value (the new form) or two values (the new form and new manual translit). | |||
-- | local function call_map_function_str(str, fun) | ||
if str == "?" then | |||
return "?" | |||
end | |||
local newform, newtranslit = fun(str) | |||
if newtranslit then | |||
return {form=newform, translit=newtranslit} | |||
else | |||
return newform | |||
end | |||
end | |||
local function call_map_function_obj(form, fun) | |||
if form.form == "?" then | |||
return {form = "?", footnotes = form.footnotes} | |||
end | |||
local newform, newtranslit = fun(form.form, form.translit) | |||
return {form=newform, translit=newtranslit, footnotes=form.footnotes} | |||
end | |||
-- Map a function over the form values in FORMS (a list of form objects of the form {form=FORM, | |||
-- translit=MANUAL_TRANSLIT, footnotes=FOOTNOTES}). If the input form is "?", it is preserved on output and the | |||
-- function is not called. The function is called with two arguments, the original form and manual translit; if manual | |||
-- translit isn't relevant, it's fine to declare the function with only one argument. The return value is either a | |||
-- single value (the new form) or two values (the new form and new manual translit). The footnotes (if any) from the | |||
-- input form objects are preserved on output. Uses insert_form_into_list() to insert the resulting form objects into | |||
-- the returned list in case two different forms map to the same thing. | |||
function export.map_forms(forms, fun) | function export.map_forms(forms, fun) | ||
if not forms then | if not forms then | ||
Line 362: | Line 256: | ||
local retval = {} | local retval = {} | ||
for _, form in ipairs(forms) do | for _, form in ipairs(forms) do | ||
export.insert_form_into_list(retval, call_map_function_obj(form, fun)) | |||
export.insert_form_into_list(retval, | |||
end | end | ||
return retval | return retval | ||
Line 370: | Line 262: | ||
-- Map a list-returning function over the form values in FORMS (a list of objects of the form | -- Map a list-returning function over the form values in FORMS (a list of form objects of the form {form=FORM, | ||
-- translit=MANUAL_TRANSLIT, footnotes=FOOTNOTES}). If the input form is "?", it is preserved on output and the | |||
-- function is not called. The function is called with two arguments, the original form and manual translit; if manual | |||
-- translit isn't relevant, it's fine to declare the function with only one argument. The return value is either a list | |||
-- | -- of forms or a list of form objects of the form {form=FORM, translit=MANUAL_TRANSLIT}. The footnotes (if any) from | ||
-- | -- the input form objects are preserved on output. Uses insert_form_into_list() to insert the resulting form objects | ||
-- into the returned list in case two different forms map to the same thing. | |||
function export.flatmap_forms(forms, fun) | function export.flatmap_forms(forms, fun) | ||
if not forms then | if not forms then | ||
Line 383: | Line 275: | ||
local retval = {} | local retval = {} | ||
for _, form in ipairs(forms) do | for _, form in ipairs(forms) do | ||
local funret = fun(form.form, form.translit) | local funret = form.form == "?" and {"?"} or fun(form.form, form.translit) | ||
for _, fr in ipairs(funret) do | for _, fr in ipairs(funret) do | ||
local newform | local newform | ||
Line 398: | Line 290: | ||
-- Map a function over the form values in FORMS (a single string, a | -- Map a function over the form values in FORMS (a single string, a form object of the form {form=FORM, | ||
-- translit=MANUAL_TRANSLIT, footnotes=FOOTNOTES}, or a list of either of the previous two types). If the input form is | |||
-- "?", it is preserved on output and the function is not called. If FIRST_ONLY is given and FORMS is a list, only map | |||
-- over the first element. Return value is of the same form as FORMS, unless FORMS is a string and the function return | |||
-- arguments, the original form and manual translit; if manual translit isn't relevant, | -- both form and manual translit (in which case the return value is a form object). The function is called with two | ||
-- arguments, the original form and manual translit; if manual translit isn't relevant, it's fine to declare the | |||
-- function with only one argument. The return value is either a single value (the new form) or two values (the new | |||
function export.map_form_or_forms(forms, | -- form and new manual translit). The footnotes (if any) from the input form objects are preserved on output. | ||
if forms | -- | ||
-- FIXME: This function is used only in [[Module:bg-verb]] and should be moved into that module. | |||
function export.map_form_or_forms(forms, fun, first_only) | |||
if not forms then | |||
return nil | return nil | ||
elseif type(forms) == "string" then | elseif type(forms) == "string" then | ||
return | return call_map_function_str(forms, fun) | ||
elseif forms.form then | elseif forms.form then | ||
return call_map_function_obj(forms, fun) | |||
else | else | ||
local retval = {} | local retval = {} | ||
for i, form in ipairs(forms) do | for i, form in ipairs(forms) do | ||
if first_only then | if first_only then | ||
return export.map_form_or_forms(form, | return export.map_form_or_forms(form, fun) | ||
end | end | ||
table.insert(retval, export.map_form_or_forms(form, | table.insert(retval, export.map_form_or_forms(form, fun)) | ||
end | end | ||
return retval | return retval | ||
Line 429: | Line 320: | ||
-- Combine two sets of footnotes. If either is nil, just return the other, and if both are nil, | -- Combine two sets of footnotes. If either is nil, just return the other, and if both are nil, return nil. | ||
function export.combine_footnotes(notes1, notes2) | function export.combine_footnotes(notes1, notes2) | ||
if not notes1 and not notes2 then | if not notes1 and not notes2 then | ||
Line 449: | Line 339: | ||
-- Expand a given footnote (as specified by the user, including the surrounding brackets) | -- Expand a given footnote (as specified by the user, including the surrounding brackets) into the form to be inserted | ||
-- into the final generated table. If `no_parse_refs` is not given and the footnote is a reference (of the form | |||
function export. | -- '[ref:...]'), parse and return the specified reference(s). Two values are returned, `footnote_string` (the expanded | ||
local notetext = | -- footnote, or nil if the second value is present) and `references` (a list of objects of the form | ||
if not notetext then | -- {text = TEXT, name = NAME, group = GROUP} if the footnote is a reference and `no_parse_refs` is not given, otherwise | ||
-- nil). Unless `return_raw` is given, the returned footnote string is capitalized and has a final period added. | |||
function export.expand_footnote_or_references(note, return_raw, no_parse_refs) | |||
local _, notetext = extract_footnote_modifiers(note) | |||
if not no_parse_refs and notetext:find("^ref:") then | |||
-- a reference | |||
notetext = rsub(notetext, "^ref:", "") | |||
local parsed_refs = require("Module:references").parse_references(notetext) | |||
for i, ref in ipairs(parsed_refs) do | |||
if type(ref) == "string" then | |||
parsed_refs[i] = {text = ref} | |||
end | |||
end | |||
return nil, parsed_refs | |||
end | end | ||
if footnote_abbrevs[notetext] then | if footnote_abbrevs[notetext] then | ||
Line 474: | Line 376: | ||
notetext = table.concat(split_notes) | notetext = table.concat(split_notes) | ||
end | end | ||
return m_string_utilities.ucfirst(notetext) .. "." | return return_raw and notetext or m_string_utilities.ucfirst(notetext) .. "." | ||
end | end | ||
-- | -- Older entry point. Equivalent to expand_footnote_or_references(note, true). | ||
-- | -- FIXME: Convert all uses to use expand_footnote_or_references() instead. | ||
function export. | function export.expand_footnote(note) | ||
if | track("expand-footnote") | ||
return export.expand_footnote_or_references(note, false, "no parse refs") | |||
end | |||
function export.fetch_headword_qualifiers_and_references(footnotes) | |||
if not footnotes then | |||
return nil | |||
end | end | ||
local quals, refs | |||
for _, qualifier in ipairs(footnotes) do | |||
local this_footnote, this_refs = export.expand_footnote_or_references(qualifier, "return raw") | |||
if this_refs then | |||
if not refs then | |||
refs = this_refs | |||
else | |||
for _, ref in ipairs(this_refs) do | |||
table.insert(refs, ref) | |||
end | |||
end | |||
else | else | ||
if not quals then | |||
quals = {this_footnote} | |||
else | |||
table.insert(quals, this_footnote) | |||
end | |||
end | end | ||
end | end | ||
return quals, refs | |||
end | end | ||
-- | -- Combine a form (either a string or a table) with additional footnotes, possibly replacing the form string and/or | ||
function export. | -- translit in the process. Normally called in one of two ways: | ||
-- (1) combine_form_and_footnotes(FORM_OBJ, ADDL_FOOTNOTES, NEW_FORM, NEW_TRANSLIT) where FORM_OBJ is an existing | |||
-- form object (a table of the form {form = FORM, translit = TRANSLIT, footnotes = FOOTNOTES, ...}); ADDL_FOOTNOTES | |||
-- is either nil, a single string (a footnote) or a list of footnotes; NEW_FORM is either nil or the new form | |||
-- string to substitute; and NEW_TRANSLIT is either nil or the new translit string to substitute. | |||
-- (2) combine_form_and_footnotes(FORM_STRING, FOOTNOTES), where FORM_STRING is a string and FOOTNOTES is either nil, | |||
-- a single string (a footnote) or a list of footnotes. | |||
-- | |||
-- In either case, a form object (a table of the form {form = FORM, translit = TRANSLIT, footnotes = FOOTNOTES, ...}) | |||
-- is returned, preserving as many properties as possible from any existing form object in FORM_OR_FORM_OBJ. Do the | |||
-- minimal amount of work; e.g. if FORM_OR_FORM_OBJ is a form object and ADDL_FOOTNOTES, NEW_FORM and NEW_TRANSLIT are | |||
-- all nil, the same object as passed in is returned. Under no circumstances is the existing form object side-effected. | |||
function export.combine_form_and_footnotes(form_or_form_obj, addl_footnotes, new_form, new_translit) | |||
if type(addl_footnotes) == "string" then | |||
addl_footnotes = {addl_footnotes} | |||
end | |||
if not addl_footnotes and not new_form and not new_translit then | |||
return form_or_form_obj | |||
end | |||
if type(form_or_form_obj) == "string" then | |||
new_form = new_form or form_or_form_obj | |||
return {form = new_form, translit = new_translit, footnotes = addl_footnotes} | |||
end | |||
form_or_form_obj = m_table.shallowcopy(form_or_form_obj) | |||
if new_form then | |||
form_or_form_obj.form = new_form | |||
end | |||
if new_translit then | |||
form_or_form_obj.translit = new_translit | |||
end | |||
if addl_footnotes then | |||
form_or_form_obj.footnotes = export.combine_footnotes(form_or_form_obj.footnotes, addl_footnotes) | |||
end | |||
return form_or_form_obj | |||
end | end | ||
Line 506: | Line 455: | ||
-- Combine a single form (either a string or object {form = FORM, footnotes = FOOTNOTES, ...}) or a list of same | -- Combine a single form (either a string or object {form = FORM, footnotes = FOOTNOTES, ...}) or a list of same | ||
-- along with footnotes and return a list of forms where each returned form is an object | -- along with footnotes and return a list of forms where each returned form is an object | ||
-- {form = FORM, footnotes = FOOTNOTES, ...}. | -- {form = FORM, footnotes = FOOTNOTES, ...}. If WORD_OR_WORDS is already in general list form and FOOTNOTES is nil, | ||
-- return WORD_OR_WORDS directly rather than copying it. | |||
function export.convert_to_general_list_form(word_or_words, footnotes) | function export.convert_to_general_list_form(word_or_words, footnotes) | ||
if type(footnotes) == "string" then | |||
footnotes = {footnotes} | |||
end | |||
if type(word_or_words) == "string" then | if type(word_or_words) == "string" then | ||
return {{form = word_or_words, footnotes = footnotes}} | return {{form = word_or_words, footnotes = footnotes}} | ||
elseif word_or_words.form then | elseif word_or_words.form then | ||
return {export.combine_form_and_footnotes(word_or_words, footnotes)} | return {export.combine_form_and_footnotes(word_or_words, footnotes)} | ||
elseif not footnotes then | |||
local | -- Check if already in general list form and return directly if so. | ||
local must_convert = false | |||
for _, form in ipairs(word_or_words) do | for _, form in ipairs(word_or_words) do | ||
if type(form) == "string" then | if type(form) == "string" then | ||
must_convert = true | |||
break | |||
end | end | ||
end | end | ||
return retval | if not must_convert then | ||
return word_or_words | |||
end | |||
end | |||
local retval = {} | |||
for _, form in ipairs(word_or_words) do | |||
if type(form) == "string" then | |||
table.insert(retval, {form = form, footnotes = footnotes}) | |||
else | |||
table.insert(retval, export.combine_form_and_footnotes(form, footnotes)) | |||
end | |||
end | end | ||
return retval | |||
end | end | ||
Line 536: | Line 500: | ||
function export.add_forms(forms, slot, stems, endings, combine_stem_ending, | -- Combine `stems` and `endings` and store into slot `slot` of form table `forms`. Either of `stems` and `endings` can | ||
-- be nil, a single string, a list of strings, a form object or a list of form objects. The combination of a given stem | |||
-- and ending happens using `combine_stem_ending`, which takes two parameters (stem and ending, each a string) and | |||
-- returns one value (a string). If manual transliteration is present in either `stems` or `endings`, `lang` (a | |||
-- language object) along with `combine_stem_ending_tr` (a function like `combine_stem_ending` for combining manual | |||
-- transliteration) must be given. `footnotes`, if specified, is a list of additional footnotes to attach to the | |||
-- resulting inflections (stem+ending combinations). The resulting inflections are inserted into the form table using | |||
-- export.insert_form(), in case of duplication. | |||
function export.add_forms(forms, slot, stems, endings, combine_stem_ending, lang, combine_stem_ending_tr, footnotes) | |||
if stems == nil or endings == nil then | if stems == nil or endings == nil then | ||
return | return | ||
end | |||
local function combine(stem, ending) | |||
if stem == "?" or ending == "?" then | |||
return "?" | |||
end | |||
return combine_stem_ending(stem, ending) | |||
end | end | ||
if type(stems) == "string" and type(endings) == "string" then | if type(stems) == "string" and type(endings) == "string" then | ||
export.insert_form(forms, slot, {form = | export.insert_form(forms, slot, {form = combine(stems, endings), footnotes = footnotes}) | ||
elseif type(stems) == "string" and is_table_of_strings(endings) then | elseif type(stems) == "string" and is_table_of_strings(endings) then | ||
for _, ending in ipairs(endings) do | for _, ending in ipairs(endings) do | ||
export.insert_form(forms, slot, {form = | export.insert_form(forms, slot, {form = combine(stems, ending), footnotes = footnotes}) | ||
end | end | ||
else | else | ||
Line 563: | Line 540: | ||
footnotes = ending.footnotes | footnotes = ending.footnotes | ||
end | end | ||
local new_form = | local new_form = combine(stem.form, ending.form) | ||
local new_translit | local new_translit | ||
if stem.translit or ending.translit then | if new_form ~= "?" and (stem.translit or ending.translit) then | ||
if not lang or not combine_stem_ending_tr then | if not lang or not combine_stem_ending_tr then | ||
error("Internal error: With manual translit, 'lang' and 'combine_stem_ending_tr' must be passed to 'add_forms'") | error("Internal error: With manual translit, 'lang' and 'combine_stem_ending_tr' must be passed to 'add_forms'") | ||
end | end | ||
local stem_tr = stem.translit or lang:transliterate(m_links.remove_links(stem.form)) | local stem_tr = stem.translit or (lang:transliterate(m_links.remove_links(stem.form))) | ||
local ending_tr = ending.translit or lang:transliterate(m_links.remove_links(ending.form)) | local ending_tr = ending.translit or (lang:transliterate(m_links.remove_links(ending.form))) | ||
new_translit = combine_stem_ending_tr(stem_tr, ending_tr) | new_translit = combine_stem_ending_tr(stem_tr, ending_tr) | ||
end | end | ||
Line 585: | Line 562: | ||
return | return | ||
elseif #sets_of_forms == 1 then | elseif #sets_of_forms == 1 then | ||
local formset = | local formset = export.convert_to_general_list_form(sets_of_forms[1], footnotes) | ||
export.insert_forms(forms, slot, formset) | export.insert_forms(forms, slot, formset) | ||
elseif #sets_of_forms == 2 then | elseif #sets_of_forms == 2 then | ||
Line 620: | Line 597: | ||
local function parse_before_or_post_text(props, text, segments, lemma_is_last) | local function parse_before_or_post_text(props, text, segments, lemma_is_last) | ||
-- Call parse_balanced_segment_run() to keep multiword links together. | -- Call parse_balanced_segment_run() to keep multiword links together. | ||
local bracketed_runs = | local bracketed_runs = put.parse_balanced_segment_run(text, "[", "]") | ||
-- Split on space or hyphen. Use preserve_splitchar so we know whether the separator was | -- Split normally on space or hyphen (but customizable). Use preserve_splitchar so we know whether the separator was | ||
-- a space or hyphen. | -- a space or hyphen. | ||
local space_separated_groups = | local space_separated_groups | ||
is_suffix and " " or "[ %-]", "preserve splitchar") | if props.split_bracketed_runs_into_words then | ||
space_separated_groups = props.split_bracketed_runs_into_words(bracketed_runs) | |||
end | |||
if not space_separated_groups then | |||
-- If the text begins with a hyphen, include the hyphen in the set of allowed characters | |||
-- for an inflected segment. This way, e.g. conjugating "-ir" is treated as a regular | |||
-- -ir verb rather than a hyphen + irregular [[ir]]. | |||
local is_suffix = rfind(text, "^%-") | |||
local split_pattern = is_suffix and " " or "[ %-]" | |||
space_separated_groups = put.split_alternating_runs(bracketed_runs, split_pattern, "preserve splitchar") | |||
end | |||
local parsed_components = {} | local parsed_components = {} | ||
Line 669: | Line 652: | ||
if not parsed_components_translit[j] then | if not parsed_components_translit[j] then | ||
parsed_components_translit[j] = | parsed_components_translit[j] = | ||
props.lang:transliterate(m_links.remove_links(parsed_component)) | (props.lang:transliterate(m_links.remove_links(parsed_component))) | ||
end | end | ||
end | end | ||
Line 748: | Line 731: | ||
word_specs = {} | word_specs = {} | ||
} | } | ||
if not disable_allow_default_indicator | if not disable_allow_default_indicator then | ||
if #segments == 1 then | |||
if props.allow_default_indicator then | |||
table.insert(segments, "<>") | |||
table.insert(segments, "") | |||
elseif props.angle_brackets_omittable then | |||
segments[1] = "<" .. segments[1] .. ">" | |||
table.insert(segments, 1, "") | |||
table.insert(segments, "") | |||
end | |||
end | |||
end | end | ||
-- Loop over every other segment. The even-numbered segments are angle-bracket specs while | -- Loop over every other segment. The even-numbered segments are angle-bracket specs while | ||
Line 783: | Line 774: | ||
local parsed_alternants = {} | local parsed_alternants = {} | ||
local alternant_text = rmatch(alternant, "^%(%((.*)%)%)$") | local alternant_text = rmatch(alternant, "^%(%((.*)%)%)$") | ||
local segments = | local segments = put.parse_balanced_segment_run(alternant_text, "<", ">") | ||
local comma_separated_groups = | local comma_separated_groups = put.split_alternating_runs(segments, "%s*,%s*") | ||
local alternant_spec = {alternants = {}} | local alternant_spec = {alternants = {}} | ||
for _, comma_separated_group in ipairs(comma_separated_groups) do | for _, comma_separated_group in ipairs(comma_separated_groups) do | ||
Line 807: | Line 798: | ||
`props` is an object specifying properties used during parsing, as follows: | `props` is an object specifying properties used during parsing, as follows: | ||
{ | { | ||
parse_indicator_spec = FUNCTION_TO_PARSE_AN_INDICATOR_SPEC (required | parse_indicator_spec = FUNCTION_TO_PARSE_AN_INDICATOR_SPEC (required), | ||
lang = LANG_OBJECT, | |||
transliterate_respelling = FUNCTION_TO_TRANSLITERATE_RESPELLING, | |||
split_bracketed_runs_into_words = nil or FUNCTION_TO_SPLIT_BRACKETED_RUNS_INTO_WORDS, | |||
lang = LANG_OBJECT | allow_default_indicator = BOOLEAN_OR_NIL, | ||
transliterate_respelling = FUNCTION_TO_TRANSLITERATE_RESPELLING | angle_brackets_omittable = BOOLEAN_OR_NIL, | ||
allow_blank_lemma = BOOLEAN_OR_NIL, | |||
allow_blank_lemma = BOOLEAN_OR_NIL | |||
} | } | ||
`parse_indicator_spec` is a required function that takes two arguments, a string surrounded by angle brackets and the | |||
lemma, and should return a word_spec object containing properties describing the indicators inside of the angle | |||
brackets). | |||
`lang` is the language object for the language in question; only needed if manual translit or respelling may be present | |||
using //. | |||
`transliterate_respelling` is a function that is only needed if respelling is allowed in place of manual translit after | |||
//. It takes one argument, the respelling or translit, and should return the transliteration of any respelling but | |||
return any translit unchanged. | |||
`split_bracketed_runs_into_words` is an optional function to split the passed-in text into words. It is used, for | |||
example, to determine what text constitutes a word when followed by an angle-bracket spec, i.e. what the lemma to be | |||
inflected is vs. surrounding fixed text. It takes one argument, the result of splitting the original text on brackets, | |||
and should return alternating runs of words and split characters, or nil to apply the default algorithm. Specifically, | |||
the value passed in is the result of calling `parse_balanced_segment_run(text, "[", "]")` from | |||
[[Module:parse utilities]] on the original text, and the default version of this function calls | |||
`split_alternating_runs(bracketed_runs, pattern, "preserve splitchar")`, where `bracketed_runs` is the value passed in | |||
and `pattern` splits on either spaces or hyphens (unless the text begins with a hyphen, in which case splitting is only | |||
on spaces, so that suffixes can be inflected). | |||
`allow_default_indicator` should be true if an empty indicator in angle brackets <> can be omitted and should be | |||
automatically added at the end of the multiword text (if no alternants) or at the end of each alternant (if alternants | |||
present). | |||
`angle_brackets_omittable` should be true if angle brackets can be omitted around a non-empty indicator in the presence | |||
of a blank lemma. In this case, if the combined indicator spec has no angle brackets, they will be added around the | |||
indicator (or around all indicators, if alternants are present). This only makes sense when `allow_blank_lemma` is | |||
specified. | |||
`allow_blank_lemma` should be true of if a blank lemma is allowed; in such a case, the calling function should | |||
substitute a default lemma, typically taken from the pagename. | |||
The return value is a table of the form | The return value is a table of the form | ||
Line 847: | Line 861: | ||
]=] | ]=] | ||
function export.parse_inflected_text(text, props) | function export.parse_inflected_text(text, props) | ||
if props.angle_brackets_omittable and not props.allow_blank_lemma then | |||
error("If 'angle_brackets_omittable' is specified, so should 'allow_blank_lemma'") | |||
end | |||
local alternant_multiword_spec = {alternant_or_word_specs = {}} | local alternant_multiword_spec = {alternant_or_word_specs = {}} | ||
local alternant_segments = m_string_utilities.capturing_split(text, "(%(%(.-%)%))") | local alternant_segments = m_string_utilities.capturing_split(text, "(%(%(.-%)%))") | ||
Line 852: | Line 869: | ||
for i = 1, #alternant_segments do | for i = 1, #alternant_segments do | ||
if i % 2 == 1 then | if i % 2 == 1 then | ||
local segments = | local segments = put.parse_balanced_segment_run(alternant_segments[i], "<", ">") | ||
-- Disable allow_default_indicator if alternants are present and we're processing | -- Disable allow_default_indicator if alternants are present and we're processing | ||
-- the non-alternant text. Otherwise we will try to treat the non-alternant text | -- the non-alternant text. Otherwise we will try to treat the non-alternant text | ||
Line 875: | Line 892: | ||
alternant_multiword_spec.post_text_translit = last_post_text_translit | alternant_multiword_spec.post_text_translit = last_post_text_translit | ||
return alternant_multiword_spec | return alternant_multiword_spec | ||
end | end | ||
Line 905: | Line 911: | ||
local function append_forms(props, formtable, slot, forms, before_text, before_text_no_links, | |||
--[=[ | |||
Subfunction of export.inflect_multiword_or_alternant_multiword_spec(). This is used in building up the inflections of | |||
multiword expressions. The basic purpose of this function is to append a set of forms representing the inflections of | |||
a given inflected term in a given slot onto the existing forms for that slot. Given a multiword expression potentially | |||
consisting of several inflected terms along with fixed text in between, we work iteratively from left to right, adding | |||
the new forms onto the existing ones. Normally, all combinations of new and existing forms are created, meaning if | |||
there are M existing forms and N new ones, we will end up with M*N forms. However, some of these combinations can be | |||
rejected using the variant mechanism (see the description of get_variants below). | |||
Specifically, `formtable` is a table of per-slot forms, where the key is a slot and the value is a list of form objects | |||
(objects of the form {form=FORM, translit=MANUAL_TRANSLIT, footnotes=FOOTNOTES}). `slot` is the slot in question. | |||
`forms` specifies the forms to be appended onto the existing forms, and is likewise a list of form objects. `props` | |||
is the same as in export.inflect_multiword_or_alternant_multiword_spec(). `before_text` is the fixed text that goes | |||
before the forms to be added. `before_text_no_links` is the same as `before_text` but with any links (i.e. hyperlinks | |||
of the form [[TERM]] or [[TERM|DISPLAY]]) converted into raw terms using remove_links() in [[Module:links]], and | |||
`before_text_translit` is optional manual translit of `before_text_no_links`. | |||
Note that the value "?" in a form is "infectious" in that if either the existing or new form has the value "?", the | |||
resulting combination will also be "?". This allows "?" to be used to mean "unknown". | |||
]=] | |||
local function append_forms(props, formtable, slot, forms, before_text, before_text_no_links, before_text_translit) | |||
if not forms then | if not forms then | ||
return | return | ||
Line 919: | Line 945: | ||
-- Reject combination due to non-matching variant codes. | -- Reject combination due to non-matching variant codes. | ||
else | else | ||
local new_form | local new_form | ||
local new_translit | local new_translit | ||
if old_form.translit or before_text_translit or form.translit then | if old_form.form == "?" or form.from == "?" then | ||
new_form = "?" | |||
else | |||
new_form = old_form.form .. before_text .. form.form | |||
if old_form.translit or before_text_translit or form.translit then | |||
if not props.lang then | |||
error("Internal error: If manual translit is given, 'props.lang' must be set") | |||
end | |||
if not before_text_translit then | |||
before_text_translit = (props.lang:transliterate(before_text_no_links)) or "" | |||
end | |||
local old_translit = old_form.translit or (props.lang:transliterate(m_links.remove_links(old_form.form))) or "" | |||
local translit = form.translit or (props.lang:transliterate(m_links.remove_links(form.form))) or "" | |||
new_translit = old_translit .. before_text_translit .. translit | |||
end | end | ||
end | end | ||
local new_footnotes = export.combine_footnotes(old_form.footnotes, form.footnotes) | local new_footnotes = export.combine_footnotes(old_form.footnotes, form.footnotes) | ||
Line 942: | Line 973: | ||
--[=[ | |||
Top-level inflection function. Create the inflections of a noun, verb, adjective or similar. `multiword_spec` is as | |||
returned by `parse_inflected_text` and describes the properties of the term to be inflected, including all the | |||
user-provided inflection specifications (e.g. the number, gender, conjugation/declension/etc. of each word) and the | |||
surrounding text. `props` indicates how to do the actual inflection (see below). The resulting inflected forms are | |||
stored into the `.forms` property of `multiword_spec`. This property holds a table whose keys are slots (i.e. ID's | |||
of individual inflected forms, such as "pres_1sg" for the first-person singular present indicative tense of a verb) | |||
and whose values are lists of the form { form = FORM, translit = MANUAL_TRANSLIT_OR_NIL, footnotes = FOOTNOTE_LIST_OR_NIL}, | |||
where FORM is a string specifying the value of the form (e.g. "ouço" for the first-person singular present indicative | |||
of the Portuguese verb [[ouvir]]); MANUAL_TRANSLIT_OR_NIL is the corresponding manual transliteration if needed (i.e. | |||
if the form is in a non-Latin script and the automatic transliteration is incorrect or unavailable), otherwise nil; | |||
and FOOTNOTE_LIST_OR_NIL is a list of footnotes to be attached to the form, or nil for no footnotes. Note that | |||
currently footnotes must be surrounded by brackets, e.g "[archaic]", and should not begin with a capital letter or end | |||
with a period. (Conversion from "[archaic]" to "Archaic." happens automatically.) | |||
This function has no return value, but modifies `multiword_spec` in-place, adding the `forms` table as described above. | |||
After calling this function, call show_forms() on the `forms` table to convert the forms and footnotes given in this | |||
table to strings suitable for display. | |||
`props` is an object specifying properties used during inflection, as follows: | |||
{ | |||
slot_list = {{"SLOT", "ACCEL"}, {"SLOT", "ACCEL"}, ...}, | |||
slot_table = {SLOT = "ACCEL", SLOT = "ACCEL", ...}, | |||
skip_slot = FUNCTION_TO_SKIP_A_SLOT or nil, | |||
lang = LANG_OBJECT or nil, | |||
inflect_word_spec = FUNCTION_TO_INFLECT_AN_INDIVIDUAL_WORD, | |||
get_variants = FUNCTION_TO_RETURN_A_VARIANT_CODE or nil, | |||
include_user_specified_links = BOOLEAN, | |||
} | |||
`slot_list` is a list of two-element lists of slots and associated accelerator inflections. SLOT is arbitrary but | |||
should correspond with slot names as generated by `inflect_word_spec`. ACCEL is the corresponding accelerator form; | |||
e.g. if SLOT is "pres_1sg", ACCEL might be "1|s|pres|ind". ACCEL is actually unused during inflection, but is used | |||
during show_forms(), which takes the same `slot_list` as a property upon input. | |||
`slot_table` is a table mapping slots to associated accelerator inflections and serves the same function as | |||
`slot_list`. Only one of `slot_list` or `slot_table` must be given. For new code it is preferable to use `slot_list` | |||
because this allows you to control the order of processing slots, which may occasionally be important. | |||
`skip_slot` is a function of one argument, a slot name, and should return a boolean indicating whether to skip the | |||
given slot during inflection. It can be used, for example, to skip singular slots if the overall term being inflected | |||
is plural-only, and vice-versa. | |||
`lang` is a language object. This is only used to generate manual transliteration. If the language is written in the | |||
Latin script or manual transliteration cannot be specified in the input to parse_inflected_text(), this can be omitted. | |||
(Manual transliteration is allowed if the `lang` object is set in the `props` passed to parse_inflected_text().) | |||
`inflect_word_spec` is the function to do the actual inflection. It is passed a single argument, which is a WORD_SPEC | |||
object describing the word to be inflected and the user-provided inflection specifications. It is exactly the same as | |||
was returned by the `parse_indicator_spec` function provided in the `props` sent on input to `parse_inflected_text`, but | |||
has additional fields describing the word to be inflected and the surrounding text, as follows: | |||
{ | |||
lemma = "LEMMA", | |||
before_text = "TEXT-BEFORE-WORD", | |||
before_text_no_links = "TEXT-BEFORE-WORD-NO-LINKS", | |||
before_text_translit = "MANUAL-TRANSLIT-OF-TEXT-BEFORE-WORD" or nil (if no manual translit or respelling was specified in the before-text) | |||
-- Fields as described in parse_indicator_spec() | |||
... | |||
} | |||
Here LEMMA is the word to be inflected as specified by the user (including any links if so given), and the | |||
`before_text*` fields describe the raw text preceding the word to be inflected. Any other fields in this object are as | |||
set by `parse_inflected_text`, and describe things like the gender, number, conjugation/declension, etc. as specified | |||
by the user in the <...> spec following the word to be inflected. | |||
`inflect_word_spec` should initialize the `.forms` property of the passed-in WORD_SPEC object to the inflected forms of | |||
the word in question. The value of this property is a table of the same format as the `.forms` property that is | |||
ultimately generated by inflect_multiword_or_alternant_multiword_spec() and described above near the top of this | |||
documentation: i.e. a table whose keys are slots and whose values are lists of the form | |||
{ form = FORM, translit = MANUAL_TRANSLIT_OR_NIL, footnotes = FOOTNOTE_LIST_OR_NIL}. | |||
`get_variants` is either nil or a function of one argument (a string, the value of an individual form). The purpose of | |||
this function is to ensure that in a multiword term where a given slot has more than one possible variant, the final | |||
output has only parallel variants in it. For example, feminine nouns and adjectives in Russian have two possible | |||
endings, one typically in -ой (-oj) and the other in -ою (-oju). If we have a feminine adjective-noun combination (or | |||
a hyphenated feminine noun-noun combination, or similar), and we don't specify `get_variants`, we'll end up with four | |||
values for the instrumental singular: one where both adjective and noun end in -ой, one where both end in -ою, and | |||
two where one of the words ends in -ой and the other in -ою. In general if we have N words each with K variants, we'll | |||
end up with an explosion of N^K possibilities. `get_variants` avoids this by returning a variant code (an arbitary | |||
string) for each variant. If two words each have a non-empty variant code, and the variant codes disagree, the | |||
combination will be rejected. If `get_variants` is not provided, or either variant code is an empty string, or the | |||
variant codes agree, the combination is allowed. | |||
The recommended way to use `get_variants` is as follows: | |||
1. During inflection in `inflect_word_spec`, add a special character or string to each of the variants generated for a | |||
given slot when there is more than one. (As an optimization, do this only when there is more than one word being | |||
inflected.) Special Unicode characters can be used for this purpose, e.g. U+FFF0, U+FFF1, ..., U+FFFD, which have | |||
no meaning in Unicode. | |||
2. Specify `get_variants` as a function that pulls out and returns the special character(s) or string included in the | |||
variant forms. | |||
3. When calling show_forms(), specify a `canonicalize` function that removes the variant code character(s) or string | |||
from each form before converting to the display form. | |||
See [[Module:hi-verb]] and [[Module:hi-common]] for an example of doing this in a generalized fashion. (Look for | |||
add_variant_codes(), get_variants() and remove_variant_codes().) | |||
`include_user_specified_links`, if given, ensures that user-specified links in the raw text surrounding a given word | |||
are preserved in the output. If omitted or set to false, such links will be removed and the whole multiword expression | |||
will be linked. | |||
]=] | |||
function export.inflect_multiword_or_alternant_multiword_spec(multiword_spec, props) | function export.inflect_multiword_or_alternant_multiword_spec(multiword_spec, props) | ||
multiword_spec.forms = {} | multiword_spec.forms = {} | ||
Line 949: | Line 1,080: | ||
if word_spec.alternants then | if word_spec.alternants then | ||
inflect_alternants(word_spec, props) | inflect_alternants(word_spec, props) | ||
else | else | ||
props.inflect_word_spec(word_spec) | props.inflect_word_spec(word_spec) | ||
Line 977: | Line 1,106: | ||
end) | end) | ||
end | end | ||
end | end | ||
Line 1,005: | Line 1,129: | ||
seen_notes = {}, | seen_notes = {}, | ||
noteindex = 1, | noteindex = 1, | ||
seen_refs = {}, | |||
} | } | ||
end | end | ||
Line 1,014: | Line 1,139: | ||
end | end | ||
local link_indices = {} | local link_indices = {} | ||
local all_refs = {} | |||
for _, footnote in ipairs(form.footnotes) do | for _, footnote in ipairs(form.footnotes) do | ||
footnote = export. | local refs | ||
local this_noteindex = footnote_obj.seen_notes[footnote] | footnote, refs = export.expand_footnote_or_references(footnote) | ||
if footnote then | |||
local this_noteindex = footnote_obj.seen_notes[footnote] | |||
if not this_noteindex then | |||
-- Generate a footnote index. | |||
this_noteindex = footnote_obj.noteindex | |||
footnote_obj.noteindex = footnote_obj.noteindex + 1 | |||
table.insert(footnote_obj.notes, '<sup style="color: red">' .. this_noteindex .. '</sup>' .. footnote) | |||
footnote_obj.seen_notes[footnote] = this_noteindex | |||
end | |||
m_table.insertIfNot(link_indices, this_noteindex) | |||
end | |||
if refs then | |||
for _, ref in ipairs(refs) do | |||
if not ref.name then | |||
local this_refhash = footnote_obj.seen_refs[ref.text] | |||
if not this_refhash then | |||
-- Different text needs to have different auto-generated names, globally across the entire page, | |||
-- including across different invocations of {{it-verb}} or {{it-conj}}. The easiest way to accomplish | |||
-- this is to use a message-digest hashing function. It does not have to be cryptographically secure | |||
-- (MD5 is insecure); it just needs to have low probability of collisions. | |||
this_refhash = mw.hash.hashValue("md5", ref.text) | |||
footnote_obj.seen_refs[ref.text] = this_refhash | |||
end | |||
ref.autoname = this_refhash | |||
end | |||
-- I considered using "n" as the default group rather than nothing, to more clearly distinguish regular | |||
-- footnotes from references, but this requires referencing group "n" as <references group="n"> below, | |||
-- which is non-obvious. | |||
m_table.insertIfNot(all_refs, ref) | |||
end | |||
end | end | ||
end | end | ||
table.sort(link_indices) | table.sort(link_indices) | ||
return '<sup style="color: red">' .. table.concat(link_indices, ",") .. '</sup>' | local function sort_refs(r1, r2) | ||
-- FIXME, we are now sorting on an arbitrary hash. Should we keep track of the order we | |||
-- saw the autonamed references and sort on that? | |||
if r1.autoname and r2.name then | |||
return true | |||
elseif r1.name and r2.autoname then | |||
return false | |||
elseif r1.name and r2.name then | |||
return r1.name < r2.name | |||
else | |||
return r1.autoname < r2.autoname | |||
end | |||
end | |||
table.sort(all_refs, sort_refs) | |||
for i, ref in ipairs(all_refs) do | |||
local refargs = {name = ref.name or ref.autoname, group = ref.group} | |||
all_refs[i] = mw.getCurrentFrame():extensionTag("ref", ref.text, refargs) | |||
end | |||
local link_text | |||
if #link_indices > 0 then | |||
link_text = '<sup style="color: red">' .. table.concat(link_indices, ",") .. '</sup>' | |||
else | |||
link_text = "" | |||
end | |||
local ref_text = table.concat(all_refs) | |||
if link_text ~= "" and ref_text ~= "" then | |||
return link_text .. "<sup>,</sup>" .. ref_text | |||
else | |||
return link_text .. ref_text | |||
end | |||
end | |||
-- Add links around words in a term. If multiword_only, do it only in multiword terms. | |||
function export.add_links(term, multiword_only) | |||
if form == "" or form == " " then | |||
return form | |||
end | |||
if not form:find("%[%[") then | |||
if rfind(form, "[%s%p]") then --optimization to avoid loading [[Module:headword]] on single-word forms | |||
local m_headword = require("Module:headword") | |||
if m_headword.head_is_multiword(form) then | |||
form = m_headword.add_multiword_links(form) | |||
end | |||
end | |||
if not multiword_only and not form:find("%[%[") then | |||
form = "[[" .. form .. "]]" | |||
end | |||
end | |||
return form | |||
end | |||
-- Remove redundant link surrounding entire term. | |||
function export.remove_redundant_links(term) | |||
return rsub(term, "^%[%[([^%[%]|]*)%]%]$", "%1") | |||
end | |||
-- Add links to all before and after text; for use in inflection modules that preserve links in multiword lemmas and | |||
-- include links in non-lemma forms rather than allowing the entire form to be a link. If `remember_original`, remember | |||
-- the original user-specified before/after text so we can reconstruct the original spec later. `add_links` is a | |||
-- function of one argument to add links to a given piece of text; if unspecified, it defaults to `export.add_links`. | |||
function export.add_links_to_before_and_after_text(alternant_multiword_spec, remember_original, add_links) | |||
add_links = add_links or export.add_links | |||
local function add_links_remember_original(object, field) | |||
if remember_original then | |||
object["user_specified_" .. field] = object[field] | |||
end | |||
object[field] = add_links(object[field]) | |||
end | |||
for _, alternant_or_word_spec in ipairs(alternant_multiword_spec.alternant_or_word_specs) do | |||
add_links_remember_original(alternant_or_word_spec, "before_text") | |||
if alternant_or_word_spec.alternants then | |||
for _, multiword_spec in ipairs(alternant_or_word_spec.alternants) do | |||
for _, word_spec in ipairs(multiword_spec.word_specs) do | |||
add_links_remember_original(word_spec, "before_text") | |||
end | |||
add_links_remember_original(multiword_spec, "post_text") | |||
end | |||
end | |||
end | |||
add_links_remember_original(alternant_multiword_spec, "post_text") | |||
end | end | ||
Line 1,038: | Line 1,270: | ||
{ | { | ||
lang = LANG_OBJECT, | lang = LANG_OBJECT, | ||
lemmas = | lemmas = {"LEMMA", "LEMMA", ...}, | ||
slot_list = {{"SLOT", "ACCEL"}, {"SLOT", "ACCEL"}, ...}, | |||
slot_table = {SLOT = "ACCEL", SLOT = "ACCEL", ...}, | |||
include_translit = BOOLEAN, | include_translit = BOOLEAN, | ||
create_footnote_obj = FUNCTION_TO_CREATE_FOOTNOTE_OBJ, | create_footnote_obj = nil or FUNCTION_TO_CREATE_FOOTNOTE_OBJ, | ||
canonicalize = FUNCTION_TO_CANONICALIZE_EACH_FORM, | canonicalize = nil or FUNCTION_TO_CANONICALIZE_EACH_FORM, | ||
transform_link = FUNCTION_TO_TRANSFORM_EACH_LINK, | transform_link = nil or FUNCTION_TO_TRANSFORM_EACH_LINK, | ||
join_spans = FUNCTION_TO_JOIN_SPANS, | transform_accel_obj = nil or FUNCTION_TO_TRANSFORM_EACH_ACCEL_OBJ, | ||
join_spans = nil or FUNCTION_TO_JOIN_SPANS, | |||
allow_footnote_symbols = BOOLEAN, | allow_footnote_symbols = BOOLEAN, | ||
footnotes = | footnotes = nil or {"EXTRA_FOOTNOTE", "EXTRA_FOOTNOTE", ...}, | ||
} | } | ||
`lemmas` is the list of lemmas, used in the accelerators. | `lemmas` is the list of lemmas, used in the accelerators. | ||
`slot_list` is a list of two-element lists of slots and associated accelerator inflections. | |||
`slot_table` is a table mapping slots to associated accelerator inflections. | `slot_list` is a list of two-element lists of slots and associated accelerator inflections. SLOT should correspond to | ||
slots generated during inflect_multiword_or_alternant_multiword_spec(). ACCEL is the corresponding accelerator form; | |||
e.g. if SLOT is "pres_1sg", ACCEL might be "1|s|pres|ind". ACCEL is used in generating entries for accelerator support | |||
(see [[WT:ACCEL]]). | |||
`slot_table` is a table mapping slots to associated accelerator inflections and serves the same function as | |||
`slot_list`. Only one of `slot_list` or `slot_table` must be given. For new code it is preferable to use `slot_list` | |||
because this allows you to control the order of processing slots, which may occasionally be important. | |||
`include_translit`, if given, causes transliteration to be included in the generated strings. | |||
`create_footnote_obj` is an optional function of no arguments to create the footnote object used to track footnotes; | `create_footnote_obj` is an optional function of no arguments to create the footnote object used to track footnotes; | ||
see export.create_footnote_obj(). Customizing it is useful to prepopulate the footnote table using | |||
export.get_footnote_text(). | |||
`canonicalize` is an optional function of one argument (a form) to canonicalize each form before processing; it can return nil | |||
`canonicalize` is an optional function of one argument (a form) to canonicalize each form before processing; it can | |||
`transform_link` is an optional function to transform a linked form prior to further processing | return nil for no change. The most common purpose of this function is to remove variant codes from the form. See the | ||
documentation for inflect_multiword_or_alternant_multiword_spec() for a description of variant codes and their purpose. | |||
`join_spans` is an optional function of three arguments (slot, orig_spans, tr_spans) where the spans in question are after | `generate_link` is an optional function to generate the link text for a given form. It is passed four arguments (slot, | ||
for, origentry, accel_obj) where `slot` is the slot being processed, `form` is the specific form object to generate a | |||
link for, `origentry` is the actual text to convert into a link, and `accel_obj` is the accelerator object to include | |||
in the link. If nil is returned, the default algorithm will apply, which is to call | |||
the links, and superscripted. In this case, `footnotes` should be a list of footnotes (preceded by footnote symbols, which are | `full_link{lang = lang, term = origentry, tr = "-", accel = accel_obj}` from [[Module:links]]. This can be used e.g. to | ||
superscripted). These footnotes are combined with any footnotes found in the forms and placed into `forms.footnotes`. | customize the appearance of the link. Note that the link should not include any transliteration because it is handled | ||
specially (all transliterations are grouped together). | |||
`transform_link` is an optional function to transform a linked form prior to further processing. It is passed three | |||
arguments (slot, link, link_tr) and should return the transformed link (or if translit is active, it should return two | |||
values, the transformed link and corresponding translit). It can return nil for no change. `transform_link` is used, | |||
for example, in [[Module:de-verb]], where it adds the appropriate pronoun ([[ich]], [[du]], etc.) to finite verb forms, | |||
and adds [[dass]] before special subordinate-clause variants of finte verb forms. | |||
`transform_accel_obj` is an optional function of three arguments (slot, formobj, accel_obj) to transform the default | |||
constructed accelerator object in `accel_obj` into an object that should be passed to full_link() in [[Module:links]]. | |||
It should return the new accelerator object, or nil for no acceleration. It can destructively modify the accelerator | |||
object passed in. NOTE: This is called even when the passed-in `accel_obj` is nil (either because the accelerator in | |||
`slot_table` or `slot_list` is "-", or because the form contains links, or because for some reason there is no lemma | |||
available). If is used e.g. in [[Module:es-verb]] and [[Module:pt-verb]] to replace the form with the original verb | |||
spec used to generate the verb, so that the accelerator code can generate the appropriate call to {{es-verb form of}} | |||
or {{pt-verb form of}}, which computes the inflections, instead of directly listing the inflections. | |||
`join_spans` is an optional function of three arguments (slot, orig_spans, tr_spans) where the spans in question are | |||
after linking and footnote processing. It should return a string (the joined spans) or nil for the default algorithm, | |||
which separately joins the orig_spans and tr_spans with commas and puts a newline between them. | |||
`allow_footnote_symbols`, if given, causes any footnote symbols attached to forms (e.g. numbers, asterisk) to be | |||
separated off, placed outside the links, and superscripted. In this case, `footnotes` should be a list of footnotes | |||
(preceded by footnote symbols, which are superscripted). These footnotes are combined with any footnotes found in the | |||
forms and placed into `forms.footnotes`. This mechanism of specifying footnotes is provided for backward compatibility | |||
with certain existing inflection modules and should not be used for new modules. Instead, use the regular footnote | |||
mechanism specified using the `footnotes` property attached to each form object. | |||
]=] | ]=] | ||
function export.show_forms(forms, props) | function export.show_forms(forms, props) | ||
local footnote_obj = props.create_footnote_obj and props.create_footnote_obj() or export.create_footnote_obj() | local footnote_obj = props.create_footnote_obj and props.create_footnote_obj() or export.create_footnote_obj() | ||
local | local function fetch_form_and_translit(entry, remove_links) | ||
local form, translit | |||
if type(entry) == "table" then | |||
form, translit = entry.form, entry.translit | |||
else | |||
form = entry | |||
end | |||
if remove_links then | |||
form = m_links.remove_links(form) | |||
end | |||
return form, translit | |||
end | end | ||
local lemma_forms = {} | local lemma_forms = {} | ||
for _, lemma in ipairs(props.lemmas) do | for _, lemma in ipairs(props.lemmas) do | ||
local lemma_form, _ = fetch_form_and_translit(lemma) | |||
m_table.insertIfNot(lemma_forms, lemma_form) | |||
end | end | ||
forms.lemma = #lemma_forms > 0 and table.concat(lemma_forms, ", ") or mw.title.getCurrentTitle().text | forms.lemma = #lemma_forms > 0 and table.concat(lemma_forms, ", ") or mw.title.getCurrentTitle().text | ||
local function get_accelerator_for_form(slot, formobj, i, origentry, accel_form) | |||
end | |||
local m_table_tools = require("Module:table tools") | local m_table_tools = require("Module:table tools") | ||
Line 1,096: | Line 1,371: | ||
local tr_spans = {} | local tr_spans = {} | ||
local orignotes, trnotes = "", "" | local orignotes, trnotes = "", "" | ||
if type(formvals) ~= "table" then | |||
error("Internal error: For slot '" .. slot .. "', expected table but saw " .. mw.dumpObject(formvals)) | |||
end | |||
for i, form in ipairs(formvals) do | for i, form in ipairs(formvals) do | ||
local orig_text = props.canonicalize and props.canonicalize(form.form) or form.form | local orig_text = props.canonicalize and props.canonicalize(form.form) or form.form | ||
Line 1,110: | Line 1,388: | ||
-- remove redundant link surrounding entire form | -- remove redundant link surrounding entire form | ||
origentry = export.remove_redundant_links(origentry) | origentry = export.remove_redundant_links(origentry) | ||
-------------------- Compute the accelerator object. ----------------- | |||
local accel_obj | local accel_obj | ||
-- | -- Check if form still has links; if so, don't add accelerators because the resulting entries will | ||
-- be wrong. | |||
if | if props.lemmas[1] and not form.no_accel and accel_form ~= "-" and not rfind(origentry, "%[%[") then | ||
-- If there is more than one form or more than one lemma, things get tricky. Often, there are | |||
-- the same number of forms as lemmas, e.g. for Ukrainian [[зимовий]] "wintry; winter (rel.)", | |||
-- which can be stressed зимо́вий or зимови́й with corresponding masculine/neuter genitive | |||
-- singulars зимо́вого or зимово́го etc. In this case, usually the forms and lemmas match up so | |||
-- we do this. If there are different numbers of forms than lemmas, it's usually one lemma | |||
-- against several forms e.g. Ukrainian [[міст]] "bridge" with genitive singular мо́сту or моста́ | |||
-- (accent patterns b or c) or [[ложка|ло́жка]] "spoon" with nominative plural ло́жки or ложки́ | |||
-- (accent patterns a or c). Here, we should assign the same lemma to both forms. The opposite | |||
-- can happen, e.g. [[черга]] "turn, queue" stressed че́рга or черга́ with nominative plural only | |||
-- че́рги (accent patterns a or d). Here we should assign both lemmas to the same form. In more | |||
-- complicated cases, with more than one lemma and form and different numbers of each, we try | |||
-- to align them as much as possible, e.g. if there are somehow eight forms and three lemmas, | |||
-- we assign lemma 1 to forms 1-3, lemma 2 to forms 4-6 and lemma 3 to forms 7 and 8, and | |||
-- conversely if there are somehow three forms and eight lemmas. This is likely to be wrong, but | |||
-- (a) there's unlikely to be a single algorithm that works in all such circumstances, and (b) | |||
-- these cases are vanishingly rare or nonexistent. Properly we should try to remember which | |||
-- form was generated by which lemma, but that is significant extra work for little gain. | |||
local first_lemma, last_lemma | |||
if #formvals >= #props.lemmas then | |||
-- More forms than lemmas. Try to even out the forms assigned per lemma. | |||
local forms_per_lemma = math.ceil(#formvals / #props.lemmas) | |||
first_lemma = math.floor((i - 1) / forms_per_lemma) + 1 | |||
last_lemma = first_lemma | |||
else | |||
-- More lemmas than forms. Try to even out the lemmas assigned per form. | |||
local lemmas_per_form = math.ceil(#props.lemmas / #formvals) | |||
first_lemma = (i - 1) * lemmas_per_form + 1 | |||
last_lemma = math.min(first_lemma + lemmas_per_form - 1, #props.lemmas) | |||
end | |||
local accel_lemma, accel_lemma_translit | |||
if first_lemma == last_lemma then | |||
accel_lemma, accel_lemma_translit = fetch_form_and_translit(props.lemmas[first_lemma], "remove links") | |||
else | |||
accel_lemma = {} | |||
accel_lemma_translit = {} | |||
for j=first_lemma, last_lemma do | |||
local this_lemma = props.lemmas[j] | |||
local this_accel_lemma, this_accel_lemma_translit = fetch_form_and_translit(props.lemmas[j], "remove links") | |||
-- Do not use table.insert() especially for the translit because it may be nil and in | |||
-- that case we want gaps in the array. | |||
accel_lemma[j - first_lemma + 1] = this_accel_lemma | |||
accel_lemma_translit[j - first_lemma + 1] = this_accel_lemma_translit | |||
end | |||
end | |||
accel_obj = { | accel_obj = { | ||
form = accel_form, | form = accel_form, | ||
Line 1,122: | Line 1,447: | ||
} | } | ||
end | end | ||
link = m_links.full_link{lang = props.lang, term = origentry, tr = "-", accel = accel_obj} | -- Postprocess if requested. | ||
if props.transform_accel_obj then | |||
accel_obj = props.transform_accel_obj(slot, form, accel_obj) | |||
end | |||
-------------------- Generate link to form. ----------------- | |||
if props.generate_link then | |||
link = props.generate_link(slot, form, origentry, accel_obj) | |||
end | |||
link = link or m_links.full_link{lang = props.lang, term = origentry, tr = "-", accel = accel_obj} | |||
end | end | ||
local tr = props.include_translit and (form.translit or props.lang:transliterate(m_links.remove_links(orig_text))) or nil | local tr = props.include_translit and (form.translit or (props.lang:transliterate(m_links.remove_links(orig_text)))) or nil | ||
local trentry | local trentry | ||
if props.allow_footnote_symbols and tr then | if props.allow_footnote_symbols and tr then | ||
Line 1,184: | Line 1,519: | ||
-- | -- Given a list of forms (each of which is a table of the form | ||
-- {form=FORM, translit=MANUAL_TRANSLIT, footnotes=FOOTNOTES}), concatenate into a | |||
-- SLOT=FORM//TRANSLIT,FORM//TRANSLIT,... string (or SLOT=FORM,FORM,... if no translit), | |||
function export. | -- replacing embedded | signs with <!>. | ||
function export.concat_forms_in_slot(forms) | |||
if forms then | |||
local new_vals = {} | |||
for _, v in ipairs(forms) do | |||
local form = v.form | |||
if v.translit then | |||
form = form .. "//" .. v.translit | |||
end | |||
table.insert(new_vals, rsub(form, "|", "<!>")) | |||
end | |||
return table.concat(new_vals, ",") | |||
else | |||
return nil | |||
end | |||
end | end | ||
return export | return export |