Module:affix: Difference between revisions

Undo revision 380101 by Sware (talk)
Tag: Undo
No edit summary
 
(13 intermediate revisions by the same user not shown)
Line 6: Line 6:
local m_str_utils = require("Module:string utilities")
local m_str_utils = require("Module:string utilities")
local m_table = require("Module:table")
local m_table = require("Module:table")
local en_utilities_module = "Module:en-utilities"
local etymology_module = "Module:etymology"
local etymology_module = "Module:etymology"
local pron_qualifier_module = "Module:pron qualifier"
local pron_qualifier_module = "Module:pron qualifier"
local scripts_module = "Module:scripts"
local scripts_module = "Module:scripts"
local utilities_module = "Module:utilities"
local utilities_module = "Module:utilities"
-- Export this so the category code in [[Module:category tree/poscatboiler/data/terms by etymology]] can access it.
-- Export this so the category code in [[Module:category tree/etymology]] can access it.
export.affix_lang_data_module_prefix = "Module:affix/lang-data/"
export.affix_lang_data_module_prefix = "Module:affix/lang-data/"


Line 18: Line 19:
local rfind = m_str_utils.find
local rfind = m_str_utils.find
local rmatch = m_str_utils.match
local rmatch = m_str_utils.match
local pluralize = m_str_utils.pluralize
local pluralize = require(en_utilities_module).pluralize
local u = m_str_utils.char
local u = m_str_utils.char
local ucfirst = m_str_utils.ucfirst
local ucfirst = m_str_utils.ucfirst
local unpack = unpack or table.unpack -- Lua 5.2 compatibility


-- Export this so the category code in [[Module:category tree/poscatboiler/data/terms by etymology]] can access it.
function export.affix_variants(canonical, variants)
local mappings = {}
for _, variant in ipairs(variants) do
mappings[variant] = canonical
end
return mappings
end
 
function export.id_mapping(default, ids)
local mapping = { default = default }
if ids then
for id, target in pairs(ids) do
mapping[id] = target
end
end
return mapping
end
 
function export.id_mapping_with_affix_variants(base, id_variants)
local mappings = {}
for id, variants in pairs(id_variants) do
for _, variant in ipairs(variants) do
mappings[variant] = export.id_mapping(base, {[id] = base})
end
end
return mappings
end
 
function export.merge_tables(...)
local result = {}
for i = 1, select('#', ...) do
local t = select(i, ...)
if t then
for k, v in pairs(t) do
result[k] = v
end
end
end
return result
end
 
-- Export this so the category code in [[Module:category tree/etymology]] can access it.
export.langs_with_lang_specific_data = {
export.langs_with_lang_specific_data = {
["az"] = true,
["az"] = true,
["fi"] = true,
["fi"] = true,
["fr"] = true,
["izh"] = true,
["izh"] = true,
["la"] = true,
["la"] = true,
["sah"] = true,
["sah"] = true,
["tr"] = true,
["tr"] = true,
["trk-pro"] = true,
}
}


Line 57: Line 102:
===About different types of affixes ("template", "display", "link", "lookup" and "category"):===
===About different types of affixes ("template", "display", "link", "lookup" and "category"):===


* A "template affix" is an affix in its source form as it appears in a template call. Generally, a template affix has
* A "template affix" is an affix in its source form as it appears in a template call. Generally, a template affix has an
   an attached template hyphen (see above) to indicate that it is an affix and indicate what type of affix it is
   attached template hyphen (see above) to indicate that it is an affix and indicate what type of affix it is (prefix,
  (prefix, suffix, interfix/infix or circumfix), but some of the older-style templates such as {{tl|suffix}},
  suffix, interfix or circumfix), but some of the older-style templates such as {{tl|suffix}}, {{tl|prefix}},
  {{tl|prefix}}, {{tl|confix}}, etc. have "positional" affixes where the presence of the affix in a certain position
  {{tl|confix}}, etc. have "positional" affixes where the presence of the affix in a certain position (e.g. the second
  (e.g. the second or third parameter) indicates that it is a certain type of affix, whether or not it has an attached
  or third parameter) indicates that it is a certain type of affix, whether or not it has an attached template hyphen.
  template hyphen.
* A "display affix" is the corresponding affix as it is actually displayed to the user. The display affix may differ
* A "display affix" is the corresponding affix as it is actually displayed to the user. The display affix may differ
   from the template affix for various reasons:
   from the template affix for various reasons:
Line 72: Line 116:
     languages have differences between the "template hyphen" specified in the template (which always needs to be
     languages have differences between the "template hyphen" specified in the template (which always needs to be
specified somehow or other in templates like {{tl|affix}}, to indicate that the term is an affix and what type of
specified somehow or other in templates like {{tl|affix}}, to indicate that the term is an affix and what type of
affix it is) and the display hyphen (see above), with corresponding differences between template and display affixes.
affix it is) and the display hyphen (see above), with corresponding differences between template and display
affixes.
* A (regular) "link affix" is the affix that is linked to when the affix is shown to the user. The link affix is usually
* A (regular) "link affix" is the affix that is linked to when the affix is shown to the user. The link affix is usually
   the same as the display affix, but will differ in one of three circumstances:
   the same as the display affix, but will differ in one of three circumstances:
Line 78: Line 123:
     inline modifiers or piped links, as described above under "display affix".
     inline modifiers or piped links, as described above under "display affix".
   *# For certain languages, certain affixes are mapped to canonical form using language-specific mappings. For example,
   *# For certain languages, certain affixes are mapped to canonical form using language-specific mappings. For example,
in Finnish, the adjective-forming suffix [[-kas]] appears as [[-käs]] after front vowels, but logically both
in Finnish, the adjective-forming suffix {{m|fi|-kas}} appears as {{m|fi|-käs}} after front vowels, but logically
forms are the same suffix and should be linked and categorized the same. Similarly, in Latin, the negative and
both forms are the same suffix and should be linked and categorized the same. Similarly, in Latin, the negative and
intensive prefixes spelled [[in-]] (etymologically two distinct prefixes) appear variously as [[il-]], [[im-]] or
intensive prefixes spelled {{m|la|in-}} (etymologically two distinct prefixes) appear variously as {{m|la|il-}},
[[ir-]] before certain consonants. Mappings are supplied in [[Module:affix/lang-data/LANGCODE]] to convert
{{m|la|im-}} or {{m|la|ir-}} before certain consonants. Mappings are supplied in [[Module:affix/lang-data/LANGCODE]]
Finnish [[-käs]] to [[-kas]] for linking and categorization purposes. Note that the affixes in the mappings use
to convert Finnish {{m|fi|-käs}} to {{m|fi|-kas}} for linking and categorization purposes. Note that the affixes in
"lookup hyphens" to indicate the different types of affixes, which is usually the same as the template hyphen but
the mappings use "lookup hyphens" to indicate the different types of affixes, which is usually the same as the
differs for Arabic scripts, because there are multiple possible template hyphens recognized but only one lookup
template hyphen but differs for Arabic scripts, because there are multiple possible template hyphens recognized but
hyphen (tatweel). The form of the affix as used to look up in the mapping tables is called the "lookup affix";
only one lookup hyphen (tatweel). The form of the affix as used to look up in the mapping tables is called the
see below.
"lookup affix"; see below.
* A "stripped link affix" is a link affix that has been passed through the language's `makeEntryName()` function, which
* A "stripped link affix" is a link affix that has been passed through the language's `stripDiacritics()` function, which
   may strip certain diacritics: e.g. macrons in Latin and Old English (indicating length); acute and grave accents in
   may strip certain diacritics: e.g. macrons in Latin and Old English (indicating length); acute and grave accents in
   Russian and various other Slavic languages (indicating stress); vowel diacritics in most Arabic-script languages; and
   Russian and various other Slavic languages (indicating stress); vowel diacritics in most Arabic-script languages; and
Line 99: Line 144:
link.
link.
   *# If no entry is found, the affix is then looked up in a modified link form (specifically, the modified display
   *# If no entry is found, the affix is then looked up in a modified link form (specifically, the modified display
form passed through the language's `makeEntryName()` function, which strips out certain diacritics, but with the
form passed through the language's `stripDiacritics()` function, which strips out certain diacritics, but with the
lookup hyphen re-added if it was stripped out, as in the case of tatweel in many Arabic-script languages).
lookup hyphen re-added if it was stripped out, as in the case of tatweel in many Arabic-script languages).
   The reason for this double lookup procedure is to allow for mappings that are sensitive to the extra diacritics, but
   The reason for this double lookup procedure is to allow for mappings that are sensitive to the extra diacritics, but
   also allow for mappings that are not sensitive in this fashion (e.g. Russian [[-ливый]] occurs both stressed and
   also allow for mappings that are not sensitive in this fashion (e.g. Russian {{m|ru|-ливый}} occurs both stressed and
   unstressed, but is the same prefix either way).
   unstressed, but is the same prefix either way).
* A "category affix" is the affix as it appears in categories such as [[:Category:Finnish terms suffixed with -kas]].
* A "category affix" is the affix as it appears in categories such as [[:Category:Finnish terms suffixed with -kas|
  The category affix is currently always the same as the stripped link affix. This means that for Arabic-script
  Category:Finnish terms suffixed with ''-kas'']]. The category affix is currently always the same as the stripped link
  languages, it may or may not have a tatweel, even if the correponding display affix and regular link affix have a
  affix. This means that for Arabic-script languages, it may or may not have a tatweel, even if the correponding display
  tatweel. As mentioned above, makeEntryName() strips tatweel for Arabic, Persian and Urdu, but not for Ottoman Turkish.
  affix and regular link affix have a tatweel. As mentioned above, stripDiacritics() strips tatweel for Arabic, Persian
  Hence affix categories for Arabic, Persian and Urdu will be missing the tatweel, but affix categories for
  and Urdu, but not for Ottoman Turkish. Hence affix categories for Arabic, Persian and Urdu will be missing the
  Ottoman Turkish will have it. An additional complication is that if the template affix contains a ZWNJ, the display
  tatweel, but affix categories for Ottoman Turkish will have it. An additional complication is that if the template
  (and hence the link and category affixes) will have no hyphen attached in any case.
  affix contains a ZWNJ, the display (and hence the link and category affixes) will have no hyphen attached in any case.
]==]
]==]


Line 137: Line 182:
local ZWNJ = u(0x200C) -- zero-width non-joiner
local ZWNJ = u(0x200C) -- zero-width non-joiner
local template_hyphens = {
local template_hyphens = {
-- This covers all Arabic scripts. See above.
["Arab"] = "ـ" .. ZWNJ .. "-", -- tatweel + zero-width non-joiner + regular hyphen
["Arab"] = "ـ" .. ZWNJ .. "-", -- tatweel + zero-width non-joiner + regular hyphen
["Hebr"] = "־", -- Hebrew-specific hyphen termed "maqqef"
["Hebr"] = "־", -- Hebrew-specific hyphen termed "maqqef"
-- This covers all Arabic scripts. See above.
["Mong"] = "᠊",
["Mong"] = "᠊",
["mnc-Mong"] = "᠊",
["sjo-Mong"] = "᠊",
["xwo-Mong"] = "᠊",
-- FIXME! What about the following right-to-left scripts?
-- FIXME! What about the following right-to-left scripts?
-- Adlm (Adlam)
-- Adlm (Adlam)
Line 224: Line 266:
["Thaa"] = no_display_hyphen,
["Thaa"] = no_display_hyphen,
["Thai"] = no_display_hyphen,
["Thai"] = no_display_hyphen,
["Tibt"] = no_display_hyphen,
}
}


Line 232: Line 275:
local function glossary_link(entry, text)
local function glossary_link(entry, text)
text = text or entry
text = text or entry
return "[[wikt:Appendix:Glossary#" .. entry .. "|" .. text .. "]]"
return "[[Appendix:Glossary#" .. entry .. "|" .. text .. "]]"
end
 
 
local function track(page)
if type(page) == "table" then
for i, pg in ipairs(page) do
page[i] = "affix/" .. pg
end
else
page = "affix/" .. page
end
require("Module:debug/track")(page)
end
end


Line 401: Line 456:
for i, cat in ipairs(data.categories) do
for i, cat in ipairs(data.categories) do
if type(cat) == "table" then
if type(cat) == "table" then
data.categories[i] = require(utilities_module).format_categories({lang:getFullName() .. " " .. cat.cat},
data.categories[i] = require(utilities_module).format_categories(lang:getFullName() .. " " .. cat.cat,
lang, cat.sort_key, cat.sort_base, force_cat)
lang, cat.sort_key, cat.sort_base, force_cat)
else
else
data.categories[i] = require(utilities_module).format_categories({lang:getFullName() .. " " .. cat}, lang,
data.categories[i] = require(utilities_module).format_categories(lang:getFullName() .. " " .. cat, lang,
data.data.sort_key, nil, force_cat)
data.data.sort_key, nil, force_cat)
end
end
Line 410: Line 465:
cattext = table.concat(data.categories)
cattext = table.concat(data.categories)
end
end
local result = table.concat(data.parts_formatted, " +‎ ") .. (data.data.lit and ", literally " ..
local result = table.concat(data.parts_formatted, not data.separator_already_added and " +‎ " or nil) ..
m_links.mark(data.data.lit, "gloss") or "")
(data.data.lit and ", literally " .. m_links.mark(data.data.lit, "gloss") or "")
local q = data.data.q
local q = data.data.q
local qq = data.data.qq
local qq = data.data.qq
local l = data.data.l
local l = data.data.l
local ll = data.data.ll
local ll = data.data.ll
if q and q[1] or qq and qq[1] or l and l[1] or ll and ll[1] then
local infl = data.data.infl
if q and q[1] or qq and qq[1] or l and l[1] or ll and ll[1] or infl and infl[1] then
result = require(pron_qualifier_module).format_qualifiers {
result = require(pron_qualifier_module).format_qualifiers {
lang = lang,
lang = lang,
Line 424: Line 480:
l = l,
l = l,
ll = ll,
ll = ll,
infl = infl,
}
}
end
end


return result .. cattext
return result .. cattext
end
--[==[
Older entry point for calling `join_formatted_parts(). FIXME: Convert callers.
]==]
function export.concat_parts(lang, parts_formatted, categories, nocat, sort_key, lit, force_cat)
return export.join_formatted_parts {
data = {
lang = lang,
nocat = nocat,
sort_key = sort_key,
lit = lit,
force_cat = force_cat,
},
parts_formatted = parts_formatted,
categories = categories,
}
end
end


Line 461: Line 500:




-- Remove links and call lang:makeEntryName(term).
-- Remove links and call lang:stripDiacritics(term).
local function make_entry_name_no_links(lang, term)
local function strip_diacritics_no_links(lang, term)
-- Double parens because makeEntryName() returns multiple values. Yuck.
return lang:stripDiacritics(m_links.remove_links(term))
return (lang:makeEntryName(m_links.remove_links(term)))
end
end


Line 503: Line 541:
to access information for constructing the categories added by `format_derived()`.
to access information for constructing the categories added by `format_derived()`.
]==]
]==]
function export.link_term(part, data)
function export.link_term(part, data, include_separator)
local result
local result


Line 509: Line 547:
result = require(etymology_module).format_derived {
result = require(etymology_module).format_derived {
lang = data.lang,
lang = data.lang,
terminfo = part,
terms = {part},
sources = {part.lang},
sort_key = data.sort_key,
sort_key = data.sort_key,
nocat = data.nocat,
nocat = data.nocat,
template_name = "affix",
qualifiers_labels_on_outside = true,
borrowing_type = data.borrowing_type,
borrowing_type = data.borrowing_type,
force_cat = data.force_cat or debug_force_cat,
force_cat = data.force_cat or debug_force_cat,
}
}
else
else
-- language (e.g. in a pseudo-loan).
result = m_links.full_link(part, "term", nil, "show qualifiers")
result = m_links.full_link(part, "term_i")
end
end


if part.q and part.q[1] or part.qq and part.qq[1] or part.l and part.l[1] or part.ll and part.ll[1] or
if include_separator and part.separator then
part.refs and part.refs[1] then
return part.separator .. result
result = require(pron_qualifier_module).format_qualifiers {
else
lang = part.lang,
return result
text = result,
q = part.q,
qq = part.qq,
l = part.l,
ll = part.ll,
refs = part.refs,
}
end
end
return result
end
end


Line 612: Line 643:
end
end


if not affix_type then
if affix_type == "non-affix" then
return term
return term
elseif affix_type == "circumfix" then
elseif affix_type == "circumfix" then
Line 689: Line 720:
if mapping then
if mapping then
if type(mapping) == "table" then
if type(mapping) == "table" then
mapping = mapping[affix_id or false]
mapping = mapping[affix_id] or mapping.default or mapping[affix_id or false]
if mapping then
if mapping then
return mapping
return mapping
Line 721: Line 752:
end
end


-- Double parens because makeEntryName() returns multiple values. Yuck.
return do_lookup(affix) or do_lookup(lang:stripDiacritics(affix)) or nil
return do_lookup(affix) or do_lookup((lang:makeEntryName(affix))) or nil
end
end


Line 728: Line 758:
--[==[
--[==[
For a given template term in a given language (see the definition of "template affix" near the top of the file),
For a given template term in a given language (see the definition of "template affix" near the top of the file),
possibly in an explicitly specified script `sc` (but usually nil), return the term's affix type ({"prefix"}, {"infix"},
possibly in an explicitly specified script `sc` (but usually nil), return the term's affix type ({"prefix"},
{"suffix"}, {"circumfix"} or {nil} for non-affix) along with the corresponding link and display affixes (see definitions
{"interfix"}, {"suffix"}, {"circumfix"} or {"non-affix"}) along with the corresponding link and display affixes
near the top of the file); also the corresponding lookup affix (if `return_lookup_affix` is specified). The term passed
(see definitions near the top of the file); also the corresponding lookup affix (if `return_lookup_affix` is specified).
in should already have any fragment (after the # sign) parsed off of it. Four values are returned: `affix_type`,
The term passed in should already have any fragment (after the # sign) parsed off of it. Four values are returned:
`link_term`, `display_term` and `lookup_term`. The affix type can be passed in instead of autodetected (pass in {false}
`affix_type`, `link_term`, `display_term` and `lookup_term`. The affix type can be passed in instead of autodetected; in
if the term is not an affix); in this case, the template term need not have any attached hyphens, and the appropriate
this case, the template term need not have any attached hyphens, and the appropriate hyphens will be added in the
hyphens will be added in the appropriate places. If `do_affix_mapping` is specified, look up the affix in the
appropriate places. If `do_affix_mapping` is specified, look up the affix in the lang-specific affix mappings, as
lang-specific affix mappings, as described in the comment at the top of the file; otherwise, the link and display terms
described in the comment at the top of the file; otherwise, the link and display terms will always be the same. (They
will always be the same. (They will be the same in any case if the template term has a bracketed link in it or is not
will be the same in any case if the template term has a bracketed link in it or is not an affix.) If
an affix.) If `return_lookup_affix` is given, the fourth return value contains the term with appropriate lookup hyphens
`return_lookup_affix` is given, the fourth return value contains the term with appropriate lookup hyphens in the
in the appropriate places; otherwise, it is the same as the display term. (This functionality is used in
appropriate places; otherwise, it is the same as the display term. (This functionality is used in
[[Module:category tree/poscatboiler/data/affixes and compounds]] to convert link affixes into lookup affixes so that
[[Module:category tree/affixes and compounds]] to convert link affixes into lookup affixes so that they can be looked up
they can be looked up in the affix mapping tables.)
in the affix mapping tables.)
]==]
]==]
local function parse_term_for_affixes(term, lang, sc, affix_type, do_affix_mapping, return_lookup_affix, affix_id)
local function parse_term_for_affixes(term, lang, sc, affix_type, do_affix_mapping, return_lookup_affix, affix_id)
if not term then
if not term then
return nil, nil, nil, nil
return "non-affix", nil, nil, nil
end
end


if term == "^" then
-- Indicates a null term to emulate the behavior of {{suffix|foo||bar}}.
term = ""
return "non-affix", term, term, term
end
if term:find("^%^") then
if term:find("^%^") then
-- If term begins with ^, it's not an affix no matter what. Strip off the ^ and return "no affix".
-- HACK! ^ at the beginning of Korean languages has a special meaning, triggering capitalization of the
term = usub(term, 2)
-- transliteration. Don't interpret it as "force non-affix" for those languages.
return nil, term, term, term
local langcode = lang:getCode()
if langcode ~= "ko" and langcode ~= "okm" and langcode ~= "jje" then
-- Formerly we allowed ^ to force non-affix type; this is now handled using an inline modifier
-- <naf>, <root>, etc. Throw an error for the moment when the old way is encountered.
error("Use of ^ to force non-affix status is no longer supported; use an inline modifier <naf> or <root> " ..
"after the component")
end
end
end


Line 763: Line 805:
thyph = "([" .. thyph .. "])"
thyph = "([" .. thyph .. "])"


if affix_type == nil then
if not affix_type then
if rfind(term, thyph .. " " .. thyph) then
if rfind(term, thyph .. " " .. thyph) then
affix_type = "circumfix"
affix_type = "circumfix"
Line 770: Line 812:
local has_ending_hyphen = rfind(term, thyph .. "$")
local has_ending_hyphen = rfind(term, thyph .. "$")
if has_beginning_hyphen and has_ending_hyphen then
if has_beginning_hyphen and has_ending_hyphen then
affix_type = "infix"
affix_type = "interfix"
elseif has_ending_hyphen then
elseif has_ending_hyphen then
affix_type = "prefix"
affix_type = "prefix"
elseif has_beginning_hyphen then
elseif has_beginning_hyphen then
affix_type = "suffix"
affix_type = "suffix"
else
affix_type = "non-affix"
end
end
end
end
Line 780: Line 824:


local link_term, display_term, lookup_term
local link_term, display_term, lookup_term
if affix_type then
if affix_type == "non-affix" then
link_term = term
display_term = term
lookup_term = term
else
display_term = reconstruct_term_per_hyphens(term, affix_type, scode, thyph, dhyph)
display_term = reconstruct_term_per_hyphens(term, affix_type, scode, thyph, dhyph)
if do_affix_mapping then
if do_affix_mapping then
Line 800: Line 848:
lookup_term = display_term
lookup_term = display_term
end
end
else
link_term = term
display_term = term
lookup_term = term
end
end


Line 819: Line 863:
is of the wrong type). Three values are returned: the link term, display term and lookup term. This function is a thin
is of the wrong type). Three values are returned: the link term, display term and lookup term. This function is a thin
wrapper around `parse_term_for_affixes`; see the comments above that function for more information. Note that this
wrapper around `parse_term_for_affixes`; see the comments above that function for more information. Note that this
function is exposed externally because it is called by [[Module:category tree/poscatboiler/data/affixes and compounds]];
function is exposed externally because it is called by [[Module:category tree/affixes and compounds]]; see the comment
see the comment in `parse_term_for_affixes` for more information.
in `parse_term_for_affixes` for more information.
]==]
]==]
function export.make_affix(term, lang, sc, affix_type, do_affix_mapping, return_lookup_affix, affix_id)
function export.make_affix(term, lang, sc, affix_type, do_affix_mapping, return_lookup_affix, affix_id)
if not (affix_type == "prefix" or affix_type == "suffix" or affix_type == "circumfix" or affix_type == "infix" or
if not (affix_type == "prefix" or affix_type == "suffix" or affix_type == "circumfix" or affix_type == "infix" or
affix_type == "interfix") then
affix_type == "interfix" or affix_type == "non-affix") then
error("Internal error: Invalid affix type " .. (affix_type or "(nil)"))
error("Internal error: Invalid affix type " .. (affix_type or "(nil)"))
end
end
Line 839: Line 883:


--[==[
--[==[
Implementation of {{tl|affix}} and {{tl|surface analysis}}. `data` contains all the information describing the affixes to
Core categorization logic for affixes. This is shared between show_affix(), show_compound_like() and
be displayed, and contains the following:
get_affix_categories_only(). Returns the categories array and other metadata needed for formatting.
 
* `.lang` ('''required'''): Overall language object. Different from term-specific language objects (see `.parts` below).
* `.sc`: Overall script object (usually omitted). Different from term-specific script objects.
* `.parts` ('''required'''): List of objects describing the affixes to show. The general format of each object is as would
          be passed to `full_link()`, except that the `.lang` field should be missing unless the term is of a language
  different from the overall `.lang` value (in such a case, the language name is shown along with the term and
  an additional "derived from" category is added). '''WARNING''': The data in `.parts` will be destructively
  modified.
* `.pos`: Overall part of speech (used in categories, defaults to {"terms"}). Different from term-specific part of speech.
* `.sort_key`: Overall sort key. Normally omitted except e.g. in Japanese.
* `.type`: Type of compound, if the parts in `.parts` describe a compound. Strictly optional, and if supplied, the
  compound type is displayed before the parts (normally capitalized, unless `.nocap` is given).
* `.nocap`: Don't capitalize the first letter of text displayed before the parts (relevant only if `.type` or
    `.surface_analysis` is given).
* `.notext`: Don't display any text before the parts (relevant only if `.type` or `.surface_analysis` is given).
* `.nocat`: Disable all categorization.
* `.lit`: Overall literal definition. Different from term-specific literal definitions.
* `.force_cat`: Always display categories, even on userspace pages.
* `.surface_analysis`: Implement {{surface analysis}}; adds `By surface analysis, ` before the parts.
 
'''WARNING''': This destructively modifies both `data` and the individual structures within `.parts`.
]==]
]==]
function export.show_affix(data)
local function generate_affix_categories(data)
data.pos = data.pos or default_pos
data.pos = data.pos or default_pos
data.pos = pluralize(data.pos)
data.pos = pluralize(data.pos)
Line 872: Line 895:


-- Process each part
-- Process each part
local parts_formatted = {}
local whole_words = 0
local whole_words = 0
local is_affix_or_compound = false
local is_affix_or_compound = false
Line 882: Line 904:
data.parts[i] = part
data.parts[i] = part
canonicalize_part(part, data.lang, data.sc)
canonicalize_part(part, data.lang, data.sc)
-- Determine affix type and get link and display terms (see text at top of file). Store them in the part
-- Determine affix type and get link and display terms (see text at top of file). Store them in the part
-- (in fields that won't clash with fields used by full_link() in [[Module:links]] or link_term()), so they
-- (in fields that won't clash with fields used by full_link() in [[Module:links]] or link_term()), so they
-- can be used in the loop below when categorizing.
-- can be used in the loop below when categorizing.
part.affix_type, part.affix_link_term, part.affix_display_term = parse_term_for_affixes(part.term,
part.affix_type, part.affix_link_term, part.affix_display_term = parse_term_for_affixes(part.term,
part.lang, part.sc, nil, not part.alt, nil, part.id)
part.lang, part.sc, part.type, not part.alt, nil, part.id)


-- If link_term is an empty string, either a bare ^ was specified or an empty term was used along with inline
-- If link_term is an empty string, either a bare ^ was specified or an empty term was used along with inline
Line 895: Line 916:
-- redundant alt text.
-- redundant alt text.
part.alt = part.alt or (part.affix_display_term ~= part.affix_link_term and part.affix_display_term) or nil
part.alt = part.alt or (part.affix_display_term ~= part.affix_link_term and part.affix_display_term) or nil
-- Make a link for the part.
table.insert(parts_formatted, export.link_term(part, data))
end
end


-- Now do categorization.
if not data.noaffixcat then
for i, part in ipairs_with_gaps(data.parts) do
-- Now do categorization.
local affix_type = part.affix_type
for i, part in ipairs_with_gaps(data.parts) do
if affix_type then
local affix_type = part.affix_type
is_affix_or_compound = true
if affix_type ~= "non-affix" then
-- We cannot distinguish interfixes from infixes by appearance. Prefer interfixes; infixes will need to
is_affix_or_compound = true
-- use {{infix}}.
if affix_type == "infix" then affix_type = "interfix" end
-- Make a sort key. For the first part, use the second part as the sort key; the intention is that if the
 
-- term has a prefix, sorting by the prefix won't be very useful so we sort by what follows, which is
-- Make a sort key. For the first part, use the second part as the sort key; the intention is that if the
-- presumably the root.
-- term has a prefix, sorting by the prefix won't be very useful so we sort by what follows, which is
local part_sort_base = nil
-- presumably the root.
local part_sort = part.sort or data.sort_key
local part_sort_base = nil
local part_sort = part.sort or data.sort_key
if i == 1 and data.parts[2] and data.parts[2].term then
 
local part2 = data.parts[2]
if i == 1 and data.parts[2] and data.parts[2].term then
-- If the second-part link term is empty, the user requested an unlinked term; avoid a wikitext error
local part2 = data.parts[2]
-- by using the alt value if available.
-- If the second-part link term is empty, the user requested an unlinked term; avoid a wikitext error
part_sort_base = ine(part2.affix_link_term) or ine(part2.alt)
-- by using the alt value if available.
if part_sort_base then
part_sort_base = ine(part2.affix_link_term) or ine(part2.alt)
part_sort_base = strip_diacritics_no_links(part2.lang, part_sort_base)
if part_sort_base then
end
part_sort_base = make_entry_name_no_links(part2.lang, part_sort_base)
end
if part.pos and rfind(part.pos, "patronym") then
table.insert(categories, {cat = "patronymics", sort_key = part_sort, sort_base = part_sort_base})
end
if data.pos ~= "terms" and part.pos and rfind(part.pos, "diminutive") then
table.insert(categories, {cat = "diminutive " .. data.pos, sort_key = part_sort,
sort_base = part_sort_base})
end
-- Don't add a '*fixed with' category if the link term is empty or is in a different language.
if ine(part.affix_link_term) and not part.part_lang then
table.insert(categories, {cat = data.pos .. " " .. affix_type .. "ed with " ..
strip_diacritics_no_links(part.lang, part.affix_link_term) ..
(part.id and " (" .. part.id .. ")" or ""),
sort_key = part_sort, sort_base = part_sort_base})
end
else
whole_words = whole_words + 1
if whole_words == 2 then
is_affix_or_compound = true
table.insert(categories, "compound " .. data.pos)
end
end
end
end
end
-- Make sure there was either an affix or a compound (two or more non-affix terms).
if not is_affix_or_compound and not data.allow_no_affixes_or_compounds then
error("The parameters did not include any affixes, and the term is not a compound. Please provide at least one affix.")
end
end


if part.pos and rfind(part.pos, "patronym") then
return text_sections, categories, borrowing_type
table.insert(categories, {cat = "patronymics", sort_key = part_sort, sort_base = part_sort_base})
end
end


if data.pos ~= "terms" and part.pos and rfind(part.pos, "diminutive") then
--[==[
table.insert(categories, {cat = "diminutive " .. data.pos, sort_key = part_sort,
Implementation of {{tl|affix}} and {{tl|surface analysis}}. `data` contains all the information describing the affixes to
sort_base = part_sort_base})
be displayed, and contains the following:
end


-- Don't add a '*fixed with' category if the link term is empty or is in a different language.
* `.lang` ('''required'''): Overall language object. Different from term-specific language objects (see `.parts` below).
if ine(part.affix_link_term) and not part.part_lang then
* `.sc`: Overall script object (usually omitted). Different from term-specific script objects.
table.insert(categories, {cat = data.pos .. " " .. affix_type .. "ed with " ..
* `.parts` ('''required'''): List of objects describing the affixes to show. The general format of each object is as would
make_entry_name_no_links(part.lang, part.affix_link_term) ..
          be passed to `full_link()`, except that the `.lang` field should be missing unless the term is of a language
(part.id and " (" .. part.id .. ")" or ""),
  different from the overall `.lang` value (in such a case, the language name is shown along with the term and
sort_key = part_sort, sort_base = part_sort_base})
  an additional "derived from" category is added). '''WARNING''': The data in `.parts` will be destructively
end
  modified.
else
* `.pos`: Overall part of speech (used in categories, defaults to {"terms"}). Different from term-specific part of speech.
whole_words = whole_words + 1
* `.sort_key`: Overall sort key. Normally omitted except e.g. in Japanese.
* `.type`: Type of compound, if the parts in `.parts` describe a compound. Strictly optional, and if supplied, the
  compound type is displayed before the parts (normally capitalized, unless `.nocap` is given).
* `.nocap`: Don't capitalize the first letter of text displayed before the parts (relevant only if `.type` or
    `.surface_analysis` is given).
* `.notext`: Don't display any text before the parts (relevant only if `.type` or `.surface_analysis` is given).
* `.nocat`: Disable all categorization.
* `.noaffixcat`: Disable affix (and compound) categorization. Relevant for e.g. blends, which may otherwise
                be incorrectly categorized as compound terms.
* `.lit`: Overall literal definition. Different from term-specific literal definitions.
* `.force_cat`: Always display categories, even on userspace pages.
* `.surface_analysis`: Implement {{surface analysis}}; adds `By surface analysis, ` before the parts.


if whole_words == 2 then
'''WARNING''': This destructively modifies both `data` and the individual structures within `.parts`.
is_affix_or_compound = true
]==]
table.insert(categories, "compound " .. data.pos)
function export.show_affix(data)
end
local text_sections, categories, borrowing_type = generate_affix_categories(data)
end
end


-- Make sure there was either an affix or a compound (two or more regular terms).
-- Process each part for display
if not is_affix_or_compound then
local parts_formatted = {}
error("The parameters did not include any affixes, and the term is not a compound. Please provide at least one affix.")
for i, part in ipairs_with_gaps(data.parts) do
-- Make a link for the part
table.insert(parts_formatted, export.link_term(part, data, "include_separator"))
end
end


Line 961: Line 1,018:
text = ucfirst(text)
text = ucfirst(text)
end
end
table.insert(text_sections, 1, text)
table.insert(text_sections, 1, text)
end
end


table.insert(text_sections, export.join_formatted_parts { data = data, parts_formatted = parts_formatted,
table.insert(text_sections, export.join_formatted_parts { data = data, parts_formatted = parts_formatted,
categories = categories })
categories = categories, separator_already_added = true })
return table.concat(text_sections)
return table.concat(text_sections)
end
--[==[
Get only the categories that would be generated by show_affix(), without any text output or formatting.
This is used by Module:etymon to get affix categorization.
Returns an array of category objects, where
each entry is either a string (simple category name) or a table with keys `cat`, `sort_key`,
and `sort_base` for more complex categorization.
`data` should have the same structure as passed to show_affix():
* `.lang` (required): Overall language object
* `.parts` (required): Array of affix part objects with `.term`, `.lang`, `.id`, etc.
* `.pos`: Part of speech (defaults to "terms")
* `.sort_key`: Overall sort key for categories
'''WARNING''': This destructively modifies both `data` and the individual structures within `.parts`.
]==]
function export.get_affix_categories_only(data)
local text_sections, categories, borrowing_type = generate_affix_categories(data)
return categories
end
end


Line 973: Line 1,049:
function export.show_surface_analysis(data)
function export.show_surface_analysis(data)
data.surface_analysis = true
data.surface_analysis = true
data.allow_no_affixes_or_compounds = true
return export.show_affix(data)
return export.show_affix(data)
end
end
Line 983: Line 1,060:
]==]
]==]
function export.show_compound(data)
function export.show_compound(data)
local data_for_cats = m_table.shallowCopy(data)
data_for_cats.parts = {}
for k, part in pairs(data.parts) do
data_for_cats.parts[k] = m_table.shallowCopy(part)
end
data_for_cats.allow_no_affixes_or_compounds = true
local categories = export.get_affix_categories_only(data_for_cats)
data.pos = data.pos or default_pos
data.pos = data.pos or default_pos
data.pos = pluralize(data.pos)
data.pos = pluralize(data.pos)
 
local text_sections, categories, borrowing_type =
local text_sections, _, borrowing_type =
process_etymology_type(data.type, data.nocap, data.notext, #data.parts > 0)
process_etymology_type(data.type, data.nocap, data.notext, #data.parts > 0)
data.borrowing_type = borrowing_type
data.borrowing_type = borrowing_type
 
local parts_formatted = {}
local parts_formatted = {}
table.insert(categories, "compound " .. data.pos)
table.insert(categories, "compound " .. data.pos)
Line 999: Line 1,084:
-- Determine affix type and get link and display terms (see text at top of file).
-- Determine affix type and get link and display terms (see text at top of file).
local affix_type, link_term, display_term = parse_term_for_affixes(part.term, part.lang, part.sc,
local affix_type, link_term, display_term = parse_term_for_affixes(part.term, part.lang, part.sc,
nil, not part.alt, nil, part.id)
part.type, not part.alt, nil, part.id)


-- If the term is an infix, recognize it as such (which means e.g. that we will display the term without
-- If the term is an interfix or the type was explicitly given, recognize it as such (which means e.g. that we
-- hyphens for East Asian languages). Otherwise, ignore the fact that it looks like an affix and display as
-- will display the term without hyphens for East Asian languages). Otherwise, ignore the fact that it looks
-- specified in the template (but pay attention to the detected affix type for certain tracking purposes).
-- like an affix and display as specified in the template (but pay attention to the detected affix type for
if affix_type == "infix" then
-- certain tracking purposes).
if affix_type == "interfix" or (part.type and part.type ~= "non-affix") then
-- If link_term is an empty string, either a bare ^ was specified or an empty term was used along with
-- If link_term is an empty string, either a bare ^ was specified or an empty term was used along with
-- inline modifiers. The intention in either case is not to link the term. Don't add a '*fixed with'
-- inline modifiers. The intention in either case is not to link the term. Don't add a '*fixed with'
Line 1,011: Line 1,097:
-- redundant alt text.
-- redundant alt text.
if link_term and link_term ~= "" and not part.part_lang then
if link_term and link_term ~= "" and not part.part_lang then
table.insert(categories, {cat = data.pos .. " interfixed with " .. make_entry_name_no_links(part.lang,
table.insert(categories, {cat = data.pos .. " " .. affix_type .. "ed with " ..
link_term), sort_key = part.sort or data.sort_key})
strip_diacritics_no_links(part.lang, link_term), sort_key = part.sort or data.sort_key})
end
end
part.term = link_term ~= "" and link_term or nil
part.term = link_term ~= "" and link_term or nil
part.alt = part.alt or (display_term ~= link_term and display_term) or nil
part.alt = part.alt or (display_term ~= link_term and display_term) or nil
else
else
if affix_type then
if affix_type ~= "non-affix" then
local langcode = data.lang:getCode()
local langcode = data.lang:getCode()
-- If `data.lang` is an etymology-only language, track both using its code and its full parent's code.
track { affix_type, affix_type .. "/lang/" .. langcode }
local full_langcode = data.lang:getFullCode()
local full_langcode = data.lang:getFullCode()
if langcode ~= full_langcode then
track(affix_type .. "/lang/" .. full_langcode)
end
else
else
whole_words = whole_words + 1
whole_words = whole_words + 1
end
end
end
end
table.insert(parts_formatted, export.link_term(part, data))
table.insert(parts_formatted, export.link_term(part, data, "include_separator"))
end
 
if whole_words == 1 then
track("one whole word")
elseif whole_words == 0 then
track("looks like confix")
end
end


table.insert(text_sections, export.join_formatted_parts { data = data, parts_formatted = parts_formatted,
table.insert(text_sections, export.join_formatted_parts { data = data, parts_formatted = parts_formatted,
categories = categories })
categories = categories, separator_already_added = true })
return table.concat(text_sections)
return table.concat(text_sections)
end
end
Line 1,039: Line 1,136:
]==]
]==]
function export.show_compound_like(data)
function export.show_compound_like(data)
local parts_formatted = {}
data.allow_no_affixes_or_compounds = true
local categories = {}
local text_sections, categories, borrowing_type = generate_affix_categories(data)


if data.cat then
if data.cat then
Line 1,046: Line 1,143:
end
end


-- Make links out of all the parts
-- Process each part for display
for i, part in ipairs(data.parts) do
local parts_formatted = {}
canonicalize_part(part, data.lang, data.sc)
for i, part in ipairs_with_gaps(data.parts) do
table.insert(parts_formatted, export.link_term(part, data))
-- Make a link for the part
table.insert(parts_formatted, export.link_term(part, data, "include_separator"))
end
end


local text_sections = {}
if #data.parts > 0 and data.oftext then
table.insert(text_sections, 1, " " .. data.oftext .. " ")
end
if data.text then
if data.text then
table.insert(text_sections, data.text)
table.insert(text_sections, 1, data.text)
end
if #data.parts > 0 and data.oftext then
table.insert(text_sections, " ")
table.insert(text_sections, data.oftext)
table.insert(text_sections, " ")
end
end
table.insert(text_sections, export.join_formatted_parts { data = data, parts_formatted = parts_formatted,
table.insert(text_sections, export.join_formatted_parts { data = data, parts_formatted = parts_formatted,
categories = categories })
categories = categories, separator_already_added = true })
return table.concat(text_sections)
return table.concat(text_sections)
end
end
Line 1,098: Line 1,194:
part.ts = export.make_affix(part.ts, part.lang, Latn, affix_type)
part.ts = export.make_affix(part.ts, part.lang, Latn, affix_type)
end
end
local function track_wrong_affix_type(template, part, expected_affix_type)
if part and not part.type then
local affix_type = parse_term_for_affixes(part.term, part.lang, part.sc)
if affix_type ~= expected_affix_type then
local part_name = expected_affix_type or "base"
local langcode = part.lang:getCode()
local full_langcode = part.lang:getFullCode()
require("Module:debug/track") {
template,
template .. "/" .. part_name,
template .. "/" .. part_name .. "/" .. (affix_type or "none"),
template .. "/" .. part_name .. "/" .. (affix_type or "none") .. "/lang/" .. langcode
}
-- If `part.lang` is an etymology-only language, track both using its code and its full parent's code.
if full_langcode ~= langcode then
require("Module:debug/track")(
template .. "/" .. part_name .. "/" .. (affix_type or "none") .. "/lang/" .. full_langcode
)
end
end
end
end


local function insert_affix_category(categories, pos, affix_type, part, sort_key, sort_base)
local function insert_affix_category(categories, pos, affix_type, part, sort_key, sort_base)
-- Don't add a '*fixed with' category if the link term is empty or is in a different language.
-- Don't add a '*fixed with' category if the link term is empty or is in a different language.
if part.term and not part.part_lang then
if part.term and not part.part_lang then
local cat = pos .. " " .. affix_type .. "ed with " .. make_entry_name_no_links(part.lang, part.term) ..
local cat = pos .. " " .. affix_type .. "ed with " .. strip_diacritics_no_links(part.lang, part.term) ..
(part.id and " (" .. part.id .. ")" or "")
(part.id and " (" .. part.id .. ")" or "")
if sort_key or sort_base then
if sort_key or sort_base then
Line 1,126: Line 1,247:
make_part_into_affix(data.prefix, data.lang, data.sc, "prefix")
make_part_into_affix(data.prefix, data.lang, data.sc, "prefix")
make_part_into_affix(data.suffix, data.lang, data.sc, "suffix")
make_part_into_affix(data.suffix, data.lang, data.sc, "suffix")
track_wrong_affix_type("circumfix", data.prefix, "prefix")
track_wrong_affix_type("circumfix", data.base, nil)
track_wrong_affix_type("circumfix", data.suffix, "suffix")


-- Create circumfix term.
-- Create circumfix term.
Line 1,143: Line 1,268:
local sort_base
local sort_base
if data.base.term then
if data.base.term then
sort_base = make_entry_name_no_links(data.base.lang, data.base.term)
sort_base = strip_diacritics_no_links(data.base.lang, data.base.term)
end
end


Line 1,152: Line 1,277:
-- Insert the categories, but don't add a '*fixed with' category if the link term is in a different language.
-- Insert the categories, but don't add a '*fixed with' category if the link term is in a different language.
if not data.prefix.part_lang then
if not data.prefix.part_lang then
table.insert(categories, {cat=data.pos .. " circumfixed with " .. make_entry_name_no_links(data.prefix.lang,
table.insert(categories, {cat=data.pos .. " circumfixed with " .. strip_diacritics_no_links(data.prefix.lang,
circumfix), sort_key=data.sort_key, sort_base=sort_base})
circumfix), sort_key=data.sort_key, sort_base=sort_base})
end
end
Line 1,173: Line 1,298:
make_part_into_affix(data.prefix, data.lang, data.sc, "prefix")
make_part_into_affix(data.prefix, data.lang, data.sc, "prefix")
make_part_into_affix(data.suffix, data.lang, data.sc, "suffix")
make_part_into_affix(data.suffix, data.lang, data.sc, "suffix")
track_wrong_affix_type("confix", data.prefix, "prefix")
track_wrong_affix_type("confix", data.base, nil)
track_wrong_affix_type("confix", data.suffix, "suffix")


-- Make links out of all the parts.
-- Make links out of all the parts.
Line 1,178: Line 1,307:
local prefix_sort_base
local prefix_sort_base
if data.base and data.base.term then
if data.base and data.base.term then
prefix_sort_base = make_entry_name_no_links(data.base.lang, data.base.term)
prefix_sort_base = strip_diacritics_no_links(data.base.lang, data.base.term)
elseif data.suffix.term then
elseif data.suffix.term then
prefix_sort_base = make_entry_name_no_links(data.suffix.lang, data.suffix.term)
prefix_sort_base = strip_diacritics_no_links(data.suffix.lang, data.suffix.term)
end
end


Line 1,213: Line 1,342:
-- Hyphenate the affixes and apply any affix mappings.
-- Hyphenate the affixes and apply any affix mappings.
make_part_into_affix(data.infix, data.lang, data.sc, "infix")
make_part_into_affix(data.infix, data.lang, data.sc, "infix")
track_wrong_affix_type("infix", data.base, nil)
track_wrong_affix_type("infix", data.infix, "infix")


-- Make links out of all the parts.
-- Make links out of all the parts.
Line 1,243: Line 1,375:
make_part_into_affix(prefix, data.lang, data.sc, "prefix")
make_part_into_affix(prefix, data.lang, data.sc, "prefix")
end
end
for i, prefix in ipairs(data.prefixes) do
track_wrong_affix_type("prefix", prefix, "prefix")
end
track_wrong_affix_type("prefix", data.base, nil)


-- Make links out of all the parts.
-- Make links out of all the parts.
Line 1,252: Line 1,390:
first_sort_base = ine(data.prefixes[2].term) or ine(data.prefixes[2].alt)
first_sort_base = ine(data.prefixes[2].term) or ine(data.prefixes[2].alt)
if first_sort_base then
if first_sort_base then
first_sort_base = make_entry_name_no_links(data.prefixes[2].lang, first_sort_base)
first_sort_base = strip_diacritics_no_links(data.prefixes[2].lang, first_sort_base)
end
end
elseif data.base then
elseif data.base then
first_sort_base = ine(data.base.term) or ine(data.base.alt)
first_sort_base = ine(data.base.term) or ine(data.base.alt)
if first_sort_base then
if first_sort_base then
first_sort_base = make_entry_name_no_links(data.base.lang, first_sort_base)
first_sort_base = strip_diacritics_no_links(data.base.lang, first_sort_base)
end
end
end
end
Line 1,291: Line 1,429:
for i, suffix in ipairs(data.suffixes) do
for i, suffix in ipairs(data.suffixes) do
make_part_into_affix(suffix, data.lang, data.sc, "suffix")
make_part_into_affix(suffix, data.lang, data.sc, "suffix")
end
track_wrong_affix_type("suffix", data.base, nil)
for i, suffix in ipairs(data.suffixes) do
track_wrong_affix_type("suffix", suffix, "suffix")
end
end