Module:usex: Difference between revisions

From Linguifex
Jump to navigation Jump to search
No edit summary
No edit summary
 
(One intermediate revision by the same user not shown)
Line 1: Line 1:
local export = {}
local export = {}
local debug_track_module = "Module:debug/track"
local links_module = "Module:links"
local scripts_module = "Module:scripts"
local script_utilities_module = "Module:script utilities"
local string_utilities_module = "Module:string utilities"
local usex_data_module = "Module:usex/data"
local m_str_utils = require(string_utilities_module)
local rsubn = m_str_utils.gsub
local rsplit = m_str_utils.split
local rfind = m_str_utils.find
local uupper = m_str_utils.upper
local ulen = m_str_utils.len
local u = m_str_utils.char


local translit_data = mw.loadData("Module:transliteration/data")
local translit_data = mw.loadData("Module:transliteration/data")
local needs_translit = translit_data[1]
local needs_translit = translit_data[1]
local BRACKET_SUB = u(0xFFF0)
local original_text = "<small>''original:''</small> "
-- 100 more or less corresponds to the setting of 30 for the example text alone as formerly used in
-- {{hi-x}} and {{ur-x}}, taking into account transliteration, gloss and formatting characters.
-- FIXME: We should have different widths for desktop vs. mobile and generate the appropriate CSS so
-- both are handled correctly.
local MAX_INLINE_WIDTH = 100 -- In characters. HACK! FIXME! Do this a better way.
-- List of scripts whose characters are double-width/full-width.
local double_width_scripts = {"Hani", "Hrkt", "Hang"}


-- microformat2 classes, see https://phabricator.wikimedia.org/T138709
-- microformat2 classes, see https://phabricator.wikimedia.org/T138709
local class = {
local css_classes = {
container_ux = 'h-usage-example',
container_ux = 'h-usage-example',
container_quotation = 'h-quotation',
container_quotation = 'h-quotation',
Line 15: Line 42:
-- transliteration = 'e-transliteration',
-- transliteration = 'e-transliteration',
-- transcription = 'e-transcription',
-- transcription = 'e-transcription',
normalization = 'e-normalization',
literally = 'e-literally',
literally = 'e-literally',
qualifier = 'e-qualifier',
source = 'e-source',
source = 'e-source',
footer = 'e-footer'
footer = 'e-footer'
}
}


-- helper functions
-- helper functions
local function track(page)
require(debug_track_module)("usex/" .. page)
return true
end
-- version of rsubn() that discards all but the first return value
local function rsub(term, foo, bar)
local retval = rsubn(term, foo, bar)
return retval
end


local function wrap(tag, class, text, lang)
local function wrap(tag, class, text, lang)
Line 39: Line 82:
local function div(class, text) return wrap('div', class, text) end
local function div(class, text) return wrap('div', class, text) end


function export.format_usex(data)
-- Remove any HTML from the formatted text and resolve links, since the extra characters don't contribute to the
local namespace = mw.title.getCurrentTitle().nsText
-- displayed length.
local function convert_to_raw_text(text)
text = rsub(text, "<.->", "")
if text:find("%[%[") then
text = require(links_module).remove_links(text)
end
return text
end


local lang, sc, usex, translation, transliteration, transcription, noenum,
local function get_character_width(text)
inline, ref, quote, lit, substs, qualifiers, source, nocat, brackets, footer,
local charsets = {}
sortkey, added_class =
for _, script in ipairs(double_width_scripts) do
data.lang, data.sc, data.usex, data.translation, data.transliteration,
table.insert(charsets, require(scripts_module).getByCode(script):getCharacters())
data.transcription, data.noenum, data.inline, data.ref, data.quote,
end
data.lit, data.substs, data.qualifiers, data.source, data.nocat,
local single_width_chars = ulen(rsub(text, "[" .. table.concat(charsets) .. "]", ""))
data.brackets, data.footer, data.sortkey, data.class
local total_chars = ulen(text)
local double_width_chars = total_chars - single_width_chars
return single_width_chars + 2 * double_width_chars
end


--[[
--[==[
if lang:getType() == "reconstructed" or namespace == "Reconstruction" then
Apply the substitutions in `subst` (from the {{para|subst}} parameter or similar) to the example or quotation in
error("Reconstructed languages and reconstructed terms cannot have usage examples, as we have no record of their use.")
`usex` after removing links, returning the resulting text. `track`, if supplied, is a function of one argument that is
used to insert tracking categories: one for any call to this function, another if a single / is used in the `subst`
argument.
]==]
function export.apply_subst(usex, subst, track)
local subbed_usex = require(links_module).remove_links(usex)
local function do_track(page)
if track then
track(page)
end
return true
end
end
]]
 
if subst then
if lit then
-- [[Special:WhatLinksHere/Wiktionary:Tracking/usex/subst]]
lit = "(literally, “" .. span(class.literally, lit) .. ")"
do_track("subst")
subst = rsplit(subst, ",")
for _, subpair in ipairs(subst) do
-- [[Special:WhatLinksHere/Wiktionary:Tracking/usex/subst-single-slash]]
local subsplit = rsplit(subpair, rfind(subpair, "//") and "//" or do_track("subst-single-slash") and "/")
subbed_usex = rsub(subbed_usex, subsplit[1], subsplit[2])
end
end
end


if source then
return subbed_usex
source = "(" .. span(class.source, source) .. ")"
end
end
 
--[=[
Process parameters for usex text (either the primary text or the original text) and associated annotations. On input,
the following fields are recognized in `data` (all are optional except as marked):
 
* `lang`: Language object of text; may be an etymology language (REQUIRED).
* `termlang`: The language object of the term being illustrated, which may be different from the language of the main
  quotation text and should always be based off of the main text, not the original text. Used for
  categories. May be an etymology language (REQUIRED).
* `usex`: Text of usex/quotation.
* `sc`: Script object of text.
* `tr`: Manual transliteration.
* `ts`: Transcription.
* `norm`: Normalized version of text.
* `normsc`: Script object of normalized version of text, or "auto".
* `subst`: String of substitutions for transliteration purposes.
* `quote`: If non-nil, this is a quotation (using {{tl|quote}} or {{tl|quote-*}}) instead of a usage example (using
  {{tl|usex}}). If it has the specific value "quote-meta", this is a quotation with citation (invoked from
  {{tl|quote-*}}). This controls the CSS class used to display the quotation, as well as the face used to tag the usex
  (which in turn results in the usex being upright text if a quotation, and italic text if a usage example).
* `title`: Title object of the current page (REQUIRED).
* `q`: List of left qualifiers.
* `qq`: List of right qualifiers.
* `ref`: String to display directly after any right qualifier, with no space. (FIXME: Should be converted into
        an actual ref.)
* `nocat`: Overall `data.nocat` value.
* `categories`: List to insert categories into (REQUIRED).
* `example_type`: Either "quotation" (if `quote` specified) or "usage example" (otherwise) (REQUIRED).
 
On output, return an object with four fields:
* `usex`: Formatted usex, including qualifiers attached to both sides and `ref` attached to the right. Always specified.
* `tr`: Formatted transliteration; may be nil.
* `ts`: Formatted transcription; may be nil.
* `norm`: Formatted normalized version of usex; may be nil.
]=]
local function process_usex_text(data)
local lang = data.lang
local termlang = data.termlang
local usex = data.usex
local sc = data.sc
local tr = data.tr
local ts = data.ts
local norm = data.norm
local normsc = data.normsc
local subst = data.subst
local quote = data.quote
local leftq = data.q
local rightq = data.qq
local ref = data.ref
local nocat = data.nocat
local categories = data.categories
local example_type = data.example_type
local title = data.title


if footer then
if normsc == "auto" then
footer = span(class.footer, footer)
normsc = nil
elseif not normsc then
normsc = sc
end
end
local example_type = quote and "quote" or "usage example" -- used in error messages
local categories = {}


if not sc then
if not sc then
sc = require("Module:scripts").findBestScript(usex, lang)
sc = lang:findBestScript(usex)
end
if not normsc and norm then
normsc = lang:findBestScript(norm)
end
end
local langcode = lang:getFullCode()


-- tr=- means omit transliteration altogether
-- tr=- means omit transliteration altogether
if transliteration == "-" then
if tr == "-" then
transliteration = nil
tr = nil
else
else
-- Try to auto-transliterate
-- Try to auto-transliterate.
if not transliteration and usex then
if not tr then
local subbed_usex = require("Module:links").remove_links(usex)
-- First, try transliterating the normalization, if supplied.
 
if norm and normsc and not normsc:getCode():find("Lat") then -- Latn, Latf, Latg, pjt-Latn
if substs then
local subbed_norm = export.apply_subst(norm, subst, track)
tr = (lang:transliterate(subbed_norm, normsc))
local substs = mw.text.split(substs, ",")
end
for _, subpair in ipairs(substs) do
-- If no normalization, or the normalization is in a Latin script, or the transliteration of the
local subsplit = mw.text.split(subpair, mw.ustring.find(subpair, "//") and "//" or "/")
-- normalization failed, fall back to transliterating the usex.
subbed_usex = mw.ustring.gsub(subbed_usex, subsplit[1], subsplit[2])
if not tr then
end
local subbed_usex = export.apply_subst(usex, subst, track)
tr = (lang:transliterate(subbed_usex, sc))
end
-- If the language doesn't have capitalization and is specified in [[Module:usex/data]], then capitalize any sentences.
-- Exclamation marks and question marks need to be unescaped then re-escaped.
if tr and mw.loadData(usex_data_module).capitalize_sentences[langcode] then
tr = tr:gsub("&#x21;", "!")
:gsub("&#x3F;", "?")
tr = rsub(tr, "%f[^%z%p%s](.)(.-[%.%?!‽])", function(m1, m2)
return uupper(m1) .. m2
end)
tr = tr:gsub("!", "&#x21;")
:gsub("%?", "&#x3F;")
end
end
transliteration = lang:transliterate(subbed_usex, sc)
end
end


-- If there is still no transliteration, then add a cleanup category
-- If there is still no transliteration, then add a cleanup category.
if not transliteration and needs_translit[lang] then
if not tr and needs_translit[langcode] and not sc:getCode():find("Lat") and sc:getCode() ~= "None" then
table.insert(categories, "Requests for transliteration of " .. lang:getCanonicalName() .. " terms")
table.insert(categories, ("Requests for transliteration of %s %ss"):format(lang:getCanonicalName(),
example_type))
end
end
end
end
if transliteration then
if tr then
transliteration = require("Module:script utilities").tag_translit(transliteration, lang:getCode(), "usex")
tr = require(script_utilities_module).tag_translit(tr, langcode, "usex")
end
end
if transcription then
if ts then
transcription = require("Module:script utilities").tag_transcription(transcription, lang:getCode(), "usex")
ts = require(script_utilities_module).tag_transcription(ts, langcode, "usex")
ts = "/" .. ts .. "/"
end
end


if translation == "-" then
local function do_language_and_script_tagging(usex, lang, sc, css_class)
translation = nil
usex = require(links_module).embedded_language_links{term = usex, lang = lang, sc = sc}
table.insert(categories, "Omitted translation in the main namespace")
elseif translation then
translation = span(class.translation, translation)
elseif lang:getCode() ~= "en" and lang:getCode() ~= "mul" and lang:getCode() ~= "und" then
-- add trreq category if translation is unspecified and language is not english, translingual or undetermined
table.insert(categories, "Requests for translations of " .. lang:getCanonicalName() .. " usage examples")
translation = "<small>(please add an English translation of this " .. example_type .. ")</small>"
end
 
if usex then
if usex:find("[[", 1, true) then
usex = require("Module:links").language_link({term = usex, lang = lang}, false)
end
local face
local face
Line 130: Line 256:
end
end
if not nocat and (namespace == "" or namespace == "Reconstruction" or namespace == "Contionary") then
usex = require(script_utilities_module).tag_text(usex, lang, sc, face, css_class)
if quote then
 
table.insert(categories, lang:getCanonicalName() .. " terms with quotations")
return usex
end
 
if usex then
usex = do_language_and_script_tagging(usex, lang, sc,
quote == "quote-meta" and css_classes.quotation_with_citation or
quote and css_classes.quotation or css_classes.example)
if not nocat then
-- Only add [[Citations:foo]] to [[:Category:LANG terms with quotations]] if [[foo]] exists.
local ok_to_add_cat
if title.nsText ~= "Citations" then
ok_to_add_cat = true
else
else
table.insert(categories, lang:getCanonicalName() .. " terms with usage examples")
-- Here we don't want to use the subpage text because we check [[Citations:foo]] against [[foo]] and
-- if there's a slash in what follows 'Citations:', we want to check against the full page with the
-- slash.
local mainspace_title = mw.title.new(title.text)
if mainspace_title and mainspace_title.exists then
ok_to_add_cat = true
end
end
if ok_to_add_cat then
-- Categories beginning with the language name should use full languages as that's what the poscat
-- system requires, but 'Requests for' categories can use etymology-only languages.
table.insert(categories, ("%s terms with %ss"):format(termlang:getFullName(), example_type))
end
end
end
end
usex = require("Module:script utilities").tag_text(usex, lang, sc, face,
quote == "quote-meta" and class.quotation_with_citation or
quote and class.quotation or class.example)
else
else
if transliteration then
if tr then
table.insert(categories, "Requests for native script in " .. lang:getCanonicalName() .. " usage examples")
table.insert(categories, ("Requests for native script in %s %ss"):format(lang:getCanonicalName(),
example_type))
end
end
-- TODO: Trigger some kind of error here
-- TODO: Trigger some kind of error here
usex = "<small>(please add the primary text of this " .. example_type .. ")</small>"
usex = "<small>(please add the primary text of this " .. example_type .. ")</small>"
end
if norm then
-- Use brackets in HTML entity format just to make sure we don't interfere with links; add brackets before
-- script tagging so that if the script tagging increases the font size, the brackets get increased too.
norm = "&#91;" .. norm .. "&#93;"
norm = do_language_and_script_tagging(norm, lang, normsc, css_classes.normalization)
end
end


local result = {}
local result = {}
 
if sc:getDirection() == "rtl" then
if leftq and #leftq > 0 then
usex = "&rlm;" .. usex .. "&lrm;"
table.insert(result, span(css_classes.qualifier, require("Module:qualifier").format_qualifier(leftq)) .. " ")
end
end
table.insert(result, usex)
table.insert(result, usex)
if rightq and #rightq > 0 then
table.insert(result, " " .. span(css_classes.qualifier, require("Module:qualifier").format_qualifier(rightq)))
end
if ref and ref ~= "" then
track("ref")
table.insert(result, ref)
end
return {
usex = table.concat(result),
tr = tr,
ts = ts,
norm = norm
}
end
--[==[
Format a usex or quotation. Implementation of {{tl|ux}}, {{tl|quote}} and {{tl|quote-*}} templates (e.g.
{{tl|quote-book}}, {{tl|quote-journal}}, {{tl|quote-web}}, etc.). FIXME: Should also be used by {{tl|Q}} and
[[Module:Quotations]].
Takes a single object `data`, containining the following fields:
* `usex`: The text of the usex or quotation to format. Semi-mandatory (a maintenance line is displayed if missing).
* `lang`: The language object of the text. Mandatory. May be an etymology language.
* `termlang`: The language object of the term, which may be different from the language of the text. Defaults to `lang`.
              Used for categories. May be an etymology language.
* `sc`: The script object of the text. Autodetected if not given.
* `quote`: If specified, this is a quotation rather than a usex (uses a different CSS class that affects formatting).
* `inline`: If specified, format the usex or quotation inline (on one line).
* `translation`: Translation of the usex or quotation, if in a foreign language.
* `lit`: Literal translation (if the translation in `translation` is idiomatic and differs significantly from the
literal translation).
* `normalization`: Normalized version of the usex or quotation (esp. for older languages where nonstandard spellings
  were common).
* `normsc`: Script object of the normalized text. If unspecified, use the script object given in `sc` if any, otherwise
            do script detection on the normalized text. If "auto", do script detection on the normalized text even if
a script was specified in `sc`.
* `transliteration`: Transliteration of the usex. If unspecified, transliterate the normalization if specified and not
                    in a Latin script and transliterable, otherwise fall back to transliterating the usex text.
* `transcription`: Transcription of the usex, for languages where the transliteration differs significantly from the
                  pronunciation.
* `subst`: String indicating substitutions to perform on the usex/quotation and normalization prior to transliterating
          them. Multiple substs are comma-separated and individual substs are of the form FROM//TO where FROM is a
  Lua pattern and TO is a Lua replacement spec. (FROM/TO is also recognized if no // is present in the
  substitution.)
* `q`: If specified, a list of left qualifiers to display before the usex/quotation text.
* `qq`: If specified, a list of right qualifiers to display after the usex/quotation text.
* `qualifiers`: If specified, a list of right qualifiers to display after the usex/quotation text, for compatibility
                purposes.
* `ref`: Reference text to display directly after the right qualifiers. (FIXME: Instead, this should be actual
        references.)
* `orig`: Original text, if the primary text of the usex or quotation is a translation.
* `origlang`: The language object of the original text. Mandatory if original text given. May be an etymology language.
* `origsc`: The script object of the original text. Autodetected if not given.
* `orignorm`: Normalized version of the original text (esp. for older languages where nonstandard spellings were
              common).
* `orignormsc`: Script object of the normalized original text. If unspecified, use the script object given in `origsc`
                if any, otherwise do script detection on the normalized original text. If "auto", do script detection
                on the normalized text even if a script was specified in `origsc`.
* `origtr`: Transliteration of the original text. If unspecified, transliterate the normalized original text if
            specified and not in a Latin script and transliterable, otherwise fall back to transliterating the original
            text.
* `origts`: Transcription of the original text, for languages where the transliteration differs significantly from the
            pronunciation.
* `origsubst`: String indicating substitutions to perform on the original text and normalization thereof prior to
              transliterating them. Multiple substs are comma-separated and individual substs are of the form FROM//TO
              where FROM is a Lua pattern and TO is a Lua replacement spec. (FROM/TO is also recognized if no // is
              present in the substitution.)
* `origq`: If specified, a list of left qualifiers to display before the original text.
* `origqq`: If specified, a list of right qualifiers to display after the original text.
* `origref`: Reference text to display directly after the right qualifiers of the original text. (FIXME: Instead, this
            should be actual references.)
* `source`: Source of the quotation, displayed in parens after the quotation text.
* `footer`: Footer displaying miscellaneous information, shown after the quotation. (Typically this should be in a
            small font.)
* `nocat`: Suppress categorization.
* `noreq`: Suppress request for translation when no translation provided.
* `sortkey`: Sort key for categories.
* `brackets`: If specified, show a bracket at the end (used with brackets= in {{tl|quote-*}} templates, which show the
              bracket at the beginning, to indicate a mention rather than a use).
* `class`: Additional CSS class surrounding the entire formatted text.
]==]
function export.format_usex(data)
local lang = data.lang
local termlang = data.termlang or lang
local translation = data.translation
local quote = data.quote
local lit = data.lit
local source = data.source
local brackets = data.brackets
local footer = data.footer
local sortkey = data.sortkey
local noreq = data.noreq
local title
if data.pagename then -- for testing, doc pages, etc.
title = mw.title.new(data.pagename)
if not title then
error(("Bad value for `data.pagename`: '%s'"):format(data.pagename))
end
else
title = mw.title.getCurrentTitle()
end
--[[
if title.nsText == "Reconstruction" or lang:hasType("reconstructed") then
error("Reconstructed languages and reconstructed terms cannot have usage examples, as we have no record of their use.")
end
]]
if #qualifiers > 0 then
if lit then
table.insert(result, " " .. require("Module:qualifier").format_qualifier(qualifiers))
lit = "(literally, “" .. span(css_classes.literally, lit) .. "”)"
end
 
if source then
source = "(" .. span(css_classes.source, source) .. ")"
end
 
if footer then
footer = span(css_classes.footer, footer)
end
end
table.insert(result, ref)
local example_type = quote and "quotation" or "usage example" -- used in error messages and categories
local categories = {}
if inline then
 
if transliteration then
local usex_obj = process_usex_text {
table.insert(result, " " .. transliteration)
lang = lang,
if transcription then
termlang = termlang,
table.insert(result, " /" .. transcription .. "/")
usex = data.usex,
sc = data.sc,
tr = data.transliteration,
ts = data.transcription,
norm = data.normalization,
normsc = data.normsc,
subst = data.subst,
quote = data.quote,
title = title,
q = data.q,
qq = data.qq,
ref = data.ref,
nocat = data.nocat,
categories = categories,
example_type = example_type,
}
 
local orig_obj = data.orig and process_usex_text {
lang = data.origlang,
-- Any categories derived from the original text should use the language of the main text or the term inside it,
-- not the language of the original text.
termlang = termlang,
usex = data.orig,
sc = data.origsc,
tr = data.origtr,
ts = data.origts,
norm = data.orignorm,
normsc = data.orignormsc,
subst = data.origsubst,
quote = data.quote,
title = title,
q = data.origq,
qq = data.origqq,
ref = data.origref,
nocat = data.nocat,
categories = categories,
example_type = example_type,
} or nil
 
if translation == "-" then
translation = nil
table.insert(categories, ("%s %ss with omitted translation"):format(lang:getFullName(),
example_type))
elseif translation then
translation = span(css_classes.translation, translation)
elseif not noreq then
local langcode = lang:getFullCode()
local origlangcode = data.origlang and data.origlang:getFullCode()
if langcode ~= "en" and langcode ~= "mul" and langcode ~= "und" and origlangcode ~= "en" then
-- add trreq category if translation is unspecified and language is not english, translingual or
-- undetermined
table.insert(categories, ("Requests for translations of %s %ss"):format(lang:getCanonicalName(),
example_type))
if quote then
translation = "<small>(please [[WT:Quotations#Adding translations to quotations|add an English translation]] of this "
.. example_type .. ")</small>"
else
translation = "<small>(please add an English translation of this " .. example_type .. ")</small>"
end
end
end
 
local function generate_inline_usex()
local result = {}
local function ins(text)
table.insert(result, text)
end
 
ins(usex_obj.usex)
 
local function insert_annotations(obj)
if obj.norm then
ins(" " .. obj.norm)
end
if obj.tr or obj.ts then
ins(" ―")
if obj.tr then
ins(" " .. obj.tr)
end
if obj.ts then
ins(" " .. obj.ts)
end
end
end
elseif transcription then
end
table.insert(result, " ― /" .. transcription .. "/")
 
insert_annotations(usex_obj)
 
if orig_obj then
ins(" (")
ins("[" .. original_text .. orig_obj.usex .. "]")
insert_annotations(orig_obj)
ins(")")
end
end


if translation then
if translation then
table.insert(result, " ― " .. translation)
ins(" ― " .. translation)
end
end


if lit then
if lit then
table.insert(result, " " .. lit)
ins(" " .. lit)
end
end
if source then
if source then
table.insert(result, " " .. source)
ins(" " .. source)
end
end


if footer then
if footer then
table.insert(result, " " .. footer)
ins(" " .. footer)
end
end


if brackets then
if data.brackets then
table.insert(result, "]")
ins("]")
end
end
elseif transliteration or translation or transcription or lit or source or footer then
table.insert(result, "<dl>")
local closing_tag = ""


if transliteration then
return table.concat(result)
table.insert(result, closing_tag)
end
table.insert(result, "<dd>" .. transliteration)
 
closing_tag = "</dd>"
local function generate_multiline_usex()
end
local result = {}
local function ins(text)
if transcription then
table.insert(result, text)
table.insert(result, closing_tag)
table.insert(result, "<dd>/" .. transcription .. "/")
closing_tag = "</dd>"
end
if translation then
table.insert(result, closing_tag)
table.insert(result, "<dd>" .. translation)
closing_tag = "</dd>"
end
end


if lit then
ins(usex_obj.usex)
table.insert(result, closing_tag)
local any_usex_annotations = usex_obj.tr or usex_obj.ts or usex_obj.norm or translation or lit
table.insert(result, "<dd>" .. lit)
local any_orig_annotations = orig_obj and (orig_obj.tr or orig_obj.ts or orig_obj.norm)
closing_tag = "</dd>"
if any_usex_annotations or orig_obj or source or footer then
ins("<dl>")
 
local function insert_dd(text)
if text then
ins("<dd>")
ins(text)
if data.brackets then
ins(BRACKET_SUB)
end
ins("</dd>")
end
end
 
insert_dd(usex_obj.norm)
insert_dd(usex_obj.tr)
insert_dd(usex_obj.ts)
 
if orig_obj then
insert_dd("[" .. original_text .. orig_obj.usex .. "]")
if any_orig_annotations then
ins("<dd><dl>")
insert_dd(orig_obj.norm)
insert_dd(orig_obj.tr)
insert_dd(orig_obj.ts)
ins("</dl></dd>")
end
end
 
insert_dd(translation)
insert_dd(lit)
 
if source or footer then
if any_usex_annotations then
ins("<dd><dl>")
end
insert_dd(source)
insert_dd(footer)
if any_usex_annotations then
ins("</dl></dd>")
end
end
 
ins("</dl>")
elseif data.brackets then
ins(BRACKET_SUB)
end
end


local extra_indent, closing_extra_indent
result = table.concat(result)
if transliteration or transcription or translation or lit then
if data.brackets then
extra_indent = "<dd><dl><dd>"
result = result:gsub("^(.*)" .. BRACKET_SUB, "%1]"):gsub(BRACKET_SUB, "")
closing_extra_indent = "</dd></dl></dd>"
else
extra_indent = "<dd>"
closing_extra_indent = "</dd>"
end
if source then
table.insert(result, closing_tag)
table.insert(result, extra_indent .. source)
closing_tag = closing_extra_indent
end
end


if footer then
return result
table.insert(result, closing_tag)
end
table.insert(result, extra_indent .. footer)
closing_tag = closing_extra_indent
end


if brackets then
local is_inline
table.insert(result, "]")
if data.inline == "auto" then
result = generate_inline_usex()
if get_character_width(convert_to_raw_text(result)) > MAX_INLINE_WIDTH then
result = generate_multiline_usex()
is_inline = false
else
is_inline = true
end
end
elseif data.inline then
table.insert(result, closing_tag)
result = generate_inline_usex()
 
is_inline = true
table.insert(result, "</dl>")
else
else
if brackets then
result = generate_multiline_usex()
table.insert(result, "]")
is_inline = false
end
end
end
 
result = table.concat(result)
local class = quote and css_classes.container_quotation or css_classes.container_ux
local class = quote and class.container_quotation or class.container_ux
if data.class then
if added_class then
class = class .. " " .. data.class
class = class .. " " .. added_class
end
result = div(class, result)
result = result .. require("Module:utilities").format_categories(categories, lang, sortkey)
if noenum then
result = "\n: " .. result
end
end
return result
result = (is_inline and span or div)(class, result)
return result .. require("Module:utilities").format_categories(categories, lang, sortkey)
end
end


return export
return export

Latest revision as of 10:42, 6 August 2024

Documentation for this module may be created at Module:usex/doc

local export = {}

local debug_track_module = "Module:debug/track"
local links_module = "Module:links"
local scripts_module = "Module:scripts"
local script_utilities_module = "Module:script utilities"
local string_utilities_module = "Module:string utilities"
local usex_data_module = "Module:usex/data"

local m_str_utils = require(string_utilities_module)

local rsubn = m_str_utils.gsub
local rsplit = m_str_utils.split
local rfind = m_str_utils.find
local uupper = m_str_utils.upper
local ulen = m_str_utils.len
local u = m_str_utils.char

local translit_data = mw.loadData("Module:transliteration/data")
local needs_translit = translit_data[1]

local BRACKET_SUB = u(0xFFF0)
local original_text = "<small>''original:''</small> "

-- 100 more or less corresponds to the setting of 30 for the example text alone as formerly used in
-- {{hi-x}} and {{ur-x}}, taking into account transliteration, gloss and formatting characters.
-- FIXME: We should have different widths for desktop vs. mobile and generate the appropriate CSS so
-- both are handled correctly.
local MAX_INLINE_WIDTH = 100 -- In characters. HACK! FIXME! Do this a better way.
-- List of scripts whose characters are double-width/full-width.
local double_width_scripts = {"Hani", "Hrkt", "Hang"}

-- microformat2 classes, see https://phabricator.wikimedia.org/T138709
local css_classes = {
	container_ux = 'h-usage-example',
	container_quotation = 'h-quotation',
	example = 'e-example',
	quotation = 'e-quotation',
	quotation_with_citation = 'e-quotation cited-passage',
	translation = 'e-translation',
	-- The following are added by [[Module:script utilities]], using [[Module:script utilities/data]]
--	transliteration = 'e-transliteration',	
--	transcription = 'e-transcription',
	normalization = 'e-normalization',
	literally = 'e-literally',
	qualifier = 'e-qualifier',
	source = 'e-source',
	footer = 'e-footer'
}


-- helper functions

local function track(page)
	require(debug_track_module)("usex/" .. page)
	return true
end


-- version of rsubn() that discards all but the first return value
local function rsub(term, foo, bar)
	local retval = rsubn(term, foo, bar)
	return retval
end


local function wrap(tag, class, text, lang)
	if lang then
		lang = ' lang="' .. lang .. '"'
	else
		lang = ""
	end
	
	if text and class then
		return table.concat{'<', tag, ' class="', class, '"', lang, '>', text, '</', tag, '>'}
	else
		return nil
	end
end

local function span(class, text) return wrap('span', class, text) end
local function div(class, text) return wrap('div', class, text) end

-- Remove any HTML from the formatted text and resolve links, since the extra characters don't contribute to the
-- displayed length.
local function convert_to_raw_text(text)
	text = rsub(text, "<.->", "")
	if text:find("%[%[") then
		text = require(links_module).remove_links(text)
	end
	return text
end

local function get_character_width(text)
	local charsets = {}
	for _, script in ipairs(double_width_scripts) do
		table.insert(charsets, require(scripts_module).getByCode(script):getCharacters())
	end
	local single_width_chars = ulen(rsub(text, "[" .. table.concat(charsets) .. "]", ""))
	local total_chars = ulen(text)
	local double_width_chars = total_chars - single_width_chars
	return single_width_chars + 2 * double_width_chars
end

--[==[
Apply the substitutions in `subst` (from the {{para|subst}} parameter or similar) to the example or quotation in
`usex` after removing links, returning the resulting text. `track`, if supplied, is a function of one argument that is
used to insert tracking categories: one for any call to this function, another if a single / is used in the `subst`
argument.
]==]
function export.apply_subst(usex, subst, track)
	local subbed_usex = require(links_module).remove_links(usex)
	local function do_track(page)
		if track then
			track(page)
		end
		return true
	end

	if subst then
		-- [[Special:WhatLinksHere/Wiktionary:Tracking/usex/subst]]
		do_track("subst")
		
		subst = rsplit(subst, ",")
		for _, subpair in ipairs(subst) do
			-- [[Special:WhatLinksHere/Wiktionary:Tracking/usex/subst-single-slash]]
			local subsplit = rsplit(subpair, rfind(subpair, "//") and "//" or do_track("subst-single-slash") and "/")
			subbed_usex = rsub(subbed_usex, subsplit[1], subsplit[2])
		end
	end

	return subbed_usex
end

--[=[
Process parameters for usex text (either the primary text or the original text) and associated annotations. On input,
the following fields are recognized in `data` (all are optional except as marked):

* `lang`: Language object of text; may be an etymology language (REQUIRED).
* `termlang`: The language object of the term being illustrated, which may be different from the language of the main
			  quotation text and should always be based off of the main text, not the original text. Used for
			  categories. May be an etymology language (REQUIRED).
* `usex`: Text of usex/quotation.
* `sc`: Script object of text.
* `tr`: Manual transliteration.
* `ts`: Transcription.
* `norm`: Normalized version of text.
* `normsc`: Script object of normalized version of text, or "auto".
* `subst`: String of substitutions for transliteration purposes.
* `quote`: If non-nil, this is a quotation (using {{tl|quote}} or {{tl|quote-*}}) instead of a usage example (using
  {{tl|usex}}). If it has the specific value "quote-meta", this is a quotation with citation (invoked from
  {{tl|quote-*}}). This controls the CSS class used to display the quotation, as well as the face used to tag the usex
  (which in turn results in the usex being upright text if a quotation, and italic text if a usage example).
* `title`: Title object of the current page (REQUIRED).
* `q`: List of left qualifiers.
* `qq`: List of right qualifiers.
* `ref`: String to display directly after any right qualifier, with no space. (FIXME: Should be converted into
         an actual ref.)
* `nocat`: Overall `data.nocat` value.
* `categories`: List to insert categories into (REQUIRED).
* `example_type`: Either "quotation" (if `quote` specified) or "usage example" (otherwise) (REQUIRED).

On output, return an object with four fields:
* `usex`: Formatted usex, including qualifiers attached to both sides and `ref` attached to the right. Always specified.
* `tr`: Formatted transliteration; may be nil.
* `ts`: Formatted transcription; may be nil.
* `norm`: Formatted normalized version of usex; may be nil.
]=]
local function process_usex_text(data)
	local lang = data.lang
	local termlang = data.termlang
	local usex = data.usex
	local sc = data.sc
	local tr = data.tr
	local ts = data.ts
	local norm = data.norm
	local normsc = data.normsc
	local subst = data.subst
	local quote = data.quote
	local leftq = data.q
	local rightq = data.qq
	local ref = data.ref
	local nocat = data.nocat
	local categories = data.categories
	local example_type = data.example_type
	local title = data.title

	if normsc == "auto" then
		normsc = nil
	elseif not normsc then
		normsc = sc
	end

	if not sc then
		sc = lang:findBestScript(usex)
	end
	if not normsc and norm then
		normsc = lang:findBestScript(norm)
	end

	local langcode = lang:getFullCode()

	-- tr=- means omit transliteration altogether
	if tr == "-" then
		tr = nil
	else
		-- Try to auto-transliterate.
		if not tr then
			-- First, try transliterating the normalization, if supplied.
			if norm and normsc and not normsc:getCode():find("Lat") then -- Latn, Latf, Latg, pjt-Latn
				local subbed_norm = export.apply_subst(norm, subst, track)
				tr = (lang:transliterate(subbed_norm, normsc))
			end
			-- If no normalization, or the normalization is in a Latin script, or the transliteration of the
			-- normalization failed, fall back to transliterating the usex.
			if not tr then
				local subbed_usex = export.apply_subst(usex, subst, track)
				tr = (lang:transliterate(subbed_usex, sc))
			end
			
			-- If the language doesn't have capitalization and is specified in [[Module:usex/data]], then capitalize any sentences.
			-- Exclamation marks and question marks need to be unescaped then re-escaped.
			if tr and mw.loadData(usex_data_module).capitalize_sentences[langcode] then
				tr = tr:gsub("&#x21;", "!")
					:gsub("&#x3F;", "?")
				tr = rsub(tr, "%f[^%z%p%s](.)(.-[%.%?!‽])", function(m1, m2)
					return uupper(m1) .. m2
				end)
				tr = tr:gsub("!", "&#x21;")
					:gsub("%?", "&#x3F;")
			end
		end

		-- If there is still no transliteration, then add a cleanup category.
		if not tr and needs_translit[langcode] and not sc:getCode():find("Lat") and sc:getCode() ~= "None" then
			table.insert(categories, ("Requests for transliteration of %s %ss"):format(lang:getCanonicalName(),
				example_type))
		end
	end
	if tr then
		tr = require(script_utilities_module).tag_translit(tr, langcode, "usex")
	end
	if ts then
		ts = require(script_utilities_module).tag_transcription(ts, langcode, "usex")
		ts = "/" .. ts .. "/"
	end

	local function do_language_and_script_tagging(usex, lang, sc, css_class)
		usex = require(links_module).embedded_language_links{term = usex, lang = lang, sc = sc}
		
		local face
		if quote then
			face = nil
		else
			face = "usex"
		end
		
		usex = require(script_utilities_module).tag_text(usex, lang, sc, face, css_class)

		return usex
	end

	if usex then
		usex = do_language_and_script_tagging(usex, lang, sc,
			quote == "quote-meta" and css_classes.quotation_with_citation or
			quote and css_classes.quotation or css_classes.example)
		
		if not nocat then
			-- Only add [[Citations:foo]] to [[:Category:LANG terms with quotations]] if [[foo]] exists.
			local ok_to_add_cat
			if title.nsText ~= "Citations" then
				ok_to_add_cat = true
			else
				-- Here we don't want to use the subpage text because we check [[Citations:foo]] against [[foo]] and
				-- if there's a slash in what follows 'Citations:', we want to check against the full page with the
				-- slash.
				local mainspace_title = mw.title.new(title.text)
				if mainspace_title and mainspace_title.exists then
					ok_to_add_cat = true
				end
			end
			if ok_to_add_cat then
				-- Categories beginning with the language name should use full languages as that's what the poscat
				-- system requires, but 'Requests for' categories can use etymology-only languages.
				table.insert(categories, ("%s terms with %ss"):format(termlang:getFullName(), example_type))
			end
		end
	else
		if tr then
			table.insert(categories, ("Requests for native script in %s %ss"):format(lang:getCanonicalName(),
				example_type))
		end
		
		-- TODO: Trigger some kind of error here
		usex = "<small>(please add the primary text of this " .. example_type .. ")</small>"
	end

	if norm then
		-- Use brackets in HTML entity format just to make sure we don't interfere with links; add brackets before
		-- script tagging so that if the script tagging increases the font size, the brackets get increased too.
		norm = "&#91;" .. norm .. "&#93;"
		norm = do_language_and_script_tagging(norm, lang, normsc, css_classes.normalization)
	end

	local result = {}

	if leftq and #leftq > 0 then
		table.insert(result, span(css_classes.qualifier, require("Module:qualifier").format_qualifier(leftq)) .. " ")
	end
	table.insert(result, usex)
	if rightq and #rightq > 0 then
		table.insert(result, " " .. span(css_classes.qualifier, require("Module:qualifier").format_qualifier(rightq)))
	end

	if ref and ref ~= "" then
		track("ref")
		table.insert(result, ref)
	end

	return {
		usex = table.concat(result),
		tr = tr,
		ts = ts,
		norm = norm
	}
end


--[==[
Format a usex or quotation. Implementation of {{tl|ux}}, {{tl|quote}} and {{tl|quote-*}} templates (e.g.
{{tl|quote-book}}, {{tl|quote-journal}}, {{tl|quote-web}}, etc.). FIXME: Should also be used by {{tl|Q}} and
[[Module:Quotations]].

Takes a single object `data`, containining the following fields:

* `usex`: The text of the usex or quotation to format. Semi-mandatory (a maintenance line is displayed if missing).
* `lang`: The language object of the text. Mandatory. May be an etymology language.
* `termlang`: The language object of the term, which may be different from the language of the text. Defaults to `lang`.
              Used for categories. May be an etymology language.
* `sc`: The script object of the text. Autodetected if not given.
* `quote`: If specified, this is a quotation rather than a usex (uses a different CSS class that affects formatting).
* `inline`: If specified, format the usex or quotation inline (on one line).
* `translation`: Translation of the usex or quotation, if in a foreign language.
* `lit`: Literal translation (if the translation in `translation` is idiomatic and differs significantly from the
		 literal translation).
* `normalization`: Normalized version of the usex or quotation (esp. for older languages where nonstandard spellings
				   were common).
* `normsc`: Script object of the normalized text. If unspecified, use the script object given in `sc` if any, otherwise
            do script detection on the normalized text. If "auto", do script detection on the normalized text even if
			a script was specified in `sc`.
* `transliteration`: Transliteration of the usex. If unspecified, transliterate the normalization if specified and not
                     in a Latin script and transliterable, otherwise fall back to transliterating the usex text.
* `transcription`: Transcription of the usex, for languages where the transliteration differs significantly from the
                   pronunciation.
* `subst`: String indicating substitutions to perform on the usex/quotation and normalization prior to transliterating
           them. Multiple substs are comma-separated and individual substs are of the form FROM//TO where FROM is a
		   Lua pattern and TO is a Lua replacement spec. (FROM/TO is also recognized if no // is present in the
		   substitution.)
* `q`: If specified, a list of left qualifiers to display before the usex/quotation text.
* `qq`: If specified, a list of right qualifiers to display after the usex/quotation text.
* `qualifiers`: If specified, a list of right qualifiers to display after the usex/quotation text, for compatibility
                purposes.
* `ref`: Reference text to display directly after the right qualifiers. (FIXME: Instead, this should be actual
         references.)
* `orig`: Original text, if the primary text of the usex or quotation is a translation.
* `origlang`: The language object of the original text. Mandatory if original text given. May be an etymology language.
* `origsc`: The script object of the original text. Autodetected if not given.
* `orignorm`: Normalized version of the original text (esp. for older languages where nonstandard spellings were
              common).
* `orignormsc`: Script object of the normalized original text. If unspecified, use the script object given in `origsc`
                if any, otherwise do script detection on the normalized original text. If "auto", do script detection
                on the normalized text even if a script was specified in `origsc`.
* `origtr`: Transliteration of the original text. If unspecified, transliterate the normalized original text if
            specified and not in a Latin script and transliterable, otherwise fall back to transliterating the original
            text.
* `origts`: Transcription of the original text, for languages where the transliteration differs significantly from the
            pronunciation.
* `origsubst`: String indicating substitutions to perform on the original text and normalization thereof prior to
               transliterating them. Multiple substs are comma-separated and individual substs are of the form FROM//TO
               where FROM is a Lua pattern and TO is a Lua replacement spec. (FROM/TO is also recognized if no // is
               present in the substitution.)
* `origq`: If specified, a list of left qualifiers to display before the original text.
* `origqq`: If specified, a list of right qualifiers to display after the original text.
* `origref`: Reference text to display directly after the right qualifiers of the original text. (FIXME: Instead, this
             should be actual references.)
* `source`: Source of the quotation, displayed in parens after the quotation text.
* `footer`: Footer displaying miscellaneous information, shown after the quotation. (Typically this should be in a
            small font.)
* `nocat`: Suppress categorization.
* `noreq`: Suppress request for translation when no translation provided.
* `sortkey`: Sort key for categories.
* `brackets`: If specified, show a bracket at the end (used with brackets= in {{tl|quote-*}} templates, which show the
              bracket at the beginning, to indicate a mention rather than a use).
* `class`: Additional CSS class surrounding the entire formatted text.
]==]

function export.format_usex(data)
	local lang = data.lang
	local termlang = data.termlang or lang
	local translation = data.translation
	local quote = data.quote
	local lit = data.lit
	local source = data.source
	local brackets = data.brackets
	local footer = data.footer
	local sortkey = data.sortkey
	local noreq = data.noreq

	local title
	if data.pagename then -- for testing, doc pages, etc.
		title = mw.title.new(data.pagename)
		if not title then
			error(("Bad value for `data.pagename`: '%s'"):format(data.pagename))
		end
	else
		title = mw.title.getCurrentTitle()
	end

	--[[
	if title.nsText == "Reconstruction" or lang:hasType("reconstructed") then
		error("Reconstructed languages and reconstructed terms cannot have usage examples, as we have no record of their use.")
	end
	]]
	
	if lit then
		lit = "(literally, “" .. span(css_classes.literally, lit) .. "”)"
	end

	if source then
		source = "(" .. span(css_classes.source, source) .. ")"
	end

	if footer then
		footer = span(css_classes.footer, footer)
	end
	
	local example_type = quote and "quotation" or "usage example" -- used in error messages and categories
	local categories = {}

	local usex_obj = process_usex_text {
		lang = lang,
		termlang = termlang,
		usex = data.usex,
		sc = data.sc,
		tr = data.transliteration,
		ts = data.transcription,
		norm = data.normalization,
		normsc = data.normsc,
		subst = data.subst,
		quote = data.quote,
		title = title,
		q = data.q,
		qq = data.qq,
		ref = data.ref,
		nocat = data.nocat,
		categories = categories,
		example_type = example_type,
	}

	local orig_obj = data.orig and process_usex_text {
		lang = data.origlang,
		-- Any categories derived from the original text should use the language of the main text or the term inside it,
		-- not the language of the original text.
		termlang = termlang,
		usex = data.orig,
		sc = data.origsc,
		tr = data.origtr,
		ts = data.origts,
		norm = data.orignorm,
		normsc = data.orignormsc,
		subst = data.origsubst,
		quote = data.quote,
		title = title,
		q = data.origq,
		qq = data.origqq,
		ref = data.origref,
		nocat = data.nocat,
		categories = categories,
		example_type = example_type,
	} or nil

	if translation == "-" then
		translation = nil
		table.insert(categories, ("%s %ss with omitted translation"):format(lang:getFullName(),
			example_type))
	elseif translation then
		translation = span(css_classes.translation, translation)
	elseif not noreq then
		local langcode = lang:getFullCode()
		local origlangcode = data.origlang and data.origlang:getFullCode()
		if langcode ~= "en" and langcode ~= "mul" and langcode ~= "und" and origlangcode ~= "en" then
			-- add trreq category if translation is unspecified and language is not english, translingual or
			-- undetermined
			table.insert(categories, ("Requests for translations of %s %ss"):format(lang:getCanonicalName(),
				example_type))
			if quote then
				translation = "<small>(please [[WT:Quotations#Adding translations to quotations|add an English translation]] of this "
					.. example_type .. ")</small>"
			else
				translation = "<small>(please add an English translation of this " .. example_type .. ")</small>"
			end
		end
	end

	local function generate_inline_usex()
		local result = {}
		local function ins(text)
			table.insert(result, text)
		end

		ins(usex_obj.usex)

		local function insert_annotations(obj)
			if obj.norm then
				ins(" " .. obj.norm)
			end
			if obj.tr or obj.ts then
				ins(" ―")
				if obj.tr then
					ins(" " .. obj.tr)
				end
				if obj.ts then
					ins(" " .. obj.ts)
				end
			end
		end

		insert_annotations(usex_obj)

		if orig_obj then
			ins(" (")
			ins("[" .. original_text .. orig_obj.usex .. "]")
			insert_annotations(orig_obj)
			ins(")")
		end

		if translation then
			ins(" ― " .. translation)
		end

		if lit then
			ins(" " .. lit)
		end
		
		if source then
			ins(" " .. source)
		end

		if footer then
			ins(" " .. footer)
		end

		if data.brackets then
			ins("]")
		end

		return table.concat(result)
	end

	local function generate_multiline_usex()
		local result = {}
		local function ins(text)
			table.insert(result, text)
		end

		ins(usex_obj.usex)
		local any_usex_annotations = usex_obj.tr or usex_obj.ts or usex_obj.norm or translation or lit
		local any_orig_annotations = orig_obj and (orig_obj.tr or orig_obj.ts or orig_obj.norm)
		if any_usex_annotations or orig_obj or source or footer then
			ins("<dl>")

			local function insert_dd(text)
				if text then
					ins("<dd>")
					ins(text)
					if data.brackets then
						ins(BRACKET_SUB)
					end
					ins("</dd>")
				end
			end

			insert_dd(usex_obj.norm)
			insert_dd(usex_obj.tr)
			insert_dd(usex_obj.ts)

			if orig_obj then
				insert_dd("[" .. original_text .. orig_obj.usex .. "]")
				if any_orig_annotations then
					ins("<dd><dl>")
					insert_dd(orig_obj.norm)
					insert_dd(orig_obj.tr)
					insert_dd(orig_obj.ts)
					ins("</dl></dd>")
				end
			end

			insert_dd(translation)
			insert_dd(lit)

			if source or footer then
				if any_usex_annotations then
					ins("<dd><dl>")
				end
				insert_dd(source)
				insert_dd(footer)
				if any_usex_annotations then
					ins("</dl></dd>")
				end
			end

			ins("</dl>")
		elseif data.brackets then
			ins(BRACKET_SUB)
		end

		result = table.concat(result)
		if data.brackets then
			result = result:gsub("^(.*)" .. BRACKET_SUB, "%1]"):gsub(BRACKET_SUB, "")
		end

		return result
	end

	local is_inline
	if data.inline == "auto" then
		result = generate_inline_usex()
		if get_character_width(convert_to_raw_text(result)) > MAX_INLINE_WIDTH then
			result = generate_multiline_usex()
			is_inline = false
		else
			is_inline = true
		end
	elseif data.inline then
		result = generate_inline_usex()
		is_inline = true
	else
		result = generate_multiline_usex()
		is_inline = false
	end

	local class = quote and css_classes.container_quotation or css_classes.container_ux
	if data.class then
		class = class .. " " .. data.class
	end
	result = (is_inline and span or div)(class, result)
	return result .. require("Module:utilities").format_categories(categories, lang, sortkey)
end

return export