Module:utilities: Difference between revisions

From Linguifex
Jump to navigation Jump to search
No edit summary
No edit summary
Line 1: Line 1:
local export = {}
local headword_data_module = "Module:headword/data"
local languages_module = "Module:languages"
local links_module = "Module:links"
local load_module = "Module:load"
local pages_module = "Module:pages"
local script_utilities_module = "Module:script utilities"
local scripts_module = "Module:scripts"
local string_utilities_module = "Module:string utilities"
local utilities_data_module = "Module:utilities/data"
local mw = mw
local mw = mw
local mw_text = mw.text
local package = package
local table = table


local require = require
local anchor_encode = mw.uri.anchorEncode
local concat = table.concat
local concat = table.concat
local decode_entities = require("Module:string utilities").decode_entities
local format_categories -- Defined below.
local get_current_frame = mw.getCurrentFrame
local insert = table.insert
local ipairs = ipairs
local ipairs = ipairs
local maxn = table.maxn
local require = require
local tonumber = tonumber
local trim = mw_text.trim
local type = type
local type = type
local unstrip = mw_text.unstrip
local unstrip = mw.text.unstrip
local unstripNoWiki = mw_text.unstripNoWiki


local export = {}
--[==[
 
Loaders for functions in other modules, which overwrite themselves with the target function when called. This ensures modules are only loaded when needed, retains the speed/convenience of locally-declared pre-loaded functions, and has no overhead after the first call, since the target functions are called directly in any subsequent calls.]==]
do
local function decode_entities(...)
local loaded = package.loaded
decode_entities = require(string_utilities_module).decode_entities
local loader = package.loaders[2]
return decode_entities(...)
 
end
--[==[
Like require, but return false if a module does not exist instead of throwing an error.
local function get_script(...)
Outputs are cached in {package.loaded}, which is faster for all module types, but much faster for nonexistent modules since require will attempt to use the full loader each time (since they don't get cached in {package.loaded}).
get_script = require(scripts_module).getByCode
Note: although nonexistent modules are cached as {false} in {package.loaded}, they still won't work with conventional require, since it uses a falsy check instead of checking the return value is not {nil}.
return get_script(...)
]==]
end
function export.safe_require(modname)
local module = loaded[modname]
local function is_content_page(...)
if module ~= nil then
is_content_page = require(pages_module).is_content_page
return module
return is_content_page(...)
end
end
-- The loader returns a function if the module exists, or nil if it doesn't, and checking this is faster than using pcall with require. If found, we still use require instead of loading and caching directly, because require contains safety checks against infinite loading loops (and we do want those to throw an error).
module = loader(modname)
local function load_data(...)
if module then
load_data = require(load_module).load_data
return require(modname)
return load_data(...)
end
end
loaded[modname] = false
return false
local function remove_links(...)
remove_links = require(links_module).remove_links
return remove_links(...)
end
local function tag_text(...)
tag_text = require(script_utilities_module).tag_text
return tag_text(...)
end
local function trim(...)
trim = require(string_utilities_module).trim
return trim(...)
end
local function uupper(...)
uupper = require(string_utilities_module).upper
return uupper(...)
end
end
end


--[==[
--[==[
Convert decimal to hexadecimal.
Loaders for objects, which load data (or some other object) into some variable, which can then be accessed as "foo or get_foo()", where the function get_foo sets the object to "foo" and then returns it. This ensures they are only loaded when needed, and avoids the need to check for the existence of the object each time, since once "foo" has been set, "get_foo" will not be called again.]==]
local catfix_scripts
local function get_catfix_scripts()
catfix_scripts, get_catfix_scripts = load_data(utilities_data_module).catfix_scripts, nil
return catfix_scripts
end
local current_title
local function get_current_title()
current_title, get_current_title = mw.title.getCurrentTitle(), nil
return current_title
end
local defaultsort
local function get_defaultsort()
defaultsort, get_defaultsort = load_data(headword_data_module).page.pagename_defaultsort, nil
return defaultsort
end
local pagename
local function get_pagename()
pagename, get_pagename = load_data(headword_data_module).page.encoded_pagename, nil
return pagename
end


Note: About three times as fast as the hex library.
local und
]==]
local function get_und()
function export.dec_to_hex(dec)
und, get_und = require(languages_module).getByCode("und"), nil
dec = tonumber(dec)
return und
if not dec or dec % 1 ~= 0 then
error("Input should be a decimal integer.")
end
end
return ("%x"):format(dec):upper()
 
end


do
do
local function check_level(lvl)
local function handle_url(capture)
if type(lvl) ~= "number" then
return capture:match("https?://[^%s%]]+%s([^%]]+)") or ""
error("Heading levels must be numbers.")
elseif lvl < 1 or lvl > 6 or lvl % 1 ~= 0 then
error("Heading levels must be integers between 1 and 6.")
end
return lvl
end
end
 
--[==[
--[==[
A helper function which iterates over the headings in `text`, which should be the content of a page or (main) section.
A helper function to strip wiki markup, giving the plaintext of what is displayed on the page.]==]
function export.get_plaintext(text)
text = text
:gsub("%[%[", "\1")
:gsub("%]%]", "\2")


Each iteration returns three values: `sec` (the section title), `lvl` (the section level) and `loc` (the index of the section in the given text, from the first equals sign). The section title will be automatically trimmed, and any HTML entities will be resolved.
-- Remove strip markers and HTML tags.
The optional parameter `a` (which should be an integer between 1 and 6) can be used to ensure that only headings of the specified level are iterated over. If `b` is also given, then they are treated as a range.
text = unstrip(text):gsub("<[^<>\1\2]+>", "")
The optional parameters `a` and `b` can be used to specify a range, so that only headings with levels in that range are returned. If only `a` is given ...
]==]
function export.find_headings(text, a, b)
a = a and check_level(a) or nil
b = b and check_level(b) or a or nil
local start, loc, lvl, sec = 1


return function()
-- Parse internal links for the display text, and remove categories.
repeat
text = remove_links(text)
loc, lvl, sec, start = text:match("()%f[^%z\n](==?=?=?=?=?)([^\n]+)%2[\t ]*%f[%z\n]()", start)
lvl = lvl and #lvl
until not (sec and a) or (lvl >= a and lvl <= b)
return sec and trim(decode_entities(sec)) or nil, lvl, loc
end
end


local function get_section(content, name, level)
-- Remove files.
if not (content and name) then
text = text:gsub("\1[Ff][Ii][Ll][Ee]:[^\1\2]+\2", "")
return nil
:gsub("\1[Ii][Mm][Aa][Gg][Ee]:[^\1\2]+\2", "")
elseif name:find("\n", 1, true) then
error("Heading name cannot contain a newline.")
end
level = level and check_level(level) or nil
name = trim(decode_entities(name))
local start
for sec, lvl, loc in export.find_headings(content, level and 1 or nil, level) do
if start and lvl <= level then
return content:sub(start, loc - 1)
elseif not start and (not level or lvl == level) and sec == name then
start, level = loc, lvl
end
end
return start and content:sub(start)
end
 
--[==[
A helper function to return the content of a page section.


`content` is raw wikitext, `name` is the requested section, and `level` is an optional parameter that specifies
-- Parse external links for the display text.
the required section heading level. If `level` is not supplied, then the first section called `name` is returned.
text = text:gsub("%[(https?://[^%[%]]+)%]", handle_url)
`name` can either be a string or table of section names. If a table, each name represents a section that has the
-- Any remaining square brackets aren't involved in links, but must be escaped to avoid creating new links.
next as a subsection. For example, { {"Spanish", "Noun"}} will return the first matching section called "Noun"
:gsub("\1", "&#91;&#91;")
under a section called "Spanish". These do not have to be at adjacent levels ("Noun" might be L4, while "Spanish"
:gsub("\2", "&#93;&#93;")
is L2). If `level` is given, it refers to the last name in the table (i.e. the name of the section to be returned).
:gsub("%[", "&#91;")
 
:gsub("]", "&#93;")
The returned section includes all of its subsections. If no matching section is found, return {nil}.
-- Strip bold, italics and soft hyphens.
]==]
:gsub("('*)'''(.-'*)'''", "%1%2")
function export.get_section(content, names, level)
:gsub("('*)''(.-'*)''", "%1%2")
if type(names) == "string" then
:gsub("­", "")
return get_section(content, names, level)
end
-- Get any HTML entities and trim.
local names_len = maxn(names)
-- Note: don't decode URL percent encoding, as it shouldn't be used in display text and may cause problems if % is used.
if names_len > 6 then
return trim(decode_entities(text))
error("Not possible specify more than 5 subsections: headings only go up to level 6.")
end
for i, name in ipairs(names) do
content = get_section(content, name, i == names_len and level or nil)
end
return content
end
end
 
--[==[
A function which returns the number of the page section which contains the current {#invoke}.
]==]
function export.get_current_section()
local frame = get_current_frame()
-- We determine the section via the heading strip marker count, since they're numbered sequentially, but the only way to do this is to generate a fake heading via frame:preprocess(). The native parser assigns each heading a unique marker, but frame:preprocess() will return copies of older markers if the heading is identical to one further up the page, so the fake heading has to be unique to the page. The best way to do this is to feed it a heading containing a nowiki marker (which we will need later), since those are always unique.
local nowiki_marker = frame:extensionTag("nowiki")
-- Note: heading strip markers have a different syntax to the ones used for tags.
local h = tonumber(frame:preprocess("=" .. nowiki_marker .. "=")
:match("\127'\"`UNIQ%-%-h%-(%d+)%-%-QINU`\"'\127"))
-- For some reason, [[Special:ExpandTemplates]] doesn't generate a heading strip marker, so if that happens we simply abort early.
if not h then
return 0
end
end
-- The only way to get the section number is to increment the heading count, so we store the offset in nowiki strip markers which can be retrieved by procedurally unstripping nowiki markers, counting backwards until we find a match.
local n, offset = tonumber(nowiki_marker:match("\127'\"`UNIQ%-%-nowiki%-([%dA-F]+)%-QINU`\"'\127"), 16)
while not offset and n > 0 do
n = n - 1
offset = unstripNoWiki(("\127'\"`UNIQ--nowiki-%08X-QINU`\"'\127"):format(n))
:match("^HEADING\1(%d+)") -- Prefix "HEADING\1" prevents collisions.
end
offset = offset and (offset + 1) or 0
frame:extensionTag("nowiki", "HEADING\1" .. offset)
return h - offset
end
end


do
do
local L2_sections
local function generate_sortkey(lang, sc, sort_key, sort_base)
--[==[
-- If the sort key is "-", treat the language as undetermined (the default). This is desirable when categorising (e.g.) translation requests, as the pages to be categorised are always in English/Translingual.
A function which returns the name of the L2 language section which contains the current {#invoke}.
if sort_key == "-" then
]==]
lang, sort_key = und or get_und(), nil
function export.get_current_L2()
elseif not lang then
local section = export.get_current_section()
lang = und or get_und()
if section == 0 then
end
return
-- Generate the automatic sort key.
local auto = lang:makeSortKey(sort_base or pagename or get_pagename(), sc)
-- Use the page defaultsort if necessary.
if not auto or auto == "" then
auto = defaultsort or get_defaultsort()
end
end
L2_sections = L2_sections or mw.loadData("Module:headword/data").page.L2_sections
-- If not sort key specified, use the automatic one.
while section > 0 do
if not sort_key or sort_key == "" then
local L2 = L2_sections[section]
return auto
if L2 then
-- Otherwise, if the language is not "und", categorize the manual sort key as either redundant or non-redundant.
return L2
-- FIXME: we should do this for "und" as well, but "Undetermined terms..." does not make sense for translations etc.
end
elseif lang:getCode() ~= "und" then
section = section - 1
return sort_key, "[[Category:" .. lang:getFullName() .. " terms with " .. (
uupper(sort_key) == auto and "redundant" or
"non-redundant non-automated"
) .. " sortkeys|" .. sort_key .. "]]"
end
end
return sort_key
end
end
end


--[==[
A helper function to strip wiki markup, giving the plaintext of what is displayed on the page.
]==]
function export.get_plaintext(text)
text = text
:gsub("%[%[", "\1")
:gsub("%]%]", "\2")
-- Remove strip markers and HTML tags.
text = unstrip(text):gsub("<[^<>\1\2]+>", "")
-- Parse internal links for the display text, and remove categories.
text = require("Module:links").remove_links(text)
-- Remove files.
for _, falsePositive in ipairs({"File", "Image"}) do
text = text:gsub("\1" .. falsePositive .. ":[^\1\2]+\2", "")
end
-- Parse external links for the display text.
text = text:gsub("%[(https?://[^%[%]]+)%]",
function(capture)
return capture:match("https?://[^%s%]]+%s([^%]]+)") or ""
end)
-- Any remaining square brackets aren't involved in links, but must be escaped to avoid creating new links.
:gsub("\1", "&#91;&#91;")
:gsub("\2", "&#93;&#93;")
:gsub("%[", "&#91;")
:gsub("]", "&#93;")
-- Strip bold, italics and soft hyphens.
:gsub("('*)'''(.-'*)'''", "%1%2")
:gsub("('*)''(.-'*)''", "%1%2")
:gsub("­", "")
-- Get any HTML entities.
-- Note: don't decode URL percent encoding, as it shouldn't be used in display text and may cause problems if % is used.
text = decode_entities(text)
return trim(text)
end
do
local title_obj, category_namespaces, page_data, pagename, pagename_defaultsort
--[==[
--[==[
Format the categories with the appropriate sort key.
Format the categories with the appropriate sort key.
* `categories` is a list of categories. Each entry in the list can be either a string (the full category, minus
* `cat` can take two forms:
  the {"Category:"} prefix) or an object. In the latter case, the object should have fields
  ** A string (the full category, minus the {"Category:"} prefix);
  ** `cat`: the full category, minus the {"Category:"} prefix (required);
  ** A list of categories. Each category in the list can be either:
  ** `lang`: optional language object to override the overall `lang`;
  *** A string in the same format as above;
  ** `sort_key`: optional sort key to override the overall `sort_key`;
  *** An object with the fields:
  ** `sort_base`: optional sort base to override the overall `sort_base`;
  **** `cat`: a string in the same format as above (required);
  ** `sc`: optional script object to override the overall `sc`.
  **** `lang`: an optional language object to override the overall `lang`;
  **** `sc`: an optional script object to override the overall `sc`.
  **** `sort_key`: an optional sort key to override the overall `sort_key`;
  **** `sort_base`: an optional sort base to override the overall `sort_base`;
* `lang` is an object encapsulating a language; if {nil}, the object for language code {"und"} (undetermined) will
* `lang` is an object encapsulating a language; if {nil}, the object for language code {"und"} (undetermined) will
  be used. `lang` is used when computing the sort key (either from the subpage name or sort base).
  be used. `lang` is used when computing the sort key (either from the subpage name or sort base).
Line 255: Line 199:
* `sc` is a script object; if nil, the default will be derived from the sort base (or its default value, the
* `sc` is a script object; if nil, the default will be derived from the sort base (or its default value, the
  subpage name) by calling {lang:findBestScript()}. The value of `sc` is used during the sort base normalization
  subpage name) by calling {lang:findBestScript()}. The value of `sc` is used during the sort base normalization
  process; for example, languages with multiple scripts will often have script-specific normalization processes.
  process; for example, languages with multiple scripts will often have script-specific normalization processes.]==]
]==]
function export.format_categories(cat, lang, sort_key, sort_base, force_output, sc)
function export.format_categories(categories, lang, sort_key, sort_base, force_output, sc)
if type(lang) == "table" and not lang.getCode then
error("The second argument to format_categories should be a language object.")
end
 
title_obj = title_obj or mw.title.getCurrentTitle()
category_namespaces = category_namespaces or mw.loadData("Module:utilities/data").category_namespaces
 
if not (
if not (
force_output or
force_output or
category_namespaces[title_obj.namespace] or
is_content_page(current_title or get_current_title()) or
title_obj.prefixedText == "Wiktionary:Sandbox"
current_title.prefixedText == "Wiktionary:Sandbox"
) then
) then
return ""
return ""
elseif not page_data then
elseif type(cat) == "string" then
page_data = mw.loadData("Module:headword/data").page
local this_sort_key, extra = generate_sortkey(lang, sc, sort_key, sort_base)
pagename = page_data.encoded_pagename
return "[[Category:" .. cat .. "|" .. this_sort_key .. "]]" .. (extra or "")
pagename_defaultsort = page_data.pagename_defaultsort
end
end
 
local ret, i, n, default = {}, 0, 0
local extra_categories
-- Iterate over all categories in `cat`.
local function generate_sort_key(lang, sort_key, sort_base, sc)
while true do
-- Generate a default sort key.
i = i + 1
-- If the sort key is "-", bypass the process of generating a sort key altogether. This is desirable when categorising (e.g.) translation requests, as the pages to be categorised are always in English/Translingual.
local category = cat[i]
if sort_key == "-" then
if category == nil then
sort_key = sort_base and sort_base:uupper() or pagename_defaultsort
return concat(ret)
else
lang = lang or require("Module:languages").getByCode("und")
sort_base = lang:makeSortKey(sort_base or pagename, sc) or pagename_defaultsort
if not sort_key or sort_key == "" then
sort_key = sort_base
elseif lang:getCode() ~= "und" then
if not extra_categories then
extra_categories = {}
end
end
end
if not sort_key or sort_key == "" then
sort_key = pagename_defaultsort
end
end
return sort_key
local this_sort_key, extra
end
-- If the category type is a table, use any custom options in it.
 
if type(category) == "table" then
local ret = {}
category, this_sort_key, extra = category.cat, generate_sortkey(
local default_sort_key = generate_sort_key(lang, sort_key, sort_base, sc)
category.lang or lang,
local ins_point = 0
category.sc or sc,
local function process_category(cat)
category.sort_key or sort_key,
local this_sort_key
category.sort_base or sort_base
if type(cat) == "string" then
)
this_sort_key = default_sort_key
-- If `default` has already been determined, use it.
elseif default then
this_sort_key = default
-- Otherwise, calculate `default` and use it.
else
else
this_sort_key = generate_sort_key(cat.lang or lang, cat.sort_key or sort_key,
this_sort_key, extra = generate_sortkey(lang, sc, sort_key, sort_base)
cat.sort_base or sort_base, cat.sc or sc)
default = this_sort_key
cat = cat.cat
end
ins_point = ins_point + 1
ret[ins_point] = "[[Category:" .. cat .. "|" .. this_sort_key .. "]]"
end
 
for _, cat in ipairs(categories) do
process_category(cat)
end
if extra_categories then
for _, cat in ipairs(extra_categories) do
process_category(cat)
end
end
n = n + 1
ret[n] = "[[Category:" .. category .. "|" .. this_sort_key .. "]]" .. (extra or "")
end
end
return concat(ret)
end
end
format_categories = export.format_categories
end
end


do
--[==[
local catfix_scripts
Add a "catfix", which is used on language-specific category pages to add language attributes and often script
classes to all entry names. The addition of language attributes and script classes makes the entry names display
better (using the language- or script-specific styles specified in [[MediaWiki:Common.css]]), which is particularly
important for non-English languages that do not have consistent font support in browsers.


--[==[
Language attributes are added for all languages, but script classes are only added for languages with one script
Add a "catfix", which is used on language-specific category pages to add language attributes and often script
listed in their data file, or for languages that have a default script listed in the {catfix_script} list in
classes to all entry names. The addition of language attributes and script classes makes the entry names display
[[Module:utilities/data]]. Some languages clearly have a default script, but still have other scripts listed in
better (using the language- or script-specific styles specified in [[MediaWiki:Common.css]]), which is particularly
their data file. If those other scripts are not simply scripts like {{cd|Brai}} (Braille), their default script
important for non-English languages that do not have consistent font support in browsers.
needs to be specified. Others do not have a default script.


Language attributes are added for all languages, but script classes are only added for languages with one script
* Serbo-Croatian is regularly written in both the Latin and Cyrillic scripts. Because it uses two scripts,
listed in their data file, or for languages that have a default script listed in the {catfix_script} list in
  Serbo-Croatian cannot have a script class applied to entries in its category pages, as only one script class
[[Module:utilities/data]]. Some languages clearly have a default script, but still have other scripts listed in
  can be specified at a time.
their data file and therefore need their default script to be specified. Others do not have a default script.
* German is usually written in the Latin script ({{cd|Latn}}), but Fraktur ({{cd|Latf}}) is also listed in
  its data file. So German needs an entry in the {catfix_script} list, so that the {{cd|Latn}} (Latin) script
  class will be applied to entries in its category pages.


* Serbo-Croatian is regularly written in both the Latin and Cyrillic scripts. Because it uses two scripts,
To find the scripts listed for a language, go to [[Module:languages]] and use the search box to find the data file
  Serbo-Croatian cannot have a script class applied to entries in its category pages, as only one script class
for the language. To find out what a script code means, search the script code in [[Module:scripts/data]].]==]
  can be specified at a time.
function export.catfix(lang, sc)
* Russian is usually written in the Cyrillic script ({{cd|Cyrl}}), but Braille ({{cd|Brai}}) is also listed in
if not lang or not lang.getCanonicalName then
  its data file. So Russian needs an entry in the {catfix_script} list, so that the {{cd|Cyrl}} (Cyrillic) script
error('The first argument to the function "catfix" should be a language object from [[Module:languages]] or [[Module:etymology languages]].')
  class will be applied to entries in its category pages.
end
if sc and not sc.getCode then
error('The second argument to the function "catfix" should be a script object from [[Module:scripts]].')
end


To find the scripts listed for a language, go to [[Module:languages]] and use the search box to find the data file
-- To add script classes to links on pages created by category boilerplate templates.
for the language. To find out what a script code means, search the script code in [[Module:scripts/data]].
if not sc then
]==]
local code = (catfix_scripts or get_catfix_scripts())[lang:getCode()] or catfix_scripts[lang:getFullCode()]
function export.catfix(lang, sc)
if code then
if not lang or not lang.getCanonicalName then
sc = get_script(code)
error('The first argument to the function "catfix" should be a language object from [[Module:languages]] or [[Module:etymology languages]].')
end
end
if sc and not sc.getCode then
end
error('The second argument to the function "catfix" should be a script object from [[Module:scripts]].')
end
local canonicalName = lang:getCanonicalName()
local nonEtymologicalName = lang:getFullName()


-- To add script classes to links on pages created by category boilerplate templates.
-- If the language only has a single valid candidate script, apply it as the default.
if not sc then
if not sc then
catfix_scripts = catfix_scripts or mw.loadData("Module:utilities/data").catfix_scripts
local scripts = lang:getScripts()
sc = catfix_scripts[lang:getCode()] or catfix_scripts[lang:getFullCode()]
if #scripts == 1 then
if sc then
sc = scripts[1]
sc = require("Module:scripts").getByCode(sc)
else
-- Iterate over scripts. If there is only one when ignoring
-- scripts like Brai, then pick that one.
for _, script in ipairs(scripts) do
if script:getCode() ~= "Brai" then
if sc then
-- Multiple candidates - fail.
sc = nil
break
else
sc = script
end
end
end
end
end
end
local catfix_class = "CATFIX-" .. mw.uri.anchorEncode(canonicalName)
if nonEtymologicalName ~= canonicalName then
catfix_class = catfix_class .. " CATFIX-" .. mw.uri.anchorEncode(nonEtymologicalName)
end
return "<span id=\"catfix\" style=\"display:none;\" class=\"" .. catfix_class .. "\">" ..
require("Module:script utilities").tag_text("&nbsp;", lang, sc, nil) ..
"</span>"
end
end
end
 
-- Hack: using a <ul> tag prevents the parser from automatically generating a <p> tag around the catfix element.
--[==[
return "<ul class=\"catfix\" data-anchor=\"" ..
Implementation of the {{tl|catfix}} template.
anchor_encode(lang:getFullName()) .. "\">" ..
]==]
tag_text("", lang, sc) .. "</ul>" ..
function export.catfix_template(frame)
format_categories("Pages using catfix", nil, nil, nil, true)
local params = {
[1] = { type = "language", required = true },
[2] = { alias_of = "sc" },
["sc"] = { type = "script" },
}
 
local args = require("Module:parameters").process(frame:getParent().args, params)
 
return export.catfix(args[1], args.sc)
end
end


Line 403: Line 315:


If `noErr` is set, the function returns false instead of throwing an error, which allows customised error handling to
If `noErr` is set, the function returns false instead of throwing an error, which allows customised error handling to
be done in the calling function.
be done in the calling function.]==]
]==]
function export.check_object(typ, noErr, ...)
function export.check_object(typ, noErr, ...)
local function fail(message)
if ... == nil then
if noErr then
if noErr then
return false
return false
else
error(message, 3)
end
end
error("Must provide at least one object to check.", 2)
end
end
 
for _, obj in ipairs{...} do
local objs = {...}
if #objs == 0 then
return fail("Must provide at least one object to check.")
end
for _, obj in ipairs(objs) do
if type(obj) ~= "table" or type(obj.hasType) ~= "function" then
if type(obj) ~= "table" or type(obj.hasType) ~= "function" then
return fail("Function expected a " .. typ .. " object, but received a " .. type(obj) .. " instead.")
if noErr then
return false
end
error("Function expected a " .. typ .. " object, but received a " .. type(obj) .. " instead.", 2)
elseif not (typ == "object" or obj:hasType(typ)) then
elseif not (typ == "object" or obj:hasType(typ)) then
for _, wrong_type in ipairs{"family", "language", "script", "Wikimedia language", "writing system"} do
for _, wrong_type in ipairs{"family", "language", "script", "Wikimedia language", "writing system"} do
if obj:hasType(wrong_type) then
if obj:hasType(wrong_type) then
return fail("Function expected a " .. typ .. " object, but received a " .. wrong_type .. " object instead.")
if noErr then
return false
end
error("Function expected a " .. typ .. " object, but received a " .. wrong_type .. " object instead.", 2)
end
end
end
end
return fail("Function expected a " .. typ .. " object, but received another type of object instead.")
if noErr then
return false
end
error("Function expected a " .. typ .. " object, but received another type of object instead.", 2)
end
end
end
end

Revision as of 18:10, 11 January 2025



local export = {}

local headword_data_module = "Module:headword/data"
local languages_module = "Module:languages"
local links_module = "Module:links"
local load_module = "Module:load"
local pages_module = "Module:pages"
local script_utilities_module = "Module:script utilities"
local scripts_module = "Module:scripts"
local string_utilities_module = "Module:string utilities"
local utilities_data_module = "Module:utilities/data"

local mw = mw

local anchor_encode = mw.uri.anchorEncode
local concat = table.concat
local format_categories -- Defined below.
local ipairs = ipairs
local require = require
local type = type
local unstrip = mw.text.unstrip

--[==[
Loaders for functions in other modules, which overwrite themselves with the target function when called. This ensures modules are only loaded when needed, retains the speed/convenience of locally-declared pre-loaded functions, and has no overhead after the first call, since the target functions are called directly in any subsequent calls.]==]
	local function decode_entities(...)
		decode_entities = require(string_utilities_module).decode_entities
		return decode_entities(...)
	end
	
	local function get_script(...)
		get_script = require(scripts_module).getByCode
		return get_script(...)
	end
	
	local function is_content_page(...)
		is_content_page = require(pages_module).is_content_page
		return is_content_page(...)
	end
	
	local function load_data(...)
		load_data = require(load_module).load_data
		return load_data(...)
	end
	
	local function remove_links(...)
		remove_links = require(links_module).remove_links
		return remove_links(...)
	end
	
	local function tag_text(...)
		tag_text = require(script_utilities_module).tag_text
		return tag_text(...)
	end
	
	local function trim(...)
		trim = require(string_utilities_module).trim
		return trim(...)
	end
	
	local function uupper(...)
		uupper = require(string_utilities_module).upper
		return uupper(...)
	end

--[==[
Loaders for objects, which load data (or some other object) into some variable, which can then be accessed as "foo or get_foo()", where the function get_foo sets the object to "foo" and then returns it. This ensures they are only loaded when needed, and avoids the need to check for the existence of the object each time, since once "foo" has been set, "get_foo" will not be called again.]==]
	local catfix_scripts
	local function get_catfix_scripts()
		catfix_scripts, get_catfix_scripts = load_data(utilities_data_module).catfix_scripts, nil
		return catfix_scripts
	end
	
	local current_title
	local function get_current_title()
		current_title, get_current_title = mw.title.getCurrentTitle(), nil
		return current_title
	end
	
	local defaultsort
	local function get_defaultsort()
		defaultsort, get_defaultsort = load_data(headword_data_module).page.pagename_defaultsort, nil
		return defaultsort
	end
	
	local pagename
	local function get_pagename()
		pagename, get_pagename = load_data(headword_data_module).page.encoded_pagename, nil
		return pagename
	end

	local und
	local function get_und()
		und, get_und = require(languages_module).getByCode("und"), nil
		return und
	end


do
	local function handle_url(capture)
		return capture:match("https?://[^%s%]]+%s([^%]]+)") or ""
	end
	
	--[==[
	A helper function to strip wiki markup, giving the plaintext of what is displayed on the page.]==]
	function export.get_plaintext(text)
		text = text
			:gsub("%[%[", "\1")
			:gsub("%]%]", "\2")

		-- Remove strip markers and HTML tags.
		text = unstrip(text):gsub("<[^<>\1\2]+>", "")

		-- Parse internal links for the display text, and remove categories.
		text = remove_links(text)

		-- Remove files.
		text = text:gsub("\1[Ff][Ii][Ll][Ee]:[^\1\2]+\2", "")
			:gsub("\1[Ii][Mm][Aa][Gg][Ee]:[^\1\2]+\2", "")

		-- Parse external links for the display text.
		text = text:gsub("%[(https?://[^%[%]]+)%]", handle_url)
			-- Any remaining square brackets aren't involved in links, but must be escaped to avoid creating new links.
			:gsub("\1", "&#91;&#91;")
			:gsub("\2", "&#93;&#93;")
			:gsub("%[", "&#91;")
			:gsub("]", "&#93;")
			-- Strip bold, italics and soft hyphens.
			:gsub("('*)'''(.-'*)'''", "%1%2")
			:gsub("('*)''(.-'*)''", "%1%2")
			:gsub("­", "")
		
		-- Get any HTML entities and trim.
		-- Note: don't decode URL percent encoding, as it shouldn't be used in display text and may cause problems if % is used.
		return trim(decode_entities(text))
	end
end

do
	local function generate_sortkey(lang, sc, sort_key, sort_base)
		-- If the sort key is "-", treat the language as undetermined (the default). This is desirable when categorising (e.g.) translation requests, as the pages to be categorised are always in English/Translingual.
		if sort_key == "-" then
			lang, sort_key = und or get_und(), nil
		elseif not lang then
			lang = und or get_und()
		end
		-- Generate the automatic sort key.
		local auto = lang:makeSortKey(sort_base or pagename or get_pagename(), sc)
		-- Use the page defaultsort if necessary.
		if not auto or auto == "" then
			auto = defaultsort or get_defaultsort()
		end
		-- If not sort key specified, use the automatic one.
		if not sort_key or sort_key == "" then
			return auto
		-- Otherwise, if the language is not "und", categorize the manual sort key as either redundant or non-redundant.
		-- FIXME: we should do this for "und" as well, but "Undetermined terms..." does not make sense for translations etc.
		elseif lang:getCode() ~= "und" then
			return sort_key, "[[Category:" .. lang:getFullName() .. " terms with " .. (
				uupper(sort_key) == auto and "redundant" or
				"non-redundant non-automated"
			) .. " sortkeys|" .. sort_key .. "]]"
		end
		return sort_key
	end

	--[==[
	Format the categories with the appropriate sort key.
	* `cat` can take two forms:
	  ** A string (the full category, minus the {"Category:"} prefix);
	  ** A list of categories. Each category in the list can be either:
	  *** A string in the same format as above;
	  *** An object with the fields:
	  **** `cat`: a string in the same format as above (required);
	  **** `lang`: an optional language object to override the overall `lang`;
	  **** `sc`: an optional script object to override the overall `sc`.
	  **** `sort_key`: an optional sort key to override the overall `sort_key`;
	  **** `sort_base`: an optional sort base to override the overall `sort_base`;
	* `lang` is an object encapsulating a language; if {nil}, the object for language code {"und"} (undetermined) will
	  be used. `lang` is used when computing the sort key (either from the subpage name or sort base).
	* `sort_key` is placed in the category invocation, and indicates how the page will sort in the respective category.
	  Normally '''do not use this'''. Instead, leave it {nil}, and if you need to a control the sort order, use
	  {sort_base}, so that language-specific normalization is applied on top of the specified sort base. If neither
	  {sort_key} nor {sort_base} is specified, the default is to apply language-specific normalization to the subpage
	  name; see below.
	* `sort_base` lets you override the default sort key while still maintaining appropriate language-specific
	  normalization. If {nil} is specified, this defaults to the subpage name, which is the portion of the full pagename
	  after subtracting the namespace prefix (and, in certain namespaces such as {User:}, but notably not in the
	  mainspace, after subtracting anything up through the final slash). The actual sort key is derived from the sort
	  base approximately by lowercasing, applying language-specific normalization and then uppercasing; note that the
	  same process is applied in deriving the sort key when no sort base is specified. For example, for French, Spanish,
	  etc. the normalization process maps accented letters to their unaccented equivalents, so that e.g. in French,
	  {{m|fr|ça}} sorts after {{m|fr|ca}} (instead of after the default Wikimedia sort order, which is approximately
	  based on Unicode sort order and places ç after z) and {{m|fr|côté}} sorts after {{m|fr|coté}} (instead of between
	  c and d). Similarly, in Russian the normalization process converts Cyrillic ё to a string consisting of Cyrillic е
	  followed by U+10FFFF, so that effectively ё sorts after е instead of the default Wikimedia sort, which (I think)
	  puts ё after я, the last letter of the Cyrillic alphabet.
	* `force_output` forces normal output in all namespaces. Normally, nothing is output if the page isn't in the main,
	  Appendix:, Thesaurus:, Reconstruction: or Citations: namespaces.
	* `sc` is a script object; if nil, the default will be derived from the sort base (or its default value, the
	  subpage name) by calling {lang:findBestScript()}. The value of `sc` is used during the sort base normalization
	  process; for example, languages with multiple scripts will often have script-specific normalization processes.]==]
	function export.format_categories(cat, lang, sort_key, sort_base, force_output, sc)
		if not (
			force_output or
			is_content_page(current_title or get_current_title()) or
			current_title.prefixedText == "Wiktionary:Sandbox"
		) then
			return ""
		elseif type(cat) == "string" then
			local this_sort_key, extra = generate_sortkey(lang, sc, sort_key, sort_base)
			return "[[Category:" .. cat .. "|" .. this_sort_key .. "]]" .. (extra or "")
		end
		local ret, i, n, default = {}, 0, 0
		-- Iterate over all categories in `cat`.
		while true do
			i = i + 1
			local category = cat[i]
			if category == nil then
				return concat(ret)
			end
			local this_sort_key, extra
			-- If the category type is a table, use any custom options in it.
			if type(category) == "table" then
				category, this_sort_key, extra = category.cat, generate_sortkey(
					category.lang or lang,
					category.sc or sc,
					category.sort_key or sort_key,
					category.sort_base or sort_base
				)
			-- If `default` has already been determined, use it.
			elseif default then
				this_sort_key = default
			-- Otherwise, calculate `default` and use it.
			else
				this_sort_key, extra = generate_sortkey(lang, sc, sort_key, sort_base)
				default = this_sort_key
			end
			n = n + 1
			ret[n] = "[[Category:" .. category .. "|" .. this_sort_key .. "]]" .. (extra or "")
		end
	end
	format_categories = export.format_categories
end

--[==[
Add a "catfix", which is used on language-specific category pages to add language attributes and often script
classes to all entry names. The addition of language attributes and script classes makes the entry names display
better (using the language- or script-specific styles specified in [[MediaWiki:Common.css]]), which is particularly
important for non-English languages that do not have consistent font support in browsers.

Language attributes are added for all languages, but script classes are only added for languages with one script
listed in their data file, or for languages that have a default script listed in the {catfix_script} list in
[[Module:utilities/data]]. Some languages clearly have a default script, but still have other scripts listed in
their data file. If those other scripts are not simply scripts like {{cd|Brai}} (Braille), their default script
needs to be specified. Others do not have a default script.

* Serbo-Croatian is regularly written in both the Latin and Cyrillic scripts. Because it uses two scripts,
  Serbo-Croatian cannot have a script class applied to entries in its category pages, as only one script class
  can be specified at a time.
* German is usually written in the Latin script ({{cd|Latn}}), but Fraktur ({{cd|Latf}}) is also listed in
  its data file. So German needs an entry in the {catfix_script} list, so that the {{cd|Latn}} (Latin) script
  class will be applied to entries in its category pages.

To find the scripts listed for a language, go to [[Module:languages]] and use the search box to find the data file
for the language. To find out what a script code means, search the script code in [[Module:scripts/data]].]==]
function export.catfix(lang, sc)
	if not lang or not lang.getCanonicalName then
		error('The first argument to the function "catfix" should be a language object from [[Module:languages]] or [[Module:etymology languages]].')
	end
	if sc and not sc.getCode then
		error('The second argument to the function "catfix" should be a script object from [[Module:scripts]].')
	end

	-- To add script classes to links on pages created by category boilerplate templates.
	if not sc then
		local code = (catfix_scripts or get_catfix_scripts())[lang:getCode()] or catfix_scripts[lang:getFullCode()]
		if code then
			sc = get_script(code)
		end
	end

	-- If the language only has a single valid candidate script, apply it as the default.
	if not sc then
		local scripts = lang:getScripts()
		if #scripts == 1 then
			sc = scripts[1]
		else
			-- Iterate over scripts. If there is only one when ignoring
			-- scripts like Brai, then pick that one.
			for _, script in ipairs(scripts) do
				if script:getCode() ~= "Brai" then
					if sc then
						-- Multiple candidates - fail.
						sc = nil
						break
					else
						sc = script
					end
				end
			end
		end
	end
	
	-- Hack: using a <ul> tag prevents the parser from automatically generating a <p> tag around the catfix element.
	return "<ul class=\"catfix\" data-anchor=\"" ..
		anchor_encode(lang:getFullName()) .. "\">" ..
		tag_text("", lang, sc) .. "</ul>" ..
		format_categories("Pages using catfix", nil, nil, nil, true)
end

--[==[
Given a type (as a string) and an arbitrary number of entities, checks whether all of those entities are language,
family, script, writing system or Wikimedia language objects. Useful for error handling in functions that require
one of these kinds of object.

If `noErr` is set, the function returns false instead of throwing an error, which allows customised error handling to
be done in the calling function.]==]
function export.check_object(typ, noErr, ...)
	if ... == nil then
		if noErr then
			return false
		end
		error("Must provide at least one object to check.", 2)
	end
	for _, obj in ipairs{...} do
		if type(obj) ~= "table" or type(obj.hasType) ~= "function" then
			if noErr then
				return false
			end
			error("Function expected a " .. typ .. " object, but received a " .. type(obj) .. " instead.", 2)
		elseif not (typ == "object" or obj:hasType(typ)) then
			for _, wrong_type in ipairs{"family", "language", "script", "Wikimedia language", "writing system"} do
				if obj:hasType(wrong_type) then
					if noErr then
						return false
					end
					error("Function expected a " .. typ .. " object, but received a " .. wrong_type .. " object instead.", 2)
				end
			end
			if noErr then
				return false
			end
			error("Function expected a " .. typ .. " object, but received another type of object instead.", 2)
		end
	end
	return true
end

return export