48,355
edits
No edit summary |
No edit summary |
||
| (2 intermediate revisions by the same user not shown) | |||
| Line 1: | Line 1: | ||
local export = {} | |||
local headword_data_module = "Module:headword/data" | |||
local languages_module = "Module:languages" | |||
local links_module = "Module:links" | |||
local load_module = "Module:load" | |||
local pages_module = "Module:pages" | |||
local script_utilities_module = "Module:script utilities" | |||
local scripts_module = "Module:scripts" | |||
local string_utilities_module = "Module:string utilities" | |||
local utilities_data_module = "Module:utilities/data" | |||
local mw = mw | local mw = mw | ||
local | local anchor_encode = mw.uri.anchorEncode | ||
local concat = table.concat | local concat = table.concat | ||
local | local format_categories -- Defined below. | ||
local ipairs = ipairs | local ipairs = ipairs | ||
local | local require = require | ||
local type = type | local type = type | ||
local unstrip = | local unstrip = mw.text.unstrip | ||
--[==[ | |||
Loaders for functions in other modules, which overwrite themselves with the target function when called. This ensures modules are only loaded when needed, retains the speed/convenience of locally-declared pre-loaded functions, and has no overhead after the first call, since the target functions are called directly in any subsequent calls.]==] | |||
local function decode_entities(...) | |||
local | decode_entities = require(string_utilities_module).decode_entities | ||
local | return decode_entities(...) | ||
end | |||
local function get_script(...) | |||
get_script = require(scripts_module).getByCode | |||
return get_script(...) | |||
end | |||
function | |||
local function is_content_page(...) | |||
is_content_page = require(pages_module).is_content_page | |||
return is_content_page(...) | |||
end | |||
local function load_data(...) | |||
load_data = require(load_module).load_data | |||
return load_data(...) | |||
end | |||
return | local function remove_links(...) | ||
remove_links = require(links_module).remove_links | |||
return remove_links(...) | |||
end | |||
local function tag_text(...) | |||
tag_text = require(script_utilities_module).tag_text | |||
return tag_text(...) | |||
end | |||
local function trim(...) | |||
trim = require(string_utilities_module).trim | |||
return trim(...) | |||
end | |||
local function uupper(...) | |||
uupper = require(string_utilities_module).upper | |||
return uupper(...) | |||
end | end | ||
--[==[ | --[==[ | ||
Loaders for objects, which load data (or some other object) into some variable, which can then be accessed as "foo or get_foo()", where the function get_foo sets the object to "foo" and then returns it. This ensures they are only loaded when needed, and avoids the need to check for the existence of the object each time, since once "foo" has been set, "get_foo" will not be called again.]==] | |||
local catfix_scripts | |||
local function get_catfix_scripts() | |||
catfix_scripts, get_catfix_scripts = load_data(utilities_data_module).catfix_scripts, nil | |||
return catfix_scripts | |||
end | |||
local current_title | |||
local function get_current_title() | |||
current_title, get_current_title = mw.title.getCurrentTitle(), nil | |||
return current_title | |||
end | |||
local defaultsort | |||
local function get_defaultsort() | |||
defaultsort, get_defaultsort = load_data(headword_data_module).page.pagename_defaultsort, nil | |||
return defaultsort | |||
end | |||
local pagename | |||
local function get_pagename() | |||
pagename, get_pagename = load_data(headword_data_module).page.encoded_pagename, nil | |||
return pagename | |||
end | |||
local und | |||
local function get_und() | |||
function | und, get_und = require(languages_module).getByCode("und"), nil | ||
return und | |||
end | end | ||
do | do | ||
local function | local function handle_url(capture) | ||
return capture:match("https?://[^%s%]]+%s([^%]]+)") or "" | |||
end | end | ||
--[==[ | --[==[ | ||
A helper function | A helper function to strip wiki markup, giving the plaintext of what is displayed on the page.]==] | ||
function export.get_plaintext(text) | |||
text = text | |||
:gsub("%[%[", "\1") | |||
:gsub("%]%]", "\2") | |||
-- Remove strip markers and HTML tags. | |||
text = unstrip(text):gsub("<[^<>\1\2]+>", "") | |||
-- Parse internal links for the display text, and remove categories. | |||
text = remove_links(text) | |||
-- Remove files. | |||
text = text:gsub("\1[Ff][Ii][Ll][Ee]:[^\1\2]+\2", "") | |||
:gsub("\1[Ii][Mm][Aa][Gg][Ee]:[^\1\2]+\2", "") | |||
-- Parse external links for the display text. | |||
text = text:gsub("%[(https?://[^%[%]]+)%]", handle_url) | |||
-- Any remaining square brackets aren't involved in links, but must be escaped to avoid creating new links. | |||
:gsub("\1", "[[") | |||
:gsub("\2", "]]") | |||
:gsub("%[", "[") | |||
:gsub("]", "]") | |||
-- Strip bold, italics and soft hyphens. | |||
:gsub("('*)'''(.-'*)'''", "%1%2") | |||
:gsub("('*)''(.-'*)''", "%1%2") | |||
:gsub("", "") | |||
-- Get any HTML entities and trim. | |||
-- Note: don't decode URL percent encoding, as it shouldn't be used in display text and may cause problems if % is used. | |||
return trim(decode_entities(text)) | |||
return | |||
end | end | ||
end | end | ||
do | do | ||
local | local function generate_sortkey(lang, sc, sort_key, sort_base) | ||
-- If the sort key is "-", treat the language as undetermined (the default). This is desirable when categorising (e.g.) translation requests, as the pages to be categorised are always in English/Translingual. | |||
if sort_key == "-" then | |||
lang, sort_key = und or get_und(), nil | |||
elseif not lang then | |||
local | lang = und or get_und() | ||
if | end | ||
-- Generate the automatic sort key. | |||
local auto = lang:makeSortKey(sort_base or pagename or get_pagename(), sc) | |||
-- Use the page defaultsort if necessary. | |||
if not auto or auto == "" then | |||
auto = defaultsort or get_defaultsort() | |||
end | end | ||
-- If not sort key specified, use the automatic one. | |||
if not sort_key or sort_key == "" then | |||
return auto | |||
-- Otherwise, if the language is not "und", categorize the manual sort key as either redundant or non-redundant. | |||
-- FIXME: we should do this for "und" as well, but "Undetermined terms..." does not make sense for translations etc. | |||
elseif lang:getCode() ~= "und" then | |||
return sort_key, "[[Category:" .. lang:getFullName() .. " terms with " .. ( | |||
uupper(sort_key) == auto and "redundant" or | |||
"non-redundant non-automated" | |||
) .. " sortkeys|" .. sort_key .. "]]" | |||
end | end | ||
return sort_key | |||
end | end | ||
--[==[ | --[==[ | ||
Format the categories with the appropriate sort key. | Format the categories with the appropriate sort key. | ||
* ` | * `cat` can take two forms: | ||
** A string (the full category, minus the {"Category:"} prefix); | |||
** `cat`: the | ** A list of categories. Each category in the list can be either: | ||
** `lang`: optional language object to override the overall `lang`; | *** A string in the same format as above; | ||
** ` | *** An object with the fields: | ||
** ` | **** `cat`: a string in the same format as above (required); | ||
** ` | **** `lang`: an optional language object to override the overall `lang`; | ||
**** `sc`: an optional script object to override the overall `sc`. | |||
**** `sort_key`: an optional sort key to override the overall `sort_key`; | |||
**** `sort_base`: an optional sort base to override the overall `sort_base`; | |||
* `lang` is an object encapsulating a language; if {nil}, the object for language code {"und"} (undetermined) will | * `lang` is an object encapsulating a language; if {nil}, the object for language code {"und"} (undetermined) will | ||
be used. `lang` is used when computing the sort key (either from the subpage name or sort base). | be used. `lang` is used when computing the sort key (either from the subpage name or sort base). | ||
| Line 255: | Line 199: | ||
* `sc` is a script object; if nil, the default will be derived from the sort base (or its default value, the | * `sc` is a script object; if nil, the default will be derived from the sort base (or its default value, the | ||
subpage name) by calling {lang:findBestScript()}. The value of `sc` is used during the sort base normalization | subpage name) by calling {lang:findBestScript()}. The value of `sc` is used during the sort base normalization | ||
process; for example, languages with multiple scripts will often have script-specific normalization processes. | process; for example, languages with multiple scripts will often have script-specific normalization processes.]==] | ||
function export.format_categories(cat, lang, sort_key, sort_base, force_output, sc) | |||
function export.format_categories( | |||
if not ( | if not ( | ||
force_output or | force_output or | ||
is_content_page(current_title or get_current_title()) or | |||
current_title.prefixedText == "Wiktionary:Sandbox" | |||
) then | ) then | ||
return "" | return "" | ||
elseif | elseif type(cat) == "string" then | ||
local this_sort_key, extra = generate_sortkey(lang, sc, sort_key, sort_base) | |||
return "[[Category:" .. cat .. "|" .. this_sort_key .. "]]" .. (extra or "") | |||
end | end | ||
local ret, i, n, default = {}, 0, 0 | |||
-- Iterate over all categories in `cat`. | |||
local | while true do | ||
i = i + 1 | |||
local category = cat[i] | |||
if category == nil then | |||
return concat(ret) | |||
end | end | ||
local this_sort_key, extra | |||
-- If the category type is a table, use any custom options in it. | |||
if type(category) == "table" then | |||
category, this_sort_key, extra = category.cat, generate_sortkey( | |||
category.lang or lang, | |||
category.sc or sc, | |||
category.sort_key or sort_key, | |||
category.sort_base or sort_base | |||
) | |||
this_sort_key = | -- If `default` has already been determined, use it. | ||
elseif default then | |||
this_sort_key = default | |||
-- Otherwise, calculate `default` and use it. | |||
else | else | ||
this_sort_key = | this_sort_key, extra = generate_sortkey(lang, sc, sort_key, sort_base) | ||
default = this_sort_key | |||
end | end | ||
n = n + 1 | |||
ret[n] = "[[Category:" .. category .. "|" .. this_sort_key .. "]]" .. (extra or "") | |||
end | end | ||
end | end | ||
format_categories = export.format_categories | |||
end | end | ||
do | --[==[ | ||
Add a "catfix", which is used on language-specific category pages to add language attributes and often script | |||
classes to all entry names. The addition of language attributes and script classes makes the entry names display | |||
better (using the language- or script-specific styles specified in [[MediaWiki:Common.css]]), which is particularly | |||
important for non-English languages that do not have consistent font support in browsers. | |||
Language attributes are added for all languages, but script classes are only added for languages with one script | |||
listed in their data file, or for languages that have a default script listed in the {catfix_script} list in | |||
[[Module:utilities/data]]. Some languages clearly have a default script, but still have other scripts listed in | |||
their data file. If those other scripts are not simply scripts like {{cd|Brai}} (Braille), their default script | |||
needs to be specified. Others do not have a default script. | |||
* Serbo-Croatian is regularly written in both the Latin and Cyrillic scripts. Because it uses two scripts, | |||
Serbo-Croatian cannot have a script class applied to entries in its category pages, as only one script class | |||
can be specified at a time. | |||
* German is usually written in the Latin script ({{cd|Latn}}), but Fraktur ({{cd|Latf}}) is also listed in | |||
its data file. So German needs an entry in the {catfix_script} list, so that the {{cd|Latn}} (Latin) script | |||
class will be applied to entries in its category pages. | |||
To find the scripts listed for a language, go to [[Module:languages]] and use the search box to find the data file | |||
for the language. To find out what a script code means, search the script code in [[Module:scripts/data]].]==] | |||
function export.catfix(lang, sc) | |||
if not lang or not lang.getCanonicalName then | |||
error('The first argument to the function "catfix" should be a language object from [[Module:languages]] or [[Module:etymology languages]].') | |||
end | |||
if sc and not sc.getCode then | |||
error('The second argument to the function "catfix" should be a script object from [[Module:scripts]].') | |||
end | |||
To | -- To add script classes to links on pages created by category boilerplate templates. | ||
if not sc then | |||
local code = (catfix_scripts or get_catfix_scripts())[lang:getCode()] or catfix_scripts[lang:getFullCode()] | |||
if code then | |||
if | sc = get_script(code) | ||
end | end | ||
end | |||
-- If the language only has a single valid candidate script, apply it as the default. | |||
if not sc then | |||
local scripts = lang:getScripts() | |||
sc = | if #scripts == 1 then | ||
sc = scripts[1] | |||
else | |||
-- Iterate over scripts. If there is only one when ignoring | |||
-- scripts like Brai, then pick that one. | |||
for _, script in ipairs(scripts) do | |||
if script:getCode() ~= "Brai" then | |||
if sc then | |||
-- Multiple candidates - fail. | |||
sc = nil | |||
break | |||
else | |||
sc = script | |||
end | |||
end | |||
end | end | ||
end | end | ||
end | end | ||
-- Hack: using a <ul> tag prevents the parser from automatically generating a <p> tag around the catfix element. | |||
-- | return "<ul class=\"catfix\" data-anchor=\"" .. | ||
anchor_encode(lang:getFullName()) .. "\">" .. | |||
tag_text("", lang, sc) .. "</ul>" | |||
end | end | ||
| Line 407: | Line 314: | ||
If `noErr` is set, the function returns false instead of throwing an error, which allows customised error handling to | If `noErr` is set, the function returns false instead of throwing an error, which allows customised error handling to | ||
be done in the calling function. | be done in the calling function.]==] | ||
]==] | |||
function export.check_object(typ, noErr, ...) | function export.check_object(typ, noErr, ...) | ||
if ... == nil then | |||
if noErr then | if noErr then | ||
return false | return false | ||
end | end | ||
error("Must provide at least one object to check.", 2) | |||
end | end | ||
for _, obj in ipairs{...} do | |||
for _, obj in ipairs | |||
if type(obj) ~= "table" or type(obj.hasType) ~= "function" then | if type(obj) ~= "table" or type(obj.hasType) ~= "function" then | ||
return | if noErr then | ||
return false | |||
end | |||
error("Function expected a " .. typ .. " object, but received a " .. type(obj) .. " instead.", 2) | |||
elseif not (typ == "object" or obj:hasType(typ)) then | elseif not (typ == "object" or obj:hasType(typ)) then | ||
for _, wrong_type in ipairs{"family", "language", "script", "Wikimedia language", "writing system"} do | for _, wrong_type in ipairs{"family", "language", "script", "Wikimedia language", "writing system"} do | ||
if obj:hasType(wrong_type) then | if obj:hasType(wrong_type) then | ||
return | if noErr then | ||
return false | |||
end | |||
error("Function expected a " .. typ .. " object, but received a " .. wrong_type .. " object instead.", 2) | |||
end | end | ||
end | end | ||
return | if noErr then | ||
return false | |||
end | |||
error("Function expected a " .. typ .. " object, but received another type of object instead.", 2) | |||
end | end | ||
end | end | ||