Module:collation: Difference between revisions
Jump to navigation
Jump to search
(Created page with "local export = {} -- Custom functions for generating a sortkey that will achieve the desired sort -- order. -- name of module and name of exported function local custom_sort_...") |
No edit summary |
||
Line 1: | Line 1: | ||
local export = {} | local export = {} | ||
local require = require | |||
local byte = string.byte | |||
local concat = table.concat | |||
local find = string.find | |||
local get_plaintext = require("Module:utilities").get_plaintext | |||
local match = string.match | |||
local memoize = require("Module:fun").memoize | |||
local remove = table.remove | |||
local sort = table.sort | |||
local string_sort -- defined below as export.string_sort | |||
local sub = string.sub | |||
local trim = mw.text.trim | |||
local type = type | |||
-- Custom functions for generating a sortkey that will achieve the desired sort | -- Custom functions for generating a sortkey that will achieve the desired sort | ||
-- order. | -- order. | ||
-- name of module and name of exported function | -- name of module and name of exported function | ||
local | local custom_funcs = { | ||
ahk = { "Mymr-sortkey", "makeSortKey" }, | |||
aio = { "Mymr-sortkey", "makeSortKey" }, | |||
blk = { "Mymr-sortkey", "makeSortKey" }, | |||
egy = { "egy-utilities", "make_sortkey" }, | egy = { "egy-utilities", "make_sortkey" }, | ||
kac = { "Mymr-sortkey", "makeSortKey" }, | |||
kht = { "Mymr-sortkey", "makeSortKey" }, | |||
ksw = { "Mymr-sortkey", "makeSortKey" }, | |||
kyu = { "Mymr-sortkey", "makeSortKey" }, | |||
["mkh-mmn"] = { "Mymr-sortkey", "makeSortKey" }, | |||
mnw = { "Mymr-sortkey", "makeSortKey" }, | |||
my = { "Mymr-sortkey", "makeSortKey" }, | |||
phk = { "Mymr-sortkey", "makeSortKey" }, | |||
pwo = { "Mymr-sortkey", "makeSortKey" }, | |||
omx = { "Mymr-sortkey", "makeSortKey" }, | |||
shn = { "Mymr-sortkey", "makeSortKey" }, | |||
tjl = { "Mymr-sortkey", "makeSortKey" }, | |||
} | } | ||
local function is_lang_object(lang) | local function is_lang_object(lang) | ||
return type(lang) == "table" and type(lang.getCanonicalName) == "function" | return type(lang) == "table" and type(lang.getCanonicalName) == "function" | ||
end | end | ||
Line 24: | Line 46: | ||
.. ": expected function object, got " .. type(func) .. ".", 2) | .. ": expected function object, got " .. type(func) .. ".", 2) | ||
end | end | ||
return true | |||
end | end | ||
local function make_sortkey_func(lang, make_sortbase) | |||
local langcode = lang:getCode() | |||
local makeDisplayText = lang.makeDisplayText | |||
local | local custom_func = custom_funcs[langcode] | ||
local makeSortKey | |||
if custom_func then | |||
local _makeSortKey = require("Module:" .. custom_func[1])[custom_func[2]] | |||
local | function makeSortKey(_, text) | ||
return _makeSortKey(text, langcode) | |||
local | |||
end | end | ||
else | |||
makeSortKey = lang.makeSortKey | |||
end | end | ||
return make_sortbase and check_function("make_sortkey_func", 2, make_sortbase) and function(element) | |||
return (makeSortKey( | |||
lang, | |||
(makeDisplayText( | |||
lang, | |||
get_plaintext(make_sortbase(element)) | |||
)) | |||
)) | |||
end or function(element) | |||
return (makeSortKey( | |||
lang, | |||
(makeDisplayText( | |||
lang, | |||
get_plaintext(element) | |||
)) | |||
)) | |||
end | end | ||
end | end | ||
-- When comparing two elements with code points outside the BMP, the less-than | |||
local | -- operator treats all code points above U+FFFF as equal because of a bug in | ||
-- glibc. See [[phab:T193096#4161287]]. Instead, compares bytes, which always | |||
-- yields the same result as comparing code points in valid UTF-8 strings. | |||
-- UTF-8-encoded characters that do not belong to the Basic Multilingual Plane | |||
-- (that is, with code points greater than U+FFFF) have byte sequences that | |||
-- begin with the bytes 240 to 244. | |||
do | |||
-- Memoize match because we've set the `simple` flag, which means it | |||
-- should only be used with fixed additional arguments (in this case, the | |||
-- pattern). | |||
local match = memoize(match, true) | |||
function export.string_sort(item1, item2) | |||
if match(item1, "^[^\240-\244]*$") and match(item2, "^[^\240-\244]*$") then | |||
return item1 < item2 | |||
return | |||
end | end | ||
local i = 0 | |||
while true do | |||
return | i = i + 1 | ||
local b1, b2 = byte(item1, i, i), byte(item2, i, i) | |||
if not b1 then | |||
return b2 and true or false | |||
elseif b1 ~= b2 then | |||
return b2 and b1 < b2 or false | |||
end | |||
end | end | ||
end | end | ||
string_sort = export.string_sort | |||
end | end | ||
function export.sort(elems, lang, | function export.sort(elems, lang, make_sortbase) | ||
if not is_lang_object(lang) then | |||
return sort(elems) | |||
end | end | ||
return | local make_sortkey = memoize(make_sortkey_func(lang, make_sortbase), true) | ||
return sort(elems, function(elem1, elem2) | |||
return string_sort(make_sortkey(elem1), make_sortkey(elem2)) | |||
end) | |||
end | end | ||
Line 124: | Line 138: | ||
end | end | ||
local | local m_table = require("Module:table") | ||
local elems = m_table.shallowcopy(args) | |||
local m_languages = require("Module:languages") | local m_languages = require("Module:languages") | ||
local lang | local lang | ||
if args.lang then | if args.lang then | ||
lang = m_languages.getByCode(args.lang) or m_languages.err(args.lang, | lang = m_languages.getByCode(args.lang) or m_languages.err(args.lang, "lang") | ||
else | else | ||
local code = | local code = remove(elems, 1) | ||
code = code and | code = code and trim(code) | ||
lang = m_languages.getByCode(code) or m_languages.err(code, 1) | lang = m_languages.getByCode(code) or m_languages.err(code, 1) | ||
end | end | ||
local i = 1 | |||
while true do | |||
local elem = elems[i] | |||
while elem do | |||
elem = trim(elem, "%s") | |||
if elem ~= "" then | |||
break | |||
end | |||
remove(elems, i) | |||
elem = elems[i] | |||
end | |||
if not elem then | |||
break | |||
elseif not ( -- Strip redundant wikilinks. | |||
not match(elem, "^()%[%[") or | |||
find(elem, "[[", 3, true) or | |||
find(elem, "]]", 3, true) ~= #elem - 1 or | |||
find(elem, "|", 3, true) | |||
) then | |||
elem = sub(elem, 3, -3) | |||
elem = trim(elem, "%s") | |||
end | |||
elems[i] = elem .. "\n" | |||
i = i + 1 | |||
end | |||
elems = m_table.removeDuplicates(elems) | |||
export.sort(elems, lang) | export.sort(elems, lang) | ||
return | |||
return concat(elems, args.sep or "|") | |||
end | end | ||
return export | return export |
Latest revision as of 18:22, 5 August 2024
- The following documentation is located at Module:collation/doc.[edit]
- Useful links: subpage list • links • transclusions • testcases • sandbox
local export = {}
local require = require
local byte = string.byte
local concat = table.concat
local find = string.find
local get_plaintext = require("Module:utilities").get_plaintext
local match = string.match
local memoize = require("Module:fun").memoize
local remove = table.remove
local sort = table.sort
local string_sort -- defined below as export.string_sort
local sub = string.sub
local trim = mw.text.trim
local type = type
-- Custom functions for generating a sortkey that will achieve the desired sort
-- order.
-- name of module and name of exported function
local custom_funcs = {
ahk = { "Mymr-sortkey", "makeSortKey" },
aio = { "Mymr-sortkey", "makeSortKey" },
blk = { "Mymr-sortkey", "makeSortKey" },
egy = { "egy-utilities", "make_sortkey" },
kac = { "Mymr-sortkey", "makeSortKey" },
kht = { "Mymr-sortkey", "makeSortKey" },
ksw = { "Mymr-sortkey", "makeSortKey" },
kyu = { "Mymr-sortkey", "makeSortKey" },
["mkh-mmn"] = { "Mymr-sortkey", "makeSortKey" },
mnw = { "Mymr-sortkey", "makeSortKey" },
my = { "Mymr-sortkey", "makeSortKey" },
phk = { "Mymr-sortkey", "makeSortKey" },
pwo = { "Mymr-sortkey", "makeSortKey" },
omx = { "Mymr-sortkey", "makeSortKey" },
shn = { "Mymr-sortkey", "makeSortKey" },
tjl = { "Mymr-sortkey", "makeSortKey" },
}
local function is_lang_object(lang)
return type(lang) == "table" and type(lang.getCanonicalName) == "function"
end
local function check_function(funcName, argIdx, func)
if type(func) ~= "function" then
error("bad argument #" .. argIdx .. " to " .. funcName
.. ": expected function object, got " .. type(func) .. ".", 2)
end
return true
end
local function make_sortkey_func(lang, make_sortbase)
local langcode = lang:getCode()
local makeDisplayText = lang.makeDisplayText
local custom_func = custom_funcs[langcode]
local makeSortKey
if custom_func then
local _makeSortKey = require("Module:" .. custom_func[1])[custom_func[2]]
function makeSortKey(_, text)
return _makeSortKey(text, langcode)
end
else
makeSortKey = lang.makeSortKey
end
return make_sortbase and check_function("make_sortkey_func", 2, make_sortbase) and function(element)
return (makeSortKey(
lang,
(makeDisplayText(
lang,
get_plaintext(make_sortbase(element))
))
))
end or function(element)
return (makeSortKey(
lang,
(makeDisplayText(
lang,
get_plaintext(element)
))
))
end
end
-- When comparing two elements with code points outside the BMP, the less-than
-- operator treats all code points above U+FFFF as equal because of a bug in
-- glibc. See [[phab:T193096#4161287]]. Instead, compares bytes, which always
-- yields the same result as comparing code points in valid UTF-8 strings.
-- UTF-8-encoded characters that do not belong to the Basic Multilingual Plane
-- (that is, with code points greater than U+FFFF) have byte sequences that
-- begin with the bytes 240 to 244.
do
-- Memoize match because we've set the `simple` flag, which means it
-- should only be used with fixed additional arguments (in this case, the
-- pattern).
local match = memoize(match, true)
function export.string_sort(item1, item2)
if match(item1, "^[^\240-\244]*$") and match(item2, "^[^\240-\244]*$") then
return item1 < item2
end
local i = 0
while true do
i = i + 1
local b1, b2 = byte(item1, i, i), byte(item2, i, i)
if not b1 then
return b2 and true or false
elseif b1 ~= b2 then
return b2 and b1 < b2 or false
end
end
end
string_sort = export.string_sort
end
function export.sort(elems, lang, make_sortbase)
if not is_lang_object(lang) then
return sort(elems)
end
local make_sortkey = memoize(make_sortkey_func(lang, make_sortbase), true)
return sort(elems, function(elem1, elem2)
return string_sort(make_sortkey(elem1), make_sortkey(elem2))
end)
end
function export.sort_template(frame)
if not mw.isSubsting() then
error("This template must be substed.")
end
local args
if frame.args.parent then
args = frame:getParent().args
else
args = frame.args
end
local m_table = require("Module:table")
local elems = m_table.shallowcopy(args)
local m_languages = require("Module:languages")
local lang
if args.lang then
lang = m_languages.getByCode(args.lang) or m_languages.err(args.lang, "lang")
else
local code = remove(elems, 1)
code = code and trim(code)
lang = m_languages.getByCode(code) or m_languages.err(code, 1)
end
local i = 1
while true do
local elem = elems[i]
while elem do
elem = trim(elem, "%s")
if elem ~= "" then
break
end
remove(elems, i)
elem = elems[i]
end
if not elem then
break
elseif not ( -- Strip redundant wikilinks.
not match(elem, "^()%[%[") or
find(elem, "[[", 3, true) or
find(elem, "]]", 3, true) ~= #elem - 1 or
find(elem, "|", 3, true)
) then
elem = sub(elem, 3, -3)
elem = trim(elem, "%s")
end
elems[i] = elem .. "\n"
i = i + 1
end
elems = m_table.removeDuplicates(elems)
export.sort(elems, lang)
return concat(elems, args.sep or "|")
end
return export