Module:headword/page: Difference between revisions

No edit summary
No edit summary
 
(12 intermediate revisions by 2 users not shown)
Line 1: Line 1:
local require = require
local export = {}
local m_str_utils = require("Module:string utilities")
 
local languages_module = "Module:languages"
local maintenance_category_module = "Module:maintenance category"
local pages_module = "Module:pages"
local string_compare_module = "Module:string/compare"
local string_decode_entities_module = "Module:string/decodeEntities"
local string_remove_comments_module = "Module:string/removeComments"
local string_utilities_module = "Module:string utilities"
local table_module = "Module:table"
local template_parser_module = "Module:template parser"


local mw = mw
local mw = mw
Line 8: Line 17:


local concat = table.concat
local concat = table.concat
local decode_entities = m_str_utils.decode_entities
local encode_entities = m_str_utils.encode_entities
local find = string.find
local find = string.find
local get_category = require("Module:maintenance category").get_category
local format = string.format
local get_etym_lang = require("Module:etymology languages").getByCanonicalName
local gsub = string.gsub
local insert = table.insert
local insert = table.insert
local ipairs = ipairs
local list_to_set = require("Module:table").listToSet
local load_data = mw.loadData
local load_data = mw.loadData
local match = string.match
local match = string.match
local new_title = mw.title.new
local new_title = mw.title.new
local remove_comments = m_str_utils.remove_comments
local pairs = pairs
local split = m_str_utils.split
local require = require
local string_sort = require("Module:collation").string_sort
local sub = string.sub
local sub = string.sub
local toNFC = ustring.toNFC
local toNFC = ustring.toNFC
local toNFD = ustring.toNFD
local toNFD = ustring.toNFD
local type = type
local type_or_class = require("Module:parser").type_or_class
local u = m_str_utils.char
local ugsub = ustring.gsub
local ugsub = ustring.gsub
local uupper = m_str_utils.upper


local langnames = load_data("Module:languages/canonical names")
local function class_else_type(...)
local etym_langnames = load_data("Module:etymology languages/canonical names")
class_else_type = require(template_parser_module).class_else_type
return class_else_type(...)
end


local export = {}
local function decode_entities(...)
decode_entities = require(string_decode_entities_module)
return decode_entities(...)
end
 
local function encode_entities(...)
encode_entities = require(string_utilities_module).encode_entities
return encode_entities(...)
end
 
local function get_category(...)
get_category = require(maintenance_category_module).get_category
return get_category(...)
end
 
local function get_lang(...)
get_lang = require(languages_module).getByCode
return get_lang(...)
end
 
local function list_to_set(...)
list_to_set = require(table_module).listToSet
return list_to_set(...)
end
 
local function parse(...)
parse = require(template_parser_module).parse
return parse(...)
end
 
local function remove_comments(...)
remove_comments = require(string_remove_comments_module)
return remove_comments(...)
end
 
local function physical_to_logical_pagename_if_mammoth(...)
physical_to_logical_pagename_if_mammoth = require(pages_module).physical_to_logical_pagename_if_mammoth
return physical_to_logical_pagename_if_mammoth(...)
end
 
local function split(...)
split = require(string_utilities_module).split
return split(...)
end
 
local function string_compare(...)
string_compare =  require(string_compare_module)
return string_compare(...)
end


-- Convert a numeric list of characters and ranges to the equivalent Lua pattern. WARNING: This destructively modifies
local function uupper(...)
-- the contents of `ranges`.
uupper = require(string_utilities_module).upper
local function char_ranges_to_pattern(ranges)
return uupper(...)
for j, range in ipairs(ranges) do
if type(range) == "table" then
for k, char in ipairs(range) do
range[k] = u(char)
end
ranges[j] = concat(range, "-")
else
ranges[j] = u(range)
end
end
return concat(ranges)
end
end


--[==[
Loaders for objects, which load data (or some other object) into some variable, which can then be accessed as "foo or get_foo()", where the function get_foo sets the object to "foo" and then returns it. This ensures they are only loaded when needed, and avoids the need to check for the existence of the object each time, since once "foo" has been set, "get_foo" will not be called again.]==]
local langnames
local function get_langnames()
langnames, get_langnames = load_data("Module:languages/canonical names"), nil
return langnames
end


-- Combining character data used when categorising unusual characters. These resolve into two patterns, used to find
-- Combining character data used when categorising unusual characters. These resolve into two patterns, used to find
-- single combining characters (i.e. character + diacritic(s)) or double combining characters (i.e. character +
-- single combining characters (i.e. character + diacritic(s)) or double combining characters (i.e. character +
-- diacritic(s) + character).
-- diacritic(s) + character).
local comb_chars = {
-- Charsets are in the format used by Unicode's UnicodeSet tool: https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp.
single = {
 
{0x0300, 0x034E},
-- Single combining characters.
-- Exclude combining grapheme joiner.
-- Charset: [[:M:]&[:^Canonical_Combining_Class=/^Double_/:]&[:^subhead=Grapheme joiner:]&[:^Variation_Selector=Yes:]]
{0x0350, 0x035B},
-- Note: concatenating hundreds of lines at once gives an error, so () are used every 150 lines to break it up into chunks.
{0x0363, 0x036F},
local comb_chars_single =
{0x0483, 0x0489},
("\204\128-\205\142" .. -- U+0300-U+034E
{0x0591, 0x05BD},
"\205\144-\205\155" .. -- U+0350-U+035B
0x05BF,
"\205\163-\205\175" .. -- U+0363-U+036F
{0x05C1, 0x05C2},
"\210\131-\210\137" .. -- U+0483-U+0489
{0x05C4, 0x05C5},
"\214\145-\214\189" .. -- U+0591-U+05BD
0x05C7,
"\214\191" .. -- U+05BF
{0x0610, 0x061A},
"\215\129" .. -- U+05C1
{0x064B, 0x065F},
"\215\130" .. -- U+05C2
0x0670,
"\215\132" .. -- U+05C4
{0x06D6, 0x06DC},
"\215\133" .. -- U+05C5
{0x06DF, 0x06E4},
"\215\135" .. -- U+05C7
{0x06E7, 0x06E8},
"\216\144-\216\154" .. -- U+0610-U+061A
{0x06EA, 0x06ED},
"\217\139-\217\159" .. -- U+064B-U+065F
0x0711,
"\217\176" .. -- U+0670
{0x0730, 0x074A},
"\219\150-\219\156" .. -- U+06D6-U+06DC
{0x07A6, 0x07B0},
"\219\159-\219\164" .. -- U+06DF-U+06E4
{0x07EB, 0x07F3},
"\219\167" .. -- U+06E7
0x07FD,
"\219\168" .. -- U+06E8
{0x0816, 0x0819},
"\219\170-\219\173" .. -- U+06EA-U+06ED
{0x081B, 0x0823},
"\220\145" .. -- U+0711
{0x0825, 0x0827},
"\220\176-\221\138" .. -- U+0730-U+074A
{0x0829, 0x082D},
"\222\166-\222\176" .. -- U+07A6-U+07B0
{0x0859, 0x085B},
"\223\171-\223\179" .. -- U+07EB-U+07F3
{0x0898, 0x089F},
"\223\189" .. -- U+07FD
{0x08CA, 0x08E1},
"\224\160\150-\224\160\153" .. -- U+0816-U+0819
{0x08E3, 0x0903},
"\224\160\155-\224\160\163" .. -- U+081B-U+0823
{0x093A, 0x093C},
"\224\160\165-\224\160\167" .. -- U+0825-U+0827
{0x093E, 0x094F},
"\224\160\169-\224\160\173" .. -- U+0829-U+082D
{0x0951, 0x0957},
"\224\161\153-\224\161\155" .. -- U+0859-U+085B
{0x0962, 0x0963},
"\224\162\151-\224\162\159" .. -- U+0897-U+089F
{0x0981, 0x0983},
"\224\163\138-\224\163\161" .. -- U+08CA-U+08E1
0x09BC,
"\224\163\163-\224\164\131" .. -- U+08E3-U+0903
{0x09BE, 0x09C4},
"\224\164\186-\224\164\188" .. -- U+093A-U+093C
{0x09C7, 0x09C8},
"\224\164\190-\224\165\143" .. -- U+093E-U+094F
{0x09CB, 0x09CD},
"\224\165\145-\224\165\151" .. -- U+0951-U+0957
0x09D7,
"\224\165\162" .. -- U+0962
{0x09E2, 0x09E3},
"\224\165\163" .. -- U+0963
0x09FE,
"\224\166\129-\224\166\131" .. -- U+0981-U+0983
{0x0A01, 0x0A03},
"\224\166\188" .. -- U+09BC
0x0A3C,
"\224\166\190-\224\167\132" .. -- U+09BE-U+09C4
{0x0A3E, 0x0A42},
"\224\167\135" .. -- U+09C7
{0x0A47, 0x0A48},
"\224\167\136" .. -- U+09C8
{0x0A4B, 0x0A4D},
"\224\167\139-\224\167\141" .. -- U+09CB-U+09CD
0x0A51,
"\224\167\151" .. -- U+09D7
{0x0A70, 0x0A71},
"\224\167\162" .. -- U+09E2
0x0A75,
"\224\167\163" .. -- U+09E3
{0x0A81, 0x0A83},
"\224\167\190" .. -- U+09FE
0x0ABC,
"\224\168\129-\224\168\131" .. -- U+0A01-U+0A03
{0x0ABE, 0x0AC5},
"\224\168\188" .. -- U+0A3C
{0x0AC7, 0x0AC9},
"\224\168\190-\224\169\130" .. -- U+0A3E-U+0A42
{0x0ACB, 0x0ACD},
"\224\169\135" .. -- U+0A47
{0x0AE2, 0x0AE3},
"\224\169\136" .. -- U+0A48
{0x0AFA, 0x0AFF},
"\224\169\139-\224\169\141" .. -- U+0A4B-U+0A4D
{0x0B01, 0x0B03},
"\224\169\145" .. -- U+0A51
0x0B3C,
"\224\169\176" .. -- U+0A70
{0x0B3E, 0x0B44},
"\224\169\177" .. -- U+0A71
{0x0B47, 0x0B48},
"\224\169\181" .. -- U+0A75
{0x0B4B, 0x0B4D},
"\224\170\129-\224\170\131" .. -- U+0A81-U+0A83
{0x0B55, 0x0B57},
"\224\170\188" .. -- U+0ABC
{0x0B62, 0x0B63},
"\224\170\190-\224\171\133" .. -- U+0ABE-U+0AC5
0x0B82,
"\224\171\135-\224\171\137" .. -- U+0AC7-U+0AC9
{0x0BBE, 0x0BC2},
"\224\171\139-\224\171\141" .. -- U+0ACB-U+0ACD
{0x0BC6, 0x0BC8},
"\224\171\162" .. -- U+0AE2
{0x0BCA, 0x0BCD},
"\224\171\163" .. -- U+0AE3
0x0BD7,
"\224\171\186-\224\171\191" .. -- U+0AFA-U+0AFF
{0x0C00, 0x0C04},
"\224\172\129-\224\172\131" .. -- U+0B01-U+0B03
0x0C3C,
"\224\172\188" .. -- U+0B3C
{0x0C3E, 0x0C44},
"\224\172\190-\224\173\132" .. -- U+0B3E-U+0B44
{0x0C46, 0x0C48},
"\224\173\135" .. -- U+0B47
{0x0C4A, 0x0C4D},
"\224\173\136" .. -- U+0B48
{0x0C55, 0x0C56},
"\224\173\139-\224\173\141" .. -- U+0B4B-U+0B4D
{0x0C62, 0x0C63},
"\224\173\149-\224\173\151" .. -- U+0B55-U+0B57
{0x0C81, 0x0C83},
"\224\173\162" .. -- U+0B62
0x0CBC,
"\224\173\163" .. -- U+0B63
{0x0CBE, 0x0CC4},
"\224\174\130" .. -- U+0B82
{0x0CC6, 0x0CC8},
"\224\174\190-\224\175\130" .. -- U+0BBE-U+0BC2
{0x0CCA, 0x0CCD},
"\224\175\134-\224\175\136" .. -- U+0BC6-U+0BC8
{0x0CD5, 0x0CD6},
"\224\175\138-\224\175\141" .. -- U+0BCA-U+0BCD
{0x0CE2, 0x0CE3},
"\224\175\151" .. -- U+0BD7
0x0CF3,
"\224\176\128-\224\176\132" .. -- U+0C00-U+0C04
{0x0D00, 0x0D03},
"\224\176\188" .. -- U+0C3C
{0x0D3B, 0x0D3C},
"\224\176\190-\224\177\132" .. -- U+0C3E-U+0C44
{0x0D3E, 0x0D44},
"\224\177\134-\224\177\136" .. -- U+0C46-U+0C48
{0x0D46, 0x0D48},
"\224\177\138-\224\177\141" .. -- U+0C4A-U+0C4D
{0x0D4A, 0x0D4D},
"\224\177\149" .. -- U+0C55
0x0D57,
"\224\177\150" .. -- U+0C56
{0x0D62, 0x0D63},
"\224\177\162" .. -- U+0C62
{0x0D81, 0x0D83},
"\224\177\163" .. -- U+0C63
0x0DCA,
"\224\178\129-\224\178\131" .. -- U+0C81-U+0C83
{0x0DCF, 0x0DD4},
"\224\178\188" .. -- U+0CBC
0x0DD6,
"\224\178\190-\224\179\132" .. -- U+0CBE-U+0CC4
{0x0DD8, 0x0DDF},
"\224\179\134-\224\179\136" .. -- U+0CC6-U+0CC8
{0x0DF2, 0x0DF3},
"\224\179\138-\224\179\141" .. -- U+0CCA-U+0CCD
0x0E31,
"\224\179\149" .. -- U+0CD5
{0x0E34, 0x0E3A},
"\224\179\150" .. -- U+0CD6
{0x0E47, 0x0E4E},
"\224\179\162" .. -- U+0CE2
0x0EB1,
"\224\179\163" .. -- U+0CE3
{0x0EB4, 0x0EBC},
"\224\179\179" .. -- U+0CF3
{0x0EC8, 0x0ECE},
"\224\180\128-\224\180\131" .. -- U+0D00-U+0D03
{0x0F18, 0x0F19},
"\224\180\187" .. -- U+0D3B
0x0F35,
"\224\180\188" .. -- U+0D3C
0x0F37,
"\224\180\190-\224\181\132" .. -- U+0D3E-U+0D44
0x0F39,
"\224\181\134-\224\181\136" .. -- U+0D46-U+0D48
{0x0F3E, 0x0F3F},
"\224\181\138-\224\181\141" .. -- U+0D4A-U+0D4D
{0x0F71, 0x0F84},
"\224\181\151" .. -- U+0D57
{0x0F86, 0x0F87},
"\224\181\162" .. -- U+0D62
{0x0F8D, 0x0F97},
"\224\181\163" .. -- U+0D63
{0x0F99, 0x0FBC},
"\224\182\129-\224\182\131" .. -- U+0D81-U+0D83
0x0FC6,
"\224\183\138" .. -- U+0DCA
{0x102B, 0x103E},
"\224\183\143-\224\183\148" .. -- U+0DCF-U+0DD4
{0x1056, 0x1059},
"\224\183\150" .. -- U+0DD6
{0x105E, 0x1060},
"\224\183\152-\224\183\159" .. -- U+0DD8-U+0DDF
{0x1062, 0x1064},
"\224\183\178" .. -- U+0DF2
{0x1067, 0x106D},
"\224\183\179" .. -- U+0DF3
{0x1071, 0x1074},
"\224\184\177" .. -- U+0E31
{0x1082, 0x108D},
"\224\184\180-\224\184\186" .. -- U+0E34-U+0E3A
0x108F,
"\224\185\135-\224\185\142" .. -- U+0E47-U+0E4E
{0x109A, 0x109D},
"\224\186\177" .. -- U+0EB1
{0x135D, 0x135F},
"\224\186\180-\224\186\188" .. -- U+0EB4-U+0EBC
{0x1712, 0x1715},
"\224\187\136-\224\187\142" .. -- U+0EC8-U+0ECE
{0x1732, 0x1734},
"\224\188\152" .. -- U+0F18
{0x1752, 0x1753},
"\224\188\153" .. -- U+0F19
{0x1772, 0x1773},
"\224\188\181" .. -- U+0F35
{0x17B4, 0x17D3},
"\224\188\183" .. -- U+0F37
0x17DD,
"\224\188\185" .. -- U+0F39
-- Exclude Mongolian variation selectors.
"\224\188\190" .. -- U+0F3E
{0x1885, 0x1886},
"\224\188\191" .. -- U+0F3F
0x18A9,
"\224\189\177-\224\190\132" .. -- U+0F71-U+0F84
{0x1920, 0x192B},
"\224\190\134" .. -- U+0F86
{0x1930, 0x193B},
"\224\190\135" .. -- U+0F87
{0x1A17, 0x1A1B},
"\224\190\141-\224\190\151" .. -- U+0F8D-U+0F97
{0x1A55, 0x1A5E},
"\224\190\153-\224\190\188" .. -- U+0F99-U+0FBC
{0x1A60, 0x1A7C},
"\224\191\134" .. -- U+0FC6
0x1A7F,
"\225\128\171-\225\128\190" .. -- U+102B-U+103E
{0x1AB0, 0x1ACE},
"\225\129\150-\225\129\153" .. -- U+1056-U+1059
{0x1B00, 0x1B04},
"\225\129\158-\225\129\160" .. -- U+105E-U+1060
{0x1B34, 0x1B44},
"\225\129\162-\225\129\164" .. -- U+1062-U+1064
{0x1B6B, 0x1B73},
"\225\129\167-\225\129\173" .. -- U+1067-U+106D
{0x1B80, 0x1B82},
"\225\129\177-\225\129\180" .. -- U+1071-U+1074
{0x1BA1, 0x1BAD},
"\225\130\130-\225\130\141" .. -- U+1082-U+108D
{0x1BE6, 0x1BF3},
"\225\130\143" .. -- U+108F
{0x1C24, 0x1C37},
"\225\130\154-\225\130\157" .. -- U+109A-U+109D
{0x1CD0, 0x1CD2},
"\225\141\157-\225\141\159" .. -- U+135D-U+135F
{0x1CD4, 0x1CE8},
"\225\156\146-\225\156\149" .. -- U+1712-U+1715
0x1CED,
"\225\156\178-\225\156\180" .. -- U+1732-U+1734
0x1CF4,
"\225\157\146" .. -- U+1752
{0x1CF7, 0x1CF9},
"\225\157\147" .. -- U+1753
{0x1DC0, 0x1DCC},
"\225\157\178" .. -- U+1772
{0x1DCE, 0x1DFB},
"\225\157\179" .. -- U+1773
{0x1DFD, 0x1DFF},
"\225\158\180-\225\159\147") .. -- U+17B4-U+17D3
{0x20D0, 0x20F0},
("\225\159\157" .. -- U+17DD
{0x2CEF, 0x2CF1},
"\225\162\133" .. -- U+1885
0x2D7F,
"\225\162\134" .. -- U+1886
{0x2DE0, 0x2DFF},
"\225\162\169" .. -- U+18A9
{0x302A, 0x302F},
"\225\164\160-\225\164\171" .. -- U+1920-U+192B
{0x3099, 0x309A},
"\225\164\176-\225\164\187" .. -- U+1930-U+193B
{0xA66F, 0xA672},
"\225\168\151-\225\168\155" .. -- U+1A17-U+1A1B
{0xA674, 0xA67D},
"\225\169\149-\225\169\158" .. -- U+1A55-U+1A5E
{0xA69E, 0xA69F},
"\225\169\160-\225\169\188" .. -- U+1A60-U+1A7C
{0xA6F0, 0xA6F1},
"\225\169\191" .. -- U+1A7F
0xA802,
"\225\170\176-\225\171\142" .. -- U+1AB0-U+1ACE
0xA806,
"\225\172\128-\225\172\132" .. -- U+1B00-U+1B04
0xA80B,
"\225\172\180-\225\173\132" .. -- U+1B34-U+1B44
{0xA823, 0xA827},
"\225\173\171-\225\173\179" .. -- U+1B6B-U+1B73
0xA82C,
"\225\174\128-\225\174\130" .. -- U+1B80-U+1B82
{0xA880, 0xA881},
"\225\174\161-\225\174\173" .. -- U+1BA1-U+1BAD
{0xA8B4, 0xA8C5},
"\225\175\166-\225\175\179" .. -- U+1BE6-U+1BF3
{0xA8E0, 0xA8F1},
"\225\176\164-\225\176\183" .. -- U+1C24-U+1C37
0xA8FF,
"\225\179\144-\225\179\146" .. -- U+1CD0-U+1CD2
{0xA926, 0xA92D},
"\225\179\148-\225\179\168" .. -- U+1CD4-U+1CE8
{0xA947, 0xA953},
"\225\179\173" .. -- U+1CED
{0xA980, 0xA983},
"\225\179\180" .. -- U+1CF4
{0xA9B3, 0xA9C0},
"\225\179\183-\225\179\185" .. -- U+1CF7-U+1CF9
0xA9E5,
"\225\183\128-\225\183\140" .. -- U+1DC0-U+1DCC
{0xAA29, 0xAA36},
"\225\183\142-\225\183\187" .. -- U+1DCE-U+1DFB
0xAA43,
"\225\183\189-\225\183\191" .. -- U+1DFD-U+1DFF
{0xAA4C, 0xAA4D},
"\226\131\144-\226\131\176" .. -- U+20D0-U+20F0
{0xAA7B, 0xAA7D},
"\226\179\175-\226\179\177" .. -- U+2CEF-U+2CF1
0xAAB0,
"\226\181\191" .. -- U+2D7F
{0xAAB2, 0xAAB4},
"\226\183\160-\226\183\191" .. -- U+2DE0-U+2DFF
{0xAAB7, 0xAAB8},
"\227\128\170-\227\128\175" .. -- U+302A-U+302F
{0xAABE, 0xAABF},
"\227\130\153" .. -- U+3099
0xAAC1,
"\227\130\154" .. -- U+309A
{0xAAEB, 0xAAEF},
"\234\153\175-\234\153\178" .. -- U+A66F-U+A672
{0xAAF5, 0xAAF6},
"\234\153\180-\234\153\189" .. -- U+A674-U+A67D
{0xABE3, 0xABEA},
"\234\154\158" .. -- U+A69E
{0xABEC, 0xABED},
"\234\154\159" .. -- U+A69F
0xFB1E,
"\234\155\176" .. -- U+A6F0
{0xFE20, 0xFE2F},
"\234\155\177" .. -- U+A6F1
0x101FD,
"\234\160\130" .. -- U+A802
0x102E0,
"\234\160\134" .. -- U+A806
{0x10376, 0x1037A},
"\234\160\139" .. -- U+A80B
{0x10A01, 0x10A03},
"\234\160\163-\234\160\167" .. -- U+A823-U+A827
{0x10A05, 0x10A06},
"\234\160\172" .. -- U+A82C
{0x10A0C, 0x10A0F},
"\234\162\128" .. -- U+A880
{0x10A38, 0x10A3A},
"\234\162\129" .. -- U+A881
0x10A3F,
"\234\162\180-\234\163\133" .. -- U+A8B4-U+A8C5
{0x10AE5, 0x10AE6},
"\234\163\160-\234\163\177" .. -- U+A8E0-U+A8F1
{0x10D24, 0x10D27},
"\234\163\191" .. -- U+A8FF
{0x10EAB, 0x10EAC},
"\234\164\166-\234\164\173" .. -- U+A926-U+A92D
{0x10EFD, 0x10EFF},
"\234\165\135-\234\165\147" .. -- U+A947-U+A953
{0x10F46, 0x10F50},
"\234\166\128-\234\166\131" .. -- U+A980-U+A983
{0x10F82, 0x10F85},
"\234\166\179-\234\167\128" .. -- U+A9B3-U+A9C0
{0x11000, 0x11002},
"\234\167\165" .. -- U+A9E5
{0x11038, 0x11046},
"\234\168\169-\234\168\182" .. -- U+AA29-U+AA36
0x11070,
"\234\169\131" .. -- U+AA43
{0x11073, 0x11074},
"\234\169\140" .. -- U+AA4C
{0x1107F, 0x11082},
"\234\169\141" .. -- U+AA4D
{0x110B0, 0x110BA},
"\234\169\187-\234\169\189" .. -- U+AA7B-U+AA7D
0x110C2,
"\234\170\176" .. -- U+AAB0
{0x11100, 0x11102},
"\234\170\178-\234\170\180" .. -- U+AAB2-U+AAB4
{0x11127, 0x11134},
"\234\170\183" .. -- U+AAB7
{0x11145, 0x11146},
"\234\170\184" .. -- U+AAB8
0x11173,
"\234\170\190" .. -- U+AABE
{0x11180, 0x11182},
"\234\170\191" .. -- U+AABF
{0x111B3, 0x111C0},
"\234\171\129" .. -- U+AAC1
{0x111C9, 0x111CC},
"\234\171\171-\234\171\175" .. -- U+AAEB-U+AAEF
{0x111CE, 0x111CF},
"\234\171\181" .. -- U+AAF5
{0x1122C, 0x11237},
"\234\171\182" .. -- U+AAF6
0x1123E,
"\234\175\163-\234\175\170" .. -- U+ABE3-U+ABEA
0x11241,
"\234\175\172" .. -- U+ABEC
{0x112DF, 0x112EA},
"\234\175\173" .. -- U+ABED
{0x11300, 0x11303},
"\239\172\158" .. -- U+FB1E
{0x1133B, 0x1133C},
"\239\184\160-\239\184\175" .. -- U+FE20-U+FE2F
{0x1133E, 0x11344},
"\240\144\135\189" .. -- U+101FD
{0x11347, 0x11348},
"\240\144\139\160" .. -- U+102E0
{0x1134B, 0x1134D},
"\240\144\141\182-\240\144\141\186" .. -- U+10376-U+1037A
0x11357,
"\240\144\168\129-\240\144\168\131" .. -- U+10A01-U+10A03
{0x11362, 0x11363},
"\240\144\168\133" .. -- U+10A05
{0x11366, 0x1136C},
"\240\144\168\134" .. -- U+10A06
{0x11370, 0x11374},
"\240\144\168\140-\240\144\168\143" .. -- U+10A0C-U+10A0F
{0x11435, 0x11446},
"\240\144\168\184-\240\144\168\186" .. -- U+10A38-U+10A3A
0x1145E,
"\240\144\168\191" .. -- U+10A3F
{0x114B0, 0x114C3},
"\240\144\171\165" .. -- U+10AE5
{0x115AF, 0x115B5},
"\240\144\171\166" .. -- U+10AE6
{0x115B8, 0x115C0},
"\240\144\180\164-\240\144\180\167" .. -- U+10D24-U+10D27
{0x115DC, 0x115DD},
"\240\144\181\169-\240\144\181\173" .. -- U+10D69-U+10D6D
{0x11630, 0x11640},
"\240\144\186\171" .. -- U+10EAB
{0x116AB, 0x116B7},
"\240\144\186\172" .. -- U+10EAC
{0x1171D, 0x1172B},
"\240\144\187\188-\240\144\187\191" .. -- U+10EFC-U+10EFF
{0x1182C, 0x1183A},
"\240\144\189\134-\240\144\189\144" .. -- U+10F46-U+10F50
{0x11930, 0x11935},
"\240\144\190\130-\240\144\190\133" .. -- U+10F82-U+10F85
{0x11937, 0x11938},
"\240\145\128\128-\240\145\128\130" .. -- U+11000-U+11002
{0x1193B, 0x1193E},
"\240\145\128\184-\240\145\129\134" .. -- U+11038-U+11046
0x11940,
"\240\145\129\176" .. -- U+11070
{0x11942, 0x11943},
"\240\145\129\179" .. -- U+11073
{0x119D1, 0x119D7},
"\240\145\129\180" .. -- U+11074
{0x119DA, 0x119E0},
"\240\145\129\191-\240\145\130\130" .. -- U+1107F-U+11082
0x119E4,
"\240\145\130\176-\240\145\130\186" .. -- U+110B0-U+110BA
{0x11A01, 0x11A0A},
"\240\145\131\130" .. -- U+110C2
{0x11A33, 0x11A39},
"\240\145\132\128-\240\145\132\130" .. -- U+11100-U+11102
{0x11A3B, 0x11A3E},
"\240\145\132\167-\240\145\132\180" .. -- U+11127-U+11134
0x11A47,
"\240\145\133\133" .. -- U+11145
{0x11A51, 0x11A5B},
"\240\145\133\134" .. -- U+11146
{0x11A8A, 0x11A99},
"\240\145\133\179" .. -- U+11173
{0x11C2F, 0x11C36},
"\240\145\134\128-\240\145\134\130" .. -- U+11180-U+11182
{0x11C38, 0x11C3F},
"\240\145\134\179-\240\145\135\128" .. -- U+111B3-U+111C0
{0x11C92, 0x11CA7},
"\240\145\135\137-\240\145\135\140" .. -- U+111C9-U+111CC
{0x11CA9, 0x11CB6},
"\240\145\135\142" .. -- U+111CE
{0x11D31, 0x11D36},
"\240\145\135\143" .. -- U+111CF
0x11D3A,
"\240\145\136\172-\240\145\136\183" .. -- U+1122C-U+11237
{0x11D3C, 0x11D3D},
"\240\145\136\190" .. -- U+1123E
{0x11D3F, 0x11D45},
"\240\145\137\129" .. -- U+11241
0x11D47,
"\240\145\139\159-\240\145\139\170" .. -- U+112DF-U+112EA
{0x11D8A, 0x11D8E},
"\240\145\140\128-\240\145\140\131" .. -- U+11300-U+11303
{0x11D90, 0x11D91},
"\240\145\140\187" .. -- U+1133B
{0x11D93, 0x11D97},
"\240\145\140\188" .. -- U+1133C
{0x11EF3, 0x11EF6},
"\240\145\140\190-\240\145\141\132" .. -- U+1133E-U+11344
{0x11F00, 0x11F01},
"\240\145\141\135" .. -- U+11347
0x11F03,
"\240\145\141\136" .. -- U+11348
{0x11F34, 0x11F3A},
"\240\145\141\139-\240\145\141\141" .. -- U+1134B-U+1134D
{0x11F3E, 0x11F42},
"\240\145\141\151" .. -- U+11357
0x13440,
"\240\145\141\162" .. -- U+11362
{0x13447, 0x13455},
"\240\145\141\163" .. -- U+11363
{0x16AF0, 0x16AF4},
"\240\145\141\166-\240\145\141\172" .. -- U+11366-U+1136C
{0x16B30, 0x16B36},
"\240\145\141\176-\240\145\141\180" .. -- U+11370-U+11374
0x16F4F,
"\240\145\142\184-\240\145\143\128" .. -- U+113B8-U+113C0
{0x16F51, 0x16F87},
"\240\145\143\130" .. -- U+113C2
{0x16F8F, 0x16F92},
"\240\145\143\133" .. -- U+113C5
-- Exclude Khitan Small Script filler.
"\240\145\143\135-\240\145\143\138" .. -- U+113C7-U+113CA
{0x16FF0, 0x16FF1},
"\240\145\143\140-\240\145\143\144" .. -- U+113CC-U+113D0
{0x1BC9D, 0x1BC9E},
"\240\145\143\146" .. -- U+113D2
{0x1CF00, 0x1CF2D},
"\240\145\143\161" .. -- U+113E1
{0x1CF30, 0x1CF46},
"\240\145\143\162" .. -- U+113E2
{0x1D165, 0x1D169},
"\240\145\144\181-\240\145\145\134" .. -- U+11435-U+11446
{0x1D16D, 0x1D172},
"\240\145\145\158" .. -- U+1145E
{0x1D17B, 0x1D182},
"\240\145\146\176-\240\145\147\131" .. -- U+114B0-U+114C3
{0x1D185, 0x1D18B},
"\240\145\150\175-\240\145\150\181" .. -- U+115AF-U+115B5
{0x1D1AA, 0x1D1AD},
"\240\145\150\184-\240\145\151\128" .. -- U+115B8-U+115C0
{0x1D242, 0x1D244},
"\240\145\151\156" .. -- U+115DC
{0x1DA00, 0x1DA36},
"\240\145\151\157" .. -- U+115DD
{0x1DA3B, 0x1DA6C},
"\240\145\152\176-\240\145\153\128" .. -- U+11630-U+11640
0x1DA75,
"\240\145\154\171-\240\145\154\183" .. -- U+116AB-U+116B7
0x1DA84,
"\240\145\156\157-\240\145\156\171" .. -- U+1171D-U+1172B
{0x1DA9B, 0x1DA9F},
"\240\145\160\172-\240\145\160\186" .. -- U+1182C-U+1183A
{0x1DAA1, 0x1DAAF},
"\240\145\164\176-\240\145\164\181" .. -- U+11930-U+11935
{0x1E000, 0x1E006},
"\240\145\164\183" .. -- U+11937
{0x1E008, 0x1E018},
"\240\145\164\184" .. -- U+11938
{0x1E01B, 0x1E021},
"\240\145\164\187-\240\145\164\190" .. -- U+1193B-U+1193E
{0x1E023, 0x1E024},
"\240\145\165\128") .. -- U+11940
{0x1E026, 0x1E02A},
("\240\145\165\130" .. -- U+11942
0x1E08F,
"\240\145\165\131" .. -- U+11943
{0x1E130, 0x1E136},
"\240\145\167\145-\240\145\167\151" .. -- U+119D1-U+119D7
0x1E2AE,
"\240\145\167\154-\240\145\167\160" .. -- U+119DA-U+119E0
{0x1E2EC, 0x1E2EF},
"\240\145\167\164" .. -- U+119E4
{0x1E4EC, 0x1E4EF},
"\240\145\168\129-\240\145\168\138" .. -- U+11A01-U+11A0A
{0x1E8D0, 0x1E8D6},
"\240\145\168\179-\240\145\168\185" .. -- U+11A33-U+11A39
{0x1E944, 0x1E94A},
"\240\145\168\187-\240\145\168\190" .. -- U+11A3B-U+11A3E
},
"\240\145\169\135" .. -- U+11A47
double = {
"\240\145\169\145-\240\145\169\155" .. -- U+11A51-U+11A5B
{0x035C, 0x0362},
"\240\145\170\138-\240\145\170\153" .. -- U+11A8A-U+11A99
0x1DCD,
"\240\145\176\175-\240\145\176\182" .. -- U+11C2F-U+11C36
0x1DFC,
"\240\145\176\184-\240\145\176\191" .. -- U+11C38-U+11C3F
},
"\240\145\178\146-\240\145\178\167" .. -- U+11C92-U+11CA7
vs = { -- variation selectors; separated out so that we don't get categories for them
"\240\145\178\169-\240\145\178\182" .. -- U+11CA9-U+11CB6
{0xFE00, 0xFE0F},
"\240\145\180\177-\240\145\180\182" .. -- U+11D31-U+11D36
{0xE0100, 0xE01EF},
"\240\145\180\186" .. -- U+11D3A
}
"\240\145\180\188" .. -- U+11D3C
}
"\240\145\180\189" .. -- U+11D3D
for key, charset in pairs(comb_chars) do
"\240\145\180\191-\240\145\181\133" .. -- U+11D3F-U+11D45
comb_chars[key] = char_ranges_to_pattern(charset)
"\240\145\181\135" .. -- U+11D47
end
"\240\145\182\138-\240\145\182\142" .. -- U+11D8A-U+11D8E
"\240\145\182\144" .. -- U+11D90
"\240\145\182\145" .. -- U+11D91
"\240\145\182\147-\240\145\182\151" .. -- U+11D93-U+11D97
"\240\145\187\179-\240\145\187\182" .. -- U+11EF3-U+11EF6
"\240\145\188\128" .. -- U+11F00
"\240\145\188\129" .. -- U+11F01
"\240\145\188\131" .. -- U+11F03
"\240\145\188\180-\240\145\188\186" .. -- U+11F34-U+11F3A
"\240\145\188\190-\240\145\189\130" .. -- U+11F3E-U+11F42
"\240\145\189\154" .. -- U+11F5A
"\240\147\145\128" .. -- U+13440
"\240\147\145\135-\240\147\145\149" .. -- U+13447-U+13455
"\240\150\132\158-\240\150\132\175" .. -- U+1611E-U+1612F
"\240\150\171\176-\240\150\171\180" .. -- U+16AF0-U+16AF4
"\240\150\172\176-\240\150\172\182" .. -- U+16B30-U+16B36
"\240\150\189\143" .. -- U+16F4F
"\240\150\189\145-\240\150\190\135" .. -- U+16F51-U+16F87
"\240\150\190\143-\240\150\190\146" .. -- U+16F8F-U+16F92
"\240\150\191\164" .. -- U+16FE4
"\240\150\191\176" .. -- U+16FF0
"\240\150\191\177" .. -- U+16FF1
"\240\155\178\157" .. -- U+1BC9D
"\240\155\178\158" .. -- U+1BC9E
"\240\156\188\128-\240\156\188\173" .. -- U+1CF00-U+1CF2D
"\240\156\188\176-\240\156\189\134" .. -- U+1CF30-U+1CF46
"\240\157\133\165-\240\157\133\169" .. -- U+1D165-U+1D169
"\240\157\133\173-\240\157\133\178" .. -- U+1D16D-U+1D172
"\240\157\133\187-\240\157\134\130" .. -- U+1D17B-U+1D182
"\240\157\134\133-\240\157\134\139" .. -- U+1D185-U+1D18B
"\240\157\134\170-\240\157\134\173" .. -- U+1D1AA-U+1D1AD
"\240\157\137\130-\240\157\137\132" .. -- U+1D242-U+1D244
"\240\157\168\128-\240\157\168\182" .. -- U+1DA00-U+1DA36
"\240\157\168\187-\240\157\169\172" .. -- U+1DA3B-U+1DA6C
"\240\157\169\181" .. -- U+1DA75
"\240\157\170\132" .. -- U+1DA84
"\240\157\170\155-\240\157\170\159" .. -- U+1DA9B-U+1DA9F
"\240\157\170\161-\240\157\170\175" .. -- U+1DAA1-U+1DAAF
"\240\158\128\128-\240\158\128\134" .. -- U+1E000-U+1E006
"\240\158\128\136-\240\158\128\152" .. -- U+1E008-U+1E018
"\240\158\128\155-\240\158\128\161" .. -- U+1E01B-U+1E021
"\240\158\128\163" .. -- U+1E023
"\240\158\128\164" .. -- U+1E024
"\240\158\128\166-\240\158\128\170" .. -- U+1E026-U+1E02A
"\240\158\130\143" .. -- U+1E08F
"\240\158\132\176-\240\158\132\182" .. -- U+1E130-U+1E136
"\240\158\138\174" .. -- U+1E2AE
"\240\158\139\172-\240\158\139\175" .. -- U+1E2EC-U+1E2EF
"\240\158\147\172-\240\158\147\175" .. -- U+1E4EC-U+1E4EF
"\240\158\151\174" .. -- U+1E5EE
"\240\158\151\175" .. -- U+1E5EF
"\240\158\163\144-\240\158\163\150" .. -- U+1E8D0-U+1E8D6
"\240\158\165\132-\240\158\165\138") -- U+1E944-U+1E94A
 
-- Double combining characters.
-- Charset: [[:M:]&[:Canonical_Combining_Class=/^Double_/:]&[:^subhead=Grapheme joiner:]&[:^Variation_Selector=Yes:]]
local comb_chars_double =
"\205\156-\205\162" .. -- U+035C-U+0362
"\225\183\141" .. -- U+1DCD
"\225\183\188" -- U+1DFC
-- Variation selectors etc.; separated out so that we don't get categories for them.
-- Charset: [[:M:]&[[:subhead=Grapheme joiner:][:Variation_Selector=Yes:]]].
local comb_chars_other =
"\205\143" .. -- U+034F
"\225\160\139-\225\160\141" .. -- U+180B-U+180D
"\225\160\143" .. -- U+180F
"\239\184\128-\239\184\143" .. -- U+FE00-U+FE0F
"\243\160\132\128-\243\160\135\175" -- U+E0100-U+E01EF


local comb_chars_all = comb_chars.single .. comb_chars.double .. comb_chars.vs
local comb_chars_all = comb_chars_single .. comb_chars_double .. comb_chars_other


comb_chars = {
local comb_chars = {
combined_single = "[^" .. comb_chars_all .. "][" .. comb_chars.single .. comb_chars.vs .. "]+%f[^" .. comb_chars_all .. "]",
combined_single = "[^" .. comb_chars_all .. "][" .. comb_chars_single .. comb_chars_other .. "]+%f[^" .. comb_chars_all .. "]",
combined_double = "[^" .. comb_chars_all .. "][" .. comb_chars.single .. comb_chars.vs .. "]*[" .. comb_chars.double .. "]+[" .. comb_chars_all .. "]*.[" .. comb_chars.single .. comb_chars.vs .. "]*",
combined_double = "[^" .. comb_chars_all .. "][" .. comb_chars_single .. comb_chars_other .. "]*[" .. comb_chars_double .. "]+[" .. comb_chars_all .. "]*.[" .. comb_chars_single .. comb_chars_other .. "]*",
diacritics_single = "[" .. comb_chars.single .. "]",
diacritics_single = "[" .. comb_chars_single .. "]",
diacritics_double = "[" .. comb_chars.double .. "]",
diacritics_double = "[" .. comb_chars_double .. "]",
diacritics_all = "[" .. comb_chars_all .. "]"
diacritics_all = "[" .. comb_chars_all .. "]"
}
}


-- From https://unicode.org/Public/emoji/15.1/emoji-sequences.txt
-- Somewhat curated list from https://unicode.org/Public/emoji/16.0/emoji-sequences.txt.
local emoji_chars = {
-- NOTE: There are lots more emoji sequences involving non-emoji Plane 0 symbols followed by 0xFE0F, which we don't
{0x231A, 0x231B}, -- watch..hourglass done                                          # E0.6  [2] (..)
-- (yet?) handle.
{0x23E9, 0x23EC}, -- fast-forward button..fast down button                          # E0.6  [4] (..)
local emoji_chars =
0x23F0,          -- alarm clock                                                    # E0.6  [1] ()
"\226\140\154" .. -- U+231A (⌚)
0x23F3,          -- hourglass not done                                            # E0.6  [1] ()
"\226\140\155" .. -- U+231B (⌛)
{0x25FD, 0x25FE}, -- white medium-small square..black medium-small square          # E0.6  [2] (..)
"\226\140\168" .. -- U+2328 (⌨)
{0x2614, 0x2615}, -- umbrella with rain drops..hot beverage                        # E0.6  [2] (..)
"\226\143\143" .. -- U+23CF (⏏)
{0x2648, 0x2653}, -- Aries..Pisces                                                  # E0.6  [12] (♈..)
"\226\143\169-\226\143\179" .. -- U+23E9-U+23F3 (⏩-⏳)
0x267F,          -- wheelchair symbol                                              # E0.6  [1] (♿)
"\226\143\184-\226\143\186" .. -- U+23F8-U+23FA (⏸-⏺)
0x2693,          -- anchor                                                        # E0.6  [1] ()
"\226\150\170" .. -- U+25AA (▪)
0x26A1,          -- high voltage                                                  # E0.6  [1] ()
"\226\150\171" .. -- U+25AB (▫)
{0x26AA, 0x26AB}, -- white circle..black circle                                    # E0.6  [2] (⚪..⚫)
"\226\150\182" .. -- U+25B6 (▶)
{0x26BD, 0x26BE}, -- soccer ball..baseball                                          # E0.6  [2] (⚽..⚾)
"\226\151\128" .. -- U+25C0 (◀)
{0x26C4, 0x26C5}, -- snowman without snow..sun behind cloud                        # E0.6  [2] (⛄..⛅)
"\226\151\187-\226\151\190" .. -- U+25FB-U+25FE (◻-◾)
0x26CE,          -- Ophiuchus                                                      # E0.6  [1] (⛎)
"\226\152\128-\226\152\132" .. -- U+2600-U+2604 (☀-☄)
0x26D4,          -- no entry                                                      # E0.6  [1] ()
"\226\152\142" .. -- U+260E ()
0x26EA,          -- church                                                        # E0.6  [1] ()
"\226\152\145" .. -- U+2611 (☑)
{0x26F2, 0x26F3}, -- fountain..flag in hole                                        # E0.6  [2] (..)
"\226\152\148" .. -- U+2614 ()
0x26F5,          -- sailboat                                                      # E0.6  [1] (⛵)
"\226\152\149" .. -- U+2615 (☕)
0x26FA,          -- tent                                                          # E0.6  [1] (⛺)
"\226\152\152" .. -- U+2618 (☘)
0x26FD,          -- fuel pump                                                      # E0.6  [1] (⛽)
"\226\152\157" .. -- U+261D (☝)
0x2705,          -- check mark button                                              # E0.6  [1] (✅)
"\226\152\160" .. -- U+2620 (☠)
{0x270A, 0x270B}, -- raised fist..raised hand                                      # E0.6  [2] (..)
"\226\152\162" .. -- U+2622 (☢)
0x2728,          -- sparkles                                                      # E0.6  [1] (✨)
"\226\152\163" .. -- U+2623 (☣)
0x274C,          -- cross mark                                                    # E0.6  [1] ()
"\226\152\166" .. -- U+2626 (☦)
0x274E,          -- cross mark button                                              # E0.6  [1] ()
"\226\152\170" .. -- U+262A (☪)
{0x2753, 0x2755}, -- red question mark..white exclamation mark                      # E0.6  [3] (❓..❕)
"\226\152\174" .. -- U+262E (☮)
0x2757,          -- red exclamation mark                                          # E0.6  [1] (❗)
"\226\152\175" .. -- U+262F (☯)
{0x2795, 0x2797}, -- plus..divide                                                  # E0.6  [3] (➕..)
"\226\152\184-\226\152\186" .. -- U+2638-U+263A (☸-☺)
0x27B0,          -- curly loop                                                    # E0.6  [1] (➰)
"\226\153\136-\226\153\147" .. -- U+2648-U+2653 (♈-♓)
0x27BF,          -- double curly loop                                              # E1.0  [1] ()
"\226\153\159" .. -- U+265F (♟)
{0x2B1B, 0x2B1C}, -- black large square..white large square                        # E0.6  [2] (⬛..⬜)
"\226\153\160" .. -- U+2660 (♠)
0x2B50,          -- star                                                          # E0.6  [1] ()
"\226\153\163" .. -- U+2663 (♣)
0x2B55,          -- hollow red circle                                              # E0.6  [1] ()
"\226\153\165" .. -- U+2665 (♥)
{0x1F300, 0x1FAFF}, -- emoji in Plane 1
"\226\153\166" .. -- U+2666 (♦)
-- NOTE: There are lots more emoji sequences involving non-emoji Plane 0 symbols followed by 0xFE0F, which we don't
"\226\153\168" .. -- U+2668 (♨)
-- (yet?) handle.
"\226\153\187" .. -- U+267B (♻)
}
"\226\153\190" .. -- U+267E (♾)
emoji_chars = char_ranges_to_pattern(emoji_chars)
"\226\153\191" .. -- U+267F (♿)
"\226\154\146-\226\154\151" .. -- U+2692-U+2697 (⚒-⚗)
"\226\154\153" .. -- U+2699 ()
"\226\154\155" .. -- U+269B (⚛)
"\226\154\156" .. -- U+269C ()
"\226\154\160" .. -- U+26A0 (⚠)
"\226\154\161" .. -- U+26A1 (⚡)
"\226\154\170" .. -- U+26AA (⚪)
"\226\154\171" .. -- U+26AB (⚫)
"\226\154\176" .. -- U+26B0 (⚰)
"\226\154\177" .. -- U+26B1 (⚱)
"\226\154\189" .. -- U+26BD (⚽)
"\226\154\190" .. -- U+26BE (⚾)
"\226\155\132" .. -- U+26C4 (⛄)
"\226\155\133" .. -- U+26C5 (⛅)
"\226\155\136" .. -- U+26C8 (⛈)
"\226\155\142" .. -- U+26CE (⛎)
"\226\155\143" .. -- U+26CF (⛏)
"\226\155\145" .. -- U+26D1 ()
"\226\155\147" .. -- U+26D3 ()
"\226\155\148" .. -- U+26D4 (⛔)
"\226\155\169" .. -- U+26E9 (⛩)
"\226\155\170" .. -- U+26EA (⛪)
"\226\155\176-\226\155\181" .. -- U+26F0-U+26F5 (⛰-⛵)
"\226\155\183-\226\155\186" .. -- U+26F7-U+26FA (⛷-⛺)
"\226\155\189" .. -- U+26FD (⛽)
"\226\156\130" .. -- U+2702 (✂)
"\226\156\133" .. -- U+2705 (✅)
"\226\156\136-\226\156\141" .. -- U+2708-U+270D (✈-✍)
"\226\156\143" .. -- U+270F (✏)
"\226\156\146" .. -- U+2712 (✒)
"\226\156\148" .. -- U+2714 (✔)
"\226\156\150" .. -- U+2716 (✖)
"\226\156\157" .. -- U+271D (✝)
"\226\156\161" .. -- U+2721 (✡)
"\226\156\168" .. -- U+2728 (✨)
"\226\156\179" .. -- U+2733 (✳)
"\226\156\180" .. -- U+2734 ()
"\226\157\132" .. -- U+2744 (❄)
"\226\157\135" .. -- U+2747 ()
"\226\157\140" .. -- U+274C (❌)
"\226\157\142" .. -- U+274E (❎)
"\226\157\147-\226\157\149" .. -- U+2753-U+2755 (❓-❕)
"\226\157\151" .. -- U+2757 (❗)
"\226\157\163" .. -- U+2763 (❣)
"\226\157\164" .. -- U+2764 (❤)
"\226\158\149-\226\158\151" .. -- U+2795-U+2797 (➕-➗)
"\226\158\161" .. -- U+27A1 (➡)
"\226\158\176" .. -- U+27B0 (➰)
"\226\158\191" .. -- U+27BF (➿)
"\226\164\180" .. -- U+2934 ()
"\226\164\181" .. -- U+2935 (⤵)
"\226\172\133-\226\172\135" .. -- U+2B05-U+2B07 (⬅-⬇)
"\226\172\155" .. -- U+2B1B (⬛)
"\226\172\156" .. -- U+2B1C (⬜)
"\226\173\144" .. -- U+2B50 (⭐)
"\226\173\149" .. -- U+2B55 (⭕)
"\227\128\176" .. -- U+3030 (〰)
"\227\128\189" .. -- U+303D (〽)
"\227\138\151" .. -- U+3297 (㊗)
"\227\138\153" .. -- U+3299 (㊙)
"\240\159\128\132" .. -- U+1F004 (🀄)
"\240\159\131\143" .. -- U+1F0CF (🃏)
"\240\159\133\176" .. -- U+1F170 (🅰)
"\240\159\133\177" .. -- U+1F171 (🅱)
"\240\159\133\190" .. -- U+1F17E (🅾)
"\240\159\133\191" .. -- U+1F17F (🅿)
"\240\159\134\142" .. -- U+1F18E (🆎)
"\240\159\134\145-\240\159\134\154" .. -- U+1F191-U+1F19A (🆑-🆚)
"\240\159\136\129" .. -- U+1F201 (🈁)
"\240\159\136\130" .. -- U+1F202 (🈂)
"\240\159\136\154" .. -- U+1F21A (🈚)
"\240\159\136\175" .. -- U+1F22F (🈯)
"\240\159\136\178-\240\159\136\186" .. -- U+1F232-U+1F23A (🈲-🈺)
"\240\159\137\144" .. -- U+1F250 (🉐)
"\240\159\137\145" .. -- U+1F251 (🉑)
"\240\159\140\128-\240\159\153\143" .. -- U+1F300-U+1F64F (🌀-🙏)
"\240\159\154\128-\240\159\155\151" .. -- U+1F680-U+1F6D7 (🚀-🛗)
"\240\159\155\156-\240\159\155\172" .. -- U+1F6DC-U+1F6EC (🛜-🛬)
"\240\159\155\176-\240\159\155\188" .. -- U+1F6F0-U+1F6FC (🛰-🛼)
"\240\159\159\160-\240\159\159\171" .. -- U+1F7E0-U+1F7EB (🟠-🟫)
"\240\159\159\176" .. -- U+1F7F0 (🟰)
"\240\159\164\140-\240\159\169\147" .. -- U+1F90C-U+1FA53 (🤌-🩓)
"\240\159\169\160-\240\159\169\173" .. -- U+1FA60-U+1FA6D (🩠-🩭)
"\240\159\169\176-\240\159\169\188" .. -- U+1FA70-U+1FA7C (🩰-🩼)
"\240\159\170\128-\240\159\170\137" .. -- U+1FA80-U+1FA89 (🪀-🪉)
"\240\159\170\143-\240\159\171\134" .. -- U+1FA8F-U+1FAC6 (🪏-🫆)
"\240\159\171\142-\240\159\171\156" .. -- U+1FACE-U+1FADC (🫎-🫜)
"\240\159\171\159-\240\159\171\169" .. -- U+1FADF-U+1FAE9 (🫟-🫩)
"\240\159\171\176-\240\159\171\184" -- U+1FAF0-U+1FAF8 (🫰-🫸)
 
local unsupported_characters
local function get_unsupported_characters()
unsupported_characters, get_unsupported_characters = {}, nil
for k, v in pairs(load_data("Module:links/data").unsupported_characters) do
unsupported_characters[v] = k
end
return unsupported_characters
end


local unsupported_characters = {}
-- The list of unsupported titles and invert it (so the keys are pagenames and values are canonical titles).
for k, v in pairs(require("Module:links/data").unsupported_characters) do
local unsupported_titles
unsupported_characters[v] = k
local function get_unsupported_titles()
unsupported_titles, get_unsupported_titles = {}, nil
for k, v in pairs(load_data("Module:links/data").unsupported_titles) do
unsupported_titles[v] = k
end
return unsupported_titles
end
end


-- Get the list of unsupported titles and invert it (so the keys are pagenames and values are canonical titles).
-- To save on memory, we only cache names with either non-ASCII characters in them or ASCII characters to be removed or
local unsupported_titles = {}
-- transformed (apostrophe, double quote, hyphen).
for k, v in pairs(require("Module:links/data").unsupported_titles) do
local L2_sort_key_cache = {}
unsupported_titles[v] = k
 
function export.get_L2_sort_key(L2)
if L2 == "Translingual" then
return "\1"
elseif L2 == "English" then
return "\2"
elseif match(L2, "^[%z\1-\b\14-!#-&(-,.-\127]+$") then
return L2
end
local sort_key = L2_sort_key_cache[L2]
if sort_key then
return sort_key
end
sort_key = toNFC(ugsub(ugsub(toNFD(L2), "[" .. comb_chars_all .. "'\"ʻʼ]+", ""), "[%s%-]+", " "))
L2_sort_key_cache[L2] = sort_key
return sort_key
end
end


Line 466: Line 698:
   with {mw.ustring.find()}.
   with {mw.ustring.find()}.
* `L2_list`: Ordered list of L2 headings on the page, with the extra key `n` that gives the length of the list.
* `L2_list`: Ordered list of L2 headings on the page, with the extra key `n` that gives the length of the list.
* `L2_sections`: Lookup table of L2 headings on the page, where the key is the section number assigned by the preprocessor, and the value is the L2 heading name. Once an invocation has got its actual section number from get_current_section in [[Module:utilities]], it can use this table to determine its parent L2. TODO: We could expand this to include subsections, to check POS headings are correct etc.
* `L2_sections`: Lookup table of L2 headings on the page, where the key is the section number assigned by the preprocessor, and the value is the L2 heading name. Once an invocation has got its actual section number from get_current_L2 in [[Module:pages]], it can use this table to determine its parent L2. TODO: We could expand this to include subsections, to check POS headings are correct etc.
* `unsupported_titles`: Map from pagenames to canonical titles for unsupported-title pages.
* `unsupported_titles`: Map from pagenames to canonical titles for unsupported-title pages.
* `namespace`: Namespace of the pagename.
* `namespace`: Namespace of the pagename.
* `ns`: Namespace table for the page from mw.site.namespaces (TODO: merge with `namespace` above).
* `ns`: Namespace table for the page from mw.site.namespaces (TODO: merge with `namespace` above).
* `full_raw_pagename`: Full version of the '''RAW''' pagename (i.e. unsupported-title pages aren't canonicalized);
* `full_raw_pagename`: Full version of the '''RAW''' pagename (i.e. unsupported-title pages aren't canonicalized);
   including the namespace and the root (portion before the slash).
   including the namespace and the base (portion before the slash).
* `pagename`: Canonicalized subpage portion of the pagename (unsupported-title pages are canonicalized).
* `pagename`: Canonicalized subpage portion of the pagename (unsupported-title pages are canonicalized).
* `pagename_with_base`: Same as `pagename` in the main namespace; otherwise, the whole pagename without the namespace.
* `decompose_pagename`: Equivalent of `pagename` in NFD decomposition.
* `decompose_pagename`: Equivalent of `pagename` in NFD decomposition.
* `pagename_len`: Length of `pagename` in Unicode chars, where combinations of spacing character + decomposed diacritic
* `pagename_len`: Length of `pagename` in Unicode chars, where combinations of spacing character + decomposed diacritic
Line 483: Line 716:
* `wikitext_topic_cat`: FIXME: Document me.
* `wikitext_topic_cat`: FIXME: Document me.
* `wikitext_langname_cat`: FIXME: Document me.
* `wikitext_langname_cat`: FIXME: Document me.
`no_fetch_content` says to not fetch and parse the content or set a DEFAULTSORT sort key, in order to save time on
test and documentation pages that have lots of template invocations that set `|pagename=`. It turns out nearly all the
time of this function is contained in the line `frame:callParserFunction("DEFAULTSORT", data.pagename_defaultsort)`,
so we skip it on test and documentation pages where it accomplishes nothing in any case.
]==]
]==]


function export.process_page(pagename)
function export.process_page(pagename, no_fetch_content)
local data = {
local data = {
comb_chars = comb_chars,
comb_chars = comb_chars,
emoji_pattern = "[" .. emoji_chars .. "]",
emoji_pattern = "[" .. emoji_chars .. "]",
unsupported_titles = unsupported_titles
unsupported_titles = unsupported_titles or get_unsupported_titles()
}
}
Line 499: Line 737:
local function bad_pagename()
local function bad_pagename()
if not pagename then
if not pagename then
error("Internal error: Something wrong, `data.pagename` not specified but current title containg illegal characters")
error("Internal error: Something wrong, `data.pagename` not specified but current title contains illegal characters")
else
else
error(("Bad value for `data.pagename`: '%s', which must not contain illegal characters"):format(pagename))
error(format("Bad value for `data.pagename`: '%s', which must not contain illegal characters", pagename))
end
end
end
end
Line 512: Line 750:
raw_title = mw.title.getCurrentTitle()
raw_title = mw.title.getCurrentTitle()
end
end
data.namespace = raw_title.nsText
 
local nsText = raw_title.nsText
local namespace_is_reconstruction = nsText == "Reconstruction"
data.namespace = nsText
data.ns = mw.site.namespaces[raw_title.namespace]
data.ns = mw.site.namespaces[raw_title.namespace]
data.full_raw_pagename = raw_title.fullText
local full_raw_pagename = raw_title.fullText
data.full_raw_pagename = full_raw_pagename


local frame = mw.getCurrentFrame()
local frame = mw.getCurrentFrame()
-- WARNING: `content` May be nil, e.g. if we're substing a template like {{ja-new}} on a not-yet-created page
-- WARNING: `content` may be nil, e.g. if we're substing a template like {{ja-new}} on a not-yet-created page
-- or if the module specifies the subpage as `data.pagename` (which many modules do) and we're in an Appendix
-- or if the module specifies the subpage as `data.pagename` (which many modules do) and we're in an Appendix
-- or other non-mainspace page. We used to make the latter an error but there are too many modules that do it,
-- or other non-mainspace page. We used to make the latter an error but there are too many modules that do it,
-- and substing on a nonexistent page is totally legit, and we don't actually need to be able to access the
-- and substing on a nonexistent page is totally legit, and we don't actually need to be able to access the
-- content of the page.
-- content of the page.
local content = raw_title:getContent()
local content = not no_fetch_content and raw_title:getContent() or nil
local content_lang = mw.getContentLanguage()


--Get the pagename.
-- Get the pagename.
pagename = raw_title.subpageText
pagename = physical_to_logical_pagename_if_mammoth(raw_title)
:gsub("^Unsupported titles/(.*)", function(m)
pagename = gsub(pagename, "^Unsupported titles/(.+)", function(m)
insert(cats, "Unsupported titles")
insert(cats, "Unsupported titles")
return unsupported_titles[m] or (m:gsub("`.-`", unsupported_characters))
local title = (unsupported_titles or get_unsupported_titles())[m]
end)
if title then
-- Save pagename, as local variable will be destructively modified.
return title
end
-- Substitute pairs of "`". Those not used for escaping should be escaped as "`grave`", but might not be,
-- so if a pair don't form a match, the closing "`" should become the opening "`" of the next match attempt.
-- This has to be done manually, instead of using gsub.
local open_pos = find(m, "`")
if not open_pos then
return m
end
title = {sub(m, 1, open_pos - 1)}
while true do
local close_pos = find(m, "`", open_pos + 1)
if not close_pos then
-- Add "`" plus any remaining characters.
insert(title, sub(m, open_pos))
break
end
local escape = sub(m, open_pos, close_pos)
local ch = (unsupported_characters or get_unsupported_characters())[escape]
-- Match found, so substitute the character and move to the first "`" after the match if found, or
-- otherwise return.
if ch then
insert(title, ch)
local nxt_pos = close_pos + 1
open_pos = find(m, "`", nxt_pos)
-- Add any characters between the match and the next "`" or end.
if open_pos then
insert(title, sub(m, nxt_pos, open_pos - 1))
else
insert(title, sub(m, nxt_pos))
break
end
-- Match not found, so make the closing "`" the opening "`" of the next attempt.
else
-- Add the failed match, except for the closing "`".
insert(title, sub(m, open_pos, close_pos - 1))
open_pos = close_pos
end
end
return concat(title)
end)
-- Save pagename, as the local variable will be destructively modified.
data.pagename = pagename
data.pagename = pagename
if nsText == "" then
data.pagename_with_base = pagename
else
data.pagename_with_base = raw_title.text
end
-- Decompose the pagename in Unicode normalization form D.
-- Decompose the pagename in Unicode normalization form D.
data.decompose_pagename = toNFD(pagename)
data.decompose_pagename = toNFD(pagename)
Line 544: Line 832:
end
end
pagename = ugsub(pagename, comb_chars.combined_double, explode)
pagename = ugsub(pagename, comb_chars.combined_double, explode)
pagename = ugsub(pagename, comb_chars.combined_single, explode)
pagename = gsub(ugsub(pagename, comb_chars.combined_single, explode), ".[\128-\191]*", explode)
:gsub(".[\128-\191]*", explode)


data.explode_pagename = explode_pagename
data.explode_pagename = explode_pagename
Line 552: Line 839:
-- Generate DEFAULTSORT.
-- Generate DEFAULTSORT.
data.encoded_pagename = encode_entities(data.pagename)
data.encoded_pagename = encode_entities(data.pagename)
data.pagename_defaultsort = require("Module:languages").getByCode("mul"):makeSortKey(data.encoded_pagename)
data.pagename_defaultsort = get_lang("mul"):makeSortKey(data.encoded_pagename)
frame:callParserFunction(
if not no_fetch_content then
"DEFAULTSORT",
frame:callParserFunction("DEFAULTSORT", data.pagename_defaultsort)
data.pagename_defaultsort
end
)
data.raw_defaultsort = uupper(raw_title.text)
data.raw_defaultsort = raw_title.text:uupper()
-- Make `L2_list` and `L2_sections`, note raw wikitext use of {{DEFAULTSORT:}} and {{DISPLAYTITLE:}}, then add categories if any unwanted L1 headings are found, the L2 headings are in the wrong order, or they don't match a canonical language name.
-- Make `L2_list` and `L2_sections`, note raw wikitext use of {{DEFAULTSORT:}} and {{DISPLAYTITLE:}}, then add categories if any unwanted L1 headings are found, the L2 headings are in the wrong order, or they don't match a canonical language name.
-- Note: HTML comments shouldn't be removed from `content` until after this step, as they can affect the result.
-- Note: HTML comments shouldn't be removed from `content` until after this step, as they can affect the result.
do
do
local L2_list, L2_list_len, L2_sections, sort_cache, prev = {}, 0, {}, {}
local L2_list, L2_list_len, L2_sections = {}, 0, {}
local defaultsort, displaytitle, page_has_L1, L2_wrong_order, L2_nonstandard, page_has_arg
local prev, rc
local new_cats, L2_wrong_order = {}
local function get_weight(L2)
local function handle_heading(heading)
if L2 == "Translingual" then
local level = heading.level
return "\1"
elseif L2 == "English" then
return "\2"
elseif match(L2, "^[%z\1-\b\14-!#-&(-,.-\127]+$") then
return L2
end
local weight = sort_cache[L2]
if weight then
return weight
end
weight = toNFC(ugsub(ugsub(toNFD(L2), "[" .. comb_chars_all .. "'\"ʻʼ]+", ""), "[%s%-]+", " "))
sort_cache[L2] = weight
return weight
end
local function handle_heading(node)
local level = node.level
if level > 2 then
if level > 2 then
return
return
end
end
local name = node:get_name()
local name = heading:get_name()
-- Check there are no newline characters in the heading, which might appear after preprocessing (e.g. from an expanded template). In such cases, the preprocessor section count still increments (since it's calculated pre-expansion), but the heading will fail, so we shouldn't increment the L2 count.
-- heading:get_name() will return nil if there are any newline characters in the preprocessed heading name (e.g. from an expanded template). In such cases, the preprocessor section count still increments (since it's calculated pre-expansion), but the heading will fail, so the L2 count shouldn't be incremented.
if find(name, "\n", 1, true) then
if name == nil then
return
return
end
end
L2_list_len = L2_list_len + 1
L2_list_len = L2_list_len + 1
L2_list[L2_list_len] = name
L2_list[L2_list_len] = name
L2_sections[node.section] = name
L2_sections[heading.section] = name
-- We also add any L1s, since they terminate the preceding L2, but add a maintenance category since it's probably a mistake.
-- Also add any L1s, since they terminate the preceding L2, but add a maintenance category since it's probably a mistake.
if level == 1 then
if level == 1 then
page_has_L1 = true
new_cats["Pages with unwanted L1 headings"] = true
end
end
-- Check the heading is in the right order.
-- Check the heading is in the right order.
Line 603: Line 873:
if prev and not (
if prev and not (
L2_wrong_order or
L2_wrong_order or
string_sort(get_weight(prev), get_weight(name))
string_compare(export.get_L2_sort_key(prev), export.get_L2_sort_key(name))
) then
) then
new_cats["Pages with language headings in the wrong order"] = true
L2_wrong_order = true
L2_wrong_order = true
end
end
-- Check it's a canonical language name.
-- Check it's a canonical language name.
if not langnames[name] then
if not (langnames or get_langnames())[name] then
L2_nonstandard = true
new_cats["Pages with nonstandard language headings"] = true
end
end
prev = name
prev = name
end
end
local function handle_template(node)
local function handle_template(template)
local name = node:get_name()
-- Turn off redirect checking except in the Reconstruction namespace because the rc flag is only
-- used in the Reconstruction namespace and the other names are parser functions, which AFAIK can't
-- be redirected to.
local name = template:get_name(nil, not namespace_is_reconstruction and "no_redirect" or nil)
if name == "DEFAULTSORT:" then
if name == "DEFAULTSORT:" then
defaultsort = true
new_cats["Pages with DEFAULTSORT conflicts"] = true
elseif name == "DISPLAYTITLE:" then
elseif name == "DISPLAYTITLE:" then
displaytitle = true
new_cats["Pages with DISPLAYTITLE conflicts"] = true
elseif name == "reconstructed" then
rc = true
end
end
end
end
if content then
if content then
for node in require("Module:template parser").parse(content):__pairs("next_node") do
for node in parse(content):iterate_nodes() do
local node_type = type_or_class(node)
local node_class = class_else_type(node)
if node_type == "heading" then
if node_class == "heading" then
handle_heading(node)
handle_heading(node)
elseif node_type == "template" and not (defaultsort and displaytitle) then
elseif node_class == "template" then
handle_template(node)
handle_template(node)
elseif node_type == "argument" then
elseif node_class == "parameter" then
page_has_arg = true
new_cats["Pages with raw triple-brace template parameters"] = true
end
end
end
end
Line 639: Line 915:
data.L2_list = L2_list
data.L2_list = L2_list
data.L2_sections = L2_sections
data.L2_sections = L2_sections
 
insert(cats, get_category("Pages with entries"))
insert(cats, get_category(format("Pages with %s entr%s", L2_list_len, L2_list_len == 1 and "y" or "ies")))
for cat in pairs(new_cats) do
--insert(cats, get_category(cat))
end
if namespace_is_reconstruction and not rc then
local langname = match(full_raw_pagename, "^Reconstruction:([^/]+)/.")
if langname then
insert(cats, get_category(langname .. " entries missing Template:reconstructed"))
end
end
end
end


Line 676: Line 964:
end
end
t_lang[uupper(decode_entities(sortkey))] = true
t_lang[uupper(decode_entities(sortkey))] = true
end
local function do_iteration(name, sortkey, wikitext_langname_cat)
if langnames[name] then
return add_cat_table(wikitext_langname_cat, name, sortkey)
end
name = etym_langnames[name] and name or content_lang:lcfirst(name)
if etym_langnames[name] then
name = get_etym_lang(name):getFullName()
return add_cat_table(wikitext_langname_cat, name, sortkey)
end
end
end
Line 711: Line 988:
local code = match(cat, "^([%w%-.]+):")
local code = match(cat, "^([%w%-.]+):")
if code then
if code then
return add_cat_table(wikitext_topic_cat, code, sortkey)
add_cat_table(wikitext_topic_cat, code, sortkey)
return
end
end
-- Split by word.
-- Split by word.
cat = split(cat, " ", true, true)
cat = split(cat, " ", true, true)
-- Iterate over the category name, starting with the longest possible name and shaving off the first word until we find one. We do it this way because:
-- Formerly we looked for the language name anywhere in the category. This is simply wrong
-- (a) Going from shortest to longest risks falsely matching (e.g.) German Low German categories as German.
-- because there are no categories like 'Alsatian French lemmas' (only L2 languages
-- (b) Checking the start of category names first risks falsely match (e.g.) Alsatian French as Alsatian (a variety of Alemannic German), not French.
-- have langname categories), but doing it this way wrongly catches things like [[Category:Shapsug Adyghe]]
-- If no matches are found, then check the start of the category name, shaving off the last word each iteration.
-- in [[Category:Adyghe entries with language name categories using raw markup]].
local cat_len, n, name, done = #cat, 1
local n = #cat - 1
repeat
name = concat(cat, " ", n, cat_len)
done = do_iteration(name, sortkey, wikitext_langname_cat)
if done then
return
end
n = n + 1
until n > cat_len
n = cat_len - 1
if n <= 0 then
if n <= 0 then
return
return
end
end
-- Go from longest to shortest and stop once we've found a language name. Going from shortest
-- to longest or not stopping after a match risks falsely matching (e.g.) German Low German
-- categories as German.
repeat
repeat
name = concat(cat, " ", 1, n)
local name = concat(cat, " ", 1, n)
done = do_iteration(name, sortkey, wikitext_langname_cat)
if (langnames or get_langnames())[name] then
if done then
add_cat_table(wikitext_langname_cat, name, sortkey)
return
return
end
end
Line 769: Line 1,041:
data.wikitext_topic_cat = wikitext_topic_cat
data.wikitext_topic_cat = wikitext_topic_cat
data.wikitext_langname_cat = wikitext_langname_cat
data.wikitext_langname_cat = wikitext_langname_cat
if raw_sortkey then
insert(cats, get_category("Pages with raw sortkeys"))
end
end
end
 
data.cats = {}
return data
return data
end
end


return export
return export