Module:Unicode data: Difference between revisions

From Linguifex
Jump to navigation Jump to search
No edit summary
No edit summary
 
Line 1: Line 1:
local p = {}
local m_str_utils = require("Module:string utilities")
 
local cp = m_str_utils.codepoint
local gcodepoint = m_str_utils.gcodepoint
local gsub = string.gsub
local u = m_str_utils.char
 
local export = {}
local udata = mw.loadData("Module:Unicode data/data")


local floor = math.floor
local floor = math.floor


local function errorf(level, ...)
local function errorf(first_arg, ...)
if type(level) == "number" then
if type(first_arg) == "number" then
return error(string.format(...), level + 1)
return error(string.format(...), first_arg + 1)
else -- level is actually the format string.
else
return error(string.format(level, ...), 2)
return error(string.format(first_arg, ...), 2)
end
end
end
end
Line 13: Line 21:
local function binary_range_search(codepoint, ranges)
local function binary_range_search(codepoint, ranges)
local low, mid, high
local low, mid, high
low, high = 1, ranges.length or require "Module:TableTools".length(ranges)
low, high = 1, ranges.length or require "Module:table".length(ranges)
while low <= high do
while low <= high do
mid = floor((low + high) / 2)
mid = floor((low + high) / 2)
Line 27: Line 35:
return nil, mid
return nil, mid
end
end
p.binary_range_search = binary_range_search
export.binary_range_search = binary_range_search


--[[
local function linear_range_search(codepoint, ranges)
local function linear_range_search(codepoint, ranges)
for i, range in ipairs(ranges) do
for i, range in ipairs(ranges) do
if range[1] <= codepoint and codepoint <= range[2] then
if codepoint < range[1] then
break
elseif codepoint <= range[2] then
return range
return range
end
end
end
end
end
end
--]]


-- Load a module by indexing "loader" with the name of the module minus the
-- Load a module by indexing "loader" with the name of the module minus the
Line 57: Line 65:
-- see "Hangul Syllable Name Generation" in section 3.12 of the
-- see "Hangul Syllable Name Generation" in section 3.12 of the
-- Unicode Specification:
-- Unicode Specification:
-- https://www.unicode.org/versions/Unicode11.0.0/ch03.pdf
-- https://www.unicode.org/versions/latest/ch03.pdf
-- For most of the name rules given here, see the subsection
-- "Unicode Name Property" in section 4.8 (Name) and the table 4-8
-- (Name Derivation Rule Prefix Strings):
-- https://www.unicode.org/versions/latest/ch04.pdf
local name_hooks = {
local name_hooks = {
{     0x00,     0x1F, "<control-%04X>" }, -- C0 control characters
{   0x0000,   0x001F, "<control-%04X>" }, -- C0 control characters
{     0x7F,     0x9F, "<control-%04X>" }, -- DEL and C1 control characters
{   0x007F,   0x009F, "<control-%04X>" }, -- DEL and C1 control characters
{  0x3400,  0x4DBF, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension A
{  0x3400,  0x4DBF, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension A
{  0x4E00,  0x9FFF, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph
{  0x4E00,  0x9FFF, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph
Line 80: Line 92:
{  0xF900,  0xFA6D, "CJK COMPATIBILITY IDEOGRAPH-%04X" },
{  0xF900,  0xFA6D, "CJK COMPATIBILITY IDEOGRAPH-%04X" },
{  0xFA70,  0xFAD9, "CJK COMPATIBILITY IDEOGRAPH-%04X" },
{  0xFA70,  0xFAD9, "CJK COMPATIBILITY IDEOGRAPH-%04X" },
{  0x17000,  0x187F7, "TANGUT IDEOGRAPH-%04X" }, -- Tangut Ideograph
{  0xFDD0,  0xFDEF, "<noncharacter-%04X>" },
{  0xFE00,  0xFE0F, function (codepoint) -- Variation Selectors Supplement
return ("VARIATION SELECTOR-%d"):format(codepoint - 0xFE00 + 1)
end},
    {  0x13460,  0x143FA, "EGYPTIAN HIEROGLYPH-%04X" }, -- Egyptian Hieroglyphs Extended-A
{  0x17000,  0x187F7, "TANGUT IDEOGRAPH-%04X" }, -- Tangut
{  0x18800,  0x18AFF, function (codepoint)
{  0x18800,  0x18AFF, function (codepoint)
return ("TANGUT COMPONENT-%03d"):format(codepoint - 0x187FF)
return ("TANGUT COMPONENT-%03d"):format(codepoint - 0x187FF)
end },
end },
{  0x18D00,  0x18D08, "TANGUT IDEOGRAPH-%04X" }, -- Tangut Ideograph Supplement
{  0x18B00,  0x18CD5, "KHITAN SMALL SCRIPT CHARACTER-%04X" }, -- Khitan Small Script
{  0x18CFF,  0x18CFF, "KHITAN SMALL SCRIPT CHARACTER-%04X" },
{  0x18D00,  0x18D08, "TANGUT IDEOGRAPH-%04X" }, -- Tangut Supplement
{  0x1B170,  0x1B2FB, "NUSHU CHARACTER-%04X" }, -- Nushu
{  0x1B170,  0x1B2FB, "NUSHU CHARACTER-%04X" }, -- Nushu
{  0x20000,  0x2A6DF, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension B
{  0x20000,  0x2A6DF, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension B
Line 91: Line 110:
{  0x2B820,  0x2CEA1, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension E
{  0x2B820,  0x2CEA1, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension E
{  0x2CEB0,  0x2EBE0, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension F
{  0x2CEB0,  0x2EBE0, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension F
{  0x2EBF0,  0x2EE5D, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension I
-- CJK Compatibility Ideographs Supplement (Supplementary Ideographic Plane)
-- CJK Compatibility Ideographs Supplement (Supplementary Ideographic Plane)
{  0x2F800,  0x2FA1D, "CJK COMPATIBILITY IDEOGRAPH-%04X" },
{  0x2F800,  0x2FA1D, "CJK COMPATIBILITY IDEOGRAPH-%04X" },
{  0x30000,  0x3134A, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension G
{  0x31350,  0x323AF, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension H
{  0xE0100,  0xE01EF, function (codepoint) -- Variation Selectors Supplement
{  0xE0100,  0xE01EF, function (codepoint) -- Variation Selectors Supplement
return ("VARIATION SELECTOR-%d"):format(codepoint - 0xE0100 + 17)
return ("VARIATION SELECTOR-%d"):format(codepoint - 0xE0100 + 17)
end},
end},
{  0x30000,  0x3134A, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension G
{  0x31350,  0x323AF, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension H
{  0x2EBF0,  0x2EE5D, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension I
{  0xF0000,  0xFFFFD, "<private-use-%04X>" }, -- Plane 15 Private Use
{  0xF0000,  0xFFFFD, "<private-use-%04X>" }, -- Plane 15 Private Use
{ 0x100000, 0x10FFFD, "<private-use-%04X>" } -- Plane 16 Private Use
{ 0x100000, 0x10FFFD, "<private-use-%04X>" }, -- Plane 16 Private Use
}
}
name_hooks.length = #name_hooks
name_hooks.length = #name_hooks
--[[ Add another - in this line to test the code point ordering in name_hooks.
local i = 1
local function print_it(a, b, c)
if type(c) == "string" then
mw.log(c:format(a), c:format(b))
else
mw.log(c(a), c(b))
end
end
while true do
local first, second = name_hooks[i], name_hooks[i + 1]
if not (first and second) then break end
local message
if not (first[1] < first[2] and first[2] < second[1] and second[1] < second[2]) then
message = "Bad name label ordering at index " .. i .. ":"
elseif second[1] == first[2] + 1 and second[3] == first[3] then
message = "Name hooks can be merged at index " .. i .. ":"
end
if message then
mw.log(message)
print_it(unpack(first))
print_it(unpack(second))
end
i = i + 1
end
--]]


local name_range_cache
local name_range_cache
Line 126: Line 171:
--]]
--]]


function p.is_noncharacter(codepoint)
-- https://www.unicode.org/versions/latest/ch04.pdf, section 4.8
function export.lookup_name(codepoint)
-- U+FDD0-U+FDEF and all code points ending in FFFE or FFFF are Unassigned
-- U+FDD0-U+FDEF and all code points ending in FFFE or FFFF are Unassigned
-- (Cn) and specifically noncharacters:
-- (Cn) and specifically noncharacters:
-- https://www.unicode.org/faq/private_use.html#nonchar4
-- https://www.unicode.org/faq/private_use.html#nonchar4
return 0xFDD0 <= codepoint and (codepoint <= 0xFDEF
if codepoint >= 0xFFFE and floor(codepoint % 0x10000) >= 0xFFFE then
or floor(codepoint % 0x10000) >= 0xFFFE)
end
 
-- https://www.unicode.org/versions/Unicode11.0.0/ch04.pdf, section 4.8
function p.lookup_name(codepoint)
if p.is_noncharacter(codepoint) then
return ("<noncharacter-%04X>"):format(codepoint)
return ("<noncharacter-%04X>"):format(codepoint)
end
end
Line 167: Line 206:
end
end


function p.lookup_image(codepoint)
function export.lookup_image(codepoint)
local data = loader[('images/%03X'):format(codepoint / 0x1000)]
local data = loader[('images/%03X'):format(codepoint / 0x1000)]
Line 175: Line 214:
end
end


local planes = {
function export.lookup_image_emoji(codepoint)
[ 0] = "Basic Multilingual Plane";
local data = loader[('emoji_images/%03X'):format(codepoint / 0x1000)]
[ 1] = "Supplementary Multilingual Plane";
[ 2] = "Supplementary Ideographic Plane";
if data then
[ 3] = "Tertiary Ideographic Plane";
return data[codepoint]
[14] = "Supplementary Special-purpose Plane";
end
[15] = "Supplementary Private Use Area-A";
end
[16] = "Supplementary Private Use Area-B";
}


-- Load [[Module:Unicode data/blocks]] if needed and assign it to this variable.
-- Load [[Module:Unicode data/blocks]] if needed and assign it to this variable.
Line 193: Line 230:
if data then
if data then
-- Unpack doesn't work on tables loaded with mw.loadData.
-- Unpack doesn't work on tables loaded with mw.loadData.
return i, data[1], data[2], data[3]
return i, data[3], data[1], data[2]
end
end
end
end


-- An ipairs-type iterator generator for the list of blocks.
-- An ipairs-type iterator generator for the list of blocks.
function p.enum_blocks()
function export.enum_blocks()
local blocks = loader.blocks
local blocks = loader.blocks
return block_iter, blocks, 0
return block_iter, blocks, 0
end
end


function p.lookup_plane(codepoint)
function export.get_block_range(name)
for i, block in ipairs(loader.blocks) do
if block[3] == name then
return block[1], block[2]
end
end
end
 
function export.lookup_plane(codepoint)
local i = floor(codepoint / 0x10000)
local i = floor(codepoint / 0x10000)
return planes[i] or ("Plane %u"):format(i)
return udata.planes[i] or ("Plane %u"):format(i)
end
end


function p.lookup_block(codepoint)
function export.lookup_block(codepoint)
local blocks = loader.blocks
local blocks = loader.blocks
local range = binary_range_search(codepoint, blocks)
local range = binary_range_search(codepoint, blocks)
Line 218: Line 263:
end
end


function p.get_block_info(name)
function export.get_block_info(name)
for i, block in ipairs(loader.blocks) do
for i, block in ipairs(loader.blocks) do
if block[3] == name then
if block[3] == name then
Line 226: Line 271:
end
end


function p.is_valid_pagename(pagename)
function export.is_valid_pagename(pagename)
local has_nonws = false
local has_nonws = false


for cp in mw.ustring.gcodepoint(pagename) do
for codepoint in gcodepoint(pagename) do
if (cp == 0x0023) -- #
if (codepoint == 0x0023) -- #
or (cp == 0x005B) -- [
or (codepoint == 0x005B) -- [
or (cp == 0x005D) -- ]
or (codepoint == 0x005D) -- ]
or (cp == 0x007B) -- {
or (codepoint == 0x007B) -- {
or (cp == 0x007C) -- |
or (codepoint == 0x007C) -- |
or (cp == 0x007D) -- }
or (codepoint == 0x007D) -- }
or (cp == 0x180E) -- MONGOLIAN VOWEL SEPARATOR
or (codepoint == 0x180E) -- MONGOLIAN VOWEL SEPARATOR
or ((cp >= 0x2000) and (cp <= 0x200A)) -- spaces in General Punctuation block
or ((codepoint >= 0x2000) and (codepoint <= 0x200A)) -- spaces in General Punctuation block
or (cp == 0xFFFD) -- REPLACEMENT CHARACTER
or (codepoint == 0xFFFD) -- REPLACEMENT CHARACTER
then
then
return false
return false
end
end


local printable, result = p.is_printable(cp)
local printable, result = export.is_printable(codepoint)
if not printable then
if not printable then
return false
return false
Line 287: Line 332:
-- already been found to match, or a range whose data is the default if there
-- already been found to match, or a range whose data is the default if there
-- was no match.
-- was no match.
local function memo_lookup(data_module_subpage, match_func, ...)
local function codepoint_lookup(data_module_subpage, match_func, ...)
local dots = { ... }
local dots = { ... }
local cache = {}
local cache = {}
Line 329: Line 374:
}
}
end
end
table.insert(cache, dots_range)
table.sort(cache, compare_ranges)
table.sort(cache, compare_ranges)
end
end
return match_func(codepoint)
return match_func(codepoint, unpack(dots))
end
end
 
-- Return a character's combining class value from [[Module:Unicode data/combining classes]],
-- or otherwise 0, which is treated as the default value.
do
local combining
function export.combining_class(ch)
combining = combining or mw.loadData("Module:Unicode data/combining classes")
return combining[type(ch) == "number" and u(ch) or ch] or 0
end
end
end
end


-- Get a code point's combining class value in [[Module:Unicode data/combining]],
-- FIXME: Some combining characters have a combining class of 0, so this needs rethinking.
-- and return whether this value is not zero. Zero is assigned as the default
function export.is_combining(ch)
-- if the combining class value is not found in this data module.
return export.combining_class(ch) ~= 0
-- That is, return true if character is combining, or false if it is not.
end
-- See https://www.unicode.org/reports/tr44/#Canonical_Combining_Class_Values for
-- more information.
p.is_combining = memo_lookup(
"combining",
function (codepoint, combining_class)
return combining_class and combining_class ~= 0 or false
end,
0)


function p.add_dotted_circle(str)
do
return (mw.ustring.gsub(str, ".",
local function dotted_circle(ch)
function(char)
if export.combining_class(ch) ~= 0 then
if p.is_combining(mw.ustring.codepoint(char)) then
return "" .. ch
return '' .. char
end
end
end
end))
function export.add_dotted_circle(str)
return (gsub(str, ".[\128-\191]*", dotted_circle))
end
end
end


local lookup_control = memo_lookup(
local lookup_control = codepoint_lookup(
"control",
"control",
function (codepoint, ccc)
function (codepoint, ccc)
Line 364: Line 416:
end,
end,
"assigned")
"assigned")
p.lookup_control = lookup_control
export.lookup_control = lookup_control


function p.is_assigned(codepoint)
function export.is_assigned(codepoint)
return lookup_control(codepoint) ~= "unassigned"
return lookup_control(codepoint) ~= "unassigned"
end
end


function p.is_printable(codepoint)
function export.is_printable(codepoint)
local result = lookup_control(codepoint)
local result = lookup_control(codepoint)
return (result == "assigned") or (result == "space-separator"), result
return (result == "assigned") or (result == "space-separator"), result
end
end


function p.is_whitespace(codepoint)
function export.is_whitespace(codepoint)
local result = lookup_control(codepoint)
local result = lookup_control(codepoint)
return (result == "space-separator"), result
return (result == "space-separator"), result
end
end


p.lookup_category = memo_lookup(
export.lookup_category = codepoint_lookup(
"category",
"category",
function (codepoint, category)
function (codepoint, category)
return category
return category
end,
end,
"Cn")
"Cn"
)
 
function export.get_category_long_name(category)
return loader["category"].long_names[category]
end


local lookup_script = memo_lookup(
export.lookup_script = codepoint_lookup(
"scripts",
"scripts",
function (codepoint, script_code)
function (codepoint, script)
return script_code or 'Zzzz'
return script
end,
end,
"Zzzz")
"Zzzz"
p.lookup_script = lookup_script
)
 
function p.get_best_script(str)
-- Check type of argument, because mw.text.decode coerces numbers to strings!
require "libraryUtil".checkType("get_best_script", 1, str, "string")
-- Convert HTML character references (including named character references,
-- or character entities) to characters.
str = mw.text.decode(str, true)
local scripts = {}
for codepoint in mw.ustring.gcodepoint(str) do
local script = lookup_script(codepoint)
-- Ignore "Inherited", "Undetermined", or "Uncoded" scripts.
if not (script == "Zyyy" or script == "Zinh" or script == "Zzzz") then
scripts[script] = true
end
end
-- If scripts does not contain two or more keys,
-- return first and only key (script code) in table.
if not next(scripts, next(scripts)) then
return next(scripts)
end -- else return majority script, or else "Zzzz"?
end
 
function p.is_Latin(str)
require "libraryUtil".checkType("get_best_script", 1, str, "string")
str = mw.text.decode(str, true)
-- Search for the leading bytes that introduce the UTF-8 encoding of the
-- code points U+0340-U+10FFFF. If they are not found and there is at least
-- one Latin-script character, the string counts as Latin, because the rest
-- of the characters can only be Zyyy, Zinh, and Zzzz.
-- The only scripts found below U+0370 (the first code point of the Greek
-- and Coptic block) are Latn, Zyyy, Zinh, and Zzzz.
-- See the codepage in the [[UTF-8]] article.
if not str:find "[\205-\244]" then
for codepoint in mw.ustring.gcodepoint(str) do
if lookup_script(codepoint) == "Latn" then
return true
end
end
end
local Latn = false
local i = 0; -- indexer for use in error messages
for codepoint in mw.ustring.gcodepoint(str) do
i = i + 1; -- bump the indexer
local script = lookup_script(codepoint)
if script == "Latn" then
Latn = true
elseif not (script == "Zyyy" or script == "Zinh"
or script == "Zzzz") then
return false, i -- abandon as not Latn; identify the offending character's position
end
end
return Latn, (not Latn and i) or nil -- when <Latn> false, return offending charactor's position as second return value; nil else
end


-- Checks that a string contains only characters belonging to right-to-left
function export.get_script_alias(script)
-- scripts, or characters of ignorable scripts.
return loader["scripts"].aliases[script]
function p.is_rtl(str)
require "libraryUtil".checkType("get_best_script", 1, str, "string")
str = mw.text.decode(str, true)
-- Search for the leading bytes that introduce the UTF-8 encoding of the
-- code points U+0580-U+10FFFF. If they are not found, the string can only
-- have characters from a left-to-right script, because the first code point
-- in a right-to-left script is U+0591, in the Hebrew block.
if not str:find "[\214-\244]" then
return false
end
local result = false
local rtl = loader.scripts.rtl
for codepoint in mw.ustring.gcodepoint(str) do
local script = lookup_script(codepoint)
if rtl[script] then
result = true
elseif not (script == "Zyyy" or script == "Zinh"
or script == "Zzzz") then
return false
end
end
return result
end
end


 
function export.get_entry_title(codepoint)
--[[--------------------------< I S _ R T L _ F R A M E >------------------------------------------------------
if udata.unsupported_title[codepoint] then
 
return udata.unsupported_title[codepoint]
external entry from an {{#invoke:}} to determine if a string of text is rtl.  Strips html and html-like tags so
that those tags don't corrupt the is-rtl-is-not-rtl determination; this added for the cases where the rtl text
has <br /> tags.
 
]]
 
function p.is_rtl_frame (frame)
local str = frame.args[1]; -- get the string from the {{#invoke:}} frame
str = str:gsub ('%b<>', ''); -- strip any html and html-like tags
return p.is_rtl (str); -- return if whatever remains rtl; false else
end
 
 
local function get_codepoint(args, arg)
local codepoint_string = args[arg]
or errorf(2, "Parameter %s is required", tostring(arg))
local codepoint = tonumber(codepoint_string, 16)
or errorf(2, "Parameter %s is not a code point in hexadecimal base",
tostring(arg))
if not (0 <= codepoint and codepoint <= 0x10FFFF) then
errorf(2, "code point in parameter %s out of range", tostring(arg))
end
end
return codepoint
if lookup_control(codepoint) ~= "assigned" then
end
return nil
 
local function get_func(args, arg, prefix)
local suffix = args[arg]
or errorf(2, "Parameter %s is required", tostring(arg))
suffix = mw.text.trim(suffix)
local func_name = prefix .. suffix
local func = p[func_name]
or errorf(2, "There is no function '%s'", func_name)
return func
end
 
-- This function allows any of the "lookup" functions to be invoked. The first
-- parameter is the word after "lookup_"; the second parameter is the code point
-- in hexadecimal base.
function p.lookup(frame)
local func = get_func(frame.args, 1, "lookup_")
local codepoint = get_codepoint(frame.args, 2)
local result = func(codepoint)
if func == p.lookup_name then
-- Prevent code point labels such as <control-0000> from being
-- interpreted as HTML tags.
result = result:gsub("<", "&lt;")
end
return result
end
 
function p.is(frame)
local func = get_func(frame.args, 1, "is_")
-- is_Latin and is_valid_pagename take strings.
if func == p.is_Latin or func == p.is_valid_pagename or func == p.is_rtl then
return (func(frame.args[2]))
else -- The rest take code points.
local codepoint = get_codepoint(frame.args, 2)
return (func(codepoint)) -- Adjust to one result.
end
end
return u(codepoint)
end
end


return p
return export

Latest revision as of 13:00, 11 January 2025

Documentation for this module may be created at Module:Unicode data/doc

local m_str_utils = require("Module:string utilities")

local cp = m_str_utils.codepoint
local gcodepoint = m_str_utils.gcodepoint
local gsub = string.gsub
local u = m_str_utils.char

local export = {}
local udata = mw.loadData("Module:Unicode data/data")

local floor = math.floor

local function errorf(first_arg, ...)
	if type(first_arg) == "number" then
		return error(string.format(...), first_arg + 1)
	else
		return error(string.format(first_arg, ...), 2)
	end
end

local function binary_range_search(codepoint, ranges)
	local low, mid, high
	low, high = 1, ranges.length or require "Module:table".length(ranges)
	while low <= high do
		mid = floor((low + high) / 2)
		local range = ranges[mid]
		if codepoint < range[1] then
			high = mid - 1
		elseif codepoint <= range[2] then
			return range, mid
		else
			low = mid + 1
		end
	end
	return nil, mid
end
export.binary_range_search = binary_range_search

local function linear_range_search(codepoint, ranges)
	for i, range in ipairs(ranges) do
		if codepoint < range[1] then
			break
		elseif codepoint <= range[2] then
			return range
		end
	end
end

-- Load a module by indexing "loader" with the name of the module minus the
-- "Module:Unicode data/" part. For instance, loader.blocks returns
-- [[Module:Unicode data/blocks]]. If a module cannot be loaded, false will be
-- returned.
local loader = setmetatable({}, {
	__index = function (self, key)
		local success, data = pcall(mw.loadData, "Module:Unicode data/" .. key)
		if not success then
			data = false
		end
		self[key] = data
		return data
	end
})

-- For the algorithm used to generate Hangul Syllable names,
-- see "Hangul Syllable Name Generation" in section 3.12 of the
-- Unicode Specification:
-- https://www.unicode.org/versions/latest/ch03.pdf
-- For most of the name rules given here, see the subsection
-- "Unicode Name Property" in section 4.8 (Name) and the table 4-8
-- (Name Derivation Rule Prefix Strings):
-- https://www.unicode.org/versions/latest/ch04.pdf
local name_hooks = {
	{   0x0000,   0x001F, "<control-%04X>" }, -- C0 control characters
	{   0x007F,   0x009F, "<control-%04X>" }, -- DEL and C1 control characters
	{   0x3400,   0x4DBF, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension A
	{   0x4E00,   0x9FFF, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph
	{   0xAC00,   0xD7A3, function (codepoint) -- Hangul Syllables
		local Hangul_data = loader.Hangul
		local syllable_index = codepoint - 0xAC00

		return ("HANGUL SYLLABLE %s%s%s"):format(
			Hangul_data.leads[floor(syllable_index / Hangul_data.final_count)],
			Hangul_data.vowels[floor((syllable_index % Hangul_data.final_count)
				/ Hangul_data.trail_count)],
			Hangul_data.trails[syllable_index % Hangul_data.trail_count]
		)
	end },
	-- High Surrogates, High Private Use Surrogates, Low Surrogates
	{   0xD800,   0xDFFF, "<surrogate-%04X>" },
	{   0xE000,   0xF8FF, "<private-use-%04X>" }, -- Private Use
	-- CJK Compatibility Ideographs
	{   0xF900,   0xFA6D, "CJK COMPATIBILITY IDEOGRAPH-%04X" },
	{   0xFA70,   0xFAD9, "CJK COMPATIBILITY IDEOGRAPH-%04X" },
	{   0xFDD0,   0xFDEF, "<noncharacter-%04X>" },
	{   0xFE00,   0xFE0F, function (codepoint) -- Variation Selectors Supplement
		return ("VARIATION SELECTOR-%d"):format(codepoint - 0xFE00 + 1)
	end},
    {  0x13460,  0x143FA, "EGYPTIAN HIEROGLYPH-%04X" }, -- Egyptian Hieroglyphs Extended-A
	{  0x17000,  0x187F7, "TANGUT IDEOGRAPH-%04X" }, -- Tangut
	{  0x18800,  0x18AFF, function (codepoint)
		return ("TANGUT COMPONENT-%03d"):format(codepoint - 0x187FF)
	end },
	{  0x18B00,  0x18CD5, "KHITAN SMALL SCRIPT CHARACTER-%04X" }, -- Khitan Small Script
	{  0x18CFF,  0x18CFF, "KHITAN SMALL SCRIPT CHARACTER-%04X" },
	{  0x18D00,  0x18D08, "TANGUT IDEOGRAPH-%04X" }, -- Tangut Supplement
	{  0x1B170,  0x1B2FB, "NUSHU CHARACTER-%04X" }, -- Nushu
	{  0x20000,  0x2A6DF, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension B
	{  0x2A700,  0x2B739, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension C
	{  0x2B740,  0x2B81D, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension D
	{  0x2B820,  0x2CEA1, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension E
	{  0x2CEB0,  0x2EBE0, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension F
	{  0x2EBF0,  0x2EE5D, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension I
	-- CJK Compatibility Ideographs Supplement (Supplementary Ideographic Plane)
	{  0x2F800,  0x2FA1D, "CJK COMPATIBILITY IDEOGRAPH-%04X" },
	{  0x30000,  0x3134A, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension G
	{  0x31350,  0x323AF, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension H
	{  0xE0100,  0xE01EF, function (codepoint) -- Variation Selectors Supplement
		return ("VARIATION SELECTOR-%d"):format(codepoint - 0xE0100 + 17)
	end},
	{  0xF0000,  0xFFFFD, "<private-use-%04X>" }, -- Plane 15 Private Use
	{ 0x100000, 0x10FFFD, "<private-use-%04X>" }, -- Plane 16 Private Use
}
name_hooks.length = #name_hooks
--[[ Add another - in this line to test the code point ordering in name_hooks.
local i = 1
local function print_it(a, b, c)
	if type(c) == "string" then
		mw.log(c:format(a), c:format(b))
	else
		mw.log(c(a), c(b))
	end
end
while true do
	local first, second = name_hooks[i], name_hooks[i + 1]
	if not (first and second) then break end
	local message
	if not (first[1] < first[2] and first[2] < second[1] and second[1] < second[2]) then
		message = "Bad name label ordering at index " .. i .. ":"
	elseif second[1] == first[2] + 1 and second[3] == first[3] then
		message = "Name hooks can be merged at index " .. i .. ":"
	end
	if message then
		mw.log(message)
		print_it(unpack(first))
		print_it(unpack(second))
	end
	i = i + 1
end
--]]

local name_range_cache

local function generate_name(data, codepoint)
	if type(data) == "string" then
		return data:format(codepoint)
	else
		return data(codepoint)
	end
end

--[[
-- Checks that the code point is a number and in range.
-- Does not check whether code point is an integer.
-- Not used
local function check_codepoint(funcName, argIdx, val)
	require 'libraryUtil'.checkType(funcName, argIdx, val, 'number')
	if codepoint < 0 or 0x10FFFF < codepoint then
		errorf("Codepoint %04X out of range", codepoint)
	end
end
--]]

-- https://www.unicode.org/versions/latest/ch04.pdf, section 4.8
function export.lookup_name(codepoint)
	-- U+FDD0-U+FDEF and all code points ending in FFFE or FFFF are Unassigned
	-- (Cn) and specifically noncharacters:
	-- https://www.unicode.org/faq/private_use.html#nonchar4
	if codepoint >= 0xFFFE and floor(codepoint % 0x10000) >= 0xFFFE then
		return ("<noncharacter-%04X>"):format(codepoint)
	end

	if name_range_cache -- Check if previously used "name hook" applies to this code point.
			and codepoint >= name_range_cache[1]
			and codepoint <= name_range_cache[2] then
		return generate_name(name_range_cache[3], codepoint)
	end
	
	local range = binary_range_search(codepoint, name_hooks)
	if range then
		name_range_cache = range
		return generate_name(range[3], codepoint)
	end

	local data = loader[('names/%03X'):format(codepoint / 0x1000)]
	
	if data and data[codepoint] then
		return data[codepoint]
	
	-- Unassigned (Cn) consists of noncharacters and reserved characters.
	-- The character has been established not to be a noncharacter,
	-- and if it were assigned, its name would already been retrieved,
	-- so it must be reserved.
	else
		return ("<reserved-%04X>"):format(codepoint)
	end
end

function export.lookup_image(codepoint)
	local data = loader[('images/%03X'):format(codepoint / 0x1000)]
	
	if data then
		return data[codepoint]
	end
end

function export.lookup_image_emoji(codepoint)
	local data = loader[('emoji_images/%03X'):format(codepoint / 0x1000)]
	
	if data then
		return data[codepoint]
	end
end

-- Load [[Module:Unicode data/blocks]] if needed and assign it to this variable.
local blocks

local function block_iter(blocks, i)
	i = i + 1
	local data = blocks[i]
	if data then
		 -- Unpack doesn't work on tables loaded with mw.loadData.
		return i, data[3], data[1], data[2]
	end
end

-- An ipairs-type iterator generator for the list of blocks.
function export.enum_blocks()
	local blocks = loader.blocks
	return block_iter, blocks, 0
end

function export.get_block_range(name)
	for i, block in ipairs(loader.blocks) do
		if block[3] == name then
			return block[1], block[2]
		end
	end
end

function export.lookup_plane(codepoint)
	local i = floor(codepoint / 0x10000)
	return udata.planes[i] or ("Plane %u"):format(i)
end

function export.lookup_block(codepoint)
	local blocks = loader.blocks
	local range = binary_range_search(codepoint, blocks)
	if range then
		return range[3]
	else
		return "No Block"
	end
end

function export.get_block_info(name)
	for i, block in ipairs(loader.blocks) do
		if block[3] == name then
			return block
		end
	end
end

function export.is_valid_pagename(pagename)
	local has_nonws = false

	for codepoint in gcodepoint(pagename) do
		if (codepoint == 0x0023) -- #
		or (codepoint == 0x005B) -- [
		or (codepoint == 0x005D) -- ]
		or (codepoint == 0x007B) -- {
		or (codepoint == 0x007C) -- |
		or (codepoint == 0x007D) -- }
		or (codepoint == 0x180E) -- MONGOLIAN VOWEL SEPARATOR
		or ((codepoint >= 0x2000) and (codepoint <= 0x200A)) -- spaces in General Punctuation block
		or (codepoint == 0xFFFD) -- REPLACEMENT CHARACTER
		then
			return false
		end

		local printable, result = export.is_printable(codepoint)
		if not printable then
			return false
		end

		if result ~= "space-separator" then
			has_nonws = true
		end
	end

	return has_nonws
end

local function manual_unpack(what, from)
	if what[from + 1] == nil then
		return what[from]
	end
	
	local result = {}
	from = from or 1
	for i, item in ipairs(what) do
		if i >= from then
			table.insert(result, item)
		end
	end
	return unpack(result)
end

local function compare_ranges(range1, range2)
	return range1[1] < range2[1]
end

-- Creates a function to look up data in a module that contains "singles" (a
-- code point-to-data map) and "ranges" (an array containing arrays that contain
-- the low and high code points of a range and the data associated with that
-- range).
-- "loader" loads and returns the "singles" and "ranges" tables.
-- "match_func" is passed the code point and either the data or the "dots", and
-- generates the final result of the function.
-- The varargs ("dots") describes the default data to be returned if there wasn't
-- a match.
-- In case the function is used more than once, "cache" saves ranges that have
-- already been found to match, or a range whose data is the default if there
-- was no match.
local function codepoint_lookup(data_module_subpage, match_func, ...)
	local dots = { ... }
	local cache = {}
	local singles, ranges

	return function (codepoint)
		if not singles then
			local data_module = loader[data_module_subpage]
			singles, ranges = data_module.singles, data_module.ranges
		end

		if singles[codepoint] then
			return match_func(codepoint, singles[codepoint])
		end

		local range = binary_range_search(codepoint, cache)
		if range then
			return match_func(codepoint, manual_unpack(range, 3))
		end
		
		local range, index = binary_range_search(codepoint, ranges)
		if range then
			table.insert(cache, range)
			table.sort(cache, compare_ranges)
			return match_func(codepoint, manual_unpack(range, 3))
		end
		
		if ranges[index] then
			local dots_range
			if codepoint > ranges[index][2] then
				dots_range = {
					ranges[index][2] + 1,
					ranges[index + 1] and ranges[index + 1][1] - 1 or 0x10FFFF,
					unpack(dots)
				}
			else -- codepoint < range[index][1]
				dots_range = {
					ranges[index - 1] and ranges[index - 1][2] + 1 or 0,
					ranges[index][1] - 1,
					unpack(dots)
				}
			end
			table.insert(cache, dots_range)
			table.sort(cache, compare_ranges)
		end
		
		return match_func(codepoint, unpack(dots))
	end
end

-- Return a character's combining class value from [[Module:Unicode data/combining classes]],
-- or otherwise 0, which is treated as the default value.
do
	local combining
	
	function export.combining_class(ch)
		combining = combining or mw.loadData("Module:Unicode data/combining classes")
		return combining[type(ch) == "number" and u(ch) or ch] or 0
	end
end

-- FIXME: Some combining characters have a combining class of 0, so this needs rethinking.
function export.is_combining(ch)
	return export.combining_class(ch) ~= 0
end

do
	local function dotted_circle(ch)
		if export.combining_class(ch) ~= 0 then
			return "◌" .. ch
		end
	end
	
	function export.add_dotted_circle(str)
		return (gsub(str, ".[\128-\191]*", dotted_circle))
	end
end

local lookup_control = codepoint_lookup(
	"control",
	function (codepoint, ccc)
		return ccc or "assigned"
	end,
	"assigned")
export.lookup_control = lookup_control

function export.is_assigned(codepoint)
	return lookup_control(codepoint) ~= "unassigned"
end

function export.is_printable(codepoint)
	local result = lookup_control(codepoint)
	return (result == "assigned") or (result == "space-separator"), result
end

function export.is_whitespace(codepoint)
	local result = lookup_control(codepoint)
	return (result == "space-separator"), result
end

export.lookup_category = codepoint_lookup(
	"category",
	function (codepoint, category)
		return category
	end,
	"Cn"
)

function export.get_category_long_name(category)
	return loader["category"].long_names[category]
end

export.lookup_script = codepoint_lookup(
	"scripts",
	function (codepoint, script)
		return script
	end,
	"Zzzz"
)

function export.get_script_alias(script)
	return loader["scripts"].aliases[script]
end

function export.get_entry_title(codepoint)
	if udata.unsupported_title[codepoint] then
		return udata.unsupported_title[codepoint]
	end
	if lookup_control(codepoint) ~= "assigned" then
		return nil
	end
	return u(codepoint)
end

return export