Module:Unicode data: Difference between revisions

no edit summary
(Created page with "local export = {} local floor = math.floor local function errorf(first_arg, ...) if type(first_arg) == "number" then return error(string.format(...), first_arg + 1) else...")
 
No edit summary
Line 1: Line 1:
local export = {}
local p = {}


local floor = math.floor
local floor = math.floor


local function errorf(first_arg, ...)
local function errorf(level, ...)
if type(first_arg) == "number" then
if type(level) == "number" then
return error(string.format(...), first_arg + 1)
return error(string.format(...), level + 1)
else
else -- level is actually the format string.
return error(string.format(first_arg, ...), 2)
return error(string.format(level, ...), 2)
end
end
end
end
Line 13: Line 13:
local function binary_range_search(codepoint, ranges)
local function binary_range_search(codepoint, ranges)
local low, mid, high
local low, mid, high
low, high = 1, ranges.length or require "Module:table".length(ranges)
low, high = 1, ranges.length or require "Module:TableTools".length(ranges)
while low <= high do
while low <= high do
mid = floor((low + high) / 2)
mid = floor((low + high) / 2)
Line 27: Line 27:
return nil, mid
return nil, mid
end
end
export.binary_range_search = binary_range_search
p.binary_range_search = binary_range_search


--[[
local function linear_range_search(codepoint, ranges)
local function linear_range_search(codepoint, ranges)
for i, range in ipairs(ranges) do
for i, range in ipairs(ranges) do
if codepoint < range[1] then
if range[1] <= codepoint and codepoint <= range[2] then
break
elseif codepoint <= range[2] then
return range
return range
end
end
end
end
end
end
--]]


-- Load a module by indexing "loader" with the name of the module minus the
-- Load a module by indexing "loader" with the name of the module minus the
Line 57: Line 57:
-- see "Hangul Syllable Name Generation" in section 3.12 of the
-- see "Hangul Syllable Name Generation" in section 3.12 of the
-- Unicode Specification:
-- Unicode Specification:
-- https://www.unicode.org/versions/latest/ch03.pdf
-- https://www.unicode.org/versions/Unicode11.0.0/ch03.pdf
-- For most of the name rules given here, see the subsection
-- "Unicode Name Property" in section 4.8 (Name) and the table 4-8
-- (Name Derivation Rule Prefix Strings):
-- https://www.unicode.org/versions/latest/ch04.pdf
local name_hooks = {
local name_hooks = {
{    0x00,    0x1F, "<control-%04X>" }, -- C0 control characters
{    0x00,    0x1F, "<control-%04X>" }, -- C0 control characters
{    0x7F,    0x9F, "<control-%04X>" }, -- DEL and C1 control characters
{    0x7F,    0x9F, "<control-%04X>" }, -- DEL and C1 control characters
{  0x3400,  0x4DBF, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension A
{  0x3400,  0x4DBF, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension A
{  0x4E00,  0x9FFC, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph
{  0x4E00,  0x9FFF, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph
{  0xAC00,  0xD7A3, function (codepoint) -- Hangul Syllables
{  0xAC00,  0xD7A3, function (codepoint) -- Hangul Syllables
local Hangul_data = loader.Hangul
local Hangul_data = loader.Hangul
Line 84: Line 80:
{  0xF900,  0xFA6D, "CJK COMPATIBILITY IDEOGRAPH-%04X" },
{  0xF900,  0xFA6D, "CJK COMPATIBILITY IDEOGRAPH-%04X" },
{  0xFA70,  0xFAD9, "CJK COMPATIBILITY IDEOGRAPH-%04X" },
{  0xFA70,  0xFAD9, "CJK COMPATIBILITY IDEOGRAPH-%04X" },
{  0x17000,  0x187F7, "TANGUT IDEOGRAPH-%04X" }, -- Tangut
{  0x17000,  0x187F7, "TANGUT IDEOGRAPH-%04X" }, -- Tangut Ideograph
{  0x18800,  0x18AFF, function (codepoint)
{  0x18800,  0x18AFF, function (codepoint)
return ("TANGUT COMPONENT-%03d"):format(codepoint - 0x187FF)
return ("TANGUT COMPONENT-%03d"):format(codepoint - 0x187FF)
end },
end },
{  0x18D00,  0x18D08, "TANGUT IDEOGRAPH-%04X" }, -- Tangut
{  0x18D00,  0x18D08, "TANGUT IDEOGRAPH-%04X" }, -- Tangut Ideograph Supplement
{  0x18B00,  0x18CD5, "KHITAN SMALL SCRIPT CHARACTER-%04X" },
{  0x1B170,  0x1B2FB, "NUSHU CHARACTER-%04X" }, -- Nushu
{  0x1B170,  0x1B2FB, "NUSHU CHARACTER-%04X" }, -- Nushu
{  0x20000,  0x2A6DD, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension B
{  0x20000,  0x2A6DF, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension B
{  0x2A700,  0x2B734, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension C
{  0x2A700,  0x2B739, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension C
0x2A740,  0x2B81D, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension D
0x2B740,  0x2B81D, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension D
{  0x2B820,  0x2CEA1, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension E
{  0x2B820,  0x2CEA1, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension E
{  0x2CEB0,  0x2EBE0, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension F
{  0x2CEB0,  0x2EBE0, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension F
-- CJK Compatibility Ideographs Supplement (Supplementary Ideographic Plane)
-- CJK Compatibility Ideographs Supplement (Supplementary Ideographic Plane)
{  0x2F800,  0x2FA1D, "CJK COMPATIBILITY IDEOGRAPH-%04X" },
{  0x2F800,  0x2FA1D, "CJK COMPATIBILITY IDEOGRAPH-%04X" },
{  0x30000,  0x3134A, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension G
{  0xE0100,  0xE01EF, function (codepoint) -- Variation Selectors Supplement
{  0xE0100,  0xE01EF, function (codepoint) -- Variation Selectors Supplement
return ("VARIATION SELECTOR-%d"):format(codepoint - 0xE0100 + 17)
return ("VARIATION SELECTOR-%d"):format(codepoint - 0xE0100 + 17)
end},
end},
{  0x30000,  0x3134A, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension G
{  0x31350,  0x323AF, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension H
{  0x2EBF0,  0x2EE5D, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension I
{  0xF0000,  0xFFFFD, "<private-use-%04X>" }, -- Plane 15 Private Use
{  0xF0000,  0xFFFFD, "<private-use-%04X>" }, -- Plane 15 Private Use
{ 0x100000, 0x10FFFD, "<private-use-%04X>" }  -- Plane 16 Private Use
{ 0x100000, 0x10FFFD, "<private-use-%04X>" }  -- Plane 16 Private Use
Line 129: Line 126:
--]]
--]]


-- https://www.unicode.org/versions/latest/ch04.pdf, section 4.8
function p.is_noncharacter(codepoint)
function export.lookup_name(codepoint)
-- U+FDD0-U+FDEF and all code points ending in FFFE or FFFF are Unassigned
-- U+FDD0-U+FDEF and all code points ending in FFFE or FFFF are Unassigned
-- (Cn) and specifically noncharacters:
-- (Cn) and specifically noncharacters:
-- https://www.unicode.org/faq/private_use.html#nonchar4
-- https://www.unicode.org/faq/private_use.html#nonchar4
if 0xFDD0 <= codepoint and (codepoint <= 0xFDEF
return 0xFDD0 <= codepoint and (codepoint <= 0xFDEF
or floor(codepoint % 0x10000) >= 0xFFFE) then
or floor(codepoint % 0x10000) >= 0xFFFE)
end
 
-- https://www.unicode.org/versions/Unicode11.0.0/ch04.pdf, section 4.8
function p.lookup_name(codepoint)
if p.is_noncharacter(codepoint) then
return ("<noncharacter-%04X>"):format(codepoint)
return ("<noncharacter-%04X>"):format(codepoint)
end
end
Line 165: Line 167:
end
end


function export.lookup_image(codepoint)
function p.lookup_image(codepoint)
local data = loader[('images/%03X'):format(codepoint / 0x1000)]
local data = loader[('images/%03X'):format(codepoint / 0x1000)]
Line 191: Line 193:
if data then
if data then
-- Unpack doesn't work on tables loaded with mw.loadData.
-- Unpack doesn't work on tables loaded with mw.loadData.
return i, data[3], data[1], data[2]
return i, data[1], data[2], data[3]
end
end
end
end


-- An ipairs-type iterator generator for the list of blocks.
-- An ipairs-type iterator generator for the list of blocks.
function export.enum_blocks()
function p.enum_blocks()
local blocks = loader.blocks
local blocks = loader.blocks
return block_iter, blocks, 0
return block_iter, blocks, 0
end
end


function export.get_block_range(name)
function p.lookup_plane(codepoint)
local range
for i, block in ipairs(loader.blocks) do
if block[3] == name then
range = block
end
end
if range then
return range[1], range[2]
end
end
 
function export.lookup_plane(codepoint)
local i = floor(codepoint / 0x10000)
local i = floor(codepoint / 0x10000)
return planes[i] or ("Plane %u"):format(i)
return planes[i] or ("Plane %u"):format(i)
end
end


function export.lookup_block(codepoint)
function p.lookup_block(codepoint)
local blocks = loader.blocks
local blocks = loader.blocks
local range = binary_range_search(codepoint, blocks)
local range = binary_range_search(codepoint, blocks)
Line 230: Line 218:
end
end


function export.get_block_info(name)
function p.get_block_info(name)
for i, block in ipairs(loader.blocks) do
for i, block in ipairs(loader.blocks) do
if block[3] == name then
if block[3] == name then
Line 238: Line 226:
end
end


function export.is_valid_pagename(pagename)
function p.is_valid_pagename(pagename)
local has_nonws = false
local has_nonws = false


Line 255: Line 243:
end
end


local printable, result = export.is_printable(cp)
local printable, result = p.is_printable(cp)
if not printable then
if not printable then
return false
return false
Line 344: Line 332:
end
end
return match_func(codepoint, unpack(dots))
return match_func(codepoint)
end
end
end
end
Line 354: Line 342:
-- See https://www.unicode.org/reports/tr44/#Canonical_Combining_Class_Values for
-- See https://www.unicode.org/reports/tr44/#Canonical_Combining_Class_Values for
-- more information.
-- more information.
export.is_combining = memo_lookup(
p.is_combining = memo_lookup(
"combining",
"combining",
function (codepoint, combining_class)
function (codepoint, combining_class)
Line 361: Line 349:
0)
0)


function export.add_dotted_circle(str)
function p.add_dotted_circle(str)
return (mw.ustring.gsub(str, ".",
return (mw.ustring.gsub(str, ".",
function(char)
function(char)
if export.is_combining(mw.ustring.codepoint(char)) then
if p.is_combining(mw.ustring.codepoint(char)) then
return '◌' .. char
return '◌' .. char
end
end
Line 376: Line 364:
end,
end,
"assigned")
"assigned")
export.lookup_control = lookup_control
p.lookup_control = lookup_control


function export.is_assigned(codepoint)
function p.is_assigned(codepoint)
return lookup_control(codepoint) ~= "unassigned"
return lookup_control(codepoint) ~= "unassigned"
end
end


function export.is_printable(codepoint)
function p.is_printable(codepoint)
local result = lookup_control(codepoint)
local result = lookup_control(codepoint)
return (result == "assigned") or (result == "space-separator"), result
return (result == "assigned") or (result == "space-separator"), result
end
end


function export.is_whitespace(codepoint)
function p.is_whitespace(codepoint)
local result = lookup_control(codepoint)
local result = lookup_control(codepoint)
return (result == "space-separator"), result
return (result == "space-separator"), result
end
end


export.lookup_category = memo_lookup(
p.lookup_category = memo_lookup(
"category",
"category",
function (codepoint, category)
function (codepoint, category)
Line 399: Line 387:
"Cn")
"Cn")


export.lookup_script = memo_lookup(
local lookup_script = memo_lookup(
"scripts",
"scripts",
function (codepoint, script)
function (codepoint, script_code)
return script
return script_code or 'Zzzz'
end,
end,
"Zzzz")
"Zzzz")
p.lookup_script = lookup_script
function p.get_best_script(str)
-- Check type of argument, because mw.text.decode coerces numbers to strings!
require "libraryUtil".checkType("get_best_script", 1, str, "string")
-- Convert HTML character references (including named character references,
-- or character entities) to characters.
str = mw.text.decode(str, true)
local scripts = {}
for codepoint in mw.ustring.gcodepoint(str) do
local script = lookup_script(codepoint)
-- Ignore "Inherited", "Undetermined", or "Uncoded" scripts.
if not (script == "Zyyy" or script == "Zinh" or script == "Zzzz") then
scripts[script] = true
end
end
-- If scripts does not contain two or more keys,
-- return first and only key (script code) in table.
if not next(scripts, next(scripts)) then
return next(scripts)
end -- else return majority script, or else "Zzzz"?
end


local unsupported_title = {
function p.is_Latin(str)
[0x0020] = "Unsupported titles/Space";
require "libraryUtil".checkType("get_best_script", 1, str, "string")
[0x0023] = "Unsupported titles/Number sign";
str = mw.text.decode(str, true)
[0x002E] = "Unsupported titles/Full stop";
[0x003A] = "Unsupported titles/Colon";
-- Search for the leading bytes that introduce the UTF-8 encoding of the
[0x003C] = "Unsupported titles/Less than";
-- code points U+0340-U+10FFFF. If they are not found and there is at least
[0x003E] = "Unsupported titles/Greater than";
-- one Latin-script character, the string counts as Latin, because the rest
[0x005B] = "Unsupported titles/Left square bracket";
-- of the characters can only be Zyyy, Zinh, and Zzzz.
[0x005D] = "Unsupported titles/Right square bracket";
-- The only scripts found below U+0370 (the first code point of the Greek
[0x005F] = "Unsupported titles/Low line";
-- and Coptic block) are Latn, Zyyy, Zinh, and Zzzz.
[0x007B] = "Unsupported titles/Left curly bracket";
-- See the codepage in the [[UTF-8]] article.
[0x007C] = "Unsupported titles/Vertical line";
if not str:find "[\205-\244]" then
[0x007D] = "Unsupported titles/Right curly bracket";
for codepoint in mw.ustring.gcodepoint(str) do
[0x1680] = "Unsupported titles/Ogham space";
if lookup_script(codepoint) == "Latn" then
[0xFFFD] = "Unsupported titles/Replacement character";
return true
}
end
end
end
local Latn = false
local i = 0; -- indexer for use in error messages
for codepoint in mw.ustring.gcodepoint(str) do
i = i + 1; -- bump the indexer
local script = lookup_script(codepoint)
if script == "Latn" then
Latn = true
elseif not (script == "Zyyy" or script == "Zinh"
or script == "Zzzz") then
return false, i -- abandon as not Latn; identify the offending character's position
end
end
return Latn, (not Latn and i) or nil -- when <Latn> false, return offending charactor's position as second return value; nil else
end
 
-- Checks that a string contains only characters belonging to right-to-left
-- scripts, or characters of ignorable scripts.
function p.is_rtl(str)
require "libraryUtil".checkType("get_best_script", 1, str, "string")
str = mw.text.decode(str, true)
-- Search for the leading bytes that introduce the UTF-8 encoding of the
-- code points U+0580-U+10FFFF. If they are not found, the string can only
-- have characters from a left-to-right script, because the first code point
-- in a right-to-left script is U+0591, in the Hebrew block.
if not str:find "[\214-\244]" then
return false
end
local result = false
local rtl = loader.scripts.rtl
for codepoint in mw.ustring.gcodepoint(str) do
local script = lookup_script(codepoint)
if rtl[script] then
result = true
elseif not (script == "Zyyy" or script == "Zinh"
or script == "Zzzz") then
return false
end
end
return result
end
 
 
--[[--------------------------< I S _ R T L _ F R A M E >------------------------------------------------------
 
external entry from an {{#invoke:}} to determine if a string of text is rtl.  Strips html and html-like tags so
that those tags don't corrupt the is-rtl-is-not-rtl determination; this added for the cases where the rtl text
has <br /> tags.
 
]]
 
function p.is_rtl_frame (frame)
local str = frame.args[1]; -- get the string from the {{#invoke:}} frame
str = str:gsub ('%b<>', ''); -- strip any html and html-like tags
return p.is_rtl (str); -- return if whatever remains rtl; false else
end
 
 
local function get_codepoint(args, arg)
local codepoint_string = args[arg]
or errorf(2, "Parameter %s is required", tostring(arg))
local codepoint = tonumber(codepoint_string, 16)
or errorf(2, "Parameter %s is not a code point in hexadecimal base",
tostring(arg))
if not (0 <= codepoint and codepoint <= 0x10FFFF) then
errorf(2, "code point in parameter %s out of range", tostring(arg))
end
return codepoint
end
 
local function get_func(args, arg, prefix)
local suffix = args[arg]
or errorf(2, "Parameter %s is required", tostring(arg))
suffix = mw.text.trim(suffix)
local func_name = prefix .. suffix
local func = p[func_name]
or errorf(2, "There is no function '%s'", func_name)
return func
end


function export.get_entry_title(codepoint)
-- This function allows any of the "lookup" functions to be invoked. The first
if unsupported_title[codepoint] then
-- parameter is the word after "lookup_"; the second parameter is the code point
return unsupported_title[codepoint]
-- in hexadecimal base.
function p.lookup(frame)
local func = get_func(frame.args, 1, "lookup_")
local codepoint = get_codepoint(frame.args, 2)
local result = func(codepoint)
if func == p.lookup_name then
-- Prevent code point labels such as <control-0000> from being
-- interpreted as HTML tags.
result = result:gsub("<", "&lt;")
end
end
if lookup_control(codepoint) ~= "assigned" then
return result
return nil
end
 
function p.is(frame)
local func = get_func(frame.args, 1, "is_")
-- is_Latin and is_valid_pagename take strings.
if func == p.is_Latin or func == p.is_valid_pagename or func == p.is_rtl then
return (func(frame.args[2]))
else -- The rest take code points.
local codepoint = get_codepoint(frame.args, 2)
return (func(codepoint)) -- Adjust to one result.
end
end
return mw.ustring.char(codepoint)
end
end


return export
return p