Module:Unicode data: Difference between revisions

No edit summary
No edit summary
 
Line 8: Line 8:
local export = {}
local export = {}
local udata = mw.loadData("Module:Unicode data/data")
local udata = mw.loadData("Module:Unicode data/data")
local load_commons_dataset = require("Module:Unicode data/datasets").dataset


local floor = math.floor
local floor = math.floor
local unpack = unpack or table.unpack -- Lua 5.2 compatibility


local function errorf(first_arg, ...)
local function errorf(first_arg, ...)
Line 54: Line 56:
__index = function (self, key)
__index = function (self, key)
local success, data = pcall(mw.loadData, "Module:Unicode data/" .. key)
local success, data = pcall(mw.loadData, "Module:Unicode data/" .. key)
if not success then
data = false
end
self[key] = data
return data
end
})
-- Load a dataset from Wikimedia Commons by indexing "commons_dataset_loader"
-- with the name of the module minus the "Data:Unicode data/" or ".tab" part.
-- For instance, commons_dataset_loader["names/000"] will load
-- [[commons:Data:Unicode data/names/000.tab]] with
-- [[Module:Unicode data/datasets]]. If a module cannot be loaded,
-- false will be returned.
local commons_dataset_loader = setmetatable({}, {
__index = function (self, key)
local success, data = pcall(load_commons_dataset, "Unicode data/" .. key .. ".tab")
if not success then
if not success then
data = false
data = false
Line 70: Line 89:
-- (Name Derivation Rule Prefix Strings):
-- (Name Derivation Rule Prefix Strings):
-- https://www.unicode.org/versions/latest/ch04.pdf
-- https://www.unicode.org/versions/latest/ch04.pdf
-- List up to date for Unicode 17.0
local name_hooks = {
local name_hooks = {
{  0x0000,  0x001F, "<control-%04X>" }, -- C0 control characters
{  0x0000,  0x001F, "<control-%04X>" }, -- C0 control characters
Line 97: Line 117:
end},
end},
     {  0x13460,  0x143FA, "EGYPTIAN HIEROGLYPH-%04X" }, -- Egyptian Hieroglyphs Extended-A
     {  0x13460,  0x143FA, "EGYPTIAN HIEROGLYPH-%04X" }, -- Egyptian Hieroglyphs Extended-A
{  0x17000,  0x187F7, "TANGUT IDEOGRAPH-%04X" }, -- Tangut
{  0x17000,  0x187FF, "TANGUT IDEOGRAPH-%04X" }, -- Tangut
{  0x18800,  0x18AFF, function (codepoint)
{  0x18800,  0x18AFF, function (codepoint)
return ("TANGUT COMPONENT-%03d"):format(codepoint - 0x187FF)
return ("TANGUT COMPONENT-%03d"):format(codepoint - 0x187FF)
Line 103: Line 123:
{  0x18B00,  0x18CD5, "KHITAN SMALL SCRIPT CHARACTER-%04X" }, -- Khitan Small Script
{  0x18B00,  0x18CD5, "KHITAN SMALL SCRIPT CHARACTER-%04X" }, -- Khitan Small Script
{  0x18CFF,  0x18CFF, "KHITAN SMALL SCRIPT CHARACTER-%04X" },
{  0x18CFF,  0x18CFF, "KHITAN SMALL SCRIPT CHARACTER-%04X" },
{  0x18D00,  0x18D08, "TANGUT IDEOGRAPH-%04X" }, -- Tangut Supplement
{  0x18D00,  0x18D1E, "TANGUT IDEOGRAPH-%04X" }, -- Tangut Supplement
{  0x18D80,  0x18DF2, function (codepoint)
return ("TANGUT COMPONENT-%03d"):format(codepoint - 0x18A7F)
end },
{  0x1B170,  0x1B2FB, "NUSHU CHARACTER-%04X" }, -- Nushu
{  0x1B170,  0x1B2FB, "NUSHU CHARACTER-%04X" }, -- Nushu
{  0x20000,  0x2A6DF, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension B
{  0x20000,  0x2A6DF, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension B
{  0x2A700,  0x2B739, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension C
{  0x2A700,  0x2B739, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension C
{  0x2B740,  0x2B81D, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension D
{  0x2B740,  0x2B81D, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension D
{  0x2B820,  0x2CEA1, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension E
{  0x2B820,  0x2CEAD, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension E
{  0x2CEB0,  0x2EBE0, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension F
{  0x2CEB0,  0x2EBE0, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension F
{  0x2EBF0,  0x2EE5D, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension I
{  0x2EBF0,  0x2EE5D, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension I
Line 115: Line 138:
{  0x30000,  0x3134A, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension G
{  0x30000,  0x3134A, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension G
{  0x31350,  0x323AF, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension H
{  0x31350,  0x323AF, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension H
{  0x323B0,  0x33479, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension J
{  0xE0100,  0xE01EF, function (codepoint) -- Variation Selectors Supplement
{  0xE0100,  0xE01EF, function (codepoint) -- Variation Selectors Supplement
return ("VARIATION SELECTOR-%d"):format(codepoint - 0xE0100 + 17)
return ("VARIATION SELECTOR-%d"):format(codepoint - 0xE0100 + 17)
Line 192: Line 216:
end
end


local data = loader[('names/%03X'):format(codepoint / 0x1000)]
-- local data = loader[('names/%03X'):format(codepoint / 0x1000)]
local data = commons_dataset_loader[('names/%03X'):format(codepoint / 0x1000)]
if data and data[codepoint] then
if data and data[codepoint] then
Line 207: Line 232:


function export.lookup_image(codepoint)
function export.lookup_image(codepoint)
local data = loader[('images/%03X'):format(codepoint / 0x1000)]
-- local data = loader[('images/%03X'):format(codepoint / 0x1000)]
local data = commons_dataset_loader[('images/%03X'):format(codepoint / 0x1000)]
if data then
if data then
Line 215: Line 241:


function export.lookup_image_emoji(codepoint)
function export.lookup_image_emoji(codepoint)
local data = loader[('emoji_images/%03X'):format(codepoint / 0x1000)]
-- local data = loader[('emoji_images/%03X'):format(codepoint / 0x1000)]
local data = commons_dataset_loader[('emoji_images/%03X'):format(codepoint / 0x1000)]
if data then
if data then