Module:Unicode data: Difference between revisions
No edit summary |
No edit summary |
||
| Line 8: | Line 8: | ||
local export = {} | local export = {} | ||
local udata = mw.loadData("Module:Unicode data/data") | local udata = mw.loadData("Module:Unicode data/data") | ||
local load_commons_dataset = require("Module:Unicode data/datasets").dataset | |||
local floor = math.floor | local floor = math.floor | ||
local unpack = unpack or table.unpack -- Lua 5.2 compatibility | |||
local function errorf(first_arg, ...) | local function errorf(first_arg, ...) | ||
| Line 54: | Line 56: | ||
__index = function (self, key) | __index = function (self, key) | ||
local success, data = pcall(mw.loadData, "Module:Unicode data/" .. key) | local success, data = pcall(mw.loadData, "Module:Unicode data/" .. key) | ||
if not success then | |||
data = false | |||
end | |||
self[key] = data | |||
return data | |||
end | |||
}) | |||
-- Load a dataset from Wikimedia Commons by indexing "commons_dataset_loader" | |||
-- with the name of the module minus the "Data:Unicode data/" or ".tab" part. | |||
-- For instance, commons_dataset_loader["names/000"] will load | |||
-- [[commons:Data:Unicode data/names/000.tab]] with | |||
-- [[Module:Unicode data/datasets]]. If a module cannot be loaded, | |||
-- false will be returned. | |||
local commons_dataset_loader = setmetatable({}, { | |||
__index = function (self, key) | |||
local success, data = pcall(load_commons_dataset, "Unicode data/" .. key .. ".tab") | |||
if not success then | if not success then | ||
data = false | data = false | ||
| Line 70: | Line 89: | ||
-- (Name Derivation Rule Prefix Strings): | -- (Name Derivation Rule Prefix Strings): | ||
-- https://www.unicode.org/versions/latest/ch04.pdf | -- https://www.unicode.org/versions/latest/ch04.pdf | ||
-- List up to date for Unicode 17.0 | |||
local name_hooks = { | local name_hooks = { | ||
{ 0x0000, 0x001F, "<control-%04X>" }, -- C0 control characters | { 0x0000, 0x001F, "<control-%04X>" }, -- C0 control characters | ||
| Line 97: | Line 117: | ||
end}, | end}, | ||
{ 0x13460, 0x143FA, "EGYPTIAN HIEROGLYPH-%04X" }, -- Egyptian Hieroglyphs Extended-A | { 0x13460, 0x143FA, "EGYPTIAN HIEROGLYPH-%04X" }, -- Egyptian Hieroglyphs Extended-A | ||
{ 0x17000, | { 0x17000, 0x187FF, "TANGUT IDEOGRAPH-%04X" }, -- Tangut | ||
{ 0x18800, 0x18AFF, function (codepoint) | { 0x18800, 0x18AFF, function (codepoint) | ||
return ("TANGUT COMPONENT-%03d"):format(codepoint - 0x187FF) | return ("TANGUT COMPONENT-%03d"):format(codepoint - 0x187FF) | ||
| Line 103: | Line 123: | ||
{ 0x18B00, 0x18CD5, "KHITAN SMALL SCRIPT CHARACTER-%04X" }, -- Khitan Small Script | { 0x18B00, 0x18CD5, "KHITAN SMALL SCRIPT CHARACTER-%04X" }, -- Khitan Small Script | ||
{ 0x18CFF, 0x18CFF, "KHITAN SMALL SCRIPT CHARACTER-%04X" }, | { 0x18CFF, 0x18CFF, "KHITAN SMALL SCRIPT CHARACTER-%04X" }, | ||
{ 0x18D00, | { 0x18D00, 0x18D1E, "TANGUT IDEOGRAPH-%04X" }, -- Tangut Supplement | ||
{ 0x18D80, 0x18DF2, function (codepoint) | |||
return ("TANGUT COMPONENT-%03d"):format(codepoint - 0x18A7F) | |||
end }, | |||
{ 0x1B170, 0x1B2FB, "NUSHU CHARACTER-%04X" }, -- Nushu | { 0x1B170, 0x1B2FB, "NUSHU CHARACTER-%04X" }, -- Nushu | ||
{ 0x20000, 0x2A6DF, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension B | { 0x20000, 0x2A6DF, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension B | ||
{ 0x2A700, 0x2B739, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension C | { 0x2A700, 0x2B739, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension C | ||
{ 0x2B740, 0x2B81D, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension D | { 0x2B740, 0x2B81D, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension D | ||
{ 0x2B820, | { 0x2B820, 0x2CEAD, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension E | ||
{ 0x2CEB0, 0x2EBE0, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension F | { 0x2CEB0, 0x2EBE0, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension F | ||
{ 0x2EBF0, 0x2EE5D, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension I | { 0x2EBF0, 0x2EE5D, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension I | ||
| Line 115: | Line 138: | ||
{ 0x30000, 0x3134A, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension G | { 0x30000, 0x3134A, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension G | ||
{ 0x31350, 0x323AF, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension H | { 0x31350, 0x323AF, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension H | ||
{ 0x323B0, 0x33479, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension J | |||
{ 0xE0100, 0xE01EF, function (codepoint) -- Variation Selectors Supplement | { 0xE0100, 0xE01EF, function (codepoint) -- Variation Selectors Supplement | ||
return ("VARIATION SELECTOR-%d"):format(codepoint - 0xE0100 + 17) | return ("VARIATION SELECTOR-%d"):format(codepoint - 0xE0100 + 17) | ||
| Line 192: | Line 216: | ||
end | end | ||
local data = loader[('names/%03X'):format(codepoint / 0x1000)] | -- local data = loader[('names/%03X'):format(codepoint / 0x1000)] | ||
local data = commons_dataset_loader[('names/%03X'):format(codepoint / 0x1000)] | |||
if data and data[codepoint] then | if data and data[codepoint] then | ||
| Line 207: | Line 232: | ||
function export.lookup_image(codepoint) | function export.lookup_image(codepoint) | ||
local data = loader[('images/%03X'):format(codepoint / 0x1000)] | -- local data = loader[('images/%03X'):format(codepoint / 0x1000)] | ||
local data = commons_dataset_loader[('images/%03X'):format(codepoint / 0x1000)] | |||
if data then | if data then | ||
| Line 215: | Line 241: | ||
function export.lookup_image_emoji(codepoint) | function export.lookup_image_emoji(codepoint) | ||
local data = loader[('emoji_images/%03X'):format(codepoint / 0x1000)] | -- local data = loader[('emoji_images/%03X'):format(codepoint / 0x1000)] | ||
local data = commons_dataset_loader[('emoji_images/%03X'):format(codepoint / 0x1000)] | |||
if data then | if data then | ||