Module:languages/data: Difference between revisions

No edit summary
No edit summary
 
(3 intermediate revisions by the same user not shown)
Line 9: Line 9:
-- UTF-8 encoded strings for some commonly-used diacritics.
-- UTF-8 encoded strings for some commonly-used diacritics.
local c = {
local c = {
prime = u(0x02B9),
grave = u(0x0300),
grave = u(0x0300),
acute = u(0x0301),
acute = u(0x0301),
Line 19: Line 20:
diaer = u(0x0308),
diaer = u(0x0308),
ringabove = u(0x030A),
ringabove = u(0x030A),
hook            = u(0x0309),
dacute = u(0x030B),
dacute = u(0x030B),
caron = u(0x030C),
caron = u(0x030C),
Line 31: Line 33:
cedilla = u(0x0327),
cedilla = u(0x0327),
ogonek = u(0x0328),
ogonek = u(0x0328),
tildebelow      = u(0x0330),
brevebelow = u(0x032E),
brevebelow = u(0x032E),
macronbelow = u(0x0331),
macronbelow = u(0x0331),
Line 46: Line 49:
small_o = u(0x0366),
small_o = u(0x0366),
small_u = u(0x0367),
small_u = u(0x0367),
kamora          = u(0x0484),
keraia = u(0x0374),
dasiapneumata   = u(0x0485),
lowerkeraia = u(0x0375),
psilipneumata   = u(0x0486),
tonos = u(0x0384),
palatalization = u(0x0484),
dasiapneumata = u(0x0485),
psilipneumata = u(0x0486),
kashida = u(0x0640),
kashida = u(0x0640),
fathatan = u(0x064B),
fathatan = u(0x064B),
Line 60: Line 66:
hamzaabove = u(0x0654),
hamzaabove = u(0x0654),
nunghunna = u(0x0658),
nunghunna = u(0x0658),
zwarakay = u(0x0659),
smallv = u(0x065A),
smallv = u(0x065A),
superalef = u(0x0670),
superalef = u(0x0670),
psili = u(0x1FBD),
udatta = u(0x0951),
coronis = u(0x1FBF),
anudatta = u(0x0952),
dottedgrave = u(0x1DC0),
dottedacute = u(0x1DC1),
coronis = u(0x1FBD),
psili = u(0x1FBF),
dasia = u(0x1FEF),
ZWNJ = u(0x200C), -- zero width non-joiner
ZWNJ = u(0x200C), -- zero width non-joiner
ZWJ = u(0x200D), -- zero width joiner
ZWJ = u(0x200D), -- zero width joiner
RSQuo = u(0x2019), -- right single quote
RSQuo = u(0x2019), -- right single quote
kavyka = u(0xA67C),
VS01 = u(0xFE00), -- variation selector 1
VS01 = u(0xFE00), -- variation selector 1
-- Punctuation for the standardChars field.
-- Punctuation for the standardChars field.
Line 98: Line 111:
-- These values are placed here to make it possible to synchronise a group of languages without the need for a dedicated function module.
-- These values are placed here to make it possible to synchronise a group of languages without the need for a dedicated function module.


s["cau-Cyrl-displaytext"] = {
-- cau
from = {"[IlІӀ]", ""},
do
to = {"ӏ", ""}
local cau_remove_diacritics = c.grave .. c.acute .. c.macron
local cau_from = {"[IlΙІӀᴴ]"}
local cau_to = {{
["l"] = "ӏ",
["Ι"] = "ӏ",
["І"] = "ӏ",
["Ӏ"] = "ӏ",
["ᴴ"] = "ᵸ",
}}
 
s["cau-Cyrl-displaytext"] = {
from = cau_from,
to = cau_to,
}
 
s["cau-Cyrl-entryname"] = {
remove_diacritics = cau_remove_diacritics,
from = cau_from,
to = cau_to,
}
 
s["cau-Latn-entryname"] = {remove_diacritics = cau_remove_diacritics}
end
 
-- Cyrs
do
local Cyrs_remove_diacritics = c.grave .. c.acute .. c.dotabove .. c.diaer .. c.invbreve .. c.palatalization .. c.dasiapneumata .. c.psilipneumata .. c.dottedgrave .. c.dottedacute .. c.kavyka
 
s["Cyrs-entryname"] = {remove_diacritics = Cyrs_remove_diacritics}
 
s["Cyrs-sortkey"] = {
remove_diacritics = Cyrs_remove_diacritics,
from = {
"ї", "оу", -- 2 chars
"[ґꙣєѕꙃꙅꙁіꙇђꙉѻꙩꙫꙭꙮꚙꚛꙋѡѿꙍѽꙑѣꙗѥꙕѧꙙѩꙝꙛѫѭѯѱѳѵҁ]"
},
to = {
"и" .. p[1], "у", {
["ґ"] = "г" .. p[1], ["ꙣ"] = "д" .. p[1], ["є"] = "е", ["ѕ"] = "ж" .. p[1], ["ꙃ"] = "ж" .. p[1],
["ꙅ"] = "ж" .. p[1], ["ꙁ"] = "з", ["і"] = "и" .. p[1], ["ꙇ"] = "и" .. p[1], ["ђ"] = "и" .. p[2],
["ꙉ"] = "и" .. p[2], ["ѻ"] = "о", ["ꙩ"] = "о", ["ꙫ"] = "о", ["ꙭ"] = "о",
["ꙮ"] = "о", ["ꚙ"] = "о", ["ꚛ"] = "о", ["ꙋ"] = "у", ["ѡ"] = "х" .. p[1],
["ѿ"] = "х" .. p[1], ["ꙍ"] = "х" .. p[1], ["ѽ"] = "х" .. p[1], ["ꙑ"] = "ы", ["ѣ"] = "ь" .. p[1],
["ꙗ"] = "ь" .. p[2], ["ѥ"] = "ь" .. p[3], ["ꙕ"] = "ю", ["ѧ"] = "я", ["ꙙ"] = "я",
["ѩ"] = "я" .. p[1], ["ꙝ"] = "я" .. p[1], ["ꙛ"] = "я" .. p[2], ["ѫ"] = "я" .. p[3], ["ѭ"] = "я" .. p[4],
["ѯ"] = "я" .. p[5], ["ѱ"] = "я" .. p[6], ["ѳ"] = "я" .. p[7], ["ѵ"] = "я" .. p[8], ["ҁ"] = "я" .. p[9],
}
},
}
end
 
s["Grek-displaytext"] = {
from = {"Þ", "þ", "['" .. c.RSQuo .. c.prime .. c.keraia .. c.coronis .. c.psili .. "]"}, -- Not tonos, used as the numeral sign in entries.
to = {"Ϸ", "ϸ", c.RSQuo}
}
}


s["cau-Cyrl-entryname"] = {
s["Grek-entryname"] = {
remove_diacritics = c.grave .. c.acute .. c.macron,
remove_diacritics = c.caron .. c.diaerbelow .. c.brevebelow,
from = s["cau-Cyrl-displaytext"].from,
from = s["Grek-displaytext"].from,
to = s["cau-Cyrl-displaytext"].to
to = {"Ϸ", "ϸ", "'"}
}
}


s["cau-Latn-entryname"] = {remove_diacritics = c.grave .. c.acute .. c.macron}
s["Grek-sortkey"] = {
remove_diacritics = "';·`¨´῀" .. c.grave .. c.acute .. c.diaer .. c.caron .. c.commaabove .. c.revcommaabove .. c.macron .. c.breve .. c.diaerbelow .. c.brevebelow .. c.perispomeni .. c.ypogegrammeni .. c.RSQuo .. c.prime .. c.keraia .. c.lowerkeraia .. c.tonos .. c.coronis .. c.psili .. c.dasia,
from = {"ϝ", "ͷ", "ϛ", "ͱ", "ͺ", "ϳ", "ϻ", "[ϟϙ]", "[ςϲ]", "ͳ"},
to = {"ε" .. p[1], "ε" .. p[2], "ε" .. p[3], "ζ" .. p[1], "ι", "ι" .. p[1], "π" .. p[1], "π" .. p[2], "σ", "ϡ"}
}


s["Cyrs-entryname"] = {remove_diacritics = c.grave .. c.acute ..  c.diaer .. c.kamora .. c.dasiapneumata .. c.psilipneumata}
s["itc-Latn-displaytext"] = {
from = {c.caron},
to = {c.breve},
}


s["Cyrs-sortkey"] = {
s["itc-Latn-entryname"] = {remove_diacritics = c.macron .. c.breve .. c.diaer .. c.caron .. c.dinvbreve}
from = {
"ї", "оу", -- 2 chars
"ґ", "ꙣ", "є", "[ѕꙃꙅ]", "ꙁ", "[іꙇ]", "[ђꙉ]", "[ѻꙩꙫꙭꙮꚙꚛ]", "ꙋ", "[ѡѿꙍѽ]", "ꙑ", "ѣ", "ꙗ", "ѥ", "ꙕ", "[ѧꙙ]", "[ѩꙝ]", "ꙛ", "ѫ", "ѭ", "ѯ", "ѱ", "ѳ", "ѵ", "ҁ" -- 1 char
},
to = {
"и" .. p[1], "у",
"г" .. p[1], "д" .. p[1], "е", "ж" .. p[1], "з", "и" .. p[1], "и" .. p[2], "о", "у", "х" .. p[1], "ы", "ь" .. p[1], "ь" .. p[2], "ь" .. p[3], "ю", "я", "я" .. p[1], "я" .. p[2], "я" .. p[3], "я" .. p[4], "я" .. p[5], "я" .. p[6], "я" .. p[7], "я" .. p[8], "я" .. p[9]
},
}


s["Grek-sortkey"] = {
s["itc-Latn-sortkey"] = {
remove_diacritics = c.grave .. c.acute .. c.diaer .. c.caron .. c.commaabove .. c.revcommaabove .. c.macron .. c.breve .. c.diaerbelow .. c.brevebelow .. c.perispomeni .. c.ypogegrammeni,
remove_diacritics = c.circ .. c.tilde .. c.macron .. c.breve .. c.diaer .. c.caron .. c.zigzag .. c.dmacron .. c.dtilde .. c.dinvbreve .. c.small_a .. c.small_e .. c.small_i .. c.small_o .. c.small_u, -- Chiefly medieval abbreviations.
from = {"ϝ", "ͷ", "ϛ", "ͱ", "ϻ", "ϟ", "ϙ", "ς", "ϡ", "ͳ"},
from = {"", "æ", "[đꝱꟈ]", "", "", "", "[ƚꝉꝲ]", "", "", "[ꝋᵒ]", "œ", "[ꝑꝓꝕ]", "[ꝗꝙ]", "[ꝛꝵꝶꝝ]", "[ꟊˢ]", "[ꝷᵗ]", "ᵘ", "ꝟ", "⁊"},
to = {"ε" .. p[1], "ε" .. p[2], "ε" .. p[3], "ζ" .. p[1], "π" .. p[1], "π" .. p[2], "π" .. p[2], "σ", "ω" .. p[1], "ω" .. p[1]}
to = {"a", "ae", "d", "e", "i", "k", "l", "m", "n", "o", "oe", "p", "q", "r", "s", "t", "u", "v", "&"}
}
}


Line 186: Line 253:


s["Mong-entryname"] = s["Mong-displaytext"]
s["Mong-entryname"] = s["Mong-displaytext"]
s["Polyt-displaytext"] = s["Grek-displaytext"]


s["Polyt-entryname"] = {
s["Polyt-entryname"] = {
remove_diacritics = c.macron .. c.breve .. c.dbrevebelow,
remove_diacritics = c.macron .. c.breve .. c.dbrevebelow,
from = {"[" .. c.RSQuo .. c.psili .. c.coronis .. "]"},
from = s["Grek-entryname"].from,
to = {"'"}
to = s["Grek-entryname"].to
}
}
s["Polyt-sortkey"] = s["Grek-sortkey"]
-- Samr
do
s["Samr-entryname"] = {
remove_diacritics = c.CGJ .. u(0x0816) .. "-" .. u(0x082D),
}
s["Samr-sortkey"] = s["Samr-entryname"]
end


s["roa-oil-sortkey"] = {
s["roa-oil-sortkey"] = {
remove_diacritics = c.grave .. c.acute .. c.circ .. c.diaer .. c.ringabove .. c.cedilla .. "'",
remove_diacritics = c.grave .. c.acute .. c.circ .. c.tilde .. c.diaer .. c.ringabove .. c.cedilla .. "'",
from = {"æ", "œ"},
from = {"æ", "œ", "·"},
to = {"ae", "oe"}
to = {"ae", "oe", " "}
}
}


Line 207: Line 287:


s["wen-sortkey"] = {
s["wen-sortkey"] = {
from = {
from = {"ch", "[lłßꞩẜ]", "dz[" .. c.caron .. c.acute .. "]", "[bcefmnoprswz][" .. c.caron .. c.acute .. c.dotabove .. "]"},
"l", -- Ensure "l" comes after "ł".
"b́", "č", "ć", "dź", "ě", "f́", "ch", "ł", "ḿ", "ń", "ó", "ṕ", "ř", "ŕ", "š", "ś", "ẃ", "ž", "ż", "ź"
},
to = {
to = {
"l" .. p[1],
"h" .. p[1],
"b" .. p[1], "c" .. p[1], "c" .. p[2], "d" .. p[1], "e" .. p[1], "f" .. p[1], "h" .. p[1], "l", "m" .. p[1], "n" .. p[1], "o" .. p[1], "p" .. p[1], "r" .. p[1], "r" .. p[2], "s" .. p[1], "s" .. p[2], "w" .. p[1], "z" .. p[1], "z" .. p[2], "z" .. p[3]
{
["l"] = "l" .. p[1], ["ł"] = "l", ["ß"] = "s", ["ꞩ"] = "š", ["ẜ"] = "š",
},
{
["dz" .. c.caron] = "d" .. p[1], ["dz" .. c.acute] = "d" .. p[2]
},
{
["b" .. c.acute] = "b" .. p[1],
["c" .. c.caron] = "c" .. p[1], ["c" .. c.acute] = "c" .. p[2],
["e" .. c.caron] = "e" .. p[1], ["e" .. c.dotabove] = "e" .. p[1],
["f" .. c.acute] = "f" .. p[1],
["m" .. c.acute] = "m" .. p[1],
["n" .. c.acute] = "n" .. p[1],
["o" .. c.acute] = "o" .. p[1],
["p" .. c.acute] = "p" .. p[1],
["r" .. c.caron] = "r" .. p[1], ["r" .. c.acute] = "r" .. p[2],
["s" .. c.caron] = "s" .. p[1], ["s" .. c.acute] = "s" .. p[2],
["w" .. c.acute] = "w" .. p[1],
["z" .. c.caron] = "z" .. p[1], ["z" .. c.acute] = "z" .. p[2],
}
}
}
}
}
Line 219: Line 315:
export.shared = s
export.shared = s


-- Short-term solution to override the standard substitution process, by forcing the module to substitute the entire text in one pass. This results in any PUA characters that are used as stand-ins for formatting being handled by the language-specific substitution process, which is usually undesirable.
-- Short-term solution to override the standard substitution process, by forcing the module to substitute the entire text in one pass, if "cont" is given. This results in any PUA characters that are used as stand-ins for formatting being handled by the language-specific substitution process, which is usually undesirable. If the value is "none" then the formatting tags do not get turned into PUA characters in the first place.
-- This override is provided for languages which use formatting between strings of text which might need to interact with each other (e.g. Korean 값이 transliterates as "gaps-i", but [[값]] has the formatting '''값'''[[-이]]. The normal process would split the text at the second '''.)
-- This override is provided for languages which use formatting between strings of text which might need to interact with each other (e.g. Korean 값이 transliterates as "gaps-i", but [[값]] has the formatting '''값'''[[-이]]. The normal process would split the text at the second '''.)
export.contiguous_substitution = {
export.substitution = {
["ja"] = "tr",
["gmy"] = "none",
["jje"] = "tr",
["ja"] = "cont",
["ko"] = "tr",
["jje"] = "cont",
["ko-ear"] = "tr",
["ko"] = "cont",
["ru"] = "tr",
["ko-ear"] = "cont",
["th-new"] = "tr",
["ru"] = "cont",
["sa"] = "tr",
["th-new"] = "cont",
["zkt"] = "tr",
["sa"] = "cont",
["zkt"] = "cont",
}
}


Line 235: Line 332:
-- being deprecated, so should not be added to on a permanent basis. Temporary additions are permitted under reasonable
-- being deprecated, so should not be added to on a permanent basis. Temporary additions are permitted under reasonable
-- circumstances (e.g. to facilitate changing a language's code). When an alias is no longer used, it should be removed.
-- circumstances (e.g. to facilitate changing a language's code). When an alias is no longer used, it should be removed.
-- Aliases in this table are tracked at [[Wiktionary:Tracking/languages/LANG]]; see e.g.
-- [[Special:WhatLinksHere/Wiktionary:Tracking/languages/RL.]] for the `RL.` alias.
export.aliases = {
export.aliases = {
["CL."] = "la-cla",
["EL."] = "la-ecc",
["EL."] = "la-ecc",
["LL."] = "la-lat",
["LL."] = "la-lat",
["ML."] = "la-med",
["ML."] = "la-med",
["NL."] = "la-new",
["NL."] = "la-new",
["RL."] = "la-ren",
["VL."] = "la-vul",
["VL."] = "la-vul",
["prv"] = "oc-pro",
["nds-DE"] = "nds-de",
["nds-NL"] = "nds-nl",
["roa-oan"] = "roa-ona",
}
}


-- Codes to which are tracked. Note that all aliases listed above are also tracked, so should not be duplicated here.
-- Codes which are tracked. Note that all aliases listed above are also tracked, so should not be duplicated here.
-- Tracking uses the same mechanism described above in the comment above `export.aliases`.
export.track = {
export.track = {
-- Codes duplicated been full and etymology-only languages
-- Codes duplicated between full and etymology-only languages.
["lzh-lit"] = true,
["lzh-lit"] = true,
-- Codes actively being converted to families
-- Languages actively being converted to families.
["nan"] = true,
["bh"] = true, -- inc-bih
["nan"] = true, -- zhx-nan
}
}


return export
return export