Module:languages/data: Difference between revisions

no edit summary
(Created page with "local u = mw.ustring.char local export = {} --[=[ Here is a list of the language fields by order of frequency according to User:Erutuon/language_stuff. If the order changes, change the order here for potentially greater efficiency. local fields = { "canonical_name", "wikidata_item", "family", "scripts", "other_names", "ancestors", "type", "translit", "entry_name", "sort_key", "override_translit", "wikimedia_codes", "standard_chars", "wikipedia_article...")
 
No edit summary
 
Line 1: Line 1:
local u = mw.ustring.char
local table = table
local insert = table.insert
local u = require("Module:string/char")
 
local export = {}
local export = {}


--[=[
Here is a list of the language fields by order of frequency according to [[User:Erutuon/language_stuff]].
If the order changes, change the order here for potentially greater efficiency.
local fields = {
"canonical_name",
"wikidata_item",
"family",
"scripts",
"other_names",
"ancestors",
"type",
"translit",
"entry_name",
"sort_key",
"override_translit",
"wikimedia_codes",
"standard_chars",
"wikipedia_article",
"link_tr",
}
--[=[
Insert the fields into the table with their values as their frequency ranking.
{export.most_common_field = 1, export.second_most_common_field = 2, ... }
for i, field in ipairs(fields) do
export[field] = i
end
]=]
-- UTF-8 encoded strings for some commonly-used diacritics.
-- UTF-8 encoded strings for some commonly-used diacritics.
local c = {
local c = {
Line 63: Line 36:
dbrevebelow = u(0x035C),
dbrevebelow = u(0x035C),
dinvbreve = u(0x0361),
dinvbreve = u(0x0361),
small_e = u(0x0364),
kamora          = u(0x0484),
kamora          = u(0x0484),
dasiapneumata  = u(0x0485),
dasiapneumata  = u(0x0485),
Line 91: Line 65:
local braille = {}
local braille = {}
for i = 0x2800, 0x28FF do
for i = 0x2800, 0x28FF do
table.insert(braille, u(i))
insert(braille, u(i))
end
end
c.braille = table.concat(braille)
c.braille = table.concat(braille)
Line 135: Line 109:
s["Grek-sortkey"] = {
s["Grek-sortkey"] = {
remove_diacritics = c.grave .. c.acute .. c.diaer .. c.caron .. c.commaabove .. c.revcommaabove .. c.diaerbelow .. c.brevebelow .. c.perispomeni .. c.ypogegrammeni,
remove_diacritics = c.grave .. c.acute .. c.diaer .. c.caron .. c.commaabove .. c.revcommaabove .. c.diaerbelow .. c.brevebelow .. c.perispomeni .. c.ypogegrammeni,
from = {"ς"},
from = {"ϝ", "ͷ", "ϛ", "ͱ", "ϻ", "ϟ", "ϙ", ", "ϡ", "ͳ"},
to = {"σ"}
to = {"ε" .. p[1], "ε" .. p[2], "ε" .. p[3], "ζ" .. p[1], "π" .. p[1], "π" .. p[2], "π" .. p[2], "σ", "ω" .. p[1], "ω" .. p[1]}
}
 
s["Jpan-sortkey"] = {
Jpan = "Jpan-sortkey",
Hani = "Hani-sortkey",
Hrkt = "Hira-sortkey", -- sort general kana by normalizing to Hira
Hira = "Hira-sortkey",
Kana = "Kana-sortkey"
}
 
s["Jpan-translit"] = {
Hrkt = "Hrkt-translit",
Hira = "Hrkt-translit",
Kana = "Hrkt-translit"
}
}


local HaniChars = require("Module:scripts").getByCode("Hani"):getCharacters()
local HaniChars = require("Module:scripts").getByCode("Hani"):getCharacters()
-- `漢字(한자)`→`漢字`
-- `가-나-다`→`가나다`
-- `온돌(溫突/溫堗)`→`온돌` ([[ondol]])
s["Kore-entryname"] = {
s["Kore-entryname"] = {
remove_diacritics = u(0x302E) .. u(0x302F),
remove_diacritics = u(0x302E) .. u(0x302F),
from = {"([" .. HaniChars .. "])%(.-%)", "(.)%-(.)", "%([" .. HaniChars .. "]+%)"},
from = {"([" .. HaniChars .. "])%(.-%)", "(.)%-(.)", "%([" .. HaniChars .. "/]+%)"},
to = {"%1", "%1%2"}
to = {"%1", "%1%2"}
}
s["Lisu-sortkey"] = {
from = {"𑾰"},
to = {"ꓬ" .. p[1]}
}
}


s["Mong-displaytext"] = {
s["Mong-displaytext"] = {
from = {"([ᠨ-ᡂᡸ])ᠶ([ᠨ-ᡂᡸ])", "([ᠠ-ᡂᡸ])ᠸ([^-ᠧ])", "([ᠠ-ᡂᡸ])ᠸ$"},
from = {"([ᠨ-ᡂᡸ])ᠶ([ᠨ-ᡂᡸ])", "([ᠠ-ᡂᡸ])ᠸ([^᠋ᠠ-ᠧ])", "([ᠠ-ᡂᡸ])ᠸ$"},
to = {"%1ᠢ%2", "%1ᠧ%2", "%1ᠧ"}
to = {"%1ᠢ%2", "%1ᠧ%2", "%1ᠧ"}
}
}


s["Mong-entryname"] = s["Mong-displaytext"]
s["Mong-entryname"] = s["Mong-displaytext"]
s["Polyt-entryname"] = {
remove_diacritics = c.macron .. c.breve .. c.dbrevebelow,
from = {"[" .. c.RSQuo .. c.psili .. c.coronis .. "]"},
to = {"'"}
}


s["roa-oil-sortkey"] = {
s["roa-oil-sortkey"] = {
Line 185: Line 187:
["jje"] = "tr",
["jje"] = "tr",
["ko"] = "tr",
["ko"] = "tr",
["ko-ear"] = "tr",
["ru"] = "tr",
["ru"] = "tr",
["th-new"] = "tr",
["sa"] = "tr",
["zkt"] = "tr",
}
-- Code aliases. The left side is the alias and the right side is the canonical code. NOTE: These are gradually
-- being deprecated, so should not be added to on a permanent basis. Temporary additions are permitted under reasonable
-- circumstances (e.g. to facilitate changing a language's code). When an alias is no longer used, it should be removed.
export.aliases = {
["CL."] = "la-cla",
["EL."] = "la-ecc",
["LL."] = "la-lat",
["ML."] = "la-med",
["NL."] = "la-new",
["RL."] = "la-ren",
["VL."] = "la-vul",
["prv"] = "oc-pro",
}
-- Codes to which are tracked. Note that all aliases listed above are also tracked, so should not be duplicated here.
export.track = {
-- Codes duplicated been full and etymology-only languages
["lzh-lit"] = true,
-- Codes actively being converted to families
["nan"] = true,
}
}


return export
return export