|
|
| Line 1: |
Line 1: |
| local m_scripts = require("Module:scripts") | | local export = {} |
| | |
| | -- We can't use mw.loadData() on [[Module:languages/chars]] because [[Module:languages/data]] itself is sometimes loaded |
| | -- using mw.loadData(), and calling mw.loadData() on [[Module:languages/chars]] will insert metatables into the |
| | -- character tables, which the second mw.loadData() will choke on. |
| | local m_chars = require("Module:languages/chars") |
|
| |
|
| local table = table
| |
| local insert = table.insert
| |
| local u = require("Module:string/char") | | local u = require("Module:string/char") |
| | | local c = m_chars.chars |
| local export = {}
| |
| | |
| -- UTF-8 encoded strings for some commonly-used diacritics.
| |
| local c = { | |
| prime = u(0x02B9),
| |
| grave = u(0x0300),
| |
| acute = u(0x0301),
| |
| circ = u(0x0302),
| |
| tilde = u(0x0303),
| |
| macron = u(0x0304),
| |
| overline = u(0x0305),
| |
| breve = u(0x0306),
| |
| dotabove = u(0x0307),
| |
| diaer = u(0x0308),
| |
| ringabove = u(0x030A),
| |
| hook = u(0x0309),
| |
| dacute = u(0x030B),
| |
| caron = u(0x030C),
| |
| lineabove = u(0x030D),
| |
| dgrave = u(0x030F),
| |
| invbreve = u(0x0311),
| |
| commaabove = u(0x0313),
| |
| revcommaabove = u(0x0314),
| |
| dotbelow = u(0x0323),
| |
| diaerbelow = u(0x0324),
| |
| ringbelow = u(0x0325),
| |
| cedilla = u(0x0327),
| |
| ogonek = u(0x0328),
| |
| tildebelow = u(0x0330),
| |
| brevebelow = u(0x032E),
| |
| macronbelow = u(0x0331),
| |
| perispomeni = u(0x0342),
| |
| ypogegrammeni = u(0x0345),
| |
| CGJ = u(0x034F), -- combining grapheme joiner
| |
| zigzag = u(0x035B),
| |
| dbrevebelow = u(0x035C),
| |
| dmacron = u(0x035E),
| |
| dtilde = u(0x0360),
| |
| dinvbreve = u(0x0361),
| |
| small_a = u(0x0363),
| |
| small_e = u(0x0364),
| |
| small_i = u(0x0365),
| |
| small_o = u(0x0366),
| |
| small_u = u(0x0367),
| |
| keraia = u(0x0374),
| |
| lowerkeraia = u(0x0375),
| |
| tonos = u(0x0384),
| |
| palatalization = u(0x0484),
| |
| dasiapneumata = u(0x0485),
| |
| psilipneumata = u(0x0486),
| |
| kashida = u(0x0640),
| |
| fathatan = u(0x064B),
| |
| dammatan = u(0x064C),
| |
| kasratan = u(0x064D),
| |
| fatha = u(0x064E),
| |
| damma = u(0x064F),
| |
| kasra = u(0x0650),
| |
| shadda = u(0x0651),
| |
| sukun = u(0x0652),
| |
| hamzaabove = u(0x0654),
| |
| nunghunna = u(0x0658),
| |
| zwarakay = u(0x0659),
| |
| smallv = u(0x065A),
| |
| superalef = u(0x0670),
| |
| udatta = u(0x0951),
| |
| anudatta = u(0x0952),
| |
| dottedgrave = u(0x1DC0),
| |
| dottedacute = u(0x1DC1),
| |
| coronis = u(0x1FBD),
| |
| psili = u(0x1FBF),
| |
| dasia = u(0x1FEF),
| |
| ZWNJ = u(0x200C), -- zero width non-joiner
| |
| ZWJ = u(0x200D), -- zero width joiner
| |
| RSQuo = u(0x2019), -- right single quote
| |
| kavyka = u(0xA67C),
| |
| VS01 = u(0xFE00), -- variation selector 1
| |
| -- Punctuation for the standardChars field.
| |
| -- Note: characters are literal (i.e. no magic characters).
| |
| punc = " ',-‐‑‒–—…∅",
| |
| -- Range covering all diacritics.
| |
| diacritics = u(0x300) .. "-" .. u(0x34E) ..
| |
| u(0x350) .. "-" .. u(0x36F) ..
| |
| u(0x1AB0) .. "-" .. u(0x1ACE) ..
| |
| u(0x1DC0) .. "-" .. u(0x1DFF) ..
| |
| u(0x20D0) .. "-" .. u(0x20F0) ..
| |
| u(0xFE20) .. "-" .. u(0xFE2F),
| |
| }
| |
| -- Braille characters for the standardChars field.
| |
| local braille = {}
| |
| for i = 0x2800, 0x28FF do
| |
| insert(braille, u(i))
| |
| end
| |
| c.braille = table.concat(braille)
| |
| export.chars = c | | export.chars = c |
| | | local p = m_chars.puaChars |
| -- PUA characters, generally used in sortkeys.
| |
| -- Note: if the limit needs to be increased, do so in powers of 2 (due to the way memory is allocated for tables).
| |
| local p = {} | |
| for i = 1, 32 do
| |
| p[i] = u(0xF000+i-1)
| |
| end
| |
| export.puaChars = p | | export.puaChars = p |
| | local cs = m_chars.chars_substitutions |
| | export.chars_substitutions = cs |
|
| |
|
| | -- FIXME! Many of the script-specific values below can be moved to [[Module:scripts/data]] to serve as script-wide |
| | -- fallback values instead of specifying them for each language using the script. |
| local s = {} | | local s = {} |
| -- These values are placed here to make it possible to synchronise a group of languages without the need for a dedicated function module. | | -- These values are placed here to make it possible to synchronise a group of languages without the need for a dedicated function module. |
| Line 130: |
Line 38: |
| } | | } |
|
| |
|
| s["cau-Cyrl-entryname"] = { | | s["cau-Cyrl-stripdiacritics"] = { |
| remove_diacritics = cau_remove_diacritics, | | remove_diacritics = cau_remove_diacritics, |
| from = cau_from, | | from = cau_from, |
| Line 136: |
Line 44: |
| } | | } |
|
| |
|
| s["cau-Latn-entryname"] = {remove_diacritics = cau_remove_diacritics} | | s["cau-Latn-stripdiacritics"] = {remove_diacritics = cau_remove_diacritics} |
| end
| |
| | |
| -- Cyrs
| |
| do
| |
| local Cyrs_remove_diacritics = c.grave .. c.acute .. c.dotabove .. c.diaer .. c.invbreve .. c.palatalization .. c.dasiapneumata .. c.psilipneumata .. c.dottedgrave .. c.dottedacute .. c.kavyka
| |
|
| |
| | |
| s["Cyrs-entryname"] = {remove_diacritics = Cyrs_remove_diacritics}
| |
| | |
| s["Cyrs-sortkey"] = {
| |
| remove_diacritics = Cyrs_remove_diacritics,
| |
| from = {
| |
| "ї", "оу", -- 2 chars
| |
| "[ґꙣєѕꙃꙅꙁіꙇђꙉѻꙩꙫꙭꙮꚙꚛꙋѡѿꙍѽꙑѣꙗѥꙕѧꙙѩꙝꙛѫѭѯѱѳѵҁ]"
| |
| },
| |
| to = {
| |
| "и" .. p[1], "у", {
| |
| ["ґ"] = "г" .. p[1], ["ꙣ"] = "д" .. p[1], ["є"] = "е", ["ѕ"] = "ж" .. p[1], ["ꙃ"] = "ж" .. p[1],
| |
| ["ꙅ"] = "ж" .. p[1], ["ꙁ"] = "з", ["і"] = "и" .. p[1], ["ꙇ"] = "и" .. p[1], ["ђ"] = "и" .. p[2],
| |
| ["ꙉ"] = "и" .. p[2], ["ѻ"] = "о", ["ꙩ"] = "о", ["ꙫ"] = "о", ["ꙭ"] = "о",
| |
| ["ꙮ"] = "о", ["ꚙ"] = "о", ["ꚛ"] = "о", ["ꙋ"] = "у", ["ѡ"] = "х" .. p[1],
| |
| ["ѿ"] = "х" .. p[1], ["ꙍ"] = "х" .. p[1], ["ѽ"] = "х" .. p[1], ["ꙑ"] = "ы", ["ѣ"] = "ь" .. p[1],
| |
| ["ꙗ"] = "ь" .. p[2], ["ѥ"] = "ь" .. p[3], ["ꙕ"] = "ю", ["ѧ"] = "я", ["ꙙ"] = "я",
| |
| ["ѩ"] = "я" .. p[1], ["ꙝ"] = "я" .. p[1], ["ꙛ"] = "я" .. p[2], ["ѫ"] = "я" .. p[3], ["ѭ"] = "я" .. p[4],
| |
| ["ѯ"] = "я" .. p[5], ["ѱ"] = "я" .. p[6], ["ѳ"] = "я" .. p[7], ["ѵ"] = "я" .. p[8], ["ҁ"] = "я" .. p[9],
| |
| }
| |
| },
| |
| }
| |
| end | | end |
|
| |
| s["Grek-displaytext"] = {
| |
| from = {"Þ", "þ", "['" .. c.RSQuo .. c.prime .. c.keraia .. c.coronis .. c.psili .. "]"}, -- Not tonos, used as the numeral sign in entries.
| |
| to = {"Ϸ", "ϸ", c.RSQuo}
| |
| }
| |
|
| |
| s["Grek-entryname"] = {
| |
| remove_diacritics = c.caron .. c.diaerbelow .. c.brevebelow,
| |
| from = s["Grek-displaytext"].from,
| |
| to = {"Ϸ", "ϸ", "'"}
| |
| }
| |
|
| |
| s["Grek-sortkey"] = {
| |
| remove_diacritics = "';·`¨´῀" .. c.grave .. c.acute .. c.diaer .. c.caron .. c.commaabove .. c.revcommaabove .. c.macron .. c.breve .. c.diaerbelow .. c.brevebelow .. c.perispomeni .. c.ypogegrammeni .. c.RSQuo .. c.prime .. c.keraia .. c.lowerkeraia .. c.tonos .. c.coronis .. c.psili .. c.dasia,
| |
| from = {"ϝ", "ͷ", "ϛ", "ͱ", "ͺ", "ϳ", "ϻ", "[ϟϙ]", "[ςϲ]", "ͳ"},
| |
| to = {"ε" .. p[1], "ε" .. p[2], "ε" .. p[3], "ζ" .. p[1], "ι", "ι" .. p[1], "π" .. p[1], "π" .. p[2], "σ", "ϡ"}
| |
| }
| |
|
| |
|
| s["itc-Latn-displaytext"] = { | | s["itc-Latn-displaytext"] = { |
| Line 189: |
Line 52: |
| } | | } |
|
| |
|
| s["itc-Latn-entryname"] = {remove_diacritics = c.macron .. c.breve .. c.diaer .. c.caron .. c.dinvbreve} | | s["itc-Latn-stripdiacritics"] = {remove_diacritics = c.macron .. c.breve .. c.diaer .. c.caron .. c.dinvbreve} |
|
| |
|
| s["itc-Latn-sortkey"] = { | | s["itc-Latn-sortkey"] = { |
| Line 215: |
Line 78: |
| } | | } |
|
| |
|
| s["jpx-entryname"] = s["jpx-displaytext"] | | s["jpx-stripdiacritics"] = s["jpx-displaytext"] |
|
| |
|
| s["jpx-sortkey"] = { | | s["jpx-sortkey"] = { |
| Line 231: |
Line 94: |
| Kana = "Hrkt-translit" | | Kana = "Hrkt-translit" |
| } | | } |
|
| |
| local HaniChars = m_scripts.getByCode("Hani"):getCharacters()
| |
| -- `漢字(한자)`→`漢字`
| |
| -- `가-나-다`→`가나다`, `가--나--다`→`가-나-다`
| |
| -- `온돌(溫突/溫堗)`→`온돌` ([[ondol]])
| |
| s["Kore-entryname"] = {
| |
| remove_diacritics = u(0x302E) .. u(0x302F),
| |
| from = {"([" .. HaniChars .. "])%(.-%)", "^%-", "%-$", "%-(%-?)", "\1", "%([" .. HaniChars .. "/]+%)"},
| |
| to = {"%1", "\1", "\1", "%1", "-"}
| |
| }
| |
|
| |
| s["Lisu-sortkey"] = {
| |
| from = {"𑾰"},
| |
| to = {"ꓬ" .. p[1]}
| |
| }
| |
|
| |
| s["Mong-displaytext"] = {
| |
| from = {"([ᠨ-ᡂᡸ])ᠶ([ᠨ-ᡂᡸ])", "([ᠠ-ᡂᡸ])ᠸ([^᠋ᠠ-ᠧ])", "([ᠠ-ᡂᡸ])ᠸ$"},
| |
| to = {"%1ᠢ%2", "%1ᠧ%2", "%1ᠧ"}
| |
| }
| |
|
| |
| s["Mong-entryname"] = s["Mong-displaytext"]
| |
|
| |
| s["Polyt-displaytext"] = s["Grek-displaytext"]
| |
|
| |
| s["Polyt-entryname"] = {
| |
| remove_diacritics = c.macron .. c.breve .. c.dbrevebelow,
| |
| from = s["Grek-entryname"].from,
| |
| to = s["Grek-entryname"].to
| |
| }
| |
|
| |
| s["Polyt-sortkey"] = s["Grek-sortkey"]
| |
|
| |
| -- Samr
| |
| do
| |
| s["Samr-entryname"] = {
| |
| remove_diacritics = c.CGJ .. u(0x0816) .. "-" .. u(0x082D),
| |
| }
| |
|
| |
| s["Samr-sortkey"] = s["Samr-entryname"]
| |
| end
| |
|
| |
|
| s["roa-oil-sortkey"] = { | | s["roa-oil-sortkey"] = { |
| Line 278: |
Line 100: |
| to = {"ae", "oe", " "} | | to = {"ae", "oe", " "} |
| } | | } |
|
| |
| s["Tibt-displaytext"] = {
| |
| from = {"ༀ", "༌", "།།", "༚༚", "༚༝", "༝༚", "༝༝", "ཷ", "ཹ", "ེེ", "ོོ"},
| |
| to = {"ཨོཾ", "་", "༎", "༛", "༟", "࿎", "༞", "ྲཱྀ", "ླཱྀ", "ཻ", "ཽ"}
| |
| }
| |
|
| |
| s["Tibt-entryname"] = s["Tibt-displaytext"]
| |
|
| |
|
| s["wen-sortkey"] = { | | s["wen-sortkey"] = { |
| Line 312: |
Line 127: |
| } | | } |
| } | | } |
| | |
| | -- Myanmar dotted form : https://www.unicode.org/Public/UNIDATA/StandardizedVariants.txt |
| | s["aio-displaytext"] = { |
| | from = {"([ကဂငတထပမယလဝဢေၵၸၺႀꩠꩡꩢꩣꩤꩥꩦꩫꩬꩯꩺ])"}, |
| | to = {"%1" .. c.VS01} |
| | } |
| | s["aio-stripdiacritics"] = { |
| | remove_diacritics = c.VS01, |
| | } |
| | |
| | s["phk-displaytext"] = s["aio-displaytext"] |
| | s["phk-stripdiacritics"] = s["aio-stripdiacritics"] |
| | |
| | s["kht-displaytext"] = s["aio-displaytext"] |
| | s["kht-stripdiacritics"] = s["aio-stripdiacritics"] |
|
| |
|
| export.shared = s | | export.shared = s |
|
| |
|
| -- Short-term solution to override the standard substitution process, by forcing the module to substitute the entire text in one pass, if "cont" is given. This results in any PUA characters that are used as stand-ins for formatting being handled by the language-specific substitution process, which is usually undesirable. If the value is "none" then the formatting tags do not get turned into PUA characters in the first place. | | --[==[ var: |
| -- This override is provided for languages which use formatting between strings of text which might need to interact with each other (e.g. Korean 값이 transliterates as "gaps-i", but [[값]] has the formatting '''값'''[[-이]]. The normal process would split the text at the second '''.)
| | Short-term solution to override the standard substitution process, by forcing the module to substitute the entire |
| | text in one pass, if "cont" is given. This results in any PUA characters that are used as stand-ins for formatting being |
| | handled by the language-specific substitution process, which is usually undesirable. If the value is "none" then the |
| | formatting tags do not get turned into PUA characters in the first place. This override is provided for languages which |
| | use formatting between strings of text which might need to interact with each other (e.g. Korean 값이 transliterates as "gaps-i", but [[값]] has the formatting '''값'''[[-이]]. The normal process would split the text at the second '''.) |
| | ]==] |
| export.substitution = { | | export.substitution = { |
| ["gmy"] = "none", | | ["gmy"] = "none", |
| Line 329: |
Line 164: |
| } | | } |
|
| |
|
| -- Code aliases. The left side is the alias and the right side is the canonical code. NOTE: These are gradually | | --[==[ var: |
| -- being deprecated, so should not be added to on a permanent basis. Temporary additions are permitted under reasonable
| | Code aliases. The left side is the alias and the right side is the canonical code. NOTE: These are gradually |
| -- circumstances (e.g. to facilitate changing a language's code). When an alias is no longer used, it should be removed.
| | being deprecated, so should not be added to on a permanent basis. Temporary additions are permitted under reasonable |
| -- Aliases in this table are tracked at [[Wiktionary:Tracking/languages/LANG]]; see e.g.
| | circumstances (e.g. to facilitate changing a language's code). When an alias is no longer used, it should be removed. |
| -- [[Special:WhatLinksHere/Wiktionary:Tracking/languages/RL.]] for the `RL.` alias.
| | Aliases in this table are tracked at [[Wiktionary:Tracking/languages/LANG]]; see e.g. |
| | [[Special:WhatLinksHere/Wiktionary:Tracking/languages/VL.]] for the `VL.` alias. |
| | ]==] |
| export.aliases = { | | export.aliases = { |
| ["EL."] = "la-ecc", | | ["EL."] = "la-ecc", |
| Line 343: |
Line 180: |
| ["nds-NL"] = "nds-nl", | | ["nds-NL"] = "nds-nl", |
| ["roa-oan"] = "roa-ona", | | ["roa-oan"] = "roa-ona", |
| | ["sa-cls"] = "cls", |
| | ["sa-ved"] = "vsn", |
| } | | } |
|
| |
|
| -- Codes which are tracked. Note that all aliases listed above are also tracked, so should not be duplicated here. | | --[==[ var: |
| -- Tracking uses the same mechanism described above in the comment above `export.aliases`.
| | Codes which are tracked. Note that all aliases listed above are also tracked, so should not be duplicated here. |
| | Tracking uses the same mechanism described above in the comment above `export.aliases`. |
| | ]==] |
| export.track = { | | export.track = { |
| -- Codes duplicated between full and etymology-only languages. | | -- Codes duplicated between full and etymology-only languages. |
| ["lzh-lit"] = true, | | ["lzh-lit"] = true, |
| | ["lzh"] = true, |
| -- Languages actively being converted to families. | | -- Languages actively being converted to families. |
| ["bh"] = true, -- inc-bih | | ["bh"] = true, -- inc-bih |