Module:Cyrs-translit: Difference between revisions
Created page with "local export = {} local numbers = mw.loadData("Module:Cyrs-translit/numbers") local ugsub = mw.ustring.gsub local toNFC = mw.ustring.toNFC local toNFD = mw.ustring.toNFD local U = mw.ustring.char local umatch = mw.ustring.match local usub = mw.ustring.sub local ulower = mw.ustring.lower local acute = U(0x301) local grave = U(0x300) local circumflex = U(0x302) local palatalization = U(0x0484) local titlo = U(0x0483) local dasia = U(0x0485) local psili = U(0x0486) local..." |
m 1 revision imported |
||
| (One intermediate revision by one other user not shown) | |||
| Line 14: | Line 14: | ||
local grave = U(0x300) | local grave = U(0x300) | ||
local circumflex = U(0x302) | local circumflex = U(0x302) | ||
local kamora = U(0x0311) | |||
local palatalization = U(0x0484) | local palatalization = U(0x0484) | ||
local titlo = U(0x0483) | local titlo = U(0x0483) | ||
| Line 22: | Line 23: | ||
local breathing = psili .. dasia | local breathing = psili .. dasia | ||
local accent = "[" .. acute .. grave .. circumflex .. breathing .. "]*" | local accent = "[" .. acute .. grave .. circumflex .. breathing .. "]*" | ||
local vowels = " | local vowels = "aAæÆeEiIoOœŒꝏꝎuUyY" | ||
local vowel_or_soft = "[" .. vowels .. "ʹ]" | local vowel_or_soft = "[" .. vowels .. "ʹ]" | ||
| Line 38: | Line 39: | ||
["І"] = 'I', ["і"] = 'i', -- Contrastive with "И". | ["І"] = 'I', ["і"] = 'i', -- Contrastive with "И". | ||
["Й"] = 'J', ["й"] = 'j', | ["Й"] = 'J', ["й"] = 'j', | ||
["Ꙉ"] = ' | ["Ꙉ"] = 'Ǵ', ["ꙉ"] = 'ǵ', | ||
["К"] = 'K', ["к"] = 'k', | ["К"] = 'K', ["к"] = 'k', | ||
["Л"] = 'L', ["л"] = 'l', | ["Л"] = 'L', ["л"] = 'l', | ||
| Line 44: | Line 45: | ||
["Н"] = 'N', ["н"] = 'n', | ["Н"] = 'N', ["н"] = 'n', | ||
["О"] = 'O', ["о"] = 'o', | ["О"] = 'O', ["о"] = 'o', | ||
["Ꚙ"] = 'Ꝏ', ["ꚙ"] = 'ꝏ', | |||
["П"] = 'P', ["п"] = 'p', | ["П"] = 'P', ["п"] = 'p', | ||
["Р"] = 'R', ["р"] = 'r', | ["Р"] = 'R', ["р"] = 'r', | ||
["С"] = 'S', ["с"] = 's', | ["С"] = 'S', ["с"] = 's', | ||
["Т"] = 'T', ["т"] = 't', | ["Т"] = 'T', ["т"] = 't', | ||
["У"] = 'U', ["у"] = 'u', | |||
["Ꙋ"] = 'U', ["ꙋ"] = 'u', | ["Ꙋ"] = 'U', ["ꙋ"] = 'u', | ||
["Ф"] = 'F', ["ф"] = 'f', | ["Ф"] = 'F', ["ф"] = 'f', | ||
["Х"] = 'X', ["х"] = 'x', | ["Х"] = 'X', ["х"] = 'x', | ||
| Line 58: | Line 60: | ||
["Ꙡ"] = 'Ć', ["ꙡ"] = 'ć', -- From a merger of "Ц" and "Ч" in Old Novgorodian. | ["Ꙡ"] = 'Ć', ["ꙡ"] = 'ć', -- From a merger of "Ц" and "Ч" in Old Novgorodian. | ||
["Ч"] = 'Č', ["ч"] = 'č', | ["Ч"] = 'Č', ["ч"] = 'č', | ||
["Џ"] = 'Dž', ["џ"] = 'dž', | |||
["Ш"] = 'Š', ["ш"] = 'š', | ["Ш"] = 'Š', ["ш"] = 'š', | ||
["Щ"] = 'Št', ["щ"] = 'št', | ["Щ"] = 'Št', ["щ"] = 'št', | ||
| Line 65: | Line 68: | ||
["Ѣ"] = 'Ě', ["ѣ"] = 'ě', | ["Ѣ"] = 'Ě', ["ѣ"] = 'ě', | ||
["Ꙓ"] = 'Jě', ["ꙓ"] = 'jě', | ["Ꙓ"] = 'Jě', ["ꙓ"] = 'jě', | ||
["Ю"] = 'Ju', ["ю"] = 'ju', | |||
["Ꙗ"] = 'Ja', ["ꙗ"] = 'ja', | ["Ꙗ"] = 'Ja', ["ꙗ"] = 'ja', | ||
["Ѥ"] = 'Je', ["ѥ"] = 'je', | ["Ѥ"] = 'Je', ["ѥ"] = 'je', | ||
[" | ["Ѧ"] = 'Ę', ["ѧ"] = 'ę', | ||
["Ѫ"] = 'Ǫ', ["ѫ"] = 'ǫ', | ["Ѫ"] = 'Ǫ', ["ѫ"] = 'ǫ', | ||
["Ѩ"] = 'Ję', ["ѩ"] = 'ję', | |||
["Ѭ"] = 'Jǫ', ["ѭ"] = 'jǫ', | ["Ѭ"] = 'Jǫ', ["ѭ"] = 'jǫ', | ||
[" | ["Ꙛ"] = 'Œ', ["ꙛ"] = 'œ', -- Becomes "œ̨". | ||
["Ѯ"] = 'Ks', ["ѯ"] = 'ks', | ["Ѯ"] = 'Ks', ["ѯ"] = 'ks', | ||
["Ѱ"] = 'Ps', ["ѱ"] = 'ps', | ["Ѱ"] = 'Ps', ["ѱ"] = 'ps', | ||
| Line 78: | Line 82: | ||
["Ѷ"] = 'Ü', ["ѷ"] = 'ü', -- Contrastive with "Ѵ". | ["Ѷ"] = 'Ü', ["ѷ"] = 'ü', -- Contrastive with "Ѵ". | ||
["Ҁ"] = 'Q', ["ҁ"] = 'q', | ["Ҁ"] = 'Q', ["ҁ"] = 'q', | ||
[psili] = '', | |||
[kamora] = circumflex, | |||
} | } | ||
| Line 83: | Line 89: | ||
["ᲀ"] = 'в', | ["ᲀ"] = 'в', | ||
["Ґ"] = 'Г', ["ґ"] = 'г', | ["Ґ"] = 'Г', ["ґ"] = 'г', | ||
-- ["Ђ"] = 'Ꙉ', ["ђ"] = 'ꙉ', | |||
["ᲁ"] = 'д', | ["ᲁ"] = 'д', | ||
["Ꙣ"] = 'Д' .. palatalization, ["ꙣ"] = 'д' .. palatalization, | ["Ꙣ"] = 'Д' .. palatalization, ["ꙣ"] = 'д' .. palatalization, | ||
| Line 93: | Line 100: | ||
["Ї"] = 'І', ["ї"] = 'і', | ["Ї"] = 'І', ["ї"] = 'і', | ||
["Ꙇ"] = 'І', ["ꙇ"] = 'і', | ["Ꙇ"] = 'І', ["ꙇ"] = 'і', | ||
-- ["Ћ"] = 'Ꙉ', ["ћ"] = 'ꙉ', | |||
["Ꙥ"] = 'Л' .. palatalization, ["ꙥ"] = 'л' .. palatalization, | ["Ꙥ"] = 'Л' .. palatalization, ["ꙥ"] = 'л' .. palatalization, | ||
["Ꙧ"] = 'М' .. palatalization, ["ꙧ"] = 'м' .. palatalization, | ["Ꙧ"] = 'М' .. palatalization, ["ꙧ"] = 'м' .. palatalization, | ||
| Line 100: | Line 108: | ||
["Ꙫ"] = 'О', ["ꙫ"] = 'о', | ["Ꙫ"] = 'О', ["ꙫ"] = 'о', | ||
["Ꚛ"] = 'О', ["ꚛ"] = 'о', | ["Ꚛ"] = 'О', ["ꚛ"] = 'о', | ||
["Ꙭ"] = 'Ꚙ', ["ꙭ"] = 'ꚙ', | |||
["Ꙭ"] = ' | |||
["ꙮ"] = 'о', | ["ꙮ"] = 'о', | ||
["ᲂ"] = 'о', | ["ᲂ"] = 'о', | ||
| Line 114: | Line 121: | ||
["Я"] = 'Ꙗ', ["я"] = 'ꙗ', | ["Я"] = 'Ꙗ', ["я"] = 'ꙗ', | ||
["Ꙕ"] = 'Ю', ["ꙕ"] = 'ю', | ["Ꙕ"] = 'Ю', ["ꙕ"] = 'ю', | ||
["Ꙙ"] = 'Ѧ', ["ꙙ"] = 'ѧ', | ["Ꙙ"] = 'Ѧ', ["ꙙ"] = 'ѧ', | ||
["Ꙝ"] = 'Ѩ', ["ꙝ"] = 'ѩ', | ["Ꙝ"] = 'Ѩ', ["ꙝ"] = 'ѩ', | ||
} | |||
-- A second round of substitutions, e.g. if the final output isn't a precomposed character, but needs to behave like one during processing. | |||
local final_substitutions = { | |||
["Œ"] = "Œ̨", ["œ"] = "œ̨", | |||
[titlo] = ":", [vzmet] = ":" | |||
} | } | ||
| Line 149: | Line 161: | ||
-- Old Novgorodian | -- Old Novgorodian | ||
lang_letters["zle-ono"] = setmetatable({ | lang_letters["zle-ono"] = setmetatable({ | ||
["Ц"] = ' | ["Ц"] = 'Cʹ', ["ц"] = 'cʹ', | ||
["Ч"] = ' | ["Ч"] = 'Cʹ', ["ч"] = 'cʹ', | ||
["Щ"] = ' | ["Щ"] = 'Sʹcʹ', ["щ"] = 'sʹcʹ', | ||
}, {__index = common_letters}) | }, {__index = common_letters}) | ||
| Line 159: | Line 171: | ||
-- Old Pskovian | -- Old Pskovian | ||
lang_letters["zle-ops"] = setmetatable({ -- In addition to zle-ono above. | lang_letters["zle-ops"] = setmetatable({ -- In addition to zle-ono above. | ||
["Ж"] = ' | ["Ж"] = 'Zʹ', ["ж"] = 'zʹ', | ||
["Ѕ"] = ' | ["Ѕ"] = 'Dzʹ', ["ѕ"] = 'dzʹ', | ||
["З"] = ' | ["З"] = 'Zʹ', ["з"] = 'zʹ', | ||
["С"] = ' | ["С"] = 'Sʹ', ["с"] = 'sʹ', | ||
["Ш"] = ' | ["Џ"] = 'Dzʹ', ["џ"] = 'dzʹ', | ||
["Щ"] = ' | ["Ш"] = 'Sʹ', ["ш"] = 'sʹ', | ||
["Щ"] = 'Sʹk', ["щ"] = 'sʹk', | |||
["Ѣ"] = 'Æ', ["ѣ"] = 'æ', | |||
["Ꙓ"] = 'Jæ', ["ꙓ"] = 'jæ', | |||
}, {__index = lang_letters["zle-ono"]}) | }, {__index = lang_letters["zle-ono"]}) | ||
| Line 185: | Line 200: | ||
end | end | ||
local function | local function handle_rough_breathing(base1, base2, diacritics) | ||
-- | -- Mark rough breathing with "h". | ||
if | local base2_lower = ulower(base2) | ||
return | if not vowels:match(base2_lower) then | ||
return base1 .. base2 .. diacritics .. "h" | |||
end | end | ||
local base1_lower = ulower(base1) | |||
if not vowels:match(base1_lower) then | |||
return ( | return base1 .. (base2_lower == base2 and "h" or "H") .. base2_lower .. diacritics | ||
end | |||
return (base1_lower == base1 and "h" or "H") .. base1 .. base2 .. diacritics | |||
end | end | ||
| Line 268: | Line 286: | ||
-- In some languages, treat "уо" ("uo") as "у" ("u"). | -- In some languages, treat "уо" ("uo") as "у" ("u"). | ||
if uo_is_u[lang] then | if uo_is_u[lang] then | ||
-- Not "ꚙ", which is an orthographically doubled "о". | |||
text = ugsub(text, "([уУѵѴѷѶ]" .. accent .. ")[оО]", "%1") | text = ugsub(text, "([уУѵѴѷѶ]" .. accent .. ")[оО]", "%1") | ||
end | end | ||
-- Treat "оу" ("ou") as "у" ("u"). | -- Treat "оу" ("ou") as "у" ("u") (but not "ꚙ"). | ||
text = ugsub(text, "([оО])(" .. accent .. ")[уУѵѴѷѶ]", handle_ou) | text = ugsub(text, "([оО])(" .. accent .. ")[уУѵѴѷѶ]", handle_ou) | ||
| Line 282: | Line 301: | ||
text = text:gsub(".[\128-\191]*", letters) | text = text:gsub(".[\128-\191]*", letters) | ||
-- Handle any breathing marks. | -- Handle any rough breathing marks. | ||
text = ugsub(toNFD(text), "([ | -- FIXME: this can't handle various edge cases. | ||
text = ugsub(toNFD(text), "(%w)(%w?)([^%w%s]*)[" .. dasia .. "]", handle_rough_breathing) | |||
if umatch(text, "[" .. breathing .. "]") then | if umatch(text, "[" .. breathing .. "]") then | ||
| Line 289: | Line 309: | ||
end | end | ||
-- | -- Final substitutions. | ||
text = | text = text:gsub(".[\128-\191]*", final_substitutions) | ||
return toNFC(text) | return toNFC(text) | ||