Module:Cyrs-translit

From Linguifex
Revision as of 21:49, 3 April 2025 by Sware (talk | contribs) (Created page with "local export = {} local numbers = mw.loadData("Module:Cyrs-translit/numbers") local ugsub = mw.ustring.gsub local toNFC = mw.ustring.toNFC local toNFD = mw.ustring.toNFD local U = mw.ustring.char local umatch = mw.ustring.match local usub = mw.ustring.sub local ulower = mw.ustring.lower local acute = U(0x301) local grave = U(0x300) local circumflex = U(0x302) local palatalization = U(0x0484) local titlo = U(0x0483) local dasia = U(0x0485) local psili = U(0x0486) local...")
(diff) ← Older revision | Latest revision (diff) | Newer revision → (diff)
Jump to navigation Jump to search

Documentation for this module may be created at Module:Cyrs-translit/doc

local export = {}

local numbers = mw.loadData("Module:Cyrs-translit/numbers")

local ugsub = mw.ustring.gsub
local toNFC = mw.ustring.toNFC
local toNFD = mw.ustring.toNFD
local U = mw.ustring.char
local umatch = mw.ustring.match
local usub = mw.ustring.sub
local ulower = mw.ustring.lower

local acute = U(0x301)
local grave = U(0x300)
local circumflex = U(0x302)
local palatalization = U(0x0484)
local titlo = U(0x0483)
local dasia = U(0x0485)
local psili = U(0x0486)
local vzmet = U(0xA66F)

local breathing = psili .. dasia
local accent = "[" .. acute .. grave .. circumflex .. breathing .. "]*"
local vowels = "aAeEiIoOuUyY"
local vowel_or_soft = "[" .. vowels .. "ʹ]"

local common_letters = {
	["А"] = 'A', ["а"] = 'a',
	["Б"] = 'B', ["б"] = 'b',
	["В"] = 'V', ["в"] = 'v',
	["Г"] = 'G', ["г"] = 'g',
	["Д"] = 'D', ["д"] = 'd',
	["Е"] = 'E', ["е"] = 'e',
	["Ж"] = 'Ž', ["ж"] = 'ž',
	["Ѕ"] = 'Dz', ["ѕ"] = 'dz',
	["З"] = 'Z', ["з"] = 'z',
	["И"] = 'I', ["и"] = 'i',
	["І"] = 'I', ["і"] = 'i', -- Contrastive with "И".
	["Й"] = 'J', ["й"] = 'j',
	["Ꙉ"] = 'Đ', ["ꙉ"] = 'đ',
	["К"] = 'K', ["к"] = 'k',
	["Л"] = 'L', ["л"] = 'l',
	["М"] = 'M', ["м"] = 'm',
	["Н"] = 'N', ["н"] = 'n',
	["О"] = 'O', ["о"] = 'o',
	["П"] = 'P', ["п"] = 'p',
	["Р"] = 'R', ["р"] = 'r',
	["С"] = 'S', ["с"] = 's',
	["Т"] = 'T', ["т"] = 't',
	["Ꙋ"] = 'U', ["ꙋ"] = 'u',
	["У"] = 'U', ["у"] = 'u',
	["Ф"] = 'F', ["ф"] = 'f',
	["Х"] = 'X', ["х"] = 'x',
	["Ѡ"] = 'O', ["ѡ"] = 'o', -- Contrastive with "О".
	["Ѿ"] = 'Ot', ["ѿ"] = 'ot', -- Becomes "otŭ" as appropriate.
	["Ѽ"] = 'Ô', ["ѽ"] = 'ô',
	["Ц"] = 'C', ["ц"] = 'c',
	["Ꙡ"] = 'Ć', ["ꙡ"] = 'ć', -- From a merger of "Ц" and "Ч" in Old Novgorodian.
	["Ч"] = 'Č', ["ч"] = 'č',
	["Ш"] = 'Š', ["ш"] = 'š',
	["Щ"] = 'Št', ["щ"] = 'št',
	["Ъ"] = 'Ŭ', ["ъ"] = 'ŭ',
	["Ꙑ"] = 'Y', ["ꙑ"] = 'y',
	["Ь"] = 'Ĭ', ["ь"] = 'ĭ',
	["Ѣ"] = 'Ě', ["ѣ"] = 'ě',
	["Ꙓ"] = 'Jě', ["ꙓ"] = 'jě',
	["Ꙗ"] = 'Ja', ["ꙗ"] = 'ja',
	["Ѥ"] = 'Je', ["ѥ"] = 'je',
	["Ю"] = 'Ju', ["ю"] = 'ju',
	["Ѫ"] = 'Ǫ', ["ѫ"] = 'ǫ',
	["Ѭ"] = 'Jǫ', ["ѭ"] = 'jǫ',
	["Ѧ"] = 'Ę', ["ѧ"] = 'ę',
	["Ѩ"] = 'Ję', ["ѩ"] = 'ję',
	["Ѯ"] = 'Ks', ["ѯ"] = 'ks',
	["Ѱ"] = 'Ps', ["ѱ"] = 'ps',
	["Ѳ"] = 'Θ', ["ѳ"] = 'θ',
	["Ѵ"] = 'Ü', ["ѵ"] = 'ü',
	["Ѷ"] = 'Ü', ["ѷ"] = 'ü', -- Contrastive with "Ѵ".
	["Ҁ"] = 'Q', ["ҁ"] = 'q',
}

local variants = {
	["ᲀ"] = 'в',
	["Ґ"] = 'Г', ["ґ"] = 'г',
	["ᲁ"] = 'д',
	["Ꙣ"] = 'Д' .. palatalization, ["ꙣ"] = 'д' .. palatalization,
	["Є"] = 'Е', ["є"] = 'е',
	["Э"] = 'Е', ["э"] = 'е',
	["Ꙃ"] = 'Ѕ', ["ꙃ"] = 'ѕ',
	["Ꙅ"] = 'Ѕ', ["ꙅ"] = 'ѕ',
	["Ꙁ"] = 'З', ["ꙁ"] = 'з',
	["Ӥ"] = 'И', ["ӥ"] = 'и',
	["Ї"] = 'І', ["ї"] = 'і',
	["Ꙇ"] = 'І', ["ꙇ"] = 'і',
	["Ꙥ"] = 'Л' .. palatalization, ["ꙥ"] = 'л' .. palatalization,
	["Ꙧ"] = 'М' .. palatalization, ["ꙧ"] = 'м' .. palatalization,
	["Ҥ"] = 'Н' .. palatalization, ["ҥ"] = 'н' .. palatalization,
	["Ѻ"] = 'О', ["ѻ"] = 'о',
	["Ꙩ"] = 'О', ["ꙩ"] = 'о',
	["Ꙫ"] = 'О', ["ꙫ"] = 'о',
	["Ꚛ"] = 'О', ["ꚛ"] = 'о',
	["Ꚙ"] = 'О', ["ꚙ"] = 'о',
	["Ꙭ"] = 'О', ["ꙭ"] = 'о',
	["ꙮ"] = 'о',
	["ᲂ"] = 'о',
	["ᲃ"] = 'с',
	["ᲄ"] = 'т',
	["ᲅ"] = 'т',
	["Ѹ"] = 'Ꙋ', ["ѹ"] = 'ꙋ', ["ᲈ"] = 'ꙋ',
	["Ꙍ"] = 'Ѡ', ["ꙍ"] = 'ѡ',
	["Ы"] = 'Ꙑ', ["ы"] = 'ꙑ',
	["ᲆ"] = 'ъ',
	["ᲇ"] = 'ѣ',
	["Я"] = 'Ꙗ', ["я"] = 'ꙗ',
	["Ꙕ"] = 'Ю', ["ꙕ"] = 'ю',
	["Ꙛ"] = 'Ѫ', ["ꙛ"] = 'ѫ',
	["Ꙙ"] = 'Ѧ', ["ꙙ"] = 'ѧ',
	["Ꙝ"] = 'Ѩ', ["ꙝ"] = 'ѩ',
}

-- Letters converted to their iotated equivalents when word-initial.
local common_iotated_initial = {
	["Ѣ"] = 'Ꙓ', ["ѣ"] = 'ꙓ',
}

-- Letters converted to their iotated equivalents after vowels or a
-- palatalization mark.
local common_iotated_after_vowel_or_soft = {
	["Е"] = 'Ѥ', ["е"] = 'ѥ',
	["Ѣ"] = 'Ꙓ', ["ѣ"] = 'ꙓ',
	["Ѧ"] = 'Ѩ', ["ѧ"] = 'ѩ',
}

local lang_letters = {}
local lang_iotated_initial = {}
local lang_iotated_after_vowel_or_soft = {}
local uo_is_u = {}

-- Old East Slavic
	lang_letters["orv"] = setmetatable({
		["Щ"] = 'Šč', ["щ"] = 'šč',
	}, {__index = common_letters})
	
	lang_iotated_initial["orv"] = setmetatable({
		["Е"] = 'Ѥ', ["е"] = 'ѥ',
		["Ѧ"] = 'Ѩ', ["ѧ"] = 'ѩ',
	}, {__index = common_iotated_initial})

-- Old Novgorodian
	lang_letters["zle-ono"] = setmetatable({
		["Ц"] = 'Ć', ["ц"] = 'ć',
		["Ч"] = 'Ć', ["ч"] = 'ć',
		["Щ"] = 'Ść', ["щ"] = 'ść',
	}, {__index = common_letters})
	
	lang_iotated_initial["zle-ono"] = lang_iotated_initial["orv"]
	uo_is_u["zle-ono"] = true

-- Old Pskovian
	lang_letters["zle-ops"] = setmetatable({ -- In addition to zle-ono above.
		["Ж"] = 'Ź', ["ж"] = 'ź',
		["Ѕ"] = 'Dź', ["ѕ"] = 'dź',
		["З"] = 'Ź', ["з"] = 'ź',
		["С"] = 'Ś', ["с"] = 'ś',
		["Ш"] = 'Ś', ["ш"] = 'ś',
		["Щ"] = 'Šk', ["щ"] = 'šk',
	}, {__index = lang_letters["zle-ono"]})
	
	lang_iotated_initial["zle-ops"] = setmetatable({ -- In addition to zle-ono above.
		["Ѫ"] = 'Ѭ', ["ѫ"] = 'ѭ',
	}, {__index = lang_iotated_initial["zle-ono"]})
	
	lang_iotated_after_vowel_or_soft["zle-ops"] = setmetatable({
		["Ѫ"] = 'Ѭ', ["ѫ"] = 'ѭ',
	}, {__index = common_iotated_after_vowel_or_soft})
	
	uo_is_u["zle-ops"] = true

local function handle_v(prev, v)
	return prev .. (v == "Ѵ" and "В" or "в")
end

local function handle_ou(o, ac)
	return (ulower(o) == o and "у" or "У") .. ac
end

local function handle_breathing(vowel, br)
	-- Don't mark smooth breathing.
	if br == psili then
		return vowel
	end
	-- Mark rough breathing with "h".
	local vowel_lower = ulower(vowel)
	return (vowel_lower == vowel and "h" or "H") .. vowel_lower
end

function export.tr(text, lang, sc)
	if not sc then
		sc = require("Module:languages").getByCode(lang, nil, true):findBestScript(text):getCode()
	end
	if sc ~= "Cyrs" then
		return nil
	end
	
	local input = text
	
	-- Decompose any acute and grave accents.
	text = ugsub(toNFD(text), "[^" .. acute .. grave .. "]+", toNFC)
	
	-- Canonicalize any variants.
	text = text:gsub(".[\128-\191]*", variants)

	-- Transliterate the palatalization mark as prime.
	text = text:gsub(palatalization, "ʹ")
	
	-- Treat "Ѵ" as the consonant "В" (transliterated "V") in diphthongs that
	-- correspond to Ancient Greek "αυ", "ευ" and "ηυ" (equivalent to "аѵ", "еѵ"
	-- and "иѵ").  Note that "ιυ" ("іѵ") is not a diphthong, and "ου" ("оѵ") is
	-- a long vowel. However, this doesn't apply to "Ѷ", as the diacritic means
	-- it must be treated as a vowel.
	text = ugsub(text, "([аАеЕиИꙗꙖѥѤ]" .. accent .. ")([ѵѴ])", handle_v)
	
	local letters = lang_letters[lang] or common_letters
	
	-- Convert "ѿ" to "ѡт" if followed by a non-iotated vowel (including those
	-- which iotate only after vowels) or a palatalization mark, and "ѡтъ" in
	-- all other cases.
	text = ugsub(text, "([ѿѾ])(" .. accent .. ")()", function(ot, ac, loc)
		ot = (ot == "Ѿ" and "Ѡ" or "ѡ") .. ac .. "т"
		local nxt = toNFD(usub(text, loc, loc):gsub(".[\128-\191]*", letters))
		if not umatch(nxt, "^" .. vowel_or_soft) then
			ot = ot .. "ъ"
		end
		return ot
	end)
	
	-- Handle any vowels which are iotated at the start of words.
	local iotated_initial = lang_iotated_initial[lang] or common_iotated_initial
	-- Not possible to input iotated_initial directly, as mw.ustring.gsub
	-- doesn't respect metamethods...
	text = ugsub(text, "%f[%w].", function(m)
		return iotated_initial[m]
	end)
	
	-- Handle any vowels which are iotated after another vowel or a
	-- palatalization mark.
	local iotated_after_vowel_or_soft = lang_iotated_after_vowel_or_soft[lang] or common_iotated_after_vowel_or_soft
	text = ugsub(text, "()(" .. accent .. ")(.)", function(loc, ac, letter)
		local iotated = iotated_after_vowel_or_soft[letter]
		if iotated then
			loc = loc - 1
			local prev = toNFD((loc == 0 and "" or usub(text, loc, loc)):gsub(".[\128-\191]*", letters))
			if umatch(prev, vowel_or_soft .. "%W*$") then
				return ac .. iotated
			end
		end
	end)
	
	-- Treat "ъі" as "ꙑ", and make "ъ" tense ("ŷ") before "и" or an iotated
	-- vowel.
	text = ugsub(text, "([Ъъ])(" .. accent .. ")()([иИіІ]?)", function(yer, ac, loc, i)
		local nxt = toNFD(usub(text, loc, loc):gsub(".[\128-\191]*", letters)):match("^[iIjJ]")
		if nxt ~= nil then
			return (yer == "Ъ" and "Ꙑ" or "ꙑ") .. ((i == "і" or i == "І") and ac or circumflex .. ac .. i)
		end
	end)
	
	-- In some languages, treat "уо" ("uo") as "у" ("u").
	if uo_is_u[lang] then
		text = ugsub(text, "([уУѵѴѷѶ]" .. accent .. ")[оО]", "%1")
	end
	
	-- Treat "оу" ("ou") as "у" ("u").
	text = ugsub(text, "([оО])(" .. accent .. ")[уУѵѴѷѶ]", handle_ou)
	
	-- Substitute any numbers.
	for key, repl in pairs(numbers) do
		text = ugsub(text, key, repl)
	end

	-- Main substitution.
	text = text:gsub(".[\128-\191]*", letters)
	
	-- Handle any breathing marks.
	text = ugsub(toNFD(text), "([" .. vowels .. "][" .. vowels .. "%W]-)([" .. breathing .. "])", handle_breathing)
	
	if umatch(text, "[" .. breathing .. "]") then
		error("Invalid breathing marks in input " .. mw.dumpObject(input))
	end
	
	-- Transliterate the titlo and vzmet as colon.
	text = ugsub(text, "[" .. titlo .. vzmet .. "]", ":")

	return toNFC(text)
end

return export