Module:mn-translit: Difference between revisions

From Linguifex
Jump to navigation Jump to search
Created page with "local export = {} local mn = require("Module:mn-common") local gsub = mw.ustring.gsub local preConv = { ["є"] = "ө", ["ѳ"] = "ө", ["Є"] = "Ө", ["Ѳ"] = "Ө" } local oneChar = { ["а"] = "a", ["б"] = "b", ["в"] = "v", ["г"] = "g", ["д"] = "d", ["е"] = "je", ["ё"] = "jo", ["ж"] = "ž", ["з"] = "z", ["и"] = "i", ["й"] = "j", ["к"] = "k", ["л"] = "l", ["м"] = "m", ["н"] = "n", ["о"] = "o", ["ө"] = "ö", ["п"] = "p", ["р"] = "r", ["с"] = "s", ["..."
 
m Protected "Module:mn-translit": (bot) automatically protect highly visible templates/modules (reference score: 2000+ >= 1000) ([Edit=Allow only autoconfirmed users] (indefinite) [Move=Allow only autoconfirmed users] (indefinite))
Line 2: Line 2:
local mn = require("Module:mn-common")
local mn = require("Module:mn-common")
local gsub = mw.ustring.gsub
local gsub = mw.ustring.gsub
local toNFC = mw.ustring.toNFC
local toNFD = mw.ustring.toNFD
local UTF8_char = "[%z\1-\127\194-\244][\128-\191]*"


local preConv = {
local preConv = {
["є"] = "ө", ["ѳ"] = "ө",
["є"] = "ө", ["ѳ"] = "ө", ["ї"] = "ү",
["Є"] = "Ө", ["Ѳ"] = "Ө"
["Є"] = "Ө", ["Ѳ"] = "Ө", ["Ї"] = "Ү"
}
}


Line 22: Line 25:
return nil
return nil
end
end
-- Pre-convert any substitute characters.
local UTF8_char = "[%z\1-\127\194-\244][\128-\191]*"
text = text:gsub(UTF8_char, preConv)
-- Decompose (except for "ё" and "й") so that accents can be consistently detected.
-- Decompose (except for "ё" and "й") so that accents can be consistently detected.
text = mw.ustring.toNFD(text)
text = toNFD(text)
text = gsub(text, "[еЕ]̈", mw.ustring.toNFC)
text = gsub(text, "[еЕ]̈", toNFC)
text = gsub(text, "[иИ]̆", mw.ustring.toNFC)
text = gsub(text, "[иИ]̆", toNFC)
-- Pre-convert "є" and "ѳ" to "ө".
text = string.gsub(text, UTF8_char, preConv)
-- Hard sign does nothing if word-final (extremely rare, but attested in borrowings of affected Russian proper nouns like "Коммерсантъ").
-- Hard sign does nothing if word-final (extremely rare, but attested in borrowings of affected Russian proper nouns like "Коммерсантъ").
text = gsub(text, "[Ъъ]([^а-яёөү])", "%1")
text = gsub(text, "[Ъъ]([^а-яёөү])", "%1")
Line 63: Line 64:
-- Do primary substitutions. If still present, Cyrillic "е" becomes "je" and "ю" becomes "ju".
-- Do primary substitutions. If still present, Cyrillic "е" becomes "je" and "ю" becomes "ju".
for digraph, replacement in pairs(twoChars) do
for digraph, replacement in pairs(twoChars) do
text[i].substring = string.gsub(text[i].substring, digraph, replacement)
text[i].substring = text[i].substring:gsub(digraph, replacement)
end
end
text[i].substring = string.gsub(text[i].substring, UTF8_char, oneChar)
text[i].substring = text[i].substring:gsub(UTF8_char, oneChar)
table.insert(text.translit, text[i].substring)
table.insert(text.translit, text[i].substring)
end
end
return mw.ustring.toNFC(table.concat(text.translit, ""))
return toNFC(table.concat(text.translit, ""))
end
end


return export
return export

Revision as of 09:32, 27 April 2024



local export = {}
local mn = require("Module:mn-common")
local gsub = mw.ustring.gsub
local toNFC = mw.ustring.toNFC
local toNFD = mw.ustring.toNFD
local UTF8_char = "[%z\1-\127\194-\244][\128-\191]*"

local preConv = {
	["є"] = "ө", ["ѳ"] = "ө", ["ї"] = "ү",
	["Є"] = "Ө", ["Ѳ"] = "Ө", ["Ї"] = "Ү"
}

local oneChar = {
	["а"] = "a", ["б"] = "b", ["в"] = "v", ["г"] = "g", ["д"] = "d", ["е"] = "je", ["ё"] = "jo", ["ж"] = "ž", ["з"] = "z", ["и"] = "i", ["й"] = "j", ["к"] = "k", ["л"] = "l", ["м"] = "m", ["н"] = "n", ["о"] = "o", ["ө"] = "ö", ["п"] = "p", ["р"] = "r", ["с"] = "s", ["т"] = "t", ["у"] = "u", ["ү"] = "ü", ["ф"] = "f", ["х"] = "x", ["ц"] = "c", ["ч"] = "č", ["ш"] = "š", ["щ"] = "šč", ["ъ"] = "ʺ", ["ы"] = "y", ["ь"] = "ʹ", ["э"] = "e", ["ю"] = "ju", ["я"] = "ja",
	["А"] = "A", ["Б"] = "B", ["В"] = "V", ["Г"] = "G", ["Д"] = "D", ["Е"] = "Je", ["Ё"] = "Jo", ["Ж"] = "Ž", ["З"] = "Z", ["И"] = "I", ["Й"] = "J", ["К"] = "K", ["Л"] = "L", ["М"] = "M", ["Н"] = "N", ["О"] = "O", ["Ө"] = "Ö", ["П"] = "P", ["Р"] = "R", ["С"] = "S", ["Т"] = "T", ["У"] = "U", ["Ү"] = "Ü", ["Ф"] = "F", ["Х"] = "X", ["Ц"] = "C", ["Ч"] = "Č", ["Ш"] = "Š", ["Щ"] = "Šč", ["Ъ"] = "ʺ", ["Ы"] = "Y", ["Ь"] = "ʹ", ["Э"] = "E", ["Ю"] = "Ju", ["Я"] = "Ja"
}

local twoChars = {
	["ий"] = "ii",
	["Ий"] = "Ii"
}

function export.tr(text, lang, sc)
	if sc ~= "Cyrl" then
		return nil
	end
	-- Pre-convert any substitute characters.
	text = text:gsub(UTF8_char, preConv)
	-- Decompose (except for "ё" and "й") so that accents can be consistently detected.
	text = toNFD(text)
	text = gsub(text, "[еЕ]̈", toNFC)
	text = gsub(text, "[иИ]̆", toNFC)
	-- Hard sign does nothing if word-final (extremely rare, but attested in borrowings of affected Russian proper nouns like "Коммерсантъ").
	text = gsub(text, "[Ъъ]([^а-яёөү])", "%1")
	text = gsub(text, "[Ъъ]$", "")
	-- "Е" is not iotated after a consonant and "ю" is not iotated after "ж", "ш", "ч" or "щ", so they must be converted to their non-iotated romanizations in advance. However, a soft sign before "е" or "ю" (as a front vowel) creates a morphemic break, which means iotation does occur (i.e. it acts like a hard sign). These exceptions are converted into the iotated romanizations first; then, all instances of "е" and "ю" (which meet the prerequisites) are converted into non-iotated romanizations. This treats all instances of "ю" as front vowels, so deal with any back vowel instances of "ю" once iterating over the vowel harmonic segments later on, before doing the main substitutions.
	text = gsub(text, "([Ьь])е", "%1je")
	text = gsub(text, "([жчшщЖЧШЩ]ь)ю", "%1ju")
	text = gsub(text, "([^̀́̂аеёиоөуүъыэюяАЕЁИОӨУҮЪЫЬЭЮЯ%s][̀́̂]?)е", "%1e")
	text = gsub(text, "([жчшщЖЧШЩ]ь?)ю","%1u")
	-- Divide into segments by vowel harmony and iterate over them.
	text = mn.vowelharmony(text)
	text.translit = {}
	for i, v in ipairs(text) do
		-- "Ю" is "jü/ü" if front harmonic.
		if text[i].position == "front" then
			-- Latin "u" (from previous substitution).
			text[i].substring = gsub(text[i].substring, "u", "ü")
			-- Cyrillic "ю".
			text[i].substring = gsub(text[i].substring, "ю", "jü")
			text[i].substring = gsub(text[i].substring, "Ю", "Jü")
			-- "Е" is "jö/ö" if front round harmonic.
			if text[i].quality == "rounded" then
				-- Latin "e" (from previous substitution).
				text[i].substring = gsub(text[i].substring, "e", "ö")
				-- Cyrillic "е".
				text[i].substring = gsub(text[i].substring, "е", "jö")
				text[i].substring = gsub(text[i].substring, "Е", "Jö")
			end
		-- If back harmonic, any instances of "ю" that were treated as front vowels need to be treated as back vowels (i.e. the soft sign doesn't create a morphemic break, so iotation does not occur).
		else
			text[i].substring = gsub(text[i].substring, "([Ьь])ju", "%1u")
		end
		-- Do primary substitutions. If still present, Cyrillic "е" becomes "je" and "ю" becomes "ju".
		for digraph, replacement in pairs(twoChars) do
			text[i].substring = text[i].substring:gsub(digraph, replacement)
		end
		text[i].substring = text[i].substring:gsub(UTF8_char, oneChar)
		table.insert(text.translit, text[i].substring)
	end
	
	return toNFC(table.concat(text.translit, ""))
end

return export