Module:ru-translit: Difference between revisions

From Linguifex
Jump to navigation Jump to search
Created page with "local export = {} --[=[ FIXME: 1. (DONE) If you write '''Б'''ез, it transliterates to '''B'''jez instead of '''B'''ez, as it should. 2. (DONE) Convert ъ to nothing b..."
 
Bugfix for й at the start of words.
Line 7: Line 7:
1. (DONE) If you write '''Б'''ез, it transliterates to '''B'''jez instead of
1. (DONE) If you write '''Б'''ез, it transliterates to '''B'''jez instead of
   '''B'''ez, as it should.
   '''B'''ez, as it should.
  -- NOTE: This currently doesn't work due to an issue in [[Module:languages]]
  -- which means this module won't see style apostrophes.
2. (DONE) Convert ъ to nothing before comma or other non-letter particle, e.g.
2. (DONE) Convert ъ to nothing before comma or other non-letter particle, e.g.
   in Однимъ словомъ, идешь на чтеніе.
   in Однимъ словомъ, идешь на чтеніе.
Line 13: Line 15:
     translit so that we can display the transformed Cyrillic in the
     translit so that we can display the transformed Cyrillic in the
     "phonetic respelling" notation of {{ru-IPA}}.
     "phonetic respelling" notation of {{ru-IPA}}.
4. (DONE) Convert apostrophe to ъ before transliteration when after a consonant
  and before a vowel (requested by Atitarev).
]=]
]=]


local u = mw.ustring.char
local m_str_utils = require("Module:string utilities")
local rfind = mw.ustring.find
local rsub = mw.ustring.gsub -- WARNING: Don't return this directly in a function, or surround in parens
local rmatch = mw.ustring.match
local rsplit = mw.text.split
local ulower = mw.ustring.lower
local usub = mw.ustring.sub


local GR = u(0x0300) -- grave = ̀
local decompose = require("Module:ru-common").decompose
local TEMP_G = u(0xFFF1) -- substitute to preserve g from changing to v
local explode = m_str_utils.explode_utf8
local concat = table.concat
local insert = table.insert
local ipairs = ipairs
local remove = table.remove
local rfind = m_str_utils.find
local rsub = m_str_utils.gsub
local rsplit = m_str_utils.split
local select = select
local toNFC = mw.ustring.toNFC
local toNFD = mw.ustring.toNFD
local u = m_str_utils.char
 
local AC = u(0x301) -- acute = ́
local GR = u(0x0300) -- grave = ̀
local BR = u(0x0306) -- breve ̆
local DI = u(0x0308) -- diaeresis = ̈
local DIACRITICS = AC .. GR .. BR .. DI ..
u(0x0302) .. -- circumflex ̂
u(0x0304) .. -- macron ̄
u(0x0307) .. -- dot above ̇
u(0x030A) .. -- ring above ̊
u(0x030C) .. -- caron ̌
u(0x030F) .. -- double grave ̏
u(0x0323) .. -- dot below ̣
u(0x0328)    -- ogonek ̨
local TEMP_G = u(0xFFF1) -- substitute to prevent g from changing to v
local word_chars = "%a’%(%)%[%]" .. DIACRITICS


local function ine(x) -- if not empty
local function ine(x) -- if not empty
if x == "" then return nil else return x end
return x ~= "" and x or nil
end
end


-- In this table, we now map Cyrillic е and э to je and e, and handle the
-- Main letter conversion table.
-- post-consonant version (plain e and ɛ) specially.
local letters = {
local tab = {
["а"] = "a", ["б"] = "b", ["в"] = "v", ["г"] = "g", ["д"] = "d", ["е"] = "je", ["ж"] = "ž", ["з"] = "z", ["и"] = "i", ["й"] = "j", ["к"] = "k", ["л"] = "l", ["м"] = "m", ["н"] = "n", ["о"] = "o", ["п"] = "p", ["р"] = "r", ["с"] = "s", ["т"] = "t", ["у"] = "u", ["ф"] = "f", ["х"] = "x", ["ц"] = "c", ["ч"] = "č", ["ш"] = "š", ["щ"] = "šč", ["ъ"] = "ʺ", ["ы"] = "y", ["ь"] = "ʹ", ["э"] = "e", ["ю"] = "ju", ["я"] = "ja",
["А"]="A", ["Б"]="B", ["В"]="V", ["Г"]="G", ["Д"]="D", ["Е"]="Je", ["Ё"]="", ["Ж"]="Ž", ["З"]="Z", ["И"]="I", ["Й"]="J",
["А"] = "A", ["Б"] = "B", ["В"] = "V", ["Г"] = "G", ["Д"] = "D", ["Е"] = "Je", ["Ж"] = "Ž", ["З"] = "Z", ["И"] = "I", ["Й"] = "J", ["К"] = "K", ["Л"] = "L", ["М"] = "M", ["Н"] = "N", ["О"] = "O", ["П"] = "P", ["Р"] = "R", ["С"] = "S", ["Т"] = "T", ["У"] = "U", ["Ф"] = "F", ["Х"] = "X", ["Ц"] = "C", ["Ч"] = "Č", ["Ш"] = "Š", ["Щ"] = "Šč", ["Ъ"] = "ʺ", ["Ы"] = "Y", ["Ь"] = "ʹ", ["Э"] = "E", ["Ю"] = "Ju", ["Я"] = "Ja",
["К"]="K", ["Л"]="L", ["М"]="M", ["Н"]="N", ["О"]="O", ["П"]="P", ["Р"]="R", ["С"]="S", ["Т"]="T", ["У"]="U", ["Ф"]="F",
["Х"]="X", ["Ц"]="C", ["Ч"]="Č", ["Ш"]="Š", ["Щ"]="Šč", ["Ъ"]="ʺ", ["Ы"]="Y", ["Ь"]="ʹ", ["Э"]="E", ["Ю"]="Ju", ["Я"]="Ja",
['а']='a', ['б']='b', ['в']='v', ['г']='g', ['д']='d', ['е']='je', ['ё']='jó', ['ж']='ž', ['з']='z', ['и']='i', ['й']='j',
['к']='k', ['л']='l', ['м']='m', ['н']='n', ['о']='o', ['п']='p', ['р']='r', ['с']='s', ['т']='t', ['у']='u', ['ф']='f',
['х']='x', ['ц']='c', ['ч']='č', ['ш']='š', ['щ']='šč', ['ъ']='ʺ', ['ы']='y', ['ь']='ʹ', ['э']='e', ['ю']='ju', ['я']='ja',
-- Russian style quotes
-- Russian style quotes
['«']='', ['»']='',
["«"] = "", ["»"] = "",
-- archaic, pre-1918 letters
-- archaic, pre-1918 letters
['І']='I', ['і']='i', ['Ѳ']='F', ['ѳ']='f',
["і"] = "i", ["ѳ"] = "f", ["ѣ"] = "jě", ["ѵ"] = "i",
['Ѣ']='Jě', ['ѣ']='jě', ['Ѵ']='I', ['ѵ']='i',
["І"] = "I", ["Ѳ"] = "F", ["Ѣ"] = "Jě", ["Ѵ"] = "I",
-- archaic, pre-1700 letters
-- archaic, pre-1708 letters (most of these are covered by aliases below)
['Ѕ']='Z', ['ѕ']='z', ['Ꙃ']='Z', ['ꙃ']='z', ['Ꙁ']='Z', ['ꙁ']='z',
["ѥ"] = "je", ["ѯ"] = "ks", ["ѱ"] = "ps",
['Ѡ']='O', ['ѡ']='o', ['Ѿ']='Ot', ['ѿ']='ot', ['Ꙋ']='U', ['ꙋ']='u',
["Ѥ"] = "Je", ["Ѯ"] = "Ks", ["Ѱ"] = "Ps",
['Ꙑ']='Y', ['ꙑ']='y', ['Ꙗ']='Ja', ['ꙗ']='ja', ['Ѥ']='Je', ['ѥ']='je',
['Ѧ']='Ja', ['ѧ']='ja', ['Ѩ']='Ja', ['ѩ']='ja', ['Ѫ']='U', ['ѫ']='u', ['Ѭ']='Ju', ['ѭ']='ju',
['Ѯ']='Ks', ['ѯ']='ks', ['Ѱ']='Ps', ['ѱ']='ps', ['Є']='E', ['є']='e', ['Ї']='I', ['ї']='i',
}
}


-- following based on ru-common for use with is_monosyllabic()
-- Treat most archaic letters as aliases. Exceptions:
-- any Cyrillic or Latin vowel, including ёЁ and composed Cyrillic vowels with grave accent;
-- ѥ is not the same as е, because it doesn't lose iotation after a consonant.
-- not including accented Latin vowels except ě (FIXME, might want to change this)
-- ѯ and ѱ can't be treated as aliases, because mapping 1 character to 2 messes
local lowercase_vowels = "аеиоуяэыюіѣѵүѐѝёaeiouyěɛ"
-- can cause the logic which checks the capitalization of adjacent letters to
local uppercase_vowels = "АЕИОУЯЭЫЮІѢѴҮЀЍЁAEIOUYĚƐ"
-- become unreliable. This only affects the uppercase forms, but the lowercase
local vowels = lowercase_vowels .. uppercase_vowels
-- forms are also excepted for consistency.
local aliases = {
["є"] = "е", ["ꙁ"] = "з", ["ꙃ"] = "з", ["ѕ"] = "з", ["ї"] = "і", ["ꙋ"] = "у", ["ѡ"] = "о", ["ѿ"] = "о", ["ꙑ"] = "ы", ["ꙗ"] = "я", ["ѧ"] = "я", ["ѫ"] = "у", ["ѩ"] = "я", ["ѭ"] = "ю",
["Є"] = "Е", ["Ꙁ"] = "З", ["Ꙃ"] = "З", ["Ѕ"] = "З", ["Ї"] = "І", ["Ꙋ"] = "У", ["Ѡ"] = "О", ["Ѿ"] = "О", ["Ꙑ"] = "Ы", ["Ꙗ"] = "Я", ["Ѧ"] = "Я", ["Ѫ"] = "У", ["Ѩ"] = "Я", ["Ѭ"] = "Ю", ["'"] = ""
}


-- FIXME! Doesn't work with ɣ, which gets included in this character set
local plain_e = {
local non_consonants = "[" .. vowels .. "ЪЬъьʹʺ%A]"
["е"] = "e", ["ѣ"] = "ě", ["э"] = "ɛ",
local consonants = "[^" .. vowels .. "ЪЬъьʹʺ%A]"
["Е"] = "E", ["Ѣ"] = "Ě", ["Э"] = "Ɛ"
}


local map_to_plain_e_map = {["Е"] = "E", ["е"] = "e", ["Ѣ"] = "Ě", ["ѣ"] = "ě", ["Э"] = "Ɛ", ["э"] = "ɛ"}
local jo_letters = {
local function map_to_plain_e(pre, e)
["ё"] = "jo", ["ѣ̈"] = "", ["я̈"] = "",
return pre .. map_to_plain_e_map[e]
["Ё"] = "Jo", ["Ѣ̈"] = "", ["Я̈"] = ""
end
}


local map_to_je_map = {["Е"] = "Je", ["е"] = "je", ["Ѣ"] = "Jě", ["ѣ"] = "jě", ["Э"] = "E", ["э"] = "e"}
local vowels = "аеиіоуыѣэюяѥѵaæɐeəɛiɪɨoɵuyʊʉАЕИІОУЫѢЭЮЯѤѴAEƐIOUY"
local function map_to_je(pre, e)
if e == nil then
e = pre
pre = ""
end
return pre .. map_to_je_map[e]
end
 
-- decompose composed grave chars; they will map to uncomposed Latin letters for
-- consistency with other char+grave combinations, and we do this early to
-- avoid problems converting to e or je
local decompose_grave_map = {['ѐ'] = 'е' .. GR, ['Ѐ'] = 'Е' .. GR, ['ѝ'] = 'и' .. GR, ['Ѝ'] = 'И' .. GR}
 
-- True if Cyrillic or decomposed Latin word has no more than one vowel;
-- includes non-syllabic stems such as льд-; copied from ru-common and modified
-- to avoid having to import that module (which would slow things down
-- significantly)
local function is_monosyllabic(word)
return not rfind(word, "[" .. vowels .. "].*[" .. vowels .. "]")
end


-- Apply transformations to the Cyrillic to more closely match pronunciation.
-- Apply transformations to the Cyrillic to more closely match pronunciation.
Line 101: Line 99:
-- special-casing for что and related words.
-- special-casing for что and related words.
function export.apply_tr_fixes(text, noadj, noshto, forceadj)
function export.apply_tr_fixes(text, noadj, noshto, forceadj)
-- decompose composed grave characters before we convert Cyrillic е to
-- normalize any aliases
-- Latin e or je
text = text:gsub(".[\128-\191]*", aliases)
text = rsub(text, "[ѐЀѝЍ]", decompose_grave_map)
-- decompose stress accents without decomposing letters we want to treat
-- as units (e.g. й or ё)
text = decompose(text)


local origtext = text
local origtext = text
Line 112: Line 112:
-- Handle какого-нибудь/-либо/-то; must be done first because of an exception
-- Handle какого-нибудь/-либо/-то; must be done first because of an exception
-- made for бого-, снего-, etc.
-- made for бого-, снего-, etc.
text = rsub(text, "([кКтТ][аА][кК][оеОЕ" .. (forceadj and "аА" or "") .. "][\204\129\204\128]?)([гГ])([оО]%-)", repl)
text = rsub(text, "([кКтТ][аА][кК][оеОЕ" .. (forceadj and "аА" or "") .. "][" .. AC .. GR .. "]?)([гГ])([оО]%-)", repl)
if not forceadj then
if not forceadj then
-- handle много
local function go(text, case)
text = rsub(text, "%f[%a\204\129\204\128]([Мм]но[\204\129\204\128]?)го%f[^%a\204\129\204\128]", "%1" .. TEMP_G .. "о")
local pattern = rsub(case, "^(.)(.*)(го[" .. AC .. GR .. "]?)(%-?)$", function(m1, m2, m3, m4)
-- handle немного, намного
m1 = "%f[%a" .. AC .. GR .. "]([" .. m1:uupper() .. m1 .. "]"
text = rsub(text, "%f[%a\204\129\204\128]([Нн][еа]мно[\204\129\204\128]?)го%f[^%a\204\129\204\128]", "%1" .. TEMP_G .. "о")
m2 = m2:gsub("\204[\128\129]", "[" .. AC .. GR .. "]?") .. ")"
-- handle до́рого [short form of дорогой, adverb]
m3 = m3:gsub("\204[\128\129]", "[" .. AC .. GR .. "]?")
text = rsub(text, "%f[%a\204\129\204\128]([Дд]о[\204\129\204\128]?ро)го%f[^%a\204\129\204\128]", "%1" .. TEMP_G .. "о")
:gsub("(.*)", "г(%1")
-- handle недо́рого [short form of недорогой, adverb]
m4 = m4 == "-" and "%-)" or ")%f[^%a" .. AC .. GR .. "]"
text = rsub(text, "%f[%a\204\129\204\128]([Нн]едо[\204\129\204\128]?ро)го%f[^%a\204\129\204\128]", "%1" .. TEMP_G .. "о")
return m1 .. m2 .. m3 .. m4
-- handle стро́го
end)
text = rsub(text, "%f[%a\204\129\204\128]([Сс]тро[\204\129\204\128]?)го%f[^%a\204\129\204\128]", "%1" .. TEMP_G .. "о")
return rsub(text, pattern, "%1" .. TEMP_G .. "%2")
-- handle нестро́го
end
text = rsub(text, "%f[%a\204\129\204\128]([Нн]естро[\204\129\204\128]?)го%f[^%a\204\129\204\128]", "%1" .. TEMP_G .. "о")
for _, case in ipairs{"мно́го", "н[еа]мно́го", "до́рого", "недо́рого", "стро́го", "нестро́го", "на́строго", "убо́го", "пол[ао]́го"} do
-- handle на́строго
text = go(text, case)
text = rsub(text, "%f[%a\204\129\204\128]([Нн]а[\204\129\204\128]?стро)го%f[^%a\204\129\204\128]", "%1" .. TEMP_G .. "о")
end
-- handle убо́го
text = rsub(text, "%f[%a\204\129\204\128]([Уу]бо[\204\129\204\128]?)го%f[^%a\204\129\204\128]", "%1" .. TEMP_G .. "о")
-- handle поло́го
text = rsub(text, "%f[%a\204\129\204\128]([Пп]оло[\204\129\204\128]?)го%f[^%a\204\129\204\128]", "%1" .. TEMP_G .. "о")
-- check for neuter short forms of compound adjectives in -но́гий
-- check for neuter short forms of compound adjectives in -но́гий
if rfind(text, "оно[\204\129\204\128]?го%f[^%a\204\129\204\128]") then
if rfind(text, "но[" .. AC .. GR .. "]?го%f[^%a" .. AC .. GR .. "]") then
-- handle безно́го
for _, case in ipairs{"безно́го", "босоно́го", "веслоно́го", "длинноно́го", "двуно́го", "коротконо́го", "кривоно́го", "одноно́го", "пятино́го", "трёхно́го", "трехно́го", "хромоно́го", "четвероно́го", "шестино́го"} do
text = rsub(text, "%f[%a\204\129\204\128]([Бб]езно[\204\129\204\128]?)го%f[^%a\204\129\204\128]", "%1" .. TEMP_G .. "о")
text = go(text, case)
-- handle босоно́го
end
text = rsub(text, "%f[%a\204\129\204\128]([Бб]осоно[\204\129\204\128]?)го%f[^%a\204\129\204\128]", "%1" .. TEMP_G .. "о")
end
-- handle веслоно́го
for _, case in ipairs{"ого́", "го́го", "ваго́го", "ло́го", "п[ео]́го", "со́го", "То́го", "ле́го", "игого́", "огого́", "альбиньязего", "д[иі]е́го", "бо́лого", "гр[иі]е́го", "манче́го", "пичис[иі]е́го", "тенкодого", "хио́го", "аго-", "его-", "ого-"} do
text = rsub(text, "%f[%a\204\129\204\128]([Вв]еслоно[\204\129\204\128]?)го%f[^%a\204\129\204\128]", "%1" .. TEMP_G .. "о")
text = go(text, case)
-- handle длинноно́го
text = rsub(text, "%f[%a\204\129\204\128]([Дд]линноно[\204\129\204\128]?)го%f[^%a\204\129\204\128]", "%1" .. TEMP_G .. "о")
-- handle двуно́го
text = rsub(text, "%f[%a\204\129\204\128]([Дд]вуно[\204\129\204\128]?)го%f[^%a\204\129\204\128]", "%1" .. TEMP_G .. "о")
-- handle коротконо́го
text = rsub(text, "%f[%a\204\129\204\128]([Кк]оротконо[\204\129\204\128]?)го%f[^%a\204\129\204\128]", "%1" .. TEMP_G .. "о")
-- handle кривоно́го
text = rsub(text, "%f[%a\204\129\204\128]([Кк]ривоно[\204\129\204\128]?)го%f[^%a\204\129\204\128]", "%1" .. TEMP_G .. "о")
-- handle одноно́го
text = rsub(text, "%f[%a\204\129\204\128]([Оо]дноно[\204\129\204\128]?)го%f[^%a\204\129\204\128]", "%1" .. TEMP_G .. "о")
-- handle пятино́го
text = rsub(text, "%f[%a\204\129\204\128]([Пп]ятино[\204\129\204\128]?)го%f[^%a\204\129\204\128]", "%1" .. TEMP_G .. "о")
-- handle трёхно́го
text = rsub(text, "%f[%a\204\129\204\128]([Тт]р[ёе][\204\129\204\128]?хно[\204\129\204\128]?)го%f[^%a\204\129\204\128]", "%1" .. TEMP_G .. "о")
-- handle хромоно́го
text = rsub(text, "%f[%a\204\129\204\128]([Хх]ромоно[\204\129\204\128]?)го%f[^%a\204\129\204\128]", "%1" .. TEMP_G .. "о")
-- handle четвероно́го
text = rsub(text, "%f[%a\204\129\204\128]([Чч]етвероно[\204\129\204\128]?)го%f[^%a\204\129\204\128]", "%1" .. TEMP_G .. "о")
-- handle шестино́го
text = rsub(text, "%f[%a\204\129\204\128]([Шш]естино[\204\129\204\128]?)го%f[^%a\204\129\204\128]", "%1" .. TEMP_G .. "о")
end
end
-- handle пе́го [short form of пе́гий "piebald"]
text = rsub(text, "%f[%a\204\129\204\128]([Пп]е[\204\129\204\128]?)го%f[^%a\204\129\204\128]", "%1" .. TEMP_G .. "о")
-- handle лого, сого, ого
text = rsub(text, "%f[%a\204\129\204\128]([лсЛС]?[Оо][\204\129\204\128]?)г(о[\204\129\204\128]?)%f[^%a\204\129\204\128]", "%1" .. TEMP_G .. "%2")
-- handle Того, То́го (but not того or Того́, which have /v/)
text = rsub(text, "%f[%a\204\129\204\128](То́?)го%f[^%a\204\129\204\128]", "%1" .. TEMP_G .. "о")
-- handle лего
text = rsub(text, "%f[%a\204\129\204\128]([Лл]е[\204\129\204\128]?)го%f[^%a\204\129\204\128]", "%1" .. TEMP_G .. "о")
-- handle игого, огого; note, we substitute TEMP_G for both г's
-- because otherwise the ого- at the beginning gets converted to ово
text = rsub(text, "%f[%a\204\129\204\128]([ИиОо])гог(о[\204\129\204\128]?)%f[^%a\204\129\204\128]", "%1" .. TEMP_G .. "о" .. TEMP_G .. "%2")
-- handle Диего
text = rsub(text, "%f[%a\204\129\204\128](Дие́?)го%f[^%a\204\129\204\128]", "%1" .. TEMP_G .. "о")
-- handle бо́лого
text = rsub(text, "%f[%a\204\129\204\128]([Бб]о[\204\129\204\128]?ло)го%f[^%a\204\129\204\128]", "%1" .. TEMP_G .. "о")
-- handle *ого-, *его- (e.g. бого-, снего-)
text = rsub(text, "([ео][\204\129\204\128]?)го%-", "%1" .. TEMP_G .. "о-")
end
end
--handle genitive/accusative endings, which are spelled -ого/-его/-аго
--handle genitive/accusative endings, which are spelled -ого/-его/-аго
Line 183: Line 142:
-- and pronouns, excluding words like много, ого (-аго occurs in
-- and pronouns, excluding words like много, ого (-аго occurs in
-- pre-reform spelling); \204\129 is an acute accent, \204\128 is a grave accent
-- pre-reform spelling); \204\129 is an acute accent, \204\128 is a grave accent
local pattern = "([оеОЕ" .. (forceadj and "аА" or "") .. "][\204\129\204\128]?)([гГ])([оО][\204\129\204\128]?)"
local pattern = "([оеОЕ" .. (forceadj and "аА" or "") .. "][" .. AC .. GR .. "]?)([гГ])([оО][" .. AC .. GR .. "]?)"
local reflexive = "([сС][яЯ][\204\129\204\128]?)"
local reflexive = "([сС][яЯ][" .. AC .. GR .. "]?)"
text = rsub(text, pattern .. "%f[^%a\204\129\204\128]", repl)
text = rsub(text, pattern .. "%f[^%a" .. AC .. GR .. TEMP_G .. "]", repl)
text = rsub(text, pattern .. reflexive .. "%f[^%a\204\129\204\128]", repl)
text = rsub(text, pattern .. reflexive .. "%f[^%a" .. AC .. GR .. TEMP_G .. "]", repl)
-- handle сегодня
-- handle сегодня
text = rsub(text, "%f[%a\204\129\204\128]([Сс]е)г(о[\204\129\204\128]?дня)%f[^%a\204\129\204\128]", "%1в%2")
text = rsub(text, "%f[%a" .. AC .. GR .. "]([Сс]е)г(о[" .. AC .. GR .. "]?дня)%f[^%a" .. AC .. GR .. "]", "%1в%2")
-- handle сегодняшн-
-- handle сегодняшн-
text = rsub(text, "%f[%a\204\129\204\128]([Сс]е)г(о[\204\129\204\128]?дняшн)", "%1в%2")
text = rsub(text, "%f[%a" .. AC .. GR .. "]([Сс]е)г(о[" .. AC .. GR .. "]?дняшн)", "%1в%2")
-- replace TEMP_G with g; must be done after the -go -> -vo changes
-- replace TEMP_G with g; must be done after the -go -> -vo changes
text = rsub(text, TEMP_G, "г")
text = rsub(text, TEMP_G, "г")
Line 199: Line 158:
local ch2sh = {["ч"] = "ш", ["Ч"] = "Ш"}
local ch2sh = {["ч"] = "ш", ["Ч"] = "Ш"}
-- Handle что
-- Handle что
text = rsub(text, "%f[%a\204\129\204\128]([Чч])(то[\204\129\204\128]?)%f[^%a\204\129\204\128]",
text = rsub(text, "%f[%a" .. AC .. GR .. "]([Чч])(то[" .. AC .. GR .. "]?)%f[^%a" .. AC .. GR .. "]",
function(ch, to) return ch2sh[ch] .. to end)
function(ch, to) return ch2sh[ch] .. to end)
-- Handle чтобы, чтоб
-- Handle чтобы, чтоб
text = rsub(text, "%f[%a\204\129\204\128]([Чч])(то[\204\129\204\128]?бы?)%f[^%a\204\129\204\128]",
text = rsub(text, "%f[%a" .. AC .. GR .. "]([Чч])(то[" .. AC .. GR .. "]?бы?)%f[^%a" .. AC .. GR .. "]",
function(ch, to) return ch2sh[ch] .. to end)
function(ch, to) return ch2sh[ch] .. to end)
-- Handle ничто
-- Handle ничто
text = rsub(text, "%f[%a\204\129\204\128]([Нн]и)ч(то[\204\129\204\128]?)%f[^%a\204\129\204\128]", "%1ш%2")
text = rsub(text, "%f[%a" .. AC .. GR .. "]([Нн]и)ч(то[" .. AC .. GR .. "]?)%f[^%a" .. AC .. GR .. "]", "%1ш%2")
end
end


-- Handle мягкий, лёгкий, легчать, etc.
-- Handle мягкий, лёгкий, легчать, etc.
text = rsub(text, "([МмЛл][яеё][\204\129\204\128]?)г([кч])", "%1х%2")
text = rsub(text, "([МмЛл][яеё][" .. AC .. GR .. "]?)г([кч])", "%1х%2")


return origtext, text
return origtext, text
end
end


-- Transliterate after the pronunciation-related transformations of
do
-- export.apply_tr_fixes() have been applied. Called from {{ru-IPA}}.
-- If и, check if it's actually й to avoid wrongly treating it as a vowel.
-- INCLUDE_MONOSYLLABIC_JO_ACCENT is as in export.tr().
local function handle_short_i(word, ch, i, adjust)
function export.tr_after_fixes(text, include_monosyllabic_jo_accent)
if (ch == "и" or ch == "И") and word[i] == BR then
-- Remove word-final hard sign, either utterance-finally or followed by
-- Remove the breve; only used by get_next_char.
-- a non-letter character such as space, comma, period, hyphen, etc.
if adjust then
text = rsub(text, "[Ъъ]$", "")
remove(word, i)
text = rsub(text, "[Ъъ]([%A])", "%1")
end
 
ch = toNFC(ch .. BR)
-- Convert apostrophe the hard sign between consonant and vowel (i.e.
word[i - 1] = ch
-- in the places where the hard sign normally occurs in modern text).
end
-- Apostrophe is sometimes used to indicate the hard sign; this may have
return ch
-- originated from the forcible removal of the hard sign from printing
end
-- offices in the 1920's, after the implementation of the Russian
-- orthography reform. The if-statement is an optimization; see below.
local function get_prev_char(word, i)
if rfind(text, "'") then
local j, ch = 0
text = rsub(text, "(" .. consonants .. ")'([" .. lowercase_vowels .. "])", "%1ъ%2")
repeat
text = rsub(text, "(" .. consonants .. ")'([" .. uppercase_vowels .. "])", "%1Ъ%2")
j = j + 1
ch = word[i - j]
until not (ch and (DIACRITICS .. "()’"):find(ch, nil, true))
return handle_short_i(word, ch, i - j + 1)
end
local function get_next_char(word, i)
local j, ch = 0
repeat
j = j + 1
ch = word[i + j]
until ch ~= "(" and ch ~= ")"
return handle_short_i(word, ch, i + j + 1, true)
end
-- Check if a vowel should be made "plain" (usually by removing the "j"
-- in the transliteration). Returns true if `prev` is in the string `check`.
-- If `this` and `prev` are both uppercase, always returns false (on the
-- assumption the term is an initialism).
-- Note: We check both because of terms like Романо-д’Эццелино and
-- Комон-л’Эванте, where an uppercase `this` follows a lowercase `prev`,
-- (since the apostrophe is ignored).
local function check_plain(this, prev, check, in_check)
if prev and (this == this:ulower() or prev == prev:ulower()) then
if check:match(prev, 1, true) then
return in_check
end
return not in_check
end
end
-- Convert any jos (ё, ѣ̈, я̈) as a special-case.
local function is_jo_letter(this, prev, output, word, d)
local tr = jo_letters[this]
if not tr then
return
end
-- Remove "j" if preceded by a hushing consonant (ж ч ш щ).
if check_plain(this, prev, "жчшщЖЧШЩ", true) then
tr = tr:sub(2)
if this == this:uupper() then
tr = tr:uupper()
end
end
insert(output, tr)
-- Note the position, so we can give it an implicit primary stress
-- if necessary (unless it already has secondary stress; shouldn't
-- ever come after primary stress, but just in case it does we
-- shouldn't override it or give the jo two stress marks.
if word[d.i + 1] ~= GR then
d.final_jo = #output
end
return true
end
end
 
-- the if-statement below isn't necessary but may speed things up,
local function do_iteration(output, word, d)
-- particularly when include_monosyllabic_jo_accent isn't set, in that
-- Get current, previous and next characters, skipping over brackets, and
-- in the majority of cases where ё doesn't occur, we avoid a pattern find
-- ignoring diacritics for the previous character (which simplifies checks).
-- (in is_monosyllabic()) and three pattern subs. The translit module needs
local this = word[d.i]
-- to be as fast as possible since it may be called hundreds or
local prev = get_prev_char(word, d.i)
-- thousands of times on some pages.
local nxt = get_next_char(word, d.i)
if rfind(text, "[Ёё]") then
-- A word is monosyllabic if it has only one vowel.
-- We need to special-case ё after a "hushing" consonant, which becomes
if vowels:find(this, 1, true) then
-- ó (or o), without j. We also need special cases for monosyllabic ё
d.vowels = d.vowels + 1
-- when INCLUDE_MONOSYLLABIC_JO_ACCENT isn't set, so we don't add the
end
-- accent mark that we would otherwise include.
if nxt == DI then
if not include_monosyllabic_jo_accent and is_monosyllabic(text) and not rfind(text, "^%-") then
d.i = d.i + 1
text = rsub(text, "([жшчщЖШЧЩ])ё","%1o")
this = toNFC(this .. DI)
text = text:gsub("ё", "jo")
if is_jo_letter(this, prev, output, word, d) then
text = text:gsub("Ё", "Jo")
return
else
end
-- in a nonmonosyllabic word where there is a later stressed vowel
elseif nxt == BR then
-- in the same word (e.g. трёхэта́жный), don't put an accent mark on ё.
d.i = d.i + 1
text = rsub(text, "ё([^ %-]-[" .. vowels .. ")", "jo%1")
this = toNFC(this .. BR)
-- same goes if there are two ё's in a word (трёхколёсный, четырёхзвёздный)
-- Note that explicit stress has been found, which prevents any
text = rsub(text, "ё([^ %-]-ё)", "jo%1")
-- implicit stress from being added for jos.
-- same goes if this is a prefix (четырёх-)
elseif this == AC then
text = rsub(text, "ё([^ %-]-%-)$", "jo%1")
d.primary = true
-- same goes if there is a stressed vowel *followed* by ё (this is
-- After a lowercase consonant or at the start of a suffix, е becomes
-- quite rare but occurs e.g. in А́ндзё "Anjō (city in Japan)"
-- e, ѣ becomes ě and э becomes ɛ; after й, this only applies to э.
text = rsub(text, "([" .. vowels .. "]́[^ %-]-)ё", "%1jo")
elseif plain_e[this] and (
-- handle hushing consonant + ё + another accented vowel (e.g. шёлкопряди́льня)
check_plain(this, prev, vowels .. "ʹʺъЪьЬ" .. ((this == "э" or this == "Э") and "" or "йЙ"), false) or
-- (already partly converted by previous regexes)
not prev and d.dash_before
text = rsub(text, "([жшчщЖШЧЩ])j","%1")
) then
-- handle remaining cases of hushing consonant + ё
insert(output, plain_e[this])
text = rsub(text, "([жшчщЖШЧЩ])ё","%1ó")
return
-- conversion of remaining ё will occur as a result of 'tab'.
-- ю becomes u if if preceded by ж or ш.
elseif (
(this == "ю" or this == "Ю") and
check_plain(this, prev, "жшЖШ", true)
) then
insert(output, this == "ю" and "u" or "U")
return
-- Make lowercase izhitsa display as -v- after /a/, /e/ and /i/
-- (matching the equivalent Greek digraphs αυ, ευ and ηυ).
elseif (
this == "ѵ" and
prev and ("аеиіѣэяѥaæɐeəɛiɪɨАЕИІѢЭЯѤAEƐI"):find(prev, 1, true)
) then
this = "в"
word[d.i] = "в"
-- Ignore word-final hard signs.
elseif (this == "ъ" or this == "Ъ") and d.i == #word then
return
end
end
insert(output, letters[this] or this)
end
end


-- ю after ж and ш becomes u (e.g. брошюра, жюри)
-- Transliterate after the pronunciation-related transformations of
text = rsub(text, "([жшЖШ])ю","%1u")
-- export.apply_tr_fixes() have been applied. Called from {{ru-IPA}}.
 
-- `jo_accent` is as in export.tr().
-- the if-statement below isn't necessary but may speed things up in that
function export.tr_after_fixes(text, jo_accent)
-- in the majority of cases where the letters below don't occur, we avoid
-- normalize any aliases
-- six pattern subs.
text = toNFC(text:gsub(".[\128-\191]*", aliases))
if rfind(text, "[ЕеѢѣЭэ]") then
local output = {}
-- е after a dash at the beginning of a word becomes e, and э becomes ɛ
-- (like after a consonant)
-- Note: We use ustring gsub because ustring gmatch is bugged, and
text = rsub(text, "^(%-)([ЕеѢѣЭэ])", map_to_plain_e)
-- it's easy to make gsub do the same thing.
text = rsub(text, "(%s%-)([ЕеѢѣЭэ])", map_to_plain_e)
rsub(text, "([^" .. word_chars .. "]*)([" .. word_chars .. "]*)", function(before, word)
-- don't get confused by single quote or parens between consonant and е;
for _, ch in ipairs(explode(before)) do
-- e.g. Б'''ез''', американ(ец)
insert(output, ch)
text = rsub(text, "(" .. consonants .. "['%(%)]*)([ЕеѢѣЭэ])", map_to_plain_e)
end
 
-- FIXME: Do this in one loop instead of splitting by word.
-- This is now the default
word = explode(toNFD(word))
-- е after a vowel or at the beginning of a word becomes je, and э becomes e
local d = {
-- text = rsub(text, "^([ЕеѢѣЭэ])", map_to_je)
i = 0,
-- text = rsub(text, "(" .. non_consonants .. ")([ЕеѢѣЭэ])", map_to_je)
vowels = 0
-- -- need to do it twice in case of sequences of such vowels
}
-- text = rsub(text, "^([ЕеѢѣЭэ])", map_to_je)
-- Prefix if it's preceded by "^-" or " -".
-- text = rsub(text, "(" .. non_consonants .. ")([ЕеѢѣЭэ])", map_to_je)
if output[#output] == "-" then
local prev = output[#output - 1]
if not prev or rfind(prev, "%s") then
d.dash_before = true
end
end
while d.i < #word do
d.i = d.i + 1
do_iteration(output, word, d)
end
-- Add an implicit primary stress to a jo (if applicable).
-- Jos do not implicitly take stress accents if an explicit primary
-- stress is given. Otherwise, the final jo which doesn't have
-- secondary stress takes primary stress.
-- Prefixes do not take implicit primary stress.
-- Primary stress will be shown on monosyllables if either they
-- are a suffix or `jo_accent` is "mono".
if (
jo_accent ~= "none" and
d.final_jo and
(not (d.primary or word[#word] == "-")) and
(jo_accent == "mono" or d.vowels > 1 or d.dash_before)
) then
output[d.final_jo] = output[d.final_jo] .. AC
end
end)
return toNFC(concat(output))
end
end
text = (rsub(text,'.',tab))
return text
end
end


Line 301: Line 354:
-- ё is a special case: it is rendered (j)ó in multisyllabic words and
-- ё is a special case: it is rendered (j)ó in multisyllabic words and
-- monosyllabic words in multi-word phrases, but rendered (j)o without an
-- monosyllabic words in multi-word phrases, but rendered (j)o without an
-- accent in isolated monosyllabic words, unless INCLUDE_MONOSYLLABIC_JO_ACCENT
-- accent in isolated monosyllabic words. This can be overridden with the
-- is specified. (This is used in conjugation and declension tables.)
-- JO_ACCENT parameter: if set to "mono", monosyllabic words will also be
-- given as (j)ó (this is used in conjugation and declension tables); if set
-- to "none", it will always be rendered (j)o.
-- NOADJ disables special-casing for adjectives in -го, while FORCEADJ forces
-- NOADJ disables special-casing for adjectives in -го, while FORCEADJ forces
-- special-casing for adjectives and disables checking for exceptions
-- special-casing for adjectives and disables checking for exceptions
-- (e.g. много). NOSHTO disables special-casing for что and related words.
-- (e.g. много). NOSHTO disables special-casing for что and related words.
function export.tr(text, lang, sc, include_monosyllabic_jo_accent, noadj, noshto, forceadj)
-- As a special case, if `lang` is a language other than "ru", then none of
local origtext, subbed_text = export.apply_tr_fixes(text, noadj, noshto, forceadj)
-- the special transformations are applied, and JO_ACCENT is set to "none".
return export.tr_after_fixes(subbed_text, include_monosyllabic_jo_accent)
-- This is for situations which require Russian transcriptions of Cyrillic,
-- but where the special cases don't make sense (e.g. the Cyrillization of
-- Mandarin, or pidgins such as Russenorsk).
function export.tr(text, lang, sc, jo_accent, noadj, noshto, forceadj)
if (ine(lang) or "ru") ~= "ru" then
return export.tr_after_fixes(text, "none")
end
return export.tr_after_fixes(
select(2, export.apply_tr_fixes(text, noadj, noshto, forceadj)),
jo_accent
)
end
end


Line 318: Line 383:
-- transformations are applied and before translit. It is of the form
-- transformations are applied and before translit. It is of the form
-- FROM/TO,FROM/TO,...
-- FROM/TO,FROM/TO,...
function export.tr_sub(text, include_monosyllabic_jo_accent, noadj, noshto, sub,
function export.tr_sub(text, jo_accent, noadj, noshto, sub,
forceadj)
forceadj)
if type(text) == 'table' then -- called directly from a template
if type(text) == "table" then -- called directly from a template
include_monosyllabic_jo_accent = ine(text.args.include_monosyllabic_jo_accent)
jo_accent = ine(text.args.jo_accent)
noadj = ine(text.args.noadj)
noadj = ine(text.args.noadj)
noshto = ine(text.args.noshto)
noshto = ine(text.args.noshto)
Line 336: Line 401:
end
end


return export.tr(text, nil, nil, include_monosyllabic_jo_accent, noadj, noshto, forceadj)
return export.tr(text, nil, nil, jo_accent, noadj, noshto, forceadj)
end
end


--for adjectives, pronouns
--for adjectives, pronouns
function export.tr_adj(text, include_monosyllabic_jo_accent)
function export.tr_adj(text, jo_accent)
if type(text) == 'table' then -- called directly from a template
if type(text) == "table" then -- called directly from a template
include_monosyllabic_jo_accent = ine(text.args.include_monosyllabic_jo_accent)
jo_accent = ine(text.args.jo_accent)
text = text.args[1]
text = text.args[1]
end
end
Line 349: Line 414:
-- from the noun or adjective modules, it's called with suffix ого, which
-- from the noun or adjective modules, it's called with suffix ого, which
-- would otherwise trigger the exceptional case and be transliterated as ogo
-- would otherwise trigger the exceptional case and be transliterated as ogo
return export.tr(text, nil, nil, include_monosyllabic_jo_accent, false,
return export.tr(text, nil, nil, jo_accent, false,
"noshto", "forceadj")
"noshto", "forceadj")
end
end


return export
return export
-- For Vim, so we get 4-space tabs
-- vim: set ts=4 sw=4 noet:

Revision as of 09:03, 8 January 2025


This module will transliterate Russian language text. It is also used to transliterate Russenorsk, Solombala English, and Taimyr Pidgin Russian. The module should preferably not be called directly from templates or other modules. To use it from a template, use {{xlit}}. Within a module, use Module:languages#Language:transliterate.

For testcases, see Module:ru-translit/testcases.

Functions

tr(text, lang, sc)
Transliterates a given piece of text written in the script specified by the code sc, and language specified by the code lang.
When the transliteration fails, returns nil.

local export = {}

--[=[

FIXME:

1. (DONE) If you write '''Б'''ез, it transliterates to '''B'''jez instead of
   '''B'''ez, as it should.
   -- NOTE: This currently doesn't work due to an issue in [[Module:languages]]
   -- which means this module won't see style apostrophes.
2. (DONE) Convert ъ to nothing before comma or other non-letter particle, e.g.
   in Однимъ словомъ, идешь на чтеніе.
3. (DONE) Make special-casing for adjectives in -го and for что (and friends)
    be the default, and implement transformations in Cyrillic rather than after
    translit so that we can display the transformed Cyrillic in the
    "phonetic respelling" notation of {{ru-IPA}}.
]=]

local m_str_utils = require("Module:string utilities")

local decompose = require("Module:ru-common").decompose
local explode = m_str_utils.explode_utf8
local concat = table.concat
local insert = table.insert
local ipairs = ipairs
local remove = table.remove
local rfind = m_str_utils.find
local rsub = m_str_utils.gsub
local rsplit = m_str_utils.split
local select = select
local toNFC = mw.ustring.toNFC
local toNFD = mw.ustring.toNFD
local u = m_str_utils.char

local AC = u(0x301) -- acute = ́
local GR = u(0x0300) -- grave = ̀
local BR = u(0x0306) -- breve ̆
local DI = u(0x0308) -- diaeresis = ̈
local DIACRITICS = AC .. GR .. BR .. DI ..
	u(0x0302) .. -- circumflex ̂
	u(0x0304) .. -- macron ̄
	u(0x0307) .. -- dot above ̇
	u(0x030A) .. -- ring above ̊
	u(0x030C) .. -- caron ̌
	u(0x030F) .. -- double grave ̏
	u(0x0323) .. -- dot below ̣
	u(0x0328)    -- ogonek ̨
local TEMP_G = u(0xFFF1) -- substitute to prevent g from changing to v
local word_chars = "%a’%(%)%[%]" .. DIACRITICS

local function ine(x) -- if not empty
	return x ~= "" and x or nil
end

-- Main letter conversion table.
local letters = {
	["а"] = "a", ["б"] = "b", ["в"] = "v", ["г"] = "g", ["д"] = "d", ["е"] = "je", ["ж"] = "ž", ["з"] = "z", ["и"] = "i", ["й"] = "j", ["к"] = "k", ["л"] = "l", ["м"] = "m", ["н"] = "n", ["о"] = "o", ["п"] = "p", ["р"] = "r", ["с"] = "s", ["т"] = "t", ["у"] = "u", ["ф"] = "f", ["х"] = "x", ["ц"] = "c", ["ч"] = "č", ["ш"] = "š", ["щ"] = "šč", ["ъ"] = "ʺ", ["ы"] = "y", ["ь"] = "ʹ", ["э"] = "e", ["ю"] = "ju", ["я"] = "ja",
	["А"] = "A", ["Б"] = "B", ["В"] = "V", ["Г"] = "G", ["Д"] = "D", ["Е"] = "Je", ["Ж"] = "Ž", ["З"] = "Z", ["И"] = "I", ["Й"] = "J", ["К"] = "K", ["Л"] = "L", ["М"] = "M", ["Н"] = "N", ["О"] = "O", ["П"] = "P", ["Р"] = "R", ["С"] = "S", ["Т"] = "T", ["У"] = "U", ["Ф"] = "F", ["Х"] = "X", ["Ц"] = "C", ["Ч"] = "Č", ["Ш"] = "Š", ["Щ"] = "Šč", ["Ъ"] = "ʺ", ["Ы"] = "Y", ["Ь"] = "ʹ", ["Э"] = "E", ["Ю"] = "Ju", ["Я"] = "Ja",
	-- Russian style quotes
	["«"] = "“", ["»"] = "”",
	-- archaic, pre-1918 letters
	["і"] = "i", ["ѳ"] = "f", ["ѣ"] = "jě", ["ѵ"] = "i",
	["І"] = "I", ["Ѳ"] = "F", ["Ѣ"] = "Jě", ["Ѵ"] = "I",
	-- archaic, pre-1708 letters (most of these are covered by aliases below)
	["ѥ"] = "je", ["ѯ"] = "ks", ["ѱ"] = "ps",
	["Ѥ"] = "Je", ["Ѯ"] = "Ks", ["Ѱ"] = "Ps",
}

-- Treat most archaic letters as aliases. Exceptions:
-- ѥ is not the same as е, because it doesn't lose iotation after a consonant.
-- ѯ and ѱ can't be treated as aliases, because mapping 1 character to 2 messes
-- can cause the logic which checks the capitalization of adjacent letters to
-- become unreliable. This only affects the uppercase forms, but the lowercase
-- forms are also excepted for consistency.
local aliases = {
	["є"] = "е", ["ꙁ"] = "з", ["ꙃ"] = "з", ["ѕ"] = "з", ["ї"] = "і", ["ꙋ"] = "у", ["ѡ"] = "о", ["ѿ"] = "о", ["ꙑ"] = "ы", ["ꙗ"] = "я", ["ѧ"] = "я", ["ѫ"] = "у", ["ѩ"] = "я", ["ѭ"] = "ю",
	["Є"] = "Е", ["Ꙁ"] = "З", ["Ꙃ"] = "З", ["Ѕ"] = "З", ["Ї"] = "І", ["Ꙋ"] = "У", ["Ѡ"] = "О", ["Ѿ"] = "О", ["Ꙑ"] = "Ы", ["Ꙗ"] = "Я", ["Ѧ"] = "Я", ["Ѫ"] = "У", ["Ѩ"] = "Я", ["Ѭ"] = "Ю", ["'"] = "’"
}

local plain_e = {
	["е"] = "e", ["ѣ"] = "ě", ["э"] = "ɛ",
	["Е"] = "E", ["Ѣ"] = "Ě", ["Э"] = "Ɛ"
}

local jo_letters = {
	["ё"] = "jo", ["ѣ̈"] = "jǒ", ["я̈"] = "jǫ",
	["Ё"] = "Jo", ["Ѣ̈"] = "Jǒ", ["Я̈"] = "Jǫ"
}

local vowels = "аеиіоуыѣэюяѥѵaæɐeəɛiɪɨoɵuyʊʉАЕИІОУЫѢЭЮЯѤѴAEƐIOUY"

-- Apply transformations to the Cyrillic to more closely match pronunciation.
-- Return two arguments: the "original" text (after decomposing composed
-- grave characters), and the transformed text. If the two are different,
-- {{ru-IPA}} should display a "phonetic respelling" notation. 
-- NOADJ disables special-casing for adjectives in -го, while FORCEADJ forces
-- special-casing for adjectives, including those in -аго (pre-reform spelling)
-- and disables checking for exceptions (e.g. много, ого). NOSHTO disables
-- special-casing for что and related words.
function export.apply_tr_fixes(text, noadj, noshto, forceadj)
	-- normalize any aliases
	text = text:gsub(".[\128-\191]*", aliases)
	-- decompose stress accents without decomposing letters we want to treat
	-- as units (e.g. й or ё)
	text = decompose(text)

	local origtext = text
	-- the second half of the if-statement below is an optimization; see above.
	if not noadj and text:find("го") then
		local v = {["г"] = "в", ["Г"] = "В"}
		local repl = function(e, g, o, sja) return e .. v[g] .. o .. (sja or "") end
		-- Handle какого-нибудь/-либо/-то; must be done first because of an exception
		-- made for бого-, снего-, etc.
		text = rsub(text, "([кКтТ][аА][кК][оеОЕ" .. (forceadj and "аА" or "") .. "][" .. AC .. GR .. "]?)([гГ])([оО]%-)", repl)
		if not forceadj then
			local function go(text, case)
				local pattern = rsub(case, "^(.)(.*)(го[" .. AC .. GR .. "]?)(%-?)$", function(m1, m2, m3, m4)
					m1 = "%f[%a" .. AC .. GR .. "]([" .. m1:uupper() .. m1 .. "]"
					m2 = m2:gsub("\204[\128\129]", "[" .. AC .. GR .. "]?") .. ")"
					m3 = m3:gsub("\204[\128\129]", "[" .. AC .. GR .. "]?")
						:gsub("^г(.*)", "г(%1")
					m4 = m4 == "-" and "%-)" or ")%f[^%a" .. AC .. GR .. "]"
					return m1 .. m2 .. m3 .. m4
				end)
				return rsub(text, pattern, "%1" .. TEMP_G .. "%2")
			end
			for _, case in ipairs{"мно́го", "н[еа]мно́го", "до́рого", "недо́рого", "стро́го", "нестро́го", "на́строго", "убо́го", "пол[ао]́го"} do
				text = go(text, case)
			end
			-- check for neuter short forms of compound adjectives in -но́гий
			if rfind(text, "но[" .. AC .. GR .. "]?го%f[^%a" .. AC .. GR .. "]") then
				for _, case in ipairs{"безно́го", "босоно́го", "веслоно́го", "длинноно́го", "двуно́го", "коротконо́го", "кривоно́го", "одноно́го", "пятино́го", "трёхно́го", "трехно́го", "хромоно́го", "четвероно́го", "шестино́го"} do
					text = go(text, case)
				end
			end
			for _, case in ipairs{"ого́", "го́го", "ваго́го", "ло́го", "п[ео]́го", "со́го", "То́го", "ле́го", "игого́", "огого́", "альбиньязего", "д[иі]е́го", "бо́лого", "гр[иі]е́го", "манче́го", "пичис[иі]е́го", "тенкодого", "хио́го", "аго-", "его-", "ого-"} do
				text = go(text, case)
			end
		end
		--handle genitive/accusative endings, which are spelled -ого/-его/-аго
		-- (-ogo/-ego/-ago) but transliterated -ovo/-evo/-avo; only for adjectives
		-- and pronouns, excluding words like много, ого (-аго occurs in
		-- pre-reform spelling); \204\129 is an acute accent, \204\128 is a grave accent
		local pattern = "([оеОЕ" .. (forceadj and "аА" or "") .. "][" .. AC .. GR .. "]?)([гГ])([оО][" .. AC .. GR .. "]?)"
		local reflexive = "([сС][яЯ][" .. AC .. GR .. "]?)"
		text = rsub(text, pattern .. "%f[^%a" .. AC .. GR .. TEMP_G .. "]", repl)
		text = rsub(text, pattern .. reflexive .. "%f[^%a" .. AC .. GR .. TEMP_G .. "]", repl)
		-- handle сегодня
		text = rsub(text, "%f[%a" .. AC .. GR .. "]([Сс]е)г(о[" .. AC .. GR .. "]?дня)%f[^%a" .. AC .. GR .. "]", "%1в%2")
		-- handle сегодняшн-
		text = rsub(text, "%f[%a" .. AC .. GR .. "]([Сс]е)г(о[" .. AC .. GR .. "]?дняшн)", "%1в%2")
		-- replace TEMP_G with g; must be done after the -go -> -vo changes
		text = rsub(text, TEMP_G, "г")
	end

	-- the second half of the if-statement below is an optimization; see above.
	if not noshto and text:find("то") then
		local ch2sh = {["ч"] = "ш", ["Ч"] = "Ш"}
		-- Handle что
		text = rsub(text, "%f[%a" .. AC .. GR .. "]([Чч])(то[" .. AC .. GR .. "]?)%f[^%a" .. AC .. GR .. "]",
			function(ch, to) return ch2sh[ch] .. to end)
		-- Handle чтобы, чтоб
		text = rsub(text, "%f[%a" .. AC .. GR .. "]([Чч])(то[" .. AC .. GR .. "]?бы?)%f[^%a" .. AC .. GR .. "]",
			function(ch, to) return ch2sh[ch] .. to end)
		-- Handle ничто
		text = rsub(text, "%f[%a" .. AC .. GR .. "]([Нн]и)ч(то[" .. AC .. GR .. "]?)%f[^%a" .. AC .. GR .. "]", "%1ш%2")
	end

	-- Handle мягкий, лёгкий, легчать, etc.
	text = rsub(text, "([МмЛл][яеё][" .. AC .. GR .. "]?)г([кч])", "%1х%2")

	return origtext, text
end

do
	-- If и, check if it's actually й to avoid wrongly treating it as a vowel.
	local function handle_short_i(word, ch, i, adjust)
		if (ch == "и" or ch == "И") and word[i] == BR then
			-- Remove the breve; only used by get_next_char.
			if adjust then
				remove(word, i)
			end
			ch = toNFC(ch .. BR)
			word[i - 1] = ch
		end
		return ch
	end
	
	local function get_prev_char(word, i)
		local j, ch = 0
		repeat
			j = j + 1
			ch = word[i - j]
		until not (ch and (DIACRITICS .. "()’"):find(ch, nil, true))
		return handle_short_i(word, ch, i - j + 1)
	end
	
	local function get_next_char(word, i)
		local j, ch = 0
		repeat
			j = j + 1
			ch = word[i + j]
		until ch ~= "(" and ch ~= ")"
		return handle_short_i(word, ch, i + j + 1, true)
	end
	
	-- Check if a vowel should be made "plain" (usually by removing the "j"
	-- in the transliteration). Returns true if `prev` is in the string `check`.
	-- If `this` and `prev` are both uppercase, always returns false (on the
	-- assumption the term is an initialism).
	-- Note: We check both because of terms like Романо-д’Эццелино and
	-- Комон-л’Эванте, where an uppercase `this` follows a lowercase `prev`,
	-- (since the apostrophe is ignored).
	local function check_plain(this, prev, check, in_check)
		if prev and (this == this:ulower() or prev == prev:ulower()) then
			if check:match(prev, 1, true) then
				return in_check
			end
			return not in_check
		end
	end
	
	-- Convert any jos (ё, ѣ̈, я̈) as a special-case.
	local function is_jo_letter(this, prev, output, word, d)
		local tr = jo_letters[this]
		if not tr then
			return
		end
		-- Remove "j" if preceded by a hushing consonant (ж ч ш щ).
		if check_plain(this, prev, "жчшщЖЧШЩ", true) then
			tr = tr:sub(2)
			if this == this:uupper() then
				tr = tr:uupper()
			end
		end
		insert(output, tr)
		-- Note the position, so we can give it an implicit primary stress
		-- if necessary (unless it already has secondary stress; shouldn't
		-- ever come after primary stress, but just in case it does we
		-- shouldn't override it or give the jo two stress marks.
		if word[d.i + 1] ~= GR then
			d.final_jo = #output
		end
		return true
	end
	
	local function do_iteration(output, word, d)
		-- Get current, previous and next characters, skipping over brackets, and
		-- ignoring diacritics for the previous character (which simplifies checks).
		local this = word[d.i]
		local prev = get_prev_char(word, d.i)
		local nxt = get_next_char(word, d.i)
		-- A word is monosyllabic if it has only one vowel.
		if vowels:find(this, 1, true) then
			d.vowels = d.vowels + 1
		end
		if nxt == DI then
			d.i = d.i + 1
			this = toNFC(this .. DI)
			if is_jo_letter(this, prev, output, word, d) then
				return
			end
		elseif nxt == BR then
			d.i = d.i + 1
			this = toNFC(this .. BR)
		-- Note that explicit stress has been found, which prevents any
		-- implicit stress from being added for jos.
		elseif this == AC then
			d.primary = true
		-- After a lowercase consonant or at the start of a suffix, е becomes
		-- e, ѣ becomes ě and э becomes ɛ; after й, this only applies to э.
		elseif plain_e[this] and (
			check_plain(this, prev, vowels .. "ʹʺъЪьЬ" .. ((this == "э" or this == "Э") and "" or "йЙ"), false) or
			not prev and d.dash_before
		) then
			insert(output, plain_e[this])
			return
		-- ю becomes u if if preceded by ж or ш.
		elseif (
			(this == "ю" or this == "Ю") and
			check_plain(this, prev, "жшЖШ", true)
		) then
			insert(output, this == "ю" and "u" or "U")
			return
		-- Make lowercase izhitsa display as -v- after /a/, /e/ and /i/
		-- (matching the equivalent Greek digraphs αυ, ευ and ηυ).
		elseif (
			this == "ѵ" and
			prev and ("аеиіѣэяѥaæɐeəɛiɪɨАЕИІѢЭЯѤAEƐI"):find(prev, 1, true)
		) then
			this = "в"
			word[d.i] = "в"
		-- Ignore word-final hard signs.
		elseif (this == "ъ" or this == "Ъ") and d.i == #word then
			return
		end
		insert(output, letters[this] or this)
	end

	-- Transliterate after the pronunciation-related transformations of
	-- export.apply_tr_fixes() have been applied. Called from {{ru-IPA}}.
	-- `jo_accent` is as in export.tr().
	function export.tr_after_fixes(text, jo_accent)
		-- normalize any aliases
		text = toNFC(text:gsub(".[\128-\191]*", aliases))
		local output = {}
		
		-- Note: We use ustring gsub because ustring gmatch is bugged, and
		-- it's easy to make gsub do the same thing.
		rsub(text, "([^" .. word_chars .. "]*)([" .. word_chars .. "]*)", function(before, word)
			for _, ch in ipairs(explode(before)) do
				insert(output, ch)
			end
			-- FIXME: Do this in one loop instead of splitting by word.
			word = explode(toNFD(word))
			local d = {
				i = 0,
				vowels = 0
			}
			-- Prefix if it's preceded by "^-" or " -".
			if output[#output] == "-" then
				local prev = output[#output - 1]
				if not prev or rfind(prev, "%s") then
					d.dash_before = true
				end
			end
			while d.i < #word do
				d.i = d.i + 1
				do_iteration(output, word, d)
			end
			-- Add an implicit primary stress to a jo (if applicable).
			-- Jos do not implicitly take stress accents if an explicit primary
			-- stress is given. Otherwise, the final jo which doesn't have
			-- secondary stress takes primary stress.
			-- Prefixes do not take implicit primary stress.
			-- Primary stress will be shown on monosyllables if either they
			-- are a suffix or `jo_accent` is "mono".
			if (
				jo_accent ~= "none" and
				d.final_jo and
				(not (d.primary or word[#word] == "-")) and
				(jo_accent == "mono" or d.vowels > 1 or d.dash_before)
			) then
				output[d.final_jo] = output[d.final_jo] .. AC
			end
		end)
		
		return toNFC(concat(output))
	end
end

-- Transliterates text, which should be a single word or phrase. It should
-- include stress marks, which are then preserved in the transliteration.
-- ё is a special case: it is rendered (j)ó in multisyllabic words and
-- monosyllabic words in multi-word phrases, but rendered (j)o without an
-- accent in isolated monosyllabic words. This can be overridden with the
-- JO_ACCENT parameter: if set to "mono", monosyllabic words will also be
-- given as (j)ó (this is used in conjugation and declension tables); if set
-- to "none", it will always be rendered (j)o.
-- NOADJ disables special-casing for adjectives in -го, while FORCEADJ forces
-- special-casing for adjectives and disables checking for exceptions
-- (e.g. много). NOSHTO disables special-casing for что and related words.
-- As a special case, if `lang` is a language other than "ru", then none of
-- the special transformations are applied, and JO_ACCENT is set to "none".
-- This is for situations which require Russian transcriptions of Cyrillic,
-- but where the special cases don't make sense (e.g. the Cyrillization of
-- Mandarin, or pidgins such as Russenorsk).
function export.tr(text, lang, sc, jo_accent, noadj, noshto, forceadj)
	if (ine(lang) or "ru") ~= "ru" then
		return export.tr_after_fixes(text, "none")
	end
	return export.tr_after_fixes(
		select(2, export.apply_tr_fixes(text, noadj, noshto, forceadj)),
		jo_accent
	)
end

-- translit with various special-case substitutions; NOADJ disables
-- special-casing for adjectives in -го, while FORCEADJ forces special-casing
-- for adjectives and disables checking for expections (e.g. много).
-- NOSHTO disables special-casing for что and related words. SUB is used
-- to implement arbitrary substitutions in the Cyrillic text before other
-- transformations are applied and before translit. It is of the form
-- FROM/TO,FROM/TO,...
function export.tr_sub(text, jo_accent, noadj, noshto, sub,
	forceadj)
	if type(text) == "table" then -- called directly from a template
		jo_accent = ine(text.args.jo_accent)
		noadj = ine(text.args.noadj)
		noshto = ine(text.args.noshto)
		sub = ine(text.args.sub)
		text = text.args[1]
	end

	if sub then
		local subs = rsplit(sub, ",")
		for _, subpair in ipairs(subs) do
			local subsplit = rsplit(subpair, "/")
			text = rsub(text, subsplit[1], subsplit[2])
		end
	end

	return export.tr(text, nil, nil, jo_accent, noadj, noshto, forceadj)
end

--for adjectives, pronouns
function export.tr_adj(text, jo_accent)
	if type(text) == "table" then -- called directly from a template
		jo_accent = ine(text.args.jo_accent)
		text = text.args[1]
	end

	-- we have to include "forceadj" because typically when tr_adj() is called
	-- from the noun or adjective modules, it's called with suffix ого, which
	-- would otherwise trigger the exceptional case and be transliterated as ogo
	return export.tr(text, nil, nil, jo_accent, false,
		"noshto", "forceadj")
end

return export