Module:tt-translit

From Linguifex
Revision as of 12:44, 21 April 2026 by Sware (talk | contribs) (1 revision imported)
(diff) ← Older revision | Latest revision (diff) | Newer revision → (diff)
Jump to navigation Jump to search

Documentation for this module may be created at Module:tt-translit/doc

local export = {}

local rsubn = mw.ustring.gsub

-- version of rsubn() that discards all but the first return value
local function rsub(term, foo, bar)
	local retval = rsubn(term, foo, bar)
	return retval
end

-- apply rsub() repeatedly until no change
local function rsub_repeatedly(term, foo, bar)
	while true do
		local new_term = rsub(term, foo, bar)
		if new_term == term then
			return term
		end
		term = new_term
	end
end

local tt = {
	['б']='b',['Б']='B', ['в']='w',['В']='W', ['г']='g',['Г']='G', ['д']='d',['Д']='D',
	['з']='z',['З']='Z', ['й']='y',['Й']='Y', ['к']='k',['К']='K', ['л']='l',['Л']='L',
	['м']='m',['М']='M', ['н']='n',['Н']='N', ['п']='p',['П']='P', ['р']='r',['Р']='R',
	['с']='s',['С']='S', ['т']='t',['Т']='T', ['ф']='f',['Ф']='F', ['х']='x',['Х']='X',
	['ч']='ç',['Ч']='Ç', ['ш']='ş',['Ш']='Ş',
	['җ']='c',['Җ']='C', ['ң']='ñ',['Ң']='Ñ', ['һ']='h',['Һ']='H',
	['ж']='j',['Ж']='J', ['ц']='ts',['Ц']='Ts', ['щ']='şç',['Щ']='Şç', ['ё']='yo',['Ё']='Yo',
	['а']='a',['А']='A', ['ы']='ı',['Ы']='I', ['о']='o',['О']='O', ['у']='u',['У']='U',
	['ә']='ä',['Ә']='Ä', ['э']='e',['Э']='E', ['и']='i',['И']='İ', ['ө']='ö',['Ө']='Ö', ['ү']='ü',['Ү']='Ü',
	--['я']='ya',['Я']='Ya', ['е']='ye',['Е']='Ye', ['ю']='yu',['Ю']='Yu',
	--['е']='e',['Е']='E',
	['ь']='ʹ',['Ь']='ʹ', ['ъ']='ʺ',['Ъ']='ʺ',
	['ҡ']='q',['Ҡ']='Q', ['ғ']='ğ',['Ғ']='Ğ', ['Ӹ']='Iy',['ӹ']='ıy',
}

local consonants = 'БВГДЗЙКЛМНПРСТФХЧШҖҢҺбвгдзйклмнпрстфхчшҗңһЖЦЩжцщҠҒҡғ'
local vowels_hard = 'АЫӸОУаыӹоу'
local vowels_soft = 'ӘЭИӨҮәэиөү'
local vowels_iotated = 'ЯЕЮяею' -- ё is only in loans

local consonants_soft2hard = {['К']='Ҡ', ['Г']='Ғ', ['к']='ҡ', ['г']='ғ'}
local vowels_hard2soft = {['А']='Ә', ['Ы']='Э', ['Ӹ']='И', ['О']='Ө', ['У']='Ү', ['а']='ә', ['ы']='э', ['ӹ']='и', ['о']='ө', ['у']='ү'}

local vowels_iotated_expanded_hard = {['Я']='Йа', ['Е']='Йы', ['Ю']='Йу', ['я']='йа', ['е']='йы', ['ю']='йу'}
local vowels_iotated_expanded_soft = {['Я']='Йә', ['Е']='Йэ', ['Ю']='Йү', ['я']='йә', ['е']='йэ', ['ю']='йү'}

local tt_Arab_New = {
	-- [[s:mul:Рус мәктәпләре өчен татар теле дәреслеге/13]]
	-- XXX: need to investigate the most appropriate Unicode codepoints to use for tt-Arab
	['ا']='а', ['ە']='ә',
	['ب']='б', ['پ']='п', ['ت']='т',
	['ج']='җ', ['چ']='ч', ['ح']='х',
	['د']='д',
	['ر']='р', ['ز']='з', ['ژ']='ж',
	['س']='с', ['ش']='ш',
	['ع']='ғ',
	['ف']='ф', ['ق']='ҡ', ['ک']='к', ['گ']='г', ['ڭ']='ң',
	['ل']='л',
	['م']='м',
	['ن']='н',
	['ۇ']='ө', ['و']='ү', ['ۋ']='в',
	['ه']='һ',
	['ىُ']='э', ['ی']='и',
	['ث']='с', ['خ']='х', ['ذ']='з', ['ص']='с', ['ض']='з', ['ط']='т', ['ظ']='з', ['غ']='ғ',

	['ئ']='ь',
	['ࢭ']='ъ',

	['۱']='1', ['۲']='2', ['۳']='3', ['۴']='4', ['۵']='5',
	['۶']='6', ['۷']='7', ['۸']='8', ['۹']='9', ['۰']='0',
	['١']='1', ['٢']='2', ['٣']='3', ['٤']='4', ['٥']='5',
	['٦']='6', ['٧']='7', ['٨']='8', ['٩']='9', ['٠']='0',

	['،']=',', ['؟']='?',
}

-- excluding ә/а. яңа имля has separate letters
--local vowels_soft2hard = {['э']='ы', ['и']='ӹ', ['ө']='о', ['ү']='у'}
-- XXX: keep и for now. less unsightly and more common than ый? can и vs. ый even be predicted accurately?
local vowels_soft2hard = {['э']='ы', ['и']='и', ['ө']='о', ['ү']='у'}

function export.tr(text, lang, sc)
	if sc == 'tt-Arab' then
		-- яңа имля.
		-- automatic insertion of э/ы would be Cool
		-- but maybe we don't have to worry about that
		-- since яңалиф also omits them.
		-- visualize the continuity between the two.
		-- also, insertion would wreak havoc on иске имля

		-- quick fixes for иске имля?
		text = rsub(text, '^او', 'ئو')
		text = rsub(text, '^ای', 'ئی')
		text = rsub(text, '^آ', 'ئا')
		text = rsub(text, '^ا', 'ئە')
		text = rsub(text, '([%p%s])او', '%1ئو')
		text = rsub(text, '([%p%s])ای', '%1ئی')
		text = rsub(text, '([%p%s])آ', '%1ئا')
		text = rsub(text, '([%p%s])ا', '%1ئە')
		text = rsub(text, 'ه$', 'ە')
		text = rsub(text, 'ه([%p%s' .. mw.ustring.char(0x200C) .. '])', 'ە%1')
		text = rsub(text, mw.ustring.char(0x200C), '') -- ZERO WIDTH NON-JOINER
		text = rsub(text, '(.)' .. mw.ustring.char(0x0651), '%1%1') -- SHADDA

		text = rsub(text, 'ىُ', tt_Arab_New) -- `э/ы` is not atomic in Unicode
		text = rsub(text, '.', tt_Arab_New)

		text = rsub(text, 'ии([әэөаү])', 'ий%1')
		text = rsub(text, 'и([әэөаү])', 'й%1')
		text = rsub(text, '([әэөаүи])и', '%1й')
		text = rsub(text, 'ү([әэөаи])', 'в%1')
		text = rsub(text, '([әэөаиү])ү', '%1в')
		text = rsub(text,
			'([^%p%s]+)',
			function(text)
				text = rsub(text, mw.ustring.format('^(ъ?)и([%s])', consonants), '%1й%2')
				text = rsub(text, mw.ustring.format('^(ь)([%s])', consonants), '%1э%2')

				if mw.ustring.match(text, '[ъаҡғ]') then
					text = rsub(text, mw.ustring.format('([%s])', vowels_soft), vowels_soft2hard)
				end
				text = rsub(text, '^ъ', '')
				text = rsub(text, '^ь', '')

				return text
			end
		)
		text = rsub(text, '.', tt)
		return text
	end

	-- normalize pure vocalic e
	text = rsub(text,
		mw.ustring.format('([%s])([Ее])', consonants),
		function(consonant, e)
			local uniotated = {['Е']='Э', ['е']='э'}
			return consonant .. uniotated[e]
		end
	)

	-- simplify handling ый
	text = rsub(text, 'Ы[Йй]', 'Ӹ')
	text = rsub(text, 'ый', 'ӹ')

	-- Russian loan sounds
	-- XXX: an idea: identify Russian loans by adding an accent mark?
	--text = rsub(text, 'ия', 'ийә')

	-- process iotated soft vowels
	text = rsub(text,
		mw.ustring.format('([%s])([%s]*[%s])', vowels_iotated, consonants, vowels_soft),
		function(vowel_iotated, following)
			return vowels_iotated_expanded_soft[vowel_iotated] .. following
		end
	)
	text = rsub(text,
		mw.ustring.format('([%s])([%s]*)([Ьь])', vowels_iotated, consonants),
		function(vowel_iotated, following, soft_sign)
			return vowels_iotated_expanded_soft[vowel_iotated] .. following
		end
	)
	text = rsub_repeatedly(text,
		mw.ustring.format('([%s])([%s])', vowels_soft, vowels_iotated),
		function(preceding, vowel_iotated)
			return preceding .. vowels_iotated_expanded_soft[vowel_iotated]
		end
	)
	-- process iotated hard vowels
	text = rsub(text,
		mw.ustring.format('([%s])', vowels_iotated),
		function(vowel_iotated)
			return vowels_iotated_expanded_hard[vowel_iotated]
		end
	)
	-- verbal noun + 3rd person possessive
	text = rsub(text, 'үйэ', 'үвэ')

	-- q/ğ is indicated by using a hard vowel, even in soft vowel words
	text = rsub(text,
		mw.ustring.format('([КГкг])([%s]+)([%s])([Ъъ])', vowels_hard, consonants),
		function(kg, vowel, following, soft_and_glottal_sign)
			-- XXX: presumably this is what ъ means here
			return consonants_soft2hard[kg] .. vowels_hard2soft[vowel] .. following .. 'ь'
		end
	)
	text = rsub(text,
		mw.ustring.format('([КГкг])([%s]+)([%s]+[%s])', vowels_hard, consonants, vowels_soft),
		function(kg, vowel, following)
			return consonants_soft2hard[kg] .. vowels_hard2soft[vowel] .. following
		end
	)
	text = rsub(text,
		mw.ustring.format('([КГкг])([%s]+)([%s])([Ьь])', vowels_hard, consonants),
		function(kg, vowel, following, soft_sign)
			return consonants_soft2hard[kg] .. vowels_hard2soft[vowel] .. following
		end
	)
	text = rsub(text,
		mw.ustring.format('([%s]?)([КГкг])([%s]?)', vowels_hard, vowels_hard),
		function(preceding, kg, following)
			return preceding .. (((following ~= '') or (preceding ~= '' and following == '')) and consonants_soft2hard[kg] or kg) .. following
		end
	)
	text = rsub(text, '([КГкг])([Ъъ])', function(kg, hard_sign) return consonants_soft2hard[kg] end)

	-- excrescent y/w after i/u
	text = rsub_repeatedly(text, '([Ии])([' .. vowels_hard .. vowels_soft .. '])', '%1й%2')
	text = rsub_repeatedly(text, '([УҮуү])([' .. vowels_hard .. vowels_soft .. '])', '%1в%2')

	-- semivocalic w after vowels
	text = rsub(text, '([' .. vowels_hard .. vowels_soft .. '])[УҮуү]', '%1в')

	-- glottal stop after vowels
	text = rsub(text, '([' .. vowels_hard .. vowels_soft .. '])[Ээ]', '%1ь')

	text = rsub(text, '.', tt)
	return text
end

return export