Module:tt-translit

From Linguifex
Jump to navigation Jump to search

This module will transliterate Tatar language text. The module should preferably not be called directly from templates or other modules. To use it from a template, use {{xlit}}. Within a module, use Module:languages#Language:transliterate.

For testcases, see Module:tt-translit/testcases.

Functions

tr(text, lang, sc)
Transliterates a given piece of text written in the script specified by the code sc, and language specified by the code lang.
When the transliteration fails, returns nil.

local export = {}

local rsubn = mw.ustring.gsub

-- version of rsubn() that discards all but the first return value
local function rsub(term, foo, bar)
	local retval = rsubn(term, foo, bar)
	return retval
end

-- apply rsub() repeatedly until no change
local function rsub_repeatedly(term, foo, bar)
	while true do
		local new_term = rsub(term, foo, bar)
		if new_term == term then
			return term
		end
		term = new_term
	end
end

local tt = {
	['б']='b',['Б']='B', ['в']='w',['В']='W', ['г']='g',['Г']='G', ['д']='d',['Д']='D',
	['з']='z',['З']='Z', ['й']='y',['Й']='Y', ['к']='k',['К']='K', ['л']='l',['Л']='L',
	['м']='m',['М']='M', ['н']='n',['Н']='N', ['п']='p',['П']='P', ['р']='r',['Р']='R',
	['с']='s',['С']='S', ['т']='t',['Т']='T', ['ф']='f',['Ф']='F', ['х']='x',['Х']='X',
	['ч']='ç',['Ч']='Ç', ['ш']='ş',['Ш']='Ş',
	['җ']='c',['Җ']='C', ['ң']='ñ',['Ң']='Ñ', ['һ']='h',['Һ']='H',
	['ж']='j',['Ж']='J', ['ц']='ts',['Ц']='Ts', ['щ']='şç',['Щ']='Şç', ['ё']='yo',['Ё']='Yo',
	['а']='a',['А']='A', ['ы']='ı',['Ы']='I', ['о']='o',['О']='O', ['у']='u',['У']='U',
	['ә']='ä',['Ә']='Ä', ['э']='e',['Э']='E', ['и']='i',['И']='İ', ['ө']='ö',['Ө']='Ö', ['ү']='ü',['Ү']='Ü',
	--['я']='ya',['Я']='Ya', ['е']='ye',['Е']='Ye', ['ю']='yu',['Ю']='Yu',
	--['е']='e',['Е']='E',
	['ь']='ʹ',['Ь']='ʹ', ['ъ']='ʺ',['Ъ']='ʺ',
	['ҡ']='q',['Ҡ']='Q', ['ғ']='ğ',['Ғ']='Ğ', ['Ӹ']='Iy',['ӹ']='ıy',
}

local consonants = 'БВГДЗЙКЛМНПРСТФХЧШҖҢҺбвгдзйклмнпрстфхчшҗңһЖЦЩжцщҠҒҡғ'
local vowels_hard = 'АЫӸОУаыӹоу'
local vowels_soft = 'ӘЭИӨҮәэиөү'
local vowels_iotated = 'ЯЕЮяею' -- ё is only in loans

local consonants_soft2hard = {['К']='Ҡ', ['Г']='Ғ', ['к']='ҡ', ['г']='ғ'}
local vowels_hard2soft = {['А']='Ә', ['Ы']='Э', ['Ӹ']='И', ['О']='Ө', ['У']='Ү', ['а']='ә', ['ы']='э', ['ӹ']='и', ['о']='ө', ['у']='ү'}

local vowels_iotated_expanded_hard = {['Я']='Йа', ['Е']='Йы', ['Ю']='Йу', ['я']='йа', ['е']='йы', ['ю']='йу'}
local vowels_iotated_expanded_soft = {['Я']='Йә', ['Е']='Йэ', ['Ю']='Йү', ['я']='йә', ['е']='йэ', ['ю']='йү'}

local tt_Arab_New = {
	-- [[s:mul:Рус мәктәпләре өчен татар теле дәреслеге/13]]
	-- XXX: need to investigate the most appropriate Unicode codepoints to use for tt-Arab
	['ا']='а', ['ە']='ә',
	['ب']='б', ['پ']='п', ['ت']='т',
	['ج']='җ', ['چ']='ч', ['ح']='х',
	['د']='д',
	['ر']='р', ['ز']='з', ['ژ']='ж',
	['س']='с', ['ش']='ш',
	['ع']='ғ',
	['ف']='ф', ['ق']='ҡ', ['ک']='к', ['گ']='г', ['ڭ']='ң',
	['ل']='л',
	['م']='м',
	['ن']='н',
	['ۇ']='ө', ['و']='ү', ['ۋ']='в',
	['ه']='һ',
	['ىُ']='э', ['ی']='и',
	['ث']='с', ['خ']='х', ['ذ']='з', ['ص']='с', ['ض']='з', ['ط']='т', ['ظ']='з', ['غ']='ғ',

	['ئ']='ь',
	['ࢭ']='ъ',

	['۱']='1', ['۲']='2', ['۳']='3', ['۴']='4', ['۵']='5',
	['۶']='6', ['۷']='7', ['۸']='8', ['۹']='9', ['۰']='0',
	['١']='1', ['٢']='2', ['٣']='3', ['٤']='4', ['٥']='5',
	['٦']='6', ['٧']='7', ['٨']='8', ['٩']='9', ['٠']='0',

	['،']=',', ['؟']='?',
}

-- excluding ә/а. яңа имля has separate letters
--local vowels_soft2hard = {['э']='ы', ['и']='ӹ', ['ө']='о', ['ү']='у'}
-- XXX: keep и for now. less unsightly and more common than ый? can и vs. ый even be predicted accurately?
local vowels_soft2hard = {['э']='ы', ['и']='и', ['ө']='о', ['ү']='у'}

function export.tr(text, lang, sc)
	if sc == 'tt-Arab' then
		-- яңа имля.
		-- automatic insertion of э/ы would be Cool
		-- but maybe we don't have to worry about that
		-- since яңалиф also omits them.
		-- visualize the continuity between the two.
		-- also, insertion would wreak havoc on иске имля

		-- quick fixes for иске имля?
		text = rsub(text, '^او', 'ئو')
		text = rsub(text, '^ای', 'ئی')
		text = rsub(text, '^آ', 'ئا')
		text = rsub(text, '^ا', 'ئە')
		text = rsub(text, '([%p%s])او', '%1ئو')
		text = rsub(text, '([%p%s])ای', '%1ئی')
		text = rsub(text, '([%p%s])آ', '%1ئا')
		text = rsub(text, '([%p%s])ا', '%1ئە')
		text = rsub(text, 'ه$', 'ە')
		text = rsub(text, 'ه([%p%s' .. mw.ustring.char(0x200C) .. '])', 'ە%1')
		text = rsub(text, mw.ustring.char(0x200C), '') -- ZERO WIDTH NON-JOINER
		text = rsub(text, '(.)' .. mw.ustring.char(0x0651), '%1%1') -- SHADDA

		text = rsub(text, 'ىُ', tt_Arab_New) -- `э/ы` is not atomic in Unicode
		text = rsub(text, '.', tt_Arab_New)

		text = rsub(text, 'ии([әэөаү])', 'ий%1')
		text = rsub(text, 'и([әэөаү])', 'й%1')
		text = rsub(text, '([әэөаүи])и', '%1й')
		text = rsub(text, 'ү([әэөаи])', 'в%1')
		text = rsub(text, '([әэөаиү])ү', '%1в')
		text = rsub(text,
			'([^%p%s]+)',
			function(text)
				text = rsub(text, mw.ustring.format('^(ъ?)и([%s])', consonants), '%1й%2')
				text = rsub(text, mw.ustring.format('^(ь)([%s])', consonants), '%1э%2')

				if mw.ustring.match(text, '[ъаҡғ]') then
					text = rsub(text, mw.ustring.format('([%s])', vowels_soft), vowels_soft2hard)
				end
				text = rsub(text, '^ъ', '')
				text = rsub(text, '^ь', '')

				return text
			end
		)
		text = rsub(text, '.', tt)
		return text
	end

	-- normalize pure vocalic e
	text = rsub(text,
		mw.ustring.format('([%s])([Ее])', consonants),
		function(consonant, e)
			local uniotated = {['Е']='Э', ['е']='э'}
			return consonant .. uniotated[e]
		end
	)

	-- simplify handling ый
	text = rsub(text, 'Ы[Йй]', 'Ӹ')
	text = rsub(text, 'ый', 'ӹ')

	-- Russian loan sounds
	-- XXX: an idea: identify Russian loans by adding an accent mark?
	--text = rsub(text, 'ия', 'ийә')

	-- process iotated soft vowels
	text = rsub(text,
		mw.ustring.format('([%s])([%s]*[%s])', vowels_iotated, consonants, vowels_soft),
		function(vowel_iotated, following)
			return vowels_iotated_expanded_soft[vowel_iotated] .. following
		end
	)
	text = rsub(text,
		mw.ustring.format('([%s])([%s]*)([Ьь])', vowels_iotated, consonants),
		function(vowel_iotated, following, soft_sign)
			return vowels_iotated_expanded_soft[vowel_iotated] .. following
		end
	)
	text = rsub_repeatedly(text,
		mw.ustring.format('([%s])([%s])', vowels_soft, vowels_iotated),
		function(preceding, vowel_iotated)
			return preceding .. vowels_iotated_expanded_soft[vowel_iotated]
		end
	)
	-- process iotated hard vowels
	text = rsub(text,
		mw.ustring.format('([%s])', vowels_iotated),
		function(vowel_iotated)
			return vowels_iotated_expanded_hard[vowel_iotated]
		end
	)
	-- verbal noun + 3rd person possessive
	text = rsub(text, 'үйэ', 'үвэ')

	-- q/ğ is indicated by using a hard vowel, even in soft vowel words
	text = rsub(text,
		mw.ustring.format('([КГкг])([%s]+)([%s])([Ъъ])', vowels_hard, consonants),
		function(kg, vowel, following, soft_and_glottal_sign)
			-- XXX: presumably this is what ъ means here
			return consonants_soft2hard[kg] .. vowels_hard2soft[vowel] .. following .. 'ь'
		end
	)
	text = rsub(text,
		mw.ustring.format('([КГкг])([%s]+)([%s]+[%s])', vowels_hard, consonants, vowels_soft),
		function(kg, vowel, following)
			return consonants_soft2hard[kg] .. vowels_hard2soft[vowel] .. following
		end
	)
	text = rsub(text,
		mw.ustring.format('([КГкг])([%s]+)([%s])([Ьь])', vowels_hard, consonants),
		function(kg, vowel, following, soft_sign)
			return consonants_soft2hard[kg] .. vowels_hard2soft[vowel] .. following
		end
	)
	text = rsub(text,
		mw.ustring.format('([%s]?)([КГкг])([%s]?)', vowels_hard, vowels_hard),
		function(preceding, kg, following)
			return preceding .. (((following ~= '') or (preceding ~= '' and following == '')) and consonants_soft2hard[kg] or kg) .. following
		end
	)
	text = rsub(text, '([КГкг])([Ъъ])', function(kg, hard_sign) return consonants_soft2hard[kg] end)

	-- excrescent y/w after i/u
	text = rsub_repeatedly(text, '([Ии])([' .. vowels_hard .. vowels_soft .. '])', '%1й%2')
	text = rsub_repeatedly(text, '([УҮуү])([' .. vowels_hard .. vowels_soft .. '])', '%1в%2')

	-- semivocalic w after vowels
	text = rsub(text, '([' .. vowels_hard .. vowels_soft .. '])[УҮуү]', '%1в')

	-- glottal stop after vowels
	text = rsub(text, '([' .. vowels_hard .. vowels_soft .. '])[Ээ]', '%1ь')

	text = rsub(text, '.', tt)
	return text
end

return export