Module:mr-Modi-translit

From Linguifex
Jump to navigation Jump to search

This module will transliterate Marathi language text. It is also used to transliterate Varhadi. The module should preferably not be called directly from templates or other modules. To use it from a template, use {{xlit}}. Within a module, use Module:languages#Language:transliterate.

For testcases, see Module:mr-Modi-translit/testcases.

Functions

tr(text, lang, sc)
Transliterates a given piece of text written in the script specified by the code sc, and language specified by the code lang.
When the transliteration fails, returns nil.

local export = {}

local u = require("Module:string/char")
local gsub = mw.ustring.gsub
local find = mw.ustring.find

local ZWJ = u(0x200D)

local conv = {
	-- consonants
	['𑘎']='k', ['𑘏']='kh', ['𑘐']='g', ['𑘑']='gh', ['𑘒']='ṅ',
	['𑘓']='c', ['𑘔']='ch', ['𑘕']='j', ['𑘖']='jh', ['𑘗']='ñ', 
	['𑘘']='ṭ', ['𑘙']='ṭh', ['𑘚']='ḍ', ['𑘛']='ḍh', ['𑘜']='ṇ', 
	['𑘝']='t', ['𑘞']='th', ['𑘟']='d', ['𑘠']='dh', ['𑘡']='n', 
	['𑘢']='p', ['𑘣']='ph', ['𑘤']='b', ['𑘥']='bh', ['𑘦']='m',
	['𑘧']='y', ['𑘨']='r', ['𑘩']='l', ['𑘪']='v', ['𑘯']='ḷ',
	['𑘫']='ś', ['𑘬']='ṣ', ['𑘭']='s', ['𑘮']='h',
	['𑘨𑘿'..ZWJ] = 'r',
	-- ['𑘕𑘿𑘗'] = 'dny',

	-- vowel diacritics
	----  only in script charts: ['𑘱'] = 'i', ['𑘴'] ='ū',
	['𑘳'] = 'u', ['𑘹'] = 'e', ['𑘻'] = 'o', 
	['𑘰'] = 'ā', ['𑘲'] = 'ī',
	['𑘵'] = 'ru',
	['𑘺'] = 'ai', ['𑘼'] = 'au',
	-- ['𑘰𑙀'] = 'ŏ',
	['𑙀'] = 'ĕ',

	-- vowel signs
	----  only in script charts: ['𑘂'] = 'i', ['𑘅'] ='ū',
	['𑘀'] = 'a', ['𑘄'] = 'u', ['𑘊'] = 'e', ['𑘌'] = 'o',
	['𑘁'] = 'ā', ['𑘃'] = 'ī',
	['𑘆'] = 'ŕ', 
	['𑘋'] = 'ai', ['𑘍'] = 'au', 
	['𑘁𑙀'] = 'ŏ',
	['𑘀𑙀'] = 'ĕ', ['𑘊𑙀'] = 'ĕ',
	
	['𑘌𑘦𑘿'] = 'om',
	
	-- chandrabindu
	--- ['𑙀𑘽'] = '̃',
	
	-- anusvara
	['𑘽'] = 'ṁ',
	
	-- visarga
	['𑘾'] = 'ḥ',
	
	-- virama
	['𑘿'] = '',
	
	-- numerals
	['𑙐'] = '0', ['𑙑'] = '1', ['𑙒'] = '2', ['𑙓'] = '3', ['𑙔'] = '4',
	['𑙕'] = '5', ['𑙖'] = '6', ['𑙗'] = '7', ['𑙘'] = '8', ['𑙙'] = '9',
	
	-- punctuation
	['𑙁'] = '.', -- danda
	['𑙂'] = '.', -- double danda
	['+'] = '', -- compound separator
	
	-- abbreviation sign
	['𑙃'] = '.',
}

local nasal_assim = {
	['𑘎'] = '𑘒', ['𑘏'] = '𑘒', ['𑘐'] = '𑘒', ['𑘑'] = '𑘒', 
	['𑘓'] = '𑘗', ['𑘔'] = '𑘗', ['𑘕'] = '𑘗', ['𑘖'] = '𑘗',  
	['𑘘'] = '𑘜', ['𑘙'] = '𑘜', ['𑘚'] = '𑘜', ['𑘛'] = '𑘜',
	['𑘢'] = '𑘦', ['𑘣'] = '𑘦', ['𑘤'] = '𑘦', ['𑘥'] = '𑘦', ['𑘦'] = '𑘦',
	['𑘧'] = 'i', ['𑘨'] = '𑘄', ['𑘩'] = '𑘩', ['𑘪'] = '𑘄',
	['𑘫'] = '𑘄', ['𑘬'] = '𑘄', ['𑘭'] = '𑘄', ['𑘮'] = '𑘄',
}

local perm_cl = {
	['𑘦𑘿𑘩'] = true, ['𑘪𑘿𑘩'] = true, ['𑘡𑘿𑘩'] = true,
	
}

local all_cons, special_cons = '𑘎𑘏𑘐𑘑𑘒𑘓𑘔𑘕𑘖𑘗𑘘𑘙𑘚𑘛𑘝𑘞𑘟𑘠𑘢𑘣𑘤𑘥𑘫𑘬𑘭𑘧𑘨𑘩𑘪𑘮𑘜𑘡𑘦𑘯', '𑘟𑘝𑘧𑘨𑘩𑘪𑘮𑘡𑘦'
local vowel, vowel_sign = '%*a𑘱𑘳𑘵𑘹𑘻𑘰𑘲𑘴𑘺𑘼𑙀', '𑘀𑘂𑘄𑘊𑘌𑘁𑘃𑘅𑘆𑘋𑘍𑘀𑙀'
local syncope_pattern = '([' .. vowel .. vowel_sign .. '])([' .. all_cons .. '])a([' .. all_cons .. '])([ं]?[' .. vowel .. vowel_sign .. '])'

local function rev_string(text)
	local char_array, i = {}, 1
	for char in string.gmatch(text, "[%z\1-\127\194-\244][\128-\191]*") do -- UTF-8 character pattern
		char_array[i] = char
		i = i + 1
	end
	return table.concat(require("Module:table").reverse(char_array))
end

function export.tr(text, lang, sc)
	-- text = gsub(text, 'ाँ', 'ॉ' .. 'ं')
	-- text = gsub(text, 'ँ', 'ॅ' .. 'ं')
	text = gsub(text, '([^' .. vowel .. vowel_sign .. '])𑘽 ', '%1𑘀 ')
	text = gsub(text, '([^' .. vowel .. vowel_sign .. '])𑘽$', '%1𑘀')
	text = gsub(text, '([' .. all_cons .. '])([' .. vowel .. '𑘿]?)', function(c, d)
		return c .. (d == "" and 'a' or d) end)
	for word in mw.ustring.gmatch(text, "[𑘀-𑙙a]+") do
		local orig_word = word
		word = rev_string(word)
		word = gsub(word, '^a([' .. all_cons .. '][' .. vowel .. vowel_sign .. '])', '%1')
		while find(word, syncope_pattern) do
			word = gsub(word, syncope_pattern, '%1%2%3%4')
		end
		word = gsub(word, '(.?)𑘽(.)', function(succ, prev)
			return succ .. (succ..prev == "a" and "𑘿𑘦" or 
				(succ == "" and find(prev, '[' .. vowel .. ']') and "̃" or nasal_assim[succ] or "n")) .. prev end)
		text = gsub(text, orig_word, rev_string(word))
	end
	text = gsub(text, '.', conv)
	text = gsub(text, 'a([iu])̃', 'a͠%1')
	text = gsub(text, 'aa', 'a')
    text = gsub(text, 'ñjñ', 'ndny')
    text = gsub(text, 'jñ', 'dny')
	return mw.ustring.toNFC(text)
end

return export