Module:bn-translit

From Linguifex
Jump to navigation Jump to search

This module will transliterate Bengali language text. The module should preferably not be called directly from templates or other modules. To use it from a template, use {{xlit}}. Within a module, use Module:languages#Language:transliterate.

For testcases, see Module:bn-translit/testcases.

Functions

tr(text, lang, sc)
Transliterates a given piece of text written in the script specified by the code sc, and language specified by the code lang.
When the transliteration fails, returns nil.

-- Transliteration for Bengali
local export = {}

local m_str_utils = require("Module:string utilities")

local u = m_str_utils.char
local gmatch = m_str_utils.gmatch
local gsub = m_str_utils.gsub
local len = m_str_utils.len
local match = m_str_utils.match
local sub = m_str_utils.sub
local toNFC = mw.ustring.toNFC

local QO = u(0x003F) -- question mark

local char = {
	-- consonants
	["ক"] = "k",
	["খ"] = "kh",
	["গ"] = "g",
	["ঘ"] = "gh",
	["ঙ"] = "ṅ",
	["চ"] = "c",
	["ছ"] = "ch",
	["জ"] = "j",
	["ঝ"] = "jh",
	["ঞ"] = "ñ",
	["ট"] = "ṭ",
	["ঠ"] = "ṭh",
	["ড"] = "ḍ",
	["ঢ"] = "ḍh",
	["ণ"] = "ṇ",
	["ত"] = "t",
	["থ"] = "th",
	["দ"] = "d",
	["ধ"] = "dh",
	["ন"] = "n",
	["প"] = "p",
	["ফ"] = "ph",
	["ব"] = "b",
	["ভ"] = "bh",
	["ম"] = "m",
	["য"] = "j",
	["র"] = "r",
	["ল"] = "l",
	["শ"] = "ś",
	["ষ"] = "ṣ",
	["স"] = "s",
	["হ"] = "h",
	["ড়"] = "ṛ",
	["ঢ়"] = "ṛh",
	["য়"] = "ẏ",

	-- vowel diacritics
	["ি"] = "i",
	["ু"] = "u",
	["ৃ"] = "ri",
	["ে"] = "e",
	["ো"] = "ō",
	["া"] = "a",
	["ী"] = "i",
	["ূ"] = "u",
	["ৈ"] = "ōi",
	["ৌ"] = "ōu",

	-- archaic vowel diacritics
	["ৄ"] = "ri",
	["ৢ"] = "li",
	["ৣ"] = "li",

	-- visarga
	["ঃ"] = "ḥ",

	-- vowel signs
	["অ"] = "o",
	["ই"] = "i",
	["উ"] = "u",
	["ঋ"] = "ri",
	["এ"] = "e",
	["ও"] = "ō",
	["আ"] = "a",
	["ঈ"] = "i",
	["ঊ"] = "u",
	["ঐ"] = "ōi",
	["ঔ"] = "ōu",

	-- archaic vowel signs
	["ৠ"] = "ri",
	["ঌ"] = "li",
	["ৡ"] = "li",

	--virama
	["্"] = "",

	-- chandrabindu
	["ঁ"] = "̃",

	-- avagraha
	['ঽ'] = '’',

	-- anusvara
	["ং"] = "ṅ",

	-- khandata,
	["ৎ"] = "t",

	-- numerals
	["০"] = "0",
	["১"] = "1",
	["২"] = "2",
	["৩"] = "3",
	["৪"] = "4",
	["৫"] = "5",
	["৬"] = "6",
	["৭"] = "7",
	["৮"] = "8",
	["৯"] = "9",

	-- punctuation
	["।"] = ".", -- dãri
}

local consonant, vowel, vowel_sign = "ক-হড়-য়", "oা-ৌ’", "অ-ঔ"
local c = "[" .. consonant .. "]"
local cc = "়?" .. c
local v = "[" .. vowel .. vowel_sign .. "o]"
local syncope_pattern = "(" .. v .. cc .. v .. cc .. ")o(" .. cc .. "ঁ?" .. v .. ")"

local deaspirate = "[কগচজটডতদপব]"

local function rev_string(text)
	local result, length = "", len(text)
	for i = 1, length do
		result = result .. sub(text, length - i + 1, length - i + 1)
	end
	return result
end

function export.tr(text, lang, sc, override)
	text = gsub(text, "(" .. c .. ")ও", "%1্ও")
	text = gsub(text, "^(" .. c .. ")্ও", "%1ও")

	text = gsub(text, "(" .. c .. ")্‌(" .. c .. ")$", "%1্%2্")
	text = gsub(text, "(" .. c .. ")্‌(" .. c .. ") ", "%1্%2্ ")

	text = gsub(text, "(" .. v .. ")ঞ(" .. v .. ")", "%1̃%2")

	text = gsub(text, "(" .. c .. "়?)([" .. vowel .. "’?্]?)", function(a, b)
		return a .. (b == "" and "o" or b)
	end)

	for word in gmatch(text, "[ঁ-৽o’]+") do
		local orig_word = word
		word = rev_string(word)
		word = gsub(word, "^o(়?" .. c .. ")(ঁ?" .. v .. ")", "%1%2")
		while match(word, syncope_pattern) do
			word = gsub(word, syncope_pattern, "%1%2")
		end
		text = gsub(text, orig_word, rev_string(word))
	end

	text = gsub(text, "(" .. deaspirate .. ")হ", "%1'h")

	text = gsub(text, "্ম", "ṃ")
	text = gsub(text, "্য", "y")
	text = gsub(text, "্ব", "v")

	text = gsub(text, "িত$", "ito")
	text = gsub(text, "িত ", "ito ")

	text = gsub(text, "ৃত$", "rito")
	text = gsub(text, "ৃত ", "rito ")

	text = gsub(text, "িব$", "ibo")
	text = gsub(text, "িব ", "ibo ")

	text = gsub(text, "র্চ$", "র্চ্‌")
	text = gsub(text, "র্চ ", "র্চ্‌ ")

	text = gsub(text, "ছিল$", "chilo")
	text = gsub(text, "ছিল ", "chilo ")

	text = gsub(text, "র([মফ])o", "রo%1")

	text = gsub(text, "(" .. cc .. ")o([অআ])", "%1%2")
	text = gsub(text, "(" .. cc .. ")ও", "%1oō")

	text = gsub(text, ".[়’]?", char)
	text = gsub(text, ".", char)

	local v_Latn = "[oaiueō]̃?"
	local c_Latn = "[bcdḍghjklmṃnṇprsśṣtṭvẇyẏ]"
	local consonants_no_h = "[bcdgjklmnpsśtṭḍ]"

	-- inherent vowel deletion
	text = gsub(text, "(" .. v_Latn .. ")bo([bdps])(" .. v_Latn .. ")", "%1b%2%3")
	text = gsub(text, "(" .. v_Latn .. ")cho([bpt])(" .. v_Latn .. ")", "%1ch%2%3")
	text = gsub(text, "(" .. v_Latn .. ")do([bp])(" .. v_Latn .. ")", "%1d%2%3")
	text = gsub(text, "(" .. v_Latn .. ")dho([bp])(" .. v_Latn .. ")", "%1dh%2%3")

	text = gsub(text, "(" .. v_Latn .. ")go([bpr])(" .. v_Latn .. ")", "%1g%2%3")

	text = gsub(text, "(" .. v_Latn .. ")jo([bpr])(" .. v_Latn .. ")", "%1j%2%3")
	text = gsub(text, "(" .. v_Latn .. ")ko([bmprsśtṭ])(" .. v_Latn .. ")", "%1k%2%3")
	text = gsub(text, "(" .. v_Latn .. ")kho([bmpt])(" .. v_Latn .. ")", "%1kh%2%3")
	text = gsub(text, "(" .. v_Latn .. ")lo([bdp]h?)(" .. v_Latn .. ")", "%1l%2%3")
	text = gsub(text, "(" .. v_Latn .. ")lo([dp]v)(" .. v_Latn .. ")", "%1l%2%3")

	text = gsub(text, "(" .. v_Latn .. ")mo([bckprṛ])(" .. v_Latn .. ")", "%1m%2%3")
	text = gsub(text, "(" .. v_Latn .. ")no([bcglpṭ]?)(" .. v_Latn .. ")", "%1n%2%3")
	text = gsub(text, "(" .. v_Latn .. ")ṅo([blmp]h?)(" .. v_Latn .. ")", "%1ṅ%2%3")
	text = gsub(text, "(" .. v_Latn .. ")po([bcp])(" .. v_Latn .. ")", "%1p%2%3")
	text = gsub(text, "(" .. v_Latn .. ")pho([bdjmtpz]?)(" .. v_Latn .. ")", "%1ph%2%3")

	text = gsub(text, "(" .. v_Latn .. ")ro([bcdghjklsṣś]h?)(" .. v_Latn .. ")", "%1r%2%3")
	text = gsub(text, "(" .. v_Latn .. ")ṣo([bjlmp])(" .. v_Latn .. ")", "%1ṣ%2%3")
	text = gsub(text, "(" .. v_Latn .. ")śo([bgjlmp])(" .. v_Latn .. ")", "%1ś%2%3")
	text = gsub(text, "(" .. v_Latn .. ")so([bjlmp])(" .. v_Latn .. ")", "%1s%2%3")
	text = gsub(text, "(" .. v_Latn .. ")ṭo([bgkp])(" .. v_Latn .. ")", "%1ṭ%2%3")

	text = gsub(text, "(" .. v_Latn .. ")ẏo([j])(" .. v_Latn .. ")", "%1ẏ%2%3")

	-- exceptional
	text = gsub(text, "([cr])ch$", "%1cho")
	text = gsub(text, "([cr])ch ", "%1cho ")
	text = gsub(text, "([cr])ch(" .. QO .. ")", "%1cho%2")

	text = gsub(text, "apon(" .. v_Latn .. ")", "apn%1")
	text = gsub(text, "arbi", "arobi")

	text = gsub(text, "goñjo$", "gonj")
	text = gsub(text, "goñjo ", "gonj ")
	text = gsub(text, "got", "goto")

	text = gsub(text, "hojjo", "hojj")

	text = gsub(text, "ikta$", "ikota")
	text = gsub(text, "ikta ", "ikota ")

	text = gsub(text, "iẏ$", "iẏo")
	text = gsub(text, "iẏ ", "iẏo ")

	text = gsub(text, "ken$", "keno")
	text = gsub(text, "ken ", "keno ")
	text = gsub(text, "ken(" .. QO .. ")", "keno%1")

	text = gsub(text, "korob", "korbo")

	text = gsub(text, "sṭo$", "sṭ")
	text = gsub(text, "sṭo ", "sṭ ")

	text = gsub(text, "ajon(" .. v_Latn .. ")", "ajn")
	text = gsub(text, "(" .. v_Latn .. ")koṭr(" .. v_Latn .. ")", "%1kṭr%2")
	text = gsub(text, "(" .. v_Latn .. ")khost(" .. v_Latn .. ")", "%1khst%2")
	text = gsub(text, "(" .. v_Latn .. ")jost(" .. v_Latn .. ")", "%1jst%2")
	text = gsub(text, "(" .. v_Latn .. ")no(" .. c_Latn .. "h?)(" .. c_Latn .. "h?)(" .. v_Latn .. ")", "%1n%2%3%4")
	text = gsub(text, "(" .. v_Latn .. ")rkoṭ(" .. v_Latn .. ")", "%1rkṭ%2")
	text = gsub(text, "(" .. v_Latn .. ")ṣdh(" .. v_Latn .. ")", "%1ṣodh%2")
	text = gsub(text, "(" .. v_Latn .. ")sm(" .. v_Latn .. ")", "%1śom%2")

	text = gsub(text, "^up(" .. c_Latn .. ")", "upo%1")
	text = gsub(text, " up(" .. c_Latn .. ")", " upo%1")

	-- qualifiers
	text = gsub(text, "(" .. c_Latn .. ")oṭa$", "%1ṭa")
	text = gsub(text, "(" .. c_Latn .. ")oṭa ", "%1ṭa ")
	text = gsub(text, "(" .. c_Latn .. ")oṭi$", "%1ṭi")
	text = gsub(text, "(" .. c_Latn .. ")oṭi ", "%1ṭi ")

	-- Cv
	text = gsub(text, "([bgmr])v", "%1b")
	text = gsub(text, "udv", "udb")
	text = gsub(text, "ttv", "tt")
	text = gsub(text, "jjv", "jj")
	text = gsub(text, "^[sś]v", "ś")                     -- initial
	text = gsub(text, "([sś])v", "śś")                   -- medial

	text = gsub(text, "^(" .. consonants_no_h .. "h?)v", "%1") -- initial
	text = gsub(text, " (" .. consonants_no_h .. "h?)v", " %1") -- initial
	text = gsub(text, "([lṅ])(" .. consonants_no_h .. "h?)v", "%1%2")
	text = gsub(text, "(" .. consonants_no_h .. ")v", "%1%1") -- medial
	text = gsub(text, "(" .. consonants_no_h .. ")hv", "%1%1h") -- medial_h

	--ahv, ihv
	text = gsub(text, "ahv", "aubh")
	text = gsub(text, "ihv", "iubh")

	text = gsub(text, "hv", "hb")

	-- kṣ
	text = gsub(text, "^kṣ", "kh") -- initial
	text = gsub(text, " kṣ", " kh") -- initial
	text = gsub(text, "ṅkṣ", "ṅkh") -- after_ṅ
	text = gsub(text, "kṣ", "kkh") -- medial
	text = gsub(text, "kkhṃ", "kkh") -- before_ṃ

	-- sm
	text = gsub(text, "^([ṣs])ṃ(" .. v_Latn .. ")", "ś%2̃") -- initial
	text = gsub(text, "([ṣs])ṃ(" .. v_Latn .. ")", "śś%2̃") -- medial

	-- tm
	text = gsub(text, "^tṃ", "t") -- initial
	text = gsub(text, "tṃ", "tt") -- medial

	text = gsub(text, "ṃ", "m")
	text = gsub(text, "ṣ", "ś")

	-- visarga deletion
	text = gsub(text, "ḥkh", "kkh")

	-- foreign conjuncts
	text = gsub(text, "([ln])ḍo$", "%1ḍ")
	text = gsub(text, "([ln])nḍo ", "%1ḍ ")

	text = gsub(text, "rko$", "rk")
	text = gsub(text, "rko ", "rk ")

	text = gsub(text, "(" .. v_Latn .. ")h$", "%1ho")
	text = gsub(text, "(" .. v_Latn .. ")h ", "%1ho ")

	text = gsub(text, "([glś])aho$", "%1ah")
	text = gsub(text, "([glś])aho ", "%1ah ")

	text = gsub(text, "ṇn", "ṇon")
	text = gsub(text, "ṇ", "n")

	text = gsub(text, "^eya", "ê")
	text = gsub(text, " eya", " ê")
	text = gsub(text, "^oya", "ê")
	text = gsub(text, " oya", " ê")

	text = gsub(text, "^(" .. consonants_no_h .. "h?)ya", "%1ê")                             -- initial
	text = gsub(text, " (" .. consonants_no_h .. "h?)ya", " %1ê")                            -- initial
	text = gsub(text, "^(" .. consonants_no_h .. "h?)(" .. consonants_no_h .. "h?)ya", "%1%2ê") -- initial_double
	text = gsub(text, " (" .. consonants_no_h .. "h?)(" .. consonants_no_h .. "h?)ya", " %1%2ê") -- initial_double
	text = gsub(text, "^hya", "hê")                                                          -- h_initial
	text = gsub(text, "yal$", "êl")                                                          -- final_l

	text = gsub(text, "^jñan", "gên")                                                       -- jñan_initial
	text = gsub(text, " jñan", " gên")                                                      -- jñan_initial
	text = gsub(text, "jñan", "ggên")                                                       -- jñan_medial

	text = gsub(text, "ñ", "n")

	text = gsub(text, "yanḍ", "ênḍ")

	text = gsub(text, "^(" .. consonants_no_h .. "h?)yo", "%1ê") -- initial
	text = gsub(text, " (" .. consonants_no_h .. "h?)yo", " %1ê") -- initial

	-- Cy
	text = gsub(text, "^(" .. consonants_no_h .. "h?)y", "%1") -- initial
	text = gsub(text, "ṅ(" .. consonants_no_h .. "h?)y", "ṅ%1")
	text = gsub(text, "(" .. consonants_no_h .. ")y", "%1%1") -- medial
	text = gsub(text, "(" .. consonants_no_h .. ")hy", "%1%1h") -- medial_h

	-- hy
	text = gsub(text, "^hy", "h") -- initial
	text = gsub(text, " hy", " h") -- initial
	text = gsub(text, "hy", "jjh") -- medial

	-- ry
	text = gsub(text, "ry", "rj")

	text = gsub(text, "ẏo([gklmn])([aeiīōuū])", "ẏ%1%2")
	text = gsub(text, "ẏoō", "ẏō")
	text = gsub(text, "oō$", "ō")

	text = gsub(text, "([ei])ẏ([" .. consonant .. "])", "%1ẏo%2")
	text = gsub(text, "([ei])ẏ$", "%1ẏo")

	-- rules for changing s to ś (applicable for native words only)
	text = gsub(text, "s(" .. v_Latn .. ")$", "ś%1") -- final
	text = gsub(text, "s(" .. v_Latn .. ") ", "ś%1 ") -- final
	text = gsub(text, "s(" .. v_Latn .. ")", "ŝ%1") -- medial

	text = gsub(text, "([ai])s$", "%1ś")
	text = gsub(text, "([ai])s ", "%1ś ")

	text = gsub(text, "os$", "oŝ")
	text = gsub(text, "os ", "oŝ ")

	text = gsub(text, "^(" .. c_Latn .. ")oŝ$", "%1os")
	text = gsub(text, " (" .. c_Latn .. ")oŝ$", " %1os")
	text = gsub(text, "^(" .. c_Latn .. ")oŝ ", "%1os ")

	text = gsub(text, "^ŝe(" .. c_Latn .. ")$", "^se%1")
	text = gsub(text, " ŝe(" .. c_Latn .. ")$", " se%1")
	text = gsub(text, "^ŝe(" .. c_Latn .. ") ", "^se%1 ")
	text = gsub(text, " ŝe(" .. c_Latn .. ") ", " se%1 ")

	text = gsub(text, "ŝalam", "salam")

	text = gsub(text, "ŝ", "ś")

	text = gsub(text, "śl", "sl")
	text = gsub(text, "śr", "sr")
	text = gsub(text, "sp", "śp")
	text = gsub(text, "^śp", "sp")
	text = gsub(text, " śp", " sp")

	text = gsub(text, "śṭh$", "śṭho")

	text = gsub(text, "^([kg]h?)([dḍtṭ])", "%1o%2")
	text = gsub(text, "^(" .. c_Latn .. ")([aou])b$", "%1%2bo")
	text = gsub(text, "^(" .. c_Latn .. ")([aou])b ", "%1%2bo ")

	text = gsub(text, "^([bcdḍghjkmṃnṇprsśṣtṭẇẏ])([aou])bh$", "%1%2bho")
	text = gsub(text, "^([bcdḍghjkmṃnṇprsśṣtṭẇẏ])([aou])bh ", "%1%2bho ")

	text = gsub(text, "lona$", "lna")
	text = gsub(text, "nola$", "nla")

	text = gsub(text, "ōẏ", "ōẇ")
	text = gsub(text, "ō̃ẏ", "ō̃ẇ")

	text = gsub(text, "ōẇ$", "ōẏ")
	text = gsub(text, "ōẇ ", "ōẏ ")

	text = gsub(text, "oo", "o")

	if match(text, "[ঁ-৽]") and mode ~= "debug" then
		return nil
	else
		return toNFC(text)
	end
end

return export