Module:sa-utilities/translit/SLP1-to-Deva

From Linguifex
Jump to navigation Jump to search

Documentation for this module may be created at Module:sa-utilities/translit/SLP1-to-Deva/doc

local export = {}

local consonant_list = "kKgGNcCjJYwWqQRtTdDnpPbBmyrlLvSzsh"
local consonant = "[" .. consonant_list .. "]"
local vowel_list = "aAiIuUfFxXeEoO"
local vowel = "[" .. vowel_list .. "]"

local U = require("Module:string/char")
local gsub = mw.ustring.gsub
local gmatch = mw.ustring.gmatch

local virAma = U(0x94D)
local svar = U(0x951)
local anud = U(0x952)

local diacritics = {
	['a'] = '', ['A'] = 'ा',
	['i'] = 'ि', ['I'] = 'ी',
	['u'] = 'ु', ['U'] = 'ू',
	['f'] = 'ृ', ['F'] = 'ॄ',
	['x'] = 'ॢ', ['X'] = 'ॣ',
	['e'] = 'े', ['E'] = 'ै',
	['o'] = 'ो', ['O'] = 'ौ',
}

local tt = {
	-- consonants
	['k'] = 'क', ['K'] = 'ख', ['g'] = 'ग', ['G'] = 'घ', ['N'] = 'ङ',
	['c'] = 'च', ['C'] = 'छ', ['j'] = 'ज', ['J'] = 'झ', ['Y'] = 'ञ',
	['w'] = 'ट', ['W'] = 'ठ', ['q'] = 'ड', ['Q'] = 'ढ', ['R'] = 'ण',
	['t'] = 'त', ['T'] = 'थ', ['d'] = 'द', ['D'] = 'ध', ['n'] = 'न',
	['p'] = 'प', ['P'] = 'फ', ['b'] = 'ब', ['B'] = 'भ', ['m'] = 'म',
	['y'] = 'य', ['r'] = 'र', ['l'] = 'ल', ['v'] = 'व', ['L'] = 'ळ',
	['S'] = 'श', ['z'] = 'ष', ['s'] = 'स', ['h'] = 'ह',
	-- vowels
	['a'] = "अ", ['A'] = "आ",
	['i'] = "इ", ['I'] = "ई",
	['u'] = "उ", ['U'] = "ऊ",
	['f'] = "ऋ", ['F'] = "ॠ",
	['x'] = "ऌ", ['X'] = "ॡ",
	['e'] = "ए", ['E'] = "ऐ",
	['o'] = "ओ", ['O'] = "औ",
	-- chandrabindu
	['~'] = 'ँ',
	-- anusvara
	['M'] = 'ं',
	-- visarga
	['H'] = 'ः',
	-- avagraha
	["'"] = 'ऽ',
	--numerals
	['0'] = '०', ['1'] = '१', ['2'] = '२', ['3'] = '३', ['4'] = '४', ['5'] = '५', ['6'] = '६', ['7'] = '७', ['8'] = '८', ['9'] = '९',
	--Vedic extensions
	['Z'] = 'ᳵ',
	['V'] = 'ᳶ',
	-- ['/'] = '',
	-- ['\\'] = '',
}

function export.tr(text, lang, sc)
	-- Introduce underscore as extra 'accent' for syllables where dependent svarita becomes anudatta (viz. between two udatta's).
	text = gsub(text, "(/[" .. consonant_list .. " 'HM~]*".. vowel ..")([" .. consonant_list .. " 'HM~]*".. vowel .."/)", "%1_%2")
	text = gsub(text, "(/[" .. consonant_list .. " 'HM~]*".. vowel ..")([" .. consonant_list .. " 'HM~]*".. vowel .."/)", "%1_%2") -- twice
	
	-- capture twice so that all adjacent pairs are covered
	text = gsub(text, "(" .. consonant .. ")(" .. consonant .. ")", "%1" .. virAma .. "%2")
	text = gsub(text, "(" .. consonant .. ")(" .. consonant .. ")", "%1" .. virAma .. "%2")
	-- whitespace and end of string
	text = gsub(text, "(" .. consonant .. ")%f[%s%z-]", "%1" .. virAma)
	-- all vowel diacritics
	text = gsub(text, "(" .. consonant .. ")(" .. vowel ..  ")", function(c, v) return c .. diacritics[v] end)
	-- everything else
	text = gsub(text, '.', tt)
	
	-- Vedic accent
	do
		-- Build sets of Devanagari consonants/vowels (post-transliteration).
		local dev_consonant = {}
		for c in gmatch(consonant_list, ".") do
			dev_consonant[tt[c]] = true
		end

		local dev_vowel = {}
		for v in gmatch(vowel_list, ".") do
			dev_vowel[tt[v]] = true
		end

		local vowel_sign = {}
		for _, sign in pairs(diacritics) do
			if sign ~= "" then
				vowel_sign[sign] = true
			end
		end

		-- Common Devanagari marks that can trail a syllable.
		local mark = {
			["ँ"] = true, ["ं"] = true, ["ः"] = true,
			["ᳵ"] = true, ["ᳶ"] = true,
		}

		-- Turn text into a codepoint array for easy indexing.
		local chars = {}
		for ch in gmatch(text, ".") do
			chars[#chars + 1] = ch
		end
		local n = #chars

		-- Collect syllable spans and explicit accent markers.
		local starts, ends, afters, accents = {}, {}, {}, {}
		local i = 1
		while i <= n do
			local ch = chars[i]
			if dev_consonant[ch] or dev_vowel[ch] then
				local start = i
				local j = i

				if dev_consonant[ch] then
					j = j + 1

					-- (virama + consonant)* for conjuncts
					while j <= n - 1 and chars[j] == virAma and dev_consonant[chars[j + 1]] do
						j = j + 2
					end

					-- optional final virama (dead consonant)
					if j <= n and chars[j] == virAma then
						j = j + 1
					end

					-- optional vowel sign
					if j <= n and vowel_sign[chars[j]] then
						j = j + 1
					end
				else
					-- independent vowel
					j = j + 1
				end

				-- trailing marks (anusvara, candrabindu, visarga, etc.)
				while j <= n and mark[chars[j]] do
					j = j + 1
				end

				-- optional explicit accent marker at the very end of the syllable
				local accent = "none"
				local after = j - 1
				local accent_name = { ['/'] = 'acute', ['\\'] = 'grave', ['_'] = 'underscore' }
				if j <= n and (chars[j] == "/" or chars[j] == "\\" or chars[j] == "_") then
					accent = accent_name[chars[j]]
					after = j       -- include the marker in "after"
					j = j + 1
				end

				local end_idx = (accent == "none") and after or (after - 1) -- exclude marker from end
				starts[#starts + 1] = start
				ends[#ends + 1] = end_idx
				afters[#afters + 1] = after
				accents[#accents + 1] = accent

				i = after + 1
			else
				i = i + 1
			end
		end

		-- Propagate accents:
		-- * Explicit acute "/" marks udatta (no glyph inserted), makes previous syllables anudatta,
		--   and the following syllable (if unmarked) becomes svarita.
		-- * Explicit grave "\" becomes svarita, and previous syllables become anudatta.
		local last_accent = 0
		local prev_acute = false
		for si, accent in ipairs(accents) do
			if accent == "acute" then
				for j = last_accent + 1, si - 1 do
					accents[j] = "anudatta"
				end
				last_accent = si
				prev_acute = true
			elseif accent == "grave" then
				for j = last_accent + 1, si - 1 do
					accents[j] = "anudatta"
				end
				last_accent = si
				accents[si] = "svarita"
				prev_acute = false
			elseif accent == "underscore" then
				last_accent = si
				accents[si] = "anudatta"
				prev_acute = false
			elseif prev_acute then
				accents[si] = "svarita"
				prev_acute = false
				last_accent = si
			end
		end

		-- Rebuild text:
		-- - drop the explicit "/" "\" "_" markers
		-- - insert anud/svar after syllable end as needed
		local out = {}
		local offset = 1

		for si, start in ipairs(starts) do
			local end_idx = ends[si]
			local after = afters[si]

			-- copy any non-syllable chars before this syllable
			if offset <= start - 1 then
				for k = offset, start - 1 do
					out[#out + 1] = chars[k]
				end
			end

			-- copy syllable body (excluding explicit marker)
			for k = start, end_idx do
				out[#out + 1] = chars[k]
			end

			-- insert accent glyphs
			local a = accents[si]
			if a == "anudatta" then
				out[#out + 1] = anud
			elseif a == "svarita" then
				out[#out + 1] = svar
			end

			-- jump past syllable (+ explicit marker if it existed)
			offset = after + 1
		end

		-- copy any trailing chars after the last syllable
		for k = offset, n do
			out[#out + 1] = chars[k]
		end

		text = table.concat(out)
	end
	
	-- fix case where accent mark is placed before diacritic
	text = gsub(text, "([".. anud .. svar .. "])([ंँः]+)", "%2%1")
	
	-- a virAma followed by accent mark should not have the accent mark at all
	text = gsub(text, virAma .. "[".. anud .. svar .. "]", virAma)
	
	return mw.ustring.toNFC(text)
end

return export