Module:chg-translit

From Linguifex
Jump to navigation Jump to search

This module will transliterate Chagatai language text. The module should preferably not be called directly from templates or other modules. To use it from a template, use {{xlit}}. Within a module, use Module:languages#Language:transliterate.

For testcases, see Module:chg-translit/testcases.

Functions

tr(text, lang, sc)
Transliterates a given piece of text written in the script specified by the code sc, and language specified by the code lang.
When the transliteration fails, returns nil.

-- Author: Saam-andar

local export = {}

local m_str_utils = require("Module:string utilities")

local gcodepoint = m_str_utils.gcodepoint
local rfind = m_str_utils.find
local rsubn = m_str_utils.gsub
local rmatch = m_str_utils.match
local rsplit = m_str_utils.split
local U = m_str_utils.char
local unpack = unpack or table.unpack -- Lua 5.2 compatibility

-- version of rsubn() that discards all but the first return value
local function rsub(term, foo, bar)
	local retval = rsubn(term, foo, bar)
	return retval
end

local zwnj = U(0x200C) -- zero-width non-joiner
local alif_madda = U(0x622)
local alif_hamza_below = U(0x625)
local alif = U(0x627)
local taa_marbuta = U(0x629)
local laam = U(0x644)
local waaw = U(0x648)
local yaa = U(0x64A)
local fatHataan = U(0x64B)
local Dammataan = U(0x64C)
local kasrataan = U(0x64D)
local shadda = U(0x651)
local sukuun = U(0x652)
local dagger_alif = U(0x670)
local alif_wasl = U(0x671)
local fatha = U(0x64E)
local kasra = U(0x650)
local zamma = U(0x64F)
local highhmz = U(0x654)
local sukun = "ْ"
--local zwj = U(0x200D) -- zero-width joiner
local lrm = U(0x200E) -- left-to-right mark
local rlm = U(0x200F) -- right-to-left mark

local consonants = "بپتثجچحخدذرزژسشصضطظعغفقکگلمنوؤهیئء"
local diacritics = fatha .. kasra .. zamma .. highhmz .. sukun .. shadda .. dagger_alif .. fatHataan .. Dammataan .. kasrataan

-- mapping
local tt = {
	["آ"] = "ʾā",
	["ا"] = "ʾ",
	["ب"] = "b",
	["پ"] = "p",
	["ت"] = "t",
	["ث"] = "s̱",
	["ج"] = "j",
	["چ"] = "č",
	["ح"] = "ḥ",
	["خ"] = "x",
	["د"] = "d",
	["ذ"] = "ẕ",
	["ر"] = "r",
	["ز"] = "z",
	["ژ"] = "ž",
	["س"] = "s",
	["ش"] = "š",
	["ص"] = "ṣ",
	["ض"] = "ż",
	["ط"] = "ṭ",
	["ظ"] = "ẓ",
	["ع"] = "ʿ",
	["غ"] = "ġ",
	["ف"] = "f",
	["ق"] = "q",
	["ک"] = "k",
	["گ"] = "g",
	["ل"] = "l",
	["م"] = "m",
	["ن"] = "n",
	["و"] = "w",
	["ه"] = "h",
	["ی"] = "y",
	[taa_marbuta] = "t",
	
	["ݣ"] = "ñ",
	
	["ء"] = "ʾ",
	["ئ"] = "ʾ",
	["ؤ"] = "ʾ",
	["أ"] = "ʾ",
	["إ"] = "ʾ",

	[zwnj] = "-", -- ZWNJ (zero-width non-joiner)

	-- ligatures
	["ﻻ"] = "lʾ",
	["اللّٰه"] = "ʾllh",

	-- kashida
	["ـ"] = "-", -- kashida, no sound

	-- alif_wasla
	[alif_wasl] = "ʾ̃", 

	-- numerals
	["۱"] = "1",
	["۲"] = "2",
	["۳"] = "3",
	["۴"] = "4",
	["۵"] = "5",
	["۶"] = "6",
	["۷"] = "7",
	["۸"] = "8",
	["۹"] = "9",
	["۰"] = "0",

	-- punctuation (leave on separate lines)
	["؟"] = "?", -- question mark
	["،"] = ",", -- comma
	["؛"] = ";", -- semicolon
	["«"] = "“", -- quotation mark
	["»"] = "”", -- quotation mark
	["٪"] = "%", -- percent
	["؉"] = "‰", -- per mille
	["٫"] = ".", -- decimals
	["٬"] = ",", -- thousan
}

-- Main function
function export.tr(text, lang, sc, options)
	if not text or text == "" then
		return nil
	end
	
	if type(text) == "table" then
		local function f(x) return (x ~= "") and x or nil end
		text, lang, sc = f(text.args[1]), f(text.args[2]), f(text.args[3])
		options = text.args[4] and {} or nil
	end
	
	-- Only process if script is Arabic
	if sc and sc ~= "Arab" then
		return nil
	end
	
	-- Strip diacritics
	text = rsubn(text, "[" .. diacritics .. "]", "")
	text = rsubn(text, "([" .. consonants .. "]+)ا", "%1ā" )

	text = rsubn(text, ".", function(char)
		return tt[char] or char
	end)
	
	return text
end

return export