Module:chg-translit

From Linguifex
Revision as of 18:50, 4 October 2025 by wikt>Bartanaqa (Undo revision 87289466 by Bartanaqa (talk) nvm)
(diff) ← Older revision | Latest revision (diff) | Newer revision → (diff)
Jump to navigation Jump to search

Documentation for this module may be created at Module:chg-translit/doc

-- Author: Saam-andar

local export = {}

local m_str_utils = require("Module:string utilities")

local gcodepoint = m_str_utils.gcodepoint
local rfind = m_str_utils.find
local rsubn = m_str_utils.gsub
local rmatch = m_str_utils.match
local rsplit = m_str_utils.split
local U = m_str_utils.char
local unpack = unpack or table.unpack -- Lua 5.2 compatibility

-- version of rsubn() that discards all but the first return value
local function rsub(term, foo, bar)
	local retval = rsubn(term, foo, bar)
	return retval
end

local zwnj = U(0x200C) -- zero-width non-joiner
local alif_madda = U(0x622)
local alif_hamza_below = U(0x625)
local alif = U(0x627)
local taa_marbuta = U(0x629)
local laam = U(0x644)
local waaw = U(0x648)
local yaa = U(0x64A)
local fatHataan = U(0x64B)
local Dammataan = U(0x64C)
local kasrataan = U(0x64D)
local shadda = U(0x651)
local sukuun = U(0x652)
local dagger_alif = U(0x670)
local alif_wasl = U(0x671)
local fatha = U(0x64E)
local kasra = U(0x650)
local zamma = U(0x64F)
local highhmz = U(0x654)
local sukun = "ْ"
--local zwj = U(0x200D) -- zero-width joiner
local lrm = U(0x200E) -- left-to-right mark
local rlm = U(0x200F) -- right-to-left mark

local consonants = "بپتثجچحخدذرزژسشصضطظعغفقکگلمنوؤهیئء"
local diacritics = fatha .. kasra .. zamma .. highhmz .. sukun .. shadda .. dagger_alif .. fatHataan .. Dammataan .. kasrataan

-- mapping
local tt = {
	["آ"] = "ʾā",
	["ا"] = "ʾ",
	["ب"] = "b",
	["پ"] = "p",
	["ت"] = "t",
	["ث"] = "s̱",
	["ج"] = "j",
	["چ"] = "č",
	["ح"] = "ḥ",
	["خ"] = "x",
	["د"] = "d",
	["ذ"] = "ẕ",
	["ر"] = "r",
	["ز"] = "z",
	["ژ"] = "ž",
	["س"] = "s",
	["ش"] = "š",
	["ص"] = "ṣ",
	["ض"] = "ż",
	["ط"] = "ṭ",
	["ظ"] = "ẓ",
	["ع"] = "ʿ",
	["غ"] = "ġ",
	["ف"] = "f",
	["ق"] = "q",
	["ک"] = "k",
	["گ"] = "g",
	["ل"] = "l",
	["م"] = "m",
	["ن"] = "n",
	["و"] = "w",
	["ه"] = "h",
	["ی"] = "y",
	[taa_marbuta] = "t",
	
	["ݣ"] = "ñ",
	
	["ء"] = "ʾ",
	["ئ"] = "ʾ",
	["ؤ"] = "ʾ",
	["أ"] = "ʾ",
	["إ"] = "ʾ",

	[zwnj] = "-", -- ZWNJ (zero-width non-joiner)

	-- ligatures
	["ﻻ"] = "lʾ",
	["اللّٰه"] = "ʾllh",

	-- kashida
	["ـ"] = "-", -- kashida, no sound

	-- alif_wasla
	[alif_wasl] = "ʾ̃", 

	-- numerals
	["۱"] = "1",
	["۲"] = "2",
	["۳"] = "3",
	["۴"] = "4",
	["۵"] = "5",
	["۶"] = "6",
	["۷"] = "7",
	["۸"] = "8",
	["۹"] = "9",
	["۰"] = "0",

	-- punctuation (leave on separate lines)
	["؟"] = "?", -- question mark
	["،"] = ",", -- comma
	["؛"] = ";", -- semicolon
	["«"] = "“", -- quotation mark
	["»"] = "”", -- quotation mark
	["٪"] = "%", -- percent
	["؉"] = "‰", -- per mille
	["٫"] = ".", -- decimals
	["٬"] = ",", -- thousan
}

-- Main function
function export.tr(text, lang, sc, options)
	if not text or text == "" then
		return nil
	end
	
	if type(text) == "table" then
		local function f(x) return (x ~= "") and x or nil end
		text, lang, sc = f(text.args[1]), f(text.args[2]), f(text.args[3])
		options = text.args[4] and {} or nil
	end
	
	-- Only process if script is Arabic
	if sc and sc ~= "Arab" then
		return nil
	end
	
	-- Strip diacritics
	text = rsubn(text, "[" .. diacritics .. "]", "")
	text = rsubn(text, "([" .. consonants .. "]+)ا", "%1ā" )

	text = rsubn(text, ".", function(char)
		return tt[char] or char
	end)
	
	return text
end

return export