Module:ps-translit

From Linguifex
Jump to navigation Jump to search

This module will transliterate Pashto language text. The module should preferably not be called directly from templates or other modules. To use it from a template, use {{xlit}}. Within a module, use Module:languages#Language:transliterate.

For testcases, see Module:ps-translit/testcases.

Functions

tr(text, lang, sc)
Transliterates a given piece of text written in the script specified by the code sc, and language specified by the code lang.
When the transliteration fails, returns nil.

local m_str_utils = require("Module:string utilities")

local U = m_str_utils.char
local gsub = m_str_utils.gsub

local export = {}

local zwar = U(0x64E)
local zer = U(0x650)
local pesh = U(0x64F)
local tashdid = U(0x651) -- also called shadda
local zwarakay = U(0x659) -- Pashto /ə/
local jazm = "ْ"
local he = "ه"

local ain = 'ع'
local alif = 'ا'
local ye = 'ي'
local ye2 = "ےی"
local ye3 = 'ې'
local waw = "و"
local nasal = 'ں'

local consonants = "بپتټثجځچڅحخدډذرړزژږسشښصضطظعغفقکګلمنڼوه"
local consonantS = "بپتټثجځچڅحخدډذرړزژږسشښصضطظعغفقکګلمنڼه"
local consonantS2 = "بپتټثجځچڅحخدډذرړزژږسشښصضطظعغفقکګلمنڼوهي" 
local vowels = "اآیېيۍئےو"
local vowels2 = "آیېيۍئےو"
local semivowels = "وي"
local hes = "هح"
local diacritics = "َُِّْٰٙ"
local ZZPZ = "َُِٙ"

local mapping = {
	["آ"] = 'â', ["ب"] = 'b', ["پ"] = 'p', ["ت"] = 't', ["ټ"] = 'ṭ', ["ث"] = 's̱',
	["ج"] = 'j', ["ځ"] = 'ź', ["چ"] = 'č', ["څ"] = 'ś', ["ح"] = 'ḥ', ["خ"] = 'x', 
	["د"] = 'd', ["ډ"] = 'ḍ', ["ذ"] = 'ẕ', ["ر"] = 'r', ['ړ'] = "ṛ", ["ز"] = 'z', ["ژ"] = 'ž', ["ږ"] = 'ǵ',
	["س"] = 's', ["ش"] = 'š', ["ښ"] = 'x̌', ["ص"] = 'ṣ', ["ض"] = 'ẓ', 
	["ط"] = 't̤', ["ظ"] = 'z̤', ["ع"] = 'ʻ', ["غ"] = 'ǧ', ["ف"] = 'f', ["ق"] = 'q',
	["ک"] = 'k', ["ګ"] = 'g', ["ڼ"] = 'ṇ',
	["ل"] = 'l', ["م"] = 'm', ["ن"] = 'n', ["و"] = 'w', ["ه"] = 'h', ["ي"] = 'y', ["ں"] = 'ṉ', 

	["ؤ"] = "wə", ["ۍ"] = "əy", ["ئ"] = 'əy', ["ې"] = 'e', ["ۀ"] = 'ə', ["ی"] = 'y', ["ے"] = 'y',
	
	-- diacritics
	[zwar] = "a",
	[zer] = "ĭ",
	[pesh] = "ŭ",
	[zwarakay] = "ə",
	[jazm] = "", -- also sukun - no vowel
	[U(0x200C)] = "-", -- ZWNJ (zero-width non-joiner)
	
	-- ligatures
	["ﻻ"] = "lā",
	["ﷲ"] = "allāh",
	
	-- kashida
	["ـ"] = "-", -- kashida, no sound
	
	-- numerals
	["۱"] = "1", ["۲"] = "2", ["۳"] = "3", ["۴"] = "4", ["۵"] = "5",
	["۶"] = "6", ["۷"] = "7", ["۸"] = "8", ["۹"] = "9", ["۰"] = "0",
	
	-- punctuation (leave on separate lines)
	["؟"] = "?", -- question mark
	["،"] = ",", -- comma
	["؛"] = ";", -- semicolon
	["«"] = '“', -- quotation mark
	["»"] = '”', -- quotation mark
	["٪"] = "%", -- percent
	["؉"] = "‰", -- per mille
	["٫"] = ".", -- decimals
	["٬"] = ",", -- thousand
}

function export.tr(text, lang, sc)
	
	--define the "end" of a word
	text = gsub(text, "#", "HASHTAG")
	text = gsub(text, " | ", "# | #")
	text = gsub(text, "\n" , "#".."\n" .. "#")
	text = "##" .. gsub(text, " ", "# #") .. "##"
	-- hastags now mark the beginning and end of a word
	
	-- EXCEPTIONS - leave as they are, unless they have been sorted out elsewhere

    text = gsub(text, "ن٘", "ṉ")
    text = gsub(text, "الله", "allâh")
    
  	-- diacritics
	text = gsub(text, pesh .. waw .. jazm .. "", "u")
    text = gsub(text, jazm .. alif, "â")
    
	-- Initial alif
	text = gsub(text, alif .. zwar .. '([' .. consonantS .. '])', "a%1")
	text = gsub(text, alif .. zer .. ye .. jazm .. "", "i")

	text = gsub(text, alif .. zer, "ĭ")
	text = gsub(text, alif .. waw .. jazm .. "", "o")

	text = gsub(text, alif .. pesh .. waw, "u")
	text = gsub(text, alif .. pesh .. waw .. jazm .. "", "u")	
	text = gsub(text, alif .. pesh, "ŭ")

	-- Tashdeed
	text = gsub(text, '([' .. consonantS2 .. '])' .. tashdid, "%1%1")
	text = gsub(text, '([' .. consonantS2 .. '])' .. tashdid .. '([' .. ZZPZ .. '])', "%1%1%2")
	text = gsub(text, '([' .. ZZPZ .. '])' .. ye .. '([' .. ZZPZ .. '])' .. tashdid, "%1yy%2")
	text = gsub(text, '([' .. ZZPZ .. '])' .. waw .. '([' .. ZZPZ .. '])' .. tashdid, "%1ww%2")
	-- For some reason the tashdeed gets pushed after the other diacritics, so this line is necessary for tashdeed to work with other diacritics
	text = gsub(text, '([' .. consonants .. '])' .. '([' .. ZZPZ .. '])' .. tashdid, "%1%1%2")
	
	-- tanween diacritic / no need to mess about
	text = gsub(text, '([' .. consonants .. '])' .. 'ً' .. alif, "%1an")
	text = gsub(text, alif .. 'ً', "an")
	text = gsub(text, '([' .. consonants .. '])' .. 'ً', "%1an")

	-- tall zwar -- / no need to mess about
	text = gsub(text, '([' .. vowels .. '])' .. 'ٰ', "á")
	text = gsub(text, '([' .. consonants .. '])' .. 'ٰ' .. '([' .. vowels .. '])', "%1á")

	-- ‘ain
	text = gsub(text, alif .. ain , "â‘") 
	text = gsub(text, ain .. alif  .. '([' .. consonants .. '])', "ʻâ%1") 
	text = gsub(text, '([' .. consonants .. '])' .. ain .. he, "%1ʻa")
	text = gsub(text, '([' .. consonants .. '])' .. '([' .. zer .. pesh .. ']?)' .. ain, "%1%2ʻ")
	text = gsub(text, ain .. zer  .. '([' .. consonants .. '])', "ʻĭ%1")
	text = gsub(text, ain .. pesh  .. '([' .. consonants .. '])', "ʻŭ%1")
	text = gsub(text, ain .. zer .. ye .. '([' .. consonants .. '])', "ʻi%1")
    text = gsub(text, ain .. pesh .. waw .. '([' .. consonantS .. '])', "ʻu%1")
    
    ---  alif
    text = gsub(text, '([' .. consonants .. '])' .. zwar .. alif, "%1â")
	text = gsub(text, '([' .. consonantS2 .. '])' .. alif, "%1â")
	text = gsub(text, '([' .. consonants .. '])' .. tashdid .. alif, "%1%1â")
	text = gsub(text, "#" .. alif .. ye, "i")
	text = gsub(text, "#" .. alif .. waw, "o")
	text = gsub(text, "#" .. alif .. ye3, "e")
	text = gsub(text, '([' .. consonantS2 .. '])' .. alif .. ye .. waw, "%1âyo")
  
	-- waw
	text = gsub(text, waw .. '([' .. ZZPZ .. '])', "w%1")
    text = gsub(text, 'ُو', "u")
	

	-- medial/final consonants

    --- (e) -- works
	text = gsub(text, '([' .. consonants .. '])' .. ye .. jazm .. '([' .. consonants .. '])', "%1i%2")
	
    --- he 
	text = gsub(text, '([' .. consonantS2 .. '])' .. zwar .. he .. zer .. ye, "%1ahi")
	text = gsub(text, '([' .. consonantS2 .. '])' .. zwar .. he .. alif, "%1ahâ")
	text = gsub(text, zwar .. he .. '([' .. consonants .. vowels .. '])', "ah%1")
	text = gsub(text, '([' .. consonantS2 .. '])' .. zwar .. he, "%1ah")
	text = gsub(text, '([' .. consonantS .. '])' .. he .. "#", "%1a")
	text = gsub(text, jazm .. waw .. he, "wa")
	text = gsub(text, jazm .. ye .. he, "ya")
	text = gsub(text, '([' .. vowels2 .. diacritics .. '])' .. '([' .. semivowels .. '])' .. he .. "#", "%1%2a")
	text = gsub(text, '([' .. consonantS .. '])' .. waw .. he .. "#", "%1oh")
	text = gsub(text, '([' .. consonantS .. '])' .. ye .. he .. "#", "%1ih")

    --- waw

	text = gsub(text, '([' .. consonants .. '])' .. waw, "%1o")

	text = gsub(text, '([' .. consonantS2 .. '])' .. tashdid .. waw, "%1%1o")
	text = gsub(text, zer .. ye .. waw, "io")
	text = gsub(text, '([' .. consonantS2 .. '])' .. ye .. waw .. jazm, "%1iw")
	text = gsub(text, '([' .. consonantS2 .. '])' .. ye .. waw .. "#", "%1yo")
	text = gsub(text, '([' .. consonantS2 .. '])' .. ye .. waw .. '([' .. consonantS .. '])', "%1yo%2")
	text = gsub(text, '([' .. diacritics .. '])' .. '([' .. semivowels .. '])' .. waw, "%1%2o")
	text = gsub(text, ye .. waw, "yo")
	text = gsub(text, "#" .. ye .. waw, "yo")

	text = gsub(text, '([' .. consonants .. '])' .. tashdid .. zer .. ye .. jazm .. alif, "%1%1iyâ")
	text = gsub(text, '([' .. consonants .. '])' .. zer .. ye .. alif, "%1iâ")
	
	--- ye
	--text = gsub(text, '([' .. consonantS2 .. '])' .. ye .. waw .. ye, "%1iwi")
	--text = gsub(text, '([' .. consonantS2 .. '])' .. waw .. ye .. jazm, "%1oy")
	
	text = gsub(text, ye .. zwar .. alif, "yâ")
	text = gsub(text, '([' .. consonants .. zer .. '])' .. ye .. '([' .. consonantS .. '])', "%1i%2")
	text = gsub(text, '([' .. consonants .. zer .. '])' .. ye .. "#", "%1i")
	text = gsub(text, '([' .. consonantS .. '])' .. '([' .. ye2 .. '])', "%1a%2")
	text = gsub(text, '([' .. diacritics .. '])' .. '([' .. semivowels .. '])' .. '([' .. ye2 .. '])', "%1%2a%3")
	text = gsub(text, "#" .. '([' .. semivowels .. '])' .. '([' .. ye2 .. '])', "%1a%2")


    -- get rid of hashtags (not needed)
    text = gsub(text, "#", "")
    text = gsub(text, "HASHTAG", "#")

	text = gsub(text, '.', mapping)

	
	text = gsub(text, 'ĭy', "i")
	text = gsub(text, 'ŭw', "u")
	text = gsub(text, 'ĭi', "i")
	text = gsub(text, 'ŭu', "u")
	text = gsub(text, "اa", "a")
	text = gsub(text, 'aa', "â")
	--

	return text
end
return export