Module:ks-Arab-translit

From Linguifex
Jump to navigation Jump to search

This module will transliterate text in the Arabic script. The module should preferably not be called directly from templates or other modules. To use it from a template, use {{xlit}}. Within a module, use Module:languages#Language:transliterate.

For testcases, see Module:ks-Arab-translit/testcases.

Functions

tr(text, lang, sc)
Transliterates a given piece of text written in the script specified by the code sc, and language specified by the code lang.
When the transliteration fails, returns nil.

-- TODO: long í support
-- TODO: sort out short e vs palatalisation
-- TODO: add rule for CẹC = CyaC

local u = require("Module:string/char")
local gsub = mw.ustring.gsub

local export = {}

-- defined below
local invalid_vowel_combination

local khatma = '۔'
local question_mark = '؟'

local vav = 'و'
local ye = 'ی'
local alif = 'ا'
local he = 'ہ'
local docheshm = u(0x06BE)

local re = 'ر'
local nun = 'ن'

local vw_s_cfu = u(0x0650) -- pesh (i)
local vw_s_ccu = u(0x0655) -- hamza below
local vw_s_cbr = u(0x064F) -- zer (u)
local vw_s_mcu = u(0x0654) -- hamza above
local vw_s_ocu = u(0x064E) -- zabar (a)

local vw_l_cbr = u(0x0657) -- inverted zer
local vw_l_cfu = u(0x0656) -- subscript alif

local hat = u(0x065A) -- V
local inverted_hat = u(0x065B) -- inverted V
local hats = hat .. inverted_hat

local short_vowels_list = vw_s_cfu .. vw_s_ccu .. vw_s_cbr .. vw_s_mcu .. vw_s_ocu 

-- carrier + diacritic combos
local long_u = vav .. vw_l_cbr
local short_o = vav .. inverted_hat
local long_i = ye .. vw_l_cfu
local short_ye = ye .. inverted_hat

local vocalised_carrier = long_u .. short_o .. long_i .. short_ye .. 'ێ' .. 'ۆ' .. 'اٟ' .. 'ۄ'
local standalone_carrier = long_u .. short_o .. long_i .. short_ye .. 'ۆ' .. 'ێ' .. 'ۆ' .. 'اٟ' .. 'ۄ'
local palatalisers = "ۍؠ"

local consonants = "بپتٹجچدڈرڑزژسشفکگلمنهھےثحخذصضطظعغقۍۄیٲآ"
local consonants_latn = "bptṭsjchxdḍzrċsśzʿġfqkglmnhv"

local conv = {
	-- consonants
	['ب'] = 'b', ['پ'] = 'p', ['ت'] = 't', ['ٹ'] = 'ṭ', ['ث'] = 's',
	['ج'] = 'j', ['چ'] = 'c', ['ح'] = 'h', ['خ'] = 'kh',
	['د'] = 'd', ['ڈ'] = 'ḍ', ['ذ'] = 'z',
	['ر'] = 'r', ['ڑ'] = 'ḍ', ['ز'] = 'z', ['ژ'] = 'ċ',
	['س'] = 's', ['ش'] = 'ś', ['ص'] = 's', ['ض'] = 'z',
	['ط'] = 't', ['ظ'] = 'z',
	['ع'] = 'ʿ', ['غ'] = 'ġ',
	['ف'] = 'f', ['ق'] = 'q',
	['ک'] = 'k', ['گ'] = 'g',
	['ل'] = 'l', ['م'] = 'm', ['ن'] = 'n',
	['ہ'] = 'h', ['ھ'] = 'h',

    -- Why is this separate?
	['خ'] = 'kh',
	
	-- always word-final
	['ے'] = 'y', 

    -- short e to be treated separately
	
	-- incorrect palatalisation marker
	['ۍ'] = '\'',

	-- broken/open vowels 
    -- confirm if there are other use cases for these two
	['ۄ'] = 'ọ', ['ؠ'] = 'ẹ', -- optionally ẹ = ya or used at the end to indicate palatalisation
	
	-- a carries long vowels
	['ٲ'] = 'ạ̄', ['آ'] = 'ā',

    ['یٖ'] = 'ī', ['اٟ'] = 'ụ̄',

    -- vowels
    ['ۆ'] = 'o', ['ێ'] = 'e', ['أ'] = 'ạ',
	
	-- numerals
	['۰'] = '0', ['۱'] = '1', ['۲'] = '2', ['۳'] = '3', ['۴'] = '4', ['۵'] = '5', ['۶'] = '6', ['۷'] = '7', ['۸'] = '8', ['۹'] = '9',

}

local short_vowels = {
	-- high vowels
	[u(0x0650)] = 'i', [u(0x0655)] = 'ụ', [u(0x064F)] = 'u', [u(0x065F)] = 'ụ̄',
	
	-- central vowels
	[u(0x0654)] = 'ạ',
	
	-- low vowels
	[u(0x064E)] = 'a',
}

local hardcoded_conv = {
    [u(0x0624)] = 'vạ',
}

local alif = 'ا'
local waw = 'و'
local ye = 'ی'

function export.tr(text, lang, sc)
    if invalid_vowel_combination(text) then
--		require("Module:debug").track("ks-Arab-translit/invalid vowel combination")
		return nil
    end

	text = gsub(text, khatma, '.')
	text = gsub(text, question_mark, '?')

    text = gsub(text, '([' .. palatalisers .. ']%f[%s%z])', '\'')

    -- short e at the end of words is ē with V sign
    text = gsub(text, 'ے' .. hat, "e")

    -- ye with inverted hat is /j/
    -- ? always occurs after a consonant
    text = gsub(text, "([" .. consonants .. "])" .. ye .. inverted_hat .. "([" .. short_vowels_list .. "])", "%1y%2")
    text = gsub(text, "([" .. consonants .. "])" .. short_ye .. alif, "%1yā")
    text = gsub(text, "([" .. consonants .. "])" .. short_ye, "%1y")

    -- nun with hat
    text = gsub(text, nun .. "[" .. hats.. "]", nun)

    -- ye with hat = short e 
    -- only when not final?
    text = gsub(text, ye .. "[" .. hats.. "]", "e")

    -- re with inverted hat followed by short vowel is re + short vowel
--    text = gsub(text, "([" .. consonants .. "])" .. re .. inverted_hat .. "([" .. short_vowels_list .. "])", "%1r%2")

    -- re with inverted hat is re
    text = gsub(text, re .. inverted_hat, re)

    -- interconsonantal vav is a long ō sound
    text = gsub(text,
        '([' .. consonants ..  ']' .. docheshm .. '?)' .. vav .. '([' .. consonants .. '])',
        "%1ō%2")

    -- intervocalic alif is a long a sound
	text = gsub(text, '([' .. consonants .. 'و ' .. '])' .. alif .. '([' .. consonants .. 'و' .. '])', "%1ā%2")

    -- consonant + short vowel + he disregards the he and transliterates the vowel
    text = gsub(text, '([' .. consonants .. '])([' ..  short_vowels_list .. '])' .. he .. '(%s)', "%1%2%3")
    text = gsub(text, '([' .. consonants .. '])([' ..  short_vowels_list .. '])' .. he .. '^', "%1%2")

    -- final he + short vowel disregards the he and transliterates the vowel
    -- should return NIL 
    text = gsub(text, he .. '([' ..  short_vowels_list .. '])$', short_vowels)

    -- word-initial alif + vowelled carrier drops the alif
    text = gsub(text, '^' .. alif .. '([' .. vocalised_carrier .. '])', "%1")

    -- word-initial alif + short vowel diacritic drops the alif
    text = gsub(text, '^' .. alif .. '([' .. short_vowels_list .. '])', "%1")

    -- word-initial alif + vowelled carrier drops the alif
    text = gsub(text, '(%s)' .. alif .. '([' .. vocalised_carrier .. '])', "%1%2")

    -- word-initial alif + short vowel diacritic drops the alif
    text = gsub(text, '(%s)' .. alif .. '([' .. short_vowels_list .. '])', "%1%2")

    -- re with inverted hat is just re
    text = gsub(text, re .. inverted_hat, re)
	
    -- long /u:/ and /i:/
    text = gsub(text, vav .. vw_s_cbr .. vav .. "([" .. consonants .. "])", vav .. "ū%1")
    text = gsub(text, "([" .. consonants  .. "])" .. vw_s_cfu .. ye .. "([" .. consonants .. "])", "%1ī%2")

    -- vav with hat = short o
    text = gsub(text, vav .. "[" .. hats .. "]", "o")

    -- vav with short vowel
    text = gsub(text,
        vav .. "([" .. short_vowels_list .. "])",
        function(c)
            return "v" .. short_vowels[c]
        end)

    -- vav with inverted pish = long u
    text = gsub(text, long_u, "ū")

    -- long i
    text = gsub(text, ye .. vw_l_cfu, 'ī') 

    -- intervocalic ye is a long a sound
    text = gsub(text, '([' .. consonants .. '])' .. ye .. '([' .. consonants .. '])', "%1ē%2")

    -- word-final alif and ye
    text = gsub(text, '([' .. consonants .. '])' .. ye .. '$', "%1ī")
    text = gsub(text, '([' .. consonants .. '])' .. alif .. '$', "%1ā")

    -- regard the consonant + short vowel combinations throughout
	text = gsub(text, '.', short_vowels)

	text = gsub(text, '[أإبپتٹجچدڈرڑزژسشفکگلمنهھےثحخذصضطظعغقۍۄؠٲآۆۆێ]', conv)
	
	-- normal consonants left over
	text = gsub(text, vav, 'v')
	text = gsub(text, 'ہ', 'h')
    text = gsub(text, "ی", "y")

    -- hardcoded consonants left over
    -- Seems to be auto-subbed?
    text = gsub(text, '[ؤ]', hardcoded_conv)

	-- CẹC = CyaC
    text = gsub(text, "([" .. consonants_latn .. "])ẹ([" .. consonants_latn .. "])", "%1ya%2")
	
	return text
end

function invalid_vowel_combination(text) 
    -- if a vowel carrier or a standalone vowel has another vowel on top it should not
    local orig_text = text
    local count
    text, count = gsub(text, "[" .. standalone_carrier .. short_vowels_list .. "]", "")
    if count > 0 then
        return nil
	end
    return #text == 0
end

return export