Module:ar-translit: Difference between revisions

Created page with "-- Authors: Benwing, ZxxZxxZ, Atitarev local export = {} local U = mw.ustring.char local rfind = mw.ustring.find local rsubn = mw.ustring.gsub local rmatch = mw.ustring.matc..."
 
m 1 revision imported
 
(One intermediate revision by one other user not shown)
Line 3: Line 3:
local export = {}
local export = {}


local U = mw.ustring.char
local m_str_utils = require("Module:string utilities")
local rfind = mw.ustring.find
 
local rsubn = mw.ustring.gsub
local gcodepoint = m_str_utils.gcodepoint
local rmatch = mw.ustring.match
local rfind = m_str_utils.find
local rsplit = mw.text.split
local rsubn = m_str_utils.gsub
local gcodepoint = mw.ustring.gcodepoint
local rmatch = m_str_utils.match
local rsplit = m_str_utils.split
local U = m_str_utils.char
local unpack = unpack or table.unpack -- Lua 5.2 compatibility


-- assigned below
-- assigned below
Line 19: Line 22:
end
end


local zwnj = U(0x200c) -- zero-width non-joiner
local zwnj = U(0x200C) -- zero-width non-joiner
local alif_madda = U(0x622)
local alif_madda = U(0x622)
local alif_hamza_below = U(0x625)
local alif_hamza_below = U(0x625)
Line 38: Line 41:
local dagger_alif = U(0x670)
local dagger_alif = U(0x670)
local alif_waSl = U(0x671)
local alif_waSl = U(0x671)
--local zwj = U(0x200d) -- zero-width joiner
--local zwj = U(0x200D) -- zero-width joiner
local lrm = U(0x200e) -- left-to-right mark
local lrm = U(0x200E) -- left-to-right mark
local rlm = U(0x200f) -- right-to-left mark
local rlm = U(0x200F) -- right-to-left mark
-- Occurs after al- in allaḏī and variants so that we can implement elision of
-- a- after a preceding vowel, after which we remove the marker.
local alladi_marker = U(0xFFF0)


local tt = {
local tt = {
Line 46: Line 52:
["ب"]="b", ["ت"]="t", ["ث"]="ṯ", ["ج"]="j", ["ح"]="ḥ", ["خ"]="ḵ",
["ب"]="b", ["ت"]="t", ["ث"]="ṯ", ["ج"]="j", ["ح"]="ḥ", ["خ"]="ḵ",
["د"]="d", ["ذ"]="ḏ", ["ر"]="r", ["ز"]="z", ["س"]="s", ["ش"]="š",
["د"]="d", ["ذ"]="ḏ", ["ر"]="r", ["ز"]="z", ["س"]="s", ["ش"]="š",
["ص"]="ṣ", ["ض"]="ḍ", ["ط"]="ṭ", ["ظ"]="ẓ", ["ع"]="ʿ", ["غ"]="ḡ",
["ص"]="ṣ", ["ض"]="ḍ", ["ط"]="ṭ", ["ظ"]="ẓ", ["ع"]="ʕ", ["غ"]="ḡ",
["ف"]="f", ["ق"]="q", ["ك"]="k", ["ڪ"]="k", ["ل"]="l", ["م"]="m", ["ن"]="n",
["ف"]="f", ["ق"]="q", ["ك"]="k", ["ڪ"]="k", ["ل"]="l", ["م"]="m", ["ن"]="n",
["ه"]="h",
["ه"]="h",
Line 57: Line 63:
-- [zwj]="", -- ZWJ (zero-width joiner)
-- [zwj]="", -- ZWJ (zero-width joiner)
-- rare letters
-- rare letters
["پ"]="p", ["چ"]="č", ["ڤ"]="v", ["ڥ"]="v", ["گ"]="g", ["ڨ"]="g", ["ڧ"]="q", ["ڢ"]="f", ["ں"]="n", ["ڭ"]="g",
["پ"]="p", ["چ"]="č", ["ژ"]="ž", ["ڤ"]="v", ["ڥ"]="v", ["گ"]="g",
["ڨ"]="g", ["ڧ"]="q", ["ڢ"]="f", ["ں"]="n", ["ڭ"]="g",
-- semivowels or long vowels, alif, hamza, special letters
-- semivowels or long vowels, alif, hamza, special letters
["ا"]="ā", -- ʾalif
["ا"]="ā", -- ʾalif
-- hamzated letters
-- hamzated letters
["أ"]="ʾ", -- hamza over alif
["أ"]="ʔ", -- hamza over alif
[alif_hamza_below]="ʾ", -- hamza under alif
[alif_hamza_below]="ʔ", -- hamza under alif
["ؤ"]="ʾ", -- hamza over wāw
["ؤ"]="ʔ", -- hamza over wāw
["ئ"]="ʾ", -- hamza over yā
["ئ"]="ʔ", -- hamza over yā
["ء"]="ʾ", -- hamza on the line
["ء"]="ʔ", -- hamza on the line
-- long vowels
-- long vowels
[waaw]="w", --"ū" after ḍamma (u) and not before diacritic
[waaw]="w", --"ū" after ḍamma (u) and not before diacritic
[yaa]="y", --"ī" after kasra (i) and not before diacritic
[yaa]="y", --"ī" after kasra (i) and not before diacritic
[alif_maqSuura]="ā", -- ʾalif maqṣūra
[alif_maqSuura]="ā", -- ʾalif maqṣūra
[alif_madda]="ʾā", -- ʾalif madda
[alif_madda]="ʔā", -- ʾalif madda
[alif_waSl]= "", -- hamzatu l-waṣl
[alif_waSl]= "", -- hamzatu l-waṣl
[dagger_alif] = "ā", -- ʾalif xanjariyya = dagger ʾalif (Koranic diacritic)
[dagger_alif] = "ā", -- ʾalif xanjariyya = dagger ʾalif (Koranic diacritic)
Line 115: Line 122:
local sun_letters_tr = table.concat(ttsun3, "")
local sun_letters_tr = table.concat(ttsun3, "")


local consonants_needing_vowels = "بتثجحخدذرزسشصضطظعغفقكڪلمنهپچڤگڨڧڢںڭأإؤئءةﷲ"
local consonants_needing_vowels = "بتثجحخدذرزسشصضطظعغفقكڪلمنهپچژڤگڨڧڢںڭأإؤئءةﷲ"
-- consonants on the right side; includes alif madda
-- consonants on the right side; includes alif madda
local rconsonants = consonants_needing_vowels .. "ويآ"
local rconsonants = consonants_needing_vowels .. "ويآ"
Line 129: Line 136:
local before_diacritic_checking_subs = {
local before_diacritic_checking_subs = {
------------ transformations prior to checking for diacritics --------------
------------ transformations prior to checking for diacritics --------------
-- random Koranic marks and presentation forms
{U(0x06E1), sukuun}, -- "Small High Dotless Head of Khah" (variant of sukūn)
{U(0x06DA), ""}, -- "Small High Jeem"
{U(0x06DF), ""}, -- "Small High Rounded Zero" (FIXME: correct?)
{U(0x08F0), U(0x64B)}, -- "Open Fathatan"
{U(0x08F1), U(0x64C)}, -- "Open Dammatan"
{U(0x08F2), U(0x64D)}, -- "Open Kasratan"
{U(0x06E4), ""}, -- "Small High Madda" (FIXME: correct?)
{U(0x06D6), ""}, -- "Small High Ligature Sad with Lam with Alef Maksura" (FIXME: there are others we need to do)
{U(0x06E5), "و"},
{U(0x06E6), "ي"},
-- convert llh for allāh into ll+shadda+dagger-alif+h
-- convert llh for allāh into ll+shadda+dagger-alif+h
{"لله", "للّٰه"},
{"لله", "للّٰه"},
Line 136: Line 154:
-- transliteration process inconvenient, so undo it.
-- transliteration process inconvenient, so undo it.
{"([" .. fatHataan .. Dammataan .. kasrataan .. fatHa .. Damma .. kasra .. dagger_alif .. "])" .. shadda, shadda .. "%1"},
{"([" .. fatHataan .. Dammataan .. kasrataan .. fatHa .. Damma .. kasra .. dagger_alif .. "])" .. shadda, shadda .. "%1"},
-- ignore Koranic gemination at beginning of word due to assimilation of preceding consonant
{" ([" .. lconsonants .. "])" .. shadda, " %1"},
-- ignore alif jamīla (otiose alif in 3pl verb forms)
-- ignore alif jamīla (otiose alif in 3pl verb forms)
--    #1: handle ḍamma + wāw + alif (final -ū)
--    #1: handle ḍamma + wāw + alif (final -ū)
Line 181: Line 201:
{alif_waSl .. fatHa .. "?" .. laam, "l-"},
{alif_waSl .. fatHa .. "?" .. laam, "l-"},
-- special casing if the l in al- has a shadda on it (as in الَّذِي "that"),
-- special casing if the l in al- has a shadda on it (as in الَّذِي "that"),
-- so we don't mistakenly double the dash
-- so we don't mistakenly double the dash; insert a special marker here so
{"l%-" .. shadda, "ll"},
-- that we know later to elide the a- after a vowel
{"l%-" .. shadda, "l" .. alladi_marker .. "l"},
-- implement assimilation of sun letters
-- implement assimilation of sun letters
{"l%-[" .. sun_letters .. "]", ttsun2},
{"l%-[" .. sun_letters .. "]", ttsun2},
Line 206: Line 227:


if not force_translit and not has_diacritics(text) then
if not force_translit and not has_diacritics(text) then
require("Module:debug").track("ar-translit/lacking diacritics")
return nil
return nil
end
end
Line 227: Line 249:
-- Otherwise, no if word ends in a/i/u, yes if ends in an/in/un.
-- Otherwise, no if word ends in a/i/u, yes if ends in an/in/un.
text = rsub(text, "^(a?l%-[^%s]+)" .. taa_marbuuTa .. "([" .. fatHataan .. Dammataan .. kasrataan .. fatHa .. Damma .. kasra .. "])",
text = rsub(text, "^(a?l%-[^%s]+)" .. taa_marbuuTa .. "([" .. fatHataan .. Dammataan .. kasrataan .. fatHa .. Damma .. kasra .. "])",
'%1<span style="color: #888888">t</span>%2')
'%1<span style="color: var(--wikt-palette-grey-8,#888)">t</span>%2')
text = rsub(text, "(" .. space_like_class .. "a?l%-[^%s]+)" .. taa_marbuuTa .. "([" .. fatHataan .. Dammataan .. kasrataan .. fatHa .. Damma .. kasra .. "])",
text = rsub(text, "(" .. space_like_class .. "a?l%-[^%s]+)" .. taa_marbuuTa .. "([" .. fatHataan .. Dammataan .. kasrataan .. fatHa .. Damma .. kasra .. "])",
'%1<span style="color: #888888">t</span>%2')
'%1<span style="color: var(--wikt-palette-grey-8,#888)">t</span>%2')
text = rsub(text, taa_marbuuTa .. "([" .. fatHa .. Damma .. kasra .. "])", "t%1")
text = rsub(text, taa_marbuuTa .. "([" .. fatHa .. Damma .. kasra .. "])", "t%1")
text = rsub(text, taa_marbuuTa .. "([" .. fatHataan .. Dammataan .. kasrataan .. "])",
text = rsub(text, taa_marbuuTa .. "([" .. fatHataan .. Dammataan .. kasrataan .. "])",
'<span style="color: #888888">t</span>%1')
'<span style="color: var(--wikt-palette-grey-8,#888)">t</span>%1')
text = rsub(text, ".", {
text = rsub(text, ".", {
[fatHataan] = '<span style="color: #888888">an</span>',
[fatHataan] = '<span style="color: var(--wikt-palette-grey-8,#888)">an</span>',
[kasrataan] = '<span style="color: #888888">in</span>',
[kasrataan] = '<span style="color: var(--wikt-palette-grey-8,#888)">in</span>',
[Dammataan] = '<span style="color: #888888">un</span>'
[Dammataan] = '<span style="color: var(--wikt-palette-grey-8,#888)">un</span>'
})
})
text = rsub(text, "([" .. fatHa .. Damma .. kasra .. "])(" .. space_like_class .. ")",
text = rsub(text, "([" .. fatHa .. Damma .. kasra .. "])(" .. space_like_class .. ")",
function(vowel, space)
function(vowel, space)
vowel_repl = {
vowel_repl = {
[fatHa] = '<span style="color: #888888">a</span> ',
[fatHa] = '<span style="color: var(--wikt-palette-grey-8,#888)">a</span> ',
[kasra] = '<span style="color: #888888">i</span> ',
[kasra] = '<span style="color: var(--wikt-palette-grey-8,#888)">i</span> ',
[Damma] = '<span style="color: #888888">u</span> '
[Damma] = '<span style="color: var(--wikt-palette-grey-8,#888)">u</span> '
}
}
return vowel_repl[vowel] .. space
return vowel_repl[vowel] .. space
Line 249: Line 271:
)
)
text = rsub(text, "[" .. fatHa .. Damma .. kasra .. "]$", {
text = rsub(text, "[" .. fatHa .. Damma .. kasra .. "]$", {
[fatHa] = '<span style="color: #888888">a</span>',
[fatHa] = '<span style="color: var(--wikt-palette-grey-8,#888)">a</span>',
[kasra] = '<span style="color: #888888">i</span>',
[kasra] = '<span style="color: var(--wikt-palette-grey-8,#888)">i</span>',
[Damma] = '<span style="color: #888888">u</span>'
[Damma] = '<span style="color: var(--wikt-palette-grey-8,#888)">u</span>'
})
})
text = rsub(text, '</span><span style="color: #888888">', "")
text = rsub(text, '</span><span style="color: var(--wikt-palette-grey-8,#888)">', "")
elseif omit_i3raab then -- omit ʾiʿrāb in transliteration
elseif omit_i3raab then -- omit ʾiʿrāb in transliteration
text = rsub(text, "[" .. fatHataan .. Dammataan .. kasrataan .. "]", "")
text = rsub(text, "[" .. fatHataan .. Dammataan .. kasrataan .. "]", "")
Line 292: Line 314:
text = rsub(text, "aā", "ā")
text = rsub(text, "aā", "ā")
-- Implement elision of al- after a final vowel. We do this
-- Implement elision of al- after a final vowel. We do this
-- conservatively, only handling elision of the definite article rather
-- conservatively, only handling elision of the definite article and related
-- terms (specifically, relative pronoun الَّذِي (allaḏī) and variants) rather
-- than elision in other cases of hamzat al-waṣl (e.g. form-I imperatives
-- than elision in other cases of hamzat al-waṣl (e.g. form-I imperatives
-- or form-VII and above verbal nouns) partly because elision in
-- or form-VII and above verbal nouns) partly because elision in
Line 298: Line 321:
-- elision in case of words written with initial bare alif instead of
-- elision in case of words written with initial bare alif instead of
-- properly with hamzated alif. Possibly we should reconsider.
-- properly with hamzated alif. Possibly we should reconsider.
-- At the very least we currently don't handle elision of الَّذِي (allaḏi)
text = rsub(text, "([aiuāīū]'* +'*)a([" .. sun_letters_tr .. "][%-" .. alladi_marker .. "])",
-- correctly because we special-case it to appear without the hyphen;
-- perhaps we should reconsider that.
text = rsub(text, "([aiuāīū]'* +'*)a([" .. sun_letters_tr .. "]%-)",
"%1%2")
"%1%2")
if gray_i3raab then
if gray_i3raab then
text = rsub(text, "([aiuāīū]'*</span>'* +'*)a([" .. sun_letters_tr .. "]%-)",
text = rsub(text, "([aiuāīū]'*</span>'* +'*)a([" .. sun_letters_tr .. "][%-" .. alladi_marker .. "])",
"%1%2")
"%1%2")
end
end
-- Special-case the transliteration of allāh, without the hyphen
-- remove indicator of allaḏī, which has served its purpose
text = rsub(text, alladi_marker, "")
-- Special-case the transliteration of allāh, without the hyphen.
text = rsub(text, "^(a?)l%-lāh", "%1llāh")
text = rsub(text, "^(a?)l%-lāh", "%1llāh")
text = rsub(text, "(" .. space_like_class .. "a?)l%-lāh", "%1llāh")
text = rsub(text, "(" .. space_like_class .. "a?)l%-lāh", "%1llāh")
-- Compress multiple spaces, which may occur e.g. when removing Koranic diacritics.
text = rsub(text, "(%s)%s+", "%1")


return text
return text
Line 323: Line 347:
-- If you want to catch places without iʿrāb, comment out the next two lines.
-- If you want to catch places without iʿrāb, comment out the next two lines.
{"[" .. lconsonants .. "]$", ""},
{"[" .. lconsonants .. "]$", ""},
{"[" .. lconsonants .. "](" .. space_like_class .. ")", "%1"},
{"[" .. lconsonants .. "]([%)%]}]?" .. space_like_class .. ")", "%1"},
-- remove consonants (or alif) when followed by diacritics
-- remove consonants (or alif) when followed by diacritics
-- must go after removing shadda
-- must go after removing shadda
Line 350: Line 374:
-- declared as local above
-- declared as local above
function has_diacritics(text)
function has_diacritics(text)
local orig_text = text
local count
local count
text, count = rsubn(text, "[" .. lrm .. rlm .. "]", "")
text, count = rsubn(text, "[" .. lrm .. rlm .. "]", "")
Line 357: Line 382:
for _, sub in ipairs(has_diacritics_subs) do
for _, sub in ipairs(has_diacritics_subs) do
text = rsub(text, unpack(sub))
text = rsub(text, unpack(sub))
end
if #text > 0 then
mw.log(("Check for missing diacritics failed; original text '%s', text without diacritics '%s'"):format(
orig_text, text))
end
end
return #text == 0
return #text == 0
Line 407: Line 436:


return export
return export
-- For Vim, so we get 4-space tabs
-- vim: set ts=4 sw=4 noet: