Module:ar-translit: Difference between revisions
Jump to navigation
Jump to search
Created page with "-- Authors: Benwing, ZxxZxxZ, Atitarev local export = {} local U = mw.ustring.char local rfind = mw.ustring.find local rsubn = mw.ustring.gsub local rmatch = mw.ustring.matc..." |
No edit summary |
||
| Line 3: | Line 3: | ||
local export = {} | local export = {} | ||
local | local m_str_utils = require("Module:string utilities") | ||
local rfind = | |||
local rsubn = | local gcodepoint = m_str_utils.gcodepoint | ||
local rmatch = | local rfind = m_str_utils.find | ||
local rsplit = | local rsubn = m_str_utils.gsub | ||
local | local rmatch = m_str_utils.match | ||
local rsplit = m_str_utils.split | |||
local U = m_str_utils.char | |||
local unpack = unpack or table.unpack -- Lua 5.2 compatibility | |||
-- assigned below | -- assigned below | ||
| Line 19: | Line 22: | ||
end | end | ||
local zwnj = U( | local zwnj = U(0x200C) -- zero-width non-joiner | ||
local alif_madda = U(0x622) | local alif_madda = U(0x622) | ||
local alif_hamza_below = U(0x625) | local alif_hamza_below = U(0x625) | ||
| Line 38: | Line 41: | ||
local dagger_alif = U(0x670) | local dagger_alif = U(0x670) | ||
local alif_waSl = U(0x671) | local alif_waSl = U(0x671) | ||
--local zwj = U( | --local zwj = U(0x200D) -- zero-width joiner | ||
local lrm = U( | local lrm = U(0x200E) -- left-to-right mark | ||
local rlm = U( | local rlm = U(0x200F) -- right-to-left mark | ||
-- Occurs after al- in allaḏī and variants so that we can implement elision of | |||
-- a- after a preceding vowel, after which we remove the marker. | |||
local alladi_marker = U(0xFFF0) | |||
local tt = { | local tt = { | ||
| Line 46: | Line 52: | ||
["ب"]="b", ["ت"]="t", ["ث"]="ṯ", ["ج"]="j", ["ح"]="ḥ", ["خ"]="ḵ", | ["ب"]="b", ["ت"]="t", ["ث"]="ṯ", ["ج"]="j", ["ح"]="ḥ", ["خ"]="ḵ", | ||
["د"]="d", ["ذ"]="ḏ", ["ر"]="r", ["ز"]="z", ["س"]="s", ["ش"]="š", | ["د"]="d", ["ذ"]="ḏ", ["ر"]="r", ["ز"]="z", ["س"]="s", ["ش"]="š", | ||
["ص"]="ṣ", ["ض"]="ḍ", ["ط"]="ṭ", ["ظ"]="ẓ", ["ع"]=" | ["ص"]="ṣ", ["ض"]="ḍ", ["ط"]="ṭ", ["ظ"]="ẓ", ["ع"]="ʕ", ["غ"]="ḡ", | ||
["ف"]="f", ["ق"]="q", ["ك"]="k", ["ڪ"]="k", ["ل"]="l", ["م"]="m", ["ن"]="n", | ["ف"]="f", ["ق"]="q", ["ك"]="k", ["ڪ"]="k", ["ل"]="l", ["م"]="m", ["ن"]="n", | ||
["ه"]="h", | ["ه"]="h", | ||
| Line 57: | Line 63: | ||
-- [zwj]="", -- ZWJ (zero-width joiner) | -- [zwj]="", -- ZWJ (zero-width joiner) | ||
-- rare letters | -- rare letters | ||
["پ"]="p", ["چ"]="č", ["ڤ"]="v", ["ڥ"]="v", ["گ"]="g", ["ڨ"]="g", ["ڧ"]="q", ["ڢ"]="f", ["ں"]="n", ["ڭ"]="g", | ["پ"]="p", ["چ"]="č", ["ژ"]="ž", ["ڤ"]="v", ["ڥ"]="v", ["گ"]="g", | ||
["ڨ"]="g", ["ڧ"]="q", ["ڢ"]="f", ["ں"]="n", ["ڭ"]="g", | |||
-- semivowels or long vowels, alif, hamza, special letters | -- semivowels or long vowels, alif, hamza, special letters | ||
["ا"]="ā", -- ʾalif | ["ا"]="ā", -- ʾalif | ||
-- hamzated letters | -- hamzated letters | ||
["أ"]=" | ["أ"]="ʔ", -- hamza over alif | ||
[alif_hamza_below]=" | [alif_hamza_below]="ʔ", -- hamza under alif | ||
["ؤ"]=" | ["ؤ"]="ʔ", -- hamza over wāw | ||
["ئ"]=" | ["ئ"]="ʔ", -- hamza over yā | ||
["ء"]=" | ["ء"]="ʔ", -- hamza on the line | ||
-- long vowels | -- long vowels | ||
[waaw]="w", --"ū" after ḍamma (u) and not before diacritic | [waaw]="w", --"ū" after ḍamma (u) and not before diacritic | ||
[yaa]="y", --"ī" after kasra (i) and not before diacritic | [yaa]="y", --"ī" after kasra (i) and not before diacritic | ||
[alif_maqSuura]="ā", -- ʾalif maqṣūra | [alif_maqSuura]="ā", -- ʾalif maqṣūra | ||
[alif_madda]=" | [alif_madda]="ʔā", -- ʾalif madda | ||
[alif_waSl]= "", -- hamzatu l-waṣl | [alif_waSl]= "", -- hamzatu l-waṣl | ||
[dagger_alif] = "ā", -- ʾalif xanjariyya = dagger ʾalif (Koranic diacritic) | [dagger_alif] = "ā", -- ʾalif xanjariyya = dagger ʾalif (Koranic diacritic) | ||
| Line 115: | Line 122: | ||
local sun_letters_tr = table.concat(ttsun3, "") | local sun_letters_tr = table.concat(ttsun3, "") | ||
local consonants_needing_vowels = " | local consonants_needing_vowels = "بتثجحخدذرزسشصضطظعغفقكڪلمنهپچژڤگڨڧڢںڭأإؤئءةﷲ" | ||
-- consonants on the right side; includes alif madda | -- consonants on the right side; includes alif madda | ||
local rconsonants = consonants_needing_vowels .. "ويآ" | local rconsonants = consonants_needing_vowels .. "ويآ" | ||
| Line 129: | Line 136: | ||
local before_diacritic_checking_subs = { | local before_diacritic_checking_subs = { | ||
------------ transformations prior to checking for diacritics -------------- | ------------ transformations prior to checking for diacritics -------------- | ||
-- random Koranic marks and presentation forms | |||
{U(0x06E1), sukuun}, -- "Small High Dotless Head of Khah" (variant of sukūn) | |||
{U(0x06DA), ""}, -- "Small High Jeem" | |||
{U(0x06DF), ""}, -- "Small High Rounded Zero" (FIXME: correct?) | |||
{U(0x08F0), U(0x64B)}, -- "Open Fathatan" | |||
{U(0x08F1), U(0x64C)}, -- "Open Dammatan" | |||
{U(0x08F2), U(0x64D)}, -- "Open Kasratan" | |||
{U(0x06E4), ""}, -- "Small High Madda" (FIXME: correct?) | |||
{U(0x06D6), ""}, -- "Small High Ligature Sad with Lam with Alef Maksura" (FIXME: there are others we need to do) | |||
{U(0x06E5), "و"}, | |||
{U(0x06E6), "ي"}, | |||
-- convert llh for allāh into ll+shadda+dagger-alif+h | -- convert llh for allāh into ll+shadda+dagger-alif+h | ||
{"لله", "للّٰه"}, | {"لله", "للّٰه"}, | ||
| Line 136: | Line 154: | ||
-- transliteration process inconvenient, so undo it. | -- transliteration process inconvenient, so undo it. | ||
{"([" .. fatHataan .. Dammataan .. kasrataan .. fatHa .. Damma .. kasra .. dagger_alif .. "])" .. shadda, shadda .. "%1"}, | {"([" .. fatHataan .. Dammataan .. kasrataan .. fatHa .. Damma .. kasra .. dagger_alif .. "])" .. shadda, shadda .. "%1"}, | ||
-- ignore Koranic gemination at beginning of word due to assimilation of preceding consonant | |||
{" ([" .. lconsonants .. "])" .. shadda, " %1"}, | |||
-- ignore alif jamīla (otiose alif in 3pl verb forms) | -- ignore alif jamīla (otiose alif in 3pl verb forms) | ||
-- #1: handle ḍamma + wāw + alif (final -ū) | -- #1: handle ḍamma + wāw + alif (final -ū) | ||
| Line 181: | Line 201: | ||
{alif_waSl .. fatHa .. "?" .. laam, "l-"}, | {alif_waSl .. fatHa .. "?" .. laam, "l-"}, | ||
-- special casing if the l in al- has a shadda on it (as in الَّذِي "that"), | -- special casing if the l in al- has a shadda on it (as in الَّذِي "that"), | ||
-- so we don't mistakenly double the dash | -- so we don't mistakenly double the dash; insert a special marker here so | ||
{"l%-" .. shadda, " | -- that we know later to elide the a- after a vowel | ||
{"l%-" .. shadda, "l" .. alladi_marker .. "l"}, | |||
-- implement assimilation of sun letters | -- implement assimilation of sun letters | ||
{"l%-[" .. sun_letters .. "]", ttsun2}, | {"l%-[" .. sun_letters .. "]", ttsun2}, | ||
| Line 206: | Line 227: | ||
if not force_translit and not has_diacritics(text) then | if not force_translit and not has_diacritics(text) then | ||
require("Module:debug").track("ar-translit/lacking diacritics") | |||
return nil | return nil | ||
end | end | ||
| Line 227: | Line 249: | ||
-- Otherwise, no if word ends in a/i/u, yes if ends in an/in/un. | -- Otherwise, no if word ends in a/i/u, yes if ends in an/in/un. | ||
text = rsub(text, "^(a?l%-[^%s]+)" .. taa_marbuuTa .. "([" .. fatHataan .. Dammataan .. kasrataan .. fatHa .. Damma .. kasra .. "])", | text = rsub(text, "^(a?l%-[^%s]+)" .. taa_marbuuTa .. "([" .. fatHataan .. Dammataan .. kasrataan .. fatHa .. Damma .. kasra .. "])", | ||
'%1<span style="color: # | '%1<span style="color: var(--wikt-palette-grey-8,#888)">t</span>%2') | ||
text = rsub(text, "(" .. space_like_class .. "a?l%-[^%s]+)" .. taa_marbuuTa .. "([" .. fatHataan .. Dammataan .. kasrataan .. fatHa .. Damma .. kasra .. "])", | text = rsub(text, "(" .. space_like_class .. "a?l%-[^%s]+)" .. taa_marbuuTa .. "([" .. fatHataan .. Dammataan .. kasrataan .. fatHa .. Damma .. kasra .. "])", | ||
'%1<span style="color: # | '%1<span style="color: var(--wikt-palette-grey-8,#888)">t</span>%2') | ||
text = rsub(text, taa_marbuuTa .. "([" .. fatHa .. Damma .. kasra .. "])", "t%1") | text = rsub(text, taa_marbuuTa .. "([" .. fatHa .. Damma .. kasra .. "])", "t%1") | ||
text = rsub(text, taa_marbuuTa .. "([" .. fatHataan .. Dammataan .. kasrataan .. "])", | text = rsub(text, taa_marbuuTa .. "([" .. fatHataan .. Dammataan .. kasrataan .. "])", | ||
'<span style="color: # | '<span style="color: var(--wikt-palette-grey-8,#888)">t</span>%1') | ||
text = rsub(text, ".", { | text = rsub(text, ".", { | ||
[fatHataan] = '<span style="color: # | [fatHataan] = '<span style="color: var(--wikt-palette-grey-8,#888)">an</span>', | ||
[kasrataan] = '<span style="color: # | [kasrataan] = '<span style="color: var(--wikt-palette-grey-8,#888)">in</span>', | ||
[Dammataan] = '<span style="color: # | [Dammataan] = '<span style="color: var(--wikt-palette-grey-8,#888)">un</span>' | ||
}) | }) | ||
text = rsub(text, "([" .. fatHa .. Damma .. kasra .. "])(" .. space_like_class .. ")", | text = rsub(text, "([" .. fatHa .. Damma .. kasra .. "])(" .. space_like_class .. ")", | ||
function(vowel, space) | function(vowel, space) | ||
vowel_repl = { | vowel_repl = { | ||
[fatHa] = '<span style="color: # | [fatHa] = '<span style="color: var(--wikt-palette-grey-8,#888)">a</span> ', | ||
[kasra] = '<span style="color: # | [kasra] = '<span style="color: var(--wikt-palette-grey-8,#888)">i</span> ', | ||
[Damma] = '<span style="color: # | [Damma] = '<span style="color: var(--wikt-palette-grey-8,#888)">u</span> ' | ||
} | } | ||
return vowel_repl[vowel] .. space | return vowel_repl[vowel] .. space | ||
| Line 249: | Line 271: | ||
) | ) | ||
text = rsub(text, "[" .. fatHa .. Damma .. kasra .. "]$", { | text = rsub(text, "[" .. fatHa .. Damma .. kasra .. "]$", { | ||
[fatHa] = '<span style="color: # | [fatHa] = '<span style="color: var(--wikt-palette-grey-8,#888)">a</span>', | ||
[kasra] = '<span style="color: # | [kasra] = '<span style="color: var(--wikt-palette-grey-8,#888)">i</span>', | ||
[Damma] = '<span style="color: # | [Damma] = '<span style="color: var(--wikt-palette-grey-8,#888)">u</span>' | ||
}) | }) | ||
text = rsub(text, '</span><span style="color: # | text = rsub(text, '</span><span style="color: var(--wikt-palette-grey-8,#888)">', "") | ||
elseif omit_i3raab then -- omit ʾiʿrāb in transliteration | elseif omit_i3raab then -- omit ʾiʿrāb in transliteration | ||
text = rsub(text, "[" .. fatHataan .. Dammataan .. kasrataan .. "]", "") | text = rsub(text, "[" .. fatHataan .. Dammataan .. kasrataan .. "]", "") | ||
| Line 292: | Line 314: | ||
text = rsub(text, "aā", "ā") | text = rsub(text, "aā", "ā") | ||
-- Implement elision of al- after a final vowel. We do this | -- Implement elision of al- after a final vowel. We do this | ||
-- conservatively, only handling elision of the definite article rather | -- conservatively, only handling elision of the definite article and related | ||
-- terms (specifically, relative pronoun الَّذِي (allaḏī) and variants) rather | |||
-- than elision in other cases of hamzat al-waṣl (e.g. form-I imperatives | -- than elision in other cases of hamzat al-waṣl (e.g. form-I imperatives | ||
-- or form-VII and above verbal nouns) partly because elision in | -- or form-VII and above verbal nouns) partly because elision in | ||
| Line 298: | Line 321: | ||
-- elision in case of words written with initial bare alif instead of | -- elision in case of words written with initial bare alif instead of | ||
-- properly with hamzated alif. Possibly we should reconsider. | -- properly with hamzated alif. Possibly we should reconsider. | ||
text = rsub(text, "([aiuāīū]'* +'*)a([" .. sun_letters_tr .. "][%-" .. alladi_marker .. "])", | |||
text = rsub(text, "([aiuāīū]'* +'*)a([" .. sun_letters_tr .. "]%-)", | |||
"%1%2") | "%1%2") | ||
if gray_i3raab then | if gray_i3raab then | ||
text = rsub(text, "([aiuāīū]'*</span>'* +'*)a([" .. sun_letters_tr .. "]%-)", | text = rsub(text, "([aiuāīū]'*</span>'* +'*)a([" .. sun_letters_tr .. "][%-" .. alladi_marker .. "])", | ||
"%1%2") | "%1%2") | ||
end | end | ||
-- Special-case the transliteration of allāh, without the hyphen | -- remove indicator of allaḏī, which has served its purpose | ||
text = rsub(text, alladi_marker, "") | |||
-- Special-case the transliteration of allāh, without the hyphen. | |||
text = rsub(text, "^(a?)l%-lāh", "%1llāh") | text = rsub(text, "^(a?)l%-lāh", "%1llāh") | ||
text = rsub(text, "(" .. space_like_class .. "a?)l%-lāh", "%1llāh") | text = rsub(text, "(" .. space_like_class .. "a?)l%-lāh", "%1llāh") | ||
-- Compress multiple spaces, which may occur e.g. when removing Koranic diacritics. | |||
text = rsub(text, "(%s)%s+", "%1") | |||
return text | return text | ||
| Line 323: | Line 347: | ||
-- If you want to catch places without iʿrāb, comment out the next two lines. | -- If you want to catch places without iʿrāb, comment out the next two lines. | ||
{"[" .. lconsonants .. "]$", ""}, | {"[" .. lconsonants .. "]$", ""}, | ||
{"[" .. lconsonants .. "](" .. space_like_class .. ")", "%1"}, | {"[" .. lconsonants .. "]([%)%]}]?" .. space_like_class .. ")", "%1"}, | ||
-- remove consonants (or alif) when followed by diacritics | -- remove consonants (or alif) when followed by diacritics | ||
-- must go after removing shadda | -- must go after removing shadda | ||
| Line 350: | Line 374: | ||
-- declared as local above | -- declared as local above | ||
function has_diacritics(text) | function has_diacritics(text) | ||
local orig_text = text | |||
local count | local count | ||
text, count = rsubn(text, "[" .. lrm .. rlm .. "]", "") | text, count = rsubn(text, "[" .. lrm .. rlm .. "]", "") | ||
| Line 357: | Line 382: | ||
for _, sub in ipairs(has_diacritics_subs) do | for _, sub in ipairs(has_diacritics_subs) do | ||
text = rsub(text, unpack(sub)) | text = rsub(text, unpack(sub)) | ||
end | |||
if #text > 0 then | |||
mw.log(("Check for missing diacritics failed; original text '%s', text without diacritics '%s'"):format( | |||
orig_text, text)) | |||
end | end | ||
return #text == 0 | return #text == 0 | ||
| Line 407: | Line 436: | ||
return export | return export | ||
Revision as of 21:10, 11 September 2025
- The following documentation is generated by Module:documentation/functions/translit. [edit]
- Useful links: subpage list • links • transclusions • testcases • sandbox
This module will transliterate Arabic language text. It is also used to transliterate West Circassian, Avar, Chechen, Ingush, and East Circassian.
The module should preferably not be called directly from templates or other modules.
To use it from a template, use {{xlit}}.
Within a module, use Module:languages#Language:transliterate.
For testcases, see Module:ar-translit/testcases.
Functions
tr(text, lang, sc)- Transliterates a given piece of
textwritten in the script specified by the codesc, and language specified by the codelang. - When the transliteration fails, returns
nil.
-- Authors: Benwing, ZxxZxxZ, Atitarev
local export = {}
local m_str_utils = require("Module:string utilities")
local gcodepoint = m_str_utils.gcodepoint
local rfind = m_str_utils.find
local rsubn = m_str_utils.gsub
local rmatch = m_str_utils.match
local rsplit = m_str_utils.split
local U = m_str_utils.char
local unpack = unpack or table.unpack -- Lua 5.2 compatibility
-- assigned below
local has_diacritics
-- version of rsubn() that discards all but the first return value
local function rsub(term, foo, bar)
local retval = rsubn(term, foo, bar)
return retval
end
local zwnj = U(0x200C) -- zero-width non-joiner
local alif_madda = U(0x622)
local alif_hamza_below = U(0x625)
local alif = U(0x627)
local taa_marbuuTa = U(0x629)
local laam = U(0x644)
local waaw = U(0x648)
local alif_maqSuura = U(0x649)
local yaa = U(0x64A)
local fatHataan = U(0x64B)
local Dammataan = U(0x64C)
local kasrataan = U(0x64D)
local fatHa = U(0x64E)
local Damma = U(0x64F)
local kasra = U(0x650)
local shadda = U(0x651)
local sukuun = U(0x652)
local dagger_alif = U(0x670)
local alif_waSl = U(0x671)
--local zwj = U(0x200D) -- zero-width joiner
local lrm = U(0x200E) -- left-to-right mark
local rlm = U(0x200F) -- right-to-left mark
-- Occurs after al- in allaḏī and variants so that we can implement elision of
-- a- after a preceding vowel, after which we remove the marker.
local alladi_marker = U(0xFFF0)
local tt = {
-- consonants
["ب"]="b", ["ت"]="t", ["ث"]="ṯ", ["ج"]="j", ["ح"]="ḥ", ["خ"]="ḵ",
["د"]="d", ["ذ"]="ḏ", ["ر"]="r", ["ز"]="z", ["س"]="s", ["ش"]="š",
["ص"]="ṣ", ["ض"]="ḍ", ["ط"]="ṭ", ["ظ"]="ẓ", ["ع"]="ʕ", ["غ"]="ḡ",
["ف"]="f", ["ق"]="q", ["ك"]="k", ["ڪ"]="k", ["ل"]="l", ["م"]="m", ["ن"]="n",
["ه"]="h",
-- tāʾ marbūṭa (special) - always after a fátḥa (a), silent at the end of
-- an utterance, "t" in ʾiḍāfa or with pronounced tanwīn. We catch
-- most instances of tāʾ marbūṭa before we get to this stage.
[taa_marbuuTa]="t", -- tāʾ marbūṭa = ة
-- control characters
[zwnj]="-", -- ZWNJ (zero-width non-joiner)
-- [zwj]="", -- ZWJ (zero-width joiner)
-- rare letters
["پ"]="p", ["چ"]="č", ["ژ"]="ž", ["ڤ"]="v", ["ڥ"]="v", ["گ"]="g",
["ڨ"]="g", ["ڧ"]="q", ["ڢ"]="f", ["ں"]="n", ["ڭ"]="g",
-- semivowels or long vowels, alif, hamza, special letters
["ا"]="ā", -- ʾalif
-- hamzated letters
["أ"]="ʔ", -- hamza over alif
[alif_hamza_below]="ʔ", -- hamza under alif
["ؤ"]="ʔ", -- hamza over wāw
["ئ"]="ʔ", -- hamza over yā
["ء"]="ʔ", -- hamza on the line
-- long vowels
[waaw]="w", --"ū" after ḍamma (u) and not before diacritic
[yaa]="y", --"ī" after kasra (i) and not before diacritic
[alif_maqSuura]="ā", -- ʾalif maqṣūra
[alif_madda]="ʔā", -- ʾalif madda
[alif_waSl]= "", -- hamzatu l-waṣl
[dagger_alif] = "ā", -- ʾalif xanjariyya = dagger ʾalif (Koranic diacritic)
-- short vowels, šádda and sukūn
[fatHataan]="an", -- fatḥatan
[Dammataan]="un", -- ḍammatan
[kasrataan]="in", -- kasratan
[fatHa]="a", -- fatḥa
[Damma]="u", -- ḍamma
[kasra]="i", -- kasra
-- šadda - doubled consonant
[sukuun]="", --sukūn - no vowel
-- ligatures
["ﻻ"]="lā",
["ﷲ"]="llāh",
-- taṭwīl
["ـ"]="", -- taṭwīl, no sound
-- numerals
["١"]="1", ["٢"]="2", ["٣"]="3", ["٤"]="4", ["٥"]="5",
["٦"]="6", ["٧"]="7", ["٨"]="8", ["٩"]="9", ["٠"]="0",
-- punctuation (leave on separate lines)
["؟"]="?", -- question mark
["«"]='“', -- quotation mark
["»"]='”', -- quotation mark
["٫"]=".", -- decimal point
["٬"]=",", -- thousands separator
["٪"]="%", -- percent sign
["،"]=",", -- comma
["؛"]=";" -- semicolon
}
local sun_letters = "تثدذرزسشصضطظلن"
-- For use in implementing sun-letter assimilation of ال (al-)
local ttsun1 = {}
local ttsun2 = {}
local ttsun3 = {}
for cp in gcodepoint(sun_letters) do
local ch = U(cp)
ttsun1[ch] = tt[ch]
ttsun2["l-" .. ch] = tt[ch] .. "-" .. ch
table.insert(ttsun3, tt[ch])
end
-- For use in implementing elision of al-
local sun_letters_tr = table.concat(ttsun3, "")
local consonants_needing_vowels = "بتثجحخدذرزسشصضطظعغفقكڪلمنهپچژڤگڨڧڢںڭأإؤئءةﷲ"
-- consonants on the right side; includes alif madda
local rconsonants = consonants_needing_vowels .. "ويآ"
-- consonants on the left side; does not include alif madda
local lconsonants = consonants_needing_vowels .. "وي"
-- Arabic semicolon, comma, question mark; taṭwīl; period, exclamation point,
-- single quote for bold/italic, double quotes for quoted material
local punctuation = "؟،؛" .. "ـ" .. ".!'" .. '"'
local space_like = "%s'" .. '"'
local space_like_class = "[" .. space_like .. "]"
local numbers = "١٢٣٤٥٦٧٨٩٠"
local before_diacritic_checking_subs = {
------------ transformations prior to checking for diacritics --------------
-- random Koranic marks and presentation forms
{U(0x06E1), sukuun}, -- "Small High Dotless Head of Khah" (variant of sukūn)
{U(0x06DA), ""}, -- "Small High Jeem"
{U(0x06DF), ""}, -- "Small High Rounded Zero" (FIXME: correct?)
{U(0x08F0), U(0x64B)}, -- "Open Fathatan"
{U(0x08F1), U(0x64C)}, -- "Open Dammatan"
{U(0x08F2), U(0x64D)}, -- "Open Kasratan"
{U(0x06E4), ""}, -- "Small High Madda" (FIXME: correct?)
{U(0x06D6), ""}, -- "Small High Ligature Sad with Lam with Alef Maksura" (FIXME: there are others we need to do)
{U(0x06E5), "و"},
{U(0x06E6), "ي"},
-- convert llh for allāh into ll+shadda+dagger-alif+h
{"لله", "للّٰه"},
-- shadda+short-vowel (including tanwīn vowels, i.e. -an -in -un) gets
-- replaced with short-vowel+shadda during NFC normalisation, which
-- MediaWiki does for all Unicode strings; however, it makes the
-- transliteration process inconvenient, so undo it.
{"([" .. fatHataan .. Dammataan .. kasrataan .. fatHa .. Damma .. kasra .. dagger_alif .. "])" .. shadda, shadda .. "%1"},
-- ignore Koranic gemination at beginning of word due to assimilation of preceding consonant
{" ([" .. lconsonants .. "])" .. shadda, " %1"},
-- ignore alif jamīla (otiose alif in 3pl verb forms)
-- #1: handle ḍamma + wāw + alif (final -ū)
{Damma .. waaw .. alif, Damma .. waaw},
-- #2: handle wāw + sukūn + alif (final -w in -aw in defective verbs)
-- this must go before the generation of w, which removes the waw here.
{waaw .. sukuun .. alif, waaw .. sukuun},
-- ignore final alif or alif maqṣūra following fatḥatan (e.g. in accusative
-- singular or words like عَصًا "stick" or هُدًى "guidance"; this is called
-- tanwin nasb)
{fatHataan .. "[" .. alif .. alif_maqSuura .. "]", fatHataan},
-- same but with the fatḥatan placed over the alif or alif maqṣūra
-- instead of over the previous letter (considered a misspelling but
-- common)
{"[" .. alif .. alif_maqSuura .. "]" .. fatHataan, fatHataan},
-- tāʾ marbūṭa should always be preceded by fatḥa, alif, alif madda or
-- dagger alif; infer fatḥa if not
{"([^" .. fatHa .. alif .. alif_madda .. dagger_alif .. "])" .. taa_marbuuTa, "%1" .. fatHa .. taa_marbuuTa},
-- similarly for alif between consonants, possibly marked with shadda
-- (does not apply to initial alif, which is silent when not marked with
-- hamza, or final alif, which might be pronounced as -an)
{"([" .. lconsonants .. "]" .. shadda .. "?)" .. alif .. "([" .. rconsonants .. "])",
"%1" .. fatHa .. alif .. "%2"},
-- infer fatḥa in case of non-fatḥa + alif/alif-maqṣūra + dagger alif
{"([^" .. fatHa .. "])([" .. alif .. alif_maqSuura .. "]" .. dagger_alif .. ")", "%1" .. fatHa .. "%2"},
-- infer kasra in case of hamza-under-alif not + kasra
{alif_hamza_below .. "([^" .. kasra .. kasrataan .. "])", alif_hamza_below .. kasra .. "%1"},
-- ignore dagger alif placed over regular alif or alif maqṣūra
{"([" .. alif .. alif_maqSuura .. "])" .. dagger_alif, "%1"},
----------- rest of these concern definite article alif-lām ----------
-- in kasra/ḍamma + alif + lam, make alif into hamzatu l-waṣl, so we
-- handle cases like بِالتَّوْفِيق (bi-t-tawfīq) correctly
{"([" .. Damma .. kasra .. "])" .. alif .. laam, "%1" .. alif_waSl .. laam},
-- al + consonant + shadda (only recognize word-initially if regular alif): remove shadda
{"^(" .. alif .. fatHa .. "?" .. laam .. "[" .. lconsonants .. "])" .. shadda, "%1"},
{"(" .. space_like_class .. alif .. fatHa .. "?" .. laam .. "[" .. lconsonants .. "])" .. shadda, "%1"},
{"(" .. alif_waSl .. fatHa .. "?" .. laam .. "[" .. lconsonants .. "])" .. shadda, "%1"},
-- handle l- hamzatu l-waṣl or word-initial al-
{"^" .. alif .. fatHa .. "?" .. laam, "al-"},
{"(" .. space_like_class .. ")" .. alif .. fatHa .. "?" .. laam, "%1al-"},
-- next one for bi-t-tawfīq
{"([" .. Damma .. kasra .. "])" .. alif_waSl .. fatHa .. "?" .. laam, "%1-l-"},
-- next one for remaining hamzatu l-waṣl (at beginning of word)
{alif_waSl .. fatHa .. "?" .. laam, "l-"},
-- special casing if the l in al- has a shadda on it (as in الَّذِي "that"),
-- so we don't mistakenly double the dash; insert a special marker here so
-- that we know later to elide the a- after a vowel
{"l%-" .. shadda, "l" .. alladi_marker .. "l"},
-- implement assimilation of sun letters
{"l%-[" .. sun_letters .. "]", ttsun2},
}
-- Transliterate the word(s) in TEXT. LANG (the language) and SC (the script)
-- are ignored. OMIT_I3RAAB means leave out final short vowels (ʾiʿrāb).
-- GRAY_I3RAAB means render transliterate short vowels (ʾiʿrāb) in gray.
-- FORCE_TRANSLIT causes even non-vocalized text to be transliterated
-- (normally the function checks for non-vocalized text and returns nil,
-- since such text is ambiguous in transliteration).
function export.tr(text, lang, sc, omit_i3raab, gray_i3raab, force_translit)
-- make it possible to call this function from a template
if type(text) == "table" then
local function f(x) return (x ~= "") and x or nil end
text, lang, sc, omit_i3raab, force_translit =
f(text.args[1]), f(text.args[2]), f(text.args[3]), f(text.args[4]), f(text.args[5])
end
for _, sub in ipairs(before_diacritic_checking_subs) do
text = rsub(text, sub[1], sub[2])
end
if not force_translit and not has_diacritics(text) then
require("Module:debug").track("ar-translit/lacking diacritics")
return nil
end
------------ transformations after checking for diacritics --------------
-- Replace plain alif with hamzatu l-waṣl when followed by fatḥa/ḍamma/kasra.
-- Must go after handling of initial al-, which distinguishes alif-fatḥa
-- from alif w/hamzatu l-waṣl. Must go before generation of ū and ī, which
-- eliminate the ḍamma/kasra.
text = rsub(text, alif .. "([" .. fatHa .. Damma .. kasra .. "])", alif_waSl .. "%1")
-- ḍamma + waw not followed by a diacritic is ū, otherwise w
text = rsub(text, Damma .. waaw .. "([^" .. fatHataan .. Dammataan .. kasrataan .. fatHa .. Damma .. kasra .. shadda .. sukuun .. dagger_alif .. "])", "ū%1")
text = rsub(text, Damma .. waaw .. "$", "ū")
-- kasra + yaa not followed by a diacritic (or ū from prev step) is ī, otherwise y
text = rsub(text, kasra .. yaa .. "([^" .. fatHataan .. Dammataan .. kasrataan .. fatHa .. Damma .. kasra .. shadda .. sukuun .. dagger_alif .. "ū])", "ī%1")
text = rsub(text, kasra .. yaa .. "$", "ī")
-- convert shadda to double letter.
text = rsub(text, "(.)" .. shadda, "%1%1")
if not omit_i3raab and gray_i3raab then -- show ʾiʿrāb grayed in transliteration
-- decide whether to gray out the t in ﺓ. If word begins with al- or l-, yes.
-- Otherwise, no if word ends in a/i/u, yes if ends in an/in/un.
text = rsub(text, "^(a?l%-[^%s]+)" .. taa_marbuuTa .. "([" .. fatHataan .. Dammataan .. kasrataan .. fatHa .. Damma .. kasra .. "])",
'%1<span style="color: var(--wikt-palette-grey-8,#888)">t</span>%2')
text = rsub(text, "(" .. space_like_class .. "a?l%-[^%s]+)" .. taa_marbuuTa .. "([" .. fatHataan .. Dammataan .. kasrataan .. fatHa .. Damma .. kasra .. "])",
'%1<span style="color: var(--wikt-palette-grey-8,#888)">t</span>%2')
text = rsub(text, taa_marbuuTa .. "([" .. fatHa .. Damma .. kasra .. "])", "t%1")
text = rsub(text, taa_marbuuTa .. "([" .. fatHataan .. Dammataan .. kasrataan .. "])",
'<span style="color: var(--wikt-palette-grey-8,#888)">t</span>%1')
text = rsub(text, ".", {
[fatHataan] = '<span style="color: var(--wikt-palette-grey-8,#888)">an</span>',
[kasrataan] = '<span style="color: var(--wikt-palette-grey-8,#888)">in</span>',
[Dammataan] = '<span style="color: var(--wikt-palette-grey-8,#888)">un</span>'
})
text = rsub(text, "([" .. fatHa .. Damma .. kasra .. "])(" .. space_like_class .. ")",
function(vowel, space)
vowel_repl = {
[fatHa] = '<span style="color: var(--wikt-palette-grey-8,#888)">a</span> ',
[kasra] = '<span style="color: var(--wikt-palette-grey-8,#888)">i</span> ',
[Damma] = '<span style="color: var(--wikt-palette-grey-8,#888)">u</span> '
}
return vowel_repl[vowel] .. space
end
)
text = rsub(text, "[" .. fatHa .. Damma .. kasra .. "]$", {
[fatHa] = '<span style="color: var(--wikt-palette-grey-8,#888)">a</span>',
[kasra] = '<span style="color: var(--wikt-palette-grey-8,#888)">i</span>',
[Damma] = '<span style="color: var(--wikt-palette-grey-8,#888)">u</span>'
})
text = rsub(text, '</span><span style="color: var(--wikt-palette-grey-8,#888)">', "")
elseif omit_i3raab then -- omit ʾiʿrāb in transliteration
text = rsub(text, "[" .. fatHataan .. Dammataan .. kasrataan .. "]", "")
text = rsub(text, "[" .. fatHa .. Damma .. kasra .. "](" .. space_like_class .. ")", "%1")
text = rsub(text, "[" .. fatHa .. Damma .. kasra .. "]$", "")
end
-- tāʾ marbūṭa should not be rendered by -t if word-final even when
-- ʾiʿrāb (desinential inflection) is shown; instead, use (t) before
-- whitespace, nothing when final; but render final -ﺍﺓ and -ﺁﺓ as -āh,
-- consistent with Wehr's dictionary
-- Left-to-right or right-to-left mark at end of text will prevent tāʾ marbūṭa
-- from being transliterated correctly.
text = string.gsub(text, lrm, "")
text = string.gsub(text, rlm, "")
text = rsub(text, "([" .. alif .. alif_madda .. "])" .. taa_marbuuTa .. "$", "%1h")
-- Ignore final tāʾ marbūṭa (it appears as "a" due to the preceding
-- short vowel). Need to do this after graying or omitting word-final
-- ʾiʿrāb.
text = rsub(text, taa_marbuuTa .. "$", "")
text = rsub(text, taa_marbuuTa .. "(%p)", "%1")
if not omit_i3raab then -- show ʾiʿrāb in transliteration
text = rsub(text, taa_marbuuTa .. "(" .. space_like_class .. ")", "(t)%1")
else
-- When omitting ʾiʿrāb, show all non-absolutely-final instances of
-- tāʾ marbūṭa as (t), with trailing ʾiʿrāb omitted.
text = rsub(text, taa_marbuuTa, "(t)")
end
-- tatwīl should be rendered as - at beginning or end of word. It will
-- be rendered as nothing in the middle of a word (FIXME, do we want
-- this?)
text = rsub(text, "^ـ", "-")
text = rsub(text, "(" .. space_like_class .. ")ـ",
"%1-")
text = rsub(text, "ـ$", "-")
text = rsub(text, "ـ(" .. space_like_class .. ")", "-%1")
-- Now convert remaining Arabic chars according to table.
text = rsub(text, ".", tt)
text = rsub(text, "aā", "ā")
-- Implement elision of al- after a final vowel. We do this
-- conservatively, only handling elision of the definite article and related
-- terms (specifically, relative pronoun الَّذِي (allaḏī) and variants) rather
-- than elision in other cases of hamzat al-waṣl (e.g. form-I imperatives
-- or form-VII and above verbal nouns) partly because elision in
-- these cases isn't so common in MSA and partly to avoid excessive
-- elision in case of words written with initial bare alif instead of
-- properly with hamzated alif. Possibly we should reconsider.
text = rsub(text, "([aiuāīū]'* +'*)a([" .. sun_letters_tr .. "][%-" .. alladi_marker .. "])",
"%1%2")
if gray_i3raab then
text = rsub(text, "([aiuāīū]'*</span>'* +'*)a([" .. sun_letters_tr .. "][%-" .. alladi_marker .. "])",
"%1%2")
end
-- remove indicator of allaḏī, which has served its purpose
text = rsub(text, alladi_marker, "")
-- Special-case the transliteration of allāh, without the hyphen.
text = rsub(text, "^(a?)l%-lāh", "%1llāh")
text = rsub(text, "(" .. space_like_class .. "a?)l%-lāh", "%1llāh")
-- Compress multiple spaces, which may occur e.g. when removing Koranic diacritics.
text = rsub(text, "(%s)%s+", "%1")
return text
end
local has_diacritics_subs = {
-- FIXME! What about lam-alif ligature?
-- remove punctuation and shadda
-- must go before removing final consonants
{"[" .. punctuation .. shadda .. "]", ""},
-- Remove consonants at end of word or utterance, so that we're OK with
-- words lacking iʿrāb (must go before removing other consonants).
-- If you want to catch places without iʿrāb, comment out the next two lines.
{"[" .. lconsonants .. "]$", ""},
{"[" .. lconsonants .. "]([%)%]}]?" .. space_like_class .. ")", "%1"},
-- remove consonants (or alif) when followed by diacritics
-- must go after removing shadda
-- do not remove the diacritics yet because we need them to handle
-- long-vowel sequences of diacritic + pseudo-consonant
{"[" .. lconsonants .. alif .. "]([" .. fatHataan .. Dammataan .. kasrataan .. fatHa .. Damma .. kasra .. sukuun .. dagger_alif .. "])", "%1"},
-- the following two must go after removing consonants w/diacritics because
-- we only want to treat vocalic wāw/yā' in them (we want to have removed
-- wāw/yā' followed by a diacritic)
-- remove ḍamma + wāw
{Damma .. waaw, ""},
-- remove kasra + yā'
{kasra .. yaa, ""},
-- remove fatḥa/fatḥatan + alif/alif-maqṣūra
{"[" .. fatHataan .. fatHa .. "][" .. alif .. alif_maqSuura .. "]", ""},
-- remove diacritics
{"[" .. fatHataan .. Dammataan .. kasrataan .. fatHa .. Damma .. kasra .. sukuun .. dagger_alif .. "]", ""},
-- remove numbers, hamzatu l-waṣl, alif madda
{"[" .. numbers .. "ٱ" .. "آ" .. "]", ""},
-- remove non-Arabic characters
{"[^" .. U(0x0600) .. "-" .. U(0x06FF) .. U(0x0750) .. "-" .. U(0x077F) ..
U(0x08A0) .. "-" .. U(0x08FF) .. U(0xFB50) .. "-" .. U(0xFDFF) ..
U(0xFE70) .. "-" .. U(0xFEFF) .. "]", ""}
}
-- declared as local above
function has_diacritics(text)
local orig_text = text
local count
text, count = rsubn(text, "[" .. lrm .. rlm .. "]", "")
if count > 0 then
require("Module:debug").track("ar-translit/lrm or rlm")
end
for _, sub in ipairs(has_diacritics_subs) do
text = rsub(text, unpack(sub))
end
if #text > 0 then
mw.log(("Check for missing diacritics failed; original text '%s', text without diacritics '%s'"):format(
orig_text, text))
end
return #text == 0
end
-- Return true if transliteration TR is an irregular transliteration of
-- ARABIC. Return false if ARABIC can't be transliterated. For purposes of
-- establishing regularity, hyphens are ignored and word-final tāʾ marbūṭa
-- can be transliterated as "(t)", "" or "t".
function export.irregular_translit(arabic, tr)
if not arabic or arabic == "" or not tr or tr == "" then
return false
end
local regtr = export.tr(arabic)
if not regtr or regtr == tr then
return false
end
local arwords = rsplit(arabic, " ")
local regwords = rsplit(regtr, " ")
local words = rsplit(tr, " ")
if #regwords ~= #words or #regwords ~= #arwords then
return true
end
for i=1,#regwords do
local regword = regwords[i]
local word = words[i]
local arword = arwords[i]
-- Resolve final (t) in auto-translit to t, h or nothing
if rfind(regword, "%(t%)$") then
regword = rfind(word, "āh$") and rsub(regword, "%(t%)$", "h") or
rfind(word, "t$") and rsub(regword, "%(t%)$", "t") or
rsub(regword, "%(t%)$", "")
end
-- Resolve clitics + short a + alif-lām, which may get auto-transliterated
-- to contain long ā, to short a if the manual translit has it; note
-- that currently in cases with assimilated l, the auto-translit will
-- fail, so we won't ever get here and don't have to worry about
-- auto-translit l against manual-translit assimilated char.
local clitic_chars = "^[وفكل]" -- separate line to avoid L2R display weirdness
if rfind(arword, clitic_chars .. fatHa .. "?[" .. alif .. alif_waSl .. "]" .. laam) and rfind(word, "^[wfkl]a%-") then
regword = rsub(regword, "^([wfkl])ā", "%1a")
end
-- Ignore hyphens when comparing
if rsub(regword, "%-", "") ~= rsub(word, "%-", "") then
return true
end
end
return false
end
return export