Module:ar-translit: Difference between revisions
Created page with "-- Authors: Benwing, ZxxZxxZ, Atitarev local export = {} local U = mw.ustring.char local rfind = mw.ustring.find local rsubn = mw.ustring.gsub local rmatch = mw.ustring.matc..." |
m 1 revision imported |
||
| (One intermediate revision by one other user not shown) | |||
| Line 3: | Line 3: | ||
local export = {} | local export = {} | ||
local | local m_str_utils = require("Module:string utilities") | ||
local rfind = | |||
local rsubn = | local gcodepoint = m_str_utils.gcodepoint | ||
local rmatch = | local rfind = m_str_utils.find | ||
local rsplit = | local rsubn = m_str_utils.gsub | ||
local | local rmatch = m_str_utils.match | ||
local rsplit = m_str_utils.split | |||
local U = m_str_utils.char | |||
local unpack = unpack or table.unpack -- Lua 5.2 compatibility | |||
-- assigned below | -- assigned below | ||
| Line 19: | Line 22: | ||
end | end | ||
local zwnj = U( | local zwnj = U(0x200C) -- zero-width non-joiner | ||
local alif_madda = U(0x622) | local alif_madda = U(0x622) | ||
local alif_hamza_below = U(0x625) | local alif_hamza_below = U(0x625) | ||
| Line 38: | Line 41: | ||
local dagger_alif = U(0x670) | local dagger_alif = U(0x670) | ||
local alif_waSl = U(0x671) | local alif_waSl = U(0x671) | ||
--local zwj = U( | --local zwj = U(0x200D) -- zero-width joiner | ||
local lrm = U( | local lrm = U(0x200E) -- left-to-right mark | ||
local rlm = U( | local rlm = U(0x200F) -- right-to-left mark | ||
-- Occurs after al- in allaḏī and variants so that we can implement elision of | |||
-- a- after a preceding vowel, after which we remove the marker. | |||
local alladi_marker = U(0xFFF0) | |||
local tt = { | local tt = { | ||
| Line 46: | Line 52: | ||
["ب"]="b", ["ت"]="t", ["ث"]="ṯ", ["ج"]="j", ["ح"]="ḥ", ["خ"]="ḵ", | ["ب"]="b", ["ت"]="t", ["ث"]="ṯ", ["ج"]="j", ["ح"]="ḥ", ["خ"]="ḵ", | ||
["د"]="d", ["ذ"]="ḏ", ["ر"]="r", ["ز"]="z", ["س"]="s", ["ش"]="š", | ["د"]="d", ["ذ"]="ḏ", ["ر"]="r", ["ز"]="z", ["س"]="s", ["ش"]="š", | ||
["ص"]="ṣ", ["ض"]="ḍ", ["ط"]="ṭ", ["ظ"]="ẓ", ["ع"]=" | ["ص"]="ṣ", ["ض"]="ḍ", ["ط"]="ṭ", ["ظ"]="ẓ", ["ع"]="ʕ", ["غ"]="ḡ", | ||
["ف"]="f", ["ق"]="q", ["ك"]="k", ["ڪ"]="k", ["ل"]="l", ["م"]="m", ["ن"]="n", | ["ف"]="f", ["ق"]="q", ["ك"]="k", ["ڪ"]="k", ["ل"]="l", ["م"]="m", ["ن"]="n", | ||
["ه"]="h", | ["ه"]="h", | ||
| Line 57: | Line 63: | ||
-- [zwj]="", -- ZWJ (zero-width joiner) | -- [zwj]="", -- ZWJ (zero-width joiner) | ||
-- rare letters | -- rare letters | ||
["پ"]="p", ["چ"]="č", ["ڤ"]="v", ["ڥ"]="v", ["گ"]="g", ["ڨ"]="g", ["ڧ"]="q", ["ڢ"]="f", ["ں"]="n", ["ڭ"]="g", | ["پ"]="p", ["چ"]="č", ["ژ"]="ž", ["ڤ"]="v", ["ڥ"]="v", ["گ"]="g", | ||
["ڨ"]="g", ["ڧ"]="q", ["ڢ"]="f", ["ں"]="n", ["ڭ"]="g", | |||
-- semivowels or long vowels, alif, hamza, special letters | -- semivowels or long vowels, alif, hamza, special letters | ||
["ا"]="ā", -- ʾalif | ["ا"]="ā", -- ʾalif | ||
-- hamzated letters | -- hamzated letters | ||
["أ"]=" | ["أ"]="ʔ", -- hamza over alif | ||
[alif_hamza_below]=" | [alif_hamza_below]="ʔ", -- hamza under alif | ||
["ؤ"]=" | ["ؤ"]="ʔ", -- hamza over wāw | ||
["ئ"]=" | ["ئ"]="ʔ", -- hamza over yā | ||
["ء"]=" | ["ء"]="ʔ", -- hamza on the line | ||
-- long vowels | -- long vowels | ||
[waaw]="w", --"ū" after ḍamma (u) and not before diacritic | [waaw]="w", --"ū" after ḍamma (u) and not before diacritic | ||
[yaa]="y", --"ī" after kasra (i) and not before diacritic | [yaa]="y", --"ī" after kasra (i) and not before diacritic | ||
[alif_maqSuura]="ā", -- ʾalif maqṣūra | [alif_maqSuura]="ā", -- ʾalif maqṣūra | ||
[alif_madda]=" | [alif_madda]="ʔā", -- ʾalif madda | ||
[alif_waSl]= "", -- hamzatu l-waṣl | [alif_waSl]= "", -- hamzatu l-waṣl | ||
[dagger_alif] = "ā", -- ʾalif xanjariyya = dagger ʾalif (Koranic diacritic) | [dagger_alif] = "ā", -- ʾalif xanjariyya = dagger ʾalif (Koranic diacritic) | ||
| Line 115: | Line 122: | ||
local sun_letters_tr = table.concat(ttsun3, "") | local sun_letters_tr = table.concat(ttsun3, "") | ||
local consonants_needing_vowels = " | local consonants_needing_vowels = "بتثجحخدذرزسشصضطظعغفقكڪلمنهپچژڤگڨڧڢںڭأإؤئءةﷲ" | ||
-- consonants on the right side; includes alif madda | -- consonants on the right side; includes alif madda | ||
local rconsonants = consonants_needing_vowels .. "ويآ" | local rconsonants = consonants_needing_vowels .. "ويآ" | ||
| Line 129: | Line 136: | ||
local before_diacritic_checking_subs = { | local before_diacritic_checking_subs = { | ||
------------ transformations prior to checking for diacritics -------------- | ------------ transformations prior to checking for diacritics -------------- | ||
-- random Koranic marks and presentation forms | |||
{U(0x06E1), sukuun}, -- "Small High Dotless Head of Khah" (variant of sukūn) | |||
{U(0x06DA), ""}, -- "Small High Jeem" | |||
{U(0x06DF), ""}, -- "Small High Rounded Zero" (FIXME: correct?) | |||
{U(0x08F0), U(0x64B)}, -- "Open Fathatan" | |||
{U(0x08F1), U(0x64C)}, -- "Open Dammatan" | |||
{U(0x08F2), U(0x64D)}, -- "Open Kasratan" | |||
{U(0x06E4), ""}, -- "Small High Madda" (FIXME: correct?) | |||
{U(0x06D6), ""}, -- "Small High Ligature Sad with Lam with Alef Maksura" (FIXME: there are others we need to do) | |||
{U(0x06E5), "و"}, | |||
{U(0x06E6), "ي"}, | |||
-- convert llh for allāh into ll+shadda+dagger-alif+h | -- convert llh for allāh into ll+shadda+dagger-alif+h | ||
{"لله", "للّٰه"}, | {"لله", "للّٰه"}, | ||
| Line 136: | Line 154: | ||
-- transliteration process inconvenient, so undo it. | -- transliteration process inconvenient, so undo it. | ||
{"([" .. fatHataan .. Dammataan .. kasrataan .. fatHa .. Damma .. kasra .. dagger_alif .. "])" .. shadda, shadda .. "%1"}, | {"([" .. fatHataan .. Dammataan .. kasrataan .. fatHa .. Damma .. kasra .. dagger_alif .. "])" .. shadda, shadda .. "%1"}, | ||
-- ignore Koranic gemination at beginning of word due to assimilation of preceding consonant | |||
{" ([" .. lconsonants .. "])" .. shadda, " %1"}, | |||
-- ignore alif jamīla (otiose alif in 3pl verb forms) | -- ignore alif jamīla (otiose alif in 3pl verb forms) | ||
-- #1: handle ḍamma + wāw + alif (final -ū) | -- #1: handle ḍamma + wāw + alif (final -ū) | ||
| Line 181: | Line 201: | ||
{alif_waSl .. fatHa .. "?" .. laam, "l-"}, | {alif_waSl .. fatHa .. "?" .. laam, "l-"}, | ||
-- special casing if the l in al- has a shadda on it (as in الَّذِي "that"), | -- special casing if the l in al- has a shadda on it (as in الَّذِي "that"), | ||
-- so we don't mistakenly double the dash | -- so we don't mistakenly double the dash; insert a special marker here so | ||
{"l%-" .. shadda, " | -- that we know later to elide the a- after a vowel | ||
{"l%-" .. shadda, "l" .. alladi_marker .. "l"}, | |||
-- implement assimilation of sun letters | -- implement assimilation of sun letters | ||
{"l%-[" .. sun_letters .. "]", ttsun2}, | {"l%-[" .. sun_letters .. "]", ttsun2}, | ||
| Line 206: | Line 227: | ||
if not force_translit and not has_diacritics(text) then | if not force_translit and not has_diacritics(text) then | ||
require("Module:debug").track("ar-translit/lacking diacritics") | |||
return nil | return nil | ||
end | end | ||
| Line 227: | Line 249: | ||
-- Otherwise, no if word ends in a/i/u, yes if ends in an/in/un. | -- Otherwise, no if word ends in a/i/u, yes if ends in an/in/un. | ||
text = rsub(text, "^(a?l%-[^%s]+)" .. taa_marbuuTa .. "([" .. fatHataan .. Dammataan .. kasrataan .. fatHa .. Damma .. kasra .. "])", | text = rsub(text, "^(a?l%-[^%s]+)" .. taa_marbuuTa .. "([" .. fatHataan .. Dammataan .. kasrataan .. fatHa .. Damma .. kasra .. "])", | ||
'%1<span style="color: # | '%1<span style="color: var(--wikt-palette-grey-8,#888)">t</span>%2') | ||
text = rsub(text, "(" .. space_like_class .. "a?l%-[^%s]+)" .. taa_marbuuTa .. "([" .. fatHataan .. Dammataan .. kasrataan .. fatHa .. Damma .. kasra .. "])", | text = rsub(text, "(" .. space_like_class .. "a?l%-[^%s]+)" .. taa_marbuuTa .. "([" .. fatHataan .. Dammataan .. kasrataan .. fatHa .. Damma .. kasra .. "])", | ||
'%1<span style="color: # | '%1<span style="color: var(--wikt-palette-grey-8,#888)">t</span>%2') | ||
text = rsub(text, taa_marbuuTa .. "([" .. fatHa .. Damma .. kasra .. "])", "t%1") | text = rsub(text, taa_marbuuTa .. "([" .. fatHa .. Damma .. kasra .. "])", "t%1") | ||
text = rsub(text, taa_marbuuTa .. "([" .. fatHataan .. Dammataan .. kasrataan .. "])", | text = rsub(text, taa_marbuuTa .. "([" .. fatHataan .. Dammataan .. kasrataan .. "])", | ||
'<span style="color: # | '<span style="color: var(--wikt-palette-grey-8,#888)">t</span>%1') | ||
text = rsub(text, ".", { | text = rsub(text, ".", { | ||
[fatHataan] = '<span style="color: # | [fatHataan] = '<span style="color: var(--wikt-palette-grey-8,#888)">an</span>', | ||
[kasrataan] = '<span style="color: # | [kasrataan] = '<span style="color: var(--wikt-palette-grey-8,#888)">in</span>', | ||
[Dammataan] = '<span style="color: # | [Dammataan] = '<span style="color: var(--wikt-palette-grey-8,#888)">un</span>' | ||
}) | }) | ||
text = rsub(text, "([" .. fatHa .. Damma .. kasra .. "])(" .. space_like_class .. ")", | text = rsub(text, "([" .. fatHa .. Damma .. kasra .. "])(" .. space_like_class .. ")", | ||
function(vowel, space) | function(vowel, space) | ||
vowel_repl = { | vowel_repl = { | ||
[fatHa] = '<span style="color: # | [fatHa] = '<span style="color: var(--wikt-palette-grey-8,#888)">a</span> ', | ||
[kasra] = '<span style="color: # | [kasra] = '<span style="color: var(--wikt-palette-grey-8,#888)">i</span> ', | ||
[Damma] = '<span style="color: # | [Damma] = '<span style="color: var(--wikt-palette-grey-8,#888)">u</span> ' | ||
} | } | ||
return vowel_repl[vowel] .. space | return vowel_repl[vowel] .. space | ||
| Line 249: | Line 271: | ||
) | ) | ||
text = rsub(text, "[" .. fatHa .. Damma .. kasra .. "]$", { | text = rsub(text, "[" .. fatHa .. Damma .. kasra .. "]$", { | ||
[fatHa] = '<span style="color: # | [fatHa] = '<span style="color: var(--wikt-palette-grey-8,#888)">a</span>', | ||
[kasra] = '<span style="color: # | [kasra] = '<span style="color: var(--wikt-palette-grey-8,#888)">i</span>', | ||
[Damma] = '<span style="color: # | [Damma] = '<span style="color: var(--wikt-palette-grey-8,#888)">u</span>' | ||
}) | }) | ||
text = rsub(text, '</span><span style="color: # | text = rsub(text, '</span><span style="color: var(--wikt-palette-grey-8,#888)">', "") | ||
elseif omit_i3raab then -- omit ʾiʿrāb in transliteration | elseif omit_i3raab then -- omit ʾiʿrāb in transliteration | ||
text = rsub(text, "[" .. fatHataan .. Dammataan .. kasrataan .. "]", "") | text = rsub(text, "[" .. fatHataan .. Dammataan .. kasrataan .. "]", "") | ||
| Line 292: | Line 314: | ||
text = rsub(text, "aā", "ā") | text = rsub(text, "aā", "ā") | ||
-- Implement elision of al- after a final vowel. We do this | -- Implement elision of al- after a final vowel. We do this | ||
-- conservatively, only handling elision of the definite article rather | -- conservatively, only handling elision of the definite article and related | ||
-- terms (specifically, relative pronoun الَّذِي (allaḏī) and variants) rather | |||
-- than elision in other cases of hamzat al-waṣl (e.g. form-I imperatives | -- than elision in other cases of hamzat al-waṣl (e.g. form-I imperatives | ||
-- or form-VII and above verbal nouns) partly because elision in | -- or form-VII and above verbal nouns) partly because elision in | ||
| Line 298: | Line 321: | ||
-- elision in case of words written with initial bare alif instead of | -- elision in case of words written with initial bare alif instead of | ||
-- properly with hamzated alif. Possibly we should reconsider. | -- properly with hamzated alif. Possibly we should reconsider. | ||
text = rsub(text, "([aiuāīū]'* +'*)a([" .. sun_letters_tr .. "][%-" .. alladi_marker .. "])", | |||
text = rsub(text, "([aiuāīū]'* +'*)a([" .. sun_letters_tr .. "]%-)", | |||
"%1%2") | "%1%2") | ||
if gray_i3raab then | if gray_i3raab then | ||
text = rsub(text, "([aiuāīū]'*</span>'* +'*)a([" .. sun_letters_tr .. "]%-)", | text = rsub(text, "([aiuāīū]'*</span>'* +'*)a([" .. sun_letters_tr .. "][%-" .. alladi_marker .. "])", | ||
"%1%2") | "%1%2") | ||
end | end | ||
-- Special-case the transliteration of allāh, without the hyphen | -- remove indicator of allaḏī, which has served its purpose | ||
text = rsub(text, alladi_marker, "") | |||
-- Special-case the transliteration of allāh, without the hyphen. | |||
text = rsub(text, "^(a?)l%-lāh", "%1llāh") | text = rsub(text, "^(a?)l%-lāh", "%1llāh") | ||
text = rsub(text, "(" .. space_like_class .. "a?)l%-lāh", "%1llāh") | text = rsub(text, "(" .. space_like_class .. "a?)l%-lāh", "%1llāh") | ||
-- Compress multiple spaces, which may occur e.g. when removing Koranic diacritics. | |||
text = rsub(text, "(%s)%s+", "%1") | |||
return text | return text | ||
| Line 323: | Line 347: | ||
-- If you want to catch places without iʿrāb, comment out the next two lines. | -- If you want to catch places without iʿrāb, comment out the next two lines. | ||
{"[" .. lconsonants .. "]$", ""}, | {"[" .. lconsonants .. "]$", ""}, | ||
{"[" .. lconsonants .. "](" .. space_like_class .. ")", "%1"}, | {"[" .. lconsonants .. "]([%)%]}]?" .. space_like_class .. ")", "%1"}, | ||
-- remove consonants (or alif) when followed by diacritics | -- remove consonants (or alif) when followed by diacritics | ||
-- must go after removing shadda | -- must go after removing shadda | ||
| Line 350: | Line 374: | ||
-- declared as local above | -- declared as local above | ||
function has_diacritics(text) | function has_diacritics(text) | ||
local orig_text = text | |||
local count | local count | ||
text, count = rsubn(text, "[" .. lrm .. rlm .. "]", "") | text, count = rsubn(text, "[" .. lrm .. rlm .. "]", "") | ||
| Line 357: | Line 382: | ||
for _, sub in ipairs(has_diacritics_subs) do | for _, sub in ipairs(has_diacritics_subs) do | ||
text = rsub(text, unpack(sub)) | text = rsub(text, unpack(sub)) | ||
end | |||
if #text > 0 then | |||
mw.log(("Check for missing diacritics failed; original text '%s', text without diacritics '%s'"):format( | |||
orig_text, text)) | |||
end | end | ||
return #text == 0 | return #text == 0 | ||
| Line 407: | Line 436: | ||
return export | return export | ||