Module:fulf-translit: Difference between revisions

From Linguifex
Jump to navigation Jump to search
Created page with "-- Authors: Benwing, ZxxZxxZ, Atitarev local export = {} local m_str_utils = require("Module:string utilities") local gcodepoint = m_str_utils.gcodepoint local rfind = m_str_utils.find local rsubn = m_str_utils.gsub local rmatch = m_str_utils.match local rsplit = m_str_utils.split local U = m_str_utils.char local unpack = unpack or table.unpack -- Lua 5.2 compatibility -- assigned below local has_diacritics -- version of rsubn() that discards all but the first retur..."
 
Melinoë (talk | contribs)
No edit summary
Line 1: Line 1:
-- Authors: Benwing, ZxxZxxZ, Atitarev
local export = {}


local export = {}
--Contributors: Malku H₂n̥rés, Sartma, Erutuon, Metaknowledge


local m_str_utils = require("Module:string utilities")
local m_str_utils = require("Module:string utilities")


local gcodepoint = m_str_utils.gcodepoint
local gcodepoint = m_str_utils.gcodepoint
local rfind = m_str_utils.find
local match = m_str_utils.match
local rsubn = m_str_utils.gsub
local s = m_str_utils.gsub
local rmatch = m_str_utils.match
local rsplit = m_str_utils.split
local U = m_str_utils.char
local U = m_str_utils.char
local unpack = unpack or table.unpack -- Lua 5.2 compatibility


-- assigned below
local bidirectional_control_characters =
local has_diacritics
U(0x061C) .. U(0x200E) .. U(0x200F) .. U(0x202A) .. "-" .. U(0x202E)
.. U(0x2066) .. "-" .. U(0x2069)
local word_end = "%f[%s%z" .. bidirectional_control_characters .. "%-]"
local word_start = "%f[^%s%z" .. bidirectional_control_characters .. "%-]"
-- Bidirectional control characters should be avoided as much as possible,
-- but they are easily picked up when copying and pasting, so the module needs
-- to account for them.
-- This list is from [[w:Bidirectional control character]].


-- version of rsubn() that discards all but the first return value
local V = "[aɔɛeiăəou‌āēīōūêôáéíóúḗṓếố][̂̄̆]?́?"
local function rsub(term, foo, bar)
local C = "[ʔḇḡḏhwzḥṭylsʕqrśšṯ'ḵmnfṣbdgptkjc″vḫẓġTZCDK]"
local retval = rsubn(term, foo, bar)
 
return retval
local c = { --direct translit
end
--full char ie. C
    ["א"] = "ʔ",
    ["ב"] = "ḇ",
    ["ג"] = "ḡ",
    ["ד"] = "ḏ",
    ["ה"] = "h",
    ["ו"] = "w",
    ["ז"] = "z",
    ["ח"] = "ḥ",
    ["ט"] = "ṭ",
    ["י"] = "y",
    ["ל"] = "l",
    ["ס"] = "s",
    ["ע"] = "ʕ",
    ["ק"] = "q",
    ["ר"] = "r",
    ["ש"] = "š",
    ["ת"] = "ṯ",
--miscellaneous:
["׳"] = "'", --geresh
    ["־"] = "-", --hyphen
    ["׃"] = " .", --dot
["ׂ"] = "ˊ", --sin dot
["ׁ"] = "ˇ", --shin dot
    ["ּ"] = "·", --dagesh
["֫"] = "^", --oleh
["ֽ"] = "+", --meteg
--niqqud ie. V
["ַ"] = "a",
["ָ"] = "ɔ",
["ֶ"] = "ɛ",
["ֵ"] = "e",
["ִ"] = "i",
["ֳ"] = "ɔ̆",
["ֲ"] = "ă",
["ֱ"] = "ɛ̆",
["ְ"] = "ü",
["ֹ"] = "o",
["ֺ"] = "o",
["ֻ"] = "u",
["ׇ"] = "ɔ",
}
 
local b = { --BH
--when different final form
{"[כך]", "ḵ"},
{"[מם]", "m"},
{"[נן]", "n"},
{"[פף]", "f"},
{"[צץ]", "ṣ"},
 
{"(" .. V .. ")(·?)(+?)(^?)([ˊˇ]?'?)", "%5%2%1%4%3"},  --order: s(h)in dot, geresh, dagesh, vowel (niqqud), oleh, meteg
--bgdkft: fricative + dagesh > stop
{"ḇ·", "b"},
{"ḡ·", "g"},
{"ḏ·", "d"},
{"ṯ·", "t"},
{"ḵ·", "k"},
{"f·", "p"},
--s(h)in dot
{"ß(·?)ˇ", "š%1"},
{"ß(·?)ˊ", "ś%1"},
--vowel lengthenings
{"i([+^]?)y", "ī%1"}, --V > long / _{jw}{no V no dagesh}
{"ī([+^]?" .. V .. ")", "iy%1"},
{"ī·", "iy·"},
{"e([+^]?)y", "ē%1"},
{"ē([+^]?" .. V .. ")", "ey%1"},
{"ɛ([+^]?)y", "E%1"},  --see E > ɛ̄ below
{"E([+^]?" .. V .. ")", "ɛy%1"},
{"(" .. C .. "·?)wo", "%1ō"},
{"(" .. V .. "[+^]?)w·", "%1U"},
{"w·", "ū"},
{"U", "w·"},
{"(" .. C .. "·?)y·", "%1ī"},
--h > circumflex / V_{no V no dagesh}
{"(" .. V .. "[+^]?)h", "%1H"},
{"H(" .. V .. ")", "h%1"},
{"H·", "h"},
{"e([+^]?)H", "ê%1"},
{"o([+^]?)H", "ô%1"},
{"ɛ([+^]?)H", "ɛ̂%1"},
{"ɔ([+^]?)H", "ɔ̂%1"},
{"a([+^]?)H", "â%1"},
 
{"(" .. V .. "[+^]?%s?)(.)·(%s?" .. V .. ")", "%1%2%2%3"},  --dagesh gemination
{"[·ß]", ""},  --deletion of unpointed s(h)ins and useless dageshim
--schwa: Ə means "kept"
{"ə" .. word_end, ""},
{"ə([ḇḡḏḵfṯ])", "Ə%1"},
{"([+‌āēīōūoE])(" .. C .. ")ə", "%1%2Ə"},
{"E", "ɛ̄"},  --see >E above
{"(" .. C .. "ə?" .. C .. ")ə", "%1Ə"},
{"(" .. C .. ")Ə(" .. C .. ")([Əə])", "%1ə%2Ə"},
{word_start .. "([ūw]?a?" .. C .. ")ə", "%1Ə"},
{"ə", ""},
{"Ə", "ə"},


local zwnj = U(0x200C) -- zero-width non-joiner
{"([ʕhḥ])a(" .. word_end .. ")", "^a%1%2"},  --final /a/-guttural inversion
local alif_maddah = U(0x622)
--penultimate stress: segolates & -áyiC
local alif_hamza_below = U(0x625)
{"(" .. C .. "[eɛo])(%+?".. C .. "ɛ" .. C .. ")" .. word_end, "%1^%2"},
local alif = U(0x627)
{"(" .. C .. "a)(%+?".. C .. C .. "?a" .. C ..")" .. word_end, "%1^%2"},
local taa_marbuuTa = U(0x629)
{"ayi(" .. C .. ")" .. word_end, "a^yi%1"},
local laam = U(0x644)
--stress marking
local waaw = U(0x648)
{"a^", "á"},
local alif_maqSuura = U(0x649)
{"e^", "é"},
local yaa = U(0x64A)
{"i^", "í"},
local fatHataan = U(0x64B)
{"o^", "ó"},
local Dammataan = U(0x64C)
{"u^", "ú"},
local kasrataan = U(0x64D)
{"ɛ^", "ɛ́"},
local fatHa = U(0x64E)
{"ɔ^", "ɔ́"},
local Damma = U(0x64F)
{"ā^", "ā́"},
local kasra = U(0x650)
{"ē^", "ḗ"},
local shadda = U(0x651)
{"ī^", "ī́"},
local sukuun = U(0x652)
{"ō^", "ṓ"},
local maddah = U(0x653)
{"ū^", "ū́"},
local sub_alif = U(0x0656)
{"ɛ̄^", "ɛ̄́"},
local dagger_alif = U(0x670)
{"ɔ̄^", "ɔ̄́"},
local alif_waSl = U(0x671)
{"ê^", "ế"},
--local zwj = U(0x200D) -- zero-width joiner
{"ô^", "ố"},
local lrm = U(0x200E) -- left-to-right mark
{"ɛ̂^", "ɛ̂́"},
local rlm = U(0x200F) -- right-to-left mark
{"ɔ̂^", "ɔ̂́"},
-- Occurs after al- in allaḏī and variants so that we can implement elision of
-- a- after a preceding vowel, after which we remove the marker.
local alladi_marker = U(0xFFF0)


local tt = {
{"ɔyw(" .. word_end .. ")", "ɔw%1"}, --irregular…
-- consonants
{"(" .. V .. "[+^]?)([bdgptk])(" .. V .. ")", "%1%2%2%3"}, --dagesh bgdkft gemination
["ب"]="b", ["ت"]="t", ["ث"]="ṯ", ["ج"]="j", ["ح"]="ḥ", ["خ"]="ḵ",
{"f", ""}, --bc p̄ are 2 chars
["د"]="d", ["ذ"]="ḏ", ["ر"]="r", ["ز"]="z", ["س"]="s", ["ش"]="š",
{"%s%.", "."}, --quotes: " ." > "." (esthetics)
["ص"]="ṣ", ["ض"]="ḍ", ["ط"]="ṭ", ["ظ"]="ẓ", ["ع"]="ʕ", ["غ"]="ḡ",
["ف"]="f", ["ق"]="q", ["ک"]="k", ["ل"]="l", ["م"]="m", ["ن"]="n",
["ه"]="h",
-- tāʾ marbūṭa (special) - always after a fátḥa (a), silent at the end of
-- an utterance, "t" in ʾiḍāfa or with pronounced tanwīn. We catch
-- most instances of tāʾ marbūṭa before we get to this stage.
[taa_marbuuTa]="t", -- tāʾ marbūṭa = ة
-- control characters
[zwnj]="-", -- ZWNJ (zero-width non-joiner)
-- [zwj]="", -- ZWJ (zero-width joiner)
-- rare letters
["پ"]="p", ["چ"]="c", ["ژ"]="ž", ["ڤ"]="v", ["ڥ"]="v", ["گ"]="g",
["ڨ"]="g", ["ڧ"]="q", ["ڢ"]="f", ["ں"]="n", ["ڭ"]="g",
-- semivowels or long vowels, alif, hamza, special letters
["ا"]="a", -- ʾalif
-- hamzated letters
["أ"]="ʔ", -- hamza over alif
[alif_hamza_below]="ʔ", -- hamza under alif
["ؤ"]="ʔ", -- hamza over wāw
["ئ"]="ʔ", -- hamza over yā
["ء"]="ʔ", -- hamza on the line
-- long vowels
[waaw]="w", --"ū" after ḍamma (u) and not before diacritic
[yaa]="y", --"ī" after kasra (i) and not before diacritic
[alif_maqSuura]="ā", -- ʾalif maqṣūra
[alif_maddah]="ʔā", -- ʾalif maddah
[alif_waSl]= "", -- hamzatu l-waṣl
[dagger_alif] = "ā", -- ʾalif xanjariyya = dagger ʾalif (Koranic diacritic)
-- short vowels, šádda and sukūn
[fatHataan]="an", -- fatḥatan
[Dammataan]="un", -- ḍammatan
[kasrataan]="in", -- kasratan
[fatHa]="a", -- fatḥa
[Damma]="u", -- ḍamma
[kasra]="i", -- kasra
[sub_alif] = "ü", -- subscript ʾalif
[maddah] = "o",
-- šadda - doubled consonant
[sukuun]="", --sukūn - no vowel
-- ligatures
["ﻻ"]="lā",
["ﷲ"]="llāh",
-- taṭwīl
["ـ"]="", -- taṭwīl, no sound
-- numerals
["١"]="1", ["٢"]="2", ["٣"]="3", ["٤"]="4", ["٥"]="5",
["٦"]="6", ["٧"]="7", ["٨"]="8", ["٩"]="9", ["٠"]="0",
-- punctuation (leave on separate lines)
["؟"]="?", -- question mark
["«"]='“', -- quotation mark
["»"]='”', -- quotation mark
["٫"]=".", -- decimal point
["٬"]=",", -- thousands separator
["٪"]="%", -- percent sign
["،"]=",", -- comma
["؛"]=";" -- semicolon
}
}


local sun_letters = "تثدذرزسشصضطظلن"
--MH
-- For use in implementing sun-letter assimilation of ال (al-)
local m = { --direct change
local ttsun1 = {}
["ḏ"] = "d",
local ttsun2 = {}
["ḡ"] = "g",
local ttsun3 = {}
["ś"] = "s",
for cp in gcodepoint(sun_letters) do
["״"] = "″", --gershayim
local ch = U(cp)
["q"] = "k",
ttsun1[ch] = tt[ch]
["ī"] = "i",
ttsun2["l-" .. ch] = tt[ch] .. "-" .. ch
["ū"] = "u",
table.insert(ttsun3, tt[ch])
["́"] = "^", --stress marking conversion below
end
}
-- For use in implementing elision of al-
 
local sun_letters_tr = table.concat(ttsun3, "")
local l = {
--indirect
{"p̄", "f"},
{"[̂̆̄]", ""},
{"ḥ'", "ḫ"},
{"ṯ'", "T"},
{"ṭ'", "ẓ"},
{"g'", "j"},
{"z'", "Z"},
{"ṣ'", "C"},
{"d'", "D"},
{"[rʕ]'", "ġ"},
{"(.)%1", "%1"},
{"[ḇw]", "v"},
{"[ḵḥ]", "K"},
{"[ṯṭ]", "t"},
{"'", ""},
{"[ʔʕ]", "'"},
--above: loss of vowel length, loss of gemination, turning n-grams into 1 char, MH mergers.
 
--schwa
--prefixes
-- {word_start .. "([bvkKlšdm])ə", "%1e"},
-- {"(u[bvkKlšdm])ə", "%1e"},
--initial C clusters
{word_start .. "([rnmly])ə", "%1e"},
{word_start .. "(" .. C .. ")ə([h'])", "%1e%2"},
--internal
{"([ə+]" .. C .. ")ə", "%1e"},
{"(" .. C .. C .. ")ə", "%1e"},
{"[ə+]", ""}, --deletion of remaining schwa and metegim
 
--put here not above to avoid e/ə confusion
{"[āâă]", "a"},
{"[ēêɛ]", "e"},
{"[ōô]", "o"},
{"[ḗế]", "é"},
{"[ṓố]", "ó"},


local consonants_needing_vowels = "بتثجحخدذرزسشصضطظعغفقكڪلمنهپچژڤگڨڧڢںڭأإؤئءةﷲ"
{"(" .. word_start .. "[^áéíóú^]-[aeiouɔ])(" .. C .. "?" .. C .. "?)" .. word_end, "%1^%2"},  --module-explicit default final stress...
-- consonants on the right side; includes alif maddah
--same articulation > schwa insertion
local rconsonants = consonants_needing_vowels .. "ويآ"
{"([bp])([bp])", "%1e%2"},
-- consonants on the left side; does not include alif maddah
{"([vf])([vf])", "%1e%2"},
local lconsonants = consonants_needing_vowels .. "وي"
{"([dt])([dt])", "%1e%2"},
-- Arabic semicolon, comma, question mark; taṭwīl; period, exclamation point,
{"([DTṣ])([DTṣ])", "%1e%2"},
-- single quote for bold/italic, double quotes for quoted material
{"([zs])([zs])", "%1e%2"},
local punctuation = "؟،؛" .. "ـ" .. ".!'" .. '"'
{"([Zš])([Zš])", "%1e%2"},
local space_like = "%s'" .. '"'
{"([jC])([jC])", "%1e%2"},
local space_like_class = "[" .. space_like .. "]"
{"([gk])([gk])", "%1e%2"},
local numbers = "١٢٣٤٥٦٧٨٩٠"
{"(K)(K)", "%1e%2"},
{"(r)(r)", "%1e%2"},
{"''", "'e'"},


local before_diacritic_checking_subs = {
--a/o, including kol
------------ transformations prior to checking for diacritics --------------
{"ɔ(" .. C .. C .. ")", "o%1"},
-- random Koranic marks and presentation forms
{"ɔ(" .. C .. ")" .. word_end, "o%1"},
{U(0x06E1), sukuun}, -- "Small High Dotless Head of Khah" (variant of sukūn)
{"(" .. word_start .. "[kK])ɔ(^l" .. word_end .. ")", "%1o%2"},
{U(0x06DA), ""}, -- "Small High Jeem"
{"([bvkKlšd][ea][kK])ɔ(^l" .. word_end .. ")", "%1o%2"},
{U(0x06DF), ""}, -- "Small High Rounded Zero" (FIXME: correct?)
-- {"(m[ei][kK])ɔ(^l" .. word_end .. ")", "%1o%2"},
{U(0x08F0), U(0x64B)}, -- "Open Fathatan"
{"(" .. word_start .. "u[kK])ɔ(^l" .. word_end .. ")", "%1o%2"},
{U(0x08F1), U(0x64C)}, -- "Open Dammatan"
{"(ha[kK])ɔ(^l" .. word_end .. ")", "%1o%2"},
{U(0x08F2), U(0x64D)}, -- "Open Kasratan"
{"ɔ", "a"},
{U(0x06E4), ""}, -- "Small High maddah" (FIXME: correct?)
{U(0x06D6), ""}, -- "Small High Ligature Sad with Lam with Alef Maksura" (FIXME: there are others we need to do)
{U(0x06E5), "و"},
{U(0x06E6), "ي"},
-- convert llh for allāh into ll+shadda+dagger-alif+h
{"لله", "للّٰه"},
-- shadda+short-vowel (including tanwīn vowels, i.e. -an -in -un) gets
-- replaced with short-vowel+shadda during NFC normalisation, which
-- MediaWiki does for all Unicode strings; however, it makes the
-- transliteration process inconvenient, so undo it.
{"([" .. fatHataan .. Dammataan .. kasrataan .. fatHa .. Damma .. kasra .. dagger_alif .. "])" .. shadda, shadda .. "%1"},
-- ignore Koranic gemination at beginning of word due to assimilation of preceding consonant
{" ([" .. lconsonants .. "])" .. shadda, " %1"},
-- ignore alif jamīla (otiose alif in 3pl verb forms)
--    #1: handle ḍamma + wāw + alif (final -ū)
{Damma .. waaw .. alif, Damma .. waaw},
--    #2: handle wāw + sukūn + alif (final -w in -aw in defective verbs)
--    this must go before the generation of w, which removes the waw here.
{waaw .. sukuun .. alif, waaw .. sukuun},
-- ignore final alif or alif maqṣūra following fatḥatan (e.g. in accusative
-- singular or words like عَصًا "stick" or هُدًى "guidance"; this is called
-- tanwin nasb)
{fatHataan .. "[" .. alif .. alif_maqSuura .. "]", fatHataan},
-- same but with the fatḥatan placed over the alif or alif maqṣūra
-- instead of over the previous letter (considered a misspelling but
-- common)
{"[" .. alif .. alif_maqSuura .. "]" .. fatHataan, fatHataan},
-- tāʾ marbūṭa should always be preceded by fatḥa, alif, alif maddah or
-- dagger alif; infer fatḥa if not
{"([^" .. fatHa .. alif .. alif_maddah .. dagger_alif .. "])" .. taa_marbuuTa, "%1" .. fatHa .. taa_marbuuTa},
-- similarly for alif between consonants, possibly marked with shadda
-- (does not apply to initial alif, which is silent when not marked with
-- hamza, or final alif, which might be pronounced as -an)
{"([" .. lconsonants .. "]" .. shadda .. "?)" .. alif .. "([" .. rconsonants .. "])",
"%1" .. fatHa .. alif .. "%2"},
-- infer fatḥa in case of non-fatḥa + alif/alif-maqṣūra + dagger alif
{"([^" .. fatHa .. "])([" .. alif .. alif_maqSuura .. "]" .. dagger_alif .. ")", "%1" .. fatHa .. "%2"},
-- infer kasra in case of hamza-under-alif not + kasra
{alif_hamza_below .. "([^" .. kasra .. kasrataan .. "])", alif_hamza_below .. kasra .. "%1"},
-- ignore dagger alif placed over regular alif or alif maqṣūra
{"([" .. alif .. alif_maqSuura .. "])" .. dagger_alif, "%1"},


----------- rest of these concern definite article alif-lām ----------
{"(" .. word_start .. C .. C .. "?" .. V .. ")^(" .. C .. "?" .. C .. "?" .. word_end .. ")", "%1%2"}, --…reader-implicit acute accent in monosyllabic
-- in kasra/ḍamma + alif + lam, make alif into hamzatu l-waṣl, so we
--stress marking
-- handle cases like بِالتَّوْفِيق (bi-t-tawfīq) correctly
{"a^", "á"},
{"([" .. Damma .. kasra .. "])" .. alif .. laam, "%1" .. alif_waSl .. laam},
{"e^", "é"},
-- al + consonant + shadda (only recognize word-initially if regular alif): remove shadda
{"i^", "í"},
{"^(" .. alif .. fatHa .. "?" .. laam .. "[" .. lconsonants .. "])" .. shadda, "%1"},
{"o^", "ó"},
{"(" .. space_like_class .. alif .. fatHa .. "?" .. laam .. "[" .. lconsonants .. "])" .. shadda, "%1"},
{"u^", "ú"},
{"(" .. alif_waSl .. fatHa .. "?" .. laam .. "[" .. lconsonants .. "])" .. shadda, "%1"},
--glottal stops: kept when {CV}'V,
-- handle l- hamzatu l-waṣl or word-initial al-
{"(" .. word_start .. ")'", "%1"},
{"^" .. alif .. fatHa .. "?" .. laam, "al-"},
{"'(" .. C .. ")", "%1"},
{"(" .. space_like_class .. ")" .. alif .. fatHa .. "?" .. laam, "%1al-"},
{"'(" .. word_end .. ")", "%1"},
-- next one for bi-t-tawfīq
--fake digraphs
{"([" .. Damma .. kasra .. "])" .. alif_waSl .. fatHa .. "?" .. laam, "%1-l-"},
{"([szck])h", "%1'h"},
-- next one for remaining hamzatu l-waṣl (at beginning of word)
--one char > displaying
{alif_waSl .. fatHa .. "?" .. laam, "l-"},
{"", "ts"},
-- special casing if the l in al- has a shadda on it (as in الَّذِي "that"),
{"š", "sh"},
-- so we don't mistakenly double the dash; insert a special marker here so
{"T", "t'"},
-- that we know later to elide the a- after a vowel
{"Z", "zh"},
{"l%-" .. shadda, "l" .. alladi_marker .. "l"},
{"C", "ch"},
-- implement assimilation of sun letters
{"D", "d'"},
{"l%-[" .. sun_letters .. "]", ttsun2},
{"K", "kh"},
}
}


-- Transliterate the word(s) in TEXT. LANG (the language) and SC (the script)
 
-- are ignored. OMIT_I3RAAB means leave out final short vowels (ʾiʿrāb).
function export.BH(text)
-- GRAY_I3RAAB means render transliterate short vowels (ʾiʿrāb) in gray.
text = s(s(text, '.', c), "[֣֖֣֑֣֣֧֛֖֥֧֛֥֖֑֣֖֥֔֗֗֙֔]", "") --remove cantillation marks so that it works for quotes too
-- FORCE_TRANSLIT causes even non-vocalized text to be transliterated
for a = 1, #b do
-- (normally the function checks for non-vocalized text and returns nil,
text = s(text, b[a][1], b[a][2])
-- since such text is ambiguous in transliteration).
function export.tr(text, lang, sc, omit_i3raab, gray_i3raab, force_translit)
-- make it possible to call this function from a template
if type(text) == "table" then
local function f(x) return (x ~= "") and x or nil end
text, lang, sc, omit_i3raab, force_translit =
f(text.args[1]), f(text.args[2]), f(text.args[3]), f(text.args[4]), f(text.args[5])
end
end
return text
end


for _, sub in ipairs(before_diacritic_checking_subs) do
function export.BH_tr(text)
text = rsub(text, sub[1], sub[2])
return (s(export.BH(text), "+", "")) --metegim kept for MH
end
 
function export.MH_tr(text)
local acronym = false
text = s(export.BH(text), '.', m) --.BH() to keep metegim, m is applied
if match(text, "″") and not match(text, V) then --acronym = gershayim & no V
text = s(s(s(text, "p̄", "p"), "ḇ", "b"), "ḵ", "k")
acronym = true
end
end
 
for a = 1, #l do --in any case, l is applied
if not force_translit and not has_diacritics(text) then
text = s(text, l[a][1], l[a][2])
require("Module:debug").track("ar-translit/lacking diacritics")
return nil
end
end
if acronym == true then
------------ transformations after checking for diacritics --------------
text = mw.ustring.upper(text)
-- Replace plain alif with hamzatu l-waṣl when followed by fatḥa/ḍamma/kasra.
-- Must go after handling of initial al-, which distinguishes alif-fatḥa
-- from alif w/hamzatu l-waṣl. Must go before generation of ū and ī, which
-- eliminate the ḍamma/kasra.
text = rsub(text, alif .. "([" .. fatHa .. Damma .. kasra .. "])", alif_waSl .. "%1")
-- ḍamma + waw not followed by a diacritic is ū, otherwise w
text = rsub(text, Damma .. waaw .. "([^" .. fatHataan .. Dammataan .. kasrataan .. fatHa .. Damma .. kasra .. shadda .. sukuun .. dagger_alif .. "])", "ū%1")
text = rsub(text, Damma .. waaw .. "$", "ū")
-- kasra + yaa not followed by a diacritic (or ū from prev step) is ī, otherwise y
text = rsub(text, kasra .. yaa .. "([^" .. fatHataan .. Dammataan .. kasrataan .. fatHa .. Damma .. kasra .. shadda .. sukuun .. dagger_alif .. "ū])", "ī%1")
text = rsub(text, kasra .. yaa .. "$", "ī")
-- convert shadda to double letter.
text = rsub(text, "(.)" .. shadda, "%1%1")
if not omit_i3raab and gray_i3raab then -- show ʾiʿrāb grayed in transliteration
-- decide whether to gray out the t in ﺓ. If word begins with al- or l-, yes.
-- Otherwise, no if word ends in a/i/u, yes if ends in an/in/un.
text = rsub(text, "^(a?l%-[^%s]+)" .. taa_marbuuTa .. "([" .. fatHataan .. Dammataan .. kasrataan .. fatHa .. Damma .. kasra .. "])",
'%1<span style="color: var(--wikt-palette-grey-8,#888)">t</span>%2')
text = rsub(text, "(" .. space_like_class .. "a?l%-[^%s]+)" .. taa_marbuuTa .. "([" .. fatHataan .. Dammataan .. kasrataan .. fatHa .. Damma .. kasra .. "])",
'%1<span style="color: var(--wikt-palette-grey-8,#888)">t</span>%2')
text = rsub(text, taa_marbuuTa .. "([" .. fatHa .. Damma .. kasra .. "])", "t%1")
text = rsub(text, taa_marbuuTa .. "([" .. fatHataan .. Dammataan .. kasrataan .. "])",
'<span style="color: var(--wikt-palette-grey-8,#888)">t</span>%1')
text = rsub(text, ".", {
[fatHataan] = '<span style="color: var(--wikt-palette-grey-8,#888)">an</span>',
[kasrataan] = '<span style="color: var(--wikt-palette-grey-8,#888)">in</span>',
[Dammataan] = '<span style="color: var(--wikt-palette-grey-8,#888)">un</span>'
})
text = rsub(text, "([" .. fatHa .. Damma .. kasra .. "])(" .. space_like_class .. ")",
function(vowel, space)
vowel_repl = {
[fatHa] = '<span style="color: var(--wikt-palette-grey-8,#888)">a</span> ',
[kasra] = '<span style="color: var(--wikt-palette-grey-8,#888)">i</span> ',
[Damma] = '<span style="color: var(--wikt-palette-grey-8,#888)">u</span> '
}
return vowel_repl[vowel] .. space
end
)
text = rsub(text, "[" .. fatHa .. Damma .. kasra .. "]$", {
[fatHa] = '<span style="color: var(--wikt-palette-grey-8,#888)">a</span>',
[kasra] = '<span style="color: var(--wikt-palette-grey-8,#888)">i</span>',
[Damma] = '<span style="color: var(--wikt-palette-grey-8,#888)">u</span>'
})
text = rsub(text, '</span><span style="color: var(--wikt-palette-grey-8,#888)">', "")
elseif omit_i3raab then -- omit ʾiʿrāb in transliteration
text = rsub(text, "[" .. fatHataan .. Dammataan .. kasrataan .. "]", "")
text = rsub(text, "[" .. fatHa .. Damma .. kasra .. "](" .. space_like_class .. ")", "%1")
text = rsub(text, "[" .. fatHa .. Damma .. kasra .. "]$", "")
end
end
-- tāʾ marbūṭa should not be rendered by -t if word-final even when
return text
-- ʾiʿrāb (desinential inflection) is shown; instead, use (t) before
end
-- whitespace, nothing when final; but render final -ﺍﺓ and -ﺁﺓ as -āh,
 
-- consistent with Wehr's dictionary
function export.tr(text, lang, sc)
-- Left-to-right or right-to-left mark at end of text will prevent tāʾ marbūṭa
if not sc then
-- from being transliterated correctly.
sc = require("Module:languages").getByCode(lang, nil, true):findBestScript(text):getCode()
text = string.gsub(text, lrm, "")
text = string.gsub(text, rlm, "")
text = rsub(text, "([" .. alif .. alif_maddah .. "])" .. taa_marbuuTa .. "$", "%1h")
-- Ignore final tāʾ marbūṭa (it appears as "a" due to the preceding
-- short vowel). Need to do this after graying or omitting word-final
-- ʾiʿrāb.
text = rsub(text, taa_marbuuTa .. "$", "")
text = rsub(text, taa_marbuuTa .. "(%p)", "%1")
if not omit_i3raab then -- show ʾiʿrāb in transliteration
text = rsub(text, taa_marbuuTa .. "(" .. space_like_class .. ")", "(t)%1")
else
-- When omitting ʾiʿrāb, show all non-absolutely-final instances of
-- tāʾ marbūṭa as (t), with trailing ʾiʿrāb omitted.
text = rsub(text, taa_marbuuTa, "(t)")
end
end
-- tatwīl should be rendered as - at beginning or end of word. It will
if sc ~= "Hebr" or not match(text, "[ְ-ֻ־ׇ״]") then
-- be rendered as nothing in the middle of a word (FIXME, do we want
return nil
-- this?)
elseif lang == "he" then
text = rsub(text, "", "-")
return export.MH_tr(text)
text = rsub(text, "(" .. space_like_class .. ")ـ",
elseif lang == "hbo" then --though useless
"%1-")
return export.BH_tr(text)
text = rsub(text, "ـ$", "-")
text = rsub(text, "ـ(" .. space_like_class .. ")", "-%1")
-- Now convert remaining Arabic chars according to table.
text = rsub(text, ".", tt)
text = rsub(text, "", "ā")
-- Implement elision of al- after a final vowel. We do this
-- conservatively, only handling elision of the definite article and related
-- terms (specifically, relative pronoun الَّذِي (allaḏī) and variants) rather
-- than elision in other cases of hamzat al-waṣl (e.g. form-I imperatives
-- or form-VII and above verbal nouns) partly because elision in
-- these cases isn't so common in MSA and partly to avoid excessive
-- elision in case of words written with initial bare alif instead of
-- properly with hamzated alif. Possibly we should reconsider.
text = rsub(text, "([aiuāīū]'* +'*)a([" .. sun_letters_tr .. "][%-" .. alladi_marker .. "])",
"%1%2")
if gray_i3raab then
text = rsub(text, "([aiuāīū]'*</span>'* +'*)a([" .. sun_letters_tr .. "][%-" .. alladi_marker .. "])",
"%1%2")
end
end
-- remove indicator of allaḏī, which has served its purpose
end
text = rsub(text, alladi_marker, "")
-- Special-case the transliteration of allāh, without the hyphen.
text = rsub(text, "^(a?)l%-lāh", "%1llāh")
text = rsub(text, "(" .. space_like_class .. "a?)l%-lāh", "%1llāh")
-- Compress multiple spaces, which may occur e.g. when removing Koranic diacritics.
text = rsub(text, "(%s)%s+", "%1")


return text
function export.tr_all(frame)
return export.BH_tr(frame.args[1]) .. ", " .. export.MH_tr(frame.args[1])
end
end


local has_diacritics_subs = {
--Erutuon's code for code points below
-- FIXME! What about lam-alif ligature?
-- remove punctuation and shadda
-- must go before removing final consonants
{"[" .. punctuation .. shadda .. "]", ""},
-- Remove consonants at end of word or utterance, so that we're OK with
-- words lacking iʿrāb (must go before removing other consonants).
-- If you want to catch places without iʿrāb, comment out the next two lines.
{"[" .. lconsonants .. "]$", ""},
{"[" .. lconsonants .. "]([%)%]}]?" .. space_like_class .. ")", "%1"},
-- remove consonants (or alif) when followed by diacritics
-- must go after removing shadda
-- do not remove the diacritics yet because we need them to handle
-- long-vowel sequences of diacritic + pseudo-consonant
{"[" .. lconsonants .. alif .. "]([" .. fatHataan .. Dammataan .. kasrataan .. fatHa .. Damma .. kasra .. sukuun .. dagger_alif .. "])", "%1"},
-- the following two must go after removing consonants w/diacritics because
-- we only want to treat vocalic wāw/yā' in them (we want to have removed
-- wāw/yā' followed by a diacritic)
-- remove ḍamma + wāw
{Damma .. waaw, ""},
-- remove kasra + yā'
{kasra .. yaa, ""},
-- remove fatḥa/fatḥatan + alif/alif-maqṣūra
{"[" .. fatHataan .. fatHa .. "][" .. alif .. alif_maqSuura .. "]", ""},
-- remove diacritics
{"[" .. fatHataan .. Dammataan .. kasrataan .. fatHa .. Damma .. kasra .. sukuun .. dagger_alif .. "]", ""},
-- remove numbers, hamzatu l-waṣl, alif maddah
{"[" .. numbers .. "ٱ" .. "آ" .. "]", ""},
-- remove non-Arabic characters
{"[^" .. U(0x0600) .. "-" .. U(0x06FF) .. U(0x0750) .. "-" .. U(0x077F) ..
U(0x08A0) .. "-" .. U(0x08FF) .. U(0xFB50) .. "-" .. U(0xFDFF) ..
U(0xFE70) .. "-" .. U(0xFEFF) .. "]", ""}
}


-- declared as local above
--[[
function has_diacritics(text)
local Array = require "Module:array"
local orig_text = text
local function show_code_point_names(text)
local count
if not text then return "" end
text, count = rsubn(text, "[" .. lrm .. rlm .. "]", "")
local names = Array()
if count > 0 then
for cp in gcodepoint(text) do
require("Module:debug").track("ar-translit/lrm or rlm")
-- Remove HEBREW LETTER, HEBREW POINT, etc.
end
local name = require "Module:Unicode data".lookup_name(cp)
for _, sub in ipairs(has_diacritics_subs) do
:gsub(
text = rsub(text, unpack(sub))
"^HEBREW (%w+) ",
end
function(type)
if #text > 0 then
if type == "ACCENT" then return "ACCENT " else return "" end
mw.log(("Check for missing diacritics failed; original text '%s', text without diacritics '%s'"):format(
end)
orig_text, text))
:lower()
names:insert(name)
end
end
return #text == 0
return names:concat ", "
end
end


-- Return true if transliteration TR is an irregular transliteration of
 
-- ARABIC. Return false if ARABIC can't be transliterated. For purposes of
local old_s = s
-- establishing regularity, hyphens are ignored and word-final tāʾ marbūṭa
function s(...)
-- can be transliterated as "(t)", "" or "t".
local old = ...
function export.irregular_translit(arabic, tr)
local new = old_s(...)
if not arabic or arabic == "" or not tr or tr == "" then
if old ~= new then
return false
mw.log(show_code_point_names(old), show_code_point_names(new), ...)
end
local regtr = export.tr(arabic)
if not regtr or regtr == tr then
return false
end
local arwords = rsplit(arabic, " ")
local regwords = rsplit(regtr, " ")
local words = rsplit(tr, " ")
if #regwords ~= #words or #regwords ~= #arwords then
return true
end
for i=1,#regwords do
local regword = regwords[i]
local word = words[i]
local arword = arwords[i]
-- Resolve final (t) in auto-translit to t, h or nothing
if rfind(regword, "%(t%)$") then
regword = rfind(word, "āh$") and rsub(regword, "%(t%)$", "h") or
rfind(word, "t$") and rsub(regword, "%(t%)$", "t") or
rsub(regword, "%(t%)$", "")
end
-- Resolve clitics + short a + alif-lām, which may get auto-transliterated
-- to contain long ā, to short a if the manual translit has it; note
-- that currently in cases with assimilated l, the auto-translit will
-- fail, so we won't ever get here and don't have to worry about
-- auto-translit l against manual-translit assimilated char.
local clitic_chars = "^[وفكل]" -- separate line to avoid L2R display weirdness
if rfind(arword, clitic_chars .. fatHa .. "?[" .. alif .. alif_waSl .. "]" .. laam) and rfind(word, "^[wfkl]a%-") then
regword = rsub(regword, "^([wfkl])ā", "%1a")
end
-- Ignore hyphens when comparing
if rsub(regword, "%-", "") ~= rsub(word, "%-", "") then
return true
end
end
end
return false
return new
end
end
--]]


return export
return export

Revision as of 05:42, 28 June 2026


This module will transliterate Fulfathic language text. The module should preferably not be called directly from templates or other modules. To use it from a template, use {{xlit}}. Within a module, use Module:languages#Language:transliterate.

For testcases, see Module:fulf-translit/testcases.

Functions

tr(text, lang, sc)
Transliterates a given piece of text written in the script specified by the code sc, and language specified by the code lang.
When the transliteration fails, returns nil.

local export = {}

--Contributors: Malku H₂n̥rés, Sartma, Erutuon, Metaknowledge

local m_str_utils = require("Module:string utilities")

local gcodepoint = m_str_utils.gcodepoint
local match = m_str_utils.match
local s = m_str_utils.gsub
local U = m_str_utils.char

local bidirectional_control_characters =
	U(0x061C) .. U(0x200E) .. U(0x200F) .. U(0x202A) .. "-" .. U(0x202E)
	.. U(0x2066) .. "-" .. U(0x2069)
local word_end = "%f[%s%z" .. bidirectional_control_characters .. "%-]"
local word_start = "%f[^%s%z" .. bidirectional_control_characters .. "%-]"
-- Bidirectional control characters should be avoided as much as possible,
-- but they are easily picked up when copying and pasting, so the module needs
-- to account for them.
-- This list is from [[w:Bidirectional control character]].

local V = "[aɔɛeiăəou‌āēīōūêôáéíóúḗṓếố][̂̄̆]?́?" 
local C = "[ʔḇḡḏhwzḥṭylsʕqrśšṯ'ḵmnfṣbdgptkjc″vḫẓġTZCDK]"

local c = { --direct translit
	--full char ie. C
    ["א"] = "ʔ",
    ["ב"] = "ḇ",
    ["ג"] = "ḡ",
    ["ד"] = "ḏ",
    ["ה"] = "h",
    ["ו"] = "w",
    ["ז"] = "z",
    ["ח"] = "ḥ",
    ["ט"] = "ṭ",
    ["י"] = "y",
    ["ל"] = "l",
    ["ס"] = "s",
    ["ע"] = "ʕ",
    ["ק"] = "q",
    ["ר"] = "r",
    ["ש"] = "š",
    ["ת"] = "ṯ",
	--miscellaneous:
	["׳"] = "'", --geresh
    ["־"] = "-", --hyphen
    ["׃"] = " .", --dot
	["ׂ"] = "ˊ", --sin dot
	["ׁ"] = "ˇ", --shin dot
    ["ּ"] = "·", --dagesh
	["֫"] = "^", --oleh
	["ֽ"] = "+", --meteg
	--niqqud ie. V
	["ַ"] = "a",
	["ָ"] = "ɔ",
	["ֶ"] = "ɛ",
	["ֵ"] = "e",
	["ִ"] = "i",
	["ֳ"] = "ɔ̆",
	["ֲ"] = "ă",
	["ֱ"] = "ɛ̆",
	["ְ"] = "ü",
	["ֹ"] = "o",
	["ֺ"] = "o",
	["ֻ"] = "u",
	["ׇ"] = "ɔ",
}

local b = { --BH
	--when different final form
	{"[כך]", "ḵ"},
	{"[מם]", "m"},
	{"[נן]", "n"},
	{"[פף]", "f"},
	{"[צץ]", "ṣ"},

	{"(" .. V .. ")(·?)(+?)(^?)([ˊˇ]?'?)", "%5%2%1%4%3"},  --order: s(h)in dot, geresh, dagesh, vowel (niqqud), oleh, meteg
	--bgdkft: fricative + dagesh > stop
	{"ḇ·", "b"},
	{"ḡ·", "g"},
	{"ḏ·", "d"},
	{"ṯ·", "t"},
	{"ḵ·", "k"},
	{"f·", "p"},
	--s(h)in dot
	{"ß(·?)ˇ", "š%1"},
	{"ß(·?)ˊ", "ś%1"},
	--vowel lengthenings
	{"i([+^]?)y", "ī%1"}, --V > long / _{jw}{no V no dagesh}
	{"ī([+^]?" .. V .. ")", "iy%1"},
	{"ī·", "iy·"},
	{"e([+^]?)y", "ē%1"},
	{"ē([+^]?" .. V .. ")", "ey%1"},
	{"ɛ([+^]?)y", "E%1"},  --see E > ɛ̄ below
	{"E([+^]?" .. V .. ")", "ɛy%1"},
	{"(" .. C .. "·?)wo", "%1ō"},
	{"(" .. V .. "[+^]?)w·", "%1U"},
	{"w·", "ū"},
	{"U", "w·"},
	{"(" .. C .. "·?)y·", "%1ī"},
	--h > circumflex / V_{no V no dagesh}
	{"(" .. V .. "[+^]?)h", "%1H"},
	{"H(" .. V .. ")", "h%1"},
	{"H·", "h"},
	{"e([+^]?)H", "ê%1"},
	{"o([+^]?)H", "ô%1"},
	{"ɛ([+^]?)H", "ɛ̂%1"},
	{"ɔ([+^]?)H", "ɔ̂%1"},
	{"a([+^]?)H", "â%1"},

	{"(" .. V .. "[+^]?%s?)(.)·(%s?" .. V .. ")", "%1%2%2%3"},  --dagesh gemination
	{"[·ß]", ""},  --deletion of unpointed s(h)ins and useless dageshim
	--schwa: Ə means "kept"
	{"ə" .. word_end, ""},
	{"ə([ḇḡḏḵfṯ])", "Ə%1"},
	{"([+‌āēīōūoE])(" .. C .. ")ə", "%1%2Ə"},
	{"E", "ɛ̄"},  --see >E above
	{"(" .. C .. "ə?" .. C .. ")ə", "%1Ə"},
	{"(" .. C .. ")Ə(" .. C .. ")([Əə])", "%1ə%2Ə"},
	{word_start .. "([ūw]?a?" .. C .. ")ə", "%1Ə"},
	{"ə", ""},
	{"Ə", "ə"},

	{"([ʕhḥ])a(" .. word_end .. ")", "^a%1%2"},  --final /a/-guttural inversion
	--penultimate stress: segolates & -áyiC
	{"(" .. C .. "[eɛo])(%+?".. C .. "ɛ" .. C .. ")" .. word_end, "%1^%2"},
	{"(" .. C .. "a)(%+?".. C .. C .. "?a" .. C ..")" .. word_end, "%1^%2"},
	{"ayi(" .. C .. ")" .. word_end, "a^yi%1"},
	--stress marking
	{"a^", "á"},
	{"e^", "é"},
	{"i^", "í"},
	{"o^", "ó"},
	{"u^", "ú"},
	{"ɛ^", "ɛ́"},
	{"ɔ^", "ɔ́"},
	{"ā^", "ā́"},
	{"ē^", "ḗ"},
	{"ī^", "ī́"},
	{"ō^", "ṓ"},
	{"ū^", "ū́"},
	{"ɛ̄^", "ɛ̄́"},
	{"ɔ̄^", "ɔ̄́"},
	{"ê^", "ế"},
	{"ô^", "ố"},
	{"ɛ̂^", "ɛ̂́"},
	{"ɔ̂^", "ɔ̂́"},

	{"ɔyw(" .. word_end .. ")", "ɔw%1"},  --irregular…
	{"(" .. V .. "[+^]?)([bdgptk])(" .. V .. ")", "%1%2%2%3"},  --dagesh bgdkft gemination
	{"f", "p̄"},  --bc p̄ are 2 chars
	{"%s%.", "."},  --quotes: " ." > "." (esthetics)
}

--MH
local m = { --direct change
	["ḏ"] = "d",
	["ḡ"] = "g",
	["ś"] = "s",
	["״"] = "″", --gershayim
	["q"] = "k",
	["ī"] = "i",
	["ū"] = "u",
	["́"] = "^", --stress marking conversion below
}

local l = {
	--indirect
	{"p̄", "f"},
	{"[̂̆̄]", ""},
	{"ḥ'", "ḫ"},
	{"ṯ'", "T"},
	{"ṭ'", "ẓ"},
	{"g'", "j"},
	{"z'", "Z"},
	{"ṣ'", "C"},
	{"d'", "D"},
	{"[rʕ]'", "ġ"},
	{"(.)%1", "%1"},
	{"[ḇw]", "v"},
	{"[ḵḥ]", "K"},
	{"[ṯṭ]", "t"},
	{"'", ""},
	{"[ʔʕ]", "'"},
	--above: loss of vowel length, loss of gemination, turning n-grams into 1 char, MH mergers.

	--schwa
	--prefixes
	-- {word_start .. "([bvkKlšdm])ə", "%1e"},
	-- {"(u[bvkKlšdm])ə", "%1e"},
	--initial C clusters
	{word_start .. "([rnmly])ə", "%1e"},
	{word_start .. "(" .. C .. ")ə([h'])", "%1e%2"},
	--internal
	{"([ə+]" .. C .. ")ə", "%1e"},
	{"(" .. C .. C .. ")ə", "%1e"},
	{"[ə+]", ""}, --deletion of remaining schwa and metegim

	--put here not above to avoid e/ə confusion
	{"[āâă]", "a"},
	{"[ēêɛ]", "e"},
	{"[ōô]", "o"},
	{"[ḗế]", "é"},
	{"[ṓố]", "ó"},

	{"(" .. word_start .. "[^áéíóú^]-[aeiouɔ])(" .. C .. "?" .. C .. "?)" .. word_end, "%1^%2"},  --module-explicit default final stress...
	--same articulation > schwa insertion
	{"([bp])([bp])", "%1e%2"},
	{"([vf])([vf])", "%1e%2"},
	{"([dt])([dt])", "%1e%2"},
	{"([DTṣ])([DTṣ])", "%1e%2"},
	{"([zs])([zs])", "%1e%2"},
	{"([Zš])([Zš])", "%1e%2"},
	{"([jC])([jC])", "%1e%2"},
	{"([gk])([gk])", "%1e%2"},
	{"(K)(K)", "%1e%2"},
	{"(r)(r)", "%1e%2"},
	{"''", "'e'"},

	--a/o, including kol
	{"ɔ(" .. C .. C .. ")", "o%1"},
	{"ɔ(" .. C .. ")" .. word_end, "o%1"},
	{"(" .. word_start .. "[kK])ɔ(^l" .. word_end .. ")", "%1o%2"},
	{"([bvkKlšd][ea][kK])ɔ(^l" .. word_end .. ")", "%1o%2"},
	-- {"(m[ei][kK])ɔ(^l" .. word_end .. ")", "%1o%2"},
	{"(" .. word_start .. "u[kK])ɔ(^l" .. word_end .. ")", "%1o%2"},
	{"(ha[kK])ɔ(^l" .. word_end .. ")", "%1o%2"},
	{"ɔ", "a"},

	{"(" .. word_start .. C .. C .. "?" .. V .. ")^(" .. C .. "?" .. C .. "?" .. word_end .. ")", "%1%2"},  --…reader-implicit acute accent in monosyllabic
	--stress marking
	{"a^", "á"},
	{"e^", "é"},
	{"i^", "í"},
	{"o^", "ó"},
	{"u^", "ú"},
	--glottal stops: kept when {CV}'V,
	{"(" .. word_start .. ")'", "%1"},
	{"'(" .. C .. ")", "%1"},
	{"'(" .. word_end .. ")", "%1"},
	--fake digraphs
	{"([szck])h", "%1'h"},
	--one char > displaying
	{"ṣ", "ts"},
	{"š", "sh"},
	{"T", "t'"},
	{"Z", "zh"},
	{"C", "ch"},
	{"D", "d'"},
	{"K", "kh"},
}


function export.BH(text)
	text = s(s(text, '.', c), "[֣֖֣֑֣֣֧֛֖֥֧֛֥֖֑֣֖֥֔֗֗֙֔]", "") --remove cantillation marks so that it works for quotes too
	for a = 1, #b do
		text = s(text, b[a][1], b[a][2])
	end
	return text
end

function export.BH_tr(text)
	return (s(export.BH(text), "+", "")) --metegim kept for MH
end

function export.MH_tr(text)
	local acronym = false
	text = s(export.BH(text), '.', m) --.BH() to keep metegim, m is applied
	if match(text, "″") and not match(text, V) then --acronym = gershayim & no V
		text = s(s(s(text, "p̄", "p"), "ḇ", "b"), "ḵ", "k")
		acronym = true
	end
	for a = 1, #l do --in any case, l is applied
		text = s(text, l[a][1], l[a][2])
	end
	if acronym == true then
		text = mw.ustring.upper(text)
	end
	return text
end

function export.tr(text, lang, sc)
	if not sc then
		sc = require("Module:languages").getByCode(lang, nil, true):findBestScript(text):getCode()
	end
	if sc ~= "Hebr" or not match(text, "[ְ-ֻ־ׇ״]") then
		return nil
	elseif lang == "he" then
		return export.MH_tr(text)
	elseif lang == "hbo" then --though useless
		return export.BH_tr(text)
	end
end

function export.tr_all(frame)
	return export.BH_tr(frame.args[1]) .. ", " .. export.MH_tr(frame.args[1])
end

--Erutuon's code for code points below

--[[
local Array = require "Module:array"
local function show_code_point_names(text)
	if not text then return "" end
	local names = Array()
	for cp in gcodepoint(text) do
		-- Remove HEBREW LETTER, HEBREW POINT, etc.
		local name = require "Module:Unicode data".lookup_name(cp)
			:gsub(
				"^HEBREW (%w+) ",
				function(type)
					if type == "ACCENT" then return "ACCENT " else return "" end
				end)
			:lower()
		names:insert(name)
	end
	return names:concat ", "
end


local old_s = s
function s(...)
	local old = ...
	local new = old_s(...)
	if old ~= new then
		mw.log(show_code_point_names(old), show_code_point_names(new), ...)
	end
	return new
end
--]]

return export