|
|
| (19 intermediate revisions by the same user not shown) |
| Line 1: |
Line 1: |
| local export = {}
| | -- ts doesn't work, not bothering with it more |
| | |
| --Contributors: Malku H₂n̥rés, Sartma, Erutuon, Metaknowledge | |
| | |
| local m_str_utils = require("Module:string utilities")
| |
| | |
| local gcodepoint = m_str_utils.gcodepoint
| |
| local match = m_str_utils.match
| |
| local s = m_str_utils.gsub
| |
| local U = m_str_utils.char
| |
| | |
| local bidirectional_control_characters =
| |
| U(0x061C) .. U(0x200E) .. U(0x200F) .. U(0x202A) .. "-" .. U(0x202E)
| |
| .. U(0x2066) .. "-" .. U(0x2069)
| |
| local word_end = "%f[%s%z" .. bidirectional_control_characters .. "%-]"
| |
| local word_start = "%f[^%s%z" .. bidirectional_control_characters .. "%-]"
| |
| -- Bidirectional control characters should be avoided as much as possible,
| |
| -- but they are easily picked up when copying and pasting, so the module needs
| |
| -- to account for them.
| |
| -- This list is from [[w:Bidirectional control character]].
| |
| | |
| local V = "[aɔɛeiăəouāēīōūêôáéíóúḗṓếố][̂̄̆]?́?"
| |
| local C = "[ʔḇḡḏhwzḥṭylsʕqrśšṯ'ḵmnfṣbdgptkjc″vḫẓġTZCDK]"
| |
| | |
| local c = { --direct translit
| |
| --full char ie. C
| |
| ["א"] = "ʔ",
| |
| ["ב"] = "ḇ",
| |
| ["ג"] = "ḡ",
| |
| ["ד"] = "ḏ",
| |
| ["ה"] = "h",
| |
| ["ו"] = "w",
| |
| ["ז"] = "z",
| |
| ["ח"] = "ḥ",
| |
| ["ט"] = "ṭ",
| |
| ["י"] = "y",
| |
| ["ל"] = "l",
| |
| ["ס"] = "s",
| |
| ["ע"] = "ʕ",
| |
| ["ק"] = "q",
| |
| ["ר"] = "r",
| |
| ["ש"] = "š",
| |
| ["ת"] = "ṯ",
| |
| --miscellaneous:
| |
| ["׳"] = "'", --geresh
| |
| ["־"] = "-", --hyphen
| |
| ["׃"] = " .", --dot
| |
| ["ׂ"] = "ˊ", --sin dot
| |
| ["ׁ"] = "ˇ", --shin dot
| |
| ["ּ"] = "·", --dagesh
| |
| ["֫"] = "^", --oleh
| |
| ["ֽ"] = "+", --meteg
| |
| --niqqud ie. V
| |
| ["ַ"] = "a",
| |
| ["ָ"] = "ɔ",
| |
| ["ֶ"] = "ɛ",
| |
| ["ֵ"] = "e",
| |
| ["ִ"] = "i",
| |
| ["ֳ"] = "ɔ̆",
| |
| ["ֲ"] = "ă",
| |
| ["ֱ"] = "ɛ̆",
| |
| ["ְ"] = "ü",
| |
| ["ֹ"] = "o",
| |
| ["ֺ"] = "o",
| |
| ["ֻ"] = "u",
| |
| ["ׇ"] = "ɔ",
| |
| }
| |
| | |
| local b = { --BH
| |
| --when different final form
| |
| {"[כך]", "ḵ"},
| |
| {"[מם]", "m"},
| |
| {"[נן]", "n"},
| |
| {"[פף]", "f"},
| |
| {"[צץ]", "ṣ"},
| |
| | |
| {"(" .. V .. ")(·?)(+?)(^?)([ˊˇ]?'?)", "%5%2%1%4%3"}, --order: s(h)in dot, geresh, dagesh, vowel (niqqud), oleh, meteg
| |
| --bgdkft: fricative + dagesh > stop
| |
| {"ḇ·", "b"},
| |
| {"ḡ·", "g"},
| |
| {"ḏ·", "d"},
| |
| {"ṯ·", "t"},
| |
| {"ḵ·", "k"},
| |
| {"f·", "p"},
| |
| --s(h)in dot
| |
| {"ß(·?)ˇ", "š%1"},
| |
| {"ß(·?)ˊ", "ś%1"},
| |
| --vowel lengthenings
| |
| {"i([+^]?)y", "ī%1"}, --V > long / _{jw}{no V no dagesh}
| |
| {"ī([+^]?" .. V .. ")", "iy%1"},
| |
| {"ī·", "iy·"},
| |
| {"e([+^]?)y", "ē%1"},
| |
| {"o([+^]?)w", "ō%1"},
| |
| {"ē([+^]?" .. V .. ")", "ey%1"},
| |
| {"ɛ([+^]?)y", "E%1"}, --see E > ɛ̄ below
| |
| {"E([+^]?" .. V .. ")", "ɛy%1"},
| |
| {"(" .. C .. "·?)wo", "%1ō"},
| |
| {"(" .. V .. "[+^]?)w·", "%1U"},
| |
| {"w·", "ū"},
| |
| {"U", "w·"},
| |
| {"(" .. C .. "·?)y·", "%1ī"},
| |
| --h > circumflex / V_{no V no dagesh}
| |
| {"(" .. V .. "[+^]?)h", "%1H"},
| |
| {"H(" .. V .. ")", "h%1"},
| |
| {"H·", "h"},
| |
| {"e([+^]?)H", "ê%1"},
| |
| {"o([+^]?)H", "ô%1"},
| |
| {"ɛ([+^]?)H", "ɛ̂%1"},
| |
| {"ɔ([+^]?)H", "ɔ̂%1"},
| |
| {"a([+^]?)H", "â%1"},
| |
| | |
| {"(" .. V .. "[+^]?%s?)(.)·(%s?" .. V .. ")", "%1%2%2%3"}, --dagesh gemination
| |
| {"[·ß]", ""}, --deletion of unpointed s(h)ins and useless dageshim
| |
| --schwa: Ə means "kept"
| |
| {"ə" .. word_end, ""},
| |
| {"ə([ḇḡḏḵfṯ])", "Ə%1"},
| |
| {"([+āēīōūoE])(" .. C .. ")ə", "%1%2Ə"},
| |
| {"E", "ɛ̄"}, --see >E above
| |
| {"(" .. C .. "ə?" .. C .. ")ə", "%1Ə"},
| |
| {"(" .. C .. ")Ə(" .. C .. ")([Əə])", "%1ə%2Ə"},
| |
| {word_start .. "([ūw]?a?" .. C .. ")ə", "%1Ə"},
| |
| {"ə", ""},
| |
| {"Ə", "ə"},
| |
| | |
| {"([ʕhḥ])a(" .. word_end .. ")", "^a%1%2"}, --final /a/-guttural inversion
| |
| --penultimate stress: segolates & -áyiC
| |
| {"(" .. C .. "[eɛo])(%+?".. C .. "ɛ" .. C .. ")" .. word_end, "%1^%2"},
| |
| {"(" .. C .. "a)(%+?".. C .. C .. "?a" .. C ..")" .. word_end, "%1^%2"},
| |
| {"ayi(" .. C .. ")" .. word_end, "a^yi%1"},
| |
| --stress marking
| |
| {"a^", "á"},
| |
| {"e^", "é"},
| |
| {"i^", "í"},
| |
| {"o^", "ó"},
| |
| {"u^", "ú"},
| |
| {"ɛ^", "ɛ́"},
| |
| {"ɔ^", "ɔ́"},
| |
| {"ā^", "ā́"},
| |
| {"ē^", "ḗ"},
| |
| {"ī^", "ī́"},
| |
| {"ō^", "ṓ"},
| |
| {"ū^", "ū́"},
| |
| {"ɛ̄^", "ɛ̄́"},
| |
| {"ɔ̄^", "ɔ̄́"},
| |
| {"ê^", "ế"},
| |
| {"ô^", "ố"},
| |
| {"ɛ̂^", "ɛ̂́"},
| |
| {"ɔ̂^", "ɔ̂́"},
| |
| | |
| {"ɔyw(" .. word_end .. ")", "ɔw%1"}, --irregular…
| |
| {"(" .. V .. "[+^]?)([bdgptk])(" .. V .. ")", "%1%2%2%3"}, --dagesh bgdkft gemination
| |
| {"f", "p̄"}, --bc p̄ are 2 chars
| |
| {"%s%.", "."}, --quotes: " ." > "." (esthetics)
| |
| }
| |
| | |
| --MH
| |
| local m = { --direct change
| |
| ["ḏ"] = "d",
| |
| ["ḡ"] = "g",
| |
| ["ś"] = "s",
| |
| ["״"] = "″", --gershayim
| |
| ["q"] = "k",
| |
| ["ī"] = "i",
| |
| ["ū"] = "u",
| |
| ["́"] = "^", --stress marking conversion below
| |
| }
| |
| | |
| local l = {
| |
| --indirect
| |
| {"p̄", "f"},
| |
| {"[̂̆̄]", ""},
| |
| {"ḥ'", "ḫ"},
| |
| {"ṯ'", "T"},
| |
| {"ṭ'", "ẓ"},
| |
| {"g'", "j"},
| |
| {"z'", "Z"},
| |
| {"ṣ'", "C"},
| |
| {"d'", "D"},
| |
| {"[rʕ]'", "ġ"},
| |
| {"(.)%1", "%1"},
| |
| {"[ḇw]", "v"},
| |
| {"[ḵḥ]", "K"},
| |
| {"[ṯṭ]", "t"},
| |
| {"'", ""},
| |
| {"[ʔʕ]", "'"},
| |
| --above: loss of vowel length, loss of gemination, turning n-grams into 1 char, MH mergers.
| |
| | |
| --schwa
| |
| --prefixes
| |
| -- {word_start .. "([bvkKlšdm])ə", "%1e"},
| |
| -- {"(u[bvkKlšdm])ə", "%1e"},
| |
| --initial C clusters
| |
| {word_start .. "([rnmly])ə", "%1e"},
| |
| {word_start .. "(" .. C .. ")ə([h'])", "%1e%2"},
| |
| --internal
| |
| {"([ə+]" .. C .. ")ə", "%1e"},
| |
| {"(" .. C .. C .. ")ə", "%1e"},
| |
| {"[ə+]", ""}, --deletion of remaining schwa and metegim
| |
| | |
| --put here not above to avoid e/ə confusion
| |
| {"[āâă]", "a"},
| |
| {"[ēêɛ]", "e"},
| |
| {"[ōô]", "o"},
| |
| {"[ḗế]", "é"},
| |
| {"[ṓố]", "ó"},
| |
| | |
| {"(" .. word_start .. "[^áéíóú^]-[aeiouɔ])(" .. C .. "?" .. C .. "?)" .. word_end, "%1^%2"}, --module-explicit default final stress...
| |
| --same articulation > schwa insertion
| |
| {"([bp])([bp])", "%1e%2"},
| |
| {"([vf])([vf])", "%1e%2"},
| |
| {"([dt])([dt])", "%1e%2"},
| |
| {"([DTṣ])([DTṣ])", "%1e%2"},
| |
| {"([zs])([zs])", "%1e%2"},
| |
| {"([Zš])([Zš])", "%1e%2"},
| |
| {"([jC])([jC])", "%1e%2"},
| |
| {"([gk])([gk])", "%1e%2"},
| |
| {"(K)(K)", "%1e%2"},
| |
| {"(r)(r)", "%1e%2"},
| |
| {"''", "'e'"},
| |
| | |
| --a/o, including kol
| |
| {"ɔ(" .. C .. C .. ")", "o%1"},
| |
| {"ɔ(" .. C .. ")" .. word_end, "o%1"},
| |
| {"(" .. word_start .. "[kK])ɔ(^l" .. word_end .. ")", "%1o%2"},
| |
| {"([bvkKlšd][ea][kK])ɔ(^l" .. word_end .. ")", "%1o%2"},
| |
| -- {"(m[ei][kK])ɔ(^l" .. word_end .. ")", "%1o%2"},
| |
| {"(" .. word_start .. "u[kK])ɔ(^l" .. word_end .. ")", "%1o%2"},
| |
| {"(ha[kK])ɔ(^l" .. word_end .. ")", "%1o%2"},
| |
| {"ɔ", "a"},
| |
| | |
| {"(" .. word_start .. C .. C .. "?" .. V .. ")^(" .. C .. "?" .. C .. "?" .. word_end .. ")", "%1%2"}, --…reader-implicit acute accent in monosyllabic
| |
| --stress marking
| |
| {"a^", "á"},
| |
| {"e^", "é"},
| |
| {"i^", "í"},
| |
| {"o^", "ó"},
| |
| {"u^", "ú"},
| |
| --glottal stops: kept when {CV}'V,
| |
| {"(" .. word_start .. ")'", "%1"},
| |
| {"'(" .. C .. ")", "%1"},
| |
| {"'(" .. word_end .. ")", "%1"},
| |
| --fake digraphs
| |
| {"([szck])h", "%1'h"},
| |
| --one char > displaying
| |
| {"ṣ", "ts"},
| |
| {"š", "sh"},
| |
| {"T", "t'"},
| |
| {"Z", "zh"},
| |
| {"C", "ch"},
| |
| {"D", "d'"},
| |
| {"K", "kh"},
| |
| }
| |
| | |
| | |
| function export.BH(text)
| |
| text = s(s(text, '.', c), "[֣֖֣֑֣֣֧֛֖֥֧֛֥֖֑֣֖֥֔֗֗֙֔]", "") --remove cantillation marks so that it works for quotes too
| |
| for a = 1, #b do
| |
| text = s(text, b[a][1], b[a][2])
| |
| end
| |
| return text
| |
| end
| |
| | |
| function export.BH_tr(text)
| |
| return (s(export.BH(text), "+", "")) --metegim kept for MH
| |
| end
| |
| | |
| function export.MH_tr(text)
| |
| local acronym = false
| |
| text = s(export.BH(text), '.', m) --.BH() to keep metegim, m is applied
| |
| if match(text, "″") and not match(text, V) then --acronym = gershayim & no V
| |
| text = s(s(s(text, "p̄", "p"), "ḇ", "b"), "ḵ", "k")
| |
| acronym = true
| |
| end
| |
| for a = 1, #l do --in any case, l is applied
| |
| text = s(text, l[a][1], l[a][2])
| |
| end
| |
| if acronym == true then
| |
| text = mw.ustring.upper(text)
| |
| end
| |
| return text
| |
| end
| |
| | |
| function export.tr(text, lang, sc)
| |
| if not sc then
| |
| sc = require("Module:languages").getByCode(lang, nil, true):findBestScript(text):getCode()
| |
| end
| |
| if sc ~= "Hebr" or not match(text, "[ְ-ֻ־ׇ״]") then
| |
| return nil
| |
| elseif lang == "fulf" then
| |
| return export.MH_tr(text)
| |
| elseif lang == "hbo" then --though useless
| |
| return export.BH_tr(text)
| |
| end
| |
| end
| |
| | |
| function export.tr_all(frame)
| |
| return export.BH_tr(frame.args[1]) .. ", " .. export.MH_tr(frame.args[1])
| |
| end
| |
| | |
| --Erutuon's code for code points below
| |
| | |
| --[[
| |
| local Array = require "Module:array"
| |
| local function show_code_point_names(text)
| |
| if not text then return "" end
| |
| local names = Array()
| |
| for cp in gcodepoint(text) do
| |
| -- Remove HEBREW LETTER, HEBREW POINT, etc.
| |
| local name = require "Module:Unicode data".lookup_name(cp)
| |
| :gsub(
| |
| "^HEBREW (%w+) ",
| |
| function(type)
| |
| if type == "ACCENT" then return "ACCENT " else return "" end
| |
| end)
| |
| :lower()
| |
| names:insert(name)
| |
| end
| |
| return names:concat ", "
| |
| end
| |
| | |
| | |
| local old_s = s
| |
| function s(...)
| |
| local old = ...
| |
| local new = old_s(...)
| |
| if old ~= new then
| |
| mw.log(show_code_point_names(old), show_code_point_names(new), ...)
| |
| end
| |
| return new
| |
| end
| |
| --]]
| |
| | |
| return export
| |