Module:grc-translit: Difference between revisions
Created page with "local export = {} local m_data = require('Module:grc-utilities/data') local tokenize = require('Module:grc-utilities').tokenize local ufind = mw.ustring.find local ugsub = m..." |
m 1 revision imported |
||
| (One intermediate revision by one other user not shown) | |||
| Line 1: | Line 1: | ||
local export = {} | local export = {} | ||
local | local m_grc_utils = require("Module:grc-utilities") | ||
local | local m_grc_utils_data = require("Module:grc-utilities/data") | ||
local m_str_utils = require("Module:string utilities") | |||
local | local tokenize = require("Module:grc-utilities").tokenize | ||
local | local canonicalize = m_grc_utils.canonicalize | ||
local concat = table.concat | |||
local insert = table.insert | |||
local split = m_str_utils.split | |||
local u = m_str_utils.char | |||
local ugsub = m_str_utils.gsub | |||
local ulower = m_str_utils.lower | |||
local umatch = mw.ustring.match | |||
local uupper = m_str_utils.upper | |||
-- Diacritics | -- Diacritics | ||
local diacritics = | local diacritic = m_grc_utils_data.diacritic | ||
local diacritics = m_grc_utils_data.diacritics | |||
-- Greek | -- Greek | ||
| Line 19: | Line 25: | ||
local grave = diacritics.grave | local grave = diacritics.grave | ||
local circumflex = diacritics.circum | local circumflex = diacritics.circum | ||
local smooth = diacritics.smooth | local smooth = diacritics.smooth | ||
local rough = diacritics.rough | local rough = diacritics.rough | ||
local breve = diacritics.breve | |||
local macron = diacritics.macron | local macron = diacritics.macron | ||
local subscript = diacritics.subscript | local subscript = diacritics.subscript | ||
local vowel = m_grc_utils_data.vowel | |||
-- Latin | -- Latin | ||
local hat = diacritics.Latin_circum | local hat = diacritics.Latin_circum | ||
local | local au_subscript = "^[αυ].*" .. subscript .. "$" | ||
local question_mark = u(0x37E) | |||
local velar = | local velar = "[γκξχϙ]" | ||
local long_vowels = { -- Macron will be added. | |||
["η"] = "e", | |||
["ω"] = "o", | |||
} | |||
local tt = { | local tt = { | ||
| Line 37: | Line 48: | ||
["α"] = "a", | ["α"] = "a", | ||
["ε"] = "e", | ["ε"] = "e", | ||
["ι"] = "i", | ["ι"] = "i", | ||
["ο"] = "o", | ["ο"] = "o", | ||
["υ"] = "u", | ["υ"] = "u", | ||
-- Consonants | -- Consonants | ||
| Line 62: | Line 71: | ||
["χ"] = "kh", | ["χ"] = "kh", | ||
["ψ"] = "ps", | ["ψ"] = "ps", | ||
-- | -- Other letters | ||
["ϛ"] = "st", | |||
["ϝ"] = "w", | ["ϝ"] = "w", | ||
[" | ["ͱ"] = "h", | ||
["ϳ"] = "j", | |||
["ϙ"] = "q", | ["ϙ"] = "q", | ||
[" | ["ϻ"] = "s", | ||
[" | ["ϸ"] = "š", | ||
["ͳ"] = "s", | |||
--["ͷ"] = "v", Differs by dialect. | |||
[" | |||
-- Diacritics | -- Diacritics | ||
-- unchanged: macron, diaeresis, grave, acute | -- unchanged: macron, diaeresis, grave, acute | ||
[smooth] = "", | |||
[smooth] = | [rough] = "", | ||
[rough] = | |||
[circumflex] = hat, | [circumflex] = hat, | ||
[subscript] = | [subscript] = "i", | ||
} | } | ||
local function get_next_token(tokens, i) | |||
local new = i + 1 | |||
local token = tokens[new] | |||
while token and token:match("[()[%]{}]") do | |||
new = new + 1 | |||
token = tokens[new] | |||
end | |||
return new, token, token and ulower(token), concat(tokens, nil, i + 1, new - 1) | |||
end | |||
local function translit_letter(letter, trail) | |||
local tr = long_vowels[letter] | |||
return (tr and (tr .. (trail:find(breve) and "" or macron)) or tt[letter] or letter) .. trail:gsub(".[\128-\191]*", tt) | |||
end | |||
local function do_translit(token) | |||
-- Put iota subscript before accent marks, so that they appear on "i". | |||
token = ugsub(token, "([" .. acute .. grave .. circumflex .. "]+)" .. subscript, subscript .. "%1") | |||
return ugsub(token, "(.)(%W*)", translit_letter) | |||
end | |||
local function remove_macron_if_hat(m) | |||
return m:find(hat) and m:gsub(macron, "") or m | |||
end | |||
local function insert_translit(output, translit, token, lower_token, next_token, next_token_lower, suffix) | |||
-- Remove any duplicate diacritics (this shouldn't really happen). | |||
local n | |||
repeat | |||
translit, n = ugsub(translit, "(" .. diacritic .. ")(%W-)%1", "%1%2") | |||
until n == 0 | |||
-- Remove macron from a vowel that has a circumflex. | |||
translit = ugsub(translit, "%W+", remove_macron_if_hat) | |||
-- If capitalized, only capitalize the first letter unless the following token is capitalized as well. | |||
insert( | |||
output, | |||
(token == lower_token and translit or | |||
next_token == next_token_lower and translit:gsub("^" .. ".[\128-\191]*", uupper) or | |||
uupper(translit) | |||
) .. suffix | |||
) | |||
end | |||
function export.tr(text, lang, sc) | function export.tr(text, lang, sc) | ||
if text == | if text == "῾" then | ||
return | return "h" | ||
end | end | ||
-- in case of bold/italic text; only works in testcases submodule, not in sandbox, so outcommented | |||
-- local remove_rough = { | |||
-- ['ἱ'] = 'ι', ['ἵ'] = 'ί', ['ἳ'] = 'ὶ', ['ἷ'] = 'ῖ', | |||
-- ['ὑ'] = 'υ', ['ὕ'] = 'ύ', ['ὓ'] = 'ὺ', ['ὗ'] = 'ῦ', | |||
-- } | |||
-- text = ugsub(text, "([αᾰᾱΑᾸᾹεΕηΗοΟυῠῡΥῨῩωΩ])(\'\'\'?)([ἱἵἳἷὑὕὓὗ])", | |||
-- function(a,b,c) | |||
-- return a .. rough .. b .. remove_rough[c] | |||
-- end) | |||
--[[ | --[[ | ||
Replace semicolon or Greek question mark with regular question mark, | Replace semicolon or Greek question mark with regular question mark, | ||
except | except any that occur in HTML entities. Use split to separate out the | ||
chunks between any entities. | |||
]] | ]] | ||
text = | text = split(canonicalize(text), "(&#?%w+;)") | ||
for i = 1, #text, 2 do | |||
text[i] = text[i]:gsub(";", "?"):gsub(question_mark, "?") | |||
end | |||
text = concat(text) | |||
-- Handle the middle dot. It is equivalent to semicolon or colon, but semicolon is probably more common. | -- Handle the middle dot. It is equivalent to semicolon or colon, but semicolon is probably more common. | ||
text = text:gsub("·", ";") | text = text:gsub("·", ";") | ||
local tokens = tokenize(text) | local tokens = tokenize(text) | ||
--now read the tokens | --now read the tokens | ||
local output = {} | local next_i, next_token, next_token_lower, suffix = get_next_token(tokens, 0) | ||
local output = {suffix} | |||
while next_token do | |||
local i, token, lower_token, is_rough = next_i, next_token, next_token_lower | |||
local translit = | local translit = do_translit(lower_token) | ||
next_i, next_token, next_token_lower, suffix = get_next_token(tokens, i) | |||
-- γ before a velar should be <n> | |||
if | if lower_token:find("γ") and next_token_lower and umatch(next_token_lower, velar) then | ||
translit = translit:gsub("g", "n") | |||
translit = | elseif lang == "xbc" and lower_token:find("φ") then | ||
elseif token == | translit = translit:gsub("ph", "f") | ||
-- ρ after ρ should be <rh> | elseif token == "ρ"..rough then | ||
translit = | translit = "rh" | ||
elseif token == "ρ"..smooth then | |||
-- add macron to ᾳ | translit = "r" | ||
translit = | -- ρ after ρ should be <rh> | ||
elseif lang == "grc" and lower_token:find("ρ") then | |||
-- Keep adding ρs until they run out. Set is_rough, so that "h" will get appended. | |||
while next_token_lower and next_token_lower:find("ρ") do | |||
insert_translit(output, translit, token, lower_token, next_token, next_token_lower, suffix) | |||
i, token, lower_token, is_rough = next_i, next_token, next_token_lower, true | |||
translit = do_translit(lower_token) | |||
next_i, next_token, next_token_lower, suffix = get_next_token(tokens, i) | |||
end | |||
-- add macron to ᾳ | |||
elseif umatch(lower_token, au_subscript) then | |||
translit = translit:gsub("[au]", "%0" .. macron) | |||
end | end | ||
if | if is_rough or lower_token:find(rough) then | ||
if | if umatch(lower_token, vowel) then | ||
translit = | translit = "h" .. translit | ||
else | else | ||
translit = | local final = umatch(translit, "(%w)%W*$") | ||
if final and final ~= "h" then | |||
translit = translit .. "h" | |||
end | |||
end | end | ||
end | end | ||
insert_translit(output, translit, token, lower_token, next_token, next_token_lower, suffix) | |||
end | end | ||
return concat(output) | |||
end | end | ||
return export | return export | ||