Module:grc-translit: Difference between revisions

Created page with "local export = {} local m_data = require('Module:grc-utilities/data') local tokenize = require('Module:grc-utilities').tokenize local ufind = mw.ustring.find local ugsub = m..."
 
my method didn't work in the sandbox
Line 1: Line 1:
local export = {}
local export = {}


local m_data = require('Module:grc-utilities/data')
local m_grc_utils = require("Module:grc-utilities")
local tokenize = require('Module:grc-utilities').tokenize
local m_grc_utils_data = require("Module:grc-utilities/data")
local m_str_utils = require("Module:string utilities")


local ufind = mw.ustring.find
local tokenize = require("Module:grc-utilities").tokenize
local ugsub = mw.ustring.gsub
local U = mw.ustring.char
local ulower = mw.ustring.lower
local uupper = mw.ustring.upper


local UTF8char = '[%z\1-\127\194-\244][\128-\191]*'
local canonicalize = m_grc_utils.canonicalize
local concat = table.concat
local insert = table.insert
local split = m_str_utils.split
local u = m_str_utils.char
local ugsub = m_str_utils.gsub
local ulower = m_str_utils.lower
local umatch = mw.ustring.match
local uupper = m_str_utils.upper


-- Diacritics
-- Diacritics
local diacritics = m_data.named
local diacritic = m_grc_utils_data.diacritic
local diacritics = m_grc_utils_data.diacritics


-- Greek
-- Greek
Line 19: Line 25:
local grave = diacritics.grave
local grave = diacritics.grave
local circumflex = diacritics.circum
local circumflex = diacritics.circum
local diaeresis = diacritics.diaeresis
local smooth = diacritics.smooth
local smooth = diacritics.smooth
local rough = diacritics.rough
local rough = diacritics.rough
local breve = diacritics.breve
local macron = diacritics.macron
local macron = diacritics.macron
local breve = diacritics.breve
local subscript = diacritics.subscript
local subscript = diacritics.subscript
local vowel = m_grc_utils_data.vowel


-- Latin
-- Latin
local hat = diacritics.Latin_circum
local hat = diacritics.Latin_circum


local macron_diaeresis = macron .. diaeresis .. "?" .. hat
local au_subscript = "^[αυ].*" .. subscript .. "$"
local a_subscript = '^[αΑ].*' .. subscript .. '$'
local question_mark = u(0x37E)
local velar = 'κγχξ'
local velar = "[γκξχϙ]"
 
local long_vowels = { -- Macron will be added.
["η"] = "e",
["ω"] = "o",
}


local tt = {
local tt = {
Line 37: Line 48:
["α"] = "a",
["α"] = "a",
["ε"] = "e",
["ε"] = "e",
["η"] = "e" .. macron,
["ι"] = "i",
["ι"] = "i",
["ο"] = "o",
["ο"] = "o",
["υ"] = "u",
["υ"] = "u",
["ω"] = "o" .. macron,


-- Consonants
-- Consonants
Line 62: Line 71:
["χ"] = "kh",
["χ"] = "kh",
["ψ"] = "ps",
["ψ"] = "ps",
 
-- Archaic letters
-- Other letters
["ϛ"] = "st",
["ϝ"] = "w",
["ϝ"] = "w",
["ϻ"] = "ś",
["ͱ"] = "h",
["ϳ"] = "j",
["ϙ"] = "q",
["ϙ"] = "q",
["ϡ"] = "š",
["ϻ"] = "s",
["ͷ"] = "v",
["ϸ"] = "š",
["ͳ"] = "s",
-- Incorrect characters: see [[Wiktionary:About Ancient Greek#Miscellaneous]].
--["ͷ"] = "v", Differs by dialect.
-- These are tracked by [[Module:script utilities]].
 
["ϐ"] = "b",
["ϑ"] = "th",
["ϰ"] = "k",
["ϱ"] = "r",
["ϲ"] = "s",
["ϕ"] = "ph",
-- Diacritics
-- Diacritics
-- unchanged: macron, diaeresis, grave, acute
-- unchanged: macron, diaeresis, grave, acute
[breve] = '',
[smooth] = "",
[smooth] = '',
[rough] = "",
[rough] = '',
[circumflex] = hat,
[circumflex] = hat,
[subscript] = 'i',
[subscript] = "i",
}
}
local function get_next_token(tokens, i)
local new = i + 1
local token = tokens[new]
while token and token:match("[()[%]{}]") do
new = new + 1
token = tokens[new]
end
return new, token, token and ulower(token), concat(tokens, nil, i + 1, new - 1)
end
local function translit_letter(letter, trail)
local tr = long_vowels[letter]
return (tr and (tr .. (trail:find(breve) and "" or macron)) or tt[letter] or letter) .. trail:gsub(".[\128-\191]*", tt)
end
local function do_translit(token)
-- Put iota subscript before accent marks, so that they appear on "i".
token = ugsub(token, "([" .. acute .. grave .. circumflex .. "]+)" .. subscript, subscript .. "%1")
return ugsub(token, "(.)(%W*)", translit_letter)
end
local function remove_macron_if_hat(m)
return m:find(hat) and m:gsub(macron, "") or m
end
local function insert_translit(output, translit, token, lower_token, next_token, next_token_lower, suffix)
-- Remove any duplicate diacritics (this shouldn't really happen).
local n
repeat
translit, n = ugsub(translit, "(" .. diacritic .. ")(%W-)%1", "%1%2")
until n == 0
-- Remove macron from a vowel that has a circumflex.
translit = ugsub(translit, "%W+", remove_macron_if_hat)
-- If capitalized, only capitalize the first letter unless the following token is capitalized as well.
insert(
output,
(token == lower_token and translit or
next_token == next_token_lower and translit:gsub("^" .. ".[\128-\191]*", uupper) or
uupper(translit)
) .. suffix
)
end


function export.tr(text, lang, sc)
function export.tr(text, lang, sc)
if text == '' then
if text == "" then
return 'h'
return "h"
end
end
-- in case of bold/italic text; only works in testcases submodule, not in sandbox, so outcommented
-- local remove_rough = {
-- ['ἱ'] = 'ι', ['ἵ'] = 'ί', ['ἳ'] = 'ὶ', ['ἷ'] = 'ῖ',
--        ['ὑ'] = 'υ', ['ὕ'] = 'ύ', ['ὓ'] = 'ὺ', ['ὗ'] = 'ῦ',
--    }
-- text = ugsub(text, "([αᾰᾱΑᾸᾹεΕηΗοΟυῠῡΥῨῩωΩ])(\'\'\'?)([ἱἵἳἷὑὕὓὗ])",
-- function(a,b,c)
-- return a .. rough .. b .. remove_rough[c]
-- end)
--[[
--[[
Replace semicolon or Greek question mark with regular question mark,
Replace semicolon or Greek question mark with regular question mark,
except after an ASCII alphanumeric character (to avoid converting
except any that occur in HTML entities. Use split to separate out the
semicolons in HTML entities).
chunks between any entities.
]]
]]
text = ugsub(text, "([^A-Za-z0-9])[;" .. U(0x37E) .. "]", "%1?")
text = split(canonicalize(text), "(&#?%w+;)")
for i = 1, #text, 2 do
text[i] = text[i]:gsub(";", "?"):gsub(question_mark, "?")
end
text = concat(text)
 
-- Handle the middle dot. It is equivalent to semicolon or colon, but semicolon is probably more common.
-- Handle the middle dot. It is equivalent to semicolon or colon, but semicolon is probably more common.
text = text:gsub("·", ";")
text = text:gsub("·", ";")
 
local tokens = tokenize(text)
local tokens = tokenize(text)


--now read the tokens
--now read the tokens
local output = {}
local next_i, next_token, next_token_lower, suffix = get_next_token(tokens, 0)
for i, token in pairs(tokens) do
local output = {suffix}
-- Convert token to lowercase and substitute each character
while next_token do
-- for its transliteration
local i, token, lower_token, is_rough = next_i, next_token, next_token_lower
local translit = ulower(token):gsub(UTF8char, tt)
local translit = do_translit(lower_token)
next_i, next_token, next_token_lower, suffix = get_next_token(tokens, i)
local next_token = tokens[i + 1]
 
-- γ before a velar should be <n>
if token == 'γ' and next_token and velar:find(next_token, 1, true) then
if lower_token:find("γ") and next_token_lower and umatch(next_token_lower, velar) then
-- γ before a velar should be <n>
translit = translit:gsub("g", "n")
translit = 'n'
elseif lang == "xbc" and lower_token:find("φ") then
elseif token == 'ρ' and tokens[i - 1] == 'ρ' then
translit = translit:gsub("ph", "f")
-- ρ after ρ should be <rh>
elseif token == "ρ"..rough then
translit = 'rh'
translit = "rh"
elseif ufind(token, a_subscript) then
elseif token == "ρ"..smooth then
-- add macron to ᾳ
translit = "r"
translit = ugsub(translit, '([aA])', '%1' .. macron)
-- ρ after ρ should be <rh>
elseif lang == "grc" and lower_token:find("ρ") then
-- Keep adding ρs until they run out. Set is_rough, so that "h" will get appended.
while next_token_lower and next_token_lower:find("ρ") do
insert_translit(output, translit, token, lower_token, next_token, next_token_lower, suffix)
i, token, lower_token, is_rough = next_i, next_token, next_token_lower, true
translit = do_translit(lower_token)
next_i, next_token, next_token_lower, suffix = get_next_token(tokens, i)
end
-- add macron to ᾳ
elseif umatch(lower_token, au_subscript) then
translit = translit:gsub("[au]", "%0" .. macron)
end
end
 
if token:find(rough) then
if is_rough or lower_token:find(rough) then
if ufind(token, '^[Ρρ]') then
if umatch(lower_token, vowel) then
translit = translit .. 'h'
translit = "h" .. translit
else -- vowel
else
translit = 'h' .. translit
local final = umatch(translit, "(%w)%W*$")
if final and final ~= "h" then
translit = translit .. "h"
end
end
end
end
end
 
-- Remove macron from a vowel that has a circumflex.
insert_translit(output, translit, token, lower_token, next_token, next_token_lower, suffix)
if ufind(translit, macron_diaeresis) then
translit = translit:gsub(macron, '')
end
-- Capitalize first character of transliteration.
if token ~= ulower(token) then
translit = translit:gsub("^" .. UTF8char, uupper)
end
table.insert(output, translit)
end
end
output = table.concat(output)
 
return concat(output)
return output
end
end


return export
return export