Module:grc-translit: Difference between revisions
Jump to navigation
Jump to search
Created page with "local export = {} local m_data = require('Module:grc-utilities/data') local tokenize = require('Module:grc-utilities').tokenize local ufind = mw.ustring.find local ugsub = m..." |
my method didn't work in the sandbox |
||
| Line 1: | Line 1: | ||
local export = {} | local export = {} | ||
local | local m_grc_utils = require("Module:grc-utilities") | ||
local | local m_grc_utils_data = require("Module:grc-utilities/data") | ||
local m_str_utils = require("Module:string utilities") | |||
local | local tokenize = require("Module:grc-utilities").tokenize | ||
local | local canonicalize = m_grc_utils.canonicalize | ||
local concat = table.concat | |||
local insert = table.insert | |||
local split = m_str_utils.split | |||
local u = m_str_utils.char | |||
local ugsub = m_str_utils.gsub | |||
local ulower = m_str_utils.lower | |||
local umatch = mw.ustring.match | |||
local uupper = m_str_utils.upper | |||
-- Diacritics | -- Diacritics | ||
local diacritics = | local diacritic = m_grc_utils_data.diacritic | ||
local diacritics = m_grc_utils_data.diacritics | |||
-- Greek | -- Greek | ||
| Line 19: | Line 25: | ||
local grave = diacritics.grave | local grave = diacritics.grave | ||
local circumflex = diacritics.circum | local circumflex = diacritics.circum | ||
local smooth = diacritics.smooth | local smooth = diacritics.smooth | ||
local rough = diacritics.rough | local rough = diacritics.rough | ||
local breve = diacritics.breve | |||
local macron = diacritics.macron | local macron = diacritics.macron | ||
local subscript = diacritics.subscript | local subscript = diacritics.subscript | ||
local vowel = m_grc_utils_data.vowel | |||
-- Latin | -- Latin | ||
local hat = diacritics.Latin_circum | local hat = diacritics.Latin_circum | ||
local | local au_subscript = "^[αυ].*" .. subscript .. "$" | ||
local question_mark = u(0x37E) | |||
local velar = | local velar = "[γκξχϙ]" | ||
local long_vowels = { -- Macron will be added. | |||
["η"] = "e", | |||
["ω"] = "o", | |||
} | |||
local tt = { | local tt = { | ||
| Line 37: | Line 48: | ||
["α"] = "a", | ["α"] = "a", | ||
["ε"] = "e", | ["ε"] = "e", | ||
["ι"] = "i", | ["ι"] = "i", | ||
["ο"] = "o", | ["ο"] = "o", | ||
["υ"] = "u", | ["υ"] = "u", | ||
-- Consonants | -- Consonants | ||
| Line 62: | Line 71: | ||
["χ"] = "kh", | ["χ"] = "kh", | ||
["ψ"] = "ps", | ["ψ"] = "ps", | ||
-- | -- Other letters | ||
["ϛ"] = "st", | |||
["ϝ"] = "w", | ["ϝ"] = "w", | ||
[" | ["ͱ"] = "h", | ||
["ϳ"] = "j", | |||
["ϙ"] = "q", | ["ϙ"] = "q", | ||
[" | ["ϻ"] = "s", | ||
[" | ["ϸ"] = "š", | ||
["ͳ"] = "s", | |||
--["ͷ"] = "v", Differs by dialect. | |||
[" | |||
-- Diacritics | -- Diacritics | ||
-- unchanged: macron, diaeresis, grave, acute | -- unchanged: macron, diaeresis, grave, acute | ||
[smooth] = "", | |||
[smooth] = | [rough] = "", | ||
[rough] = | |||
[circumflex] = hat, | [circumflex] = hat, | ||
[subscript] = | [subscript] = "i", | ||
} | } | ||
local function get_next_token(tokens, i) | |||
local new = i + 1 | |||
local token = tokens[new] | |||
while token and token:match("[()[%]{}]") do | |||
new = new + 1 | |||
token = tokens[new] | |||
end | |||
return new, token, token and ulower(token), concat(tokens, nil, i + 1, new - 1) | |||
end | |||
local function translit_letter(letter, trail) | |||
local tr = long_vowels[letter] | |||
return (tr and (tr .. (trail:find(breve) and "" or macron)) or tt[letter] or letter) .. trail:gsub(".[\128-\191]*", tt) | |||
end | |||
local function do_translit(token) | |||
-- Put iota subscript before accent marks, so that they appear on "i". | |||
token = ugsub(token, "([" .. acute .. grave .. circumflex .. "]+)" .. subscript, subscript .. "%1") | |||
return ugsub(token, "(.)(%W*)", translit_letter) | |||
end | |||
local function remove_macron_if_hat(m) | |||
return m:find(hat) and m:gsub(macron, "") or m | |||
end | |||
local function insert_translit(output, translit, token, lower_token, next_token, next_token_lower, suffix) | |||
-- Remove any duplicate diacritics (this shouldn't really happen). | |||
local n | |||
repeat | |||
translit, n = ugsub(translit, "(" .. diacritic .. ")(%W-)%1", "%1%2") | |||
until n == 0 | |||
-- Remove macron from a vowel that has a circumflex. | |||
translit = ugsub(translit, "%W+", remove_macron_if_hat) | |||
-- If capitalized, only capitalize the first letter unless the following token is capitalized as well. | |||
insert( | |||
output, | |||
(token == lower_token and translit or | |||
next_token == next_token_lower and translit:gsub("^" .. ".[\128-\191]*", uupper) or | |||
uupper(translit) | |||
) .. suffix | |||
) | |||
end | |||
function export.tr(text, lang, sc) | function export.tr(text, lang, sc) | ||
if text == | if text == "῾" then | ||
return | return "h" | ||
end | end | ||
-- in case of bold/italic text; only works in testcases submodule, not in sandbox, so outcommented | |||
-- local remove_rough = { | |||
-- ['ἱ'] = 'ι', ['ἵ'] = 'ί', ['ἳ'] = 'ὶ', ['ἷ'] = 'ῖ', | |||
-- ['ὑ'] = 'υ', ['ὕ'] = 'ύ', ['ὓ'] = 'ὺ', ['ὗ'] = 'ῦ', | |||
-- } | |||
-- text = ugsub(text, "([αᾰᾱΑᾸᾹεΕηΗοΟυῠῡΥῨῩωΩ])(\'\'\'?)([ἱἵἳἷὑὕὓὗ])", | |||
-- function(a,b,c) | |||
-- return a .. rough .. b .. remove_rough[c] | |||
-- end) | |||
--[[ | --[[ | ||
Replace semicolon or Greek question mark with regular question mark, | Replace semicolon or Greek question mark with regular question mark, | ||
except | except any that occur in HTML entities. Use split to separate out the | ||
chunks between any entities. | |||
]] | ]] | ||
text = | text = split(canonicalize(text), "(&#?%w+;)") | ||
for i = 1, #text, 2 do | |||
text[i] = text[i]:gsub(";", "?"):gsub(question_mark, "?") | |||
end | |||
text = concat(text) | |||
-- Handle the middle dot. It is equivalent to semicolon or colon, but semicolon is probably more common. | -- Handle the middle dot. It is equivalent to semicolon or colon, but semicolon is probably more common. | ||
text = text:gsub("·", ";") | text = text:gsub("·", ";") | ||
local tokens = tokenize(text) | local tokens = tokenize(text) | ||
--now read the tokens | --now read the tokens | ||
local output = {} | local next_i, next_token, next_token_lower, suffix = get_next_token(tokens, 0) | ||
local output = {suffix} | |||
while next_token do | |||
local i, token, lower_token, is_rough = next_i, next_token, next_token_lower | |||
local translit = | local translit = do_translit(lower_token) | ||
next_i, next_token, next_token_lower, suffix = get_next_token(tokens, i) | |||
-- γ before a velar should be <n> | |||
if | if lower_token:find("γ") and next_token_lower and umatch(next_token_lower, velar) then | ||
translit = translit:gsub("g", "n") | |||
translit = | elseif lang == "xbc" and lower_token:find("φ") then | ||
elseif token == | translit = translit:gsub("ph", "f") | ||
-- ρ after ρ should be <rh> | elseif token == "ρ"..rough then | ||
translit = | translit = "rh" | ||
elseif token == "ρ"..smooth then | |||
-- add macron to ᾳ | translit = "r" | ||
translit = | -- ρ after ρ should be <rh> | ||
elseif lang == "grc" and lower_token:find("ρ") then | |||
-- Keep adding ρs until they run out. Set is_rough, so that "h" will get appended. | |||
while next_token_lower and next_token_lower:find("ρ") do | |||
insert_translit(output, translit, token, lower_token, next_token, next_token_lower, suffix) | |||
i, token, lower_token, is_rough = next_i, next_token, next_token_lower, true | |||
translit = do_translit(lower_token) | |||
next_i, next_token, next_token_lower, suffix = get_next_token(tokens, i) | |||
end | |||
-- add macron to ᾳ | |||
elseif umatch(lower_token, au_subscript) then | |||
translit = translit:gsub("[au]", "%0" .. macron) | |||
end | end | ||
if | if is_rough or lower_token:find(rough) then | ||
if | if umatch(lower_token, vowel) then | ||
translit = | translit = "h" .. translit | ||
else | else | ||
translit = | local final = umatch(translit, "(%w)%W*$") | ||
if final and final ~= "h" then | |||
translit = translit .. "h" | |||
end | |||
end | end | ||
end | end | ||
insert_translit(output, translit, token, lower_token, next_token, next_token_lower, suffix) | |||
end | end | ||
return concat(output) | |||
end | end | ||
return export | return export | ||
Revision as of 10:59, 28 August 2025
- The following documentation is generated by Module:documentation/functions/translit. [edit]
- Useful links: subpage list • links • transclusions • testcases • sandbox
This module will transliterate Ancient Greek language text.
The module should preferably not be called directly from templates or other modules.
To use it from a template, use {{xlit}}.
Within a module, use Module:languages#Language:transliterate.
For testcases, see Module:grc-translit/testcases.
Functions
tr(text, lang, sc)- Transliterates a given piece of
textwritten in the script specified by the codesc, and language specified by the codelang. - When the transliteration fails, returns
nil.
local export = {}
local m_grc_utils = require("Module:grc-utilities")
local m_grc_utils_data = require("Module:grc-utilities/data")
local m_str_utils = require("Module:string utilities")
local tokenize = require("Module:grc-utilities").tokenize
local canonicalize = m_grc_utils.canonicalize
local concat = table.concat
local insert = table.insert
local split = m_str_utils.split
local u = m_str_utils.char
local ugsub = m_str_utils.gsub
local ulower = m_str_utils.lower
local umatch = mw.ustring.match
local uupper = m_str_utils.upper
-- Diacritics
local diacritic = m_grc_utils_data.diacritic
local diacritics = m_grc_utils_data.diacritics
-- Greek
local acute = diacritics.acute
local grave = diacritics.grave
local circumflex = diacritics.circum
local smooth = diacritics.smooth
local rough = diacritics.rough
local breve = diacritics.breve
local macron = diacritics.macron
local subscript = diacritics.subscript
local vowel = m_grc_utils_data.vowel
-- Latin
local hat = diacritics.Latin_circum
local au_subscript = "^[αυ].*" .. subscript .. "$"
local question_mark = u(0x37E)
local velar = "[γκξχϙ]"
local long_vowels = { -- Macron will be added.
["η"] = "e",
["ω"] = "o",
}
local tt = {
-- Vowels
["α"] = "a",
["ε"] = "e",
["ι"] = "i",
["ο"] = "o",
["υ"] = "u",
-- Consonants
["β"] = "b",
["γ"] = "g",
["δ"] = "d",
["ζ"] = "z",
["θ"] = "th",
["κ"] = "k",
["λ"] = "l",
["μ"] = "m",
["ν"] = "n",
["ξ"] = "x",
["π"] = "p",
["ρ"] = "r",
["σ"] = "s",
["ς"] = "s",
["τ"] = "t",
["φ"] = "ph",
["χ"] = "kh",
["ψ"] = "ps",
-- Other letters
["ϛ"] = "st",
["ϝ"] = "w",
["ͱ"] = "h",
["ϳ"] = "j",
["ϙ"] = "q",
["ϻ"] = "s",
["ϸ"] = "š",
["ͳ"] = "s",
--["ͷ"] = "v", Differs by dialect.
-- Diacritics
-- unchanged: macron, diaeresis, grave, acute
[smooth] = "",
[rough] = "",
[circumflex] = hat,
[subscript] = "i",
}
local function get_next_token(tokens, i)
local new = i + 1
local token = tokens[new]
while token and token:match("[()[%]{}]") do
new = new + 1
token = tokens[new]
end
return new, token, token and ulower(token), concat(tokens, nil, i + 1, new - 1)
end
local function translit_letter(letter, trail)
local tr = long_vowels[letter]
return (tr and (tr .. (trail:find(breve) and "" or macron)) or tt[letter] or letter) .. trail:gsub(".[\128-\191]*", tt)
end
local function do_translit(token)
-- Put iota subscript before accent marks, so that they appear on "i".
token = ugsub(token, "([" .. acute .. grave .. circumflex .. "]+)" .. subscript, subscript .. "%1")
return ugsub(token, "(.)(%W*)", translit_letter)
end
local function remove_macron_if_hat(m)
return m:find(hat) and m:gsub(macron, "") or m
end
local function insert_translit(output, translit, token, lower_token, next_token, next_token_lower, suffix)
-- Remove any duplicate diacritics (this shouldn't really happen).
local n
repeat
translit, n = ugsub(translit, "(" .. diacritic .. ")(%W-)%1", "%1%2")
until n == 0
-- Remove macron from a vowel that has a circumflex.
translit = ugsub(translit, "%W+", remove_macron_if_hat)
-- If capitalized, only capitalize the first letter unless the following token is capitalized as well.
insert(
output,
(token == lower_token and translit or
next_token == next_token_lower and translit:gsub("^" .. ".[\128-\191]*", uupper) or
uupper(translit)
) .. suffix
)
end
function export.tr(text, lang, sc)
if text == "῾" then
return "h"
end
-- in case of bold/italic text; only works in testcases submodule, not in sandbox, so outcommented
-- local remove_rough = {
-- ['ἱ'] = 'ι', ['ἵ'] = 'ί', ['ἳ'] = 'ὶ', ['ἷ'] = 'ῖ',
-- ['ὑ'] = 'υ', ['ὕ'] = 'ύ', ['ὓ'] = 'ὺ', ['ὗ'] = 'ῦ',
-- }
-- text = ugsub(text, "([αᾰᾱΑᾸᾹεΕηΗοΟυῠῡΥῨῩωΩ])(\'\'\'?)([ἱἵἳἷὑὕὓὗ])",
-- function(a,b,c)
-- return a .. rough .. b .. remove_rough[c]
-- end)
--[[
Replace semicolon or Greek question mark with regular question mark,
except any that occur in HTML entities. Use split to separate out the
chunks between any entities.
]]
text = split(canonicalize(text), "(&#?%w+;)")
for i = 1, #text, 2 do
text[i] = text[i]:gsub(";", "?"):gsub(question_mark, "?")
end
text = concat(text)
-- Handle the middle dot. It is equivalent to semicolon or colon, but semicolon is probably more common.
text = text:gsub("·", ";")
local tokens = tokenize(text)
--now read the tokens
local next_i, next_token, next_token_lower, suffix = get_next_token(tokens, 0)
local output = {suffix}
while next_token do
local i, token, lower_token, is_rough = next_i, next_token, next_token_lower
local translit = do_translit(lower_token)
next_i, next_token, next_token_lower, suffix = get_next_token(tokens, i)
-- γ before a velar should be <n>
if lower_token:find("γ") and next_token_lower and umatch(next_token_lower, velar) then
translit = translit:gsub("g", "n")
elseif lang == "xbc" and lower_token:find("φ") then
translit = translit:gsub("ph", "f")
elseif token == "ρ"..rough then
translit = "rh"
elseif token == "ρ"..smooth then
translit = "r"
-- ρ after ρ should be <rh>
elseif lang == "grc" and lower_token:find("ρ") then
-- Keep adding ρs until they run out. Set is_rough, so that "h" will get appended.
while next_token_lower and next_token_lower:find("ρ") do
insert_translit(output, translit, token, lower_token, next_token, next_token_lower, suffix)
i, token, lower_token, is_rough = next_i, next_token, next_token_lower, true
translit = do_translit(lower_token)
next_i, next_token, next_token_lower, suffix = get_next_token(tokens, i)
end
-- add macron to ᾳ
elseif umatch(lower_token, au_subscript) then
translit = translit:gsub("[au]", "%0" .. macron)
end
if is_rough or lower_token:find(rough) then
if umatch(lower_token, vowel) then
translit = "h" .. translit
else
local final = umatch(translit, "(%w)%W*$")
if final and final ~= "h" then
translit = translit .. "h"
end
end
end
insert_translit(output, translit, token, lower_token, next_token, next_token_lower, suffix)
end
return concat(output)
end
return export