Module:grc-translit

From Linguifex
Jump to navigation Jump to search

This module will transliterate Ancient Greek language text. The module should preferably not be called directly from templates or other modules. To use it from a template, use {{xlit}}. Within a module, use Module:languages#Language:transliterate.

For testcases, see Module:grc-translit/testcases.

Functions

tr(text, lang, sc)
Transliterates a given piece of text written in the script specified by the code sc, and language specified by the code lang.
When the transliteration fails, returns nil.

local export = {}

local m_grc_utils = require("Module:grc-utilities")
local m_grc_utils_data = require("Module:grc-utilities/data")
local m_str_utils = require("Module:string utilities")

local tokenize = require("Module:grc-utilities").tokenize

local canonicalize = m_grc_utils.canonicalize
local concat = table.concat
local insert = table.insert
local split = m_str_utils.split
local u = m_str_utils.char
local ugsub = m_str_utils.gsub
local ulower = m_str_utils.lower
local umatch = mw.ustring.match
local uupper = m_str_utils.upper

-- Diacritics
local diacritic = m_grc_utils_data.diacritic
local diacritics = m_grc_utils_data.diacritics

-- Greek
local acute = diacritics.acute
local grave = diacritics.grave
local circumflex = diacritics.circum
local smooth = diacritics.smooth
local rough = diacritics.rough
local breve = diacritics.breve
local macron = diacritics.macron
local subscript = diacritics.subscript
local vowel = m_grc_utils_data.vowel

-- Latin
local hat = diacritics.Latin_circum

local au_subscript = "^[αυ].*" .. subscript .. "$"
local question_mark = u(0x37E)
local velar = "[γκξχϙ]"

local long_vowels = { -- Macron will be added.
	["η"] = "e",
	["ω"] = "o",
}

local tt = {
	-- Vowels
	["α"] = "a",
	["ε"] = "e",
	["ι"] = "i",
	["ο"] = "o",
	["υ"] = "u",

	-- Consonants
	["β"] = "b",
	["γ"] = "g",
	["δ"] = "d",
	["ζ"] = "z",
	["θ"] = "th",
	["κ"] = "k",
	["λ"] = "l",
	["μ"] = "m",
	["ν"] = "n",
	["ξ"] = "x",
	["π"] = "p",
	["ρ"] = "r",
	["σ"] = "s",
	["ς"] = "s",
	["τ"] = "t",
	["φ"] = "ph",
	["χ"] = "kh",
	["ψ"] = "ps",

	-- Other letters
	["ϛ"] = "st",
	["ϝ"] = "w",
	["ͱ"] = "h",
	["ϳ"] = "j",
	["ϙ"] = "q",
	["ϻ"] = "s",
	["ϸ"] = "š",
	["ͳ"] = "s",
	--["ͷ"] = "v", Differs by dialect.

	-- Diacritics
	-- unchanged: macron, diaeresis, grave, acute
	[smooth] = "",
	[rough] = "",
	[circumflex] = hat,
	[subscript] = "i",
}

local function get_next_token(tokens, i)
	local new = i + 1
	local token = tokens[new]
	while token and token:match("[()[%]{}]") do
		new = new + 1
		token = tokens[new]
	end
	return new, token, token and ulower(token), concat(tokens, nil, i + 1, new - 1)
end

local function translit_letter(letter, trail)
	local tr = long_vowels[letter]
	return (tr and (tr .. (trail:find(breve) and "" or macron)) or tt[letter] or letter) .. trail:gsub(".[\128-\191]*", tt)
end

local function do_translit(token)
	-- Put iota subscript before accent marks, so that they appear on "i".
	token = ugsub(token, "([" .. acute .. grave .. circumflex .. "]+)" .. subscript, subscript .. "%1")
	return ugsub(token, "(.)(%W*)", translit_letter)
end

local function remove_macron_if_hat(m)
	return m:find(hat) and m:gsub(macron, "") or m
end

local function insert_translit(output, translit, token, lower_token, next_token, next_token_lower, suffix)
	-- Remove any duplicate diacritics (this shouldn't really happen).
	local n
	repeat
		translit, n = ugsub(translit, "(" .. diacritic .. ")(%W-)%1", "%1%2")
	until n == 0
	-- Remove macron from a vowel that has a circumflex.
	translit = ugsub(translit, "%W+", remove_macron_if_hat)
	-- If capitalized, only capitalize the first letter unless the following token is capitalized as well.
	insert(
		output,
		(token == lower_token and translit or
			next_token == next_token_lower and translit:gsub("^" .. ".[\128-\191]*", uupper) or
			uupper(translit)
		) .. suffix
	)
end

function export.tr(text, lang, sc)
	if text == "῾" then
		return "h"
	end
	
	-- in case of bold/italic text; only works in testcases submodule, not in sandbox, so outcommented
--	local remove_rough = { 
--		['ἱ'] = 'ι', ['ἵ'] = 'ί', ['ἳ'] = 'ὶ', ['ἷ'] = 'ῖ', 
--        ['ὑ'] = 'υ', ['ὕ'] = 'ύ', ['ὓ'] = 'ὺ', ['ὗ'] = 'ῦ',
--    }
--	text = ugsub(text, "([αᾰᾱΑᾸᾹεΕηΗοΟυῠῡΥῨῩωΩ])(\'\'\'?)([ἱἵἳἷὑὕὓὗ])", 
--		function(a,b,c)
--			return a .. rough .. b .. remove_rough[c]
--		end)
	
	--[[
		Replace semicolon or Greek question mark with regular question mark,
		except any that occur in HTML entities. Use split to separate out the
		chunks between any entities.
	]]
	text = split(canonicalize(text), "(&#?%w+;)")
	for i = 1, #text, 2 do
		text[i] = text[i]:gsub(";", "?"):gsub(question_mark, "?")
	end
	text = concat(text)

	-- Handle the middle dot. It is equivalent to semicolon or colon, but semicolon is probably more common.
	text = text:gsub("·", ";")

	local tokens = tokenize(text)

	--now read the tokens
	local next_i, next_token, next_token_lower, suffix = get_next_token(tokens, 0)
	local output = {suffix}
	while next_token do
		local i, token, lower_token, is_rough = next_i, next_token, next_token_lower
		local translit = do_translit(lower_token)
		next_i, next_token, next_token_lower, suffix = get_next_token(tokens, i)

		-- γ before a velar should be <n>
		if lower_token:find("γ") and next_token_lower and umatch(next_token_lower, velar) then
			translit = translit:gsub("g", "n")
		elseif lang == "xbc" and lower_token:find("φ") then
			translit = translit:gsub("ph", "f")
		elseif token == "ρ"..rough then
			translit = "rh"
		elseif token == "ρ"..smooth then
			translit = "r"
		-- ρ after ρ should be <rh>
		elseif lang == "grc" and lower_token:find("ρ") then
			-- Keep adding ρs until they run out. Set is_rough, so that "h" will get appended.
			while next_token_lower and next_token_lower:find("ρ") do
				insert_translit(output, translit, token, lower_token, next_token, next_token_lower, suffix)
				i, token, lower_token, is_rough = next_i, next_token, next_token_lower, true
				translit = do_translit(lower_token)
				next_i, next_token, next_token_lower, suffix = get_next_token(tokens, i)
			end
		-- add macron to ᾳ
		elseif umatch(lower_token, au_subscript) then
			translit = translit:gsub("[au]", "%0" .. macron)
		end

		if is_rough or lower_token:find(rough) then
			if umatch(lower_token, vowel) then
				translit = "h" .. translit
			else
				local final = umatch(translit, "(%w)%W*$")
				if final and final ~= "h" then
					translit = translit .. "h"
				end
			end
		end

		insert_translit(output, translit, token, lower_token, next_token, next_token_lower, suffix)
	end

	return concat(output)
end

return export