Module:grc-translit: Difference between revisions

Revision as of 10:59, 28 August 2025

The following documentation is generated by Module:documentation/functions/translit. ^[edit]

Useful links: subpage list • links • transclusions • testcases • sandbox

This module will transliterate Ancient Greek language text. The module should preferably not be called directly from templates or other modules. To use it from a template, use {{xlit}}. Within a module, use Module:languages#Language:transliterate.

For testcases, see Module:grc-translit/testcases.

Functions

tr(text, lang, sc): Transliterates a given piece of text written in the script specified by the code sc, and language specified by the code lang.; When the transliteration fails, returns nil.

local export = {}

local m_grc_utils = require("Module:grc-utilities")
local m_grc_utils_data = require("Module:grc-utilities/data")
local m_str_utils = require("Module:string utilities")

local tokenize = require("Module:grc-utilities").tokenize

local canonicalize = m_grc_utils.canonicalize
local concat = table.concat
local insert = table.insert
local split = m_str_utils.split
local u = m_str_utils.char
local ugsub = m_str_utils.gsub
local ulower = m_str_utils.lower
local umatch = mw.ustring.match
local uupper = m_str_utils.upper

-- Diacritics
local diacritic = m_grc_utils_data.diacritic
local diacritics = m_grc_utils_data.diacritics

-- Greek
local acute = diacritics.acute
local grave = diacritics.grave
local circumflex = diacritics.circum
local smooth = diacritics.smooth
local rough = diacritics.rough
local breve = diacritics.breve
local macron = diacritics.macron
local subscript = diacritics.subscript
local vowel = m_grc_utils_data.vowel

-- Latin
local hat = diacritics.Latin_circum

local au_subscript = "^[αυ].*" .. subscript .. "$"
local question_mark = u(0x37E)
local velar = "[γκξχϙ]"

local long_vowels = { -- Macron will be added.
	["η"] = "e",
	["ω"] = "o",
}

local tt = {
	-- Vowels
	["α"] = "a",
	["ε"] = "e",
	["ι"] = "i",
	["ο"] = "o",
	["υ"] = "u",

	-- Consonants
	["β"] = "b",
	["γ"] = "g",
	["δ"] = "d",
	["ζ"] = "z",
	["θ"] = "th",
	["κ"] = "k",
	["λ"] = "l",
	["μ"] = "m",
	["ν"] = "n",
	["ξ"] = "x",
	["π"] = "p",
	["ρ"] = "r",
	["σ"] = "s",
	["ς"] = "s",
	["τ"] = "t",
	["φ"] = "ph",
	["χ"] = "kh",
	["ψ"] = "ps",

	-- Other letters
	["ϛ"] = "st",
	["ϝ"] = "w",
	["ͱ"] = "h",
	["ϳ"] = "j",
	["ϙ"] = "q",
	["ϻ"] = "s",
	["ϸ"] = "š",
	["ͳ"] = "s",
	--["ͷ"] = "v", Differs by dialect.

	-- Diacritics
	-- unchanged: macron, diaeresis, grave, acute
	[smooth] = "",
	[rough] = "",
	[circumflex] = hat,
	[subscript] = "i",
}

local function get_next_token(tokens, i)
	local new = i + 1
	local token = tokens[new]
	while token and token:match("[()[%]{}]") do
		new = new + 1
		token = tokens[new]
	end
	return new, token, token and ulower(token), concat(tokens, nil, i + 1, new - 1)
end

local function translit_letter(letter, trail)
	local tr = long_vowels[letter]
	return (tr and (tr .. (trail:find(breve) and "" or macron)) or tt[letter] or letter) .. trail:gsub(".[\128-\191]*", tt)
end

local function do_translit(token)
	-- Put iota subscript before accent marks, so that they appear on "i".
	token = ugsub(token, "([" .. acute .. grave .. circumflex .. "]+)" .. subscript, subscript .. "%1")
	return ugsub(token, "(.)(%W*)", translit_letter)
end

local function remove_macron_if_hat(m)
	return m:find(hat) and m:gsub(macron, "") or m
end

local function insert_translit(output, translit, token, lower_token, next_token, next_token_lower, suffix)
	-- Remove any duplicate diacritics (this shouldn't really happen).
	local n
	repeat
		translit, n = ugsub(translit, "(" .. diacritic .. ")(%W-)%1", "%1%2")
	until n == 0
	-- Remove macron from a vowel that has a circumflex.
	translit = ugsub(translit, "%W+", remove_macron_if_hat)
	-- If capitalized, only capitalize the first letter unless the following token is capitalized as well.
	insert(
		output,
		(token == lower_token and translit or
			next_token == next_token_lower and translit:gsub("^" .. ".[\128-\191]*", uupper) or
			uupper(translit)
		) .. suffix
	)
end

function export.tr(text, lang, sc)
	if text == "῾" then
		return "h"
	end
	
	-- in case of bold/italic text; only works in testcases submodule, not in sandbox, so outcommented
--	local remove_rough = { 
--		['ἱ'] = 'ι', ['ἵ'] = 'ί', ['ἳ'] = 'ὶ', ['ἷ'] = 'ῖ', 
--        ['ὑ'] = 'υ', ['ὕ'] = 'ύ', ['ὓ'] = 'ὺ', ['ὗ'] = 'ῦ',
--    }
--	text = ugsub(text, "([αᾰᾱΑᾸᾹεΕηΗοΟυῠῡΥῨῩωΩ])(\'\'\'?)([ἱἵἳἷὑὕὓὗ])", 
--		function(a,b,c)
--			return a .. rough .. b .. remove_rough[c]
--		end)
	
	--[[
		Replace semicolon or Greek question mark with regular question mark,
		except any that occur in HTML entities. Use split to separate out the
		chunks between any entities.
	]]
	text = split(canonicalize(text), "(&#?%w+;)")
	for i = 1, #text, 2 do
		text[i] = text[i]:gsub(";", "?"):gsub(question_mark, "?")
	end
	text = concat(text)

	-- Handle the middle dot. It is equivalent to semicolon or colon, but semicolon is probably more common.
	text = text:gsub("·", ";")

	local tokens = tokenize(text)

	--now read the tokens
	local next_i, next_token, next_token_lower, suffix = get_next_token(tokens, 0)
	local output = {suffix}
	while next_token do
		local i, token, lower_token, is_rough = next_i, next_token, next_token_lower
		local translit = do_translit(lower_token)
		next_i, next_token, next_token_lower, suffix = get_next_token(tokens, i)

		-- γ before a velar should be <n>
		if lower_token:find("γ") and next_token_lower and umatch(next_token_lower, velar) then
			translit = translit:gsub("g", "n")
		elseif lang == "xbc" and lower_token:find("φ") then
			translit = translit:gsub("ph", "f")
		elseif token == "ρ"..rough then
			translit = "rh"
		elseif token == "ρ"..smooth then
			translit = "r"
		-- ρ after ρ should be <rh>
		elseif lang == "grc" and lower_token:find("ρ") then
			-- Keep adding ρs until they run out. Set is_rough, so that "h" will get appended.
			while next_token_lower and next_token_lower:find("ρ") do
				insert_translit(output, translit, token, lower_token, next_token, next_token_lower, suffix)
				i, token, lower_token, is_rough = next_i, next_token, next_token_lower, true
				translit = do_translit(lower_token)
				next_i, next_token, next_token_lower, suffix = get_next_token(tokens, i)
			end
		-- add macron to ᾳ
		elseif umatch(lower_token, au_subscript) then
			translit = translit:gsub("[au]", "%0" .. macron)
		end

		if is_rough or lower_token:find(rough) then
			if umatch(lower_token, vowel) then
				translit = "h" .. translit
			else
				local final = umatch(translit, "(%w)%W*$")
				if final and final ~= "h" then
					translit = translit .. "h"
				end
			end
		end

		insert_translit(output, translit, token, lower_token, next_token, next_token_lower, suffix)
	end

	return concat(output)
end

return export

Module:grc-translit: Difference between revisions

Revision as of 10:59, 28 August 2025

Functions

Navigation menu

Search