Module:la-vul-translit

From Linguifex
Jump to navigation Jump to search

This module will transliterate Vulgar Latin text. The module should preferably not be called directly from templates or other modules. To use it from a template, use {{xlit}}. Within a module, use Module:languages#Language:transliterate.

For testcases, see Module:la-vul-translit/testcases.

Functions

tr(text, lang, sc)
Transliterates a given piece of text written in the script specified by the code sc, and language specified by the code lang.
When the transliteration fails, returns nil.

local export = {}

local m_str_utils = require("Module:string utilities")
local m_table = require("Module:table")

local concat = table.concat
local extend = m_table.extend
local insert = table.insert
local list_to_set = m_table.listToSet
local remove = table.remove
local ugmatch = mw.ustring.gmatch
local ugsub = m_str_utils.gsub
local ulen = m_str_utils.len
local umatch = mw.ustring.match
local usub = m_str_utils.sub

local vowel_patterns = {}

vowel_patterns["all"] = {
	{"ā", "a"},
	{"ae", "ę"},
	{"áé", "ę́"},
	{"e", "ę"},
	{"o", "ǫ"},
}

vowel_patterns["It-W"] = {
	{"ē", "ẹ"},
	{"i", "ẹ"},
	{"ī", "i"},
	{"ō", "ọ"},
	{"u", "ọ"},
	{"ū", "u"},
}

vowel_patterns["Italo-Western"] = vowel_patterns["It-W"]
vowel_patterns["Western"] = vowel_patterns["It-W"]

vowel_patterns["E"] = {
	{"ē", "ẹ"},
	{"i", "ẹ"},
	{"ī", "i"},
	{"ō", "o"},
	{"ū", "u"},
}

vowel_patterns["Eastern"] = vowel_patterns["E"]
vowel_patterns["Romanian"] = vowel_patterns["E"]

vowel_patterns["S"] = {
	{"ē", "e"},
	{"ẹ", "e"},
	{"ī", "i"},
	{"ō", "o"},
	{"ọ", "o"},
	{"ū", "u"},
}

vowel_patterns["Sardinian"] = vowel_patterns["S"]

local dictionary = {
	["a"] = "a",["e"] = "e",["i"] = "i",["o"] = "o",["u"] = "u",
	["ā"] = "ā",["ē"] = "ē",["ī"] = "ī",["ō"] = "ō",["ū"] = "ū",
	["ae"] = "ae",["oe"] = "ē",["ai"] = "aị",["ei"] = "ėị",["au"] = "aụ",["eu"] = "ėụ",
	["b"] = "b",["d"] = "d",["f"] = "f",
	["c"] = "c",["g"] = "g",["v"] = "v",["x"] = "x",
	["qu"] = "qŭ",
	["'"] = "'"
}

local vowels = list_to_set{
	"a", "e", "i", "o", "u",
	"ā", "ē", "ī", "ō", "ū",
	"ae", "oe", "aị", "ėị", "aụ", "ėụ",
	"-"
}

local onsets = list_to_set{
	"b", "p", "d", "t",
	"g", "c", "cu", "qŭ",
	"f", "s", "z",
	"l", "m", "n", "r", "j", "v", "w",
	
	"bl", "pl", "br", "pr", "ps", 
	"dr", "tr",
	"gl", "cl", "gr", "cr",
	"fl", "fr",
	
	"sp", "st", "sc", "scu", "sl", "sm", "sn", "su",
	"spr", "str", "scr",
	"spl", "scl"
}

local codas = list_to_set{
	"b", "p", "d", "t", "g", "c",
	"f", "s", "z",
	"l", "m", "n", "r", "j",
	
	"sp", "st", "sc",

	"lp", "lt", "lc",
	"lb", "ld", "lg",
	"lf",
	
	"rp", "rt", "rc",
	"rb", "rd", "rg",
	"rf",
	
	"mp", "nt", "nc",
	"mb", "nd", "ng",

	"lm", "rl", "rm", "rn",
	
	"ps", "ts", "cs", "x", "ls", "ns", "rs",
	"lcs", "ncs", "rcs",
	"lms", "rls", "rms", "rns"
}

local voicing = {
	["p"] = "b",
	["t"] = "d",
	["k"] = "ɡ",
}

local devoicing = m_table.invert(voicing)

-- These phonetic rules apply to the whole word, not just a syllable
local word_rules_start = {
	{"h", ""},
	{"k", "c"},
	{"y", "i"},
	{"ȳ", "ī"},
	{"x('?[ct])", "s%1"},
	{"([aeiouāēīōū]'?)b%f[aeiouāēīōū]", "%1v"},
	{"([bdɡ])(ˈ?)%f[cfpqstx]", function (consonant, stress)
		return (devoicing[consonant] or consonant) .. stress
	end},
	{"([cpt])(ˈ?)%f[bdɡ]", function (consonant, stress)
		return (voicing[consonant] or consonant) .. stress
	end},
	{"m(ˈ?[cdgqtx])", "n%1"},
	{"n(ˈ?[bmp])", "m%1"},
	{"um$", "u"}
}

local stress_shift_rules = {
	["qu"] = "'qu", ["ngu"] = "n'gu", ["gu"] = "'gu", ["v"] = "'v",
	
	["bl"] = "'bl", ["pl"] = "'pl", ["br"] = "'br", ["pr"] = "'pr",
	["dr"] = "'dr", ["tr"] = "'tr",
	["gl"] = "'gl", ["cl"] = "'cl", ["gr"] = "'gr", ["cr"] = "'cr",
	["fl"] = "'fl", ["fr"] = "'fr", ["ct"] = "c't", ["pt"] = "p't", ["gd"] = "g'd",
	
	["sl"] = "s'l", ["sm"] = "s'm", ["sn"] = "s'n", ["su"] = "s'u",
	["st"] = "s't", ["xt"] = "x't",
	["spr"] = "s'pr", ["str"] = "s'tr", ["scr"] = "s'cr",
	["spl"] = "s'pl", ["scl"] = "s'cl",
	
	["nct"] = "nc't"
}

local word_rules_end = {
	{"^ĭ", "j"},
	-- {"g([ei])", "j%1"},
	-- {"dĭ", "j"},
	-- {"gĭ", "j"},
	-- {"z", "j"},
	{"ė", "e"},
	{"ị", "i"},
	{"ụ", "u"},
	{"ĭ", "i"},
	{"ŭ", "u"},
	{"ei", "i"},
	{"ii", "i"},
	{"ee$", "ie"},
	{"([aẹęeọǫou])(́?)e$", "%1%2i"},
}

local nasalized = {}

nasalized["It-W"] = {
	{"[ẹęeēi](́?)[nm]", "ẽ%1"},
	{"[ī](́?)[nm]", "ĩ%1"},
	{"[ū](́?)[nm]", "ũ%1"},
	{"[ọǫoōu](́?)[nm]", "õ%1"},
}

nasalized["E"] = {
	{"[ẹęeēi](́?)[nm]", "ẽ%1"},
	{"[ī](́?)[nm]", "ĩ%1"},
	{"[uū](́?)[nm]", "ũ%1"},
	{"[ọoō](́?)[nm]", "õ%1"},
}

nasalized["S"] = {
	{"[eē](́?)[nm]", "ẽ%1"},
	{"[iī](́?)[nm]", "ĩ%1"},
	{"[uū](́?)[nm]", "ũ%1"},
	{"[oō](́?)[nm]", "õ%1"},
}

local function nasalize_vowels(word, family)
	word = ugsub(word, "[aā](́?)[nm]$", "ã%1")
	word = ugsub(word, "[aā][nm](́?)s", "ã%1s")
	for _, rule in ipairs(nasalized[family]) do
		word = ugsub(word, rule[1] .. "$", rule[2])
		word = ugsub(word, rule[1] .. "(́?)s", rule[2] .. "%2s")
	end
	return word
end

local function get_onset(syll)
	local consonants = {}
	
	for i = 1, #syll do
		if vowels[syll[i]] then
			break
		end
		if syll[i] ~= "'" then
			insert(consonants, syll[i])
		end
	end
	
	return concat(consonants)
end

local function get_coda(syll)
	local consonants = {}
	
	for i = #syll, 1, -1 do
		if vowels[syll[i]] then
			break
		end
		
		insert(consonants, 1, syll[i])
	end
	
	return concat(consonants)
end

local function get_vowel(syll)
	for i = 1,#syll do
		if vowels[syll[i]] then return syll[i] end
	end
end

local function split_syllables(word)
	local phonemes = {}
	
	while ulen(word) > 0 do
		local longestmatch = ""
		
		for letter in pairs(dictionary) do
			if ulen(letter) > ulen(longestmatch) and usub(word, 1, ulen(letter)) == letter then
				longestmatch = letter
			end
		end
		
		if ulen(longestmatch) > 0 then
			insert(phonemes, dictionary[longestmatch])
			word = usub(word, ulen(longestmatch) + 1)
		else
			insert(phonemes, usub(word, 1, 1))
			word = usub(word, 2)
		end
	end
	
	local syllables, syll = {}, {}
	
	while #phonemes > 0 do
		local phoneme = remove(phonemes, 1)
		
		if phoneme == "'" then
			if #syll > 0 then
				insert(syllables, syll)
			end
			syll = {"'"}
		elseif vowels[phoneme] then
			insert(syll, phoneme)
			insert(syllables, syll)
			syll = {}
		else
			insert(syll, phoneme)
		end
	end
	
	-- If there are phonemes left, then the word ends in a consonant
	-- Add them to the last syllable
	extend(syllables[#syllables], syll)
	
	-- Split consonant clusters between syllables
	for i, current in ipairs(syllables) do
		if i > 1 then
			local previous = syllables[i-1]
			local onset = get_onset(current)
			-- Shift over consonants until the syllable onset is valid
			while not (onset == "" or onsets[onset]) do
				insert(previous, remove(current, 1))
				onset = get_onset(current)
			end
			
			-- If the preceding syllable still ends with a vowel, and the current one begins with s + another consonant, or with gn, then shift it over
			if get_coda(previous) == "" and ((current[1] == "s" and not vowels[current[2]]) or (current[1] == "g" and current[2] == "n")) then
				insert(previous, remove(current, 1))
			end
			
			-- If there is no vowel at all in this syllable
			if not get_vowel(current) then
				for _ = 1, #current do
					insert(syllables[i-1], remove(current, 1))
				end
				remove(syllables,i)
			end
			
		end
	end
	
	for _, s in ipairs(syllables) do
		local onset = get_onset(s)
		if not (onset == "" or onsets[onset]) then
			error("onset error:[" .. onset .. "]")
		end
		local coda = get_coda(s)
		if not (coda == "" or codas[coda]) then
			error("coda error:[" .. coda .. "]")
		end
	end
	
	return syllables
end

local function detect_accent(syllables)
	-- Manual override
	for i = 1, #syllables do
		for j = 1, #syllables[i] do
			if syllables[i][j] == "'" then
				remove(syllables[i], j)
				return i
			end
		end
	end
	if #syllables > 2 then
		-- Does the penultimate syllable end in a single vowel?
		local penult = syllables[#syllables - 1]
		
		if penult[#penult]:match("^[aeiouy]$") then
			local ult = syllables[#syllables]
			if ult[2] and (ult[1] .. ult[2]):match("[bdg][lr]") then
				return #syllables - 1
			end
			return #syllables - 2
		else
			return #syllables - 1
		end
	elseif #syllables == 2 then
		return #syllables - 1
	end
	return #syllables
end

local function place_accent(syllable)
	-- Special case: i before a or o
	local new_syllable = ugsub(syllable, "i([aoāō])", "i%1́")
	if syllable == new_syllable then
		new_syllable = ugsub(syllable, "([aeẹęioọǫuāēīōūėịụ-])", "%1́")
	end
	return new_syllable
end

local function convert_word(raw_word, vowel_pattern)
	local asterisks, word = raw_word:match("^(%**)(.*)")

	-- do starting word-based rules
	for _, rule in ipairs(word_rules_start) do
		word = ugsub(word, rule[1], rule[2])
	end

	-- Prothetic i before s + consonant
	if vowel_pattern == "It-W" then
		word = word:gsub("^s+[bcdfglmnprtz]", "i%0")
	end

	for k, v in pairs(stress_shift_rules) do
		word = ugsub(word, k .. "'", v)
	end

	-- Double consonant stress shifts
	word = word:gsub("([bcdfghjklmnprstz])%1'", "%1'%1")

	local syllables = split_syllables(word)
	local accent = detect_accent(syllables)
	
	-- Check antepenult for e, i > j (written i)
	--[[local antepenult = syllables[#syllables - 2]
	local penult = syllables[#syllables - 1]
	
	if antepenult and penult then
		if syllables[accent] == antepenult and umatch(antepenult[#antepenult], "^[eēiī]$") and umatch(penult[#penult], "^[aāoō]$") then
			syllables[#syllables-2][#antepenult] = "ị"
			accent = accent + 1
		end
	end]]--

	for i, syll in ipairs(syllables) do
		if syllables[i + 1] then
			if umatch(syll[#syll], "^[eēiī]$") and umatch(syllables[i + 1][1], "^[aāoōuū]$") then
				syll[#syll] = "ĭ"
				if syllables[accent] == syll then
					accent = accent + 1
				end
			end
		end
	end
	

	for i, syll in ipairs(syllables) do
		syll = concat(syll)
		for _, rule in ipairs(vowel_patterns["all"]) do
			syll = ugsub(syll, rule[1], rule[2])
		end
		for _, rule in ipairs(vowel_patterns[vowel_pattern]) do
			syll = ugsub(syll, rule[1], rule[2])
		end
		--[[if i ~= accent then
			syll = syll:gsub("ẹ", "e")
			syll = syll:gsub("ọ", "o")
		end]]
		syllables[i] = (i == accent and place_accent(syll) or syll)
	end
	
	word = concat(syllables)
	
	for _, rule in ipairs(word_rules_end) do
		word = ugsub(word, rule[1], rule[2])
	end
	
	word = nasalize_vowels(word, vowel_pattern)
	
	return asterisks .. word
end

function export.convert_words(words, vowel_pattern)
	local word_table = {}

	for word in ugmatch(words, "%S+") do
		insert(word_table, convert_word(word, vowel_pattern))
	end

	return concat(word_table, " ")
end

return export