Module:grc-utilities: Difference between revisions

From Linguifex
Jump to navigation Jump to search
Created page with "local export = {} local m_script_utils = require("Module:script utilities") local m_links = require("Module:links") local lang = require("Module:languages").getByCode("grc")..."
 
No edit summary
 
Line 1: Line 1:
local export = {}
local export = {}


local m_script_utils = require("Module:script utilities")
local m_data = mw.loadData("Module:grc-utilities/data")
local m_links = require("Module:links")
local m_string_utils = require("Module:string utilities")
 
local concat = table.concat
local full_link = require("Module:links").full_link
local gsub = m_string_utils.gsub
local match = m_string_utils.match
local insert = table.insert
local sparseConcat = require("Module:table").sparseConcat
local standard_diacritics -- defined below
local tag_text = require("Module:script utilities").tag_text
local toNFC = mw.ustring.toNFC
local toNFD = mw.ustring.toNFD
 
local lang = require("Module:languages").getByCode("grc")
local lang = require("Module:languages").getByCode("grc")
local sc = require("Module:scripts").getByCode("polytonic")
local sc = require("Module:scripts").getByCode("Polyt")


local m_data = mw.loadData("Module:grc-utilities/data")
local groups = m_data.groups
local groups = m_data.groups
local canonical = m_data.canonical
local diacritic_order = m_data.diacritic_order
local diacritic_order = m_data.diacritic_order
local conversions = m_data.conversions
local diacritical_conversions = m_data.diacritical_conversions
local diacritics = m_data.diacritics
local diacritics = m_data.diacritics
local diacritic = m_data.diacritic
local diacritic = m_data.diacritic
Line 25: Line 37:
local combining_diacritic = m_data.combining_diacritic
local combining_diacritic = m_data.combining_diacritic


local UTF8_char = "[\1-\127\194-\244][\128-\191]*"
local UTF8_char = ".[\128-\191]*"
local basic_Greek = "[\206-\207][\128-\191]" -- excluding first line of Greek and Coptic block: ͰͱͲͳʹ͵Ͷͷͺͻͼͽ;Ϳ
local basic_Greek = "[\206-\207][\128-\191]" -- excluding first line of Greek and Coptic block: ͰͱͲͳʹ͵Ͷͷͺͻͼͽ;Ϳ
local find = mw.ustring.find
local match = mw.ustring.match
local gmatch = mw.ustring.gmatch
local sub = mw.ustring.sub
local gsub = mw.ustring.gsub
local toNFC = mw.ustring.toNFC
local decompose = mw.ustring.toNFD


local info = {}
local info = {}
Line 43: Line 47:
local upsilon_t = { vowel = true, offglide = true }
local upsilon_t = { vowel = true, offglide = true }
-- These don't need any contents.
-- These don't need any contents.
local rho_t = {}
local breathy_cons_t = {}
-- local consonant_t = {}
-- local consonant_t = {}
local diacritic_t = { diacritic = true }
local diacritic_t = { diacritic = true }
Line 51: Line 55:
local function add_info(characters, t)
local function add_info(characters, t)
if type(characters) == "string" then
if type(characters) == "string" then
for character in string.gmatch(characters, UTF8_char) do
for character in characters:gmatch(UTF8_char) do
info[character] = t
info[character] = t
end
end
else
else
for i, character in ipairs(characters) do
for _, character in ipairs(characters) do
info[character] = t
info[character] = t
end
end
Line 71: Line 75:
add_info("Ιι", iota_t)
add_info("Ιι", iota_t)
add_info("Υυ", upsilon_t)
add_info("Υυ", upsilon_t)
-- add_info("ΒΓΔΖΘΚΛΜΝΞΠΡΣΤΦΧΨϜϘϺϷͶϠβγδζθκλμνξπρσςτφχψϝϙϻϸͷϡ", consonant_t)
add_info("ϜϝΡρ", breathy_cons_t)
add_info("Ρρ", rho_t)


local not_recognized = {}
local not_recognized = {}
setmetatable(info, { __index =
setmetatable(info, { __index = function(t, key)
function(t, key)
return not_recognized
return not_recognized
end})
end
})
 
local sparseConcat = require("Module:table").sparseConcat
 
local checkType = require "libraryUtil".checkType
 
local function _check(funcName)
return function(argIndex, arg, expectType, nilOk)
return checkType(funcName, argIndex, arg, expectType, nilOk)
end
end


-- Perform a function on each Unicode character in a string.
-- Perform a function on each Unicode character in a string.
local function forEach(str, func)
local function forEach(str, func)
for char in string.gmatch(str, UTF8_char) do
for char in str:gmatch(UTF8_char) do
func(char)
func(char)
end
end
end
-- This concatenates or inserts a character, then removes it from the text.
local function add(list, index, chars, text)
if not chars then
error("The function add cannot act on a nil character.")
end
if list[index] then
list[index] = list[index] .. chars
else
list[index] = chars
end
-- Basic string function works here.
return text:sub(#chars + 1)
end
end


function export.tag(term, face)
function export.tag(term, face)
return m_script_utils.tag_text(term, lang, sc, face)
return tag_text(term, lang, sc, face)
end
end


function export.link(term, face, alt, tr)
function export.link(term, face, alt, tr)
return m_links.full_link( { term = term, alt = alt, lang = lang, sc = sc, tr = tr }, face)
return full_link({ term = term, alt = alt, lang = lang, sc = sc, tr = tr }, face)
end
end


local function linkNoTag(term, alt)
-- Convert spacing to combining diacritics, and nonstandard to standard polytonic Greek.
return m_links.language_link{ term = term, lang = lang, alt = alt }
function export.standardDiacritics(text)
return toNFD((toNFD(text):gsub(UTF8_char, diacritical_conversions)))
end
end
standard_diacritics = export.standardDiacritics


-- Convert spacing to combining diacritics, and nonstandard to standard polytonic Greek.
-- Convert variant letter forms to the canonical form, and decompose.
function export.standardDiacritics(text)
function export.canonicalize(text)
text = decompose(text)
text = standard_diacritics(text)
-- Compose, since the characters in `canonical` are in form NFC.
text = text:gsub(UTF8_char, conversions)
text = toNFC(text):gsub(UTF8_char .. grave, canonical) -- for ϗ̀
:gsub(UTF8_char, canonical)
return text
-- Decompose on return.
return toNFD(text)
end
end


Line 157: Line 137:
-- The following might have odd results when there
-- The following might have odd results when there
-- are three or more diacritics.
-- are three or more diacritics.
table.insert(output, index, diacritic)
insert(output, index, diacritic)
-- [[Special:WhatLinksHere/Wiktionary:Tracking/grc-utils/too many diacritics]]
require("Module:debug").track("grc-utils/too many diacritics")
--[[
--[[
local m_templates = require("Module:grc-utilities/templates")
local m_templates = require("Module:grc-utilities/templates")
Line 172: Line 154:


function export.reorderDiacritics(text)
function export.reorderDiacritics(text)
local d = diacritics
return (gsub(toNFD(text), combining_diacritic .. combining_diacritic .. "+", reorderDiacriticSequence))
return (gsub(decompose(text),
combining_diacritic .. combining_diacritic .. "+",
reorderDiacriticSequence))
end
end


Line 186: Line 164:
local function make_tokens(text)
local function make_tokens(text)
local tokens, prev_info = {}, {}
local tokens, prev_info = {}, {}
local token_i, vowel_count = 1, 0 -- Vowel count tracks .
local token_i, vowel_count = 1, 0 -- Vowel count tracks.
local prev
local prev, prev_vowel_info
for character in string.gmatch(decompose(text), UTF8_char) do
for character in text:gmatch(UTF8_char) do
local curr_info = info[character]
local curr_info = info[character]
-- Split vowels between tokens if not a diphthong.
-- Split vowels between tokens if not a diphthong.
if curr_info.vowel then
if curr_info.vowel then
vowel_count = vowel_count + 1
vowel_count = vowel_count + 1
if prev and (not (vowel_count == 2 and curr_info.offglide and prev_info.vowel)
if vowel_count == 2 and curr_info.offglide and not (
-- υυ υ, υ
prev_vowel_info == iota_t or -- ιι ι, ι; ιυ → ι, υ
-- ιυ → ι, υ
prev_vowel_info == upsilon_t and curr_info == upsilon_t -- υυ → υ, υ
or prev_info.offglide and curr_info == upsilon_t or curr_info == prev_info) then
) then
vowel_count, prev_vowel_info = 0, nil
elseif prev then
token_i = token_i + 1
token_i = token_i + 1
if prev_info.vowel then
vowel_count, prev_vowel_info = 1, curr_info
vowel_count = 1
else
end
vowel_count, prev_vowel_info = 1, curr_info
elseif vowel_count == 2 then
vowel_count = 0
end
end
tokens[token_i] = (tokens[token_i] or "") .. character
tokens[token_i] = (tokens[token_i] or "") .. character
elseif curr_info.diacritic then
elseif curr_info.diacritic then
vowel_count = 0
vowel_count, prev_vowel_info = 0, nil
tokens[token_i] = (tokens[token_i] or "") .. character
tokens[token_i] = (tokens[token_i] or "") .. character
if prev_info.diacritic or prev_info.vowel then
if prev_info and (prev_info.diacritic or prev_info.vowel) then
if character == diaeresis then
if character == diaeresis or character == subscript then
-- Split the diphthong in the current token if a diaeresis was found:
-- Split the diphthong in the current token if a diaeresis or subscript
-- the first letter, then the second letter plus any diacritics.
-- was found: the first letter, then the second letter plus any diacritics.
local previous_vowel, vowel_with_diaeresis =
local previous_vowel, vowel_with_diaeresis = tokens[token_i]:match("^(" .. basic_Greek .. ")(" .. basic_Greek .. ".+)")
string.match(tokens[token_i],
"^(" .. basic_Greek .. ")(" .. basic_Greek .. ".+)")
if previous_vowel then
if previous_vowel then
tokens[token_i], tokens[token_i + 1] = previous_vowel, vowel_with_diaeresis
tokens[token_i], tokens[token_i + 1] = previous_vowel, vowel_with_diaeresis
Line 227: Line 203:
' couldn’t be split because it does not consist of two Basic Greek characters followed by other characters.')
' couldn’t be split because it does not consist of two Basic Greek characters followed by other characters.')
--]]
--]]
end
-- If there is only a diaeresis, it could still be the first vowel of a
-- diphthong:
-- αὐτοϋιός → αὐ τ ο *ϋι* ό ς
if character == diaeresis and prev_info.vowel then
vowel_count, prev_vowel_info = 1, prev_info
end
end
end
end
elseif prev_info == rho_t then
elseif prev_info == breathy_cons_t then
if curr_info ~= breathing_t then
if curr_info ~= breathing_t then
mw.log(string.format("The character %s in %s should not have the accent %s on it.",
mw.log(("The character %s in %s should not have the accent %s on it."):format(
prev, text, require("Module:grc-utilities/templates").addDottedCircle(character)))
prev, text, require("Module:grc-utilities/templates").addDottedCircle(character)))
end
end
else
elseif prev then
-- prev can be nil when passed a bare diacritic (as in Translingual diacritic entries)
mw.log("The character " .. prev .. " cannot have a diacritic on it.")
mw.log("The character " .. prev .. " cannot have a diacritic on it.")
end
end
Line 252: Line 235:
local cache = {}
local cache = {}
function export.tokenize(text)
function export.tokenize(text)
local decomposed = decompose(text)
text = toNFD(text)
if not cache[decomposed] then
if not cache[text] then
cache[decomposed] = make_tokens(text)
cache[text] = make_tokens(text)
end
end
return cache[decomposed]
return cache[text]
end
end


Line 266: Line 249:
Used by [[Module:grc-pronunciation]]. ]=]
Used by [[Module:grc-pronunciation]]. ]=]
function export.pronunciationOrder(text)
function export.pronunciationOrder(text)
text = export.standardDiacritics(text)
text = standard_diacritics(text)
if match(text, groups[1]) then
if find(text, groups[1]) then
text = gsub(text,
text = gsub(text,
diacritic .. diacritic .. "+",
diacritic .. diacritic .. "+",
function(sequence)
function(sequence)
-- Put breathing and diaeresis first, then accents, then macron or breve
-- Put breathing and diaeresis first, then accents, then macron or breve
return table.concat{
return concat{
match(sequence, groups[2]) or "",
match(sequence, groups[2]) or "",
match(sequence, groups[3]) or "",
match(sequence, groups[3]) or "",
Line 281: Line 262:
}
}
end)
end)
text = gsub(text, macron, spacing_macron) -- combining to spacing macron
text = gsub(text, macron, spacing_macron) -- combining to spacing macron
text = gsub(text, breve, spacing_breve) -- combining to spacing breve
text = gsub(text, breve, spacing_breve) -- combining to spacing breve
end
end
return toNFC(text)
return toNFC(text)
end
-- Returns a table of any ambiguous vowels in the text, language-tagged.
function export.findAmbig(text, noTag)
if (not text) or type(text) ~= "string" then
error("The input to function findAmbig is nonexistent or not a string")
end
local lengthDiacritic = "[" .. macron .. breve .. circumflex .. subscript .. "]"
local aiu_diacritic = "^([" .. "αιυ" .. "])(" .. diacritic .. "*)$"
-- breaks the word into units
local output, vowels = {}, {}
for _, token in ipairs(export.tokenize(text)) do
if not find(token, m_data.consonant) then
local vowel, diacritics = match(
token,
aiu_diacritic
)
if vowel and (diacritics == "" or
not find(diacritics, lengthDiacritic)) then
local diacriticked_vowel
if not noTag then
diacriticked_vowel = export.tag(vowel .. diacritics)
else
diacriticked_vowel = vowel
end
table.insert(output, diacriticked_vowel)
-- Lists the vowel letters that are ambiguous, for categorization purposes.
vowels[mw.ustring.lower(vowel)] = true
end
end
end
return output, vowels
end
end


return export
return export

Latest revision as of 14:27, 7 May 2026



local export = {}

local m_data = mw.loadData("Module:grc-utilities/data")
local m_string_utils = require("Module:string utilities")

local concat = table.concat
local full_link = require("Module:links").full_link
local gsub = m_string_utils.gsub
local match = m_string_utils.match
local insert = table.insert
local sparseConcat = require("Module:table").sparseConcat
local standard_diacritics -- defined below
local tag_text = require("Module:script utilities").tag_text
local toNFC = mw.ustring.toNFC
local toNFD = mw.ustring.toNFD

local lang = require("Module:languages").getByCode("grc")
local sc = require("Module:scripts").getByCode("Polyt")

local groups = m_data.groups
local canonical = m_data.canonical
local diacritic_order = m_data.diacritic_order
local diacritical_conversions = m_data.diacritical_conversions
local diacritics = m_data.diacritics
local diacritic = m_data.diacritic
local macron = diacritics.macron
local breve = diacritics.breve
local spacing_macron = diacritics.spacing_macron
local spacing_breve = diacritics.spacing_breve
local rough = diacritics.rough
local smooth = diacritics.smooth
local diaeresis = diacritics.diaeresis
local acute = diacritics.acute
local grave = diacritics.grave
local circumflex = diacritics.circum
local subscript = diacritics.subscript
local combining_diacritic = m_data.combining_diacritic

local UTF8_char = ".[\128-\191]*"
local basic_Greek = "[\206-\207][\128-\191]" -- excluding first line of Greek and Coptic block: ͰͱͲͳʹ͵Ͷͷͺͻͼͽ;Ϳ

local info = {}
-- The tables are shared among different characters so that they can be checked
-- for equality if needed, and to use less space.
local vowel_t = { vowel = true }
local iota_t = { vowel = true, offglide = true }
local upsilon_t = { vowel = true, offglide = true }
-- These don't need any contents.
local breathy_cons_t = {}
-- local consonant_t = {}
local diacritic_t = { diacritic = true }
-- Needed for equality comparisons.
local breathing_t = { diacritic = true }

local function add_info(characters, t)
	if type(characters) == "string" then
		for character in characters:gmatch(UTF8_char) do
			info[character] = t
		end
	else
		for _, character in ipairs(characters) do
			info[character] = t
		end
	end
end

add_info({ macron, breve,
		diaeresis,
		acute, grave, circumflex,
		subscript,
	}, diacritic_t)

add_info({rough, smooth}, breathing_t)
add_info("ΑΕΗΟΩαεηοω", vowel_t)
add_info("Ιι", iota_t)
add_info("Υυ", upsilon_t)
add_info("ϜϝΡρ", breathy_cons_t)

local not_recognized = {}
setmetatable(info, { __index = function(t, key)
	return not_recognized
end})

-- Perform a function on each Unicode character in a string.
local function forEach(str, func)
	for char in str:gmatch(UTF8_char) do
		func(char)
	end
end

function export.tag(term, face)
	return tag_text(term, lang, sc, face)
end

function export.link(term, face, alt, tr)
	return full_link({ term = term, alt = alt, lang = lang, sc = sc, tr = tr }, face)
end

-- Convert spacing to combining diacritics, and nonstandard to standard polytonic Greek.
function export.standardDiacritics(text)
	return toNFD((toNFD(text):gsub(UTF8_char, diacritical_conversions)))
end
standard_diacritics = export.standardDiacritics

-- Convert variant letter forms to the canonical form, and decompose.
function export.canonicalize(text)
	text = standard_diacritics(text)
	-- Compose, since the characters in `canonical` are in form NFC.
	text = toNFC(text):gsub(UTF8_char .. grave, canonical) -- for ϗ̀
		:gsub(UTF8_char, canonical)
	-- Decompose on return.
	return toNFD(text)
end

--[=[	This function arranges diacritics in the following order:
			1. macron or breve
			2. breathings or diaeresis
			3. acute, circumflex, or grave
			4. iota subscript
		Used by [[Module:typing-aids]].
		
		Returns an error if a sequence of diacritics contains more than one
		of each category.
]=]
local function reorderDiacriticSequence(diacritics)
	local output = {}
	forEach(diacritics,
		function (diacritic)
			local index = diacritic_order[diacritic]
			if not output[index] then
				output[index] = diacritic
			else
				-- Place breve after macron.
				if diacritic == breve then
					index = index + 1
				end
				-- The following might have odd results when there
				-- are three or more diacritics.
				insert(output, index, diacritic)
				-- [[Special:WhatLinksHere/Wiktionary:Tracking/grc-utils/too many diacritics]]
				require("Module:debug").track("grc-utils/too many diacritics")
				--[[
				local m_templates = require("Module:grc-utilities/templates")
				error("There are two diacritics, " ..
						m_templates.addDottedCircle(output[index]) .. " and " ..
						m_templates.addDottedCircle(diacritic) ..
						" that belong in the same position. There should be only one."
				)
				--]]
			end
		end)
	return sparseConcat(output)
end

function export.reorderDiacritics(text)
	return (gsub(toNFD(text), combining_diacritic .. combining_diacritic .. "+", reorderDiacriticSequence))
end

--[=[
		This breaks a word into meaningful "tokens", which are
		individual letters or diphthongs with their diacritics.
		Used by [[Module:grc-accent]] and [[Module:grc-pronunciation]].
--]=]
local function make_tokens(text)
	local tokens, prev_info = {}, {}
	local token_i, vowel_count = 1, 0 -- Vowel count tracks.
	local prev, prev_vowel_info
	for character in text:gmatch(UTF8_char) do
		local curr_info = info[character]
		-- Split vowels between tokens if not a diphthong.
		if curr_info.vowel then
			vowel_count = vowel_count + 1
			if vowel_count == 2 and curr_info.offglide and not (
				prev_vowel_info == iota_t or -- ιι → ι, ι; ιυ → ι, υ
				prev_vowel_info == upsilon_t and curr_info == upsilon_t -- υυ → υ, υ
			) then
				vowel_count, prev_vowel_info = 0, nil
			elseif prev then
				token_i = token_i + 1
				vowel_count, prev_vowel_info = 1, curr_info
			else
				vowel_count, prev_vowel_info = 1, curr_info
			end
			tokens[token_i] = (tokens[token_i] or "") .. character
		elseif curr_info.diacritic then
			vowel_count, prev_vowel_info = 0, nil
			tokens[token_i] = (tokens[token_i] or "") .. character
			if prev_info and (prev_info.diacritic or prev_info.vowel) then
				if character == diaeresis or character == subscript then
					-- Split the diphthong in the current token if a diaeresis or subscript
					-- was found: the first letter, then the second letter plus any diacritics.
					local previous_vowel, vowel_with_diaeresis = tokens[token_i]:match("^(" .. basic_Greek .. ")(" .. basic_Greek .. ".+)")
					if previous_vowel then
						tokens[token_i], tokens[token_i + 1] = previous_vowel, vowel_with_diaeresis
						token_i = token_i + 1
					else
						-- The vowel preceding the vowel with the diaeresis will already be
						-- placed in the previous token if it has a diacritic:
						-- Περικλῆῐ̈ → Π ε ρ ι κ λ ῆ ῐ̈
						--[[
						mw.log('Diaeresis was found in ' .. text .. ', but the previous token ' ..
							require("Module:Unicode data").add_dotted_circle(tokens[token_i]) ..
							' couldn’t be split because it does not consist of two Basic Greek characters followed by other characters.')
						--]]
					end
					-- If there is only a diaeresis, it could still be the first vowel of a
					-- diphthong:
					-- αὐτοϋιός → αὐ τ ο *ϋι* ό ς
					if character == diaeresis and prev_info.vowel then
						vowel_count, prev_vowel_info = 1, prev_info
					end
				end
			elseif prev_info == breathy_cons_t then
				if curr_info ~= breathing_t then
					mw.log(("The character %s in %s should not have the accent %s on it."):format(
						prev, text, require("Module:grc-utilities/templates").addDottedCircle(character)))
				end
			elseif prev then
				-- prev can be nil when passed a bare diacritic (as in Translingual diacritic entries)
				mw.log("The character " .. prev .. " cannot have a diacritic on it.")
			end
		else
			vowel_count = 0
			if prev then
				token_i = token_i + 1
			end
			tokens[token_i] = (tokens[token_i] or "") .. character
		end
		prev = character
		prev_info = curr_info
	end
	return tokens
end

local cache = {}
function export.tokenize(text)
	text = toNFD(text)
	if not cache[text] then
		cache[text] = make_tokens(text)
	end
	return cache[text]
end

--[=[	Places diacritics in the following order:
			1. breathings or diaeresis
			2. acute, circumflex, or grave
			3. macron or breve
			4. iota subscript
		Used by [[Module:grc-pronunciation]].		]=]
function export.pronunciationOrder(text)
	text = standard_diacritics(text)
	if match(text, groups[1]) then
		text = gsub(text,
			diacritic .. diacritic .. "+",
			function(sequence)
				-- Put breathing and diaeresis first, then accents, then macron or breve
				return concat{
					match(sequence, groups[2]) or "",
					match(sequence, groups[3]) or "",
					match(sequence, groups[1]) or "",
					match(sequence, groups[4]) or ""
				}
			end)
		text = gsub(text, macron, spacing_macron) -- combining to spacing macron
		text = gsub(text, breve, spacing_breve) -- combining to spacing breve
	end
	return toNFC(text)
end

return export