Module:grc-utilities: Difference between revisions
Jump to navigation
Jump to search
Created page with "local export = {} local m_script_utils = require("Module:script utilities") local m_links = require("Module:links") local lang = require("Module:languages").getByCode("grc")..." |
No edit summary |
||
| Line 1: | Line 1: | ||
local export = {} | local export = {} | ||
local | local m_data = mw.loadData("Module:grc-utilities/data") | ||
local | local m_string_utils = require("Module:string utilities") | ||
local concat = table.concat | |||
local full_link = require("Module:links").full_link | |||
local gsub = m_string_utils.gsub | |||
local match = m_string_utils.match | |||
local insert = table.insert | |||
local sparseConcat = require("Module:table").sparseConcat | |||
local standard_diacritics -- defined below | |||
local tag_text = require("Module:script utilities").tag_text | |||
local toNFC = mw.ustring.toNFC | |||
local toNFD = mw.ustring.toNFD | |||
local lang = require("Module:languages").getByCode("grc") | local lang = require("Module:languages").getByCode("grc") | ||
local sc = require("Module:scripts").getByCode(" | local sc = require("Module:scripts").getByCode("Polyt") | ||
local groups = m_data.groups | local groups = m_data.groups | ||
local canonical = m_data.canonical | |||
local diacritic_order = m_data.diacritic_order | local diacritic_order = m_data.diacritic_order | ||
local | local diacritical_conversions = m_data.diacritical_conversions | ||
local diacritics = m_data.diacritics | local diacritics = m_data.diacritics | ||
local diacritic = m_data.diacritic | local diacritic = m_data.diacritic | ||
| Line 25: | Line 37: | ||
local combining_diacritic = m_data.combining_diacritic | local combining_diacritic = m_data.combining_diacritic | ||
local UTF8_char = " | local UTF8_char = ".[\128-\191]*" | ||
local basic_Greek = "[\206-\207][\128-\191]" -- excluding first line of Greek and Coptic block: ͰͱͲͳʹ͵Ͷͷͺͻͼͽ;Ϳ | local basic_Greek = "[\206-\207][\128-\191]" -- excluding first line of Greek and Coptic block: ͰͱͲͳʹ͵Ͷͷͺͻͼͽ;Ϳ | ||
local info = {} | local info = {} | ||
| Line 43: | Line 47: | ||
local upsilon_t = { vowel = true, offglide = true } | local upsilon_t = { vowel = true, offglide = true } | ||
-- These don't need any contents. | -- These don't need any contents. | ||
local | local breathy_cons_t = {} | ||
-- local consonant_t = {} | -- local consonant_t = {} | ||
local diacritic_t = { diacritic = true } | local diacritic_t = { diacritic = true } | ||
| Line 51: | Line 55: | ||
local function add_info(characters, t) | local function add_info(characters, t) | ||
if type(characters) == "string" then | if type(characters) == "string" then | ||
for character in | for character in characters:gmatch(UTF8_char) do | ||
info[character] = t | info[character] = t | ||
end | end | ||
else | else | ||
for | for _, character in ipairs(characters) do | ||
info[character] = t | info[character] = t | ||
end | end | ||
| Line 71: | Line 75: | ||
add_info("Ιι", iota_t) | add_info("Ιι", iota_t) | ||
add_info("Υυ", upsilon_t) | add_info("Υυ", upsilon_t) | ||
add_info("ϜϝΡρ", breathy_cons_t) | |||
local not_recognized = {} | local not_recognized = {} | ||
setmetatable(info, { __index = | setmetatable(info, { __index = function(t, key) | ||
return not_recognized | |||
end}) | |||
}) | |||
-- Perform a function on each Unicode character in a string. | -- Perform a function on each Unicode character in a string. | ||
local function forEach(str, func) | local function forEach(str, func) | ||
for char in | for char in str:gmatch(UTF8_char) do | ||
func(char) | func(char) | ||
end | end | ||
end | end | ||
function export.tag(term, face) | function export.tag(term, face) | ||
return | return tag_text(term, lang, sc, face) | ||
end | end | ||
function export.link(term, face, alt, tr) | function export.link(term, face, alt, tr) | ||
return | return full_link({ term = term, alt = alt, lang = lang, sc = sc, tr = tr }, face) | ||
end | end | ||
-- Convert spacing to combining diacritics, and nonstandard to standard polytonic Greek. | |||
return | function export.standardDiacritics(text) | ||
return toNFD((toNFD(text):gsub(UTF8_char, diacritical_conversions))) | |||
end | end | ||
standard_diacritics = export.standardDiacritics | |||
-- Convert | -- Convert variant letter forms to the canonical form, and decompose. | ||
function export. | function export.canonicalize(text) | ||
text = | text = standard_diacritics(text) | ||
-- Compose, since the characters in `canonical` are in form NFC. | |||
text = text:gsub(UTF8_char, | text = toNFC(text):gsub(UTF8_char .. grave, canonical) -- for ϗ̀ | ||
:gsub(UTF8_char, canonical) | |||
return text | -- Decompose on return. | ||
return toNFD(text) | |||
end | end | ||
| Line 157: | Line 137: | ||
-- The following might have odd results when there | -- The following might have odd results when there | ||
-- are three or more diacritics. | -- are three or more diacritics. | ||
insert(output, index, diacritic) | |||
-- [[Special:WhatLinksHere/Wiktionary:Tracking/grc-utils/too many diacritics]] | |||
require("Module:debug").track("grc-utils/too many diacritics") | |||
--[[ | --[[ | ||
local m_templates = require("Module:grc-utilities/templates") | local m_templates = require("Module:grc-utilities/templates") | ||
| Line 172: | Line 154: | ||
function export.reorderDiacritics(text) | function export.reorderDiacritics(text) | ||
return (gsub(toNFD(text), combining_diacritic .. combining_diacritic .. "+", reorderDiacriticSequence)) | |||
return (gsub( | |||
end | end | ||
| Line 186: | Line 164: | ||
local function make_tokens(text) | local function make_tokens(text) | ||
local tokens, prev_info = {}, {} | local tokens, prev_info = {}, {} | ||
local token_i, vowel_count = 1, 0 -- Vowel count tracks . | local token_i, vowel_count = 1, 0 -- Vowel count tracks. | ||
local prev | local prev, prev_vowel_info | ||
for character in | for character in text:gmatch(UTF8_char) do | ||
local curr_info = info[character] | local curr_info = info[character] | ||
-- Split vowels between tokens if not a diphthong. | -- Split vowels between tokens if not a diphthong. | ||
if curr_info.vowel then | if curr_info.vowel then | ||
vowel_count = vowel_count + 1 | vowel_count = vowel_count + 1 | ||
if | if vowel_count == 2 and curr_info.offglide and not ( | ||
prev_vowel_info == iota_t or -- ιι → ι, ι; ιυ → ι, υ | |||
prev_vowel_info == upsilon_t and curr_info == upsilon_t -- υυ → υ, υ | |||
) then | |||
vowel_count, prev_vowel_info = 0, nil | |||
elseif prev then | |||
token_i = token_i + 1 | token_i = token_i + 1 | ||
vowel_count, prev_vowel_info = 1, curr_info | |||
else | |||
vowel_count, prev_vowel_info = 1, curr_info | |||
vowel_count = | |||
end | end | ||
tokens[token_i] = (tokens[token_i] or "") .. character | tokens[token_i] = (tokens[token_i] or "") .. character | ||
elseif curr_info.diacritic then | elseif curr_info.diacritic then | ||
vowel_count = 0 | vowel_count, prev_vowel_info = 0, nil | ||
tokens[token_i] = (tokens[token_i] or "") .. character | tokens[token_i] = (tokens[token_i] or "") .. character | ||
if prev_info.diacritic or prev_info.vowel then | if prev_info and (prev_info.diacritic or prev_info.vowel) then | ||
if character == diaeresis then | if character == diaeresis or character == subscript then | ||
-- Split the diphthong in the current token if a diaeresis | -- Split the diphthong in the current token if a diaeresis or subscript | ||
-- the first letter, then the second letter plus any diacritics. | -- was found: the first letter, then the second letter plus any diacritics. | ||
local previous_vowel, vowel_with_diaeresis = | local previous_vowel, vowel_with_diaeresis = tokens[token_i]:match("^(" .. basic_Greek .. ")(" .. basic_Greek .. ".+)") | ||
if previous_vowel then | if previous_vowel then | ||
tokens[token_i], tokens[token_i + 1] = previous_vowel, vowel_with_diaeresis | tokens[token_i], tokens[token_i + 1] = previous_vowel, vowel_with_diaeresis | ||
| Line 227: | Line 203: | ||
' couldn’t be split because it does not consist of two Basic Greek characters followed by other characters.') | ' couldn’t be split because it does not consist of two Basic Greek characters followed by other characters.') | ||
--]] | --]] | ||
end | |||
-- If there is only a diaeresis, it could still be the first vowel of a | |||
-- diphthong: | |||
-- αὐτοϋιός → αὐ τ ο *ϋι* ό ς | |||
if character == diaeresis and prev_info.vowel then | |||
vowel_count, prev_vowel_info = 1, prev_info | |||
end | end | ||
end | end | ||
elseif prev_info == | elseif prev_info == breathy_cons_t then | ||
if curr_info ~= breathing_t then | if curr_info ~= breathing_t then | ||
mw.log( | mw.log(("The character %s in %s should not have the accent %s on it."):format( | ||
prev, text, require("Module:grc-utilities/templates").addDottedCircle(character))) | prev, text, require("Module:grc-utilities/templates").addDottedCircle(character))) | ||
end | end | ||
elseif prev then | |||
-- prev can be nil when passed a bare diacritic (as in Translingual diacritic entries) | |||
mw.log("The character " .. prev .. " cannot have a diacritic on it.") | mw.log("The character " .. prev .. " cannot have a diacritic on it.") | ||
end | end | ||
| Line 252: | Line 235: | ||
local cache = {} | local cache = {} | ||
function export.tokenize(text) | function export.tokenize(text) | ||
text = toNFD(text) | |||
if not cache[ | if not cache[text] then | ||
cache[ | cache[text] = make_tokens(text) | ||
end | end | ||
return cache[ | return cache[text] | ||
end | end | ||
| Line 266: | Line 249: | ||
Used by [[Module:grc-pronunciation]]. ]=] | Used by [[Module:grc-pronunciation]]. ]=] | ||
function export.pronunciationOrder(text) | function export.pronunciationOrder(text) | ||
text = | text = standard_diacritics(text) | ||
if match(text, groups[1]) then | |||
if | |||
text = gsub(text, | text = gsub(text, | ||
diacritic .. diacritic .. "+", | diacritic .. diacritic .. "+", | ||
function(sequence) | function(sequence) | ||
-- Put breathing and diaeresis first, then accents, then macron or breve | -- Put breathing and diaeresis first, then accents, then macron or breve | ||
return | return concat{ | ||
match(sequence, groups[2]) or "", | match(sequence, groups[2]) or "", | ||
match(sequence, groups[3]) or "", | match(sequence, groups[3]) or "", | ||
| Line 281: | Line 262: | ||
} | } | ||
end) | end) | ||
text = gsub(text, macron, spacing_macron) -- combining to spacing macron | text = gsub(text, macron, spacing_macron) -- combining to spacing macron | ||
text = gsub(text, breve, spacing_breve) -- combining to spacing breve | text = gsub(text, breve, spacing_breve) -- combining to spacing breve | ||
end | end | ||
return toNFC(text) | return toNFC(text) | ||
end | end | ||
return export | return export | ||
Latest revision as of 14:27, 7 May 2026
- This module lacks a documentation subpage. Please create it.
- Useful links: subpage list • links • transclusions • testcases • sandbox
local export = {}
local m_data = mw.loadData("Module:grc-utilities/data")
local m_string_utils = require("Module:string utilities")
local concat = table.concat
local full_link = require("Module:links").full_link
local gsub = m_string_utils.gsub
local match = m_string_utils.match
local insert = table.insert
local sparseConcat = require("Module:table").sparseConcat
local standard_diacritics -- defined below
local tag_text = require("Module:script utilities").tag_text
local toNFC = mw.ustring.toNFC
local toNFD = mw.ustring.toNFD
local lang = require("Module:languages").getByCode("grc")
local sc = require("Module:scripts").getByCode("Polyt")
local groups = m_data.groups
local canonical = m_data.canonical
local diacritic_order = m_data.diacritic_order
local diacritical_conversions = m_data.diacritical_conversions
local diacritics = m_data.diacritics
local diacritic = m_data.diacritic
local macron = diacritics.macron
local breve = diacritics.breve
local spacing_macron = diacritics.spacing_macron
local spacing_breve = diacritics.spacing_breve
local rough = diacritics.rough
local smooth = diacritics.smooth
local diaeresis = diacritics.diaeresis
local acute = diacritics.acute
local grave = diacritics.grave
local circumflex = diacritics.circum
local subscript = diacritics.subscript
local combining_diacritic = m_data.combining_diacritic
local UTF8_char = ".[\128-\191]*"
local basic_Greek = "[\206-\207][\128-\191]" -- excluding first line of Greek and Coptic block: ͰͱͲͳʹ͵Ͷͷͺͻͼͽ;Ϳ
local info = {}
-- The tables are shared among different characters so that they can be checked
-- for equality if needed, and to use less space.
local vowel_t = { vowel = true }
local iota_t = { vowel = true, offglide = true }
local upsilon_t = { vowel = true, offglide = true }
-- These don't need any contents.
local breathy_cons_t = {}
-- local consonant_t = {}
local diacritic_t = { diacritic = true }
-- Needed for equality comparisons.
local breathing_t = { diacritic = true }
local function add_info(characters, t)
if type(characters) == "string" then
for character in characters:gmatch(UTF8_char) do
info[character] = t
end
else
for _, character in ipairs(characters) do
info[character] = t
end
end
end
add_info({ macron, breve,
diaeresis,
acute, grave, circumflex,
subscript,
}, diacritic_t)
add_info({rough, smooth}, breathing_t)
add_info("ΑΕΗΟΩαεηοω", vowel_t)
add_info("Ιι", iota_t)
add_info("Υυ", upsilon_t)
add_info("ϜϝΡρ", breathy_cons_t)
local not_recognized = {}
setmetatable(info, { __index = function(t, key)
return not_recognized
end})
-- Perform a function on each Unicode character in a string.
local function forEach(str, func)
for char in str:gmatch(UTF8_char) do
func(char)
end
end
function export.tag(term, face)
return tag_text(term, lang, sc, face)
end
function export.link(term, face, alt, tr)
return full_link({ term = term, alt = alt, lang = lang, sc = sc, tr = tr }, face)
end
-- Convert spacing to combining diacritics, and nonstandard to standard polytonic Greek.
function export.standardDiacritics(text)
return toNFD((toNFD(text):gsub(UTF8_char, diacritical_conversions)))
end
standard_diacritics = export.standardDiacritics
-- Convert variant letter forms to the canonical form, and decompose.
function export.canonicalize(text)
text = standard_diacritics(text)
-- Compose, since the characters in `canonical` are in form NFC.
text = toNFC(text):gsub(UTF8_char .. grave, canonical) -- for ϗ̀
:gsub(UTF8_char, canonical)
-- Decompose on return.
return toNFD(text)
end
--[=[ This function arranges diacritics in the following order:
1. macron or breve
2. breathings or diaeresis
3. acute, circumflex, or grave
4. iota subscript
Used by [[Module:typing-aids]].
Returns an error if a sequence of diacritics contains more than one
of each category.
]=]
local function reorderDiacriticSequence(diacritics)
local output = {}
forEach(diacritics,
function (diacritic)
local index = diacritic_order[diacritic]
if not output[index] then
output[index] = diacritic
else
-- Place breve after macron.
if diacritic == breve then
index = index + 1
end
-- The following might have odd results when there
-- are three or more diacritics.
insert(output, index, diacritic)
-- [[Special:WhatLinksHere/Wiktionary:Tracking/grc-utils/too many diacritics]]
require("Module:debug").track("grc-utils/too many diacritics")
--[[
local m_templates = require("Module:grc-utilities/templates")
error("There are two diacritics, " ..
m_templates.addDottedCircle(output[index]) .. " and " ..
m_templates.addDottedCircle(diacritic) ..
" that belong in the same position. There should be only one."
)
--]]
end
end)
return sparseConcat(output)
end
function export.reorderDiacritics(text)
return (gsub(toNFD(text), combining_diacritic .. combining_diacritic .. "+", reorderDiacriticSequence))
end
--[=[
This breaks a word into meaningful "tokens", which are
individual letters or diphthongs with their diacritics.
Used by [[Module:grc-accent]] and [[Module:grc-pronunciation]].
--]=]
local function make_tokens(text)
local tokens, prev_info = {}, {}
local token_i, vowel_count = 1, 0 -- Vowel count tracks.
local prev, prev_vowel_info
for character in text:gmatch(UTF8_char) do
local curr_info = info[character]
-- Split vowels between tokens if not a diphthong.
if curr_info.vowel then
vowel_count = vowel_count + 1
if vowel_count == 2 and curr_info.offglide and not (
prev_vowel_info == iota_t or -- ιι → ι, ι; ιυ → ι, υ
prev_vowel_info == upsilon_t and curr_info == upsilon_t -- υυ → υ, υ
) then
vowel_count, prev_vowel_info = 0, nil
elseif prev then
token_i = token_i + 1
vowel_count, prev_vowel_info = 1, curr_info
else
vowel_count, prev_vowel_info = 1, curr_info
end
tokens[token_i] = (tokens[token_i] or "") .. character
elseif curr_info.diacritic then
vowel_count, prev_vowel_info = 0, nil
tokens[token_i] = (tokens[token_i] or "") .. character
if prev_info and (prev_info.diacritic or prev_info.vowel) then
if character == diaeresis or character == subscript then
-- Split the diphthong in the current token if a diaeresis or subscript
-- was found: the first letter, then the second letter plus any diacritics.
local previous_vowel, vowel_with_diaeresis = tokens[token_i]:match("^(" .. basic_Greek .. ")(" .. basic_Greek .. ".+)")
if previous_vowel then
tokens[token_i], tokens[token_i + 1] = previous_vowel, vowel_with_diaeresis
token_i = token_i + 1
else
-- The vowel preceding the vowel with the diaeresis will already be
-- placed in the previous token if it has a diacritic:
-- Περικλῆῐ̈ → Π ε ρ ι κ λ ῆ ῐ̈
--[[
mw.log('Diaeresis was found in ' .. text .. ', but the previous token ' ..
require("Module:Unicode data").add_dotted_circle(tokens[token_i]) ..
' couldn’t be split because it does not consist of two Basic Greek characters followed by other characters.')
--]]
end
-- If there is only a diaeresis, it could still be the first vowel of a
-- diphthong:
-- αὐτοϋιός → αὐ τ ο *ϋι* ό ς
if character == diaeresis and prev_info.vowel then
vowel_count, prev_vowel_info = 1, prev_info
end
end
elseif prev_info == breathy_cons_t then
if curr_info ~= breathing_t then
mw.log(("The character %s in %s should not have the accent %s on it."):format(
prev, text, require("Module:grc-utilities/templates").addDottedCircle(character)))
end
elseif prev then
-- prev can be nil when passed a bare diacritic (as in Translingual diacritic entries)
mw.log("The character " .. prev .. " cannot have a diacritic on it.")
end
else
vowel_count = 0
if prev then
token_i = token_i + 1
end
tokens[token_i] = (tokens[token_i] or "") .. character
end
prev = character
prev_info = curr_info
end
return tokens
end
local cache = {}
function export.tokenize(text)
text = toNFD(text)
if not cache[text] then
cache[text] = make_tokens(text)
end
return cache[text]
end
--[=[ Places diacritics in the following order:
1. breathings or diaeresis
2. acute, circumflex, or grave
3. macron or breve
4. iota subscript
Used by [[Module:grc-pronunciation]]. ]=]
function export.pronunciationOrder(text)
text = standard_diacritics(text)
if match(text, groups[1]) then
text = gsub(text,
diacritic .. diacritic .. "+",
function(sequence)
-- Put breathing and diaeresis first, then accents, then macron or breve
return concat{
match(sequence, groups[2]) or "",
match(sequence, groups[3]) or "",
match(sequence, groups[1]) or "",
match(sequence, groups[4]) or ""
}
end)
text = gsub(text, macron, spacing_macron) -- combining to spacing macron
text = gsub(text, breve, spacing_breve) -- combining to spacing breve
end
return toNFC(text)
end
return export