Module:lt-common

From Linguifex
Jump to navigation Jump to search

Documentation for this module may be created at Module:lt-common/doc

local export = {}

local m_str_utils = require("Module:string utilities")
local u = m_str_utils.char
local ugsub = m_str_utils.gsub
local ulower = m_str_utils.lower
local uupper = m_str_utils.upper
local ufind = m_str_utils.find
local ulen = m_str_utils.len
local ucodepoint = m_str_utils.codepoint

-- Keep native Unicode normalization functions (no replacement available)
local toNFC = mw.ustring.toNFC
local toNFD = mw.ustring.toNFD

-- =============================================================================
-- Unicode constants
-- =============================================================================

local GRAVE  = u(0x0300)  -- combining grave accent
local ACUTE  = u(0x0301)  -- combining acute accent
local TILDE  = u(0x0303)  -- combining tilde
local MACRON = u(0x0304)  -- combining macron
local DOTABOVE = u(0x0307)  -- combining dot above
local CARON  = u(0x030C)  -- combining caron
local OGONEK = u(0x0328)  -- combining ogonek

local ANY_ACCENT = "[" .. GRAVE .. ACUTE .. TILDE .. "]"

-- Legacy aliases for backward compatibility
local grave = GRAVE
local acute = ACUTE
local tilde = TILDE
local macron = MACRON
local dotabove = DOTABOVE
local caron = CARON
local ogonek = OGONEK
local accents = ANY_ACCENT

-- =============================================================================
-- Internal helper functions
-- =============================================================================

local dotless_to_dotted = {
	["ı"] = "i",
	["ȷ"] = "j",
}

local function char_to_dotted_form(base, below)
	return (dotless_to_dotted[base] or base) .. below
end

local function normalize_dotted_chars(text)
	-- Remove any dots above, and convert dotless forms to dotted. 
	-- On entry, text must be in NFD form.
	return ugsub(text, "([iıjȷ])(" .. ogonek .. "?)" .. dotabove, char_to_dotted_form)
end

local function char_to_accent_form(base, below)
	-- Add a 'dot above' after the base.
	if base == "i" or base == "j" then
		return base .. below .. dotabove
	end
	-- Convert any dotless chars combining with accents to the dotted form, 
	-- so that they normalize properly. This shouldn't happen, but just in case.
	return char_to_dotted_form(base, below)
end

local function stripped_text_form(text)
	-- Remove accents.
	text = ugsub(toNFD(text), accents .. "+", "")
	-- Normalize dotless characters and dot-above diacritics.
	return normalize_dotted_chars(text)
end

-- =============================================================================
-- Input validation
-- =============================================================================

-- Reject Private Use Area characters (U+E000–U+F8FF).
function export.reject_pua(s)
	if not s then return end
	for i = 1, ulen(s) do
		local cp = ucodepoint(s, i)
		if cp >= 0xE000 and cp <= 0xF8FF then
			error(string.format(
				"lt-common: private use area character U+%04X detected in \"%s\". " ..
				"Please use a standard Unicode character instead.", cp, s))
		end
	end
end

-- =============================================================================
-- Input normalization
-- =============================================================================

-- Detect nonstandard encoding patterns in the input.
-- Returns: dotless_flag (found ı/ȷ), precomp_i_flag (found precomposed í/ì/ĩ)
function export.detect_nonstandard(s)
	if not s then return false, false end
	local nfd_s = toNFD(s)
	local dotless_flag   = ufind(nfd_s, "[ıȷ]") ~= nil
	local precomp_i_flag = ufind(nfd_s, "[íìĩ]") ~= nil
	return dotless_flag, precomp_i_flag
end

-- Normalize input to clean canonical NFC.
-- Handles dotless i/j (ı, ȷ) and stray dot-above combinations.
function export.canonicalize_input(s)
	if not s then return s end
	s = toNFD(s)
	
	-- Remove stray dot-above after i/j (with or without ogonek)
	s = ugsub(s, "([iıjȷ])(" .. OGONEK .. "?)" .. DOTABOVE, function(base, below)
		base = (base == "ı") and "i" or (base == "ȷ") and "j" or base
		return base .. below
	end)
	
	-- Convert any remaining dotless i/j to standard forms
	s = ugsub(s, "ı", "i")
	s = ugsub(s, "ȷ", "j")
	
	return toNFC(s)
end

-- =============================================================================
-- Partial NFD conversion (stem_ac representation)
-- =============================================================================

-- Convert canonical NFC to partial NFD (stem_ac).
-- Applies full NFD, then recomposes non-accent diacritics.
-- Only grave/acute/tilde remain as combining characters.
function export.to_stem_ac(s)
	if not s then return s end
	s = toNFD(s)
	
	-- Recompose ogonek vowels
	s = ugsub(s, "a" .. OGONEK, "ą")
	s = ugsub(s, "e" .. OGONEK, "ę")
	s = ugsub(s, "i" .. OGONEK, "į")
	s = ugsub(s, "u" .. OGONEK, "ų")
	
	-- Recompose macron vowel
	s = ugsub(s, "u" .. MACRON, "ū")
	
	-- Recompose dot-above e
	s = ugsub(s, "e" .. DOTABOVE, "ė")
	
	-- Recompose caron consonants
	s = ugsub(s, "c" .. CARON, "č")
	s = ugsub(s, "s" .. CARON, "š")
	s = ugsub(s, "z" .. CARON, "ž")
	
	return s
end

-- =============================================================================
-- Accent manipulation
-- =============================================================================

-- Strip all accent marks (grave/acute/tilde) from partial NFD text.
function export.to_stem_bare(stem_ac)
	if not stem_ac then return stem_ac end
	return ugsub(stem_ac, ANY_ACCENT, "")
end

-- Check if partial NFD text contains any accent marks.
function export.has_accent(stem_ac)
	return ufind(stem_ac, ANY_ACCENT) ~= nil
end

-- =============================================================================
-- Complete input pipeline
-- =============================================================================

-- Process raw user input through the complete normalization pipeline.
-- Returns: stem_bare, stem_ac, dotless_flag, precomp_flag
function export.process_input(raw)
	if not raw then return raw, raw, false, false end
	
	export.reject_pua(raw)
	local dotless_flag, precomp_flag = export.detect_nonstandard(raw)
	local canon = export.canonicalize_input(raw)
	local stem_ac = export.to_stem_ac(canon)
	local stem_bare = export.to_stem_bare(stem_ac)
	
	return stem_bare, stem_ac, dotless_flag, precomp_flag
end

-- =============================================================================
-- Display and text processing
-- =============================================================================

function export.makeDisplayText(text, lang, sc)
	if not text then return text end
	-- Normalize dotless characters and dot-above diacritics (while retaining accents).
	text = normalize_dotted_chars(toNFD(text))
	-- Add a 'dot above' between "i" or "j" and an accent.
	text = ugsub(text, "([iıjȷ])(" .. ogonek .. "?)%f" .. accents, char_to_accent_form)
	return toNFC(text)
end

-- Called from [[Module:languages]] since [[Module:lt-common]] is set as the stripDiacritics handler in
-- [[Module:languages/data/2]].
function export.stripDiacritics(text, lang, sc)
	if not text then return text end
	return toNFC(stripped_text_form(text))
end

local sortkey_substitutes = {
	[ogonek] = u(0xF000),
	[caron] = u(0xF001),
	[macron] = u(0xF002),
	[dotabove] = u(0xF003),
	["y"] = "i" .. u(0xF004),
}

function export.makeSortKey(text, lang, sc)
	if not text then return text end
	-- Normalize to the stripped-text form and convert diacritics to Private Use 
	-- Area characters so they sort after all other characters.
	text = stripped_text_form(ulower(text))
		:gsub(".[\128-\191]*", sortkey_substitutes)
	return toNFC(uupper(text))
end

return export