Module:en-utilities
Documentation for this module may be created at Module:en-utilities/doc
local export = {}
local add_suffix -- Defined below.
local find = string.find
local match = string.match
local reverse = string.reverse
local sub = string.sub
local toNFD = mw.ustring.toNFD
local ugsub = mw.ustring.gsub
local ulower = mw.ustring.lower
local umatch = mw.ustring.match
local usub = mw.ustring.sub
local vowels = "aæᴀᴁɐɑɒ@eᴇǝⱻəɛɘɜɞɤiıɪɨᵻoøœᴏɶɔᴐɵuᴜʉᵾɯꟺʊʋʌyʏ"
local hyphens = "%-‐‑‒–—"
--[==[
Loaders for objects, which load data (or some other object) into some variable, which can then be accessed as "foo or get_foo()", where the function get_foo sets the object to "foo" and then returns it. This ensures they are only loaded when needed, and avoids the need to check for the existence of the object each time, since once "foo" has been set, "get_foo" will not be called again.]==]
local diacritics
local function get_diacritics()
diacritics, get_diacritics = mw.loadData("Module:headword/data").page.comb_chars.diacritics_all .. "+", nil
return diacritics
end
-- Normalize a string, so that case and diacritics are ignored. By default, "gu"
-- and "qu" are normalized to "g" and "q", because they behave like consonants
-- under certain conditions (e.g. final "y" does not usually have the plural
-- "ies" after a vowel, but it's regular for "quy" to become "quies". The flag
-- `not_gu` prevents this happening to "gu", and is needed because terms ending
-- "-guy" are almost always compounds of "guy" (→ "guys").
local function normalize(str, followed_by, not_gu)
if not followed_by then
followed_by = ""
end
str = ugsub(toNFD(str) .. followed_by, "([" .. (not_gu and "" or "Gg") .. "Qq])u([".. vowels .. "])", "%1%2")
return ulower(ugsub(sub(str, 1, #str - #followed_by), diacritics or get_diacritics(), ""))
end
local function epenthetic_e_default(stem)
return sub(stem, -1) ~= "e"
end
local function epenthetic_e_for_s(stem, term)
-- If the stem is different, it must be from "y" → "i".
if stem ~= term then
return true
end
local final
if match(stem, "^[^\128-\255]*$") then
final = sub(stem, -1)
else
stem = ugsub(toNFD(stem), diacritics or get_diacritics(), "")
final = usub(stem, -1)
end
-- Epenthetic "e" is added after a sibilant or sibilant-affricate. The vast
-- majority of these are spelled "s", "x", "z", "ch" and "sh", but "dg"
-- (→ "dge") and "ß" (→ "ss") can be found in obsolete spellings, "shh" in
-- onomatopoeia, and "zh", "dj", "jj" (and more) in loanwords.
return (
final == "g" and sub(stem, -2, -2) == "d" or
final == "h" and match(stem, "[csz]h+$") or
final == "j" and umatch(stem, "[^" .. vowels .. "]j$") or
final == "s" or
final == "x" or
final == "z" or
final == "ß"
)
end
local suffixes = {}
suffixes["s.plural"] = {
final_y_is_i = true,
epenthetic_e = epenthetic_e_for_s
}
suffixes["s.verb"] = {
final_y_is_i = true,
final_consonant_is_doubled = true,
epenthetic_e = epenthetic_e_for_s
}
suffixes["ing"] = {
final_consonant_is_doubled = true,
remove_silent_e = true,
}
suffixes["d"] = {
final_y_is_i = true,
final_consonant_is_doubled = true,
epenthetic_e = epenthetic_e_default,
}
suffixes["dst"] = suffixes["d"]
suffixes["st.verb"] = suffixes["d"]
suffixes["th"] = suffixes["d"]
suffixes["n"] = {
final_y_is_i = true,
final_y_is_i_after_vowel = true,
final_guy_is_gui = true,
final_consonant_is_doubled = true,
-- No epenthetic "e" after an "e", or an "i", "r" or "w" preceded by a vowel.
epenthetic_e = function(stem)
return not (
sub(stem, -1) == "e" or
umatch(normalize(stem), "[" .. vowels .. "][irw]$")
)
end,
}
suffixes["r"] = {
final_y_is_i = true,
final_ey_is_i = true,
final_guy_is_gui = true,
final_consonant_is_doubled = true,
epenthetic_e = epenthetic_e_default
}
suffixes["st.superlative"] = suffixes["r"]
-- Returns the stem used for suffixes that sometimes convert final "y" into "i",
-- such as "-es" ("-ies"), e.g. "penny" → "penni" ("pennies"). If
-- `final_ey_is_i` is true, final "ey" may also be converted, e.g. "plaguey" →
-- "plagui"; this is needed for "-er" ("-ier") and "-est" ("-iest"). If `not_gu`
-- is true, then normalize() will be called with the `not_gu` flag (see there
-- for more info); this is true in most cases.
local function convert_final_y_to_i(str, not_gu, final_ey_is_i, final_y_is_i_after_vowel)
local final3 = usub(str, -3)
-- Special case: treat "eey" as "ee" + "y" (e.g. "treey" → "treeiest").
-- "oey" and "uey" are usually vowel + "ey", but examples of "oe" + "y" and
-- "ue" = "y" do also exist: compare "go" → "goey" → "goier" with "doe" →
-- "doey" → "doeier"; "flu" → "fluey" → "fluiest" and "flue" → "fluey" →
-- "flueiest" form a theoretically possible minimal pair.
if final3 == "eey" then
return sub(str, 1, -2) .. "i"
end
local final2 = usub(str, -2)
-- If `final_ey_is_i` is true, treat final "-ey" can also be reduced.
if final_ey_is_i and final2 == "ey" then
-- Remove "ey" to get the base stem.
local base_stem = sub(str, 1, -3)
-- Special case: allow final "-ey" ("potato-ey" → "potato-iest").
if umatch(final3, "[" .. hyphens .. "]ey") then
return base_stem .. "i"
end
-- Final "ey" becomes "i" iff the term is polysyllabic (e.g. not
-- "grey"). "ey" is common if the base stem ends in a vowel ("echo →
-- "echoey"), so the presence of a vowel anywhere in the base stem is
-- sufficient to deem it polysyllabic. ("echoey" → "echo" → "echoiest",
-- "beigey" → "beig" → "beigiest", but "grey" → "gr" → "greyest"). The
-- first "y" in "-yey" can be treated as a vowel as long as it's
-- preceded by something ("clayey" → "clay" → "clayiest", "cryey" →
-- "cry" → "cryiest", but "*yey" → "*y" → "*yeyest"), so it needs to be
-- treated as a special case.
local normalized = normalize(base_stem, "ey")
if sub(normalized, -1) == "y" then
if umatch(normalized, "[%w@][yY]$") then
return base_stem .. "i"
end
elseif umatch(normalized, "[" .. vowels .. "%d]%w*$") then
return base_stem .. "i"
end
-- Special cases:
-- Final "quy" ("soliloquy" → "soliloquies").
-- Final "guy" iff `not_gu` is false ("roguy" → "roguiest").
-- Final "y" after a vowel iff `final_y_is_i_after_vowel` is true ("slay" →
-- "slain").
-- Final "-y" ("bro-y" → "bro-iest"), accounting for hyphen variation.
elseif umatch(final2, "[" .. hyphens .. "]y") then
-- Replace final "y" with "i".
return sub(str, 1, -2) .. "i"
-- Otherwise, final "y" becomes "i" iff it's not preceded by a vowel
-- ("shy" → "shiest", "horsy" → "horsies", but "day" → "days", "coy" →
-- "coyest").
else
-- Remove "y" to get the base stem.
local base_stem = sub(str, 1, -2)
if umatch(normalize(base_stem, "y", not_gu), "[^%s%p" .. (final_y_is_i_after_vowel and "" or vowels) .. "]$") then
return base_stem .. "i"
end
end
return str
end
local function double_final_consonant(str, final)
local initial = umatch(normalize(sub(str, 1, -2), final), "^.*%f[^%z%s" .. hyphens .. "…]([%l%p]*)[" .. vowels .. "]$")
return initial and (
initial == "" or
initial == "y" or
match(initial, "^.[\128-\191]*$") and umatch(initial, "[^" .. vowels .. "]") or
umatch(initial, "^[^" .. vowels .. "]*%f[^%l]$")
) and (str .. final) or str
end
local function remove_silent_e(str)
local final2 = sub(str, -2)
if final2 == "ie" then
-- Replace "ie" with "y", unless it follows another "y" (e.g.
-- "spulyie" → "spulyieing").
return ugsub(str, "([^yY%s%p])ie$", "%1y")
end
local base_stem = sub(str, 1, -2)
-- Silent "e" occurs after "u" or a consonant (cluster) preceded by a vowel.
return (
final2 == "ue" or
umatch(normalize(base_stem, "e"), "[" .. vowels .. "][^" .. vowels .. "]+$")
) and base_stem or str
end
function export.add_suffix(term, suffix, pos)
local data = suffixes[suffix]
suffix = match(suffix, "^([^.]*)")
local final, stem = sub(term, -1)
-- Proper nouns don't have a final "y" changed to "i" (e.g. "the Gettys",
-- "the public Ivys").
if data.final_y_is_i and final == "y" and pos ~= "proper noun" then
stem = convert_final_y_to_i(term, not data.final_guy_is_gui, data.final_ey_is_i, data.final_y_is_i_after_vowel)
elseif data.remove_silent_e and final == "e" then
stem = remove_silent_e(term)
else
stem = term
end
local epenthetic_e = data.epenthetic_e
if epenthetic_e and epenthetic_e(stem, term) then
suffix = "e" .. suffix
end
if (
data.final_consonant_is_doubled and
match(final, "^[bcdfgjklmnpqrstvz]$") and -- Only double regular consonants.
umatch(suffix, "^[" .. vowels .. "]")
) then
stem = double_final_consonant(term, final)
end
return stem .. suffix
end
add_suffix = export.add_suffix
--[==[
Pluralize a word in a smart fashion, according to normal English rules.
# If the word ends in a consonant or "qu" + "-y", replace "-y" with "-ies".
# If the word ends in "s", "x", "z", "ch", "sh" or "zh", add "-es".
# Otherwise, add "-s".
This handles links correctly:
# If a piped link, change the second part appropriately.
# If a non-piped link and rule #1 above applies, convert to a piped link with the second part containing the plural.
# If a non-piped link and rules #2 or #3 above apply, add the plural outside the link.
]==]
function export.pluralize(str)
-- Treat as a link if a "[[" is present and the string ends with "]]".
if not (find(str, "[[", 1, true) and sub(str, -2) == "]]") then
return add_suffix(str, "s.plural")
end
-- Find the last "[[" (in case there is more than one) by reversing
-- the string.
local str_rev = reverse(str)
local open = find(str_rev, "[[", 3, true)
-- If the last "[[" is followed by a "]]" which isn't at the end,
-- then the final "]]" is just plaintext (e.g. "[[foo]]bar]]").
local bad_close = find(str_rev, "]]", 3, true)
-- Note: the bad "]]" will have a lower index than the last "[[" in
-- the reversed string.
if bad_close and bad_close < open then
return add_suffix(str, "s.plural")
end
open = #str - open + 2
-- Get the target and display text by searching from just after "[[".
local target, display = match(str, "([^|]*)|?(.*)%]%]$", open)
display = add_suffix(display ~= "" and display or target, "s.plural")
-- If the link target is a substring of the display text, then
-- use a trail (e.g. "[[foo]]" → "[[foo]]s", since "foo" is a substring
-- of "foos").
local index, trail = find(display, target, 1, true)
if index == 1 then
return sub(str, 1, open - 1) .. target .. "]]" .. sub(display, trail + 1)
end
-- Otherwise, return a piped link.
return sub(str, 1, open - 1) .. target .. "|" .. display .. "]]"
end
-- Returns true if `plural` is an expected, regular plural of `term`.
function export.is_regular_plural(plural, term, pos)
-- Ignore any final punctuation that occurs in both forms, which is common
-- in abbreviations (e.g. "abbr." → "abbrs.").
local final_punc = umatch(term, "%p*$")
local final_punc_len = #final_punc
if sub(plural, -final_punc_len) == final_punc then
term = sub(term, 1, -final_punc_len - 1)
plural = sub(plural, 1, -final_punc_len - 1)
end
if plural == term .. "s" or plural == add_suffix(term, "s.plural", pos) then
return true
end
local final = sub(term, -1)
return (
-- Doubled final consonants in "s" and "z".
final == "s" and plural == term .. "ses" or -- e.g. "busses"
final == "z" and plural == term .. "zes" or -- e.g. "quizzes"
-- convert_final_y_to_i() without the `not_gu` flag set, to catch
-- "-guy" → "-guies", but not "day" → "daies".
final == "y" and plural == convert_final_y_to_i(term) .. "es" or
-- Capitalized terms like "$DEITY" → "$DEITIES (should we treat this as regular?)
final == "Y" and ulower(plural) == convert_final_y_to_i(ulower(term)) .. "es"
)
end
return export