Module:string utilities: Difference between revisions

From Linguifex
Jump to navigation Jump to search
No edit summary
Tag: Reverted
No edit summary
 
(One intermediate revision by the same user not shown)
Line 1: Line 1:
local mw = mw
local string = string
local table = table
local ustring = mw.ustring
local byte = string.byte
local char = string.char
local concat = table.concat
local find = string.find
local format = string.format
local gmatch = string.gmatch
local gsub = string.gsub
local len = string.len
local load_data = mw.loadData
local lower = string.lower
local match = string.match
local next = next
local reverse = string.reverse
local select = select
local sort = table.sort
local sub = string.sub
local tonumber = tonumber
local tostring = tostring
local type = type
local ucodepoint = ustring.codepoint
local ufind = ustring.find
local ugcodepoint = ustring.gcodepoint
local ugmatch = ustring.gmatch
local ugsub = ustring.gsub
local ulower = ustring.lower
local umatch = ustring.match
local unpack = unpack
local upper = string.upper
local usub = ustring.sub
local uupper = ustring.upper
-- Defined below.
local charset_escape
local codepoint
local explode_utf8
local format_fun
local get_indefinite_article
local pattern_escape
local pattern_simplifier
local php_trim
local replacement_escape
local u
local ulen
local module_name = "string_utilities"
local module_name = "string_utilities"
local export = {}
local export = {}


local format_escapes = {
--[==[Explodes a string into an array of UTF-8 characters. '''Warning''': this function has no safety checks for non-UTF-8 byte sequences, to optimize speed and memory use. Inputs containing them therefore result in undefined behaviour.]==]
    ["op"] = "{",
function export.explode_utf8(str)
    ["cl"] = "}",
local text, i = {}, 0
}
for ch in gmatch(str, ".[\128-\191]*") do
i = i + 1
text[i] = ch
end
return text
end
explode_utf8 = export.explode_utf8


function export.format_fun(str, fun)
--[==[Escapes the magic characters used in [[mw:Extension:Scribunto/Lua reference manual#Patterns|patterns]] (Lua's version of regular expressions): <code>$%()*+-.?[]^</code>. For example, {{code|lua|"^$()%.[]*+-?"}} becomes {{code|lua|"%^%$%(%)%%%.%[%]%*%+%-%?"}}. This is necessary when constructing a pattern involving arbitrary text (e.g. from user input).]==]
    return (str:gsub("{(\\?)((\\?)[^{}]*)}", function (p1, name, p2)
function export.pattern_escape(str)
        if #p1 + #p2 == 1 then
return (gsub(str, "[$%%()*+%-.?[%]^]", "%%%0"))
            return format_escapes[name] or error(module_name .. ".format: unrecognized escape sequence '{\\" .. name .. "}'")
end
        else
pattern_escape = export.pattern_escape
        if fun(name) and type(fun(name)) ~= "string" then
 
        error(module_name .. ".format: '" .. name .. "' is a " .. type(fun(name)) .. ", not a string")
--[==[Escapes only the magic characters used in [[mw:Extension:Scribunto/Lua reference manual#Patterns|pattern]] character sets: <code>%-]^</code>.]==]
        end
function export.charset_escape(str)
            return fun(name) or error(module_name .. ".format: '" .. name .. "' not found in table")
return (gsub(str, "[%%%-%]^]", "%%%0"))
        end
    end))
end
end
charset_escape = export.charset_escape


--[==[This function, unlike {{code|lua|string.format}} and {{code|lua|mw.ustring.format}}, takes just two parameters—a format string and a table—and replaces all instances of {{code|lua|{param_name}}} in the format string with the table's entry for {{code|lua|param_name}}. The opening and closing brace characters can be escaped with <code>{\op}</code> and <code>{\cl}</code>, respectively. A table entry beginning with a slash can be escaped by doubling the initial slash.
--[==[Escapes only <code>%</code>, which is the only magic character used in replacement [[mw:Extension:Scribunto/Lua reference manual#Patterns|patterns]] with string.gsub and mw.ustring.gsub.]==]
====Examples====
function export.replacement_escape(str)
* {{code|lua|2=string_utilities.format("{foo} fish, {bar} fish, {baz} fish, {quux} fish", {["foo"]="one", ["bar"]="two", ["baz"]="red", ["quux"]="blue"})}}
return (gsub(str, "%%", "%%%%"))
*: produces: {{code|lua|"one fish, two fish, red fish, blue fish"}}
* {{code|lua|2=string_utilities.format("The set {\\op}1, 2, 3{\\cl} contains {\\\\hello} elements.", {["\\hello"]="three"})}}
*: produces: {{code|lua|"The set {1, 2, 3} contains three elements."}}
*:* Note that the single and double backslashes should be entered as double and quadruple backslashes when quoted in a literal string.]==]
function export.format(str, tbl)
    return export.format_fun(str, function (key) return tbl[key] end)
end
end
replacement_escape = export.replacement_escape


-- A helper function which takes a string, position and type ("byte" or "char"), and returns the equivalent position for the other type (e.g. iterate_utf8("字典", 2, "char") returns 4, because character 2 of "字典" begins with byte 4). `pos` can be positive or negative, and the function will iterate over the string forwards or backwards (respectively) until it reaches the input position. Checks byte-by-byte; skipping over trailing bytes, and then calculating the correct byte trail for any leading bytes (i.e. how many trailing bytes should follow); these trailing bytes are then checked together.
do
-- The optional parameters `init_from_type` and `init_to_type` can be used to start part-way through an iteration to improve performance, if multiple values need to be returned from the same string. For example, iterate_utf8("слова́рь", 11, "byte", 5, 3) will begin checking at byte 5/the start of character 3. Note: The function won't check if these values match each other (as the only way to do this would be to run the iteration from the beginning), so mismatched values will return incorrect results.
local function check_sets_equal(set1, set2)
local function iterate_utf8(text, pos, from_type, init_from_type, init_to_type)
local k2
-- Position 0 is always valid and never changes.
for k1, v1 in next, set1 do
if pos == 0 then
local v2 = set2[k1]
return pos
if v1 ~= v2 and (v2 == nil or not check_sets_equal(v1, v2)) then
return false
end
k2 = next(set2, k2)
end
return next(set2, k2) == nil
end
end
local to_type
local function check_sets(bytes)
if from_type == "char" then
local key, set1, set = next(bytes)
to_type = "byte"
if set1 == true then
else
return true
to_type = "char"
elseif not check_sets(set1) then
return false
end
while true do
key, set = next(bytes, key)
if not key then
return true
elseif not check_sets_equal(set, set1) then
return false
end
end
end
end
-- Positive positions iterate forwards; negative positions iterate backwards.
local function make_charset(range)
local iterate_val
if #range == 1 then
if pos > 0 then
return char(range[1])
iterate_val = 1
end
else
sort(range)
iterate_val = -1
local compressed, n, start = {}, 0, range[1]
for i = 1, #range do
local this, nxt = range[i], range[i + 1]
if nxt ~= this + 1 then
n = n + 1
compressed[n] = this == start and char(this) or
char(start) .. "-" .. char(this)
start = nxt
end
end
return "[" .. concat(compressed) .. "]"
end
end
-- Adjust init_from_type and init_to_type to the iteration before, so that matches for the position given by them will work.
local function parse_1_byte_charset(pattern, pos)
local trail, cp, min, b = 0
while true do
local c, leading_byte = {}
local ch, nxt_pos
c[from_type] = init_from_type and init_from_type ~= 0 and init_from_type - iterate_val or 0
pos, ch, nxt_pos = match(pattern, "()([%%%]\194-\244][\128-\191]*)()", pos)
c[to_type] = init_to_type and init_to_type ~= 0 and init_to_type - iterate_val or 0
if not ch then
return false
elseif ch == "%" then
if match(pattern, "^[acdlpsuwxACDLPSUWXZ\128-\255]", nxt_pos) then
return false
end
pos = pos + 2
elseif ch == "]" then
pos = nxt_pos
return pos
else
return false
end
end
end
while true do
--[==[Parses `pattern`, a ustring library pattern, and attempts to convert it into a string library pattern. If conversion isn't possible, returns false.]==]
if pos > 0 then
pattern_simplifier = require("Module:fun").memoize(function(pattern)
b = text:byte(c.byte + 1)
if type(pattern) == "number" then
else
return tostring(pattern)
b = text:byte(text:len() + c.byte)
end
local pos, captures, start, n, output = 1, 0, 1, 0
while true do
local ch, nxt_pos
pos, ch, nxt_pos = match(pattern, "()([%%(.[\194-\244][\128-\191]*)()", pos)
if not ch then
break
end
local nxt = sub(pattern, nxt_pos, nxt_pos)
if ch == "%" then
if nxt == "b" then
if not match(pattern, "^()[^\128-\255][^\128-\255]", pos + 2) then
return false
end
pos = pos + 4
elseif nxt == "f" then
pos = pos + 2
if not match(pattern, "^()%[[^^]", pos) then
return false
end
-- Only possible to convert a %f charset which is all
-- ASCII, so use parse_1_byte_charset.
pos = parse_1_byte_charset(pattern, pos)
if not pos then
return false
end
elseif nxt == "Z" then
pos = pos + 2
nxt = sub(pattern, pos, pos)
if nxt == "*" or nxt == "+" or nxt == "-" then
pos = pos + 1
else
output = output or {}
n = n + 1
if nxt == "?" then
output[n] = sub(pattern, start, pos - 3) .. "[\1-\127\194-\244]?[\128-\191]*"
pos = pos + 1
else
output[n] = sub(pattern, start, pos - 3) .. "[\1-\127\194-\244][\128-\191]*"
end
start = pos
end
elseif find("acdlpsuwxACDLPSUWX", nxt, 1, true) then
return false
-- Skip the next character if it's ASCII. Otherwise, we will
-- still need to do length checks.
else
pos = pos + (byte(nxt) < 128 and 2 or 1)
end
elseif ch == "(" then
if nxt == ")" or captures == 32 then
return false
end
captures = captures + 1
pos = pos + 1
elseif ch == "." then
if nxt == "*" or nxt == "+" or nxt == "-" then
pos = pos + 2
else
output = output or {}
n = n + 1
if nxt == "?" then
output[n] = sub(pattern, start, pos - 1) .. "[^\128-\191]?[\128-\191]*"
pos = pos + 2
else
output[n] = sub(pattern, start, pos - 1) .. "[^\128-\191][\128-\191]*"
pos = pos + 1
end
start = pos
end
elseif ch == "[" then
-- Fail negative charsets. TODO: 1-byte charsets should be safe.
if nxt == "^" then
return false
-- If the first character is "%", ch_len is determined by the
-- next one instead.
elseif nxt == "%" then
nxt_pos = nxt_pos + 1
nxt = sub(pattern, nxt_pos, nxt_pos)
end
local ch_len = #match(pattern, "^.[\128-\191]*", nxt_pos)
if ch_len == 1 then -- Single-byte charset.
pos = parse_1_byte_charset(pattern, pos + 1)
if not pos then
return false
end
else -- Multibyte charset.
local charset_pos, bytes = pos
pos = pos + 1
while true do -- TODO: non-ASCII charset ranges.
pos, ch, nxt_pos = match(pattern, "()([^\128-\191][\128-\191]*)()", pos)
if not ch then
return false
-- If escaped, get the next character. No need to
-- distinguish magic characters or character classes,
-- as they'll all fail for having the wrong length
-- anyway.
elseif ch == "%" then
pos, ch, nxt_pos = match(pattern, "()([^\128-\191][\128-\191]*)()", pos)
elseif ch == "]" then
pos = nxt_pos
break
end
if ch_len ~= #ch then
return false
end
bytes = bytes or {}
local bytes = bytes
for i = 1, ch_len - 1 do
local b = byte(ch, i, i)
bytes[b] = bytes[b] or {}
bytes = bytes[b]
end
bytes[byte(ch, -1)] = true
pos = nxt_pos
end
if not pos then
return false
end
local nxt = sub(pattern, pos, pos)
if (
(nxt == "?" or nxt == "*" or nxt == "-") or
(nxt == "+" and ch_len > 2) or
not check_sets(bytes)
) then
return false
end
local ranges, b, key, next_byte = {}, 0
repeat
key, next_byte = next(bytes)
local range, n = {key}, 1
-- Loop starts on the second iteration.
for key in next, bytes, key do
n = n + 1
range[n] = key
end
b = b + 1
ranges[b] = range
bytes = next_byte
until next_byte == true
if nxt == "+" then
local range1, range2 = ranges[1], ranges[2]
ranges[1] = make_charset(range1)
ranges[3] = make_charset(range2)
local n = #range2
for i = 1, #range1 do
n = n + 1
range2[n] = range1[i]
end
ranges[2] = make_charset(range2) .. "*"
pos = pos + 1
else
for i = 1, #ranges do
ranges[i] = make_charset(ranges[i])
end
end
output = output or {}
n = n + 1
output[n] = sub(pattern, start, charset_pos - 1) .. concat(ranges)
start = pos
end
elseif nxt == "+" then
if #ch ~= 2 then
return false
end
output = output or {}
n = n + 1
output[n] = sub(pattern, start, pos) .. "[" .. ch .. "]*" .. sub(ch, 2, 2)
pos = nxt_pos + 1
start = pos
elseif nxt == "?" or nxt == "*" or nxt == "-" then
return false
else
pos = nxt_pos
end
end
if start == 1 then
return pattern
end
end
-- Position byte doesn't exist, so iterate the return value and return it.
return concat(output) .. sub(pattern, start)
if not b then
end, true)
return c[to_type] + iterate_val
export.pattern_simplifier = pattern_simplifier -- For testing.
elseif b < 0x80 then
end
-- 1-byte codepoint, 00-7F.
 
trail = 0
function export.len(str)
cp = b
return type(str) == "number" and len(str) or
min = 0
#str - #gsub(str, "[^\128-\191]+", "")
leading_byte = true
end
elseif b < 0xc0 then
ulen = export.len
-- A trailing byte.
 
leading_byte = false
function export.sub(str, i, j)
elseif b < 0xc2 then
str, i = type(str) == "number" and tostring(str) or str, i or 1
-- An overlong encoding for a 1-byte codepoint.
if i < 0 or j and j < 0 then
error("String " .. text .. " is not UTF-8.")
return usub(str, i, j)
elseif b < 0xe0 then
elseif j and i > j or i > #str then
-- 2-byte codepoint, C2-DF.
return ""
trail = 1
end
cp = b - 0xc0
local n, new_i = 0
min = 0x80
for loc1, loc2 in gmatch(str, "()[^\128-\191]+()[\128-\191]*") do
leading_byte = true
n = n + loc2 - loc1
elseif b < 0xf0 then
if not new_i and n >= i then
-- 3-byte codepoint, E0-EF.
new_i = loc2 - (n - i) - 1
trail = 2
if not j then
cp = b - 0xe0
return sub(str, new_i)
min = 0x800
leading_byte = true
elseif b < 0xf4 then
-- 4-byte codepoint, F0-F3.
trail = 3
cp = b - 0xf0
min = 0x10000
leading_byte = true
elseif b == 0xf4 then
-- 4-byte codepoint, F4.
-- Make sure it doesn't decode to over U+10FFFF.
if text:byte(c.byte + 2) > 0x8f then
error("String " .. text .. " is not UTF-8.")
end
end
trail = 3
cp = 4
min = 0x100000
leading_byte = true
else
-- Codepoint over U+10FFFF, or invalid byte.
error("String " .. text .. " is not UTF-8.")
end
end
if j and n > j then
-- Check subsequent bytes for multibyte codepoints.
return sub(str, new_i, loc2 - (n - j) - 1)
if leading_byte then
end
local from, to
end
if pos > 0 then
return new_i and sub(str, new_i) or ""
from, to = c.byte + 2, c.byte + 1 + trail
end
 
do
local function _find(str, loc1, loc2, ...)
if loc1 and not match(str, "^()[^\128-\255]*$") then
-- Use raw values of loc1 and loc2 to get loc1 and the length of the match.
loc1, loc2 = ulen(sub(str, 1, loc1)), ulen(sub(str, loc1, loc2))
-- Offset length with loc1 to get loc2.
loc2 = loc1 + loc2 - 1
end
return loc1, loc2, ...
end
--[==[A version of find which uses string.find when possible, but otherwise uses mw.ustring.find.]==]
function export.find(str, pattern, init, plain)
init = init or 1
if init ~= 1 and not match(str, "^()[^\128-\255]*$") then
return ufind(str, pattern, init, plain)
elseif plain then
return _find(str, find(str, pattern, init, true))
end
local simple = pattern_simplifier(pattern)
if simple then
return _find(str, find(str, simple, init))
end
return ufind(str, pattern, init)
end
end
 
--[==[A version of match which uses string.match when possible, but otherwise uses mw.ustring.match.]==]
function export.match(str, pattern, init)
init = init or 1
if init ~= 1 and not match(str, "^()[^\128-\255]*$") then
return umatch(str, pattern, init)
end
local simple = pattern_simplifier(pattern)
if simple then
return match(str, simple, init)
end
return umatch(str, pattern, init)
end
 
--[==[A version of gmatch which uses string.gmatch when possible, but otherwise uses mw.ustring.gmatch.]==]
function export.gmatch(str, pattern)
local simple = pattern_simplifier(pattern)
if simple then
return gmatch(str, simple)
end
return ugmatch(str, pattern)
end
 
--[==[A version of gsub which uses string.gsub when possible, but otherwise uses mw.ustring.gsub.]==]
function export.gsub(str, pattern, repl, n)
local simple = pattern_simplifier(pattern)
if simple then
return gsub(str, simple, repl, n)
end
return ugsub(str, pattern, repl, n)
end
 
--[==[Like gsub, but pattern-matching facilities are turned off, so `pattern` and `repl` (if a string) are treated as literal.]==]
function export.plain_gsub(str, pattern, repl, n)
return gsub(str, pattern_escape(pattern), type(repl) == "string" and replacement_escape(repl) or repl, n)
end
 
--[==[Reverses a UTF-8 string; equivalent to string.reverse.]==]
function export.reverse(str)
return reverse(gsub(str, "[\194-\244][\128-\191]*", reverse))
end
 
do
local function err(cp)
error("Codepoint " .. cp .. " is out of range: codepoints must be between 0x0 and 0x10FFFF.", 2)
end
 
local function utf8_char(cp)
cp = tonumber(cp)
if cp < 0 then
err("-0x" .. format("%X", -cp + 1))
elseif cp < 0x80 then
return char(cp)
elseif cp < 0x800 then
return char(
0xC0 + cp / 0x40,
0x80 + cp % 0x40
)
elseif cp < 0x10000 then
if cp >= 0xD800 and cp < 0xE000 then
return "?" -- mw.ustring.char returns "?" for surrogates.
end
return char(
0xE0 + cp / 0x1000,
0x80 + cp / 0x40 % 0x40,
0x80 + cp % 0x40
)
elseif cp < 0x110000 then
return char(
0xF0 + cp / 0x40000,
0x80 + cp / 0x1000 % 0x40,
0x80 + cp / 0x40 % 0x40,
0x80 + cp % 0x40
)
end
err("0x" .. format("%X", cp))
end
 
function export.char(cp, ...)
if ... == nil then
return utf8_char(cp)
end
local ret = {cp, ...}
for i = 1, select("#", cp, ...) do
ret[i] = utf8_char(ret[i])
end
return concat(ret)
end
u = export.char
end
 
do
local function get_codepoint(b1, b2, b3, b4)
if b1 < 128 then
return b1, 1
elseif b1 < 224 then
return 0x40 * b1 + b2 - 0x3080, 2
elseif b1 < 240 then
return 0x1000 * b1 + 0x40 * b2 + b3 - 0xE2080, 3
end
return 0x40000 * b1 + 0x1000 * b2 + 0x40 * b3 + b4 - 0x3C82080, 4
end
 
function export.codepoint(str, i, j)
if type(str) == "number" then
return byte(str, i, j)
end
i, j = i or 1, j == -1 and #str or i or 1
if i == 1 and j == 1 then
return (get_codepoint(byte(str, 1, 4)))
elseif i < 0 or j < 0 then
return ucodepoint(str, i, j) -- FIXME
end
local n, nb, ret, nr = 0, 1, {}, 0
while n < j do
n = n + 1
if n < i then
local b = byte(str, nb)
nb = nb + (b < 128 and 1 or b < 224 and 2 or b < 240 and 3 or 4)
else
else
from, to = text:len() + c.byte + 1, text:len() + c.byte + trail
local b1, b2, b3, b4 = byte(str, nb, nb + 3)
end
if not b1 then
for trailing_byte = from, to do
break
b = text:byte(trailing_byte)
if not b or b < 0x80 or b > 0xbf then
error("String " .. text .. " is not UTF-8.")
end
end
cp = cp * 0x40 + b - 0x80
nr = nr + 1
local add
ret[nr], add = get_codepoint(b1, b2, b3, b4)
nb = nb + add
end
end
local next_byte = text:byte(to + 1)
end
if next_byte and next_byte >= 0x80 and next_byte <= 0xbf then
return unpack(ret)
-- Too many trailing bytes.
end
error("String " .. text .. " is not UTF-8.")
codepoint = export.codepoint
elseif cp < min then
-- Overlong encoding.
function export.gcodepoint(str, i, j)
error("String " .. text .. " is not UTF-8.")
i, j = i or 1, j ~= -1 and j or nil
if i < 0 or j and j < 0 then
return ugcodepoint(str, i, j) -- FIXME
end
local n, nb = 1, 1
while n < i do
local b = byte(str, nb)
if not b then
break
end
end
nb = nb + (b < 128 and 1 or b < 224 and 2 or b < 240 and 3 or 4)
n = n + 1
end
end
c.byte = c.byte + iterate_val
return function()
if leading_byte then
if j and n > j then
c.char = c.char + iterate_val
return nil
end
end
n = n + 1
if c[from_type] == pos then
local b1, b2, b3, b4 = byte(str, nb, nb + 3)
return c[to_type]
if not b1 then
return nil
end
local ret, add = get_codepoint(b1, b2, b3, b4)
nb = nb + add
return ret
end
end
end
end
end
end


--[==[Converts a character position to the equivalent byte position.]==]
--[==[A version of lower which uses string.lower when possible, but otherwise uses mw.ustring.lower.]==]
function export.charsToBytes(text, pos)
function export.lower(str)
return iterate_utf8(text, pos, "char")
return (match(str, "^()[^\128-\255]*$") and lower or ulower)(str)
end
end


--[==[Converts a byte position to the equivalent character position.]==]
--[==[A version of upper which uses string.upper when possible, but otherwise uses mw.ustring.upper.]==]
function export.bytesToChars(text, pos)
function export.upper(str)
local byte = text:byte(pos)
return (match(str, "^()[^\128-\255]*$") and upper or uupper)(str)
if byte and byte >= 0x80 and byte <= 0xbf then
error("Byte " .. pos .. " is not a leading byte.")
end
return iterate_utf8(text, pos, "byte")
end
end


-- A helper function which iterates through a pattern, and returns two values: a potentially modified version of the pattern, and a boolean indicating whether the returned pattern is simple (i.e. whether it can be used with the stock string library); if not, then the pattern is complex (i.e. it must be used with the ustring library, which is much more resource-intensive).
do
local function patternSimplifier(text, pattern, plain)
local function add_captures(text, n, ...)
pattern = tostring(pattern)
-- Insert any captures from the splitting pattern.
-- If `plain` is set, then the pattern is treated as literal (so is always simple). Only used by find.
local offset, capture = n - 1, ...
if plain then
while capture do
return pattern, true
n = n + 1
--If none of these are present, then the pattern has to be simple.
text[n] = capture
elseif not (
capture = select(n - offset, ...)
pattern:match("%[.-[\128-\255].-%]") or
end
pattern:match("[\128-\255][%*%+%?%-]") or
return n
pattern:match("%%[abcdlpsuwxACDLPSUWXZ]") or
pattern:match("%[%^[^%]]+%]") or
pattern:match("%.[^%*%+%-]") or
pattern:match("%.$") or
pattern:match("%%b.?[\128-\255]") or
pattern:match("()", 1, true)
) then
return pattern, true
end
end
-- Otherwise, the pattern could go either way.
-- Build up the new pattern in a table, then concatenate at the end. we do it this way, as occasionally entries get modified along the way.
local new_pattern = {}
local len, pos, b = pattern:len(), 0
local char, next_char
-- `escape` and `balanced` are counters, which ensure the effects of % or %b (respectively) are distributed over the following bytes.
local function iterate(str, str_len, text, n, start, _sub, loc1, loc2, ...)
-- `set` is a boolean that states whether the current byte is in a charset.
if not (loc1 and start <= str_len) then
-- `capture` keeps track of how many layers of capture groups the position is in, while `captures` keeps a tally of how many groups have been detected (due to the string library limit of 32).
-- If no match, or there is but we're past the end of the string
local escape, set, balanced, capture, captures = 0, false, 0, 0, 0
-- (which happens when the match is the empty string), then add
-- the final chunk and return.
while pos < len do
n = n + 1
pos = pos + 1
text[n] = _sub(str, start)
b = pattern:byte(pos)
return
if escape > 0 then escape = escape - 1 end
elseif loc2 < loc1 then
if balanced > 0 then balanced = balanced - 1 end
-- Special case: If we match the empty string, then include the
char = next_char or pattern:sub(pos, pos)
-- next character; this avoids an infinite loop, and makes
next_char = pattern:sub(pos + 1, pos + 1)
-- splitting by an empty string work the way mw.text.split() does
if escape == 0 then
-- (including non-adjacent empty string matches with %f). If we
if char == "%" then
-- reach the end of the string this way, return immediately, so we
-- Apply % escape.
-- don't get a final empty string. If using the string library, we
if next_char == "." or next_char == "%" or next_char == "[" or next_char == "]" then
-- need to make sure we advance by one UTF-8 character.
escape = 2
if _sub == sub then
if balanced > 0 then balanced = balanced + 1 end
loc1 = loc1 + #match(str, "^[\128-\191]*", loc1 + 1)
-- These charsets make the pattern complex.
end
elseif next_char:match("[acdlpsuwxACDLPSUWXZ]") then
n = n + 1
return pattern, false
text[n] = _sub(str, start, loc1)
-- This is "%b".
start = loc1 + 1
elseif next_char == "b" then
if start > str_len then
balanced = 4
return ... and add_captures(text, n, ...) or n
end
-- Enter or leave a charset.
elseif char == "[" then
set = true
elseif char == "]" then
set = false
elseif char == "(" then
capture = capture + 1
elseif char == ")" then
if capture > 0 and set == false and balanced == 0 then
captures = captures + 1
capture = capture - 1
end
end
end
else
-- Add chunk up to the current match.
n = n + 1
text[n] = _sub(str, start, loc1 - 1)
start = loc2 + 1
end
end
return (... and add_captures(text, n, ...) or n), start
end
local function _split(str, pattern, str_len, _sub, _find, plain)
local text, n, start = {}, 0, 1
repeat
n, start = iterate(str, str_len, text, n, start, _sub, _find(str, pattern, start, plain))
until not start
-- Multibyte char.
return text
if b > 0x7f then
end
-- If followed by "*", "+" or "-", then 2-byte chars can be converted into charsets. However, this is not possible with 3 or 4-byte chars, as the charset would be too permissive, because if the trailing bytes were in a different order then this could be a different valid character.
if next_char == "*" or next_char == "+" or next_char == "-" then
--[==[Reimplementation of mw.text.split() that includes any capturing groups in the splitting pattern. This works like Python's re.split() function, except that it has Lua's behavior when the split pattern is empty (i.e. advancing by one character at a time; Python returns the whole remainder of the string). When possible, it will use the string library, but otherwise uses the ustring library. There are two optional parameters: `str_lib` forces use of the string library, while `plain` turns any pattern matching facilities off, treating `pattern` as literal.]==]
local prev_pos = pattern:byte(pos - 1)
function export.split(str, pattern, str_lib, plain)
if prev_pos > 0xc1 and prev_pos < 0xe0 then
if str_lib or plain then
new_pattern[#new_pattern] = "[" .. new_pattern[#new_pattern]
return _split(str, pattern, #str, sub, find, plain)
table.insert(new_pattern, char .. "]")
end
local simple = pattern_simplifier(pattern)
if simple then
return _split(str, simple, #str, sub, find)
end
return _split(str, pattern, ulen(str), usub, ufind)
end
export.capturing_split = export.split -- To be removed.
end
 
do
-- TODO: merge this with export.split. Not clear how to do this while
-- maintaining the same level of performance, as gsplit is slower.
local function _split(str, pattern, str_len, _sub, _find, plain)
local start, final = 1
local function iter(loc1, loc2, ...)
-- If no match, return the final chunk.
if not loc1 then
final = true
return _sub(str, start)
end
-- Special case: If we match the empty string, then eat the
-- next character; this avoids an infinite loop, and makes
-- splitting by the empty string work the way mw.text.gsplit() does
-- (including non-adjacent empty string matches with %f). If we
-- reach the end of the string this way, set `final` to true, so we
-- don't get stuck matching the empty string at the end.
local chunk
if loc2 < loc1 then
-- If using the string library, we need to make sure we advance
-- by one UTF-8 character.
if _sub == sub then
loc1 = loc1 + #match(str, "^[\128-\191]*", loc1 + 1)
end
chunk = _sub(str, start, loc1)
if loc1 >= str_len then
final = true
else
else
return pattern, false
start = loc1 + 1
end
end
-- If in a charset or used in "%b", then the pattern is complex.
-- Eat chunk up to the current match.
-- If followed by "?", add "?" after each byte.
elseif next_char == "?" then
table.insert(new_pattern, char .. "?")
local check_pos, check_b, i = pos, pattern:byte(pos), #new_pattern
while check_b and check_b < 0xc0 do
check_pos = check_pos - 1
check_b = pattern:byte(check_pos)
i = i - 1
new_pattern[i] = new_pattern[i] .. "?"
end
pos = pos + 1
next_char = pattern:sub(pos + 1, pos + 1)
elseif set or balanced > 0 then
return pattern, false
else
else
table.insert(new_pattern, char)
chunk = _sub(str, start, loc1 - 1)
start = loc2 + 1
end
end
elseif char == "." then
return chunk, ...
-- "*", "+", "-" are always okay after ".", as they don't care how many bytes a char has.
end
if set or next_char == "*" or next_char == "+" or next_char == "-" or escape > 0 then
table.insert(new_pattern, char)
return function()
-- If followed by "?", make sure "?" is after the leading byte of the UTF-8 char pattern, then skip forward one.
if not final then
elseif next_char == "?" then
return iter(_find(str, pattern, start, plain))
table.insert(new_pattern, "[%z\1-\127\194-\244]?[\128-\191]*")
pos = pos + 1
next_char = pattern:sub(pos + 1, pos + 1)
-- If used with "%b", pattern is complex.
elseif balanced > 0 then
return pattern, false
-- Otherwise, add the UTF-8 char pattern.
else
table.insert(new_pattern, "[%z\1-\127\194-\244][\128-\191]*")
end
end
-- Negative charsets are always complex, unless the text has no UTF-8 chars.
return nil
elseif char == "[" and next_char == "^" and escape == 0 and text:match("[\128-\255]") then
return pattern, false
-- "()" matches the position unless escaped or used with "%b", so always necessitates ustring (as we need it to match the char position, not the byte one).
elseif char == "(" and next_char == ")" and balanced == 0 and escape == 0 and text:match("[\128-\255]") then
return pattern, false
else
table.insert(new_pattern, char)
end
end
end
end
if captures > 32 then
return pattern, false
function export.gsplit(str, pattern, str_lib, plain)
else
if str_lib or plain then
pattern = table.concat(new_pattern)
return _split(str, pattern, #str, sub, find, plain)
return pattern, true
end
local simple = pattern_simplifier(pattern)
if simple then
return _split(str, simple, #str, sub, find)
end
return _split(str, pattern, ulen(str), usub, ufind)
end
end
end
end


--[==[A version of len which uses string.len, but returns the same result as mw.ustring.len.]==]
function export.trim(str, charset)
function export.len(text)
if not charset then
text = tostring(text)
return match(str, "^()%s*$") and "" or match(str, "^%s*(.*%S)")
local len_bytes = text:len()
elseif match(charset, "^()[^\128-\255]*$") then
if not text:match("[\128-\255]") then
return match(str, "^()[" .. charset .. "]*$") and "" or match(str, "^[" .. charset .. "]*(.*[^" .. charset .. "])")
return len_bytes
else
return iterate_utf8(text, len_bytes, "byte")
end
end
return umatch(str, "^[" .. charset .. "]*(.-)[" .. charset .. "]*$")
end
end


--[==[A version of sub which uses string.sub, but returns the same result as mw.ustring.sub.]==]
do
function export.sub(text, i_char, j_char)
local entities
text = tostring(text)
 
if not text:match("[\128-\255]") then
local function decode_numeric_entity(code, pattern, base)
return text:sub(i_char, j_char)
local cp = match(code, pattern) and tonumber(code, base)
return cp and cp < 0x110000 and u(cp) or nil
end
end
local i_byte, j_byte
 
if j_char then
local function decode_entity(hash, x, code)
if i_char > 0 and j_char > 0 then
if hash == "#" then
if j_char < i_char then return "" end
return x == "" and decode_numeric_entity(code, "^%d+$") or
i_byte = iterate_utf8(text, i_char, "char")
decode_numeric_entity(code, "^%x+$", 16)
j_byte = iterate_utf8(text, j_char + 1, "char", i_char, i_byte) - 1
elseif i_char < 0 and j_char < 0 then
if j_char < i_char then return "" end
j_byte = iterate_utf8(text, j_char + 1, "char") - 1
i_byte = iterate_utf8(text, i_char, "char", j_char, j_byte)
-- For some reason, mw.ustring.sub with i=0, j=0 returns the same result as for i=1, j=1, while string.sub always returns "". However, mw.ustring.sub does return "" with i=1, j=0. As such, we need to adjust j_char to 1 if i_char is either 0, or negative with a magnitude greater than the length of the string.
elseif j_char == 0 then
i_byte = iterate_utf8(text, i_char, "char")
if i_byte == 0 or -i_byte > text:len() then j_char = 1 end
j_byte = iterate_utf8(text, j_char + 1, "char") - 1
else
i_byte = iterate_utf8(text, i_char, "char")
j_byte = iterate_utf8(text, j_char + 1, "char") - 1
end
end
else
entities = entities or load_data("Module:data/entities")
i_byte = iterate_utf8(text, i_char, "char")
return entities[x .. code]
end
end
return text:sub(i_byte, j_byte)
end


--[==[A version of lower which uses string.lower when possible, but otherwise uses mw.ustring.lower.]==]
-- Non-ASCII characters aren't valid in proper HTML named entities, but MediaWiki uses them in some custom aliases which have also been included in [[Module:data/entities]].
function export.lower(text)
function export.decode_entities(str)
text = tostring(text)
return find(str, "&", 1, true) and
if not text:match("[\128-\255]") then
gsub(str, "&(#?)([xX]?)([%w\128-\255]+);", decode_entity) or str
return text:lower()
else
return mw.ustring.lower(text)
end
end
end
end


--[==[A version of upper which uses string.upper when possible, but otherwise uses mw.ustring.upper.]==]
do
function export.upper(text)
local html_entities
text = tostring(text)
if not text:match("[\128-\255]") then
local function encode_entity(ch)
return text:upper()
local entity = html_entities[ch]
else
if entity then
return mw.ustring.upper(text)
return entity
end
entity = "&#" .. codepoint(ch) .. ";"
html_entities[ch] = entity
return entity
end
end
end
 
function export.encode_entities(str, charset, str_lib, plain)
--[==[A version of find which uses string.find when possible, but otherwise uses mw.ustring.find.]==]
-- Memoized HTML entities (taken from mw.text.lua).
function export.find(text, pattern, init_char, plain)
html_entities = html_entities or {
text = tostring(text)
["\""] = "&quot;",
local simple
["&"] = "&amp;",
pattern, simple = patternSimplifier(text, pattern, plain)
["'"] = "&#039;",
-- If the pattern is simple but multibyte characters are present, then init_char needs to be converted into bytes for string.find to work properly, and the return values need to be converted back into chars.
["<"] = "&lt;",
if simple then
[">"] = "&gt;",
if not text:match("[\128-\255]") then
["\194\160"] = "&nbsp;",
return text:find(pattern, init_char, plain)
}
else
if not charset then
local init_byte = init_char and iterate_utf8(text, init_char, "char")
return (gsub(str, "[\"&'<>\194]\160?", html_entities))
local byte1, byte2, c1, c2, c3, c4, c5, c6, c7, c8, c9 = text:find(pattern, init_byte, plain)
elseif plain then
return (gsub(str, "[" .. charset_escape(charset) .. "]", encode_entity))
-- If string.find returned nil, then return nil.
elseif str_lib then
if not (byte1 and byte2) then
if not match(charset, "^()[^\128-\255]*$") then
return nil
error("Cannot use the string library with a character set that contains a character with a codepoint above U+007F.")
end
end
return (gsub(str, "[" .. charset .. "]", encode_entity))
-- Get first return value. If we have a positive init_char, we can save resources by resuming at that point.
end
local char1, char2
local pattern = charset and "[" .. charset .. "]"
if (not init_char) or init_char > 0 then
local simple = pattern_simplifier(pattern)
char1 = iterate_utf8(text, byte1, "byte", init_byte, init_char)
if simple then
else
return (gsub(str, simple, encode_entity))
char1 = iterate_utf8(text, byte1, "byte")
end
-- If byte1 and byte2 are the same, don't bother running iterate_utf8 twice. Otherwise, resume iterate_utf8 from byte1 to find char2.
if byte1 == byte2 then
char2 = char1
else
char2 = iterate_utf8(text, byte2, "byte", byte1, char1)
end
return unpack{char1, char2, c1, c2, c3, c4, c5, c6, c7, c8, c9}
end
end
else
return (ugsub(str, pattern, encode_entity))
return mw.ustring.find(text, pattern, init_char, plain)
end
end
end
end


--[==[A version of match which uses string.match when possible, but otherwise uses mw.ustring.match.]==]
do
function export.match(text, pattern, init)
local function decode_path(code)
text = tostring(text)
return char(tonumber(code, 16))
local simple
end
pattern, simple = patternSimplifier(text, pattern)
if simple then
local function decode(lead, trail)
if init and text:find("[\128-\255]") then
if lead == "+" or lead == "_" then
init = iterate_utf8(text, init, "char")
return " " .. trail
elseif #trail == 2 then
return decode_path(trail)
end
return lead .. trail
end
function export.decode_uri(str, enctype)
enctype = enctype and upper(enctype) or "QUERY"
if enctype == "PATH" then
return find(str, "%", 1, true) and
gsub(str, "%%(%x%x)", decode_path) or str
elseif enctype == "QUERY" then
return (find(str, "%", 1, true) or find(str, "+", 1, true)) and
gsub(str, "([%%%+])(%x?%x?)", decode) or str
elseif enctype == "WIKI" then
return (find(str, "%", 1, true) or find(str, "_", 1, true)) and
gsub(str, "([%%_])(%x?%x?)", decode) or str
end
end
return text:match(pattern, init)
error("bad argument #2 to \"decode_uri\" (expected QUERY, PATH, or WIKI)", 2)
else
return mw.ustring.match(text, pattern, init)
end
end
end
end


--[==[A version of gmatch which uses string.gmatch when possible, but otherwise uses mw.ustring.gmatch.]==]
do
function export.gmatch(text, pattern)
local function _remove_comments(str, pre)
text = tostring(text)
local head = find(str, "<!--", 1, true)
local simple
if not head then
pattern, simple = patternSimplifier(text, pattern)
return str
if simple then
end
return text:gmatch(pattern)
local ret, n = {sub(str, 1, head - 1)}, 1
else
while true do
return mw.ustring.gmatch(text, pattern)
local loc = find(str, "-->", head + 4, true)
if not loc then
return pre and concat(ret) or
concat(ret) .. sub(str, head)
end
head = loc + 3
loc = find(str, "<!--", head, true)
if not loc then
return concat(ret) .. sub(str, head)
end
n = n + 1
ret[n] = sub(str, head, loc - 1)
head = loc
end
end
--[==[Removes any HTML comments from the input text. `stage` can be one of three options:
* {{lua|"PRE"}} (default) applies the method used by MediaWiki's preprocessor: all {{code||<nowiki><!-- ... --></nowiki>}} pairs are removed, as well as any text after an unclosed {{code||<nowiki><!--</nowiki>}}. This is generally suitable when parsing raw template or [[mw:Parser extension tags|parser extension tag]] code. (Note, however, that the actual method used by the preprocessor is considerably more complex and differs under certain conditions (e.g. comments inside nowiki tags); if full accuracy is absolutely necessary, use [[Module:template parser]] instead).
* {{lua|"POST"}} applies the method used to generate the final page output once all templates have been expanded: it loops over the text, removing any {{code||<nowiki><!-- ... --></nowiki>}} pairs until no more are found (e.g. {{code||<nowiki><!-<!-- ... -->- ... --></nowiki>}} would be fully removed), but any unclosed {{code||<nowiki><!--</nowiki>}} is ignored. This is suitable for handling links embedded in template inputs, where the {{lua|"PRE"}} method will have already been applied by the native parser.
* {{lua|"BOTH"}} applies {{lua|"PRE"}} then {{lua|"POST"}}.]==]
function export.remove_comments(str, stage)
if not stage or stage == "PRE" then
return _remove_comments(str, true)
end
local processed = stage == "POST" and _remove_comments(str) or
stage == "BOTH" and _remove_comments(str, true) or
error("bad argument #2 to \"remove_comments\" (expected PRE, POST, or BOTH)", 2)
while processed ~= str do
str = processed
processed = _remove_comments(str)
end
return str
end
end
end
end


--[==[A version of gsub which uses string.gsub when possible, but otherwise uses mw.ustring.gsub.]==]
--[==[Lua equivalent of PHP's {{code|php|trim($string)}}, which trims {{code|lua|"\0"}}, {{code|lua|"\t"}}, {{code|lua|"\n"}}, {{code|lua|"\v"}}, {{code|lua|"\r"}} and {{code|lua|" "}}. This is useful when dealing with template parameters, since the native parser trims them like this.]==]
function export.gsub(text, pattern, repl, n)
function export.php_trim(str)
text = tostring(text)
return match(str, "%f[^%z\t\n\v\r ].*%f[%z\t\n\v\r ]") or ""
local simple
pattern, simple = patternSimplifier(text, pattern)
if simple then
return text:gsub(pattern, repl, n)
else
return mw.ustring.gsub(text, pattern, repl, n)
end
end
end
php_trim = export.php_trim


--[==[
--[==[Takes a parameter name as an input, and returns the Scribunto-normalized form (i.e. the key that that parameter would have in a {{code|lua|frame.args}} table). For example, {{code|lua|"1"}} is normalized to {{code|lua|1}} (a number), and {{code|lua|" foo "}} is normalized to {{code|lua|"foo"}}. If the input is not a string, it is returned unchanged.
-- Reimplementation of mw.ustring.split() that includes any capturing
-- groups in the splitting pattern. This works like Python's re.split()
-- function, except that it has Lua's behavior when the split pattern
-- is empty (i.e. advancing by one character at a time; Python returns the
-- whole remainder of the string).
]==]
function export.capturing_split(str, pattern)
    local ret = {}
    -- (.-) corresponds to (.*?) in Python or Perl; () captures the
    -- current position after matching.
    pattern = "(.-)" .. pattern .. "()"
    local start = 1
    while true do
        -- Did we reach the end of the string?
        if start > #str then
            table.insert(ret, "")
            return ret
        end
        -- match() returns all captures as multiple return values;
        -- we need to insert into a table to get them all.
        local captures = {export.match(str, pattern, start)}
        -- If no match, add the remainder of the string.
        if #captures == 0 then
            table.insert(ret, export.sub(str, start))
            return ret
        end
        local newstart = table.remove(captures)
        -- Special case: If we don't advance by any characters, then advance
        -- by one character; this avoids an infinite loop, and makes splitting
        -- by an empty string work the way mw.ustring.split() does. If we
        -- reach the end of the string this way, return immediately, so we
        -- don't get a final empty string.
        if newstart == start then
            table.insert(ret, export.sub(str, start, start))
            table.remove(captures, 1)
            start = start + 1
            if start > #str then
            return ret
            end
        else
            table.insert(ret, table.remove(captures, 1))
            start = newstart
        end
        -- Insert any captures from the splitting pattern.
        for _, x in ipairs(captures) do
            table.insert(ret, x)
        end
    end
end


local function uclcfirst(text, dolower)
After being trimmed with {{code|lua|export.php_trim}}, strings are converted to numbers if:
local function douclcfirst(text)
# They are integers, with no decimals (2.0) or leading zeroes (02).
-- Actual function to re-case of the first letter.
# They are ≤ 2{{sup|53}} and ≥ -2{{sup|53}}.
local first_letter = export.sub(text, 1, 1)
# For positive values, they do not have a leading {{code|lua|+}} sign.]==]
first_letter = dolower and export.lower(first_letter) or export.upper(first_letter)
function export.scribunto_param_key(key)
return first_letter .. export.sub(text, 2)
if type(key) ~= "string" then
return key
end
end
-- If there's a link at the beginning, re-case the first letter of the
key = php_trim(key)
-- link text. This pattern matches both piped and unpiped links.
if match(key, "^-?[1-9]%d*$") then
-- If the link is not piped, the second capture (linktext) will be empty.
local num = tonumber(key)
local link, linktext, remainder = export.match(text, "^%[%[([^|%]]+)%|?(.-)%]%](.*)$")
-- Lua integers are only accurate to 2^53 - 1, so we have to specifically check for 2^53 and -2^53, since 2^53 == 2^53 + 1 evaluates to true.
if link then
return (
return "[[" .. link .. "|" .. douclcfirst(linktext ~= "" and linktext or link) .. "]]" .. remainder
num <= 9007199254740991 and num >= -9007199254740991 or
key == "9007199254740992" or
key == "-9007199254740992"
) and num or key
elseif key == "0" then
return 0
end
end
return douclcfirst(text)
return key
end
end


function export.ucfirst(text)
do
return uclcfirst(text, false)
local byte_escapes
local function escape_byte(b)
return byte_escapes[b] or format("\\%03d", byte(b))
end
function export.escape_bytes(str)
byte_escapes = byte_escapes or load_data("Module:string utilities/data").byte_escapes
return (gsub(str, ".", escape_byte))
end
end
end


function export.lcfirst(text)
function export.format_fun(str, fun)
return uclcfirst(text, true)
return (gsub(str, "{(\\?)((\\?)[^{}]*)}", function(p1, name, p2)
if #p1 + #p2 == 1 then
return name == "op" and "{" or
name == "cl" and "}" or
error(module_name .. ".format: unrecognized escape sequence '{\\" .. name .. "}'")
elseif fun(name) and type(fun(name)) ~= "string" then
error(module_name .. ".format: \"" .. name .. "\" is a " .. type(fun(name)) .. ", not a string")
end
return fun(name) or error(module_name .. ".format: \"" .. name .. "\" not found in table")
end))
end
end
format_fun = export.format_fun


-- Almost identical to mw.text.nowiki, but with minor changes to be identical to the PHP equivalent: ";" always escapes, and colons in certain protocols only escape after regex \b. Also about 2-3 times as fast.
--[==[This function, unlike {{code|lua|string.format}} and {{code|lua|mw.ustring.format}}, takes just two parameters—a format string and a table—and replaces all instances of {{code|lua|{param_name}}} in the format string with the table's entry for {{code|lua|param_name}}. The opening and closing brace characters can be escaped with <code>{\op}</code> and <code>{\cl}</code>, respectively. A table entry beginning with a slash can be escaped by doubling the initial slash.
function export.nowiki(text)
====Examples====
return (text
* {{code|lua|2=string_utilities.format("{foo} fish, {bar} fish, {baz} fish, {quux} fish", {["foo"]="one", ["bar"]="two", ["baz"]="red", ["quux"]="blue"})}}
:gsub("[\"&'<=>%[%]{|};]", {
*: produces: {{code|lua|"one fish, two fish, red fish, blue fish"}}
["\""] = "&#34;", ["&"] = "&#38;", ["'"] = "&#39;",
* {{code|lua|2=string_utilities.format("The set {\\op}1, 2, 3{\\cl} contains {\\\\hello} elements.", {["\\hello"]="three"})}}
["<"] = "&#60;", ["="] = "&#61;", [">"] = "&#62;",
*: produces: {{code|lua|"The set {1, 2, 3} contains three elements."}}
["["] = "&#91;", ["]"] = "&#93;", ["{"] = "&#123;",
*:* Note that the single and double backslashes should be entered as double and quadruple backslashes when quoted in a literal string.]==]
["|"] = "&#124;", ["}"] = "&#125;", [";"] = "&#59;"
function export.format(str, tbl)
})
return format_fun(str, function(key)
:gsub("%f[^%z\r\n][#*: \n\r\t]", {
return tbl[key]
["#"] = "&#35;", ["*"] = "&#42;", [":"] = "&#58;",
end)
[" "] = "&#32;", ["\n"] = "&#10;", ["\r"] = "&#13;",
["\t"] = "&#9;"
})
:gsub("(%f[^%z\r\n])%-(%-%-%-)", "%1&#45;%2")
:gsub("__", "_&#95;")
:gsub("://", "&#58;//")
:gsub("([IP]?[MRS][BFI][CDN])([\t\n\f\r ])", function(m1, m2)
if m1 == "ISBN" or m1 == "RFC" or m1 == "PMID" then
return m1 .. m2:gsub(".", {
["\t"] = "&#9;", ["\n"] = "&#10;", ["\f"] = "&#12;",
["\r"] = "&#13;", [" "] = "&#32;"
})
end
end)
:gsub("[%w_]+:", {
["bitcoin:"] = "bitcoin&#58;", ["geo:"] = "geo&#58;", ["magnet:"] = "magnet&#58;",
["mailto:"] = "mailto&#58;", ["matrix:"] = "matrix&#58;", ["news:"] = "news&#58;",
["sip:"] = "sip&#58;", ["sips:"] = "sips&#58;", ["sms:"] = "sms&#58;",
["tel:"] = "tel&#58;", ["urn:"] = "urn&#58;", ["xmpp:"] = "xmpp&#58;"
}))
end
end


function export.capitalize(text)
do
if type(text) == "table" then
local function do_uclcfirst(str, case_func)
-- allow calling from a template
-- Actual function to re-case of the first letter.
text = text.args[1]
local first_letter = case_func(match(str, "^.[\128-\191]*") or "")
return first_letter .. sub(str, #first_letter + 1)
end
local function uclcfirst(str, case_func)
-- If there's a link at the beginning, re-case the first letter of the
-- link text. This pattern matches both piped and unpiped links.
-- If the link is not piped, the second capture (linktext) will be empty.
local link, linktext, remainder = match(str, "^%[%[([^|%]]+)%|?(.-)%]%](.*)$")
if link then
return "[[" .. link .. "|" .. do_uclcfirst(linktext ~= "" and linktext or link, case_func) .. "]]" .. remainder
end
return do_uclcfirst(str, case_func)
end
end
-- Capitalize multi-word that is separated by spaces
-- by uppercasing the first letter of each part.
function export.ucfirst(str)
-- I assume nobody will input all CAP text.
return uclcfirst(str, uupper)
w2 = {}
for w in export.gmatch(text, "%S+") do
table.insert(w2, uclcfirst(w, false))
end
end
return table.concat(w2, " ")
end


function export.pluralize(text)
function export.lcfirst(str)
if type(text) == "table" then
return uclcfirst(str, ulower)
-- allow calling from a template
end
text = text.args[1]
local function capitalize(w)
return uclcfirst(w, uupper)
end
end
-- Pluralize a word in a smart fashion, according to normal English rules.
-- 1. If word ends in consonant + -y, replace the -y with -ies.
-- 2. If the word ends in -s, -x, -z, -sh, -ch, add -es.
-- 3. Otherwise, add -s.
-- This handles links correctly:
-- 1. If a piped link, change the second part appropriately.
-- 2. If a non-piped link and rule #1 above applies, convert to a piped link
--    with the second part containing the plural.
-- 3. If a non-piped link and rules #2 or #3 above apply, add the plural
--    outside the link.
local function word_ends_in_consonant_plus_y(text)
--[==[Capitalize each word of a string. WARNING: May be broken in the presence of multiword links.]==]
function export.capitalize(str)
if type(str) == "table" then
-- allow calling from a template
str = str.args[1]
end
-- Capitalize multi-word that is separated by spaces
-- by uppercasing the first letter of each part.
-- I assume nobody will input all CAP text.
return (ugsub(str, "%S+", capitalize))
end
end
 
do
local function word_ends_in_consonant_plus_y(str)
-- FIXME, a subrule of rule #1 above says the -ies ending doesn't
-- FIXME, a subrule of rule #1 above says the -ies ending doesn't
-- apply to proper nouns, hence "the Gettys", "the public Ivys".
-- apply to proper nouns, hence "the Gettys", "the public Ivys".
Line 579: Line 964:
-- be important as this function is almost always called on common nouns
-- be important as this function is almost always called on common nouns
-- (e.g. parts of speech, place types).
-- (e.g. parts of speech, place types).
return text:find("[^aeiouAEIOU ]y$")
return find(str, "[^aeiouyAEIOUY ]y$")
end
end
local function word_takes_es_plural(text)
local function word_takes_es_plural(str)
return text:find("[sxz]$") or text:find("[cs]h$")
return find(str, "[sxz]$") or find(str, "[csz]h$")
end
end
local function do_pluralize(text)
local function do_pluralize(str)
if word_ends_in_consonant_plus_y(text) then
if word_ends_in_consonant_plus_y(str) then
-- avoid returning multiple values
-- avoid returning multiple values
local hack_single_retval = text:gsub("y$", "ies")
return (gsub(str, "y$", "ies"))
return hack_single_retval
elseif word_takes_es_plural(str) then
elseif word_takes_es_plural(text) then
return str .. "es"
return text .. "es"
end
else
return str .. "s"
return text .. "s"
end
--[==[
Pluralize a word in a smart fashion, according to normal English rules.
# If word ends in consonant + -y, replace the -y with -ies.
# If the word ends in -s, -x, -z, -ch, -sh, -zh, add -es.
# Otherwise, add -s.
 
This handles links correctly:
# If a piped link, change the second part appropriately.
# If a non-piped link and rule #1 above applies, convert to a piped link with the second part containing the plural.
# If a non-piped link and rules #2 or #3 above apply, add the plural outside the link.
]==]
function export.pluralize(str)
if type(str) == "table" then
-- allow calling from a template
str = str.args[1]
end
end
end
-- Check for a link. This pattern matches both piped and unpiped links.
-- If the link is not piped, the second capture (linktext) will be empty.
-- Check for a link. This pattern matches both piped and unpiped links.
local beginning, link, linktext = match(str, "^(.*)%[%[([^|%]]+)%|?(.-)%]%]$")
-- If the link is not piped, the second capture (linktext) will be empty.
if not link then
local beginning, link, linktext = export.match(text, "^(.*)%[%[([^|%]]+)%|?(.-)%]%]$")
return do_pluralize(str)
if link then
elseif linktext ~= "" then
if linktext ~= "" then
return beginning .. "[[" .. link .. "|" .. do_pluralize(linktext) .. "]]"
return beginning .. "[[" .. link .. "|" .. do_pluralize(linktext) .. "]]"
end
elseif word_ends_in_consonant_plus_y(link) then
if word_ends_in_consonant_plus_y(link) then
return beginning .. "[[" .. link .. "|" .. gsub(link, "y$", "ies") .. "]]"
return beginning .. "[[" .. link .. "|" .. link:gsub("y$", "ies") .. "]]"
end
end
return beginning .. "[[" .. link .. "]]" .. (word_takes_es_plural(link) and "es" or "s")
return beginning .. "[[" .. link .. "]]" .. (word_takes_es_plural(link) and "es" or "s")
end
end
return do_pluralize(text)
end
end


function export.singularize(text)
do
if type(text) == "table" then
local function do_singularize(str)
-- allow calling from a template
local sing = match(str, "^(.-)ies$")
text = text.args[1]
end
-- Singularize a word in a smart fashion, according to normal English rules.
-- Works analogously to pluralize().
-- NOTE: This doesn't always work as well as pluralize(). Beware. It will
-- mishandle cases like "passes" -> "passe", "eyries" -> "eyry".
-- 1. If word ends in -ies, replace -ies with -y.
-- 2. If the word ends in -xes, -shes, -ches, remove -es. [Does not affect
--    -ses, cf. "houses", "impasses".]
-- 3. Otherwise, remove -s.
-- This handles links correctly:
-- 1. If a piped link, change the second part appropriately. Collapse the
--    link to a simple link if both parts end up the same.
-- 2. If a non-piped link, singularize the link.
-- 3. A link like "[[parish]]es" will be handled correctly because the
--    code that checks for -shes etc. allows ] characters between the
--    'sh' etc. and final -es.
local function do_singularize(text)
local sing = text:match("^(.-)ies$")
if sing then
if sing then
return sing .. "y"
return sing .. "y"
end
end
-- Handle cases like "[[parish]]es"
-- Handle cases like "[[parish]]es"
local sing = text:match("^(.-[sc]h%]*)es$")
return match(str, "^(.-[cs]h%]*)es$") or -- not -zhes
if sing then
return sing
end
-- Handle cases like "[[box]]es"
-- Handle cases like "[[box]]es"
local sing = text:match("^(.-x%]*)es$")
match(str, "^(.-x%]*)es$") or -- not -ses or -zes
if sing then
-- Handle regular plurals
return sing
match(str, "^(.-)s$") or
end
-- Otherwise, return input
local sing = text:match("^(.-)s$")
str
if sing then
return sing
end
return text
end
end
 
local function collapse_link(link, linktext)
local function collapse_link(link, linktext)
if link == linktext then
if link == linktext then
return "[[" .. link .. "]]"
return "[[" .. link .. "]]"
else
return "[[" .. link .. "|" .. linktext .. "]]"
end
end
return "[[" .. link .. "|" .. linktext .. "]]"
end
end
--[==[
Singularize a word in a smart fashion, according to normal English rules. Works analogously to {pluralize()}.
'''NOTE''': This doesn't always work as well as {pluralize()}. Beware. It will mishandle cases like "passes" -> "passe", "eyries" -> "eyry".
# If word ends in -ies, replace -ies with -y.
# If the word ends in -xes, -shes, -ches, remove -es. [Does not affect -ses, cf. "houses", "impasses".]
# Otherwise, remove -s.


-- Check for a link. This pattern matches both piped and unpiped links.
This handles links correctly:
-- If the link is not piped, the second capture (linktext) will be empty.
# If a piped link, change the second part appropriately. Collapse the link to a simple link if both parts end up the same.
local beginning, link, linktext = export.match(text, "^(.*)%[%[([^|%]]+)%|?(.-)%]%]$")
# If a non-piped link, singularize the link.
if link then
# A link like "[[parish]]es" will be handled correctly because the code that checks for -shes etc. allows ] characters between the
if linktext ~= "" then
  'sh' etc. and final -es.
]==]
function export.singularize(str)
if type(str) == "table" then
-- allow calling from a template
str = str.args[1]
end
-- Check for a link. This pattern matches both piped and unpiped links.
-- If the link is not piped, the second capture (linktext) will be empty.
local beginning, link, linktext = match(str, "^(.*)%[%[([^|%]]+)%|?(.-)%]%]$")
if not link then
return do_singularize(str)
elseif linktext ~= "" then
return beginning .. collapse_link(link, do_singularize(linktext))
return beginning .. collapse_link(link, do_singularize(linktext))
end
end
return beginning .. "[[" .. do_singularize(link) .. "]]"
return beginning .. "[[" .. do_singularize(link) .. "]]"
end
end
return do_singularize(text)
end
end


 
--[==[
function export.add_indefinite_article(text, uppercase)
Return the appropriate indefinite article to prefix to `str`. Correctly handles links and capitalized text.
Does not correctly handle words like [[union]], [[uniform]] and [[university]] that take "a" despite beginning with
a 'u'. The returned article will have its first letter capitalized if `ucfirst` is specified, otherwise lowercase.
]==]
function export.get_indefinite_article(str, ucfirst)
str = str or ""
local is_vowel = false
local is_vowel = false
-- If there's a link at the beginning, examine the first letter of the
-- If there's a link at the beginning, examine the first letter of the
-- link text. This pattern matches both piped and unpiped links.
-- link text. This pattern matches both piped and unpiped links.
-- If the link is not piped, the second capture (linktext) will be empty.
-- If the link is not piped, the second capture (linktext) will be empty.
local link, linktext, remainder = export.match(text, "^%[%[([^|%]]+)%|?(.-)%]%](.*)$")
local link, linktext = match(str, "^%[%[([^|%]]+)%|?(.-)%]%]")
if link then
if link then
is_vowel = export.find(linktext ~= "" and linktext or link, "^[AEIOUaeiou]")
is_vowel = find(linktext ~= "" and linktext or link, "^[AEIOUaeiou]")
else
else
is_vowel = export.find(text, "^[AEIOUaeiou]")
is_vowel = find(str, "^[AEIOUaeiou]")
end
end
return (is_vowel and (uppercase and "An " or "an ") or (uppercase and "A " or "a ")) .. text
return is_vowel and (ucfirst and "An" or "an") or (ucfirst and "A" or "a")
end
end
get_indefinite_article = export.get_indefinite_article


-- Convert risky characters to HTML entities, which minimizes interference once returned (e.g. for "sms:a", "<!-- -->" etc.).
--[==[
function export.escape_risky_characters(text)
Prefix `text` with the appropriate indefinite article to prefix to `text`. Correctly handles links and capitalized
if text:match("\"'") then
text. Does not correctly handle words like [[union]], [[uniform]] and [[university]] that take "a" despite beginning
for _, pattern in ipairs(require("Module:languages/data/patterns")) do
with a 'u'. The returned article will have its first letter capitalized if `ucfirst` is specified, otherwise lowercase.
text = text:gsub(pattern, function(m1) return mw.text.encode(m1, "\"'") end)
]==]
end
function export.add_indefinite_article(text, ucfirst)
end
return get_indefinite_article(text, ucfirst) .. " " .. text
-- Spacing characters in isolation generally need to be escaped in order to be properly processed by the MediaWiki software.
if not mw.ustring.match(text, "%S") then
return mw.text.encode(text, "%s")
else
return mw.text.encode(text, "!#%%&*+/:;<=>?@[\\%]_{|}")
end
end
end


return export
return export

Latest revision as of 09:42, 31 July 2024



local mw = mw
local string = string
local table = table
local ustring = mw.ustring

local byte = string.byte
local char = string.char
local concat = table.concat
local find = string.find
local format = string.format
local gmatch = string.gmatch
local gsub = string.gsub
local len = string.len
local load_data = mw.loadData
local lower = string.lower
local match = string.match
local next = next
local reverse = string.reverse
local select = select
local sort = table.sort
local sub = string.sub
local tonumber = tonumber
local tostring = tostring
local type = type
local ucodepoint = ustring.codepoint
local ufind = ustring.find
local ugcodepoint = ustring.gcodepoint
local ugmatch = ustring.gmatch
local ugsub = ustring.gsub
local ulower = ustring.lower
local umatch = ustring.match
local unpack = unpack
local upper = string.upper
local usub = ustring.sub
local uupper = ustring.upper
-- Defined below.
local charset_escape
local codepoint
local explode_utf8
local format_fun
local get_indefinite_article
local pattern_escape
local pattern_simplifier
local php_trim
local replacement_escape
local u
local ulen

local module_name = "string_utilities"

local export = {}

--[==[Explodes a string into an array of UTF-8 characters. '''Warning''': this function has no safety checks for non-UTF-8 byte sequences, to optimize speed and memory use. Inputs containing them therefore result in undefined behaviour.]==]
function export.explode_utf8(str)
	local text, i = {}, 0
	for ch in gmatch(str, ".[\128-\191]*") do
		i = i + 1
		text[i] = ch
	end
	return text
end
explode_utf8 = export.explode_utf8

--[==[Escapes the magic characters used in [[mw:Extension:Scribunto/Lua reference manual#Patterns|patterns]] (Lua's version of regular expressions): <code>$%()*+-.?[]^</code>. For example, {{code|lua|"^$()%.[]*+-?"}} becomes {{code|lua|"%^%$%(%)%%%.%[%]%*%+%-%?"}}. This is necessary when constructing a pattern involving arbitrary text (e.g. from user input).]==]
function export.pattern_escape(str)
	return (gsub(str, "[$%%()*+%-.?[%]^]", "%%%0"))
end
pattern_escape = export.pattern_escape

--[==[Escapes only the magic characters used in [[mw:Extension:Scribunto/Lua reference manual#Patterns|pattern]] character sets: <code>%-]^</code>.]==]
function export.charset_escape(str)
	return (gsub(str, "[%%%-%]^]", "%%%0"))
end
charset_escape = export.charset_escape

--[==[Escapes only <code>%</code>, which is the only magic character used in replacement [[mw:Extension:Scribunto/Lua reference manual#Patterns|patterns]] with string.gsub and mw.ustring.gsub.]==]
function export.replacement_escape(str)
	return (gsub(str, "%%", "%%%%"))
end
replacement_escape = export.replacement_escape

do
	local function check_sets_equal(set1, set2)
		local k2
		for k1, v1 in next, set1 do
			local v2 = set2[k1]
			if v1 ~= v2 and (v2 == nil or not check_sets_equal(v1, v2)) then
				return false
			end
			k2 = next(set2, k2)
		end
		return next(set2, k2) == nil
	end
	
	local function check_sets(bytes)
		local key, set1, set = next(bytes)
		if set1 == true then
			return true
		elseif not check_sets(set1) then
			return false
		end
		while true do
			key, set = next(bytes, key)
			if not key then
				return true
			elseif not check_sets_equal(set, set1) then
				return false
			end
		end
	end
	
	local function make_charset(range)
		if #range == 1 then
			return char(range[1])
		end
		sort(range)
		local compressed, n, start = {}, 0, range[1]
		for i = 1, #range do
			local this, nxt = range[i], range[i + 1]
			if nxt ~= this + 1 then
				n = n + 1
				compressed[n] = this == start and char(this) or
					char(start) .. "-" .. char(this)
				start = nxt
			end
		end
		return "[" .. concat(compressed) .. "]"
	end
	
	local function parse_1_byte_charset(pattern, pos)
		while true do
			local ch, nxt_pos
			pos, ch, nxt_pos = match(pattern, "()([%%%]\194-\244][\128-\191]*)()", pos)
			if not ch then
				return false
			elseif ch == "%" then
				if match(pattern, "^[acdlpsuwxACDLPSUWXZ\128-\255]", nxt_pos) then
					return false
				end
				pos = pos + 2
			elseif ch == "]" then
				pos = nxt_pos
				return pos
			else
				return false
			end
		end	
	end
	
	--[==[Parses `pattern`, a ustring library pattern, and attempts to convert it into a string library pattern. If conversion isn't possible, returns false.]==]
	pattern_simplifier = require("Module:fun").memoize(function(pattern)
		if type(pattern) == "number" then
			return tostring(pattern)
		end
		local pos, captures, start, n, output = 1, 0, 1, 0
		while true do
			local ch, nxt_pos
			pos, ch, nxt_pos = match(pattern, "()([%%(.[\194-\244][\128-\191]*)()", pos)
			if not ch then
				break
			end
			local nxt = sub(pattern, nxt_pos, nxt_pos)
			if ch == "%" then
				if nxt == "b" then
					if not match(pattern, "^()[^\128-\255][^\128-\255]", pos + 2) then
						return false
					end
					pos = pos + 4
				elseif nxt == "f" then
					pos = pos + 2
					if not match(pattern, "^()%[[^^]", pos) then
						return false
					end
					-- Only possible to convert a %f charset which is all
					-- ASCII, so use parse_1_byte_charset.
					pos = parse_1_byte_charset(pattern, pos)
					if not pos then
						return false
					end
				elseif nxt == "Z" then
					pos = pos + 2
					nxt = sub(pattern, pos, pos)
					if nxt == "*" or nxt == "+" or nxt == "-" then
						pos = pos + 1
					else
						output = output or {}
						n = n + 1
						if nxt == "?" then
							output[n] = sub(pattern, start, pos - 3) .. "[\1-\127\194-\244]?[\128-\191]*"
							pos = pos + 1
						else
							output[n] = sub(pattern, start, pos - 3) .. "[\1-\127\194-\244][\128-\191]*"
						end
						start = pos
					end
				elseif find("acdlpsuwxACDLPSUWX", nxt, 1, true) then
					return false
				-- Skip the next character if it's ASCII. Otherwise, we will
				-- still need to do length checks.
				else
					pos = pos + (byte(nxt) < 128 and 2 or 1)
				end
			elseif ch == "(" then
				if nxt == ")" or captures == 32 then
					return false
				end
				captures = captures + 1
				pos = pos + 1
			elseif ch == "." then
				if nxt == "*" or nxt == "+" or nxt == "-" then
					pos = pos + 2
				else
					output = output or {}
					n = n + 1
					if nxt == "?" then
						output[n] = sub(pattern, start, pos - 1) .. "[^\128-\191]?[\128-\191]*"
						pos = pos + 2
					else
						output[n] = sub(pattern, start, pos - 1) .. "[^\128-\191][\128-\191]*"
						pos = pos + 1
					end
					start = pos
				end
			elseif ch == "[" then
				-- Fail negative charsets. TODO: 1-byte charsets should be safe.
				if nxt == "^" then
					return false
				-- If the first character is "%", ch_len is determined by the
				-- next one instead.
				elseif nxt == "%" then
					nxt_pos = nxt_pos + 1
					nxt = sub(pattern, nxt_pos, nxt_pos)
				end
				local ch_len = #match(pattern, "^.[\128-\191]*", nxt_pos)
				if ch_len == 1 then -- Single-byte charset.
					pos = parse_1_byte_charset(pattern, pos + 1)
					if not pos then
						return false
					end
				else -- Multibyte charset.
					local charset_pos, bytes = pos
					pos = pos + 1
					while true do -- TODO: non-ASCII charset ranges.
						pos, ch, nxt_pos = match(pattern, "()([^\128-\191][\128-\191]*)()", pos)
						if not ch then
							return false
						-- If escaped, get the next character. No need to
						-- distinguish magic characters or character classes,
						-- as they'll all fail for having the wrong length
						-- anyway.
						elseif ch == "%" then
							pos, ch, nxt_pos = match(pattern, "()([^\128-\191][\128-\191]*)()", pos)
						elseif ch == "]" then
							pos = nxt_pos
							break
						end
						if ch_len ~= #ch then
							return false
						end
						bytes = bytes or {}
						local bytes = bytes
						for i = 1, ch_len - 1 do
							local b = byte(ch, i, i)
							bytes[b] = bytes[b] or {}
							bytes = bytes[b]
						end
						bytes[byte(ch, -1)] = true
						pos = nxt_pos
					end
					if not pos then
						return false
					end
					local nxt = sub(pattern, pos, pos)
					if (
						(nxt == "?" or nxt == "*" or nxt == "-") or
						(nxt == "+" and ch_len > 2) or
						not check_sets(bytes)
					) then
						return false
					end
					local ranges, b, key, next_byte = {}, 0
					repeat
						key, next_byte = next(bytes)
						local range, n = {key}, 1
						-- Loop starts on the second iteration.
						for key in next, bytes, key do
							n = n + 1
							range[n] = key
						end
						b = b + 1
						ranges[b] = range
						bytes = next_byte
					until next_byte == true
					if nxt == "+" then
						local range1, range2 = ranges[1], ranges[2]
						ranges[1] = make_charset(range1)
						ranges[3] = make_charset(range2)
						local n = #range2
						for i = 1, #range1 do
							n = n + 1
							range2[n] = range1[i]
						end
						ranges[2] = make_charset(range2) .. "*"
						pos = pos + 1
					else
						for i = 1, #ranges do
							ranges[i] = make_charset(ranges[i])
						end
					end
					output = output or {}
					n = n + 1
					output[n] = sub(pattern, start, charset_pos - 1) .. concat(ranges)
					start = pos
				end
			elseif nxt == "+" then
				if #ch ~= 2 then
					return false
				end
				output = output or {}
				n = n + 1
				output[n] = sub(pattern, start, pos) .. "[" .. ch .. "]*" .. sub(ch, 2, 2)
				pos = nxt_pos + 1
				start = pos
			elseif nxt == "?" or nxt == "*" or nxt == "-" then
				return false
			else
				pos = nxt_pos
			end
		end
		if start == 1 then
			return pattern
		end
		return concat(output) .. sub(pattern, start)
	end, true)
	export.pattern_simplifier = pattern_simplifier -- For testing.
end

function export.len(str)
	return type(str) == "number" and len(str) or
		#str - #gsub(str, "[^\128-\191]+", "")
end
ulen = export.len

function export.sub(str, i, j)
	str, i = type(str) == "number" and tostring(str) or str, i or 1
	if i < 0 or j and j < 0 then
		return usub(str, i, j)
	elseif j and i > j or i > #str then
		return ""
	end
	local n, new_i = 0
	for loc1, loc2 in gmatch(str, "()[^\128-\191]+()[\128-\191]*") do
		n = n + loc2 - loc1
		if not new_i and n >= i then
			new_i = loc2 - (n - i) - 1
			if not j then
				return sub(str, new_i)
			end
		end
		if j and n > j then
			return sub(str, new_i, loc2 - (n - j) - 1)
		end
	end
	return new_i and sub(str, new_i) or ""
end

do
	local function _find(str, loc1, loc2, ...)
		if loc1 and not match(str, "^()[^\128-\255]*$") then
			-- Use raw values of loc1 and loc2 to get loc1 and the length of the match.
			loc1, loc2 = ulen(sub(str, 1, loc1)), ulen(sub(str, loc1, loc2))
			-- Offset length with loc1 to get loc2.
			loc2 = loc1 + loc2 - 1
		end
		return loc1, loc2, ...
	end
	
	--[==[A version of find which uses string.find when possible, but otherwise uses mw.ustring.find.]==]
	function export.find(str, pattern, init, plain)
		init = init or 1
		if init ~= 1 and not match(str, "^()[^\128-\255]*$") then
			return ufind(str, pattern, init, plain)
		elseif plain then
			return _find(str, find(str, pattern, init, true))
		end
		local simple = pattern_simplifier(pattern)
		if simple then
			return _find(str, find(str, simple, init))
		end
		return ufind(str, pattern, init)
	end
end

--[==[A version of match which uses string.match when possible, but otherwise uses mw.ustring.match.]==]
function export.match(str, pattern, init)
	init = init or 1
	if init ~= 1 and not match(str, "^()[^\128-\255]*$") then
		return umatch(str, pattern, init)
	end
	local simple = pattern_simplifier(pattern)
	if simple then
		return match(str, simple, init)
	end
	return umatch(str, pattern, init)
end

--[==[A version of gmatch which uses string.gmatch when possible, but otherwise uses mw.ustring.gmatch.]==]
function export.gmatch(str, pattern)
	local simple = pattern_simplifier(pattern)
	if simple then
		return gmatch(str, simple)
	end
	return ugmatch(str, pattern)
end

--[==[A version of gsub which uses string.gsub when possible, but otherwise uses mw.ustring.gsub.]==]
function export.gsub(str, pattern, repl, n)
	local simple = pattern_simplifier(pattern)
	if simple then
		return gsub(str, simple, repl, n)
	end
	return ugsub(str, pattern, repl, n)
end

--[==[Like gsub, but pattern-matching facilities are turned off, so `pattern` and `repl` (if a string) are treated as literal.]==]
function export.plain_gsub(str, pattern, repl, n)
	return gsub(str, pattern_escape(pattern), type(repl) == "string" and replacement_escape(repl) or repl, n)
end

--[==[Reverses a UTF-8 string; equivalent to string.reverse.]==]
function export.reverse(str)
	return reverse(gsub(str, "[\194-\244][\128-\191]*", reverse))
end

do
	local function err(cp)
		error("Codepoint " .. cp .. " is out of range: codepoints must be between 0x0 and 0x10FFFF.", 2)
	end

	local function utf8_char(cp)
		cp = tonumber(cp)
		if cp < 0 then
			err("-0x" .. format("%X", -cp + 1))
		elseif cp < 0x80 then
			return char(cp)
		elseif cp < 0x800 then
			return char(
				0xC0 + cp / 0x40,
				0x80 + cp % 0x40
			)
		elseif cp < 0x10000 then
			if cp >= 0xD800 and cp < 0xE000 then
				return "?" -- mw.ustring.char returns "?" for surrogates.
			end
			return char(
				0xE0 + cp / 0x1000,
				0x80 + cp / 0x40 % 0x40,
				0x80 + cp % 0x40
			)
		elseif cp < 0x110000 then
			return char(
				0xF0 + cp / 0x40000,
				0x80 + cp / 0x1000 % 0x40,
				0x80 + cp / 0x40 % 0x40,
				0x80 + cp % 0x40
			)
		end
		err("0x" .. format("%X", cp))
	end

	function export.char(cp, ...)
		if ... == nil then
			return utf8_char(cp)
		end
		local ret = {cp, ...}
		for i = 1, select("#", cp, ...) do
			ret[i] = utf8_char(ret[i])
		end
		return concat(ret)
	end
	u = export.char
end

do
	local function get_codepoint(b1, b2, b3, b4)
		if b1 < 128 then
			return b1, 1
		elseif b1 < 224 then
			return 0x40 * b1 + b2 - 0x3080, 2
		elseif b1 < 240 then
			return 0x1000 * b1 + 0x40 * b2 + b3 - 0xE2080, 3
		end
		return 0x40000 * b1 + 0x1000 * b2 + 0x40 * b3 + b4 - 0x3C82080, 4
	end

	function export.codepoint(str, i, j)
		if type(str) == "number" then
			return byte(str, i, j)
		end
		i, j = i or 1, j == -1 and #str or i or 1
		if i == 1 and j == 1 then
			return (get_codepoint(byte(str, 1, 4)))
		elseif i < 0 or j < 0 then
			return ucodepoint(str, i, j) -- FIXME
		end
		local n, nb, ret, nr = 0, 1, {}, 0
		while n < j do
			n = n + 1
			if n < i then
				local b = byte(str, nb)
				nb = nb + (b < 128 and 1 or b < 224 and 2 or b < 240 and 3 or 4)
			else
				local b1, b2, b3, b4 = byte(str, nb, nb + 3)
				if not b1 then
					break
				end
				nr = nr + 1
				local add
				ret[nr], add = get_codepoint(b1, b2, b3, b4)
				nb = nb + add
			end
		end
		return unpack(ret)
	end
	codepoint = export.codepoint
	
	function export.gcodepoint(str, i, j)
		i, j = i or 1, j ~= -1 and j or nil
		if i < 0 or j and j < 0 then
			return ugcodepoint(str, i, j) -- FIXME
		end
		local n, nb = 1, 1
		while n < i do
			local b = byte(str, nb)
			if not b then
				break
			end
			nb = nb + (b < 128 and 1 or b < 224 and 2 or b < 240 and 3 or 4)
			n = n + 1
		end
		
		return function()
			if j and n > j then
				return nil
			end
			n = n + 1
			local b1, b2, b3, b4 = byte(str, nb, nb + 3)
			if not b1 then
				return nil
			end
			local ret, add = get_codepoint(b1, b2, b3, b4)
			nb = nb + add
			return ret
		end
	end
end

--[==[A version of lower which uses string.lower when possible, but otherwise uses mw.ustring.lower.]==]
function export.lower(str)
	return (match(str, "^()[^\128-\255]*$") and lower or ulower)(str)
end

--[==[A version of upper which uses string.upper when possible, but otherwise uses mw.ustring.upper.]==]
function export.upper(str)
	return (match(str, "^()[^\128-\255]*$") and upper or uupper)(str)
end

do
	local function add_captures(text, n, ...)
		-- Insert any captures from the splitting pattern.
		local offset, capture = n - 1, ...
		while capture do
			n = n + 1
			text[n] = capture
			capture = select(n - offset, ...)
		end
		return n
	end
	
	local function iterate(str, str_len, text, n, start, _sub, loc1, loc2, ...)
		if not (loc1 and start <= str_len) then
			-- If no match, or there is but we're past the end of the string
			-- (which happens when the match is the empty string), then add
			-- the final chunk and return.
			n = n + 1
			text[n] = _sub(str, start)
			return
		elseif loc2 < loc1 then
			-- Special case: If we match the empty string, then include the
			-- next character; this avoids an infinite loop, and makes
			-- splitting by an empty string work the way mw.text.split() does
			-- (including non-adjacent empty string matches with %f). If we
			-- reach the end of the string this way, return immediately, so we
			-- don't get a final empty string. If using the string library, we
			-- need to make sure we advance by one UTF-8 character.
			if _sub == sub then
				loc1 = loc1 + #match(str, "^[\128-\191]*", loc1 + 1)
			end
			n = n + 1
			text[n] = _sub(str, start, loc1)
			start = loc1 + 1
			if start > str_len then
				return ... and add_captures(text, n, ...) or n
			end
		else
			-- Add chunk up to the current match.
			n = n + 1
			text[n] = _sub(str, start, loc1 - 1)
			start = loc2 + 1
		end
		return (... and add_captures(text, n, ...) or n), start
	end
	
	local function _split(str, pattern, str_len, _sub, _find, plain)
		local text, n, start = {}, 0, 1
		
		repeat
			n, start = iterate(str, str_len, text, n, start, _sub, _find(str, pattern, start, plain))
		until not start
		
		return text
	end
	
	--[==[Reimplementation of mw.text.split() that includes any capturing groups in the splitting pattern. This works like Python's re.split() function, except that it has Lua's behavior when the split pattern is empty (i.e. advancing by one character at a time; Python returns the whole remainder of the string). When possible, it will use the string library, but otherwise uses the ustring library. There are two optional parameters: `str_lib` forces use of the string library, while `plain` turns any pattern matching facilities off, treating `pattern` as literal.]==]
	function export.split(str, pattern, str_lib, plain)
		if str_lib or plain then
			return _split(str, pattern, #str, sub, find, plain)
		end
		local simple = pattern_simplifier(pattern)
		if simple then
			return _split(str, simple, #str, sub, find)
		end
		return _split(str, pattern, ulen(str), usub, ufind)
	end
	export.capturing_split = export.split -- To be removed.
end

do
	-- TODO: merge this with export.split. Not clear how to do this while
	-- maintaining the same level of performance, as gsplit is slower.
	local function _split(str, pattern, str_len, _sub, _find, plain)
		local start, final = 1
		
		local function iter(loc1, loc2, ...)
			-- If no match, return the final chunk.
			if not loc1 then
				final = true
				return _sub(str, start)
			end
			-- Special case: If we match the empty string, then eat the
			-- next character; this avoids an infinite loop, and makes
			-- splitting by the empty string work the way mw.text.gsplit() does
			-- (including non-adjacent empty string matches with %f). If we
			-- reach the end of the string this way, set `final` to true, so we
			-- don't get stuck matching the empty string at the end.
			local chunk
			if loc2 < loc1 then
				-- If using the string library, we need to make sure we advance
				-- by one UTF-8 character.
				if _sub == sub then
					loc1 = loc1 + #match(str, "^[\128-\191]*", loc1 + 1)
				end
				chunk = _sub(str, start, loc1)
				if loc1 >= str_len then
					final = true
				else
					start = loc1 + 1
				end
			-- Eat chunk up to the current match.
			else
				chunk = _sub(str, start, loc1 - 1)
				start = loc2 + 1
			end
			return chunk, ...
		end
		
		return function()
			if not final then
				return iter(_find(str, pattern, start, plain))
			end
			return nil
		end
	end
	
	function export.gsplit(str, pattern, str_lib, plain)
		if str_lib or plain then
			return _split(str, pattern, #str, sub, find, plain)
		end
		local simple = pattern_simplifier(pattern)
		if simple then
			return _split(str, simple, #str, sub, find)
		end
		return _split(str, pattern, ulen(str), usub, ufind)
	end
end

function export.trim(str, charset)
	if not charset then
		return match(str, "^()%s*$") and "" or match(str, "^%s*(.*%S)")
	elseif match(charset, "^()[^\128-\255]*$") then
		return match(str, "^()[" .. charset .. "]*$") and "" or match(str, "^[" .. charset .. "]*(.*[^" .. charset .. "])")
	end
	return umatch(str, "^[" .. charset .. "]*(.-)[" .. charset .. "]*$")
end

do
	local entities

	local function decode_numeric_entity(code, pattern, base)
		local cp = match(code, pattern) and tonumber(code, base)
		return cp and cp < 0x110000 and u(cp) or nil
	end

	local function decode_entity(hash, x, code)
		if hash == "#" then
			return x == "" and decode_numeric_entity(code, "^%d+$") or
				decode_numeric_entity(code, "^%x+$", 16)
		end
		entities = entities or load_data("Module:data/entities")
		return entities[x .. code]
	end

	-- Non-ASCII characters aren't valid in proper HTML named entities, but MediaWiki uses them in some custom aliases which have also been included in [[Module:data/entities]].
	function export.decode_entities(str)
		return find(str, "&", 1, true) and
			gsub(str, "&(#?)([xX]?)([%w\128-\255]+);", decode_entity) or str
	end
end

do
	local html_entities
	
	local function encode_entity(ch)
		local entity = html_entities[ch]
		if entity then
			return entity
		end
		entity = "&#" .. codepoint(ch) .. ";"
		html_entities[ch] = entity
		return entity
	end
	
	function export.encode_entities(str, charset, str_lib, plain)
		-- Memoized HTML entities (taken from mw.text.lua).
		html_entities = html_entities or {
			["\""] = "&quot;",
			["&"] = "&amp;",
			["'"] = "&#039;",
			["<"] = "&lt;",
			[">"] = "&gt;",
			["\194\160"] = "&nbsp;",
		}
		if not charset then
			return (gsub(str, "[\"&'<>\194]\160?", html_entities))
		elseif plain then
			return (gsub(str, "[" .. charset_escape(charset) .. "]", encode_entity))
		elseif str_lib then
			if not match(charset, "^()[^\128-\255]*$") then
				error("Cannot use the string library with a character set that contains a character with a codepoint above U+007F.")
			end
			return (gsub(str, "[" .. charset .. "]", encode_entity))
		end
		local pattern = charset and "[" .. charset .. "]"
		local simple = pattern_simplifier(pattern)
		if simple then
			return (gsub(str, simple, encode_entity))
		end
		return (ugsub(str, pattern, encode_entity))
	end
end

do
	local function decode_path(code)
		return char(tonumber(code, 16))
	end
	
	local function decode(lead, trail)
		if lead == "+" or lead == "_" then
			return " " .. trail
		elseif #trail == 2 then
			return decode_path(trail)
		end
		return lead .. trail
	end
	
	function export.decode_uri(str, enctype)
		enctype = enctype and upper(enctype) or "QUERY"
		if enctype == "PATH" then
			return find(str, "%", 1, true) and
				gsub(str, "%%(%x%x)", decode_path) or str
		elseif enctype == "QUERY" then
			return (find(str, "%", 1, true) or find(str, "+", 1, true)) and
				gsub(str, "([%%%+])(%x?%x?)", decode) or str
		elseif enctype == "WIKI" then
			return (find(str, "%", 1, true) or find(str, "_", 1, true)) and
				gsub(str, "([%%_])(%x?%x?)", decode) or str
		end
		error("bad argument #2 to \"decode_uri\" (expected QUERY, PATH, or WIKI)", 2)
	end
end

do
	local function _remove_comments(str, pre)
		local head = find(str, "<!--", 1, true)
		if not head then
			return str
		end
		local ret, n = {sub(str, 1, head - 1)}, 1
		while true do
			local loc = find(str, "-->", head + 4, true)
			if not loc then
				return pre and concat(ret) or
					concat(ret) .. sub(str, head)
			end
			head = loc + 3
			loc = find(str, "<!--", head, true)
			if not loc then
				return concat(ret) .. sub(str, head)
			end
			n = n + 1
			ret[n] = sub(str, head, loc - 1)
			head = loc
		end
	end
	
	--[==[Removes any HTML comments from the input text. `stage` can be one of three options:
	* {{lua|"PRE"}} (default) applies the method used by MediaWiki's preprocessor: all {{code||<nowiki><!-- ... --></nowiki>}} pairs are removed, as well as any text after an unclosed {{code||<nowiki><!--</nowiki>}}. This is generally suitable when parsing raw template or [[mw:Parser extension tags|parser extension tag]] code. (Note, however, that the actual method used by the preprocessor is considerably more complex and differs under certain conditions (e.g. comments inside nowiki tags); if full accuracy is absolutely necessary, use [[Module:template parser]] instead).
	* {{lua|"POST"}} applies the method used to generate the final page output once all templates have been expanded: it loops over the text, removing any {{code||<nowiki><!-- ... --></nowiki>}} pairs until no more are found (e.g. {{code||<nowiki><!-<!-- ... -->- ... --></nowiki>}} would be fully removed), but any unclosed {{code||<nowiki><!--</nowiki>}} is ignored. This is suitable for handling links embedded in template inputs, where the {{lua|"PRE"}} method will have already been applied by the native parser.
	* {{lua|"BOTH"}} applies {{lua|"PRE"}} then {{lua|"POST"}}.]==]
	function export.remove_comments(str, stage)
		if not stage or stage == "PRE" then
			return _remove_comments(str, true)
		end
		local processed = stage == "POST" and _remove_comments(str) or
			stage == "BOTH" and _remove_comments(str, true) or
			error("bad argument #2 to \"remove_comments\" (expected PRE, POST, or BOTH)", 2)
		while processed ~= str do
			str = processed
			processed = _remove_comments(str)
		end
		return str
	end
end

--[==[Lua equivalent of PHP's {{code|php|trim($string)}}, which trims {{code|lua|"\0"}}, {{code|lua|"\t"}}, {{code|lua|"\n"}}, {{code|lua|"\v"}}, {{code|lua|"\r"}} and {{code|lua|" "}}. This is useful when dealing with template parameters, since the native parser trims them like this.]==]
function export.php_trim(str)
	return match(str, "%f[^%z\t\n\v\r ].*%f[%z\t\n\v\r ]") or ""
end
php_trim = export.php_trim

--[==[Takes a parameter name as an input, and returns the Scribunto-normalized form (i.e. the key that that parameter would have in a {{code|lua|frame.args}} table). For example, {{code|lua|"1"}} is normalized to {{code|lua|1}} (a number), and {{code|lua|" foo "}} is normalized to {{code|lua|"foo"}}. If the input is not a string, it is returned unchanged.

After being trimmed with {{code|lua|export.php_trim}}, strings are converted to numbers if:
# They are integers, with no decimals (2.0) or leading zeroes (02).
# They are ≤ 2{{sup|53}} and ≥ -2{{sup|53}}.
# For positive values, they do not have a leading {{code|lua|+}} sign.]==]
function export.scribunto_param_key(key)
	if type(key) ~= "string" then
		return key
	end
	key = php_trim(key)
	if match(key, "^-?[1-9]%d*$") then
		local num = tonumber(key)
		-- Lua integers are only accurate to 2^53 - 1, so we have to specifically check for 2^53 and -2^53, since 2^53 == 2^53 + 1 evaluates to true.
		return (
			num <= 9007199254740991 and num >= -9007199254740991 or
			key == "9007199254740992" or
			key == "-9007199254740992"
		) and num or key
	elseif key == "0" then
		return 0
	end
	return key
end

do
	local byte_escapes
	
	local function escape_byte(b)
		return byte_escapes[b] or format("\\%03d", byte(b))
	end
	
	function export.escape_bytes(str)
		byte_escapes = byte_escapes or load_data("Module:string utilities/data").byte_escapes
		return (gsub(str, ".", escape_byte))
	end
end

function export.format_fun(str, fun)
	return (gsub(str, "{(\\?)((\\?)[^{}]*)}", function(p1, name, p2)
		if #p1 + #p2 == 1 then
			return name == "op" and "{" or
				name == "cl" and "}" or
				error(module_name .. ".format: unrecognized escape sequence '{\\" .. name .. "}'")
		elseif fun(name) and type(fun(name)) ~= "string" then
			error(module_name .. ".format: \"" .. name .. "\" is a " .. type(fun(name)) .. ", not a string")
		end
		return fun(name) or error(module_name .. ".format: \"" .. name .. "\" not found in table")
	end))
end
format_fun = export.format_fun

--[==[This function, unlike {{code|lua|string.format}} and {{code|lua|mw.ustring.format}}, takes just two parameters—a format string and a table—and replaces all instances of {{code|lua|{param_name}}} in the format string with the table's entry for {{code|lua|param_name}}. The opening and closing brace characters can be escaped with <code>{\op}</code> and <code>{\cl}</code>, respectively. A table entry beginning with a slash can be escaped by doubling the initial slash.
====Examples====
* {{code|lua|2=string_utilities.format("{foo} fish, {bar} fish, {baz} fish, {quux} fish", {["foo"]="one", ["bar"]="two", ["baz"]="red", ["quux"]="blue"})}}
*: produces: {{code|lua|"one fish, two fish, red fish, blue fish"}}
* {{code|lua|2=string_utilities.format("The set {\\op}1, 2, 3{\\cl} contains {\\\\hello} elements.", {["\\hello"]="three"})}}
*: produces: {{code|lua|"The set {1, 2, 3} contains three elements."}}
*:* Note that the single and double backslashes should be entered as double and quadruple backslashes when quoted in a literal string.]==]
function export.format(str, tbl)
	return format_fun(str, function(key)
		return tbl[key]
	end)
end

do
	local function do_uclcfirst(str, case_func)
		-- Actual function to re-case of the first letter.
		local first_letter = case_func(match(str, "^.[\128-\191]*") or "")
		return first_letter .. sub(str, #first_letter + 1)
	end
	
	local function uclcfirst(str, case_func)
		-- If there's a link at the beginning, re-case the first letter of the
		-- link text. This pattern matches both piped and unpiped links.
		-- If the link is not piped, the second capture (linktext) will be empty.
		local link, linktext, remainder = match(str, "^%[%[([^|%]]+)%|?(.-)%]%](.*)$")
		if link then
			return "[[" .. link .. "|" .. do_uclcfirst(linktext ~= "" and linktext or link, case_func) .. "]]" .. remainder
		end
		return do_uclcfirst(str, case_func)
	end
	
	function export.ucfirst(str)
		return uclcfirst(str, uupper)
	end

	function export.lcfirst(str)
		return uclcfirst(str, ulower)
	end
	
	local function capitalize(w)
		return uclcfirst(w, uupper)
	end
	
	--[==[Capitalize each word of a string. WARNING: May be broken in the presence of multiword links.]==]
	function export.capitalize(str)
		if type(str) == "table" then
			-- allow calling from a template
			str = str.args[1]
		end
		-- Capitalize multi-word that is separated by spaces
		-- by uppercasing the first letter of each part.
		-- I assume nobody will input all CAP text.
		return (ugsub(str, "%S+", capitalize))
	end
end

do
	local function word_ends_in_consonant_plus_y(str)
		-- FIXME, a subrule of rule #1 above says the -ies ending doesn't
		-- apply to proper nouns, hence "the Gettys", "the public Ivys".
		-- We should maybe consider applying this rule here; but it may not
		-- be important as this function is almost always called on common nouns
		-- (e.g. parts of speech, place types).
		return find(str, "[^aeiouyAEIOUY ]y$")
	end
	
	local function word_takes_es_plural(str)
		return find(str, "[sxz]$") or find(str, "[csz]h$")
	end
	
	local function do_pluralize(str)
		if word_ends_in_consonant_plus_y(str) then
			-- avoid returning multiple values
			return (gsub(str, "y$", "ies"))
		elseif word_takes_es_plural(str) then
			return str .. "es"
		end
		return str .. "s"
	end	
	
	--[==[
	Pluralize a word in a smart fashion, according to normal English rules.
	# If word ends in consonant + -y, replace the -y with -ies.
	# If the word ends in -s, -x, -z, -ch, -sh, -zh, add -es.
	# Otherwise, add -s.

	This handles links correctly:
	# If a piped link, change the second part appropriately.
	# If a non-piped link and rule #1 above applies, convert to a piped link with the second part containing the plural.
	# If a non-piped link and rules #2 or #3 above apply, add the plural outside the link.
	]==]
	function export.pluralize(str)
		if type(str) == "table" then
			-- allow calling from a template
			str = str.args[1]
		end
		-- Check for a link. This pattern matches both piped and unpiped links.
		-- If the link is not piped, the second capture (linktext) will be empty.
		local beginning, link, linktext = match(str, "^(.*)%[%[([^|%]]+)%|?(.-)%]%]$")
		if not link then
			return do_pluralize(str)
		elseif linktext ~= "" then
			return beginning .. "[[" .. link .. "|" .. do_pluralize(linktext) .. "]]"
		elseif word_ends_in_consonant_plus_y(link) then
			return beginning .. "[[" .. link .. "|" .. gsub(link, "y$", "ies") .. "]]"
		end
		return beginning .. "[[" .. link .. "]]" .. (word_takes_es_plural(link) and "es" or "s")
	end
end

do
	local function do_singularize(str)
		local sing = match(str, "^(.-)ies$")
		if sing then
			return sing .. "y"
		end
		-- Handle cases like "[[parish]]es"
		return match(str, "^(.-[cs]h%]*)es$") or -- not -zhes
		-- Handle cases like "[[box]]es"
			match(str, "^(.-x%]*)es$") or -- not -ses or -zes
		-- Handle regular plurals
			match(str, "^(.-)s$") or
		-- Otherwise, return input
			str
	end
	
	local function collapse_link(link, linktext)
		if link == linktext then
			return "[[" .. link .. "]]"
		end
		return "[[" .. link .. "|" .. linktext .. "]]"
	end
	
	--[==[
	Singularize a word in a smart fashion, according to normal English rules. Works analogously to {pluralize()}.

	'''NOTE''': This doesn't always work as well as {pluralize()}. Beware. It will mishandle cases like "passes" -> "passe", "eyries" -> "eyry".
	# If word ends in -ies, replace -ies with -y.
	# If the word ends in -xes, -shes, -ches, remove -es. [Does not affect -ses, cf. "houses", "impasses".]
	# Otherwise, remove -s.

	This handles links correctly:
	# If a piped link, change the second part appropriately. Collapse the link to a simple link if both parts end up the same.
	# If a non-piped link, singularize the link.
	# A link like "[[parish]]es" will be handled correctly because the code that checks for -shes etc. allows ] characters between the
	  'sh' etc. and final -es.
	]==]
	function export.singularize(str)
		if type(str) == "table" then
			-- allow calling from a template
			str = str.args[1]
		end
		-- Check for a link. This pattern matches both piped and unpiped links.
		-- If the link is not piped, the second capture (linktext) will be empty.
		local beginning, link, linktext = match(str, "^(.*)%[%[([^|%]]+)%|?(.-)%]%]$")
		if not link then
			return do_singularize(str)
		elseif linktext ~= "" then
			return beginning .. collapse_link(link, do_singularize(linktext))
		end
		return beginning .. "[[" .. do_singularize(link) .. "]]"
	end
end

--[==[
Return the appropriate indefinite article to prefix to `str`. Correctly handles links and capitalized text.
Does not correctly handle words like [[union]], [[uniform]] and [[university]] that take "a" despite beginning with
a 'u'. The returned article will have its first letter capitalized if `ucfirst` is specified, otherwise lowercase.
]==]
function export.get_indefinite_article(str, ucfirst)
	str = str or ""
	local is_vowel = false
	-- If there's a link at the beginning, examine the first letter of the
	-- link text. This pattern matches both piped and unpiped links.
	-- If the link is not piped, the second capture (linktext) will be empty.
	local link, linktext = match(str, "^%[%[([^|%]]+)%|?(.-)%]%]")
	if link then
		is_vowel = find(linktext ~= "" and linktext or link, "^[AEIOUaeiou]")
	else
		is_vowel = find(str, "^[AEIOUaeiou]")
	end
	return is_vowel and (ucfirst and "An" or "an") or (ucfirst and "A" or "a")
end
get_indefinite_article = export.get_indefinite_article

--[==[
Prefix `text` with the appropriate indefinite article to prefix to `text`. Correctly handles links and capitalized
text. Does not correctly handle words like [[union]], [[uniform]] and [[university]] that take "a" despite beginning
with a 'u'. The returned article will have its first letter capitalized if `ucfirst` is specified, otherwise lowercase.
]==]
function export.add_indefinite_article(text, ucfirst)
	return get_indefinite_article(text, ucfirst) .. " " .. text
end

return export