Module:string utilities: Difference between revisions

no edit summary
No edit summary
No edit summary
Line 55: Line 55:
local php_trim
local php_trim
local replacement_escape
local replacement_escape
local trim
local u
local u
local ulen
local ulen
Line 81: Line 82:
end
end
return pattern, ulen(str), ustring, callable
return pattern, ulen(str), ustring, callable
end
--[==[Returns {nil} if the input value is the empty string, or otherwise the same value.
If the input is a string and `do_trim` is set, the input value will be trimmed before returning; if the trimmed value is the empty string, returns {nil}.
If `quote_delimiters` is set, then any outer pair of quotation marks ({' '} or {" "}) surrounding the rest of the input string will be stripped, if present. The string will not be trimmed again, converted to {nil}, or have further quotation marks stripped, as it exists as a way to embed spaces or the empty string in an input. Genuine quotation marks may also be embedded this way (e.g. {"''foo''"} returns {"'foo'"}).]==]
function export.is_not_empty(str, do_trim, quote_delimiters)
if str == "" then
return nil
elseif not (str and type(str) == "string") then
return str
elseif do_trim then
str = trim(str)
if str == "" then
return nil
end
end
return quote_delimiters and gsub(str, "^(['\"])(.*)%1$", "%2") or str
end
end


Line 105: Line 125:
}, charset_chars)
}, charset_chars)


--[==[Escapes the magic characters used in a [[mw:Extension:Scribunto/Lua reference manual#Patterns|pattern]] (Lua's version of regular expressions): <code>$%()*+-.?[]^</code>, and converts the null character to <code>%z</code>. For example, {{lua|"^$()%.[]*+-?\0"}} becomes {{lua|"%^%$%(%)%%%.%[%]%*%+%-%?%z"}}. This is necessary when constructing a pattern involving arbitrary text (e.g. from user input).]==]
--[==[Escapes the magic characters used in a [[mw:Extension:Scribunto/Lua reference manual#Patterns|pattern]] (Lua's version of regular expressions): {$%()*+-.?[]^}, and converts the null character to {%z}. For example, {"^$()%.[]*+-?\0"} becomes {"%^%$%(%)%%%.%[%]%*%+%-%?%z"}. This is necessary when constructing a pattern involving arbitrary text (e.g. from user input).]==]
function export.pattern_escape(str)
function export.pattern_escape(str)
return (gsub(str, "[%z$%%()*+%-.?[%]^]", chars))
return (gsub(str, "[%z$%%()*+%-.?[%]^]", chars))
Line 111: Line 131:
pattern_escape = export.pattern_escape
pattern_escape = export.pattern_escape


--[==[Escapes the magic characters used in [[mw:Extension:Scribunto/Lua reference manual#Patterns|pattern]] character sets: <code>%-]^</code>, and converts the null character to <code>%z</code>.]==]
--[==[Escapes the magic characters used in [[mw:Extension:Scribunto/Lua reference manual#Patterns|pattern]] character sets: {%-]^}, and converts the null character to {%z}.]==]
function export.charset_escape(str)
function export.charset_escape(str)
return (gsub(str, "[%z%%%-%]^]", charset_chars))
return (gsub(str, "[%z%%%-%]^]", charset_chars))
Line 117: Line 137:
charset_escape = export.charset_escape
charset_escape = export.charset_escape


--[==[Escapes only <code>%</code>, which is the only magic character used in replacement [[mw:Extension:Scribunto/Lua reference manual#Patterns|patterns]] with string.gsub and mw.ustring.gsub.]==]
--[==[Escapes only {%}, which is the only magic character used in replacement [[mw:Extension:Scribunto/Lua reference manual#Patterns|patterns]] with string.gsub and mw.ustring.gsub.]==]
function export.replacement_escape(str)
function export.replacement_escape(str)
return (gsub(str, "%%", "%%%%"))
return (gsub(str, "%%", "%%%%"))
Line 505: Line 525:
end
end


--[==[Parses `charset`, the interior of a string or ustring library character set, and normalizes it into a string or ustring library pattern (e.g. {{lua|"abcd-g"}} becomes {{lua|"[abcd-g]"}}, and {{lua|"[]"}} becomes {{lua|"[[%]]"}}).
--[==[Parses `charset`, the interior of a string or ustring library character set, and normalizes it into a string or ustring library pattern (e.g. {"abcd-g"} becomes {"[abcd-g]"}, and {"[]"} becomes {"[[%]]"}).


The negative (`^`), range (`-`) and literal (`%`) magic characters work as normal, and character classes may be used (e.g. `%d` and `%w`), but opening and closing square brackets are sanitized so that they behave like ordinary characters.]==]
The negative (`^`), range (`-`) and literal (`%`) magic characters work as normal, and character classes may be used (e.g. `%d` and `%w`), but opening and closing square brackets are sanitized so that they behave like ordinary characters.]==]
Line 926: Line 946:
return match(str, pattern)
return match(str, pattern)
end
end
trim = export.trim


do
do
Line 1,048: Line 1,069:
--[==[Removes any HTML comments from the input text. `stage` can be one of three options:
--[==[Removes any HTML comments from the input text. `stage` can be one of three options:
* {{lua|"PRE"}} (default) applies the method used by MediaWiki's preprocessor: all {{code||<nowiki><!-- ... --></nowiki>}} pairs are removed, as well as any text after an unclosed {{code||<nowiki><!--</nowiki>}}. This is generally suitable when parsing raw template or [[mw:Parser extension tags|parser extension tag]] code. (Note, however, that the actual method used by the preprocessor is considerably more complex and differs under certain conditions (e.g. comments inside nowiki tags); if full accuracy is absolutely necessary, use [[Module:template parser]] instead).
* {"PRE"} (default) applies the method used by MediaWiki's preprocessor: all {{code|html|<nowiki><!-- ... --></nowiki>}} pairs are removed, as well as any text after an unclosed {{code|html|<nowiki><!--</nowiki>}}. This is generally suitable when parsing raw template or [[mw:Parser extension tags|parser extension tag]] code. (Note, however, that the actual method used by the preprocessor is considerably more complex and differs under certain conditions (e.g. comments inside nowiki tags); if full accuracy is absolutely necessary, use [[Module:template parser]] instead).
* {{lua|"POST"}} applies the method used to generate the final page output once all templates have been expanded: it loops over the text, removing any {{code||<nowiki><!-- ... --></nowiki>}} pairs until no more are found (e.g. {{code||<nowiki><!-<!-- ... -->- ... --></nowiki>}} would be fully removed), but any unclosed {{code||<nowiki><!--</nowiki>}} is ignored. This is suitable for handling links embedded in template inputs, where the {{lua|"PRE"}} method will have already been applied by the native parser.
* {"POST"} applies the method used to generate the final page output once all templates have been expanded: it loops over the text, removing any {{code|html|<nowiki><!-- ... --></nowiki>}} pairs until no more are found (e.g. {{code|html|<nowiki><!-<!-- ... -->- ... --></nowiki>}} would be fully removed), but any unclosed {{code|html|<nowiki><!--</nowiki>}} is ignored. This is suitable for handling links embedded in template inputs, where the {"PRE"} method will have already been applied by the native parser.
* {{lua|"BOTH"}} applies {{lua|"PRE"}} then {{lua|"POST"}}.]==]
* {"BOTH"} applies {"PRE"} then {"POST"}.]==]
function export.remove_comments(str, stage)
function export.remove_comments(str, stage)
if not stage or stage == "PRE" then
if not stage or stage == "PRE" then
Line 1,066: Line 1,087:
end
end


--[==[Lua equivalent of PHP's {{code|php|trim($string)}}, which trims {{lua|"\0"}}, {{lua|"\t"}}, {{lua|"\n"}}, {{lua|"\v"}}, {{lua|"\r"}} and {{lua|" "}}. This is useful when dealing with template parameters, since the native parser trims them like this.]==]
--[==[Lua equivalent of PHP's {{code|php|trim($string)}}, which trims {"\0"}, {"\t"}, {"\n"}, {"\v"}, {"\r"} and {" "}. This is useful when dealing with template parameters, since the native parser trims them like this.]==]
function export.php_trim(str)
function export.php_trim(str)
-- A frontier pattern with a greedy quantifier is faster than the algorithms used by export.trim, but can be only be used if the character set includes \0, since %z matches the start/end of the string, as well as \0. This is also immune to catastrophic backtracking.
-- A frontier pattern with a greedy quantifier is faster than the algorithms used by export.trim, but can be only be used if the character set includes \0, since %z matches the start/end of the string, as well as \0. This is also immune to catastrophic backtracking.
Line 1,073: Line 1,094:
php_trim = export.php_trim
php_trim = export.php_trim


--[==[Takes a parameter name as either a string or number, and returns the Scribunto-normalized form (i.e. the key that that parameter would have in a {{lua|frame.args}} table). For example, {{lua|"1"}} (a string) is normalized to {{lua|1}} (a number), {{lua|" foo "}} is normalized to {{lua|"foo"}}, and {{lua|1.5}} (a number) is normalized to {{lua|"1.5"}} (a string). Inputs which cannot be normalized (e.g. booleans) return {{lua|nil}}. If the `no_trim` flag is set, string parameters are not trimmed, but strings may still be converted to numbers if they do not contain whitespace; this is necessary when normalizing keys into the form received by PHP during callbacks, before any trimming occurs (e.g. in the table of arguments when calling {{lua|frame:expandTemplates()}}).
--[==[Takes a parameter name as either a string or number, and returns the Scribunto-normalized form (i.e. the key that that parameter would have in a {frame.args} table). For example, {"1"} (a string) is normalized to {1} (a number), {" foo "} is normalized to {"foo"}, and {1.5} (a number) is normalized to {"1.5"} (a string). Inputs which cannot be normalized (e.g. booleans) return {nil}. If the `no_trim` flag is set, string parameters are not trimmed, but strings may still be converted to numbers if they do not contain whitespace; this is necessary when normalizing keys into the form received by PHP during callbacks, before any trimming occurs (e.g. in the table of arguments when calling {frame:expandTemplates()}).


Strings are trimmed with {{lua|export.php_trim}}, unless the `no_trim` flag is set. They are then converted to numbers if '''all''' of the following are true:
Strings are trimmed with {export.php_trim}, unless the `no_trim` flag is set. They are then converted to numbers if '''all''' of the following are true:
# They are integers; i.e. no decimals or leading zeroes (e.g. {{lua|"2"}}, but not {{lua|"2.0"}} or {{lua|"02"}}).
# They are integers; i.e. no decimals or leading zeroes (e.g. {"2"}, but not {"2.0"} or {"02"}).
# They are ≤ 2{{sup|53}} and ≥ -2{{sup|53}}.
# They are ≤ 2{{sup|53}} and ≥ -2{{sup|53}}.
# There is no leading sign unless < 0 (e.g. {{lua|"2"}} or {{lua|"-2"}}, but not {{lua|"+2"}} or {{lua|"-0"}}).
# There is no leading sign unless < 0 (e.g. {"2"} or {"-2"}, but not {"+2"} or {"-0"}).
# They contain no leading or trailing whitespace (which may be present when the `no_trim` flag is set).
# They contain no leading or trailing whitespace (which may be present when the `no_trim` flag is set).


Numbers are converted to strings if '''either''':
Numbers are converted to strings if '''either''':
# They are not integers (e.g. {{lua|1.5}}).
# They are not integers (e.g. {1.5}).
# They are > 2{{sup|53}} or < -2{{sup|53}}.
# They are > 2{{sup|53}} or < -2{{sup|53}}.


When converted to strings, integers ≤ 2{{sup|63}} and ≥ -2{{sup|63}} are formatted as integers (i.e. all digits are given), which is the range of PHP's integer precision, though the actual output may be imprecise since Lua's integer precision is > 2{{sup|53}} to < -2{{sup|53}}. All other numbers use the standard formatting output by {{lua|tostring()}}.]==]
When converted to strings, integers ≤ 2{{sup|63}} and ≥ -2{{sup|63}} are formatted as integers (i.e. all digits are given), which is the range of PHP's integer precision, though the actual output may be imprecise since Lua's integer precision is > 2{{sup|53}} to < -2{{sup|53}}. All other numbers use the standard formatting output by {tostring()}.]==]
function export.scribunto_param_key(key, no_trim)
function export.scribunto_param_key(key, no_trim)
local tp = type(key)
local tp = type(key)
Line 1,143: Line 1,164:
format_fun = export.format_fun
format_fun = export.format_fun


--[==[This function, unlike {{lua|string.format}} and {{lua|mw.ustring.format}}, takes just two parameters—a format string and a table—and replaces all instances of {{lua|{param_name}}} in the format string with the table's entry for {{lua|param_name}}. The opening and closing brace characters can be escaped with <code>{\op}</code> and <code>{\cl}</code>, respectively. A table entry beginning with a slash can be escaped by doubling the initial slash.
--[==[This function, unlike {string.format} and {mw.ustring.format}, takes just two parameters—a format string and a table—and replaces all instances of { {param_name} } in the format string with the table's entry for {param_name}. The opening and closing brace characters can be escaped with { {\op} } and { {\cl} }, respectively. A table entry beginning with a slash can be escaped by doubling the initial slash.
 
====Examples====
====Examples====
* {{lua|=string_utilities.format("{foo} fish, {bar} fish, {baz} fish, {quux} fish", {["foo"]="one", ["bar"]="two", ["baz"]="red", ["quux"]="blue"})}}
* {string_utilities.format("{foo} fish, {bar} fish, {baz} fish, {quux} fish", {["foo"]="one", ["bar"]="two", ["baz"]="red", ["quux"]="blue"}) }
*: produces: {{lua|"one fish, two fish, red fish, blue fish"}}
*: produces: {"one fish, two fish, red fish, blue fish"}
* {{lua|=string_utilities.format("The set {\\op}1, 2, 3{\\cl} contains {\\\\hello} elements.", {["\\hello"]="three"})}}
* {string_utilities.format("The set {\\op}1, 2, 3{\\cl} contains {\\\\hello} elements.", {["\\hello"]="three"})}
*: produces: {{lua|"The set {1, 2, 3} contains three elements."}}
*: produces: {"The set {1, 2, 3} contains three elements."}
*:* Note that the single and double backslashes should be entered as double and quadruple backslashes when quoted in a literal string.]==]
*:* Note that the single and double backslashes should be entered as double and quadruple backslashes when quoted in a literal string.]==]
function export.format(str, tbl)
function export.format(str, tbl)