<?xml version="1.0"?>
<feed xmlns="http://www.w3.org/2005/Atom" xml:lang="en">
	<id>https://linguifex.com/w/index.php?action=history&amp;feed=atom&amp;title=Module%3Astring%2Fcompare</id>
	<title>Module:string/compare - Revision history</title>
	<link rel="self" type="application/atom+xml" href="https://linguifex.com/w/index.php?action=history&amp;feed=atom&amp;title=Module%3Astring%2Fcompare"/>
	<link rel="alternate" type="text/html" href="https://linguifex.com/w/index.php?title=Module:string/compare&amp;action=history"/>
	<updated>2026-04-15T16:55:13Z</updated>
	<subtitle>Revision history for this page on the wiki</subtitle>
	<generator>MediaWiki 1.43.6</generator>
	<entry>
		<id>https://linguifex.com/w/index.php?title=Module:string/compare&amp;diff=493975&amp;oldid=prev</id>
		<title>Sware: Created page with &quot;local byte = string.byte local match = string.match local sub = string.sub  --[==[ A comparison function for strings, which returns {true} if {a} sorts before {b}, or otherwise {false}; it can be used as the sort function with {table.sort}.  This function always sorts using byte-order, which makes it roughly equivalent to the {&lt;} operator, but with fixes for two serious bugs raised in phab:T193096#4161287 and phab:T49137#9167559: * {&lt;} is supposed to compare UTF-...&quot;</title>
		<link rel="alternate" type="text/html" href="https://linguifex.com/w/index.php?title=Module:string/compare&amp;diff=493975&amp;oldid=prev"/>
		<updated>2026-04-14T20:29:18Z</updated>

		<summary type="html">&lt;p&gt;Created page with &amp;quot;local byte = string.byte local match = string.match local sub = string.sub  --[==[ A comparison function for strings, which returns {true} if {a} sorts before {b}, or otherwise {false}; it can be used as the sort function with {table.sort}.  This function always sorts using byte-order, which makes it roughly equivalent to the {&amp;lt;} operator, but with fixes for two serious bugs raised in &lt;a href=&quot;/w/index.php?title=Phab:T193096&amp;amp;action=edit&amp;amp;redlink=1&quot; class=&quot;new&quot; title=&quot;Phab:T193096 (page does not exist)&quot;&gt;phab:T193096#4161287&lt;/a&gt; and &lt;a href=&quot;/w/index.php?title=Phab:T49137&amp;amp;action=edit&amp;amp;redlink=1&quot; class=&quot;new&quot; title=&quot;Phab:T49137 (page does not exist)&quot;&gt;phab:T49137#9167559&lt;/a&gt;: * {&amp;lt;} is supposed to compare UTF-...&amp;quot;&lt;/p&gt;
&lt;p&gt;&lt;b&gt;New page&lt;/b&gt;&lt;/p&gt;&lt;div&gt;local byte = string.byte&lt;br /&gt;
local match = string.match&lt;br /&gt;
local sub = string.sub&lt;br /&gt;
&lt;br /&gt;
--[==[&lt;br /&gt;
A comparison function for strings, which returns {true} if {a} sorts before {b}, or otherwise {false}; it can be used as the sort function with {table.sort}.&lt;br /&gt;
&lt;br /&gt;
This function always sorts using byte-order, which makes it roughly equivalent to the {&amp;lt;} operator, but with fixes for two serious bugs raised in [[phab:T193096#4161287]] and [[phab:T49137#9167559]]:&lt;br /&gt;
* {&amp;lt;} is supposed to compare UTF-8 codepoints in the two strings, but when a codepoint that is U+10000 or above is encountered in the left-hand string, {&amp;lt;} always returns {false}, irrespective of the content of the other string.&lt;br /&gt;
* {&amp;lt;} treats unassigned codepoints and non-UTF-8 byte sequences as being higher than {&amp;quot;\0&amp;quot;} but lower than {&amp;quot;\1&amp;quot;}, instead of sorting according to byte order.]==]&lt;br /&gt;
return function(a, b)&lt;br /&gt;
	-- Equality check.&lt;br /&gt;
	if a == b then&lt;br /&gt;
		return false&lt;br /&gt;
	end&lt;br /&gt;
	-- Byte comparison is slow, so only do it when it&amp;#039;s really needed:&lt;br /&gt;
	-- iterate over both strings, grabbing a set of ASCII bytes followed by&lt;br /&gt;
	-- a set of non-ASCII bytes from each (either of which could be empty),&lt;br /&gt;
	-- and compare them with ==. If the ASCII substrings are unequal, just&lt;br /&gt;
	-- use &amp;lt;, since the bug won&amp;#039;t affect it. Otherwise, compare bytes in the&lt;br /&gt;
	-- non-ASCII substrings.&lt;br /&gt;
	local loc, ascii_a, nonascii_a, ascii_b, nonascii_b = 1&lt;br /&gt;
	repeat&lt;br /&gt;
		ascii_a, nonascii_a = match(a, &amp;quot;^([^\128-\255]*)([\128-\255]*)&amp;quot;, loc)&lt;br /&gt;
		ascii_b, nonascii_b, loc = match(b, &amp;quot;^([^\128-\255]*)([\128-\255]*)()&amp;quot;, loc) -- update `loc` on the second call&lt;br /&gt;
		-- When comparing ASCII sets, use &amp;lt;. The lower substring will be&lt;br /&gt;
		-- from the lower string *except* when it comprises the start of the&lt;br /&gt;
		-- other substring and is followed by a non-ASCII character. For&lt;br /&gt;
		-- instance, if `ascii_a` is &amp;quot;pqrs&amp;quot;:&lt;br /&gt;
		-- If `ascii_b` is &amp;quot;abc&amp;quot;, `b` is lower, since &amp;quot;abc&amp;quot; &amp;lt; &amp;quot;pqrs&amp;quot;.&lt;br /&gt;
		-- If `ascii_b` is &amp;quot;pqr&amp;quot; and followed by non-ASCII &amp;quot;ž&amp;quot;, `a` is&lt;br /&gt;
		-- lower, since &amp;quot;pqrs&amp;quot; &amp;lt; &amp;quot;pqrž&amp;quot;.&lt;br /&gt;
		-- If `ascii_b` is &amp;quot;pqr&amp;quot; and at the end of `b`, `b` is lower, since&lt;br /&gt;
		-- &amp;quot;pqr&amp;quot; &amp;lt; &amp;quot;pqrs&amp;quot;.&lt;br /&gt;
		if ascii_a ~= ascii_b then&lt;br /&gt;
			if ascii_a &amp;lt; ascii_b then&lt;br /&gt;
				return nonascii_a == &amp;quot;&amp;quot; or ascii_a ~= sub(ascii_b, 1, #ascii_a)&lt;br /&gt;
			end&lt;br /&gt;
			return not (nonascii_b == &amp;quot;&amp;quot; or ascii_b ~= sub(ascii_a, 1, #ascii_b))&lt;br /&gt;
		end&lt;br /&gt;
	-- If the non-ASCII parts are not equal, terminate the loop.&lt;br /&gt;
	until nonascii_a ~= nonascii_b&lt;br /&gt;
	-- If either one is the empty string, then the end of that string has&lt;br /&gt;
	-- been reached, making it the lower string.&lt;br /&gt;
	if nonascii_a == &amp;quot;&amp;quot; then&lt;br /&gt;
		return true&lt;br /&gt;
	elseif nonascii_b == &amp;quot;&amp;quot; then&lt;br /&gt;
		return false&lt;br /&gt;
	end&lt;br /&gt;
	loc = 1&lt;br /&gt;
	while true do&lt;br /&gt;
		-- 4 bytes at a time is a balance between minimizing the number of&lt;br /&gt;
		-- byte() calls without grabbing unnecessary extra bytes after the&lt;br /&gt;
		-- difference.&lt;br /&gt;
		local b_a1, b_a2, b_a3, b_a4 = byte(nonascii_a, loc, loc + 3)&lt;br /&gt;
		if b_a1 == nil then&lt;br /&gt;
			return true&lt;br /&gt;
		end&lt;br /&gt;
		local b_b1, b_b2, b_b3, b_b4 = byte(nonascii_b, loc, loc + 3)&lt;br /&gt;
		if b_a1 ~= b_b1 then&lt;br /&gt;
			return b_b1 and b_a1 &amp;lt; b_b1&lt;br /&gt;
		elseif b_a2 ~= b_b2 then&lt;br /&gt;
			return b_a2 == nil or b_b2 and b_a2 &amp;lt; b_b2&lt;br /&gt;
		elseif b_a3 ~= b_b3 then&lt;br /&gt;
			return b_a3 == nil or b_b3 and b_a3 &amp;lt; b_b3&lt;br /&gt;
		elseif b_a4 ~= b_b4 then&lt;br /&gt;
			return b_a4 == nil or b_b4 and b_a4 &amp;lt; b_b4&lt;br /&gt;
		end&lt;br /&gt;
		loc = loc + 4&lt;br /&gt;
	end&lt;br /&gt;
end&lt;/div&gt;</summary>
		<author><name>Sware</name></author>
	</entry>
</feed>