Module:string/isutf8
Jump to navigation
Jump to search
Documentation for this module may be created at Module:string/isutf8/doc
local byte = string.byte
local match = string.match
--[==[Returns {true} if `str` is a valid UTF-8 string. This is true if, for each character, all of the following are true:
* It has the expected number of bytes, which is determined by value of the leading byte: 1-byte characters are `0x00` to `0x7F`, 2-byte characters start with `0xC2` to `0xDF`, 3-byte characters start with `0xE0` to `0xEF`, and 4-byte characters start with `0xF0` to `0xF4`.
* The leading byte must not fall outside of the above ranges.
* The trailing byte(s) (if any), must be between `0x80` to `0xBF`.
* The character's codepoint must be between U+0000 (`0x00`) and U+10FFFF (`0xF4 0x8F 0xBF 0xBF`).
* The character cannot have an overlong encoding: for each byte length, the lowest theoretical encoding is equivalent to U+0000 (e.g. `0xE0 0x80 0x80`, the lowest theoretical 3-byte encoding, is exactly equivalent to U+0000). Encodings that use more than the minimum number of bytes are not considered valid, meaning that the first valid 3-byte character is `0xE0 0xA0 0x80` (U+0800), and the first valid 4-byte character is `0xF0 0x90 0x80 0x80` (U+10000). Formally, 2-byte characters have leading bytes ranging from `0xC0` to `0xDF` (rather than `0xC2` to `0xDF`), but `0xC0 0x80` to `0xC1 0xBF` are overlong encodings, so it is simpler to say that the 2-byte range begins at `0xC2`.
If `allow_surrogates` is set, surrogates (U+D800 to U+DFFF) will be treated as valid UTF-8. Surrogates are used in UTF-16, which encodes codepoints U+0000 to U+FFFF with 2 bytes, and codepoints from U+10000 upwards using a pair of surrogates, which are taken together as a 4-byte unit. Since surrogates have no use in UTF-8, as it encodes higher codepoints in a different way, they are not considered valid in UTF-8 text. However, there are limited circumstances where they may be necessary: for instance, JSON escapes characters using the format `\u0000`, which must contain exactly 4 hexadecimal digits; under the scheme, codepoints above U+FFFF must be escaped as the equivalent pair of surrogates, even though the text itself must be encoded in UTF-8 (e.g. U+10000 becomes `\uD800\uDC00`).]==]
return function(str, allow_surrogates)
local loc, str_len = 1, #str
while true do
-- Skipping ASCII bytes with [^\128-\255]* is much faster than searching
-- for [\128-\255].
loc = match(str, "^[^\128-\255]*()", loc)
if loc > str_len then
return true
end
-- Grab 5 bytes (i.e. at least one extra), to determine when the loop
-- should break.
local b1, b2, b3, b4, b5 = byte(str, loc, loc + 4)
while true do
-- 1 byte can't be valid, as the 1-byte characters \x00-\x7F are
-- ignored. The leading bytes for 2-byte encodings are formally
-- [\xC0-\xDF], but [\xC0\xC1] always form overlong encodings.
if b1 < 0xC2 or not b2 or b2 > 0xBF then
return false
-- 2 bytes: [\xC2-\xDF]...
elseif b1 < 0xE0 then
if b2 < 0x80 then
return false
end
loc = loc + 2
if not b3 then
return true
elseif b3 < 0x80 then
break
end
b1, b2, b3, b4, b5 = b3, b4, b5, byte(str, loc + 3, loc + 4)
-- Trailing byte: [\x80-\xBF].
elseif not b3 or b3 < 0x80 or b3 > 0xBF then
return false
-- 3 bytes: [\xE0-\xEF]...
elseif b1 < 0xF0 then
-- If b2 is [\x80-\x9F], exclude \xE0[\x80-\x9F]..., which are
-- overlong encodings.
if b2 < 0xA0 then
if b1 < 0xE1 then
return false
end
-- The remaining b2 values [\xA0-\xBF] can form the surrogates
-- \xED[\xA0-\xBF]...
elseif b1 < 0xE0 or b1 == 0xED and not allow_surrogates then
return false
end
loc = loc + 3
if not b4 then
return true
elseif b4 < 0x80 then
break
end
b1, b2, b3, b4, b5 = b4, b5, byte(str, loc + 2, loc + 4)
-- Trailing byte: [\x80-\xBF].
elseif not b4 or b4 < 0x80 or b4 > 0xBF then
return false
-- 4-bytes: [\xF0-\xF4]...
-- If b2 is [\x80-\x8F], exclude \xF0[\x80-\x8F]..., which are
-- overlong encodings, but allow \xF4 as a leading byte, since
-- \xF4\8F\BF\BF is the highest valid codepoint (U+10FFFF).
else
if b2 < 0x90 then
if b1 < 0xF1 or b1 > 0xF4 then
return false
end
elseif b1 < 0xF0 or b1 > 0xF3 or b2 < 0x80 then
return false
end
loc = loc + 4
if not b5 then
return true
elseif b5 < 0x80 then
break
end
b1, b2, b3, b4, b5 = b5, byte(str, loc + 1, loc + 4)
end
end
end
end