45,646
edits
No edit summary |
No edit summary |
||
Line 60: | Line 60: | ||
(indicating a missing code). If `not_real_lang` is given, this check is suppressed. | (indicating a missing code). If `not_real_lang` is given, this check is suppressed. | ||
]=] | ]=] | ||
-- Temporarily convert various formatting characters to PUA to prevent them from being disrupted by the substitution process. | |||
local function doTempSubstitutions(text, subbedChars, keepCarets, noTrim) | |||
-- Clone so that we don't insert any extra patterns into the table in package.loaded. For some reason, using require seems to keep memory use down; probably because the table is always cloned. | |||
local patterns = shallowcopy(require("Module:languages/data/patterns")) | |||
if keepCarets then | |||
insert(patterns, "((\\+)%^)") | |||
insert(patterns, "((%^))") | |||
end | |||
-- Ensure any whitespace at the beginning and end is temp substituted, to prevent it from being accidentally trimmed. We only want to trim any final spaces added during the substitution process (e.g. by a module), which means we only do this during the first round of temp substitutions. | |||
if not noTrim then | |||
insert(patterns, "^([\128-\191\244]*(%s+))") | |||
insert(patterns, "((%s+)[\128-\191\244]*)$") | |||
end | |||
-- Pre-substitution, of "[[" and "]]", which makes pattern matching more accurate. | |||
text = gsub(text, "%f[%[]%[%[", "\1") | |||
:gsub("%f[%]]%]%]", "\2") | |||
local i, pe = #subbedChars, require("Module:string utilities").pattern_escape | |||
for _, pattern in ipairs(patterns) do | |||
-- Patterns ending in \0 stand are for things like "[[" or "]]"), so the inserted PUA are treated as breaks between terms by modules that scrape info from pages. | |||
local term_divider | |||
pattern = gsub(pattern, "%z$", function(divider) | |||
term_divider = divider == "\0" | |||
return "" | |||
end) | |||
text = gsub(text, pattern, function(...) | |||
local m = {...} | |||
local m1New = m[1] | |||
for k = 2, #m do | |||
local n = i + k - 1 | |||
subbedChars[n] = m[k] | |||
local byte2 = floor(n / 4096) % 64 + (term_divider and 128 or 136) | |||
local byte3 = floor(n / 64) % 64 + 128 | |||
local byte4 = n % 64 + 128 | |||
m1New = gsub(m1New, pe(m[k]), "\244" .. char(byte2) .. char(byte3) .. char(byte4), 1) | |||
end | |||
i = i + #m - 1 | |||
return m1New | |||
end) | |||
end | |||
text = gsub(text, "\1", "%[%[") | |||
:gsub("\2", "%]%]") | |||
return text, subbedChars | |||
end | |||
-- Split the text into sections, based on the presence of temporarily substituted formatting characters, then iterate over each one to apply substitutions. This avoids putting PUA characters through language-specific modules, which may be unequipped for them. | -- Split the text into sections, based on the presence of temporarily substituted formatting characters, then iterate over each one to apply substitutions. This avoids putting PUA characters through language-specific modules, which may be unequipped for them. | ||
Line 127: | Line 171: | ||
return textNoEnc | return textNoEnc | ||
end | end | ||
end | end | ||