48,355
edits
No edit summary |
No edit summary |
||
| Line 37: | Line 37: | ||
local umatch = ustring.match | local umatch = ustring.match | ||
local uupper = m_str_utils.upper | local uupper = m_str_utils.upper | ||
-- Loaded as needed by findBestScript. | |||
local Hans_chars | |||
local Hant_chars | |||
local export = {} | local export = {} | ||
| Line 60: | Line 64: | ||
(indicating a missing code). If `not_real_lang` is given, this check is suppressed. | (indicating a missing code). If `not_real_lang` is given, this check is suppressed. | ||
]=] | ]=] | ||
local function get_script(...) | |||
get_script = require("Module:scripts").getByCode | |||
return get_script(...) | |||
end | |||
-- Convert risky characters to HTML entities, which minimizes interference once returned (e.g. for "sms:a", "<!-- -->" etc.). | -- Convert risky characters to HTML entities, which minimizes interference once returned (e.g. for "sms:a", "<!-- -->" etc.). | ||
| Line 556: | Line 565: | ||
--[==[Given some text, this function iterates through the scripts of a given language and tries to find the script that best matches the text. It returns a {{code|lua|Script}} object representing the script. If no match is found at all, it returns the {{code|lua|None}} script object.]==] | --[==[Given some text, this function iterates through the scripts of a given language and tries to find the script that best matches the text. It returns a {{code|lua|Script}} object representing the script. If no match is found at all, it returns the {{code|lua|None}} script object.]==] | ||
function Language:findBestScript(text, forceDetect) | function Language:findBestScript(text, forceDetect) | ||
if not text or text == "" or text == "-" then | if not text or text == "" or text == "-" then | ||
return | return get_script("None") | ||
end | end | ||
-- Differs from table returned by getScriptCodes, as Hants is not normalized into its constituents. | -- Differs from table returned by getScriptCodes, as Hants is not normalized into its constituents. | ||
codes = | local codes = self._bestScriptCodes | ||
if codes == nil then | |||
codes = self._data[4] | |||
codes = codes and split(codes, ",", true, true) or {"None"} | |||
local first_sc = | self._bestScriptCodes = codes | ||
end | |||
local first_sc = codes[1] | |||
if first_sc == "All" then | if first_sc == "All" then | ||
return | return find_best_script_without_lang(text) | ||
end | end | ||
local codes_len = #codes | local codes_len = #codes | ||
if not (forceDetect or first_sc == "Hants" or codes_len > 1) then | if not (forceDetect or first_sc == "Hants" or codes_len > 1) then | ||
first_sc = get_script(first_sc | first_sc = get_script(first_sc) | ||
local charset = first_sc.characters | local charset = first_sc.characters | ||
return charset and umatch(text, "[" .. charset .. "]") and first_sc or | return charset and umatch(text, "[" .. charset .. "]") and first_sc or get_script("None") | ||
end | end | ||
-- Remove all formatting characters. | -- Remove all formatting characters. | ||
text = | text = get_plaintext(text) | ||
-- Remove all spaces and any ASCII punctuation. Some non-ASCII punctuation is script-specific, so can't be removed. | -- Remove all spaces and any ASCII punctuation. Some non-ASCII punctuation is script-specific, so can't be removed. | ||
text = ugsub(text, "[%s!\"#%%&'()*,%-./:;?@[\\%]_{}]+", "") | text = ugsub(text, "[%s!\"#%%&'()*,%-./:;?@[\\%]_{}]+", "") | ||
if #text == 0 then | if #text == 0 then | ||
return get_script("None" | return get_script("None") | ||
end | end | ||
-- Try to match every script against the text, | -- Try to match every script against the text, | ||
-- and return the one with the most matching characters. | -- and return the one with the most matching characters. | ||
| Line 599: | Line 607: | ||
-- Special case for "Hants", which is a special code that represents whichever of "Hant" or "Hans" best matches, or "Hani" if they match equally. This avoids having to list all three. In addition, "Hants" will be treated as the best match if there is at least one matching character, under the assumption that a Han script is desirable in terms that contain a mix of Han and other scripts (not counting those which use Jpan or Kore). | -- Special case for "Hants", which is a special code that represents whichever of "Hant" or "Hans" best matches, or "Hani" if they match equally. This avoids having to list all three. In addition, "Hants" will be treated as the best match if there is at least one matching character, under the assumption that a Han script is desirable in terms that contain a mix of Han and other scripts (not counting those which use Jpan or Kore). | ||
if sc == "Hants" then | if sc == "Hants" then | ||
local Hani = get_script("Hani") | |||
if not Hant_chars then | |||
Hant_chars = load_data("Module:zh/data/ts") | |||
Hans_chars = load_data("Module:zh/data/st") | |||
end | |||
local t, s, found = 0, 0 | |||
-- This is faster than using mw.ustring.gmatch directly. | |||
for ch in gmatch(ugsub(text, "[" .. Hani.characters .. "]", "\255%0"), "\255(.[\128-\191]*)") do | |||
found = true | |||
if Hant_chars[ch] then | |||
t = t + 1 | |||
if Hans_chars[ch] then | |||
s = s + 1 | |||
end | |||
elseif Hans_chars[ch] then | |||
s = s + 1 | |||
else | |||
t, s = t + 1, s + 1 | |||
end | |||
end | |||
if found then | |||
if t == s then | |||
return Hani | |||
end | |||
return get_script(t > s and "Hant" or "Hans") | |||
end | |||
else | else | ||
sc = get_script(sc | sc = get_script(sc) | ||
if not length then | if not length then | ||
length = ulen(text) | length = ulen(text) | ||
end | end | ||
-- Count characters by removing everything in the script's charset and comparing to the original length. | -- Count characters by removing everything in the script's charset and comparing to the original length. | ||
local charset = sc.characters | local charset = sc.characters | ||
local count = charset and length - ulen(ugsub(text, "[" .. charset .. "]+", "")) or 0 | local count = charset and length - ulen(ugsub(text, "[" .. charset .. "]+", "")) or 0 | ||
if count >= length then | if count >= length then | ||
return sc | return sc | ||
| Line 618: | Line 653: | ||
end | end | ||
end | end | ||
-- Return best matching script, or otherwise None. | -- Return best matching script, or otherwise None. | ||
return bestscript or get_script("None" | return bestscript or get_script("None") | ||
end | end | ||