45,646
edits
(Defining getFullCode()) |
No edit summary |
||
Line 298: | Line 298: | ||
end | end | ||
return parent or nil | return parent or nil | ||
end | |||
--[==[Given some text, this function iterates through the scripts of a given language and tries to find the script that best matches the text. It returns a {{code|lua|Script}} object representing the script. If no match is found at all, it returns the {{code|lua|None}} script object.]==] | |||
function Language:findBestScript(text, forceDetect) | |||
local useRequire = self._useRequire | |||
if not text or text == "" or text == "-" then | |||
return require("Module:scripts").getByCode("None", nil, nil, useRequire) | |||
end | |||
-- Differs from table returned by getScriptCodes, as Hants is not normalized into its constituents. | |||
local codes = self._bestScriptCodes | |||
if codes == nil then | |||
codes = self._rawData[4] | |||
codes = codes and split(codes, ",", true, true) or {"None"} | |||
self._bestScriptCodes = codes | |||
end | |||
local first_sc = codes[1] | |||
if first_sc == "All" then | |||
return require("Module:scripts").findBestScriptWithoutLang(text) | |||
end | |||
local get_script = require("Module:scripts").getByCode | |||
local codes_len = #codes | |||
if not (forceDetect or first_sc == "Hants" or codes_len > 1) then | |||
first_sc = get_script(first_sc, nil, nil, useRequire) | |||
local charset = first_sc.characters | |||
return charset and umatch(text, "[" .. charset .. "]") and first_sc or | |||
get_script("None", nil, nil, useRequire) | |||
end | |||
-- Remove all formatting characters. | |||
text = require("Module:utilities").get_plaintext(text) | |||
-- Remove all spaces and any ASCII punctuation. Some non-ASCII punctuation is script-specific, so can't be removed. | |||
text = ugsub(text, "[%s!\"#%%&'()*,%-./:;?@[\\%]_{}]+", "") | |||
if #text == 0 then | |||
return get_script("None", nil, nil, useRequire) | |||
end | |||
-- Try to match every script against the text, | |||
-- and return the one with the most matching characters. | |||
local bestcount, bestscript, length = 0 | |||
for i = 1, codes_len do | |||
local sc = codes[i] | |||
-- Special case for "Hants", which is a special code that represents whichever of "Hant" or "Hans" best matches, or "Hani" if they match equally. This avoids having to list all three. In addition, "Hants" will be treated as the best match if there is at least one matching character, under the assumption that a Han script is desirable in terms that contain a mix of Han and other scripts (not counting those which use Jpan or Kore). | |||
if sc == "Hants" then | |||
else | |||
sc = get_script(sc, nil, nil, useRequire) | |||
if not length then | |||
length = ulen(text) | |||
end | |||
-- Count characters by removing everything in the script's charset and comparing to the original length. | |||
local charset = sc.characters | |||
local count = charset and length - ulen(ugsub(text, "[" .. charset .. "]+", "")) or 0 | |||
if count >= length then | |||
return sc | |||
elseif count > bestcount then | |||
bestcount = count | |||
bestscript = sc | |||
end | |||
end | |||
end | |||
-- Return best matching script, or otherwise None. | |||
return bestscript or get_script("None", nil, nil, useRequire) | |||
end | end | ||