Module:languages: Difference between revisions

no edit summary
No edit summary
No edit summary
Line 37: Line 37:
local umatch = ustring.match
local umatch = ustring.match
local uupper = m_str_utils.upper
local uupper = m_str_utils.upper
-- Loaded as needed by findBestScript.
local Hans_chars
local Hant_chars


local export = {}
local export = {}
Line 60: Line 64:
(indicating a missing code). If `not_real_lang` is given, this check is suppressed.
(indicating a missing code). If `not_real_lang` is given, this check is suppressed.
]=]
]=]
local function get_script(...)
get_script = require("Module:scripts").getByCode
return get_script(...)
end


-- Convert risky characters to HTML entities, which minimizes interference once returned (e.g. for "sms:a", "<!-- -->" etc.).
-- Convert risky characters to HTML entities, which minimizes interference once returned (e.g. for "sms:a", "<!-- -->" etc.).
Line 556: Line 565:
--[==[Given some text, this function iterates through the scripts of a given language and tries to find the script that best matches the text. It returns a {{code|lua|Script}} object representing the script. If no match is found at all, it returns the {{code|lua|None}} script object.]==]
--[==[Given some text, this function iterates through the scripts of a given language and tries to find the script that best matches the text. It returns a {{code|lua|Script}} object representing the script. If no match is found at all, it returns the {{code|lua|None}} script object.]==]
function Language:findBestScript(text, forceDetect)
function Language:findBestScript(text, forceDetect)
local useRequire = self._useRequire
if not text or text == "" or text == "-" then
if not text or text == "" or text == "-" then
return require("Module:scripts").getByCode("None", nil, nil, useRequire)
return get_script("None")
end
end
 
-- Differs from table returned by getScriptCodes, as Hants is not normalized into its constituents.
-- Differs from table returned by getScriptCodes, as Hants is not normalized into its constituents.
codes = table.concat(self._rawData["scripts"],", ")
local codes = self._bestScriptCodes
codes = codes and split(codes, ",", true, true) or {"None"}
if codes == nil then
self._bestScriptCodes = codes
codes = self._data[4]
codes = codes and split(codes, ",", true, true) or {"None"}
local first_sc = self._rawData.scripts[1]
self._bestScriptCodes = codes
end
 
local first_sc = codes[1]
 
if first_sc == "All" then
if first_sc == "All" then
return require("Module:scripts").findBestScriptWithoutLang(text)
return find_best_script_without_lang(text)
end
end
 
local get_script = require("Module:scripts").getByCode
local codes_len = #codes
local codes_len = #codes
 
if not (forceDetect or first_sc == "Hants" or codes_len > 1) then
if not (forceDetect or first_sc == "Hants" or codes_len > 1) then
first_sc = get_script(first_sc, nil, nil, useRequire)
first_sc = get_script(first_sc)
local charset = first_sc.characters
local charset = first_sc.characters
return charset and umatch(text, "[" .. charset .. "]") and first_sc or
return charset and umatch(text, "[" .. charset .. "]") and first_sc or get_script("None")
get_script("None", nil, nil, useRequire)
end
end
 
-- Remove all formatting characters.
-- Remove all formatting characters.
text = require("Module:utilities").get_plaintext(text)
text = get_plaintext(text)
 
-- Remove all spaces and any ASCII punctuation. Some non-ASCII punctuation is script-specific, so can't be removed.
-- Remove all spaces and any ASCII punctuation. Some non-ASCII punctuation is script-specific, so can't be removed.
text = ugsub(text, "[%s!\"#%%&'()*,%-./:;?@[\\%]_{}]+", "")
text = ugsub(text, "[%s!\"#%%&'()*,%-./:;?@[\\%]_{}]+", "")
if #text == 0 then
if #text == 0 then
return get_script("None", nil, nil, useRequire)
return get_script("None")
end
end
 
-- Try to match every script against the text,
-- Try to match every script against the text,
-- and return the one with the most matching characters.
-- and return the one with the most matching characters.
Line 599: Line 607:
-- Special case for "Hants", which is a special code that represents whichever of "Hant" or "Hans" best matches, or "Hani" if they match equally. This avoids having to list all three. In addition, "Hants" will be treated as the best match if there is at least one matching character, under the assumption that a Han script is desirable in terms that contain a mix of Han and other scripts (not counting those which use Jpan or Kore).
-- Special case for "Hants", which is a special code that represents whichever of "Hant" or "Hans" best matches, or "Hani" if they match equally. This avoids having to list all three. In addition, "Hants" will be treated as the best match if there is at least one matching character, under the assumption that a Han script is desirable in terms that contain a mix of Han and other scripts (not counting those which use Jpan or Kore).
if sc == "Hants" then
if sc == "Hants" then
local Hani = get_script("Hani")
if not Hant_chars then
Hant_chars = load_data("Module:zh/data/ts")
Hans_chars = load_data("Module:zh/data/st")
end
local t, s, found = 0, 0
-- This is faster than using mw.ustring.gmatch directly.
for ch in gmatch(ugsub(text, "[" .. Hani.characters .. "]", "\255%0"), "\255(.[\128-\191]*)") do
found = true
if Hant_chars[ch] then
t = t + 1
if Hans_chars[ch] then
s = s + 1
end
elseif Hans_chars[ch] then
s = s + 1
else
t, s = t + 1, s + 1
end
end
if found then
if t == s then
return Hani
end
return get_script(t > s and "Hant" or "Hans")
end
else
else
sc = get_script(sc, nil, nil, useRequire)
sc = get_script(sc)
 
if not length then
if not length then
length = ulen(text)
length = ulen(text)
end
end
 
-- Count characters by removing everything in the script's charset and comparing to the original length.
-- Count characters by removing everything in the script's charset and comparing to the original length.
local charset = sc.characters
local charset = sc.characters
local count = charset and length - ulen(ugsub(text, "[" .. charset .. "]+", "")) or 0
local count = charset and length - ulen(ugsub(text, "[" .. charset .. "]+", "")) or 0
 
if count >= length then
if count >= length then
return sc
return sc
Line 618: Line 653:
end
end
end
end
 
-- Return best matching script, or otherwise None.
-- Return best matching script, or otherwise None.
return bestscript or get_script("None", nil, nil, useRequire)
return bestscript or get_script("None")
end
end