Module:languages: Difference between revisions
Jump to navigation
Jump to search
No edit summary |
No edit summary |
||
(35 intermediate revisions by 2 users not shown) | |||
Line 1: | Line 1: | ||
local require = require | |||
local m_str_utils = require("Module:string utilities") | |||
local m_table = require("Module:table") | |||
local mw = mw | |||
local string = string | |||
local table = table | |||
local ustring = mw.ustring | |||
local char = string.char | |||
local check_object = require("Module:utilities").check_object | |||
local concat = table.concat | |||
local decode_entities = m_str_utils.decode_entities | |||
local decode_uri = m_str_utils.decode_uri | |||
local find = string.find | |||
local floor = math.floor | |||
local gmatch = string.gmatch | |||
local gsub = string.gsub | |||
local insert = table.insert | |||
local ipairs = ipairs | |||
local list_to_set = m_table.listToSet | |||
local load_data = mw.loadData | |||
local match = string.match | |||
local next = next | |||
local pairs = pairs | |||
local remove = table.remove | |||
local remove_duplicates = m_table.removeDuplicates | |||
local select = select | |||
local setmetatable = setmetatable | |||
local shallowcopy = m_table.shallowcopy | |||
local split = m_str_utils.split | |||
local sub = string.sub | |||
local type = type | |||
local ugsub = ustring.gsub | |||
local ulen = m_str_utils.len | |||
local ulower = m_str_utils.lower | |||
local umatch = ustring.match | |||
local uupper = m_str_utils.upper | |||
local export = {} | local export = {} | ||
Line 21: | Line 60: | ||
(indicating a missing code). If `not_real_lang` is given, this check is suppressed. | (indicating a missing code). If `not_real_lang` is given, this check is suppressed. | ||
]=] | ]=] | ||
-- Convert risky characters to HTML entities, which minimizes interference once returned (e.g. for "sms:a", "<!-- -->" etc.). | |||
local function escape_risky_characters(text) | |||
local encode_entities = require("Module:string/encode entities") | |||
-- Spacing characters in isolation generally need to be escaped in order to be properly processed by the MediaWiki software. | |||
if umatch(text, "^%s*$") then | |||
return encode_entities(text, text) | |||
else | |||
return encode_entities(text, "!#%&*+/:;<=>?@[\\]_{|}") | |||
end | |||
end | |||
-- Process carets (and any escapes). Default to simple removal, if no pattern/replacement is given. | |||
local function processCarets(text, pattern, repl) | |||
local rep | |||
repeat | |||
text, rep = gsub(text, "\\\\(\\*^)", "\3%1") | |||
until rep == 0 | |||
return text:gsub("\\^", "\4") | |||
:gsub(pattern or "%^", repl or "") | |||
:gsub("\3", "\\") | |||
:gsub("\4", "^") | |||
end | |||
-- Remove carets if they are used to capitalize parts of transliterations (unless they have been escaped). | |||
local function removeCarets(text, sc) | |||
if not sc:hasCapitalization() and sc:isTransliterated() and text:find("^", 1, true) then | |||
return processCarets(text) | |||
else | |||
return text | |||
end | |||
end | |||
-- Temporarily convert various formatting characters to PUA to prevent them from being disrupted by the substitution process. | |||
local function doTempSubstitutions(text, subbedChars, keepCarets, noTrim) | |||
-- Clone so that we don't insert any extra patterns into the table in package.loaded. For some reason, using require seems to keep memory use down; probably because the table is always cloned. | |||
local patterns = shallowcopy(require("Module:languages/data/patterns")) | |||
if keepCarets then | |||
insert(patterns, "((\\+)%^)") | |||
insert(patterns, "((%^))") | |||
end | |||
-- Ensure any whitespace at the beginning and end is temp substituted, to prevent it from being accidentally trimmed. We only want to trim any final spaces added during the substitution process (e.g. by a module), which means we only do this during the first round of temp substitutions. | |||
if not noTrim then | |||
insert(patterns, "^([\128-\191\244]*(%s+))") | |||
insert(patterns, "((%s+)[\128-\191\244]*)$") | |||
end | |||
-- Pre-substitution, of "[[" and "]]", which makes pattern matching more accurate. | |||
text = gsub(text, "%f[%[]%[%[", "\1") | |||
:gsub("%f[%]]%]%]", "\2") | |||
local i, pe = #subbedChars, require("Module:string utilities").pattern_escape | |||
for _, pattern in ipairs(patterns) do | |||
-- Patterns ending in \0 stand are for things like "[[" or "]]"), so the inserted PUA are treated as breaks between terms by modules that scrape info from pages. | |||
local term_divider | |||
pattern = gsub(pattern, "%z$", function(divider) | |||
term_divider = divider == "\0" | |||
return "" | |||
end) | |||
text = gsub(text, pattern, function(...) | |||
local m = {...} | |||
local m1New = m[1] | |||
for k = 2, #m do | |||
local n = i + k - 1 | |||
subbedChars[n] = m[k] | |||
local byte2 = floor(n / 4096) % 64 + (term_divider and 128 or 136) | |||
local byte3 = floor(n / 64) % 64 + 128 | |||
local byte4 = n % 64 + 128 | |||
m1New = gsub(m1New, pe(m[k]), "\244" .. char(byte2) .. char(byte3) .. char(byte4), 1) | |||
end | |||
i = i + #m - 1 | |||
return m1New | |||
end) | |||
end | |||
text = gsub(text, "\1", "%[%[") | |||
:gsub("\2", "%]%]") | |||
return text, subbedChars | |||
end | |||
-- Split the text into sections, based on the presence of temporarily substituted formatting characters, then iterate over each one to apply substitutions. This avoids putting PUA characters through language-specific modules, which may be unequipped for them. | |||
local function iterateSectionSubstitutions(text, subbedChars, keepCarets, self, sc, substitution_data, function_name) | |||
local pe = require("Module:string utilities").pattern_escape | |||
local fail, cats, sections = nil, {} | |||
-- See [[Module:languages/data]]. | |||
if not find(text, "\244") or self:loadData("Module:languages/data").contiguous_substitution[self._code] then | |||
sections = {text} | |||
else | |||
sections = split(text, "\244[\128-\143][\128-\191]*", true) | |||
end | |||
for _, section in ipairs(sections) do | |||
-- Don't bother processing empty strings or whitespace (which may also not be handled well by dedicated modules). | |||
if gsub(section, "%s+", "") ~= "" then | |||
local sub, sub_fail, sub_cats = require("Module:languages/doSubstitutions")(section, self, sc, substitution_data, function_name) | |||
-- Second round of temporary substitutions, in case any formatting was added by the main substitution process. However, don't do this if the section contains formatting already (as it would have had to have been escaped to reach this stage, and therefore should be given as raw text). | |||
if sub and subbedChars then | |||
local noSub | |||
for _, pattern in ipairs(require("Module:languages/data/patterns")) do | |||
if match(section, pattern .. "%z?") then | |||
noSub = true | |||
end | |||
end | |||
if not noSub then | |||
sub, subbedChars = doTempSubstitutions(sub, subbedChars, keepCarets, true) | |||
end | |||
end | |||
if (not sub) or sub_fail then | |||
text = sub | |||
fail = sub_fail | |||
cats = sub_cats or {} | |||
break | |||
end | |||
text = sub and gsub(text, pe(section), pe(sub), 1) or text | |||
if type(sub_cats) == "table" then | |||
for _, cat in ipairs(sub_cats) do | |||
insert(cats, cat) | |||
end | |||
end | |||
end | |||
end | |||
-- Trim, unless there are only spacing characters, while ignoring any final formatting characters. | |||
text = text and text:gsub("^([\128-\191\244]*)%s+(%S)", "%1%2") | |||
:gsub("(%S)%s+([\128-\191\244]*)$", "%1%2") | |||
-- Remove duplicate categories. | |||
if #cats > 1 then | |||
cats = remove_duplicates(cats) | |||
end | |||
return text, fail, cats, subbedChars | |||
end | |||
local function normalize(text, sc) | |||
text = sc:fixDiscouragedSequences(text) | |||
return sc:toFixedNFD(text) | |||
end | |||
-- Check if the raw text is an unsupported title, and if so return that. Otherwise, remove HTML entities. We do the pre-conversion to avoid loading the unsupported title list unnecessarily. | |||
local function checkNoEntities(self, text) | |||
local textNoEnc = decode_entities(text) | |||
if textNoEnc ~= text and self:loadData("Module:links/data").unsupported_titles[text] then | |||
return text | |||
else | |||
return textNoEnc | |||
end | |||
end | |||
-- Reinsert any formatting that was temporarily substituted. | |||
local function undoTempSubstitutions(text, subbedChars) | |||
local pe = require("Module:string utilities").pattern_escape | |||
for i = 1, #subbedChars do | |||
local byte2 = floor(i / 4096) % 64 + 128 | |||
local byte3 = floor(i / 64) % 64 + 128 | |||
local byte4 = i % 64 + 128 | |||
text = gsub(text, "\244[" .. char(byte2) .. char(byte2+8) .. "]" .. char(byte3) .. char(byte4), pe(subbedChars[i])) | |||
end | |||
text = gsub(text, "\1", "%[%[") | |||
:gsub("\2", "%]%]") | |||
return text | |||
end | |||
-- If no script object is provided (or if it's invalid or None), get one. | |||
local function checkScript(text, self, sc) | |||
if not check_object("script", true, sc) or sc:getCode() == "None" then | |||
return self:findBestScript(text) | |||
else | |||
return sc | |||
end | |||
end | |||
--[==[Create the form used as as a basis for display text and transliteration.]==] | |||
local function processDisplayText(text, self, sc, keepCarets, keepPrefixes) | |||
local subbedChars = {} | |||
text, subbedChars = doTempSubstitutions(text, subbedChars, keepCarets) | |||
text = decode_uri(text, "PATH") | |||
text = checkNoEntities(self, text) | |||
sc = checkScript(text, self, sc) | |||
local fail, cats | |||
text = normalize(text, sc) | |||
text, fail, cats, subbedChars = iterateSectionSubstitutions(text, subbedChars, keepCarets, self, sc, self._rawData.display_text, "makeDisplayText") | |||
text = removeCarets(text, sc) | |||
-- Remove any interwiki link prefixes (unless they have been escaped or this has been disabled). | |||
if find(text, ":") and not keepPrefixes then | |||
local rep | |||
repeat | |||
text, rep = gsub(text, "\\\\(\\*:)", "\3%1") | |||
until rep == 0 | |||
text = gsub(text, "\\:", "\4") | |||
while true do | |||
local prefix = gsub(text, "^(.-):.+", function(m1) | |||
return gsub(m1, "\244[\128-\191]*", "") | |||
end) | |||
if not prefix or prefix == text then | |||
break | |||
end | |||
local lower_prefix = ulower(prefix) | |||
if not (self:loadData("Module:data/interwikis")[lower_prefix] or prefix == "") then | |||
break | |||
end | |||
text = gsub(text, "^(.-):(.*)", function(m1, m2) | |||
local ret = {} | |||
for subbedChar in gmatch(m1, "\244[\128-\191]*") do | |||
insert(ret, subbedChar) | |||
end | |||
return concat(ret) .. m2 | |||
end) | |||
end | |||
text = gsub(text, "\3", "\\") | |||
:gsub("\4", ":") | |||
end | |||
return text, fail, cats, subbedChars | |||
end | |||
function export.err(lang_code, param, code_desc, template_tag, not_real_lang) | function export.err(lang_code, param, code_desc, template_tag, not_real_lang) | ||
Line 114: | Line 368: | ||
end | end | ||
function Language:getMainCategoryName() | |||
return self._rawData["main_category"] or "lemma" | |||
end | |||
function Language:getOtherNames(onlyOtherNames) | function Language:getOtherNames(onlyOtherNames) | ||
Line 209: | Line 466: | ||
return self._familyObject | return self._familyObject | ||
end | end | ||
--[==[Returns the family code in the language's data file.]==] | |||
function Language:getFamilyCode() | |||
local family = self._familyCode | |||
if family == nil then | |||
-- If the value is nil, it's cached as false. | |||
family = self._rawData[3] or false | |||
self._familyCode = family | |||
end | |||
return family or nil | |||
end | |||
function Language:getFamilyName() | |||
local family = self._familyName | |||
if family == nil then | |||
family = self:getFamily() | |||
-- If the value is nil, it's cached as false. | |||
family = family and family:getCanonicalName() or false | |||
self._familyName = family | |||
end | |||
return family or nil | |||
end | |||
--[==[Check whether the language belongs to `family` (which can be a family code or object). A list of objects can be given in place of `family`; in that case, return true if the language belongs to any of the specified families. Note that some languages (in particular, certain creoles) can have multiple immediate ancestors potentially belonging to different families; in that case, return true if the language belongs to any of the specified families.]==] | |||
function Language:inFamily(...) | |||
--check_object("family", nil, ...) | |||
for _, family in ipairs{...} do | |||
if type(family) == "table" then | |||
family = family:getCode() | |||
end | |||
local self_family_code = self:getFamilyCode() | |||
if not self_family_code then | |||
return false | |||
elseif self_family_code == family then | |||
return true | |||
end | |||
local self_family = self:getFamily() | |||
if self_family:inFamily(family) then | |||
return true | |||
-- If the family isn't a real family (e.g. creoles) check any ancestors. | |||
elseif self_family:getFamilyCode() == "qfa-not" then | |||
local ancestors = self:getAncestors() | |||
for _, ancestor in ipairs(ancestors) do | |||
if ancestor:inFamily(family) then | |||
return true | |||
end | |||
end | |||
end | |||
end | |||
return false | |||
end | |||
function Language:getParent() | |||
local parent = self._parentObject | |||
if parent == nil then | |||
parent = self:getParentCode() | |||
-- If the value is nil, it's cached as false. | |||
parent = parent and export.getByCode(parent, nil, true, true, self._useRequire) or false | |||
self._parentObject = parent | |||
end | |||
return parent or nil | |||
end | |||
function Language:getParentCode() | |||
local parent = self._parentCode | |||
if parent == nil then | |||
-- If the value is nil, it's cached as false. | |||
parent = self._rawData[5] or false | |||
self._parentCode = parent | |||
end | |||
return parent or nil | |||
end | |||
function Language:getParentName() | |||
local parent = self._parentName | |||
if parent == nil then | |||
parent = self:getParent() | |||
-- If the value is nil, it's cached as false. | |||
parent = parent and parent:getCanonicalName() or false | |||
self._parentName = parent | |||
end | |||
return parent or nil | |||
end | |||
--[==[Given some text, this function iterates through the scripts of a given language and tries to find the script that best matches the text. It returns a {{code|lua|Script}} object representing the script. If no match is found at all, it returns the {{code|lua|None}} script object.]==] | |||
function Language:findBestScript(text, forceDetect) | |||
local useRequire = self._useRequire | |||
if not text or text == "" or text == "-" then | |||
return require("Module:scripts").getByCode("None", nil, nil, useRequire) | |||
end | |||
-- Differs from table returned by getScriptCodes, as Hants is not normalized into its constituents. | |||
codes = table.concat(self._rawData["scripts"],", ") | |||
codes = codes and split(codes, ",", true, true) or {"None"} | |||
self._bestScriptCodes = codes | |||
local first_sc = self._rawData.scripts[1] | |||
if first_sc == "All" then | |||
return require("Module:scripts").findBestScriptWithoutLang(text) | |||
end | |||
local get_script = require("Module:scripts").getByCode | |||
local codes_len = #codes | |||
if not (forceDetect or first_sc == "Hants" or codes_len > 1) then | |||
first_sc = get_script(first_sc, nil, nil, useRequire) | |||
local charset = first_sc.characters | |||
return charset and umatch(text, "[" .. charset .. "]") and first_sc or | |||
get_script("None", nil, nil, useRequire) | |||
end | |||
-- Remove all formatting characters. | |||
text = require("Module:utilities").get_plaintext(text) | |||
-- Remove all spaces and any ASCII punctuation. Some non-ASCII punctuation is script-specific, so can't be removed. | |||
text = ugsub(text, "[%s!\"#%%&'()*,%-./:;?@[\\%]_{}]+", "") | |||
if #text == 0 then | |||
return get_script("None", nil, nil, useRequire) | |||
end | |||
-- Try to match every script against the text, | |||
-- and return the one with the most matching characters. | |||
local bestcount, bestscript, length = 0 | |||
for i = 1, codes_len do | |||
local sc = codes[i] | |||
-- Special case for "Hants", which is a special code that represents whichever of "Hant" or "Hans" best matches, or "Hani" if they match equally. This avoids having to list all three. In addition, "Hants" will be treated as the best match if there is at least one matching character, under the assumption that a Han script is desirable in terms that contain a mix of Han and other scripts (not counting those which use Jpan or Kore). | |||
if sc == "Hants" then | |||
else | |||
sc = get_script(sc, nil, nil, useRequire) | |||
if not length then | |||
length = ulen(text) | |||
end | |||
-- Count characters by removing everything in the script's charset and comparing to the original length. | |||
local charset = sc.characters | |||
local count = charset and length - ulen(ugsub(text, "[" .. charset .. "]+", "")) or 0 | |||
if count >= length then | |||
return sc | |||
elseif count > bestcount then | |||
bestcount = count | |||
bestscript = sc | |||
end | |||
end | |||
end | |||
-- Return best matching script, or otherwise None. | |||
return bestscript or get_script("None", nil, nil, useRequire) | |||
end | |||
function Language:getParentChain() | |||
local chain = self._parentChain | |||
if chain == nil then | |||
chain = {} | |||
local parent, n = self:getParent(), 0 | |||
while parent do | |||
n = n + 1 | |||
chain[n] = parent | |||
parent = parent:getParent() | |||
end | |||
self._parentChain = chain | |||
end | |||
return chain | |||
end | |||
function Language:hasParent(...) | |||
--check_object("language", nil, ...) | |||
for _, otherlang in ipairs{...} do | |||
for _, parent in ipairs(self:getParentChain()) do | |||
if type(otherlang) == "string" then | |||
if otherlang == parent:getCode() then return true end | |||
else | |||
if otherlang:getCode() == parent:getCode() then return true end | |||
end | |||
end | |||
end | |||
return false | |||
end | |||
--[==[ | |||
If the language is etymology-only, this iterates through parents until a full language or family is found, and the | |||
corresponding object is returned. If the language is a full language, then it simply returns itself. | |||
]==] | |||
function Language:getFull() | |||
local full = self._fullObject | |||
if full == nil then | |||
full = self:getFullCode() | |||
full = full == self._code and self or | |||
export.getByCode(full, nil, nil, nil, self._useRequire) | |||
self._fullObject = full | |||
end | |||
return full | |||
end | |||
--[==[ | |||
If the language is an etymology-only language, this iterates through parents until a full language or family is | |||
found, and the corresponding code is returned. If the language is a full language, then it simply returns the | |||
language code. | |||
]==] | |||
function Language:getFullCode() | |||
return self._fullCode or self._code | |||
end | |||
--[==[ | |||
If the language is an etymology-only language, this iterates through parents until a full language or family is | |||
found, and the corresponding canonical name is returned. If the language is a full language, then it simply returns | |||
the canonical name of the language. | |||
]==] | |||
function Language:getFullName() | |||
local full = self._fullName | |||
if full == nil then | |||
full = self:getFull():getCanonicalName() | |||
self._fullName = full | |||
end | |||
return full | |||
end | |||
Line 250: | Line 727: | ||
end | end | ||
end | end | ||
end | |||
end | |||
--[==[Generates alternative forms using a specified method, and returns them as a table. If no method is specified, returns a table containing only the input term.]==] | |||
function Language:generateForms(text, sc) | |||
if self._rawData.generate_forms then | |||
sc = checkScript(text, self, sc) | |||
return require("Module:" .. self._rawData.generate_forms).generateForms(text, self._code, sc:getCode()) | |||
else | |||
return {text} | |||
end | end | ||
end | end | ||
Line 330: | Line 817: | ||
end | end | ||
function Language:getTypes() | |||
local types = self._types | |||
if types == nil then | |||
types = {language = true} | |||
if self:getCode() == self._code then | |||
types.full = true | |||
else | |||
types["etymology-only"] = true | |||
end | |||
local type = self._rawData.type or "regular" | |||
for t in gmatch(type, "[^,]+") do | |||
types[t] = true | |||
end | |||
self._types = types | |||
end | |||
return types | |||
end | |||
--[==[Given a list of types as strings, returns true if the language has all of them.]==] | |||
function Language:hasType(...) | |||
local args, types = {...}, self:getTypes() | |||
for i = 1, #args do | |||
if not types[args[i]] then | |||
return false | |||
end | |||
end | |||
return true | |||
end | |||
-- Apply display-text replacements to `text`, if any. | -- Apply display-text replacements to `text`, if any. | ||
Line 391: | Line 906: | ||
function Language:transliterate(text, sc, module_override) | function Language:transliterate(text, sc, module_override) | ||
-- If there is no text, or the language doesn't have transliteration data and there's no override, return nil. | |||
if not ((module_override or | if not (self._rawData.translit_module or module_override) then | ||
return nil | return nil, false, {} | ||
elseif (not text) or text == "" or text == "-" then | |||
return text, false, {} | |||
end | |||
-- If the script is not transliteratable (and no override is given), return nil. | |||
sc = checkScript(text, self, sc) | |||
if not (sc:isTransliterated() or module_override) then | |||
return nil, true, {} | |||
end | |||
-- Remove any strip markers. | |||
text = mw.text.unstrip(text) | |||
-- Get the display text with the keepCarets flag set. | |||
local fail, cats, subbedChars | |||
text, fail, cats, subbedChars = processDisplayText(text, self, sc, true) | |||
-- Transliterate (using the module override if applicable). | |||
text, fail, cats, subbedChars = iterateSectionSubstitutions(text, subbedChars, true, self, sc, module_override or self._rawData.translit_module, "tr") | |||
if not text then | |||
return nil, true, cats | |||
end | end | ||
return require("Module:" .. ( | -- Incomplete transliterations return nil. | ||
local charset = sc.characters | |||
if charset and umatch(text, "[" .. charset .. "]") then | |||
-- Remove any characters in Latin, which includes Latin characters also included in other scripts (as these are false positives), as well as any PUA substitutions. Anything remaining should only be script code "None" (e.g. numerals). | |||
local check_text = ugsub(text, "[" .. require("Module:scripts").getByCode("Latn").characters .. "-]+", "") | |||
-- Set none_is_last_resort_only flag, so that any non-None chars will cause a script other than "None" to be returned. | |||
if require("Module:scripts").findBestScriptWithoutLang(check_text, true):getCode() ~= "None" then | |||
return nil, true, cats | |||
end | |||
end | |||
text = escape_risky_characters(text) | |||
text = undoTempSubstitutions(text, subbedChars) | |||
-- If the script does not use capitalization, then capitalize any letters of the transliteration which are immediately preceded by a caret (and remove the caret). | |||
if text and not sc:hasCapitalization() and text:find("^", 1, true) then | |||
text = processCarets(text, "%^([\128-\191\244]*%*?)([^\128-\191\244][\128-\191]*)", function(m1, m2) | |||
return m1 .. uupper(m2) | |||
end) | |||
end | |||
-- Track module overrides. | |||
if module_override ~= nil then | |||
track("module_override") | |||
end | |||
fail = text == nil and (not not fail) or false | |||
return text, fail, cats | |||
end | end | ||
Latest revision as of 23:25, 7 August 2024
- The following documentation is located at Module:languages/doc.[edit]
- Useful links: subpage list • links • transclusions • testcases • sandbox
local require = require
local m_str_utils = require("Module:string utilities")
local m_table = require("Module:table")
local mw = mw
local string = string
local table = table
local ustring = mw.ustring
local char = string.char
local check_object = require("Module:utilities").check_object
local concat = table.concat
local decode_entities = m_str_utils.decode_entities
local decode_uri = m_str_utils.decode_uri
local find = string.find
local floor = math.floor
local gmatch = string.gmatch
local gsub = string.gsub
local insert = table.insert
local ipairs = ipairs
local list_to_set = m_table.listToSet
local load_data = mw.loadData
local match = string.match
local next = next
local pairs = pairs
local remove = table.remove
local remove_duplicates = m_table.removeDuplicates
local select = select
local setmetatable = setmetatable
local shallowcopy = m_table.shallowcopy
local split = m_str_utils.split
local sub = string.sub
local type = type
local ugsub = ustring.gsub
local ulen = m_str_utils.len
local ulower = m_str_utils.lower
local umatch = ustring.match
local uupper = m_str_utils.upper
local export = {}
--[=[
Throw an error for an invalid language code or script code.
`lang_code` (required) is the bad code and can be nil or a non-string.
`param` (required) is the name of the parameter in which the code was contained. It can be a string, a number
(for a numeric param, in which case the param will show up in the error message as an ordinal such as
"first" or "second"), or `true` if no parameter can be clearly identified.
`code_desc` (optional) is text describing what the code is; by default, "language code".
`template_text` (optional) is a string specifying the template that generated the error, or a function
to generate this string. If given, it will be displayed in the error message.
`not_real_lang` (optional), if given, indicates that the code is not in the form of a language code
(e.g. it's a script code). Normally, this function checks for things that could plausibly be a language code:
two or three lowercase letters, two or three groups of three lowercase letters with hyphens between them.
If such a pattern is found, a different error message is displayed (indicating an invalid code) than otherwise
(indicating a missing code). If `not_real_lang` is given, this check is suppressed.
]=]
-- Convert risky characters to HTML entities, which minimizes interference once returned (e.g. for "sms:a", "<!-- -->" etc.).
local function escape_risky_characters(text)
local encode_entities = require("Module:string/encode entities")
-- Spacing characters in isolation generally need to be escaped in order to be properly processed by the MediaWiki software.
if umatch(text, "^%s*$") then
return encode_entities(text, text)
else
return encode_entities(text, "!#%&*+/:;<=>?@[\\]_{|}")
end
end
-- Process carets (and any escapes). Default to simple removal, if no pattern/replacement is given.
local function processCarets(text, pattern, repl)
local rep
repeat
text, rep = gsub(text, "\\\\(\\*^)", "\3%1")
until rep == 0
return text:gsub("\\^", "\4")
:gsub(pattern or "%^", repl or "")
:gsub("\3", "\\")
:gsub("\4", "^")
end
-- Remove carets if they are used to capitalize parts of transliterations (unless they have been escaped).
local function removeCarets(text, sc)
if not sc:hasCapitalization() and sc:isTransliterated() and text:find("^", 1, true) then
return processCarets(text)
else
return text
end
end
-- Temporarily convert various formatting characters to PUA to prevent them from being disrupted by the substitution process.
local function doTempSubstitutions(text, subbedChars, keepCarets, noTrim)
-- Clone so that we don't insert any extra patterns into the table in package.loaded. For some reason, using require seems to keep memory use down; probably because the table is always cloned.
local patterns = shallowcopy(require("Module:languages/data/patterns"))
if keepCarets then
insert(patterns, "((\\+)%^)")
insert(patterns, "((%^))")
end
-- Ensure any whitespace at the beginning and end is temp substituted, to prevent it from being accidentally trimmed. We only want to trim any final spaces added during the substitution process (e.g. by a module), which means we only do this during the first round of temp substitutions.
if not noTrim then
insert(patterns, "^([\128-\191\244]*(%s+))")
insert(patterns, "((%s+)[\128-\191\244]*)$")
end
-- Pre-substitution, of "[[" and "]]", which makes pattern matching more accurate.
text = gsub(text, "%f[%[]%[%[", "\1")
:gsub("%f[%]]%]%]", "\2")
local i, pe = #subbedChars, require("Module:string utilities").pattern_escape
for _, pattern in ipairs(patterns) do
-- Patterns ending in \0 stand are for things like "[[" or "]]"), so the inserted PUA are treated as breaks between terms by modules that scrape info from pages.
local term_divider
pattern = gsub(pattern, "%z$", function(divider)
term_divider = divider == "\0"
return ""
end)
text = gsub(text, pattern, function(...)
local m = {...}
local m1New = m[1]
for k = 2, #m do
local n = i + k - 1
subbedChars[n] = m[k]
local byte2 = floor(n / 4096) % 64 + (term_divider and 128 or 136)
local byte3 = floor(n / 64) % 64 + 128
local byte4 = n % 64 + 128
m1New = gsub(m1New, pe(m[k]), "\244" .. char(byte2) .. char(byte3) .. char(byte4), 1)
end
i = i + #m - 1
return m1New
end)
end
text = gsub(text, "\1", "%[%[")
:gsub("\2", "%]%]")
return text, subbedChars
end
-- Split the text into sections, based on the presence of temporarily substituted formatting characters, then iterate over each one to apply substitutions. This avoids putting PUA characters through language-specific modules, which may be unequipped for them.
local function iterateSectionSubstitutions(text, subbedChars, keepCarets, self, sc, substitution_data, function_name)
local pe = require("Module:string utilities").pattern_escape
local fail, cats, sections = nil, {}
-- See [[Module:languages/data]].
if not find(text, "\244") or self:loadData("Module:languages/data").contiguous_substitution[self._code] then
sections = {text}
else
sections = split(text, "\244[\128-\143][\128-\191]*", true)
end
for _, section in ipairs(sections) do
-- Don't bother processing empty strings or whitespace (which may also not be handled well by dedicated modules).
if gsub(section, "%s+", "") ~= "" then
local sub, sub_fail, sub_cats = require("Module:languages/doSubstitutions")(section, self, sc, substitution_data, function_name)
-- Second round of temporary substitutions, in case any formatting was added by the main substitution process. However, don't do this if the section contains formatting already (as it would have had to have been escaped to reach this stage, and therefore should be given as raw text).
if sub and subbedChars then
local noSub
for _, pattern in ipairs(require("Module:languages/data/patterns")) do
if match(section, pattern .. "%z?") then
noSub = true
end
end
if not noSub then
sub, subbedChars = doTempSubstitutions(sub, subbedChars, keepCarets, true)
end
end
if (not sub) or sub_fail then
text = sub
fail = sub_fail
cats = sub_cats or {}
break
end
text = sub and gsub(text, pe(section), pe(sub), 1) or text
if type(sub_cats) == "table" then
for _, cat in ipairs(sub_cats) do
insert(cats, cat)
end
end
end
end
-- Trim, unless there are only spacing characters, while ignoring any final formatting characters.
text = text and text:gsub("^([\128-\191\244]*)%s+(%S)", "%1%2")
:gsub("(%S)%s+([\128-\191\244]*)$", "%1%2")
-- Remove duplicate categories.
if #cats > 1 then
cats = remove_duplicates(cats)
end
return text, fail, cats, subbedChars
end
local function normalize(text, sc)
text = sc:fixDiscouragedSequences(text)
return sc:toFixedNFD(text)
end
-- Check if the raw text is an unsupported title, and if so return that. Otherwise, remove HTML entities. We do the pre-conversion to avoid loading the unsupported title list unnecessarily.
local function checkNoEntities(self, text)
local textNoEnc = decode_entities(text)
if textNoEnc ~= text and self:loadData("Module:links/data").unsupported_titles[text] then
return text
else
return textNoEnc
end
end
-- Reinsert any formatting that was temporarily substituted.
local function undoTempSubstitutions(text, subbedChars)
local pe = require("Module:string utilities").pattern_escape
for i = 1, #subbedChars do
local byte2 = floor(i / 4096) % 64 + 128
local byte3 = floor(i / 64) % 64 + 128
local byte4 = i % 64 + 128
text = gsub(text, "\244[" .. char(byte2) .. char(byte2+8) .. "]" .. char(byte3) .. char(byte4), pe(subbedChars[i]))
end
text = gsub(text, "\1", "%[%[")
:gsub("\2", "%]%]")
return text
end
-- If no script object is provided (or if it's invalid or None), get one.
local function checkScript(text, self, sc)
if not check_object("script", true, sc) or sc:getCode() == "None" then
return self:findBestScript(text)
else
return sc
end
end
--[==[Create the form used as as a basis for display text and transliteration.]==]
local function processDisplayText(text, self, sc, keepCarets, keepPrefixes)
local subbedChars = {}
text, subbedChars = doTempSubstitutions(text, subbedChars, keepCarets)
text = decode_uri(text, "PATH")
text = checkNoEntities(self, text)
sc = checkScript(text, self, sc)
local fail, cats
text = normalize(text, sc)
text, fail, cats, subbedChars = iterateSectionSubstitutions(text, subbedChars, keepCarets, self, sc, self._rawData.display_text, "makeDisplayText")
text = removeCarets(text, sc)
-- Remove any interwiki link prefixes (unless they have been escaped or this has been disabled).
if find(text, ":") and not keepPrefixes then
local rep
repeat
text, rep = gsub(text, "\\\\(\\*:)", "\3%1")
until rep == 0
text = gsub(text, "\\:", "\4")
while true do
local prefix = gsub(text, "^(.-):.+", function(m1)
return gsub(m1, "\244[\128-\191]*", "")
end)
if not prefix or prefix == text then
break
end
local lower_prefix = ulower(prefix)
if not (self:loadData("Module:data/interwikis")[lower_prefix] or prefix == "") then
break
end
text = gsub(text, "^(.-):(.*)", function(m1, m2)
local ret = {}
for subbedChar in gmatch(m1, "\244[\128-\191]*") do
insert(ret, subbedChar)
end
return concat(ret) .. m2
end)
end
text = gsub(text, "\3", "\\")
:gsub("\4", ":")
end
return text, fail, cats, subbedChars
end
function export.err(lang_code, param, code_desc, template_tag, not_real_lang)
local ordinals = {
"first", "second", "third", "fourth", "fifth", "sixth",
"seventh", "eighth", "ninth", "tenth", "eleventh", "twelfth",
"thirteenth", "fourteenth", "fifteenth", "sixteenth", "seventeenth",
"eighteenth", "nineteenth", "twentieth"
}
code_desc = code_desc or "language code"
if not template_tag then
template_tag = ""
else
if type(template_tag) ~= "string" then
template_tag = template_tag()
end
template_tag = " (Original template: " .. template_tag .. ")"
end
local function err(msg)
error(msg .. template_tag, 3)
end
local param_type = type(param)
local in_the_param
if param == true then
-- handled specially below
in_the_param = ""
else
if param_type == "number" then
param = ordinals[param] .. " parameter"
elseif param_type == "string" then
param = 'parameter "' .. param .. '"'
else
err("The parameter name is "
.. (param_type == "table" and "a table" or tostring(param))
.. ", but it should be a number or a string.")
end
in_the_param = " in the " .. param
end
if not lang_code or lang_code == "" then
if param == true then
err("The " .. code_desc .. " is missing.")
else
err("The " .. param .. " (" .. code_desc .. ") is missing.")
end
elseif type(lang_code) ~= "string" then
err("The " .. code_desc .. in_the_param .. " is supposed to be a string but is a " .. type(lang_code) .. ".")
-- Can use string.find because language codes only contain ASCII.
elseif not_real_lang or lang_code:find("^%l%l%l?$")
or lang_code:find("^%l%l%l%-%l%l%l$")
or lang_code:find("^%l%l%l%-%l%l%l%-%l%l%l$") then
err("The " .. code_desc .. " \"" .. lang_code .. "\"" .. in_the_param .. " is not valid.")
else
err("Please specify a " .. code_desc .. in_the_param .. ". The value \"" .. lang_code .. "\" is not valid.")
end
end
local function do_entry_name_or_sort_key_replacements(text, replacements)
if replacements.from then
for i, from in ipairs(replacements.from) do
local to = replacements.to[i] or ""
text = mw.ustring.gsub(text, from, to)
end
end
if replacements.remove_diacritics then
text = mw.ustring.toNFD(text)
text = mw.ustring.gsub(text,
'[' .. replacements.remove_diacritics .. ']',
'')
text = mw.ustring.toNFC(text)
end
return text
end
local Language = {}
function Language:getCode()
return self._code
end
function Language:getCanonicalName()
return self._rawData[1] or self._rawData.canonicalName
end
function Language:getDisplayForm()
return self:getCanonicalName()
end
function Language:getMainCategoryName()
return self._rawData["main_category"] or "lemma"
end
function Language:getOtherNames(onlyOtherNames)
self:loadInExtraData()
return require("Module:language-like").getOtherNames(self, onlyOtherNames)
end
function Language:getAliases()
self:loadInExtraData()
return self._extraData.aliases or {}
end
function Language:getVarieties(flatten)
self:loadInExtraData()
return require("Module:language-like").getVarieties(self, flatten)
end
function Language:getType()
return self._rawData.type or "regular"
end
function Language:getWikimediaLanguages()
if not self._wikimediaLanguageObjects then
local m_wikimedia_languages = require("Module:wikimedia languages")
self._wikimediaLanguageObjects = {}
local wikimedia_codes = self._rawData.wikimedia_codes or { self._code }
for _, wlangcode in ipairs(wikimedia_codes) do
table.insert(self._wikimediaLanguageObjects, m_wikimedia_languages.getByCode(wlangcode))
end
end
return self._wikimediaLanguageObjects
end
function Language:getWikipediaArticle()
if self._rawData.wikipedia_article then
return self._rawData.wikipedia_article
elseif self._wikipedia_article then
return self._wikipedia_article
elseif self:getWikidataItem() and mw.wikibase then
self._wikipedia_article = mw.wikibase.sitelink(self:getWikidataItem(), 'enwiki')
end
if not self._wikipedia_article then
self._wikipedia_article = mw.ustring.gsub(self:getCategoryName(), "Creole language", "Creole")
end
return self._wikipedia_article
end
function Language:makeWikipediaLink()
return "[[w:" .. self:getWikipediaArticle() .. "|" .. self:getCanonicalName() .. "]]"
end
function Language:getWikidataItem()
local item = self._rawData[2]
if type(item) == "number" then
return "Q" .. item
else
return item
end
end
function Language:getScripts()
if not self._scriptObjects then
local m_scripts = require("Module:scripts")
self._scriptObjects = {}
for _, sc in ipairs(self:getScriptCodes()) do
table.insert(self._scriptObjects, m_scripts.getByCode(sc))
end
end
return self._scriptObjects
end
function Language:getScriptCodes()
return self._rawData.scripts or self._rawData[4] or { "None" }
end
function Language:getFamily()
if self._familyObject then
return self._familyObject
end
local family = self._rawData[3] or self._rawData.family
if family then
self._familyObject = require("Module:families").getByCode(family)
end
return self._familyObject
end
--[==[Returns the family code in the language's data file.]==]
function Language:getFamilyCode()
local family = self._familyCode
if family == nil then
-- If the value is nil, it's cached as false.
family = self._rawData[3] or false
self._familyCode = family
end
return family or nil
end
function Language:getFamilyName()
local family = self._familyName
if family == nil then
family = self:getFamily()
-- If the value is nil, it's cached as false.
family = family and family:getCanonicalName() or false
self._familyName = family
end
return family or nil
end
--[==[Check whether the language belongs to `family` (which can be a family code or object). A list of objects can be given in place of `family`; in that case, return true if the language belongs to any of the specified families. Note that some languages (in particular, certain creoles) can have multiple immediate ancestors potentially belonging to different families; in that case, return true if the language belongs to any of the specified families.]==]
function Language:inFamily(...)
--check_object("family", nil, ...)
for _, family in ipairs{...} do
if type(family) == "table" then
family = family:getCode()
end
local self_family_code = self:getFamilyCode()
if not self_family_code then
return false
elseif self_family_code == family then
return true
end
local self_family = self:getFamily()
if self_family:inFamily(family) then
return true
-- If the family isn't a real family (e.g. creoles) check any ancestors.
elseif self_family:getFamilyCode() == "qfa-not" then
local ancestors = self:getAncestors()
for _, ancestor in ipairs(ancestors) do
if ancestor:inFamily(family) then
return true
end
end
end
end
return false
end
function Language:getParent()
local parent = self._parentObject
if parent == nil then
parent = self:getParentCode()
-- If the value is nil, it's cached as false.
parent = parent and export.getByCode(parent, nil, true, true, self._useRequire) or false
self._parentObject = parent
end
return parent or nil
end
function Language:getParentCode()
local parent = self._parentCode
if parent == nil then
-- If the value is nil, it's cached as false.
parent = self._rawData[5] or false
self._parentCode = parent
end
return parent or nil
end
function Language:getParentName()
local parent = self._parentName
if parent == nil then
parent = self:getParent()
-- If the value is nil, it's cached as false.
parent = parent and parent:getCanonicalName() or false
self._parentName = parent
end
return parent or nil
end
--[==[Given some text, this function iterates through the scripts of a given language and tries to find the script that best matches the text. It returns a {{code|lua|Script}} object representing the script. If no match is found at all, it returns the {{code|lua|None}} script object.]==]
function Language:findBestScript(text, forceDetect)
local useRequire = self._useRequire
if not text or text == "" or text == "-" then
return require("Module:scripts").getByCode("None", nil, nil, useRequire)
end
-- Differs from table returned by getScriptCodes, as Hants is not normalized into its constituents.
codes = table.concat(self._rawData["scripts"],", ")
codes = codes and split(codes, ",", true, true) or {"None"}
self._bestScriptCodes = codes
local first_sc = self._rawData.scripts[1]
if first_sc == "All" then
return require("Module:scripts").findBestScriptWithoutLang(text)
end
local get_script = require("Module:scripts").getByCode
local codes_len = #codes
if not (forceDetect or first_sc == "Hants" or codes_len > 1) then
first_sc = get_script(first_sc, nil, nil, useRequire)
local charset = first_sc.characters
return charset and umatch(text, "[" .. charset .. "]") and first_sc or
get_script("None", nil, nil, useRequire)
end
-- Remove all formatting characters.
text = require("Module:utilities").get_plaintext(text)
-- Remove all spaces and any ASCII punctuation. Some non-ASCII punctuation is script-specific, so can't be removed.
text = ugsub(text, "[%s!\"#%%&'()*,%-./:;?@[\\%]_{}]+", "")
if #text == 0 then
return get_script("None", nil, nil, useRequire)
end
-- Try to match every script against the text,
-- and return the one with the most matching characters.
local bestcount, bestscript, length = 0
for i = 1, codes_len do
local sc = codes[i]
-- Special case for "Hants", which is a special code that represents whichever of "Hant" or "Hans" best matches, or "Hani" if they match equally. This avoids having to list all three. In addition, "Hants" will be treated as the best match if there is at least one matching character, under the assumption that a Han script is desirable in terms that contain a mix of Han and other scripts (not counting those which use Jpan or Kore).
if sc == "Hants" then
else
sc = get_script(sc, nil, nil, useRequire)
if not length then
length = ulen(text)
end
-- Count characters by removing everything in the script's charset and comparing to the original length.
local charset = sc.characters
local count = charset and length - ulen(ugsub(text, "[" .. charset .. "]+", "")) or 0
if count >= length then
return sc
elseif count > bestcount then
bestcount = count
bestscript = sc
end
end
end
-- Return best matching script, or otherwise None.
return bestscript or get_script("None", nil, nil, useRequire)
end
function Language:getParentChain()
local chain = self._parentChain
if chain == nil then
chain = {}
local parent, n = self:getParent(), 0
while parent do
n = n + 1
chain[n] = parent
parent = parent:getParent()
end
self._parentChain = chain
end
return chain
end
function Language:hasParent(...)
--check_object("language", nil, ...)
for _, otherlang in ipairs{...} do
for _, parent in ipairs(self:getParentChain()) do
if type(otherlang) == "string" then
if otherlang == parent:getCode() then return true end
else
if otherlang:getCode() == parent:getCode() then return true end
end
end
end
return false
end
--[==[
If the language is etymology-only, this iterates through parents until a full language or family is found, and the
corresponding object is returned. If the language is a full language, then it simply returns itself.
]==]
function Language:getFull()
local full = self._fullObject
if full == nil then
full = self:getFullCode()
full = full == self._code and self or
export.getByCode(full, nil, nil, nil, self._useRequire)
self._fullObject = full
end
return full
end
--[==[
If the language is an etymology-only language, this iterates through parents until a full language or family is
found, and the corresponding code is returned. If the language is a full language, then it simply returns the
language code.
]==]
function Language:getFullCode()
return self._fullCode or self._code
end
--[==[
If the language is an etymology-only language, this iterates through parents until a full language or family is
found, and the corresponding canonical name is returned. If the language is a full language, then it simply returns
the canonical name of the language.
]==]
function Language:getFullName()
local full = self._fullName
if full == nil then
full = self:getFull():getCanonicalName()
self._fullName = full
end
return full
end
function Language:getAncestors()
if not self._ancestorObjects then
self._ancestorObjects = {}
if self._rawData.ancestors then
for _, ancestor in ipairs(self._rawData.ancestors) do
table.insert(self._ancestorObjects, export.getByCode(ancestor) or require("Module:etymology languages").getByCode(ancestor))
end
else
local fam = self:getFamily()
local protoLang = fam and fam:getProtoLanguage() or nil
-- For the case where the current language is the proto-language
-- of its family, we need to step up a level higher right from the start.
if protoLang and protoLang:getCode() == self:getCode() then
fam = fam:getFamily()
protoLang = fam and fam:getProtoLanguage() or nil
end
while not protoLang and not (not fam or fam:getCode() == "qfa-not") do
fam = fam:getFamily()
protoLang = fam and fam:getProtoLanguage() or nil
end
table.insert(self._ancestorObjects, protoLang)
end
end
return self._ancestorObjects
end
local function iterateOverAncestorTree(node, func)
for _, ancestor in ipairs(node:getAncestors()) do
if ancestor then
local ret = func(ancestor) or iterateOverAncestorTree(ancestor, func)
if ret then
return ret
end
end
end
end
--[==[Generates alternative forms using a specified method, and returns them as a table. If no method is specified, returns a table containing only the input term.]==]
function Language:generateForms(text, sc)
if self._rawData.generate_forms then
sc = checkScript(text, self, sc)
return require("Module:" .. self._rawData.generate_forms).generateForms(text, self._code, sc:getCode())
else
return {text}
end
end
function Language:getAncestorChain()
if not self._ancestorChain then
self._ancestorChain = {}
local step = #self:getAncestors() == 1 and self:getAncestors()[1] or nil
while step do
table.insert(self._ancestorChain, 1, step)
step = #step:getAncestors() == 1 and step:getAncestors()[1] or nil
end
end
return self._ancestorChain
end
function Language:hasAncestor(otherlang)
local function compare(ancestor)
return ancestor:getCode() == otherlang:getCode()
end
return iterateOverAncestorTree(self, compare) or false
end
function Language:getCategoryName(nocap)
local name = self:getCanonicalName()
-- If the name already has "language" in it, don't add it.
if not name:find("[Ll]anguage$") then
name = name .. " language"
end
if not nocap then
name = mw.getContentLanguage():ucfirst(name)
end
return name
end
function Language:makeCategoryLink()
return "[[:Category:" .. self:getCategoryName() .. "|" .. self:getDisplayForm() .. "]]"
end
function Language:getStandardCharacters()
return self._rawData.standardChars
end
function Language:makeEntryName(text)
text = mw.ustring.match(text, "^[¿¡]?(.-[^%s%p].-)%s*[؟?!;՛՜ ՞ ՟?!︖︕।॥။၊་།]?$") or text
if self:getCode() == "ar" then
local U = mw.ustring.char
local taTwiil = U(0x640)
local waSla = U(0x671)
-- diacritics ordinarily removed by entry_name replacements
local Arabic_diacritics = U(0x64B, 0x64C, 0x64D, 0x64E, 0x64F, 0x650, 0x651, 0x652, 0x670)
if text == waSla or mw.ustring.find(text, "^" .. taTwiil .. "?[" .. Arabic_diacritics .. "]" .. "$") then
return text
end
end
if type(self._rawData.entry_name) == "table" then
text = do_entry_name_or_sort_key_replacements(text, self._rawData.entry_name)
end
return text
end
-- Return true if the language has display processing enabled, i.e. lang:makeDisplayText()
-- does non-trivial processing.
function Language:hasDisplayProcessing()
return not not self._rawData.display
end
function Language:getTypes()
local types = self._types
if types == nil then
types = {language = true}
if self:getCode() == self._code then
types.full = true
else
types["etymology-only"] = true
end
local type = self._rawData.type or "regular"
for t in gmatch(type, "[^,]+") do
types[t] = true
end
self._types = types
end
return types
end
--[==[Given a list of types as strings, returns true if the language has all of them.]==]
function Language:hasType(...)
local args, types = {...}, self:getTypes()
for i = 1, #args do
if not types[args[i]] then
return false
end
end
return true
end
-- Apply display-text replacements to `text`, if any.
function Language:makeDisplayText(text)
if type(self._rawData.display) == "table" then
text = do_entry_name_or_sort_key_replacements(text, self._rawData.display)
end
return text
end
-- Add to data tables?
local has_dotted_undotted_i = {
["az"] = true,
["crh"] = true,
["gag"] = true,
["kaa"] = true,
["tt"] = true,
["tr"] = true,
["zza"] = true,
}
function Language:makeSortKey(name, sc)
if has_dotted_undotted_i[self:getCode()] then
name = name:gsub("I", "ı")
end
name = mw.ustring.lower(name)
-- Remove initial hyphens and *
local hyphens_regex = "^[-־ـ*]+(.)"
name = mw.ustring.gsub(name, hyphens_regex, "%1")
-- If there are language-specific rules to generate the key, use those
if type(self._rawData.sort_key) == "table" then
name = do_entry_name_or_sort_key_replacements(name, self._rawData.sort_key)
elseif type(self._rawData.sort_key) == "string" then
name = require("Module:" .. self._rawData.sort_key).makeSortKey(name, self:getCode(), sc and sc:getCode())
end
-- Remove parentheses, as long as they are either preceded or followed by something
name = mw.ustring.gsub(name, "(.)[()]+", "%1")
name = mw.ustring.gsub(name, "[()]+(.)", "%1")
if has_dotted_undotted_i[self:getCode()] then
name = name:gsub("i", "İ")
end
return mw.ustring.upper(name)
end
function Language:overrideManualTranslit()
if self._rawData.override_translit then
return true
else
return false
end
end
function Language:transliterate(text, sc, module_override)
-- If there is no text, or the language doesn't have transliteration data and there's no override, return nil.
if not (self._rawData.translit_module or module_override) then
return nil, false, {}
elseif (not text) or text == "" or text == "-" then
return text, false, {}
end
-- If the script is not transliteratable (and no override is given), return nil.
sc = checkScript(text, self, sc)
if not (sc:isTransliterated() or module_override) then
return nil, true, {}
end
-- Remove any strip markers.
text = mw.text.unstrip(text)
-- Get the display text with the keepCarets flag set.
local fail, cats, subbedChars
text, fail, cats, subbedChars = processDisplayText(text, self, sc, true)
-- Transliterate (using the module override if applicable).
text, fail, cats, subbedChars = iterateSectionSubstitutions(text, subbedChars, true, self, sc, module_override or self._rawData.translit_module, "tr")
if not text then
return nil, true, cats
end
-- Incomplete transliterations return nil.
local charset = sc.characters
if charset and umatch(text, "[" .. charset .. "]") then
-- Remove any characters in Latin, which includes Latin characters also included in other scripts (as these are false positives), as well as any PUA substitutions. Anything remaining should only be script code "None" (e.g. numerals).
local check_text = ugsub(text, "[" .. require("Module:scripts").getByCode("Latn").characters .. "-]+", "")
-- Set none_is_last_resort_only flag, so that any non-None chars will cause a script other than "None" to be returned.
if require("Module:scripts").findBestScriptWithoutLang(check_text, true):getCode() ~= "None" then
return nil, true, cats
end
end
text = escape_risky_characters(text)
text = undoTempSubstitutions(text, subbedChars)
-- If the script does not use capitalization, then capitalize any letters of the transliteration which are immediately preceded by a caret (and remove the caret).
if text and not sc:hasCapitalization() and text:find("^", 1, true) then
text = processCarets(text, "%^([\128-\191\244]*%*?)([^\128-\191\244][\128-\191]*)", function(m1, m2)
return m1 .. uupper(m2)
end)
end
-- Track module overrides.
if module_override ~= nil then
track("module_override")
end
fail = text == nil and (not not fail) or false
return text, fail, cats
end
function Language:hasTranslit()
return self._rawData.translit_module and true or false
end
function Language:link_tr()
return self._rawData.link_tr and true or false
end
function Language:toJSON()
local entryNamePatterns = nil
local entryNameRemoveDiacritics = nil
if self._rawData.entry_name then
entryNameRemoveDiacritics = self._rawData.entry_name.remove_diacritics
if self._rawData.entry_name.from then
entryNamePatterns = {}
for i, from in ipairs(self._rawData.entry_name.from) do
local to = self._rawData.entry_name.to[i] or ""
table.insert(entryNamePatterns, { from = from, to = to })
end
end
end
local ret = {
ancestors = self._rawData.ancestors,
canonicalName = self:getCanonicalName(),
categoryName = self:getCategoryName("nocap"),
code = self._code,
entryNamePatterns = entryNamePatterns,
entryNameRemoveDiacritics = entryNameRemoveDiacritics,
family = self._rawData[3] or self._rawData.family,
otherNames = self:getOtherNames(true),
aliases = self:getAliases(),
varieties = self:getVarieties(),
scripts = self._rawData.scripts or self._rawData[4],
type = self:getType(),
wikimediaLanguages = self._rawData.wikimedia_codes,
wikidataItem = self:getWikidataItem(),
}
return require("Module:JSON").toJSON(ret)
end
-- Do NOT use these methods!
-- All uses should be pre-approved on the talk page!
function Language:getRawData()
return self._rawData
end
function Language:getRawExtraData()
self:loadInExtraData()
return self._extraData
end
Language.__index = Language
function export.getDataModuleName(code)
if code:find("^%l%l$") then
return "languages/data2"
elseif code:find("^%l%l%l$") then
local prefix = code:sub(1, 1)
return "languages/data3/" .. prefix
elseif code:find("^[%l-]+$") then
return "languages/datax"
else
return nil
end
end
function export.getExtraDataModuleName(code)
if code:find("^%l%l$") then
return "languages/extradata2"
elseif code:find("^%l%l%l$") then
local prefix = code:sub(1, 1)
return "languages/extradata3/" .. prefix
elseif code:find("^[%l-]+$") then
return "languages/extradatax"
else
return nil
end
end
local function getRawLanguageData(code)
local modulename = export.getDataModuleName(code)
return modulename and mw.loadData("Module:" .. modulename)[code] or nil
end
local function getRawExtraLanguageData(code)
local modulename = export.getExtraDataModuleName(code)
return modulename and mw.loadData("Module:" .. modulename)[code] or nil
end
function Language:loadInExtraData()
if not self._extraData then
-- load extra data from module and assign to meta table
-- use empty table as a fallback if extra data is nil
local meta = getmetatable(self)
meta._extraData = getRawExtraLanguageData(self._code) or {}
setmetatable(self, meta)
end
end
function export.makeObject(code, data)
return data and setmetatable({ _rawData = data, _code = code }, Language) or nil
end
function export.getByCode(code, paramForError, allowEtymLang, allowFamily)
if type(code) ~= "string" then
error("The function getByCode expects a string as its first argument, but received " .. (code == nil and "nil" or "a " .. type(code)) .. ".")
end
local retval = export.makeObject(code, getRawLanguageData(code))
if not retval and allowEtymLang then
retval = require("Module:etymology languages").getByCode(code)
end
if not retval and allowFamily then
retval = require("Module:families").getByCode(code)
end
if not retval and paramForError then
local codetext = nil
if allowEtymLang and allowFamily then
codetext = "language, etymology language or family code"
elseif allowEtymLang then
codetext = "language or etymology language code"
elseif allowFamily then
codetext = "language or family code"
else
codetext = "language code"
end
export.err(code, paramForError, codetext)
end
return retval
end
function export.getByName(name, errorIfInvalid)
local byName = mw.loadData("Module:languages/by name")
local code = byName.all and byName.all[name] or byName[name]
if not code then
if errorIfInvalid then
error("The language name \"" .. name .. "\" is not valid.")
else
return nil
end
end
return export.makeObject(code, getRawLanguageData(code))
end
function export.getByCanonicalName(name, errorIfInvalid, allowEtymLang, allowFamily)
local byName = mw.loadData("Module:languages/canonical names")
local code = byName and byName[name]
local retval = code and export.makeObject(code, getRawLanguageData(code)) or nil
if not retval and allowEtymLang then
retval = require("Module:etymology languages").getByCanonicalName(name)
end
if not retval and allowFamily then
local famname = name:match("^(.*) languages$")
famname = famname or name
retval = require("Module:families").getByCanonicalName(famname)
end
if not retval and errorIfInvalid then
local text
if allowEtymLang and allowFamily then
text = "language, etymology language or family name"
elseif allowEtymLang then
text = "language or etymology language name"
elseif allowFamily then
text = "language or family name"
else
text = "language name"
end
error("The " .. text .. " \"" .. name .. "\" is not valid.")
end
return retval
end
function export.iterateAll()
mw.incrementExpensiveFunctionCount()
local m_data = mw.loadData("Module:languages/alldata")
local func, t, var = pairs(m_data)
return function()
local code, data = func(t, var)
return export.makeObject(code, data)
end
end
--[[ If language is an etymology language, iterates through parent languages
until it finds a non-etymology language. ]]
function export.getNonEtymological(lang)
while lang:getType() == "etymology language" do
local parentCode = lang:getParentCode()
lang = export.getByCode(parentCode)
or require("Module:etymology languages").getByCode(parentCode)
or require("Module:families").getByCode(parentCode)
end
return lang
end
return export