Module:languages: Difference between revisions

From Linguifex
Jump to navigation Jump to search
(Undo revision 320509 by Sware (talk))
Tags: Undo Reverted
No edit summary
 
(28 intermediate revisions by 2 users not shown)
Line 1: Line 1:
local require = require
local m_str_utils = require("Module:string utilities")
local m_table = require("Module:table")
local mw = mw
local string = string
local table = table
local ustring = mw.ustring
local char = string.char
local check_object = require("Module:utilities").check_object
local concat = table.concat
local decode_entities = m_str_utils.decode_entities
local decode_uri = m_str_utils.decode_uri
local find = string.find
local floor = math.floor
local gmatch = string.gmatch
local gsub = string.gsub
local insert = table.insert
local ipairs = ipairs
local list_to_set = m_table.listToSet
local load_data = mw.loadData
local match = string.match
local next = next
local pairs = pairs
local remove = table.remove
local remove_duplicates = m_table.removeDuplicates
local select = select
local setmetatable = setmetatable
local shallowcopy = m_table.shallowcopy
local split = m_str_utils.split
local sub = string.sub
local type = type
local ugsub = ustring.gsub
local ulen = m_str_utils.len
local ulower = m_str_utils.lower
local umatch = ustring.match
local uupper = m_str_utils.upper
local export = {}
local export = {}


local function track(page, code)
--[=[
local tracking_page = "languages/" .. page
Throw an error for an invalid language code or script code.
if code then
 
require("Module:debug/track"){tracking_page, tracking_page .. "/" .. code}
`lang_code` (required) is the bad code and can be nil or a non-string.
 
`param` (required) is the name of the parameter in which the code was contained. It can be a string, a number
(for a numeric param, in which case the param will show up in the error message as an ordinal such as
"first" or "second"), or `true` if no parameter can be clearly identified.
 
`code_desc` (optional) is text describing what the code is; by default, "language code".
 
`template_text` (optional) is a string specifying the template that generated the error, or a function
to generate this string. If given, it will be displayed in the error message.
 
`not_real_lang` (optional), if given, indicates that the code is not in the form of a language code
(e.g. it's a script code). Normally, this function checks for things that could plausibly be a language code:
two or three lowercase letters, two or three groups of three lowercase letters with hyphens between them.
If such a pattern is found, a different error message is displayed (indicating an invalid code) than otherwise
(indicating a missing code). If `not_real_lang` is given, this check is suppressed.
]=]
 
-- Convert risky characters to HTML entities, which minimizes interference once returned (e.g. for "sms:a", "<!-- -->" etc.).
local function escape_risky_characters(text)
local encode_entities = require("Module:string/encode entities")
-- Spacing characters in isolation generally need to be escaped in order to be properly processed by the MediaWiki software.
if umatch(text, "^%s*$") then
return encode_entities(text, text)
else
else
require("Module:debug/track")(tracking_page)
return encode_entities(text, "!#%&*+/:;<=>?@[\\]_{|}")
end
end
return true
end
end


local checkObject = require("Module:utilities").check_object
-- Process carets (and any escapes). Default to simple removal, if no pattern/replacement is given.
local function processCarets(text, pattern, repl)
local rep
repeat
text, rep = gsub(text, "\\\\(\\*^)", "\3%1")
until rep == 0
return text:gsub("\\^", "\4")
:gsub(pattern or "%^", repl or "")
:gsub("\3", "\\")
:gsub("\4", "^")
end


local function make_language(code, data, useRequire)
-- Remove carets if they are used to capitalize parts of transliterations (unless they have been escaped).
local function conditionalRequire(modulename)
local function removeCarets(text, sc)
if useRequire then
if not sc:hasCapitalization() and sc:isTransliterated() and text:find("^", 1, true) then
return require(modulename)
return processCarets(text)
else
else
return mw.loadData(modulename)
end
end
-- Temporarily convert various formatting characters to PUA to prevent them from being disrupted by the substitution process.
local function doTempSubstitutions(text, subbedChars, keepCarets, noTrim)
-- Clone so that we don't insert any extra patterns into the table in package.loaded. For some reason, using require seems to keep memory use down; probably because the table is always cloned.
local patterns = require("Module:table").shallowcopy(require("Module:languages/data/patterns"))
if keepCarets then
table.insert(patterns, "((\\+)%^)")
table.insert(patterns, "((%^))")
end
-- Ensure any whitespace at the beginning and end is temp substituted, to prevent it from being accidentally trimmed. We only want to trim any final spaces added during the substitution process (e.g. by a module), which means we only do this during the first round of temp substitutions.
if not noTrim then
table.insert(patterns, "^([\128-\191\244]*(%s+))")
table.insert(patterns, "((%s+)[\128-\191\244]*)$")
end
-- Pre-substitution, of "[[" and "]]", which makes pattern matching more accurate.
text = text
:gsub("%f[%[]%[%[", "\1")
:gsub("%f[%]]%]%]", "\2")
local i, pe = #subbedChars, require("Module:utilities").pattern_escape
for j, pattern in ipairs(patterns) do
-- Patterns ending in \0 stand are for things like "[[" or "]]"), so the inserted PUA are treated as breaks between terms by modules that scrape info from pages.
local term_divider
pattern = pattern:gsub("%z$", function(divider)
term_divider = divider == "\0"
return ""
end)
text = text:gsub(pattern, function(...)
local m = {...}
local m1New = m[1]
for k = 2, #m do
local n = i + k - 1
subbedChars[n] = m[k]
local byte2 = math.floor(n / 4096) % 64 + (term_divider and 128 or 136)
local byte3 = math.floor(n / 64) % 64 + 128
local byte4 = n % 64 + 128
m1New = m1New:gsub(pe(m[k]), "\244" .. string.char(byte2) .. string.char(byte3) .. string.char(byte4), 1)
end
i = i + #m - 1
return m1New
end)
end
text = text
:gsub("\1", "%[%[")
:gsub("\2", "%]%]")
return text, subbedChars
end
-- Reinsert any formatting that was temporarily substituted.
local function undoTempSubstitutions(text, subbedChars)
local pe = require("Module:utilities").pattern_escape
for i = 1, #subbedChars do
local byte2 = math.floor(i / 4096) % 64 + 128
local byte3 = math.floor(i / 64) % 64 + 128
local byte4 = i % 64 + 128
text = text:gsub("\244[" .. string.char(byte2) .. string.char(byte2+8) .. "]" .. string.char(byte3) .. string.char(byte4), pe(subbedChars[i]))
end
text = text
:gsub("\1", "%[%[")
:gsub("\2", "%]%]")
return text
return text
end
end
end
-- Convert any HTML entities.
 
local function noEntities(text)
-- Temporarily convert various formatting characters to PUA to prevent them from being disrupted by the substitution process.
if text:match("&[^;]+;") then
local function doTempSubstitutions(text, subbedChars, keepCarets, noTrim)
return require("Module:utilities").get_entities(text)
-- Clone so that we don't insert any extra patterns into the table in package.loaded. For some reason, using require seems to keep memory use down; probably because the table is always cloned.
else
local patterns = shallowcopy(require("Module:languages/data/patterns"))
return text
if keepCarets then
end
insert(patterns, "((\\+)%^)")
insert(patterns, "((%^))")
end
end
-- Ensure any whitespace at the beginning and end is temp substituted, to prevent it from being accidentally trimmed. We only want to trim any final spaces added during the substitution process (e.g. by a module), which means we only do this during the first round of temp substitutions.
-- Check if the raw text is an unsupported title, and if so return that. Otherwise, remove HTML entities. We do the pre-conversion to avoid loading the unsupported title list unnecessarily.
if not noTrim then
local function checkNoEntities(text)
insert(patterns, "^([\128-\191\244]*(%s+))")
local textNoEnc = noEntities(text)
insert(patterns, "((%s+)[\128-\191\244]*)$")
if textNoEnc ~= text and conditionalRequire("Module:links/data").unsupported_titles[text] then
return text
else
return textNoEnc
end
end
end
-- Pre-substitution, of "[[" and "]]", which makes pattern matching more accurate.
-- If no script object is provided (or if it's invalid or None), get one.
text = gsub(text, "%f[%[]%[%[", "\1")
local function checkScript(text, self, sc)
:gsub("%f[%]]%]%]", "\2")
if not checkObject("script", true, sc) or sc:getCode() == "None" then
local i, pe = #subbedChars, require("Module:string utilities").pattern_escape
return self:findBestScript(text)
for _, pattern in ipairs(patterns) do
else
-- Patterns ending in \0 stand are for things like "[[" or "]]"), so the inserted PUA are treated as breaks between terms by modules that scrape info from pages.
return sc
local term_divider
end
pattern = gsub(pattern, "%z$", function(divider)
term_divider = divider == "\0"
return ""
end)
text = gsub(text, pattern, function(...)
local m = {...}
local m1New = m[1]
for k = 2, #m do
local n = i + k - 1
subbedChars[n] = m[k]
local byte2 = floor(n / 4096) % 64 + (term_divider and 128 or 136)
local byte3 = floor(n / 64) % 64 + 128
local byte4 = n % 64 + 128
m1New = gsub(m1New, pe(m[k]), "\244" .. char(byte2) .. char(byte3) .. char(byte4), 1)
end
i = i + #m - 1
return m1New
end)
end
end
text = gsub(text, "\1", "%[%[")
local function normalize(text, sc)
:gsub("\2", "%]%]")
text = sc:fixDiscouragedSequences(text)
return text, subbedChars
return sc:toFixedNFD(text)
end
 
-- Split the text into sections, based on the presence of temporarily substituted formatting characters, then iterate over each one to apply substitutions. This avoids putting PUA characters through language-specific modules, which may be unequipped for them.
local function iterateSectionSubstitutions(text, subbedChars, keepCarets, self, sc, substitution_data, function_name)
local pe = require("Module:string utilities").pattern_escape
local fail, cats, sections = nil, {}
-- See [[Module:languages/data]].
if not find(text, "\244") or self:loadData("Module:languages/data").contiguous_substitution[self._code] then
sections = {text}
else
sections = split(text, "\244[\128-\143][\128-\191]*", true)
end
end
for _, section in ipairs(sections) do
-- Split the text into sections, based on the presence of temporarily substituted formatting characters, then iterate over each one to apply substitutions. This avoids putting PUA characters through language-specific modules, which may be unequipped for them.
-- Don't bother processing empty strings or whitespace (which may also not be handled well by dedicated modules).
local function iterateSectionSubstitutions(text, subbedChars, keepCarets, self, sc, substitution_data, function_name)
if gsub(section, "%s+", "") ~= "" then
local pe = require("Module:utilities").pattern_escape
local sub, sub_fail, sub_cats = require("Module:languages/doSubstitutions")(section, self, sc, substitution_data, function_name)
local fail, cats, sections = nil, {}
-- Second round of temporary substitutions, in case any formatting was added by the main substitution process. However, don't do this if the section contains formatting already (as it would have had to have been escaped to reach this stage, and therefore should be given as raw text).
-- See [[Module:languages/data]].
if sub and subbedChars then
if not text:match("\244") or conditionalRequire("Module:languages/data").contiguous_substitution[self:getCode()] then
local noSub
sections = {text}
for _, pattern in ipairs(require("Module:languages/data/patterns")) do
else
if match(section, pattern .. "%z?") then
sections = mw.text.split(text, "[􀀀-􏿽]")
noSub = true
end
for i, section in ipairs(sections) do
-- Don't bother processing empty strings or whitespace (which may also not be handled well by dedicated modules).
if section:gsub("%s", "") ~= "" then
local sub, sub_fail, sub_cats = require("Module:languages/doSubstitutions")(section, self, sc, substitution_data, function_name)
-- Second round of temporary substitutions, in case any formatting was added by the main substitution process. However, don't do this if the section contains formatting already (as it would have had to have been escaped to reach this stage, and therefore should be given as raw text).
if sub and subbedChars then
local noSub
for _, pattern in ipairs(require("Module:languages/data/patterns")) do
if section:match(pattern .. "%z?") then
noSub = true
end
end
if not noSub then
sub, subbedChars = doTempSubstitutions(sub, subbedChars, keepCarets, true)
end
end
end
end
if (not sub) or sub_fail then
if not noSub then
text = sub
sub, subbedChars = doTempSubstitutions(sub, subbedChars, keepCarets, true)
fail = sub_fail
cats = sub_cats or {}
break
end
end
text = sub and text:gsub(pe(section), pe(sub), 1) or text
end
if type(sub_cats) == "table" then
if (not sub) or sub_fail then
for _, cat in ipairs(sub_cats) do
text = sub
table.insert(cats, cat)
fail = sub_fail
end
cats = sub_cats or {}
break
end
text = sub and gsub(text, pe(section), pe(sub), 1) or text
if type(sub_cats) == "table" then
for _, cat in ipairs(sub_cats) do
insert(cats, cat)
end
end
end
end
end
end
-- Trim, unless there are only spacing characters, while ignoring any final formatting characters.
text = text and text
:gsub("^([\128-\191\244]*)%s+(%S)", "%1%2")
:gsub("(%S)%s+([\128-\191\244]*)$", "%1%2")
-- Remove duplicate categories.
if #cats > 1 then
cats = require("Module:table").removeDuplicates(cats)
end
return text, fail, cats, subbedChars
end
end
 
-- Process carets (and any escapes). Default to simple removal, if no pattern/replacement is given.
-- Trim, unless there are only spacing characters, while ignoring any final formatting characters.
local function processCarets(text, pattern, repl)
text = text and text:gsub("^([\128-\191\244]*)%s+(%S)", "%1%2")
:gsub("(%S)%s+([\128-\191\244]*)$", "%1%2")
 
-- Remove duplicate categories.
if #cats > 1 then
cats = remove_duplicates(cats)
end
 
return text, fail, cats, subbedChars
end
 
local function normalize(text, sc)
text = sc:fixDiscouragedSequences(text)
return sc:toFixedNFD(text)
end
 
-- Check if the raw text is an unsupported title, and if so return that. Otherwise, remove HTML entities. We do the pre-conversion to avoid loading the unsupported title list unnecessarily.
local function checkNoEntities(self, text)
local textNoEnc = decode_entities(text)
if textNoEnc ~= text and self:loadData("Module:links/data").unsupported_titles[text] then
return text
else
return textNoEnc
end
end
 
-- Reinsert any formatting that was temporarily substituted.
local function undoTempSubstitutions(text, subbedChars)
local pe = require("Module:string utilities").pattern_escape
for i = 1, #subbedChars do
local byte2 = floor(i / 4096) % 64 + 128
local byte3 = floor(i / 64) % 64 + 128
local byte4 = i % 64 + 128
text = gsub(text, "\244[" .. char(byte2) .. char(byte2+8) .. "]" .. char(byte3) .. char(byte4), pe(subbedChars[i]))
end
text = gsub(text, "\1", "%[%[")
:gsub("\2", "%]%]")
return text
end
 
-- If no script object is provided (or if it's invalid or None), get one.
local function checkScript(text, self, sc)
if not check_object("script", true, sc) or sc:getCode() == "None" then
return self:findBestScript(text)
else
return sc
end
end
 
--[==[Create the form used as as a basis for display text and transliteration.]==]
local function processDisplayText(text, self, sc, keepCarets, keepPrefixes)
local subbedChars = {}
text, subbedChars = doTempSubstitutions(text, subbedChars, keepCarets)
 
text = decode_uri(text, "PATH")
text = checkNoEntities(self, text)
 
sc = checkScript(text, self, sc)
local fail, cats
text = normalize(text, sc)
text, fail, cats, subbedChars = iterateSectionSubstitutions(text, subbedChars, keepCarets, self, sc, self._rawData.display_text, "makeDisplayText")
 
text = removeCarets(text, sc)
 
-- Remove any interwiki link prefixes (unless they have been escaped or this has been disabled).
if find(text, ":") and not keepPrefixes then
local rep
local rep
repeat
repeat
text, rep = text:gsub("\\\\(\\*^)", "\3%1")
text, rep = gsub(text, "\\\\(\\*:)", "\3%1")
until rep == 0
until rep == 0
return text
text = gsub(text, "\\:", "\4")
:gsub("\\^", "\4")
while true do
:gsub(pattern or "%^", repl or "")
local prefix = gsub(text, "^(.-):.+", function(m1)
:gsub("\3", "\\")
return gsub(m1, "\244[\128-\191]*", "")
:gsub("\4", "^")
end)
end
if not prefix or prefix == text then
break
-- Remove carets if they are used to capitalize parts of transliterations (unless they have been escaped).
end
local function removeCarets(text, sc)
local lower_prefix = ulower(prefix)
if not sc:hasCapitalization() and sc:isTransliterated() and text:match("%^") then
if not (self:loadData("Module:data/interwikis")[lower_prefix] or prefix == "") then
return processCarets(text)
break
else
end
return text
text = gsub(text, "^(.-):(.*)", function(m1, m2)
local ret = {}
for subbedChar in gmatch(m1, "\244[\128-\191]*") do
insert(ret, subbedChar)
end
return concat(ret) .. m2
end)
end
end
text = gsub(text, "\3", "\\")
:gsub("\4", ":")
end
end
return text, fail, cats, subbedChars
end
function export.err(lang_code, param, code_desc, template_tag, not_real_lang)
local ordinals = {
"first", "second", "third", "fourth", "fifth", "sixth",
"seventh", "eighth", "ninth", "tenth", "eleventh", "twelfth",
"thirteenth", "fourteenth", "fifteenth", "sixteenth", "seventeenth",
"eighteenth", "nineteenth", "twentieth"
}
local Language = {}
code_desc = code_desc or "language code"
--[==[Returns the language code of the language. Example: {{code|lua|"fr"}} for French.]==]
if not template_tag then
function Language:getCode()
template_tag = ""
return self._code
else
if type(template_tag) ~= "string" then
template_tag = template_tag()
end
template_tag = " (Original template: " .. template_tag .. ")"
end
end
local function err(msg)
--[==[Returns the canonical name of the language. This is the name used to represent that language on Wiktionary, and is guaranteed to be unique to that language alone. Example: {{code|lua|"French"}} for French.]==]
error(msg .. template_tag, 3)
function Language:getCanonicalName()
return self._rawData[1]
end
end
local param_type = type(param)
--[==[Returns the display form of the language. The display form of a language, family or script is the form it takes when appearing as the ''SOURCE'' in categories such as <code>English terms derived from ''SOURCE''</code> or <code>English given names from ''SOURCE''</code>, and is also the displayed text in <code>:makeCategoryLink</code> links. For regular and etymology languages, this is the same as the canonical name, but for families, it reads "NAME languages" (e.g. {{code|lua|"Indo-Iranian languages"}}), and for scripts, it reads "NAME script" (e.g. {{code|lua|"Arabic script"}}).]==]
local in_the_param
function Language:getDisplayForm()
if param == true then
if not self._displayForm then
-- handled specially below
local form = self:getCanonicalName()
in_the_param = ""
-- Add article and " substrate" if a substrate that lacks them.
else
if self:getFamilyCode() == "qfa-sub" then
if param_type == "number" then
if not (form:find("^[Tt]he ") or form:find("^[Aa] ")) then
param = ordinals[param] .. " parameter"
form = "a " .. form
elseif param_type == "string" then
end
param = 'parameter "' .. param .. '"'
if not form:find("[Ss]ubstrate") then
else
form = form .. " substrate"
err("The parameter name is "
end
.. (param_type == "table" and "a table" or tostring(param))
end
.. ", but it should be a number or a string.")
self._displayForm = form
end
end
return self._displayForm
in_the_param = " in the " .. param
end
end
--[==[Returns a table of the "other names" that the language is known by, excluding the canonical name. The names are not guaranteed to be unique, in that sometimes more than one language is known by the same name. Example: {{code|lua|{"Manx Gaelic", "Northern Manx", "Southern Manx"} }} for [[:Category:Manx language|Manx]]. If <code>onlyOtherNames</code> is given and is non-{{code|lua|nil}}, only names explicitly listed in the <code>otherNames</code> field are returned; otherwise, names listed under <code>otherNames</code>, <code>aliases</code> and <code>varieties</code> are combined together and returned. For example, for Manx, Manx Gaelic is listed as an alias, while Northern Manx and Southern Manx are listed as varieties. It should be noted that the <code>otherNames</code> field itself is deprecated, and entries listed there should eventually be moved to either <code>aliases</code> or <code>varieties</code>.]==]
if not lang_code or lang_code == "" then
function Language:getOtherNames(onlyOtherNames)
if param == true then
if #self._stack == 1 then
err("The " .. code_desc .. " is missing.")
self:loadInExtraData()
else
err("The " .. param .. " (" .. code_desc .. ") is missing.")
end
end
return require("Module:language-like").getOtherNames(self, onlyOtherNames)
elseif type(lang_code) ~= "string" then
err("The " .. code_desc .. in_the_param .. " is supposed to be a string but is a " .. type(lang_code) .. ".")
-- Can use string.find because language codes only contain ASCII.
elseif not_real_lang or lang_code:find("^%l%l%l?$")
or lang_code:find("^%l%l%l%-%l%l%l$")
or lang_code:find("^%l%l%l%-%l%l%l%-%l%l%l$") then
err("The " .. code_desc .. " \"" .. lang_code .. "\"" .. in_the_param .. " is not valid.")
else
err("Please specify a " .. code_desc .. in_the_param .. ". The value \"" .. lang_code .. "\" is not valid.")
end
end
end
--[==[Returns a table of the aliases that the language is known by, excluding the canonical name. Aliases are synonyms for the language in question. The names are not guaranteed to be unique, in that sometimes more than one language is known by the same name. Example: {{code|lua|{"High German", "New High German", "Deutsch"} }} for [[:Category:German language|German]].]==]
 
function Language:getAliases()
local function do_entry_name_or_sort_key_replacements(text, replacements)
if #self._stack == 1 then
if replacements.from then
self:loadInExtraData()
for i, from in ipairs(replacements.from) do
local to = replacements.to[i] or ""
text = mw.ustring.gsub(text, from, to)
end
end
return self._rawData.aliases or (self._extraData and self._extraData.aliases) or {}
end
end
--[==[Returns a table of the known subvarieties of a given language, excluding subvarieties that have been given explicit etymology language codes. The names are not guaranteed to be unique, in that sometimes a given name refers to a subvariety of more than one language. Example: {{code|lua|{"Southern Aymara", "Central Aymara"} }} for [[:Category:Aymara language|Aymara]]. Note that the returned value can have nested tables in it, when a subvariety goes by more than one name. Example: {{code|lua|{"North Azerbaijani", "South Azerbaijani", {"Afshar", "Afshari", "Afshar Azerbaijani", "Afchar"}, {"Qashqa'i", "Qashqai", "Kashkay"}, "Sonqor"} }} for [[:Category:Azerbaijani language|Azerbaijani]]. Here, for example, Afshar, Afshari, Afshar Azerbaijani and Afchar all refer to the same subvariety, whose preferred name is Afshar (the one listed first). To avoid a return value with nested tables in it, specify a non-{{code|lua|nil}} value for the <code>flatten</code> parameter; in that case, the return value would be {{code|lua|{"North Azerbaijani", "South Azerbaijani", "Afshar", "Afshari", "Afshar Azerbaijani", "Afchar", "Qashqa'i", "Qashqai", "Kashkay", "Sonqor"} }}.]==]
if replacements.remove_diacritics then
function Language:getVarieties(flatten)
text = mw.ustring.toNFD(text)
if #self._stack == 1 then
text = mw.ustring.gsub(text,
self:loadInExtraData()
'[' .. replacements.remove_diacritics .. ']',
end
'')
return require("Module:language-like").getVarieties(self, flatten)
text = mw.ustring.toNFC(text)
end
end
--[==[Given a list of types as strings, returns true if the language has all of them. Possible types are explained in [[Module:languages/data/2]] and [[Module:etymology languages/data]].]==]
return text
function Language:hasType(...)
end
if not self._type then
 
self._type = {language = true}
local Language = {}
if self:getNonEtymologicalCode() == self:getCode() then
 
self._type.full = true
function Language:getCode()
else
return self._code
self._type["etymology-only"] = true
end
end
 
for _, type in ipairs(mw.text.split(self._rawData.type, "%s*,%s*")) do
 
self._type[type] = true
function Language:getCanonicalName()
end
return self._rawData[1] or self._rawData.canonicalName
end
 
 
function Language:getDisplayForm()
return self:getCanonicalName()
end
 
function Language:getMainCategoryName()
return self._rawData["main_category"] or "lemma"
end
 
function Language:getOtherNames(onlyOtherNames)
self:loadInExtraData()
return require("Module:language-like").getOtherNames(self, onlyOtherNames)
end
 
 
function Language:getAliases()
self:loadInExtraData()
return self._extraData.aliases or {}
end
 
 
function Language:getVarieties(flatten)
self:loadInExtraData()
return require("Module:language-like").getVarieties(self, flatten)
end
 
 
function Language:getType()
return self._rawData.type or "regular"
end
 
 
function Language:getWikimediaLanguages()
if not self._wikimediaLanguageObjects then
local m_wikimedia_languages = require("Module:wikimedia languages")
self._wikimediaLanguageObjects = {}
local wikimedia_codes = self._rawData.wikimedia_codes or { self._code }
for _, wlangcode in ipairs(wikimedia_codes) do
table.insert(self._wikimediaLanguageObjects, m_wikimedia_languages.getByCode(wlangcode))
end
end
for _, type in ipairs{...} do
if not self._type[type] then
return false
end
end
return true
end
end
--[==[Returns a table containing <code>WikimediaLanguage</code> objects (see [[Module:wikimedia languages]]), which represent languages and their codes as they are used in Wikimedia projects for interwiki linking and such. More than one object may be returned, as a single Wiktionary language may correspond to multiple Wikimedia languages. For example, Wiktionary's single code <code>sh</code> (Serbo-Croatian) maps to four Wikimedia codes: <code>sh</code> (Serbo-Croatian), <code>bs</code> (Bosnian), <code>hr</code> (Croatian) and <code>sr</code> (Serbian).
return self._wikimediaLanguageObjects
The code for the Wikimedia language is retrieved from the <code>wikimedia_codes</code> property in the data modules. If that property is not present, the code of the current language is used. If none of the available codes is actually a valid Wikimedia code, an empty table is returned.]==]
end
function Language:getWikimediaLanguages()
 
if not self._wikimediaLanguageObjects then
function Language:getWikipediaArticle()
local m_wikimedia_languages = require("Module:wikimedia languages")
if self._rawData.wikipedia_article then
self._wikimediaLanguageObjects = {}
return self._rawData.wikipedia_article
local wikimedia_codes = self:getWikimediaLanguageCodes()
elseif self._wikipedia_article then
for _, wlangcode in ipairs(wikimedia_codes) do
return self._wikipedia_article
table.insert(self._wikimediaLanguageObjects, m_wikimedia_languages.getByCode(wlangcode))
elseif self:getWikidataItem() and mw.wikibase then
end
self._wikipedia_article = mw.wikibase.sitelink(self:getWikidataItem(), 'enwiki')
end
end
return self._wikimediaLanguageObjects
if not self._wikipedia_article then
self._wikipedia_article = mw.ustring.gsub(self:getCategoryName(), "Creole language", "Creole")
end
end
return self._wikipedia_article
end
function Language:makeWikipediaLink()
return "[[w:" .. self:getWikipediaArticle() .. "|" .. self:getCanonicalName() .. "]]"
end
function Language:getWikidataItem()
local item = self._rawData[2]
function Language:getWikimediaLanguageCodes()
if type(item) == "number" then
if not self._wikimediaLanguageCodes then
return "Q" .. item
self._wikimediaLanguageCodes = self._rawData.wikimedia_codes or {self:getCode()}
else
end
return item
return self._wikimediaLanguageCodes
end
end
end
--[==[Returns the name of the Wikipedia article for the language. If the property <code>wikipedia_article</code> is present in the data module it will be used first, otherwise a sitelink will be generated from <code>:getWikidataItem</code> (if set). Otherwise <code>:getCategoryName</code> is used as fallback.]==]
 
function Language:getWikipediaArticle()
function Language:getScripts()
if not self._wikipedia_article then
if not self._scriptObjects then
if self._rawData.wikipedia_article then
local m_scripts = require("Module:scripts")
self._wikipedia_article = self._rawData.wikipedia_article
self._scriptObjects = {}
elseif self:getWikidataItem() and mw.wikibase then
self._wikipedia_article = mw.wikibase.sitelink(self:getWikidataItem(), 'enwiki')
for _, sc in ipairs(self:getScriptCodes()) do
end
table.insert(self._scriptObjects, m_scripts.getByCode(sc))
if not self._wikipedia_article then
self._wikipedia_article = self:getCategoryName():gsub("Creole language", "Creole")
end
end
end
return self._wikipedia_article
end
end
function Language:makeWikipediaLink()
return self._scriptObjects
return "[[w:" .. self:getWikipediaArticle() .. "|" .. self:getCanonicalName() .. "]]"
end
 
function Language:getScriptCodes()
return self._rawData.scripts or self._rawData[4] or { "None" }
end
 
function Language:getFamily()
if self._familyObject then
return self._familyObject
end
end
--[==[Returns the Wikidata item id for the language or <code>nil</code>. This corresponds to the the second field in the data modules.]==]
local family = self._rawData[3] or self._rawData.family
function Language:getWikidataItem()
if family then
if not self._WikidataItem then
self._familyObject = require("Module:families").getByCode(family)
local item = self._rawData[2]
if type(item) == "number" then
self._WikidataItem = "Q" .. item
else
self._WikidataItem = item
end
end
return self._WikidataItem
end
end
--[==[Returns a table of <code>Script</code> objects for all scripts that the language is written in. See [[Module:scripts]].]==]
return self._familyObject
function Language:getScripts()
end
if not self._scriptObjects then
 
self._scriptObjects = {}
--[==[Returns the family code in the language's data file.]==]
if self:getScriptCodes()[1] == "All" then
function Language:getFamilyCode()
self._scriptObjects = conditionalRequire("Module:scripts/data")
local family = self._familyCode
else
if family == nil then
for _, sc in ipairs(self:getScriptCodes()) do
-- If the value is nil, it's cached as false.
table.insert(self._scriptObjects, require("Module:scripts").getByCode(sc, nil, nil, useRequire))
family = self._rawData[3] or false
end
self._familyCode = family
end
end
return self._scriptObjects
end
end
return family or nil
--[==[Returns the table of script codes in the language's data file.]==]
end
function Language:getScriptCodes()
 
if not self._scriptCodes then
function Language:getFamilyName()
self._scriptCodes = self._rawData[4] or {"None"}
local family = self._familyName
end
if family == nil then
return self._scriptCodes
family = self:getFamily()
-- If the value is nil, it's cached as false.
family = family and family:getCanonicalName() or false
self._familyName = family
end
end
return family or nil
--[==[Given some text, this function iterates through the scripts of a given language and tries to find the script that best matches the text. It returns a {{code|lua|Script}} object representing the script. If no match is found at all, it returns the {{code|lua|None}} script object.]==]
end
function Language:findBestScript(text, forceDetect)
 
if (not text) or text == "" or text == "-" then
--[==[Check whether the language belongs to `family` (which can be a family code or object). A list of objects can be given in place of `family`; in that case, return true if the language belongs to any of the specified families. Note that some languages (in particular, certain creoles) can have multiple immediate ancestors potentially belonging to different families; in that case, return true if the language belongs to any of the specified families.]==]
return require("Module:scripts").getByCode("None", nil, nil, useRequire)
function Language:inFamily(...)
--check_object("family", nil, ...)
for _, family in ipairs{...} do
if type(family) == "table" then
family = family:getCode()
end
end
local self_family_code = self:getFamilyCode()
if table.concat(self:getScriptCodes()) == "All" then
if not self_family_code then
return require("Module:scripts").findBestScriptWithoutLang(text)
return false
elseif self_family_code == family then
return true
end
end
local self_family = self:getFamily()
local scripts = self:getScripts()
if self_family:inFamily(family) then
return true
if not scripts[2] and not forceDetect then
-- If the family isn't a real family (e.g. creoles) check any ancestors.
-- Necessary, because Hani covers the entire Han range (while the Hant & Hans lists don't list shared characters).
elseif self_family:getFamilyCode() == "qfa-not" then
if scripts[1]:countCharacters(text) > 0 then
local ancestors = self:getAncestors()
return scripts[1]
for _, ancestor in ipairs(ancestors) do
else
if ancestor:inFamily(family) then
return require("Module:scripts").getByCode("None", nil, nil, useRequire)
return true
end
end
end
end
end
return require("Module:languages/findBestScript")(export, self, text, scripts, forceDetect, useRequire)
end
end
return false
end
function Language:getParent()
local parent = self._parentObject
if parent == nil then
parent = self:getParentCode()
-- If the value is nil, it's cached as false.
parent = parent and export.getByCode(parent, nil, true, true, self._useRequire) or false
self._parentObject = parent
end
return parent or nil
end
function Language:getParentCode()
local parent = self._parentCode
if parent == nil then
-- If the value is nil, it's cached as false.
parent = self._rawData[5] or false
self._parentCode = parent
end
return parent or nil
end
function Language:getParentName()
local parent = self._parentName
if parent == nil then
parent = self:getParent()
-- If the value is nil, it's cached as false.
parent = parent and parent:getCanonicalName() or false
self._parentName = parent
end
return parent or nil
end
--[==[Given some text, this function iterates through the scripts of a given language and tries to find the script that best matches the text. It returns a {{code|lua|Script}} object representing the script. If no match is found at all, it returns the {{code|lua|None}} script object.]==]
function Language:findBestScript(text, forceDetect)
local useRequire = self._useRequire
--[==[Returns a <code>Family</code> object for the language family that the language belongs to. See [[Module:families]].]==]
if not text or text == "" or text == "-" then
function Language:getFamily()
return require("Module:scripts").getByCode("None", nil, nil, useRequire)
if self._familyObject == nil then
local familyCode = self:getFamilyCode()
if familyCode then
self._familyObject = require("Module:families").getByCode(familyCode, useRequire)
-- Still memoize a nil result.
else
self._familyObject = false
end
end
return self._familyObject or nil
end
end
--[==[Returns the family code in the language's data file.]==]
-- Differs from table returned by getScriptCodes, as Hants is not normalized into its constituents.
function Language:getFamilyCode()
codes = table.concat(self._rawData["scripts"],", ")
if not self._familyCode then
codes = codes and split(codes, ",", true, true) or {"None"}
self._familyCode = self._rawData[3]
self._bestScriptCodes = codes
end
return self._familyCode
local first_sc = self._rawData.scripts[1]
end
function Language:getFamilyName()
if first_sc == "All" then
if self._familyName == nil then
return require("Module:scripts").findBestScriptWithoutLang(text)
local family = self:getFamily()
if family then
self._familyName = family:getCanonicalName()
else
self._familyName = false
end
end
return self._familyName or nil
end
end
--[==[Check whether the language belongs to `family` (which can be a family code or object). A list of objects can be given in place of `family`; in that case, return true if the language belongs to any of the specified families. Note that some languages (in particular, certain creoles) can have multiple immediate ancestors potentially belonging to different families; in that case, return true if the language belongs to any of the specified families.]==]
local get_script = require("Module:scripts").getByCode
function Language:inFamily(...)
local codes_len = #codes
--checkObject("family", nil, ...)
for _, family in ipairs{...} do
if type(family) == "table" then
family = family:getCode()
end
if not self:getFamilyCode() then
return false
elseif self:getFamilyCode() == family or self:getFamily():inFamily(family) then
return true
else
local ancestors = self:getAncestors()
for _, ancestor in ipairs(ancestors) do
if ancestor:inFamily(family) then
return true
end
end
end
end
return false
end
function Language:getParent()
if not (forceDetect or first_sc == "Hants" or codes_len > 1) then
if self._parentObject == nil then
first_sc = get_script(first_sc, nil, nil, useRequire)
local parentCode = self:getParentCode()
local charset = first_sc.characters
if parentCode then
return charset and umatch(text, "[" .. charset .. "]") and first_sc or
self._parentObject = export.getByCode(parentCode, nil, true, true, useRequire)
get_script("None", nil, nil, useRequire)
else
self._parentObject = false
end
end
return self._parentObject or nil
end
end
function Language:getParentCode()
-- Remove all formatting characters.
if not self._parentCode then
text = require("Module:utilities").get_plaintext(text)
self._parentCode = self._rawData[5]
end
return self._parentCode
end
function Language:getParentName()
-- Remove all spaces and any ASCII punctuation. Some non-ASCII punctuation is script-specific, so can't be removed.
if self._parentName == nil then
text = ugsub(text, "[%s!\"#%%&'()*,%-./:;?@[\\%]_{}]+", "")
local parent = self:getParent()
if #text == 0 then
if parent then
return get_script("None", nil, nil, useRequire)
self._parentName = parent:getCanonicalName()
else
self._parentName = false
end
end
return self._parentName or nil
end
end
function Language:getParentChain()
-- Try to match every script against the text,
if not self._parentChain then
-- and return the one with the most matching characters.
self._parentChain = {}
local bestcount, bestscript, length = 0
local parent = self:getParent()
for i = 1, codes_len do
while parent do
local sc = codes[i]
table.insert(self._parentChain, parent)
-- Special case for "Hants", which is a special code that represents whichever of "Hant" or "Hans" best matches, or "Hani" if they match equally. This avoids having to list all three. In addition, "Hants" will be treated as the best match if there is at least one matching character, under the assumption that a Han script is desirable in terms that contain a mix of Han and other scripts (not counting those which use Jpan or Kore).
parent = parent:getParent()
if sc == "Hants" then
else
sc = get_script(sc, nil, nil, useRequire)
if not length then
length = ulen(text)
end
end
end
return self._parentChain
-- Count characters by removing everything in the script's charset and comparing to the original length.
end
local charset = sc.characters
local count = charset and length - ulen(ugsub(text, "[" .. charset .. "]+", "")) or 0
function Language:hasParent(...)
--checkObject("language", nil, ...)
if count >= length then
for _, otherlang in ipairs{...} do
return sc
for _, parent in ipairs(self:getParentChain()) do
elseif count > bestcount then
if type(otherlang) == "string" then
bestcount = count
if otherlang == parent:getCode() then return true end
bestscript = sc
else
if otherlang:getCode() == parent:getCode() then return true end
end
end
end
end
end
return false
end
end
--[==[If the language is an etymology language, this iterates through parents until a regular language or family is found, and the corresponding object is returned. If the language is a regular language, then it simply returns the language.]==]
-- Return best matching script, or otherwise None.
function Language:getNonEtymological()
return bestscript or get_script("None", nil, nil, useRequire)
if not self._nonEtymologicalObject then
end
local nonEtymologicalCode = self:getNonEtymologicalCode()
 
if nonEtymologicalCode ~= self:getCode() then
function Language:getParentChain()
self._nonEtymologicalObject = export.getByCode(nonEtymologicalCode, nil, nil, nil, useRequire)
local chain = self._parentChain
else
if chain == nil then
self._nonEtymologicalObject = self
chain = {}
end
local parent, n = self:getParent(), 0
while parent do
n = n + 1
chain[n] = parent
parent = parent:getParent()
end
end
return self._nonEtymologicalObject
self._parentChain = chain
end
end
return chain
function Language:getNonEtymologicalCode()
end
return self._nonEtymologicalCode or self:getCode()
 
end
function Language:hasParent(...)
--check_object("language", nil, ...)
function Language:getNonEtymologicalName()
for _, otherlang in ipairs{...} do
if self._nonEtymologicalName == nil then
for _, parent in ipairs(self:getParentChain()) do
local nonEtymological = self:getNonEtymological()
if type(otherlang) == "string" then
if nonEtymological then
if otherlang == parent:getCode() then return true end
self._nonEtymologicalName = nonEtymological:getCanonicalName()
else
else
self._nonEtymologicalName = false
if otherlang:getCode() == parent:getCode() then return true end
end
end
end
end
return self._nonEtymologicalName or nil
end
end
return false
--[==[Returns a table of <code class="nf">Language</code> objects for all languages that this language is directly descended from. Generally this is only a single language, but creoles, pidgins and mixed languages can have multiple ancestors.]==]
end
function Language:getAncestors()
 
if not self._ancestorObjects then
--[==[
self._ancestorObjects = {}
If the language is etymology-only, this iterates through parents until a full language or family is found, and the
local ancestors = require("Module:table").shallowcopy(self:getAncestorCodes())
corresponding object is returned. If the language is a full language, then it simply returns itself.
if #ancestors > 0 then
]==]
for _, ancestor in ipairs(ancestors) do
function Language:getFull()
table.insert(self._ancestorObjects, export.getByCode(ancestor, nil, true, nil, useRequire))
local full = self._fullObject
end
if full == nil then
else
full = self:getFullCode()
local fam = self:getFamily()
full = full == self._code and self or
local protoLang = fam and fam:getProtoLanguage() or nil
export.getByCode(full, nil, nil, nil, self._useRequire)
-- For the cases where the current language is the proto-language
self._fullObject = full
-- of its family, or an etymology language that is ancestral to that
-- proto-language, we need to step up a level higher right from the
-- start.
if protoLang and (
protoLang:getCode() == self:getCode() or
(self:hasType("etymology-only") and protoLang:hasAncestor(self))
) then
fam = fam:getFamily()
protoLang = fam and fam:getProtoLanguage() or nil
end
while not protoLang and not (not fam or fam:getCode() == "qfa-not") do
fam = fam:getFamily()
protoLang = fam and fam:getProtoLanguage() or nil
end
table.insert(self._ancestorObjects, protoLang)
end
end
return self._ancestorObjects
end
end
return full
function Language:getAncestorCodes()
end
if not self._ancestorCodes then
 
local function get_codes(lang)
--[==[
return lang._rawData.ancestors or {}
If the language is an etymology-only language, this iterates through parents until a full language or family is
end
found, and the corresponding code is returned. If the language is a full language, then it simply returns the
local codes = get_codes(self)
language code.
-- Avoid a language being its own ancestor via class inheritance. We only need to check for this if the language has inherited an ancestor table from its parent, because we never want to drop ancestors that have been explicitly set in the data.
]==]
-- Recursively iterate over ancestors until we find a loop/run out. If a loop is found that involves the language, drop that ancestor.
function Language:getFullCode()
if #codes > 0 and #self._stack > 1 and not self._stack[#self._stack].ancestors then
return self._fullCode or self._code
local function check_ancestor(i, code, seen)
end
if seen[code] then
 
if code == self:getCode() then
--[==[
table.remove(codes, i)
If the language is an etymology-only language, this iterates through parents until a full language or family is
end
found, and the corresponding canonical name is returned. If the language is a full language, then it simply returns
else
the canonical name of the language.
seen[code] = true
]==]
local ancestor = export.getByCode(code, nil, true, nil, useRequire)
function Language:getFullName()
for _, ancestorCode in ipairs(get_codes(ancestor)) do
local full = self._fullName
check_ancestor(i, ancestorCode, seen)
if full == nil then
end
full = self:getFull():getCanonicalName()
end
self._fullName = full
end
for i, ancestorCode in ipairs(codes) do
local seen = {[self:getCode()] = true}
check_ancestor(i, ancestorCode, seen)
end
end
self._ancestorCodes = codes
end
return self._ancestorCodes
end
end
return full
--[==[Given a list of language objects or codes, returns true if at least one of them is an ancestor. This includes any etymology-only children of that ancestor. If the language's ancestor(s) are etymology-only languages, it will also return true for those language parent(s) (e.g. if Vulgar Latin is the ancestor, it will also return true for its parent, Latin). However, a parent is excluded from this if the ancestor is also ancestral to that parent (e.g. if Classical Persian is the ancestor, Persian would return false, because Classical Persian is also ancestral to Persian).]==]
end
function Language:hasAncestor(...)
 
--checkObject("language", nil, ...)
 
 
function Language:getAncestors()
if not self._ancestorObjects then
self._ancestorObjects = {}
local function iterateOverAncestorTree(node, func, parent_check)
if self._rawData.ancestors then
local ancestors = node:getAncestors()
for _, ancestor in ipairs(self._rawData.ancestors) do
local ancestorsParents = {}
table.insert(self._ancestorObjects, export.getByCode(ancestor) or require("Module:etymology languages").getByCode(ancestor))
for _, ancestor in ipairs(ancestors) do
local ret = func(ancestor) or iterateOverAncestorTree(ancestor, func, parent_check)
if ret then return ret end
end
end
-- Check the parents of any ancestors. We don't do this if checking the parents of the other language, so that we exclude any etymology-only children of those parents that are not directly related (e.g. if the ancestor is Vulgar Latin and we are checking New Latin, we want it to return false because they are on different ancestral branches. As such, if we're already checking the parent of New Latin (Latin) we don't want to compare it to the parent of the ancestor (Latin), as this would be a false positive; it should be one or the other).
else
if not parent_check then
local fam = self:getFamily()
return nil
local protoLang = fam and fam:getProtoLanguage() or nil
-- For the case where the current language is the proto-language
-- of its family, we need to step up a level higher right from the start.
if protoLang and protoLang:getCode() == self:getCode() then
fam = fam:getFamily()
protoLang = fam and fam:getProtoLanguage() or nil
end
end
for _, ancestor in ipairs(ancestors) do
local ancestorParents = ancestor:getParentChain()
while not protoLang and not (not fam or fam:getCode() == "qfa-not") do
for _, ancestorParent in ipairs(ancestorParents) do
fam = fam:getFamily()
if ancestorParent:getCode() == self:getCode() or ancestorParent:hasAncestor(ancestor) then
protoLang = fam and fam:getProtoLanguage() or nil
break
else
table.insert(ancestorsParents, ancestorParent)
end
end
end
for _, ancestorParent in ipairs(ancestorsParents) do
local ret = func(ancestorParent)
if ret then return ret end
end
end
table.insert(self._ancestorObjects, protoLang)
end
end
local parent_check = true
for _, otherlang in ipairs{...} do
repeat
if iterateOverAncestorTree(
self,
function(ancestor)
if type(otherlang) == "string" then
return ancestor:getCode() == otherlang
else
return ancestor:getCode() == otherlang:getCode()
end
end,
parent_check
) then
return true
elseif type(otherlang) == "string" then
otherlang = export.getByCode(otherlang, nil, true, nil, useRequire)
end
otherlang = otherlang:getParent()
parent_check = false
until not otherlang
end
return false
end
end
function Language:getAncestorChain()
return self._ancestorObjects
if not self._ancestorChain then
end
self._ancestorChain = {}
 
local step = self
local function iterateOverAncestorTree(node, func)
while true do
for _, ancestor in ipairs(node:getAncestors()) do
local ancestors = step:getAncestors()
if ancestor then
step = #ancestors == 1 and ancestors[1] or nil
local ret = func(ancestor) or iterateOverAncestorTree(ancestor, func)
if not step then break end
if ret then
table.insert(self._ancestorChain, 1, step)
return ret
end
end
end
end
return self._ancestorChain
end
end
end
local function fetch_descendants(self, format)
 
local languages = require("Module:languages/code to canonical name")
--[==[Generates alternative forms using a specified method, and returns them as a table. If no method is specified, returns a table containing only the input term.]==]
local etymology_languages = require("Module:etymology languages/code to canonical name")
function Language:generateForms(text, sc)
local families = require("Module:families/code to canonical name")
if self._rawData.generate_forms then
local descendants = {}
sc = checkScript(text, self, sc)
local family = self:getFamily()
return require("Module:" .. self._rawData.generate_forms).generateForms(text, self._code, sc:getCode())
-- Iterate over all three datasets.
else
for _, data in ipairs{languages, etymology_languages, families} do
return {text}
for code in pairs(data) do
end
local lang = export.getByCode(code, nil, true, true, useRequire)
end
-- Test for a descendant. Earlier tests weed out most candidates, while the more intensive tests are only used sparingly.
 
if (
function Language:getAncestorChain()
( -- Not an alias code.
if not self._ancestorChain then
(not lang._rawData.main_code) or
self._ancestorChain = {}
lang._rawData.main_code == code
local step = #self:getAncestors() == 1 and self:getAncestors()[1] or nil
) and
code ~= self:getCode() and -- Not self.
while step do
lang:inFamily(family) and -- In the same family.
table.insert(self._ancestorChain, 1, step)
(
step = #step:getAncestors() == 1 and step:getAncestors()[1] or nil
family:getProtoLanguageCode() == self:getCode() or -- Self is the protolanguage.
self:hasDescendant(lang) or -- Full hasDescendant check.
(lang:getNonEtymologicalCode() == self:getCode() and not self:hasAncestor(lang)) -- Etymology-only child which isn't an ancestor.
)
) then
if format == "object" then
table.insert(descendants, lang)
elseif format == "code" then
table.insert(descendants, code)
elseif format == "name" then
table.insert(descendants, lang:getCanonicalName())
end
end
end
end
end
return descendants
end
end
function Language:getDescendants()
return self._ancestorChain
if not self._descendantObjects then
end
self._descendantObjects = fetch_descendants(self, "object")
 
end
 
return self._descendantObjects
function Language:hasAncestor(otherlang)
local function compare(ancestor)
return ancestor:getCode() == otherlang:getCode()
end
end
function Language:getDescendantCodes()
return iterateOverAncestorTree(self, compare) or false
if not self._descendantCodes then
end
self._descendantCodes = fetch_descendants(self, "code")
 
end
 
return self._descendantCodes
function Language:getCategoryName(nocap)
end
local name = self:getCanonicalName()
function Language:getDescendantNames()
-- If the name already has "language" in it, don't add it.
if not self._descendantNames then
if not name:find("[Ll]anguage$") then
self._descendantNames = fetch_descendants(self, "name")
name = name .. " language"
end
return self._descendantNames
end
end
if not nocap then
function Language:hasDescendant(...)
name = mw.getContentLanguage():ucfirst(name)
for _, lang in ipairs{...} do
if type(lang) == "string" then
lang = export.getByCode(lang, nil, true, nil, useRequire)
end
if lang:hasAncestor(self) then
return true
end
end
return false
end
end
return name
end
function Language:makeCategoryLink()
return "[[:Category:" .. self:getCategoryName() .. "|" .. self:getDisplayForm() .. "]]"
end
function Language:getStandardCharacters()
return self._rawData.standardChars
end
function Language:makeEntryName(text)
text = mw.ustring.match(text, "^[¿¡]?(.-[^%s%p].-)%s*[؟?!;՛՜ ՞ ՟?!︖︕।॥။၊་།]?$") or text
--[==[Returns the name of the main category of that language. Example: {{code|lua|"French language"}} for French, whose category is at [[:Category:French language]]. Unless optional argument <code>nocap</code> is given, the language name at the beginning of the returned value will be capitalized. This capitalization is correct for category names, but not if the language name is lowercase and the returned value of this function is used in the middle of a sentence.]==]
if self:getCode() == "ar" then
function Language:getCategoryName(nocap)
local U = mw.ustring.char
if not self._categoryName then
local taTwiil = U(0x640)
local name = self:getCanonicalName()
local waSla = U(0x671)
-- Only add " language" if a regular language.
-- diacritics ordinarily removed by entry_name replacements
if #self._stack == 1 then
local Arabic_diacritics = U(0x64B, 0x64C, 0x64D, 0x64E, 0x64F, 0x650, 0x651, 0x652, 0x670)
-- If the name already has "language" in it, don't add it.
if not name:match("[Ll]anguage$") then
if text == waSla or mw.ustring.find(text, "^" .. taTwiil .. "?[" .. Arabic_diacritics .. "]" .. "$") then
name = name .. " language"
return text
end
end
self._categoryName = name
end
if nocap then
return self._categoryName
else
return mw.getContentLanguage():ucfirst(self._categoryName)
end
end
end
end
--[==[Creates a link to the category; the link text is the canonical name.]==]
if type(self._rawData.entry_name) == "table" then
function Language:makeCategoryLink()
text = do_entry_name_or_sort_key_replacements(text, self._rawData.entry_name)
return "[[:Category:" .. self:getCategoryName() .. "|" .. self:getDisplayForm() .. "]]"
end
end
function Language:getStandardCharacters(sc)
return text
if type(self._rawData.standardChars) ~= "table" then
end
return self._rawData.standardChars
 
 
-- Return true if the language has display processing enabled, i.e. lang:makeDisplayText()
-- does non-trivial processing.
function Language:hasDisplayProcessing()
return not not self._rawData.display
end
 
function Language:getTypes()
local types = self._types
if types == nil then
types = {language = true}
if self:getCode() == self._code then
types.full = true
else
else
if sc and type(sc) ~= "string" then
types["etymology-only"] = true
checkObject("script", nil, sc)
end
sc = sc:getCode()
local type = self._rawData.type or "regular"
end
for t in gmatch(type, "[^,]+") do
if (not sc) or sc == "None" then
types[t] = true
local scripts = {}
for _, script in pairs(self._rawData.standardChars) do
table.insert(scripts, script)
end
return table.concat(scripts)
end
if self._rawData.standardChars[sc] then
return self._rawData.standardChars[sc] .. (self._rawData.standardChars[1] or "")
end
end
end
self._types = types
end
end
return types
--[==[Make the entry name (i.e. the correct page name).]==]
end
function Language:makeEntryName(text, sc)
 
if (not text) or text == "" then
--[==[Given a list of types as strings, returns true if the language has all of them.]==]
return text, nil, {}
function Language:hasType(...)
local args, types = {...}, self:getTypes()
for i = 1, #args do
if not types[args[i]] then
return false
end
end
-- Set `unsupported` as true if certain conditions are met.
local unsupported
-- If there's an underscore.
if text:find("_") then
track("underscore")
unsupported = true
-- If it looks like an interwiki link.
elseif text:find(":") and text ~= ":" then
local m_utildata = conditionalRequire("Module:utilities/data")
local prefix = text:gsub("^:*(.-):.*", string.ulower)
if m_utildata.interwikis[prefix] or m_utildata.namespaces[prefix] then
unsupported = true
end
end
-- Check if the text is a listed unsupported title.
local unsupportedTitles = conditionalRequire("Module:links/data").unsupported_titles
if unsupportedTitles[text] then
return "Unsupported titles/" .. unsupportedTitles[text], nil, {}
end
sc = checkScript(text, self, sc)
local fail, cats
text = normalize(text, sc)
text, fail, cats = iterateSectionSubstitutions(text, nil, nil, self, sc, self._rawData.entry_name, "makeEntryName")
text = mw.ustring.gsub(text, "^[¿¡]?([^%s%p]+)%s*[؟?!;՛՜ ՞ ՟?!︖︕।॥။၊་།]?$", "%1") or text
text = unsupported and "Unsupported titles/" .. text or text
return text, fail, cats
end
end
return true
--[==[Generates alternative forms using a specified method, and returns them as a table. If no method is specified, returns a table containing only the input term.]==]
end
function Language:generateForms(text, sc)
 
if self._rawData.generate_forms then
-- Apply display-text replacements to `text`, if any.
sc = checkScript(text, self, sc)
function Language:makeDisplayText(text)
return require("Module:" .. self._rawData.generate_forms).generateForms(text, self:getCode(), sc:getCode())
if type(self._rawData.display) == "table" then
else
text = do_entry_name_or_sort_key_replacements(text, self._rawData.display)
return {text}
end
end
end
--[==[Creates a sort key for the given entry name, following the rules appropriate for the language. This removes diacritical marks from the entry name if they are not considered significant for sorting, and may perform some other changes. Any initial hyphen is also removed, and anything parentheses is removed as well.
return text
The <code>sort_key</code> setting for each language in the data modules defines the replacements made by this function, or it gives the name of the module that takes the entry name and returns a sortkey.]==]
end
function Language:makeSortKey(text, sc)
 
if (not text) or text == "" then
 
return text, nil, {}
-- Add to data tables?
end
local has_dotted_undotted_i = {
if text:find("<[^<>]+>") then
["az"] = true,
track("track HTML tag")
["crh"] = true,
end
["gag"] = true,
-- Remove soft hyphens, strip markers and HTML tags.
["kaa"] = true,
text = text:gsub("­", "")
["tt"] = true,
text = mw.text.unstrip(text)
["tr"] = true,
:gsub("<[^<>]+>", "")
["zza"] = true,
}
text = mw.uri.decode(text, "PATH")
 
text = checkNoEntities(text)
function Language:makeSortKey(name, sc)
if has_dotted_undotted_i[self:getCode()] then
-- Remove initial hyphens and * unless the term only consists of spacing + punctuation characters.
name = name:gsub("I", "ı")
text = mw.ustring.gsub(text, "^([􀀀-􏿽]*)[-־ـ᠊*]+([􀀀-􏿽]*)(.*[^%s%p].*)", "%1%2%3")
sc = checkScript(text, self, sc)
text = normalize(text, sc)
text = removeCarets(text, sc)
-- For languages with dotted dotless i, ensure that "İ" is sorted as "i", and "I" is sorted as "ı".
if self:hasDottedDotlessI() then
text = text
:gsub(mw.ustring.toNFD("İ"), "i")
:gsub("I", "ı")
text = sc:toFixedNFD(text)
end
-- Convert to lowercase, make the sortkey, then convert to uppercase. Where the language has dotted dotless i, it is usually not necessary to convert "i" to "İ" and "ı" to "I" first, because "I" will always be interpreted as conventional "I" (not dotless "İ") by any sorting algorithms, which will have been taken into account by the sortkey substitutions themselves. However, if no sortkey substitutions have been specified, then conversion is necessary so as to prevent "i" and "ı" both being sorted as "I".
-- An exception is made for scripts that (sometimes) sort by scraping page content, as that means they are sensitive to changes in capitalization (as it changes the target page).
local fail, cats
if not sc:sortByScraping() then
text = text:ulower()
end
text, fail, cats = iterateSectionSubstitutions(text, nil, nil, self, sc, self._rawData.sort_key, "makeSortKey")
if not sc:sortByScraping() then
if self:hasDottedDotlessI() and not self._rawData.sort_key then
text = text
:gsub("ı", "I")
:gsub("i", "İ")
text = sc:toFixedNFC(text)
end
text = text:uupper()
end
-- Remove parentheses, as long as they are either preceded or followed by something.
text = text
:gsub("(.)[()]+", "%1")
:gsub("[()]+(.)", "%1")
text = require("Module:string utilities").escape_risky_characters(text)
return text, fail, cats
end
end
--[==[Create the form used as as a basis for display text and transliteration.]==]
name = mw.ustring.lower(name)
local function processDisplayText(text, self, sc, keepCarets, keepPrefixes)
local subbedChars = {}
text, subbedChars = doTempSubstitutions(text, subbedChars, keepCarets)
text = mw.uri.decode(text, "PATH")
text = checkNoEntities(text)
sc = checkScript(text, self, sc)
local fail, cats
text = normalize(text, sc)
text, fail, cats, subbedChars = iterateSectionSubstitutions(text, subbedChars, keepCarets, self, sc, self._rawData.display_text, "makeDisplayText")
text = removeCarets(text, sc)
-- Remove any interwiki link prefixes (unless they have been escaped or this has been disabled).
if text:match(":") and not keepPrefixes then
local m_utildata, rep = conditionalRequire("Module:utilities/data")
repeat
text, rep = text:gsub("\\\\(\\*:)", "\3%1")
until rep == 0
text = text
:gsub("\\:", "\4")
while true do
local prefix = text:gsub("^(.-):.+", function(m1)
return m1:gsub("\244[\128-\191]*", "")
end)
if not prefix or prefix == text then
break
end
local lower_prefix = prefix:ulower()
if not (m_utildata.interwikis[lower_prefix] or prefix == "") then
break
end
text = text:gsub("^(.-):(.*)", function(m1, m2)
local ret = {}
for subbedChar in m1:gmatch("\244[\128-\191]*") do
table.insert(ret, subbedChar)
end
return table.concat(ret) .. m2
end)
end
text = text
:gsub("\3", "\\")
:gsub("\4", ":")
end
return text, fail, cats, subbedChars
end
--[==[Make the display text (i.e. what is displayed on the page).]==]
-- Remove initial hyphens and *
function Language:makeDisplayText(text, sc, keepPrefixes)
local hyphens_regex = "^[-־ـ*]+(.)"
if (not text) or text == "" then
name = mw.ustring.gsub(name, hyphens_regex, "%1")
return text, nil, {}
end
local fail, cats, subbedChars
text, fail, cats, subbedChars = processDisplayText(text, self, sc, nil, keepPrefixes)
text = require("Module:string utilities").escape_risky_characters(text)
return undoTempSubstitutions(text, subbedChars), fail, cats
end
--[==[Transliterates the text from the given script into the Latin script (see [[Wiktionary:Transliteration and romanization]]). The language must have the <code>translit</code> property for this to work; if it is not present, {{code|lua|nil}} is returned.
-- If there are language-specific rules to generate the key, use those
Returns three values:
if type(self._rawData.sort_key) == "table" then
# The transliteration.
name = do_entry_name_or_sort_key_replacements(name, self._rawData.sort_key)
# A boolean which indicates whether the transliteration failed for an unexpected reason. If {{code|lua|false}}, then the transliteration either succeeded, or the module is returning nothing in a controlled way (e.g. the input was {{code|lua|"-"}}). Generally, this means that no maintenance action is required. If {{code|lua|true}}, then the transliteration is {{code|lua|nil}} because either the input or output was defective in some way (e.g. [[Module:ar-translit]] will not transliterate non-vocalised inputs, and this module will fail partially-completed transliterations in all languages). Note that this value can be manually set by the transliteration module, so make sure to cross-check to ensure it is accurate.
elseif type(self._rawData.sort_key) == "string" then
# A table of categories selected by the transliteration module, which should be in the format expected by {{code|lua|format_categories}} in [[Module:utilities]].
name = require("Module:" .. self._rawData.sort_key).makeSortKey(name, self:getCode(), sc and sc:getCode())
The <code>sc</code> parameter is handled by the transliteration module, and how it is handled is specific to that module. Some transliteration modules may tolerate {{code|lua|nil}} as the script, others require it to be one of the possible scripts that the module can transliterate, and will show an error if it's not one of them. For this reason, the <code>sc</code> parameter should always be provided when writing non-language-specific code.
The <code>module_override</code> parameter is used to override the default module that is used to provide the transliteration. This is useful in cases where you need to demonstrate a particular module in use, but there is no default module yet, or you want to demonstrate an alternative version of a transliteration module before making it official. It should not be used in real modules or templates, only for testing. All uses of this parameter are tracked by [[Template:tracking/module_override]].
'''Known bugs''':
* This function assumes {tr(s1) .. tr(s2) == tr(s1 .. s2)}. When this assertion fails, wikitext markups like <nowiki>'''</nowiki> can cause wrong transliterations.
* HTML entities like <code>&amp;apos;</code>, often used to escape wikitext markups, do not work.]==]
function Language:transliterate(text, sc, module_override)
-- If there is no text, or the language doesn't have transliteration data and there's no override, return nil.
if not (self._rawData.translit or module_override) then
return nil, false, {}
elseif (not text) or text == "" or text == "-" then
return text, false, {}
end
-- If the script is not transliteratable (and no override is given), return nil.
sc = checkScript(text, self, sc)
if not (sc:isTransliterated() or module_override) then
return nil, true, {}
end
-- Remove any strip markers.
text = mw.text.unstrip(text)
-- Get the display text with the keepCarets flag set.
local fail, cats, subbedChars
text, fail, cats, subbedChars = processDisplayText(text, self, sc, true)
-- Transliterate (using the module override if applicable).
text, fail, cats, subbedChars = iterateSectionSubstitutions(text, subbedChars, true, self, sc, module_override or self._rawData.translit, "tr")
-- Incomplete transliterations return nil.
if text then
if sc:countCharacters(text) > 0 then
-- Remove any characters in (extended) Latin, which includes Latin characters also included in other scripts (as these are false positives). Anything remaining should only be script code "None" (e.g. numerals).
local check_text = mw.ustring.gsub(text, "[" .. require("Module:scripts").getByCode("Latnx"):getCharacters() .. "]", "")
if require("Module:scripts").findBestScriptWithoutLang(check_text) ~= "None" then
return nil, true, cats
end
end
else
return nil, true, cats
end
text = require("Module:string utilities").escape_risky_characters(text)
text = undoTempSubstitutions(text, subbedChars)
-- If the script does not use capitalization, then capitalize any letters of the transliteration which are immediately preceded by a caret (and remove the caret).
if text and not sc:hasCapitalization() and text:match("%^") then
text = processCarets(text, "%^([\128-\191\244]*%*?)([^\128-\191\244][\128-\191]*)", function(m1, m2)
return m1 .. m2:uupper()
end)
end
-- Track module overrides.
if module_override ~= nil then
track("module_override")
end
fail = text == nil and (not not fail) or false
return text, fail, cats
end
end
function Language:overrideManualTranslit()
-- Remove parentheses, as long as they are either preceded or followed by something
return not not self._rawData.override_translit
name = mw.ustring.gsub(name, "(.)[()]+", "%1")
end
name = mw.ustring.gsub(name, "[()]+(.)", "%1")
--[==[Returns {{code|lua|true}} if the language has a transliteration module, or {{code|lua|false}} if it doesn't.]==]
if has_dotted_undotted_i[self:getCode()] then
function Language:hasTranslit()
name = name:gsub("i", "İ")
return not not self._rawData.translit
end
end
function Language:link_tr()
return mw.ustring.upper(name)
return not not self._rawData.link_tr
end
 
function Language:overrideManualTranslit()
if self._rawData.override_translit then
return true
else
return false
end
end
end
--[==[Returns {{code|lua|true}} if the language uses the letters I/ı and İ/i, or {{code|lua|false}} if it doesn't.]==]
 
function Language:hasDottedDotlessI()
 
return not not self._rawData.dotted_dotless_i
function Language:transliterate(text, sc, module_override)
-- If there is no text, or the language doesn't have transliteration data and there's no override, return nil.
if not (self._rawData.translit_module or module_override) then
return nil, false, {}
elseif (not text) or text == "" or text == "-" then
return text, false, {}
end
end
-- If the script is not transliteratable (and no override is given), return nil.
function Language:toJSON(returnTable)
sc = checkScript(text, self, sc)
local entryNamePatterns = nil
if not (sc:isTransliterated() or module_override) then
local entryNameRemoveDiacritics = nil
return nil, true, {}
if self._rawData.entry_name then
entryNameRemoveDiacritics = self._rawData.entry_name.remove_diacritics
if self._rawData.entry_name.from then
entryNamePatterns = {}
for i, from in ipairs(self._rawData.entry_name.from) do
table.insert(entryNamePatterns, {from = from, to = self._rawData.entry_name.to[i] or ""})
end
end
end
if not self._type then
self:hasType()
end
local types = {}
for type in pairs(self._type) do
table.insert(types, type)
end
local ret = {
ancestors = self:getAncestorCodes(),
canonicalName = self:getCanonicalName(),
categoryName = self:getCategoryName("nocap"),
code = self:getCode(),
entryNamePatterns = entryNamePatterns,
entryNameRemoveDiacritics = entryNameRemoveDiacritics,
family = self:getFamilyCode(),
otherNames = self:getOtherNames(true),
aliases = self:getAliases(),
varieties = self:getVarieties(),
scripts = self:getScriptCodes(),
parent = self._parentCode or nil,
nonEtymological = self._nonEtymologicalCode or nil,
type = types,
wikimediaLanguages = self:getWikimediaLanguageCodes(),
wikidataItem = self:getWikidataItem(),
}
ret = require("Module:table").deepcopy(ret)
if returnTable then
return ret
else
return require("Module:JSON").toJSON(ret)
end
end
end
-- Remove any strip markers.
text = mw.text.unstrip(text)
-- Get the display text with the keepCarets flag set.
local fail, cats, subbedChars
text, fail, cats, subbedChars = processDisplayText(text, self, sc, true)
-- Transliterate (using the module override if applicable).
text, fail, cats, subbedChars = iterateSectionSubstitutions(text, subbedChars, true, self, sc, module_override or self._rawData.translit_module, "tr")
--[==[
if not text then
<span style="color: #BA0000">This function is not for use in entries or other content pages.</span>
return nil, true, cats
Returns a blob of data about the language. The format of this blob is undocumented, and perhaps unstable; it's intended for things like the module's own unit-tests, which are "close friends" with the module and will be kept up-to-date as the format changes.
-- Do NOT use these methods!
-- All uses should be pre-approved on the talk page!
]==]
function Language:getRawData()
local rawData = {}
for _, element in ipairs(self._stack) do
for k, v in pairs(element) do
rawData[k] = v
end
end
return rawData
end
end
--[==[<span style="color: #BA0000">This function is not for use in entries or other content pages.</span>
-- Incomplete transliterations return nil.
Returns a blob of data about the language that contains the "extra data". Much like with getRawData, the format of this blob is undocumented, and perhaps unstable; it's intended for things like the module's own unit-tests, which are "close friends" with the module and will be kept up-to-date as the format changes.]==]
local charset = sc.characters
function Language:getRawExtraData()
if charset and umatch(text, "[" .. charset .. "]") then
if #self._stack == 1 then
-- Remove any characters in Latin, which includes Latin characters also included in other scripts (as these are false positives), as well as any PUA substitutions. Anything remaining should only be script code "None" (e.g. numerals).
self:loadInExtraData()
local check_text = ugsub(text, "[" .. require("Module:scripts").getByCode("Latn").characters .. "􀀀-􏿽]+", "")
-- Set none_is_last_resort_only flag, so that any non-None chars will cause a script other than "None" to be returned.
if require("Module:scripts").findBestScriptWithoutLang(check_text, true):getCode() ~= "None" then
return nil, true, cats
end
end
return self._extraData
end
end
 
local function getRawExtraLanguageData(code)
text = escape_risky_characters(text)
local modulename = export.getExtraDataModuleName(code)
text = undoTempSubstitutions(text, subbedChars)
return modulename and conditionalRequire("Module:" .. modulename)[code] or nil
 
-- If the script does not use capitalization, then capitalize any letters of the transliteration which are immediately preceded by a caret (and remove the caret).
if text and not sc:hasCapitalization() and text:find("^", 1, true) then
text = processCarets(text, "%^([\128-\191\244]*%*?)([^\128-\191\244][\128-\191]*)", function(m1, m2)
return m1 .. uupper(m2)
end)
end
end


function Language:loadInExtraData()
-- Track module overrides.
if not self._extraData then
if module_override ~= nil then
-- load extra data from module and assign to _extraData field
track("module_override")
-- use empty table as a fallback if extra data is nil
self._extraData = getRawExtraLanguageData(self:getCode()) or {}
end
end
end
 
return Language
fail = text == nil and (not not fail) or false
 
return text, fail, cats
end
 
function Language:hasTranslit()
return self._rawData.translit_module and true or false
end
 
 
function Language:link_tr()
return self._rawData.link_tr and true or false
end
end


local function make_stack(code, data, parent, useRequire)
 
parent.__index = parent
function Language:toJSON()
local entryNamePatterns = nil
local lang = {_code = code}
local entryNameRemoveDiacritics = nil
-- Full language.
if self._rawData.entry_name then
if not parent._stack then
entryNameRemoveDiacritics = self._rawData.entry_name.remove_diacritics
-- Create stack, accessed with rawData metamethod.
if self._rawData.entry_name.from then
lang._stack = parent._rawData and {parent._rawData, data} or {data}
entryNamePatterns = {}
lang._rawData = setmetatable({}, {
for i, from in ipairs(self._rawData.entry_name.from) do
__index = function(t, k)
local to = self._rawData.entry_name.to[i] or ""
-- Data that isn't inherited from the parent.
table.insert(entryNamePatterns, { from = from, to = to })
local function no_inherit(lang, t, k)
if (
k == "aliases" or
k == "varieties" or
k == "otherNames"
) then
return lang._stack[#lang._stack][k]
end
end
-- Data that is appended by each generation.
local function append_data(lang, t, k)
if k == "type" then
local parts = {}
for i = 1, #lang._stack do
table.insert(parts, lang._stack[i][k])
end
if type(parts[1]) == "string" then
return table.concat(parts, ", ")
end
end
end
-- Otherwise, iterate down the stack, looking for a match.
local function inherit_data(lang, t, k)
local i = #lang._stack
while not lang._stack[i][k] and i > 1 do
i = i - 1
end
return lang._stack[i][k]
end
local ret = no_inherit(lang, t, k) or
append_data(lang, t, k) or
inherit_data(lang, t, k)
if (
k == 4 or
k == "ancestors" or
k == "wikimedia_codes"
) then
if type(ret) == "table" then
return ret
elseif type(ret) == "string" then
return mw.text.split(ret, "%s*,%s*")
end
else
return ret
end
end,
-- Retain immutability (as writing to rawData will break functionality).
__newindex = function()
error("not allowed to edit rawData")
end
end
})
end
-- Non-etymological code is the parent code.
lang._nonEtymologicalCode = parent._code or code
-- Etymology-only.
else
-- Copy over rawData and stack to the new object, and add new layer to stack.
lang._rawData = parent._rawData
lang._stack = parent._stack
table.insert(lang._stack, data)
-- Copy non-etymological code.
lang._nonEtymologicalCode = parent._nonEtymologicalCode
end
end
return setmetatable(lang, parent)
local ret = {
ancestors = self._rawData.ancestors,
canonicalName = self:getCanonicalName(),
categoryName = self:getCategoryName("nocap"),
code = self._code,
entryNamePatterns = entryNamePatterns,
entryNameRemoveDiacritics = entryNameRemoveDiacritics,
family = self._rawData[3] or self._rawData.family,
otherNames = self:getOtherNames(true),
aliases = self:getAliases(),
varieties = self:getVarieties(),
scripts = self._rawData.scripts or self._rawData[4],
type = self:getType(),
wikimediaLanguages = self._rawData.wikimedia_codes,
wikidataItem = self:getWikidataItem(),
}
return require("Module:JSON").toJSON(ret)
end
end
-- Do NOT use these methods!
-- All uses should be pre-approved on the talk page!
function Language:getRawData()
return self._rawData
end
function Language:getRawExtraData()
self:loadInExtraData()
return self._extraData
end
Language.__index = Language


function export.getDataModuleName(code)
function export.getDataModuleName(code)
if code:match("^%l%l$") then
if code:find("^%l%l$") then
return "languages/data/2"
return "languages/data2"
elseif code:match("^%l%l%l$") then
elseif code:find("^%l%l%l$") then
local prefix = code:sub(1, 1)
local prefix = code:sub(1, 1)
return "languages/data/3/" .. prefix
return "languages/data3/" .. prefix
elseif code:match("^[%l-]+$") then
elseif code:find("^[%l-]+$") then
return "languages/data/exceptional"
return "languages/datax"
else
else
return nil
return nil
end
end
end
end


function export.getExtraDataModuleName(code)
function export.getExtraDataModuleName(code)
local dataModule = export.getDataModuleName(code)
if code:find("^%l%l$") then
return dataModule and dataModule .. "/extra" or nil
return "languages/extradata2"
elseif code:find("^%l%l%l$") then
local prefix = code:sub(1, 1)
return "languages/extradata3/" .. prefix
elseif code:find("^[%l-]+$") then
return "languages/extradatax"
else
return nil
end
end
 
 
local function getRawLanguageData(code)
local modulename = export.getDataModuleName(code)
return modulename and mw.loadData("Module:" .. modulename)[code] or nil
end
end


function export.makeObject(code, data, useRequire)
 
if not data then
local function getRawExtraLanguageData(code)
return nil
local modulename = export.getExtraDataModuleName(code)
end
return modulename and mw.loadData("Module:" .. modulename)[code] or nil
end
code = data.main_code or code
 
 
if data.type:find("family") and not data[5] then
function Language:loadInExtraData()
return require("Module:families").makeObject(code, data, useRequire)
if not self._extraData then
else
-- load extra data from module and assign to meta table
local parent
-- use empty table as a fallback if extra data is nil
if data[5] then
local meta = getmetatable(self)
parent = export.getByCode(data[5], nil, true, true, useRequire)
meta._extraData = getRawExtraLanguageData(self._code) or {}
else
setmetatable(self, meta)
parent = make_language(code, data, useRequire)
end
return make_stack(code, data, parent, useRequire)
end
end
end
end


--[==[Finds the language whose code matches the one provided. If it exists, it returns a <code class="nf">Language</code> object representing the language. Otherwise, it returns {{code|lua|nil}}, unless <code class="n">paramForError</code> is given, in which case an error is generated. If <code class="n">paramForError</code> is {{code|lua|true}}, a generic error message mentioning the bad code is generated; otherwise <code class="n">paramForError</code> should be a string or number specifying the parameter that the code came from, and this parameter will be mentioned in the error message along with the bad code. If <code class="n">allowEtymLang</code> is specified, etymology language codes are allowed and looked up along with normal language codes. If <code class="n">allowFamily</code> is specified, language family codes are allowed and looked up along with normal language codes.]==]
 
function export.getByCode(code, paramForError, allowEtymLang, allowFamily, useRequire)
function export.makeObject(code, data)
return data and setmetatable({ _rawData = data, _code = code }, Language) or nil
end
 
 
function export.getByCode(code, paramForError, allowEtymLang, allowFamily)
if type(code) ~= "string" then
if type(code) ~= "string" then
local typ
error("The function getByCode expects a string as its first argument, but received " .. (code == nil and "nil" or "a " .. type(code)) .. ".")
if not code then
typ = "nil"
elseif checkObject("language", true, code) then
typ = "a language object"
elseif checkObject("family", true, code) then
typ = "a family object"
else
typ = "a " .. type(code)
end
error("The function getByCode expects a string as its first argument, but received " .. typ .. ".")
end
end
local function conditionalRequire(modulename)
local retval = export.makeObject(code, getRawLanguageData(code))
if useRequire then
if not retval and allowEtymLang then
return require(modulename)
retval = require("Module:etymology languages").getByCode(code)
end
if not retval and allowFamily then
retval = require("Module:families").getByCode(code)
end
if not retval and paramForError then
local codetext = nil
if allowEtymLang and allowFamily then
codetext = "language, etymology language or family code"
elseif allowEtymLang then
codetext = "language or etymology language code"
elseif allowFamily then
codetext = "language or family code"
else
else
return mw.loadData(modulename)
codetext = "language code"
end
end
export.err(code, paramForError, codetext)
end
end
local modulename = export.getDataModuleName(code)
local data = modulename and
conditionalRequire("Module:" .. modulename)[code] or
(allowEtymLang and require("Module:etymology languages/track-bad-etym-code")(code) and conditionalRequire("Module:etymology languages/data")[code]) or
(allowFamily and conditionalRequire("Module:families/data")[code]) or
(allowEtymLang and allowFamily and conditionalRequire("Module:families/data/etymology")[code])
local retval = code and data and export.makeObject(code, data, useRequire)
if not retval and paramForError then
require("Module:languages/errorGetBy").code(code, paramForError, allowEtymLang, allowFamily)
end
return retval
return retval
end
end


--[==[Like {{code|lua|getByCanonicalName()}}, except it also looks at the <code class="n">otherNames</code> listed in the non-etymology language data modules, and does not (currently) have options to look up etymology languages and families.]==]
 
function export.getByName(name, errorIfInvalid)
function export.getByName(name, errorIfInvalid)
local byName = mw.loadData("Module:languages/by name")
local byName = mw.loadData("Module:languages/by name")
Line 1,271: Line 1,114:
if not code then
if not code then
if errorIfInvalid then
if errorIfInvalid then
error("The language name \"" .. name .. "\" is not valid. See [[Wiktionary:List of languages]].")
error("The language name \"" .. name .. "\" is not valid.")
else
else
return nil
return nil
Line 1,277: Line 1,120:
end
end
return export.getByCode(code)
return export.makeObject(code, getRawLanguageData(code))
end
end


--[==[Finds the language whose canonical name (the name used to represent that language on Wiktionary) or other name matches the one provided. If it exists, it returns a <code class="nf">Language</code> object representing the language. Otherwise, it returns {{code|lua|nil}}, unless <code class="n">paramForError</code> is given, in which case an error is generated. If <code class="n">allowEtymLang</code> is specified, etymology language codes are allowed and looked up along with normal language codes. If <code class="n">allowFamily</code> is specified, language family codes are allowed and looked up along with normal language codes.
function export.getByCanonicalName(name, errorIfInvalid, allowEtymLang, allowFamily)
The canonical name of languages should always be unique (it is an error for two languages on Wiktionary to share the same canonical name), so this is guaranteed to give at most one result.
local byName = mw.loadData("Module:languages/canonical names")
This function is powered by [[Module:languages/canonical names]], which contains a pre-generated mapping of non-etymology-language canonical names to codes. It is generated by going through the [[:Category:Language data modules]] for non-etymology languages. When <code class="n">allowEtymLang</code> is specified for the above function, [[Module:etymology languages/canonical names]] may also be used, and when <code class="n">allowFamily</code> is specified for the above function, [[Module:families/canonical names]] may also be used.]==]
function export.getByCanonicalName(name, errorIfInvalid, allowEtymLang, allowFamily, useRequire)
local function conditionalRequire(modulename)
if useRequire then
return require(modulename)
else
return mw.loadData(modulename)
end
end
local byName = conditionalRequire("Module:languages/canonical names")
local code = byName and byName[name]
local code = byName and byName[name]
 
if not code and allowEtymLang then
local retval = code and export.makeObject(code, getRawLanguageData(code)) or nil
byName = conditionalRequire("Module:etymology languages/canonical names")
if not retval and allowEtymLang then
code = byName and byName[name] or
retval = require("Module:etymology languages").getByCanonicalName(name)
byName[name:gsub(" [Ss]ubstrate$", "")] or
byName[name:gsub("^a ", "")] or
byName[name:gsub("^a ", ""):gsub(" [Ss]ubstrate$", "")] or
-- For etymology families like "ira-pro".
-- FIXME: This is not ideal, as it allows " languages" to be appended to any etymology-only language, too.
byName[name:match("^(.*) languages$")]
end
end
if not retval and allowFamily then
if not code and allowFamily then
local famname = name:match("^(.*) languages$")
byName = conditionalRequire("Module:families/canonical names")
famname = famname or name
code = byName and byName[name] or
retval = require("Module:families").getByCanonicalName(famname)
byName[name:match("^(.*) languages$")]
end
end
local retval = code and export.getByCode(code, errorIfInvalid, allowEtymLang, allowFamily, useRequire)
if not retval and errorIfInvalid then
if not retval and errorIfInvalid then
require("Module:languages/errorGetBy").canonicalName(name, allowEtymLang, allowFamily)
local text
if allowEtymLang and allowFamily then
text = "language, etymology language or family name"
elseif allowEtymLang then
text = "language or etymology language name"
elseif allowFamily then
text = "language or family name"
else
text = "language name"
end
error("The " .. text .. " \"" .. name .. "\" is not valid.")
end
end
return retval
return retval
end
end


--[==[Used by [[Module:languages/data/2]] (et al.) to add default types to the entities returned.]==]
function export.iterateAll()
function export.addDefaultTypes(data, regular, ...)
mw.incrementExpensiveFunctionCount()
for _, entity in pairs(data) do
local m_data = mw.loadData("Module:languages/alldata")
-- "regular" encompasses everything that doesn't have another type already assigned.
local func, t, var = pairs(m_data)
if regular then
entity.type = entity.type or "regular"
return function()
end
local code, data = func(t, var)
local types = table.concat({...}, ", ")
return export.makeObject(code, data)
if #types > 0 then
entity.type =  types .. (entity.type and (", " .. entity.type) or "")
end
end
end
return data
end
end


--[==[Used by [[Module:etymology languages/data]] and [[Module:families/data/etymology]] to finalize the data into the format that is actually returned.]==]
--[[ If language is an etymology language, iterates through parent languages
function export.finalizeEtymologyData(data)
until it finds a non-etymology language. ]]
local aliases = {}
function export.getNonEtymological(lang)
for code, entity in pairs(data) do
while lang:getType() == "etymology language" do
-- Move parent to 5 and family to 3.
local parentCode = lang:getParentCode()
data[code][5] = data[code][3]
lang = export.getByCode(parentCode)
data[code][3] = data[code].family
or require("Module:etymology languages").getByCode(parentCode)
data[code].family = nil
or require("Module:families").getByCode(parentCode)
-- Assign any alias codes listed in alias_codes. The main_code field is used to make sure objects always use that to identify themselves, which means all aliases are fungible with their counterparts.
if entity.alias_codes then
entity.main_code = code
for _, alias in ipairs(entity.alias_codes) do
aliases[alias] = entity
end
entity.alias_codes = nil
end
end
end
for code, alias in pairs(aliases) do
data[code] = alias
return lang
end
return data
end
 
--[==[For backwards compatibility only; modules should require the /error themselves.]==]
function export.err(lang_code, param, code_desc, template_tag, not_real_lang)
return require("Module:languages/error")(lang_code, param, code_desc, template_tag, not_real_lang)
end
end


return export
return export

Latest revision as of 23:25, 7 August 2024



local require = require

local m_str_utils = require("Module:string utilities")
local m_table = require("Module:table")
local mw = mw
local string = string
local table = table
local ustring = mw.ustring

local char = string.char
local check_object = require("Module:utilities").check_object
local concat = table.concat
local decode_entities = m_str_utils.decode_entities
local decode_uri = m_str_utils.decode_uri
local find = string.find
local floor = math.floor
local gmatch = string.gmatch
local gsub = string.gsub
local insert = table.insert
local ipairs = ipairs
local list_to_set = m_table.listToSet
local load_data = mw.loadData
local match = string.match
local next = next
local pairs = pairs
local remove = table.remove
local remove_duplicates = m_table.removeDuplicates
local select = select
local setmetatable = setmetatable
local shallowcopy = m_table.shallowcopy
local split = m_str_utils.split
local sub = string.sub
local type = type
local ugsub = ustring.gsub
local ulen = m_str_utils.len
local ulower = m_str_utils.lower
local umatch = ustring.match
local uupper = m_str_utils.upper

local export = {}

--[=[
Throw an error for an invalid language code or script code.

`lang_code` (required) is the bad code and can be nil or a non-string.

`param` (required) is the name of the parameter in which the code was contained. It can be a string, a number
	(for a numeric param, in which case the param will show up in the error message as an ordinal such as
	"first" or "second"), or `true` if no parameter can be clearly identified.

`code_desc` (optional) is text describing what the code is; by default, "language code".

`template_text` (optional) is a string specifying the template that generated the error, or a function
	to generate this string. If given, it will be displayed in the error message.

`not_real_lang` (optional), if given, indicates that the code is not in the form of a language code
	(e.g. it's a script code). Normally, this function checks for things that could plausibly be a language code:
	two or three lowercase letters, two or three groups of three lowercase letters with hyphens between them.
	If such a pattern is found, a different error message is displayed (indicating an invalid code) than otherwise
	(indicating a missing code). If `not_real_lang` is given, this check is suppressed.
]=]

-- Convert risky characters to HTML entities, which minimizes interference once returned (e.g. for "sms:a", "<!-- -->" etc.).
local function escape_risky_characters(text)
	local encode_entities = require("Module:string/encode entities")
	-- Spacing characters in isolation generally need to be escaped in order to be properly processed by the MediaWiki software.
	if umatch(text, "^%s*$") then
		return encode_entities(text, text)
	else
		return encode_entities(text, "!#%&*+/:;<=>?@[\\]_{|}")
	end
end

-- Process carets (and any escapes). Default to simple removal, if no pattern/replacement is given.
local function processCarets(text, pattern, repl)
	local rep
	repeat
		text, rep = gsub(text, "\\\\(\\*^)", "\3%1")
	until rep == 0
	return text:gsub("\\^", "\4")
		:gsub(pattern or "%^", repl or "")
		:gsub("\3", "\\")
		:gsub("\4", "^")
end

-- Remove carets if they are used to capitalize parts of transliterations (unless they have been escaped).
local function removeCarets(text, sc)
	if not sc:hasCapitalization() and sc:isTransliterated() and text:find("^", 1, true) then
		return processCarets(text)
	else
		return text
	end
end

-- Temporarily convert various formatting characters to PUA to prevent them from being disrupted by the substitution process.
local function doTempSubstitutions(text, subbedChars, keepCarets, noTrim)
	-- Clone so that we don't insert any extra patterns into the table in package.loaded. For some reason, using require seems to keep memory use down; probably because the table is always cloned.
	local patterns = shallowcopy(require("Module:languages/data/patterns"))
	if keepCarets then
		insert(patterns, "((\\+)%^)")
		insert(patterns, "((%^))")
	end
	-- Ensure any whitespace at the beginning and end is temp substituted, to prevent it from being accidentally trimmed. We only want to trim any final spaces added during the substitution process (e.g. by a module), which means we only do this during the first round of temp substitutions.
	if not noTrim then
		insert(patterns, "^([\128-\191\244]*(%s+))")
		insert(patterns, "((%s+)[\128-\191\244]*)$")
	end
	-- Pre-substitution, of "[[" and "]]", which makes pattern matching more accurate.
	text = gsub(text, "%f[%[]%[%[", "\1")
		:gsub("%f[%]]%]%]", "\2")
	local i, pe = #subbedChars, require("Module:string utilities").pattern_escape
	for _, pattern in ipairs(patterns) do
		-- Patterns ending in \0 stand are for things like "[[" or "]]"), so the inserted PUA are treated as breaks between terms by modules that scrape info from pages.
		local term_divider
		pattern = gsub(pattern, "%z$", function(divider)
			term_divider = divider == "\0"
			return ""
		end)
		text = gsub(text, pattern, function(...)
			local m = {...}
			local m1New = m[1]
			for k = 2, #m do
				local n = i + k - 1
				subbedChars[n] = m[k]
				local byte2 = floor(n / 4096) % 64 + (term_divider and 128 or 136)
				local byte3 = floor(n / 64) % 64 + 128
				local byte4 = n % 64 + 128
				m1New = gsub(m1New, pe(m[k]), "\244" .. char(byte2) .. char(byte3) .. char(byte4), 1)
			end
			i = i + #m - 1
			return m1New
		end)
	end
	text = gsub(text, "\1", "%[%[")
		:gsub("\2", "%]%]")
	return text, subbedChars
end

-- Split the text into sections, based on the presence of temporarily substituted formatting characters, then iterate over each one to apply substitutions. This avoids putting PUA characters through language-specific modules, which may be unequipped for them.
local function iterateSectionSubstitutions(text, subbedChars, keepCarets, self, sc, substitution_data, function_name)
	local pe = require("Module:string utilities").pattern_escape
	local fail, cats, sections = nil, {}
	-- See [[Module:languages/data]].
	if not find(text, "\244") or self:loadData("Module:languages/data").contiguous_substitution[self._code] then
		sections = {text}
	else
		sections = split(text, "\244[\128-\143][\128-\191]*", true)
	end
	for _, section in ipairs(sections) do
		-- Don't bother processing empty strings or whitespace (which may also not be handled well by dedicated modules).
		if gsub(section, "%s+", "") ~= "" then
			local sub, sub_fail, sub_cats = require("Module:languages/doSubstitutions")(section, self, sc, substitution_data, function_name)
			-- Second round of temporary substitutions, in case any formatting was added by the main substitution process. However, don't do this if the section contains formatting already (as it would have had to have been escaped to reach this stage, and therefore should be given as raw text).
			if sub and subbedChars then
				local noSub
				for _, pattern in ipairs(require("Module:languages/data/patterns")) do
					if match(section, pattern .. "%z?") then
						noSub = true
					end
				end
				if not noSub then
					sub, subbedChars = doTempSubstitutions(sub, subbedChars, keepCarets, true)
				end
			end
			if (not sub) or sub_fail then
				text = sub
				fail = sub_fail
				cats = sub_cats or {}
				break
			end
			text = sub and gsub(text, pe(section), pe(sub), 1) or text
			if type(sub_cats) == "table" then
				for _, cat in ipairs(sub_cats) do
					insert(cats, cat)
				end
			end
		end
	end

	-- Trim, unless there are only spacing characters, while ignoring any final formatting characters.
	text = text and text:gsub("^([\128-\191\244]*)%s+(%S)", "%1%2")
		:gsub("(%S)%s+([\128-\191\244]*)$", "%1%2")

	-- Remove duplicate categories.
	if #cats > 1 then
		cats = remove_duplicates(cats)
	end

	return text, fail, cats, subbedChars
end

local function normalize(text, sc)
	text = sc:fixDiscouragedSequences(text)
	return sc:toFixedNFD(text)
end

-- Check if the raw text is an unsupported title, and if so return that. Otherwise, remove HTML entities. We do the pre-conversion to avoid loading the unsupported title list unnecessarily.
local function checkNoEntities(self, text)
	local textNoEnc = decode_entities(text)
	if textNoEnc ~= text and self:loadData("Module:links/data").unsupported_titles[text] then
		return text
	else
		return textNoEnc
	end
end

-- Reinsert any formatting that was temporarily substituted.
local function undoTempSubstitutions(text, subbedChars)
	local pe = require("Module:string utilities").pattern_escape
	for i = 1, #subbedChars do
		local byte2 = floor(i / 4096) % 64 + 128
		local byte3 = floor(i / 64) % 64 + 128
		local byte4 = i % 64 + 128
		text = gsub(text, "\244[" .. char(byte2) .. char(byte2+8) .. "]" .. char(byte3) .. char(byte4), pe(subbedChars[i]))
	end
	text = gsub(text, "\1", "%[%[")
		:gsub("\2", "%]%]")
	return text
end

-- If no script object is provided (or if it's invalid or None), get one.
local function checkScript(text, self, sc)
	if not check_object("script", true, sc) or sc:getCode() == "None" then
		return self:findBestScript(text)
	else
		return sc
	end
end

--[==[Create the form used as as a basis for display text and transliteration.]==]
local function processDisplayText(text, self, sc, keepCarets, keepPrefixes)
	local subbedChars = {}
	text, subbedChars = doTempSubstitutions(text, subbedChars, keepCarets)

	text = decode_uri(text, "PATH")
	text = checkNoEntities(self, text)

	sc = checkScript(text, self, sc)
	local fail, cats
	text = normalize(text, sc)
	text, fail, cats, subbedChars = iterateSectionSubstitutions(text, subbedChars, keepCarets, self, sc, self._rawData.display_text, "makeDisplayText")

	text = removeCarets(text, sc)

	-- Remove any interwiki link prefixes (unless they have been escaped or this has been disabled).
	if find(text, ":") and not keepPrefixes then
		local rep
		repeat
			text, rep = gsub(text, "\\\\(\\*:)", "\3%1")
		until rep == 0
		text = gsub(text, "\\:", "\4")
		while true do
			local prefix = gsub(text, "^(.-):.+", function(m1)
				return gsub(m1, "\244[\128-\191]*", "")
			end)
			if not prefix or prefix == text then
				break
			end
			local lower_prefix = ulower(prefix)
			if not (self:loadData("Module:data/interwikis")[lower_prefix] or prefix == "") then
				break
			end
			text = gsub(text, "^(.-):(.*)", function(m1, m2)
				local ret = {}
				for subbedChar in gmatch(m1, "\244[\128-\191]*") do
					insert(ret, subbedChar)
				end
				return concat(ret) .. m2
			end)
		end
		text = gsub(text, "\3", "\\")
			:gsub("\4", ":")
	end

	return text, fail, cats, subbedChars
end

function export.err(lang_code, param, code_desc, template_tag, not_real_lang)
	local ordinals = {
		"first", "second", "third", "fourth", "fifth", "sixth",
		"seventh", "eighth", "ninth", "tenth", "eleventh", "twelfth",
		"thirteenth", "fourteenth", "fifteenth", "sixteenth", "seventeenth",
		"eighteenth", "nineteenth", "twentieth"
	}
	
	code_desc = code_desc or "language code"
	
	if not template_tag then
		template_tag = ""
	else
		if type(template_tag) ~= "string" then
			template_tag = template_tag()
		end
		template_tag = " (Original template: " .. template_tag .. ")"
	end
	local function err(msg)
		error(msg .. template_tag, 3)
	end
	local param_type = type(param)
	local in_the_param
	if param == true then
		-- handled specially below
		in_the_param = ""
	else
		if param_type == "number" then
			param = ordinals[param] .. " parameter"
		elseif param_type == "string" then
			param = 'parameter "' .. param .. '"'
		else
			err("The parameter name is "
					.. (param_type == "table" and "a table" or tostring(param))
					.. ", but it should be a number or a string.")
		end
		in_the_param = " in the " .. param
	end
	
	if not lang_code or lang_code == "" then
		if param == true then
			err("The " .. code_desc .. " is missing.")
		else
			err("The " .. param .. " (" .. code_desc .. ") is missing.")
		end
	elseif type(lang_code) ~= "string" then
		err("The " .. code_desc .. in_the_param .. " is supposed to be a string but is a " .. type(lang_code) .. ".")
	-- Can use string.find because language codes only contain ASCII.
	elseif not_real_lang or lang_code:find("^%l%l%l?$")
			or lang_code:find("^%l%l%l%-%l%l%l$")
			or lang_code:find("^%l%l%l%-%l%l%l%-%l%l%l$") then
		err("The " .. code_desc .. " \"" .. lang_code .. "\"" .. in_the_param .. " is not valid.")
	else
		err("Please specify a " .. code_desc .. in_the_param .. ". The value \"" .. lang_code .. "\" is not valid.")
	end
end

local function do_entry_name_or_sort_key_replacements(text, replacements)
	if replacements.from then
		for i, from in ipairs(replacements.from) do
			local to = replacements.to[i] or ""
			text = mw.ustring.gsub(text, from, to)
		end
	end
	
	if replacements.remove_diacritics then
		text = mw.ustring.toNFD(text)
		text = mw.ustring.gsub(text,
			'[' .. replacements.remove_diacritics .. ']',
			'')
		text = mw.ustring.toNFC(text)
	end
	
	return text
end

local Language = {}

function Language:getCode()
	return self._code
end


function Language:getCanonicalName()
	return self._rawData[1] or self._rawData.canonicalName
end


function Language:getDisplayForm()
	return self:getCanonicalName()
end

function Language:getMainCategoryName()
	return self._rawData["main_category"] or "lemma"
end

function Language:getOtherNames(onlyOtherNames)
	self:loadInExtraData()
	return require("Module:language-like").getOtherNames(self, onlyOtherNames)
end


function Language:getAliases()
	self:loadInExtraData()
	return self._extraData.aliases or {}
end


function Language:getVarieties(flatten)
	self:loadInExtraData()
	return require("Module:language-like").getVarieties(self, flatten)
end


function Language:getType()
	return self._rawData.type or "regular"
end


function Language:getWikimediaLanguages()
	if not self._wikimediaLanguageObjects then
		local m_wikimedia_languages = require("Module:wikimedia languages")
		self._wikimediaLanguageObjects = {}
		local wikimedia_codes = self._rawData.wikimedia_codes or { self._code }
		
		for _, wlangcode in ipairs(wikimedia_codes) do
			table.insert(self._wikimediaLanguageObjects, m_wikimedia_languages.getByCode(wlangcode))
		end
	end
	
	return self._wikimediaLanguageObjects
end

function Language:getWikipediaArticle()
	if self._rawData.wikipedia_article then
		return self._rawData.wikipedia_article 
	elseif self._wikipedia_article then
		return self._wikipedia_article
	elseif self:getWikidataItem() and mw.wikibase then
		self._wikipedia_article = mw.wikibase.sitelink(self:getWikidataItem(), 'enwiki')
	end
	if not self._wikipedia_article then
		self._wikipedia_article = mw.ustring.gsub(self:getCategoryName(), "Creole language", "Creole")
	end
	return self._wikipedia_article
end

function Language:makeWikipediaLink()
	return "[[w:" .. self:getWikipediaArticle() .. "|" .. self:getCanonicalName() .. "]]"
end

function Language:getWikidataItem()
	local item = self._rawData[2]
	
	if type(item) == "number" then
		return "Q" .. item
	else
		return item
	end
end

function Language:getScripts()
	if not self._scriptObjects then
		local m_scripts = require("Module:scripts")
		self._scriptObjects = {}
		
		for _, sc in ipairs(self:getScriptCodes()) do
			table.insert(self._scriptObjects, m_scripts.getByCode(sc))
		end
	end
	
	return self._scriptObjects
end

function Language:getScriptCodes()
	return self._rawData.scripts or self._rawData[4] or { "None" }
end

function Language:getFamily()
	if self._familyObject then
		return self._familyObject
	end
		
	local family = self._rawData[3] or self._rawData.family 
	if family then
		self._familyObject = require("Module:families").getByCode(family)
	end
	
	return self._familyObject
end

--[==[Returns the family code in the language's data file.]==]
function Language:getFamilyCode()
	local family = self._familyCode
	if family == nil then
		-- If the value is nil, it's cached as false.
		family = self._rawData[3] or false
		self._familyCode = family
	end
	return family or nil
end

function Language:getFamilyName()
	local family = self._familyName
	if family == nil then
		family = self:getFamily()
		-- If the value is nil, it's cached as false.
		family = family and family:getCanonicalName() or false
		self._familyName = family
	end
	return family or nil
end

--[==[Check whether the language belongs to `family` (which can be a family code or object). A list of objects can be given in place of `family`; in that case, return true if the language belongs to any of the specified families. Note that some languages (in particular, certain creoles) can have multiple immediate ancestors potentially belonging to different families; in that case, return true if the language belongs to any of the specified families.]==]
function Language:inFamily(...)
	--check_object("family", nil, ...)
	for _, family in ipairs{...} do
		if type(family) == "table" then
			family = family:getCode()
		end
		local self_family_code = self:getFamilyCode()
		if not self_family_code then
			return false
		elseif self_family_code == family then
			return true
		end
		local self_family = self:getFamily()
		if self_family:inFamily(family) then
			return true
		-- If the family isn't a real family (e.g. creoles) check any ancestors.
		elseif self_family:getFamilyCode() == "qfa-not" then
			local ancestors = self:getAncestors()
			for _, ancestor in ipairs(ancestors) do
				if ancestor:inFamily(family) then
					return true
				end
			end
		end
	end
	return false
end

function Language:getParent()
	local parent = self._parentObject
	if parent == nil then
		parent = self:getParentCode()
		-- If the value is nil, it's cached as false.
		parent = parent and export.getByCode(parent, nil, true, true, self._useRequire) or false
		self._parentObject = parent
	end
	return parent or nil
end

function Language:getParentCode()
	local parent = self._parentCode
	if parent == nil then
		-- If the value is nil, it's cached as false.
		parent = self._rawData[5] or false
		self._parentCode = parent
	end
	return parent or nil
end

function Language:getParentName()
	local parent = self._parentName
	if parent == nil then
		parent = self:getParent()
		-- If the value is nil, it's cached as false.
		parent = parent and parent:getCanonicalName() or false
		self._parentName = parent
	end
	return parent or nil
end

--[==[Given some text, this function iterates through the scripts of a given language and tries to find the script that best matches the text. It returns a {{code|lua|Script}} object representing the script. If no match is found at all, it returns the {{code|lua|None}} script object.]==]
function Language:findBestScript(text, forceDetect)
	local useRequire = self._useRequire
	
	if not text or text == "" or text == "-" then
		return require("Module:scripts").getByCode("None", nil, nil, useRequire)
	end
	
	-- Differs from table returned by getScriptCodes, as Hants is not normalized into its constituents.
	codes = table.concat(self._rawData["scripts"],", ")
	codes = codes and split(codes, ",", true, true) or {"None"}
	self._bestScriptCodes = codes
	
	local first_sc = self._rawData.scripts[1]
	
	if first_sc == "All" then
		return require("Module:scripts").findBestScriptWithoutLang(text)
	end
	
	local get_script = require("Module:scripts").getByCode
	local codes_len = #codes
	
	if not (forceDetect or first_sc == "Hants" or codes_len > 1) then
		first_sc = get_script(first_sc, nil, nil, useRequire)
		local charset = first_sc.characters
		return charset and umatch(text, "[" .. charset .. "]") and first_sc or
			get_script("None", nil, nil, useRequire)
	end
	
	-- Remove all formatting characters.
	text = require("Module:utilities").get_plaintext(text)
	
	-- Remove all spaces and any ASCII punctuation. Some non-ASCII punctuation is script-specific, so can't be removed.
	text = ugsub(text, "[%s!\"#%%&'()*,%-./:;?@[\\%]_{}]+", "")
	if #text == 0 then
		return get_script("None", nil, nil, useRequire)
	end
	
	-- Try to match every script against the text,
	-- and return the one with the most matching characters.
	local bestcount, bestscript, length = 0
	for i = 1, codes_len do
		local sc = codes[i]
		-- Special case for "Hants", which is a special code that represents whichever of "Hant" or "Hans" best matches, or "Hani" if they match equally. This avoids having to list all three. In addition, "Hants" will be treated as the best match if there is at least one matching character, under the assumption that a Han script is desirable in terms that contain a mix of Han and other scripts (not counting those which use Jpan or Kore).
		if sc == "Hants" then
		else
			sc = get_script(sc, nil, nil, useRequire)
			
			if not length then
				length = ulen(text)
			end
			
			-- Count characters by removing everything in the script's charset and comparing to the original length.
			local charset = sc.characters
			local count = charset and length - ulen(ugsub(text, "[" .. charset .. "]+", "")) or 0
			
			if count >= length then
				return sc
			elseif count > bestcount then
				bestcount = count
				bestscript = sc
			end
		end
	end
	
	-- Return best matching script, or otherwise None.
	return bestscript or get_script("None", nil, nil, useRequire)
end

function Language:getParentChain()
	local chain = self._parentChain
	if chain == nil then
		chain = {}
		local parent, n = self:getParent(), 0
		while parent do
			n = n + 1
			chain[n] = parent
			parent = parent:getParent()
		end
		self._parentChain = chain
	end
	return chain
end

function Language:hasParent(...)
	--check_object("language", nil, ...)
	for _, otherlang in ipairs{...} do
		for _, parent in ipairs(self:getParentChain()) do
			if type(otherlang) == "string" then
				if otherlang == parent:getCode() then return true end
			else
				if otherlang:getCode() == parent:getCode() then return true end
			end
		end
	end
	return false
end

--[==[
If the language is etymology-only, this iterates through parents until a full language or family is found, and the
corresponding object is returned. If the language is a full language, then it simply returns itself.
]==]
function Language:getFull()
	local full = self._fullObject
	if full == nil then
		full = self:getFullCode()
		full = full == self._code and self or
			export.getByCode(full, nil, nil, nil, self._useRequire)
		self._fullObject = full
	end
	return full
end

--[==[
If the language is an etymology-only language, this iterates through parents until a full language or family is
found, and the corresponding code is returned. If the language is a full language, then it simply returns the
language code.
]==]
function Language:getFullCode()
	return self._fullCode or self._code
end

--[==[
If the language is an etymology-only language, this iterates through parents until a full language or family is
found, and the corresponding canonical name is returned. If the language is a full language, then it simply returns
the canonical name of the language.
]==]
function Language:getFullName()
	local full = self._fullName
	if full == nil then
		full = self:getFull():getCanonicalName()
		self._fullName = full
	end
	return full
end



function Language:getAncestors()
	if not self._ancestorObjects then
		self._ancestorObjects = {}
		
		if self._rawData.ancestors then
			for _, ancestor in ipairs(self._rawData.ancestors) do
				table.insert(self._ancestorObjects, export.getByCode(ancestor) or require("Module:etymology languages").getByCode(ancestor))
			end
		else
			local fam = self:getFamily()
			local protoLang = fam and fam:getProtoLanguage() or nil
			
			-- For the case where the current language is the proto-language
			-- of its family, we need to step up a level higher right from the start.
			if protoLang and protoLang:getCode() == self:getCode() then
				fam = fam:getFamily()
				protoLang = fam and fam:getProtoLanguage() or nil
			end
			
			while not protoLang and not (not fam or fam:getCode() == "qfa-not") do
				fam = fam:getFamily()
				protoLang = fam and fam:getProtoLanguage() or nil
			end
			
			table.insert(self._ancestorObjects, protoLang)
		end
	end
	
	return self._ancestorObjects
end

local function iterateOverAncestorTree(node, func)
	for _, ancestor in ipairs(node:getAncestors()) do
		if ancestor then
			local ret = func(ancestor) or iterateOverAncestorTree(ancestor, func)
			if ret then
				return ret
			end
		end
	end
end

--[==[Generates alternative forms using a specified method, and returns them as a table. If no method is specified, returns a table containing only the input term.]==]
function Language:generateForms(text, sc)
	if self._rawData.generate_forms then
		sc = checkScript(text, self, sc)
		return require("Module:" .. self._rawData.generate_forms).generateForms(text, self._code, sc:getCode())
	else
		return {text}
	end
end

function Language:getAncestorChain()
	if not self._ancestorChain then
		self._ancestorChain = {}
		local step = #self:getAncestors() == 1 and self:getAncestors()[1] or nil
		
		while step do
			table.insert(self._ancestorChain, 1, step)
			step = #step:getAncestors() == 1 and step:getAncestors()[1] or nil
		end
	end
	
	return self._ancestorChain
end


function Language:hasAncestor(otherlang)
	local function compare(ancestor)
		return ancestor:getCode() == otherlang:getCode()
	end
	
	return iterateOverAncestorTree(self, compare) or false
end


function Language:getCategoryName(nocap)
	local name = self:getCanonicalName()
	
	-- If the name already has "language" in it, don't add it.
	if not name:find("[Ll]anguage$") then
		name = name .. " language"
	end
	if not nocap then
		name = mw.getContentLanguage():ucfirst(name)
	end
	return name
end


function Language:makeCategoryLink()
	return "[[:Category:" .. self:getCategoryName() .. "|" .. self:getDisplayForm() .. "]]"
end


function Language:getStandardCharacters()
	return self._rawData.standardChars
end


function Language:makeEntryName(text)
	text = mw.ustring.match(text, "^[¿¡]?(.-[^%s%p].-)%s*[؟?!;՛՜ ՞ ՟?!︖︕।॥။၊་།]?$") or text
	
	if self:getCode() == "ar" then
		local U = mw.ustring.char
		local taTwiil = U(0x640)
		local waSla = U(0x671)
		-- diacritics ordinarily removed by entry_name replacements
		local Arabic_diacritics = U(0x64B, 0x64C, 0x64D, 0x64E, 0x64F, 0x650, 0x651, 0x652, 0x670)
		
		if text == waSla or mw.ustring.find(text, "^" .. taTwiil .. "?[" .. Arabic_diacritics .. "]" .. "$") then
			return text
		end
	end
	
	if type(self._rawData.entry_name) == "table" then
		text = do_entry_name_or_sort_key_replacements(text, self._rawData.entry_name)
	end
	
	return text
end


-- Return true if the language has display processing enabled, i.e. lang:makeDisplayText()
-- does non-trivial processing.
function Language:hasDisplayProcessing()
	return not not self._rawData.display
end

function Language:getTypes()
	local types = self._types
	if types == nil then
		types = {language = true}
		if self:getCode() == self._code then
			types.full = true
		else
			types["etymology-only"] = true
		end
		local type = self._rawData.type or "regular"
		for t in gmatch(type, "[^,]+") do
			types[t] = true
		end
		self._types = types
	end
	return types
end

--[==[Given a list of types as strings, returns true if the language has all of them.]==]
function Language:hasType(...)
	local args, types = {...}, self:getTypes()
	for i = 1, #args do
		if not types[args[i]] then
			return false
		end
	end
	return true
end

-- Apply display-text replacements to `text`, if any.
function Language:makeDisplayText(text)
	if type(self._rawData.display) == "table" then
		text = do_entry_name_or_sort_key_replacements(text, self._rawData.display)
	end
	
	return text
end


-- Add to data tables?
local has_dotted_undotted_i = {
	["az"] = true,
	["crh"] = true,
	["gag"] = true,
	["kaa"] = true,
	["tt"] = true,
	["tr"] = true,
	["zza"] = true,
}

function Language:makeSortKey(name, sc)
	if has_dotted_undotted_i[self:getCode()] then
		name = name:gsub("I", "ı")
	end
	
	name = mw.ustring.lower(name)
	
	-- Remove initial hyphens and *
	local hyphens_regex = "^[-־ـ*]+(.)"
	name = mw.ustring.gsub(name, hyphens_regex, "%1")
	
	-- If there are language-specific rules to generate the key, use those
	if type(self._rawData.sort_key) == "table" then
		name = do_entry_name_or_sort_key_replacements(name, self._rawData.sort_key)
	elseif type(self._rawData.sort_key) == "string" then
		name = require("Module:" .. self._rawData.sort_key).makeSortKey(name, self:getCode(), sc and sc:getCode())
	end
	
	-- Remove parentheses, as long as they are either preceded or followed by something
	name = mw.ustring.gsub(name, "(.)[()]+", "%1")
	name = mw.ustring.gsub(name, "[()]+(.)", "%1")
	
	if has_dotted_undotted_i[self:getCode()] then
		name = name:gsub("i", "İ")
	end
	
	return mw.ustring.upper(name)
end

function Language:overrideManualTranslit()
	if self._rawData.override_translit then
		return true
	else
		return false
	end
end


function Language:transliterate(text, sc, module_override)
	-- If there is no text, or the language doesn't have transliteration data and there's no override, return nil.
	if not (self._rawData.translit_module or module_override) then
		return nil, false, {}
	elseif (not text) or text == "" or text == "-" then
		return text, false, {}
	end
	-- If the script is not transliteratable (and no override is given), return nil.
	sc = checkScript(text, self, sc)
	if not (sc:isTransliterated() or module_override) then
		return nil, true, {}
	end

	-- Remove any strip markers.
	text = mw.text.unstrip(text)

	-- Get the display text with the keepCarets flag set.
	local fail, cats, subbedChars
	text, fail, cats, subbedChars = processDisplayText(text, self, sc, true)

	-- Transliterate (using the module override if applicable).
	text, fail, cats, subbedChars = iterateSectionSubstitutions(text, subbedChars, true, self, sc, module_override or self._rawData.translit_module, "tr")
	
	if not text then
		return nil, true, cats
	end
	
	-- Incomplete transliterations return nil.
	local charset = sc.characters
	if charset and umatch(text, "[" .. charset .. "]") then
		-- Remove any characters in Latin, which includes Latin characters also included in other scripts (as these are false positives), as well as any PUA substitutions. Anything remaining should only be script code "None" (e.g. numerals).
		local check_text = ugsub(text, "[" .. require("Module:scripts").getByCode("Latn").characters .. "􀀀-􏿽]+", "")
		-- Set none_is_last_resort_only flag, so that any non-None chars will cause a script other than "None" to be returned.
		if require("Module:scripts").findBestScriptWithoutLang(check_text, true):getCode() ~= "None" then
			return nil, true, cats
		end
	end

	text = escape_risky_characters(text)
	text = undoTempSubstitutions(text, subbedChars)

	-- If the script does not use capitalization, then capitalize any letters of the transliteration which are immediately preceded by a caret (and remove the caret).
	if text and not sc:hasCapitalization() and text:find("^", 1, true) then
		text = processCarets(text, "%^([\128-\191\244]*%*?)([^\128-\191\244][\128-\191]*)", function(m1, m2)
			return m1 .. uupper(m2)
		end)
	end

	-- Track module overrides.
	if module_override ~= nil then
		track("module_override")
	end

	fail = text == nil and (not not fail) or false

	return text, fail, cats
end

function Language:hasTranslit()
	return self._rawData.translit_module and true or false
end


function Language:link_tr()
	return self._rawData.link_tr and true or false
end


function Language:toJSON()
	local entryNamePatterns = nil
	local entryNameRemoveDiacritics = nil
	
	if self._rawData.entry_name then
		entryNameRemoveDiacritics = self._rawData.entry_name.remove_diacritics
		if self._rawData.entry_name.from then
			entryNamePatterns = {}
			for i, from in ipairs(self._rawData.entry_name.from) do
				local to = self._rawData.entry_name.to[i] or ""
				table.insert(entryNamePatterns, { from = from, to = to })
			end
		end
	end
	
	local ret = {
		ancestors = self._rawData.ancestors,
		canonicalName = self:getCanonicalName(),
		categoryName = self:getCategoryName("nocap"),
		code = self._code,
		entryNamePatterns = entryNamePatterns,
		entryNameRemoveDiacritics = entryNameRemoveDiacritics,
		family = self._rawData[3] or self._rawData.family,
		otherNames = self:getOtherNames(true),
		aliases = self:getAliases(),
		varieties = self:getVarieties(),
		scripts = self._rawData.scripts or self._rawData[4],
		type = self:getType(),
		wikimediaLanguages = self._rawData.wikimedia_codes,
		wikidataItem = self:getWikidataItem(),
	}
	
	return require("Module:JSON").toJSON(ret)
end


-- Do NOT use these methods!
-- All uses should be pre-approved on the talk page!
function Language:getRawData()
	return self._rawData
end

function Language:getRawExtraData()
	self:loadInExtraData()
	return self._extraData
end

Language.__index = Language


function export.getDataModuleName(code)
	if code:find("^%l%l$") then
		return "languages/data2"
	elseif code:find("^%l%l%l$") then
		local prefix = code:sub(1, 1)
		return "languages/data3/" .. prefix
	elseif code:find("^[%l-]+$") then
		return "languages/datax"
	else
		return nil
	end
end


function export.getExtraDataModuleName(code)
	if code:find("^%l%l$") then
		return "languages/extradata2"
	elseif code:find("^%l%l%l$") then
		local prefix = code:sub(1, 1)
		return "languages/extradata3/" .. prefix
	elseif code:find("^[%l-]+$") then
		return "languages/extradatax"
	else
		return nil
	end
end


local function getRawLanguageData(code)
	local modulename = export.getDataModuleName(code)
	return modulename and mw.loadData("Module:" .. modulename)[code] or nil
end


local function getRawExtraLanguageData(code)
	local modulename = export.getExtraDataModuleName(code)
	return modulename and mw.loadData("Module:" .. modulename)[code] or nil
end


function Language:loadInExtraData()
	if not self._extraData then
		-- load extra data from module and assign to meta table
		-- use empty table as a fallback if extra data is nil
		local meta = getmetatable(self)
		meta._extraData = getRawExtraLanguageData(self._code) or {}
		setmetatable(self, meta)
	end
end


function export.makeObject(code, data)
	return data and setmetatable({ _rawData = data, _code = code }, Language) or nil
end


function export.getByCode(code, paramForError, allowEtymLang, allowFamily)
	if type(code) ~= "string" then
		error("The function getByCode expects a string as its first argument, but received " .. (code == nil and "nil" or "a " .. type(code)) .. ".")
	end
	
	local retval = export.makeObject(code, getRawLanguageData(code))
	if not retval and allowEtymLang then
		retval = require("Module:etymology languages").getByCode(code)
	end
	if not retval and allowFamily then
		retval = require("Module:families").getByCode(code)
	end
	if not retval and paramForError then
		local codetext = nil
		if allowEtymLang and allowFamily then
			codetext = "language, etymology language or family code"
		elseif allowEtymLang then
			codetext = "language or etymology language code"
		elseif allowFamily then
			codetext = "language or family code"
		else
			codetext = "language code"
		end
		export.err(code, paramForError, codetext)
	end
	return retval
end


function export.getByName(name, errorIfInvalid)
	local byName = mw.loadData("Module:languages/by name")
	local code = byName.all and byName.all[name] or byName[name]
	
	if not code then
		if errorIfInvalid then
			error("The language name \"" .. name .. "\" is not valid.")
		else
			return nil
		end
	end
	
	return export.makeObject(code, getRawLanguageData(code))
end

function export.getByCanonicalName(name, errorIfInvalid, allowEtymLang, allowFamily)
	local byName = mw.loadData("Module:languages/canonical names")
	local code = byName and byName[name]

	local retval = code and export.makeObject(code, getRawLanguageData(code)) or nil
	if not retval and allowEtymLang then
		retval = require("Module:etymology languages").getByCanonicalName(name)
	end
	if not retval and allowFamily then
		local famname = name:match("^(.*) languages$")
		famname = famname or name
		retval = require("Module:families").getByCanonicalName(famname)
	end
	if not retval and errorIfInvalid then
		local text
		if allowEtymLang and allowFamily then
			text = "language, etymology language or family name"
		elseif allowEtymLang then
			text = "language or etymology language name"
		elseif allowFamily then
			text = "language or family name"
		else
			text = "language name"
		end
		error("The " .. text .. " \"" .. name .. "\" is not valid.")
	end
	return retval
end

function export.iterateAll()
	mw.incrementExpensiveFunctionCount()
	local m_data = mw.loadData("Module:languages/alldata")
	local func, t, var = pairs(m_data)
	
	return function()
		local code, data = func(t, var)
		return export.makeObject(code, data)
	end
end

--[[	If language is an etymology language, iterates through parent languages
		until it finds a non-etymology language. ]]
function export.getNonEtymological(lang)
	while lang:getType() == "etymology language" do
		local parentCode = lang:getParentCode()
		lang = export.getByCode(parentCode)
			or require("Module:etymology languages").getByCode(parentCode)
			or require("Module:families").getByCode(parentCode)
	end
	
	return lang
end

return export