Module:languages

From Linguifex
Revision as of 11:03, 7 August 2024 by Sware (talk | contribs)
Jump to navigation Jump to search


local require = require

local m_str_utils = require("Module:string utilities")
local m_table = require("Module:table")
local mw = mw
local string = string
local table = table
local ustring = mw.ustring

local char = string.char
local check_object = require("Module:utilities").check_object
local concat = table.concat
local decode_entities = m_str_utils.decode_entities
local decode_uri = m_str_utils.decode_uri
local find = string.find
local floor = math.floor
local gmatch = string.gmatch
local gsub = string.gsub
local insert = table.insert
local ipairs = ipairs
local list_to_set = m_table.listToSet
local load_data = mw.loadData
local match = string.match
local next = next
local pairs = pairs
local remove = table.remove
local remove_duplicates = m_table.removeDuplicates
local select = select
local setmetatable = setmetatable
local shallowcopy = m_table.shallowcopy
local split = m_str_utils.split
local sub = string.sub
local type = type
local ugsub = ustring.gsub
local ulen = m_str_utils.len
local ulower = m_str_utils.lower
local umatch = ustring.match
local uupper = m_str_utils.upper

local export = {}

--[=[
Throw an error for an invalid language code or script code.

`lang_code` (required) is the bad code and can be nil or a non-string.

`param` (required) is the name of the parameter in which the code was contained. It can be a string, a number
	(for a numeric param, in which case the param will show up in the error message as an ordinal such as
	"first" or "second"), or `true` if no parameter can be clearly identified.

`code_desc` (optional) is text describing what the code is; by default, "language code".

`template_text` (optional) is a string specifying the template that generated the error, or a function
	to generate this string. If given, it will be displayed in the error message.

`not_real_lang` (optional), if given, indicates that the code is not in the form of a language code
	(e.g. it's a script code). Normally, this function checks for things that could plausibly be a language code:
	two or three lowercase letters, two or three groups of three lowercase letters with hyphens between them.
	If such a pattern is found, a different error message is displayed (indicating an invalid code) than otherwise
	(indicating a missing code). If `not_real_lang` is given, this check is suppressed.
]=]

-- Convert risky characters to HTML entities, which minimizes interference once returned (e.g. for "sms:a", "<!-- -->" etc.).
local function escape_risky_characters(text)
	local encode_entities = require("Module:string/encode entities")
	-- Spacing characters in isolation generally need to be escaped in order to be properly processed by the MediaWiki software.
	if umatch(text, "^%s*$") then
		return encode_entities(text, text)
	else
		return encode_entities(text, "!#%&*+/:;<=>?@[\\]_{|}")
	end
end

-- Process carets (and any escapes). Default to simple removal, if no pattern/replacement is given.
local function processCarets(text, pattern, repl)
	local rep
	repeat
		text, rep = gsub(text, "\\\\(\\*^)", "\3%1")
	until rep == 0
	return text:gsub("\\^", "\4")
		:gsub(pattern or "%^", repl or "")
		:gsub("\3", "\\")
		:gsub("\4", "^")
end

-- Remove carets if they are used to capitalize parts of transliterations (unless they have been escaped).
local function removeCarets(text, sc)
	if not sc:hasCapitalization() and sc:isTransliterated() and text:find("^", 1, true) then
		return processCarets(text)
	else
		return text
	end
end

-- Temporarily convert various formatting characters to PUA to prevent them from being disrupted by the substitution process.
local function doTempSubstitutions(text, subbedChars, keepCarets, noTrim)
	-- Clone so that we don't insert any extra patterns into the table in package.loaded. For some reason, using require seems to keep memory use down; probably because the table is always cloned.
	local patterns = shallowcopy(require("Module:languages/data/patterns"))
	if keepCarets then
		insert(patterns, "((\\+)%^)")
		insert(patterns, "((%^))")
	end
	-- Ensure any whitespace at the beginning and end is temp substituted, to prevent it from being accidentally trimmed. We only want to trim any final spaces added during the substitution process (e.g. by a module), which means we only do this during the first round of temp substitutions.
	if not noTrim then
		insert(patterns, "^([\128-\191\244]*(%s+))")
		insert(patterns, "((%s+)[\128-\191\244]*)$")
	end
	-- Pre-substitution, of "[[" and "]]", which makes pattern matching more accurate.
	text = gsub(text, "%f[%[]%[%[", "\1")
		:gsub("%f[%]]%]%]", "\2")
	local i, pe = #subbedChars, require("Module:string utilities").pattern_escape
	for _, pattern in ipairs(patterns) do
		-- Patterns ending in \0 stand are for things like "[[" or "]]"), so the inserted PUA are treated as breaks between terms by modules that scrape info from pages.
		local term_divider
		pattern = gsub(pattern, "%z$", function(divider)
			term_divider = divider == "\0"
			return ""
		end)
		text = gsub(text, pattern, function(...)
			local m = {...}
			local m1New = m[1]
			for k = 2, #m do
				local n = i + k - 1
				subbedChars[n] = m[k]
				local byte2 = floor(n / 4096) % 64 + (term_divider and 128 or 136)
				local byte3 = floor(n / 64) % 64 + 128
				local byte4 = n % 64 + 128
				m1New = gsub(m1New, pe(m[k]), "\244" .. char(byte2) .. char(byte3) .. char(byte4), 1)
			end
			i = i + #m - 1
			return m1New
		end)
	end
	text = gsub(text, "\1", "%[%[")
		:gsub("\2", "%]%]")
	return text, subbedChars
end

-- Split the text into sections, based on the presence of temporarily substituted formatting characters, then iterate over each one to apply substitutions. This avoids putting PUA characters through language-specific modules, which may be unequipped for them.
local function iterateSectionSubstitutions(text, subbedChars, keepCarets, self, sc, substitution_data, function_name)
	local pe = require("Module:string utilities").pattern_escape
	local fail, cats, sections = nil, {}
	-- See [[Module:languages/data]].
	if not find(text, "\244") or self:loadData("Module:languages/data").contiguous_substitution[self._code] then
		sections = {text}
	else
		sections = split(text, "\244[\128-\143][\128-\191]*", true)
	end
	for _, section in ipairs(sections) do
		-- Don't bother processing empty strings or whitespace (which may also not be handled well by dedicated modules).
		if gsub(section, "%s+", "") ~= "" then
			local sub, sub_fail, sub_cats = require("Module:languages/doSubstitutions")(section, self, sc, substitution_data, function_name)
			-- Second round of temporary substitutions, in case any formatting was added by the main substitution process. However, don't do this if the section contains formatting already (as it would have had to have been escaped to reach this stage, and therefore should be given as raw text).
			if sub and subbedChars then
				local noSub
				for _, pattern in ipairs(require("Module:languages/data/patterns")) do
					if match(section, pattern .. "%z?") then
						noSub = true
					end
				end
				if not noSub then
					sub, subbedChars = doTempSubstitutions(sub, subbedChars, keepCarets, true)
				end
			end
			if (not sub) or sub_fail then
				text = sub
				fail = sub_fail
				cats = sub_cats or {}
				break
			end
			text = sub and gsub(text, pe(section), pe(sub), 1) or text
			if type(sub_cats) == "table" then
				for _, cat in ipairs(sub_cats) do
					insert(cats, cat)
				end
			end
		end
	end

	-- Trim, unless there are only spacing characters, while ignoring any final formatting characters.
	text = text and text:gsub("^([\128-\191\244]*)%s+(%S)", "%1%2")
		:gsub("(%S)%s+([\128-\191\244]*)$", "%1%2")

	-- Remove duplicate categories.
	if #cats > 1 then
		cats = remove_duplicates(cats)
	end

	return text, fail, cats, subbedChars
end

local function normalize(text, sc)
	text = sc:fixDiscouragedSequences(text)
	return sc:toFixedNFD(text)
end

-- Check if the raw text is an unsupported title, and if so return that. Otherwise, remove HTML entities. We do the pre-conversion to avoid loading the unsupported title list unnecessarily.
local function checkNoEntities(self, text)
	local textNoEnc = decode_entities(text)
	if textNoEnc ~= text and self:loadData("Module:links/data").unsupported_titles[text] then
		return text
	else
		return textNoEnc
	end
end

-- Reinsert any formatting that was temporarily substituted.
local function undoTempSubstitutions(text, subbedChars)
	local pe = require("Module:string utilities").pattern_escape
	for i = 1, #subbedChars do
		local byte2 = floor(i / 4096) % 64 + 128
		local byte3 = floor(i / 64) % 64 + 128
		local byte4 = i % 64 + 128
		text = gsub(text, "\244[" .. char(byte2) .. char(byte2+8) .. "]" .. char(byte3) .. char(byte4), pe(subbedChars[i]))
	end
	text = gsub(text, "\1", "%[%[")
		:gsub("\2", "%]%]")
	return text
end

-- If no script object is provided (or if it's invalid or None), get one.
local function checkScript(text, self, sc)
	if not check_object("script", true, sc) or sc:getCode() == "None" then
		return self:findBestScript(text)
	else
		return sc
	end
end

--[==[Create the form used as as a basis for display text and transliteration.]==]
local function processDisplayText(text, self, sc, keepCarets, keepPrefixes)
	local subbedChars = {}
	text, subbedChars = doTempSubstitutions(text, subbedChars, keepCarets)

	text = decode_uri(text, "PATH")
	text = checkNoEntities(self, text)

	sc = checkScript(text, self, sc)
	local fail, cats
	text = normalize(text, sc)
	text, fail, cats, subbedChars = iterateSectionSubstitutions(text, subbedChars, keepCarets, self, sc, self._rawData.display_text, "makeDisplayText")

	text = removeCarets(text, sc)

	-- Remove any interwiki link prefixes (unless they have been escaped or this has been disabled).
	if find(text, ":") and not keepPrefixes then
		local rep
		repeat
			text, rep = gsub(text, "\\\\(\\*:)", "\3%1")
		until rep == 0
		text = gsub(text, "\\:", "\4")
		while true do
			local prefix = gsub(text, "^(.-):.+", function(m1)
				return gsub(m1, "\244[\128-\191]*", "")
			end)
			if not prefix or prefix == text then
				break
			end
			local lower_prefix = ulower(prefix)
			if not (self:loadData("Module:data/interwikis")[lower_prefix] or prefix == "") then
				break
			end
			text = gsub(text, "^(.-):(.*)", function(m1, m2)
				local ret = {}
				for subbedChar in gmatch(m1, "\244[\128-\191]*") do
					insert(ret, subbedChar)
				end
				return concat(ret) .. m2
			end)
		end
		text = gsub(text, "\3", "\\")
			:gsub("\4", ":")
	end

	return text, fail, cats, subbedChars
end

function export.err(lang_code, param, code_desc, template_tag, not_real_lang)
	local ordinals = {
		"first", "second", "third", "fourth", "fifth", "sixth",
		"seventh", "eighth", "ninth", "tenth", "eleventh", "twelfth",
		"thirteenth", "fourteenth", "fifteenth", "sixteenth", "seventeenth",
		"eighteenth", "nineteenth", "twentieth"
	}
	
	code_desc = code_desc or "language code"
	
	if not template_tag then
		template_tag = ""
	else
		if type(template_tag) ~= "string" then
			template_tag = template_tag()
		end
		template_tag = " (Original template: " .. template_tag .. ")"
	end
	local function err(msg)
		error(msg .. template_tag, 3)
	end
	local param_type = type(param)
	local in_the_param
	if param == true then
		-- handled specially below
		in_the_param = ""
	else
		if param_type == "number" then
			param = ordinals[param] .. " parameter"
		elseif param_type == "string" then
			param = 'parameter "' .. param .. '"'
		else
			err("The parameter name is "
					.. (param_type == "table" and "a table" or tostring(param))
					.. ", but it should be a number or a string.")
		end
		in_the_param = " in the " .. param
	end
	
	if not lang_code or lang_code == "" then
		if param == true then
			err("The " .. code_desc .. " is missing.")
		else
			err("The " .. param .. " (" .. code_desc .. ") is missing.")
		end
	elseif type(lang_code) ~= "string" then
		err("The " .. code_desc .. in_the_param .. " is supposed to be a string but is a " .. type(lang_code) .. ".")
	-- Can use string.find because language codes only contain ASCII.
	elseif not_real_lang or lang_code:find("^%l%l%l?$")
			or lang_code:find("^%l%l%l%-%l%l%l$")
			or lang_code:find("^%l%l%l%-%l%l%l%-%l%l%l$") then
		err("The " .. code_desc .. " \"" .. lang_code .. "\"" .. in_the_param .. " is not valid.")
	else
		err("Please specify a " .. code_desc .. in_the_param .. ". The value \"" .. lang_code .. "\" is not valid.")
	end
end

local function do_entry_name_or_sort_key_replacements(text, replacements)
	if replacements.from then
		for i, from in ipairs(replacements.from) do
			local to = replacements.to[i] or ""
			text = mw.ustring.gsub(text, from, to)
		end
	end
	
	if replacements.remove_diacritics then
		text = mw.ustring.toNFD(text)
		text = mw.ustring.gsub(text,
			'[' .. replacements.remove_diacritics .. ']',
			'')
		text = mw.ustring.toNFC(text)
	end
	
	return text
end

local Language = {}

function Language:getCode()
	return self._code
end


function Language:getCanonicalName()
	return self._rawData[1] or self._rawData.canonicalName
end


function Language:getDisplayForm()
	return self:getCanonicalName()
end

function Language:getMainCategoryName()
	return self._rawData["main_category"] or "lemma"
end

function Language:getOtherNames(onlyOtherNames)
	self:loadInExtraData()
	return require("Module:language-like").getOtherNames(self, onlyOtherNames)
end


function Language:getAliases()
	self:loadInExtraData()
	return self._extraData.aliases or {}
end


function Language:getVarieties(flatten)
	self:loadInExtraData()
	return require("Module:language-like").getVarieties(self, flatten)
end


function Language:getType()
	return self._rawData.type or "regular"
end


function Language:getWikimediaLanguages()
	if not self._wikimediaLanguageObjects then
		local m_wikimedia_languages = require("Module:wikimedia languages")
		self._wikimediaLanguageObjects = {}
		local wikimedia_codes = self._rawData.wikimedia_codes or { self._code }
		
		for _, wlangcode in ipairs(wikimedia_codes) do
			table.insert(self._wikimediaLanguageObjects, m_wikimedia_languages.getByCode(wlangcode))
		end
	end
	
	return self._wikimediaLanguageObjects
end

function Language:getWikipediaArticle()
	if self._rawData.wikipedia_article then
		return self._rawData.wikipedia_article 
	elseif self._wikipedia_article then
		return self._wikipedia_article
	elseif self:getWikidataItem() and mw.wikibase then
		self._wikipedia_article = mw.wikibase.sitelink(self:getWikidataItem(), 'enwiki')
	end
	if not self._wikipedia_article then
		self._wikipedia_article = mw.ustring.gsub(self:getCategoryName(), "Creole language", "Creole")
	end
	return self._wikipedia_article
end

function Language:makeWikipediaLink()
	return "[[w:" .. self:getWikipediaArticle() .. "|" .. self:getCanonicalName() .. "]]"
end

function Language:getWikidataItem()
	local item = self._rawData[2]
	
	if type(item) == "number" then
		return "Q" .. item
	else
		return item
	end
end

function Language:getScripts()
	if not self._scriptObjects then
		local m_scripts = require("Module:scripts")
		self._scriptObjects = {}
		
		for _, sc in ipairs(self:getScriptCodes()) do
			table.insert(self._scriptObjects, m_scripts.getByCode(sc))
		end
	end
	
	return self._scriptObjects
end

function Language:getScriptCodes()
	return self._rawData.scripts or self._rawData[4] or { "None" }
end

function Language:getFamily()
	if self._familyObject then
		return self._familyObject
	end
		
	local family = self._rawData[3] or self._rawData.family 
	if family then
		self._familyObject = require("Module:families").getByCode(family)
	end
	
	return self._familyObject
end

--[==[Returns the family code in the language's data file.]==]
function Language:getFamilyCode()
	local family = self._familyCode
	if family == nil then
		-- If the value is nil, it's cached as false.
		family = self._rawData[3] or false
		self._familyCode = family
	end
	return family or nil
end

function Language:getFamilyName()
	local family = self._familyName
	if family == nil then
		family = self:getFamily()
		-- If the value is nil, it's cached as false.
		family = family and family:getCanonicalName() or false
		self._familyName = family
	end
	return family or nil
end

--[==[Check whether the language belongs to `family` (which can be a family code or object). A list of objects can be given in place of `family`; in that case, return true if the language belongs to any of the specified families. Note that some languages (in particular, certain creoles) can have multiple immediate ancestors potentially belonging to different families; in that case, return true if the language belongs to any of the specified families.]==]
function Language:inFamily(...)
	--check_object("family", nil, ...)
	for _, family in ipairs{...} do
		if type(family) == "table" then
			family = family:getCode()
		end
		local self_family_code = self:getFamilyCode()
		if not self_family_code then
			return false
		elseif self_family_code == family then
			return true
		end
		local self_family = self:getFamily()
		if self_family:inFamily(family) then
			return true
		-- If the family isn't a real family (e.g. creoles) check any ancestors.
		elseif self_family:getFamilyCode() == "qfa-not" then
			local ancestors = self:getAncestors()
			for _, ancestor in ipairs(ancestors) do
				if ancestor:inFamily(family) then
					return true
				end
			end
		end
	end
	return false
end

function Language:getParent()
	local parent = self._parentObject
	if parent == nil then
		parent = self:getParentCode()
		-- If the value is nil, it's cached as false.
		parent = parent and export.getByCode(parent, nil, true, true, self._useRequire) or false
		self._parentObject = parent
	end
	return parent or nil
end

function Language:getParentCode()
	local parent = self._parentCode
	if parent == nil then
		-- If the value is nil, it's cached as false.
		parent = self._rawData[5] or false
		self._parentCode = parent
	end
	return parent or nil
end

function Language:getParentName()
	local parent = self._parentName
	if parent == nil then
		parent = self:getParent()
		-- If the value is nil, it's cached as false.
		parent = parent and parent:getCanonicalName() or false
		self._parentName = parent
	end
	return parent or nil
end

--[==[Given some text, this function iterates through the scripts of a given language and tries to find the script that best matches the text. It returns a {{code|lua|Script}} object representing the script. If no match is found at all, it returns the {{code|lua|None}} script object.]==]
function Language:findBestScript(text, forceDetect)
	local useRequire = self._useRequire
	
	if not text or text == "" or text == "-" then
		return require("Module:scripts").getByCode("None", nil, nil, useRequire)
	end
	
	-- Differs from table returned by getScriptCodes, as Hants is not normalized into its constituents.
	codes = table.concat(self._rawData["scripts"],", ")
	codes = codes and split(codes, ",", true, true) or {"None"}
	self._bestScriptCodes = codes
	
	local first_sc = self._rawData.scripts[1]
	
	if first_sc == "All" then
		return require("Module:scripts").findBestScriptWithoutLang(text)
	end
	
	local get_script = require("Module:scripts").getByCode
	local codes_len = #codes
	
	if not (forceDetect or first_sc == "Hants" or codes_len > 1) then
		first_sc = get_script(first_sc, nil, nil, useRequire)
		local charset = first_sc.characters
		return charset and umatch(text, "[" .. charset .. "]") and first_sc or
			get_script("None", nil, nil, useRequire)
	end
	
	-- Remove all formatting characters.
	text = require("Module:utilities").get_plaintext(text)
	
	-- Remove all spaces and any ASCII punctuation. Some non-ASCII punctuation is script-specific, so can't be removed.
	text = ugsub(text, "[%s!\"#%%&'()*,%-./:;?@[\\%]_{}]+", "")
	if #text == 0 then
		return get_script("None", nil, nil, useRequire)
	end
	
	-- Try to match every script against the text,
	-- and return the one with the most matching characters.
	local bestcount, bestscript, length = 0
	for i = 1, codes_len do
		local sc = codes[i]
		-- Special case for "Hants", which is a special code that represents whichever of "Hant" or "Hans" best matches, or "Hani" if they match equally. This avoids having to list all three. In addition, "Hants" will be treated as the best match if there is at least one matching character, under the assumption that a Han script is desirable in terms that contain a mix of Han and other scripts (not counting those which use Jpan or Kore).
		if sc == "Hants" then
		else
			sc = get_script(sc, nil, nil, useRequire)
			
			if not length then
				length = ulen(text)
			end
			
			-- Count characters by removing everything in the script's charset and comparing to the original length.
			local charset = sc.characters
			local count = charset and length - ulen(ugsub(text, "[" .. charset .. "]+", "")) or 0
			
			if count >= length then
				return sc
			elseif count > bestcount then
				bestcount = count
				bestscript = sc
			end
		end
	end
	
	-- Return best matching script, or otherwise None.
	return bestscript or get_script("None", nil, nil, useRequire)
end

function Language:getParentChain()
	local chain = self._parentChain
	if chain == nil then
		chain = {}
		local parent, n = self:getParent(), 0
		while parent do
			n = n + 1
			chain[n] = parent
			parent = parent:getParent()
		end
		self._parentChain = chain
	end
	return chain
end

function Language:hasParent(...)
	--check_object("language", nil, ...)
	for _, otherlang in ipairs{...} do
		for _, parent in ipairs(self:getParentChain()) do
			if type(otherlang) == "string" then
				if otherlang == parent:getCode() then return true end
			else
				if otherlang:getCode() == parent:getCode() then return true end
			end
		end
	end
	return false
end

--[==[
If the language is etymology-only, this iterates through parents until a full language or family is found, and the
corresponding object is returned. If the language is a full language, then it simply returns itself.
]==]
function Language:getFull()
	local full = self._fullObject
	if full == nil then
		full = self:getFullCode()
		full = full == self._code and self or
			export.getByCode(full, nil, nil, nil, self._useRequire)
		self._fullObject = full
	end
	return full
end

--[==[
If the language is an etymology-only language, this iterates through parents until a full language or family is
found, and the corresponding code is returned. If the language is a full language, then it simply returns the
language code.
]==]
function Language:getFullCode()
	return self._fullCode or self._code
end

--[==[
If the language is an etymology-only language, this iterates through parents until a full language or family is
found, and the corresponding canonical name is returned. If the language is a full language, then it simply returns
the canonical name of the language.
]==]
function Language:getFullName()
	local full = self._fullName
	if full == nil then
		full = self:getFull():getCanonicalName()
		self._fullName = full
	end
	return full
end



function Language:getAncestors()
	if not self._ancestorObjects then
		self._ancestorObjects = {}
		
		if self._rawData.ancestors then
			for _, ancestor in ipairs(self._rawData.ancestors) do
				table.insert(self._ancestorObjects, export.getByCode(ancestor) or require("Module:etymology languages").getByCode(ancestor))
			end
		else
			local fam = self:getFamily()
			local protoLang = fam and fam:getProtoLanguage() or nil
			
			-- For the case where the current language is the proto-language
			-- of its family, we need to step up a level higher right from the start.
			if protoLang and protoLang:getCode() == self:getCode() then
				fam = fam:getFamily()
				protoLang = fam and fam:getProtoLanguage() or nil
			end
			
			while not protoLang and not (not fam or fam:getCode() == "qfa-not") do
				fam = fam:getFamily()
				protoLang = fam and fam:getProtoLanguage() or nil
			end
			
			table.insert(self._ancestorObjects, protoLang)
		end
	end
	
	return self._ancestorObjects
end

local function iterateOverAncestorTree(node, func)
	for _, ancestor in ipairs(node:getAncestors()) do
		if ancestor then
			local ret = func(ancestor) or iterateOverAncestorTree(ancestor, func)
			if ret then
				return ret
			end
		end
	end
end

--[==[Generates alternative forms using a specified method, and returns them as a table. If no method is specified, returns a table containing only the input term.]==]
function Language:generateForms(text, sc)
	if self._rawData.generate_forms then
		sc = checkScript(text, self, sc)
		return require("Module:" .. self._rawData.generate_forms).generateForms(text, self._code, sc:getCode())
	else
		return {text}
	end
end

function Language:getAncestorChain()
	if not self._ancestorChain then
		self._ancestorChain = {}
		local step = #self:getAncestors() == 1 and self:getAncestors()[1] or nil
		
		while step do
			table.insert(self._ancestorChain, 1, step)
			step = #step:getAncestors() == 1 and step:getAncestors()[1] or nil
		end
	end
	
	return self._ancestorChain
end


function Language:hasAncestor(otherlang)
	local function compare(ancestor)
		return ancestor:getCode() == otherlang:getCode()
	end
	
	return iterateOverAncestorTree(self, compare) or false
end


function Language:getCategoryName(nocap)
	local name = self:getCanonicalName()
	
	-- If the name already has "language" in it, don't add it.
	if not name:find("[Ll]anguage$") then
		name = name .. " language"
	end
	if not nocap then
		name = mw.getContentLanguage():ucfirst(name)
	end
	return name
end


function Language:makeCategoryLink()
	return "[[:Category:" .. self:getCategoryName() .. "|" .. self:getDisplayForm() .. "]]"
end


function Language:getStandardCharacters()
	return self._rawData.standardChars
end


function Language:makeEntryName(text)
	text = mw.ustring.match(text, "^[¿¡]?(.-[^%s%p].-)%s*[؟?!;՛՜ ՞ ՟?!︖︕।॥။၊་།]?$") or text
	
	if self:getCode() == "ar" then
		local U = mw.ustring.char
		local taTwiil = U(0x640)
		local waSla = U(0x671)
		-- diacritics ordinarily removed by entry_name replacements
		local Arabic_diacritics = U(0x64B, 0x64C, 0x64D, 0x64E, 0x64F, 0x650, 0x651, 0x652, 0x670)
		
		if text == waSla or mw.ustring.find(text, "^" .. taTwiil .. "?[" .. Arabic_diacritics .. "]" .. "$") then
			return text
		end
	end
	
	if type(self._rawData.entry_name) == "table" then
		text = do_entry_name_or_sort_key_replacements(text, self._rawData.entry_name)
	end
	
	return text
end


-- Return true if the language has display processing enabled, i.e. lang:makeDisplayText()
-- does non-trivial processing.
function Language:hasDisplayProcessing()
	return not not self._rawData.display
end

function Language:hasType()
	return self._rawData.type
end


-- Apply display-text replacements to `text`, if any.
function Language:makeDisplayText(text)
	if type(self._rawData.display) == "table" then
		text = do_entry_name_or_sort_key_replacements(text, self._rawData.display)
	end
	
	return text
end


-- Add to data tables?
local has_dotted_undotted_i = {
	["az"] = true,
	["crh"] = true,
	["gag"] = true,
	["kaa"] = true,
	["tt"] = true,
	["tr"] = true,
	["zza"] = true,
}

function Language:makeSortKey(name, sc)
	if has_dotted_undotted_i[self:getCode()] then
		name = name:gsub("I", "ı")
	end
	
	name = mw.ustring.lower(name)
	
	-- Remove initial hyphens and *
	local hyphens_regex = "^[-־ـ*]+(.)"
	name = mw.ustring.gsub(name, hyphens_regex, "%1")
	
	-- If there are language-specific rules to generate the key, use those
	if type(self._rawData.sort_key) == "table" then
		name = do_entry_name_or_sort_key_replacements(name, self._rawData.sort_key)
	elseif type(self._rawData.sort_key) == "string" then
		name = require("Module:" .. self._rawData.sort_key).makeSortKey(name, self:getCode(), sc and sc:getCode())
	end
	
	-- Remove parentheses, as long as they are either preceded or followed by something
	name = mw.ustring.gsub(name, "(.)[()]+", "%1")
	name = mw.ustring.gsub(name, "[()]+(.)", "%1")
	
	if has_dotted_undotted_i[self:getCode()] then
		name = name:gsub("i", "İ")
	end
	
	return mw.ustring.upper(name)
end

function Language:overrideManualTranslit()
	if self._rawData.override_translit then
		return true
	else
		return false
	end
end


function Language:transliterate(text, sc, module_override)
	-- If there is no text, or the language doesn't have transliteration data and there's no override, return nil.
	if not (self._rawData.translit_module or module_override) then
		return nil, false, {}
	elseif (not text) or text == "" or text == "-" then
		return text, false, {}
	end
	-- If the script is not transliteratable (and no override is given), return nil.
	sc = checkScript(text, self, sc)
	if not (sc:isTransliterated() or module_override) then
		return nil, true, {}
	end

	-- Remove any strip markers.
	text = mw.text.unstrip(text)

	-- Get the display text with the keepCarets flag set.
	local fail, cats, subbedChars
	text, fail, cats, subbedChars = processDisplayText(text, self, sc, true)

	-- Transliterate (using the module override if applicable).
	text, fail, cats, subbedChars = iterateSectionSubstitutions(text, subbedChars, true, self, sc, module_override or self._rawData.translit_module, "tr")
	
	if not text then
		return nil, true, cats
	end
	
	-- Incomplete transliterations return nil.
	local charset = sc.characters
	if charset and umatch(text, "[" .. charset .. "]") then
		-- Remove any characters in Latin, which includes Latin characters also included in other scripts (as these are false positives), as well as any PUA substitutions. Anything remaining should only be script code "None" (e.g. numerals).
		local check_text = ugsub(text, "[" .. require("Module:scripts").getByCode("Latn").characters .. "􀀀-􏿽]+", "")
		-- Set none_is_last_resort_only flag, so that any non-None chars will cause a script other than "None" to be returned.
		if require("Module:scripts").findBestScriptWithoutLang(check_text, true):getCode() ~= "None" then
			return nil, true, cats
		end
	end

	text = escape_risky_characters(text)
	text = undoTempSubstitutions(text, subbedChars)

	-- If the script does not use capitalization, then capitalize any letters of the transliteration which are immediately preceded by a caret (and remove the caret).
	if text and not sc:hasCapitalization() and text:find("^", 1, true) then
		text = processCarets(text, "%^([\128-\191\244]*%*?)([^\128-\191\244][\128-\191]*)", function(m1, m2)
			return m1 .. uupper(m2)
		end)
	end

	-- Track module overrides.
	if module_override ~= nil then
		track("module_override")
	end

	fail = text == nil and (not not fail) or false

	return text, fail, cats
end

function Language:hasTranslit()
	return self._rawData.translit_module and true or false
end


function Language:link_tr()
	return self._rawData.link_tr and true or false
end


function Language:toJSON()
	local entryNamePatterns = nil
	local entryNameRemoveDiacritics = nil
	
	if self._rawData.entry_name then
		entryNameRemoveDiacritics = self._rawData.entry_name.remove_diacritics
		if self._rawData.entry_name.from then
			entryNamePatterns = {}
			for i, from in ipairs(self._rawData.entry_name.from) do
				local to = self._rawData.entry_name.to[i] or ""
				table.insert(entryNamePatterns, { from = from, to = to })
			end
		end
	end
	
	local ret = {
		ancestors = self._rawData.ancestors,
		canonicalName = self:getCanonicalName(),
		categoryName = self:getCategoryName("nocap"),
		code = self._code,
		entryNamePatterns = entryNamePatterns,
		entryNameRemoveDiacritics = entryNameRemoveDiacritics,
		family = self._rawData[3] or self._rawData.family,
		otherNames = self:getOtherNames(true),
		aliases = self:getAliases(),
		varieties = self:getVarieties(),
		scripts = self._rawData.scripts or self._rawData[4],
		type = self:getType(),
		wikimediaLanguages = self._rawData.wikimedia_codes,
		wikidataItem = self:getWikidataItem(),
	}
	
	return require("Module:JSON").toJSON(ret)
end


-- Do NOT use these methods!
-- All uses should be pre-approved on the talk page!
function Language:getRawData()
	return self._rawData
end

function Language:getRawExtraData()
	self:loadInExtraData()
	return self._extraData
end

Language.__index = Language


function export.getDataModuleName(code)
	if code:find("^%l%l$") then
		return "languages/data2"
	elseif code:find("^%l%l%l$") then
		local prefix = code:sub(1, 1)
		return "languages/data3/" .. prefix
	elseif code:find("^[%l-]+$") then
		return "languages/datax"
	else
		return nil
	end
end


function export.getExtraDataModuleName(code)
	if code:find("^%l%l$") then
		return "languages/extradata2"
	elseif code:find("^%l%l%l$") then
		local prefix = code:sub(1, 1)
		return "languages/extradata3/" .. prefix
	elseif code:find("^[%l-]+$") then
		return "languages/extradatax"
	else
		return nil
	end
end


local function getRawLanguageData(code)
	local modulename = export.getDataModuleName(code)
	return modulename and mw.loadData("Module:" .. modulename)[code] or nil
end


local function getRawExtraLanguageData(code)
	local modulename = export.getExtraDataModuleName(code)
	return modulename and mw.loadData("Module:" .. modulename)[code] or nil
end


function Language:loadInExtraData()
	if not self._extraData then
		-- load extra data from module and assign to meta table
		-- use empty table as a fallback if extra data is nil
		local meta = getmetatable(self)
		meta._extraData = getRawExtraLanguageData(self._code) or {}
		setmetatable(self, meta)
	end
end


function export.makeObject(code, data)
	return data and setmetatable({ _rawData = data, _code = code }, Language) or nil
end


function export.getByCode(code, paramForError, allowEtymLang, allowFamily)
	if type(code) ~= "string" then
		error("The function getByCode expects a string as its first argument, but received " .. (code == nil and "nil" or "a " .. type(code)) .. ".")
	end
	
	local retval = export.makeObject(code, getRawLanguageData(code))
	if not retval and allowEtymLang then
		retval = require("Module:etymology languages").getByCode(code)
	end
	if not retval and allowFamily then
		retval = require("Module:families").getByCode(code)
	end
	if not retval and paramForError then
		local codetext = nil
		if allowEtymLang and allowFamily then
			codetext = "language, etymology language or family code"
		elseif allowEtymLang then
			codetext = "language or etymology language code"
		elseif allowFamily then
			codetext = "language or family code"
		else
			codetext = "language code"
		end
		export.err(code, paramForError, codetext)
	end
	return retval
end


function export.getByName(name, errorIfInvalid)
	local byName = mw.loadData("Module:languages/by name")
	local code = byName.all and byName.all[name] or byName[name]
	
	if not code then
		if errorIfInvalid then
			error("The language name \"" .. name .. "\" is not valid.")
		else
			return nil
		end
	end
	
	return export.makeObject(code, getRawLanguageData(code))
end

function export.getByCanonicalName(name, errorIfInvalid, allowEtymLang, allowFamily)
	local byName = mw.loadData("Module:languages/canonical names")
	local code = byName and byName[name]

	local retval = code and export.makeObject(code, getRawLanguageData(code)) or nil
	if not retval and allowEtymLang then
		retval = require("Module:etymology languages").getByCanonicalName(name)
	end
	if not retval and allowFamily then
		local famname = name:match("^(.*) languages$")
		famname = famname or name
		retval = require("Module:families").getByCanonicalName(famname)
	end
	if not retval and errorIfInvalid then
		local text
		if allowEtymLang and allowFamily then
			text = "language, etymology language or family name"
		elseif allowEtymLang then
			text = "language or etymology language name"
		elseif allowFamily then
			text = "language or family name"
		else
			text = "language name"
		end
		error("The " .. text .. " \"" .. name .. "\" is not valid.")
	end
	return retval
end

function export.iterateAll()
	mw.incrementExpensiveFunctionCount()
	local m_data = mw.loadData("Module:languages/alldata")
	local func, t, var = pairs(m_data)
	
	return function()
		local code, data = func(t, var)
		return export.makeObject(code, data)
	end
end

--[[	If language is an etymology language, iterates through parent languages
		until it finds a non-etymology language. ]]
function export.getNonEtymological(lang)
	while lang:getType() == "etymology language" do
		local parentCode = lang:getParentCode()
		lang = export.getByCode(parentCode)
			or require("Module:etymology languages").getByCode(parentCode)
			or require("Module:families").getByCode(parentCode)
	end
	
	return lang
end

return export