Module:family tree/nested data

From Linguifex
< Module:family tree
Revision as of 17:38, 8 August 2021 by Sware (talk | contribs) (Created page with "--[=[ Authors: User:kc_kennylau, User:JohnC5, User:Erutuon, User:Suzukaze-c --]=] local regular_languages = require("Module:languages/alldata") local famili...")
(diff) ← Older revision | Latest revision (diff) | Newer revision → (diff)
Jump to navigation Jump to search

Documentation for this module may be created at Module:family tree/nested data/doc

--[=[

Authors: [[User:kc_kennylau]], [[User:JohnC5]], [[User:Erutuon]], [[User:Suzukaze-c]]

--]=]

local regular_languages = require("Module:languages/alldata")
local families = require("Module:families/data")

-- Version of [[Module:etymology languages/data]] that chooses the language-
-- codiest code of several codes that have the same data. For instance,
-- it chooses "de-AT" over "Austrian German".
local etymology_languages = require("Module:family tree/etymology languages")

local supplementary_language_and_family_data = require("Module:family tree/data")

local memoize = require("Module:fun").memoize

local make_auto_subtabler = require("Module:auto-subtable")

local function get_data(code)
	return regular_languages[code] or etymology_languages[code] or families[code]
		or error("language code " .. tostring(code) .. " not recognized")
end

local get_sort_value = memoize(function (code)
	local data = get_data(code)
	return (data[1] or data.canonicalName):gsub("Proto%-", "")
end)

-- used in deep_sort
local function compare(code1, code2)
	return get_sort_value(code1) < get_sort_value(code2)
end

local function deep_sort(current)
	local result = {}
	local is_table = {}
	for key, val in pairs(current) do
		if type(key) == "number" then
			table.insert(result, val)
		else
			is_table[key] = true
			table.insert(result, key)
		end
	end
	
	table.sort(result, compare)
	
	for i = 1, #result do
		if is_table[result[i]] then
			local name = result[i]
			result[i] = deep_sort(current[result[i]])
			result[i].name = name
		else
			result[i] = { name = result[i] }
		end
	end
	
	return result
end

-- Reliably get family for etymology language.
local function get_family(code)
	while code do
		local data = get_data(code)
		local family = data[3] or data.family
		if family then -- This is a regular language or family code.
			return family
		end
		-- This is an etymology language code. Go up in the chain of parenthood.
		code = data.parent
	end
end

local function all_in_family(family_code, codes)
	for _, code in ipairs(codes) do
		if get_family(code) ~= family_code then
			return false
		end
	end
	return true
end

local protolanguage_of = memoize(function(fam_code)
	local data = get_data(fam_code)
	if data then
		local proto = data.protoLanguage
		if proto then
			return proto
		end
	end
	
	local proto = fam_code .. "-pro"
	if regular_languages[proto] or etymology_languages[proto] then
		return proto
	end
end)

-- Key: old parent code
-- Value: new parent code (move all regular languages or families here)
local new_parent_code = {}

-- Key: ancestor of proto-language
-- Value: table, containing
--    Key: proto-language
--    Value: family of proto-language
local protolanguage_to_put_under_family = make_auto_subtabler()

local function find_ancestors(code, val)
	-- First look for the special family tree–only ancestor.
	if supplementary_language_and_family_data[code] and supplementary_language_and_family_data[code].parent then
		return { supplementary_language_and_family_data[code].parent }
	
	-- Handle etymology languages.
	elseif etymology_languages[code] then
		local parent = val.parent
		if parent then
			return { parent }
		end
	
	-- Handle regular languages
	-- Return ancestors value if the ancestors belong to the same language
	-- family.
	elseif val.ancestors and (#val.ancestors == 1
	or all_in_family(get_family(code), val.ancestors)) then
		return val.ancestors
	
	-- If the proto-language of a family does not belong to the family (i.e.,
	-- it belongs to the same family as the family does), nest the family
	-- under the proto-language instead of the other way around.
	elseif families[code] then
		local proto = protolanguage_of(code)
		if proto then
			if get_family(proto) == get_family(code) then
				new_parent_code[proto] = code
				return { proto }
			end
			
			-- Normally a family is the child of a proto-language or a family,
			-- but if the family's proto-language has an ancestors field, use
			-- that as parent instead.
			local proto_data = get_data(proto)
			if proto_data.ancestors then
				for _, ancestor in ipairs(proto_data.ancestors) do
					protolanguage_to_put_under_family[ancestor][proto] = code
				end
				
				return proto_data.ancestors
			end
		end
	end
	
	-- Handle regular languages and language families.
	-- Return a proto-language if possible, else a language family.
	if get_family(code) then
		-- Go up through the language families that the language or family
		-- belongs to.
		local origin = code
		while true do
			code = get_family(code)
			val = families[code]
			
			-- no family, "not a family", undetermined family
			if not (code and val and code ~= "qfa-not" and code ~= "qfa-und") then
				if code and not val then
					-- Error in data table!
					mw.log("no such family code: " .. code)
				end
				return nil
			end
			
			local proto = protolanguage_of(code)
			if proto and proto ~= origin then
				return { proto }
			else
				return { code }
			end
		end
	end
end

local function make_parent_to_children_map()
	local children = make_auto_subtabler{}
	
	for _, data_module in ipairs { families, regular_languages, etymology_languages } do
		for code, data in pairs(data_module) do
			local ancestors = find_ancestors(code, data)
			if ancestors then
				for _, ancestor in ipairs(ancestors) do
					if ancestor ~= code then
						table.insert(children[ancestor], code)
					end
				end
			end
		end
	end
	
	-- No more auto subtabling needed.
	return children:un_auto_subtable()
end

-- For instance, Latin is the proto-language of the Romance languages, but
-- belongs to the Italic family. All the children of Latin that are languages
-- or families were initially placed under Latin, but must be placed under the
-- Romance languages.
local function move_children(parent_to_children_map)
	make_auto_subtabler(parent_to_children_map)
	
	for old_code, new_code in pairs(new_parent_code) do
		local old_children = parent_to_children_map[old_code]
		local new_children = parent_to_children_map[new_code]
		
		local i = 1
		while old_children[i] do
			local child = old_children[i]
			if child ~= new_code and (regular_languages[child] or families[child]) then
				table.insert(new_children, table.remove(old_children, i))
			else
				i = i + 1
			end
		end
		
		if #old_children == 0 then
			parent_to_children_map[old_code] = nil
		end
	end
	
	for old_parent, child_to_new_parent in pairs(protolanguage_to_put_under_family) do
		local old_children = parent_to_children_map[old_parent]
		if type(old_children) == "table" then
			local i = 1
			while old_children[i] do
				local child = old_children[i]
				
				local new_parent = child_to_new_parent[child]
				if new_parent then
					table.insert(parent_to_children_map[new_parent], table.remove(old_children, i))
				else
					i = i + 1
				end
			end
		end
	end
	
	parent_to_children_map:un_auto_subtable()
end

local function reverse_comp(a, b)
	return a > b
end

local function make_nested(data, children)
	local make_nil = {}
	for key, val in pairs(data) do
		if type(key) == "number" then
			if children[val] then
				data[val] = make_nested(children[val], children)
				table.insert(make_nil, key)
			end
		else
			data[key] = make_nested(val, children)
		end
	end
	if make_nil[2] then -- Make sure larger keys are removed first.
		table.sort(make_nil, reverse_comp)
	end
	for _, key in ipairs(make_nil) do
		table.remove(data, key)
	end
	return data
end

local function main()
	local parent_to_children_map = make_parent_to_children_map()
	
	move_children(parent_to_children_map)
	
	local nested = make_nested(parent_to_children_map, parent_to_children_map)
	
	nested = deep_sort(nested)
	
	return { nested = nested, protolanguage_of = protolanguage_of }
end

return main()