Module:headword/data: Difference between revisions

From Linguifex
Jump to navigation Jump to search
No edit summary
No edit summary
 
(One intermediate revision by the same user not shown)
Line 1: Line 1:
local headword_page_module = "Module:headword/page"
local list_to_set = require("Module:table").listToSet
local data = {}
local data = {}


data.invariable = {
------ 1. Lists which are converted into sets. ------
 
-- Zero-plurals (i.e. invariable plurals).
local irregular_plurals = list_to_set({
"cmavo",
"cmavo",
"cmene",
"cmene",
Line 7: Line 14:
"gismu",
"gismu",
"Han tu",
"Han tu",
"hanja",
"hanzi",
"hanzi",
"hanja",
"jyutping",
"jyutping",
"kana",
"kanji",
"kanji",
"lujvo",
"lujvo",
Line 16: Line 24:
"rafsi",
"rafsi",
"romaji",
"romaji",
}
}, function(item)
return item
end)
 
-- Irregular non-zero plurals AND any regular plurals where the singular ends in "s",
-- because the module assumes that inputs ending in "s" are plurals.
for k, v in next, {
mora = "morae"
} do
irregular_plurals[k] = v
irregular_plurals[v] = v -- Ensures singular and plural inputs work as expected.
end
 
data.invariable = irregular_plurals -- To be removed.
data.irregular_plurals = irregular_plurals


data.lemmas = {
data.lemmas = list_to_set{
"abbreviations",
"abbreviations",
"acronyms",
"acronyms",
Line 31: Line 53:
"circumpositions",
"circumpositions",
"classifiers",
"classifiers",
"clitics",
"cmavo",
"cmavo",
"cmavo clusters",
"cmavo clusters",
Line 40: Line 61:
"determiners",
"determiners",
"diacritical marks",
"diacritical marks",
"digraphs",
"equative adjectives",
"equative adjectives",
"fu'ivla",
"fu'ivla",
Line 45: Line 67:
"Han characters",
"Han characters",
"Han tu",
"Han tu",
"hanja",
"hanzi",
"hanzi",
"hanja",
"ideophones",
"ideophones",
"idioms",
"idioms",
"infixes",
"infixes",
"initialisms",
"iteration marks",
"interfixes",
"interfixes",
"initialisms",
"interjections",
"interjections",
"kana",
"kanji",
"kanji",
"letters",
"letters",
"ligatures",
"ligatures",
"logograms",
"lujvo",
"lujvo",
"morae",
"morphemes",
"morphemes",
"non-constituents",
"non-constituents",
Line 69: Line 95:
"predicatives",
"predicatives",
"prefixes",
"prefixes",
"prepositional phrases",
"prepositions",
"prepositions",
"prepositional phrases",
"preverbs",
"preverbs",
"pronominal adverbs",
"pronominal adverbs",
"pronouns",
"pronouns",
"proper nouns",
"proverbs",
"proverbs",
"proper nouns",
"punctuation marks",
"punctuation marks",
"relatives",
"relatives",
Line 86: Line 112:
}
}


data.nonlemmas = {
data.nonlemmas = list_to_set{
"active participle forms",
"active participles",
"active participles",
"adjectival participles",
"adjectival participles",
    "adjective case forms",
"adjective forms",
"adjective forms",
"adjective feminine forms",
"adjective feminine forms",
Line 102: Line 130:
"comparative adverb forms",
"comparative adverb forms",
"comparative adverbs",
"comparative adverbs",
"conjunction forms",
"contractions",
"contractions",
"converbs",
"converbs",
Line 108: Line 137:
"determiner superlative forms",
"determiner superlative forms",
"diminutive nouns",
"diminutive nouns",
"elative adjectives",
"equative adjective forms",
"equative adjective forms",
"equative adjectives",
"equative adjectives",
Line 116: Line 146:
"interjection forms",
"interjection forms",
"jyutping",
"jyutping",
"kanji readings",
"misspellings",
"misspellings",
"negative participles",
"negative participles",
Line 123: Line 152:
"noun dual forms",
"noun dual forms",
"noun forms",
"noun forms",
"noun paucal forms",
"noun plural forms",
"noun plural forms",
"noun possessive forms",
"noun possessive forms",
Line 163: Line 193:
"verb forms",
"verb forms",
"verbal nouns",
"verbal nouns",
}
-- These langauges will not have links to separate parts of the headword.
data.no_multiword_links = list_to_set{
"zh",
}
}


-- These languages will not have "LANG multiword terms" categories added.
-- These languages will not have "LANG multiword terms" categories added.
data.no_multiword_cat = {
data.no_multiword_cat = list_to_set{
-------- Languages without spaces between words (sometimes spaces between phrases) --------
-------- Languages without spaces between words (sometimes spaces between phrases) --------
"aho", -- Ahom
"blt", -- Tai Dam
"blt", -- Tai Dam
"ja", -- Japanese
"ja", -- Japanese
Line 177: Line 211:
"my", -- Burmese
"my", -- Burmese
"nan", -- Min Nan (some words in Latin script; hyphens between syllables)
"nan", -- Min Nan (some words in Latin script; hyphens between syllables)
"nan-hbl", -- Hokkien (some words in Latin script; hyphens between syllables)
"nod", -- Northern Thai
"nod", -- Northern Thai
"ojp", -- Old Japanese
"ojp", -- Old Japanese
"shn", -- Shan
"sou", -- Southern Thai
"tdd", -- Tai Nüa
"tdd", -- Tai Nüa
"th", -- Thai
"th", -- Thai
"tts", -- Isan
"tts", -- Isan
"twh", -- Tai Dón
"twh", -- Tai Dón
"shn", -- Shan
"txg", -- Tangut
"sou", -- Southern Thai
"zh", -- Chinese (all varieties with Chinese characters)
"zh", -- Chinese (all varieties with Chinese characters)
"zkt", -- Khitan


-------- Languages with spaces between syllables --------
-------- Languages with spaces between syllables --------
Line 192: Line 229:
"atb", -- Zaiwa
"atb", -- Zaiwa
"byk", -- Biao
"byk", -- Biao
"cdy", -- Chadong
--"duu", -- Drung; not sure
--"duu", -- Drung; not sure
--"hmx-pro", -- Proto-Hmong-Mien
--"hmx-pro", -- Proto-Hmong-Mien
Line 200: Line 238:
"mtq", -- Muong
"mtq", -- Muong
--"mww", -- White Hmong; not sure
--"mww", -- White Hmong; not sure
"onb", -- Lingao
--"sit-gkh", -- Gokhy; not sure
--"sit-gkh", -- Gokhy; not sure
--"swi", -- Sui; not sure
--"swi", -- Sui; not sure
"tbq-lol-pro", -- Proto-Loloish
"tbq-lol-pro", -- Proto-Loloish
"tdh", -- Thulung
"tdh", -- Thulung
"ukk", -- Muak Sa-aak
"vi", -- Vietnamese
"vi", -- Vietnamese
"yig", -- Wusa Nasu
"yig", -- Wusa Nasu
Line 211: Line 251:
"mkh-ban-pro", -- Proto-Bahnaric
"mkh-ban-pro", -- Proto-Bahnaric
"sit-pro", -- Proto-Sino-Tibetan; listed above
"sit-pro", -- Proto-Sino-Tibetan; listed above
 
-------- Other weirdnesses --------
-------- Other weirdnesses --------
"mul", -- Translingual; gestures, Morse code, etc.
"mul", -- Translingual; gestures, Morse code, etc.
Line 362: Line 402:


-- In these languages, the hyphen is not considered a word separator for the "multiword terms" category.
-- In these languages, the hyphen is not considered a word separator for the "multiword terms" category.
data.hyphen_not_multiword_sep = {
data.hyphen_not_multiword_sep = list_to_set{
"akk", -- Akkadian; hyphens between syllables
"akk", -- Akkadian; hyphens between syllables
"akl", -- Aklanon; hyphens for mid-word glottal stops
"ber-pro", -- Proto-Berber; morphemes separated by hyphens
"ceb", -- Cebuano; hyphens for mid-word glottal stops
"cnk", -- Khumi Chin; hyphens used in single words
"cpi", -- Chinese Pidgin English; Chinese-derived words with hyphens between syllables
"cpi", -- Chinese Pidgin English; Chinese-derived words with hyphens between syllables
"de", -- too many false positives
"de", -- too many false positives
"esx-esk-pro", -- hyphen used to separate morphemes
"esx-esk-pro", -- hyphen used to separate morphemes
"fi", -- Finnish; hyphen used to separate components in compound words if the final and initial vowels match, respectively
"fi", -- Finnish; hyphen used to separate components in compound words if the final and initial vowels match, respectively
"hil", -- Hiligaynon; hyphens for mid-word glottal stops
"ilo", -- Ilocano; hyphens for mid-word glottal stops
"kne", -- Kankanaey; hyphens for mid-word glottal stops
"lcp", -- Western Lawa; dash as syllable joiner
"lcp", -- Western Lawa; dash as syllable joiner
"lwl", -- Eastern Lawa; dash as syllable joiner
"lwl", -- Eastern Lawa; dash as syllable joiner
"mfa", -- Pattani Malay in Thai script; dash as syllable joiner
"mkh-vie-pro", -- Proto-Vietic; morphemes separated by hyphens
"mkh-vie-pro", -- Proto-Vietic; morphemes separated by hyphens
"msb", -- Masbatenyo; too many false positives
"tl", -- Tagalog; too many false positives
"war", -- Waray-Waray; too many false positives
"yo", -- Yoruba; hyphens used to show lengthened nasal vowels
}
}


-- These languages will not have "LANG masculine nouns" and similar categories added.
-- These languages will not have "LANG masculine nouns" and similar categories added.
data.no_gender_cat = {
data.no_gender_cat = list_to_set{
-- Languages without gender but which use the gender field for other purposes
-- Languages without gender but which use the gender field for other purposes
"ja",
"ja",
Line 380: Line 432:
}
}


data.notranslit = {
data.notranslit = list_to_set{
"ams",
"ams",
"az",
"az",
Line 388: Line 440:
"cjm",
"cjm",
"cmn",
"cmn",
"cpi",
"hak",
"hak",
"ja",
"ja",
Line 397: Line 450:
"mvi",
"mvi",
"nan",
"nan",
"nan-hbl",
"nan-hnm",
"nan-luh",
"nan-tws",
"oj",
"oj",
"okn",
"okn",
"pi",
"ro",
"ryn",
"ryn",
"rys",
"rys",
Line 409: Line 464:
"tkn",
"tkn",
"tly",
"tly",
"txg",
"und",
"und",
"vi",
"vi",
"xug",
"xug",
"yue",
"yoi",
"yoi",
"yox",
"yox",
"yue",
"za",
"za",
"zh",
"zh",
}
}


-- Script codes for which a script-tagged display title will be added.
-- Script codes for which a script-tagged display title will be added.
data.toBeTagged = {
data.toBeTagged = list_to_set{
"Ahom",
"Ahom",
"Arab",
"Arab",
"fa-Arab",
"glk-Arab",
"kk-Arab",
"ks-Arab",
"ku-Arab",
"mzn-Arab",
"ms-Arab",
"ota-Arab",
"pa-Arab",
"ps-Arab",
"sd-Arab",
"tt-Arab",
"ug-Arab",
"ur-Arab",
"Armi",
"Armn",
"Avst",
"Avst",
"Bali",
"Bali",
"Bamu",
"Batk",
"Beng",
"as-Beng",
"Bopo",
"Brah",
"Brai",
"Bugi",
"Buhd",
"Cakm",
"Cans",
"Cari",
"Cham",
"Cham",
"Cher",
"Copt",
"Copt",
"Kali",
"Cprt",
"Cyrl",
"Cyrs",
"Deva",
"Dsrt",
"Egyd",
"Egyp",
"Ethi",
"Geok",
"Geor",
"Glag",
"Goth",
"Grek",
"Polyt",
"polytonic",
"Gujr",
"Guru",
"Hang",
"Hani",
"Hani",
"Hano",
"Hebr",
"Hebr",
"Hira",
"Hluw",
"Ital",
"Java",
"Kali",
"Kana",
"Khar",
"Khmr",
"Knda",
"Kthi",
"Lana",
"Lana",
"Laoo",
"Latn",
"Latf",
"Latg",
"Latnx",
"Latinx",
"pjt-Latn",
"Lepc",
"Limb",
"Linb",
"Linb",
"Lisu",
"Lyci",
"Lydi",
"Mand",
"Mand",
"Mani",
"Marc",
"Merc",
"Mero",
"Mlym",
"Mong",
"Mong",
"polytonic",
"mnc-Mong",
"sjo-Mong",
"xwo-Mong",
"Mtei",
"Mymr",
"Narb",
"Nkoo",
"Nshu",
"Ogam",
"Olck",
"Orkh",
"Orya",
"Osma",
"Ougr",
"Palm",
"Phag",
"Phli",
"Phlv",
"Phnx",
"Plrd",
"Prti",
"Rjng",
"Rjng",
"Runr",
"Samr",
"Samr",
"Sarb",
"Saur",
"Sgnw",
"Shaw",
"Shrd",
"Sinh",
"Sora",
"Sund",
"Sund",
"Sylo",
"Sylo",
"Syrc",
"Tagb",
"Tale",
"Talu",
"Taml",
"Tang",
"Tang",
"Tavt",
"Tavt",
"Telu",
"Tfng",
"Tglg",
"Thaa",
"Thai",
"Tibt",
"Ugar",
"Vaii",
"Xpeo",
"Xsux",
"Xsux",
"Yiii",
"Zmth",
"Zsym",
"Ipach",
"Music",
"Rumin",
}
-- Parts of speech which will not be categorised in categories like "English terms spelled with É" if
-- the term is the character in question (e.g. the letter entry for English [[é]]). This contrasts with
-- entries like the French adjective [[m̂]], which is a one-letter word spelled with the letter.
data.pos_not_spelled_with_self = list_to_set{
"diacritical marks",
"Han characters",
"Han tu",
"hanja",
"hanzi",
"iteration marks",
"kana",
"kanji",
"letters",
"ligatures",
"logograms",
"morae",
"numeral symbols",
"numerals",
"punctuation marks",
"syllables",
"symbols",
}
}


for key, list in pairs(data) do
------ 2. Lists not converted into sets. ------
data[key] = require("Module:utils").list_to_set(list)
 
end
-- Recognized aliases for parts of speech (param 2=). Key is the short form and value is the canonical singular (not
-- pluralized) form. It is singular so that the same table can be used in [[Module:form of]] for the p=/POS= param
-- and [[Module:links]] for the pos= param.
data.pos_aliases = {
a = "adjective",
adj = "adjective",
adv = "adverb",
art = "article",
det = "determiner",
cnum = "cardinal number",
conj = "conjunction",
conv = "converb",
int = "interjection",
interj = "interjection",
intj = "interjection",
n = "noun",
num = "numeral",
part = "participle",
pcl = "particle",
phr = "phrase",
pn = "proper noun",
postp = "postposition",
pre = "preposition",
prep = "preposition",
pro = "pronoun",
pron = "pronoun",
prop = "proper noun",
proper = "proper noun",
onum = "ordinal number",
v = "verb",
vb = "verb",
vi = "intransitive verb",
vt = "transitive verb",
vti = "transitive and intransitive verb",
}


-- Parts of speech for which categories like "German masculine nouns" or "Russian imperfective verbs"
-- Parts of speech for which categories like "German masculine nouns" or "Russian imperfective verbs"
-- will be generated if the headword is of the appropriate gender/number. We put this at the bottom
-- will be generated if the headword is of the appropriate gender/number.
-- because it's a map, not a list.
data.pos_for_gender_number_cat = {
data.pos_for_gender_number_cat = {
["nouns"] = "nouns",
["nouns"] = "nouns",
["proper nouns"] = "nouns",
["proper nouns"] = "nouns",
["suffixes"] = "suffixes",
-- We include verbs because impf and pf are valid "genders".
-- We include verbs because impf and pf are valid "genders".
["verbs"] = "verbs",
["verbs"] = "verbs",
}
}
------ 3. Page-wide processing (so that it only needs to be done once per page). ------
data.page = require(headword_page_module).process_page()
-- Fuckme, random references to data.pagename and data.encoded_pagename are scattered throughout the codebase. FIXME!
data.pagename = data.page.pagename
data.encoded_pagename = data.page.encoded_pagename


return data
return data

Latest revision as of 09:39, 31 July 2024



local headword_page_module = "Module:headword/page"

local list_to_set = require("Module:table").listToSet

local data = {}

------ 1. Lists which are converted into sets. ------

-- Zero-plurals (i.e. invariable plurals).
local irregular_plurals = list_to_set({
	"cmavo",
	"cmene",
	"fu'ivla",
	"gismu",
	"Han tu",
	"hanja",
	"hanzi",
	"jyutping",
	"kana",
	"kanji",
	"lujvo",
	"phrasebook",
	"pinyin",
	"rafsi",
	"romaji",
}, function(item)
	return item
end)

-- Irregular non-zero plurals AND any regular plurals where the singular ends in "s",
-- because the module assumes that inputs ending in "s" are plurals.
for k, v in next, {
	mora = "morae"
} do
	irregular_plurals[k] = v
	irregular_plurals[v] = v -- Ensures singular and plural inputs work as expected.
end

data.invariable = irregular_plurals -- To be removed.
data.irregular_plurals = irregular_plurals

data.lemmas = list_to_set{
	"abbreviations",
	"acronyms",
	"adjectives",
	"adnominals",
	"adpositions",
	"adverbs",
	"affixes",
	"ambipositions",
	"articles",
	"circumfixes",
	"circumpositions",
	"classifiers",
	"cmavo",
	"cmavo clusters",
	"cmene",
	"combining forms",
	"conjunctions",
	"counters",
	"determiners",
	"diacritical marks",
	"digraphs",
	"equative adjectives",
	"fu'ivla",
	"gismu",
	"Han characters",
	"Han tu",
	"hanja",
	"hanzi",
	"ideophones",
	"idioms",
	"infixes",
	"initialisms",
	"iteration marks",
	"interfixes",
	"interjections",
	"kana",
	"kanji",
	"letters",
	"ligatures",
	"logograms",
	"lujvo",
	"morae",
	"morphemes",
	"non-constituents",
	"nouns",
	"numbers",
	"numeral symbols",
	"numerals",
	"particles",
	"phrases",
	"postpositions",
	"postpositional phrases",
	"predicatives",
	"prefixes",
	"prepositional phrases",
	"prepositions",
	"preverbs",
	"pronominal adverbs",
	"pronouns",
	"proper nouns",
	"proverbs",
	"punctuation marks",
	"relatives",
	"roots",
	"stems",
	"suffixes",
	"syllables",
	"symbols",
	"verbs",
}

data.nonlemmas = list_to_set{
	"active participle forms",
	"active participles",
	"adjectival participles",
    "adjective case forms",
	"adjective forms",
	"adjective feminine forms",
	"adjective plural forms",
	"adverb forms",
	"adverbial participles",
	"agent participles",
	"article forms",
	"circumfix forms",
	"combined forms",
	"comparative adjective forms",
	"comparative adjectives",
	"comparative adverb forms",
	"comparative adverbs",
	"conjunction forms",
	"contractions",
	"converbs",
	"determiner comparative forms",
	"determiner forms",
	"determiner superlative forms",
	"diminutive nouns",
	"elative adjectives",
	"equative adjective forms",
	"equative adjectives",
	"future participles",
	"gerunds",
	"infinitive forms",
	"infinitives",
	"interjection forms",
	"jyutping",
	"misspellings",
	"negative participles",
	"nominal participles",
	"noun case forms",
	"noun dual forms",
	"noun forms",
	"noun paucal forms",
	"noun plural forms",
	"noun possessive forms",
	"noun singulative forms",
	"numeral forms",
	"participles",
	"participle forms",
	"particle forms",
	"passive participles",
	"past active participles",
	"past participles",
	"past participle forms",
	"past passive participles",
	"perfect active participles",
	"perfect participles",
	"perfect passive participles",
	"pinyin",
	"plurals",
	"postposition forms",
	"prefix forms",
	"preposition contractions",
	"preposition forms",
	"prepositional pronouns",
	"present active participles",
	"present participles",
	"present passive participles",
	"pronoun forms",
	"pronoun possessive forms",
	"proper noun forms",
	"proper noun plural forms",
	"rafsi",
	"romanizations",
	"root forms",
	"singulatives",
	"suffix forms",
	"superlative adjective forms",
	"superlative adjectives",
	"superlative adverb forms",
	"superlative adverbs",
	"verb forms",
	"verbal nouns",
}

-- These langauges will not have links to separate parts of the headword.
data.no_multiword_links = list_to_set{
	"zh",
}

-- These languages will not have "LANG multiword terms" categories added.
data.no_multiword_cat = list_to_set{
	-------- Languages without spaces between words (sometimes spaces between phrases) --------
	"blt", -- Tai Dam
	"ja", -- Japanese
	"khb", -- Lü
	"km", -- Khmer
	"lo", -- Lao
	"mnw", -- Mon
	"my", -- Burmese
	"nan", -- Min Nan (some words in Latin script; hyphens between syllables)
	"nan-hbl", -- Hokkien (some words in Latin script; hyphens between syllables)
	"nod", -- Northern Thai
	"ojp", -- Old Japanese
	"shn", -- Shan
	"sou", -- Southern Thai
	"tdd", -- Tai Nüa
	"th", -- Thai
	"tts", -- Isan
	"twh", -- Tai Dón
	"txg", -- Tangut
	"zh", -- Chinese (all varieties with Chinese characters)
	"zkt", -- Khitan

	-------- Languages with spaces between syllables --------
	"ahk", -- Akha
	"aou", -- A'ou
	"atb", -- Zaiwa
	"byk", -- Biao
	"cdy", -- Chadong
	--"duu", -- Drung; not sure
	--"hmx-pro", -- Proto-Hmong-Mien
	--"hnj", -- Green Hmong; not sure
	"huq", -- Tsat
	"ium", -- Iu Mien
	--"lis", -- Lisu; not sure
	"mtq", -- Muong
	--"mww", -- White Hmong; not sure
	"onb", -- Lingao
	--"sit-gkh", -- Gokhy; not sure
	--"swi", -- Sui; not sure
	"tbq-lol-pro", -- Proto-Loloish
	"tdh", -- Thulung
	"ukk", -- Muak Sa-aak
	"vi", -- Vietnamese
	"yig", -- Wusa Nasu
	"zng", -- Mang

	-------- Languages with ~ with surrounding spaces used to separate variants --------
	"mkh-ban-pro", -- Proto-Bahnaric
	"sit-pro", -- Proto-Sino-Tibetan; listed above

	-------- Other weirdnesses --------
	"mul", -- Translingual; gestures, Morse code, etc.
	"aot", -- Atong (India); bullet is a letter

	-------- All sign languages	--------
	"ads",
	"aed",
	"aen",
	"afg",
	"ase",
	"asf",
	"asp",
	"asq",
	"asw",
	"bfi",
	"bfk",
	"bog",
	"bqn",
	"bqy",
	"bvl",
	"bzs",
	"cds",
	"csc",
	"csd",
	"cse",
	"csf",
	"csg",
	"csl",
	"csn",
	"csq",
	"csr",
	"doq",
	"dse",
	"dsl",
	"ecs",
	"esl",
	"esn",
	"eso",
	"eth",
	"fcs",
	"fse",
	"fsl",
	"fss",
	"gds",
	"gse",
	"gsg",
	"gsm",
	"gss",
	"gus",
	"hab",
	"haf",
	"hds",
	"hks",
	"hos",
	"hps",
	"hsh",
	"hsl",
	"icl",
	"iks",
	"ils",
	"inl",
	"ins",
	"ise",
	"isg",
	"isr",
	"jcs",
	"jhs",
	"jls",
	"jos",
	"jsl",
	"jus",
	"kgi",
	"kvk",
	"lbs",
	"lls",
	"lsl",
	"lso",
	"lsp",
	"lst",
	"lsy",
	"lws",
	"mdl",
	"mfs",
	"mre",
	"msd",
	"msr",
	"mzc",
	"mzg",
	"mzy",
	"nbs",
	"ncs",
	"nsi",
	"nsl",
	"nsp",
	"nsr",
	"nzs",
	"okl",
	"pgz",
	"pks",
	"prl",
	"prz",
	"psc",
	"psd",
	"psg",
	"psl",
	"pso",
	"psp",
	"psr",
	"pys",
	"rms",
	"rsl",
	"rsm",
	"sdl",
	"sfb",
	"sfs",
	"sgg",
	"sgx",
	"slf",
	"sls",
	"sqk",
	"sqs",
	"ssp",
	"ssr",
	"svk",
	"swl",
	"syy",
	"tse",
	"tsm",
	"tsq",
	"tss",
	"tsy",
	"tza",
	"ugn",
	"ugy",
	"ukl",
	"uks",
	"vgt",
	"vsi",
	"vsl",
	"vsv",
	"xki",
	"xml",
	"xms",
	"ygs",
	"ysl",
	"zib",
	"zsl",
}

-- In these languages, the hyphen is not considered a word separator for the "multiword terms" category.
data.hyphen_not_multiword_sep = list_to_set{
	"akk", -- Akkadian; hyphens between syllables
	"akl", -- Aklanon; hyphens for mid-word glottal stops
	"ber-pro", -- Proto-Berber; morphemes separated by hyphens
	"ceb", -- Cebuano; hyphens for mid-word glottal stops
	"cnk", -- Khumi Chin; hyphens used in single words
	"cpi", -- Chinese Pidgin English; Chinese-derived words with hyphens between syllables
	"de", -- too many false positives
	"esx-esk-pro", -- hyphen used to separate morphemes
	"fi", -- Finnish; hyphen used to separate components in compound words if the final and initial vowels match, respectively
	"hil", -- Hiligaynon; hyphens for mid-word glottal stops
	"ilo", -- Ilocano; hyphens for mid-word glottal stops
	"kne", -- Kankanaey; hyphens for mid-word glottal stops
	"lcp", -- Western Lawa; dash as syllable joiner
	"lwl", -- Eastern Lawa; dash as syllable joiner
	"mfa", -- Pattani Malay in Thai script; dash as syllable joiner
	"mkh-vie-pro", -- Proto-Vietic; morphemes separated by hyphens
	"msb", -- Masbatenyo; too many false positives
	"tl", -- Tagalog; too many false positives
	"war", -- Waray-Waray; too many false positives
	"yo", -- Yoruba; hyphens used to show lengthened nasal vowels
}

-- These languages will not have "LANG masculine nouns" and similar categories added.
data.no_gender_cat = list_to_set{
	-- Languages without gender but which use the gender field for other purposes
	"ja",
	"th",
}

data.notranslit = list_to_set{
	"ams",
	"az",
	"bbc",
	"bug",
	"cia",
	"cjm",
	"cmn",
	"cpi",
	"hak",
	"ja",
	"kzg",
	"lad",
	"lzh",
	"ms",
	"mul",
	"mvi",
	"nan",
	"nan-hbl",
	"nan-hnm",
	"nan-luh",
	"nan-tws",
	"oj",
	"okn",
	"ryn",
	"rys",
	"ryu",
	"sh",
	"tgt",
	"th",
	"tkn",
	"tly",
	"txg",
	"und",
	"vi",
	"xug",
	"yoi",
	"yox",
	"yue",
	"za",
	"zh",
}

-- Script codes for which a script-tagged display title will be added.
data.toBeTagged = list_to_set{
	"Ahom",
	"Arab",
		"fa-Arab",
		"glk-Arab",
		"kk-Arab",
		"ks-Arab",
		"ku-Arab",
		"mzn-Arab",
		"ms-Arab",
		"ota-Arab",
		"pa-Arab",
		"ps-Arab",
		"sd-Arab",
		"tt-Arab",
		"ug-Arab",
		"ur-Arab",
	"Armi",
	"Armn",
	"Avst",
	"Bali",
	"Bamu",
	"Batk",
	"Beng",
		"as-Beng",
	"Bopo",
	"Brah",
	"Brai",
	"Bugi",
	"Buhd",
	"Cakm",
	"Cans",
	"Cari",
	"Cham",
	"Cher",
	"Copt",
	"Cprt",
	"Cyrl",
	"Cyrs",
	"Deva",
	"Dsrt",
	"Egyd",
	"Egyp",
	"Ethi",
	"Geok",
	"Geor",
	"Glag",
	"Goth",
	"Grek",
		"Polyt",
		"polytonic",
	"Gujr",
	"Guru",
	"Hang",
	"Hani",
	"Hano",
	"Hebr",
	"Hira",
	"Hluw",
	"Ital",
	"Java",
	"Kali",
	"Kana",
	"Khar",
	"Khmr",
	"Knda",
	"Kthi",
	"Lana",
	"Laoo",
	"Latn",
		"Latf",
		"Latg",
		"Latnx",
		"Latinx",
		"pjt-Latn",
	"Lepc",
	"Limb",
	"Linb",
	"Lisu",
	"Lyci",
	"Lydi",
	"Mand",
	"Mani",
	"Marc",
	"Merc",
	"Mero",
	"Mlym",
	"Mong",
		"mnc-Mong",
		"sjo-Mong",
		"xwo-Mong",
	"Mtei",
	"Mymr",
	"Narb",
	"Nkoo",
	"Nshu",
	"Ogam",
	"Olck",
	"Orkh",
	"Orya",
	"Osma",
	"Ougr",
	"Palm",
	"Phag",
	"Phli",
	"Phlv",
	"Phnx",
	"Plrd",
	"Prti",
	"Rjng",
	"Runr",
	"Samr",
	"Sarb",
	"Saur",
	"Sgnw",
	"Shaw",
	"Shrd",
	"Sinh",
	"Sora",
	"Sund",
	"Sylo",
	"Syrc",
	"Tagb",
	"Tale",
	"Talu",
	"Taml",
	"Tang",
	"Tavt",
	"Telu",
	"Tfng",
	"Tglg",
	"Thaa",
	"Thai",
	"Tibt",
	"Ugar",
	"Vaii",
	"Xpeo",
	"Xsux",
	"Yiii",
	"Zmth",
	"Zsym",

	"Ipach",
	"Music",
	"Rumin",
}

-- Parts of speech which will not be categorised in categories like "English terms spelled with É" if
-- the term is the character in question (e.g. the letter entry for English [[é]]). This contrasts with
-- entries like the French adjective [[m̂]], which is a one-letter word spelled with the letter.
data.pos_not_spelled_with_self = list_to_set{
	"diacritical marks",
	"Han characters",
	"Han tu",
	"hanja",
	"hanzi",
	"iteration marks",
	"kana",
	"kanji",
	"letters",
	"ligatures",
	"logograms",
	"morae",
	"numeral symbols",
	"numerals",
	"punctuation marks",
	"syllables",
	"symbols",
}

------ 2. Lists not converted into sets. ------

-- Recognized aliases for parts of speech (param 2=). Key is the short form and value is the canonical singular (not
-- pluralized) form. It is singular so that the same table can be used in [[Module:form of]] for the p=/POS= param
-- and [[Module:links]] for the pos= param.
data.pos_aliases = {
	a = "adjective",
	adj = "adjective",
	adv = "adverb",
	art = "article",
	det = "determiner",
	cnum = "cardinal number",
	conj = "conjunction",
	conv = "converb",
	int = "interjection",
	interj = "interjection",
	intj = "interjection",
	n = "noun",
	num = "numeral",
	part = "participle",
	pcl = "particle",
	phr = "phrase",
	pn = "proper noun",
	postp = "postposition",
	pre = "preposition",
	prep = "preposition",
	pro = "pronoun",
	pron = "pronoun",
	prop = "proper noun",
	proper = "proper noun",
	onum = "ordinal number",
	v = "verb",
	vb = "verb",
	vi = "intransitive verb",
	vt = "transitive verb",
	vti = "transitive and intransitive verb",
}

-- Parts of speech for which categories like "German masculine nouns" or "Russian imperfective verbs"
-- will be generated if the headword is of the appropriate gender/number.
data.pos_for_gender_number_cat = {
	["nouns"] = "nouns",
	["proper nouns"] = "nouns",
	["suffixes"] = "suffixes",
	-- We include verbs because impf and pf are valid "genders".
	["verbs"] = "verbs",
}

------ 3. Page-wide processing (so that it only needs to be done once per page). ------
data.page = require(headword_page_module).process_page()
-- Fuckme, random references to data.pagename and data.encoded_pagename are scattered throughout the codebase. FIXME!
data.pagename = data.page.pagename
data.encoded_pagename = data.page.encoded_pagename

return data