45,647
edits
No edit summary |
No edit summary |
||
Line 1: | Line 1: | ||
local headword_page_module = "Module:headword/page" | |||
local list_to_set = require("Module:table").listToSet | |||
local data = {} | local data = {} | ||
------ 1. Lists which are converted into sets. ------ | |||
-- Zero-plurals (i.e. invariable plurals). | |||
local irregular_plurals = list_to_set({ | |||
"cmavo", | "cmavo", | ||
"cmene", | "cmene", | ||
Line 7: | Line 14: | ||
"gismu", | "gismu", | ||
"Han tu", | "Han tu", | ||
"hanja", | |||
"hanzi", | "hanzi", | ||
"jyutping", | "jyutping", | ||
"kana", | |||
"kanji", | "kanji", | ||
"lujvo", | "lujvo", | ||
Line 16: | Line 24: | ||
"rafsi", | "rafsi", | ||
"romaji", | "romaji", | ||
} | }, function(item) | ||
return item | |||
end) | |||
-- Irregular non-zero plurals AND any regular plurals where the singular ends in "s", | |||
-- because the module assumes that inputs ending in "s" are plurals. | |||
for k, v in next, { | |||
mora = "morae" | |||
} do | |||
irregular_plurals[k] = v | |||
irregular_plurals[v] = v -- Ensures singular and plural inputs work as expected. | |||
end | |||
data.invariable = irregular_plurals -- To be removed. | |||
data.irregular_plurals = irregular_plurals | |||
data.lemmas = { | data.lemmas = list_to_set{ | ||
"abbreviations", | "abbreviations", | ||
"acronyms", | "acronyms", | ||
Line 31: | Line 53: | ||
"circumpositions", | "circumpositions", | ||
"classifiers", | "classifiers", | ||
"cmavo", | "cmavo", | ||
"cmavo clusters", | "cmavo clusters", | ||
Line 40: | Line 61: | ||
"determiners", | "determiners", | ||
"diacritical marks", | "diacritical marks", | ||
"digraphs", | |||
"equative adjectives", | "equative adjectives", | ||
"fu'ivla", | "fu'ivla", | ||
Line 45: | Line 67: | ||
"Han characters", | "Han characters", | ||
"Han tu", | "Han tu", | ||
"hanja", | |||
"hanzi", | "hanzi", | ||
"ideophones", | "ideophones", | ||
"idioms", | "idioms", | ||
"infixes", | "infixes", | ||
"initialisms", | |||
"iteration marks", | |||
"interfixes", | "interfixes", | ||
"interjections", | "interjections", | ||
"kana", | |||
"kanji", | "kanji", | ||
"letters", | "letters", | ||
"ligatures", | "ligatures", | ||
"logograms", | |||
"lujvo", | "lujvo", | ||
"morae", | |||
"morphemes", | "morphemes", | ||
"non-constituents", | "non-constituents", | ||
Line 69: | Line 95: | ||
"predicatives", | "predicatives", | ||
"prefixes", | "prefixes", | ||
"prepositional phrases", | |||
"prepositions", | "prepositions", | ||
"preverbs", | "preverbs", | ||
"pronominal adverbs", | "pronominal adverbs", | ||
"pronouns", | "pronouns", | ||
"proper nouns", | |||
"proverbs", | "proverbs", | ||
"punctuation marks", | "punctuation marks", | ||
"relatives", | "relatives", | ||
"roots", | "roots", | ||
Line 87: | Line 112: | ||
} | } | ||
data.nonlemmas = { | data.nonlemmas = list_to_set{ | ||
"active participle forms", | |||
"active participles", | "active participles", | ||
"adjectival participles", | "adjectival participles", | ||
"adjective case forms", | |||
"adjective forms", | "adjective forms", | ||
"adjective feminine forms", | "adjective feminine forms", | ||
Line 103: | Line 130: | ||
"comparative adverb forms", | "comparative adverb forms", | ||
"comparative adverbs", | "comparative adverbs", | ||
"conjunction forms", | |||
"contractions", | "contractions", | ||
"converbs", | "converbs", | ||
Line 109: | Line 137: | ||
"determiner superlative forms", | "determiner superlative forms", | ||
"diminutive nouns", | "diminutive nouns", | ||
"elative adjectives", | |||
"equative adjective forms", | "equative adjective forms", | ||
"equative adjectives", | "equative adjectives", | ||
Line 117: | Line 146: | ||
"interjection forms", | "interjection forms", | ||
"jyutping", | "jyutping", | ||
"misspellings", | "misspellings", | ||
"negative participles", | "negative participles", | ||
Line 124: | Line 152: | ||
"noun dual forms", | "noun dual forms", | ||
"noun forms", | "noun forms", | ||
"noun paucal forms", | |||
"noun plural forms", | "noun plural forms", | ||
"noun possessive forms", | "noun possessive forms", | ||
Line 164: | Line 193: | ||
"verb forms", | "verb forms", | ||
"verbal nouns", | "verbal nouns", | ||
} | |||
-- These langauges will not have links to separate parts of the headword. | |||
data.no_multiword_links = list_to_set{ | |||
"zh", | |||
} | } | ||
-- These languages will not have "LANG multiword terms" categories added. | -- These languages will not have "LANG multiword terms" categories added. | ||
data.no_multiword_cat = { | data.no_multiword_cat = list_to_set{ | ||
-------- Languages without spaces between words (sometimes spaces between phrases) -------- | -------- Languages without spaces between words (sometimes spaces between phrases) -------- | ||
"blt", -- Tai Dam | "blt", -- Tai Dam | ||
"ja", -- Japanese | "ja", -- Japanese | ||
Line 178: | Line 211: | ||
"my", -- Burmese | "my", -- Burmese | ||
"nan", -- Min Nan (some words in Latin script; hyphens between syllables) | "nan", -- Min Nan (some words in Latin script; hyphens between syllables) | ||
"nan-hbl", -- Hokkien (some words in Latin script; hyphens between syllables) | |||
"nod", -- Northern Thai | "nod", -- Northern Thai | ||
"ojp", -- Old Japanese | "ojp", -- Old Japanese | ||
"shn", -- Shan | |||
"sou", -- Southern Thai | |||
"tdd", -- Tai Nüa | "tdd", -- Tai Nüa | ||
"th", -- Thai | "th", -- Thai | ||
"tts", -- Isan | "tts", -- Isan | ||
"twh", -- Tai Dón | "twh", -- Tai Dón | ||
" | "txg", -- Tangut | ||
"zh", -- Chinese (all varieties with Chinese characters) | "zh", -- Chinese (all varieties with Chinese characters) | ||
"zkt", -- Khitan | |||
-------- Languages with spaces between syllables -------- | -------- Languages with spaces between syllables -------- | ||
Line 193: | Line 229: | ||
"atb", -- Zaiwa | "atb", -- Zaiwa | ||
"byk", -- Biao | "byk", -- Biao | ||
"cdy", -- Chadong | |||
--"duu", -- Drung; not sure | --"duu", -- Drung; not sure | ||
--"hmx-pro", -- Proto-Hmong-Mien | --"hmx-pro", -- Proto-Hmong-Mien | ||
Line 201: | Line 238: | ||
"mtq", -- Muong | "mtq", -- Muong | ||
--"mww", -- White Hmong; not sure | --"mww", -- White Hmong; not sure | ||
"onb", -- Lingao | |||
--"sit-gkh", -- Gokhy; not sure | --"sit-gkh", -- Gokhy; not sure | ||
--"swi", -- Sui; not sure | --"swi", -- Sui; not sure | ||
"tbq-lol-pro", -- Proto-Loloish | "tbq-lol-pro", -- Proto-Loloish | ||
"tdh", -- Thulung | "tdh", -- Thulung | ||
"ukk", -- Muak Sa-aak | |||
"vi", -- Vietnamese | "vi", -- Vietnamese | ||
"yig", -- Wusa Nasu | "yig", -- Wusa Nasu | ||
Line 212: | Line 251: | ||
"mkh-ban-pro", -- Proto-Bahnaric | "mkh-ban-pro", -- Proto-Bahnaric | ||
"sit-pro", -- Proto-Sino-Tibetan; listed above | "sit-pro", -- Proto-Sino-Tibetan; listed above | ||
-------- Other weirdnesses -------- | -------- Other weirdnesses -------- | ||
"mul", -- Translingual; gestures, Morse code, etc. | "mul", -- Translingual; gestures, Morse code, etc. | ||
Line 363: | Line 402: | ||
-- In these languages, the hyphen is not considered a word separator for the "multiword terms" category. | -- In these languages, the hyphen is not considered a word separator for the "multiword terms" category. | ||
data.hyphen_not_multiword_sep = { | data.hyphen_not_multiword_sep = list_to_set{ | ||
"akk", -- Akkadian; hyphens between syllables | "akk", -- Akkadian; hyphens between syllables | ||
"akl", -- Aklanon; hyphens for mid-word glottal stops | |||
"ber-pro", -- Proto-Berber; morphemes separated by hyphens | |||
"ceb", -- Cebuano; hyphens for mid-word glottal stops | |||
"cnk", -- Khumi Chin; hyphens used in single words | |||
"cpi", -- Chinese Pidgin English; Chinese-derived words with hyphens between syllables | "cpi", -- Chinese Pidgin English; Chinese-derived words with hyphens between syllables | ||
"de", -- too many false positives | "de", -- too many false positives | ||
"esx-esk-pro", -- hyphen used to separate morphemes | "esx-esk-pro", -- hyphen used to separate morphemes | ||
"fi", -- Finnish; hyphen used to separate components in compound words if the final and initial vowels match, respectively | "fi", -- Finnish; hyphen used to separate components in compound words if the final and initial vowels match, respectively | ||
"hil", -- Hiligaynon; hyphens for mid-word glottal stops | |||
"ilo", -- Ilocano; hyphens for mid-word glottal stops | |||
"kne", -- Kankanaey; hyphens for mid-word glottal stops | |||
"lcp", -- Western Lawa; dash as syllable joiner | "lcp", -- Western Lawa; dash as syllable joiner | ||
"lwl", -- Eastern Lawa; dash as syllable joiner | "lwl", -- Eastern Lawa; dash as syllable joiner | ||
"mfa", -- Pattani Malay in Thai script; dash as syllable joiner | |||
"mkh-vie-pro", -- Proto-Vietic; morphemes separated by hyphens | "mkh-vie-pro", -- Proto-Vietic; morphemes separated by hyphens | ||
"msb", -- Masbatenyo; too many false positives | |||
"tl", -- Tagalog; too many false positives | |||
"war", -- Waray-Waray; too many false positives | |||
"yo", -- Yoruba; hyphens used to show lengthened nasal vowels | |||
} | } | ||
-- These languages will not have "LANG masculine nouns" and similar categories added. | -- These languages will not have "LANG masculine nouns" and similar categories added. | ||
data.no_gender_cat = { | data.no_gender_cat = list_to_set{ | ||
-- Languages without gender but which use the gender field for other purposes | -- Languages without gender but which use the gender field for other purposes | ||
"ja", | "ja", | ||
Line 381: | Line 432: | ||
} | } | ||
data.notranslit = { | data.notranslit = list_to_set{ | ||
"ams", | "ams", | ||
"az", | "az", | ||
Line 389: | Line 440: | ||
"cjm", | "cjm", | ||
"cmn", | "cmn", | ||
"cpi", | |||
"hak", | "hak", | ||
"ja", | "ja", | ||
Line 398: | Line 450: | ||
"mvi", | "mvi", | ||
"nan", | "nan", | ||
"nan-hbl", | |||
"nan-hnm", | |||
"nan-luh", | |||
"nan-tws", | |||
"oj", | "oj", | ||
"okn", | "okn", | ||
"ryn", | "ryn", | ||
"rys", | "rys", | ||
Line 410: | Line 464: | ||
"tkn", | "tkn", | ||
"tly", | "tly", | ||
"txg", | |||
"und", | "und", | ||
"vi", | "vi", | ||
"xug", | "xug", | ||
"yoi", | "yoi", | ||
"yox", | "yox", | ||
"yue", | |||
"za", | "za", | ||
"zh", | "zh", | ||
} | } | ||
-- Script codes for which a script-tagged display title will be added. | -- Script codes for which a script-tagged display title will be added. | ||
data.toBeTagged = { | data.toBeTagged = list_to_set{ | ||
"Ahom", | "Ahom", | ||
"Arab", | "Arab", | ||
"fa-Arab", | |||
"glk-Arab", | |||
"kk-Arab", | |||
"ks-Arab", | |||
"ku-Arab", | |||
"mzn-Arab", | |||
"ms-Arab", | |||
"ota-Arab", | |||
"pa-Arab", | |||
"ps-Arab", | |||
"sd-Arab", | |||
"tt-Arab", | |||
"ug-Arab", | |||
"ur-Arab", | |||
"Armi", | |||
"Armn", | |||
"Avst", | "Avst", | ||
"Bali", | "Bali", | ||
"Bamu", | |||
"Batk", | |||
"Beng", | |||
"as-Beng", | |||
"Bopo", | |||
"Brah", | |||
"Brai", | |||
"Bugi", | |||
"Buhd", | |||
"Cakm", | |||
"Cans", | |||
"Cari", | |||
"Cham", | "Cham", | ||
"Cher", | |||
"Copt", | "Copt", | ||
" | "Cprt", | ||
"Cyrl", | |||
"Cyrs", | |||
"Deva", | |||
"Dsrt", | |||
"Egyd", | |||
"Egyp", | |||
"Ethi", | |||
"Geok", | |||
"Geor", | |||
"Glag", | |||
"Goth", | |||
"Grek", | |||
"Polyt", | |||
"polytonic", | |||
"Gujr", | |||
"Guru", | |||
"Hang", | |||
"Hani", | "Hani", | ||
"Hano", | |||
"Hebr", | "Hebr", | ||
"Hira", | |||
"Hluw", | |||
"Ital", | |||
"Java", | |||
"Kali", | |||
"Kana", | |||
"Khar", | |||
"Khmr", | |||
"Knda", | |||
"Kthi", | |||
"Lana", | "Lana", | ||
"Laoo", | |||
"Latn", | |||
"Latf", | |||
"Latg", | |||
"Latnx", | |||
"Latinx", | |||
"pjt-Latn", | |||
"Lepc", | |||
"Limb", | |||
"Linb", | "Linb", | ||
"Lisu", | |||
"Lyci", | |||
"Lydi", | |||
"Mand", | "Mand", | ||
"Mani", | |||
"Marc", | |||
"Merc", | |||
"Mero", | |||
"Mlym", | |||
"Mong", | "Mong", | ||
" | "mnc-Mong", | ||
"sjo-Mong", | |||
"xwo-Mong", | |||
"Mtei", | |||
"Mymr", | |||
"Narb", | |||
"Nkoo", | |||
"Nshu", | |||
"Ogam", | |||
"Olck", | |||
"Orkh", | |||
"Orya", | |||
"Osma", | |||
"Ougr", | |||
"Palm", | |||
"Phag", | |||
"Phli", | |||
"Phlv", | |||
"Phnx", | |||
"Plrd", | |||
"Prti", | |||
"Rjng", | "Rjng", | ||
"Runr", | |||
"Samr", | "Samr", | ||
"Sarb", | |||
"Saur", | |||
"Sgnw", | |||
"Shaw", | |||
"Shrd", | |||
"Sinh", | |||
"Sora", | |||
"Sund", | "Sund", | ||
"Sylo", | "Sylo", | ||
"Syrc", | |||
"Tagb", | |||
"Tale", | |||
"Talu", | |||
"Taml", | |||
"Tang", | "Tang", | ||
"Tavt", | "Tavt", | ||
"Telu", | |||
"Tfng", | |||
"Tglg", | |||
"Thaa", | |||
"Thai", | |||
"Tibt", | |||
"Ugar", | |||
"Vaii", | |||
"Xpeo", | |||
"Xsux", | "Xsux", | ||
"Yiii", | |||
"Zmth", | |||
"Zsym", | |||
"Ipach", | |||
"Music", | |||
"Rumin", | |||
} | |||
-- Parts of speech which will not be categorised in categories like "English terms spelled with É" if | |||
-- the term is the character in question (e.g. the letter entry for English [[é]]). This contrasts with | |||
-- entries like the French adjective [[m̂]], which is a one-letter word spelled with the letter. | |||
data.pos_not_spelled_with_self = list_to_set{ | |||
"diacritical marks", | |||
"Han characters", | |||
"Han tu", | |||
"hanja", | |||
"hanzi", | |||
"iteration marks", | |||
"kana", | |||
"kanji", | |||
"letters", | |||
"ligatures", | |||
"logograms", | |||
"morae", | |||
"numeral symbols", | |||
"numerals", | |||
"punctuation marks", | |||
"syllables", | |||
"symbols", | |||
} | } | ||
for | ------ 2. Lists not converted into sets. ------ | ||
-- Recognized aliases for parts of speech (param 2=). Key is the short form and value is the canonical singular (not | |||
-- pluralized) form. It is singular so that the same table can be used in [[Module:form of]] for the p=/POS= param | |||
-- and [[Module:links]] for the pos= param. | |||
data.pos_aliases = { | |||
a = "adjective", | |||
adj = "adjective", | |||
adv = "adverb", | |||
art = "article", | |||
det = "determiner", | |||
cnum = "cardinal number", | |||
conj = "conjunction", | |||
conv = "converb", | |||
int = "interjection", | |||
interj = "interjection", | |||
intj = "interjection", | |||
n = "noun", | |||
num = "numeral", | |||
part = "participle", | |||
pcl = "particle", | |||
phr = "phrase", | |||
pn = "proper noun", | |||
postp = "postposition", | |||
pre = "preposition", | |||
prep = "preposition", | |||
pro = "pronoun", | |||
pron = "pronoun", | |||
prop = "proper noun", | |||
proper = "proper noun", | |||
onum = "ordinal number", | |||
v = "verb", | |||
vb = "verb", | |||
vi = "intransitive verb", | |||
vt = "transitive verb", | |||
vti = "transitive and intransitive verb", | |||
} | |||
-- Parts of speech for which categories like "German masculine nouns" or "Russian imperfective verbs" | -- Parts of speech for which categories like "German masculine nouns" or "Russian imperfective verbs" | ||
-- will be generated if the headword is of the appropriate gender/number | -- will be generated if the headword is of the appropriate gender/number. | ||
data.pos_for_gender_number_cat = { | data.pos_for_gender_number_cat = { | ||
["nouns"] = "nouns", | ["nouns"] = "nouns", | ||
["proper nouns"] = "nouns", | ["proper nouns"] = "nouns", | ||
["suffixes"] = "suffixes", | |||
-- We include verbs because impf and pf are valid "genders". | -- We include verbs because impf and pf are valid "genders". | ||
["verbs"] = "verbs", | ["verbs"] = "verbs", | ||
} | } | ||
------ 3. Page-wide processing (so that it only needs to be done once per page). ------ | |||
data.page = require(headword_page_module).process_page() | |||
-- Fuckme, random references to data.pagename and data.encoded_pagename are scattered throughout the codebase. FIXME! | |||
data.pagename = data.page.pagename | |||
data.encoded_pagename = data.page.encoded_pagename | |||
return data | return data |