Module:headword/data: Difference between revisions

No edit summary
No edit summary
 
(4 intermediate revisions by the same user not shown)
Line 1: Line 1:
local headword_page_module = "Module:headword/page"
local list_to_set = require("Module:table").listToSet
local data = {}
local data = {}


data.invariable = {
------ 1. Lists which are converted into sets. ------
 
--[==[ var:
Large pages where we disable label tracking, red link checking and similar.
]==]
data.large_pages = list_to_set {
-- pages that consistently hit timeouts
"a",
-- pages that sometimes hit timeouts
"A",
"baba",
"de",
"e",
"i",
"lima",
"o",
"u",
"и",
"山",
"子",
"月",
"一",
"人",
}
 
--[==[ var:
Map from singular to plural, and from plural to itself, for recognized parts of speech with irregular plurals. Most of
these are invariable plurals, e.g. `kanji` is its own plural; but we also have `mora` plural `morae`.
]==]
data.irregular_plurals = list_to_set({
"cmavo",
"cmavo",
"cmene",
"cmene",
Line 7: Line 40:
"gismu",
"gismu",
"Han tu",
"Han tu",
"hanja",
"hanzi",
"hanzi",
"hanja",
"jyutping",
"jyutping",
"kana",
"kanji",
"kanji",
"lujvo",
"lujvo",
Line 15: Line 49:
"pinyin",
"pinyin",
"rafsi",
"rafsi",
"romaji",
}, function(_, item)
}
return item
end)
 
local irregular_plurals = data.irregular_plurals
 
-- Irregular non-zero plurals AND any regular plurals where the singular ends in "s",
-- because the module assumes that inputs ending in "s" are plurals. The singular and
-- plural both need to be added, as the module will generate a default plural if
-- the input doesn't match a key in this table.
for sg, pl in next, {
mora = "morae"
} do
irregular_plurals[sg], irregular_plurals[pl] = pl, pl
end


data.lemmas = {
--[==[ var:
Recognized lemmas. If the part of speech in {{tl|head}} is set to one of these or its singular equivalent, the category
'LANG lemmas' will automatically be added. If the part of speech is not a singular or plural lemma or non-lemma form and
is not an abbreviation that expands to a recognized lemma or non-lemma form, the page will be added to various tracking
categories:
* [[Special:WhatLinksHere/Wiktionary:Tracking/headword/unrecognized pos]]
* [[Special:WhatLinksHere/Wiktionary:Tracking/headword/unrecognized pos/LANG]]
* [[Special:WhatLinksHere/Wiktionary:Tracking/headword/unrecognized pos/pos/POS]]
* [[Special:WhatLinksHere/Wiktionary:Tracking/headword/unrecognized pos/pos/POS/LANG]]
]==]
data.lemmas = list_to_set{
"abbreviations",
"abbreviations",
"acronyms",
"acronyms",
Line 31: Line 88:
"circumpositions",
"circumpositions",
"classifiers",
"classifiers",
"clitics",
"cmavo",
"cmavo",
"cmavo clusters",
"cmavo clusters",
Line 40: Line 96:
"determiners",
"determiners",
"diacritical marks",
"diacritical marks",
"digraphs",
"equative adjectives",
"equative adjectives",
"fu'ivla",
"fu'ivla",
Line 45: Line 102:
"Han characters",
"Han characters",
"Han tu",
"Han tu",
"hanja",
"hanzi",
"hanzi",
"hanja",
"ideophones",
"ideophones",
"idioms",
"idioms",
"infixes",
"infixes",
"initialisms",
"iteration marks",
"interfixes",
"interfixes",
"initialisms",
"interjections",
"interjections",
"kana",
"kanji",
"kanji",
"letters",
"letters",
"ligatures",
"ligatures",
"logograms",
"lujvo",
"lujvo",
"morae",
"morphemes",
"morphemes",
"non-constituents",
"non-constituents",
Line 69: Line 130:
"predicatives",
"predicatives",
"prefixes",
"prefixes",
"prepositional phrases",
"prepositions",
"prepositions",
"prepositional phrases",
"preverbs",
"preverbs",
"pronominal adverbs",
"pronominal adverbs",
"pronouns",
"pronouns",
"proper nouns",
"proverbs",
"proverbs",
"proper nouns",
"punctuation marks",
"punctuation marks",
"relatives",
"relatives",
Line 86: Line 147:
}
}


data.nonlemmas = {
--[==[ var:
Recognized non-lemma forms. If the part of speech in {{tl|head}} is set to one of these or its singular equivalent, the
category 'LANG non-lemma forms' will automatically be added. If the part of speech is not a singular or plural lemma or
non-lemma form and is not an abbreviation that expands to a recognized lemma or non-lemma form, the page will be added
to various tracking categories; see the documentation of `data.lemmas`.
]==]
data.nonlemmas = list_to_set{
"active participle forms",
"active participles",
"active participles",
"adjectival participles",
"adjectival participles",
    "adjective case forms",
"adjective forms",
"adjective forms",
"adjective feminine forms",
"adjective feminine forms",
Line 102: Line 171:
"comparative adverb forms",
"comparative adverb forms",
"comparative adverbs",
"comparative adverbs",
"conjunction forms",
"contractions",
"contractions",
"converbs",
"converbs",
Line 108: Line 178:
"determiner superlative forms",
"determiner superlative forms",
"diminutive nouns",
"diminutive nouns",
"elative adjectives",
"equative adjective forms",
"equative adjective forms",
"equative adjectives",
"equative adjectives",
Line 116: Line 187:
"interjection forms",
"interjection forms",
"jyutping",
"jyutping",
"kanji readings",
"misspellings",
"misspellings",
"negative participles",
"negative participles",
"nominal participles",
"nominal participles",
"noun case forms",
"noun case forms",
"noun construct forms",
"noun dual forms",
"noun dual forms",
"noun forms",
"noun forms",
"noun paucal forms",
"noun plural forms",
"noun plural forms",
"noun possessive forms",
"noun possessive forms",
Line 132: Line 204:
"passive participles",
"passive participles",
"past active participles",
"past active participles",
"past adverbial participles",
"past participles",
"past participles",
"past participle forms",
"past participle forms",
Line 146: Line 219:
"prepositional pronouns",
"prepositional pronouns",
"present active participles",
"present active participles",
"present adverbial participles",
"present participles",
"present participles",
"present passive participles",
"present passive participles",
"preverb forms",
"pronoun forms",
"pronoun forms",
"pronoun possessive forms",
"pronoun possessive forms",
Line 165: Line 240:
}
}


-- These languages will not have "LANG multiword terms" categories added.
--[==[ var:
data.no_multiword_cat = {
List of languages that will not have links to separate parts of the headword.
]==]
data.no_multiword_links = list_to_set{
"zh",
}
 
--[==[ var:
List of languages that will not have `LANG multiword terms` categories added. There are various reasons why languages
are in this list: (a) words are written without spaces between them; (b) syllables are written with spaces between them;
(c) variant reconstructions are notated with a tilde surrounded by spaces; (d) the language is a sign language, where
pagenames are multiword descriptions of the gesture(s) required to make an individual sign; (e) some other weirdnesses.
]==]
data.no_multiword_cat = list_to_set{
-------- Languages without spaces between words (sometimes spaces between phrases) --------
-------- Languages without spaces between words (sometimes spaces between phrases) --------
"aho", -- Ahom
"blt", -- Tai Dam
"blt", -- Tai Dam
"ja", -- Japanese
"ja", -- Japanese
Line 177: Line 263:
"my", -- Burmese
"my", -- Burmese
"nan", -- Min Nan (some words in Latin script; hyphens between syllables)
"nan", -- Min Nan (some words in Latin script; hyphens between syllables)
"nan-hbl", -- Hokkien (some words in Latin script; hyphens between syllables)
"nod", -- Northern Thai
"nod", -- Northern Thai
"ojp", -- Old Japanese
"ojp", -- Old Japanese
"shn", -- Shan
"sou", -- Southern Thai
"tdd", -- Tai Nüa
"tdd", -- Tai Nüa
"th", -- Thai
"th", -- Thai
"tts", -- Isan
"tts", -- Isan
"twh", -- Tai Dón
"twh", -- Tai Dón
"shn", -- Shan
"txg", -- Tangut
"sou", -- Southern Thai
"zh", -- Chinese (all varieties with Chinese characters)
"zh", -- Chinese (all varieties with Chinese characters)
"zkt", -- Khitan


-------- Languages with spaces between syllables --------
-------- Languages with spaces between syllables --------
Line 192: Line 281:
"atb", -- Zaiwa
"atb", -- Zaiwa
"byk", -- Biao
"byk", -- Biao
"cdy", -- Chadong
--"duu", -- Drung; not sure
--"duu", -- Drung; not sure
--"hmx-pro", -- Proto-Hmong-Mien
--"hmx-pro", -- Proto-Hmong-Mien
Line 200: Line 290:
"mtq", -- Muong
"mtq", -- Muong
--"mww", -- White Hmong; not sure
--"mww", -- White Hmong; not sure
"onb", -- Lingao
--"sit-gkh", -- Gokhy; not sure
--"sit-gkh", -- Gokhy; not sure
--"swi", -- Sui; not sure
--"swi", -- Sui; not sure
"tbq-lol-pro", -- Proto-Loloish
"tbq-lol-pro", -- Proto-Loloish
"tdh", -- Thulung
"tdh", -- Thulung
"ukk", -- Muak Sa-aak
"vi", -- Vietnamese
"vi", -- Vietnamese
"yig", -- Wusa Nasu
"yig", -- Wusa Nasu
Line 211: Line 303:
"mkh-ban-pro", -- Proto-Bahnaric
"mkh-ban-pro", -- Proto-Bahnaric
"sit-pro", -- Proto-Sino-Tibetan; listed above
"sit-pro", -- Proto-Sino-Tibetan; listed above
 
-------- Other weirdnesses --------
-------- Other weirdnesses --------
"mul", -- Translingual; gestures, Morse code, etc.
"mul", -- Translingual; gestures, Morse code, etc.
Line 361: Line 453:
}
}


-- In these languages, the hyphen is not considered a word separator for the "multiword terms" category.
--[==[ var:
data.hyphen_not_multiword_sep = {
List of languages where a hyphen is not considered a word separator for the `LANG multiword terms` category. There are
numerous reasons why languages are in this list; by each language should be listed the reason for inclusion.
]==]
data.hyphen_not_multiword_sep = list_to_set{
"akk", -- Akkadian; hyphens between syllables
"akk", -- Akkadian; hyphens between syllables
"akl", -- Aklanon; hyphens for mid-word glottal stops
"ber-pro", -- Proto-Berber; morphemes separated by hyphens
"ceb", -- Cebuano; hyphens for mid-word glottal stops
"cnk", -- Khumi Chin; hyphens used in single words
"cpi", -- Chinese Pidgin English; Chinese-derived words with hyphens between syllables
"cpi", -- Chinese Pidgin English; Chinese-derived words with hyphens between syllables
"de", -- too many false positives
"de", -- German; too many false positives
"esx-esk-pro", -- hyphen used to separate morphemes
"esx-esk-pro", -- hyphen used to separate morphemes
"fi", -- Finnish; hyphen used to separate components in compound words if the final and initial vowels match, respectively
"fi", -- Finnish; hyphen used to separate components in compound words if the final and initial vowels match, respectively
"gd", -- Scottish Gaelic; too many false positives like [[a-chianaibh]], [[a-nìos]], [[an-dè]] and other adverbs in a- and an-
"hil", -- Hiligaynon; hyphens for mid-word glottal stops
"hnn", -- Hanunoo; too many false positives
"ilo", -- Ilocano; hyphens for mid-word glottal stops
"kne", -- Kankanaey; hyphens for mid-word glottal stops
"lcp", -- Western Lawa; dash as syllable joiner
"lcp", -- Western Lawa; dash as syllable joiner
"lwl", -- Eastern Lawa; dash as syllable joiner
"lwl", -- Eastern Lawa; dash as syllable joiner
"mfa", -- Pattani Malay in Thai script; dash as syllable joiner
"mkh-vie-pro", -- Proto-Vietic; morphemes separated by hyphens
"mkh-vie-pro", -- Proto-Vietic; morphemes separated by hyphens
"msb", -- Masbatenyo; too many false positives
"tl", -- Tagalog; too many false positives
"war", -- Waray-Waray; too many false positives
"yo", -- Yoruba; hyphens used to show lengthened nasal vowels
}
}


-- These languages will not have "LANG masculine nouns" and similar categories added.
--[==[ var:
data.no_gender_cat = {
List of languages that will not have `LANG masculine nouns` and similar categories added. Generally, these languages are
lacking gender but use the gender field for other purposes. (This is a massive hack and should be changed.)
]==]
data.no_gender_cat = list_to_set{
-- Languages without gender but which use the gender field for other purposes
-- Languages without gender but which use the gender field for other purposes
"ja",
"ja",
Line 380: Line 492:
}
}


data.notranslit = {
--[==[ var:
List of languages where [[Module:headword]] should not attempt to generate a transliteration even if the term is written
in a non-Latin script. FIXME: Notate reasons why each language is in this list.
]==]
data.notranslit = list_to_set{
"ams",
"ams",
"az",
"az",
"bbc",
"bbc",
"bug",
"bug",
"cdo",
"cia",
"cia",
"cjm",
"cjm",
"cjy",
"cmn",
"cmn",
"cnp",
"cpi",
"cpx",
"csp",
"czh",
"czo",
"gan",
"hak",
"hak",
"hnm",
"hsn",
"ja",
"ja",
"kzg",
"kzg",
"lad",
"lad",
"ltc",
"luh",
"lzh",
"lzh",
"mnp",
"ms",
"ms",
"mul",
"mul",
"mvi",
"mvi",
"nan",
"nan",
"nan-dat",
"nan-hbl",
"nan-hlh",
"nan-lnx",
"nan-tws",
"nan-zhe",
"nan-zsh",
"och",
"oj",
"oj",
"okn",
"okn",
"pi",
"ro",
"ryn",
"ryn",
"rys",
"rys",
"ryu",
"ryu",
"sh",
"sh",
"sjc",
"tgt",
"tgt",
"th",
"th",
"tkn",
"tkn",
"tly",
"tly",
"txg",
"und",
"und",
"vi",
"vi",
"wuu",
"xug",
"xug",
"yue",
"yoi",
"yoi",
"yox",
"yox",
"yue",
"za",
"za",
"zh",
"zh",
"zhx-sic",
"zhx-tai",
}
--[==[ var:
List of languages that will default to `sccat` being true, i.e. categories like `LANG POS in SCRIPT script` will
automatically be generated. This can be overridden using {{para|sccat|0}} in {{tl|head}} or setting `sccat` to
`false` in Lua.
]==]
data.default_sccat = list_to_set{
"inc-apa",
"inc-ash",
"kfr",
"ks",
"mr",
"mwr",
"inc-oaw",
"inc-ohi",
"omr",
"inc-opa",
"phr",
"pi",
"pra",
"sa",
"skr",
"sd",
}
}


-- Script codes for which a script-tagged display title will be added.
--[==[ var:
data.toBeTagged = {
List of script codes for which a script-tagged display title will be added.
]==]
data.toBeTagged = list_to_set{
"Ahom",
"Ahom",
"Arab",
"Arab",
"fa-Arab",
"glk-Arab",
"kk-Arab",
"ks-Arab",
"ku-Arab",
"mzn-Arab",
"ms-Arab",
"ota-Arab",
"pa-Arab",
"ps-Arab",
"sd-Arab",
"tt-Arab",
"ug-Arab",
"ur-Arab",
"Armi",
"Armn",
"Avst",
"Avst",
"Bali",
"Bali",
"Bamu",
"Batk",
"Beng",
"as-Beng",
"Bopo",
"Brah",
"Brai",
"Bugi",
"Buhd",
"Cakm",
"Cans",
"Cari",
"Cham",
"Cham",
"Cher",
"Copt",
"Copt",
"Kali",
"Cprt",
"Cyrl",
"Cyrs",
"Deva",
"Dsrt",
"Egyd",
"Egyp",
"Ethi",
"Geok",
"Geor",
"Glag",
"Goth",
"Grek",
"Polyt",
"polytonic",
"Gujr",
"Guru",
"Hang",
"Hani",
"Hani",
"Hano",
"Hebr",
"Hebr",
"Hira",
"Hluw",
"Ital",
"Java",
"Kali",
"Kana",
"Khar",
"Khmr",
"Knda",
"Kthi",
"Lana",
"Lana",
"Laoo",
"Latn",
"Latf",
"Latg",
"Latnx",
"Latinx",
"pjt-Latn",
"Lepc",
"Limb",
"Linb",
"Linb",
"Lisu",
"Lyci",
"Lydi",
"Mand",
"Mand",
"Mani",
"Marc",
"Merc",
"Mero",
"Mlym",
"Mong",
"Mong",
"polytonic",
"mnc-Mong",
"sjo-Mong",
"xwo-Mong",
"Mtei",
"Mymr",
"Narb",
"Nkoo",
"Nshu",
"Ogam",
"Olck",
"Orkh",
"Orya",
"Osma",
"Ougr",
"Palm",
"Phag",
"Phli",
"Phlv",
"Phnx",
"Plrd",
"Prti",
"Rjng",
"Rjng",
"Runr",
"Samr",
"Samr",
"Sarb",
"Saur",
"Sgnw",
"Shaw",
"Shrd",
"Sinh",
"Sora",
"Sund",
"Sund",
"Sylo",
"Sylo",
"Syrc",
"Tagb",
"Tale",
"Talu",
"Taml",
"Tang",
"Tang",
"Tavt",
"Tavt",
"Telu",
"Tfng",
"Tglg",
"Thaa",
"Thai",
"Tibt",
"Ugar",
"Vaii",
"Xpeo",
"Xsux",
"Xsux",
"Yiii",
"Zmth",
"Zsym",
"Ipach",
"Music",
"Rumin",
}
--[==[ var:
Parts of speech which will not be categorised in categories like `English terms spelled with É` if the term is the
character in question (e.g. the letter entry for English [[é]]). This contrasts with entries like the French adjective
[[m̂]], which is a one-letter word spelled with the letter.
]==]
data.pos_not_spelled_with_self = list_to_set{
"diacritical marks",
"Han characters",
"Han tu",
"hanja",
"hanzi",
"iteration marks",
"kana",
"kanji",
"letters",
"ligatures",
"logograms",
"morae",
"numeral symbols",
"numerals",
"punctuation marks",
"syllables",
"symbols",
}
}


for key, list in pairs(data) do
------ 2. Lists not converted into sets. ------
data[key] = require("Module:utils").list_to_set(list)
 
end
--[==[ var:
Recognized aliases for parts of speech (param 2=). Key is the short form and value is the canonical singular (not
pluralized) form. It is singular so the same table can be used in [[Module:form of]] for the {{para|p}}/{{para|POS}}
param and [[Module:links]] for the pos= param. Note that any part of speech, abbreviated or not, can be suffixed with
`f` to generate the corresponding non-lemma form part of speech, such as `adjf`, `af` or `adjectivef` for
`adjective form`, and `nounf` or `nf` for `noun form`. This expansion happens even when it does not make sense for the
given part of speech (e.g. `pclf` expands to `particle form` and `symf` expands to `symbol form`), and currently also,
at least in [[Module:headword]] (but not [[Module:links]]), even if the part before the `f` is not a recognized part of
speech or abbreviation (hence `nerf` expands to `ner form`).
]==]
data.pos_aliases = {
a = "adjective",
adj = "adjective",
adv = "adverb",
art = "article",
aug = "augmentative",
cls = "classifier",
compadj = "comparative adjective",
compadv = "comparative adverb",
compdet = "comparative determiner",
comppron = "comparative pronoun",
conj = "conjunction",
contr = "contraction",
conv = "converb",
det = "determiner",
dim = "diminutive",
int = "interjection",
interj = "interjection",
intj = "interjection",
n = "noun",
-- the next two support Algonquian languages; see also vii/vai/vti/vta below
na = "animate noun",
ni = "inanimate noun",
num = "numeral",
pastpart = "past participle",
part = "participle",
pcl = "particle",
phr = "phrase",
pn = "proper noun",
postp = "postposition",
pref = "prefix",
prep = "preposition",
prepphr = "prepositional phrase",
prespart = "present participle",
pron = "pronoun",
prop = "proper noun",
proper = "proper noun",
propn = "proper noun",
rom = "romanization",
roman = "romanization",
romanisation = "romanization",
romanisations = "romanization",
suf = "suffix",
supadj = "superlative adjective",
supadv = "superlative adverb",
supdet = "superlative determiner",
suppron = "superlative pronoun",
sym = "symbol",
v = "verb",
vb = "verb",
vi = "intransitive verb",
vm = "modal verb",
vt = "transitive verb",
-- the next four support Algonquian languages
vii = "inanimate intransitive verb",
vai = "animate intransitive verb",
vti = "transitive inanimate verb",
vta = "transitive animate verb",
}


-- Parts of speech for which categories like "German masculine nouns" or "Russian imperfective verbs"
--[==[ var:
-- will be generated if the headword is of the appropriate gender/number. We put this at the bottom
Map of parts of speech for which categories like `German masculine nouns` or `Russian imperfective verbs` will be
-- because it's a map, not a list.
generated if the headword is of the appropriate gender/number. The map is used to canonicalize parts of speech for
categorization purposes; specifically, proper nouns categorizes like nouns.
]==]
data.pos_for_gender_number_cat = {
data.pos_for_gender_number_cat = {
["nouns"] = "nouns",
["nouns"] = "nouns",
["proper nouns"] = "nouns",
["proper nouns"] = "nouns",
["suffixes"] = "suffixes",
-- We include verbs because impf and pf are valid "genders".
-- We include verbs because impf and pf are valid "genders".
["verbs"] = "verbs",
["verbs"] = "verbs",
}
}
--[==[ var:
Lower limit for a "long" word in a particular language. Used to categorize terms into e.g.
[[:Category:Long English words]] automatically. Languages with no mapping here do not get categorized.
]==]
data.long_word_thresholds = {
    ["af"] = 20,
    ["bg"] = 20,
    ["cy"] = 25,
    ["de"] = 20,
    ["en"] = 25,
    ["es"] = 20,
    ["fr"] = 20,
    ["ka"] = 20,
    ["sv"] = 20,
    ["tl"] = 25,
}
------ 3. Page-wide processing (so that it only needs to be done once per page). ------
data.page = require(headword_page_module).process_page()
-- Set some page properties directly on `data` for ease of use.
data.pagename = data.page.pagename
data.encoded_pagename = data.page.encoded_pagename


return data
return data