Home
Random

Log in

Settings

About Linguifex
Disclaimers

Module:headword/data: Difference between revisions

Language
Watch
View history
View source

@@ Line 1: / Line 1: @@
+local headword_page_module = "Module:headword/page"
+local list_to_set = require("Module:table").listToSet
 local data = {}
-data.invariable = {
+------ 1. Lists which are converted into sets. ------
+--[==[ var:
+Large pages where we disable label tracking, red link checking and similar.
+]==]
+data.large_pages = list_to_set {
+	-- pages that consistently hit timeouts
+	"a",
+	-- pages that sometimes hit timeouts
+	"A",
+	"baba",
+	"de",
+	"e",
+	"i",
+	"lima",
+	"o",
+	"u",
+	"и",
+	"山",
+	"子",
+	"月",
+	"一",
+	"人",
+}
+--[==[ var:
+Map from singular to plural, and from plural to itself, for recognized parts of speech with irregular plurals. Most of
+these are invariable plurals, e.g. `kanji` is its own plural; but we also have `mora` plural `morae`.
+]==]
+data.irregular_plurals = list_to_set({
 	"cmavo",
 	"cmene",
@@ Line 7: / Line 40: @@
 	"gismu",
 	"Han tu",
+	"hanja",
 	"hanzi",
-	"hanja",
 	"jyutping",
+	"kana",
 	"kanji",
 	"lujvo",
@@ Line 15: / Line 49: @@
 	"pinyin",
 	"rafsi",
-	"romaji",
+}, function(_, item)
-}
+	return item
+end)
+local irregular_plurals = data.irregular_plurals
+-- Irregular non-zero plurals AND any regular plurals where the singular ends in "s",
+-- because the module assumes that inputs ending in "s" are plurals. The singular and
+-- plural both need to be added, as the module will generate a default plural if
+-- the input doesn't match a key in this table.
+for sg, pl in next, {
+	mora = "morae"
+} do
+	irregular_plurals[sg], irregular_plurals[pl] = pl, pl
+end
-data.lemmas = {
+--[==[ var:
+Recognized lemmas. If the part of speech in {{tl|head}} is set to one of these or its singular equivalent, the category
+'LANG lemmas' will automatically be added. If the part of speech is not a singular or plural lemma or non-lemma form and
+is not an abbreviation that expands to a recognized lemma or non-lemma form, the page will be added to various tracking
+categories:
+* [[Special:WhatLinksHere/Wiktionary:Tracking/headword/unrecognized pos]]
+* [[Special:WhatLinksHere/Wiktionary:Tracking/headword/unrecognized pos/LANG]]
+* [[Special:WhatLinksHere/Wiktionary:Tracking/headword/unrecognized pos/pos/POS]]
+* [[Special:WhatLinksHere/Wiktionary:Tracking/headword/unrecognized pos/pos/POS/LANG]]
+]==]
+data.lemmas = list_to_set{
 	"abbreviations",
 	"acronyms",
@@ Line 31: / Line 88: @@
 	"circumpositions",
 	"classifiers",
-	"clitics",
 	"cmavo",
 	"cmavo clusters",
@@ Line 40: / Line 96: @@
 	"determiners",
 	"diacritical marks",
+	"digraphs",
 	"equative adjectives",
 	"fu'ivla",
@@ Line 45: / Line 102: @@
 	"Han characters",
 	"Han tu",
+	"hanja",
 	"hanzi",
-	"hanja",
 	"ideophones",
 	"idioms",
 	"infixes",
+	"initialisms",
+	"iteration marks",
 	"interfixes",
-	"initialisms",
 	"interjections",
+	"kana",
 	"kanji",
 	"letters",
 	"ligatures",
+	"logograms",
 	"lujvo",
+	"morae",
 	"morphemes",
 	"non-constituents",
@@ Line 69: / Line 130: @@
 	"predicatives",
 	"prefixes",
+	"prepositional phrases",
 	"prepositions",
-	"prepositional phrases",
 	"preverbs",
 	"pronominal adverbs",
 	"pronouns",
+	"proper nouns",
 	"proverbs",
-	"proper nouns",
 	"punctuation marks",
 	"relatives",
@@ Line 86: / Line 147: @@
 }
-data.nonlemmas = {
+--[==[ var:
+Recognized non-lemma forms. If the part of speech in {{tl|head}} is set to one of these or its singular equivalent, the
+category 'LANG non-lemma forms' will automatically be added. If the part of speech is not a singular or plural lemma or
+non-lemma form and is not an abbreviation that expands to a recognized lemma or non-lemma form, the page will be added
+to various tracking categories; see the documentation of `data.lemmas`.
+]==]
+data.nonlemmas = list_to_set{
+	"active participle forms",
 	"active participles",
 	"adjectival participles",
+    "adjective case forms",
 	"adjective forms",
 	"adjective feminine forms",
@@ Line 102: / Line 171: @@
 	"comparative adverb forms",
 	"comparative adverbs",
+	"conjunction forms",
 	"contractions",
 	"converbs",
@@ Line 108: / Line 178: @@
 	"determiner superlative forms",
 	"diminutive nouns",
+	"elative adjectives",
 	"equative adjective forms",
 	"equative adjectives",
@@ Line 116: / Line 187: @@
 	"interjection forms",
 	"jyutping",
-	"kanji readings",
 	"misspellings",
 	"negative participles",
 	"nominal participles",
 	"noun case forms",
+	"noun construct forms",
 	"noun dual forms",
 	"noun forms",
+	"noun paucal forms",
 	"noun plural forms",
 	"noun possessive forms",
@@ Line 132: / Line 204: @@
 	"passive participles",
 	"past active participles",
+	"past adverbial participles",
 	"past participles",
 	"past participle forms",
@@ Line 146: / Line 219: @@
 	"prepositional pronouns",
 	"present active participles",
+	"present adverbial participles",
 	"present participles",
 	"present passive participles",
+	"preverb forms",
 	"pronoun forms",
 	"pronoun possessive forms",
@@ Line 165: / Line 240: @@
 }
--- These languages will not have "LANG multiword terms" categories added.
+--[==[ var:
-data.no_multiword_cat = {
+List of languages that will not have links to separate parts of the headword.
+]==]
+data.no_multiword_links = list_to_set{
+	"zh",
+}
+--[==[ var:
+List of languages that will not have `LANG multiword terms` categories added. There are various reasons why languages
+are in this list: (a) words are written without spaces between them; (b) syllables are written with spaces between them;
+(c) variant reconstructions are notated with a tilde surrounded by spaces; (d) the language is a sign language, where
+pagenames are multiword descriptions of the gesture(s) required to make an individual sign; (e) some other weirdnesses.
+]==]
+data.no_multiword_cat = list_to_set{
 	-------- Languages without spaces between words (sometimes spaces between phrases) --------
-	"aho", -- Ahom
 	"blt", -- Tai Dam
 	"ja", -- Japanese
@@ Line 177: / Line 263: @@
 	"my", -- Burmese
 	"nan", -- Min Nan (some words in Latin script; hyphens between syllables)
+	"nan-hbl", -- Hokkien (some words in Latin script; hyphens between syllables)
 	"nod", -- Northern Thai
 	"ojp", -- Old Japanese
+	"shn", -- Shan
+	"sou", -- Southern Thai
 	"tdd", -- Tai Nüa
 	"th", -- Thai
 	"tts", -- Isan
 	"twh", -- Tai Dón
-	"shn", -- Shan
+	"txg", -- Tangut
-	"sou", -- Southern Thai
 	"zh", -- Chinese (all varieties with Chinese characters)
+	"zkt", -- Khitan
 	-------- Languages with spaces between syllables --------
@@ Line 192: / Line 281: @@
 	"atb", -- Zaiwa
 	"byk", -- Biao
+	"cdy", -- Chadong
 	--"duu", -- Drung; not sure
 	--"hmx-pro", -- Proto-Hmong-Mien
@@ Line 200: / Line 290: @@
 	"mtq", -- Muong
 	--"mww", -- White Hmong; not sure
+	"onb", -- Lingao
 	--"sit-gkh", -- Gokhy; not sure
 	--"swi", -- Sui; not sure
 	"tbq-lol-pro", -- Proto-Loloish
 	"tdh", -- Thulung
+	"ukk", -- Muak Sa-aak
 	"vi", -- Vietnamese
 	"yig", -- Wusa Nasu
@@ Line 211: / Line 303: @@
 	"mkh-ban-pro", -- Proto-Bahnaric
 	"sit-pro", -- Proto-Sino-Tibetan; listed above
 	-------- Other weirdnesses --------
 	"mul", -- Translingual; gestures, Morse code, etc.
@@ Line 361: / Line 453: @@
 }
--- In these languages, the hyphen is not considered a word separator for the "multiword terms" category.
+--[==[ var:
-data.hyphen_not_multiword_sep = {
+List of languages where a hyphen is not considered a word separator for the `LANG multiword terms` category. There are
+numerous reasons why languages are in this list; by each language should be listed the reason for inclusion.
+]==]
+data.hyphen_not_multiword_sep = list_to_set{
 	"akk", -- Akkadian; hyphens between syllables
+	"akl", -- Aklanon; hyphens for mid-word glottal stops
+	"ber-pro", -- Proto-Berber; morphemes separated by hyphens
+	"ceb", -- Cebuano; hyphens for mid-word glottal stops
+	"cnk", -- Khumi Chin; hyphens used in single words
 	"cpi", -- Chinese Pidgin English; Chinese-derived words with hyphens between syllables
-	"de", -- too many false positives
+	"de", -- German; too many false positives
 	"esx-esk-pro", -- hyphen used to separate morphemes
 	"fi", -- Finnish; hyphen used to separate components in compound words if the final and initial vowels match, respectively
+	"gd", -- Scottish Gaelic; too many false positives like [[a-chianaibh]], [[a-nìos]], [[an-dè]] and other adverbs in a- and an-
+	"hil", -- Hiligaynon; hyphens for mid-word glottal stops
+	"hnn", -- Hanunoo; too many false positives
+	"ilo", -- Ilocano; hyphens for mid-word glottal stops
+	"kne", -- Kankanaey; hyphens for mid-word glottal stops
 	"lcp", -- Western Lawa; dash as syllable joiner
 	"lwl", -- Eastern Lawa; dash as syllable joiner
+	"mfa", -- Pattani Malay in Thai script; dash as syllable joiner
 	"mkh-vie-pro", -- Proto-Vietic; morphemes separated by hyphens
+	"msb", -- Masbatenyo; too many false positives
+	"tl", -- Tagalog; too many false positives
+	"war", -- Waray-Waray; too many false positives
+	"yo", -- Yoruba; hyphens used to show lengthened nasal vowels
 }
--- These languages will not have "LANG masculine nouns" and similar categories added.
+--[==[ var:
-data.no_gender_cat = {
+List of languages that will not have `LANG masculine nouns` and similar categories added. Generally, these languages are
+lacking gender but use the gender field for other purposes. (This is a massive hack and should be changed.)
+]==]
+data.no_gender_cat = list_to_set{
 	-- Languages without gender but which use the gender field for other purposes
 	"ja",
@@ Line 380: / Line 492: @@
 }
-data.notranslit = {
+--[==[ var:
+List of languages where [[Module:headword]] should not attempt to generate a transliteration even if the term is written
+in a non-Latin script. FIXME: Notate reasons why each language is in this list.
+]==]
+data.notranslit = list_to_set{
 	"ams",
 	"az",
 	"bbc",
 	"bug",
+	"cdo",
 	"cia",
 	"cjm",
+	"cjy",
 	"cmn",
+	"cnp",
+	"cpi",
+	"cpx",
+	"csp",
+	"czh",
+	"czo",
+	"gan",
 	"hak",
+	"hnm",
+	"hsn",
 	"ja",
 	"kzg",
 	"lad",
+	"ltc",
+	"luh",
 	"lzh",
+	"mnp",
 	"ms",
 	"mul",
 	"mvi",
 	"nan",
+	"nan-dat",
+	"nan-hbl",
+	"nan-hlh",
+	"nan-lnx",
+	"nan-tws",
+	"nan-zhe",
+	"nan-zsh",
+	"och",
 	"oj",
 	"okn",
-	"pi",
-	"ro",
 	"ryn",
 	"rys",
 	"ryu",
 	"sh",
+	"sjc",
 	"tgt",
 	"th",
 	"tkn",
 	"tly",
+	"txg",
 	"und",
 	"vi",
+	"wuu",
 	"xug",
-	"yue",
 	"yoi",
 	"yox",
+	"yue",
 	"za",
 	"zh",
+	"zhx-sic",
+	"zhx-tai",
+}
+--[==[ var:
+List of languages that will default to `sccat` being true, i.e. categories like `LANG POS in SCRIPT script` will
+automatically be generated. This can be overridden using {{para|sccat|0}} in {{tl|head}} or setting `sccat` to
+`false` in Lua.
+]==]
+data.default_sccat = list_to_set{
+	"inc-apa",
+	"inc-ash",
+	"kfr",
+	"ks",
+	"mr",
+	"mwr",
+	"inc-oaw",
+	"inc-ohi",
+	"omr",
+	"inc-opa",
+	"phr",
+	"pi",
+	"pra",
+	"sa",
+	"skr",
+	"sd",
 }
--- Script codes for which a script-tagged display title will be added.
+--[==[ var:
-data.toBeTagged = {
+List of script codes for which a script-tagged display title will be added.
+]==]
+data.toBeTagged = list_to_set{
 	"Ahom",
 	"Arab",
+		"fa-Arab",
+		"glk-Arab",
+		"kk-Arab",
+		"ks-Arab",
+		"ku-Arab",
+		"mzn-Arab",
+		"ms-Arab",
+		"ota-Arab",
+		"pa-Arab",
+		"ps-Arab",
+		"sd-Arab",
+		"tt-Arab",
+		"ug-Arab",
+		"ur-Arab",
+	"Armi",
+	"Armn",
 	"Avst",
 	"Bali",
+	"Bamu",
+	"Batk",
+	"Beng",
+		"as-Beng",
+	"Bopo",
+	"Brah",
+	"Brai",
+	"Bugi",
+	"Buhd",
+	"Cakm",
+	"Cans",
+	"Cari",
 	"Cham",
+	"Cher",
 	"Copt",
-	"Kali",
+	"Cprt",
+	"Cyrl",
+	"Cyrs",
+	"Deva",
+	"Dsrt",
+	"Egyd",
+	"Egyp",
+	"Ethi",
+	"Geok",
+	"Geor",
+	"Glag",
+	"Goth",
+	"Grek",
+		"Polyt",
+		"polytonic",
+	"Gujr",
+	"Guru",
+	"Hang",
 	"Hani",
+	"Hano",
 	"Hebr",
+	"Hira",
+	"Hluw",
+	"Ital",
+	"Java",
+	"Kali",
+	"Kana",
+	"Khar",
+	"Khmr",
+	"Knda",
+	"Kthi",
 	"Lana",
+	"Laoo",
+	"Latn",
+		"Latf",
+		"Latg",
+		"Latnx",
+		"Latinx",
+		"pjt-Latn",
+	"Lepc",
+	"Limb",
 	"Linb",
+	"Lisu",
+	"Lyci",
+	"Lydi",
 	"Mand",
+	"Mani",
+	"Marc",
+	"Merc",
+	"Mero",
+	"Mlym",
 	"Mong",
-	"polytonic",
+		"mnc-Mong",
+		"sjo-Mong",
+		"xwo-Mong",
+	"Mtei",
+	"Mymr",
+	"Narb",
+	"Nkoo",
+	"Nshu",
+	"Ogam",
+	"Olck",
+	"Orkh",
+	"Orya",
+	"Osma",
+	"Ougr",
+	"Palm",
+	"Phag",
+	"Phli",
+	"Phlv",
+	"Phnx",
+	"Plrd",
+	"Prti",
 	"Rjng",
+	"Runr",
 	"Samr",
+	"Sarb",
+	"Saur",
+	"Sgnw",
+	"Shaw",
+	"Shrd",
+	"Sinh",
+	"Sora",
 	"Sund",
 	"Sylo",
+	"Syrc",
+	"Tagb",
+	"Tale",
+	"Talu",
+	"Taml",
 	"Tang",
 	"Tavt",
+	"Telu",
+	"Tfng",
+	"Tglg",
+	"Thaa",
+	"Thai",
+	"Tibt",
+	"Ugar",
+	"Vaii",
+	"Xpeo",
 	"Xsux",
+	"Yiii",
+	"Zmth",
+	"Zsym",
+	"Ipach",
+	"Music",
+	"Rumin",
+}
+--[==[ var:
+Parts of speech which will not be categorised in categories like `English terms spelled with É` if the term is the
+character in question (e.g. the letter entry for English [[é]]). This contrasts with entries like the French adjective
+[[m̂]], which is a one-letter word spelled with the letter.
+]==]
+data.pos_not_spelled_with_self = list_to_set{
+	"diacritical marks",
+	"Han characters",
+	"Han tu",
+	"hanja",
+	"hanzi",
+	"iteration marks",
+	"kana",
+	"kanji",
+	"letters",
+	"ligatures",
+	"logograms",
+	"morae",
+	"numeral symbols",
+	"numerals",
+	"punctuation marks",
+	"syllables",
+	"symbols",
 }
-for key, list in pairs(data) do
+------ 2. Lists not converted into sets. ------
-	data[key] = require("Module:utils").list_to_set(list)
-end
+--[==[ var:
+Recognized aliases for parts of speech (param 2=). Key is the short form and value is the canonical singular (not
+pluralized) form. It is singular so the same table can be used in [[Module:form of]] for the {{para|p}}/{{para|POS}}
+param and [[Module:links]] for the pos= param. Note that any part of speech, abbreviated or not, can be suffixed with
+`f` to generate the corresponding non-lemma form part of speech, such as `adjf`, `af` or `adjectivef` for
+`adjective form`, and `nounf` or `nf` for `noun form`. This expansion happens even when it does not make sense for the
+given part of speech (e.g. `pclf` expands to `particle form` and `symf` expands to `symbol form`), and currently also,
+at least in [[Module:headword]] (but not [[Module:links]]), even if the part before the `f` is not a recognized part of
+speech or abbreviation (hence `nerf` expands to `ner form`).
+]==]
+data.pos_aliases = {
+	a = "adjective",
+	adj = "adjective",
+	adv = "adverb",
+	art = "article",
+	aug = "augmentative",
+	cls = "classifier",
+	compadj = "comparative adjective",
+	compadv = "comparative adverb",
+	compdet = "comparative determiner",
+	comppron = "comparative pronoun",
+	conj = "conjunction",
+	contr = "contraction",
+	conv = "converb",
+	det = "determiner",
+	dim = "diminutive",
+	int = "interjection",
+	interj = "interjection",
+	intj = "interjection",
+	n = "noun",
+	-- the next two support Algonquian languages; see also vii/vai/vti/vta below
+	na = "animate noun",
+	ni = "inanimate noun",
+	num = "numeral",
+	pastpart = "past participle",
+	part = "participle",
+	pcl = "particle",
+	phr = "phrase",
+	pn = "proper noun",
+	postp = "postposition",
+	pref = "prefix",
+	prep = "preposition",
+	prepphr = "prepositional phrase",
+	prespart = "present participle",
+	pron = "pronoun",
+	prop = "proper noun",
+	proper = "proper noun",
+	propn = "proper noun",
+	rom = "romanization",
+	roman = "romanization",
+	romanisation = "romanization",
+	romanisations = "romanization",
+	suf = "suffix",
+	supadj = "superlative adjective",
+	supadv = "superlative adverb",
+	supdet = "superlative determiner",
+	suppron = "superlative pronoun",
+	sym = "symbol",
+	v = "verb",
+	vb = "verb",
+	vi = "intransitive verb",
+	vm = "modal verb",
+	vt = "transitive verb",
+	-- the next four support Algonquian languages
+	vii = "inanimate intransitive verb",
+	vai = "animate intransitive verb",
+	vti = "transitive inanimate verb",
+	vta = "transitive animate verb",
+}
--- Parts of speech for which categories like "German masculine nouns" or "Russian imperfective verbs"
+--[==[ var:
--- will be generated if the headword is of the appropriate gender/number. We put this at the bottom
+Map of parts of speech for which categories like `German masculine nouns` or `Russian imperfective verbs` will be
--- because it's a map, not a list.
+generated if the headword is of the appropriate gender/number. The map is used to canonicalize parts of speech for
+categorization purposes; specifically, proper nouns categorizes like nouns.
+]==]
 data.pos_for_gender_number_cat = {
 	["nouns"] = "nouns",
 	["proper nouns"] = "nouns",
+	["suffixes"] = "suffixes",
 	-- We include verbs because impf and pf are valid "genders".
 	["verbs"] = "verbs",
 }
+--[==[ var:
+Lower limit for a "long" word in a particular language. Used to categorize terms into e.g.
+[[:Category:Long English words]] automatically. Languages with no mapping here do not get categorized.
+]==]
+data.long_word_thresholds = {
+    ["af"] = 20,
+    ["bg"] = 20,
+    ["cy"] = 25,
+    ["de"] = 20,
+    ["en"] = 25,
+    ["es"] = 20,
+    ["fr"] = 20,
+    ["ka"] = 20,
+    ["sv"] = 20,
+    ["tl"] = 25,
+}
+------ 3. Page-wide processing (so that it only needs to be done once per page). ------
+data.page = require(headword_page_module).process_page()
+-- Set some page properties directly on `data` for ease of use.
+data.pagename = data.page.pagename
+data.encoded_pagename = data.page.encoded_pagename
 return data

Retrieved from "https://linguifex.com/wiki/Module:headword/data"

Languages

This page is not available in other languages.

Linguifex

Privacy policy
About Linguifex
Disclaimers
Desktop