Module:headword/data: Difference between revisions
No edit summary |
No edit summary |
||
| (4 intermediate revisions by the same user not shown) | |||
| Line 1: | Line 1: | ||
local headword_page_module = "Module:headword/page" | |||
local list_to_set = require("Module:table").listToSet | |||
local data = {} | local data = {} | ||
data.invariable = { | ------ 1. Lists which are converted into sets. ------ | ||
--[==[ var: | |||
Large pages where we disable label tracking, red link checking and similar. | |||
]==] | |||
data.large_pages = list_to_set { | |||
-- pages that consistently hit timeouts | |||
"a", | |||
-- pages that sometimes hit timeouts | |||
"A", | |||
"baba", | |||
"de", | |||
"e", | |||
"i", | |||
"lima", | |||
"o", | |||
"u", | |||
"и", | |||
"山", | |||
"子", | |||
"月", | |||
"一", | |||
"人", | |||
} | |||
--[==[ var: | |||
Map from singular to plural, and from plural to itself, for recognized parts of speech with irregular plurals. Most of | |||
these are invariable plurals, e.g. `kanji` is its own plural; but we also have `mora` plural `morae`. | |||
]==] | |||
data.irregular_plurals = list_to_set({ | |||
"cmavo", | "cmavo", | ||
"cmene", | "cmene", | ||
| Line 7: | Line 40: | ||
"gismu", | "gismu", | ||
"Han tu", | "Han tu", | ||
"hanja", | |||
"hanzi", | "hanzi", | ||
"jyutping", | "jyutping", | ||
"kana", | |||
"kanji", | "kanji", | ||
"lujvo", | "lujvo", | ||
| Line 15: | Line 49: | ||
"pinyin", | "pinyin", | ||
"rafsi", | "rafsi", | ||
" | }, function(_, item) | ||
} | return item | ||
end) | |||
local irregular_plurals = data.irregular_plurals | |||
-- Irregular non-zero plurals AND any regular plurals where the singular ends in "s", | |||
-- because the module assumes that inputs ending in "s" are plurals. The singular and | |||
-- plural both need to be added, as the module will generate a default plural if | |||
-- the input doesn't match a key in this table. | |||
for sg, pl in next, { | |||
mora = "morae" | |||
} do | |||
irregular_plurals[sg], irregular_plurals[pl] = pl, pl | |||
end | |||
data.lemmas = { | --[==[ var: | ||
Recognized lemmas. If the part of speech in {{tl|head}} is set to one of these or its singular equivalent, the category | |||
'LANG lemmas' will automatically be added. If the part of speech is not a singular or plural lemma or non-lemma form and | |||
is not an abbreviation that expands to a recognized lemma or non-lemma form, the page will be added to various tracking | |||
categories: | |||
* [[Special:WhatLinksHere/Wiktionary:Tracking/headword/unrecognized pos]] | |||
* [[Special:WhatLinksHere/Wiktionary:Tracking/headword/unrecognized pos/LANG]] | |||
* [[Special:WhatLinksHere/Wiktionary:Tracking/headword/unrecognized pos/pos/POS]] | |||
* [[Special:WhatLinksHere/Wiktionary:Tracking/headword/unrecognized pos/pos/POS/LANG]] | |||
]==] | |||
data.lemmas = list_to_set{ | |||
"abbreviations", | "abbreviations", | ||
"acronyms", | "acronyms", | ||
| Line 31: | Line 88: | ||
"circumpositions", | "circumpositions", | ||
"classifiers", | "classifiers", | ||
"cmavo", | "cmavo", | ||
"cmavo clusters", | "cmavo clusters", | ||
| Line 40: | Line 96: | ||
"determiners", | "determiners", | ||
"diacritical marks", | "diacritical marks", | ||
"digraphs", | |||
"equative adjectives", | "equative adjectives", | ||
"fu'ivla", | "fu'ivla", | ||
| Line 45: | Line 102: | ||
"Han characters", | "Han characters", | ||
"Han tu", | "Han tu", | ||
"hanja", | |||
"hanzi", | "hanzi", | ||
"ideophones", | "ideophones", | ||
"idioms", | "idioms", | ||
"infixes", | "infixes", | ||
"initialisms", | |||
"iteration marks", | |||
"interfixes", | "interfixes", | ||
"interjections", | "interjections", | ||
"kana", | |||
"kanji", | "kanji", | ||
"letters", | "letters", | ||
"ligatures", | "ligatures", | ||
"logograms", | |||
"lujvo", | "lujvo", | ||
"morae", | |||
"morphemes", | "morphemes", | ||
"non-constituents", | "non-constituents", | ||
| Line 69: | Line 130: | ||
"predicatives", | "predicatives", | ||
"prefixes", | "prefixes", | ||
"prepositional phrases", | |||
"prepositions", | "prepositions", | ||
"preverbs", | "preverbs", | ||
"pronominal adverbs", | "pronominal adverbs", | ||
"pronouns", | "pronouns", | ||
"proper nouns", | |||
"proverbs", | "proverbs", | ||
"punctuation marks", | "punctuation marks", | ||
"relatives", | "relatives", | ||
| Line 86: | Line 147: | ||
} | } | ||
data.nonlemmas = { | --[==[ var: | ||
Recognized non-lemma forms. If the part of speech in {{tl|head}} is set to one of these or its singular equivalent, the | |||
category 'LANG non-lemma forms' will automatically be added. If the part of speech is not a singular or plural lemma or | |||
non-lemma form and is not an abbreviation that expands to a recognized lemma or non-lemma form, the page will be added | |||
to various tracking categories; see the documentation of `data.lemmas`. | |||
]==] | |||
data.nonlemmas = list_to_set{ | |||
"active participle forms", | |||
"active participles", | "active participles", | ||
"adjectival participles", | "adjectival participles", | ||
"adjective case forms", | |||
"adjective forms", | "adjective forms", | ||
"adjective feminine forms", | "adjective feminine forms", | ||
| Line 102: | Line 171: | ||
"comparative adverb forms", | "comparative adverb forms", | ||
"comparative adverbs", | "comparative adverbs", | ||
"conjunction forms", | |||
"contractions", | "contractions", | ||
"converbs", | "converbs", | ||
| Line 108: | Line 178: | ||
"determiner superlative forms", | "determiner superlative forms", | ||
"diminutive nouns", | "diminutive nouns", | ||
"elative adjectives", | |||
"equative adjective forms", | "equative adjective forms", | ||
"equative adjectives", | "equative adjectives", | ||
| Line 116: | Line 187: | ||
"interjection forms", | "interjection forms", | ||
"jyutping", | "jyutping", | ||
"misspellings", | "misspellings", | ||
"negative participles", | "negative participles", | ||
"nominal participles", | "nominal participles", | ||
"noun case forms", | "noun case forms", | ||
"noun construct forms", | |||
"noun dual forms", | "noun dual forms", | ||
"noun forms", | "noun forms", | ||
"noun paucal forms", | |||
"noun plural forms", | "noun plural forms", | ||
"noun possessive forms", | "noun possessive forms", | ||
| Line 132: | Line 204: | ||
"passive participles", | "passive participles", | ||
"past active participles", | "past active participles", | ||
"past adverbial participles", | |||
"past participles", | "past participles", | ||
"past participle forms", | "past participle forms", | ||
| Line 146: | Line 219: | ||
"prepositional pronouns", | "prepositional pronouns", | ||
"present active participles", | "present active participles", | ||
"present adverbial participles", | |||
"present participles", | "present participles", | ||
"present passive participles", | "present passive participles", | ||
"preverb forms", | |||
"pronoun forms", | "pronoun forms", | ||
"pronoun possessive forms", | "pronoun possessive forms", | ||
| Line 165: | Line 240: | ||
} | } | ||
-- | --[==[ var: | ||
data.no_multiword_cat = { | List of languages that will not have links to separate parts of the headword. | ||
]==] | |||
data.no_multiword_links = list_to_set{ | |||
"zh", | |||
} | |||
--[==[ var: | |||
List of languages that will not have `LANG multiword terms` categories added. There are various reasons why languages | |||
are in this list: (a) words are written without spaces between them; (b) syllables are written with spaces between them; | |||
(c) variant reconstructions are notated with a tilde surrounded by spaces; (d) the language is a sign language, where | |||
pagenames are multiword descriptions of the gesture(s) required to make an individual sign; (e) some other weirdnesses. | |||
]==] | |||
data.no_multiword_cat = list_to_set{ | |||
-------- Languages without spaces between words (sometimes spaces between phrases) -------- | -------- Languages without spaces between words (sometimes spaces between phrases) -------- | ||
"blt", -- Tai Dam | "blt", -- Tai Dam | ||
"ja", -- Japanese | "ja", -- Japanese | ||
| Line 177: | Line 263: | ||
"my", -- Burmese | "my", -- Burmese | ||
"nan", -- Min Nan (some words in Latin script; hyphens between syllables) | "nan", -- Min Nan (some words in Latin script; hyphens between syllables) | ||
"nan-hbl", -- Hokkien (some words in Latin script; hyphens between syllables) | |||
"nod", -- Northern Thai | "nod", -- Northern Thai | ||
"ojp", -- Old Japanese | "ojp", -- Old Japanese | ||
"shn", -- Shan | |||
"sou", -- Southern Thai | |||
"tdd", -- Tai Nüa | "tdd", -- Tai Nüa | ||
"th", -- Thai | "th", -- Thai | ||
"tts", -- Isan | "tts", -- Isan | ||
"twh", -- Tai Dón | "twh", -- Tai Dón | ||
" | "txg", -- Tangut | ||
"zh", -- Chinese (all varieties with Chinese characters) | "zh", -- Chinese (all varieties with Chinese characters) | ||
"zkt", -- Khitan | |||
-------- Languages with spaces between syllables -------- | -------- Languages with spaces between syllables -------- | ||
| Line 192: | Line 281: | ||
"atb", -- Zaiwa | "atb", -- Zaiwa | ||
"byk", -- Biao | "byk", -- Biao | ||
"cdy", -- Chadong | |||
--"duu", -- Drung; not sure | --"duu", -- Drung; not sure | ||
--"hmx-pro", -- Proto-Hmong-Mien | --"hmx-pro", -- Proto-Hmong-Mien | ||
| Line 200: | Line 290: | ||
"mtq", -- Muong | "mtq", -- Muong | ||
--"mww", -- White Hmong; not sure | --"mww", -- White Hmong; not sure | ||
"onb", -- Lingao | |||
--"sit-gkh", -- Gokhy; not sure | --"sit-gkh", -- Gokhy; not sure | ||
--"swi", -- Sui; not sure | --"swi", -- Sui; not sure | ||
"tbq-lol-pro", -- Proto-Loloish | "tbq-lol-pro", -- Proto-Loloish | ||
"tdh", -- Thulung | "tdh", -- Thulung | ||
"ukk", -- Muak Sa-aak | |||
"vi", -- Vietnamese | "vi", -- Vietnamese | ||
"yig", -- Wusa Nasu | "yig", -- Wusa Nasu | ||
| Line 211: | Line 303: | ||
"mkh-ban-pro", -- Proto-Bahnaric | "mkh-ban-pro", -- Proto-Bahnaric | ||
"sit-pro", -- Proto-Sino-Tibetan; listed above | "sit-pro", -- Proto-Sino-Tibetan; listed above | ||
-------- Other weirdnesses -------- | -------- Other weirdnesses -------- | ||
"mul", -- Translingual; gestures, Morse code, etc. | "mul", -- Translingual; gestures, Morse code, etc. | ||
| Line 361: | Line 453: | ||
} | } | ||
-- | --[==[ var: | ||
data.hyphen_not_multiword_sep = { | List of languages where a hyphen is not considered a word separator for the `LANG multiword terms` category. There are | ||
numerous reasons why languages are in this list; by each language should be listed the reason for inclusion. | |||
]==] | |||
data.hyphen_not_multiword_sep = list_to_set{ | |||
"akk", -- Akkadian; hyphens between syllables | "akk", -- Akkadian; hyphens between syllables | ||
"akl", -- Aklanon; hyphens for mid-word glottal stops | |||
"ber-pro", -- Proto-Berber; morphemes separated by hyphens | |||
"ceb", -- Cebuano; hyphens for mid-word glottal stops | |||
"cnk", -- Khumi Chin; hyphens used in single words | |||
"cpi", -- Chinese Pidgin English; Chinese-derived words with hyphens between syllables | "cpi", -- Chinese Pidgin English; Chinese-derived words with hyphens between syllables | ||
"de", -- too many false positives | "de", -- German; too many false positives | ||
"esx-esk-pro", -- hyphen used to separate morphemes | "esx-esk-pro", -- hyphen used to separate morphemes | ||
"fi", -- Finnish; hyphen used to separate components in compound words if the final and initial vowels match, respectively | "fi", -- Finnish; hyphen used to separate components in compound words if the final and initial vowels match, respectively | ||
"gd", -- Scottish Gaelic; too many false positives like [[a-chianaibh]], [[a-nìos]], [[an-dè]] and other adverbs in a- and an- | |||
"hil", -- Hiligaynon; hyphens for mid-word glottal stops | |||
"hnn", -- Hanunoo; too many false positives | |||
"ilo", -- Ilocano; hyphens for mid-word glottal stops | |||
"kne", -- Kankanaey; hyphens for mid-word glottal stops | |||
"lcp", -- Western Lawa; dash as syllable joiner | "lcp", -- Western Lawa; dash as syllable joiner | ||
"lwl", -- Eastern Lawa; dash as syllable joiner | "lwl", -- Eastern Lawa; dash as syllable joiner | ||
"mfa", -- Pattani Malay in Thai script; dash as syllable joiner | |||
"mkh-vie-pro", -- Proto-Vietic; morphemes separated by hyphens | "mkh-vie-pro", -- Proto-Vietic; morphemes separated by hyphens | ||
"msb", -- Masbatenyo; too many false positives | |||
"tl", -- Tagalog; too many false positives | |||
"war", -- Waray-Waray; too many false positives | |||
"yo", -- Yoruba; hyphens used to show lengthened nasal vowels | |||
} | } | ||
-- | --[==[ var: | ||
data.no_gender_cat = { | List of languages that will not have `LANG masculine nouns` and similar categories added. Generally, these languages are | ||
lacking gender but use the gender field for other purposes. (This is a massive hack and should be changed.) | |||
]==] | |||
data.no_gender_cat = list_to_set{ | |||
-- Languages without gender but which use the gender field for other purposes | -- Languages without gender but which use the gender field for other purposes | ||
"ja", | "ja", | ||
| Line 380: | Line 492: | ||
} | } | ||
data.notranslit = { | --[==[ var: | ||
List of languages where [[Module:headword]] should not attempt to generate a transliteration even if the term is written | |||
in a non-Latin script. FIXME: Notate reasons why each language is in this list. | |||
]==] | |||
data.notranslit = list_to_set{ | |||
"ams", | "ams", | ||
"az", | "az", | ||
"bbc", | "bbc", | ||
"bug", | "bug", | ||
"cdo", | |||
"cia", | "cia", | ||
"cjm", | "cjm", | ||
"cjy", | |||
"cmn", | "cmn", | ||
"cnp", | |||
"cpi", | |||
"cpx", | |||
"csp", | |||
"czh", | |||
"czo", | |||
"gan", | |||
"hak", | "hak", | ||
"hnm", | |||
"hsn", | |||
"ja", | "ja", | ||
"kzg", | "kzg", | ||
"lad", | "lad", | ||
"ltc", | |||
"luh", | |||
"lzh", | "lzh", | ||
"mnp", | |||
"ms", | "ms", | ||
"mul", | "mul", | ||
"mvi", | "mvi", | ||
"nan", | "nan", | ||
"nan-dat", | |||
"nan-hbl", | |||
"nan-hlh", | |||
"nan-lnx", | |||
"nan-tws", | |||
"nan-zhe", | |||
"nan-zsh", | |||
"och", | |||
"oj", | "oj", | ||
"okn", | "okn", | ||
"ryn", | "ryn", | ||
"rys", | "rys", | ||
"ryu", | "ryu", | ||
"sh", | "sh", | ||
"sjc", | |||
"tgt", | "tgt", | ||
"th", | "th", | ||
"tkn", | "tkn", | ||
"tly", | "tly", | ||
"txg", | |||
"und", | "und", | ||
"vi", | "vi", | ||
"wuu", | |||
"xug", | "xug", | ||
"yoi", | "yoi", | ||
"yox", | "yox", | ||
"yue", | |||
"za", | "za", | ||
"zh", | "zh", | ||
"zhx-sic", | |||
"zhx-tai", | |||
} | |||
--[==[ var: | |||
List of languages that will default to `sccat` being true, i.e. categories like `LANG POS in SCRIPT script` will | |||
automatically be generated. This can be overridden using {{para|sccat|0}} in {{tl|head}} or setting `sccat` to | |||
`false` in Lua. | |||
]==] | |||
data.default_sccat = list_to_set{ | |||
"inc-apa", | |||
"inc-ash", | |||
"kfr", | |||
"ks", | |||
"mr", | |||
"mwr", | |||
"inc-oaw", | |||
"inc-ohi", | |||
"omr", | |||
"inc-opa", | |||
"phr", | |||
"pi", | |||
"pra", | |||
"sa", | |||
"skr", | |||
"sd", | |||
} | } | ||
-- | --[==[ var: | ||
data.toBeTagged = { | List of script codes for which a script-tagged display title will be added. | ||
]==] | |||
data.toBeTagged = list_to_set{ | |||
"Ahom", | "Ahom", | ||
"Arab", | "Arab", | ||
"fa-Arab", | |||
"glk-Arab", | |||
"kk-Arab", | |||
"ks-Arab", | |||
"ku-Arab", | |||
"mzn-Arab", | |||
"ms-Arab", | |||
"ota-Arab", | |||
"pa-Arab", | |||
"ps-Arab", | |||
"sd-Arab", | |||
"tt-Arab", | |||
"ug-Arab", | |||
"ur-Arab", | |||
"Armi", | |||
"Armn", | |||
"Avst", | "Avst", | ||
"Bali", | "Bali", | ||
"Bamu", | |||
"Batk", | |||
"Beng", | |||
"as-Beng", | |||
"Bopo", | |||
"Brah", | |||
"Brai", | |||
"Bugi", | |||
"Buhd", | |||
"Cakm", | |||
"Cans", | |||
"Cari", | |||
"Cham", | "Cham", | ||
"Cher", | |||
"Copt", | "Copt", | ||
" | "Cprt", | ||
"Cyrl", | |||
"Cyrs", | |||
"Deva", | |||
"Dsrt", | |||
"Egyd", | |||
"Egyp", | |||
"Ethi", | |||
"Geok", | |||
"Geor", | |||
"Glag", | |||
"Goth", | |||
"Grek", | |||
"Polyt", | |||
"polytonic", | |||
"Gujr", | |||
"Guru", | |||
"Hang", | |||
"Hani", | "Hani", | ||
"Hano", | |||
"Hebr", | "Hebr", | ||
"Hira", | |||
"Hluw", | |||
"Ital", | |||
"Java", | |||
"Kali", | |||
"Kana", | |||
"Khar", | |||
"Khmr", | |||
"Knda", | |||
"Kthi", | |||
"Lana", | "Lana", | ||
"Laoo", | |||
"Latn", | |||
"Latf", | |||
"Latg", | |||
"Latnx", | |||
"Latinx", | |||
"pjt-Latn", | |||
"Lepc", | |||
"Limb", | |||
"Linb", | "Linb", | ||
"Lisu", | |||
"Lyci", | |||
"Lydi", | |||
"Mand", | "Mand", | ||
"Mani", | |||
"Marc", | |||
"Merc", | |||
"Mero", | |||
"Mlym", | |||
"Mong", | "Mong", | ||
" | "mnc-Mong", | ||
"sjo-Mong", | |||
"xwo-Mong", | |||
"Mtei", | |||
"Mymr", | |||
"Narb", | |||
"Nkoo", | |||
"Nshu", | |||
"Ogam", | |||
"Olck", | |||
"Orkh", | |||
"Orya", | |||
"Osma", | |||
"Ougr", | |||
"Palm", | |||
"Phag", | |||
"Phli", | |||
"Phlv", | |||
"Phnx", | |||
"Plrd", | |||
"Prti", | |||
"Rjng", | "Rjng", | ||
"Runr", | |||
"Samr", | "Samr", | ||
"Sarb", | |||
"Saur", | |||
"Sgnw", | |||
"Shaw", | |||
"Shrd", | |||
"Sinh", | |||
"Sora", | |||
"Sund", | "Sund", | ||
"Sylo", | "Sylo", | ||
"Syrc", | |||
"Tagb", | |||
"Tale", | |||
"Talu", | |||
"Taml", | |||
"Tang", | "Tang", | ||
"Tavt", | "Tavt", | ||
"Telu", | |||
"Tfng", | |||
"Tglg", | |||
"Thaa", | |||
"Thai", | |||
"Tibt", | |||
"Ugar", | |||
"Vaii", | |||
"Xpeo", | |||
"Xsux", | "Xsux", | ||
"Yiii", | |||
"Zmth", | |||
"Zsym", | |||
"Ipach", | |||
"Music", | |||
"Rumin", | |||
} | |||
--[==[ var: | |||
Parts of speech which will not be categorised in categories like `English terms spelled with É` if the term is the | |||
character in question (e.g. the letter entry for English [[é]]). This contrasts with entries like the French adjective | |||
[[m̂]], which is a one-letter word spelled with the letter. | |||
]==] | |||
data.pos_not_spelled_with_self = list_to_set{ | |||
"diacritical marks", | |||
"Han characters", | |||
"Han tu", | |||
"hanja", | |||
"hanzi", | |||
"iteration marks", | |||
"kana", | |||
"kanji", | |||
"letters", | |||
"ligatures", | |||
"logograms", | |||
"morae", | |||
"numeral symbols", | |||
"numerals", | |||
"punctuation marks", | |||
"syllables", | |||
"symbols", | |||
} | } | ||
for | ------ 2. Lists not converted into sets. ------ | ||
--[==[ var: | |||
Recognized aliases for parts of speech (param 2=). Key is the short form and value is the canonical singular (not | |||
pluralized) form. It is singular so the same table can be used in [[Module:form of]] for the {{para|p}}/{{para|POS}} | |||
param and [[Module:links]] for the pos= param. Note that any part of speech, abbreviated or not, can be suffixed with | |||
`f` to generate the corresponding non-lemma form part of speech, such as `adjf`, `af` or `adjectivef` for | |||
`adjective form`, and `nounf` or `nf` for `noun form`. This expansion happens even when it does not make sense for the | |||
given part of speech (e.g. `pclf` expands to `particle form` and `symf` expands to `symbol form`), and currently also, | |||
at least in [[Module:headword]] (but not [[Module:links]]), even if the part before the `f` is not a recognized part of | |||
speech or abbreviation (hence `nerf` expands to `ner form`). | |||
]==] | |||
data.pos_aliases = { | |||
a = "adjective", | |||
adj = "adjective", | |||
adv = "adverb", | |||
art = "article", | |||
aug = "augmentative", | |||
cls = "classifier", | |||
compadj = "comparative adjective", | |||
compadv = "comparative adverb", | |||
compdet = "comparative determiner", | |||
comppron = "comparative pronoun", | |||
conj = "conjunction", | |||
contr = "contraction", | |||
conv = "converb", | |||
det = "determiner", | |||
dim = "diminutive", | |||
int = "interjection", | |||
interj = "interjection", | |||
intj = "interjection", | |||
n = "noun", | |||
-- the next two support Algonquian languages; see also vii/vai/vti/vta below | |||
na = "animate noun", | |||
ni = "inanimate noun", | |||
num = "numeral", | |||
pastpart = "past participle", | |||
part = "participle", | |||
pcl = "particle", | |||
phr = "phrase", | |||
pn = "proper noun", | |||
postp = "postposition", | |||
pref = "prefix", | |||
prep = "preposition", | |||
prepphr = "prepositional phrase", | |||
prespart = "present participle", | |||
pron = "pronoun", | |||
prop = "proper noun", | |||
proper = "proper noun", | |||
propn = "proper noun", | |||
rom = "romanization", | |||
roman = "romanization", | |||
romanisation = "romanization", | |||
romanisations = "romanization", | |||
suf = "suffix", | |||
supadj = "superlative adjective", | |||
supadv = "superlative adverb", | |||
supdet = "superlative determiner", | |||
suppron = "superlative pronoun", | |||
sym = "symbol", | |||
v = "verb", | |||
vb = "verb", | |||
vi = "intransitive verb", | |||
vm = "modal verb", | |||
vt = "transitive verb", | |||
-- the next four support Algonquian languages | |||
vii = "inanimate intransitive verb", | |||
vai = "animate intransitive verb", | |||
vti = "transitive inanimate verb", | |||
vta = "transitive animate verb", | |||
} | |||
-- | --[==[ var: | ||
Map of parts of speech for which categories like `German masculine nouns` or `Russian imperfective verbs` will be | |||
generated if the headword is of the appropriate gender/number. The map is used to canonicalize parts of speech for | |||
categorization purposes; specifically, proper nouns categorizes like nouns. | |||
]==] | |||
data.pos_for_gender_number_cat = { | data.pos_for_gender_number_cat = { | ||
["nouns"] = "nouns", | ["nouns"] = "nouns", | ||
["proper nouns"] = "nouns", | ["proper nouns"] = "nouns", | ||
["suffixes"] = "suffixes", | |||
-- We include verbs because impf and pf are valid "genders". | -- We include verbs because impf and pf are valid "genders". | ||
["verbs"] = "verbs", | ["verbs"] = "verbs", | ||
} | } | ||
--[==[ var: | |||
Lower limit for a "long" word in a particular language. Used to categorize terms into e.g. | |||
[[:Category:Long English words]] automatically. Languages with no mapping here do not get categorized. | |||
]==] | |||
data.long_word_thresholds = { | |||
["af"] = 20, | |||
["bg"] = 20, | |||
["cy"] = 25, | |||
["de"] = 20, | |||
["en"] = 25, | |||
["es"] = 20, | |||
["fr"] = 20, | |||
["ka"] = 20, | |||
["sv"] = 20, | |||
["tl"] = 25, | |||
} | |||
------ 3. Page-wide processing (so that it only needs to be done once per page). ------ | |||
data.page = require(headword_page_module).process_page() | |||
-- Set some page properties directly on `data` for ease of use. | |||
data.pagename = data.page.pagename | |||
data.encoded_pagename = data.page.encoded_pagename | |||
return data | return data | ||