Module:headword/data: Difference between revisions

No edit summary
m 1 revision imported
 
(3 intermediate revisions by 2 users not shown)
Line 7: Line 7:
------ 1. Lists which are converted into sets. ------
------ 1. Lists which are converted into sets. ------


-- Zero-plurals (i.e. invariable plurals).
--[==[ var:
local irregular_plurals = list_to_set({
Large pages where we disable label tracking, red link checking and similar.
]==]
data.large_pages = list_to_set {
-- pages that consistently hit timeouts
"a",
-- pages that sometimes hit timeouts
"A",
"baba",
"de",
"e",
"i",
"lima",
"o",
"u",
"и",
"山",
"子",
"月",
"一",
"人",
}
 
--[==[ var:
Map from singular to plural, and from plural to itself, for recognized parts of speech with irregular plurals. Most of
these are invariable plurals, e.g. `kanji` is its own plural; but we also have `mora` plural `morae`.
]==]
data.irregular_plurals = list_to_set({
"cmavo",
"cmavo",
"cmene",
"cmene",
Line 26: Line 52:
return item
return item
end)
end)
local irregular_plurals = data.irregular_plurals


-- Irregular non-zero plurals AND any regular plurals where the singular ends in "s",
-- Irregular non-zero plurals AND any regular plurals where the singular ends in "s",
Line 37: Line 65:
end
end


data.irregular_plurals = irregular_plurals
--[==[ var:
 
Recognized lemmas. If the part of speech in {{tl|head}} is set to one of these or its singular equivalent, the category
'LANG lemmas' will automatically be added. If the part of speech is not a singular or plural lemma or non-lemma form and
is not an abbreviation that expands to a recognized lemma or non-lemma form, the page will be added to various tracking
categories:
* [[Special:WhatLinksHere/Wiktionary:Tracking/headword/unrecognized pos]]
* [[Special:WhatLinksHere/Wiktionary:Tracking/headword/unrecognized pos/LANG]]
* [[Special:WhatLinksHere/Wiktionary:Tracking/headword/unrecognized pos/pos/POS]]
* [[Special:WhatLinksHere/Wiktionary:Tracking/headword/unrecognized pos/pos/POS/LANG]]
]==]
data.lemmas = list_to_set{
data.lemmas = list_to_set{
"abbreviations",
"abbreviations",
Line 111: Line 147:
}
}


--[==[ var:
Recognized non-lemma forms. If the part of speech in {{tl|head}} is set to one of these or its singular equivalent, the
category 'LANG non-lemma forms' will automatically be added. If the part of speech is not a singular or plural lemma or
non-lemma form and is not an abbreviation that expands to a recognized lemma or non-lemma form, the page will be added
to various tracking categories; see the documentation of `data.lemmas`.
]==]
data.nonlemmas = list_to_set{
data.nonlemmas = list_to_set{
"active participle forms",
"active participle forms",
Line 149: Line 191:
"nominal participles",
"nominal participles",
"noun case forms",
"noun case forms",
"noun construct forms",
"noun dual forms",
"noun dual forms",
"noun forms",
"noun forms",
Line 197: Line 240:
}
}


-- These langauges will not have links to separate parts of the headword.
--[==[ var:
List of languages that will not have links to separate parts of the headword.
]==]
data.no_multiword_links = list_to_set{
data.no_multiword_links = list_to_set{
"zh",
"zh",
}
}


-- These languages will not have "LANG multiword terms" categories added.
--[==[ var:
List of languages that will not have `LANG multiword terms` categories added. There are various reasons why languages
are in this list: (a) words are written without spaces between them; (b) syllables are written with spaces between them;
(c) variant reconstructions are notated with a tilde surrounded by spaces; (d) the language is a sign language, where
pagenames are multiword descriptions of the gesture(s) required to make an individual sign; (e) some other weirdnesses.
]==]
data.no_multiword_cat = list_to_set{
data.no_multiword_cat = list_to_set{
-------- Languages without spaces between words (sometimes spaces between phrases) --------
-------- Languages without spaces between words (sometimes spaces between phrases) --------
Line 403: Line 453:
}
}


-- In these languages, the hyphen is not considered a word separator for the "multiword terms" category.
--[==[ var:
List of languages where a hyphen is not considered a word separator for the `LANG multiword terms` category. There are
numerous reasons why languages are in this list; by each language should be listed the reason for inclusion.
]==]
data.hyphen_not_multiword_sep = list_to_set{
data.hyphen_not_multiword_sep = list_to_set{
"akk", -- Akkadian; hyphens between syllables
"akk", -- Akkadian; hyphens between syllables
Line 411: Line 464:
"cnk", -- Khumi Chin; hyphens used in single words
"cnk", -- Khumi Chin; hyphens used in single words
"cpi", -- Chinese Pidgin English; Chinese-derived words with hyphens between syllables
"cpi", -- Chinese Pidgin English; Chinese-derived words with hyphens between syllables
"de", -- too many false positives
"de", -- German; too many false positives
"esx-esk-pro", -- hyphen used to separate morphemes
"esx-esk-pro", -- hyphen used to separate morphemes
"fi", -- Finnish; hyphen used to separate components in compound words if the final and initial vowels match, respectively
"fi", -- Finnish; hyphen used to separate components in compound words if the final and initial vowels match, respectively
"gd", -- Scottish Gaelic; too many false positives like [[a-chianaibh]], [[a-nìos]], [[an-dè]] and other adverbs in a- and an-
"hil", -- Hiligaynon; hyphens for mid-word glottal stops
"hil", -- Hiligaynon; hyphens for mid-word glottal stops
"hnn", -- Hanunoo; too many false positives
"hnn", -- Hanunoo; too many false positives
Line 428: Line 482:
}
}


-- These languages will not have "LANG masculine nouns" and similar categories added.
--[==[ var:
List of languages that will not have `LANG masculine nouns` and similar categories added. Generally, these languages are
lacking gender but use the gender field for other purposes. (This is a massive hack and should be changed.)
]==]
data.no_gender_cat = list_to_set{
data.no_gender_cat = list_to_set{
-- Languages without gender but which use the gender field for other purposes
-- Languages without gender but which use the gender field for other purposes
Line 435: Line 492:
}
}


--[==[ var:
List of languages where [[Module:headword]] should not attempt to generate a transliteration even if the term is written
in a non-Latin script. FIXME: Notate reasons why each language is in this list.
]==]
data.notranslit = list_to_set{
data.notranslit = list_to_set{
"ams",
"ams",
Line 499: Line 560:
}
}


-- Script codes for which a script-tagged display title will be added.
--[==[ var:
List of languages that will default to `sccat` being true, i.e. categories like `LANG POS in SCRIPT script` will
automatically be generated. This can be overridden using {{para|sccat|0}} in {{tl|head}} or setting `sccat` to
`false` in Lua.
]==]
data.default_sccat = list_to_set{
"inc-apa",
"inc-ash",
"kfr",
"ks",
"mr",
"mwr",
"inc-oaw",
"inc-ohi",
"omr",
"inc-opa",
"phr",
"pi",
"pra",
"sa",
"skr",
"sd",
}
 
--[==[ var:
List of script codes for which a script-tagged display title will be added.
]==]
data.toBeTagged = list_to_set{
data.toBeTagged = list_to_set{
"Ahom",
"Ahom",
Line 647: Line 734:
}
}


-- Parts of speech which will not be categorised in categories like "English terms spelled with É" if
--[==[ var:
-- the term is the character in question (e.g. the letter entry for English [[é]]). This contrasts with
Parts of speech which will not be categorised in categories like `English terms spelled with É` if the term is the
-- entries like the French adjective [[m̂]], which is a one-letter word spelled with the letter.
character in question (e.g. the letter entry for English [[é]]). This contrasts with entries like the French adjective
[[m̂]], which is a one-letter word spelled with the letter.
]==]
data.pos_not_spelled_with_self = list_to_set{
data.pos_not_spelled_with_self = list_to_set{
"diacritical marks",
"diacritical marks",
Line 672: Line 761:
------ 2. Lists not converted into sets. ------
------ 2. Lists not converted into sets. ------


-- Recognized aliases for parts of speech (param 2=). Key is the short form and value is the canonical singular (not
--[==[ var:
-- pluralized) form. It is singular so that the same table can be used in [[Module:form of]] for the p=/POS= param
Recognized aliases for parts of speech (param 2=). Key is the short form and value is the canonical singular (not
-- and [[Module:links]] for the pos= param.
pluralized) form. It is singular so the same table can be used in [[Module:form of]] for the {{para|p}}/{{para|POS}}
param and [[Module:links]] for the pos= param. Note that any part of speech, abbreviated or not, can be suffixed with
`f` to generate the corresponding non-lemma form part of speech, such as `adjf`, `af` or `adjectivef` for
`adjective form`, and `nounf` or `nf` for `noun form`. This expansion happens even when it does not make sense for the
given part of speech (e.g. `pclf` expands to `particle form` and `symf` expands to `symbol form`), and currently also,
at least in [[Module:headword]] (but not [[Module:links]]), even if the part before the `f` is not a recognized part of
speech or abbreviation (hence `nerf` expands to `ner form`).
]==]
data.pos_aliases = {
data.pos_aliases = {
a = "adjective",
a = "adjective",
Line 680: Line 776:
adv = "adverb",
adv = "adverb",
art = "article",
art = "article",
det = "determiner",
aug = "augmentative",
cnum = "cardinal number",
cls = "classifier",
compadj = "comparative adjective",
compadj = "comparative adjective",
compadv = "comparative adverb",
compadv = "comparative adverb",
compdet = "comparative determiner",
comppron = "comparative pronoun",
conj = "conjunction",
conj = "conjunction",
contr = "contraction",
conv = "converb",
conv = "converb",
det = "determiner",
dim = "diminutive",
int = "interjection",
int = "interjection",
interj = "interjection",
interj = "interjection",
intj = "interjection",
intj = "interjection",
n = "noun",
n = "noun",
-- the next two support Algonquian languages; see also vii/vai/vti/vta below
na = "animate noun",
ni = "inanimate noun",
num = "numeral",
num = "numeral",
pastpart = "past participle",
part = "participle",
part = "participle",
pcl = "particle",
pcl = "particle",
Line 698: Line 803:
pref = "prefix",
pref = "prefix",
prep = "preposition",
prep = "preposition",
prepphr = "prepositional phrase",
prespart = "present participle",
pron = "pronoun",
pron = "pronoun",
prop = "proper noun",
prop = "proper noun",
proper = "proper noun",
proper = "proper noun",
propn = "proper noun",
propn = "proper noun",
onum = "ordinal number",
rom = "romanization",
rom = "romanization",
roman = "romanization",
romanisation = "romanization",
romanisations = "romanization",
suf = "suffix",
suf = "suffix",
supadj = "superlative adjective",
supadj = "superlative adjective",
supadv = "superlative adverb",
supadv = "superlative adverb",
supdet = "superlative determiner",
suppron = "superlative pronoun",
sym = "symbol",
v = "verb",
v = "verb",
vb = "verb",
vb = "verb",
vi = "intransitive verb",
vi = "intransitive verb",
vm = "modal verb",
vt = "transitive verb",
vt = "transitive verb",
vti = "transitive and intransitive verb",
-- the next four support Algonquian languages
vii = "inanimate intransitive verb",
vai = "animate intransitive verb",
vti = "transitive inanimate verb",
vta = "transitive animate verb",
}
}


-- Parts of speech for which categories like "German masculine nouns" or "Russian imperfective verbs"
--[==[ var:
-- will be generated if the headword is of the appropriate gender/number.
Map of parts of speech for which categories like `German masculine nouns` or `Russian imperfective verbs` will be
generated if the headword is of the appropriate gender/number. The map is used to canonicalize parts of speech for
categorization purposes; specifically, proper nouns categorizes like nouns.
]==]
data.pos_for_gender_number_cat = {
data.pos_for_gender_number_cat = {
["nouns"] = "nouns",
["nouns"] = "nouns",
Line 722: Line 842:
-- We include verbs because impf and pf are valid "genders".
-- We include verbs because impf and pf are valid "genders".
["verbs"] = "verbs",
["verbs"] = "verbs",
}
--[==[ var:
Lower limit for a "long" word in a particular language. Used to categorize terms into e.g.
[[:Category:Long English words]] automatically. Languages with no mapping here do not get categorized.
]==]
data.long_word_thresholds = {
    ["af"] = 20,
    ["bg"] = 20,
    ["cy"] = 25,
    ["de"] = 20,
    ["en"] = 25,
    ["es"] = 20,
    ["fr"] = 20,
    ["ka"] = 20,
    ["sv"] = 20,
    ["tl"] = 25,
}
}


------ 3. Page-wide processing (so that it only needs to be done once per page). ------
------ 3. Page-wide processing (so that it only needs to be done once per page). ------
data.page = require(headword_page_module).process_page()
data.page = require(headword_page_module).process_page()
-- Fuckme, random references to data.pagename and data.encoded_pagename are scattered throughout the codebase. FIXME!
-- Set some page properties directly on `data` for ease of use.
data.pagename = data.page.pagename
data.pagename = data.page.pagename
data.encoded_pagename = data.page.encoded_pagename
data.encoded_pagename = data.page.encoded_pagename


return data
return data