Module:headword/data: Difference between revisions

No edit summary
No edit summary
 
Line 7: Line 7:
------ 1. Lists which are converted into sets. ------
------ 1. Lists which are converted into sets. ------


-- Zero-plurals (i.e. invariable plurals).
--[==[ var:
local irregular_plurals = list_to_set({
Large pages where we disable label tracking, red link checking and similar.
]==]
data.large_pages = list_to_set {
-- pages that consistently hit timeouts
"a",
-- pages that sometimes hit timeouts
"A",
"baba",
"de",
"e",
"i",
"lima",
"o",
"u",
"и",
"山",
"子",
"月",
"一",
"人",
}
 
--[==[ var:
Map from singular to plural, and from plural to itself, for recognized parts of speech with irregular plurals. Most of
these are invariable plurals, e.g. `kanji` is its own plural; but we also have `mora` plural `morae`.
]==]
data.irregular_plurals = list_to_set({
"cmavo",
"cmavo",
"cmene",
"cmene",
Line 26: Line 52:
return item
return item
end)
end)
local irregular_plurals = data.irregular_plurals


-- Irregular non-zero plurals AND any regular plurals where the singular ends in "s",
-- Irregular non-zero plurals AND any regular plurals where the singular ends in "s",
Line 37: Line 65:
end
end


data.irregular_plurals = irregular_plurals
--[==[ var:
 
Recognized lemmas. If the part of speech in {{tl|head}} is set to one of these or its singular equivalent, the category
'LANG lemmas' will automatically be added. If the part of speech is not a singular or plural lemma or non-lemma form and
is not an abbreviation that expands to a recognized lemma or non-lemma form, the page will be added to various tracking
categories:
* [[Special:WhatLinksHere/Wiktionary:Tracking/headword/unrecognized pos]]
* [[Special:WhatLinksHere/Wiktionary:Tracking/headword/unrecognized pos/LANG]]
* [[Special:WhatLinksHere/Wiktionary:Tracking/headword/unrecognized pos/pos/POS]]
* [[Special:WhatLinksHere/Wiktionary:Tracking/headword/unrecognized pos/pos/POS/LANG]]
]==]
data.lemmas = list_to_set{
data.lemmas = list_to_set{
"abbreviations",
"abbreviations",
Line 111: Line 147:
}
}


--[==[ var:
Recognized non-lemma forms. If the part of speech in {{tl|head}} is set to one of these or its singular equivalent, the
category 'LANG non-lemma forms' will automatically be added. If the part of speech is not a singular or plural lemma or
non-lemma form and is not an abbreviation that expands to a recognized lemma or non-lemma form, the page will be added
to various tracking categories; see the documentation of `data.lemmas`.
]==]
data.nonlemmas = list_to_set{
data.nonlemmas = list_to_set{
"active participle forms",
"active participle forms",
Line 198: Line 240:
}
}


-- These langauges will not have links to separate parts of the headword.
--[==[ var:
List of languages that will not have links to separate parts of the headword.
]==]
data.no_multiword_links = list_to_set{
data.no_multiword_links = list_to_set{
"zh",
"zh",
}
}


-- These languages will not have "LANG multiword terms" categories added.
--[==[ var:
List of languages that will not have `LANG multiword terms` categories added. There are various reasons why languages
are in this list: (a) words are written without spaces between them; (b) syllables are written with spaces between them;
(c) variant reconstructions are notated with a tilde surrounded by spaces; (d) the language is a sign language, where
pagenames are multiword descriptions of the gesture(s) required to make an individual sign; (e) some other weirdnesses.
]==]
data.no_multiword_cat = list_to_set{
data.no_multiword_cat = list_to_set{
-------- Languages without spaces between words (sometimes spaces between phrases) --------
-------- Languages without spaces between words (sometimes spaces between phrases) --------
Line 404: Line 453:
}
}


-- In these languages, the hyphen is not considered a word separator for the "multiword terms" category.
--[==[ var:
List of languages where a hyphen is not considered a word separator for the `LANG multiword terms` category. There are
numerous reasons why languages are in this list; by each language should be listed the reason for inclusion.
]==]
data.hyphen_not_multiword_sep = list_to_set{
data.hyphen_not_multiword_sep = list_to_set{
"akk", -- Akkadian; hyphens between syllables
"akk", -- Akkadian; hyphens between syllables
Line 412: Line 464:
"cnk", -- Khumi Chin; hyphens used in single words
"cnk", -- Khumi Chin; hyphens used in single words
"cpi", -- Chinese Pidgin English; Chinese-derived words with hyphens between syllables
"cpi", -- Chinese Pidgin English; Chinese-derived words with hyphens between syllables
"de", -- too many false positives
"de", -- German; too many false positives
"esx-esk-pro", -- hyphen used to separate morphemes
"esx-esk-pro", -- hyphen used to separate morphemes
"fi", -- Finnish; hyphen used to separate components in compound words if the final and initial vowels match, respectively
"fi", -- Finnish; hyphen used to separate components in compound words if the final and initial vowels match, respectively
"gd", -- Scottish Gaelic; too many false positives like [[a-chianaibh]], [[a-nìos]], [[an-dè]] and other adverbs in a- and an-
"hil", -- Hiligaynon; hyphens for mid-word glottal stops
"hil", -- Hiligaynon; hyphens for mid-word glottal stops
"hnn", -- Hanunoo; too many false positives
"hnn", -- Hanunoo; too many false positives
Line 429: Line 482:
}
}


-- These languages will not have "LANG masculine nouns" and similar categories added.
--[==[ var:
List of languages that will not have `LANG masculine nouns` and similar categories added. Generally, these languages are
lacking gender but use the gender field for other purposes. (This is a massive hack and should be changed.)
]==]
data.no_gender_cat = list_to_set{
data.no_gender_cat = list_to_set{
-- Languages without gender but which use the gender field for other purposes
-- Languages without gender but which use the gender field for other purposes
Line 436: Line 492:
}
}


--[==[ var:
List of languages where [[Module:headword]] should not attempt to generate a transliteration even if the term is written
in a non-Latin script. FIXME: Notate reasons why each language is in this list.
]==]
data.notranslit = list_to_set{
data.notranslit = list_to_set{
"ams",
"ams",
Line 500: Line 560:
}
}


-- Script codes for which a script-tagged display title will be added.
--[==[ var:
List of languages that will default to `sccat` being true, i.e. categories like `LANG POS in SCRIPT script` will
automatically be generated. This can be overridden using {{para|sccat|0}} in {{tl|head}} or setting `sccat` to
`false` in Lua.
]==]
data.default_sccat = list_to_set{
"inc-apa",
"inc-ash",
"kfr",
"ks",
"mr",
"mwr",
"inc-oaw",
"inc-ohi",
"omr",
"inc-opa",
"phr",
"pi",
"pra",
"sa",
"skr",
"sd",
}
 
--[==[ var:
List of script codes for which a script-tagged display title will be added.
]==]
data.toBeTagged = list_to_set{
data.toBeTagged = list_to_set{
"Ahom",
"Ahom",
Line 648: Line 734:
}
}


-- Parts of speech which will not be categorised in categories like "English terms spelled with É" if
--[==[ var:
-- the term is the character in question (e.g. the letter entry for English [[é]]). This contrasts with
Parts of speech which will not be categorised in categories like `English terms spelled with É` if the term is the
-- entries like the French adjective [[m̂]], which is a one-letter word spelled with the letter.
character in question (e.g. the letter entry for English [[é]]). This contrasts with entries like the French adjective
[[m̂]], which is a one-letter word spelled with the letter.
]==]
data.pos_not_spelled_with_self = list_to_set{
data.pos_not_spelled_with_self = list_to_set{
"diacritical marks",
"diacritical marks",
Line 673: Line 761:
------ 2. Lists not converted into sets. ------
------ 2. Lists not converted into sets. ------


-- Recognized aliases for parts of speech (param 2=). Key is the short form and value is the canonical singular (not
--[==[ var:
-- pluralized) form. It is singular so that the same table can be used in [[Module:form of]] for the p=/POS= param
Recognized aliases for parts of speech (param 2=). Key is the short form and value is the canonical singular (not
-- and [[Module:links]] for the pos= param.
pluralized) form. It is singular so the same table can be used in [[Module:form of]] for the {{para|p}}/{{para|POS}}
param and [[Module:links]] for the pos= param. Note that any part of speech, abbreviated or not, can be suffixed with
`f` to generate the corresponding non-lemma form part of speech, such as `adjf`, `af` or `adjectivef` for
`adjective form`, and `nounf` or `nf` for `noun form`. This expansion happens even when it does not make sense for the
given part of speech (e.g. `pclf` expands to `particle form` and `symf` expands to `symbol form`), and currently also,
at least in [[Module:headword]] (but not [[Module:links]]), even if the part before the `f` is not a recognized part of
speech or abbreviation (hence `nerf` expands to `ner form`).
]==]
data.pos_aliases = {
data.pos_aliases = {
a = "adjective",
a = "adjective",
Line 682: Line 777:
art = "article",
art = "article",
aug = "augmentative",
aug = "augmentative",
det = "determiner",
cls = "classifier",
dim = "diminutive",
compadj = "comparative adjective",
compadj = "comparative adjective",
compadv = "comparative adverb",
compadv = "comparative adverb",
compdet = "comparative determiner",
comppron = "comparative pronoun",
conj = "conjunction",
conj = "conjunction",
contr = "contraction",
contr = "contraction",
conv = "converb",
conv = "converb",
det = "determiner",
dim = "diminutive",
int = "interjection",
int = "interjection",
interj = "interjection",
interj = "interjection",
Line 697: Line 795:
ni = "inanimate noun",
ni = "inanimate noun",
num = "numeral",
num = "numeral",
pastpart = "past participle",
part = "participle",
part = "participle",
pcl = "particle",
pcl = "particle",
Line 705: Line 804:
prep = "preposition",
prep = "preposition",
prepphr = "prepositional phrase",
prepphr = "prepositional phrase",
prespart = "present participle",
pron = "pronoun",
pron = "pronoun",
prop = "proper noun",
prop = "proper noun",
Line 710: Line 810:
propn = "proper noun",
propn = "proper noun",
rom = "romanization",
rom = "romanization",
roman = "romanization",
romanisation = "romanization",
romanisations = "romanization",
suf = "suffix",
suf = "suffix",
supadj = "superlative adjective",
supadj = "superlative adjective",
supadv = "superlative adverb",
supadv = "superlative adverb",
supdet = "superlative determiner",
suppron = "superlative pronoun",
sym = "symbol",
sym = "symbol",
v = "verb",
v = "verb",
vb = "verb",
vb = "verb",
vi = "intransitive verb",
vi = "intransitive verb",
vm = "modal verb",
vt = "transitive verb",
vt = "transitive verb",
-- the next four support Algonquian languages
-- the next four support Algonquian languages
Line 725: Line 831:
}
}


-- Parts of speech for which categories like "German masculine nouns" or "Russian imperfective verbs"
--[==[ var:
-- will be generated if the headword is of the appropriate gender/number.
Map of parts of speech for which categories like `German masculine nouns` or `Russian imperfective verbs` will be
generated if the headword is of the appropriate gender/number. The map is used to canonicalize parts of speech for
categorization purposes; specifically, proper nouns categorizes like nouns.
]==]
data.pos_for_gender_number_cat = {
data.pos_for_gender_number_cat = {
["nouns"] = "nouns",
["nouns"] = "nouns",
Line 735: Line 844:
}
}


-- Lower limit for a "long" word in a particular language.
--[==[ var:
-- Used to categorize terms into e.g. [[:Category:Long English words]] automatically.
Lower limit for a "long" word in a particular language. Used to categorize terms into e.g.
-- Languages with no mapping here do not get categorized.
[[:Category:Long English words]] automatically. Languages with no mapping here do not get categorized.
]==]
data.long_word_thresholds = {
data.long_word_thresholds = {
     ["af"] = 20,
     ["af"] = 20,
Line 753: Line 863:
------ 3. Page-wide processing (so that it only needs to be done once per page). ------
------ 3. Page-wide processing (so that it only needs to be done once per page). ------
data.page = require(headword_page_module).process_page()
data.page = require(headword_page_module).process_page()
-- Fuckme, random references to data.pagename and data.encoded_pagename are scattered throughout the codebase. FIXME!
-- Set some page properties directly on `data` for ease of use.
data.pagename = data.page.pagename
data.pagename = data.page.pagename
data.encoded_pagename = data.page.encoded_pagename
data.encoded_pagename = data.page.encoded_pagename


return data
return data