Module:headword/data: Difference between revisions
No edit summary |
No edit summary |
||
| Line 7: | Line 7: | ||
------ 1. Lists which are converted into sets. ------ | ------ 1. Lists which are converted into sets. ------ | ||
-- | --[==[ var: | ||
Large pages where we disable label tracking, red link checking and similar. | |||
]==] | |||
data.large_pages = list_to_set { | |||
-- pages that consistently hit timeouts | |||
"a", | |||
-- pages that sometimes hit timeouts | |||
"A", | |||
"baba", | |||
"de", | |||
"e", | |||
"i", | |||
"lima", | |||
"o", | |||
"u", | |||
"и", | |||
"山", | |||
"子", | |||
"月", | |||
"一", | |||
"人", | |||
} | |||
--[==[ var: | |||
Map from singular to plural, and from plural to itself, for recognized parts of speech with irregular plurals. Most of | |||
these are invariable plurals, e.g. `kanji` is its own plural; but we also have `mora` plural `morae`. | |||
]==] | |||
data.irregular_plurals = list_to_set({ | |||
"cmavo", | "cmavo", | ||
"cmene", | "cmene", | ||
| Line 26: | Line 52: | ||
return item | return item | ||
end) | end) | ||
local irregular_plurals = data.irregular_plurals | |||
-- Irregular non-zero plurals AND any regular plurals where the singular ends in "s", | -- Irregular non-zero plurals AND any regular plurals where the singular ends in "s", | ||
| Line 37: | Line 65: | ||
end | end | ||
--[==[ var: | |||
Recognized lemmas. If the part of speech in {{tl|head}} is set to one of these or its singular equivalent, the category | |||
'LANG lemmas' will automatically be added. If the part of speech is not a singular or plural lemma or non-lemma form and | |||
is not an abbreviation that expands to a recognized lemma or non-lemma form, the page will be added to various tracking | |||
categories: | |||
* [[Special:WhatLinksHere/Wiktionary:Tracking/headword/unrecognized pos]] | |||
* [[Special:WhatLinksHere/Wiktionary:Tracking/headword/unrecognized pos/LANG]] | |||
* [[Special:WhatLinksHere/Wiktionary:Tracking/headword/unrecognized pos/pos/POS]] | |||
* [[Special:WhatLinksHere/Wiktionary:Tracking/headword/unrecognized pos/pos/POS/LANG]] | |||
]==] | |||
data.lemmas = list_to_set{ | data.lemmas = list_to_set{ | ||
"abbreviations", | "abbreviations", | ||
| Line 111: | Line 147: | ||
} | } | ||
--[==[ var: | |||
Recognized non-lemma forms. If the part of speech in {{tl|head}} is set to one of these or its singular equivalent, the | |||
category 'LANG non-lemma forms' will automatically be added. If the part of speech is not a singular or plural lemma or | |||
non-lemma form and is not an abbreviation that expands to a recognized lemma or non-lemma form, the page will be added | |||
to various tracking categories; see the documentation of `data.lemmas`. | |||
]==] | |||
data.nonlemmas = list_to_set{ | data.nonlemmas = list_to_set{ | ||
"active participle forms", | "active participle forms", | ||
| Line 198: | Line 240: | ||
} | } | ||
-- | --[==[ var: | ||
List of languages that will not have links to separate parts of the headword. | |||
]==] | |||
data.no_multiword_links = list_to_set{ | data.no_multiword_links = list_to_set{ | ||
"zh", | "zh", | ||
} | } | ||
-- | --[==[ var: | ||
List of languages that will not have `LANG multiword terms` categories added. There are various reasons why languages | |||
are in this list: (a) words are written without spaces between them; (b) syllables are written with spaces between them; | |||
(c) variant reconstructions are notated with a tilde surrounded by spaces; (d) the language is a sign language, where | |||
pagenames are multiword descriptions of the gesture(s) required to make an individual sign; (e) some other weirdnesses. | |||
]==] | |||
data.no_multiword_cat = list_to_set{ | data.no_multiword_cat = list_to_set{ | ||
-------- Languages without spaces between words (sometimes spaces between phrases) -------- | -------- Languages without spaces between words (sometimes spaces between phrases) -------- | ||
| Line 404: | Line 453: | ||
} | } | ||
-- | --[==[ var: | ||
List of languages where a hyphen is not considered a word separator for the `LANG multiword terms` category. There are | |||
numerous reasons why languages are in this list; by each language should be listed the reason for inclusion. | |||
]==] | |||
data.hyphen_not_multiword_sep = list_to_set{ | data.hyphen_not_multiword_sep = list_to_set{ | ||
"akk", -- Akkadian; hyphens between syllables | "akk", -- Akkadian; hyphens between syllables | ||
| Line 412: | Line 464: | ||
"cnk", -- Khumi Chin; hyphens used in single words | "cnk", -- Khumi Chin; hyphens used in single words | ||
"cpi", -- Chinese Pidgin English; Chinese-derived words with hyphens between syllables | "cpi", -- Chinese Pidgin English; Chinese-derived words with hyphens between syllables | ||
"de", -- too many false positives | "de", -- German; too many false positives | ||
"esx-esk-pro", -- hyphen used to separate morphemes | "esx-esk-pro", -- hyphen used to separate morphemes | ||
"fi", -- Finnish; hyphen used to separate components in compound words if the final and initial vowels match, respectively | "fi", -- Finnish; hyphen used to separate components in compound words if the final and initial vowels match, respectively | ||
"gd", -- Scottish Gaelic; too many false positives like [[a-chianaibh]], [[a-nìos]], [[an-dè]] and other adverbs in a- and an- | |||
"hil", -- Hiligaynon; hyphens for mid-word glottal stops | "hil", -- Hiligaynon; hyphens for mid-word glottal stops | ||
"hnn", -- Hanunoo; too many false positives | "hnn", -- Hanunoo; too many false positives | ||
| Line 429: | Line 482: | ||
} | } | ||
-- | --[==[ var: | ||
List of languages that will not have `LANG masculine nouns` and similar categories added. Generally, these languages are | |||
lacking gender but use the gender field for other purposes. (This is a massive hack and should be changed.) | |||
]==] | |||
data.no_gender_cat = list_to_set{ | data.no_gender_cat = list_to_set{ | ||
-- Languages without gender but which use the gender field for other purposes | -- Languages without gender but which use the gender field for other purposes | ||
| Line 436: | Line 492: | ||
} | } | ||
--[==[ var: | |||
List of languages where [[Module:headword]] should not attempt to generate a transliteration even if the term is written | |||
in a non-Latin script. FIXME: Notate reasons why each language is in this list. | |||
]==] | |||
data.notranslit = list_to_set{ | data.notranslit = list_to_set{ | ||
"ams", | "ams", | ||
| Line 500: | Line 560: | ||
} | } | ||
-- | --[==[ var: | ||
List of languages that will default to `sccat` being true, i.e. categories like `LANG POS in SCRIPT script` will | |||
automatically be generated. This can be overridden using {{para|sccat|0}} in {{tl|head}} or setting `sccat` to | |||
`false` in Lua. | |||
]==] | |||
data.default_sccat = list_to_set{ | |||
"inc-apa", | |||
"inc-ash", | |||
"kfr", | |||
"ks", | |||
"mr", | |||
"mwr", | |||
"inc-oaw", | |||
"inc-ohi", | |||
"omr", | |||
"inc-opa", | |||
"phr", | |||
"pi", | |||
"pra", | |||
"sa", | |||
"skr", | |||
"sd", | |||
} | |||
--[==[ var: | |||
List of script codes for which a script-tagged display title will be added. | |||
]==] | |||
data.toBeTagged = list_to_set{ | data.toBeTagged = list_to_set{ | ||
"Ahom", | "Ahom", | ||
| Line 648: | Line 734: | ||
} | } | ||
-- Parts of speech which will not be categorised in categories like | --[==[ var: | ||
Parts of speech which will not be categorised in categories like `English terms spelled with É` if the term is the | |||
character in question (e.g. the letter entry for English [[é]]). This contrasts with entries like the French adjective | |||
[[m̂]], which is a one-letter word spelled with the letter. | |||
]==] | |||
data.pos_not_spelled_with_self = list_to_set{ | data.pos_not_spelled_with_self = list_to_set{ | ||
"diacritical marks", | "diacritical marks", | ||
| Line 673: | Line 761: | ||
------ 2. Lists not converted into sets. ------ | ------ 2. Lists not converted into sets. ------ | ||
-- Recognized aliases for parts of speech (param 2=). Key is the short form and value is the canonical singular (not | --[==[ var: | ||
Recognized aliases for parts of speech (param 2=). Key is the short form and value is the canonical singular (not | |||
pluralized) form. It is singular so the same table can be used in [[Module:form of]] for the {{para|p}}/{{para|POS}} | |||
param and [[Module:links]] for the pos= param. Note that any part of speech, abbreviated or not, can be suffixed with | |||
`f` to generate the corresponding non-lemma form part of speech, such as `adjf`, `af` or `adjectivef` for | |||
`adjective form`, and `nounf` or `nf` for `noun form`. This expansion happens even when it does not make sense for the | |||
given part of speech (e.g. `pclf` expands to `particle form` and `symf` expands to `symbol form`), and currently also, | |||
at least in [[Module:headword]] (but not [[Module:links]]), even if the part before the `f` is not a recognized part of | |||
speech or abbreviation (hence `nerf` expands to `ner form`). | |||
]==] | |||
data.pos_aliases = { | data.pos_aliases = { | ||
a = "adjective", | a = "adjective", | ||
| Line 682: | Line 777: | ||
art = "article", | art = "article", | ||
aug = "augmentative", | aug = "augmentative", | ||
cls = "classifier", | |||
compadj = "comparative adjective", | compadj = "comparative adjective", | ||
compadv = "comparative adverb", | compadv = "comparative adverb", | ||
compdet = "comparative determiner", | |||
comppron = "comparative pronoun", | |||
conj = "conjunction", | conj = "conjunction", | ||
contr = "contraction", | contr = "contraction", | ||
conv = "converb", | conv = "converb", | ||
det = "determiner", | |||
dim = "diminutive", | |||
int = "interjection", | int = "interjection", | ||
interj = "interjection", | interj = "interjection", | ||
| Line 697: | Line 795: | ||
ni = "inanimate noun", | ni = "inanimate noun", | ||
num = "numeral", | num = "numeral", | ||
pastpart = "past participle", | |||
part = "participle", | part = "participle", | ||
pcl = "particle", | pcl = "particle", | ||
| Line 705: | Line 804: | ||
prep = "preposition", | prep = "preposition", | ||
prepphr = "prepositional phrase", | prepphr = "prepositional phrase", | ||
prespart = "present participle", | |||
pron = "pronoun", | pron = "pronoun", | ||
prop = "proper noun", | prop = "proper noun", | ||
| Line 710: | Line 810: | ||
propn = "proper noun", | propn = "proper noun", | ||
rom = "romanization", | rom = "romanization", | ||
roman = "romanization", | |||
romanisation = "romanization", | |||
romanisations = "romanization", | |||
suf = "suffix", | suf = "suffix", | ||
supadj = "superlative adjective", | supadj = "superlative adjective", | ||
supadv = "superlative adverb", | supadv = "superlative adverb", | ||
supdet = "superlative determiner", | |||
suppron = "superlative pronoun", | |||
sym = "symbol", | sym = "symbol", | ||
v = "verb", | v = "verb", | ||
vb = "verb", | vb = "verb", | ||
vi = "intransitive verb", | vi = "intransitive verb", | ||
vm = "modal verb", | |||
vt = "transitive verb", | vt = "transitive verb", | ||
-- the next four support Algonquian languages | -- the next four support Algonquian languages | ||
| Line 725: | Line 831: | ||
} | } | ||
-- | --[==[ var: | ||
Map of parts of speech for which categories like `German masculine nouns` or `Russian imperfective verbs` will be | |||
generated if the headword is of the appropriate gender/number. The map is used to canonicalize parts of speech for | |||
categorization purposes; specifically, proper nouns categorizes like nouns. | |||
]==] | |||
data.pos_for_gender_number_cat = { | data.pos_for_gender_number_cat = { | ||
["nouns"] = "nouns", | ["nouns"] = "nouns", | ||
| Line 735: | Line 844: | ||
} | } | ||
-- Lower limit for a "long" word in a particular language. | --[==[ var: | ||
Lower limit for a "long" word in a particular language. Used to categorize terms into e.g. | |||
[[:Category:Long English words]] automatically. Languages with no mapping here do not get categorized. | |||
]==] | |||
data.long_word_thresholds = { | data.long_word_thresholds = { | ||
["af"] = 20, | ["af"] = 20, | ||
| Line 753: | Line 863: | ||
------ 3. Page-wide processing (so that it only needs to be done once per page). ------ | ------ 3. Page-wide processing (so that it only needs to be done once per page). ------ | ||
data.page = require(headword_page_module).process_page() | data.page = require(headword_page_module).process_page() | ||
-- | -- Set some page properties directly on `data` for ease of use. | ||
data.pagename = data.page.pagename | data.pagename = data.page.pagename | ||
data.encoded_pagename = data.page.encoded_pagename | data.encoded_pagename = data.page.encoded_pagename | ||
return data | return data | ||