Home
Random

Log in

Settings

About Linguifex
Disclaimers

Module:headword/data: Difference between revisions

Language
Watch
View history
View source

@@ Line 7: / Line 7: @@
 ------ 1. Lists which are converted into sets. ------
--- Zero-plurals (i.e. invariable plurals).
+--[==[ var:
-local irregular_plurals = list_to_set({
+Large pages where we disable label tracking, red link checking and similar.
+]==]
+data.large_pages = list_to_set {
+	-- pages that consistently hit timeouts
+	"a",
+	-- pages that sometimes hit timeouts
+	"A",
+	"baba",
+	"de",
+	"e",
+	"i",
+	"lima",
+	"o",
+	"u",
+	"и",
+	"山",
+	"子",
+	"月",
+	"一",
+	"人",
+}
+--[==[ var:
+Map from singular to plural, and from plural to itself, for recognized parts of speech with irregular plurals. Most of
+these are invariable plurals, e.g. `kanji` is its own plural; but we also have `mora` plural `morae`.
+]==]
+data.irregular_plurals = list_to_set({
 	"cmavo",
 	"cmene",
@@ Line 26: / Line 52: @@
 	return item
 end)
+local irregular_plurals = data.irregular_plurals
 -- Irregular non-zero plurals AND any regular plurals where the singular ends in "s",
@@ Line 37: / Line 65: @@
 end
-data.irregular_plurals = irregular_plurals
+--[==[ var:
+Recognized lemmas. If the part of speech in {{tl|head}} is set to one of these or its singular equivalent, the category
+'LANG lemmas' will automatically be added. If the part of speech is not a singular or plural lemma or non-lemma form and
+is not an abbreviation that expands to a recognized lemma or non-lemma form, the page will be added to various tracking
+categories:
+* [[Special:WhatLinksHere/Wiktionary:Tracking/headword/unrecognized pos]]
+* [[Special:WhatLinksHere/Wiktionary:Tracking/headword/unrecognized pos/LANG]]
+* [[Special:WhatLinksHere/Wiktionary:Tracking/headword/unrecognized pos/pos/POS]]
+* [[Special:WhatLinksHere/Wiktionary:Tracking/headword/unrecognized pos/pos/POS/LANG]]
+]==]
 data.lemmas = list_to_set{
 	"abbreviations",
@@ Line 111: / Line 147: @@
 }
+--[==[ var:
+Recognized non-lemma forms. If the part of speech in {{tl|head}} is set to one of these or its singular equivalent, the
+category 'LANG non-lemma forms' will automatically be added. If the part of speech is not a singular or plural lemma or
+non-lemma form and is not an abbreviation that expands to a recognized lemma or non-lemma form, the page will be added
+to various tracking categories; see the documentation of `data.lemmas`.
+]==]
 data.nonlemmas = list_to_set{
 	"active participle forms",
@@ Line 198: / Line 240: @@
 }
--- These langauges will not have links to separate parts of the headword.
+--[==[ var:
+List of languages that will not have links to separate parts of the headword.
+]==]
 data.no_multiword_links = list_to_set{
 	"zh",
 }
--- These languages will not have "LANG multiword terms" categories added.
+--[==[ var:
+List of languages that will not have `LANG multiword terms` categories added. There are various reasons why languages
+are in this list: (a) words are written without spaces between them; (b) syllables are written with spaces between them;
+(c) variant reconstructions are notated with a tilde surrounded by spaces; (d) the language is a sign language, where
+pagenames are multiword descriptions of the gesture(s) required to make an individual sign; (e) some other weirdnesses.
+]==]
 data.no_multiword_cat = list_to_set{
 	-------- Languages without spaces between words (sometimes spaces between phrases) --------
@@ Line 404: / Line 453: @@
 }
--- In these languages, the hyphen is not considered a word separator for the "multiword terms" category.
+--[==[ var:
+List of languages where a hyphen is not considered a word separator for the `LANG multiword terms` category. There are
+numerous reasons why languages are in this list; by each language should be listed the reason for inclusion.
+]==]
 data.hyphen_not_multiword_sep = list_to_set{
 	"akk", -- Akkadian; hyphens between syllables
@@ Line 412: / Line 464: @@
 	"cnk", -- Khumi Chin; hyphens used in single words
 	"cpi", -- Chinese Pidgin English; Chinese-derived words with hyphens between syllables
-	"de", -- too many false positives
+	"de", -- German; too many false positives
 	"esx-esk-pro", -- hyphen used to separate morphemes
 	"fi", -- Finnish; hyphen used to separate components in compound words if the final and initial vowels match, respectively
+	"gd", -- Scottish Gaelic; too many false positives like [[a-chianaibh]], [[a-nìos]], [[an-dè]] and other adverbs in a- and an-
 	"hil", -- Hiligaynon; hyphens for mid-word glottal stops
 	"hnn", -- Hanunoo; too many false positives
@@ Line 429: / Line 482: @@
 }
--- These languages will not have "LANG masculine nouns" and similar categories added.
+--[==[ var:
+List of languages that will not have `LANG masculine nouns` and similar categories added. Generally, these languages are
+lacking gender but use the gender field for other purposes. (This is a massive hack and should be changed.)
+]==]
 data.no_gender_cat = list_to_set{
 	-- Languages without gender but which use the gender field for other purposes
@@ Line 436: / Line 492: @@
 }
+--[==[ var:
+List of languages where [[Module:headword]] should not attempt to generate a transliteration even if the term is written
+in a non-Latin script. FIXME: Notate reasons why each language is in this list.
+]==]
 data.notranslit = list_to_set{
 	"ams",
@@ Line 500: / Line 560: @@
 }
--- Script codes for which a script-tagged display title will be added.
+--[==[ var:
+List of languages that will default to `sccat` being true, i.e. categories like `LANG POS in SCRIPT script` will
+automatically be generated. This can be overridden using {{para|sccat|0}} in {{tl|head}} or setting `sccat` to
+`false` in Lua.
+]==]
+data.default_sccat = list_to_set{
+	"inc-apa",
+	"inc-ash",
+	"kfr",
+	"ks",
+	"mr",
+	"mwr",
+	"inc-oaw",
+	"inc-ohi",
+	"omr",
+	"inc-opa",
+	"phr",
+	"pi",
+	"pra",
+	"sa",
+	"skr",
+	"sd",
+}
+--[==[ var:
+List of script codes for which a script-tagged display title will be added.
+]==]
 data.toBeTagged = list_to_set{
 	"Ahom",
@@ Line 648: / Line 734: @@
 }
--- Parts of speech which will not be categorised in categories like "English terms spelled with É" if
+--[==[ var:
--- the term is the character in question (e.g. the letter entry for English [[é]]). This contrasts with
+Parts of speech which will not be categorised in categories like `English terms spelled with É` if the term is the
--- entries like the French adjective [[m̂]], which is a one-letter word spelled with the letter.
+character in question (e.g. the letter entry for English [[é]]). This contrasts with entries like the French adjective
+[[m̂]], which is a one-letter word spelled with the letter.
+]==]
 data.pos_not_spelled_with_self = list_to_set{
 	"diacritical marks",
@@ Line 673: / Line 761: @@
 ------ 2. Lists not converted into sets. ------
--- Recognized aliases for parts of speech (param 2=). Key is the short form and value is the canonical singular (not
+--[==[ var:
--- pluralized) form. It is singular so that the same table can be used in [[Module:form of]] for the p=/POS= param
+Recognized aliases for parts of speech (param 2=). Key is the short form and value is the canonical singular (not
--- and [[Module:links]] for the pos= param.
+pluralized) form. It is singular so the same table can be used in [[Module:form of]] for the {{para|p}}/{{para|POS}}
+param and [[Module:links]] for the pos= param. Note that any part of speech, abbreviated or not, can be suffixed with
+`f` to generate the corresponding non-lemma form part of speech, such as `adjf`, `af` or `adjectivef` for
+`adjective form`, and `nounf` or `nf` for `noun form`. This expansion happens even when it does not make sense for the
+given part of speech (e.g. `pclf` expands to `particle form` and `symf` expands to `symbol form`), and currently also,
+at least in [[Module:headword]] (but not [[Module:links]]), even if the part before the `f` is not a recognized part of
+speech or abbreviation (hence `nerf` expands to `ner form`).
+]==]
 data.pos_aliases = {
 	a = "adjective",
@@ Line 682: / Line 777: @@
 	art = "article",
 	aug = "augmentative",
-	det = "determiner",
+	cls = "classifier",
-	dim = "diminutive",
 	compadj = "comparative adjective",
 	compadv = "comparative adverb",
+	compdet = "comparative determiner",
+	comppron = "comparative pronoun",
 	conj = "conjunction",
 	contr = "contraction",
 	conv = "converb",
+	det = "determiner",
+	dim = "diminutive",
 	int = "interjection",
 	interj = "interjection",
@@ Line 697: / Line 795: @@
 	ni = "inanimate noun",
 	num = "numeral",
+	pastpart = "past participle",
 	part = "participle",
 	pcl = "particle",
@@ Line 705: / Line 804: @@
 	prep = "preposition",
 	prepphr = "prepositional phrase",
+	prespart = "present participle",
 	pron = "pronoun",
 	prop = "proper noun",
@@ Line 710: / Line 810: @@
 	propn = "proper noun",
 	rom = "romanization",
+	roman = "romanization",
+	romanisation = "romanization",
+	romanisations = "romanization",
 	suf = "suffix",
 	supadj = "superlative adjective",
 	supadv = "superlative adverb",
+	supdet = "superlative determiner",
+	suppron = "superlative pronoun",
 	sym = "symbol",
 	v = "verb",
 	vb = "verb",
 	vi = "intransitive verb",
+	vm = "modal verb",
 	vt = "transitive verb",
 	-- the next four support Algonquian languages
@@ Line 725: / Line 831: @@
 }
--- Parts of speech for which categories like "German masculine nouns" or "Russian imperfective verbs"
+--[==[ var:
--- will be generated if the headword is of the appropriate gender/number.
+Map of parts of speech for which categories like `German masculine nouns` or `Russian imperfective verbs` will be
+generated if the headword is of the appropriate gender/number. The map is used to canonicalize parts of speech for
+categorization purposes; specifically, proper nouns categorizes like nouns.
+]==]
 data.pos_for_gender_number_cat = {
 	["nouns"] = "nouns",
@@ Line 735: / Line 844: @@
 }
--- Lower limit for a "long" word in a particular language.
+--[==[ var:
--- Used to categorize terms into e.g. [[:Category:Long English words]] automatically.
+Lower limit for a "long" word in a particular language. Used to categorize terms into e.g.
--- Languages with no mapping here do not get categorized.
+[[:Category:Long English words]] automatically. Languages with no mapping here do not get categorized.
+]==]
 data.long_word_thresholds = {
      ["af"] = 20,
@@ Line 753: / Line 863: @@
 ------ 3. Page-wide processing (so that it only needs to be done once per page). ------
 data.page = require(headword_page_module).process_page()
--- Fuckme, random references to data.pagename and data.encoded_pagename are scattered throughout the codebase. FIXME!
+-- Set some page properties directly on `data` for ease of use.
 data.pagename = data.page.pagename
 data.encoded_pagename = data.page.encoded_pagename
 return data

Retrieved from "https://linguifex.com/wiki/Module:headword/data"

Languages

This page is not available in other languages.

Linguifex

Privacy policy
About Linguifex
Disclaimers
Desktop