Module:links/data: Difference between revisions

From Linguifex
Jump to navigation Jump to search
(Undo revision 320507 by Sware (talk))
Tag: Undo
No edit summary
 
Line 1: Line 1:
local u = require("Module:string utilities").char
local data = {}
local data = {}


data.high_memory_entries = {
data.phonetic_extraction = {
"a",
["th"] = "Module:th",
"animal",
["km"] = "Module:km",
"book",
}
"coffee",
 
"do",
data.ignored_prefixes = {
"e",
["cat"] = true,
"language",
["category"] = true,
"night",
["file"] = true,
"smoke",
["image"] = true
"son",
"sun",
"water",
"wind",
}
}


data.pos_tags = {
["a"] = "adjective",
["adv"] = "adverb",
["int"] = "interjection",
["n"] = "noun",
["pron"] = "pronoun",
["v"] = "verb",
["vi"] = "intransitive verb",
["vt"] = "transitive verb",
["vti"] = "transitive and intransitive verb",
}


local U = mw.ustring.char
-- Scheme for using unsupported characters in titles.
local soft_hyphen = U(0xAD)
data.unsupported_characters = {
["#"] = "`num`",
["%"] = "`percnt`", -- only escaped in percent encoding
["&"] = "`amp`", -- only escaped in HTML entities
["."] = "`period`", -- only escaped in dot-slash notation
["<"] = "`lt`",
[">"] = "`gt`",
["["] = "`lsqb`",
["]"] = "`rsqb`",
["_"] = "`lowbar`",
["`"] = "`grave`", -- used to enclose unsupported characters in the scheme, so a raw use in an unsupported title must be escaped to prevent interference
["{"] = "`lcub`",
["|"] = "`vert`",
["}"] = "`rcub`",
["~"] = "`tilde`", -- only escaped when 3 or more are consecutive
["\239\191\189"] = "`repl`" -- replacement character U+FFFD, which can't be typed directly here due to an abuse filter
}


--[[ The "actual title" is the page name with the prefix "Unsupported titles/" removed.
-- Manually specified unsupported titles. Only put titles here if there is a different reason why they are unsupported, and not just because they contain one of the unsupported characters above.
["displayed_title"] = "actual title" ]]
data.unsupported_titles = {
data.unsupported_titles = {
[" "] = "Space",
[" "] = "Space",
["{"] = "Left curly bracket",
["&amp;"] = "`amp`amp;",
["}"] = "Right curly bracket",
["λοπαδοτεμαχοσελαχογαλεοκρανιολειψανοδριμυποτριμματοσιλφιοκαραβομελιτοκατακεχυμενοκιχλεπικοσσυφοφαττοπεριστεραλεκτρυονοπτοκεφαλλιοκιγκλοπελειολαγῳοσιραιοβαφητραγανοπτερύγων"] = "Ancient Greek dish",
["["] = "Left square bracket",
["]"] = "Right square bracket",
["<"] = "Less than",
[">"] = "Greater than",
["=<"] = "Equal less than",
["=>"] = "Equal greater than",
[">="] = "Greater than equal",
["<="] = "Less than equal",
["->"] = "Hyphen greater than",
["<-"] = "Less than hyphen",
[">_<"] = "Greater than low line less than",
["::"] = "Double colon",
[": :"] = "Enclosing colons",
[":="] = "Colon equals",
[":Þ"] = "Colon capital thorn",
[":("] = "Colon left paren",
[":)"] = "Colon right paren",
["<>"] = "Less than greater than",
["<3"] = "Less than three",
["</3"] = "Less than slash three",
["< >"] = "Enclosing less than greater than",
["< />"] = "Less than trailing slash greater than",
["< > </ >"] = "HTML start tag end tag",
["<!-- -->"] = "HTML comment",
["<g>"] = "g tag",
[":-("] = "Colon hyphen left paren",
[":-)"] = "Colon hyphen right paren",
["|"] = "Vertical line",
["||"] = "Vertical line vertical line",
["| |"] = "Enclosing vertical lines",
["C#"] = "C sharp",
["#"] = "Number sign",
["# #"] = "Enclosing number signs",
[":"] = "Colon",
[".."] = "Double period",
["."] = "Full stop",
["_"] = "Low line",
["-_-"] = "Low line interfix",
[U(0xFFFD)] = "Replacement character",
[U(0x1680)] = "Ogham space",
["[ ]"] = "Square brackets",
["{ }"] = "Curly brackets",
["[…]"] = "Square bracketed ellipsis",
["_ _"] = "Enclosing low lines",
["C|N>K"] = "C through N to K",
["#MeToo"] = "MeToo",
["о/."] = "о slash dot",
["กรุงเทพมหานคร อมรรัตนโกสินทร์ มหินทรายุธยา มหาดิลกภพ นพรัตนราชธานีบูรีรมย์ อุดมราชนิเวศน์มหาสถาน อมรพิมานอวตารสถิต สักกะทัตติยวิษณุกรรมประสิทธิ์"] = "Thai name of Bangkok",
["กรุงเทพมหานคร อมรรัตนโกสินทร์ มหินทรายุธยา มหาดิลกภพ นพรัตนราชธานีบูรีรมย์ อุดมราชนิเวศน์มหาสถาน อมรพิมานอวตารสถิต สักกะทัตติยวิษณุกรรมประสิทธิ์"] = "Thai name of Bangkok",
["λοπαδοτεμαχοσελαχογαλεοκρανιολειψανοδριμυποτριμματοσιλφιοκαραβομελιτοκατακεχυμενοκιχλ" .. soft_hyphen .. "επικοσσυφοφαττοπεριστεραλεκτρυονοπτοκεφαλλιοκιγκλοπελειολαγῳοσιραιοβαφητραγανοπτερύγων"] = "Ancient Greek dish",
[u(0x1680)] = "Ogham space",
[":≠"] = ":≠",
[u(0x3000)] = "Ideographic space"
["S:t"] = "S:t",
["S:ta"] = "S:ta",
["c:a"] = "c:a",
["n:a"] = "n:a",
["n:o"] = "n:o",
["n:r"] = "n:r",
["s:a"] = "s:a",
["st:a"] = "st:a",
["v:a"] = "v:a",
}
}
for i, item in ipairs(data.high_memory_entries) do
data.high_memory_entries[i] = nil
data.high_memory_entries[item] = true
end


return data
return data

Latest revision as of 09:44, 31 July 2024



local u = require("Module:string utilities").char

local data = {}

data.phonetic_extraction = {
	["th"] = "Module:th",
	["km"] = "Module:km",
}

data.ignored_prefixes = {
	["cat"] = true,
	["category"] = true,
	["file"] = true,
	["image"] = true
}

data.pos_tags = {
	["a"] = "adjective",
	["adv"] = "adverb",
	["int"] = "interjection",
	["n"] = "noun",
	["pron"] = "pronoun",
	["v"] = "verb",
	["vi"] = "intransitive verb",
	["vt"] = "transitive verb",
	["vti"] = "transitive and intransitive verb",
}

-- Scheme for using unsupported characters in titles.
data.unsupported_characters = {
	["#"] = "`num`",
	["%"] = "`percnt`", -- only escaped in percent encoding
	["&"] = "`amp`", -- only escaped in HTML entities
	["."] = "`period`", -- only escaped in dot-slash notation
	["<"] = "`lt`",
	[">"] = "`gt`",
	["["] = "`lsqb`",
	["]"] = "`rsqb`",
	["_"] = "`lowbar`",
	["`"] = "`grave`", -- used to enclose unsupported characters in the scheme, so a raw use in an unsupported title must be escaped to prevent interference
	["{"] = "`lcub`",
	["|"] = "`vert`",
	["}"] = "`rcub`",
	["~"] = "`tilde`", -- only escaped when 3 or more are consecutive
	["\239\191\189"] = "`repl`" -- replacement character U+FFFD, which can't be typed directly here due to an abuse filter
}

-- Manually specified unsupported titles. Only put titles here if there is a different reason why they are unsupported, and not just because they contain one of the unsupported characters above.
data.unsupported_titles = {
	[" "] = "Space",
	["&amp;"] = "`amp`amp;",
	["λοπαδοτεμαχοσελαχογαλεοκρανιολειψανοδριμυποτριμματοσιλφιοκαραβομελιτοκατακεχυμενοκιχλεπικοσσυφοφαττοπεριστεραλεκτρυονοπτοκεφαλλιοκιγκλοπελειολαγῳοσιραιοβαφητραγανοπτερύγων"] = "Ancient Greek dish",
	["กรุงเทพมหานคร อมรรัตนโกสินทร์ มหินทรายุธยา มหาดิลกภพ นพรัตนราชธานีบูรีรมย์ อุดมราชนิเวศน์มหาสถาน อมรพิมานอวตารสถิต สักกะทัตติยวิษณุกรรมประสิทธิ์"] = "Thai name of Bangkok",
	[u(0x1680)] = "Ogham space",
	[u(0x3000)] = "Ideographic space"
}

return data