|
|
Line 1: |
Line 1: |
| local encode = mw.text.encode
| |
| local u = mw.ustring.char
| |
| local data = {} | | local data = {} |
|
| |
|
| data.ignore_cap = { | | data.high_memory_entries = { |
| ["ko"] = true, | | "a", |
| | "animal", |
| | "book", |
| | "coffee", |
| | "do", |
| | "e", |
| | "language", |
| | "night", |
| | "smoke", |
| | "son", |
| | "sun", |
| | "water", |
| | "wind", |
| } | | } |
|
| |
|
| data.phonetic_extraction = {
| |
| ["th"] = "Module:th",
| |
| ["km"] = "Module:km",
| |
| }
| |
|
| |
|
| data.pos_tags = {
| | local U = mw.ustring.char |
| ["a"] = "adjective",
| | local soft_hyphen = U(0xAD) |
| ["adv"] = "adverb",
| |
| ["int"] = "interjection",
| |
| ["n"] = "noun",
| |
| ["pron"] = "pronoun",
| |
| ["v"] = "verb",
| |
| ["vi"] = "intransitive verb",
| |
| ["vt"] = "transitive verb",
| |
| ["vti"] = "transitive and intransitive verb",
| |
| }
| |
|
| |
|
| --[[ The "actual title" is the page name with the prefix "Unsupported titles/" removed. | | --[[ The "actual title" is the page name with the prefix "Unsupported titles/" removed. |
Line 28: |
Line 25: |
| data.unsupported_titles = { | | data.unsupported_titles = { |
| [" "] = "Space", | | [" "] = "Space", |
| ["# #"] = "Enclosing number signs", | | ["{"] = "Left curly bracket", |
| ["#"] = "Number sign",
| | ["}"] = "Right curly bracket", |
| ["#MeToo"] = "MeToo",
| |
| ["#MeTooed"] = "MeTooed",
| |
| ["#MeTooing"] = "MeTooing",
| |
| ["#MeToos"] = "MeToos",
| |
| ["&"] = "Amp",
| |
| ["¯\\_(ツ)_/¯"] = "¯\\ (ツ) /¯",
| |
| ["¯_(ツ)_/¯"] = "¯ (ツ) /¯",
| |
| ["(^_^)"] = "(^ ^)",
| |
| ["*_*"] = "* *",
| |
| ["."] = "Full stop",
| |
| [".."] = "Double period",
| |
| ["./."] = "Period slash period",
| |
| [": :"] = ": :",
| |
| [":"] = ":",
| |
| [":-{"] = "Colon hyphen left curly bracket",
| |
| [":{"] = "Colon left curly bracket", | |
| [":|"] = "Colon vertical line",
| |
| ["=_="] = "= =",
| |
| ["[ ]"] = "Square brackets",
| |
| ["["] = "Left square bracket", | | ["["] = "Left square bracket", |
| ["[…]"] = "Square bracketed ellipsis",
| |
| ["[...]"] = "Left square bracket ... right square bracket",
| |
| ["[-0-]"] = "Australian Aboriginal Flag emoticon alternative",
| |
| ["[citation needed]"] = "Square bracketed citation needed",
| |
| ["[-o-]"] = "Australian Aboriginal Flag emoticon",
| |
| ["]"] = "Right square bracket", | | ["]"] = "Right square bracket", |
| ["^_^"] = "^ ^",
| |
| ["_ _"] = "Underscore space underscore",
| |
| ["-_-"] = "- -",
| |
| ["_"] = "Underscore",
| |
| ["{ }"] = "Curly brackets",
| |
| ["{"] = "Left curly bracket",
| |
| ["| |"] = "Enclosing vertical lines",
| |
| ["|"] = "Vertical line",
| |
| ["-||-"] = "Hyphen vertical line vertical line hyphen",
| |
| ["||"] = "Vertical line vertical line",
| |
| ["}"] = "Right curly bracket",
| |
| ["</s>"] = "End s tag",
| |
| ["< />"] = "Less than trailing slash greater than",
| |
| ["< > </ >"] = "HTML start tag end tag",
| |
| ["< >"] = "Enclosing less than greater than",
| |
| ["<!-- -->"] = "HTML comment",
| |
| ["<-"] = "Less than hyphen",
| |
| ["<"] = "Less than", | | ["<"] = "Less than", |
| ["</3"] = "Less than slash three", | | [">"] = "Greater than", |
| ["<\\3"] = "Less than backslash three", | | ["=<"] = "Equal less than", |
| ["<<"] = "Double less than", | | ["=>"] = "Equal greater than", |
| ["<<<"] = "Triple less than", | | [">="] = "Greater than equal", |
| ["<="] = "Less than equal", | | ["<="] = "Less than equal", |
| | ["->"] = "Hyphen greater than", |
| | ["<-"] = "Less than hyphen", |
| | [">_<"] = "Greater than low line less than", |
| | ["::"] = "Double colon", |
| | [": :"] = "Enclosing colons", |
| | [":="] = "Colon equals", |
| | [":Þ"] = "Colon capital thorn", |
| | [":("] = "Colon left paren", |
| | [":)"] = "Colon right paren", |
| ["<>"] = "Less than greater than", | | ["<>"] = "Less than greater than", |
| ["<3"] = "Less than three", | | ["<3"] = "Less than three", |
| | ["</3"] = "Less than slash three", |
| | ["< >"] = "Enclosing less than greater than", |
| | ["< />"] = "Less than trailing slash greater than", |
| | ["< > </ >"] = "HTML start tag end tag", |
| | ["<!-- -->"] = "HTML comment", |
| ["<g>"] = "g tag", | | ["<g>"] = "g tag", |
| ["=<"] = "Equal less than", | | [":-("] = "Colon hyphen left paren", |
| ["=>"] = "Equal greater than",
| | [":-)"] = "Colon hyphen right paren", |
| [">"] = "Greater than",
| | ["|"] = "Vertical line", |
| ["->"] = "Hyphen greater than", | | ["||"] = "Vertical line vertical line", |
| [">_<"] = "Greater than low line less than", | | ["| |"] = "Enclosing vertical lines", |
| [">="] = "Greater than equal", | |
| [">>"] = "Double greater than",
| |
| [">>>"] = "Triple greater than",
| |
| ["×_×"] = "× ×",
| |
| ["9_9"] = "9 9", | |
| ["C#"] = "C sharp", | | ["C#"] = "C sharp", |
| | ["#"] = "Number sign", |
| | ["# #"] = "Enclosing number signs", |
| | [":"] = "Colon", |
| | [".."] = "Double period", |
| | ["."] = "Full stop", |
| | ["_"] = "Low line", |
| | ["-_-"] = "Low line interfix", |
| | [U(0xFFFD)] = "Replacement character", |
| | [U(0x1680)] = "Ogham space", |
| | ["[ ]"] = "Square brackets", |
| | ["{ }"] = "Curly brackets", |
| | ["[…]"] = "Square bracketed ellipsis", |
| | ["_ _"] = "Enclosing low lines", |
| ["C|N>K"] = "C through N to K", | | ["C|N>K"] = "C through N to K", |
| ["eq #"] = "eq number sign", | | ["#MeToo"] = "MeToo", |
| ["f##k"] = "f double number sign k",
| |
| ["f##ked"] = "f double number sign ked",
| |
| ["f##king"] = "f double number sign king",
| |
| ["f##ks"] = "f double number sign ks",
| |
| ["hr #"] = "hr number sign",
| |
| ["n_n"] = "n n",
| |
| ["O_O"] = "O O",
| |
| ["O_o"] = "O o",
| |
| ["o_O"] = "o O",
| |
| ["o_o"] = "o o",
| |
| ["snake_case"] = "snake case",
| |
| ["T_T"] = "T T",
| |
| ["u_u"] = "u u",
| |
| ["X_X"] = "X X",
| |
| ["x_x"] = "x x",
| |
| ["x86_64"] = "x86 64",
| |
| ["λοπαδοτεμαχοσελαχογαλεοκρανιολειψανοδριμυποτριμματοσιλφιοκαραβομελιτοκατακεχυμενοκιχλεπικοσσυφοφαττοπεριστεραλεκτρυονοπτοκεφαλλιοκιγκλοπελειολαγῳοσιραιοβαφητραγανοπτερύγων"] = "Ancient Greek dish",
| |
| ["о/."] = "о slash dot", | | ["о/."] = "о slash dot", |
| ["ಠ_ಠ"] = "ಠ ಠ",
| |
| ["ಥ_ಥ"] = "ಥ ಥ",
| |
| ["┬─┬ノ( º _ ºノ)"] = "┬─┬ノ( º ºノ)",
| |
| ["กรุงเทพมหานคร อมรรัตนโกสินทร์ มหินทรายุธยา มหาดิลกภพ นพรัตนราชธานีบูรีรมย์ อุดมราชนิเวศน์มหาสถาน อมรพิมานอวตารสถิต สักกะทัตติยวิษณุกรรมประสิทธิ์"] = "Thai name of Bangkok", | | ["กรุงเทพมหานคร อมรรัตนโกสินทร์ มหินทรายุธยา มหาดิลกภพ นพรัตนราชธานีบูรีรมย์ อุดมราชนิเวศน์มหาสถาน อมรพิมานอวตารสถิต สักกะทัตติยวิษณุกรรมประสิทธิ์"] = "Thai name of Bangkok", |
| [u(0x1680)] = "Ogham space", | | ["λοπαδοτεμαχοσελαχογαλεοκρανιολειψανοδριμυποτριμματοσιλφιοκαραβομελιτοκατακεχυμενοκιχλ" .. soft_hyphen .. "επικοσσυφοφαττοπεριστεραλεκτρυονοπτοκεφαλλιοκιγκλοπελειολαγῳοσιραιοβαφητραγανοπτερύγων"] = "Ancient Greek dish", |
| [u(0x3000)] = "Ideographic space", | | [":≠"] = ":≠", |
| [u(0xFFFD)] = "Replacement character",
| | ["S:t"] = "S:t", |
| }
| | ["S:ta"] = "S:ta", |
| | | ["c:a"] = "c:a", |
| data.display_change = {
| | ["n:a"] = "n:a", |
| [" "] = "] [", -- Space | | ["n:o"] = "n:o", |
| [u(0x00A0)] = "]" .. u(0x00A0) .. "[", -- No-break space | | ["n:r"] = "n:r", |
| [u(0x180E)] = "]" .. u(0x180E) .. "[", -- Mongolian vowel separator
| | ["s:a"] = "s:a", |
| [u(0x2000)] = "]" .. u(0x2000) .. "[", -- En quad | | ["st:a"] = "st:a", |
| [u(0x2001)] = "]" .. u(0x2001) .. "[", -- Em quad
| | ["v:a"] = "v:a", |
| [u(0x2002)] = "]" .. u(0x2002) .. "[", -- En space | |
| [u(0x2003)] = "]" .. u(0x2003) .. "[", -- Em space
| |
| [u(0x2004)] = "]" .. u(0x2004) .. "[", -- Three-per-em space | |
| [u(0x2005)] = "]" .. u(0x2005) .. "[", -- Four-per-em space
| |
| [u(0x2006)] = "]" .. u(0x2006) .. "[", -- Six-per-em space | |
| [u(0x2007)] = "]" .. u(0x2007) .. "[", -- Figure space
| |
| [u(0x2008)] = "]" .. u(0x2008) .. "[", -- Punctuation space | |
| [u(0x2009)] = "]" .. u(0x2009) .. "[", -- Thin space
| |
| [u(0x200A)] = "]" .. u(0x200A) .. "[", -- Hair space | |
| [u(0x202F)] = "]" .. u(0x202F) .. "[", -- Narrow no-break space
| |
| [u(0x205F)] = "]" .. u(0x205F) .. "[", -- Medium mathematical space | |
| [u(0x3000)] = "]" .. u(0x3000) .. "[", -- Ideographic space
| |
| } | | } |
|
| |
|
| -- Valid URI schemes in external links, which therefore have to be escaped if used in entry names (e.g. [[sms:a]]).
| | for i, item in ipairs(data.high_memory_entries) do |
| local uri_schemes = {
| | data.high_memory_entries[i] = nil |
| "bitcoin:",
| | data.high_memory_entries[item] = true |
| "ftp://",
| |
| "ftps://",
| |
| "geo:",
| |
| "git://",
| |
| "gopher://",
| |
| "http://",
| |
| "https://",
| |
| "irc:",
| |
| "ircs:",
| |
| "magnet:",
| |
| "mailto:",
| |
| "mms://",
| |
| "news:",
| |
| "nntp://",
| |
| "redis://",
| |
| "sftp://",
| |
| "sip:",
| |
| "sips:",
| |
| "sms:",
| |
| "ssh://",
| |
| "svn://",
| |
| "tel:",
| |
| "telnet://",
| |
| "urn:",
| |
| "worldwind://",
| |
| "xmpp:", | |
| }
| |
| -- Convert into lookup table.
| |
| local uri_lookup = {}
| |
| for _, scheme in ipairs(uri_schemes) do
| |
| uri_lookup[scheme] = encode(scheme, ":")
| |
| end | | end |
| data.uri_schemes = uri_lookup
| |
|
| |
|
| return data | | return data |