Module:links/data: Difference between revisions

No edit summary
support different splits for different mammoth pages
Line 1: Line 1:
local data = {}
local unpack = unpack or table.unpack -- Lua 5.2 compatibility
local u = require("Module:string utilities").char
local u = require("Module:string utilities").char
local data = {}


data.phonetic_extraction = {
data.phonetic_extraction = {
Line 13: Line 14:
["file"] = true,
["file"] = true,
["image"] = true
["image"] = true
}
data.pos_tags = {
["a"] = "adjective",
["adv"] = "adverb",
["int"] = "interjection",
["n"] = "noun",
["pron"] = "pronoun",
["v"] = "verb",
["vi"] = "intransitive verb",
["vt"] = "transitive verb",
["vti"] = "transitive and intransitive verb",
}
}


Line 54: Line 43:
[u(0x1680)] = "Ogham space",
[u(0x1680)] = "Ogham space",
[u(0x3000)] = "Ideographic space"
[u(0x3000)] = "Ideographic space"
}
-- Mammoth pages contain only Translingual and English entries, if present. The remaining L2s are placed on subpages.
-- The same subpage titles are used across all mammoth pages for the convenience of bot and script operators.
-- Assuming that most mammoth pages will be Latin-script terms, the subpage groupings are determined by dividing the
-- list of Latin-script languages known to Wiktionary into two (three, ...) roughly equal alphabetic divisions. This is
-- easily done by looking at Petscan's output:
-- https://petscan.wmcloud.org/?sortby=title&language=en&ns%5B14%5D=1&categories=Latin+script+languages&project=wiktionary&doit=
-- This data structure contains types of splits, each of which is a list of names of splits and Lua patterns applied to
-- the decomposed L2 name (with apostrophes and double quotes removed and certain other transformations applied; see
-- get_L2_sort_key() in [[Module:headword/page]]), or "true" for the final catch-all subpage (which includes anything
-- not beginning with a Latin letter after the transformations are applied; this includes e.g. ǃKung but not 'Are'are,
-- which sorts with A, and not Àhàn, which likewise sorts with A). The patterns must be suitable for use with plain
-- string functions, not their mw.ustring equivalents.
data.mammoth_page_subpage_types = {
twos = {
{"languages A to L", "^[A-L]"},
{"languages M to Z", true},
},
threes = {
{"languages A to I", "^[A-I]"},
{"languages J to Q", "^[J-Q]"},
{"languages R to Z", true},
},
CJK = {
{"languages A to C", "^[A-C]"}, -- Translingual and Chinese on one page
{"languages D to Z", true}, -- all the remainder (mostly Japanese, Korean, Vietnamese) on the other
},
}
-- "Mammoth pages" are pages whose entries cannot be housed on a single page because of MediaWiki limits. The key is
-- the page and the value is the subpage type, as defined above in `mammoth_page_subpage_types`.
data.mammoth_pages = {
["a"] = "twos", -- FIXME: change to threes
["mammoth page test"] = "twos",  -- required for testing purposes - please leave here
}
}


return data
return data