Module:links/data: Difference between revisions
No edit summary |
support different splits for different mammoth pages |
||
| Line 1: | Line 1: | ||
local data = {} | |||
local unpack = unpack or table.unpack -- Lua 5.2 compatibility | |||
local u = require("Module:string utilities").char | local u = require("Module:string utilities").char | ||
data.phonetic_extraction = { | data.phonetic_extraction = { | ||
| Line 13: | Line 14: | ||
["file"] = true, | ["file"] = true, | ||
["image"] = true | ["image"] = true | ||
} | } | ||
| Line 54: | Line 43: | ||
[u(0x1680)] = "Ogham space", | [u(0x1680)] = "Ogham space", | ||
[u(0x3000)] = "Ideographic space" | [u(0x3000)] = "Ideographic space" | ||
} | |||
-- Mammoth pages contain only Translingual and English entries, if present. The remaining L2s are placed on subpages. | |||
-- The same subpage titles are used across all mammoth pages for the convenience of bot and script operators. | |||
-- Assuming that most mammoth pages will be Latin-script terms, the subpage groupings are determined by dividing the | |||
-- list of Latin-script languages known to Wiktionary into two (three, ...) roughly equal alphabetic divisions. This is | |||
-- easily done by looking at Petscan's output: | |||
-- https://petscan.wmcloud.org/?sortby=title&language=en&ns%5B14%5D=1&categories=Latin+script+languages&project=wiktionary&doit= | |||
-- This data structure contains types of splits, each of which is a list of names of splits and Lua patterns applied to | |||
-- the decomposed L2 name (with apostrophes and double quotes removed and certain other transformations applied; see | |||
-- get_L2_sort_key() in [[Module:headword/page]]), or "true" for the final catch-all subpage (which includes anything | |||
-- not beginning with a Latin letter after the transformations are applied; this includes e.g. ǃKung but not 'Are'are, | |||
-- which sorts with A, and not Àhàn, which likewise sorts with A). The patterns must be suitable for use with plain | |||
-- string functions, not their mw.ustring equivalents. | |||
data.mammoth_page_subpage_types = { | |||
twos = { | |||
{"languages A to L", "^[A-L]"}, | |||
{"languages M to Z", true}, | |||
}, | |||
threes = { | |||
{"languages A to I", "^[A-I]"}, | |||
{"languages J to Q", "^[J-Q]"}, | |||
{"languages R to Z", true}, | |||
}, | |||
CJK = { | |||
{"languages A to C", "^[A-C]"}, -- Translingual and Chinese on one page | |||
{"languages D to Z", true}, -- all the remainder (mostly Japanese, Korean, Vietnamese) on the other | |||
}, | |||
} | |||
-- "Mammoth pages" are pages whose entries cannot be housed on a single page because of MediaWiki limits. The key is | |||
-- the page and the value is the subpage type, as defined above in `mammoth_page_subpage_types`. | |||
data.mammoth_pages = { | |||
["a"] = "twos", -- FIXME: change to threes | |||
["mammoth page test"] = "twos", -- required for testing purposes - please leave here | |||
} | } | ||
return data | return data | ||