Module:headword/page: Difference between revisions
export get_L2_sort_key() Tag: Reverted |
No edit summary Tag: Manual revert |
||
| (One intermediate revision by the same user not shown) | |||
| Line 1: | Line 1: | ||
local export = {} | local export = {} | ||
local collation_module = "Module:collation" | |||
local languages_module = "Module:languages" | local languages_module = "Module:languages" | ||
local maintenance_category_module = "Module:maintenance category" | local maintenance_category_module = "Module:maintenance category" | ||
local string_utilities_module = "Module:string utilities" | local string_utilities_module = "Module:string utilities" | ||
local table_module = "Module:table" | local table_module = "Module:table" | ||
| Line 31: | Line 28: | ||
local ugsub = ustring.gsub | local ugsub = ustring.gsub | ||
local function class_else_type(...) | --[==[ | ||
Loaders for functions in other modules, which overwrite themselves with the target function when called. This ensures modules are only loaded when needed, retains the speed/convenience of locally-declared pre-loaded functions, and has no overhead after the first call, since the target functions are called directly in any subsequent calls.]==] | |||
local function class_else_type(...) | |||
end | class_else_type = require(template_parser_module).class_else_type | ||
return class_else_type(...) | |||
local function decode_entities(...) | end | ||
local function decode_entities(...) | |||
end | decode_entities = require(string_utilities_module).decode_entities | ||
return decode_entities(...) | |||
local function encode_entities(...) | end | ||
local function encode_entities(...) | |||
end | encode_entities = require(string_utilities_module).encode_entities | ||
return encode_entities(...) | |||
local function get_category(...) | end | ||
local function get_category(...) | |||
end | get_category = require(maintenance_category_module).get_category | ||
return get_category(...) | |||
local function get_lang(...) | end | ||
local function get_lang(...) | |||
end | get_lang = require(languages_module).getByCode | ||
return get_lang(...) | |||
local function list_to_set(...) | end | ||
local function list_to_set(...) | |||
end | list_to_set = require(table_module).listToSet | ||
return list_to_set(...) | |||
local function parse(...) | end | ||
local function parse(...) | |||
end | parse = require(template_parser_module).parse | ||
return parse(...) | |||
local function remove_comments(...) | end | ||
local function remove_comments(...) | |||
end | remove_comments = require(string_utilities_module).remove_comments | ||
return remove_comments(...) | |||
end | |||
local function split(...) | |||
split = require(string_utilities_module).split | |||
return split(...) | |||
local function split(...) | end | ||
local function string_sort(...) | |||
end | string_sort = require(collation_module).string_sort | ||
return string_sort(...) | |||
local function | end | ||
local function uupper(...) | |||
end | uupper = require(string_utilities_module).upper | ||
return uupper(...) | |||
local function uupper(...) | end | ||
end | |||
--[==[ | --[==[ | ||
| Line 656: | Line 650: | ||
end | end | ||
return unsupported_titles | return unsupported_titles | ||
end | end | ||
| Line 703: | Line 676: | ||
* `ns`: Namespace table for the page from mw.site.namespaces (TODO: merge with `namespace` above). | * `ns`: Namespace table for the page from mw.site.namespaces (TODO: merge with `namespace` above). | ||
* `full_raw_pagename`: Full version of the '''RAW''' pagename (i.e. unsupported-title pages aren't canonicalized); | * `full_raw_pagename`: Full version of the '''RAW''' pagename (i.e. unsupported-title pages aren't canonicalized); | ||
including the namespace and the | including the namespace and the root (portion before the slash). | ||
* `pagename`: Canonicalized subpage portion of the pagename (unsupported-title pages are canonicalized). | * `pagename`: Canonicalized subpage portion of the pagename (unsupported-title pages are canonicalized). | ||
* `decompose_pagename`: Equivalent of `pagename` in NFD decomposition. | * `decompose_pagename`: Equivalent of `pagename` in NFD decomposition. | ||
* `pagename_len`: Length of `pagename` in Unicode chars, where combinations of spacing character + decomposed diacritic | * `pagename_len`: Length of `pagename` in Unicode chars, where combinations of spacing character + decomposed diacritic | ||
| Line 716: | Line 688: | ||
* `wikitext_topic_cat`: FIXME: Document me. | * `wikitext_topic_cat`: FIXME: Document me. | ||
* `wikitext_langname_cat`: FIXME: Document me. | * `wikitext_langname_cat`: FIXME: Document me. | ||
]==] | ]==] | ||
function export.process_page(pagename | function export.process_page(pagename) | ||
local data = { | local data = { | ||
comb_chars = comb_chars, | comb_chars = comb_chars, | ||
| Line 737: | Line 704: | ||
local function bad_pagename() | local function bad_pagename() | ||
if not pagename then | if not pagename then | ||
error("Internal error: Something wrong, `data.pagename` not specified but current title | error("Internal error: Something wrong, `data.pagename` not specified but current title containg illegal characters") | ||
else | else | ||
error(format("Bad value for `data.pagename`: '%s', which must not contain illegal characters", pagename)) | error(format("Bad value for `data.pagename`: '%s', which must not contain illegal characters", pagename)) | ||
| Line 750: | Line 717: | ||
raw_title = mw.title.getCurrentTitle() | raw_title = mw.title.getCurrentTitle() | ||
end | end | ||
data.namespace = raw_title.nsText | |||
data.namespace = nsText | |||
data.ns = mw.site.namespaces[raw_title.namespace] | data.ns = mw.site.namespaces[raw_title.namespace] | ||
data.full_raw_pagename = raw_title.fullText | |||
local frame = mw.getCurrentFrame() | local frame = mw.getCurrentFrame() | ||
| Line 764: | Line 727: | ||
-- and substing on a nonexistent page is totally legit, and we don't actually need to be able to access the | -- and substing on a nonexistent page is totally legit, and we don't actually need to be able to access the | ||
-- content of the page. | -- content of the page. | ||
local content = | local content = raw_title:getContent() | ||
-- Get the pagename. | -- Get the pagename. | ||
pagename = | pagename = gsub(raw_title.subpageText, "^Unsupported titles/(.+)", function(m) | ||
local title = (unsupported_titles or get_unsupported_titles())[m] | local title = (unsupported_titles or get_unsupported_titles())[m] | ||
if title then | if title then | ||
| Line 814: | Line 775: | ||
end) | end) | ||
-- Save pagename, as | -- Save pagename, as local variable will be destructively modified. | ||
data.pagename = pagename | data.pagename = pagename | ||
-- Decompose the pagename in Unicode normalization form D. | -- Decompose the pagename in Unicode normalization form D. | ||
data.decompose_pagename = toNFD(pagename) | data.decompose_pagename = toNFD(pagename) | ||
| Line 840: | Line 796: | ||
data.encoded_pagename = encode_entities(data.pagename) | data.encoded_pagename = encode_entities(data.pagename) | ||
data.pagename_defaultsort = get_lang("mul"):makeSortKey(data.encoded_pagename) | data.pagename_defaultsort = get_lang("mul"):makeSortKey(data.encoded_pagename) | ||
frame:callParserFunction("DEFAULTSORT", data.pagename_defaultsort) | |||
data.raw_defaultsort = uupper(raw_title.text) | data.raw_defaultsort = uupper(raw_title.text) | ||
| Line 848: | Line 802: | ||
-- Note: HTML comments shouldn't be removed from `content` until after this step, as they can affect the result. | -- Note: HTML comments shouldn't be removed from `content` until after this step, as they can affect the result. | ||
do | do | ||
local L2_list, L2_list_len, L2_sections = {}, 0, {} | local L2_list, L2_list_len, L2_sections, sort_cache, prev = {}, 0, {}, {} | ||
local new_cats, L2_wrong_order = {} | local new_cats, L2_wrong_order = {} | ||
local function get_weight(L2) | |||
if L2 == "Translingual" then | |||
return "\1" | |||
elseif L2 == "English" then | |||
return "\2" | |||
elseif match(L2, "^[%z\1-\b\14-!#-&(-,.-\127]+$") then | |||
return L2 | |||
end | |||
local weight = sort_cache[L2] | |||
if weight then | |||
return weight | |||
end | |||
weight = toNFC(ugsub(ugsub(toNFD(L2), "[" .. comb_chars_all .. "'\"ʻʼ]+", ""), "[%s%-]+", " ")) | |||
sort_cache[L2] = weight | |||
return weight | |||
end | |||
local function handle_heading(heading) | local function handle_heading(heading) | ||
| Line 865: | Line 835: | ||
L2_list[L2_list_len] = name | L2_list[L2_list_len] = name | ||
L2_sections[heading.section] = name | L2_sections[heading.section] = name | ||
-- Check the heading is in the right order. | -- Check the heading is in the right order. | ||
-- FIXME: we need a more sophisticated sorting method which handles non-diacritic special characters (e.g. Magɨ). | -- FIXME: we need a more sophisticated sorting method which handles non-diacritic special characters (e.g. Magɨ). | ||
if prev and not ( | if prev and not ( | ||
L2_wrong_order or | L2_wrong_order or | ||
string_sort(get_weight(prev), get_weight(name)) | |||
) then | ) then | ||
L2_wrong_order = true | L2_wrong_order = true | ||
end | end | ||
prev = name | prev = name | ||
| Line 886: | Line 848: | ||
local function handle_template(template) | local function handle_template(template) | ||
local name = template:get_name() | |||
if name == "DISPLAYTITLE:" then | |||
local name = template:get_name( | |||
if | |||
new_cats["Pages with DISPLAYTITLE conflicts"] = true | new_cats["Pages with DISPLAYTITLE conflicts"] = true | ||
end | end | ||
end | end | ||
| Line 906: | Line 861: | ||
elseif node_class == "template" then | elseif node_class == "template" then | ||
handle_template(node) | handle_template(node) | ||
end | end | ||
end | end | ||
| Line 915: | Line 868: | ||
data.L2_list = L2_list | data.L2_list = L2_list | ||
data.L2_sections = L2_sections | data.L2_sections = L2_sections | ||
for cat in pairs(new_cats) do | for cat in pairs(new_cats) do | ||
insert(cats, get_category(cat)) | insert(cats, get_category(cat)) | ||
end | end | ||
end | end | ||
| Line 932: | Line 876: | ||
------ 4. Parse page for maintenance categories. ------ | ------ 4. Parse page for maintenance categories. ------ | ||
-- Use of tab characters. | -- Use of tab characters. | ||
-- Unencoded character(s) in title. | -- Unencoded character(s) in title. | ||
local IDS = list_to_set{"⿰", "⿱", "⿲", "⿳", "⿴", "⿵", "⿶", "⿷", "⿸", "⿹", "⿺", "⿻", "", "", "", "", ""} | local IDS = list_to_set{"⿰", "⿱", "⿲", "⿳", "⿴", "⿵", "⿶", "⿷", "⿸", "⿹", "⿺", "⿻", "", "", "", "", ""} | ||
-- Raw wikitext use of a topic or langname category. Also check if any raw sortkeys have been used. | -- Raw wikitext use of a topic or langname category. Also check if any raw sortkeys have been used. | ||
| Line 1,041: | Line 976: | ||
data.wikitext_topic_cat = wikitext_topic_cat | data.wikitext_topic_cat = wikitext_topic_cat | ||
data.wikitext_langname_cat = wikitext_langname_cat | data.wikitext_langname_cat = wikitext_langname_cat | ||
end | end | ||