Module:headword/page: Difference between revisions
Tags: Undo Reverted |
Tag: Undo |
||
| Line 1: | Line 1: | ||
local export = {} | local export = {} | ||
local languages_module = "Module:languages" | local languages_module = "Module:languages" | ||
local maintenance_category_module = "Module:maintenance category" | local maintenance_category_module = "Module:maintenance category" | ||
local pages_module = "Module:pages" | |||
local string_compare_module = "Module:string/compare" | |||
local string_decode_entities_module = "Module:string/decodeEntities" | |||
local string_remove_comments_module = "Module:string/removeComments" | |||
local string_utilities_module = "Module:string utilities" | local string_utilities_module = "Module:string utilities" | ||
local table_module = "Module:table" | local table_module = "Module:table" | ||
| Line 28: | Line 31: | ||
local ugsub = ustring.gsub | local ugsub = ustring.gsub | ||
local function class_else_type(...) | |||
class_else_type = require(template_parser_module).class_else_type | |||
return class_else_type(...) | |||
end | |||
local function decode_entities(...) | |||
decode_entities = require(string_decode_entities_module) | |||
return decode_entities(...) | |||
end | |||
local function encode_entities(...) | |||
encode_entities = require(string_utilities_module).encode_entities | |||
return encode_entities(...) | |||
end | |||
local function get_category(...) | |||
get_category = require(maintenance_category_module).get_category | |||
return get_category(...) | |||
end | |||
local function get_lang(...) | |||
get_lang = require(languages_module).getByCode | |||
return get_lang(...) | |||
end | |||
local function list_to_set(...) | |||
list_to_set = require(table_module).listToSet | |||
return list_to_set(...) | |||
end | |||
local function parse(...) | |||
parse = require(template_parser_module).parse | |||
return parse(...) | |||
end | |||
local function remove_comments(...) | |||
remove_comments = require(string_remove_comments_module) | |||
return remove_comments(...) | |||
end | |||
local function physical_to_logical_pagename_if_mammoth(...) | |||
physical_to_logical_pagename_if_mammoth = require(pages_module).physical_to_logical_pagename_if_mammoth | |||
local function split(...) | return physical_to_logical_pagename_if_mammoth(...) | ||
end | |||
local function split(...) | |||
split = require(string_utilities_module).split | |||
return split(...) | |||
end | |||
local function string_compare(...) | |||
string_compare = require(string_compare_module) | |||
return string_compare(...) | |||
end | |||
local function uupper(...) | |||
uupper = require(string_utilities_module).upper | |||
return uupper(...) | |||
end | |||
--[==[ | --[==[ | ||
| Line 650: | Line 656: | ||
end | end | ||
return unsupported_titles | return unsupported_titles | ||
end | |||
-- To save on memory, we only cache names with either non-ASCII characters in them or ASCII characters to be removed or | |||
-- transformed (apostrophe, double quote, hyphen). | |||
local L2_sort_key_cache = {} | |||
function export.get_L2_sort_key(L2) | |||
if L2 == "Translingual" then | |||
return "\1" | |||
elseif L2 == "English" then | |||
return "\2" | |||
elseif match(L2, "^[%z\1-\b\14-!#-&(-,.-\127]+$") then | |||
return L2 | |||
end | |||
local sort_key = L2_sort_key_cache[L2] | |||
if sort_key then | |||
return sort_key | |||
end | |||
sort_key = toNFC(ugsub(ugsub(toNFD(L2), "[" .. comb_chars_all .. "'\"ʻʼ]+", ""), "[%s%-]+", " ")) | |||
L2_sort_key_cache[L2] = sort_key | |||
return sort_key | |||
end | end | ||
| Line 676: | Line 703: | ||
* `ns`: Namespace table for the page from mw.site.namespaces (TODO: merge with `namespace` above). | * `ns`: Namespace table for the page from mw.site.namespaces (TODO: merge with `namespace` above). | ||
* `full_raw_pagename`: Full version of the '''RAW''' pagename (i.e. unsupported-title pages aren't canonicalized); | * `full_raw_pagename`: Full version of the '''RAW''' pagename (i.e. unsupported-title pages aren't canonicalized); | ||
including the namespace and the | including the namespace and the base (portion before the slash). | ||
* `pagename`: Canonicalized subpage portion of the pagename (unsupported-title pages are canonicalized). | * `pagename`: Canonicalized subpage portion of the pagename (unsupported-title pages are canonicalized). | ||
* `pagename_with_base`: Same as `pagename` in the main namespace; otherwise, the whole pagename without the namespace. | |||
* `decompose_pagename`: Equivalent of `pagename` in NFD decomposition. | * `decompose_pagename`: Equivalent of `pagename` in NFD decomposition. | ||
* `pagename_len`: Length of `pagename` in Unicode chars, where combinations of spacing character + decomposed diacritic | * `pagename_len`: Length of `pagename` in Unicode chars, where combinations of spacing character + decomposed diacritic | ||
| Line 688: | Line 716: | ||
* `wikitext_topic_cat`: FIXME: Document me. | * `wikitext_topic_cat`: FIXME: Document me. | ||
* `wikitext_langname_cat`: FIXME: Document me. | * `wikitext_langname_cat`: FIXME: Document me. | ||
`no_fetch_content` says to not fetch and parse the content or set a DEFAULTSORT sort key, in order to save time on | |||
test and documentation pages that have lots of template invocations that set `|pagename=`. It turns out nearly all the | |||
time of this function is contained in the line `frame:callParserFunction("DEFAULTSORT", data.pagename_defaultsort)`, | |||
so we skip it on test and documentation pages where it accomplishes nothing in any case. | |||
]==] | ]==] | ||
function export.process_page(pagename) | function export.process_page(pagename, no_fetch_content) | ||
local data = { | local data = { | ||
comb_chars = comb_chars, | comb_chars = comb_chars, | ||
| Line 704: | Line 737: | ||
local function bad_pagename() | local function bad_pagename() | ||
if not pagename then | if not pagename then | ||
error("Internal error: Something wrong, `data.pagename` not specified but current title | error("Internal error: Something wrong, `data.pagename` not specified but current title contains illegal characters") | ||
else | else | ||
error(format("Bad value for `data.pagename`: '%s', which must not contain illegal characters", pagename)) | error(format("Bad value for `data.pagename`: '%s', which must not contain illegal characters", pagename)) | ||
| Line 717: | Line 750: | ||
raw_title = mw.title.getCurrentTitle() | raw_title = mw.title.getCurrentTitle() | ||
end | end | ||
data.namespace = | |||
local nsText = raw_title.nsText | |||
local namespace_is_reconstruction = nsText == "Reconstruction" | |||
data.namespace = nsText | |||
data.ns = mw.site.namespaces[raw_title.namespace] | data.ns = mw.site.namespaces[raw_title.namespace] | ||
local full_raw_pagename = raw_title.fullText | |||
data.full_raw_pagename = full_raw_pagename | |||
local frame = mw.getCurrentFrame() | local frame = mw.getCurrentFrame() | ||
| Line 727: | Line 764: | ||
-- and substing on a nonexistent page is totally legit, and we don't actually need to be able to access the | -- and substing on a nonexistent page is totally legit, and we don't actually need to be able to access the | ||
-- content of the page. | -- content of the page. | ||
local content = raw_title:getContent() | local content = not no_fetch_content and raw_title:getContent() or nil | ||
-- Get the pagename. | -- Get the pagename. | ||
pagename = gsub( | pagename = physical_to_logical_pagename_if_mammoth(raw_title) | ||
pagename = gsub(pagename, "^Unsupported titles/(.+)", function(m) | |||
insert(cats, "Unsupported titles") | |||
local title = (unsupported_titles or get_unsupported_titles())[m] | local title = (unsupported_titles or get_unsupported_titles())[m] | ||
if title then | if title then | ||
| Line 775: | Line 814: | ||
end) | end) | ||
-- Save pagename, as local variable will be destructively modified. | -- Save pagename, as the local variable will be destructively modified. | ||
data.pagename = pagename | data.pagename = pagename | ||
if nsText == "" then | |||
data.pagename_with_base = pagename | |||
else | |||
data.pagename_with_base = raw_title.text | |||
end | |||
-- Decompose the pagename in Unicode normalization form D. | -- Decompose the pagename in Unicode normalization form D. | ||
data.decompose_pagename = toNFD(pagename) | data.decompose_pagename = toNFD(pagename) | ||
| Line 796: | Line 840: | ||
data.encoded_pagename = encode_entities(data.pagename) | data.encoded_pagename = encode_entities(data.pagename) | ||
data.pagename_defaultsort = get_lang("mul"):makeSortKey(data.encoded_pagename) | data.pagename_defaultsort = get_lang("mul"):makeSortKey(data.encoded_pagename) | ||
frame:callParserFunction("DEFAULTSORT", data.pagename_defaultsort) | if not no_fetch_content then | ||
frame:callParserFunction("DEFAULTSORT", data.pagename_defaultsort) | |||
end | |||
data.raw_defaultsort = uupper(raw_title.text) | data.raw_defaultsort = uupper(raw_title.text) | ||
| Line 802: | Line 848: | ||
-- Note: HTML comments shouldn't be removed from `content` until after this step, as they can affect the result. | -- Note: HTML comments shouldn't be removed from `content` until after this step, as they can affect the result. | ||
do | do | ||
local L2_list, L2_list_len, L2_sections | local L2_list, L2_list_len, L2_sections = {}, 0, {} | ||
local prev, rc | |||
local new_cats, L2_wrong_order = {} | local new_cats, L2_wrong_order = {} | ||
local function handle_heading(heading) | local function handle_heading(heading) | ||
| Line 835: | Line 865: | ||
L2_list[L2_list_len] = name | L2_list[L2_list_len] = name | ||
L2_sections[heading.section] = name | L2_sections[heading.section] = name | ||
-- Also add any L1s, since they terminate the preceding L2, but add a maintenance category since it's probably a mistake. | |||
if level == 1 then | |||
new_cats["Pages with unwanted L1 headings"] = true | |||
end | |||
-- Check the heading is in the right order. | -- Check the heading is in the right order. | ||
-- FIXME: we need a more sophisticated sorting method which handles non-diacritic special characters (e.g. Magɨ). | -- FIXME: we need a more sophisticated sorting method which handles non-diacritic special characters (e.g. Magɨ). | ||
if prev and not ( | if prev and not ( | ||
L2_wrong_order or | L2_wrong_order or | ||
string_compare(export.get_L2_sort_key(prev), export.get_L2_sort_key(name)) | |||
) then | ) then | ||
new_cats["Pages with language headings in the wrong order"] = true | |||
L2_wrong_order = true | L2_wrong_order = true | ||
end | |||
-- Check it's a canonical language name. | |||
if not (langnames or get_langnames())[name] then | |||
new_cats["Pages with nonstandard language headings"] = true | |||
end | end | ||
prev = name | prev = name | ||
| Line 848: | Line 886: | ||
local function handle_template(template) | local function handle_template(template) | ||
local name = template:get_name() | -- Turn off redirect checking except in the Reconstruction namespace because the rc flag is only | ||
if name == "DISPLAYTITLE:" then | -- used in the Reconstruction namespace and the other names are parser functions, which AFAIK can't | ||
-- be redirected to. | |||
local name = template:get_name(nil, not namespace_is_reconstruction and "no_redirect" or nil) | |||
if name == "DEFAULTSORT:" then | |||
new_cats["Pages with DEFAULTSORT conflicts"] = true | |||
elseif name == "DISPLAYTITLE:" then | |||
new_cats["Pages with DISPLAYTITLE conflicts"] = true | new_cats["Pages with DISPLAYTITLE conflicts"] = true | ||
elseif name == "reconstructed" then | |||
rc = true | |||
end | end | ||
end | end | ||
| Line 861: | Line 906: | ||
elseif node_class == "template" then | elseif node_class == "template" then | ||
handle_template(node) | handle_template(node) | ||
elseif node_class == "parameter" then | |||
new_cats["Pages with raw triple-brace template parameters"] = true | |||
end | end | ||
end | end | ||
| Line 868: | Line 915: | ||
data.L2_list = L2_list | data.L2_list = L2_list | ||
data.L2_sections = L2_sections | data.L2_sections = L2_sections | ||
insert(cats, get_category("Pages with entries")) | |||
insert(cats, get_category(format("Pages with %s entr%s", L2_list_len, L2_list_len == 1 and "y" or "ies"))) | |||
for cat in pairs(new_cats) do | for cat in pairs(new_cats) do | ||
insert(cats, get_category(cat)) | insert(cats, get_category(cat)) | ||
end | |||
if namespace_is_reconstruction and not rc then | |||
local langname = match(full_raw_pagename, "^Reconstruction:([^/]+)/.") | |||
if langname then | |||
insert(cats, get_category(langname .. " entries missing Template:reconstructed")) | |||
end | |||
end | end | ||
end | end | ||
| Line 876: | Line 932: | ||
------ 4. Parse page for maintenance categories. ------ | ------ 4. Parse page for maintenance categories. ------ | ||
-- Use of tab characters. | -- Use of tab characters. | ||
if content and find(content, "\t", 1, true) then | |||
insert(cats, get_category("Pages with tab characters")) | |||
end | |||
-- Unencoded character(s) in title. | -- Unencoded character(s) in title. | ||
local IDS = list_to_set{"⿰", "⿱", "⿲", "⿳", "⿴", "⿵", "⿶", "⿷", "⿸", "⿹", "⿺", "⿻", "", "", "", "", ""} | local IDS = list_to_set{"⿰", "⿱", "⿲", "⿳", "⿴", "⿵", "⿶", "⿷", "⿸", "⿹", "⿺", "⿻", "", "", "", "", ""} | ||
for char in pairs(explode_pagename) do | |||
if IDS[char] and char ~= data.pagename then | |||
insert(cats, "Terms containing unencoded characters") | |||
break | |||
end | |||
end | |||
-- Raw wikitext use of a topic or langname category. Also check if any raw sortkeys have been used. | -- Raw wikitext use of a topic or langname category. Also check if any raw sortkeys have been used. | ||
| Line 976: | Line 1,041: | ||
data.wikitext_topic_cat = wikitext_topic_cat | data.wikitext_topic_cat = wikitext_topic_cat | ||
data.wikitext_langname_cat = wikitext_langname_cat | data.wikitext_langname_cat = wikitext_langname_cat | ||
if raw_sortkey then | |||
insert(cats, get_category("Pages with raw sortkeys")) | |||
end | |||
end | end | ||