Module:headword/page: Difference between revisions

m 1 revision imported
No edit summary
Tag: Manual revert
 
Line 1: Line 1:
local export = {}
local export = {}


local collation_module = "Module:collation"
local languages_module = "Module:languages"
local languages_module = "Module:languages"
local maintenance_category_module = "Module:maintenance category"
local maintenance_category_module = "Module:maintenance category"
local pages_module = "Module:pages"
local string_compare_module = "Module:string/compare"
local string_decode_entities_module = "Module:string/decodeEntities"
local string_remove_comments_module = "Module:string/removeComments"
local string_utilities_module = "Module:string utilities"
local string_utilities_module = "Module:string utilities"
local table_module = "Module:table"
local table_module = "Module:table"
Line 31: Line 28:
local ugsub = ustring.gsub
local ugsub = ustring.gsub


local function class_else_type(...)
--[==[
class_else_type = require(template_parser_module).class_else_type
Loaders for functions in other modules, which overwrite themselves with the target function when called. This ensures modules are only loaded when needed, retains the speed/convenience of locally-declared pre-loaded functions, and has no overhead after the first call, since the target functions are called directly in any subsequent calls.]==]
return class_else_type(...)
local function class_else_type(...)
end
class_else_type = require(template_parser_module).class_else_type
 
return class_else_type(...)
local function decode_entities(...)
end
decode_entities = require(string_decode_entities_module)
return decode_entities(...)
local function decode_entities(...)
end
decode_entities = require(string_utilities_module).decode_entities
 
return decode_entities(...)
local function encode_entities(...)
end
encode_entities = require(string_utilities_module).encode_entities
return encode_entities(...)
local function encode_entities(...)
end
encode_entities = require(string_utilities_module).encode_entities
 
return encode_entities(...)
local function get_category(...)
end
get_category = require(maintenance_category_module).get_category
return get_category(...)
local function get_category(...)
end
get_category = require(maintenance_category_module).get_category
 
return get_category(...)
local function get_lang(...)
end
get_lang = require(languages_module).getByCode
return get_lang(...)
local function get_lang(...)
end
get_lang = require(languages_module).getByCode
 
return get_lang(...)
local function list_to_set(...)
end
list_to_set = require(table_module).listToSet
return list_to_set(...)
local function list_to_set(...)
end
list_to_set = require(table_module).listToSet
 
return list_to_set(...)
local function parse(...)
end
parse = require(template_parser_module).parse
return parse(...)
local function parse(...)
end
parse = require(template_parser_module).parse
 
return parse(...)
local function remove_comments(...)
end
remove_comments = require(string_remove_comments_module)
return remove_comments(...)
local function remove_comments(...)
end
remove_comments = require(string_utilities_module).remove_comments
 
return remove_comments(...)
local function physical_to_logical_pagename_if_mammoth(...)
end
physical_to_logical_pagename_if_mammoth = require(pages_module).physical_to_logical_pagename_if_mammoth
return physical_to_logical_pagename_if_mammoth(...)
local function split(...)
end
split = require(string_utilities_module).split
 
return split(...)
local function split(...)
end
split = require(string_utilities_module).split
return split(...)
local function string_sort(...)
end
string_sort = require(collation_module).string_sort
 
return string_sort(...)
local function string_compare(...)
end
string_compare = require(string_compare_module)
return string_compare(...)
local function uupper(...)
end
uupper = require(string_utilities_module).upper
 
return uupper(...)
local function uupper(...)
end
uupper = require(string_utilities_module).upper
return uupper(...)
end


--[==[
--[==[
Line 656: Line 650:
end
end
return unsupported_titles
return unsupported_titles
end
-- To save on memory, we only cache names with either non-ASCII characters in them or ASCII characters to be removed or
-- transformed (apostrophe, double quote, hyphen).
local L2_sort_key_cache = {}
function export.get_L2_sort_key(L2)
if L2 == "Translingual" then
return "\1"
elseif L2 == "English" then
return "\2"
elseif match(L2, "^[%z\1-\b\14-!#-&(-,.-\127]+$") then
return L2
end
local sort_key = L2_sort_key_cache[L2]
if sort_key then
return sort_key
end
sort_key = toNFC(ugsub(ugsub(toNFD(L2), "[" .. comb_chars_all .. "'\"ʻʼ]+", ""), "[%s%-]+", " "))
L2_sort_key_cache[L2] = sort_key
return sort_key
end
end


Line 703: Line 676:
* `ns`: Namespace table for the page from mw.site.namespaces (TODO: merge with `namespace` above).
* `ns`: Namespace table for the page from mw.site.namespaces (TODO: merge with `namespace` above).
* `full_raw_pagename`: Full version of the '''RAW''' pagename (i.e. unsupported-title pages aren't canonicalized);
* `full_raw_pagename`: Full version of the '''RAW''' pagename (i.e. unsupported-title pages aren't canonicalized);
   including the namespace and the base (portion before the slash).
   including the namespace and the root (portion before the slash).
* `pagename`: Canonicalized subpage portion of the pagename (unsupported-title pages are canonicalized).
* `pagename`: Canonicalized subpage portion of the pagename (unsupported-title pages are canonicalized).
* `pagename_with_base`: Same as `pagename` in the main namespace; otherwise, the whole pagename without the namespace.
* `decompose_pagename`: Equivalent of `pagename` in NFD decomposition.
* `decompose_pagename`: Equivalent of `pagename` in NFD decomposition.
* `pagename_len`: Length of `pagename` in Unicode chars, where combinations of spacing character + decomposed diacritic
* `pagename_len`: Length of `pagename` in Unicode chars, where combinations of spacing character + decomposed diacritic
Line 716: Line 688:
* `wikitext_topic_cat`: FIXME: Document me.
* `wikitext_topic_cat`: FIXME: Document me.
* `wikitext_langname_cat`: FIXME: Document me.
* `wikitext_langname_cat`: FIXME: Document me.
`no_fetch_content` says to not fetch and parse the content or set a DEFAULTSORT sort key, in order to save time on
test and documentation pages that have lots of template invocations that set `|pagename=`. It turns out nearly all the
time of this function is contained in the line `frame:callParserFunction("DEFAULTSORT", data.pagename_defaultsort)`,
so we skip it on test and documentation pages where it accomplishes nothing in any case.
]==]
]==]


function export.process_page(pagename, no_fetch_content)
function export.process_page(pagename)
local data = {
local data = {
comb_chars = comb_chars,
comb_chars = comb_chars,
Line 737: Line 704:
local function bad_pagename()
local function bad_pagename()
if not pagename then
if not pagename then
error("Internal error: Something wrong, `data.pagename` not specified but current title contains illegal characters")
error("Internal error: Something wrong, `data.pagename` not specified but current title containg illegal characters")
else
else
error(format("Bad value for `data.pagename`: '%s', which must not contain illegal characters", pagename))
error(format("Bad value for `data.pagename`: '%s', which must not contain illegal characters", pagename))
Line 750: Line 717:
raw_title = mw.title.getCurrentTitle()
raw_title = mw.title.getCurrentTitle()
end
end
 
data.namespace = raw_title.nsText
local nsText = raw_title.nsText
local namespace_is_reconstruction = nsText == "Reconstruction"
data.namespace = nsText
data.ns = mw.site.namespaces[raw_title.namespace]
data.ns = mw.site.namespaces[raw_title.namespace]
local full_raw_pagename = raw_title.fullText
data.full_raw_pagename = raw_title.fullText
data.full_raw_pagename = full_raw_pagename


local frame = mw.getCurrentFrame()
local frame = mw.getCurrentFrame()
Line 764: Line 727:
-- and substing on a nonexistent page is totally legit, and we don't actually need to be able to access the
-- and substing on a nonexistent page is totally legit, and we don't actually need to be able to access the
-- content of the page.
-- content of the page.
local content = not no_fetch_content and raw_title:getContent() or nil
local content = raw_title:getContent()


-- Get the pagename.
-- Get the pagename.
pagename = physical_to_logical_pagename_if_mammoth(raw_title)
pagename = gsub(raw_title.subpageText, "^Unsupported titles/(.+)", function(m)
pagename = gsub(pagename, "^Unsupported titles/(.+)", function(m)
insert(cats, "Unsupported titles")
local title = (unsupported_titles or get_unsupported_titles())[m]
local title = (unsupported_titles or get_unsupported_titles())[m]
if title then
if title then
Line 814: Line 775:
end)
end)
-- Save pagename, as the local variable will be destructively modified.
-- Save pagename, as local variable will be destructively modified.
data.pagename = pagename
data.pagename = pagename
if nsText == "" then
data.pagename_with_base = pagename
else
data.pagename_with_base = raw_title.text
end
-- Decompose the pagename in Unicode normalization form D.
-- Decompose the pagename in Unicode normalization form D.
data.decompose_pagename = toNFD(pagename)
data.decompose_pagename = toNFD(pagename)
Line 840: Line 796:
data.encoded_pagename = encode_entities(data.pagename)
data.encoded_pagename = encode_entities(data.pagename)
data.pagename_defaultsort = get_lang("mul"):makeSortKey(data.encoded_pagename)
data.pagename_defaultsort = get_lang("mul"):makeSortKey(data.encoded_pagename)
if not no_fetch_content then
frame:callParserFunction("DEFAULTSORT", data.pagename_defaultsort)
frame:callParserFunction("DEFAULTSORT", data.pagename_defaultsort)
end
data.raw_defaultsort = uupper(raw_title.text)
data.raw_defaultsort = uupper(raw_title.text)
Line 848: Line 802:
-- Note: HTML comments shouldn't be removed from `content` until after this step, as they can affect the result.
-- Note: HTML comments shouldn't be removed from `content` until after this step, as they can affect the result.
do
do
local L2_list, L2_list_len, L2_sections = {}, 0, {}
local L2_list, L2_list_len, L2_sections, sort_cache, prev = {}, 0, {}, {}
local prev, rc
local new_cats, L2_wrong_order = {}
local new_cats, L2_wrong_order = {}
local function get_weight(L2)
if L2 == "Translingual" then
return "\1"
elseif L2 == "English" then
return "\2"
elseif match(L2, "^[%z\1-\b\14-!#-&(-,.-\127]+$") then
return L2
end
local weight = sort_cache[L2]
if weight then
return weight
end
weight = toNFC(ugsub(ugsub(toNFD(L2), "[" .. comb_chars_all .. "'\"ʻʼ]+", ""), "[%s%-]+", " "))
sort_cache[L2] = weight
return weight
end
local function handle_heading(heading)
local function handle_heading(heading)
Line 865: Line 835:
L2_list[L2_list_len] = name
L2_list[L2_list_len] = name
L2_sections[heading.section] = name
L2_sections[heading.section] = name
-- Also add any L1s, since they terminate the preceding L2, but add a maintenance category since it's probably a mistake.
 
if level == 1 then
new_cats["Pages with unwanted L1 headings"] = true
end
-- Check the heading is in the right order.
-- Check the heading is in the right order.
-- FIXME: we need a more sophisticated sorting method which handles non-diacritic special characters (e.g. Magɨ).
-- FIXME: we need a more sophisticated sorting method which handles non-diacritic special characters (e.g. Magɨ).
if prev and not (
if prev and not (
L2_wrong_order or
L2_wrong_order or
string_compare(export.get_L2_sort_key(prev), export.get_L2_sort_key(name))
string_sort(get_weight(prev), get_weight(name))
) then
) then
new_cats["Pages with language headings in the wrong order"] = true
L2_wrong_order = true
L2_wrong_order = true
end
-- Check it's a canonical language name.
if not (langnames or get_langnames())[name] then
new_cats["Pages with nonstandard language headings"] = true
end
end
prev = name
prev = name
Line 886: Line 848:
local function handle_template(template)
local function handle_template(template)
-- Turn off redirect checking except in the Reconstruction namespace because the rc flag is only
local name = template:get_name()
-- used in the Reconstruction namespace and the other names are parser functions, which AFAIK can't
if name == "DISPLAYTITLE:" then
-- be redirected to.
local name = template:get_name(nil, not namespace_is_reconstruction and "no_redirect" or nil)
if name == "DEFAULTSORT:" then
new_cats["Pages with DEFAULTSORT conflicts"] = true
elseif name == "DISPLAYTITLE:" then
new_cats["Pages with DISPLAYTITLE conflicts"] = true
new_cats["Pages with DISPLAYTITLE conflicts"] = true
elseif name == "reconstructed" then
rc = true
end
end
end
end
Line 906: Line 861:
elseif node_class == "template" then
elseif node_class == "template" then
handle_template(node)
handle_template(node)
elseif node_class == "parameter" then
new_cats["Pages with raw triple-brace template parameters"] = true
end
end
end
end
Line 915: Line 868:
data.L2_list = L2_list
data.L2_list = L2_list
data.L2_sections = L2_sections
data.L2_sections = L2_sections
insert(cats, get_category("Pages with entries"))
insert(cats, get_category(format("Pages with %s entr%s", L2_list_len, L2_list_len == 1 and "y" or "ies")))
for cat in pairs(new_cats) do
for cat in pairs(new_cats) do
insert(cats, get_category(cat))
insert(cats, get_category(cat))
end
if namespace_is_reconstruction and not rc then
local langname = match(full_raw_pagename, "^Reconstruction:([^/]+)/.")
if langname then
insert(cats, get_category(langname .. " entries missing Template:reconstructed"))
end
end
end
end
end
Line 932: Line 876:
------ 4. Parse page for maintenance categories. ------
------ 4. Parse page for maintenance categories. ------
-- Use of tab characters.
-- Use of tab characters.
if content and find(content, "\t", 1, true) then
insert(cats, get_category("Pages with tab characters"))
end
-- Unencoded character(s) in title.
-- Unencoded character(s) in title.
local IDS = list_to_set{"⿰", "⿱", "⿲", "⿳", "⿴", "⿵", "⿶", "⿷", "⿸", "⿹", "⿺", "⿻", "⿼", "⿽", "⿾", "⿿", "㇯"}
local IDS = list_to_set{"⿰", "⿱", "⿲", "⿳", "⿴", "⿵", "⿶", "⿷", "⿸", "⿹", "⿺", "⿻", "⿼", "⿽", "⿾", "⿿", "㇯"}
for char in pairs(explode_pagename) do
if IDS[char] and char ~= data.pagename then
insert(cats, "Terms containing unencoded characters")
break
end
end


-- Raw wikitext use of a topic or langname category. Also check if any raw sortkeys have been used.
-- Raw wikitext use of a topic or langname category. Also check if any raw sortkeys have been used.
Line 1,041: Line 976:
data.wikitext_topic_cat = wikitext_topic_cat
data.wikitext_topic_cat = wikitext_topic_cat
data.wikitext_langname_cat = wikitext_langname_cat
data.wikitext_langname_cat = wikitext_langname_cat
if raw_sortkey then
 
insert(cats, get_category("Pages with raw sortkeys"))
end
end
end