Module:script utilities: Difference between revisions

Module:script utilities (view source)

Revision as of 19:29, 30 November 2022

3,409 bytes added , 30 November 2022

no edit summary

Sware

Bureaucrats, Administrators

48,407

edits

@@ Line 9: / Line 9: @@
 	[[Module:languages]]
 	[[Module:parameters]]
-	[[Module:utilities]]
+	[[Module:utilities/format_categories]]
 ]=]
 function export.is_Latin_script(sc)
 	-- Latn, Latf, Latinx, pjt-Latn
-	return nil and true or mw.ustring.find(require("Module:scripts").getByCode(sc)["_code"], "Lat") and true or false
+	return sc:getCode():find("Lat") and true or false
 end
@@ Line 43: / Line 43: @@
 end
--- Apply a function to `text`, but not to the target of wikilinks or to HTML tags.
+-- Ustring turns on the codepoint-aware string matching. The basic string function
-local function munge_text(text, fn)
+-- should be used for simple sequences of characters, Ustring function for
-	local has_html = text:find("<")
+-- sets – [].
-	local has_two_part_link = text:find("%[%[.*|")
+local function trackPattern(text, pattern, tracking, ustring)
-	if not has_html and not has_two_part_link then
+	local find = ustring and mw.ustring.find or string.find
-		return fn(text)
+end
-	end
+local function track(text, lang, sc)
+	local U = mw.ustring.char
+	if lang and text then
+		local langCode = lang:getCode()
+		-- [[Special:WhatLinksHere/Template:tracking/script/ang/acute]]
+		if langCode == "ang" then
+			local decomposed = mw.ustring.toNFD(text)
+			local acute = U(0x301)
+			trackPattern(decomposed, acute, "ang/acute")
+		--[=[
+		[[Special:WhatLinksHere/Template:tracking/script/Greek/wrong-phi]]
+		[[Special:WhatLinksHere/Template:tracking/script/Greek/wrong-theta]]
+		[[Special:WhatLinksHere/Template:tracking/script/Greek/wrong-kappa]]
+		[[Special:WhatLinksHere/Template:tracking/script/Greek/wrong-rho]]
+			ϑ, ϰ, ϱ, ϕ should generally be replaced with θ, κ, ρ, φ.
+		]=]
+		elseif langCode == "el" or langCode == "grc" then
+			trackPattern(text, "ϑ", "Greek/wrong-theta")
+			trackPattern(text, "ϰ", "Greek/wrong-kappa")
+			trackPattern(text, "ϱ", "Greek/wrong-rho")
+			trackPattern(text, "ϕ", "Greek/wrong-phi")
+			--[=[
+			[[Special:WhatLinksHere/Template:tracking/script/Ancient Greek/spacing-coronis]]
+			[[Special:WhatLinksHere/Template:tracking/script/Ancient Greek/spacing-smooth-breathing]]
+			[[Special:WhatLinksHere/Template:tracking/script/Ancient Greek/wrong-apostrophe]]
+				When spacing coronis and spacing smooth breathing are used as apostrophes,
+				they should be replaced with right single quotation marks (’).
+			]=]
+			if langCode == "grc" then
+				trackPattern(text, U(0x1FBD), "Ancient Greek/spacing-coronis")
+				trackPattern(text, U(0x1FBF), "Ancient Greek/spacing-smooth-breathing")
+				trackPattern(text, "[" .. U(0x1FBD) .. U(0x1FBF) .. "]", "Ancient Greek/wrong-apostrophe", true)
+			end
+		-- [[Special:WhatLinksHere/Template:tracking/script/Russian/grave-accent]]
+		elseif langCode == "ru" then
+			local decomposed = mw.ustring.toNFD(text)
+			trackPattern(decomposed, U(0x300), "Russian/grave-accent")
+		-- [[Special:WhatLinksHere/Template:tracking/script/Tibetan/trailing-punctuation]]
+		elseif langCode == "bo" then
+			trackPattern(text, "[་།]$", "Tibetan/trailing-punctuation", true)
+			trackPattern(text, "[་།]%]%]$", "Tibetan/trailing-punctuation", true)
-	local strutils = require("Module:string utilities")
+		--[=[
+		[[Special:WhatLinksHere/Template:tracking/script/Thai/broken-ae]]
+		[[Special:WhatLinksHere/Template:tracking/script/Thai/broken-am]]
+		[[Special:WhatLinksHere/Template:tracking/script/Thai/wrong-rue-lue]]
+		]=]
+		elseif langCode == "th" then
+			trackPattern(text, "เ".."เ", "Thai/broken-ae")
+			trackPattern(text, "ํ[่้๊๋]?า", "Thai/broken-am", true)
+			trackPattern(text, "[ฤฦ]า", "Thai/wrong-rue-lue", true)
-	local function munge_text_with_html(txt)
+		--[=[
-		local parts = strutils.capturing_split(txt, "(<[^>]->)")
+		[[Special:WhatLinksHere/Template:tracking/script/Lao/broken-ae]]
-		for i = 1, #parts, 2 do
+		[[Special:WhatLinksHere/Template:tracking/script/Lao/broken-am]]
-			parts[i] = fn(parts[i])
+		[[Special:WhatLinksHere/Template:tracking/script/Lao/possible-broken-ho-no]]
-		end
+		[[Special:WhatLinksHere/Template:tracking/script/Lao/possible-broken-ho-mo]]
-		return table.concat(parts)
+		[[Special:WhatLinksHere/Template:tracking/script/Lao/possible-broken-ho-lo]]
-	end
+		]=]
+		elseif langCode == "lo" then
+			trackPattern(text, "ເ".."ເ", "Lao/broken-ae")
+			trackPattern(text, "ໍ[່້໊໋]?າ", "Lao/broken-am", true)
+			trackPattern(text, "ຫນ", "Lao/possible-broken-ho-no")
+			trackPattern(text, "ຫມ", "Lao/possible-broken-ho-mo")
+			trackPattern(text, "ຫລ", "Lao/possible-broken-ho-lo")
-	if has_two_part_link then
+		--[=[
-		-- The hard case is when both two-part links and HTML tags occur, because crippled Lua patterns
+		[[Special:WhatLinksHere/Template:tracking/script/Lü/broken-ae]]
-		-- don't support alternation. We need to first split on two-part links (which seem more likely
+		[[Special:WhatLinksHere/Template:tracking/script/Lü/possible-wrong-sequence]]
-		-- to occur), then split odd-numbered fragments on HTML tags, then apply the function to
+		]=]
-		-- odd-numbered subfragments. This is unlikely to be very efficient, but should occur rarely.
+		elseif langCode == "khb" then
-		local parts = strutils.capturing_split(text, "(%[%[[^%[%]|]-|)")
+			trackPattern(text, "ᦵ".."ᦵ", "Lü/broken-ae")
-		for i = 1, #parts, 2 do
+			trackPattern(text, "[ᦀ-ᦫ][ᦵᦶᦷᦺ]", "Lü/possible-wrong-sequence", true)
-			if has_html then
-				parts[i] = munge_text_with_html(parts[i])
-			else
-				parts[i] = fn(parts[i])
-			end
 		end
-		return table.concat(parts)
-	else -- HTML tags only
-		return munge_text_with_html(text)
 	end
 end
@@ Line 84: / Line 139: @@
 	if not sc then
 		sc = require("Module:scripts").findBestScript(text, lang)
+	end
+	track(text, lang, sc)
+	-- Replace space characters with newlines in Mongolian-script text, which is written top-to-bottom.
+	if sc:getDirection() == "down" and text:find(" ") then
+		text = require("Module:munge_text")(text, function(txt)
+			-- having extra parentheses makes sure only the first return value gets through
+			return (txt:gsub(" +", "<br>"))
+		end)
 	end
-	-- Hack Korean text to remove hyphens. This should be handled in a more general fashion, but needs to
+	-- Hack Korean script text to remove hyphens.
+	-- XXX: This should be handled in a more general fashion, but needs to
 	-- be efficient by not doing anything if no hyphens are present, and currently this is the only
 	-- language needing such processing.
-	if lang:getCode() == "ko" and text:find("%-") then
+	-- 20220221: Also convert 漢字(한자) to ruby, instead of needing [[Template:Ruby]].
-		text = munge_text(text, function(txt)
+	if sc:getCode() == "Kore" and (text:find("%-") or text:find("[()]")) then
-			-- having extra parentheses makes sure only the first return value gets through
+		text = require("Module:munge_text")(text, function(txt)
-			return (txt:gsub("%-", ""))
+			-- Hani/Hang regex is a reasonable subset of Hani/Hang from [[Module:scripts/data]],
+			-- last checked on 20220221
+			txt = txt:gsub("%-", "")
+			txt = mw.ustring.gsub(txt, "([一-鿿㐀-䶿𠀀-𮯯𰀀-𱍏]+)%(([가-힣ᄀ-ᇿꥠ-ꥼힰ-ퟻ]+)%)", "<ruby>%1<rp>(</rp><rt>%2</rt><rp>)</rp></ruby>")
+			return txt
 		end)
 	end
@@ Line 249: / Line 319: @@
 	return "<small>[" .. disp_script .. " needed]</small>" ..
-		(nocat and "" or require("Module:utilities").format_categories({category}, lang, sort_key))
+		(nocat and "" or require("Module:utilities/format_categories")({category}, lang, sort_key))
 end

Module:script utilities: Difference between revisions

Module:script utilities (view source)

Revision as of 19:29, 30 November 2022

Navigation menu

Search