Home
Random

Log in

Settings

About Linguifex
Disclaimers

Module:script utilities: Difference between revisions

Language
Watch
View history
View source

@@ Line 2: / Line 2: @@
 local anchors_module = "Module:anchors"
+local debug_track_module = "Module:debug/track"
 local links_module = "Module:links"
 local munge_text_module = "Module:munge text"
@@ Line 13: / Line 14: @@
 local require = require
 local toNFD = mw.ustring.toNFD
+local dump = mw.dumpObject
 --[==[
 Loaders for functions in other modules, which overwrite themselves with the target function when called. This ensures modules are only loaded when needed, retains the speed/convenience of locally-declared pre-loaded functions, and has no overhead after the first call, since the target functions are called directly in any subsequent calls.]==]
-	local function embedded_language_links(...)
+local function embedded_language_links(...)
-		embedded_language_links = require(links_module).embedded_language_links
+	embedded_language_links = require(links_module).embedded_language_links
-		return embedded_language_links(...)
+	return embedded_language_links(...)
-	end
+end
-	local function format_categories(...)
+local function find_best_script_without_lang(...)
-		format_categories = require(utilities_module).format_categories
+	find_best_script_without_lang = require(scripts_module).findBestScriptWithoutLang
-		return format_categories(...)
+	return find_best_script_without_lang(...)
-	end
+end
-	local function get_script(...)
+local function format_categories(...)
-		get_script = require(scripts_module).getByCode
+	format_categories = require(utilities_module).format_categories
-		return get_script(...)
+	return format_categories(...)
-	end
+end
-	local function language_anchor(...)
+local function get_script(...)
-		language_anchor = require(anchors_module).language_anchor
+	get_script = require(scripts_module).getByCode
-		return language_anchor(...)
+	return get_script(...)
-	end
+end
-	local function munge_text(...)
+local function language_anchor(...)
-		munge_text = require(munge_text_module)
+	language_anchor = require(anchors_module).language_anchor
-		return munge_text(...)
+	return language_anchor(...)
-	end
+end
-	local function process_params(...)
+local function munge_text(...)
-		process_params = require(parameters_module).process
+	munge_text = require(munge_text_module)
-		return process_params(...)
+	return munge_text(...)
-	end
+end
-	local function u(...)
+local function process_params(...)
-		u = require(string_utilities_module).char
+	process_params = require(parameters_module).process
-		return u(...)
+	return process_params(...)
-	end
+end
-	local function ugsub(...)
+local function track(...)
-		ugsub = require(string_utilities_module).gsub
+	track = require(debug_track_module)
-		return ugsub(...)
+	return track(...)
-	end
+end
-	local function umatch(...)
+local function u(...)
-		umatch = require(string_utilities_module).match
+	u = require(string_utilities_module).char
-		return umatch(...)
+	return u(...)
-	end
+end
+local function ugsub(...)
+	ugsub = require(string_utilities_module).gsub
+	return ugsub(...)
+end
+local function umatch(...)
+	umatch = require(string_utilities_module).match
+	return umatch(...)
+end
 --[==[
 Loaders for objects, which load data (or some other object) into some variable, which can then be accessed as "foo or get_foo()", where the function get_foo sets the object to "foo" and then returns it. This ensures they are only loaded when needed, and avoids the need to check for the existence of the object each time, since once "foo" has been set, "get_foo" will not be called again.]==]
-	local m_data
+local m_data
-	local function get_data()
+local function get_data()
-		m_data, get_data = mw.loadData("Module:script utilities/data"), nil
+	m_data, get_data = mw.loadData("Module:script utilities/data"), nil
-		return m_data
+	return m_data
-	end
+end
 --[=[
@@ Line 78: / Line 90: @@
 	[[Module:parameters]]
 	[[Module:utilities]]
+	[[Module:debug/track]]
 ]=]
@@ Line 107: / Line 120: @@
 		if sc then
+			-- Track uses of sc parameter.
+			if sc:getCode() == lang:findBestScript(text):getCode() then
+				insert(cats, lang:getFullName() .. " terms with redundant script codes")
+			else
+				insert(cats, lang:getFullName() .. " terms with non-redundant manual script codes")
+			end
 		else
 			sc = lang:findBestScript(text)
@@ Line 124: / Line 143: @@
 		return export.tag_text(text, lang, sc, face, class) .. cats
 	end
+end
+-- Ustring turns on the codepoint-aware string matching. The basic string function
+-- should be used for simple sequences of characters, Ustring function for
+-- sets – [].
+local function trackPattern(text, pattern, tracking)
+	if pattern and umatch(text, pattern) then
+		track("script/" .. tracking)
+	end
+end
+local function track_text(text, lang, sc)
+	if lang and text then
+		local langCode = lang:getFullCode()
+		-- [[Special:WhatLinksHere/Wiktionary:Tracking/script/ang/acute]]
+		if langCode == "ang" then
+			local decomposed = toNFD(text)
+			local acute = u(0x301)
+			trackPattern(decomposed, acute, "ang/acute")
+		--[=[
+		[[Special:WhatLinksHere/Wiktionary:Tracking/script/Greek/wrong-phi]]
+		[[Special:WhatLinksHere/Wiktionary:Tracking/script/Greek/wrong-theta]]
+		[[Special:WhatLinksHere/Wiktionary:Tracking/script/Greek/wrong-kappa]]
+		[[Special:WhatLinksHere/Wiktionary:Tracking/script/Greek/wrong-rho]]
+			ϑ, ϰ, ϱ, ϕ should generally be replaced with θ, κ, ρ, φ.
+		]=]
+		elseif langCode == "el" or langCode == "grc" then
+			trackPattern(text, "ϑ", "Greek/wrong-theta")
+			trackPattern(text, "ϰ", "Greek/wrong-kappa")
+			trackPattern(text, "ϱ", "Greek/wrong-rho")
+			trackPattern(text, "ϕ", "Greek/wrong-phi")
+			--[=[
+			[[Special:WhatLinksHere/Wiktionary:Tracking/script/Ancient Greek/spacing-coronis]]
+			[[Special:WhatLinksHere/Wiktionary:Tracking/script/Ancient Greek/spacing-smooth-breathing]]
+			[[Special:WhatLinksHere/Wiktionary:Tracking/script/Ancient Greek/wrong-apostrophe]]
+				When spacing coronis and spacing smooth breathing are used as apostrophes,
+				they should be replaced with right single quotation marks (’).
+			]=]
+			if langCode == "grc" then
+				trackPattern(text, u(0x1FBD), "Ancient Greek/spacing-coronis")
+				trackPattern(text, u(0x1FBF), "Ancient Greek/spacing-smooth-breathing")
+				trackPattern(text, "[" .. u(0x1FBD) .. u(0x1FBF) .. "]", "Ancient Greek/wrong-apostrophe", true)
+			end
+		-- [[Special:WhatLinksHere/Wiktionary:Tracking/script/Russian/grave-accent]]
+		elseif langCode == "ru" then
+			local decomposed = toNFD(text)
+			trackPattern(decomposed, u(0x300), "Russian/grave-accent")
+		-- [[Special:WhatLinksHere/Wiktionary:Tracking/script/Chuvash/latin-homoglyph]]
+		elseif langCode == "cv" then
+			trackPattern(text, "[ĂăĔĕÇçŸÿ]", "Chuvash/latin-homoglyph")
+		-- [[Special:WhatLinksHere/Wiktionary:Tracking/script/Tibetan/trailing-punctuation]]
+		elseif langCode == "bo" then
+			trackPattern(text, "[་།]$", "Tibetan/trailing-punctuation")
+			trackPattern(text, "[་།]%]%]$", "Tibetan/trailing-punctuation")
+		--[=[
+		[[Special:WhatLinksHere/Wiktionary:Tracking/script/Thai/broken-ae]]
+		[[Special:WhatLinksHere/Wiktionary:Tracking/script/Thai/broken-am]]
+		[[Special:WhatLinksHere/Wiktionary:Tracking/script/Thai/wrong-rue-lue]]
+		]=]
+		elseif langCode == "th" then
+			trackPattern(text, "เ".."เ", "Thai/broken-ae")
+			trackPattern(text, "ํ[่้๊๋]?า", "Thai/broken-am")
+			trackPattern(text, "[ฤฦ]า", "Thai/wrong-rue-lue")
+		--[=[
+		[[Special:WhatLinksHere/Wiktionary:Tracking/script/Lao/broken-ae]]
+		[[Special:WhatLinksHere/Wiktionary:Tracking/script/Lao/broken-am]]
+		[[Special:WhatLinksHere/Wiktionary:Tracking/script/Lao/possible-broken-ho-no]]
+		[[Special:WhatLinksHere/Wiktionary:Tracking/script/Lao/possible-broken-ho-mo]]
+		[[Special:WhatLinksHere/Wiktionary:Tracking/script/Lao/possible-broken-ho-lo]]
+		]=]
+		elseif langCode == "lo" then
+			trackPattern(text, "ເ".."ເ", "Lao/broken-ae")
+			trackPattern(text, "ໍ[່້໊໋]?າ", "Lao/broken-am")
+			trackPattern(text, "ຫນ", "Lao/possible-broken-ho-no")
+			trackPattern(text, "ຫມ", "Lao/possible-broken-ho-mo")
+			trackPattern(text, "ຫລ", "Lao/possible-broken-ho-lo")
+		--[=[
+		[[Special:WhatLinksHere/Wiktionary:Tracking/script/Lü/broken-ae]]
+		[[Special:WhatLinksHere/Wiktionary:Tracking/script/Lü/possible-wrong-sequence]]
+		]=]
+		elseif langCode == "khb" then
+			trackPattern(text, "ᦵ".."ᦵ", "Lü/broken-ae")
+			trackPattern(text, "[ᦀ-ᦫ][ᦵᦶᦷᦺ]", "Lü/possible-wrong-sequence")
+		end
+	end
+end
+local function Kore_ruby(...)
+	-- Cache character sets on the first call.
+	local Hang_chars = get_script("Hang"):getCharacters()
+	local Hani_chars = get_script("Hani"):getCharacters()
+	-- Overwrite with the actual function, which is called directly on subsequent calls.
+	function Kore_ruby(txt)
+		return (ugsub(txt, "([%-".. Hani_chars .. "]+)%(([%-" .. Hang_chars .. "]+)%)", "<ruby>%1<rp>(</rp><rt>%2</rt><rp>)</rp></ruby>"))
+	end
+	return Kore_ruby(...)
 end
@@ Line 141: / Line 270: @@
 function export.tag_text(text, lang, sc, face, class, id)
 	if not sc then
-		sc = lang:findBestScript(text)
+		if lang then
+			sc = lang:findBestScript(text)
+		else
+			sc = find_best_script_without_lang(text)
+		end
 	end
+	track_text(text, lang, sc)
 	-- Replace space characters with newlines in Mongolian-script text, which is written top-to-bottom.
-	if sc:getDirection():match("vertical") and text:find(" ") then
+	if sc:getDirection():find("vertical", nil, true) and text:find(" ", nil, true) then
 		text = munge_text(text, function(txt)
 			-- having extra parentheses makes sure only the first return value gets through
@@ Line 157: / Line 292: @@
 	-- language needing such processing.
 	-- 20220221: Also convert 漢字(한자) to ruby, instead of needing [[Template:Ruby]].
-	if sc:getCode() == "Kore" and (text:find("-", 1, true) or text:find("[()]")) then
+	if sc:getCode() == "Kore" and text:match("[%-()g]") then
-		text = munge_text(text, function(txt)
+		local title, display = require("Module:links").get_wikilink_parts(text, true)
-			txt = txt:gsub("%-(%-?)", "%1")
+		if title ~= nil then -- special case that the text is a single link, do not munge and preserve affix hyphens
-			txt = ugsub(txt, "([%-".. get_script("Hani"):getCharacters() .. "]+)%(([%-" .. get_script("Hang"):getCharacters() .. "]+)%)", "<ruby>%1<rp>(</rp><rt>%2</rt><rp>)</rp></ruby>")
+			if lang and lang:getCode() == "okm" then -- Middle Korean code from [[User:Chom.kwoy]]
-			return txt
+				-- Comment from [[User:Lunabunn]]:
-		end)
+				-- In Middle Korean orthography, syllable formation is phonemic as opposed to morpheme-boundary-based a la
+				-- modern Korean. As such, for example, if you were to write nam-i, it would be rendered as na.mi so if you
+				-- then put na-mi to indicate particle boundaries as in modern Korean, the hyphen would be misplaced.
+				-- Previously, this was alleviated by specialcasing na--mi but [[User:Theknightwho]] made that resolve to -
+				-- in the Hangul (previously we used to just delete all -s in Hangul processing), so it broke.
+				-- [[User:Chom.kwoy]] implemented a different solution, which is writing -> instead using however many >s to
+				-- shift the hyphen by that number of letters in the romanization.
+				-- By the time we are called, > signs have been converted to &gt; by a call to encode_entities() in
+				-- make_link() in [[Module:links]] (near the bottom of the function).
+				-- 'g' in Middle Korean is a special sign to treat the following ㅇ sign as /G/ instead of null.
+				display = display:gsub("&gt;", ""):gsub("g", "")
+			end
+			if display:find("<") then
+				display = munge_text(display, function(txt)
+					txt = txt:gsub("(.)%-(%-?)(.)", "%1%2%3")
+					return Kore_ruby(txt)
+				end)
+			else
+				display = display:gsub("(.)%-(%-?)(.)", "%1%2%3")
+				display = Kore_ruby(display)
+			end
+			text = "[[" .. title .. "|" .. display .. "]]"
+		else
+			text = munge_text(text, function(txt)
+				if lang and lang:getCode() == "okm" then
+					txt = txt:gsub("&gt;", ""):gsub("g", "")
+				end
+				if txt == text then -- special case for the entire text being plain
+					txt = txt:gsub("(.)%-(%-?)(.)", "%1%2%3")
+				else
+					txt = txt:gsub("%-(%-?)", "%1")
+				end
+				return Kore_ruby(txt)
+			end)
+		end
 	end
 	if sc:getCode() == "Image" then
 		face = nil
 	end
-	local function class_attr(classes)
+	if face == "hypothetical" then
-		-- if the script code is hyphenated (i.e. language code-script code, add the last component as a class as well)
+	-- [[Special:WhatLinksHere/Wiktionary:Tracking/script-utilities/face/hypothetical]]
-		-- e.g. ota-Arab adds both Arab and ota-Arab as classes
+		track("script-utilities/face/hypothetical")
-		if sc:getCode():find("-", 1, true) then
+	end
-			insert(classes, 1, (ugsub(sc:getCode(), ".+%-", "")))
-			insert(classes, 2, sc:getCode())
+	local data = (m_data or get_data()).faces[face or "plain"]
-		else
+	if data == nil then
-			insert(classes, 1, sc:getCode())
+		error('Invalid script face "' .. face .. '".')
-		end
-		if class and class ~= '' then
-			insert(classes, class)
-		end
-		return 'class="' .. concat(classes, ' ') .. '"'
 	end
-	local function tag_attr(...)
+	local tag = data.tag
-		local output = {}
+	local opening_tag = {tag}
-		if id then
-			insert(output, 'id="' .. language_anchor(lang, id) .. '"')
+	if lang and id then
-		end
+		insert(opening_tag, 'id="' .. language_anchor(lang, id) .. '"')
-		insert(output, class_attr({...}) )
-		if lang then
-			-- FIXME: Is it OK to insert the etymology-only lang code and have it fall back to the first part of the
-			-- lang code (by chopping off the '-...' part)? It seems the :lang() selector does this; not sure about
-			-- [lang=...] attributes.
-			insert(output, 'lang="' .. lang:getFullCode() .. '"')
-		end
-		return concat(output, " ")
 	end
-	local data = (m_data or get_data()).faces[face or "plain"]
+	local classes = {data.class}
+	-- if the script code is hyphenated (i.e. language code-script code, add the last component as a class as well)
-	-- Add a script wrapper
+	-- e.g. ota-Arab adds both Arab and ota-Arab as classes
-	if data then
+	if sc:getCode():find("-", nil, true) then
-		return ( data.prefix or "" ) .. '<' .. data.tag .. ' ' .. tag_attr(data.class) .. '>' .. text .. '</' .. data.tag .. '>'
+		insert(classes, 1, (ugsub(sc:getCode(), ".+%-", "")))
+		insert(classes, 2, sc:getCode())
 	else
-		error('Invalid script face "' .. face .. '".')
+		insert(classes, 1, sc:getCode())
+	end
+	if class and class ~= '' then
+		insert(classes, class)
+	end
+	insert(opening_tag, 'class="' .. concat(classes, ' ') .. '"')
+	-- FIXME: Is it OK to insert the etymology-only lang code and have it fall back to the first part of the
+	-- lang code (by chopping off the '-...' part)? It seems the :lang() selector does this; not sure about
+	-- [lang=...] attributes.
+	if lang then
+		insert(opening_tag, 'lang="' .. lang:getFullCode() .. '"')
 	end
+	-- Add a script wrapper
+	return (data.prefix or "") .. "<" .. concat(opening_tag, " ") .. ">" .. text .. "</" .. tag .. ">"
 end
@@ Line 229: / Line 398: @@
 			or error("Second argument to tag_translit should be a language code or language object.")
 	end
 	local data = (m_data or get_data()).translit[kind or "default"]
-	local opening_tag = {}
+	local tag = data.tag
+	local opening_tag = {tag}
-	insert(opening_tag, data.tag)
+	local class = data.class
 	if lang == "ja" then
-		insert(opening_tag, 'class="' .. (data.classes and data.classes .. " " or "") .. (is_manual and "manual-tr " or "") .. 'tr"')
+		insert(opening_tag, 'class="' .. (class and (class .. " ") or "") .. (is_manual and "manual-tr " or "") .. 'tr"')
 	else
 		insert(opening_tag, 'lang="' .. lang .. '-Latn"')
-		insert(opening_tag, 'class="' .. (data.classes and data.classes .. " " or "") .. (is_manual and "manual-tr " or "") .. 'tr Latn"')
+		insert(opening_tag, 'class="' .. (class and (class .. " ") or "") .. (is_manual and "manual-tr " or "") .. 'tr Latn"')
 	end
-	if data.dir then
+	local dir = data.dir
-		insert(opening_tag, 'dir="' .. data.dir .. '"')
+	if dir then
+		insert(opening_tag, 'dir="' .. dir .. '"')
+	end
+	if attributes then
+		track("tag_translit/attributes")
+		insert(opening_tag, attributes)
 	end
-	insert(opening_tag, attributes)
+	return "<" .. concat(opening_tag, " ") .. ">" .. translit .. "</" .. tag .. ">"
-	return "<" .. concat(opening_tag, " ") .. ">" .. translit .. "</" .. data.tag .. ">"
 end
@@ Line 257: / Line 431: @@
 			or error("Second argument to tag_transcription should be a language code or language object.")
 	end
 	local data = (m_data or get_data()).transcription[kind or "default"]
-	local opening_tag = {}
+	local tag = data.tag
+	local opening_tag = {tag}
-	insert(opening_tag, data.tag)
+	local class = data.class
 	if lang == "ja" then
-		insert(opening_tag, 'class="' .. (data.classes and data.classes .. " " or "") .. 'ts"')
+		insert(opening_tag, 'class="' .. (class and (class .. " ") or "") .. 'ts"')
 	else
 		insert(opening_tag, 'lang="' .. lang .. '-Latn"')
-		insert(opening_tag, 'class="' .. (data.classes and data.classes .. " " or "") .. 'ts Latn"')
+		insert(opening_tag, 'class="' .. (class and (class .. " ") or "") .. 'ts Latn"')
+	end
+	local dir = data.dir
+	if dir then
+		insert(opening_tag, 'dir="' .. dir .. '"')
+	end
+	if attributes then
+		track("tag_transcription/attributes")
+		insert(opening_tag, attributes)
+	end
+	return "<" .. concat(opening_tag, " ") .. ">" .. transcription .. "</" .. tag .. ">"
+end
+--[==[Tags {def} as a definition.
+The <code>def</code> parameter must be one of the following:
+; {{code|lua|"gloss"}}
+: The text is wrapped in {{code|html|2=<span class="(mention-gloss">...</span>}}.
+; {{code|lua|"non-gloss"}}
+: The text is wrapped in {{code|html|2=<span class="use-with-mention">...</span>}}.
+The optional <code>attributes</code> parameter is used to specify additional HTML attributes for the tag.]==]
+function export.tag_definition(def, kind, attributes)
+	local data = (m_data or get_data()).definition[kind]
+	if data == nil then
+		error("Second argument to tag_definition should specify the kind of definition from the list in [[Module:script utilities/data]].")
+	end
+	local tag = data.tag
+	local opening_tag = {tag}
+	local class = data.class
+	if class then
+		insert(opening_tag, 'class="' .. class .. '"')
 	end
-	if data.dir then
+	if attributes then
-		insert(opening_tag, 'dir="' .. data.dir .. '"')
+		insert(opening_tag, attributes)
 	end
-	insert(opening_tag, attributes)
+	return "<" .. concat(opening_tag, " ") .. ">" .. def .. "</" .. tag .. ">"
-	return "<" .. concat(opening_tag, " ") .. ">" .. transcription .. "</" .. data.tag .. ">"
 end
@@ Line 323: / Line 530: @@
 		-- If there are no non-Latin scripts, return nothing.
-		if not has_nonlatin then
+		if not has_nonlatin and lang:getCode() ~= "und" then
 			return ""
 		end
@@ Line 338: / Line 545: @@
 --[==[This is used by {{temp|rfscript}}. See there for more information.]==]
-do
+function export.template_rfscript(frame)
-	local function get_args(frame)
+	local boolean = {type = "boolean"}
-		local boolean = {type = "boolean"}
+	local args = process_params(frame:getParent().args, {
-		return process_params(frame:getParent().args, {
+		[1] = {required = true, type = "language", default = "und"},
-			[1] = {required = true, type = "language", default = "und"},
+		["sc"] = {type = "script"},
-			["sc"] = {type = "script"},
+		["usex"] = boolean,
-			["usex"] = boolean,
+		["quote"] = boolean,
-			["quote"] = boolean,
+		["nocat"] = boolean,
-			["nocat"] = boolean,
+		["sort"] = true,
-			["sort"] = true,
+	})
-		})
-	end
+	local ret = export.request_script(args[1], args["sc"], args.quote and "quote" or args.usex, args.nocat, args.sort)
-	function export.template_rfscript(frame)
+	if ret == "" then
-		local args = get_args(frame)
+		error("This language is written in the Latin alphabet. It does not need a native script.")
-		local ret = export.request_script(args[1], args["sc"], args.quote and "quote" or args.usex, args.nocat, args.sort)
-		if ret == "" then
-			error("This language is written in the Latin alphabet. It does not need a native script.")
-		else
-			return ret
-		end
 	end
+	return ret
 end

Retrieved from "https://linguifex.com/wiki/Module:script_utilities"

Languages

This page is not available in other languages.

Linguifex

Privacy policy
About Linguifex
Disclaimers
Desktop