Module:IPA: Difference between revisions

3,873 bytes added ,  16 November 2025
no edit summary
No edit summary
No edit summary
Line 3: Line 3:
local force_cat = false -- for testing
local force_cat = false -- for testing


local pages_module = "Module:pages"
local pron_qualifier_module = "Module:pron qualifier"
local pron_qualifier_module = "Module:pron qualifier"
local qualifier_module = "Module:qualifier"
local qualifier_module = "Module:qualifier"
local references_module = "Module:references"
local references_module = "Module:references"
local string_utilities_module = "Module:string utilities"
local string_utilities_module = "Module:string utilities"
local syllables_module = "Module:syllables"
local utilities_module = "Module:utilities"
local utilities_module = "Module:utilities"


local m_data = mw.loadData("Module:IPA/data")
local m_data = mw.loadData("Module:IPA/data")
local m_str_utils = require(string_utilities_module)
local m_str_utils = require(string_utilities_module)
local m_syllables -- [[Module:syllables]]; loaded below if needed
local m_symbols = mw.loadData("Module:IPA/data/symbols")
local m_symbols = mw.loadData("Module:IPA/data/symbols")


Line 19: Line 22:
local gsub = string.gsub
local gsub = string.gsub
local insert = table.insert
local insert = table.insert
local is_preview = require(pages_module).is_preview
local len = m_str_utils.len
local len = m_str_utils.len
local listToText = mw.text.listToText
local listToText = mw.text.listToText
local match = string.match
local match = string.match
local pattern_escape = m_str_utils.pattern_escape
local sub = string.sub
local sub = string.sub
local u = m_str_utils.char
local u = m_str_utils.char
Line 52: Line 57:
--[==[
--[==[
Format a line of one or more IPA pronunciations as {{tl|IPA}} would do it, i.e. with a preceding {"IPA:"} followed by
Format a line of one or more IPA pronunciations as {{tl|IPA}} would do it, i.e. with a preceding {"IPA:"} followed by
the word {"key"} linking to a page describing the language's phonology, and with an added category
the word {"key"} linking to an Appendix page describing the language's phonology, and with an added category
{{cd|<var>lang</var> terms with IPA pronunciation}}. Other than the extra preceding text and category, this is identical
` ``lang`` terms with IPA pronunciation`. Other than the extra preceding text and category, this is identical
to {format_IPA_multiple()}, and the considerations described there in the documentation apply here as well. There is a
to {format_IPA_multiple()}, and the considerations described there in the documentation apply here as well. There is a
single parameter `data`, an object with the following fields:
single parameter `data`, an object with the following fields:
Line 59: Line 64:
   pronunciations with invalid phonemes; for determining how many syllables the pronunciations have in them, in order to
   pronunciations with invalid phonemes; for determining how many syllables the pronunciations have in them, in order to
   add a category such as [[:Category:Italian 2-syllable words]] (for certain languages only); for adding a category
   add a category such as [[:Category:Italian 2-syllable words]] (for certain languages only); for adding a category
   {{cd|<var>lang</var> terms with IPA pronunciation}}; and for determining the proper sort keys for categories. Unlike
   ` ``lang`` terms with IPA pronunciation`; and for determining the proper sort keys for categories. Unlike
   for {format_IPA_multiple()}, `lang` may not be {nil}.
   for {format_IPA_multiple()}, `lang` may not be {nil}.
* `items`: List of pronunciations, in exactly the same format as for {format_IPA_multiple()}.
* `items`: List of pronunciations, in exactly the same format as for {format_IPA_multiple()}.
Line 113: Line 118:
prefix_text = '<span class="error">' .. err .. '</span>'
prefix_text = '<span class="error">' .. err .. '</span>'
else
else
--[[if hasKey[lang:getCode()] then
if hasKey[lang:getCode()] then
prefix_text = "IPA for " .. langname
prefix_text = "IPA for " .. langname
else
else
prefix_text = "wikipedia:" .. langname .. " phonology"
prefix_text = langname
end]]
end
prefix_text = "IPA for " .. langname
prefix_text = "[[" .. prefix_text .. "|key]]"
prefix_text = "[[" .. prefix_text .. "|key]]"
end
end


local prefix = "[[Wiktionary:International Phonetic Alphabet|IPA]]<sup>(" .. prefix_text .. ")</sup>:&#32;"
local prefix = "[[wikt:Wiktionary:International Phonetic Alphabet|IPA]]<sup>(" .. prefix_text .. ")</sup>:&#32;"


local IPAs, categories = export.format_IPA_multiple(lang, items, separator, no_count, "raw")
local IPAs, categories = export.format_IPA_multiple(lang, items, separator, no_count, "raw")
Line 162: Line 165:
local function determine_repr(pron)
local function determine_repr(pron)
local reconstructed
local reconstructed
 
-- remove initial asterisk before representation marks, used on some Reconstruction pages
-- Temporarily remove any initial asterisk before representation marks,
-- which avoids having to account for it in the data, but set the
-- `reconstructed` flag.
if sub(pron, 1, 1) == "*" then
if sub(pron, 1, 1) == "*" then
reconstructed = true
reconstructed = true
pron = sub(pron, 2)
pron = sub(pron, 2)
end
end
 
local opening = match(pron, "^.[\128-\191]*")
-- Some representation types have aliases for convenience (e.g. "// //" is
local data = m_data.representation_types[opening]
-- an alias for "⫽ ⫽"). and these need to be substituted in before checking
-- for other data.
if data then
local opening, n = match(pron, "^.[\128-\191]*")
local closing = data[2]
local subs_data = m_data.representation_subs[opening]
if data and match(pron, closing .. "$", #opening + 1) then
if subs_data then
return data[1], opening, closing, reconstructed
pron, n = ugsub(pron, subs_data[1], subs_data[2])
-- If the substitution was made, `opening` needs to be changed to the
-- new opening character.
if n ~= 0 then
opening = subs_data[3]
end
end
 
-- Get the type data based on the opening character (if any), and set the
-- representation type if the closing character matches.
local type_data, repr, closing = m_data.representation_types[opening]
if type_data then
closing = type_data[2]
if type_data and match(pron, pattern_escape(closing) .. "$", #opening + 1) then
repr = type_data[1]
end
end
end
end
 
return nil, "", "", reconstructed
-- Default to the empty string.
if not repr then
opening, closing = "", ""
end
 
-- Reattach the asterisk if reconstructed.
if reconstructed then
pron = "*" .. pron
end
 
return pron, repr, opening, closing, reconstructed
end
end


local function hasInvalidSeparators(transcription)
local function hasInvalidSeparators(transcription)
if umatch(transcription, "%.[ˈˌ]") or umatch(transcription, "[ˈˌ][ .]") then
-- Escape certain characters as well as pauses, which have the format "(...)" (with any number of dots), to avoid false-positives.
return true
transcription = transcription:gsub(".[\128-\191]*", m_symbols.separator_escapes)
else
:gsub("%(%.+%)", "\3")
return false
:gsub("[()]+", "")
end
return (
transcription:find("..", nil, true) or
transcription:match("%.%f[%z \1\2\3,:;]") or
transcription:match("\1%f[%z \2\3,:;]") or
transcription:match("\2%f[%z \1\3,:;]") or
transcription:match("\3[:;]") or
transcription:match("%f[^%z \1\2\3,]%.")
) and true or false
end
end


--[==[
--[==[
Format a line of one or more bare IPA pronunciations (i.e. without any preceding {"IPA:"} and without adding to a
Format a line of one or more bare IPA pronunciations (i.e. without any preceding {"IPA:"} and without adding to a
category {{cd|<var>lang</var> terms with IPA pronunciation}}). Individual pronunciations are formatted using
category ` ``lang`` terms with IPA pronunciation`). Individual pronunciations are formatted using
{format_IPA()} and are combined with separators, qualifiers, pre-text, post-text, etc. to form a line of pronunciations.
{format_IPA()} and are combined with separators, qualifiers, pre-text, post-text, etc. to form a line of pronunciations.
Parameters accepted are:
Parameters accepted are:
Line 325: Line 361:


if lang then
if lang then
-- The nature of hasInvalidSeparators() is such that we don't have to split a combined '/.../ [...]' spec
-- Add syllable count if the language's diphthongs are listed in [[Module:syllables]].
-- into its parts in order to process.
-- Don't do this if the term has spaces, a liaison mark () or isn't in mainspace.
if lang:getCode() == "en" and hasInvalidSeparators(item.pron) then
if not no_count and namespace == 0 then
insert(categories, "English IPA pronunciations with invalid separators")
m_syllables = m_syllables or require(syllables_module)
local langcode = lang:getCode()
if m_data.langs_to_generate_syllable_count_categories[langcode] then
local raw_phonemic, phonetic, use_it = split_phonemic_phonetic(item.pron)
local phonemic, repr = determine_repr(raw_phonemic)
if not phonetic then -- not a '/.../ [...]' combined pronunciation
if m_data.langs_to_use_phonetic_notation[langcode] then
use_it = repr == "phonetic" and phonemic or nil
else
use_it = repr == "phonemic" and phonemic or nil
end
elseif repr == "phonetic" then
use_it = phonetic
elseif repr == "phonemic" then
use_it = phonemic
end
-- Note: two uses of find with plain patterns is much faster than umatch with [ ‿].
if use_it and not (find(use_it, " ") or find(use_it, "")) then
local syllable_count = m_syllables.getVowels(use_it, lang)
if syllable_count then
insert(categories, lang:getCanonicalName() .. " " .. syllable_count ..
"-syllable words")
end
end
end
end
end
end
end
Line 343: Line 403:
may have HTML added surrounding invalid characters so they appear in red.
may have HTML added surrounding invalid characters so they appear in red.
]=]
]=]
local function format_one_IPA(lang, pron, err, categories)
local function format_one_IPA(lang, raw_pron, err, categories)
-- Disallow wikilinks.
-- Disallow wikilinks.
if match(pron, "%[%[.-%]%]") then
if match(raw_pron, "%[%[.-%]%]") then
error("IPA input must not contain wikilinks.")
error("IPA input must not contain wikilinks.")
end
end
pron = decode_entities(pron)
raw_pron = decode_entities(raw_pron)


-- Detect the type of transcription.
-- Detect the type of transcription.
local repr, opening, closing, reconstructed = determine_repr(pron)
local pron, repr, opening, closing, reconstructed = determine_repr(raw_pron)
-- Strip any reconstruction asterisk and representation marks.
-- Strip any reconstruction asterisk and representation marks.
pron = sub(pron, #opening + 1 + (reconstructed and 1 or 0), -#closing - 1)
pron = sub(pron, #opening + 1 + (reconstructed and 1 or 0), -#closing - 1)
if not repr then
insert(categories, "IPA pronunciations with invalid representation marks")
-- insert(err, "invalid representation marks")
-- Removed because it's annoying when previewing pronunciation pages.
end
if repr ~= "orthographic" and lang and lang:getCode() == "en" and hasInvalidSeparators(pron) then
insert(categories, "English IPA pronunciations with invalid separators")
end


if pron == "" then
if pron == "" then
Line 411: Line 480:


if result ~= "" then
if result ~= "" then
local suggestions = {}
if lang then
-- Get the per_lang_valid data, and convert any per-language valid sequences to spaces.
local per_lang_valid = m_symbols.per_lang_valid[lang:getCode()]
if per_lang_valid then
if type(per_lang_valid) == "table" then
for _, pattern in pairs(per_lang_valid) do
result = ugsub(result, pattern, " ")
end
else -- Should be a string.
result = ugsub(result, per_lang_valid, " ")
end
end
end
local suggestions
-- Check for any invalid sequences, excluding anything in the per-language lookup table.
for k, v in pairs(m_symbols.invalid) do
for k, v in pairs(m_symbols.invalid) do
if find(result, k, 1, true) then
if find(result, k, nil, true) then
pron = pron:gsub(k, v)
if not suggestions then
suggestions = {}
end
insert(suggestions, k .. " with " .. v)
end
end
if suggestions and suggestions[1] then
suggestions = listToText(suggestions)
if is_content_page then
error("Invalid IPA: replace " .. suggestions)
end
end
insert(err, "replace " .. suggestions)
end
-- Convert any valid character sequences to spaces
for _, pattern in pairs(m_symbols.valid) do
result = ugsub(result, pattern, " ")
end
end
if not match(result, "^ *$") then
result = ugsub(result, "⁽[".. m_symbols.superscripts .. "]+⁾", "")
local category = "IPA pronunciations with invalid IPA characters"
local per_lang_valid
if not is_content_page then
if lang then
category = category .. "/non_mainspace"
per_lang_valid = m_symbols.per_lang_valid[lang:getCode()]
end
insert(categories, category)
insert(err, "invalid IPA characters (" .. result .. ")")
end
end
per_lang_valid = per_lang_valid or ""
result = ugsub(result, "[" .. m_symbols.valid .. per_lang_valid .. "]", "")
end
end


Line 458: Line 555:
else
else
local phoneme = usub(rest, 1, 1)
local phoneme = usub(rest, 1, 1)
insert(phonemes, "<span style=\"color: red\">" .. phoneme .. "</span>")
insert(phonemes, "<span style=\"color: var(--wikt-palette-red,red)\">" .. phoneme .. "</span>")
rest = usub(rest, 2)
rest = usub(rest, 2)
insert(categories, "IPA pronunciations with invalid phonemes/" .. lang:getCode())
insert(categories, "IPA pronunciations with invalid phonemes/" .. lang:getCode())
Line 501: Line 598:
end
end


if err[1] then
if err[1] and is_preview() then
err = '<span class="previewonly error" style="font-size: small;>&#32;' .. concat(err, ", ") .. "</span>"
err = '<span class="error" style="font-size: small;>&#32;' .. concat(err, ", ") .. "</span>"
else
else
err = ""
err = ""
end
end


return process_maybe_split_categories(split_output, categories, '<span class="IPA">' .. pron .. "</span>", lang,
return process_maybe_split_categories(split_output, categories, '<span class="IPA nowrap">' .. pron .. "</span>", lang,
err)
err)
end
end
Line 514: Line 611:
Format a line of one or more enPR pronunciations as {{tl|enPR}} would do it, i.e. with a preceding {"enPR:"} (linked to
Format a line of one or more enPR pronunciations as {{tl|enPR}} would do it, i.e. with a preceding {"enPR:"} (linked to
[[Appendix:English pronunciation]]) followed by one or more formatted, comma-separated enPR pronunciations. The
[[Appendix:English pronunciation]]) followed by one or more formatted, comma-separated enPR pronunciations. The
pronunciations are formatted by wrapping them in the {{cd|AHD}} and {{cd|enPR}} CSS classes and adding any left and
pronunciations are formatted by wrapping them in the `AHD` and `enPR` CSS classes and adding any left and
right regular and accent qualifiers. In addition, the overall result is wrapped in any overall left and right regular
right regular and accent qualifiers. In addition, the overall result is wrapped in any overall left and right regular
and accent qualifiers. There is a single parameter `data`, an object with the following fields:
and accent qualifiers. There is a single parameter `data`, an object with the following fields:
Line 531: Line 628:
]==]
]==]
function export.format_enPR_full(data)
function export.format_enPR_full(data)
local prefix = "[[Appendix:English pronunciation|enPR]]: "
local prefix = "[[wikt:Appendix:English pronunciation|enPR]]: "
local lang = require("Module:languages").getByCode("en")
local lang = require("Module:languages").getByCode("en")
local parts = {}
local parts = {}