Module:IPA: Difference between revisions

Jump to navigation Jump to search
587 bytes removed ,  18 November 2025
no edit summary
No edit summary
No edit summary
 
(15 intermediate revisions by the same user not shown)
Line 3: Line 3:
local force_cat = false -- for testing
local force_cat = false -- for testing


local m_data = mw.loadData("Module:IPA/data")
local pages_module = "Module:pages"
local m_str_utils = require("Module:string utilities")
local m_symbols = mw.loadData("Module:IPA/data/symbols")
local pron_qualifier_module = "Module:pron qualifier"
local pron_qualifier_module = "Module:pron qualifier"
local qualifier_module = "Module:qualifier"
local qualifier_module = "Module:qualifier"
local references_module = "Module:references"
local references_module = "Module:references"
local string_utilities_module = "Module:string utilities"
local syllables_module = "Module:syllables"
local syllables_module = "Module:syllables"
local utilities_module = "Module:utilities"
local utilities_module = "Module:utilities"
local m_data = mw.loadData("Module:IPA/data")
local m_str_utils = require(string_utilities_module)
local m_syllables -- [[Module:syllables]]; loaded below if needed
local m_syllables -- [[Module:syllables]]; loaded below if needed
local m_symbols = mw.loadData("Module:IPA/data/symbols")


local concat = table.concat
local concat = table.concat
local decode_entities = m_str_utils.decode_entities
local find = string.find
local find = string.find
local gmatch = m_str_utils.gmatch
local gmatch = m_str_utils.gmatch
local gsub = string.gsub
local gsub = string.gsub
local insert = table.insert
local insert = table.insert
local is_preview = require(pages_module).is_preview
local len = m_str_utils.len
local len = m_str_utils.len
local listToText = mw.text.listToText
local listToText = mw.text.listToText
local match = string.match
local match = string.match
local pattern_escape = m_str_utils.pattern_escape
local sub = string.sub
local sub = string.sub
local u = m_str_utils.char
local u = m_str_utils.char
local ufind = m_str_utils.find
local ugsub = m_str_utils.gsub
local ugsub = m_str_utils.gsub
local umatch = m_str_utils.match
local umatch = m_str_utils.match
Line 29: Line 34:


local namespace = mw.title.getCurrentTitle().namespace
local namespace = mw.title.getCurrentTitle().namespace
local is_content_page = namespace == 0 or namespace == 118
local is_content_page = namespace == 0 or namespace == 120


local function process_maybe_split_categories(split_output, categories, prontext, lang, errtext)
local function process_maybe_split_categories(split_output, categories, prontext, lang, errtext)
Line 53: Line 58:
Format a line of one or more IPA pronunciations as {{tl|IPA}} would do it, i.e. with a preceding {"IPA:"} followed by
Format a line of one or more IPA pronunciations as {{tl|IPA}} would do it, i.e. with a preceding {"IPA:"} followed by
the word {"key"} linking to an Appendix page describing the language's phonology, and with an added category
the word {"key"} linking to an Appendix page describing the language's phonology, and with an added category
{{cd|<var>lang</var> terms with IPA pronunciation}}. Other than the extra preceding text and category, this is identical
` ``lang`` terms with IPA pronunciation`. Other than the extra preceding text and category, this is identical
to {format_IPA_multiple()}, and the considerations described there in the documentation apply here as well. There is a
to {format_IPA_multiple()}, and the considerations described there in the documentation apply here as well. There is a
single parameter `data`, an object with the following fields:
single parameter `data`, an object with the following fields:
Line 59: Line 64:
   pronunciations with invalid phonemes; for determining how many syllables the pronunciations have in them, in order to
   pronunciations with invalid phonemes; for determining how many syllables the pronunciations have in them, in order to
   add a category such as [[:Category:Italian 2-syllable words]] (for certain languages only); for adding a category
   add a category such as [[:Category:Italian 2-syllable words]] (for certain languages only); for adding a category
   {{cd|<var>lang</var> terms with IPA pronunciation}}; and for determining the proper sort keys for categories. Unlike
   ` ``lang`` terms with IPA pronunciation`; and for determining the proper sort keys for categories. Unlike
   for {format_IPA_multiple()}, `lang` may not be {nil}.
   for {format_IPA_multiple()}, `lang` may not be {nil}.
* `items`: List of pronunciations, in exactly the same format as for {format_IPA_multiple()}.
* `items`: List of pronunciations, in exactly the same format as for {format_IPA_multiple()}.
Line 113: Line 118:
prefix_text = '<span class="error">' .. err .. '</span>'
prefix_text = '<span class="error">' .. err .. '</span>'
else
else
if hasKey[lang:getCode()] then
prefix_text = "IPA for " .. langname
prefix_text = "Appendix:" .. langname .. " pronunciation"
else
prefix_text = "wikipedia:" .. langname .. " phonology"
end
prefix_text = "[[" .. prefix_text .. "|key]]"
prefix_text = "[[" .. prefix_text .. "|key]]"
end
end


local prefix = "[[Wiktionary:International Phonetic Alphabet|IPA]]<sup>(" .. prefix_text .. ")</sup>:&#32;"
local prefix = "[[wikt:Wiktionary:International Phonetic Alphabet|IPA]]<sup>(" .. prefix_text .. ")</sup>:&#32;"


local IPAs, categories = export.format_IPA_multiple(lang, items, separator, no_count, "raw")
local IPAs, categories = export.format_IPA_multiple(lang, items, separator, no_count, "raw")
if is_content_page then
insert(categories, {
cat = langname .. " terms with IPA pronunciation",
sort_key = sort_key
})
end


local prontext = prefix .. IPAs
local prontext = prefix .. IPAs
Line 159: Line 153:


local function determine_repr(pron)
local function determine_repr(pron)
local repr_mark = {}
local reconstructed
local repr, reconstructed


-- remove initial asterisk before representation marks, used on some Reconstruction pages
-- Temporarily remove any initial asterisk before representation marks,
-- which avoids having to account for it in the data, but set the
-- `reconstructed` flag.
if sub(pron, 1, 1) == "*" then
if sub(pron, 1, 1) == "*" then
reconstructed = true
reconstructed = true
Line 168: Line 163:
end
end


local representation_types = {
-- Some representation types have aliases for convenience (e.g. "// //" is
['/'] = { right = '/', type = 'phonemic', },
-- an alias for "⫽ ⫽"). and these need to be substituted in before checking
['['] = { right = ']', type = 'phonetic', },
-- for other data.
['⟨'] = { right = '⟩', type = 'orthographic', },
local opening, n = match(pron, "^.[\128-\191]*")
['-'] = { type = 'rhyme' },
local subs_data = m_data.representation_subs[opening]
}
if subs_data then
pron, n = ugsub(pron, subs_data[1], subs_data[2])
-- If the substitution was made, `opening` needs to be changed to the
-- new opening character.
if n ~= 0 then
opening = subs_data[3]
end
end


repr_mark.i, repr_mark.f, repr_mark.left, repr_mark.right = ufind(pron, '^(.).-(.)$')
-- Get the type data based on the opening character (if any), and set the
-- representation type if the closing character matches.
local type_data, repr, closing = m_data.representation_types[opening]
if type_data then
closing = type_data[2]
if type_data and match(pron, pattern_escape(closing) .. "$", #opening + 1) then
repr = type_data[1]
end
end


local representation_type = representation_types[repr_mark.left]
-- Default to the empty string.
if not repr then
opening, closing = "", ""
end


if representation_type then
-- Reattach the asterisk if reconstructed.
if representation_type.right then
if reconstructed then
if repr_mark.right == representation_type.right then
pron = "*" .. pron
repr = representation_type.type
end
else
repr = representation_type.type
end
else
repr = nil
end
end


return repr, reconstructed
return pron, repr, opening, closing, reconstructed
end
end


local function hasInvalidSeparators(transcription)
local function hasInvalidSeparators(transcription)
if match(transcription, "%.\203[\136\140]") then -- [ˈˌ]
-- Escape certain characters as well as pauses, which have the format "(...)" (with any number of dots), to avoid false-positives.
return true
transcription = transcription:gsub(".[\128-\191]*", m_symbols.separator_escapes)
else
:gsub("%(%.+%)", "\3")
return false
:gsub("[()]+", "")
end
return (
transcription:find("..", nil, true) or
transcription:match("%.%f[%z \1\2\3,:;]") or
transcription:match("\1%f[%z \2\3,:;]") or
transcription:match("\2%f[%z \1\3,:;]") or
transcription:match("\3[:;]") or
transcription:match("%f[^%z \1\2\3,]%.")
) and true or false
end
end


--[==[
--[==[
Format a line of one or more bare IPA pronunciations (i.e. without any preceding {"IPA:"} and without adding to a
Format a line of one or more bare IPA pronunciations (i.e. without any preceding {"IPA:"} and without adding to a
category {{cd|<var>lang</var> terms with IPA pronunciation}}). Individual pronunciations are formatted using
category ` ``lang`` terms with IPA pronunciation`). Individual pronunciations are formatted using
{format_IPA()} and are combined with separators, qualifiers, pre-text, post-text, etc. to form a line of pronunciations.
{format_IPA()} and are combined with separators, qualifiers, pre-text, post-text, etc. to form a line of pronunciations.
Parameters accepted are:
Parameters accepted are:
Line 258: Line 271:
if namespace == 10 then -- Template
if namespace == 10 then -- Template
insert(items, {pron = "/aɪ piː ˈeɪ/"})
insert(items, {pron = "/aɪ piː ˈeɪ/"})
else
insert(categories, "Pronunciation templates without a pronunciation")
end
end
end
end
Line 343: Line 354:
local langcode = lang:getCode()
local langcode = lang:getCode()
if m_data.langs_to_generate_syllable_count_categories[langcode] then
if m_data.langs_to_generate_syllable_count_categories[langcode] then
local phonemic, phonetic = split_phonemic_phonetic(item.pron)
local raw_phonemic, phonetic, use_it = split_phonemic_phonetic(item.pron)
local use_it
local phonemic, repr = determine_repr(raw_phonemic)
if not phonetic then -- not a '/.../ [...]' combined pronunciation
if not phonetic then -- not a '/.../ [...]' combined pronunciation
local repr = determine_repr(phonemic)
if m_data.langs_to_use_phonetic_notation[langcode] then
if m_data.langs_to_use_phonetic_notation[langcode] then
use_it = repr == "phonetic" and phonemic or nil
use_it = repr == "phonetic" and phonemic or nil
Line 366: Line 376:
end
end
end
end
end
-- The nature of hasInvalidSeparators() is such that we don't have to split a combined '/.../ [...]' spec
-- into its parts in order to process.
if lang:getCode() == "en" and hasInvalidSeparators(item.pron) then
insert(categories, "IPA for English using .ˈ or .ˌ")
end
end
end
end
Line 386: Line 390:
may have HTML added surrounding invalid characters so they appear in red.
may have HTML added surrounding invalid characters so they appear in red.
]=]
]=]
local function format_one_IPA(lang, pron, err, categories)
local function format_one_IPA(lang, raw_pron, err, categories)
-- Remove wikilinks, so that wikilink brackets are not misinterpreted as indicating phonetic transcription
-- Disallow wikilinks.
local without_links = gsub(pron, "%[%[[^|%]]+|([^%]]+)%]%]", "%1")
if match(raw_pron, "%[%[.-%]%]") then
without_links = gsub(without_links, "%[%[[^%]]+%]%]", "%1")
error("IPA input must not contain wikilinks.")
 
-- Detect whether this is a phonemic or phonetic transcription
local repr, reconstructed = determine_repr(without_links)
 
if reconstructed then
pron = sub(pron, 2)
without_links = sub(without_links, 2)
end
end
raw_pron = decode_entities(raw_pron)


-- If valid, strip the representation marks
-- Detect the type of transcription.
if repr == "phonemic" then
local pron, repr, opening, closing, reconstructed = determine_repr(raw_pron)
pron = usub(pron, 2, -2)
without_links = usub(without_links, 2, -2)
-- Strip any reconstruction asterisk and representation marks.
elseif repr == "phonetic" then
pron = sub(pron, #opening + 1 + (reconstructed and 1 or 0), -#closing - 1)
pron = usub(pron, 2, -2)
without_links = usub(without_links, 2, -2)
elseif repr == "orthographic" then
pron = usub(pron, 2, -2)
without_links = usub(without_links, 2, -2)
elseif repr == "rhyme" then
pron = usub(pron, 2)
without_links = usub(without_links, 2)
else
insert(categories, "IPA pronunciations with invalid representation marks")
-- insert(err, "invalid representation marks")
-- Removed because it's annoying when previewing pronunciation pages.
end


if pron == "" then
if repr ~= "orthographic" and lang and lang:getCode() == "en" and hasInvalidSeparators(pron) then
insert(categories, "IPA pronunciations with no pronunciation present")
insert(categories, "English IPA pronunciations with invalid separators")
end
end


-- Check for obsolete and nonstandard symbols
-- Check for obsolete and nonstandard symbols
for i, symbol in ipairs(m_data.nonstandard) do
for _, symbol in ipairs(m_data.nonstandard) do
local result
local result
for nonstandard in gmatch(pron, symbol) do
for nonstandard in gmatch(pron, symbol) do
Line 430: Line 416:
end
end
insert(result, nonstandard)
insert(result, nonstandard)
insert(categories,
{cat = "IPA pronunciations with obsolete or nonstandard characters", sort_key = nonstandard}
)
end
end


Line 446: Line 429:
3. bolding
3. bolding
4. italics
4. italics
5. HTML entity for space
5. asterisk at beginning of transcription
6. asterisk at beginning of transcription
6. comma followed by spacing characters
7. comma followed by spacing characters
7. superscripts enclosed in superscript parentheses ]]
8. superscripts enclosed in superscript parentheses ]]
local found_HTML
local found_HTML
local result = gsub(without_links, "<(%a+)[^>]*>([^<]+)</%1>",
local result = gsub(pron, "<(%a+)[^>]*>([^<]+)</%1>",
function(tagName, content)
function(tagName, content)
found_HTML = true
found_HTML = true
Line 458: Line 440:
result = gsub(result, "'''([^']*)'''", "%1")
result = gsub(result, "'''([^']*)'''", "%1")
result = gsub(result, "''([^']*)''", "%1")
result = gsub(result, "''([^']*)''", "%1")
result = gsub(result, "&[^;]+;", "") -- This may catch things that are not valid character entities.
result = gsub(result, "^%*", "")
result = gsub(result, "^%*", "")
result = ugsub(result, ",%s+", "")
result = ugsub(result, ",%s+", "")
Line 470: Line 451:
pron = gsub(pron, vs15, "")
pron = gsub(pron, vs15, "")
end
end
pron = ugsub(pron, "(" .. vs15_class .. ")", "%1" .. vs15)
pron = ugsub(pron, vs15_class, "%0" .. vs15)
end
end


if result ~= "" then
if result ~= "" then
local suggestions = {}
if lang then
-- Get the per_lang_valid data, and convert any per-language valid sequences to spaces.
local per_lang_valid = m_symbols.per_lang_valid[lang:getCode()]
if per_lang_valid then
if type(per_lang_valid) == "table" then
for _, pattern in pairs(per_lang_valid) do
result = ugsub(result, pattern, " ")
end
else -- Should be a string.
result = ugsub(result, per_lang_valid, " ")
end
end
end
local suggestions
-- Check for any invalid sequences, excluding anything in the per-language lookup table.
for k, v in pairs(m_symbols.invalid) do
for k, v in pairs(m_symbols.invalid) do
if find(result, k, 1, true) then
if find(result, k, nil, true) then
if not suggestions then
suggestions = {}
end
insert(suggestions, k .. " with " .. v)
insert(suggestions, k .. " with " .. v)
end
end
end
end
if suggestions[1] then
if suggestions and suggestions[1] then
suggestions = listToText(suggestions)
suggestions = listToText(suggestions)
if is_content_page then
if is_content_page then
error("Invalid IPA: replace " .. suggestions)
error("Invalid IPA: replace " .. suggestions)
else
insert(err, "replace " .. suggestions)
end
end
insert(err, "replace " .. suggestions)
end
end
result = ugsub(result, "⁽[".. m_symbols.superscripts .. "]+⁾", "")
-- Convert any valid character sequences to spaces
local per_lang_valid
for _, pattern in pairs(m_symbols.valid) do
if lang then
result = ugsub(result, pattern, " ")
per_lang_valid = m_symbols.per_lang_valid[lang:getCode()]
end
per_lang_valid = per_lang_valid or ""
result = ugsub(result, "[" .. m_symbols.valid .. per_lang_valid .. "]", "")
if result ~= "" then
local category = "IPA pronunciations with invalid IPA characters"
if not is_content_page then
category = category .. "/non_mainspace"
end
insert(categories, category)
insert(err, "invalid IPA characters (" .. result .. ")")
end
end
end
end


if found_HTML then
if (repr == "phonemic" or repr == "rhyme") and lang and m_data.phonemes[lang:getCode()] then
insert(categories, "IPA pronunciations with paired HTML tags")
local valid_phonemes = m_data.phonemes[lang:getCode()]
end
local rest = pron
local phonemes = {}


if repr == "phonemic" or repr == "rhyme" then
while #rest > 0 do
if lang and m_data.phonemes[lang:getCode()] then
local longestmatch, longestmatch_len = "", 0
local valid_phonemes = m_data.phonemes[lang:getCode()]
local rest = pron
local phonemes = {}


while #rest > 0 do
local rest_init = sub(rest, 1, 1)
local longestmatch, longestmatch_len = "", 0
if rest_init == "(" or rest_init == ")" then
 
longestmatch = rest_init
local rest_init = sub(rest, 1, 1)
longestmatch_len = 1
if rest_init == "(" or rest_init == ")" then
else
longestmatch = rest_init
for _, phoneme in ipairs(valid_phonemes) do
longestmatch_len = 1
local phoneme_len = len(phoneme)
else
if phoneme_len > longestmatch_len and usub(rest, 1, phoneme_len) == phoneme then
for _, phoneme in ipairs(valid_phonemes) do
longestmatch = phoneme
local phoneme_len = len(phoneme)
longestmatch_len = len(longestmatch)
if phoneme_len > longestmatch_len and usub(rest, 1, phoneme_len) == phoneme then
longestmatch = phoneme
longestmatch_len = len(longestmatch)
end
end
end
end
end
end


if longestmatch_len > 0 then
if longestmatch_len > 0 then
insert(phonemes, longestmatch)
insert(phonemes, longestmatch)
rest = usub(rest, longestmatch_len + 1)
rest = usub(rest, longestmatch_len + 1)
else
else
local phoneme = usub(rest, 1, 1)
local phoneme = usub(rest, 1, 1)
insert(phonemes, "<span style=\"color: red\">" .. phoneme .. "</span>")
insert(phonemes, "<span style=\"color: var(--wikt-palette-red,red)\">" .. phoneme .. "</span>")
rest = usub(rest, 2)
rest = usub(rest, 2)
insert(categories, "IPA pronunciations with invalid phonemes/" .. lang:getCode())
end
end
end
pron = concat(phonemes)
end
end


if repr == "phonemic" then
pron = concat(phonemes)
pron = "/" .. pron .. "/"
else
pron = "-" .. pron
end
elseif repr == "phonetic" then
pron = "[" .. pron .. "]"
elseif repr == "orthographic" then
pron = "⟨" .. pron .. "⟩"
end
 
if reconstructed then
pron = "*" .. pron
end
end


return pron
return (reconstructed and "*" or "") .. opening .. pron .. closing
end
end


Line 595: Line 560:
end
end


if err[1] then
if err[1] and is_preview() then
err = '<span class="previewonly error" style="font-size: small;>&#32;' .. concat(err, ", ") .. "</span>"
err = '<span class="error" style="font-size: small;>&#32;' .. concat(err, ", ") .. "</span>"
else
else
err = ""
err = ""
end
end


return process_maybe_split_categories(split_output, categories, '<span class="IPA">' .. pron .. "</span>", lang,
return process_maybe_split_categories(split_output, categories, '<span class="IPA nowrap">' .. pron .. "</span>", lang,
err)
err)
end
end
Line 608: Line 573:
Format a line of one or more enPR pronunciations as {{tl|enPR}} would do it, i.e. with a preceding {"enPR:"} (linked to
Format a line of one or more enPR pronunciations as {{tl|enPR}} would do it, i.e. with a preceding {"enPR:"} (linked to
[[Appendix:English pronunciation]]) followed by one or more formatted, comma-separated enPR pronunciations. The
[[Appendix:English pronunciation]]) followed by one or more formatted, comma-separated enPR pronunciations. The
pronunciations are formatted by wrapping them in the {{cd|AHD}} and {{cd|enPR}} CSS classes and adding any left and
pronunciations are formatted by wrapping them in the `AHD` and `enPR` CSS classes and adding any left and
right regular and accent qualifiers. In addition, the overall result is wrapped in any overall left and right regular
right regular and accent qualifiers. In addition, the overall result is wrapped in any overall left and right regular
and accent qualifiers. There is a single parameter `data`, an object with the following fields:
and accent qualifiers. There is a single parameter `data`, an object with the following fields:
Line 625: Line 590:
]==]
]==]
function export.format_enPR_full(data)
function export.format_enPR_full(data)
local prefix = "[[Appendix:English pronunciation|enPR]]: "
local prefix = "[[wikt:Appendix:English pronunciation|enPR]]: "
local lang = require("Module:languages").getByCode("en")
local lang = require("Module:languages").getByCode("en")
local parts = {}
local parts = {}

Navigation menu