Module:IPA: Difference between revisions

From Linguifex
Jump to navigation Jump to search
No edit summary
No edit summary
 
(14 intermediate revisions by the same user not shown)
Line 1: Line 1:
local export = {}
local export = {}
-- [[Module:IPA/data]]


local m_data = mw.loadData('Module:IPA/data') -- [[Module:IPA/data]]
local force_cat = false -- for testing
local m_symbols = mw.loadData('Module:IPA/data/symbols') -- [[Module:IPA/data/symbols]]
local m_syllables -- [[Module:syllables]]; loaded below if needed


local sub = mw.ustring.sub
local pron_qualifier_module = "Module:pron qualifier"
local find = mw.ustring.find
local qualifier_module = "Module:qualifier"
local gsub = mw.ustring.gsub
local references_module = "Module:references"
local match = mw.ustring.match
local string_utilities_module = "Module:string utilities"
local gmatch = mw.ustring.gmatch
local utilities_module = "Module:utilities"
local U = mw.ustring.char
 
local m_data = mw.loadData("Module:IPA/data")
local m_str_utils = require(string_utilities_module)
local m_symbols = mw.loadData("Module:IPA/data/symbols")
 
local concat = table.concat
local decode_entities = m_str_utils.decode_entities
local find = string.find
local gmatch = m_str_utils.gmatch
local gsub = string.gsub
local insert = table.insert
local len = m_str_utils.len
local listToText = mw.text.listToText
local match = string.match
local sub = string.sub
local u = m_str_utils.char
local ugsub = m_str_utils.gsub
local umatch = m_str_utils.match
local usub = m_str_utils.sub
 
local namespace = mw.title.getCurrentTitle().namespace
local is_content_page = namespace == 0 or namespace == 118
 
local function process_maybe_split_categories(split_output, categories, prontext, lang, errtext)
if split_output ~= "raw" then
if categories[1] then
categories = require(utilities_module).format_categories(categories, lang, nil, nil, force_cat)
else
categories = ""
end
end
if split_output then -- for use of IPA in links, etc.
if errtext then
return prontext, categories, errtext
else
return prontext, categories
end
else
return prontext .. (errtext or "") .. categories
end
end
 
--[==[
Format a line of one or more IPA pronunciations as {{tl|IPA}} would do it, i.e. with a preceding {"IPA:"} followed by
the word {"key"} linking to a page describing the language's phonology, and with an added category
{{cd|<var>lang</var> terms with IPA pronunciation}}. Other than the extra preceding text and category, this is identical
to {format_IPA_multiple()}, and the considerations described there in the documentation apply here as well. There is a
single parameter `data`, an object with the following fields:
* `lang`: Object representing the language of the pronunciations, which is used when adding cleanup categories for
  pronunciations with invalid phonemes; for determining how many syllables the pronunciations have in them, in order to
  add a category such as [[:Category:Italian 2-syllable words]] (for certain languages only); for adding a category
  {{cd|<var>lang</var> terms with IPA pronunciation}}; and for determining the proper sort keys for categories. Unlike
  for {format_IPA_multiple()}, `lang` may not be {nil}.
* `items`: List of pronunciations, in exactly the same format as for {format_IPA_multiple()}.
* `err`: If not {nil}, a string containing an error message to use in place of the link to the language's phonology.
* `separator`: The default separator to use when separating formatted items. Defaults to {", "}. Does not apply to the
  first item, where the default separator is always the empty string. Overridden by the per-item `separator` field in
  `items`.
* `sort_key`: Explicit sort key used for categories.
* `no_count`: Suppress adding a {#-syllable words} category such as [[:Category:Italian 2-syllable words]]. Note that
  only certain languages add such categories to begin with, because it depends on knowing how to count syllables in a
  given language, which depends on the phonology of the language. Also, this does not suppress the addition of cleanup
  or other categories. If you need them suppressed, use `split_output` to return the categories separately and ignore
  them.
* `split_output`: If not given, the return value is a concatenation of the formatted pronunciation and formatted
  categories. Otherwise, two values are returned: the formatted pronunciation and the categories. If `split_output` is
  the value {"raw"}, the categories are returned in list form, where the list elements are a combination of category
  strings and category objects of the form suitable for passing to {format_categories()} in [[Module:utilities]]. If
  `split_output` is any other value besides {nil}, the categories are returned as a pre-formatted concatenated string.
* `include_langname`: If specified, prefix the result with the language name, followed by a colon.
* `q`: {nil} or a list of left qualifiers (as in {{tl|q}}) to display at the beginning, before the formatted
  pronunciations and preceding {"IPA:"}.
* `qq`: {nil} or a list of right qualifiers to display after all formatted pronunciations.
* `a`: {nil} or a list of left accent qualifiers (as in {{tl|a}}) to display at the beginning, before the formatted
  pronunciations and preceding {"IPA:"}.
* `aa`: {nil} or a list of right accent qualifiers to display after all formatted pronunciations.
]==]
function export.format_IPA_full(data)
if type(data) ~= "table" or data.getCode then
error("Must now supply a table of arguments to format_IPA_full(); first argument should be that table, not a language object")
end
local lang = data.lang
local items = data.items
local err = data.err
local separator = data.separator
local sort_key = data.sort_key
local no_count = data.no_count
local split_output = data.split_output
local q = data.q
local qq = data.qq
local a = data.a
local aa = data.aa
local include_langname = data.include_langname


function export.format_IPA_full(lang, items, err, separator, sortKey, no_count)
local IPA_key, key_link, err_text, prefix, IPAs, category
local hasKey = m_data.langs_with_infopages
local hasKey = m_data.langs_with_infopages
local namespace = mw.title.getCurrentTitle().nsText
 
if not lang or not lang.getCode then
error("Must specify language to format_IPA_full()")
end
local langname = lang:getCanonicalName()
 
local prefix_text
if err then
if err then
err_text = '<span class="error">' .. err .. '</span>'
prefix_text = '<span class="error">' .. err .. '</span>'
else
else
key_link = "[[IPA for " .. lang:getCanonicalName() .. "|key]]"
--[[if hasKey[lang:getCode()] then
prefix_text = "IPA for " .. langname
else
prefix_text = "wikipedia:" .. langname .. " phonology"
end]]
prefix_text = "IPA for " .. langname
prefix_text = "[[" .. prefix_text .. "|key]]"
end
end


local prefix = "[[Wiktionary:International Phonetic Alphabet|IPA]]<sup>(" .. prefix_text .. ")</sup>:&#32;"
local prefix = "[[wikt:Wiktionary:International Phonetic Alphabet|IPA]]<sup>(" .. ( key_link or err_text ) .. ")</sup>:&#32;"
 
local IPAs, categories = export.format_IPA_multiple(lang, items, separator, no_count, "raw")
IPAs = export.format_IPA_multiple(lang, items, separator, no_count)
 
if is_content_page then
insert(categories, {
cat = langname .. " terms with IPA pronunciation",
sort_key = sort_key
})
end
 
local prontext = prefix .. IPAs
if q and q[1] or qq and qq[1] or a and a[1] or aa and aa[1] then
prontext = require(pron_qualifier_module).format_qualifiers {
lang = lang,
text = prontext,
q = q,
qq = qq,
a = a,
aa = aa,
}
end
if include_langname then
prontext = langname .. ": " .. prontext
end
return process_maybe_split_categories(split_output, categories, prontext, lang)
end


return prefix .. IPAs
local function split_phonemic_phonetic(pron)
local reconstructed, phonemic, phonetic = match(pron, "^(%*?)(/.-/)%s+(%[.-%])$")
if reconstructed then
return reconstructed .. phonemic, reconstructed .. phonetic
else
return pron, nil
end
end
end


local function determine_repr(pron)
local function determine_repr(pron)
local repr_mark = {}
local reconstructed
local repr, reconstructed
-- remove initial asterisk before representation marks, used on some Reconstruction pages
-- remove initial asterisk before representation marks, used on some Reconstruction pages
if find(pron, "^%*") then
if sub(pron, 1, 1) == "*" then
reconstructed = true
reconstructed = true
pron = sub(pron, 2)
pron = sub(pron, 2)
end
end
local representation_types = {
local opening = match(pron, "^.[\128-\191]*")
['/'] = { right = '/', type = 'phonemic', },
local data = m_data.representation_types[opening]
['['] = { right = ']', type = 'phonetic', },
['⟨'] = { right = '⟩', type = 'orthographic', },
['-'] = { type = 'rhyme' },
}
repr_mark.i, repr_mark.f, repr_mark.left, repr_mark.right = find(pron, '^(.).-(.)$')
if data then
local closing = data[2]
local representation_type = representation_types[repr_mark.left]
if data and match(pron, closing .. "$", #opening + 1) then
return data[1], opening, closing, reconstructed
if representation_type then
if representation_type.right then
if repr_mark.right == representation_type.right then
repr = representation_type.type
end
else
repr = representation_type.type
end
end
else
repr = nil
end
end
return repr, reconstructed
return nil, "", "", reconstructed
end
end


local function hasInvalidSeparators(transcription)
local function hasInvalidSeparators(transcription)
if find(transcription, "%.[ˈˌ]") then
if umatch(transcription, "%.[ˈˌ]") or umatch(transcription, "[ˈˌ][ .]") then
return true
return true
else
else
Line 76: Line 190:
end
end


function export.format_IPA_multiple(lang, items, separator, no_count)
--[==[
Format a line of one or more bare IPA pronunciations (i.e. without any preceding {"IPA:"} and without adding to a
category {{cd|<var>lang</var> terms with IPA pronunciation}}). Individual pronunciations are formatted using
{format_IPA()} and are combined with separators, qualifiers, pre-text, post-text, etc. to form a line of pronunciations.
Parameters accepted are:
* `lang` is an object representing the language of the pronunciations, which is used when adding cleanup categories for
  pronunciations with invalid phonemes; for determining how many syllables the pronunciations have in them, in order to
  add a category such as [[:Category:Italian 2-syllable words]] (for certain languages only); and for computing the
  proper sort keys for categories. `lang` may be {nil}.
* `items` is a list of pronunciations, each of which is an object with the following properties:
** `pron`: the pronunciation, in the same format as is accepted by {format_IPA()}, i.e. it should be either phonemic
    (surrounded by {/.../}), phonetic (surrounded by {[...]}), orthographic (surrounded by {⟨...⟩}) or a rhyme
(beginning with a hyphen);
** `pretext`: text to display directly before the formatted pronunciation, inside of any qualifiers or accent
    qualifiers;
** `posttext`: text to display directly after the formatted pronunciation, inside of any qualifiers or accent
    qualifiers;
** `q` or `qualifiers`: {nil} or a list of left qualifiers (as in {{tl|q}}) to display before the formatted
    pronunciation; note that `qualifiers` is deprecated;
** `qq`: {nil} or a list of right qualifiers to display after the formatted pronunciation;
** `a`: {nil} or a list of left accent qualifiers (as in {{tl|a}}) to display before the formatted pronunciation;
** `aa`: {nil} or a list of right accent qualifiers to after before the formatted pronunciation;
** `refs`: {nil} or a list of references or reference specs to add after the pronunciation and any posttext and
    qualifiers; the value of a list item is either a string containing the reference text (typically a call to a
citation template such as {{tl|cite-book}}, or a template wrapping such a call), or an object with fields `text`
(the reference text), `name` (the name of the reference, as in {{cd|<nowiki><ref name="foo">...</ref></nowiki>}}
or {{cd|<nowiki><ref name="foo" /></nowiki>}}) and/or `group` (the group of the reference, as in
{{cd|<nowiki><ref name="foo" group="bar">...</ref></nowiki>}} or
{{cd|<nowiki><ref name="foo" group="bar"/></nowiki>}}); this uses a parser function to format the reference
appropriately and insert a footnote number that hyperlinks to the actual reference, located in the
{{cd|<nowiki><references /></nowiki>}} section;
** `gloss`: {nil} or a gloss (definition) for this item, if different definitions have different pronunciations;
** `pos`: {nil} or a part of speech for this item, if different parts of speech have different pronunciations;
** `separator`: the separator text to insert directly before the formatted pronunciation and all qualifiers, accent
  qualifiers and pre-text; defaults to the outer `separator` parameter.
* `separator`: The default separator to use when separating formatted items. Defaults to {", "}. Does not apply to the
  first item, where the default separator is always the empty string. Overridden by the per-item `separator` field in
  `items`.
* `no_count`: Suppress adding a {#-syllable words} category such as [[:Category:Italian 2-syllable words]]. Note that
  only certain languages add such categories to begin with, because it depends on knowing how to count syllables in a
  given language, which depends on the phonology of the language. Also, this does not suppress the addition of cleanup
  categories. If you need them suppressed, use `split_output` to return the categories separately and ignore them.
* `split_output`: If not given, the return value is a concatenation of the formatted pronunciation and formatted
  categories. Otherwise, two values are returned: the formatted pronunciation and the categories. If `split_output` is
  the value {"raw"}, the categories are returned in list form, where the list elements are a combination of category
  strings and category objects of the form suitable for passing to {format_categories()} in [[Module:utilities]]. If
  `split_output` is any other value besides {nil}, the categories are returned as a pre-formatted concatenated string.
]==]
function export.format_IPA_multiple(lang, items, separator, no_count, split_output)
local categories = {}
local categories = {}
separator = separator or ', '
separator = separator or ", "
 
-- Format
-- Format
if not items[1] then
if not items[1] then
if mw.title.getCurrentTitle().nsText == "Template" then
if namespace == 10 then -- Template
table.insert(items, {pron = "/aɪ piː ˈeɪ/"})
insert(items, {pron = "/aɪ piː ˈeɪ/"})
else
else
table.insert(categories, "[[Category:Pronunciation templates without a pronunciation]]")
insert(categories, "Pronunciation templates without a pronunciation")
end
end
end
end
 
local bits = {}
local bits = {}
 
for _, item in ipairs(items) do
for i, item in ipairs(items) do
local bit = export.format_IPA(lang, item.pron)
local bit
 
-- If the pronunciation is entirely empty, allow this and don't do anything, so that e.g. the pretext and/or
-- posttext can be specified to force something like ''unknown'' to appear in place of the pronunciation
-- (as happens e.g. when ? is used as a respelling in [[Module:ca-IPA]]; see [[guèiser]] for an example).
if item.pron == "" then
bit = ""
else
local item_categories, errtext
bit, item_categories, errtext = export.format_IPA(lang, item.pron, "raw")
bit = bit .. errtext
for _, cat in ipairs(item_categories) do
insert(categories, cat)
end
end
 
if item.pretext then
if item.pretext then
bit = item.pretext .. bit
bit = item.pretext .. bit
end
end
 
if item.posttext then
if item.posttext then
bit = bit .. item.posttext
bit = bit .. item.posttext
end
end
 
if item.qualifiers and item.qualifiers[1] then
local has_qualifiers = item.q and item.q[1] or item.qq and item.qq[1] or item.qualifiers and item.qualifiers[1]
bit = require("Module:qualifier").format_qualifier(item.qualifiers) .. " " .. bit
or item.a and item.a[1] or item.aa and item.aa[1]
local has_gloss_or_pos = item.gloss or item.pos
if has_qualifiers or has_gloss_or_pos then
-- FIXME: Currently we tack the gloss and POS (in that order) onto the end of the regular left qualifiers.
-- Should we do something different?
local q = item.q
if has_gloss_or_pos then
q = mw.clone(item.q) or {}
if item.gloss then
local m_qualifier = require(qualifier_module)
insert(q, m_qualifier.wrap_qualifier_css("“", "quote") .. item.gloss ..
m_qualifier.wrap_qualifier_css("”", "quote"))
end
if item.pos then
-- FIXME: Consider expanding aliases as found in [[Module:headword/data]] or similar.
insert(q, item.pos)
end
end
 
bit = require("Module:pron qualifier").format_qualifiers {
lang = lang,
text = bit,
q = q,
qq = item.qq,
qualifiers = item.qualifiers,
a = item.a,
aa = item.aa,
}
end
end


if item.refs or item.note then
if item.note then
local refspecs
-- Support removed on 2024-06-15.
if item.note then
error("Support for `.note` has been removed; switch to `.refs` (which must be a list)")
-- FIXME: eliminate item.note in favor of item.refs. Use tracking to find places
end
-- that use item.note.
if item.refs then
refspecs = {item.note}
local refspecs = item.refs
else
refspecs = item.refs
end
local refs = {}
if #refspecs > 0 then
if #refspecs > 0 then
for _, refspec in ipairs(refspecs) do
bit = bit .. require(references_module).format_references(refspecs)
if type(refspec) ~= "table" then
refspec = {text = refspec}
end
local refargs
if refspec.name or refspec.group then
refargs = {name = refspec.name, group = refspec.group}
end
table.insert(refs, mw.getCurrentFrame():extensionTag("ref", refspec.text, refargs))
end
bit = bit .. table.concat(refs)
end
end
end
end


table.insert(bits, bit)
bit = (item.separator or (i == 1 and "" or separator)) .. bit
 
insert(bits, bit)
 
if lang then
if lang then
-- Add syllable count if the language's diphthongs are listed in [[Module:syllables]].
-- The nature of hasInvalidSeparators() is such that we don't have to split a combined '/.../ [...]' spec
-- Don't do this if the term has spaces or a liaison mark (‿).
-- into its parts in order to process.
if not no_count and mw.title.getCurrentTitle().namespace == 0 then
m_syllables = m_syllables or require('Module:syllables')
local langcode = lang:getCode()
if m_data.langs_to_generate_syllable_count_categories[langcode] then
local repr = determine_repr(item.pron)
local use_it
if m_data.langs_to_use_phonetic_notation[langcode] then
use_it = repr == "phonetic"
else
use_it = repr == "phonemic"
end
if use_it and not find(item.pron, "[ ‿]") then
local syllable_count = m_syllables.getVowels(item.pron, lang)
if syllable_count then
table.insert(categories, "[[Category:" .. lang:getCanonicalName() .. " " .. syllable_count .. "-syllable words]]")
end
end
end
end
 
if lang:getCode() == "en" and hasInvalidSeparators(item.pron) then
if lang:getCode() == "en" and hasInvalidSeparators(item.pron) then
table.insert(categories, "[[Category:IPA for English using .ˈ or .ˌ]]")
insert(categories, "English IPA pronunciations with invalid separators")
end
end
end
end
end
end


return table.concat(bits, separator) .. table.concat(categories)
return process_maybe_split_categories(split_output, categories, concat(bits), lang)
end
end


-- Takes an IPA pronunciation and formats it and adds cleanup categories.
--[=[
function export.format_IPA(lang, pron, split_output)
Format a single IPA pronunciation, which cannot be a combined spec (such as {/.../ [...]}). This has been extracted from
local err = {}
{format_IPA()} to allow the latter to handle such combined specs. This works like {format_IPA()} but requires that
local categories = {}
pre-created {err} (for error messages) and {categories} lists be passed in, and adds any generated error messages and
categories to those lists. A single value is returned, the pronunciation, which is usually the same as passed in, but
-- Remove wikilinks, so that wikilink brackets are not misinterpreted as
may have HTML added surrounding invalid characters so they appear in red.
-- indicating phonemic transcription
]=]
local str_gsub = string.gsub
local function format_one_IPA(lang, pron, err, categories)
local without_links = str_gsub(pron, '%[%[[^|%]]+|([^%]]+)%]%]', '%1')
-- Disallow wikilinks.
without_links = str_gsub(without_links, '%[%[[^%]]+%]%]', '%1')
if match(pron, "%[%[.-%]%]") then
error("IPA input must not contain wikilinks.")
-- Detect whether this is a phonemic or phonetic transcription
local repr, reconstructed = determine_repr(without_links)
if reconstructed then
pron = sub(pron, 2)
end
end
-- If valid, strip the representation marks
pron = decode_entities(pron)
if repr == "phonemic" then
 
pron = sub(pron, 2, -2)
-- Detect the type of transcription.
without_links = sub(without_links, 2, -2)
local repr, opening, closing, reconstructed = determine_repr(pron)
elseif repr == "phonetic" then
pron = sub(pron, 2, -2)
without_links = sub(without_links, 2, -2)
elseif repr == "orthographic" then
pron = sub(pron, 2, -2)
without_links = sub(without_links, 2, -2)
elseif repr == "rhyme" then
pron = sub(pron, 2)
without_links = sub(without_links, 2)
else
table.insert(categories, "[[Category:IPA pronunciations with invalid representation marks]]")
-- table.insert(err, "invalid representation marks")
-- Removed because it's annoying when previewing pronunciation pages.
end
-- Strip any reconstruction asterisk and representation marks.
pron = sub(pron, #opening + 1 + (reconstructed and 1 or 0), -#closing - 1)
if pron == "" then
if pron == "" then
table.insert(categories, "[[Category:IPA pronunciations with no pronunciation present]]")
insert(categories, "IPA pronunciations with no pronunciation present")
end
end
 
-- Check for obsolete and nonstandard symbols
-- Check for obsolete and nonstandard symbols
for i, symbol in ipairs(m_data.nonstandard) do
for _, symbol in ipairs(m_data.nonstandard) do
local result
local result
for nonstandard in gmatch(pron, symbol) do
for nonstandard in gmatch(pron, symbol) do
Line 213: Line 368:
result = {}
result = {}
end
end
table.insert(result, nonstandard)
insert(result, nonstandard)
table.insert(categories, "[[Category:IPA pronunciations with obsolete or nonstandard characters|" .. nonstandard .. "]]")
insert(categories,
{cat = "IPA pronunciations with obsolete or nonstandard characters", sort_key = nonstandard}
)
end
end
 
if result then
if result then
table.insert(err, "obsolete or nonstandard characters (" .. table.concat(result) .. ")")
insert(err, "obsolete or nonstandard characters (" .. concat(result) .. ")")
break
break
end
end
end
end
 
--[[ Check for invalid symbols after removing the following:
--[[ Check for invalid symbols after removing the following:
1. wikilinks (handled above)
1. wikilinks (handled above)
Line 228: Line 385:
3. bolding
3. bolding
4. italics
4. italics
5. HTML entity for space
5. asterisk at beginning of transcription
6. asterisk at beginning of transcription
6. comma followed by spacing characters
7. comma followed by spacing characters
7. superscripts enclosed in superscript parentheses ]]
8. superscripts enclosed in superscript parentheses ]]
local found_HTML
local found_HTML
local result = str_gsub(without_links, "<(%a+)[^>]*>([^<]+)</%1>",
local result = gsub(pron, "<(%a+)[^>]*>([^<]+)</%1>",
function(tagName, content)
function(tagName, content)
found_HTML = true
found_HTML = true
return content
return content
end)
end)
result = str_gsub(result, "'''([^']*)'''", "%1")
result = gsub(result, "'''([^']*)'''", "%1")
result = str_gsub(result, "''([^']*)''", "%1")
result = gsub(result, "''([^']*)''", "%1")
result = str_gsub(result, "&[^;]+;", "") -- This may catch things that are not valid character entities.
result = gsub(result, "^%*", "")
result = str_gsub(result, "^%*", "")
result = ugsub(result, ",%s+", "")
result = gsub(result, ",%s+", "")
 
result = gsub(result, "⁽[".. m_symbols.superscripts .. "]+⁾", "")
result = gsub(result, '[' .. m_symbols.valid .. ']', '')
-- VS15
-- VS15
local vs15_class = "[" .. m_symbols.add_vs15 .. "]"
local vs15_class = "[" .. m_symbols.add_vs15 .. "]"
if mw.ustring.find(pron, vs15_class) then
if umatch(pron, vs15_class) then
local vs15 = U(0xFE0E)
local vs15 = u(0xFE0E)
if mw.ustring.find(result, vs15) then
if find(result, vs15) then
result = gsub(result, vs15, "")
result = gsub(result, vs15, "")
pron = mw.ustring.gsub(pron, vs15, "")
pron = gsub(pron, vs15, "")
end
end
pron = mw.ustring.gsub(pron, "(" .. vs15_class .. ")", "%1" .. vs15)
pron = ugsub(pron, vs15_class, "%0" .. vs15)
end
end


if result ~= '' then
if result ~= "" then
local suggestions = {}
local suggestions = {}
mw.log(pron, result)
for k, v in pairs(m_symbols.invalid) do
local namespace = mw.title.getCurrentTitle().namespace
if find(result, k, 1, true) then
local category
pron = pron:gsub(k, v)
if namespace == 0 then
-- main namespace
category = "IPA pronunciations with invalid IPA characters"
elseif namespace == 118 then
-- reconstruction namespace
category = "IPA pronunciations with invalid IPA characters/reconstruction"
else
category = "IPA pronunciations with invalid IPA characters/non_mainspace"
end
for character in gmatch(result, ".") do
local suggestion = m_symbols.suggestions[character]
if suggestion then
table.insert(suggestions, character .. " with " .. suggestion)
end
end
table.insert(categories, "[[Category:" .. category .. "|" .. character .. "]]")
end
end
table.insert(err, "invalid IPA characters (" .. result .. ")")
if suggestions[1] then
result = ugsub(result, "⁽[".. m_symbols.superscripts .. "]+⁾", "")
table.insert(err, "replace " .. table.concat(suggestions, ", "))
local per_lang_valid
if lang then
per_lang_valid = m_symbols.per_lang_valid[lang:getCode()]
end
end
per_lang_valid = per_lang_valid or ""
result = ugsub(result, "[" .. m_symbols.valid .. per_lang_valid .. "]", "")
end
end
 
if found_HTML then
if found_HTML then
table.insert(categories, "[[Category:IPA pronunciations with paired HTML tags]]")
insert(categories, "IPA pronunciations with paired HTML tags")
end
end
 
-- Reference inside IPA template usage
if (repr == "phonemic" or repr == "rhyme") and lang and m_data.phonemes[lang:getCode()] then
-- FIXME: Doesn't work; you can't put HTML in module output.
local valid_phonemes = m_data.phonemes[lang:getCode()]
--if mw.ustring.find(pron, '</ref>') then
local rest = pron
-- table.insert(categories, "[[Category:IPA pronunciations with reference]]")
local phonemes = {}
--end
 
while #rest > 0 do
if repr == "phonemic" or repr == "rhyme" then
local longestmatch, longestmatch_len = "", 0
if lang and m_data.phonemes[lang:getCode()] then
 
local valid_phonemes = m_data.phonemes[lang:getCode()]
local rest_init = sub(rest, 1, 1)
local rest = pron
if rest_init == "(" or rest_init == ")" then
local phonemes = {}
longestmatch = rest_init
longestmatch_len = 1
while mw.ustring.len(rest) > 0 do
else
local longestmatch = ""
for _, phoneme in ipairs(valid_phonemes) do
local phoneme_len = len(phoneme)
if sub(rest, 1, 1) == "(" or sub(rest, 1, 1) == ")" then
if phoneme_len > longestmatch_len and usub(rest, 1, phoneme_len) == phoneme then
longestmatch = sub(rest, 1, 1)
longestmatch = phoneme
else
longestmatch_len = len(longestmatch)
for _, phoneme in ipairs(valid_phonemes) do
if mw.ustring.len(phoneme) > mw.ustring.len(longestmatch) and sub(rest, 1, mw.ustring.len(phoneme)) == phoneme then
longestmatch = phoneme
end
end
end
end
end
if mw.ustring.len(longestmatch) > 0 then
table.insert(phonemes, longestmatch)
rest = sub(rest, mw.ustring.len(longestmatch) + 1)
else
local phoneme = sub(rest, 1, 1)
table.insert(phonemes, "<span style=\"color: red\">" .. phoneme .. "</span>")
rest = sub(rest, 2)
table.insert(categories, "[[Category:IPA pronunciations with invalid phonemes/" .. lang:getCode() .. "]]")
end
end
end
 
pron = table.concat(phonemes)
if longestmatch_len > 0 then
insert(phonemes, longestmatch)
rest = usub(rest, longestmatch_len + 1)
else
local phoneme = usub(rest, 1, 1)
insert(phonemes, "<span style=\"color: red\">" .. phoneme .. "</span>")
rest = usub(rest, 2)
insert(categories, "IPA pronunciations with invalid phonemes/" .. lang:getCode())
end
end
end
 
if repr == "phonemic" then
pron = concat(phonemes)
pron = "/" .. pron .. "/"
end
else
 
pron = "-" .. pron
return (reconstructed and "*" or "") .. opening .. pron .. closing
end
end
elseif repr == "phonetic" then
 
pron = "[" .. pron .. "]"
--[==[
elseif repr == "orthographic" then
Format an IPA pronunciation. This wraps the pronunciation in appropriate CSS classes and adds cleanup categories and
pron = "⟨" .. pron .. "⟩"
error messages as needed. The pronunciation `pron` should be either phonemic (surrounded by {/.../}), phonetic
(surrounded by {[...]}), orthographic (surrounded by {⟨...⟩}), a rhyme (beginning with a hyphen) or a combined
phonemic/phonetic spec (of the form {/.../ [...]}). `lang` indicates the language of the pronunciation and can be {nil}.
If not {nil}, and the specified language has data in [[Module:IPA/data]] indicating the allowed phonemes, then the page
will be added to a cleanup category and an error message displayed next to the outputted pronunciation. Note that {lang}
also determines sort key processing in the added cleanup categories. If `split_output` is not given, the return value is
a concatenation of the formatted pronunciation, error messages and formatted cleanup categories. Otherwise, three values
are returned: the formatted pronunciation, the cleanup categories and the concatenated error messages. If `split_output`
is the value {"raw"}, the cleanup categories are returned in list form, where the list elements are a combination of
category strings and category objects of the form suitable for passing to {format_categories()} in [[Module:utilities]].
If `split_output` is any other value besides {nil}, the cleanup categories are returned as a pre-formatted concatenated
string.
]==]
function export.format_IPA(lang, pron, split_output)
local err = {}
local categories = {}
 
-- `pron` shouldn't contain ref tags.
if match(pron, "\127'\"`UNIQ%-%-ref%-[%dA-F]+%-QINU`\"'\127") then
error("<ref> tags found inside pronunciation parameter.")
end
end
 
if reconstructed then
local phonemic, phonetic = split_phonemic_phonetic(pron)
pron = "*" .. pron
pron = format_one_IPA(lang, phonemic, err, categories)
if phonetic then
phonetic = format_one_IPA(lang, phonetic, err, categories)
pron = pron .. " " .. phonetic
end
end
 
if err[1] then
if err[1] then
err = '<span class="previewonly error" style="font-size: small;>&#32;' .. table.concat(err, ', ') .. '</span>'
err = '<span class="previewonly error" style="font-size: small;>&#32;' .. concat(err, ", ") .. "</span>"
else
else
err = ""
err = ""
end
end
 
if split_output then -- for use of IPA in links
return process_maybe_split_categories(split_output, categories, '<span class="IPA">' .. pron .. "</span>", lang,
return '<span class="IPA">' .. pron .. '</span>', table.concat(categories), err
err)
else
return '<span class="IPA">' .. pron .. '</span>' .. err .. table.concat(categories)
end
end
end


function export.example(frame)
--[==[
local output = {}
Format a line of one or more enPR pronunciations as {{tl|enPR}} would do it, i.e. with a preceding {"enPR:"} (linked to
[[Appendix:English pronunciation]]) followed by one or more formatted, comma-separated enPR pronunciations. The
local m_links = require('Module:links')
pronunciations are formatted by wrapping them in the {{cd|AHD}} and {{cd|enPR}} CSS classes and adding any left and
local m_languages = require('Module:languages')
right regular and accent qualifiers. In addition, the overall result is wrapped in any overall left and right regular
and accent qualifiers. There is a single parameter `data`, an object with the following fields:
table.insert(
* `items` is a list of enPR pronunciations, each of which is an object with the following properties:
output,
** `pron`: the enPR pronunciation;
[[
** `q`: {nil} or a list of left qualifiers (as in {{tl|q}}) to display before the formatted pronunciation;
{| class="wikitable"
** `qq`: {nil} or a list of right qualifiers to display after the formatted pronunciation;
! Term !! IPA !! Generated X-SAMPA !! Regenerated IPA !! Matched?
** `a`: {nil} or a list of left accent qualifiers (as in {{tl|a}}) to display before the formatted pronunciation;
]]
** `aa`: {nil} or a list of right accent qualifiers to after before the formatted pronunciation.
)
* `q`: {nil} or a list of left qualifiers (as in {{tl|q}}) to display at the beginning, before the formatted
local row =
  pronunciations and preceding {"enPR:"}.
[[
* `qq`: {nil} or a list of right qualifiers to display after all formatted pronunciations.
|-
* `a`: {nil} or a list of left accent qualifiers (as in {{tl|a}}) to display at the beginning, before the formatted
| link || IPA || XSAMPA || regenerated_IPA || matched
  pronunciations and preceding {"enPR:"}.
]]
* `aa`: {nil} or a list of right accent qualifiers to display after all formatted pronunciations.
]==]
local examples = mw.text.split(frame.args[1], ",%s*")
function export.format_enPR_full(data)
local prefix = "[[Appendix:English pronunciation|enPR]]: "
local m_XSAMPA = require("Module:IPA/X-SAMPA")
local lang = require("Module:languages").getByCode("en")
local parts = {}
for _, example in pairs(examples) do
 
local lang, word = match(example, "(%l%l%l?):(.+) [/%[]")
for _, item in ipairs(data.items) do
local part = '<span class="AHD enPR">' .. item.pron .. "</span>"
if lang then
 
lang = m_languages.getByCode(lang) or error('"' .. lang .. '" is not a valid language code.')
if item.q and item.q[1] or item.qq and item.qq[1] or item.a and item.a[1] or item.aa and item.aa[1] then
part = require("Module:pron qualifier").format_qualifiers {
lang = lang,
text = part,
q = item.q,
qq = item.qq,
a = item.a,
aa = item.aa,
}
end
end
insert(parts, part)
local IPA = match(example, "/[^/]+/")
end
or match(example, "%[[^%]]+%]")
 
or error('No IPA transcription found in "' .. example .. '".')
local prontext = prefix .. concat(parts, ", ")
local XSAMPA = m_XSAMPA.IPA_to_XSAMPA(IPA)
if data.q and data.q[1] or data.qq and data.qq[1] or data.a and data.a[1] or data.aa and data.aa[1] then
local regenerated_IPA = m_XSAMPA.XSAMPA_to_IPA(XSAMPA)
prontext = require(pron_qualifier_module).format_qualifiers {
lang = lang,
content = {
text = prontext,
link = lang and word and m_links.full_link{ term = word, lang = lang },
q = data.q,
matched = IPA == regenerated_IPA
qq = data.qq,
and '<span style="color: green;">yes</span>'
a = data.a,
or '<span style="color: red;">no</span>',
aa = data.aa,
IPA = '<span class="IPA">' .. IPA .. '</span>',
XSAMPA = '<code>' .. XSAMPA .. '</code>',
regenerated_IPA = '<span class="IPA">' .. regenerated_IPA .. '</span>'
}
}
local function add_content(item)
return content[item] or ""
end
local row = gsub(row, "[%a_]+", add_content)
table.insert(output, row)
end
end
 
table.insert(output, "|}")
return prontext
return table.concat(output)
end
end


return export
return export

Latest revision as of 17:57, 16 January 2025



local export = {}

local force_cat = false -- for testing

local pron_qualifier_module = "Module:pron qualifier"
local qualifier_module = "Module:qualifier"
local references_module = "Module:references"
local string_utilities_module = "Module:string utilities"
local utilities_module = "Module:utilities"

local m_data = mw.loadData("Module:IPA/data")
local m_str_utils = require(string_utilities_module)
local m_symbols = mw.loadData("Module:IPA/data/symbols")

local concat = table.concat
local decode_entities = m_str_utils.decode_entities
local find = string.find
local gmatch = m_str_utils.gmatch
local gsub = string.gsub
local insert = table.insert
local len = m_str_utils.len
local listToText = mw.text.listToText
local match = string.match
local sub = string.sub
local u = m_str_utils.char
local ugsub = m_str_utils.gsub
local umatch = m_str_utils.match
local usub = m_str_utils.sub

local namespace = mw.title.getCurrentTitle().namespace
local is_content_page = namespace == 0 or namespace == 118

local function process_maybe_split_categories(split_output, categories, prontext, lang, errtext)
	if split_output ~= "raw" then
		if categories[1] then
			categories = require(utilities_module).format_categories(categories, lang, nil, nil, force_cat)
		else
			categories = ""
		end
	end
	if split_output then -- for use of IPA in links, etc.
		if errtext then
			return prontext, categories, errtext
		else
			return prontext, categories
		end
	else
		return prontext .. (errtext or "") .. categories
	end
end

--[==[
Format a line of one or more IPA pronunciations as {{tl|IPA}} would do it, i.e. with a preceding {"IPA:"} followed by
the word {"key"} linking to a page describing the language's phonology, and with an added category
{{cd|<var>lang</var> terms with IPA pronunciation}}. Other than the extra preceding text and category, this is identical
to {format_IPA_multiple()}, and the considerations described there in the documentation apply here as well. There is a
single parameter `data`, an object with the following fields:
* `lang`: Object representing the language of the pronunciations, which is used when adding cleanup categories for
   pronunciations with invalid phonemes; for determining how many syllables the pronunciations have in them, in order to
   add a category such as [[:Category:Italian 2-syllable words]] (for certain languages only); for adding a category
   {{cd|<var>lang</var> terms with IPA pronunciation}}; and for determining the proper sort keys for categories. Unlike
   for {format_IPA_multiple()}, `lang` may not be {nil}.
* `items`: List of pronunciations, in exactly the same format as for {format_IPA_multiple()}.
* `err`: If not {nil}, a string containing an error message to use in place of the link to the language's phonology.
* `separator`: The default separator to use when separating formatted items. Defaults to {", "}. Does not apply to the
  first item, where the default separator is always the empty string. Overridden by the per-item `separator` field in
  `items`.
* `sort_key`: Explicit sort key used for categories.
* `no_count`: Suppress adding a {#-syllable words} category such as [[:Category:Italian 2-syllable words]]. Note that
  only certain languages add such categories to begin with, because it depends on knowing how to count syllables in a
  given language, which depends on the phonology of the language. Also, this does not suppress the addition of cleanup
  or other categories. If you need them suppressed, use `split_output` to return the categories separately and ignore
  them.
* `split_output`: If not given, the return value is a concatenation of the formatted pronunciation and formatted
  categories. Otherwise, two values are returned: the formatted pronunciation and the categories. If `split_output` is
  the value {"raw"}, the categories are returned in list form, where the list elements are a combination of category
  strings and category objects of the form suitable for passing to {format_categories()} in [[Module:utilities]]. If
  `split_output` is any other value besides {nil}, the categories are returned as a pre-formatted concatenated string.
* `include_langname`: If specified, prefix the result with the language name, followed by a colon.
* `q`: {nil} or a list of left qualifiers (as in {{tl|q}}) to display at the beginning, before the formatted
  pronunciations and preceding {"IPA:"}.
* `qq`: {nil} or a list of right qualifiers to display after all formatted pronunciations.
* `a`: {nil} or a list of left accent qualifiers (as in {{tl|a}}) to display at the beginning, before the formatted
  pronunciations and preceding {"IPA:"}.
* `aa`: {nil} or a list of right accent qualifiers to display after all formatted pronunciations.
]==]
function export.format_IPA_full(data)
	if type(data) ~= "table" or data.getCode then
		error("Must now supply a table of arguments to format_IPA_full(); first argument should be that table, not a language object")
	end
	local lang = data.lang
	local items = data.items
	local err = data.err
	local separator = data.separator
	local sort_key = data.sort_key
	local no_count = data.no_count
	local split_output = data.split_output
	local q = data.q
	local qq = data.qq
	local a = data.a
	local aa = data.aa
	local include_langname = data.include_langname

	local hasKey = m_data.langs_with_infopages

	if not lang or not lang.getCode then
		error("Must specify language to format_IPA_full()")
	end
	local langname = lang:getCanonicalName()

	local prefix_text
	if err then
		prefix_text = '<span class="error">' .. err .. '</span>'
	else
		--[[if hasKey[lang:getCode()] then
			prefix_text = "IPA for " .. langname
		else
			prefix_text = "wikipedia:" .. langname .. " phonology"
		end]]
		prefix_text = "IPA for " .. langname
		
		prefix_text = "[[" .. prefix_text .. "|key]]"
	end

	local prefix = "[[Wiktionary:International Phonetic Alphabet|IPA]]<sup>(" .. prefix_text .. ")</sup>:&#32;"

	local IPAs, categories = export.format_IPA_multiple(lang, items, separator, no_count, "raw")

	if is_content_page then
		insert(categories, {
			cat = langname .. " terms with IPA pronunciation",
			sort_key = sort_key
		})
	end

	local prontext = prefix .. IPAs
	if q and q[1] or qq and qq[1] or a and a[1] or aa and aa[1] then
		prontext = require(pron_qualifier_module).format_qualifiers {
			lang = lang,
			text = prontext,
			q = q,
			qq = qq,
			a = a,
			aa = aa,
		}
	end
	if include_langname then
		prontext = langname .. ": " .. prontext
	end
	return process_maybe_split_categories(split_output, categories, prontext, lang)
end

local function split_phonemic_phonetic(pron)
	local reconstructed, phonemic, phonetic = match(pron, "^(%*?)(/.-/)%s+(%[.-%])$")
	if reconstructed then
		return reconstructed .. phonemic, reconstructed .. phonetic
	else
		return pron, nil
	end
end

local function determine_repr(pron)
	local reconstructed
	
	-- remove initial asterisk before representation marks, used on some Reconstruction pages
	if sub(pron, 1, 1) == "*" then
		reconstructed = true
		pron = sub(pron, 2)
	end
	
	local opening = match(pron, "^.[\128-\191]*")
	local data = m_data.representation_types[opening]
	
	if data then
		local closing = data[2]
		if data and match(pron, closing .. "$", #opening + 1) then
			return data[1], opening, closing, reconstructed
		end
	end
	
	return nil, "", "", reconstructed
end

local function hasInvalidSeparators(transcription)
	if umatch(transcription, "%.[ˈˌ]") or umatch(transcription, "[ˈˌ][ .]") then
		return true
	else
		return false
	end
end

--[==[
Format a line of one or more bare IPA pronunciations (i.e. without any preceding {"IPA:"} and without adding to a
category {{cd|<var>lang</var> terms with IPA pronunciation}}). Individual pronunciations are formatted using
{format_IPA()} and are combined with separators, qualifiers, pre-text, post-text, etc. to form a line of pronunciations.
Parameters accepted are:
* `lang` is an object representing the language of the pronunciations, which is used when adding cleanup categories for
   pronunciations with invalid phonemes; for determining how many syllables the pronunciations have in them, in order to
   add a category such as [[:Category:Italian 2-syllable words]] (for certain languages only); and for computing the
   proper sort keys for categories. `lang` may be {nil}.
* `items` is a list of pronunciations, each of which is an object with the following properties:
** `pron`: the pronunciation, in the same format as is accepted by {format_IPA()}, i.e. it should be either phonemic
     (surrounded by {/.../}), phonetic (surrounded by {[...]}), orthographic (surrounded by {⟨...⟩}) or a rhyme
	 (beginning with a hyphen);
** `pretext`: text to display directly before the formatted pronunciation, inside of any qualifiers or accent
     qualifiers;
** `posttext`: text to display directly after the formatted pronunciation, inside of any qualifiers or accent
     qualifiers;
** `q` or `qualifiers`: {nil} or a list of left qualifiers (as in {{tl|q}}) to display before the formatted
     pronunciation; note that `qualifiers` is deprecated;
** `qq`: {nil} or a list of right qualifiers to display after the formatted pronunciation;
** `a`: {nil} or a list of left accent qualifiers (as in {{tl|a}}) to display before the formatted pronunciation;
** `aa`: {nil} or a list of right accent qualifiers to after before the formatted pronunciation;
** `refs`: {nil} or a list of references or reference specs to add after the pronunciation and any posttext and
     qualifiers; the value of a list item is either a string containing the reference text (typically a call to a
	 citation template such as {{tl|cite-book}}, or a template wrapping such a call), or an object with fields `text`
	 (the reference text), `name` (the name of the reference, as in {{cd|<nowiki><ref name="foo">...</ref></nowiki>}}
	 or {{cd|<nowiki><ref name="foo" /></nowiki>}}) and/or `group` (the group of the reference, as in
	 {{cd|<nowiki><ref name="foo" group="bar">...</ref></nowiki>}} or
	 {{cd|<nowiki><ref name="foo" group="bar"/></nowiki>}}); this uses a parser function to format the reference
	 appropriately and insert a footnote number that hyperlinks to the actual reference, located in the
	 {{cd|<nowiki><references /></nowiki>}} section;
** `gloss`: {nil} or a gloss (definition) for this item, if different definitions have different pronunciations;
** `pos`: {nil} or a part of speech for this item, if different parts of speech have different pronunciations;
** `separator`: the separator text to insert directly before the formatted pronunciation and all qualifiers, accent
   qualifiers and pre-text; defaults to the outer `separator` parameter.
* `separator`: The default separator to use when separating formatted items. Defaults to {", "}. Does not apply to the
  first item, where the default separator is always the empty string. Overridden by the per-item `separator` field in
  `items`.
* `no_count`: Suppress adding a {#-syllable words} category such as [[:Category:Italian 2-syllable words]]. Note that
  only certain languages add such categories to begin with, because it depends on knowing how to count syllables in a
  given language, which depends on the phonology of the language. Also, this does not suppress the addition of cleanup
  categories. If you need them suppressed, use `split_output` to return the categories separately and ignore them.
* `split_output`: If not given, the return value is a concatenation of the formatted pronunciation and formatted
  categories. Otherwise, two values are returned: the formatted pronunciation and the categories. If `split_output` is
  the value {"raw"}, the categories are returned in list form, where the list elements are a combination of category
  strings and category objects of the form suitable for passing to {format_categories()} in [[Module:utilities]]. If
  `split_output` is any other value besides {nil}, the categories are returned as a pre-formatted concatenated string.
]==]
function export.format_IPA_multiple(lang, items, separator, no_count, split_output)
	local categories = {}
	separator = separator or ", "

	-- Format
	if not items[1] then
		if namespace == 10 then -- Template
			insert(items, {pron = "/aɪ piː ˈeɪ/"})
		else
			insert(categories, "Pronunciation templates without a pronunciation")
		end
	end

	local bits = {}

	for i, item in ipairs(items) do
		local bit

		-- If the pronunciation is entirely empty, allow this and don't do anything, so that e.g. the pretext and/or
		-- posttext can be specified to force something like ''unknown'' to appear in place of the pronunciation
		-- (as happens e.g. when ? is used as a respelling in [[Module:ca-IPA]]; see [[guèiser]] for an example).
		if item.pron == "" then
			bit = ""
		else
			local item_categories, errtext
			bit, item_categories, errtext = export.format_IPA(lang, item.pron, "raw")
			bit = bit .. errtext
			for _, cat in ipairs(item_categories) do
				insert(categories, cat)
			end
		end

		if item.pretext then
			bit = item.pretext .. bit
		end

		if item.posttext then
			bit = bit .. item.posttext
		end

		local has_qualifiers = item.q and item.q[1] or item.qq and item.qq[1] or item.qualifiers and item.qualifiers[1]
			or item.a and item.a[1] or item.aa and item.aa[1]
		local has_gloss_or_pos = item.gloss or item.pos
		if has_qualifiers or has_gloss_or_pos then
			-- FIXME: Currently we tack the gloss and POS (in that order) onto the end of the regular left qualifiers.
			-- Should we do something different?
			local q = item.q
			if has_gloss_or_pos then
				q = mw.clone(item.q) or {}
				if item.gloss then
					local m_qualifier = require(qualifier_module)
					insert(q, m_qualifier.wrap_qualifier_css("“", "quote") .. item.gloss ..
						m_qualifier.wrap_qualifier_css("”", "quote"))
				end
				if item.pos then
					-- FIXME: Consider expanding aliases as found in [[Module:headword/data]] or similar.
					insert(q, item.pos)
				end
			end

			bit = require("Module:pron qualifier").format_qualifiers {
				lang = lang,
				text = bit,
				q = q,
				qq = item.qq,
				qualifiers = item.qualifiers,
				a = item.a,
				aa = item.aa,
			}
		end

		if item.note then
			-- Support removed on 2024-06-15.
			error("Support for `.note` has been removed; switch to `.refs` (which must be a list)")
		end
		if item.refs then
			local refspecs = item.refs
			if #refspecs > 0 then
				bit = bit .. require(references_module).format_references(refspecs)
			end
		end

		bit = (item.separator or (i == 1 and "" or separator)) .. bit

		insert(bits, bit)

		if lang then
			-- The nature of hasInvalidSeparators() is such that we don't have to split a combined '/.../ [...]' spec
			-- into its parts in order to process.
			if lang:getCode() == "en" and hasInvalidSeparators(item.pron) then
				insert(categories, "English IPA pronunciations with invalid separators")
			end
		end
	end

	return process_maybe_split_categories(split_output, categories, concat(bits), lang)
end

--[=[
Format a single IPA pronunciation, which cannot be a combined spec (such as {/.../ [...]}). This has been extracted from
{format_IPA()} to allow the latter to handle such combined specs. This works like {format_IPA()} but requires that
pre-created {err} (for error messages) and {categories} lists be passed in, and adds any generated error messages and
categories to those lists. A single value is returned, the pronunciation, which is usually the same as passed in, but
may have HTML added surrounding invalid characters so they appear in red.
]=]
local function format_one_IPA(lang, pron, err, categories)
	-- Disallow wikilinks.
	if match(pron, "%[%[.-%]%]") then
		error("IPA input must not contain wikilinks.")
	end
	
	pron = decode_entities(pron)

	-- Detect the type of transcription.
	local repr, opening, closing, reconstructed = determine_repr(pron)
	
	-- Strip any reconstruction asterisk and representation marks.
	pron = sub(pron, #opening + 1 + (reconstructed and 1 or 0), -#closing - 1)

	if pron == "" then
		insert(categories, "IPA pronunciations with no pronunciation present")
	end

	-- Check for obsolete and nonstandard symbols
	for _, symbol in ipairs(m_data.nonstandard) do
		local result
		for nonstandard in gmatch(pron, symbol) do
			if not result then
				result = {}
			end
			insert(result, nonstandard)
			insert(categories,
				{cat = "IPA pronunciations with obsolete or nonstandard characters", sort_key = nonstandard}
			)
		end

		if result then
			insert(err, "obsolete or nonstandard characters (" .. concat(result) .. ")")
			break
		end
	end

	--[[ Check for invalid symbols after removing the following:
			1. wikilinks (handled above)
			2. paired HTML tags
			3. bolding
			4. italics
			5. asterisk at beginning of transcription
			6. comma followed by spacing characters
			7. superscripts enclosed in superscript parentheses		]]
	local found_HTML
	local result = gsub(pron, "<(%a+)[^>]*>([^<]+)</%1>",
		function(tagName, content)
			found_HTML = true
			return content
		end)
	result = gsub(result, "'''([^']*)'''", "%1")
	result = gsub(result, "''([^']*)''", "%1")
	result = gsub(result, "^%*", "")
	result = ugsub(result, ",%s+", "")

	-- VS15
	local vs15_class = "[" .. m_symbols.add_vs15 .. "]"
	if umatch(pron, vs15_class) then
		local vs15 = u(0xFE0E)
		if find(result, vs15) then
			result = gsub(result, vs15, "")
			pron = gsub(pron, vs15, "")
		end
		pron = ugsub(pron, vs15_class, "%0" .. vs15)
	end

	if result ~= "" then
		local suggestions = {}
		for k, v in pairs(m_symbols.invalid) do
			if find(result, k, 1, true) then
				pron = pron:gsub(k, v)
			end
		end
		
		result = ugsub(result, "⁽[".. m_symbols.superscripts .. "]+⁾", "")
		local per_lang_valid
		if lang then
			per_lang_valid = m_symbols.per_lang_valid[lang:getCode()]
		end
		per_lang_valid = per_lang_valid or ""
		result = ugsub(result, "[" .. m_symbols.valid .. per_lang_valid .. "]", "")
	end

	if found_HTML then
		insert(categories, "IPA pronunciations with paired HTML tags")
	end

	if (repr == "phonemic" or repr == "rhyme") and lang and m_data.phonemes[lang:getCode()] then
		local valid_phonemes = m_data.phonemes[lang:getCode()]
		local rest = pron
		local phonemes = {}

		while #rest > 0 do
			local longestmatch, longestmatch_len = "", 0

			local rest_init = sub(rest, 1, 1)
			if rest_init == "(" or rest_init == ")" then
				longestmatch = rest_init
				longestmatch_len = 1
			else
				for _, phoneme in ipairs(valid_phonemes) do
					local phoneme_len = len(phoneme)
					if phoneme_len > longestmatch_len and usub(rest, 1, phoneme_len) == phoneme then
						longestmatch = phoneme
						longestmatch_len = len(longestmatch)
					end
				end
			end

			if longestmatch_len > 0 then
				insert(phonemes, longestmatch)
				rest = usub(rest, longestmatch_len + 1)
			else
				local phoneme = usub(rest, 1, 1)
				insert(phonemes, "<span style=\"color: red\">" .. phoneme .. "</span>")
				rest = usub(rest, 2)
				insert(categories, "IPA pronunciations with invalid phonemes/" .. lang:getCode())
			end
		end

		pron = concat(phonemes)
	end

	return (reconstructed and "*" or "") .. opening .. pron .. closing
end

--[==[
Format an IPA pronunciation. This wraps the pronunciation in appropriate CSS classes and adds cleanup categories and
error messages as needed. The pronunciation `pron` should be either phonemic (surrounded by {/.../}), phonetic
(surrounded by {[...]}), orthographic (surrounded by {⟨...⟩}), a rhyme (beginning with a hyphen) or a combined
phonemic/phonetic spec (of the form {/.../ [...]}). `lang` indicates the language of the pronunciation and can be {nil}.
If not {nil}, and the specified language has data in [[Module:IPA/data]] indicating the allowed phonemes, then the page
will be added to a cleanup category and an error message displayed next to the outputted pronunciation. Note that {lang}
also determines sort key processing in the added cleanup categories. If `split_output` is not given, the return value is
a concatenation of the formatted pronunciation, error messages and formatted cleanup categories. Otherwise, three values
are returned: the formatted pronunciation, the cleanup categories and the concatenated error messages. If `split_output`
is the value {"raw"}, the cleanup categories are returned in list form, where the list elements are a combination of
category strings and category objects of the form suitable for passing to {format_categories()} in [[Module:utilities]].
If `split_output` is any other value besides {nil}, the cleanup categories are returned as a pre-formatted concatenated
string.
]==]
function export.format_IPA(lang, pron, split_output)
	local err = {}
	local categories = {}

	-- `pron` shouldn't contain ref tags.
	if match(pron, "\127'\"`UNIQ%-%-ref%-[%dA-F]+%-QINU`\"'\127") then
		error("<ref> tags found inside pronunciation parameter.")
	end

	local phonemic, phonetic = split_phonemic_phonetic(pron)
	pron = format_one_IPA(lang, phonemic, err, categories)
	if phonetic then
		phonetic = format_one_IPA(lang, phonetic, err, categories)
		pron = pron .. " " .. phonetic
	end

	if err[1] then
		err = '<span class="previewonly error" style="font-size: small;>&#32;' .. concat(err, ", ") .. "</span>"
	else
		err = ""
	end

	return process_maybe_split_categories(split_output, categories, '<span class="IPA">' .. pron .. "</span>", lang,
		err)
end

--[==[
Format a line of one or more enPR pronunciations as {{tl|enPR}} would do it, i.e. with a preceding {"enPR:"} (linked to
[[Appendix:English pronunciation]]) followed by one or more formatted, comma-separated enPR pronunciations. The
pronunciations are formatted by wrapping them in the {{cd|AHD}} and {{cd|enPR}} CSS classes and adding any left and
right regular and accent qualifiers. In addition, the overall result is wrapped in any overall left and right regular
and accent qualifiers. There is a single parameter `data`, an object with the following fields:
* `items` is a list of enPR pronunciations, each of which is an object with the following properties:
** `pron`: the enPR pronunciation;
** `q`: {nil} or a list of left qualifiers (as in {{tl|q}}) to display before the formatted pronunciation;
** `qq`: {nil} or a list of right qualifiers to display after the formatted pronunciation;
** `a`: {nil} or a list of left accent qualifiers (as in {{tl|a}}) to display before the formatted pronunciation;
** `aa`: {nil} or a list of right accent qualifiers to after before the formatted pronunciation.
* `q`: {nil} or a list of left qualifiers (as in {{tl|q}}) to display at the beginning, before the formatted
  pronunciations and preceding {"enPR:"}.
* `qq`: {nil} or a list of right qualifiers to display after all formatted pronunciations.
* `a`: {nil} or a list of left accent qualifiers (as in {{tl|a}}) to display at the beginning, before the formatted
  pronunciations and preceding {"enPR:"}.
* `aa`: {nil} or a list of right accent qualifiers to display after all formatted pronunciations.
]==]
function export.format_enPR_full(data)
	local prefix = "[[Appendix:English pronunciation|enPR]]: "
	local lang = require("Module:languages").getByCode("en")
	local parts = {}

	for _, item in ipairs(data.items) do
		local part = '<span class="AHD enPR">' .. item.pron .. "</span>"

		if item.q and item.q[1] or item.qq and item.qq[1] or item.a and item.a[1] or item.aa and item.aa[1] then
			part = require("Module:pron qualifier").format_qualifiers {
				lang = lang,
				text = part,
				q = item.q,
				qq = item.qq,
				a = item.a,
				aa = item.aa,
			}
		end
		insert(parts, part)
	end

	local prontext = prefix .. concat(parts, ", ")
	if data.q and data.q[1] or data.qq and data.qq[1] or data.a and data.a[1] or data.aa and data.aa[1] then
		prontext = require(pron_qualifier_module).format_qualifiers {
			lang = lang,
			text = prontext,
			q = data.q,
			qq = data.qq,
			a = data.a,
			aa = data.aa,
		}
	end

	return prontext
end

return export