45,646
edits
No edit summary |
No edit summary |
||
Line 1: | Line 1: | ||
local export = {} | local export = {} | ||
local m_data = mw.loadData( | local force_cat = false -- for testing | ||
local m_symbols = mw.loadData( | |||
local m_data = mw.loadData("Module:IPA/data") | |||
local m_str_utils = require("Module:string utilities") | |||
local m_symbols = mw.loadData("Module:IPA/data/symbols") | |||
local pron_qualifier_module = "Module:pron qualifier" | |||
local qualifier_module = "Module:qualifier" | |||
local references_module = "Module:references" | |||
local syllables_module = "Module:syllables" | |||
local utilities_module = "Module:utilities" | |||
local m_syllables -- [[Module:syllables]]; loaded below if needed | local m_syllables -- [[Module:syllables]]; loaded below if needed | ||
local | local concat = table.concat | ||
local | local find = string.find | ||
local gsub = mw. | local gmatch = m_str_utils.gmatch | ||
local | local gsub = string.gsub | ||
local | local insert = table.insert | ||
local | local len = m_str_utils.len | ||
local listToText = mw.text.listToText | |||
local match = string.match | |||
local sub = string.sub | |||
local u = m_str_utils.char | |||
local ufind = m_str_utils.find | |||
local ugsub = m_str_utils.gsub | |||
local umatch = m_str_utils.match | |||
local usub = m_str_utils.sub | |||
local namespace = mw.title.getCurrentTitle().namespace | |||
local is_content_page = namespace == 0 or namespace == 118 | |||
local function process_maybe_split_categories(split_output, categories, prontext, lang, errtext) | |||
if split_output ~= "raw" then | |||
if categories[1] then | |||
categories = require(utilities_module).format_categories(categories, lang, nil, nil, force_cat) | |||
else | |||
categories = "" | |||
end | |||
end | |||
if split_output then -- for use of IPA in links, etc. | |||
if errtext then | |||
return prontext, categories, errtext | |||
else | |||
return prontext, categories | |||
end | |||
else | |||
return prontext .. (errtext or "") .. categories | |||
end | |||
end | |||
--[==[ | |||
Format a line of one or more IPA pronunciations as {{tl|IPA}} would do it, i.e. with a preceding {"IPA:"} followed by | |||
the word {"key"} linking to an Appendix page describing the language's phonology, and with an added category | |||
{{cd|<var>lang</var> terms with IPA pronunciation}}. Other than the extra preceding text and category, this is identical | |||
to {format_IPA_multiple()}, and the considerations described there in the documentation apply here as well. There is a | |||
single parameter `data`, an object with the following fields: | |||
* `lang`: Object representing the language of the pronunciations, which is used when adding cleanup categories for | |||
pronunciations with invalid phonemes; for determining how many syllables the pronunciations have in them, in order to | |||
add a category such as [[:Category:Italian 2-syllable words]] (for certain languages only); for adding a category | |||
{{cd|<var>lang</var> terms with IPA pronunciation}}; and for determining the proper sort keys for categories. Unlike | |||
for {format_IPA_multiple()}, `lang` may not be {nil}. | |||
* `items`: List of pronunciations, in exactly the same format as for {format_IPA_multiple()}. | |||
* `err`: If not {nil}, a string containing an error message to use in place of the link to the language's phonology. | |||
* `separator`: The default separator to use when separating formatted items. Defaults to {", "}. Does not apply to the | |||
first item, where the default separator is always the empty string. Overridden by the per-item `separator` field in | |||
`items`. | |||
* `sort_key`: Explicit sort key used for categories. | |||
* `no_count`: Suppress adding a {#-syllable words} category such as [[:Category:Italian 2-syllable words]]. Note that | |||
only certain languages add such categories to begin with, because it depends on knowing how to count syllables in a | |||
given language, which depends on the phonology of the language. Also, this does not suppress the addition of cleanup | |||
or other categories. If you need them suppressed, use `split_output` to return the categories separately and ignore | |||
them. | |||
* `split_output`: If not given, the return value is a concatenation of the formatted pronunciation and formatted | |||
categories. Otherwise, two values are returned: the formatted pronunciation and the categories. If `split_output` is | |||
the value {"raw"}, the categories are returned in list form, where the list elements are a combination of category | |||
strings and category objects of the form suitable for passing to {format_categories()} in [[Module:utilities]]. If | |||
`split_output` is any other value besides {nil}, the categories are returned as a pre-formatted concatenated string. | |||
* `include_langname`: If specified, prefix the result with the language name, followed by a colon. | |||
* `q`: {nil} or a list of left qualifiers (as in {{tl|q}}) to display at the beginning, before the formatted | |||
pronunciations and preceding {"IPA:"}. | |||
* `qq`: {nil} or a list of right qualifiers to display after all formatted pronunciations. | |||
* `a`: {nil} or a list of left accent qualifiers (as in {{tl|a}}) to display at the beginning, before the formatted | |||
pronunciations and preceding {"IPA:"}. | |||
* `aa`: {nil} or a list of right accent qualifiers to display after all formatted pronunciations. | |||
]==] | |||
function export.format_IPA_full(data) | |||
if type(data) ~= "table" or data.getCode then | |||
error("Must now supply a table of arguments to format_IPA_full(); first argument should be that table, not a language object") | |||
end | |||
local lang = data.lang | |||
local items = data.items | |||
local err = data.err | |||
local separator = data.separator | |||
local sort_key = data.sort_key | |||
local no_count = data.no_count | |||
local split_output = data.split_output | |||
local q = data.q | |||
local qq = data.qq | |||
local a = data.a | |||
local aa = data.aa | |||
local include_langname = data.include_langname | |||
local hasKey = m_data.langs_with_infopages | local hasKey = m_data.langs_with_infopages | ||
local | |||
if not lang or not lang.getCode then | |||
error("Must specify language to format_IPA_full()") | |||
end | |||
local langname = lang:getCanonicalName() | |||
local prefix_text | |||
if err then | if err then | ||
prefix_text = '<span class="error">' .. err .. '</span>' | |||
else | else | ||
if hasKey[lang:getCode()] then | |||
prefix_text = "Appendix:" .. langname .. " pronunciation" | |||
else | |||
prefix_text = "wikipedia:" .. langname .. " phonology" | |||
end | |||
prefix_text = "[[" .. prefix_text .. "|key]]" | |||
end | end | ||
local prefix = "[[Wiktionary:International Phonetic Alphabet|IPA]]<sup>(" .. prefix_text .. ")</sup>: " | |||
local prefix = "[[ | |||
local IPAs, categories = export.format_IPA_multiple(lang, items, separator, no_count, "raw") | |||
IPAs = export.format_IPA_multiple(lang, items, separator, no_count) | |||
if is_content_page then | |||
insert(categories, { | |||
cat = langname .. " terms with IPA pronunciation", | |||
sort_key = sort_key | |||
}) | |||
end | |||
local prontext = prefix .. IPAs | |||
if q and q[1] or qq and qq[1] or a and a[1] or aa and aa[1] then | |||
prontext = require(pron_qualifier_module).format_qualifiers { | |||
lang = lang, | |||
text = prontext, | |||
q = q, | |||
qq = qq, | |||
a = a, | |||
aa = aa, | |||
} | |||
end | |||
if include_langname then | |||
prontext = langname .. ": " .. prontext | |||
end | |||
return process_maybe_split_categories(split_output, categories, prontext, lang) | |||
end | |||
local function split_phonemic_phonetic(pron) | |||
local reconstructed, phonemic, phonetic = match(pron, "^(%*?)(/.-/)%s+(%[.-%])$") | |||
if reconstructed then | |||
return reconstructed .. phonemic, reconstructed .. phonetic | |||
else | |||
return pron, nil | |||
end | |||
end | end | ||
Line 35: | Line 161: | ||
local repr_mark = {} | local repr_mark = {} | ||
local repr, reconstructed | local repr, reconstructed | ||
-- remove initial asterisk before representation marks, used on some Reconstruction pages | -- remove initial asterisk before representation marks, used on some Reconstruction pages | ||
if | if sub(pron, 1, 1) == "*" then | ||
reconstructed = true | reconstructed = true | ||
pron = sub(pron, 2) | pron = sub(pron, 2) | ||
end | end | ||
local representation_types = { | local representation_types = { | ||
['/'] = { right = '/', type = 'phonemic', }, | ['/'] = { right = '/', type = 'phonemic', }, | ||
Line 48: | Line 174: | ||
['-'] = { type = 'rhyme' }, | ['-'] = { type = 'rhyme' }, | ||
} | } | ||
repr_mark.i, repr_mark.f, repr_mark.left, repr_mark.right = | repr_mark.i, repr_mark.f, repr_mark.left, repr_mark.right = ufind(pron, '^(.).-(.)$') | ||
local representation_type = representation_types[repr_mark.left] | local representation_type = representation_types[repr_mark.left] | ||
if representation_type then | if representation_type then | ||
if representation_type.right then | if representation_type.right then | ||
Line 64: | Line 190: | ||
repr = nil | repr = nil | ||
end | end | ||
return repr, reconstructed | return repr, reconstructed | ||
end | end | ||
local function hasInvalidSeparators(transcription) | local function hasInvalidSeparators(transcription) | ||
if | if match(transcription, "%.\203[\136\140]") then -- [ˈˌ] | ||
return true | return true | ||
else | else | ||
Line 76: | Line 202: | ||
end | end | ||
function export.format_IPA_multiple(lang, items, separator, no_count) | --[==[ | ||
Format a line of one or more bare IPA pronunciations (i.e. without any preceding {"IPA:"} and without adding to a | |||
category {{cd|<var>lang</var> terms with IPA pronunciation}}). Individual pronunciations are formatted using | |||
{format_IPA()} and are combined with separators, qualifiers, pre-text, post-text, etc. to form a line of pronunciations. | |||
Parameters accepted are: | |||
* `lang` is an object representing the language of the pronunciations, which is used when adding cleanup categories for | |||
pronunciations with invalid phonemes; for determining how many syllables the pronunciations have in them, in order to | |||
add a category such as [[:Category:Italian 2-syllable words]] (for certain languages only); and for computing the | |||
proper sort keys for categories. `lang` may be {nil}. | |||
* `items` is a list of pronunciations, each of which is an object with the following properties: | |||
** `pron`: the pronunciation, in the same format as is accepted by {format_IPA()}, i.e. it should be either phonemic | |||
(surrounded by {/.../}), phonetic (surrounded by {[...]}), orthographic (surrounded by {⟨...⟩}) or a rhyme | |||
(beginning with a hyphen); | |||
** `pretext`: text to display directly before the formatted pronunciation, inside of any qualifiers or accent | |||
qualifiers; | |||
** `posttext`: text to display directly after the formatted pronunciation, inside of any qualifiers or accent | |||
qualifiers; | |||
** `q` or `qualifiers`: {nil} or a list of left qualifiers (as in {{tl|q}}) to display before the formatted | |||
pronunciation; note that `qualifiers` is deprecated; | |||
** `qq`: {nil} or a list of right qualifiers to display after the formatted pronunciation; | |||
** `a`: {nil} or a list of left accent qualifiers (as in {{tl|a}}) to display before the formatted pronunciation; | |||
** `aa`: {nil} or a list of right accent qualifiers to after before the formatted pronunciation; | |||
** `refs`: {nil} or a list of references or reference specs to add after the pronunciation and any posttext and | |||
qualifiers; the value of a list item is either a string containing the reference text (typically a call to a | |||
citation template such as {{tl|cite-book}}, or a template wrapping such a call), or an object with fields `text` | |||
(the reference text), `name` (the name of the reference, as in {{cd|<nowiki><ref name="foo">...</ref></nowiki>}} | |||
or {{cd|<nowiki><ref name="foo" /></nowiki>}}) and/or `group` (the group of the reference, as in | |||
{{cd|<nowiki><ref name="foo" group="bar">...</ref></nowiki>}} or | |||
{{cd|<nowiki><ref name="foo" group="bar"/></nowiki>}}); this uses a parser function to format the reference | |||
appropriately and insert a footnote number that hyperlinks to the actual reference, located in the | |||
{{cd|<nowiki><references /></nowiki>}} section; | |||
** `gloss`: {nil} or a gloss (definition) for this item, if different definitions have different pronunciations; | |||
** `pos`: {nil} or a part of speech for this item, if different parts of speech have different pronunciations; | |||
** `separator`: the separator text to insert directly before the formatted pronunciation and all qualifiers, accent | |||
qualifiers and pre-text; defaults to the outer `separator` parameter. | |||
* `separator`: The default separator to use when separating formatted items. Defaults to {", "}. Does not apply to the | |||
first item, where the default separator is always the empty string. Overridden by the per-item `separator` field in | |||
`items`. | |||
* `no_count`: Suppress adding a {#-syllable words} category such as [[:Category:Italian 2-syllable words]]. Note that | |||
only certain languages add such categories to begin with, because it depends on knowing how to count syllables in a | |||
given language, which depends on the phonology of the language. Also, this does not suppress the addition of cleanup | |||
categories. If you need them suppressed, use `split_output` to return the categories separately and ignore them. | |||
* `split_output`: If not given, the return value is a concatenation of the formatted pronunciation and formatted | |||
categories. Otherwise, two values are returned: the formatted pronunciation and the categories. If `split_output` is | |||
the value {"raw"}, the categories are returned in list form, where the list elements are a combination of category | |||
strings and category objects of the form suitable for passing to {format_categories()} in [[Module:utilities]]. If | |||
`split_output` is any other value besides {nil}, the categories are returned as a pre-formatted concatenated string. | |||
]==] | |||
function export.format_IPA_multiple(lang, items, separator, no_count, split_output) | |||
local categories = {} | local categories = {} | ||
separator = separator or | separator = separator or ", " | ||
-- Format | -- Format | ||
if not items[1] then | if not items[1] then | ||
if | if namespace == 10 then -- Template | ||
insert(items, {pron = "/aɪ piː ˈeɪ/"}) | |||
else | else | ||
insert(categories, "Pronunciation templates without a pronunciation") | |||
end | end | ||
end | end | ||
local bits = {} | local bits = {} | ||
for | for i, item in ipairs(items) do | ||
local bit = export.format_IPA(lang, item.pron) | local bit | ||
-- If the pronunciation is entirely empty, allow this and don't do anything, so that e.g. the pretext and/or | |||
-- posttext can be specified to force something like ''unknown'' to appear in place of the pronunciation | |||
-- (as happens e.g. when ? is used as a respelling in [[Module:ca-IPA]]; see [[guèiser]] for an example). | |||
if item.pron == "" then | |||
bit = "" | |||
else | |||
local item_categories, errtext | |||
bit, item_categories, errtext = export.format_IPA(lang, item.pron, "raw") | |||
bit = bit .. errtext | |||
for _, cat in ipairs(item_categories) do | |||
insert(categories, cat) | |||
end | |||
end | |||
if item.pretext then | if item.pretext then | ||
bit = item.pretext .. bit | bit = item.pretext .. bit | ||
end | end | ||
if item.posttext then | if item.posttext then | ||
bit = bit .. item.posttext | bit = bit .. item.posttext | ||
end | end | ||
local has_qualifiers = item.q and item.q[1] or item.qq and item.qq[1] or item.qualifiers and item.qualifiers[1] | |||
bit = require("Module:qualifier"). | or item.a and item.a[1] or item.aa and item.aa[1] | ||
local has_gloss_or_pos = item.gloss or item.pos | |||
if has_qualifiers or has_gloss_or_pos then | |||
-- FIXME: Currently we tack the gloss and POS (in that order) onto the end of the regular left qualifiers. | |||
-- Should we do something different? | |||
local q = item.q | |||
if has_gloss_or_pos then | |||
q = mw.clone(item.q) or {} | |||
if item.gloss then | |||
local m_qualifier = require(qualifier_module) | |||
insert(q, m_qualifier.wrap_qualifier_css("“", "quote") .. item.gloss .. | |||
m_qualifier.wrap_qualifier_css("”", "quote")) | |||
end | |||
if item.pos then | |||
-- FIXME: Consider expanding aliases as found in [[Module:headword/data]] or similar. | |||
insert(q, item.pos) | |||
end | |||
end | |||
bit = require("Module:pron qualifier").format_qualifiers { | |||
lang = lang, | |||
text = bit, | |||
q = q, | |||
qq = item.qq, | |||
qualifiers = item.qualifiers, | |||
a = item.a, | |||
aa = item.aa, | |||
} | |||
end | end | ||
if | if item.note then | ||
-- Support removed on 2024-06-15. | |||
error("Support for `.note` has been removed; switch to `.refs` (which must be a list)") | |||
end | |||
if item.refs then | |||
local refspecs = item.refs | |||
if #refspecs > 0 then | if #refspecs > 0 then | ||
bit = bit .. require(references_module).format_references(refspecs) | |||
end | end | ||
end | end | ||
bit = (item.separator or (i == 1 and "" or separator)) .. bit | |||
insert(bits, bit) | |||
if lang then | if lang then | ||
-- Add syllable count if the language's diphthongs are listed in [[Module:syllables]]. | -- Add syllable count if the language's diphthongs are listed in [[Module:syllables]]. | ||
-- Don't do this if the term has spaces | -- Don't do this if the term has spaces, a liaison mark (‿) or isn't in mainspace. | ||
if not no_count and | if not no_count and namespace == 0 then | ||
m_syllables = m_syllables or require( | m_syllables = m_syllables or require(syllables_module) | ||
local langcode = lang:getCode() | local langcode = lang:getCode() | ||
if m_data.langs_to_generate_syllable_count_categories[langcode] then | if m_data.langs_to_generate_syllable_count_categories[langcode] then | ||
local | local phonemic, phonetic = split_phonemic_phonetic(item.pron) | ||
local use_it | local use_it | ||
if m_data.langs_to_use_phonetic_notation[langcode] then | if not phonetic then -- not a '/.../ [...]' combined pronunciation | ||
use_it = repr == " | local repr = determine_repr(phonemic) | ||
if m_data.langs_to_use_phonetic_notation[langcode] then | |||
use_it = repr == "phonemic" | use_it = repr == "phonetic" and phonemic or nil | ||
else | |||
use_it = repr == "phonemic" and phonemic or nil | |||
end | |||
elseif repr == "phonetic" then | |||
use_it = phonetic | |||
elseif repr == "phonemic" then | |||
use_it = phonemic | |||
end | end | ||
if use_it and not find( | -- Note: two uses of find with plain patterns is much faster than umatch with [ ‿]. | ||
local syllable_count = m_syllables.getVowels( | if use_it and not (find(use_it, " ") or find(use_it, "‿")) then | ||
local syllable_count = m_syllables.getVowels(use_it, lang) | |||
if syllable_count then | if syllable_count then | ||
insert(categories, lang:getCanonicalName() .. " " .. syllable_count .. | |||
"-syllable words") | |||
end | end | ||
end | end | ||
Line 156: | Line 368: | ||
end | end | ||
-- The nature of hasInvalidSeparators() is such that we don't have to split a combined '/.../ [...]' spec | |||
-- into its parts in order to process. | |||
if lang:getCode() == "en" and hasInvalidSeparators(item.pron) then | if lang:getCode() == "en" and hasInvalidSeparators(item.pron) then | ||
insert(categories, "IPA for English using .ˈ or .ˌ") | |||
end | end | ||
end | end | ||
end | end | ||
return | return process_maybe_split_categories(split_output, categories, concat(bits), lang) | ||
end | end | ||
-- | --[=[ | ||
function | Format a single IPA pronunciation, which cannot be a combined spec (such as {/.../ [...]}). This has been extracted from | ||
{format_IPA()} to allow the latter to handle such combined specs. This works like {format_IPA()} but requires that | |||
pre-created {err} (for error messages) and {categories} lists be passed in, and adds any generated error messages and | |||
categories to those lists. A single value is returned, the pronunciation, which is usually the same as passed in, but | |||
-- Remove wikilinks, so that wikilink brackets are not misinterpreted as | may have HTML added surrounding invalid characters so they appear in red. | ||
]=] | |||
local | local function format_one_IPA(lang, pron, err, categories) | ||
-- Remove wikilinks, so that wikilink brackets are not misinterpreted as indicating phonetic transcription | |||
without_links = | local without_links = gsub(pron, "%[%[[^|%]]+|([^%]]+)%]%]", "%1") | ||
without_links = gsub(without_links, "%[%[[^%]]+%]%]", "%1") | |||
-- Detect whether this is a phonemic or phonetic transcription | -- Detect whether this is a phonemic or phonetic transcription | ||
local repr, reconstructed = determine_repr(without_links) | local repr, reconstructed = determine_repr(without_links) | ||
if reconstructed then | if reconstructed then | ||
pron = sub(pron, 2) | pron = sub(pron, 2) | ||
without_links = sub(without_links, 2) | |||
end | end | ||
-- If valid, strip the representation marks | -- If valid, strip the representation marks | ||
if repr == "phonemic" then | if repr == "phonemic" then | ||
pron = | pron = usub(pron, 2, -2) | ||
without_links = | without_links = usub(without_links, 2, -2) | ||
elseif repr == "phonetic" then | elseif repr == "phonetic" then | ||
pron = | pron = usub(pron, 2, -2) | ||
without_links = | without_links = usub(without_links, 2, -2) | ||
elseif repr == "orthographic" then | elseif repr == "orthographic" then | ||
pron = | pron = usub(pron, 2, -2) | ||
without_links = | without_links = usub(without_links, 2, -2) | ||
elseif repr == "rhyme" then | elseif repr == "rhyme" then | ||
pron = | pron = usub(pron, 2) | ||
without_links = | without_links = usub(without_links, 2) | ||
else | else | ||
insert(categories, "IPA pronunciations with invalid representation marks") | |||
-- | -- insert(err, "invalid representation marks") | ||
-- Removed because it's annoying when previewing pronunciation pages. | -- Removed because it's annoying when previewing pronunciation pages. | ||
end | end | ||
if pron == "" then | if pron == "" then | ||
insert(categories, "IPA pronunciations with no pronunciation present") | |||
end | end | ||
-- Check for obsolete and nonstandard symbols | -- Check for obsolete and nonstandard symbols | ||
for i, symbol in ipairs(m_data.nonstandard) do | for i, symbol in ipairs(m_data.nonstandard) do | ||
Line 213: | Line 429: | ||
result = {} | result = {} | ||
end | end | ||
insert(result, nonstandard) | |||
insert(categories, | |||
{cat = "IPA pronunciations with obsolete or nonstandard characters", sort_key = nonstandard} | |||
) | |||
end | end | ||
if result then | if result then | ||
insert(err, "obsolete or nonstandard characters (" .. concat(result) .. ")") | |||
break | break | ||
end | end | ||
end | end | ||
--[[ Check for invalid symbols after removing the following: | --[[ Check for invalid symbols after removing the following: | ||
1. wikilinks (handled above) | 1. wikilinks (handled above) | ||
Line 233: | Line 451: | ||
8. superscripts enclosed in superscript parentheses ]] | 8. superscripts enclosed in superscript parentheses ]] | ||
local found_HTML | local found_HTML | ||
local result = | local result = gsub(without_links, "<(%a+)[^>]*>([^<]+)</%1>", | ||
function(tagName, content) | function(tagName, content) | ||
found_HTML = true | found_HTML = true | ||
return content | return content | ||
end) | end) | ||
result = | result = gsub(result, "'''([^']*)'''", "%1") | ||
result = | result = gsub(result, "''([^']*)''", "%1") | ||
result = | result = gsub(result, "&[^;]+;", "") -- This may catch things that are not valid character entities. | ||
result = | result = gsub(result, "^%*", "") | ||
result = | result = ugsub(result, ",%s+", "") | ||
-- VS15 | -- VS15 | ||
local vs15_class = "[" .. m_symbols.add_vs15 .. "]" | local vs15_class = "[" .. m_symbols.add_vs15 .. "]" | ||
if | if umatch(pron, vs15_class) then | ||
local vs15 = | local vs15 = u(0xFE0E) | ||
if | if find(result, vs15) then | ||
result = gsub(result, vs15, "") | result = gsub(result, vs15, "") | ||
pron = | pron = gsub(pron, vs15, "") | ||
end | end | ||
pron = | pron = ugsub(pron, "(" .. vs15_class .. ")", "%1" .. vs15) | ||
end | end | ||
if result ~= | if result ~= "" then | ||
local suggestions = {} | local suggestions = {} | ||
for k, v in pairs(m_symbols.invalid) do | |||
if find(result, k, 1, true) then | |||
insert(suggestions, k .. " with " .. v) | |||
end | |||
end | end | ||
if suggestions[1] then | |||
suggestions = listToText(suggestions) | |||
if | if is_content_page then | ||
error("Invalid IPA: replace " .. suggestions) | |||
else | |||
insert(err, "replace " .. suggestions) | |||
end | end | ||
end | end | ||
result = ugsub(result, "⁽[".. m_symbols.superscripts .. "]+⁾", "") | |||
if | local per_lang_valid | ||
if lang then | |||
per_lang_valid = m_symbols.per_lang_valid[lang:getCode()] | |||
end | |||
per_lang_valid = per_lang_valid or "" | |||
result = ugsub(result, "[" .. m_symbols.valid .. per_lang_valid .. "]", "") | |||
if result ~= "" then | |||
local category = "IPA pronunciations with invalid IPA characters" | |||
if not is_content_page then | |||
category = category .. "/non_mainspace" | |||
end | |||
insert(categories, category) | |||
insert(err, "invalid IPA characters (" .. result .. ")") | |||
end | end | ||
end | end | ||
if found_HTML then | if found_HTML then | ||
insert(categories, "IPA pronunciations with paired HTML tags") | |||
end | end | ||
if repr == "phonemic" or repr == "rhyme" then | if repr == "phonemic" or repr == "rhyme" then | ||
if lang and m_data.phonemes[lang:getCode()] then | if lang and m_data.phonemes[lang:getCode()] then | ||
Line 299: | Line 514: | ||
local rest = pron | local rest = pron | ||
local phonemes = {} | local phonemes = {} | ||
while | while #rest > 0 do | ||
local longestmatch = "" | local longestmatch, longestmatch_len = "", 0 | ||
local rest_init = sub(rest, 1, 1) | |||
longestmatch = | if rest_init == "(" or rest_init == ")" then | ||
longestmatch = rest_init | |||
longestmatch_len = 1 | |||
else | else | ||
for _, phoneme in ipairs(valid_phonemes) do | for _, phoneme in ipairs(valid_phonemes) do | ||
local phoneme_len = len(phoneme) | |||
if phoneme_len > longestmatch_len and usub(rest, 1, phoneme_len) == phoneme then | |||
longestmatch = phoneme | longestmatch = phoneme | ||
longestmatch_len = len(longestmatch) | |||
end | end | ||
end | end | ||
end | end | ||
if | if longestmatch_len > 0 then | ||
insert(phonemes, longestmatch) | |||
rest = | rest = usub(rest, longestmatch_len + 1) | ||
else | else | ||
local phoneme = | local phoneme = usub(rest, 1, 1) | ||
insert(phonemes, "<span style=\"color: red\">" .. phoneme .. "</span>") | |||
rest = | rest = usub(rest, 2) | ||
insert(categories, "IPA pronunciations with invalid phonemes/" .. lang:getCode()) | |||
end | end | ||
end | end | ||
pron = | pron = concat(phonemes) | ||
end | end | ||
if repr == "phonemic" then | if repr == "phonemic" then | ||
pron = "/" .. pron .. "/" | pron = "/" .. pron .. "/" | ||
Line 337: | Line 556: | ||
pron = "⟨" .. pron .. "⟩" | pron = "⟨" .. pron .. "⟩" | ||
end | end | ||
if reconstructed then | if reconstructed then | ||
pron = "*" .. pron | pron = "*" .. pron | ||
end | end | ||
return pron | |||
end | |||
--[==[ | |||
Format an IPA pronunciation. This wraps the pronunciation in appropriate CSS classes and adds cleanup categories and | |||
error messages as needed. The pronunciation `pron` should be either phonemic (surrounded by {/.../}), phonetic | |||
(surrounded by {[...]}), orthographic (surrounded by {⟨...⟩}), a rhyme (beginning with a hyphen) or a combined | |||
phonemic/phonetic spec (of the form {/.../ [...]}). `lang` indicates the language of the pronunciation and can be {nil}. | |||
If not {nil}, and the specified language has data in [[Module:IPA/data]] indicating the allowed phonemes, then the page | |||
will be added to a cleanup category and an error message displayed next to the outputted pronunciation. Note that {lang} | |||
also determines sort key processing in the added cleanup categories. If `split_output` is not given, the return value is | |||
a concatenation of the formatted pronunciation, error messages and formatted cleanup categories. Otherwise, three values | |||
are returned: the formatted pronunciation, the cleanup categories and the concatenated error messages. If `split_output` | |||
is the value {"raw"}, the cleanup categories are returned in list form, where the list elements are a combination of | |||
category strings and category objects of the form suitable for passing to {format_categories()} in [[Module:utilities]]. | |||
If `split_output` is any other value besides {nil}, the cleanup categories are returned as a pre-formatted concatenated | |||
string. | |||
]==] | |||
function export.format_IPA(lang, pron, split_output) | |||
local err = {} | |||
local categories = {} | |||
-- `pron` shouldn't contain ref tags. | |||
if match(pron, "\127'\"`UNIQ%-%-ref%-[%dA-F]+%-QINU`\"'\127") then | |||
error("<ref> tags found inside pronunciation parameter.") | |||
end | |||
local phonemic, phonetic = split_phonemic_phonetic(pron) | |||
pron = format_one_IPA(lang, phonemic, err, categories) | |||
if phonetic then | |||
phonetic = format_one_IPA(lang, phonetic, err, categories) | |||
pron = pron .. " " .. phonetic | |||
end | |||
if err[1] then | if err[1] then | ||
err = '<span class="previewonly error" style="font-size: small;> ' . | err = '<span class="previewonly error" style="font-size: small;> ' .. concat(err, ", ") .. "</span>" | ||
else | else | ||
err = "" | err = "" | ||
end | end | ||
return process_maybe_split_categories(split_output, categories, '<span class="IPA">' .. pron .. "</span>", lang, | |||
err) | |||
end | end | ||
--[==[ | |||
Format a line of one or more enPR pronunciations as {{tl|enPR}} would do it, i.e. with a preceding {"enPR:"} (linked to | |||
[[Appendix:English pronunciation]]) followed by one or more formatted, comma-separated enPR pronunciations. The | |||
pronunciations are formatted by wrapping them in the {{cd|AHD}} and {{cd|enPR}} CSS classes and adding any left and | |||
right regular and accent qualifiers. In addition, the overall result is wrapped in any overall left and right regular | |||
and accent qualifiers. There is a single parameter `data`, an object with the following fields: | |||
* `items` is a list of enPR pronunciations, each of which is an object with the following properties: | |||
** `pron`: the enPR pronunciation; | |||
** `q`: {nil} or a list of left qualifiers (as in {{tl|q}}) to display before the formatted pronunciation; | |||
{ | ** `qq`: {nil} or a list of right qualifiers to display after the formatted pronunciation; | ||
** `a`: {nil} or a list of left accent qualifiers (as in {{tl|a}}) to display before the formatted pronunciation; | |||
]] | ** `aa`: {nil} or a list of right accent qualifiers to after before the formatted pronunciation. | ||
* `q`: {nil} or a list of left qualifiers (as in {{tl|q}}) to display at the beginning, before the formatted | |||
local | pronunciations and preceding {"enPR:"}. | ||
[[ | * `qq`: {nil} or a list of right qualifiers to display after all formatted pronunciations. | ||
* `a`: {nil} or a list of left accent qualifiers (as in {{tl|a}}) to display at the beginning, before the formatted | |||
pronunciations and preceding {"enPR:"}. | |||
]] | * `aa`: {nil} or a list of right accent qualifiers to display after all formatted pronunciations. | ||
]==] | |||
local | function export.format_enPR_full(data) | ||
local prefix = "[[Appendix:English pronunciation|enPR]]: " | |||
local | local lang = require("Module:languages").getByCode("en") | ||
local parts = {} | |||
for _, | |||
local | for _, item in ipairs(data.items) do | ||
local part = '<span class="AHD enPR">' .. item.pron .. "</span>" | |||
if | |||
if item.q and item.q[1] or item.qq and item.qq[1] or item.a and item.a[1] or item.aa and item.aa[1] then | |||
part = require("Module:pron qualifier").format_qualifiers { | |||
lang = lang, | |||
text = part, | |||
q = item.q, | |||
qq = item.qq, | |||
a = item.a, | |||
aa = item.aa, | |||
} | |||
end | end | ||
insert(parts, part) | |||
end | |||
local prontext = prefix .. concat(parts, ", ") | |||
if data.q and data.q[1] or data.qq and data.qq[1] or data.a and data.a[1] or data.aa and data.aa[1] then | |||
prontext = require(pron_qualifier_module).format_qualifiers { | |||
lang = lang, | |||
text = prontext, | |||
q = data.q, | |||
qq = data.qq, | |||
a = data.a, | |||
aa = data.aa, | |||
} | } | ||
end | end | ||
return prontext | |||
return | |||
end | end | ||
return export | return export |