48,357
edits
No edit summary |
No edit summary |
||
| Line 3: | Line 3: | ||
local force_cat = false -- for testing | local force_cat = false -- for testing | ||
local pages_module = "Module:pages" | |||
local pron_qualifier_module = "Module:pron qualifier" | local pron_qualifier_module = "Module:pron qualifier" | ||
local qualifier_module = "Module:qualifier" | local qualifier_module = "Module:qualifier" | ||
local references_module = "Module:references" | local references_module = "Module:references" | ||
local string_utilities_module = "Module:string utilities" | local string_utilities_module = "Module:string utilities" | ||
local syllables_module = "Module:syllables" | |||
local utilities_module = "Module:utilities" | local utilities_module = "Module:utilities" | ||
local m_data = mw.loadData("Module:IPA/data") | local m_data = mw.loadData("Module:IPA/data") | ||
local m_str_utils = require(string_utilities_module) | local m_str_utils = require(string_utilities_module) | ||
local m_syllables -- [[Module:syllables]]; loaded below if needed | |||
local m_symbols = mw.loadData("Module:IPA/data/symbols") | local m_symbols = mw.loadData("Module:IPA/data/symbols") | ||
| Line 19: | Line 22: | ||
local gsub = string.gsub | local gsub = string.gsub | ||
local insert = table.insert | local insert = table.insert | ||
local is_preview = require(pages_module).is_preview | |||
local len = m_str_utils.len | local len = m_str_utils.len | ||
local listToText = mw.text.listToText | local listToText = mw.text.listToText | ||
local match = string.match | local match = string.match | ||
local pattern_escape = m_str_utils.pattern_escape | |||
local sub = string.sub | local sub = string.sub | ||
local u = m_str_utils.char | local u = m_str_utils.char | ||
| Line 52: | Line 57: | ||
--[==[ | --[==[ | ||
Format a line of one or more IPA pronunciations as {{tl|IPA}} would do it, i.e. with a preceding {"IPA:"} followed by | Format a line of one or more IPA pronunciations as {{tl|IPA}} would do it, i.e. with a preceding {"IPA:"} followed by | ||
the word {"key"} linking to | the word {"key"} linking to an Appendix page describing the language's phonology, and with an added category | ||
` ``lang`` terms with IPA pronunciation`. Other than the extra preceding text and category, this is identical | |||
to {format_IPA_multiple()}, and the considerations described there in the documentation apply here as well. There is a | to {format_IPA_multiple()}, and the considerations described there in the documentation apply here as well. There is a | ||
single parameter `data`, an object with the following fields: | single parameter `data`, an object with the following fields: | ||
| Line 59: | Line 64: | ||
pronunciations with invalid phonemes; for determining how many syllables the pronunciations have in them, in order to | pronunciations with invalid phonemes; for determining how many syllables the pronunciations have in them, in order to | ||
add a category such as [[:Category:Italian 2-syllable words]] (for certain languages only); for adding a category | add a category such as [[:Category:Italian 2-syllable words]] (for certain languages only); for adding a category | ||
` ``lang`` terms with IPA pronunciation`; and for determining the proper sort keys for categories. Unlike | |||
for {format_IPA_multiple()}, `lang` may not be {nil}. | for {format_IPA_multiple()}, `lang` may not be {nil}. | ||
* `items`: List of pronunciations, in exactly the same format as for {format_IPA_multiple()}. | * `items`: List of pronunciations, in exactly the same format as for {format_IPA_multiple()}. | ||
| Line 113: | Line 118: | ||
prefix_text = '<span class="error">' .. err .. '</span>' | prefix_text = '<span class="error">' .. err .. '</span>' | ||
else | else | ||
if hasKey[lang:getCode()] then | |||
prefix_text = "IPA for " .. langname | prefix_text = "IPA for " .. langname | ||
else | else | ||
prefix_text = | prefix_text = langname | ||
end | end | ||
prefix_text = "[[" .. prefix_text .. "|key]]" | prefix_text = "[[" .. prefix_text .. "|key]]" | ||
end | end | ||
local prefix = "[[Wiktionary:International Phonetic Alphabet|IPA]]<sup>(" .. prefix_text .. ")</sup>: " | local prefix = "[[wikt:Wiktionary:International Phonetic Alphabet|IPA]]<sup>(" .. prefix_text .. ")</sup>: " | ||
local IPAs, categories = export.format_IPA_multiple(lang, items, separator, no_count, "raw") | local IPAs, categories = export.format_IPA_multiple(lang, items, separator, no_count, "raw") | ||
| Line 162: | Line 165: | ||
local function determine_repr(pron) | local function determine_repr(pron) | ||
local reconstructed | local reconstructed | ||
-- remove initial asterisk before representation marks, | -- Temporarily remove any initial asterisk before representation marks, | ||
-- which avoids having to account for it in the data, but set the | |||
-- `reconstructed` flag. | |||
if sub(pron, 1, 1) == "*" then | if sub(pron, 1, 1) == "*" then | ||
reconstructed = true | reconstructed = true | ||
pron = sub(pron, 2) | pron = sub(pron, 2) | ||
end | end | ||
local opening = match(pron, "^.[\128-\191]*") | -- Some representation types have aliases for convenience (e.g. "// //" is | ||
local data = m_data.representation_types[opening] | -- an alias for "⫽ ⫽"). and these need to be substituted in before checking | ||
-- for other data. | |||
if | local opening, n = match(pron, "^.[\128-\191]*") | ||
local subs_data = m_data.representation_subs[opening] | |||
if | if subs_data then | ||
pron, n = ugsub(pron, subs_data[1], subs_data[2]) | |||
-- If the substitution was made, `opening` needs to be changed to the | |||
-- new opening character. | |||
if n ~= 0 then | |||
opening = subs_data[3] | |||
end | |||
end | |||
-- Get the type data based on the opening character (if any), and set the | |||
-- representation type if the closing character matches. | |||
local type_data, repr, closing = m_data.representation_types[opening] | |||
if type_data then | |||
closing = type_data[2] | |||
if type_data and match(pron, pattern_escape(closing) .. "$", #opening + 1) then | |||
repr = type_data[1] | |||
end | end | ||
end | end | ||
-- Default to the empty string. | |||
if not repr then | |||
opening, closing = "", "" | |||
end | |||
-- Reattach the asterisk if reconstructed. | |||
if reconstructed then | |||
pron = "*" .. pron | |||
end | |||
return pron, repr, opening, closing, reconstructed | |||
end | end | ||
local function hasInvalidSeparators(transcription) | local function hasInvalidSeparators(transcription) | ||
-- Escape certain characters as well as pauses, which have the format "(...)" (with any number of dots), to avoid false-positives. | |||
transcription = transcription:gsub(".[\128-\191]*", m_symbols.separator_escapes) | |||
:gsub("%(%.+%)", "\3") | |||
:gsub("[()]+", "") | |||
return ( | |||
transcription:find("..", nil, true) or | |||
transcription:match("%.%f[%z \1\2\3,:;]") or | |||
transcription:match("\1%f[%z \2\3,:;]") or | |||
transcription:match("\2%f[%z \1\3,:;]") or | |||
transcription:match("\3[:;]") or | |||
transcription:match("%f[^%z \1\2\3,]%.") | |||
) and true or false | |||
end | end | ||
--[==[ | --[==[ | ||
Format a line of one or more bare IPA pronunciations (i.e. without any preceding {"IPA:"} and without adding to a | Format a line of one or more bare IPA pronunciations (i.e. without any preceding {"IPA:"} and without adding to a | ||
category | category ` ``lang`` terms with IPA pronunciation`). Individual pronunciations are formatted using | ||
{format_IPA()} and are combined with separators, qualifiers, pre-text, post-text, etc. to form a line of pronunciations. | {format_IPA()} and are combined with separators, qualifiers, pre-text, post-text, etc. to form a line of pronunciations. | ||
Parameters accepted are: | Parameters accepted are: | ||
| Line 325: | Line 361: | ||
if lang then | if lang then | ||
-- | -- Add syllable count if the language's diphthongs are listed in [[Module:syllables]]. | ||
-- Don't do this if the term has spaces, a liaison mark (‿) or isn't in mainspace. | |||
if not no_count and namespace == 0 then | |||
m_syllables = m_syllables or require(syllables_module) | |||
local langcode = lang:getCode() | |||
if m_data.langs_to_generate_syllable_count_categories[langcode] then | |||
local raw_phonemic, phonetic, use_it = split_phonemic_phonetic(item.pron) | |||
local phonemic, repr = determine_repr(raw_phonemic) | |||
if not phonetic then -- not a '/.../ [...]' combined pronunciation | |||
if m_data.langs_to_use_phonetic_notation[langcode] then | |||
use_it = repr == "phonetic" and phonemic or nil | |||
else | |||
use_it = repr == "phonemic" and phonemic or nil | |||
end | |||
elseif repr == "phonetic" then | |||
use_it = phonetic | |||
elseif repr == "phonemic" then | |||
use_it = phonemic | |||
end | |||
-- Note: two uses of find with plain patterns is much faster than umatch with [ ‿]. | |||
if use_it and not (find(use_it, " ") or find(use_it, "‿")) then | |||
local syllable_count = m_syllables.getVowels(use_it, lang) | |||
if syllable_count then | |||
insert(categories, lang:getCanonicalName() .. " " .. syllable_count .. | |||
"-syllable words") | |||
end | |||
end | |||
end | |||
end | end | ||
end | end | ||
| Line 343: | Line 403: | ||
may have HTML added surrounding invalid characters so they appear in red. | may have HTML added surrounding invalid characters so they appear in red. | ||
]=] | ]=] | ||
local function format_one_IPA(lang, | local function format_one_IPA(lang, raw_pron, err, categories) | ||
-- Disallow wikilinks. | -- Disallow wikilinks. | ||
if match( | if match(raw_pron, "%[%[.-%]%]") then | ||
error("IPA input must not contain wikilinks.") | error("IPA input must not contain wikilinks.") | ||
end | end | ||
raw_pron = decode_entities(raw_pron) | |||
-- Detect the type of transcription. | -- Detect the type of transcription. | ||
local repr, opening, closing, reconstructed = determine_repr( | local pron, repr, opening, closing, reconstructed = determine_repr(raw_pron) | ||
-- Strip any reconstruction asterisk and representation marks. | -- Strip any reconstruction asterisk and representation marks. | ||
pron = sub(pron, #opening + 1 + (reconstructed and 1 or 0), -#closing - 1) | pron = sub(pron, #opening + 1 + (reconstructed and 1 or 0), -#closing - 1) | ||
if not repr then | |||
insert(categories, "IPA pronunciations with invalid representation marks") | |||
-- insert(err, "invalid representation marks") | |||
-- Removed because it's annoying when previewing pronunciation pages. | |||
end | |||
if repr ~= "orthographic" and lang and lang:getCode() == "en" and hasInvalidSeparators(pron) then | |||
insert(categories, "English IPA pronunciations with invalid separators") | |||
end | |||
if pron == "" then | if pron == "" then | ||
| Line 411: | Line 480: | ||
if result ~= "" then | if result ~= "" then | ||
local suggestions | if lang then | ||
-- Get the per_lang_valid data, and convert any per-language valid sequences to spaces. | |||
local per_lang_valid = m_symbols.per_lang_valid[lang:getCode()] | |||
if per_lang_valid then | |||
if type(per_lang_valid) == "table" then | |||
for _, pattern in pairs(per_lang_valid) do | |||
result = ugsub(result, pattern, " ") | |||
end | |||
else -- Should be a string. | |||
result = ugsub(result, per_lang_valid, " ") | |||
end | |||
end | |||
end | |||
local suggestions | |||
-- Check for any invalid sequences, excluding anything in the per-language lookup table. | |||
for k, v in pairs(m_symbols.invalid) do | for k, v in pairs(m_symbols.invalid) do | ||
if find(result, k, | if find(result, k, nil, true) then | ||
if not suggestions then | |||
suggestions = {} | |||
end | |||
insert(suggestions, k .. " with " .. v) | |||
end | |||
end | |||
if suggestions and suggestions[1] then | |||
suggestions = listToText(suggestions) | |||
if is_content_page then | |||
error("Invalid IPA: replace " .. suggestions) | |||
end | end | ||
insert(err, "replace " .. suggestions) | |||
end | |||
-- Convert any valid character sequences to spaces | |||
for _, pattern in pairs(m_symbols.valid) do | |||
result = ugsub(result, pattern, " ") | |||
end | end | ||
if not match(result, "^ *$") then | |||
local category = "IPA pronunciations with invalid IPA characters" | |||
if not is_content_page then | |||
category = category .. "/non_mainspace" | |||
end | |||
insert(categories, category) | |||
insert(err, "invalid IPA characters (" .. result .. ")") | |||
end | end | ||
end | end | ||
| Line 458: | Line 555: | ||
else | else | ||
local phoneme = usub(rest, 1, 1) | local phoneme = usub(rest, 1, 1) | ||
insert(phonemes, "<span style=\"color: red\">" .. phoneme .. "</span>") | insert(phonemes, "<span style=\"color: var(--wikt-palette-red,red)\">" .. phoneme .. "</span>") | ||
rest = usub(rest, 2) | rest = usub(rest, 2) | ||
insert(categories, "IPA pronunciations with invalid phonemes/" .. lang:getCode()) | insert(categories, "IPA pronunciations with invalid phonemes/" .. lang:getCode()) | ||
| Line 501: | Line 598: | ||
end | end | ||
if err[1] then | if err[1] and is_preview() then | ||
err = '<span class=" | err = '<span class="error" style="font-size: small;> ' .. concat(err, ", ") .. "</span>" | ||
else | else | ||
err = "" | err = "" | ||
end | end | ||
return process_maybe_split_categories(split_output, categories, '<span class="IPA">' .. pron .. "</span>", lang, | return process_maybe_split_categories(split_output, categories, '<span class="IPA nowrap">' .. pron .. "</span>", lang, | ||
err) | err) | ||
end | end | ||
| Line 514: | Line 611: | ||
Format a line of one or more enPR pronunciations as {{tl|enPR}} would do it, i.e. with a preceding {"enPR:"} (linked to | Format a line of one or more enPR pronunciations as {{tl|enPR}} would do it, i.e. with a preceding {"enPR:"} (linked to | ||
[[Appendix:English pronunciation]]) followed by one or more formatted, comma-separated enPR pronunciations. The | [[Appendix:English pronunciation]]) followed by one or more formatted, comma-separated enPR pronunciations. The | ||
pronunciations are formatted by wrapping them in the | pronunciations are formatted by wrapping them in the `AHD` and `enPR` CSS classes and adding any left and | ||
right regular and accent qualifiers. In addition, the overall result is wrapped in any overall left and right regular | right regular and accent qualifiers. In addition, the overall result is wrapped in any overall left and right regular | ||
and accent qualifiers. There is a single parameter `data`, an object with the following fields: | and accent qualifiers. There is a single parameter `data`, an object with the following fields: | ||
| Line 531: | Line 628: | ||
]==] | ]==] | ||
function export.format_enPR_full(data) | function export.format_enPR_full(data) | ||
local prefix = "[[Appendix:English pronunciation|enPR]]: " | local prefix = "[[wikt:Appendix:English pronunciation|enPR]]: " | ||
local lang = require("Module:languages").getByCode("en") | local lang = require("Module:languages").getByCode("en") | ||
local parts = {} | local parts = {} | ||