48,404
edits
No edit summary |
No edit summary |
||
| (15 intermediate revisions by the same user not shown) | |||
| Line 3: | Line 3: | ||
local force_cat = false -- for testing | local force_cat = false -- for testing | ||
local | local pages_module = "Module:pages" | ||
local pron_qualifier_module = "Module:pron qualifier" | local pron_qualifier_module = "Module:pron qualifier" | ||
local qualifier_module = "Module:qualifier" | local qualifier_module = "Module:qualifier" | ||
local references_module = "Module:references" | local references_module = "Module:references" | ||
local string_utilities_module = "Module:string utilities" | |||
local syllables_module = "Module:syllables" | local syllables_module = "Module:syllables" | ||
local utilities_module = "Module:utilities" | local utilities_module = "Module:utilities" | ||
local m_data = mw.loadData("Module:IPA/data") | |||
local m_str_utils = require(string_utilities_module) | |||
local m_syllables -- [[Module:syllables]]; loaded below if needed | local m_syllables -- [[Module:syllables]]; loaded below if needed | ||
local m_symbols = mw.loadData("Module:IPA/data/symbols") | |||
local concat = table.concat | local concat = table.concat | ||
local decode_entities = m_str_utils.decode_entities | |||
local find = string.find | local find = string.find | ||
local gmatch = m_str_utils.gmatch | local gmatch = m_str_utils.gmatch | ||
local gsub = string.gsub | local gsub = string.gsub | ||
local insert = table.insert | local insert = table.insert | ||
local is_preview = require(pages_module).is_preview | |||
local len = m_str_utils.len | local len = m_str_utils.len | ||
local listToText = mw.text.listToText | local listToText = mw.text.listToText | ||
local match = string.match | local match = string.match | ||
local pattern_escape = m_str_utils.pattern_escape | |||
local sub = string.sub | local sub = string.sub | ||
local u = m_str_utils.char | local u = m_str_utils.char | ||
local ugsub = m_str_utils.gsub | local ugsub = m_str_utils.gsub | ||
local umatch = m_str_utils.match | local umatch = m_str_utils.match | ||
| Line 29: | Line 34: | ||
local namespace = mw.title.getCurrentTitle().namespace | local namespace = mw.title.getCurrentTitle().namespace | ||
local is_content_page = namespace == 0 or namespace == | local is_content_page = namespace == 0 or namespace == 120 | ||
local function process_maybe_split_categories(split_output, categories, prontext, lang, errtext) | local function process_maybe_split_categories(split_output, categories, prontext, lang, errtext) | ||
| Line 53: | Line 58: | ||
Format a line of one or more IPA pronunciations as {{tl|IPA}} would do it, i.e. with a preceding {"IPA:"} followed by | Format a line of one or more IPA pronunciations as {{tl|IPA}} would do it, i.e. with a preceding {"IPA:"} followed by | ||
the word {"key"} linking to an Appendix page describing the language's phonology, and with an added category | the word {"key"} linking to an Appendix page describing the language's phonology, and with an added category | ||
` ``lang`` terms with IPA pronunciation`. Other than the extra preceding text and category, this is identical | |||
to {format_IPA_multiple()}, and the considerations described there in the documentation apply here as well. There is a | to {format_IPA_multiple()}, and the considerations described there in the documentation apply here as well. There is a | ||
single parameter `data`, an object with the following fields: | single parameter `data`, an object with the following fields: | ||
| Line 59: | Line 64: | ||
pronunciations with invalid phonemes; for determining how many syllables the pronunciations have in them, in order to | pronunciations with invalid phonemes; for determining how many syllables the pronunciations have in them, in order to | ||
add a category such as [[:Category:Italian 2-syllable words]] (for certain languages only); for adding a category | add a category such as [[:Category:Italian 2-syllable words]] (for certain languages only); for adding a category | ||
` ``lang`` terms with IPA pronunciation`; and for determining the proper sort keys for categories. Unlike | |||
for {format_IPA_multiple()}, `lang` may not be {nil}. | for {format_IPA_multiple()}, `lang` may not be {nil}. | ||
* `items`: List of pronunciations, in exactly the same format as for {format_IPA_multiple()}. | * `items`: List of pronunciations, in exactly the same format as for {format_IPA_multiple()}. | ||
| Line 113: | Line 118: | ||
prefix_text = '<span class="error">' .. err .. '</span>' | prefix_text = '<span class="error">' .. err .. '</span>' | ||
else | else | ||
prefix_text = "IPA for " .. langname | |||
prefix_text = "[[" .. prefix_text .. "|key]]" | prefix_text = "[[" .. prefix_text .. "|key]]" | ||
end | end | ||
local prefix = "[[Wiktionary:International Phonetic Alphabet|IPA]]<sup>(" .. prefix_text .. ")</sup>: " | local prefix = "[[wikt:Wiktionary:International Phonetic Alphabet|IPA]]<sup>(" .. prefix_text .. ")</sup>: " | ||
local IPAs, categories = export.format_IPA_multiple(lang, items, separator, no_count, "raw") | local IPAs, categories = export.format_IPA_multiple(lang, items, separator, no_count, "raw") | ||
local prontext = prefix .. IPAs | local prontext = prefix .. IPAs | ||
| Line 159: | Line 153: | ||
local function determine_repr(pron) | local function determine_repr(pron) | ||
local | local reconstructed | ||
-- remove initial asterisk before representation marks, | -- Temporarily remove any initial asterisk before representation marks, | ||
-- which avoids having to account for it in the data, but set the | |||
-- `reconstructed` flag. | |||
if sub(pron, 1, 1) == "*" then | if sub(pron, 1, 1) == "*" then | ||
reconstructed = true | reconstructed = true | ||
| Line 168: | Line 163: | ||
end | end | ||
-- Some representation types have aliases for convenience (e.g. "// //" is | |||
-- an alias for "⫽ ⫽"). and these need to be substituted in before checking | |||
-- for other data. | |||
local opening, n = match(pron, "^.[\128-\191]*") | |||
local subs_data = m_data.representation_subs[opening] | |||
if subs_data then | |||
pron, n = ugsub(pron, subs_data[1], subs_data[2]) | |||
-- If the substitution was made, `opening` needs to be changed to the | |||
-- new opening character. | |||
if n ~= 0 then | |||
opening = subs_data[3] | |||
end | |||
end | |||
-- Get the type data based on the opening character (if any), and set the | |||
-- representation type if the closing character matches. | |||
local type_data, repr, closing = m_data.representation_types[opening] | |||
if type_data then | |||
closing = type_data[2] | |||
if type_data and match(pron, pattern_escape(closing) .. "$", #opening + 1) then | |||
repr = type_data[1] | |||
end | |||
end | |||
-- Default to the empty string. | |||
if not repr then | |||
opening, closing = "", "" | |||
end | |||
if | -- Reattach the asterisk if reconstructed. | ||
if reconstructed then | |||
pron = "*" .. pron | |||
end | end | ||
return repr, reconstructed | return pron, repr, opening, closing, reconstructed | ||
end | end | ||
local function hasInvalidSeparators(transcription) | local function hasInvalidSeparators(transcription) | ||
-- Escape certain characters as well as pauses, which have the format "(...)" (with any number of dots), to avoid false-positives. | |||
transcription = transcription:gsub(".[\128-\191]*", m_symbols.separator_escapes) | |||
:gsub("%(%.+%)", "\3") | |||
:gsub("[()]+", "") | |||
return ( | |||
transcription:find("..", nil, true) or | |||
transcription:match("%.%f[%z \1\2\3,:;]") or | |||
transcription:match("\1%f[%z \2\3,:;]") or | |||
transcription:match("\2%f[%z \1\3,:;]") or | |||
transcription:match("\3[:;]") or | |||
transcription:match("%f[^%z \1\2\3,]%.") | |||
) and true or false | |||
end | end | ||
--[==[ | --[==[ | ||
Format a line of one or more bare IPA pronunciations (i.e. without any preceding {"IPA:"} and without adding to a | Format a line of one or more bare IPA pronunciations (i.e. without any preceding {"IPA:"} and without adding to a | ||
category | category ` ``lang`` terms with IPA pronunciation`). Individual pronunciations are formatted using | ||
{format_IPA()} and are combined with separators, qualifiers, pre-text, post-text, etc. to form a line of pronunciations. | {format_IPA()} and are combined with separators, qualifiers, pre-text, post-text, etc. to form a line of pronunciations. | ||
Parameters accepted are: | Parameters accepted are: | ||
| Line 258: | Line 271: | ||
if namespace == 10 then -- Template | if namespace == 10 then -- Template | ||
insert(items, {pron = "/aɪ piː ˈeɪ/"}) | insert(items, {pron = "/aɪ piː ˈeɪ/"}) | ||
end | end | ||
end | end | ||
| Line 343: | Line 354: | ||
local langcode = lang:getCode() | local langcode = lang:getCode() | ||
if m_data.langs_to_generate_syllable_count_categories[langcode] then | if m_data.langs_to_generate_syllable_count_categories[langcode] then | ||
local | local raw_phonemic, phonetic, use_it = split_phonemic_phonetic(item.pron) | ||
local | local phonemic, repr = determine_repr(raw_phonemic) | ||
if not phonetic then -- not a '/.../ [...]' combined pronunciation | if not phonetic then -- not a '/.../ [...]' combined pronunciation | ||
if m_data.langs_to_use_phonetic_notation[langcode] then | if m_data.langs_to_use_phonetic_notation[langcode] then | ||
use_it = repr == "phonetic" and phonemic or nil | use_it = repr == "phonetic" and phonemic or nil | ||
| Line 366: | Line 376: | ||
end | end | ||
end | end | ||
end | end | ||
end | end | ||
| Line 386: | Line 390: | ||
may have HTML added surrounding invalid characters so they appear in red. | may have HTML added surrounding invalid characters so they appear in red. | ||
]=] | ]=] | ||
local function format_one_IPA(lang, | local function format_one_IPA(lang, raw_pron, err, categories) | ||
-- | -- Disallow wikilinks. | ||
if match(raw_pron, "%[%[.-%]%]") then | |||
error("IPA input must not contain wikilinks.") | |||
end | end | ||
raw_pron = decode_entities(raw_pron) | |||
-- | -- Detect the type of transcription. | ||
local pron, repr, opening, closing, reconstructed = determine_repr(raw_pron) | |||
-- Strip any reconstruction asterisk and representation marks. | |||
pron = sub(pron, #opening + 1 + (reconstructed and 1 or 0), -#closing - 1) | |||
if | if repr ~= "orthographic" and lang and lang:getCode() == "en" and hasInvalidSeparators(pron) then | ||
insert(categories, "IPA pronunciations with | insert(categories, "English IPA pronunciations with invalid separators") | ||
end | end | ||
-- Check for obsolete and nonstandard symbols | -- Check for obsolete and nonstandard symbols | ||
for | for _, symbol in ipairs(m_data.nonstandard) do | ||
local result | local result | ||
for nonstandard in gmatch(pron, symbol) do | for nonstandard in gmatch(pron, symbol) do | ||
| Line 430: | Line 416: | ||
end | end | ||
insert(result, nonstandard) | insert(result, nonstandard) | ||
end | end | ||
| Line 446: | Line 429: | ||
3. bolding | 3. bolding | ||
4. italics | 4. italics | ||
5 | 5. asterisk at beginning of transcription | ||
6. comma followed by spacing characters | |||
7. superscripts enclosed in superscript parentheses ]] | |||
local found_HTML | local found_HTML | ||
local result = gsub( | local result = gsub(pron, "<(%a+)[^>]*>([^<]+)</%1>", | ||
function(tagName, content) | function(tagName, content) | ||
found_HTML = true | found_HTML = true | ||
| Line 458: | Line 440: | ||
result = gsub(result, "'''([^']*)'''", "%1") | result = gsub(result, "'''([^']*)'''", "%1") | ||
result = gsub(result, "''([^']*)''", "%1") | result = gsub(result, "''([^']*)''", "%1") | ||
result = gsub(result, "^%*", "") | result = gsub(result, "^%*", "") | ||
result = ugsub(result, ",%s+", "") | result = ugsub(result, ",%s+", "") | ||
| Line 470: | Line 451: | ||
pron = gsub(pron, vs15, "") | pron = gsub(pron, vs15, "") | ||
end | end | ||
pron = ugsub(pron, | pron = ugsub(pron, vs15_class, "%0" .. vs15) | ||
end | end | ||
if result ~= "" then | if result ~= "" then | ||
local suggestions | if lang then | ||
-- Get the per_lang_valid data, and convert any per-language valid sequences to spaces. | |||
local per_lang_valid = m_symbols.per_lang_valid[lang:getCode()] | |||
if per_lang_valid then | |||
if type(per_lang_valid) == "table" then | |||
for _, pattern in pairs(per_lang_valid) do | |||
result = ugsub(result, pattern, " ") | |||
end | |||
else -- Should be a string. | |||
result = ugsub(result, per_lang_valid, " ") | |||
end | |||
end | |||
end | |||
local suggestions | |||
-- Check for any invalid sequences, excluding anything in the per-language lookup table. | |||
for k, v in pairs(m_symbols.invalid) do | for k, v in pairs(m_symbols.invalid) do | ||
if find(result, k, | if find(result, k, nil, true) then | ||
if not suggestions then | |||
suggestions = {} | |||
end | |||
insert(suggestions, k .. " with " .. v) | insert(suggestions, k .. " with " .. v) | ||
end | end | ||
end | end | ||
if suggestions[1] then | if suggestions and suggestions[1] then | ||
suggestions = listToText(suggestions) | suggestions = listToText(suggestions) | ||
if is_content_page then | if is_content_page then | ||
error("Invalid IPA: replace " .. suggestions) | error("Invalid IPA: replace " .. suggestions) | ||
end | end | ||
insert(err, "replace " .. suggestions) | |||
end | end | ||
-- Convert any valid character sequences to spaces | |||
for _, pattern in pairs(m_symbols.valid) do | |||
result = ugsub(result, pattern, " ") | |||
end | end | ||
end | end | ||
if | if (repr == "phonemic" or repr == "rhyme") and lang and m_data.phonemes[lang:getCode()] then | ||
local valid_phonemes = m_data.phonemes[lang:getCode()] | |||
local rest = pron | |||
local phonemes = {} | |||
while #rest > 0 do | |||
local longestmatch, longestmatch_len = "", 0 | |||
local | |||
local rest_init = sub(rest, 1, 1) | |||
if rest_init == "(" or rest_init == ")" then | |||
longestmatch = rest_init | |||
longestmatch_len = 1 | |||
else | |||
for _, phoneme in ipairs(valid_phonemes) do | |||
local phoneme_len = len(phoneme) | |||
if phoneme_len > longestmatch_len and usub(rest, 1, phoneme_len) == phoneme then | |||
longestmatch = phoneme | |||
longestmatch_len = len(longestmatch) | |||
end | end | ||
end | end | ||
end | |||
if longestmatch_len > 0 then | |||
insert(phonemes, longestmatch) | |||
rest = usub(rest, longestmatch_len + 1) | |||
else | |||
local phoneme = usub(rest, 1, 1) | |||
insert(phonemes, "<span style=\"color: var(--wikt-palette-red,red)\">" .. phoneme .. "</span>") | |||
rest = usub(rest, 2) | |||
end | end | ||
end | end | ||
pron = concat(phonemes) | |||
end | end | ||
return pron | return (reconstructed and "*" or "") .. opening .. pron .. closing | ||
end | end | ||
| Line 595: | Line 560: | ||
end | end | ||
if err[1] then | if err[1] and is_preview() then | ||
err = '<span class=" | err = '<span class="error" style="font-size: small;> ' .. concat(err, ", ") .. "</span>" | ||
else | else | ||
err = "" | err = "" | ||
end | end | ||
return process_maybe_split_categories(split_output, categories, '<span class="IPA">' .. pron .. "</span>", lang, | return process_maybe_split_categories(split_output, categories, '<span class="IPA nowrap">' .. pron .. "</span>", lang, | ||
err) | err) | ||
end | end | ||
| Line 608: | Line 573: | ||
Format a line of one or more enPR pronunciations as {{tl|enPR}} would do it, i.e. with a preceding {"enPR:"} (linked to | Format a line of one or more enPR pronunciations as {{tl|enPR}} would do it, i.e. with a preceding {"enPR:"} (linked to | ||
[[Appendix:English pronunciation]]) followed by one or more formatted, comma-separated enPR pronunciations. The | [[Appendix:English pronunciation]]) followed by one or more formatted, comma-separated enPR pronunciations. The | ||
pronunciations are formatted by wrapping them in the | pronunciations are formatted by wrapping them in the `AHD` and `enPR` CSS classes and adding any left and | ||
right regular and accent qualifiers. In addition, the overall result is wrapped in any overall left and right regular | right regular and accent qualifiers. In addition, the overall result is wrapped in any overall left and right regular | ||
and accent qualifiers. There is a single parameter `data`, an object with the following fields: | and accent qualifiers. There is a single parameter `data`, an object with the following fields: | ||
| Line 625: | Line 590: | ||
]==] | ]==] | ||
function export.format_enPR_full(data) | function export.format_enPR_full(data) | ||
local prefix = "[[Appendix:English pronunciation|enPR]]: " | local prefix = "[[wikt:Appendix:English pronunciation|enPR]]: " | ||
local lang = require("Module:languages").getByCode("en") | local lang = require("Module:languages").getByCode("en") | ||
local parts = {} | local parts = {} | ||