Module:IPA: Difference between revisions

no edit summary
(Undo revision 410245 by Sware (talk))
Tag: Undo
No edit summary
Line 3: Line 3:
local force_cat = false -- for testing
local force_cat = false -- for testing


local m_data = mw.loadData("Module:IPA/data")
local m_str_utils = require("Module:string utilities")
local m_symbols = mw.loadData("Module:IPA/data/symbols")
local pron_qualifier_module = "Module:pron qualifier"
local pron_qualifier_module = "Module:pron qualifier"
local qualifier_module = "Module:qualifier"
local qualifier_module = "Module:qualifier"
local references_module = "Module:references"
local references_module = "Module:references"
local string_utilities_module = "Module:string utilities"
local syllables_module = "Module:syllables"
local syllables_module = "Module:syllables"
local utilities_module = "Module:utilities"
local utilities_module = "Module:utilities"
local m_data = mw.loadData("Module:IPA/data")
local m_str_utils = require(string_utilities_module)
local m_syllables -- [[Module:syllables]]; loaded below if needed
local m_syllables -- [[Module:syllables]]; loaded below if needed
local m_symbols = mw.loadData("Module:IPA/data/symbols")


local concat = table.concat
local concat = table.concat
local decode_entities = m_str_utils.decode_entities
local find = string.find
local find = string.find
local gmatch = m_str_utils.gmatch
local gmatch = m_str_utils.gmatch
Line 23: Line 26:
local sub = string.sub
local sub = string.sub
local u = m_str_utils.char
local u = m_str_utils.char
local ufind = m_str_utils.find
local ugsub = m_str_utils.gsub
local ugsub = m_str_utils.gsub
local umatch = m_str_utils.match
local umatch = m_str_utils.match
Line 113: Line 115:
prefix_text = '<span class="error">' .. err .. '</span>'
prefix_text = '<span class="error">' .. err .. '</span>'
else
else
prefix_text = langname .. " pronunciation"
if hasKey[lang:getCode()] then
prefix_text = "Appendix:" .. langname .. " pronunciation"
else
prefix_text = "wikipedia:" .. langname .. " phonology"
end
prefix_text = "[[" .. prefix_text .. "|key]]"
prefix_text = "[[" .. prefix_text .. "|key]]"
end
end
Line 155: Line 161:


local function determine_repr(pron)
local function determine_repr(pron)
local repr_mark = {}
local reconstructed
local repr, reconstructed
 
-- remove initial asterisk before representation marks, used on some Reconstruction pages
-- remove initial asterisk before representation marks, used on some Reconstruction pages
if sub(pron, 1, 1) == "*" then
if sub(pron, 1, 1) == "*" then
Line 163: Line 168:
pron = sub(pron, 2)
pron = sub(pron, 2)
end
end
 
local representation_types = {
local opening = match(pron, "^.[\128-\191]*")
['/'] = { right = '/', type = 'phonemic', },
local data = m_data.representation_types[opening]
['['] = { right = ']', type = 'phonetic', },
['⟨'] = { right = '⟩', type = 'orthographic', },
if data then
['-'] = { type = 'rhyme' },
local closing = data[2]
}
if data and match(pron, closing .. "$", #opening + 1) then
 
return data[1], opening, closing, reconstructed
repr_mark.i, repr_mark.f, repr_mark.left, repr_mark.right = ufind(pron, '^(.).-(.)$')
 
local representation_type = representation_types[repr_mark.left]
 
if representation_type then
if representation_type.right then
if repr_mark.right == representation_type.right then
repr = representation_type.type
end
else
repr = representation_type.type
end
end
else
repr = nil
end
end
 
return repr, reconstructed
return nil, "", "", reconstructed
end
end


local function hasInvalidSeparators(transcription)
local function hasInvalidSeparators(transcription)
if match(transcription, "%.\203[\136\140]") then -- [ˈˌ]
if umatch(transcription, "%.[ˈˌ]") or umatch(transcription, "[ˈˌ][ .]") then
return true
return true
else
else
Line 339: Line 331:
local langcode = lang:getCode()
local langcode = lang:getCode()
if m_data.langs_to_generate_syllable_count_categories[langcode] then
if m_data.langs_to_generate_syllable_count_categories[langcode] then
local phonemic, phonetic = split_phonemic_phonetic(item.pron)
local phonemic, phonetic, use_it = split_phonemic_phonetic(item.pron)
local use_it
local repr = determine_repr(phonemic)
if not phonetic then -- not a '/.../ [...]' combined pronunciation
if not phonetic then -- not a '/.../ [...]' combined pronunciation
local repr = determine_repr(phonemic)
if m_data.langs_to_use_phonetic_notation[langcode] then
if m_data.langs_to_use_phonetic_notation[langcode] then
use_it = repr == "phonetic" and phonemic or nil
use_it = repr == "phonetic" and phonemic or nil
Line 367: Line 358:
-- into its parts in order to process.
-- into its parts in order to process.
if lang:getCode() == "en" and hasInvalidSeparators(item.pron) then
if lang:getCode() == "en" and hasInvalidSeparators(item.pron) then
insert(categories, "IPA for English using .ˈ or .ˌ")
insert(categories, "English IPA pronunciations with invalid separators")
end
end
end
end
Line 383: Line 374:
]=]
]=]
local function format_one_IPA(lang, pron, err, categories)
local function format_one_IPA(lang, pron, err, categories)
-- Remove wikilinks, so that wikilink brackets are not misinterpreted as indicating phonetic transcription
-- Disallow wikilinks.
local without_links = gsub(pron, "%[%[[^|%]]+|([^%]]+)%]%]", "%1")
if match(pron, "%[%[.-%]%]") then
without_links = gsub(without_links, "%[%[[^%]]+%]%]", "%1")
error("IPA input must not contain wikilinks.")
 
-- Detect whether this is a phonemic or phonetic transcription
local repr, reconstructed = determine_repr(without_links)
 
if reconstructed then
pron = sub(pron, 2)
without_links = sub(without_links, 2)
end
end
pron = decode_entities(pron)


-- If valid, strip the representation marks
-- Detect the type of transcription.
if repr == "phonemic" then
local repr, opening, closing, reconstructed = determine_repr(pron)
pron = usub(pron, 2, -2)
without_links = usub(without_links, 2, -2)
-- Strip any reconstruction asterisk and representation marks.
elseif repr == "phonetic" then
pron = sub(pron, #opening + 1 + (reconstructed and 1 or 0), -#closing - 1)
pron = usub(pron, 2, -2)
if not repr then
without_links = usub(without_links, 2, -2)
elseif repr == "orthographic" then
pron = usub(pron, 2, -2)
without_links = usub(without_links, 2, -2)
elseif repr == "rhyme" then
pron = usub(pron, 2)
without_links = usub(without_links, 2)
else
insert(categories, "IPA pronunciations with invalid representation marks")
insert(categories, "IPA pronunciations with invalid representation marks")
-- insert(err, "invalid representation marks")
-- insert(err, "invalid representation marks")
Line 419: Line 397:


-- Check for obsolete and nonstandard symbols
-- Check for obsolete and nonstandard symbols
for i, symbol in ipairs(m_data.nonstandard) do
for _, symbol in ipairs(m_data.nonstandard) do
local result
local result
for nonstandard in gmatch(pron, symbol) do
for nonstandard in gmatch(pron, symbol) do
Line 442: Line 420:
3. bolding
3. bolding
4. italics
4. italics
5. HTML entity for space
5. asterisk at beginning of transcription
6. asterisk at beginning of transcription
6. comma followed by spacing characters
7. comma followed by spacing characters
7. superscripts enclosed in superscript parentheses ]]
8. superscripts enclosed in superscript parentheses ]]
local found_HTML
local found_HTML
local result = gsub(without_links, "<(%a+)[^>]*>([^<]+)</%1>",
local result = gsub(pron, "<(%a+)[^>]*>([^<]+)</%1>",
function(tagName, content)
function(tagName, content)
found_HTML = true
found_HTML = true
Line 454: Line 431:
result = gsub(result, "'''([^']*)'''", "%1")
result = gsub(result, "'''([^']*)'''", "%1")
result = gsub(result, "''([^']*)''", "%1")
result = gsub(result, "''([^']*)''", "%1")
result = gsub(result, "&[^;]+;", "") -- This may catch things that are not valid character entities.
result = gsub(result, "^%*", "")
result = gsub(result, "^%*", "")
result = ugsub(result, ",%s+", "")
result = ugsub(result, ",%s+", "")
Line 466: Line 442:
pron = gsub(pron, vs15, "")
pron = gsub(pron, vs15, "")
end
end
pron = ugsub(pron, "(" .. vs15_class .. ")", "%1" .. vs15)
pron = ugsub(pron, vs15_class, "%0" .. vs15)
end
end


Line 505: Line 481:
end
end


if repr == "phonemic" or repr == "rhyme" then
if (repr == "phonemic" or repr == "rhyme") and lang and m_data.phonemes[lang:getCode()] then
if lang and m_data.phonemes[lang:getCode()] then
local valid_phonemes = m_data.phonemes[lang:getCode()]
local valid_phonemes = m_data.phonemes[lang:getCode()]
local rest = pron
local rest = pron
local phonemes = {}
local phonemes = {}


while #rest > 0 do
while #rest > 0 do
local longestmatch, longestmatch_len = "", 0
local longestmatch, longestmatch_len = "", 0


local rest_init = sub(rest, 1, 1)
local rest_init = sub(rest, 1, 1)
if rest_init == "(" or rest_init == ")" then
if rest_init == "(" or rest_init == ")" then
longestmatch = rest_init
longestmatch = rest_init
longestmatch_len = 1
longestmatch_len = 1
else
else
for _, phoneme in ipairs(valid_phonemes) do
for _, phoneme in ipairs(valid_phonemes) do
local phoneme_len = len(phoneme)
local phoneme_len = len(phoneme)
if phoneme_len > longestmatch_len and usub(rest, 1, phoneme_len) == phoneme then
if phoneme_len > longestmatch_len and usub(rest, 1, phoneme_len) == phoneme then
longestmatch = phoneme
longestmatch = phoneme
longestmatch_len = len(longestmatch)
longestmatch_len = len(longestmatch)
end
end
end
end
end
end


if longestmatch_len > 0 then
if longestmatch_len > 0 then
insert(phonemes, longestmatch)
insert(phonemes, longestmatch)
rest = usub(rest, longestmatch_len + 1)
rest = usub(rest, longestmatch_len + 1)
else
else
local phoneme = usub(rest, 1, 1)
local phoneme = usub(rest, 1, 1)
insert(phonemes, "<span style=\"color: red\">" .. phoneme .. "</span>")
insert(phonemes, "<span style=\"color: red\">" .. phoneme .. "</span>")
rest = usub(rest, 2)
rest = usub(rest, 2)
insert(categories, "IPA pronunciations with invalid phonemes/" .. lang:getCode())
insert(categories, "IPA pronunciations with invalid phonemes/" .. lang:getCode())
end
end
end
pron = concat(phonemes)
end
if repr == "phonemic" then
pron = "/" .. pron .. "/"
else
pron = "-" .. pron
end
end
elseif repr == "phonetic" then
pron = "[" .. pron .. "]"
elseif repr == "orthographic" then
pron = "⟨" .. pron .. "⟩"
end


if reconstructed then
pron = concat(phonemes)
pron = "*" .. pron
end
end


return pron
return (reconstructed and "*" or "") .. opening .. pron .. closing
end
end