Module:IPA/data: Difference between revisions

No edit summary
No edit summary
 
(One intermediate revision by the same user not shown)
Line 1: Line 1:
local list_to_set = require("Module:table").listToSet
local data = {}
local data = {}
--[=[
A list of representation types (e.g. /foo/ for phonemic and [bar] for phonetic),
given as a table. The key is the opening character, the first value the
representation type, and the second value the closing symbol.]=]
data.representation_types = {
["/"] = {"phonemic", "/"},
["["] = {"phonetic", "]"},
["⫽"] = {"morphophonemic", "⫽"},
["⟨"] = {"orthographic", "⟩"},
["-"] = {"rhyme", ""},
}
--[=[
A list of convenience inputs for certain representation types. The key is the
opening character, and the table is a three-item array consisting of (1) an
mw.ustring.gsub pattern which is anchored to the start and end of the string,
with a single capture group that excludes the characters to be substituted,
(2) a corresponding replacement pattern to be used with the pattern, and (3) the
replacement opening character.]=]
data.representation_subs = {
["<"] = {"^<(.*)>$", "⟨%1⟩", "⟨"},
["/"] = {"^//(.*)//$", "⫽%1⫽", "⫽"},
}


--[=[
--[=[
Line 7: Line 33:
the generated pronunciation links to such pages; for other languages, it links
the generated pronunciation links to such pages; for other languages, it links
to the "LANG phonology" page in Wikipedia (which may or may not exist).
to the "LANG phonology" page in Wikipedia (which may or may not exist).
[[Module:IPA]] is responsible for this linking; see format_IPA_full().
[[Module:IPA]] is responsible for this linking; see format_IPA_full().]=]
]=]
data.langs_with_infopages = list_to_set{
local langs_with_infopages = {
"acw",
"acw",
"ady",
"ady",
Line 41: Line 66:
"ga",
"ga",
"gd",
"gd",
    "gmh",
    "gmw-msc",
"got",
"got",
"he",
"he",
Line 47: Line 74:
"hu",
"hu",
"hy",
"hy",
    "id",
"id",
"ii",
"ii",
"is",
"is",
Line 107: Line 134:
"wlm",
"wlm",
"yi",
"yi",
"yrl",
"yue",
"yue",
"zlw-mas"
"zlw-mas"
}
}
data.langs_with_infopages = {}
-- Convert the list in `langs_with_infopages` to a set.
for _, langcode in ipairs(langs_with_infopages) do
data.langs_with_infopages[langcode] = true
end


--[=[
--[=[
Line 133: Line 154:
NOTE: There are some additional languages that have these categories.
NOTE: There are some additional languages that have these categories.
For example:
For example:
* Thai words have these categories added by [[Module:th-pron]].
* Thai words have these categories added by [[Module:th-pron]].]=]
]=]
data.diphthongs = {
data.diphthongs = {
["cs"] = { -- [[w:Czech phonology#Diphthongs]]
["cs"] = { -- [[w:Czech phonology#Diphthongs]]
Line 153: Line 173:
"[aʌ][ʊɪ]ə", -- May be a disyllabic sequence in some or all dialects?
"[aʌ][ʊɪ]ə", -- May be a disyllabic sequence in some or all dialects?
},
},
["eo"] = {
"[aeiou][iu]̯",
},
["grc"] = {
["grc"] = {
"[aeyo]i",
"[aeyo]i",
Line 171: Line 194:
"[aeɛoɔu]i",
"[aeɛoɔu]i",
"[aeɛioɔ]u",
"[aeɛioɔ]u",
},
["la"] = {
"[eaou]i",
"[eao]u",
"[ao]e",
},
},
["lb"] = {
["lb"] = {
Line 188: Line 206:
"LANG #-syllable words", e.g. [[:Category:Russian 3-syllable words]], should be
"LANG #-syllable words", e.g. [[:Category:Russian 3-syllable words]], should be
generated. Do not list languages here if they have an entry above under
generated. Do not list languages here if they have an entry above under
`data.diphthongs`; such languages are automatically added to this list.
`data.diphthongs`; such languages are automatically added to this list.]=]
]=]
local langs_to_generate_syllable_count_categories = list_to_set{
local langs_to_generate_syllable_count_categories = {
"ar", -- Arabic has diphthongs, but they are transcribed
"ar", -- Arabic has diphthongs, but they are transcribed
-- with semivowel symbols.
-- with semivowel symbols.
"ary", -- Moroccan Arabic has diphthongs, but they are transcribed
"ary", -- Moroccan Arabic has diphthongs, but they are transcribed
-- with semivowel symbols.
-- with semivowel symbols.
"bg",  -- Bulgarian has diphthongs with /j/ and marginally with /w/,
        -- but these are semivowels.
"ca", -- Catalan has diphthongs, but they are generally transcribed using
"ca", -- Catalan has diphthongs, but they are generally transcribed using
-- /w/ and /j/, so do not need to be listed (see [[w:Catalan language#Diphthongs and triphthongs]].
-- /w/ and /j/, so do not need to be listed (see [[w:Catalan language#Diphthongs and triphthongs]].
"es", -- Spanish has diphthongs, but they are transcribed with i̯ etc.
"es", -- Spanish has diphthongs, but they are transcribed with i̯ etc.
"eu",  -- Basque has dipthongs, but they are transcribed with i̯ and u̯.
"fi", -- Finnish has diphthongs, but they are now automatically transcribed with
"fi", -- Finnish has diphthongs, but they are now automatically transcribed with
-- the nonsyllabic diacritic
-- the nonsyllabic diacritic
"fr", -- French has diphthongs, but they are transcribed
"fr", -- French has diphthongs, but they are transcribed
-- with semivowel symbols: [[w:French phonology#Glides and diphthongs]].
-- with semivowel symbols: [[w:French phonology#Glides and diphthongs]].
    "id",   -- Indonesian has diphthongs, but they are transcribed with i̯ or /j/ etc.
"hnn",
"id", -- Indonesian has diphthongs, but they are transcribed with i̯ or /j/ etc.
"ka",
"ka",
    "kmr",
"kne",
"kmr",
"ku",
"ku",
"la", -- All diphthongs transcribed with e̯ or /j/ etc.
"mk",
"mk",
    "ms",   -- Malay has diphthongs, but they are transcribed with i̯ or /j/ etc.
"ms", -- Malay has diphthongs, but they are transcribed with i̯ or /j/ etc.
    "mt", -- Maltese has diphthongs, but they are transcribed
"mt", -- Maltese has diphthongs, but they are transcribed
-- with semivowel symbols.
-- with semivowel symbols.
"pl",   -- No diphthongs, properly speaking; sequences of a vowel and /w/ or /j/ though.
"pl", -- No diphthongs, properly speaking; sequences of a vowel and /w/ or /j/ though.
"pt", -- Portuguese has diphthongs, but they are transcribed with i̯ or /j/ etc.
"pt", -- Portuguese has diphthongs, but they are transcribed with i̯ or /j/ etc.
"rsk", -- No diphthongs but there are sequences of vowel and /j/ or /w/.
"ru", -- No diphthongs, properly speaking; sequences of a vowel and /j/ though.
"ru", -- No diphthongs, properly speaking; sequences of a vowel and /j/ though.
"sk", -- Slovak has rising diphthongs, /i̯e, i̯a, i̯u, u̯o/, which are probably always spelled with the nonsyllabic diacritic, so do not need to be listed.
"sk", -- Slovak has rising diphthongs, /i̯e, i̯a, i̯u, u̯o/, which are probably always spelled with the nonsyllabic diacritic, so do not need to be listed.
"sl", -- No diphthongs, properly speaking; sequences of a vowel, /j/ and /w/ though
"sl", -- No diphthongs, properly speaking; sequences of a vowel, /j/ and /w/ though
"sq", -- [[w:Albanian language#Vowels]] doesn't mention anything about diphthongs.
"sq", -- [[w:Albanian language#Vowels]] doesn't mention anything about diphthongs.
    "tl",   -- Tagalog has diphthongs, but they are transcribed with i̯ or /j/ etc.
"tl", -- Tagalog has diphthongs, but they are transcribed with i̯ or /j/ etc
"tsg",
"ug", -- No diphthongs.
"ug", -- No diphthongs.
}
}
data.langs_to_generate_syllable_count_categories = {}
-- Convert the list in `langs_to_generate_syllable_count_categories` to a set.
for _, langcode in ipairs(langs_to_generate_syllable_count_categories) do
data.langs_to_generate_syllable_count_categories[langcode] = true
end
-- Also add languages listed under `data.diphthongs`.
-- Also add languages listed under `data.diphthongs`.
for langcode, _ in pairs(data.diphthongs) do
for langcode, _ in pairs(data.diphthongs) do
data.langs_to_generate_syllable_count_categories[langcode] = true
langs_to_generate_syllable_count_categories[langcode] = true
end
end
 
data.langs_to_generate_syllable_count_categories = langs_to_generate_syllable_count_categories


-- Languages to use the phonetic not phonemic notation to compute syllables counts.
-- Languages to use the phonetic not phonemic notation to compute syllables counts.
local langs_to_use_phonetic_notation = {
data.langs_to_use_phonetic_notation = list_to_set{
    "bg",
"es",
"es",
"id",
"la",
"mk",
"mk",
"ms",
"rsk",
"ru",
"ru",
}
}
data.langs_to_use_phonetic_notation = {}
-- Convert the list in `langs_to_use_phonetic_notation` to a set.
for _, langcode in ipairs(langs_to_use_phonetic_notation) do
data.langs_to_use_phonetic_notation[langcode] = true
end


-- Non-standard or obsolete IPA symbols.
-- Non-standard or obsolete IPA symbols.
Line 252: Line 267:
so we can't put them in the line below. ]]
so we can't put them in the line below. ]]
"ɑ̢", "ɔ̗", "ɔ̖",
"ɑ̢", "ɔ̗", "ɔ̖",
"[?ƍσƺƪƞƛłščžǰǧǯẋᵻᵿⱻʚω∅ØȣᴀᴇⱻQKPT]"
"[?ƍσƺƪƞƛłščžǰǧǯẋⱻʚω∅ØȣᴀᴇⱻQKPT]"
}
}


Line 278: Line 293:
"a", "b", "d", "d͡ʒ", "d͡z", "e", "f", "h", "i", "j", "k",
"a", "b", "d", "d͡ʒ", "d͡z", "e", "f", "h", "i", "j", "k",
"l", "m", "n", "o", "p", "r", "s", "t", "t͡s", "t͡ʃ",
"l", "m", "n", "o", "p", "r", "s", "t", "t͡s", "t͡ʃ",
"u", "v", "w", "x", "z", "ɡ", "ʃ", "ʒ",
"u", "u̯", "v", "w", "x", "z", "ɡ", "ʃ", "ʒ",
"ˈ", ".", " ", "-",
"ˈ", ".", " ", "-",
}
}

Latest revision as of 16:01, 17 November 2025



local list_to_set = require("Module:table").listToSet

local data = {}

--[=[
A list of representation types (e.g. /foo/ for phonemic and [bar] for phonetic),
given as a table. The key is the opening character, the first value the
representation type, and the second value the closing symbol.]=]
data.representation_types = {
	["/"] = {"phonemic", "/"},
	["["] = {"phonetic", "]"},
	["⫽"] = {"morphophonemic", "⫽"},
	["⟨"] = {"orthographic", "⟩"},
	["-"] = {"rhyme", ""},
}

--[=[
A list of convenience inputs for certain representation types. The key is the
opening character, and the table is a three-item array consisting of (1) an
mw.ustring.gsub pattern which is anchored to the start and end of the string,
with a single capture group that excludes the characters to be substituted,
(2) a corresponding replacement pattern to be used with the pattern, and (3) the
replacement opening character.]=]
data.representation_subs = {
	["<"] = {"^<(.*)>$", "⟨%1⟩", "⟨"},
	["/"] = {"^//(.*)//$", "⫽%1⫽", "⫽"},
}

--[=[
This should list the language codes of all languages that have a pronunciation
page in the appendix of the form ''Appendix:LANG pronunciation'', e.g.
[[Appendix:Russian pronunciation]]. For these languages, the text "key" next to
the generated pronunciation links to such pages; for other languages, it links
to the "LANG phonology" page in Wikipedia (which may or may not exist).
[[Module:IPA]] is responsible for this linking; see format_IPA_full().]=]
data.langs_with_infopages = list_to_set{
	"acw",
	"ady",
	"ang",
	"arc",
	"ba",
	"bg",
	"bo",
	"ca",
	"cho",
	"cmn",
	"cs",
	"cv",
	"cy",
	"da",
	"de",
	"dsb",
	"dz",
	"egl",
	"egy",
	"el",
	"en",
	"enm",
	"eo",
	"es",
	"fa",
	"fi",
	"fo",
	"fr",
	"fy",
	"ga",
	"gd",
    "gmh",
    "gmw-msc",
	"got",
	"he",
	"hi",
	"hrx",
	"hu",
	"hy",
	"id",
	"ii",
	"is",
	"it",
	"iu",
	"ja",
	"jbo",
	"ka",
	"kls",
	"ko",
	"kw",
	"la",
	"lb",
	"liv",
	"lt",
	"lv",
	"mdf",
	"mfe",
	"mic",
	"mk",
	"mns-nor",
	"ms",
	"mt",
	"mul",
	"my",
	"nan",
	"nci",
	"nl",
	"nn",
	"no",
	"nov",
	"nv",
	"pjt",
	"pl",
	"ps",
	"pt",
	"ro",
	"ru",
	"scn",
	"sco",
	"sga",
	"sh",
	"sl",
	"sq",
	"sv",
	"sw",
	"syc",
	"szl",
	"tg",
	"th",
	"tl",
	"tpw",
	"tr",
	"tyv",
	"ug",
	"uk",
	"vi",
	"vo",
	"wlm",
	"yi",
	"yrl",
	"yue",
	"zlw-mas"
}

--[=[
This should list the diphthongs of a language (in the form of Lua patterns),
provided they do *NOT* contain semivowel symbols such as /j w ɰ ɥ/ or vowels
with nonsyllabic diacritics such as /i̯ u̯/. For example, list /au/ or /aʊ/,
but do not list /aw/ or /au̯/. The data in this table is used to count the
number of syllables in a word. [[Module:syllables]] automatically knows how
to correctly handle semivowel symbols and nonsyllabic diacritics.

Any language listed here will automatically have categories of the form
"LANG #-syllable words" generated. In addition, any language listed below under
`langs_to_generate_syllable_count_categories` will also have such categories
generated.

NOTE: There are some additional languages that have these categories.
For example:
* Thai words have these categories added by [[Module:th-pron]].]=]
data.diphthongs = {
	["cs"] = { -- [[w:Czech phonology#Diphthongs]]
		"[aeo]u",
		},
	["de"] = {
		"a[ɪʊ]",
		"ɔ[ʏɪ]",
		},
	["en"] = { -- from [[Appendix:English pronunciation]] mostly, but /ʌɪ/ is from the OED
		"[aɑeɛoɔʌ][ɪi]",
		"[ɑɒæo]e",
		"[əɐ]ʉ",
		"[aɒəoɔæ]ʊ",
		"æo",
		"[ɛeɪiɔʊʉ]ə",	-- /iə/ is a diphthong in NZE, but a disyllabic sequence in GA.
						-- /ɪə/ is both a disyllabic sequence and a diphthong in old-fashioned RP.
		"[aʌ][ʊɪ]ə",	-- May be a disyllabic sequence in some or all dialects?
		},
	["eo"] = {
		"[aeiou][iu]̯",
	},
	["grc"] = {
		"[aeyo]i",
		"[ae]u",
		"[ɛɔa]ː[iu]",
		},
	["hrx"] = {
		"aɪ̯",
		"aʊ̯",
		"oɪ̯",
		"eʊ̯",
	},
	["is"] = {			-- [[w:Icelandic phonology#Vowels]]
		"[aeø][iɪy]",	-- Wikipedia is oddly specific about the second element: ei and ai, but øɪ.
		"[ao]u",
		},
	["it"] = {
		"[aeɛoɔu]i",
		"[aeɛioɔ]u",
		},
	["lb"] = {
		"[iu]ə",
		"[ɜoæɑ]ɪ",
		"[əæɑ]ʊ",
	},
}

--[=[
This should list any languages for which categories of the form
"LANG #-syllable words", e.g. [[:Category:Russian 3-syllable words]], should be
generated. Do not list languages here if they have an entry above under
`data.diphthongs`; such languages are automatically added to this list.]=]
local langs_to_generate_syllable_count_categories = list_to_set{
	"ar",	-- Arabic has diphthongs, but they are transcribed
			-- with semivowel symbols.
	"ary",	-- Moroccan Arabic has diphthongs, but they are transcribed
			-- with semivowel symbols.
	"bg",   -- Bulgarian has diphthongs with /j/ and marginally with /w/,
	        -- but these are semivowels.
	"ca",	-- Catalan has diphthongs, but they are generally transcribed using
			-- /w/ and /j/, so do not need to be listed (see [[w:Catalan language#Diphthongs and triphthongs]].
	"es",	-- Spanish has diphthongs, but they are transcribed with i̯ etc.
	"eu",   -- Basque has dipthongs, but they are transcribed with i̯ and u̯.
	"fi",	-- Finnish has diphthongs, but they are now automatically transcribed with
			-- the nonsyllabic diacritic
	"fr",	-- French has diphthongs, but they are transcribed
			-- with semivowel symbols: [[w:French phonology#Glides and diphthongs]].
	"hnn",
	"id",	-- Indonesian has diphthongs, but they are transcribed with i̯ or /j/ etc.
	"ka",
	"kne",
	"kmr",
	"ku",
	"la",	-- All diphthongs transcribed with e̯ or /j/ etc.
	"mk",
	"ms",	-- Malay has diphthongs, but they are transcribed with i̯ or /j/ etc.
	"mt",	-- Maltese has diphthongs, but they are transcribed
			-- with semivowel symbols.
	"pl",	-- No diphthongs, properly speaking; sequences of a vowel and /w/ or /j/ though.
	"pt",	-- Portuguese has diphthongs, but they are transcribed with i̯ or /j/ etc.
	"rsk",	-- No diphthongs but there are sequences of vowel and /j/ or /w/.
	"ru",	-- No diphthongs, properly speaking; sequences of a vowel and /j/ though.
	"sk",	-- Slovak has rising diphthongs, /i̯e, i̯a, i̯u, u̯o/, which are probably always spelled with the nonsyllabic diacritic, so do not need to be listed.
	"sl",	-- No diphthongs, properly speaking; sequences of a vowel, /j/ and /w/ though
	"sq",	-- [[w:Albanian language#Vowels]] doesn't mention anything about diphthongs.
	"tl",	-- Tagalog has diphthongs, but they are transcribed with i̯ or /j/ etc
	"tsg",
	"ug",	-- No diphthongs.
}
-- Also add languages listed under `data.diphthongs`.
for langcode, _ in pairs(data.diphthongs) do
	langs_to_generate_syllable_count_categories[langcode] = true
end
data.langs_to_generate_syllable_count_categories = langs_to_generate_syllable_count_categories

-- Languages to use the phonetic not phonemic notation to compute syllables counts.
data.langs_to_use_phonetic_notation = list_to_set{
    "bg",
	"es",
	"id",
	"la",
	"mk",
	"ms",
	"rsk",
	"ru",
}

-- Non-standard or obsolete IPA symbols.
data.nonstandard = {
	--[[	The following symbols consist of more than one character,
			so we can't put them in the line below.		]]
	"ɑ̢", "ɔ̗", "ɔ̖",
	"[?ƍσƺƪƞƛłščžǰǧǯẋⱻʚω∅ØȣᴀᴇⱻQKPT]"
}

-- See valid IPA characters at [[Module:IPA/data/symbols]].

data.phonemes = {}
data.phonemes["dz"] = {
	"m", "n", "ŋ",
	"p", "t", "ʈ", "k",
	"pʰ", "tʰ", "ʈʰ", "kʰ",
	"t͡s", "t͡ɕ",
	"t͡sʰ", "t͡ɕʰ",
	"w", "s", "z", "ɬ", "l", "r", "ɕ", "ʑ", "j", "h",
	"ɑ", "e", "i", "o", "u",
	"ɑː", "eː", "ɛː", "iː", "oː", "øː", "uː", "yː",
	"ɑ˥", "e˥", "i˥", "o˥", "u˥",
	"ɑː˥", "eː˥", "ɛː˥", "iː˥", "oː˥", "øː˥", "uː˥", "yː˥",
	"m˥", "n˥", "ŋ˥", "p˥", "k˥", "k̚˥", "w˥", "l˥", "r˥", "ɕ˥", "j˥", ")˥",
	"ɑ˩", "e˩", "i˩", "o˩", "u˩",
	"ɑː˩", "eː˩", "ɛː˩", "iː˩", "oː˩", "øː˩", "uː˩", "yː˩",
	"m˩", "n˩", "ŋ˩", "p˩", "k˩", "k̚˩", "w˩", "l˩", "r˩", "ɕ˩", "j˩", ")˩",
	".", ",", "-",
}
data.phonemes["eo"] = {
	"a", "b", "d", "d͡ʒ", "d͡z", "e", "f", "h", "i", "j", "k",
	"l", "m", "n", "o", "p", "r", "s", "t", "t͡s", "t͡ʃ",
	"u", "u̯", "v", "w", "x", "z", "ɡ", "ʃ", "ʒ",
	"ˈ", ".", " ", "-",
}
data.phonemes["hy"] = {
	"ɑ", "b", "ɡ", "d", "e", "z", "ə", "tʰ", "ʒ", "i", "l", "χ", "t͡s",
	"k", "h", "d͡z", "ʁ", "t͡ʃ", "m", "j", "n", "ʃ", "ɔ", "t͡ʃʰ", "p", "d͡ʒ",
	"r", "s", "v", "t", "ɾ", "t͡sʰ", "v", "pʰ", "kʰ", "o", "f", "ŋɡ", "ŋk",
	"ŋχ", "u", "œ", "ʏ", "ˈ", "ˌ", ".", " ", "ː", 
}
data.phonemes["nl"] = {
	"m", "n", "ŋ",
	"p", "b", "t", "d", "k", "ɡ",
	"f", "v", "s", "z", "ʃ", "ʒ", "x", "ɣ", "ɦ",
	"ʋ", "l", "j", "r",
	"ɪ", "ʏ", "ɛ", "ə", "ɔ", "ɑ",
	"i", "iː", "y", "yː", "u", "uː", "eː", "øː", "oː", "ɛː", "œː", "ɔː", "aː",
	"ɛi̯", "œy̯", "ɔi̯", "ɑu̯", "ɑi̯",
	"iu̯", "yu̯", "ui̯", "eːu̯", "oːi̯", "aːi̯",
	"ˈ", "ˌ", ".", " ", "-",
}
data.phonemes["mt"] = {
	"m", "n",
	"p", "t", "k", "ʔ",
	"b", "d", "ɡ",
	"t͡s", "t͡ʃ",
	"d͡z", "d͡ʒ",
	"f", "s", "ʃ", "ħ",
	"v", "z", "ʒ", "ɣ",
	"l", "j", "w",
	"r",
	"ɪ", "ɛ", "ɔ", "a", "u",
	"ɛˤ", "ɔˤ", "aˤ", "əˤ",
	"ɛˤː", "ɔˤː", "aˤː", "əˤː", "ɪˤː",
	"iː", "ɪː", "ɛː", "ɔː", "aː", "uː",
	"ˈ", "ˌ", ".", " ", "‿", "-"
}

return data