Module:IPA/data

This module lacks a documentation subpage. Please create it.
Useful links: root page • root page’s subpages • links • transclusions • testcases • sandbox
local list_to_set = require("Module:table").listToSet

local data = {}

--[=[
A list of representation types (e.g. /foo/ for phonemic and [bar] for phonetic),
given as a table. The key is the opening character, the first value the
representation type, and the second value the closing symbol.]=]
data.representation_types = {
	["/"] = {"phonemic", "/"},
	["["] = {"phonetic", "]"},
	["⫽"] = {"morphophonemic", "⫽"},
	["⟨"] = {"orthographic", "⟩"},
	["-"] = {"rhyme", ""},
}

--[=[
A list of convenience inputs for certain representation types. The key is the
opening character, and the table is a three-item array consisting of (1) an
mw.ustring.gsub pattern which is anchored to the start and end of the string,
with a single capture group that excludes the characters to be substituted,
(2) a corresponding replacement pattern to be used with the pattern, and (3) the
replacement opening character.]=]
data.representation_subs = {
	["<"] = {"^<(.*)>$", "⟨%1⟩", "⟨"},
	["/"] = {"^//(.*)//$", "⫽%1⫽", "⫽"},
}

--[=[
This should list the language codes of all languages that have a pronunciation
page in the appendix of the form ''Appendix:LANG pronunciation'', e.g.
[[Appendix:Russian pronunciation]]. For these languages, the text "key" next to
the generated pronunciation links to such pages; for other languages, it links
to the "LANG phonology" page in Wikipedia (which may or may not exist).
[[Module:IPA]] is responsible for this linking; see format_IPA_full().]=]
data.langs_with_infopages = list_to_set{
	"acw",
	"ady",
	"ang",
	"arc",
	"ba",
	"bg",
	"bo",
	"ca",
	"cho",
	"cmn",
	"cs",
	"cv",
	"cy",
	"da",
	"de",
	"dsb",
	"dz",
	"egl",
	"egy",
	"el",
	"en",
	"enm",
	"eo",
	"es",
	"fa",
	"fi",
	"fo",
	"fr",
	"fy",
	"ga",
	"gd",
    "gmh",
    "gmw-msc",
	"got",
	"he",
	"hi",
	"hrx",
	"hu",
	"hy",
	"id",
	"ii",
	"is",
	"it",
	"iu",
	"ja",
	"jbo",
	"ka",
	"kls",
	"ko",
	"kw",
	"la",
	"lb",
	"liv",
	"lt",
	"lv",
	"mdf",
	"mfe",
	"mic",
	"mk",
	"mns-nor",
	"ms",
	"mt",
	"mul",
	"my",
	"nan",
	"nci",
	"nl",
	"nn",
	"no",
	"nov",
	"nv",
	"pjt",
	"pl",
	"ps",
	"pt",
	"ro",
	"ru",
	"scn",
	"sco",
	"sga",
	"sh",
	"sl",
	"sq",
	"sv",
	"sw",
	"syc",
	"szl",
	"tg",
	"th",
	"tl",
	"tpw",
	"tr",
	"tyv",
	"ug",
	"uk",
	"vi",
	"vo",
	"wlm",
	"yi",
	"yrl",
	"yue",
	"zlw-mas"
}

--[=[
This should list the diphthongs of a language (in the form of Lua patterns),
provided they do *NOT* contain semivowel symbols such as /j w ɰ ɥ/ or vowels
with nonsyllabic diacritics such as /i̯ u̯/. For example, list /au/ or /aʊ/,
but do not list /aw/ or /au̯/. The data in this table is used to count the
number of syllables in a word. [[Module:syllables]] automatically knows how
to correctly handle semivowel symbols and nonsyllabic diacritics.

Any language listed here will automatically have categories of the form
"LANG #-syllable words" generated. In addition, any language listed below under
`langs_to_generate_syllable_count_categories` will also have such categories
generated.

NOTE: There are some additional languages that have these categories.
For example:
* Thai words have these categories added by [[Module:th-pron]].]=]
data.diphthongs = {
	["cs"] = { -- [[w:Czech phonology#Diphthongs]]
		"[aeo]u",
		},
	["de"] = {
		"a[ɪʊ]",
		"ɔ[ʏɪ]",
		},
	["en"] = { -- from [[Appendix:English pronunciation]] mostly, but /ʌɪ/ is from the OED
		"[aɑeɛoɔʌ][ɪi]",
		"[ɑɒæo]e",
		"[əɐ]ʉ",
		"[aɒəoɔæ]ʊ",
		"æo",
		"[ɛeɪiɔʊʉ]ə",	-- /iə/ is a diphthong in NZE, but a disyllabic sequence in GA.
						-- /ɪə/ is both a disyllabic sequence and a diphthong in old-fashioned RP.
		"[aʌ][ʊɪ]ə",	-- May be a disyllabic sequence in some or all dialects?
		},
	["eo"] = {
		"[aeiou][iu]̯",
	},
	["grc"] = {
		"[aeyo]i",
		"[ae]u",
		"[ɛɔa]ː[iu]",
		},
	["hrx"] = {
		"aɪ̯",
		"aʊ̯",
		"oɪ̯",
		"eʊ̯",
	},
	["is"] = {			-- [[w:Icelandic phonology#Vowels]]
		"[aeø][iɪy]",	-- Wikipedia is oddly specific about the second element: ei and ai, but øɪ.
		"[ao]u",
		},
	["it"] = {
		"[aeɛoɔu]i",
		"[aeɛioɔ]u",
		},
	["lb"] = {
		"[iu]ə",
		"[ɜoæɑ]ɪ",
		"[əæɑ]ʊ",
	},
}

--[=[
This should list any languages for which categories of the form
"LANG #-syllable words", e.g. [[:Category:Russian 3-syllable words]], should be
generated. Do not list languages here if they have an entry above under
`data.diphthongs`; such languages are automatically added to this list.]=]
local langs_to_generate_syllable_count_categories = list_to_set{
	"ar",	-- Arabic has diphthongs, but they are transcribed
			-- with semivowel symbols.
	"ary",	-- Moroccan Arabic has diphthongs, but they are transcribed
			-- with semivowel symbols.
	"bg",   -- Bulgarian has diphthongs with /j/ and marginally with /w/,
	        -- but these are semivowels.
	"ca",	-- Catalan has diphthongs, but they are generally transcribed using
			-- /w/ and /j/, so do not need to be listed (see [[w:Catalan language#Diphthongs and triphthongs]].
	"es",	-- Spanish has diphthongs, but they are transcribed with i̯ etc.
	"eu",   -- Basque has dipthongs, but they are transcribed with i̯ and u̯.
	"fi",	-- Finnish has diphthongs, but they are now automatically transcribed with
			-- the nonsyllabic diacritic
	"fr",	-- French has diphthongs, but they are transcribed
			-- with semivowel symbols: [[w:French phonology#Glides and diphthongs]].
	"hnn",
	"id",	-- Indonesian has diphthongs, but they are transcribed with i̯ or /j/ etc.
	"ka",
	"kne",
	"kmr",
	"ku",
	"la",	-- All diphthongs transcribed with e̯ or /j/ etc.
	"mk",
	"ms",	-- Malay has diphthongs, but they are transcribed with i̯ or /j/ etc.
	"mt",	-- Maltese has diphthongs, but they are transcribed
			-- with semivowel symbols.
	"pl",	-- No diphthongs, properly speaking; sequences of a vowel and /w/ or /j/ though.
	"pt",	-- Portuguese has diphthongs, but they are transcribed with i̯ or /j/ etc.
	"rsk",	-- No diphthongs but there are sequences of vowel and /j/ or /w/.
	"ru",	-- No diphthongs, properly speaking; sequences of a vowel and /j/ though.
	"sk",	-- Slovak has rising diphthongs, /i̯e, i̯a, i̯u, u̯o/, which are probably always spelled with the nonsyllabic diacritic, so do not need to be listed.
	"sl",	-- No diphthongs, properly speaking; sequences of a vowel, /j/ and /w/ though
	"sq",	-- [[w:Albanian language#Vowels]] doesn't mention anything about diphthongs.
	"tl",	-- Tagalog has diphthongs, but they are transcribed with i̯ or /j/ etc
	"tsg",
	"ug",	-- No diphthongs.
}
-- Also add languages listed under `data.diphthongs`.
for langcode, _ in pairs(data.diphthongs) do
	langs_to_generate_syllable_count_categories[langcode] = true
end
data.langs_to_generate_syllable_count_categories = langs_to_generate_syllable_count_categories

-- Languages to use the phonetic not phonemic notation to compute syllables counts.
data.langs_to_use_phonetic_notation = list_to_set{
    "bg",
	"es",
	"id",
	"la",
	"mk",
	"ms",
	"rsk",
	"ru",
}

-- Non-standard or obsolete IPA symbols.
data.nonstandard = {
	--[[	The following symbols consist of more than one character,
			so we can't put them in the line below.		]]
	"ɑ̢", "ɔ̗", "ɔ̖",
	"[?ƍσƺƪƞƛłščžǰǧǯẋⱻʚω∅ØȣᴀᴇⱻQKPT]"
}

-- See valid IPA characters at [[Module:IPA/data/symbols]].

data.phonemes = {}
data.phonemes["dz"] = {
	"m", "n", "ŋ",
	"p", "t", "ʈ", "k",
	"pʰ", "tʰ", "ʈʰ", "kʰ",
	"t͡s", "t͡ɕ",
	"t͡sʰ", "t͡ɕʰ",
	"w", "s", "z", "ɬ", "l", "r", "ɕ", "ʑ", "j", "h",
	"ɑ", "e", "i", "o", "u",
	"ɑː", "eː", "ɛː", "iː", "oː", "øː", "uː", "yː",
	"ɑ˥", "e˥", "i˥", "o˥", "u˥",
	"ɑː˥", "eː˥", "ɛː˥", "iː˥", "oː˥", "øː˥", "uː˥", "yː˥",
	"m˥", "n˥", "ŋ˥", "p˥", "k˥", "k̚˥", "w˥", "l˥", "r˥", "ɕ˥", "j˥", ")˥",
	"ɑ˩", "e˩", "i˩", "o˩", "u˩",
	"ɑː˩", "eː˩", "ɛː˩", "iː˩", "oː˩", "øː˩", "uː˩", "yː˩",
	"m˩", "n˩", "ŋ˩", "p˩", "k˩", "k̚˩", "w˩", "l˩", "r˩", "ɕ˩", "j˩", ")˩",
	".", ",", "-",
}
data.phonemes["eo"] = {
	"a", "b", "d", "d͡ʒ", "d͡z", "e", "f", "h", "i", "j", "k",
	"l", "m", "n", "o", "p", "r", "s", "t", "t͡s", "t͡ʃ",
	"u", "u̯", "v", "w", "x", "z", "ɡ", "ʃ", "ʒ",
	"ˈ", ".", " ", "-",
}
data.phonemes["hy"] = {
	"ɑ", "b", "ɡ", "d", "e", "z", "ə", "tʰ", "ʒ", "i", "l", "χ", "t͡s",
	"k", "h", "d͡z", "ʁ", "t͡ʃ", "m", "j", "n", "ʃ", "ɔ", "t͡ʃʰ", "p", "d͡ʒ",
	"r", "s", "v", "t", "ɾ", "t͡sʰ", "v", "pʰ", "kʰ", "o", "f", "ŋɡ", "ŋk",
	"ŋχ", "u", "œ", "ʏ", "ˈ", "ˌ", ".", " ", "ː", 
}
data.phonemes["nl"] = {
	"m", "n", "ŋ",
	"p", "b", "t", "d", "k", "ɡ",
	"f", "v", "s", "z", "ʃ", "ʒ", "x", "ɣ", "ɦ",
	"ʋ", "l", "j", "r",
	"ɪ", "ʏ", "ɛ", "ə", "ɔ", "ɑ",
	"i", "iː", "y", "yː", "u", "uː", "eː", "øː", "oː", "ɛː", "œː", "ɔː", "aː",
	"ɛi̯", "œy̯", "ɔi̯", "ɑu̯", "ɑi̯",
	"iu̯", "yu̯", "ui̯", "eːu̯", "oːi̯", "aːi̯",
	"ˈ", "ˌ", ".", " ", "-",
}
data.phonemes["mt"] = {
	"m", "n",
	"p", "t", "k", "ʔ",
	"b", "d", "ɡ",
	"t͡s", "t͡ʃ",
	"d͡z", "d͡ʒ",
	"f", "s", "ʃ", "ħ",
	"v", "z", "ʒ", "ɣ",
	"l", "j", "w",
	"r",
	"ɪ", "ɛ", "ɔ", "a", "u",
	"ɛˤ", "ɔˤ", "aˤ", "əˤ",
	"ɛˤː", "ɔˤː", "aˤː", "əˤː", "ɪˤː",
	"iː", "ɪː", "ɛː", "ɔː", "aː", "uː",
	"ˈ", "ˌ", ".", " ", "‿", "-"
}

return data