---Transcribes a Vietnamese word or compound word into IPA. Supports
-- pronunciations in some of the main dialects of Vietnam.
--
-- This module is intended as a replacement for the lumbering monstrosity
-- [[Bản mẫu:vie-pron]], which itself is a replacement for the
-- editor-unfriendly [[Bản mẫu:VieIPA]].
require "mw.ustring"
local p = {}
p.dialects = require("Module:ViePron/dialects")
local dialects = {
"Hà Nội", "Hải Phòng",
"Vinh", "Thanh Chương", "Hà Tĩnh",
"Huế",
"Quy Nhơn", "Sài Gòn"
}
local accentedChars = "đâăêôơưáấắéếíóốớúứýàầằèềìòồờùừỳảẩẳẻểỉỏổởủửỷãẫẵẽễĩõỗỡũữỹạậặẹệịọộợụựỵ"
---Table mapping vowel characters to their toneless base letters.
local vowelsToBases = {
["a"] = "a", ["á"] = "a", ["à"] = "a", ["ã"] = "a", ["ả"] = "a", ["ạ"] = "a",
["â"] = "â", ["ấ"] = "â", ["ầ"] = "â", ["ẫ"] = "â", ["ẩ"] = "â", ["ậ"] = "â",
["ă"] = "ă", ["ắ"] = "ă", ["ằ"] = "ă", ["ẵ"] = "ă", ["ẳ"] = "ă", ["ặ"] = "ă",
["e"] = "e", ["é"] = "e", ["è"] = "e", ["ẽ"] = "e", ["ẻ"] = "e", ["ẹ"] = "e",
["ê"] = "ê", ["ế"] = "ê", ["ề"] = "ê", ["ễ"] = "ê", ["ể"] = "ê", ["ệ"] = "ê",
["i"] = "i", ["í"] = "i", ["ì"] = "i", ["ĩ"] = "i", ["ỉ"] = "i", ["ị"] = "i",
["o"] = "o", ["ó"] = "o", ["ò"] = "o", ["õ"] = "o", ["ỏ"] = "o", ["ọ"] = "o",
["ô"] = "ô", ["ố"] = "ô", ["ồ"] = "ô", ["ỗ"] = "ô", ["ổ"] = "ô", ["ộ"] = "ô",
["ơ"] = "ơ", ["ớ"] = "ơ", ["ờ"] = "ơ", ["ỡ"] = "ơ", ["ở"] = "ơ", ["ợ"] = "ơ",
["u"] = "u", ["ú"] = "u", ["ù"] = "u", ["ũ"] = "u", ["ủ"] = "u", ["ụ"] = "u",
["ư"] = "ư", ["ứ"] = "ư", ["ừ"] = "ư", ["ữ"] = "ư", ["ử"] = "ư", ["ự"] = "ư",
["y"] = "y", ["ý"] = "y", ["ỳ"] = "y", ["ỹ"] = "y", ["ỷ"] = "y", ["ỵ"] = "y"
}
---Table mapping vowel characters to the VIQR representation of their tones.
local vowelsToVIQRTones = {
["a"] = "", ["á"] = "'", ["à"] = "`", ["ã"] = "~", ["ả"] = "?", ["ạ"] = ".",
["â"] = "", ["ấ"] = "'", ["ầ"] = "`", ["ẫ"] = "~", ["ẩ"] = "?", ["ậ"] = ".",
["ă"] = "", ["ắ"] = "'", ["ằ"] = "`", ["ẵ"] = "~", ["ẳ"] = "?", ["ặ"] = ".",
["e"] = "", ["é"] = "'", ["è"] = "`", ["ẽ"] = "~", ["ẻ"] = "?", ["ẹ"] = ".",
["ê"] = "", ["ế"] = "'", ["ề"] = "`", ["ễ"] = "~", ["ể"] = "?", ["ệ"] = ".",
["i"] = "", ["í"] = "'", ["ì"] = "`", ["ĩ"] = "~", ["ỉ"] = "?", ["ị"] = ".",
["o"] = "", ["ó"] = "'", ["ò"] = "`", ["õ"] = "~", ["ỏ"] = "?", ["ọ"] = ".",
["ô"] = "", ["ố"] = "'", ["ồ"] = "`", ["ỗ"] = "~", ["ổ"] = "?", ["ộ"] = ".",
["ơ"] = "", ["ớ"] = "'", ["ờ"] = "`", ["ỡ"] = "~", ["ở"] = "?", ["ợ"] = ".",
["u"] = "", ["ú"] = "'", ["ù"] = "`", ["ũ"] = "~", ["ủ"] = "?", ["ụ"] = ".",
["ư"] = "", ["ứ"] = "'", ["ừ"] = "`", ["ữ"] = "~", ["ử"] = "?", ["ự"] = ".",
["y"] = "", ["ý"] = "'", ["ỳ"] = "`", ["ỹ"] = "~", ["ỷ"] = "?", ["ỵ"] = "."
}
---Receives a word and returns a copy of the word without tone marks.
function p.detone(word)
return mw.ustring.gsub(word, "%a", vowelsToBases)
end
---Returns the VIQR representation of the given glide-vowel-glide sequence’s tone.
function p.viqrTone(gvg)
for character in mw.ustring.gmatch(gvg, "%a") do
if vowelsToVIQRTones[character] then
local tone = vowelsToVIQRTones[character]
-- Immediately return non-ngang tones.
if #tone > 0 then return tone end
end
end
-- Fall back on the ngang tone.
return ""
end
---Returns a breakdown of the given word.
-- @usage {{#gọi:ViePron|components|tiếng}}
function p.components(frame)
local c = p._components(frame.args.word or frame.args[1])
return mw.ustring.format("%s-%s-%s", c.ci, c.gvg, c.cf)
end
function p._components(word)
-- Initial and final consonant clusters
-- [[Bản mẫu:vie-pron/VieC]]
local ci, cf = mw.ustring.match(word, "^([bcdđfghjklmnpqrstvwxz]*).-([cghmnpt]*)$")
local giv
if ci == "g" then
local c2 = mw.ustring.sub(word, 2, 2)
if c2 == "i" then
ci = "gi"
elseif vowelsToBases[c2] == "i" then
ci = "gi"
giv = c2
elseif c2 == "y" or vowelsToBases[c2] == "y" then
-- e.g. [[giặt gỵa]]
ci = "d"
local c2idx = mw.ustring.find(accentedChars, c2, 1, true) - 6
word = ci .. mw.ustring.sub(accentedChars, c2idx, c2idx) ..
mw.ustring.sub(word, 3)
end
elseif ci == "q" then
ci = "qu"
end
-- Interior glide-vowel-glide sequence
-- [[Bản mẫu:vie-pron/VieV]]
local interior = mw.ustring.sub(word, mw.ustring.len(ci) + 1,
mw.ustring.len(word) - mw.ustring.len(cf))
local gvg = {}
for letter in mw.ustring.gmatch(interior, "%a") do
if vowelsToBases[letter] then table.insert(gvg, letter) else break end
end
gvg = table.concat(gvg)
-- TODO: Support polysyllabic words.
assert(mw.ustring.len(ci .. gvg .. cf) == mw.ustring.len(word),
"Từ này không tuân theo quy tắc chính tả tiếng Việt, hoặc là từ đa âm tiết được viết như một từ. " ..
"Nếu là từ ngoại ngữ, xin hãy định rõ cách phiên âm vào tham số của bản mẫu vie-pron, " ..
"và phân tách các âm tiết bằng dấu gạch ngang (-) hoặc khoảng cách. (“" ..
ci .. "”+“" .. gvg .. "”+“" .. cf .. "”≠“" .. word .. "”)")
-- Words in which “gi-” is short for “*gii-”.
if ci == "gi" and giv and #gvg < 1 then gvg = giv end
-- “y-”
if #ci < 1 and mw.ustring.sub(gvg, 1, 1) == "y" then
gvg = "i" .. mw.ustring.sub(gvg, 2)
end
-- Semisyllables, like in “H'Mông”
if #ci > 0 and #gvg < 1 and #cf < 1 then gvg = "ờ" end
-- Tone
local t = p.viqrTone(gvg)
return {ci = ci, gvg = gvg, cf = cf, t = t}
end
---Returns the IPA transcription of the given initial consonant cluster.
-- @usage {{#gọi:VieIPA|ciToIPA|t|iế|ng|Hà Tĩnh}}
function p.ciToIPA(frame)
return p._ciToIPA(frame.args.ci or frame.args[1],
frame.args.gvg or frame.args[2],
frame.args.cf or frame.args[3],
frame.args.dialect or frame.args[4])
end
function p._ciToIPA(ci, gvg, cf, dialect)
local data = p.dialects[dialect].initialConsonantsToIPA
local ipa = data[ci] or
-- Loanwords from some minority languages retain double consonants.
data[mw.ustring.sub(ci, 1)] or data[mw.ustring.sub(ci, 2)]
if type(ipa) == "function" then ipa = ipa(p.detone(gvg), cf) end
return ipa or ""
end
---Returns the IPA transcription of the given glide-vowel-glide sequence.
-- @usage {{#gọi:VieIPA|gvgToIPA|t|iế|ng|'|Hà Tĩnh}}
function p.gvgToIPA(frame)
return p._gvgToIPA(frame.args.ci or frame.args[1],
frame.args.gvg or frame.args[2],
frame.args.cf or frame.args[3],
frame.args.t or frame.args[4],
frame.args.dialect or frame.args[5])
end
function p._gvgToIPA(ci, gvg, cf, t, dialect)
local gvgData = p.dialects[dialect].interiorToIPA
local toneAttributes = p.dialects[dialect].toneAttributes[t] or {}
local ipa = gvgData[gvg] or gvgData[p.detone(gvg)] or ""
if type(ipa) == "function" then ipa = ipa(ci, cf) or "" end
-- Insert glottal stop.
if toneAttributes.glottal then
if toneAttributes.repeated then
ipa = mw.ustring.gsub(ipa, "(%a)_ː?", "%1_ʔ%1_")
else ipa = mw.ustring.gsub(ipa, "(%a)_(ː?)", "%1_%2ʔ") end
end
-- Insert breathy-voice diacritic.
if toneAttributes.breathy then ipa = mw.ustring.gsub(ipa, "_", "\204\164_") -- U+0324
-- Or insert creaky-voice diacritic.
elseif toneAttributes.creaky then ipa = mw.ustring.gsub(ipa, "_", "\204\176_") -- U+0330
end
return mw.ustring.gsub(ipa, "_", "")
end
---Returns the IPA transcription of the given final consonant cluster.
-- @usage {{#gọi:VieIPA|t|iế|ng|Quy Nhơn}}
function p.cfToIPA(frame)
return p._cfToIPA(frame.args.ci or frame.args[1],
frame.args.gvg or frame.args[2],
frame.args.cf or frame.args[3],
frame.args.dialect or frame.args[4])
end
function p._cfToIPA(ci, gvg, cf, dialect)
local ipa = p.dialects[dialect].finalConsonantsToIPA[cf]
if type(ipa) == "function" then ipa = ipa(ci, p.detone(gvg)) end
return ipa or ""
end
---Returns the IPA tone letters for the given word.
-- [[Bản mẫu:vie-pron/VieTn]] and [[Bản mẫu:vie-pron/VieT]]
-- @usage {{#invoke:ViePron|viqrToneToIPA|tiếng|Sài Gòn}}
function p.viqrToneToIPA(frame)
return p._viqrToneToIPA(frame.args.word or frame.args[1],
frame.args.dialect or frame.args[2])
end
function p._viqrToneToIPA(word, dialect)
local viqr = p.viqrTone(word)
if not viqr then return end
return p.dialects[dialect].viqrTonesToIPA[viqr]
end
---Returns the IPA transcription of the given Vietnamese text.
-- @usage {{#invoke:ViePron|ipa|tiếng Việt}}
function p.ipa(frame)
return p._ipa(frame.args.text or frame.args[1] or "",
frame.args.dialect or frame.args[2] or "Hà Nội", frame.args.css)
end
function p._ipa(text, dialect, css)
local ipa = {}
for word in mw.ustring.gmatch(mw.ustring.lower(text), "([a-z" .. accentedChars .. "]+)") do
local c = p._components(word)
local tone = p._viqrToneToIPA(word, dialect)
if css then
tone = "<span class='IPA-tone'>" .. tone .. "</span>"
end
table.insert(ipa,
p._ciToIPA(c.ci, c.gvg, c.cf, dialect) ..
p._gvgToIPA(c.ci, c.gvg, c.cf, c.t, dialect) ..
p._cfToIPA(c.ci, c.gvg, c.cf, dialect) ..
tone)
end
return table.concat(ipa, " ")
end
---Returns [[Bản mẫu:vie-pron/Bảng]] prefilled with IPA transcriptions of the
-- given word in several dialects.
-- @usage {{#invoke:ViePron|standaloneTable}}
function p.standaloneTable(frame)
-- Get any words passed in as arguments to #invoke:.
local words = {frame.args.word}
local overrides = {}
if #words < 1 or #words[1] < 1 then
for i, arg in ipairs(frame.args) do
table.insert(words, arg)
end
for k, v in pairs(frame.args) do
overrides[k] = v
end
end
-- Get any words passed in as arguments to [[Bản mẫu:vie-pron]].
if #words < 1 then
local template = frame:getParent()
for i, arg in ipairs(template.args) do
table.insert(words, arg)
end
for k, v in pairs(template.args) do
overrides[k] = v
end
end
-- Fall back on the page name.
if #words < 1 then words = {mw.title.getCurrentTitle().text} end
-- Fill in the display template.
words = table.concat(words, " ")
return frame:expandTemplate{
title = "vie-pron/Bảng",
args = {
words,
HN = overrides.HN or p._ipa(words, "Hà Nội", true),
H = overrides.H or p._ipa(words, "Huế", true),
SG = overrides.SG or p._ipa(words, "Sài Gòn", true),
-- 3 vùng không phổ biến nhưng nổi bật theo ngôn ngữ học
V = overrides.V or p._ipa(words, "Vinh", true),
TC = overrides.TC or p._ipa(words, "Thanh Chương", true),
HT = overrides.HT or p._ipa(words, "Hà Tĩnh", true),
},
}
end
---Returns an HTML table row with one header cell for each supported dialect.
-- @usage {{#invoke:ViePron|tableColumnHeaders}}
function p.tableColumnHeaders()
local headers = {"<th>Từ</th>"}
for i = 1, #dialects do
table.insert(headers, "<th>" .. dialects[i] .. "</th>")
end
return "<tr>" .. table.concat(headers) .. "</tr>"
end
---Returns an HTML table row of IPA transcriptions of the given word in all the
-- supported dialects. Adjacent, identical table cells are combined.
-- @usage {{#invoke:ViePron|tableRow|tiếng Việt}}
function p.tableRow(frame)
return p._tableRow(frame.args.word or frame.args[1])
end
function p._tableRow(word)
local cells = {}
local colspans = {}
for i = 1, #dialects do
cells[i] = p._ipa(word, dialects[i], true)
colspans[i] = 1
end
for i = #cells, 1, -1 do
if cells[i] == cells[i - 1] then
colspans[i - 1] = colspans[i - 1] + colspans[i]
table.remove(cells, i)
table.remove(colspans, i)
end
end
for i = 1, #cells do
local colspan = ""
if colspans[i] > 1 then colspan = " colspan='" .. colspans[i] .. "'" end
cells[i] = "<td" .. colspan ..">" .. cells[i] .. "</td>"
end
return "<tr><th scope='row'>" .. word .. "</th>" .. table.concat(cells) .. "</tr>"
end
return p