local export = {}
local function str_hira_to_kata(s)
return (mw.ustring.gsub(s, '[ぁ-ゖ]', function(m1) return mw.ustring.char(mw.ustring.codepoint(m1) + 96) end))
end
local function str_kata_to_hira(s)
return (mw.ustring.gsub(s, '[ァ-ヶ]', function(m1) return mw.ustring.char(mw.ustring.codepoint(m1) - 96) end))
end
-- using Wagner–Fischer algorithm
-- str_ucompare is O(nlogn). parse_text is O(n). For long texts, the latter is less prone to Lua timeout error.
local function str_ucompare(s1, s2, limit)
s1, s2 = mw.text.split(s1, ''), mw.text.split(s2, '')
local len1, len2 = #s1, #s2
if limit and len1 * len2 * 20 > limit then return {{s1}, {s2}} end
local m_cost, m_step = {{0}}, {{}}
for i = 1, len1 do
m_cost[i + 1] = {i}
m_step[i + 1] = {3}
end
for j = 1, len2 do
m_cost[1][j + 1] = j
m_step[1][j + 1] = 4
end
for i = 1, len1 do
for j = 1, len2 do
local b_same = s1[i] == s2[j]
local c_sub = m_cost[i][j] + (b_same and 0 or 1)
local c_del = m_cost[i][j + 1] + 1
local c_ins = m_cost[i + 1][j] + 1
if c_sub <= c_del and c_sub <= c_ins then
m_cost[i + 1][j + 1] = c_sub
m_step[i + 1][j + 1] = b_same and 1 or 2
elseif c_del <= c_ins then
m_cost[i + 1][j + 1] = c_del
m_step[i + 1][j + 1] = 3
else
m_cost[i + 1][j + 1] = c_ins
m_step[i + 1][j + 1] = 4
end
end
end
local i1, i2 = len1 + 1, len2 + 1
local m_offset = {{-1, -1}, {-1, -1}, {-1, 0}, {0, -1}}
local r_step_rev = {}
local r_step_rev_pos1 = {}
local r_step_rev_pos2 = {}
local step = m_step[i1][i2]
while step do
table.insert(r_step_rev, step)
i1 = i1 + m_offset[step][1]
i2 = i2 + m_offset[step][2]
table.insert(r_step_rev_pos1, i1)
table.insert(r_step_rev_pos2, i2)
step = m_step[i1][i2]
end
local r1, r2 = {}, {}
local i = #r_step_rev
local step = r_step_rev[i]
while i > 0 do
local r1_f, r2_f = {}, {}
if step == 1 then
repeat
table.insert(r1_f, s1[r_step_rev_pos1[i]])
table.insert(r2_f, s2[r_step_rev_pos2[i]])
i = i - 1
step = r_step_rev[i]
until step ~= 1
else
while true do
if step == 2 then
table.insert(r1_f, s1[r_step_rev_pos1[i]])
table.insert(r2_f, s2[r_step_rev_pos2[i]])
elseif step == 3 then
table.insert(r1_f, s1[r_step_rev_pos1[i]])
elseif step == 4 then
table.insert(r2_f, s2[r_step_rev_pos2[i]])
else
break
end
i = i - 1
step = r_step_rev[i]
end
end
table.insert(r1, table.concat(r1_f))
table.insert(r2, table.concat(r2_f))
end
return {r1, r2}
end
local function str_parse_link(s)
local t = {}
local lt
local i1, i2
local i_o = 1
local i_n = s:find('%[%[', i_o)
while i_n do
i1, i2 = s:find('%[%[', i_n + 1), s:find('%]%]', i_n + 2)
if not i2 then break end
while i1 and i1 < i2 do
i_n = i1
i1 = s:find('%[%[', i_n + 1)
end
if i_o < i_n then table.insert(t, {
text = s:sub(i_o, i_n - 1),
}) end
if i_n + 2 < i2 then
lt = s:sub(i_n + 2, i2 - 1)
i1 = lt:find('|')
if i1 and i1 > 1 and i1 < lt:len() then
table.insert(t, {
text = lt:sub(i1 + 1),
linkto = lt:sub(1, i1 - 1),
})
else
table.insert(t, {
text = lt,
linkto = lt,
})
end
end
i_o = i2 + 2
i_n = s:find('%[%[', i_o)
end
if i_o <= s:len() then table.insert(t, {
text = s:sub(i_o),
}) end
return t
end
local function table_merge(link_table, ruby_table)
local r = {}
local r_sub, r_insert
local len_cut
local id_l, id_r = 1, 1
while id_l <= #link_table and id_r <= #ruby_table do
len_cut = link_table[id_l].text:len() - ruby_table[id_r].text:len()
if ruby_table[id_r].ruby and (ruby_table[id_r].ruby:find'%[%[..-%]%]' or len_cut < 0) then
if ruby_table[id_r].ruby then
r_sub = {
text = {},
ruby = str_parse_link(ruby_table[id_r].ruby),
}
r_insert = r_sub.text
table.insert(r, r_sub)
else
r_insert = r
end
while len_cut < 0 do
table.insert(r_insert, {
text = link_table[id_l].text,
linkto = link_table[id_l].linkto
})
id_l = id_l + 1
len_cut = len_cut + link_table[id_l].text:len()
end
table.insert(r_insert, {
text = link_table[id_l].text:sub(1, -1 - len_cut),
linkto = link_table[id_l].linkto
})
if len_cut == 0 then
id_l = id_l + 1
id_r = id_r + 1
else
link_table[id_l].text = link_table[id_l].text:sub(-len_cut)
id_r = id_r + 1
end
else
if link_table[id_l].linkto then
r_sub = {
text = {},
linkto = link_table[id_l].linkto,
}
r_insert = r_sub.text
table.insert(r, r_sub)
else
r_insert = r
end
while len_cut > 0 and not (ruby_table[id_r].ruby and ruby_table[id_r].ruby:find'%[%[..-%]%]') do
table.insert(r_insert, {
text = ruby_table[id_r].text,
ruby = ruby_table[id_r].ruby,
})
id_r = id_r + 1
len_cut = len_cut - ruby_table[id_r].text:len()
end
if len_cut == 0 then
table.insert(r_insert, {
text = ruby_table[id_r].text,
ruby = ruby_table[id_r].ruby,
})
id_l = id_l + 1
id_r = id_r + 1
else
if ruby_table[id_r].ruby then
link_table[id_l].text = link_table[id_l].text:sub(-(len_cut + ruby_table[id_r].text:len()))
else
table.insert(r_insert, {
text = ruby_table[id_r].text:sub(1, -1 + len_cut),
})
ruby_table[id_r].text = ruby_table[id_r].text:sub(len_cut)
id_l = id_l + 1
end
end
end
end
return r
end
--[==[Concatenates the texts in a ruby table. Discards all ruby and links.]==]
function export.to_text(ruby_table)
local r = {}
local v_text
for _, v in ipairs(ruby_table) do
v_text = v.text
if type(v_text) == 'string' then
table.insert(r, v_text)
else
table.insert(r, export.to_text(v_text))
end
end
return table.concat(r)
end
--[==[Concatenates the texts in a ruby table. Discards all links. Ruby are used in place of the text below it when present.]==]
function export.to_ruby(ruby_table)
local r = {}
local v_text
for _, v in ipairs(ruby_table) do
v_text = v.ruby or v.text
if type(v_text) == 'string' then
table.insert(r, v_text)
else
table.insert(r, export.to_ruby(v_text))
end
end
return table.concat(r)
end
local function table_to_markup(ruby_table, break_link, lb, lm, lf, rb, rm, rf)
local text = {}
local v_text, v_ruby, v_linkto
for _, v in ipairs(ruby_table) do
v_linkto, v_ruby = v.linkto, v.ruby
if type(v.text) ~= 'string' then
if break_link and v_linkto then
v_text = {}
for _, vv in ipairs(v.text) do
if vv.text ~= '' or vv.ruby and vv.ruby ~= '' then
table.insert(v_text, {
text = {{
text = vv.text,
linkto = v_linkto,
}},
ruby = vv.ruby,
})
end
end
v_linkto, v_ruby = nil, nil
v_text = table_to_markup(v_text, break_link, lb, lm, lf, rb, rm, rf)
else
v_text = table_to_markup(v.text, break_link, lb, lm, lf, rb, rm, rf)
end
else
v_text = v.text
end
if v_linkto then
if v_linkto ~= '' then table.insert(text, lb .. v_linkto .. lm .. (v_text ~= '' and v_text or '_') .. lf)
else table.insert(text, v_text) end
elseif v_ruby then
if type(v_ruby) ~= 'string' then v_ruby = table_to_markup(v_ruby, break_link, lb, lm, lf, rb, rm, rf) end
if v_ruby ~= '' then table.insert(text, rb .. v_text .. rm .. v_ruby .. rf)
else table.insert(text, v_text) end
else
table.insert(text, v_text)
end
end
return table.concat(text)
end
--[==[Generates {"[]()"} markups from ruby tables.
* {options.break_link = true}: Change {[[...|<ruby>...<ruby>]]} to {<ruby>[[...]]<ruby>}.
* {options.markup}: Use custom markups other than {"[...](...)"} and {"[[...|...]]"}. Custom markups are not recognized and can not be converted back.]==]
function export.to_markup(ruby_table, options)
options = options or {}
local omarkup = options.markup or {}
return table_to_markup(
ruby_table,
options.break_link,
omarkup.link_border_left or '[[',
omarkup.link_border_middle or '|',
omarkup.link_border_right or ']]',
omarkup.ruby_border_left or '[',
omarkup.ruby_border_middle or '](',
omarkup.ruby_border_right or ')')
end
--[==[Generates wikitexts from ruby tables.
* The options are the same as {function export.to_markup()}]==]
function export.to_wiki(ruby_table, options)
options = options or {}
local omarkup = options.markup or {}
return table_to_markup(
ruby_table,
options.break_link,
omarkup.link_border_left or '[[',
omarkup.link_border_middle or '|',
omarkup.link_border_right or ']]',
omarkup.ruby_border_left or '<ruby>',
omarkup.ruby_border_middle or '<rp>(</rp><rt>',
omarkup.ruby_border_right or '</rt><rp>)</rp></ruby>')
end
--[==[Constructs a ruby table form {"[]()"} markups.]==]
function export.parse_markup(markup)
local ruby = {}
local link_table = str_parse_link(markup:gsub('(%b[])(%b())', function(m1, m2)
table.insert(ruby, m2:sub(2, -2))
return m1:sub(2, -2)
end))
local plain_text = export.to_text(str_parse_link(markup))
local ruby_table = {}
local p0 = 1
local ruby_n = 1
local s_text, s_ruby
for p1, m1, m2, p2 in plain_text:gmatch'()(%b[])(%b())()' do
if p0 < p1 then
s_text = plain_text:sub(p0, p1 - 1)
table.insert(ruby_table, {text = s_text})
end
s_text = m1:sub(2, -2)
s_ruby = ruby[ruby_n]
table.insert(ruby_table, {
text = s_text,
ruby = s_ruby ~= '' and s_ruby or nil,
})
p0 = p2
ruby_n = ruby_n + 1
end
if p0 <= plain_text:len() then
s_text = plain_text:sub(p0)
table.insert(ruby_table, {text = s_text})
end
return table_merge(link_table, ruby_table)
end
local data_range = mw.loadData'Module:ja/data/range'
local range_mute = '%^%-%.゠・'
local range_hirakata = data_range.hiragana .. data_range.katakana
local range_kana = range_hirakata .. data_range.hentaigana
local range_noalias = '<>^%c%p%s%z' .. range_kana
local range_noruby = range_noalias .. data_range.kana_graph
local range_nospace = range_kana .. data_range.kanji .. data_range.ideograph .. data_range.kana_graph .. data_range.punctuation
--[==[Constructs a ruby table from the 2 strings passed to this function. The differences in the 2 strings are converted into ruby, with the corresponding part of {kana} becoming the ruby text and that of {term} becoming the text under the ruby. Links in {term} will be integrated into the result. Links in {kana} will be ignored by default.
Details about the format of {term} and {kana} can be found in [[Template:ja-r/documentation#Ruby_and_transliteration]].
To better adapt to Japanese texts, this function assumes that all non-letters (except for 5 symbols "^", ".", "-", "゠", "・" for transliteration reasons) and all kana always represent themselves. These literal characters when appearing in {term} should also appear unchanged or as a hira-kata counterpart in {kana}. This behaviour can be changed by using {options.try}, or by manually isolating a single literal character with "%".
* {options.try == nil}: Lauch an error when the assumption of literal characters fails.
* {options.try == 'force'}: Discard the assumption of literal characters when it fails, and try to find any differences in the strings.
* {options.try_force_limit}: Limit the time used by {options.try == 'force'}.
* {options.space == nil}: Remove spaces between kana or kanji but preserve elsewhere.
* {options.space == 'all'}: Preserve all spaces.
* {options.space == 'none'}: Remove all spaces.
* {options.allow_ruby_link == true}: Try to match the links in {kana}.]==]
function export.parse_text(term, kana, options)
options = options or {}
local _remove_space
if options.space == 'none' then
_remove_space = function(_r)
local function _next(p1, p2)
if p2 and p2 < #_r[p1].text then
return p1, p2 + 1
end
p1 = p1 + 1
if p1 > #_r then
p2 = nil
else
p2 = type(_r[p1].text) ~= 'string' and 1 or nil
end
return p1, p2
end
local pos1, pos2 = _next(0, nil)
while pos1 <= #_r do
_t = pos2 and _r[pos1].text[pos2] or _r[pos1]
_t.text = _t.text:gsub(' ', '')
if _t.linkto then _t.linkto = _remove_space({{text = _t.linkto}})[1].text end
if _t.ruby then _t.ruby = _remove_space({{text = _t.ruby}})[1].text end
if pos2 then
if _r[pos1].linkto then _r[pos1].linkto = _remove_space({{text = _r[pos1].linkto}})[1].text end
if _r[pos1].ruby then _r[pos1].ruby = _remove_space({{text = _r[pos1].ruby}})[1].text end
end
pos1, pos2 = _next(pos1, pos2)
end
return _r
end
elseif options.space == 'all' then
_remove_space = function(_r)
return _r
end
else
_remove_space = function(_r, context_ak, context_bk)
local function _next(p1, p2)
if p2 and p2 < #_r[p1].text then
return p1, p2 + 1
end
p1 = p1 + 1
if p1 > #_r then
p2 = nil
else
p2 = type(_r[p1].text) ~= 'string' and 1 or nil
end
return p1, p2
end
local pos1, pos2 = _next(0, nil)
local pos3, pos4 = pos1, pos2
local after_k = context_ak
local before_k
local _t, char
while pos1 <= #_r do
if pos3 == pos1 and (pos4 == pos2 or pos4 < pos2) or pos3 < pos1 then
before_k = context_bk
pos3, pos4 = _next(pos1, pos2)
while pos3 <= #_r do
_t = pos4 and _r[pos3].text[pos4] or _r[pos3]
char = mw.ustring.find(_t.text, '[^ \']')
if char then
char = mw.ustring.sub(_t.text, char, char)
before_k = mw.ustring.find(char, '['..range_nospace..']')
break
end
pos3, pos4 = _next(pos3, pos4)
end
end
_t = pos2 and _r[pos1].text[pos2] or _r[pos1]
if _t.linkto then _t.linkto = _remove_space({{text = _t.linkto}}, after_k, before_k)[1].text end
if _t.ruby then _t.ruby = _remove_space({{text = _t.ruby}}, after_k, before_k)[1].text end
if pos2 then
if _r[pos1].linkto then _r[pos1].linkto = _remove_space({{text = _r[pos1].linkto}}, after_k, before_k)[1].text end
if _r[pos1].ruby then _r[pos1].ruby = _remove_space({{text = _r[pos1].ruby}}, after_k, before_k)[1].text end
end
local seg = {}
local i0 = 1
for i1, m1, i2 in mw.ustring.gmatch(_t.text, '()(['..range_nospace..']+)()') do
if after_k and not mw.ustring.sub(_t.text, i0, i1 - 1):find'[^ \']' then
table.insert(seg, (mw.ustring.sub(_t.text, i0, i1 - 1):gsub(' ', '')))
else
table.insert(seg, mw.ustring.sub(_t.text, i0, i1 - 1))
end
table.insert(seg, m1)
after_k = true
i0 = i2
end
after_k = after_k and not mw.ustring.sub(_t.text, i0):find'[^ \']'
if after_k and before_k then
table.insert(seg, (mw.ustring.sub(_t.text, i0):gsub(' ', '')))
else
table.insert(seg, mw.ustring.sub(_t.text, i0))
end
_t.text = table.concat(seg)
pos1, pos2 = _next(pos1, pos2)
end
return _r
end
end
-- Create the link table
-- e.g. "[[エドガー・アラン・ポー|アラン・ポー]]の[[推理 小説]]"
local link_table = str_parse_link(term:gsub('%%', '')) -- remove '%'
--[[link_table = {
{text = 'アラン・ポー', linkto = 'エドガー・アラン・ポー'},
{text = 'の'},
{text = '推理 小説', linkto = '推理 小説'},
}]]
-- Remove romaji markup
kana = kana:gsub('[%^%-%.]', '') -- remove '^', '-', '.', preserve '%', ' '
-- Create the ruby table
-- e.g. 'アラン・ポーの推理 小説', 'あらん ぽー の すいり しょうせつ'
-- ("ぽお" is not allowed)
local ruby_table = {}
local plain_term_raw = export.to_text(str_parse_link(term)) -- Remove links: [[A|B]] -> B, [[C]] -> C
local plain_kana_raw = options.allow_ruby_link and kana or export.to_text(str_parse_link(kana))
local plain_term = mw.text.split(plain_term_raw, '%%')
local plain_kana = mw.text.split(plain_kana_raw, '%%')
if #plain_term ~= #plain_kana then
mw.logObject(plain_term)
mw.logObject(plain_kana)
error('Separator "%" in the kanji and kana strings do not match.')
end
for i, plain_term_i in ipairs(plain_term) do
if plain_term_i ~= '' or plain_kana[i] ~= '' then
local pattern_ruby, pattern_ruby_is_ruby = {}, {}
local function _func_pat(s_sub)
local in_xml_tag = false
table.insert(pattern_ruby, '(' .. s_sub:gsub('[\1-\255][\128-\191]*', function(m0)
if in_xml_tag then
if m0 == '>' then in_xml_tag = false end
return ''
else
if m0 == '<' then
in_xml_tag = true
return ' ?<.->'
else
local m0_m = m0
if m0:find'^[%(%)%.%%%+%-%*%?%[%]%^%$]$' then m0_m = '%' .. m0_m end
if mw.ustring.find(m0, '^['..range_mute..']$') then m0_m = '[' .. m0_m .. ' -]?'
elseif mw.ustring.find(m0, '^[ヶゖケ]$') then
m0_m = "[" .. str_kata_to_hira(m0_m) .. str_hira_to_kata(m0_m) .. "かがこカガコ]"
elseif mw.ustring.find(m0, '^['..range_hirakata..']$') then
m0_m = "[" .. str_kata_to_hira(m0_m) .. str_hira_to_kata(m0_m) .. "]"
end
return ' ?' .. m0_m
end
end
end) .. ' ?)')
end
local plain_term_noxml = plain_term_i:gsub('%b<>', '<>')
local pos0 = 1
-- Use a custom iterator so that we can exclude "&" and "@" from range_noalias, as they're part of %p.
for pos1, s, pos2 in (function()
local pos2, pos1, c = 1
local len = mw.ustring.len(plain_term_noxml)
return function()
if pos2 > len then return nil end
pos1 = math.min(
mw.ustring.find(plain_term_noxml, "[^"..range_noalias.."]", pos2) or math.huge,
mw.ustring.find(plain_term_noxml, "[&@]", pos2) or math.huge
)
if pos1 == math.huge then return nil end
pos2 = pos1
repeat
pos2, c = select(2, mw.ustring.find(plain_term_noxml, "(["..range_noalias.."])", pos2 + 1))
until (not pos2) or (pos2 and not c:find("[&@]"))
pos2 = pos2 or len + 1
return pos1, mw.ustring.sub(plain_term_noxml, pos1, pos2 - 1), pos2
end
end)() do
if pos0 < pos1 then
local s_sub = mw.ustring.sub(plain_term_noxml, pos0, pos1 - 1)
if not pattern_ruby_is_ruby[#pattern_ruby] or mw.ustring.match(s_sub, '[^' .. range_mute .. ']') then
_func_pat(s_sub)
end
end
if not pattern_ruby_is_ruby[#pattern_ruby] then
table.insert(pattern_ruby, '(..-)')
pattern_ruby_is_ruby[#pattern_ruby] = true
end
pos0 = pos2
end
if #pattern_ruby == 0 then
-- isolated symbol matches anything.
table.insert(ruby_table, {
text = plain_term_i,
ruby = plain_kana[i] ~= plain_term_i and mw.ustring.len(plain_term_i) == 1 and plain_kana[i] or nil,
})
else
if pos0 <= mw.ustring.len(plain_term_noxml) then
_func_pat(mw.ustring.sub(plain_term_noxml, pos0))
end
local pat_ruby_s = table.concat(pattern_ruby)
-- 'アラン・ポーの推理 小説' to '( ?[あア] ?[らラ] ?[んン] ?[・ -]? ?[ぽポ] ?ー ?[のノ] ?)(..-)( )(..-)'
-- Excute matching
local ruby_table_i_ruby = {mw.ustring.match(plain_kana[i], '^'..pat_ruby_s..'$')}
if #ruby_table_i_ruby > 0 then
local ruby_table_i_text = {mw.ustring.match(plain_term_i, '^'..pat_ruby_s..'$')}
for n_match = 1, #pattern_ruby do
-- Exclude "&" and "@" from range_noruby, as they're part of %p.
if (
pattern_ruby_is_ruby[n_match] and
ruby_table_i_text[n_match] ~= ruby_table_i_ruby[n_match] and
(
mw.ustring.find(ruby_table_i_text[n_match], '[^' .. range_noruby .. ']') or
ruby_table_i_text[n_match]:find("[&@]")
)
) then
table.insert(ruby_table, {
text = ruby_table_i_text[n_match],
ruby = ruby_table_i_ruby[n_match],
})
else
if #ruby_table > 0 and ruby_table[#ruby_table].ruby == nil then
ruby_table[#ruby_table].text = ruby_table[#ruby_table].text .. ruby_table_i_text[n_match]
else
table.insert(ruby_table, {text = ruby_table_i_text[n_match]})
end
end
end
elseif options.try == 'force' then
require('Module:debug').track('ja-ruby/forced match')
local forced_result = str_ucompare(plain_term_i, plain_kana[i], options.try_force_limit)
for ii, vv in ipairs(forced_result[1]) do
table.insert(ruby_table, {
text = vv,
ruby = forced_result[2][ii] ~= vv and forced_result[2][ii] or nil,
})
end
else
mw.log(pat_ruby_s)
error('Can not match "' .. plain_term_i .. '" and "' .. plain_kana[i] .. '"')
end
end
end
end
--[[ruby_table = {
{text = 'アラン・ポーの'},
{text = '推理', ruby = 'すいり'},
{text = ' '}
{text = '小説', ruby = 'しょうせつ'},
}]]
return _remove_space(table_merge(link_table, ruby_table))
-- Merge the ruby and link table
--[[return {
{text = 'アラン・ポー', linkto = 'エドガー・アラン・ポー'},
{text = 'の'},
{text = {
{text = '推理', ruby = 'すいり'},
{text = ''}
{text = '小説', ruby = 'しょうせつ'},
}, linkto = '推理小説'},
}]]
end
--[==[A shortcut for combinations like {to_wiki(parse_text(...))}. It accepts a table containing named arguments instead of positional ones.
* {term}, {kana}: Arguments for {parse_text()}.
* markup: Argument for {parse_markup()}.
* to_target: Default to {to_wiki()}. {'text'} for {to_text()}; {'ruby'} for {to_ruby()}; {'markup'} for {to_markup()}.
* options: A shared option table passed to all functions involved.]==]
function export.ruby_auto(args)
local to_target
if args.target == 'text' then
to_target = export.to_text
elseif args.target == 'ruby' then
to_target = export.to_ruby
elseif args.target == 'markup' then
to_target = export.to_markup
else
to_target = export.to_wiki
end
if args.term and args.kana then
return to_target(export.parse_text(args.term, args.kana, args.options), args.options)
elseif args.markup then
return to_target(export.parse_markup(args.markup, args.options), args.options)
else
error('Cannot find "term" and "kana" or "markup"')
end
end
return export