local concat = table.concat
local explode = require("Module:string utilities").explode_utf8
local find = string.find
local insert = table.insert
local match = string.match
local pcall = pcall
local umatch = mw.ustring.match
local export = {}
do
local ids = mw.loadData("Module:Hani/data").ids
local function find_end_of_ids(text, i, components)
local component, j, success = 0, i
repeat
component = component + 1
j = j + 1
local char = text[j]
-- If it's the end of the string or a space, fail the whole sequence and backtrack.
if not char or umatch(char, "%s") then
-- Throw an error object containing the end index and the expected number of remaining characters.
error{_end = j - 1, _expected = components - component + 1}
end
local new_components = ids[char]
if new_components then
success, j = pcall(find_end_of_ids, text, j, new_components)
if not success then
-- Add any additional expected characters.
if j._expected then
j._expected = j._expected + components - component
end
error(j)
end
end
until component == components
return j
end
-- Explodes a string of characters into an array, taking into account any ideographic description characters (IDS). By default, it throws an error if invalid IDS is found. If `fallback` is set, the invalid sequence is split into the largest possible components (e.g. "⿲⿸AB⿱CD" would be split into "⿲", "⿸AB" and "⿱CD", while "⿰⿱AB⿰C" would be split into "⿰", "⿱AB", "⿰" and "C"); this is useful for sortkey contexts, as invalid sequences may occur in arbitrary input.
function export.explode_chars(text, fallback)
if not (match(text, "\226\191[\176-\191]") or find(text, "〾") or find(text, "")) then
return explode(text)
end
text = explode(text)
local ret, text_len, i = {}, #text, 0
repeat
i = i + 1
local char = text[i]
local components = ids[char]
if components then
local success, j = pcall(find_end_of_ids, text, i, components)
if success then
char = concat(text, nil, i, j)
i = j
elseif not j._expected then -- Any other errors (e.g. stack overflows) will be strings.
error(j)
else
j = "Invalid IDS sequence: \"" .. concat(text, nil, i, j._end) ..
"\": expected " .. j._expected .. " additional character" ..
(j._expected == 1 and "" or "s") .. "."
if not fallback then
error(j)
end
mw.log(j)
require("Module:debug/track")("Hani/invalid ids")
end
end
insert(ret, char)
until i == text_len
return ret
end
end
-- Converts any iteration marks (々 and 〻) into the relevant characters in the text, where n repeated iteration marks repeats n previous characters (e.g. "時々" = "時時", "馬鹿々々しい" = "馬鹿馬鹿しい" etc). Punctuation and unconnected sets of iteraton marks block iteration, with excess marks being left as-is. For example, "X,Y々々" = "X,YY々", and "X々Y々々" = "XXYY々" (not "XXYXY").
function export.convert_iteration_marks(text)
if not match(text, "\227\128[\133\187]") then
return text
end
text = explode(text)
-- Work backwards, since sets of iteration marks must be isolated from each other (e.g. "X々Y々々" should be "XXYY々", with one excess at the end, not "XXYXY").
local i, n = #text, 0
while i > 0 do
local char = text[i]
if char == "々" or char == "〻" then
n = n + 1
elseif n > 0 then
-- Count backwards once for each iteration mark, but stop early if we find something which can't be iterated, as that marks the start of the set to be repeated.
local anchor = i
for j = 0, n - 1 do
local prev = text[anchor - j]
if not prev or prev == "々" or prev == "〻" or umatch(prev, "%W") then
n = j
break
end
end
if n > 0 then
i = i - n + 1
-- Replace iteration marks ahead with the relevant character.
for j = i, i + n - 1 do
text[j + n] = text[j]
end
n = 0
end
end
i = i - 1
end
return concat(text)
end
return export