#include "machines.h" #include "error.h" #include "logger.h" #include "tr.h" #include "utf8.h" namespace fsm { auto defaultPlaceholder(std::string_view pattern, std::string_view fill) -> std::string { using namespace std::literals; enum { start, backslash, lbe, rbe, lb, backslashlb } s = start; std::string out; for (const auto& c : pattern) { switch (s) { case start: switch (c) { default: out.push_back(c); break; case '\\': s = backslash; break; case '{': s = lbe; break; case '}': s = rbe; break; } break; case backslash: out.push_back(c); s = start; break; case backslashlb: out.push_back(c); s = lb; break; case lbe: if (c == '{') { out.push_back(c); s = start; } else if (c == '}') { out.append(fill); s = start; } else { s = lb; } break; case rbe: if (c == '}') { out.push_back(c); s = start; } else { parse_error(ec::t_internal, pattern, &c); } break; case lb: if (c == '}') { out.append(fill); s = start; } else if (c == '\\') { s = backslashlb; } else if (c == '{') { parse_error(ec::t_internal, pattern, &c); } else { } break; } } return out; } auto toStr(char32_t c) -> std::string { std::string c_asStr; utf8::append(c, std::back_inserter(c_asStr)); return c_asStr; } auto utf8_begin(std::string_view input) -> utf8::iterator { return utf8::iterator( input.begin(), input.begin(), input.end()); } auto utf8_end(std::string_view input) -> utf8::iterator { return utf8::iterator(input.end(), input.begin(), input.end()); } [[nodiscard]] auto unicode_reverse(std::string_view input) -> std::string { std::string ret; ret.reserve(input.size()); std::copy(utf8_begin(input), utf8_end(input), kblib::consumer([&](auto c) -> decltype(ret.insert(0, toStr(c))) { return ret.insert(0, toStr(c)); })); return ret; } auto state_machine::operator()(Word w, channelID ch) const -> word_data_t { std::string& input = w.data[ch]; log_debug("transform: fsm"); std::u32string ustr; utf8::utf8to32(input.begin(), input.end(), std::back_inserter(ustr)); if (dir & r_in) { std::reverse(ustr.begin(), ustr.end()); } const state* s = &states.at("S"); std::string output; // initial guess of size; is likely to be at least this large so skip any // smaller allocations output.reserve(ustr.size()); for (auto c : ustr) { log_debug("In state ", kblib::quoted(s->name)); auto&& [out, dest] = s->step(c); output += out; log_debug("Mapped ", kblib::quoted(toStr(c)), " to ", kblib::quoted(out)); if (not dest) { log_err(+c); std::terminate(); } s = dest; } output.append(s->suf()); if (dir & r_out) { output = unicode_reverse(output); } input = output; return std::move(w.data); } auto state::step(char32_t c) const -> std::pair { std::string c_asStr = toStr(c); auto dest = [&](const state* dest) { return dest ? dest : this; }; auto act = [&](const state::action_t& act) -> std::pair { return {defaultPlaceholder(act.format, c_asStr), dest(act.dest)}; }; if (auto [it, f] = kblib::get_check(cmap, c); f) { log_debug("character rule"); return act(it->second); } if (not set.empty()) { for (const auto& r : set) { if (tr::find({r.matches.data(), r.matches.size()}, c)) { // if (auto p = std::find(r.matches.begin(), r.matches.end(), c); // p != r.matches.end()) { log_debug("set rule"); return act(r.action); } } } if (not map.empty()) { for (const auto& m : map) { if (auto p = kblib::find_in(m.in_set.begin(), m.in_set.end(), c); p != m.in_set.size()) { log_debug("map rule"); return {toStr(*(m.out_set.begin() + p)), dest(m.dest)}; } } } if (not match.empty()) { for (const auto& m : match) { if (std::count(m.matches.begin(), m.matches.end(), c)) { log_debug("match rule"); return {toStr(c), dest(m.dest)}; } } } if (def) { log_debug("Default rule (user)"); return act(*def); } log_debug("Default rule (identity)"); return {toStr(c), dest(ret)}; } } // namespace fsm