#include "tstrings.h" #include "error.h" #include "interpolate.h" #include "kblib/simple.h" #include "kblib/stats.h" #include "srell/srell.hpp" #include #include #include using kblib::a; using namespace std::literals; [[nodiscard]] auto operator==(const token& a, const token& b) -> bool { return std::tie(a.type, a.o_val) == std::tie(b.type, b.o_val); } [[nodiscard]] auto operator<(const token& a, const token& b) -> bool { return std::tie(a.type, a.o_val) < std::tie(b.type, b.o_val); } namespace { // move to grammar file header // translate old-style doubled-character escapes into backslash escapes [[nodiscard, maybe_unused]] auto compat_filter(std::string_view in) -> std::string { std::string ret; bool esc = false; for (auto [c, i] : kblib::enumerate(in)) { if (esc) { if (c == in[i - 1]) { ret.push_back('\\'); ret.push_back(c); } else { ret.push_back(in[i - 1]); ret.push_back(c); } } if (kblib::contains("<>{}", c)) { esc = true; } else { ret.push_back(c); } } return ret; } // remove backslash-escape sequences [[nodiscard]] auto unescape(std::string_view in) -> std::string { bool b_esc = false; bool d_esc = false; std::string ret; char prev{}; for (auto& c : in) { if (d_esc) { if (c == prev) { ret.push_back(c); } else { ret.push_back(prev); ret.push_back(c); } d_esc = false; } else if (b_esc) { ret.push_back(c); b_esc = false; } else { if (c == '\\') { b_esc = true; } else if (c == '<' or c == '>') { d_esc = true; } else { ret.push_back(c); } } prev = c; } return ret; } [[nodiscard, maybe_unused]] auto strip_escapes(std::string in) -> std::string { srell::regex double_lb("<<"); in = srell::regex_replace(in, double_lb, "<"); srell::regex double_rb(">>"); in = srell::regex_replace(in, double_rb, ">"); srell::regex backslash(R"__(\\(.))__"); return srell::regex_replace(in, backslash, "$1"); } class token_iterator { public: using value_type = token; using pointer = const value_type*; using reference = const value_type&; using const_reference = const value_type&; using iterator_category = std::input_iterator_tag; token_iterator() noexcept = default; token_iterator(std::string_view _s) : data(_s) { // try { extract_next(); // } catch (srell::regex_error& e) { // std::cerr< const_reference { return curr; } [[nodiscard]] auto operator->() const noexcept -> pointer { return &curr; } auto operator++() -> token_iterator& { do { extract_next(); } while (not data.empty() and curr.type == op::null); return *this; } auto operator++(int) -> const token_iterator { token_iterator tmp{*this}; do { extract_next(); } while (not data.empty() and curr.type == op::null); return tmp; } [[nodiscard]] explicit operator bool() const noexcept { return not curr.o_val.empty(); } friend auto operator==(const token_iterator& a, const token_iterator& b) noexcept -> bool; private: [[noreturn]] auto error(ec err, const char* epos) -> void { parse_error(err, data, epos); } [[noreturn]] auto error(ec err) -> void { parse_error(err, data, cpos); } // actual tokenizer auto extract_next() -> void { try { if (cpos == ptrdiff_t(data.size())) { // revert to sentinel on end of range if (not data.empty()) { if (parenDepth != -1) { error(ec::eos); } *this = {}; } return; } struct match_action { srell::regex match; void (*action)(token_iterator*, const srell::cmatch&); }; const auto flag = srell::regex_constants::dotall; const static std::vector textmode { {srell::regex{R"__(^((?:[^<>\\]|<<|>>|\\.)+))__", flag}, [](token_iterator* t, const srell::cmatch& matches) { t->curr = {op::text, unescape(matches.str(1))}; }}, {srell::regex{R"__(^<>)__", flag}, [](token_iterator* t, const srell::cmatch& /*unused*/) { t->error(ec::early_close); }}, {srell::regex{R"__(^<(\d+)>)__", flag}, [](token_iterator* t, const srell::cmatch& matches) { t->curr = {op::argument, matches.str(1)}; }}, {srell::regex{R"__(^<([\w\d]+)>)__", flag}, [](token_iterator* t, const srell::cmatch& matches) { t->curr = {op::special_argument, matches.str(1)}; }}, {srell::regex{R"__(^<(\.\.\.)>)__", flag}, [](token_iterator* t, const srell::cmatch& matches) { t->curr = {op::ellipsis, matches.str(1)}; }}, {srell::regex{R"__(^<(#[\w\d]+)>)__", flag}, [](token_iterator* t, const srell::cmatch& /*unused*/) { t->error(ec::overmarked_arg); }}, {srell::regex{R"__(^<([^(>]*\\[^>]*)>)__", flag}, [](token_iterator* t, const srell::cmatch& /*unused*/) { t->error(ec::escape_nostr); }}, {srell::regex{R"__(^<\()__", flag}, [](token_iterator* t, const srell::cmatch& /*unused*/) { t->error(ec::blank_fname); }}, {srell::regex{R"__(^)__", flag}, [](token_iterator* t, const srell::cmatch& matches) { t->curr = {op::text, matches.str(1)}; }}, {srell::regex{R"__(^error(ec::invalid_raw); }}, {srell::regex{R"__(^<([-\w\d+*/^?=.]+)\()__", flag}, [](token_iterator* t, const srell::cmatch& matches) { t->state = 1; t->parenDepth = 1; // don't actually return the < token t->curr = {op::push_func, matches.str(1)}; }}, #if 0 {srell::regex{R"__(^<(?=[^>]+>))__", flag}, [](token_iterator* t, const srell::cmatch&) { t->state = 1; // don't actually return the < token t->curr = {op::null, {}}; }}, #endif {srell::regex{R"__(^<[^>]*>)__", flag}, [](token_iterator* t, const srell::cmatch& /*unused*/) { t->error(ec::bad_format); }}, {srell::regex{R"__(^<[^>]*)__", flag}, [](token_iterator* t, const srell::cmatch& /*unused*/) { t->error(ec::extra_ab); }}, {srell::regex{R"__(^.+)__", flag}, [](token_iterator* t, const srell::cmatch& /*unused*/) { t->error(ec::bad_format); }}, {srell::regex{R"__(^$)__", flag}, [](token_iterator* t, const srell::cmatch& /*unused*/) { t->extract_next(); }}, }; const static std::vector escmode0 = {{ {srell::regex{R"__(^$)__", flag}, [](token_iterator* t, const srell::cmatch& /*unused*/) { t->error(ec::eos); }}, {srell::regex{R"__(^\))__", flag}, [](token_iterator* t, const srell::cmatch& /*unused*/) { t->error(ec::extra_rp); }}, #if 0 {srell::regex{R"__(^\|)__", flag}, [](token_iterator* t, const srell::cmatch&) { t->error(ec::extra_p); }}, {srell::regex{R"__(^\()__", flag}, [](token_iterator* t, const srell::cmatch&) { t->error(ec::blank_fname); }}, {srell::regex{R"__(^raw\(([^)]*)\))__", flag}, [](token_iterator* t, const srell::cmatch& matches) { t->curr = {op::text, matches.str(1)}; }}, {srell::regex{R"__(^([-\w\d+*/^?=.]+)\()__", flag}, [](token_iterator* t, const srell::cmatch& matches) { t->curr = {op::push_func, matches.str(1)}; ++(t->parenDepth); }}, {srell::regex{R"__(^#[\w\d]+\()__", flag}, [](token_iterator* t, const srell::cmatch&) { t->error(ec::call_arg); }}, {srell::regex{R"__(^[^>(|]*\()__", flag}, [](token_iterator* t, const srell::cmatch&) { t->error(ec::invalid_fname); }}, {srell::regex{R"__(^(\.\.\.))__", flag}, [](token_iterator* t, const srell::cmatch& matches) { t->curr = {op::ellipsis, matches.str(1)}; }}, {srell::regex{R"__(^>$)__", flag}, [](token_iterator* t, const srell::cmatch&) { //end of stream, correctly formed t->state = 0; t->parenDepth = -1; t->curr = {op::text, ""s}; }}, {srell::regex{R"__(^>((?:[^<>\\]|<<|>>|\\.)+))__", flag}, [](token_iterator* t, const srell::cmatch& matches) { t->state = 0; t->parenDepth = -1; //don't actually return the > token t->curr = {op::text, strip_escapes(matches.str(1))}; }}, #endif {srell::regex{R"__(^.+)__", flag}, [](token_iterator* t, const srell::cmatch& /*unused*/) { t->error(ec::bad_format); }}, }}; const static std::vector escmode1 = {{ {srell::regex{R"__(^$)__", flag}, [](token_iterator* t, const srell::cmatch& /*unused*/) { t->error(ec::eos); }}, {srell::regex{R"__(^raw\(([^)]*)\)\|?)__", flag}, [](token_iterator* t, const srell::cmatch& matches) { t->curr = {op::text, matches.str(1)}; }}, {srell::regex{R"__(^raw\()__", flag}, [](token_iterator* t, const srell::cmatch& /*unused*/) { t->error(ec::invalid_raw); }}, {srell::regex{R"__(^([-\w\d+*/^?=.]+)\()__", flag}, [](token_iterator* t, const srell::cmatch& matches) { t->curr = {op::push_func, matches.str(1)}; ++(t->parenDepth); }}, {srell::regex{R"__(^#[\w\d]+\()__", flag}, [](token_iterator* t, const srell::cmatch& /*unused*/) { t->error(ec::call_arg); }}, {srell::regex{R"__(^#(\d+)\|?)__", flag}, [](token_iterator* t, const srell::cmatch& matches) { t->curr = {op::argument, matches.str(1)}; }}, {srell::regex{R"__(^#([\w\d]+)\|?)__", flag}, [](token_iterator* t, const srell::cmatch& matches) { t->curr = {op::special_argument, matches.str(1)}; }}, {srell::regex{R"__(^#([\w\d#]+))__", flag}, [](token_iterator* t, const srell::cmatch& /*unused*/) { t->error(ec::extra_h); }}, {srell::regex{R"__(^#[^()|<>])__", flag}, [](token_iterator* t, const srell::cmatch& /*unused*/) { t->error(ec::early_close); }}, {srell::regex{R"__(^(\.\.\.)\|?)__", flag}, [](token_iterator* t, const srell::cmatch& matches) { t->curr = {op::ellipsis, matches.str(1)}; }}, {srell::regex{R"__(^((?:[^|()<>\\]|\\.)+)\|?)__", flag}, [](token_iterator* t, const srell::cmatch& matches) { t->curr = {op::text, unescape(matches.str(1))}; }}, {srell::regex{R"__(^((?:[^|()<>\\]|\\.)*)\|)__", flag}, [](token_iterator* t, const srell::cmatch& matches) { t->curr = {op::text, unescape(matches.str(1))}; }}, {srell::regex{R"__(^\()__", flag}, [](token_iterator* t, const srell::cmatch& /*unused*/) { t->error(ec::blank_fname); }}, {srell::regex{R"__(\\\()__", flag}, [](token_iterator* t, const srell::cmatch& /*unused*/) { t->curr = {op::text, "("}; }}, {srell::regex{R"__(^[^>(|]*\()__", flag}, [](token_iterator* t, const srell::cmatch& /*unused*/) { t->error(ec::invalid_fname); }}, {srell::regex{R"__(^(\))\|>)__", flag}, [](token_iterator* t, const srell::cmatch& /*unused*/) { t->error(ec::extra_p); }}, {srell::regex{R"__(^(\))[^)|>]+>)__", flag}, [](token_iterator* t, const srell::cmatch& /*unused*/) { t->error(ec::extra_c); }}, {srell::regex{R"__(^(\))>)__", flag}, [](token_iterator* t, const srell::cmatch& matches) { if (t->parenDepth != 1) { t->error(ec::extra_lp); } t->curr = {op::call_func, matches.str(1)}; t->parenDepth = -1; t->state = 0; }}, {srell::regex{R"__(^(\))\|?)__", flag}, [](token_iterator* t, const srell::cmatch& matches) { t->curr = {op::call_func, matches.str(1)}; --(t->parenDepth); }}, {srell::regex{R"__(^<)__", flag}, [](token_iterator* t, const srell::cmatch& /*unused*/) { t->error(ec::extra_ab); }}, {srell::regex{R"__(^>)__", flag}, [](token_iterator* t, const srell::cmatch& /*unused*/) { t->error(ec::extra_lp); }}, {srell::regex{R"__(^\|)__", flag}, [](token_iterator* t, const srell::cmatch& /*unused*/) { // don't actually return the | token t->curr = {op::text, ""}; }}, {srell::regex{R"__(^.+)__", flag}, [](token_iterator* t, const srell::cmatch& /*unused*/) { t->error(ec::bad_format); }}, }}; const static auto apply = [&](const std::vector& cases) { auto to_match = data.substr(kblib::saturating_cast(cpos)); srell::cmatch m; for (auto& ma : cases) { if (srell::regex_search(to_match.begin(), to_match.end(), m, ma.match)) { ma.action(this, m); cpos += m.str(0).size(); if (m.str(0).empty()) { parse_error( ec::p_internal, data, cpos, kblib::concat("Zero length match: state=", state, ", parenDepth=", parenDepth, ", match index=", (&ma - cases.data()))); } return; } } error(ec::p_internal); }; apply(state == 0 ? textmode : parenDepth == 0 ? escmode0 : escmode1); } catch (srell::regex_error& e) { std::cerr << e.code() << '\n'; std::abort(); } } std::string_view data; std::ptrdiff_t cpos{0}; value_type curr; int parenDepth{-1}; int state{0}; }; [[nodiscard]] auto operator==(const token_iterator& a, const token_iterator& b) noexcept -> bool { return (a.data == b.data) and (a.cpos == b.cpos); } [[nodiscard]] auto operator!=(const token_iterator& a, const token_iterator& b) noexcept -> bool { return not (a == b); } class tokenize { public: using iterator = token_iterator; using const_iterator = token_iterator; tokenize(std::string_view _s) noexcept : data(_s) {} [[nodiscard]] auto begin() const -> token_iterator { return {data}; } [[nodiscard]] auto end() const noexcept -> token_iterator { return {}; } private: std::string_view data; }; } // namespace template constexpr auto idx = std::in_place_index; auto parse_tstring(const template_machine& bm, std::string_view str) -> tnode { using namespace std::string_literals; tnode root{tnode_v{idx, bm.lookup_new("")}, nullptr}; tnode* current = &root; auto insert_child = [&](auto tag, auto&&... args) -> auto& { return *current->children.emplace_back( std::make_unique(tnode_v{tag, std::move(args)...}, current)); }; for (auto tok : tokenize(str)) { assert(current); switch (tok.type) { case op::text: insert_child(idx, tok.o_val); break; case op::push_func: current = &insert_child(idx, bm.lookup_new(tok.o_val)); break; case op::call_func: current = current->parent; break; case op::argument: insert_child(idx, pFromStr(int, tok.o_val)); break; case op::special_argument: insert_child(idx, tok.o_val); break; case op::ellipsis: insert_child(idx); break; default: assert(false); } } return root; } auto do_print_tnode_v(std::monostate, std::ostream& os) -> std::ostream& { return os; } auto do_print_tnode_v(int x, std::ostream& os) -> std::ostream& { return os << ' ' << x; } auto do_print_tnode_v(std::string x, std::ostream& os) -> std::ostream& { return os << ' ' << kblib::quoted(x); } auto do_print_tnode_v(const t_func* x, std::ostream& os) -> std::ostream& { return os << ' ' << x->name; // << ' ' << std::boolalpha << bool(x->func); } auto print_texpr(const tnode& expr, std::ostream& os) -> std::ostream& { using namespace std::literals; auto str = [](expr_e kind) -> std::string_view { switch (kind) { case expr_e::ellipsis: return "dots"; case expr_e::argument: return "arg"; case expr_e::text: return "text"; case expr_e::func: return "func"; case expr_e::special_argument: return "s_arg"; default: return "???"; } }; auto do_v = [&](const tnode& expr, int indent_level, auto& self) -> void { os << '\n' << kblib::repeat(" "s, indent_level) << '('; kblib::visit_indexed(expr.data, [&](auto tag, auto& data) { os << str(static_cast(tag.value)); do_print_tnode_v(data, os); }); for (auto& child : expr.children) { self(*child, indent_level + 1, self); } os << ")"; }; do_v(expr, 0, do_v); os << '\n'; return os; } [[nodiscard]] auto token_to_bytecode(token in, const template_machine& b) -> tstr_op { assert(in.type != op::null); auto at = [](const template_machine& b, std::string_view s) -> transformer* { auto it = b.transformers.find(s); if (it == b.transformers.end()) { throw wordgen_error(ec::no_function, kblib::quoted(s), "\n", ""); } return it->second.get(); }; switch (in.type) { case op::text: return {in.type, in.o_val}; break; case op::push_func: return {in.type, at(b, in.o_val)}; break; case op::argument: // Would use from_chars here but doesn't work // So a copy has to be made because there's no other (char* first, char* // last) -> int function in the standard library and stoi is the only one // which doesn't take an NTBS (which in.o_val.data() is not) return {in.type, std::stoi(in.o_val)}; break; case op::special_argument: return {in.type, in.o_val}; break; case op::call_func: case op::ellipsis: case op::null: return {in.type, {}}; default: assert(false); } } [[nodiscard]] tstr_ops template_machine::compile(const std::string& in) const { auto tokens = tokenize(in); // wrap entire string in a top-level echo, making eval simpler tstr_ops ret = { {op::push_func, transformers.at(".").get()}, }; std::transform(tokens.begin(), tokens.end(), std::back_inserter(ret), [this](auto a) { return token_to_bytecode(a, *this); }); ret.push_back({op::call_func, {}}); return ret; } [[nodiscard]] auto template_machine::to_bytes_test(std::string_view in) const -> asyncpp::generator { co_yield serialize({op::push_func, transformers.at(".").get()}, *this); std::string err; try { for (auto& tok : tokenize(in)) { auto bc = token_to_bytecode(tok, *this); co_yield serialize(bc, *this); } } catch (wordgen_error& e) { err.push_back('E'); err += std::to_string(kblib::etoi(e.code())); } if (not err.empty()) { co_yield err; co_return; } co_yield serialize({op::call_func, {}}, *this); } auto template_machine::get_args(const std::vector& p_args, const counters& c, int argc) const -> ArgsType { auto sa = [&](std::string_view a) { return special_argument(a); }; ArgsType args{origin_tag{min_argument()}, kblib::signed_cast(-min_argument()) + p_args.size()}; std::copy(p_args.begin(), p_args.end(), args.begin() - min_argument()); args[sa("d")] = std::to_string(c.depth); args[sa("D")] = std::to_string(c.depthLimit); args[sa("e")] = std::to_string(*c.expansions); args[sa("E")] = std::to_string(c.expansionsLimit); args[sa("c")] = std::to_string(args.high()); args[sa("C")] = std::to_string(argc); args[sa("a")] = std::vector{p_args.begin(), p_args.begin() + argc}; args[sa("...")] = std::vector{p_args.begin() + argc, p_args.end()}; args[sa("A")] = p_args; args[sa("lt")] = "<"; args[sa("gt")] = ">"; args[sa("p")] = "|"; args[sa("b")] = "\\"; args[sa("lb")] = "\\{"; args[sa("rb")] = "\\}"; return args; } [[nodiscard]] auto template_machine::eval( const tstr_ops& in, const std::vector& p_args, const counters& c, int argc) const -> std::string { const ArgsType args = get_args(p_args, c, argc); struct frame { const transformer* call = nullptr; argslist args; }; std::vector stack; // This precondition is established by to_bytes assert(not in.empty()); assert(in.front().type == op::push_func); for (auto&& op : in) { try { switch (op.type) { case op::push_func: stack.push_back({std::get(op.data), {}}); break; case op::call_func: { auto v = (*stack.back().call)(std::move(stack.back().args)); stack.pop_back(); if (not stack.empty()) { stack.back().args.push_back(std::move(v)); } else { return stringize(v); } } break; case op::text: stack.back().args.push_back(std::get(op.data)); break; case op::argument: stack.back().args.push_back( kblib::variant_cast(args.at(std::get(op.data)))); break; case op::special_argument: stack.back().args.push_back(kblib::variant_cast( args.at(special_argument(std::get(op.data))))); break; case op::ellipsis: stack.back().args.push_back( kblib::variant_cast(args.at(special_argument("...")))); break; default: throw std::invalid_argument("invalid bytecode received: type = " + std::to_string(kblib::etoi(op.type))); case op::null:; // do nothing } } catch (std::bad_variant_access&) { throw std::invalid_argument( "invalid bytecode received: ill-formed operation\nop = " + to_string(op, *this)); } } // Only way to get here is if we run out of bytecodes without popping the // stack all the way throw std::invalid_argument("invalid bytecode received: not enough pops"); } [[nodiscard]] auto template_machine::special_argument(std::string_view s) const -> int { for (const auto& [key, val] : arg_names) { if (key == s) { return val; } } throw std::runtime_error(kblib::concat("invalid argref '#", s, "'")); } [[nodiscard]] auto template_machine::reverse_lookup(const transformer& t) const noexcept(false) -> const std::string& { auto it = std::find_if(transformers.begin(), transformers.end(), [&t](const auto& u) { return u.second.get() == &t; }); if (it != transformers.end()) { return it->first; } else { throw std::out_of_range(""); } } [[nodiscard]] auto to_string(const op_data& op, const template_machine& b) -> std::string { return kblib::visit2( op, [](std::monostate /*unused*/) { return "NULL"s; }, [](int i) { return std::to_string(i); }, [](const std::string& s) { return kblib::quoted(s); }, [&b](const transformer* t) { try { return '"' + b.reverse_lookup(*t) + "(\""; } catch (std::out_of_range&) { return "INVALID"s; } }); } [[nodiscard]] auto to_string(const tstr_op& op, const template_machine& b) -> std::string { std::string what = "{type: "; what += std::to_string(kblib::etoi(op.type)); what += ", data: "; what += to_string(op.data, b) + "}"; return what; } /*enum class op : unsigned char { null=0, //monostate text, //string push_func, //transformer* call_func, //monostate argument, //int special_argument, //int ellipsis, //string };*/ [[nodiscard]] auto serialize(const tstr_op& op, const template_machine& b) -> std::string { auto data = kblib::visit2( op.data, [](std::monostate /*unused*/) { return "null"s; }, [](int i) { return std::to_string(i); }, [](const std::string& s) { return kblib::quoted(s); }, [&b](const transformer* t) { if (not t) return "NULLPTR"s; try { return b.reverse_lookup(*t); } catch (std::out_of_range&) { return "INVALID"s; } }); switch (op.type) { case op::null: return "null " + data; case op::text: return "text " + data; case op::push_func: return "func( " + data; case op::call_func: return "call) " + data; case op::argument: return "arg " + data; case op::special_argument: return "sparg " + data; case op::ellipsis: return "dots " + data; default: return "ERROR " + data; } }