#include "interpolate.h" #include "logger.h" #include #include #include #include "srell.hpp" #include "cppcoro/generator.hpp" using namespace std::literals; std::tuple datafile::parse_nodename(std::string_view name) { using Ret = std::tuple; auto split = name.find('|'); if (split == std::string_view::npos) { return Ret{name, 0, false}; } auto primary_part = name.substr(0, split); auto args_spec = name.substr(split); return Ret{primary_part, std::count(args_spec.begin(), args_spec.end(), '|'), kblib::ends_with(name, "..."sv)}; } node* datafile::find_node(std::string_view name) { try { auto split = name.find('|'); if (split == std::string_view::npos) { return find_node(name, 0); } auto primary_part = name.substr(0, split); auto args_spec = name.substr(split); auto nlist = nodes.find(primary_part); if (nlist == nodes.end()) { return nullptr; } auto argc = std::count(args_spec.begin(), args_spec.end(), '|'); for (auto&& n : nlist->second) { if (n.declared_argc == argc || (n.variadic && n.declared_argc <= argc)) { return &n; } } return nullptr; } catch (const std::out_of_range& e) { log_err("In ", kblib::quoted(name), ", error ", e.what()); throw; } } const node* datafile::find_node(std::string_view name) const { try { auto split = name.find('|'); if (split == std::string_view::npos) { return find_node(name, 0); } auto primary_part = name.substr(0, split); auto args_spec = name.substr(split); return find_node(primary_part, std::count(args_spec.begin(), args_spec.end(), '|')); } catch (const std::out_of_range& e) { log_err("In ", kblib::quoted(name), ", error ", e.what()); throw; } } node* datafile::find_node(std::string_view name, int argc) { auto nlist = nodes.find(name); if (nlist == nodes.end()) { return nullptr; } for (auto&& n : nlist->second) { if (n.declared_argc == argc || (n.variadic && n.declared_argc <= argc)) { return &n; } } return nullptr; } const node* datafile::find_node(std::string_view name, int argc) const { auto nlist = nodes.find(name); if (nlist == nodes.end()) { return nullptr; } for (auto&& n : nlist->second) { if (n.declared_argc == argc || (n.variadic && n.declared_argc <= argc)) { return &n; } } return nullptr; } void format_path_impl(const path_t& path, std::string& out) { out.push_back('['); out += kblib::to_string<62>(path.branch); for (const path_t& c : path.children) { format_path_impl(c, out); } out.push_back(']'); } std::string format_path(const path_t& path) { std::string out = "+"s + kblib::to_string<62>(path.branch); for (const path_t& c : path.children) { format_path_impl(c, out); } return out; } static std::string mod_string(const std::string& in, std::string_view mods) { using namespace kblib::literals; switch (kblib::FNV32a(mods)) { case ""_fnv32: default: return in; case "q"_fnv32: return kblib::quoted(in); case "h"_fnv32: return kblib::concat("", kblib::html_encode(in), ""); case "e"_fnv32: return kblib::escapify(in); case "u"_fnv32: return kblib::url_encode(in); } } std::string format(const Word& word, std::string_view fmt) { std::string out; for (const auto& [text, spec, mods] : basic_parser(fmt)) { if (!text.empty()) { out += text; } if (spec) { if (*spec == "%all%") { bool htmlMode = (mods == "h"); out += std::accumulate( word.data.begin(), word.data.end(), std::string{}, [&, mods = mods, first = true](std::string out, const std::pair& ch) mutable { if (!(first || htmlMode)) { out.push_back('\t'); } first = false; return out += mod_string(ch.second, mods); } ); } else if (*spec == "path") { out += format_path(word.path); } else if (auto check = kblib::get_check(word.data, ch(*spec))) { out += mod_string(word.data.at(ch(*spec)), mods); } } } return out; } small_vector freqs_of(const bytecode_machine& bm, const node& n, const std::vector& args, const counters& c, int argc) { return kblib::build>(n.freqs.begin(), n.freqs.end(), [&](auto v){ return std::visit(kblib::visitor{ // Negative frequencies are set to zero [](double f) {return std::max(f, 0.);}, [&](const bytecodes& v) { return std::max(kblib::lexical_cast(bm.eval(v, args, c, argc)), 0.); } }, v); }); } struct fixed_branch { std::string val; double freq; vmap other_channels; }; fixed_branch apply_args_lazy(const bytecode_machine& bm, const node& n, const std::vector& args, const counters& c, int argc, std::size_t selection) { const auto& valsel = n.vals[selection]; const auto& ocsel = n.other_channels[selection]; return fixed_branch{ bm.eval(valsel, args, c, argc), std::visit(kblib::visitor{ [](double f) {return std::max(f, 0.);}, [&](const bytecodes& v) { return std::max(kblib::lexical_cast(bm.eval(v, args, c, argc)), 0.); } }, n.freqs[selection]), [&]{ auto x = kblib::build, 4>>( ocsel.begin(), ocsel.end(), [&](const auto& alt) { return std::pair{alt.first, bm.eval(alt.second, args, c, argc)}; }); return vmap{x.begin(), x.end()}; }() }; } Word chooseFrom(const datafile& data, RandomGenerator& rng, counters c, const node& n, const std::vector& args, const std::variant freq_override) { //Algorithm: //1. Determine branch to take // 1. Apply args to node frequencies // 2. Apply frequency override // 3. categorically choose branch //2. Apply args to branch //3. Interpolate noderefs into branch's val if (n.freqs.size() != n.vals.size() || n.freqs.size() != n.other_channels.size()) { log_err("valsize: ", std::to_string(n.vals.size()), "\nfreqsize: ", std::to_string(n.freqs.size()), "\nocsize: ", n.other_channels.size()); std::terminate(); } // std::clog<<"node: "<::epsilon()); Word ret{{}, probability, {static_cast(selection), {}}}; auto include_ocs = [&] { for (const auto& ch : branch.other_channels) { ret.data[ch.first] += ch.second; } }; if (*c.expansions >= c.expansionsLimit) { log_warn("expansion limit reached."); ret.val() = branch.val; include_ocs(); return ret; } else if (c.depth >= c.depthLimit) { log_warn("depth limit reached."); ret.val() = branch.val; include_ocs(); return ret; } if (branch.val == "") { include_ocs(); return ret; } for (auto&& e : fparse(data, branch.val)) { std::visit(kblib::visitor{ [&](const std::string& s) { ret.val() += s; include_ocs(); }, [&](const noderef& nr) { if (!nr.source) { assert(false); } const auto* onode = (*nr.source)->find_node(nr.name, nr.args.size()); if (!onode) { log_err("Node not found: ", kblib::quoted(nr.name), " (with ", std::to_string(nr.args.size()), " arguments)."); std::abort(); throw wordgen_error(ec::internal, "", "\n", branch.val); } auto tmp = chooseFrom(**nr.source, rng, incr(c), *onode, nr.args, nr.freq_override); for (auto&& ch : tmp.data) { ret.data[ch.first] += std::move(ch.second); } ret.freq *= tmp.freq; ret.path.children.push_back(tmp.path); } }, e); } return ret; } cppcoro::generator basic_parser(std::string_view input) { using namespace std::literals; enum { start, backslash, lbe, rbe, lb, backslashlb, mod, backslashmod, } s = start; std::string text; std::string spec; std::string mods; for (const auto& c : input) { switch (s) { case start: switch (c) { default: text.push_back(c); break; case '\\': s = backslash; break; case '{': s = lbe; break; case '}': s = rbe; break; } break; case backslash: if (c == 't') { text.push_back('\t'); } else if (c == 'n') { text.push_back('\n'); } else if (c == 'v') { text.push_back('\v'); } else if (c == 'f') { text.push_back('\f'); } else if (c == 'r') { text.push_back('\r'); } else if (c == '0') { text.push_back('\0'); } else { text.push_back(c); } s = start; break; case backslashlb: if (c == 't') { text.push_back('\t'); } else if (c == 'n') { text.push_back('\n'); } else if (c == 'v') { text.push_back('\v'); } else if (c == 'f') { text.push_back('\f'); } else if (c == 'r') { text.push_back('\r'); } else if (c == '0') { text.push_back('\0'); } else { text.push_back(c); } s = lb; break; case backslashmod: if (c == 't') { mods.push_back('\t'); } else if (c == 'n') { mods.push_back('\n'); } else if (c == 'v') { mods.push_back('\v'); } else if (c == 'f') { mods.push_back('\f'); } else if (c == 'r') { mods.push_back('\r'); } else if (c == '0') { mods.push_back('\0'); } else { mods.push_back(c); } s = mod; break; case lbe: if (c == '{') { text.push_back(c); s = start; } else if (c == '}') { co_yield {text, "", ""}; text.clear(); s = start; } else if (c == '!') { s = mod; } else { spec.push_back(c); s = lb; } break; case rbe: if (c == '}') { text.push_back(c); s = start; } else { parse_error(ec::t_internal, input, &c); } break; case lb: if (c == '}') { co_yield {text, spec, ""}; text.clear(); spec.clear(); s = start; } else if (c == '\\') { s = backslashlb; } else if (c == '{') { parse_error(ec::t_internal, input, &c); } else if (c == '!') { s = mod; } else { spec.push_back(c); } break; case mod: if (c == '}') { co_yield {text, spec, mods}; text.clear(); spec.clear(); mods.clear(); s = start; } else if (c == '\\') { s = backslashmod; } else if (c == '{') { parse_error(ec::t_internal, input, &c); } else { mods.push_back(c); } } } if (s == lb || s == backslash || s == backslashlb || s == rbe) { parse_error(ec::t_internal, input, input.end() - 1); } co_yield {text, std::nullopt, ""}; } struct parse_result { std::string literal; std::optional name; std::variant freq_override; std::vector args; std::string domain; }; cppcoro::generator refParseSM(std::string refstr); cppcoro::generator refParseRE(std::string refstr); [[nodiscard]] cppcoro::generator> fparse(const datafile& domain, const std::string& s) { /* format: string(escape('{}', '\\', '{', '{', '}', '}')) "{" name ["|" args] [(":" flist) | ("!" ilist)] "}" name: string(escape('{}|:!', '\\')) args: simple-string ["|" args] simple-string: string(escape('{}|:!', '\\')) flist: [f-pre] [fsep] double (fsep double)* f-pre: [fsep] ("+" | "*" | "=") fsep: ignore(re('[^0-9.]*[^0-9.+*=]')) ilist: [isep] iref (isep iref)* iref: int [":" double] isep: ignore(re('[^0-9.:]+')) */ // std::clog< oldpos); } return ret; #endif } ilist_t scan_ilist(const std::string& input) { log_debug("ilist: ", input); const static thread_local srell::u8regex isep{u8R"__([^\d.:+=-]+(\d+)(?::([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?))?)__", srell::regex_constants::dotall}; std::size_t pos = 0; ilist_t ret; for (srell::u8sregex_iterator it{input.begin(), input.end(), isep}, end{}; it != end; ++it) { ret.emplace_back(std::stoi((*it)[1]), [&]() -> std::optional { if ((*it)[2].length()) { return std::stod((*it)[2]); } else { return std::nullopt; } }()); } return ret; #if 0 while (pos < input.size()) { auto opos = pos; srell::u8smatch m; if (srell::regex_search(input.begin()+pos, input.end(), m, isep)) { pos = m[0].second - input.begin(); } try { ret.emplace_back(std::stoi(input, &pos), std::nullopt); } catch (const std::invalid_argument& e) { log_err("Failed to parse ilist index: ", kblib::quoted(input), "[", std::to_string(pos), "]"); throw; } if (input[pos] == ':') { ++pos; try { ret.back().second = std::stod(input, &pos); } catch (const std::invalid_argument& e) { log_err("Failed to parse ilist freq: ", kblib::quoted(input), "[", std::to_string(pos), "]"); throw; } } //must make progress assert(pos > opos); } return ret; #endif } // Basically copy Formatter().parse() but treat ! differently: // Only one of : or ! is allowed to follow the refname, and | // is used to separate arguments. Also, the subtle points of // {{ and }} interpretation are surely different. // The two functions in this file should be equivalent in behavior. cppcoro::generator refParseSM(std::string refstr) { log_debug(refstr); enum class s {literal, esc, lb, rb, name, flist, ilist, args, }; std::string lit; std::string domain; std::string name; std::string flist; std::string ilist; enum class fo {none, flist, ilist} f = fo::none; std::vector args; s state = s::literal; s ostate = s::literal; auto prep_fo = [&]() -> std::variant { // std::clog< refParseRE(std::string refstr) { const static thread_local srell::u8regex ref{u8R"__((?=.)((?:[^{}]|\\[{}]|\{\{|\}\})*)(?:\{([^%{}|:!]%)?((?:[^{}|:!]|\\[{}|:!])*)((?:\|(?:[^{}|:!]|\\[{}|:!])*)*)((:[^{}|\d.+*=]*(?:\+|\*|=)?[^{}|\d.-]*[\d.-]+(?:[^{}|\d.-]+[\d.-]+)*[^{}|\d.-]*)|(![^{}|\d.:+=-]*\d+(?::[\d.-]+)?(?:[^{}|\d.:+=-]+\d+(?::[\d.-]+))*[^{}|\d.:+=-]*))?(\}))?)__", srell::regex_constants::dotall}; const static thread_local srell::u8regex arg{u8R"__((?<=\|)(?:[^{}|:!]|\\[{}|:!])*)__", srell::regex_constants::dotall}; for (srell::u8sregex_iterator it{refstr.begin(), refstr.end(), ref}, end{}; it != end; ++it) { auto&& match = *it; co_yield parse_result{ match[1], match[8].length() ? std::optional{match[3]} : std::nullopt, [&]() -> std::variant { if (match[6].length()) { return scan_flist(match[6].str().substr(1)); } else if (match[7].length()) { return scan_ilist(match[7].str().substr(1)); } else { return std::monostate{}; } }(), kblib::build_copy>( srell::u8sregex_token_iterator{match[4].first, match[4].second, arg}, srell::u8sregex_token_iterator{}), match[2]}; } } bool test_parser(std::vector> tests) { } std::unordered_map& datafile::ch_db() { static std::unordered_map db; return db; }