#include "interpolate.h" #include "machines.h" #include "logger.h" #include "tr.h" #include #include "srell.hpp" channelID l_ch(std::string_view name) { auto ch_id = ch(name); auto& ch_db = datafile::ch_db(); if (auto it = ch_db.find(ch_id); it != ch_db.end()) { if (name != it->second) { log_err("Two channels hash to the same value:\n\t", name, "\n\t", it->second, "\nRecommend changing one or both names, and reporting this as an issue." ); } } else { ch_db.emplace(ch_id, name); } return ch_id; } std::pair argsDeclared(std::string_view name) { auto args_pos = name.find('|'); if (args_pos == name.npos) { return {0, kblib::ends_with(name, "...")}; } auto args_spec = name.substr(args_pos); return {std::count(args_spec.begin(), args_spec.end(), '|'), kblib::ends_with(args_spec, "...")}; } struct tmp_node { std::vector> freqs; small_vector vals; std::vector other_channels; int declared_argc{}; bool variadic{}; }; enum class r { backslash, dollar, }; replace_sequence extractTransforms(const YAML::Node& data, const r match_format = r::backslash) { auto translate_r = [match_format](std::string s) { if (match_format == r::backslash) { std::string ret; //the majority of strings will not change size. ret.reserve(s.size()); bool inEsc = false; for (char c : s) { if (c == '$') { ret.append(2, '$'); } else if (inEsc) { if (c == '\\') { ret.push_back(c); } else if (std::isdigit(c)) { ret.push_back('$'); ret.push_back(c); } else { ret.push_back('\\'); ret.push_back(c); } inEsc = false; } else { if (c == '\\') { inEsc = true; } else { ret.push_back(c); } } } log_debug("adjusted ", kblib::quoted(s), " to ", kblib::quoted(ret)); if (s != ret) { } return ret; } return s; }; std::vector ret; for (const auto& stage : data) { if (stage.IsMap()) { if (stage["S"]) { ret.emplace_back(stage.as()); } else if (stage["assign"]) { ret.emplace_back([fmt = stage["assign"].as()](Word w, channelID ch) { log_info("assign: ", fmt); w.data[ch] = format(w, fmt); return std::move(w.data); }); } } else if (stage.IsSequence()) { std::vector> steps; for (const auto& step : stage) { std::string match = step["m"].as(); if (match.empty()) { continue; } steps.emplace_back( srell::regex{match, srell::regex_constants::dotall}, translate_r(step["r"].as())); } ret.emplace_back([steps = std::move(steps)](Word w, channelID ch) { log_info("regex"); for (const auto& [m, r] : steps) { w.data[ch] = srell::regex_replace(std::move(w.data[ch]), m, r); } return std::move(w.data); }); } } return ret; } datafile::datafile(YAML::Node data, RandomGenerator& rng) : bm(this, rng) { bool has_control_block = false; bool control_block_has_channels = false; bool control_block_has_replace = false; r rformat = r::backslash; if (auto&& control = data["control"]) { assert(control.IsMap()); has_control_block = true; if (auto&& channels = control["channels"]) { control_block_has_channels = true; for (auto&& v : channels.as>()) { channelNames.emplace(l_ch(v.first), std::move(v.second)); } control.remove("channels"); } if (auto&& format = control["format"]) { if (format.as() == "dollar" || kblib::toLower(format.as()) == "ecmascript") { rformat = r::dollar; } else if (format.as() == "backslash" || kblib::toLower(format.as()) == "python") { rformat = r::backslash; } else { log_err("control.format must be one of" "\n- backslash" "\n- python" "\n- dollar" "\n- ecmascript" ); throw wordgen_error(ec::internal, "", "", ""); } } if (auto&& replace = control["replace"]) { control_block_has_replace = true; if (!replace.IsSequence()) { log_err("'control.replace' must be a sequence."); assert(false); } for (auto&& stage : replace) { if (!stage.IsMap()) { log_err("'control.replace' must be a sequence of mappings."); assert(false); } this->replace.emplace_back(); auto& rs = this->replace.back(); assert(stage.IsMap()); for (auto&& channel : stage) { if (!channel.second.IsSequence()) { log_err("'control.replace' invalid."); assert(false); } rs.actions[l_ch(channel.first.as())] = extractTransforms(channel.second, rformat); } } control.remove("replace"); } if (auto&& grammars = control["grammars"]) { log_notice("control.grammars not yet implemented."); control.remove("grammars"); } if (auto&& functions = control["functions"]) { log_notice("control.functions not yet implemented."); control.remove("functions"); } if (auto&& start = control["start"]) { startNode = start.as(); control.remove("start"); } for (auto&& other : control) { log_notice("Ignoring unknown control \"", kblib::escapify(other.first.as()), '"'); } data.remove("control"); } if (auto&& channels = data["channels"]) { if (control_block_has_channels) { log_notice("treating 'replace' as normal node because control.replace overrides it"); } else { assert(channels.IsMap()); for (auto&& v : channels.as>()) { channelNames.emplace(l_ch(v.first), std::move(v.second)); } } data.remove("channels"); } if (auto&& replace = data["replace"]) { if (control_block_has_replace) { log_notice("'replace' in control block overrides the old style"); } else { assert(replace.IsMap()); this->replace.resize(1); auto& rs = this->replace.back(); for (auto&& ch : replace) { rs.actions[::l_ch(ch.first.as())] = extractTransforms(ch.second, rformat); } } data.remove("replace"); if (data["replacement"]) { log_notice("Treating 'replacement' as normal node because 'replace' overrides it"); } if (data["replaceIPA"]) { log_notice("Treating 'replaceIPA' as normal node because 'replace' overrides it"); } } else { if (auto&& replaceVal = data["replacement"]) { log_notice("Old-style 'replacement' is deprecated, use 'replace' instead"); assert(replaceVal.IsSequence()); this->replace.resize(1); auto& rs = this->replace.back(); rs.actions["val"_ch] = extractTransforms(replaceVal, rformat); data.remove("replacement"); } if (auto&& replaceIPA = data["replaceIPA"]) { log_notice("Old-style 'replaceIPA' is deprecated, use 'replace' instead"); assert(replaceIPA.IsSequence()); this->replace.resize(1); auto& rs = this->replace.back(); rs.actions["ipa"_ch] = extractTransforms(replaceIPA, rformat); data.remove("replaceIPA"); } } auto isDouble = [](auto){return false;}; for (auto&& n : data) { node tmp; std::string nodename = n.first.as(); tmp.name = nodename; std::tie(tmp.declared_argc, tmp.variadic) = argsDeclared(nodename); if (n.second.IsMap()) { log_warn("Literal nodes not implemented. Translating ", nodename, " to regular node."); YAML::Node branch; branch[0] = std::move(n.second); n.second = std::move(branch); } log_debug(kblib::quoted(nodename), ':'); for (auto&& b : n.second) { auto& val = tmp.vals.emplace_back(); auto& freq = tmp.freqs.emplace_back(1.0); auto& ocs = tmp.other_channels.emplace_back(); for (auto&& c_v : b) { auto&& channel = c_v.first.as(); auto&& data = c_v.second; if (channel == "val") { val = bm.to_bytes(data.as()); } else if (channel == "freq") { assert(data.IsScalar()); if (isDouble(data.as())) { freq = data.as(); } else { freq = bm.to_bytes(data.as()); } } else if (channel == "path") { throw wordgen_error(ec::internal, "'path' is reserved as a channel name.", "", ""); } else { ocs[l_ch(channel)] = bm.to_bytes(data.as()); } } if (val.size() == 0) { val = bm.to_bytes(""); } } const auto& [name, argc, var] = parse_nodename(nodename); tmp.declared_argc = argc; tmp.variadic = var; nodes[name].push_back(std::move(tmp)); } log_debug(); } namespace fsm { //Cannot get state*s out of the node std::optional toState(const YAML::Node& node) { state output; output.ret = nullptr; for (const auto& rule : node) { auto name = rule.first.as(); if (name == "set") { if (!rule.second.IsSequence()) { log_err("'set' rules must be a sequence."); return std::nullopt; } for (const auto set : rule.second) { if (!set.IsSequence()) { log_err("'set' rules must be a sequence of sequences."); return std::nullopt; } output.set.push_back({set[0].as(), state::action_t{set[1].as(), nullptr}}); } continue; } if (name == "map") { if (!rule.second.IsSequence()) { log_err("'map' rules must be a sequence."); return std::nullopt; } log_debug(name, ':'); for (const auto& map : rule.second) { if (!map.IsSequence()) { log_err("'map' rules must be a sequence of sequences."); return std::nullopt; } log_debug('\t', kblib::quoted(map[0].as()), ", ", kblib::quoted(map[1].as())); if (map.size() > 2) { log_debug("\t -> ", kblib::quoted(map[2].as())); } std::pair tmp { tr::expand_set(tr::widen(map[0].as())), tr::expand_set(tr::widen(map[1].as())) }; output.map.push_back({ decltype(state::map_action_t::in_set){tmp.first.begin(), tmp.first.end()}, decltype(state::map_action_t::out_set){tmp.second.begin(), tmp.second.end()}, nullptr }); } continue; } if (name == "match") { if (!rule.second.IsSequence()) { log_err("'match' rules must be a sequence."); return std::nullopt; } log_debug(name, ':'); for (const auto& match : rule.second) { if (!match.IsSequence()) { log_err("'match' rules must be a sequence of sequences."); return std::nullopt; } log_debug('\t', kblib::quoted(match[0].as()), "\n\t -> ", kblib::quoted(match[1].as())); auto tmp = tr::expand_set(tr::widen(match[0].as())); output.match.push_back({ decltype(state::match_action_t::matches){tmp.begin(), tmp.end()}, nullptr }); } continue; } if (name == "return") { continue; } if (name == "end") { output.end = rule.second.as(); continue; } //character rules if (name == "default") { output.def = {rule.second[0].as(), nullptr}; continue; } auto uname = tr::widen(name); if (uname.length() > 1) { log_warn("Ignoring unknown rule '", name, "'."); continue; } output.cmap[uname[0]] = {rule.second[0].as(), nullptr}; } return output; } bool set_dests(std::map& states, std::string sname, const YAML::Node& data) { using kblib::to_signed; auto& cstate = states[sname]; for (const auto& rule : data) { auto name = rule.first.as(); if (name == "set") { for (std::ptrdiff_t i = 0; i < to_signed(cstate.set.size()); ++i) { if (rule.second[i][2]) { cstate.set[i].action.dest = &states.at(rule.second[i][2].as()); #if LOG_LOADING_NOISY log_debug(sname, ": ", name, " -> ", rule.second[i][2].as()); #endif } else { cstate.set[i].action.dest = &cstate; #if LOG_LOADING_NOISY log_debug(sname, ": ", name, " -> ", sname); #endif } } continue; } if (name == "map") { for (std::ptrdiff_t i = 0; i < to_signed(cstate.map.size()); ++i) { if (rule.second[i].size() > 2) { cstate.map[i].dest = &states.at(rule.second[i][2].as()); #if LOG_LOADING_NOISY log_debug(sname, ": ", name, " -> ", rule.second[i][2].as()); #endif } else { cstate.map[i].dest = &cstate; #if LOG_LOADING_NOISY log_debug(sname, ": ", name, " -> ", sname); #endif } } continue; } if (name == "match") { for (std::ptrdiff_t i = 0; i < to_signed(cstate.match.size()); ++i) { if (rule.second[i].size() > 1) { cstate.match[i].dest = &states.at(rule.second[i][1].as()); #if LOG_LOADING_NOISY log_debug(sname, ": ", name, " -> ", rule.second[i][1].as()); #endif } else { cstate.match[i].dest = &cstate; #if LOG_LOADING_NOISY log_debug(sname, ": ", name, " -> ", sname); #endif } } continue; } if (name == "return") { cstate.ret = &states.at(rule.second.as()); #if LOG_LOADING_NOISY log_debug(sname, ": ", name, " -> ", rule.second.as()); #endif continue; } if (name == "end") { continue; } //character rules if (name == "default") { if (rule.second.size() > 1) { cstate.def->dest = &states.at(rule.second[1].as()); #if LOG_LOADING_NOISY log_debug(sname, ": ", name, " -> ", rule.second[1].as()); #endif } else { cstate.def->dest = &cstate; #if LOG_LOADING_NOISY log_debug(sname, ": ", name, " -> ", sname); #endif } continue; } auto uname = tr::widen(name); if (rule.second.size() > 1) { if (!kblib::get_check(states, rule.second[1].as())) { log_err("FSM state '", sname, "' rule '", name, ": [", kblib::quoted(rule.second[0].as()), ", ", kblib::quoted(rule.second[1].as()), "]' references unknown state '", rule.second[1].as(), "'."); return false; } cstate.cmap[uname[0]].dest = &states.at(rule.second[1].as()); #if LOG_LOADING_NOISY log_debug(sname, ": ", name, " -> ", rule.second[1].as()); #endif } else { cstate.cmap[uname[0]].dest = &cstate; #if LOG_LOADING_NOISY log_debug(sname, ": ", name, " -> ", sname); #endif } } return true; } } // namespace fsm namespace YAML { bool convert::decode(const Node& node, fsm::state_machine& sm) { if (!node.IsMap() || !node["S"]) { log_err("State machines must be a mapping and must include an \"S\" state."); return false; } if (node["reversed"]) { auto dir = node["reversed"].as(); if (dir > 3 || dir < 0) { log_warn("FSM directions should only be 0, 1, 2, or 3. Interpreting direction 'reversed: ", dir, "' as ", dir & 3, "."); dir &= 3; } sm.dir = static_cast(dir); } else { sm.dir = fsm::state_machine::dir_override::none; } //populate states for (const auto& state : node) { auto name = state.first.as(); if (name == "reversed") { continue; } auto tstate = fsm::toState(state.second); if (!tstate) { log_err("Failed to extract FSM state ", kblib::quoted(name), "."); return false; } sm.states[name] = *tstate; #if LOG_LOADING_NOISY std::clog<(), state.second)) { log_err("Reference to undefined state."); return false; } } return true; } } // namespace YAML