#include "interpolate.h" #include "logger.h" #include "machines.h" #include "tr.h" #include "unicode.h" #include "srell/srell.hpp" #include auto l_ch(std::string_view name) -> channelID { auto ch_id = ch(name); auto& ch_db = datafile::ch_db(); if (auto it = ch_db.find(ch_id); it != ch_db.end()) { if (name != it->second) { log_err("Two channels hash to the same value:\n\t", name, "\n\t", it->second, "\nRecommend changing one or both names, and reporting this " "as an issue."); } } else { ch_db.emplace(ch_id, name); } return ch_id; } auto argsDeclared(std::string_view name) -> std::pair { auto args_pos = name.find('|'); if (args_pos == std::string_view::npos) { return {0, kblib::ends_with(name, "...")}; } auto args_spec = name.substr(args_pos); return {std::count(args_spec.begin(), args_spec.end(), '|'), kblib::ends_with(args_spec, "...")}; } struct tmp_node { std::vector> freqs; small_vector vals; std::vector other_channels; int declared_argc{}; bool variadic{}; }; enum class r { backslash, dollar, }; auto extractTransforms(const YAML::Node& data, const r match_format = r::backslash) -> replace_sequence { auto translate_r = [match_format](std::string s) { if (match_format == r::backslash) { std::string ret; // the majority of strings will not change size. ret.reserve(s.size()); bool inEsc = false; for (char c : s) { if (c == '$') { ret.append(2, '$'); } else if (inEsc) { if (c == '\\') { ret.push_back(c); } else if (std::isdigit(c)) { ret.push_back('$'); ret.push_back(c); } else { ret.push_back('\\'); ret.push_back(c); } inEsc = false; } else { if (c == '\\') { inEsc = true; } else { ret.push_back(c); } } } log_debug("adjusted ", kblib::quoted(s), " to ", kblib::quoted(ret)); if (s != ret) { } return ret; } return s; }; std::vector ret; for (const auto& stage : data) { if (stage.IsMap()) { if (stage["S"]) { ret.emplace_back(stage.as()); } else if (auto& assign = stage["assign"]) { ret.emplace_back( [fmt = assign.as()](Word w, channelID ch) { log_debug("assign: ", fmt); w.data[ch] = format(w, fmt); return std::move(w.data); }); } else if (auto& norm = stage["normalize"]) { auto type = norm.as(); boost::locale::norm_type n; if (type == "default") { n = boost::locale::norm_default; } else if (type == "nfd" or type == "NFD") { n = boost::locale::norm_nfd; } else if (type == "nfc" or type == "NFC") { n = boost::locale::norm_nfc; } else if (type == "nfkd" or type == "NFKD") { n = boost::locale::norm_nfkd; } else if (type == "nfkc" or type == "NFKC") { n = boost::locale::norm_nfkc; } else { throw wordgen_error(ec::invalid_normalization_form, kblib::quoted(type), "", ""); } log_debug("normalize: ", type); ret.emplace_back(normalizer{n}); } else { // TODO(killerbee13): Improve this error handling throw wordgen_error(ec::invalid_stage_type, "", "", ""); } } else if (stage.IsSequence()) { std::vector> steps; for (const auto& step : stage) { auto match = step["m"].as(); if (match.empty()) { continue; } steps.emplace_back( srell::regex{match, srell::regex_constants::dotall}, translate_r(step["r"].as())); } ret.emplace_back([steps = std::move(steps)](Word w, channelID ch) { log_debug("transform: regex"); for (const auto& [m, r] : steps) { w.data[ch] = srell::regex_replace(w.data[ch], m, r); } return std::move(w.data); }); } else { throw wordgen_error(ec::invalid_stage_type, "", "", ""); } } return ret; } datafile::datafile(YAML::Node data, RandomGenerator& rng) : bm(this, rng) , channelNames{{"val"_ch, "Words"}, {"ipa"_ch, "IPA"}, {"path"_ch, "Path"}} { bool has_control_block = false; bool control_block_has_channels = false; bool control_block_has_replace = false; r rformat = r::backslash; auto set_channel_names = [&](YAML::Node& channels) { for (auto&& v : channels.as>()) { log_debug("Set channel name ", kblib::quoted(v.first), " to ", kblib::quoted(v.second)); channelNames.insert_or_assign(l_ch(v.first), std::move(v.second)); } }; if (auto&& control = data["control"]) { assert(control.IsMap()); has_control_block = true; if (auto&& channels = control["channels"]) { if (not channels.IsMap()) { log_err("'controls.channels' must be a sequence."); assert(false); } control_block_has_channels = true; set_channel_names(channels); control.remove("channels"); } if (auto&& format = control["format"]) { if (format.as() == "dollar" or kblib::tolower(format.as()) == "ecmascript") { rformat = r::dollar; } else if (format.as() == "backslash" or kblib::tolower(format.as()) == "python") { rformat = r::backslash; } else { log_err("control.format must be one of" "\n- backslash" "\n- python" "\n- dollar" "\n- ecmascript"); throw wordgen_error(ec::internal, "", "", ""); } } if (auto&& replace = control["replace"]) { control_block_has_replace = true; if (not replace.IsSequence()) { log_err("'control.replace' must be a sequence."); assert(false); } for (auto&& stage : replace) { if (not stage.IsMap()) { log_err("'control.replace' must be a sequence of mappings."); assert(false); } this->replace.emplace_back(); auto& rs = this->replace.back(); assert(stage.IsMap()); for (auto&& channel : stage) { if (not channel.second.IsSequence()) { log_err("'control.replace' invalid."); assert(false); } rs.actions[l_ch(channel.first.as())] = extractTransforms(channel.second, rformat); } } control.remove("replace"); } if (auto&& grammars = control["grammars"]) { log_notice("control.grammars not yet implemented."); control.remove("grammars"); } if (auto&& functions = control["functions"]) { log_notice("control.functions not yet implemented."); control.remove("functions"); } if (auto&& start = control["start"]) { startNode = start.as(); control.remove("start"); } for (auto&& other : control) { log_notice("Ignoring unknown control \"", kblib::escapify(other.first.as()), '"'); } data.remove("control"); } if (auto&& channels = data["channels"]) { if (control_block_has_channels) { log_notice( "treating 'channels' as normal node because control.channels " "overrides it"); } else { if (not channels.IsMap()) { log_err("'channels' must be a sequence."); assert(false); } set_channel_names(channels); data.remove("channels"); } } if (auto&& replace = data["replace"]) { if (control_block_has_replace) { log_notice("treating 'replace' as normal node because control.replace " "overrides it"); } else { assert(replace.IsMap()); this->replace.resize(1); auto& rs = this->replace.back(); for (auto&& ch : replace) { rs.actions[::l_ch(ch.first.as())] = extractTransforms(ch.second, rformat); } data.remove("replace"); } if (data["replacement"]) { log_notice("Treating 'replacement' as normal node because 'replace' " "overrides it"); } if (data["replaceIPA"]) { log_notice("Treating 'replaceIPA' as normal node because 'replace' " "overrides it"); } } else { if (auto&& replaceVal = data["replacement"]) { log_notice( "Old-style 'replacement' is deprecated, use 'replace' instead"); assert(replaceVal.IsSequence()); this->replace.resize(1); auto& rs = this->replace.back(); rs.actions["val"_ch] = extractTransforms(replaceVal, rformat); data.remove("replacement"); } if (auto&& replaceIPA = data["replaceIPA"]) { log_notice( "Old-style 'replaceIPA' is deprecated, use 'replace' instead"); assert(replaceIPA.IsSequence()); this->replace.resize(1); auto& rs = this->replace.back(); rs.actions["ipa"_ch] = extractTransforms(replaceIPA, rformat); data.remove("replaceIPA"); } } auto isDouble = [](auto /*unused*/) { return false; }; for (auto&& n : data) { node tmp; auto nodename = n.first.as(); tmp.name = nodename; std::tie(tmp.declared_argc, tmp.variadic) = argsDeclared(nodename); if (n.second.IsMap()) { log_warn("Literal nodes not implemented. Translating ", nodename, " to regular node."); YAML::Node branch; branch[0] = n.second; n.second = branch; } log_debug(kblib::quoted(nodename), ':'); for (auto&& b : n.second) { auto& val = tmp.vals.emplace_back(); auto& freq = tmp.freqs.emplace_back(1.0); auto& ocs = tmp.other_channels.emplace_back(); for (auto&& c_v : b) { decltype(auto) channel = c_v.first.as(); auto& data = c_v.second; if (channel == "val") { val = bm.compile(data.as()); } else if (channel == "freq") { assert(data.IsScalar()); if (isDouble(data.as())) { freq = data.as(); } else { freq = bm.compile(data.as()); } } else if (channel == "path") { throw wordgen_error(ec::internal, "'path' is reserved as a channel name.", "", ""); } else { auto ch = l_ch(channel); ocs[ch] = bm.compile(data.as()); if (auto [it, did] = channelNames.try_emplace(ch, channel); did) { log_debug("Set default channel name ", kblib::quoted(channel)); } } } if (val.empty()) { val = bm.compile(""); } } const auto& [name, argc, var] = parse_nodename(nodename); if (constexpr auto limit = static_cast(kblib::max); argc >= limit) { log_err("Node template arguments are limited to ", static_cast(kblib::max)); throw wordgen_error( ec::too_many_tnode_args, name, "\n", kblib::concat("declared with ", argc, " arguments.")); } tmp.declared_argc = static_cast(argc); tmp.variadic = var; auto& matching_nodes = nodes[name]; auto pos = std::lower_bound(matching_nodes.begin(), matching_nodes.end(), tmp, [](const node& a, const node& b) { return a.declared_argc > b.declared_argc; }); matching_nodes.insert(pos, std::move(tmp)); // nodes[name].push_back(std::move(tmp)); } log_debug(); } namespace fsm { // Cannot get state*s out of the node auto toState(const YAML::Node& node) -> std::optional { state output; output.ret = nullptr; for (const auto& rule : node) { auto name = rule.first.as(); log_debug(name, ':'); if (name == "set") { if (not rule.second.IsSequence()) { log_err("'set' rules must be a sequence."); return std::nullopt; } for (const auto& set : rule.second) { if (not set.IsSequence()) { log_err("'set' rules must be a sequence of sequences."); return std::nullopt; } log_debug('\t', kblib::quoted(set[0].as()), ", ", kblib::quoted(set[1].as())); auto set1 = tr::expand_set(tr::widen(set[0].as())); log_debug("Expands to:"); log_debug('\t', kblib::quoted(set1), ", ", kblib::quoted(set[1].as())); output.set.push_back( {decltype(state::set_action_t::matches){set1.begin(), set1.end()}, state::action_t{set[1].as(), nullptr}}); } } else if (name == "map") { if (not rule.second.IsSequence()) { log_err("'map' rules must be a sequence."); return std::nullopt; } for (const auto& map : rule.second) { if (not map.IsSequence()) { log_err("'map' rules must be a sequence of sequences."); return std::nullopt; } log_debug('\t', kblib::quoted(map[0].as()), ", ", kblib::quoted(map[1].as())); log_debug("\t -> ", kblib::quoted(map[2].as())); auto set1 = tr::expand_set(tr::widen(map[0].as())); auto set2 = tr::expand_set(tr::widen(map[1].as())); log_debug("Expands to:"); log_debug('\t', kblib::quoted(set1), ", ", kblib::quoted(set2)); log_debug("\t -> ", kblib::quoted(map[2].as())); std::pair tmp{set1, set2}; output.map.push_back({decltype(state::map_action_t::in_set){ tmp.first.begin(), tmp.first.end()}, decltype(state::map_action_t::out_set){ tmp.second.begin(), tmp.second.end()}, nullptr}); } } else if (name == "match") { if (not rule.second.IsSequence()) { log_err("'match' rule must be a sequence."); return std::nullopt; } for (const auto& match : rule.second) { if (not match.IsSequence()) { log_err("'match' rule must be a sequence of sequences."); return std::nullopt; } log_debug('\t', kblib::quoted(match[0].as())); if (match[1]) { log_debug("\t -> ", kblib::quoted(match[1].as())); } auto tmp = tr::expand_set(tr::widen(match[0].as())); output.match.push_back({decltype(state::match_action_t::matches){ tmp.begin(), tmp.end()}, nullptr}); } } else if (name == "return") { if (not rule.second.IsScalar()) { log_err("'return' rule must be a node-name"); return std::nullopt; } // Can't get state*s at this stage, so leave this rule for later. log_debug("\t -> ", kblib::quoted(rule.second.as())); } else if (name == "end") { log_debug("\t", kblib::quoted(rule.second.as())); output.end = rule.second.as(); // character rules } else if (name == "default") { if (not rule.second.IsSequence()) { log_err("'default' rule must be a sequence"); return std::nullopt; } log_debug("\t", kblib::quoted(rule.second[0].as())); if (rule.second[1]) { log_debug("\t -> ", kblib::quoted(rule.second[1].as())); } output.def = {rule.second[0].as(), nullptr}; } else { auto uname = tr::widen(name); if (uname.length() > 1) { log_warn("Ignoring unknown rule '", name, "'."); } else { if (not rule.second.IsSequence()) { log_err("Character rule must be a sequence"); return std::nullopt; } log_debug("Character rule: ", kblib::quoted(rule.second[0].as())); if (rule.second[1]) { log_debug("\t -> ", kblib::quoted(rule.second[1].as())); } output.cmap[uname[0]] = {rule.second[0].as(), nullptr}; } } } return output; } auto set_dests(std::map& states, std::string sname, const YAML::Node& data) -> bool { using kblib::to_signed; auto& cstate = states[sname]; for (const auto& rule : data) { auto name = rule.first.as(); if (name == "set") { for (std::size_t i = 0; i < cstate.set.size(); ++i) { if (rule.second[i][2]) { cstate.set[i].action.dest = &states.at(rule.second[i][2].as()); #if LOG_LOADING_NOISY log_debug(sname, ": ", name, " -> ", rule.second[i][2].as()); #endif } else { cstate.set[i].action.dest = &cstate; #if LOG_LOADING_NOISY log_debug(sname, ": ", name, " -> ", sname); #endif } } continue; } if (name == "map") { for (std::size_t i = 0; i < cstate.map.size(); ++i) { if (rule.second[i].size() > 2) { cstate.map[i].dest = &states.at(rule.second[i][2].as()); #if LOG_LOADING_NOISY log_debug(sname, ": ", name, " -> ", rule.second[i][2].as()); #endif } else { cstate.map[i].dest = &cstate; #if LOG_LOADING_NOISY log_debug(sname, ": ", name, " -> ", sname); #endif } } continue; } if (name == "match") { for (std::size_t i = 0; i < cstate.match.size(); ++i) { if (rule.second[i].size() > 1) { cstate.match[i].dest = &states.at(rule.second[i][1].as()); #if LOG_LOADING_NOISY log_debug(sname, ": ", name, " -> ", rule.second[i][1].as()); #endif } else { cstate.match[i].dest = &cstate; #if LOG_LOADING_NOISY log_debug(sname, ": ", name, " -> ", sname); #endif } } continue; } if (name == "return") { cstate.ret = &states.at(rule.second.as()); #if LOG_LOADING_NOISY log_debug(sname, ": ", name, " -> ", rule.second.as()); #endif continue; } if (name == "end") { continue; } // character rules if (name == "default") { if (rule.second.size() > 1) { cstate.def->dest = &states.at(rule.second[1].as()); #if LOG_LOADING_NOISY log_debug(sname, ": ", name, " -> ", rule.second[1].as()); #endif } else { cstate.def->dest = &cstate; #if LOG_LOADING_NOISY log_debug(sname, ": ", name, " -> ", sname); #endif } continue; } auto uname = tr::widen(name); if (rule.second.size() > 1) { if (not kblib::get_check(states, rule.second[1].as())) { log_err("FSM state '", sname, "' rule '", name, ": [", kblib::quoted(rule.second[0].as()), ", ", kblib::quoted(rule.second[1].as()), "]' references unknown state '", rule.second[1].as(), "'."); return false; } cstate.cmap[uname[0]].dest = &states.at(rule.second[1].as()); #if LOG_LOADING_NOISY log_debug(sname, ": ", name, " -> ", rule.second[1].as()); #endif } else { cstate.cmap[uname[0]].dest = &cstate; #if LOG_LOADING_NOISY log_debug(sname, ": ", name, " -> ", sname); #endif } } return true; } } // namespace fsm namespace YAML { auto convert::decode(const Node& node, fsm::state_machine& sm) -> bool { if (not node.IsMap() or not node["S"]) { log_err( "State machines must be a mapping and must include an \"S\" state."); return false; } if (node["reversed"]) { auto dir = node["reversed"].as(); if (dir > 3 or dir < 0) { log_warn("FSM directions should only be 0, 1, 2, or 3. Interpreting " "direction 'reversed: ", dir, "' as ", kblib::to_unsigned(dir) & 3u, "."); dir &= 3u; } sm.dir = static_cast(dir); } else { sm.dir = fsm::state_machine::dir_override::none; } // populate states for (const auto& state : node) { auto name = state.first.as(); log_debug("State ", kblib::quoted(name), ':'); if (name == "reversed") { continue; } auto tstate = fsm::toState(state.second); if (not tstate) { log_err("Failed to extract FSM state ", kblib::quoted(name), "."); return false; } sm.states[name] = *tstate; auto it = sm.states.find(name); it->second.name = it->first; #if LOG_LOADING_NOISY std::clog << name << '\n'; #endif } // set dest pointers for (const auto& state : node) { if (not set_dests(sm.states, state.first.as(), state.second)) { log_err("Reference to undefined state."); return false; } } return true; } } // namespace YAML