#include "vnum.h" #include "error.h" #include "fda.h" #include "interpolate.h" #include "logger.h" #include "tests.h" #include #include #include #include #include "randutils.hpp" #include #include #include static randutils::random_generator randgen; #include "boost/locale.hpp" #include "srell/srell.hpp" #include "strong_type.h" #include #include #include #include #include using std::cerr; using std::cin; using std::clog; using std::cout; using std::string; using std::u16string; using std::u32string; using namespace std::string_literals; /* * refParse: string -> vector * class reference * * chooseFrom: grammar, node, depth, maxdepth -> word * class word * class grammar * * applyRE: word -> word * * listAll: grammar, node, depth, maxdepth -> list * * formatWord: word, opts, formatStr -> string * * printPath: path -> string * readPath: string -> path * followPath: path -> word * class path * * main: * gen: grammar, node,, count, depth -> list * list: grammar, node, depth -> list * * * * loadGrammar: YAML::Node -> grammar * dumpGrammar: grammar -> YAML::Node * * toBNF: grammar, node -> BNFGrammar * fromBNF: BNFGrammar -> grammar * */ auto printHelpText() -> void { #include "helptext.inc" std::cout << "Extended help for wordgen v" << vnum << '\n' << std::string_view{help_text, sizeof(help_text)}; } template constexpr inline bool is_optional = false; template constexpr inline bool is_optional> = true; template class ParserConstraint : public TCLAP::Constraint { using result_type = std::invoke_result_t; public: ParserConstraint(Parser par) : _parser(std::move(par)) , _usage(_parser.usage()) , _desc(_parser.desc()) {} ParserConstraint(Parser par, std::string usage) : _parser(std::move(par)) , _usage(usage) , _desc(std::move(usage)) {} ParserConstraint(Parser par, std::string usage, std::string desc) : _parser(std::move(par)) , _usage(std::move(usage)) , _desc(std::move(desc)) {} [[nodiscard]] auto parse(const std::string& val) const -> decltype(auto) { return _parser(val); } [[nodiscard]] auto check(const std::string& val) const -> bool override { try { auto result = _parser(val); if constexpr (kblib::is_variant_like_v) { using tuple_t = typename kblib::list_as_tuple::type; if constexpr (kblib::contains_type_v) { return not std::holds_alternative(result); } else if constexpr (kblib::contains_type_v) { return not std::holds_alternative(result); } else if constexpr (kblib::contains_type_v) { return not std::holds_alternative(result); } else if constexpr (kblib::contains_type_v) { return not std::holds_alternative(result); } else { return bool(result); } } else if (is_optional) { return bool(result); } else { return true; } } catch (std::exception&) { return false; } } [[nodiscard]] auto shortID() const -> std::string override { return _usage; } [[nodiscard]] auto description() const -> std::string override { return _desc; } private: Parser _parser; std::string _usage; std::string _desc; }; struct none {}; STRONG_TYPE(percent, double); using seconds = std::chrono::duration; auto main(int argc, char** argv) -> int { // c_test_func(); try { log_info("wordgen v", vnum); TCLAP::CmdLine cmd("Generate random words/sentences matching" " a context-free grammar", ' ', vnum); const TCLAP::UnlabeledValueArg file( "file", "The datafile to use, or 'help', which will print extended " "information.", true, "", "filename|help", cmd); const TCLAP::UnlabeledValueArg root("root", "Root node", true, "", "root", cmd); std::vector commands_allowed{"gen", "list", "diag", "show", "xform"}; TCLAP::ValuesConstraint commands(commands_allowed); const TCLAP::UnlabeledValueArg command( "command", "command", false, "gen", &commands, cmd); const TCLAP::MultiArg channels("c", "channel", "print CHANNEL", false, "CHANNEL", cmd); const TCLAP::SwitchArg ipaMode("p", "ipa", "print IPA transcriptions (-c ipa)", cmd); const TCLAP::SwitchArg showFreqs( "f", "freqs", "print calculated frequencies (-c freq)", cmd); const TCLAP::SwitchArg allChannels( "C", "printAll", "print all channels present (overrides -c, -p, -f, -V, -F)", cmd); const TCLAP::ValueArg depth( "d", "depth", "maximum recursion depth (defaults to 24, or equal to expansion " "limit if -e is used without -d.)", false, -1, "int", cmd); const TCLAP::ValueArg expansions( "e", "expansions", "maximum number of expansions (implies " "-d equal to -e if used alone) (defaults to depth^2)", false, -1, "int", cmd); const TCLAP::SwitchArg HTMLMode("H", "html", "write output as HTML table", cmd); const TCLAP::ValueArg num("n", "num", "number of words to generate", false, 1, "int", cmd); const TCLAP::SwitchArg noVal("V", "noVal", "suppress implicit 'val' printing", cmd); const TCLAP::SwitchArg quiet("q", "quiet", "don't print the header", cmd); const TCLAP::SwitchArg silent( "Q", "silent", "don't print anything except diagnostics", cmd); auto progress_parser = ParserConstraint{ [](std::string_view val) -> std::variant { if (val.length() <= 1) { return std::false_type{}; } else if (val == "none") { return none{}; } else if (kblib::ends_with(val, 's')) { const char* begin = val.data(); char* end{}; double value = std::strtod(begin, &end); if (end != begin + (val.size() - 1)) { return std::false_type{}; } else { return seconds{value}; } } else if (kblib::ends_with(val, '%')) { const char* begin = val.data(); char* end{}; double value = std::strtod(begin, &end); if (end != begin + (val.size() - 1)) { return std::false_type{}; } else { return percent{value}; } } else { return std::false_type{}; } }, "none|s|%"}; assert(std::get(progress_parser.parse("1%")) == percent{1}); assert(std::get(progress_parser.parse("1s")) == seconds{1}); assert(std::get(progress_parser.parse("10%")) == percent{10}); assert(std::get(progress_parser.parse("5s")) == seconds{5}); assert( std::holds_alternative(progress_parser.parse("%"))); assert( std::holds_alternative(progress_parser.parse(""))); assert( std::holds_alternative(progress_parser.parse("2"))); assert(progress_parser.check("1%")); assert(progress_parser.check("1s")); assert(not progress_parser.check("%")); assert(not progress_parser.check("")); assert(not progress_parser.check("2")); const TCLAP::ValueArg progress( "i", "progress", "print periodic progress updates (works best with -Q)", false, "none", &progress_parser, cmd); const TCLAP::ValueArg fstr("F", "fmt", "format string for printing words", false, "", "fmt_str", cmd); const TCLAP::ValueArg hfstr("", "hfmt", "format string for the header", false, "", "fmt_str", cmd); std::vector loglevels_allowed{ "none", "err", "error", "warn", "notice", "info", "debug"}; TCLAP::ValuesConstraint loglevels(loglevels_allowed); const TCLAP::ValueArg loglevel( "l", "loglevel", "Set the logging level. Defaults to 'notice'.", false, "notice", &loglevels, cmd); const TCLAP::SwitchArg listZeros( "0", "listZeros", "list: include 0-frequency values. NYI", cmd); const TCLAP::SwitchArg dumpREs( "", "regex", "diag: Dump regular expressions. NYI", cmd); const TCLAP::SwitchArg dumpNodes("", "nodes", "diag: Dump switching nodes. NYI", cmd); const TCLAP::SwitchArg testREs( "", "retest", "diag: Apply tranformations from CHANNELS to input", cmd); const TCLAP::ValueArg check_REs( "", "checktokens", "diag: Check the regexes used for tokenization for correctness", false, "", "filename", cmd); const TCLAP::MultiArg testParser( "", "tstr", "diag: Test template parser with file(s)", false, "filename", cmd); const TCLAP::SwitchArg EbnfExport( "", "bnf", "diag: Export to EBNF (val only). NYI", cmd); const TCLAP::SwitchArg dumpfmt( "", "dumpfmt", "diag: Show the effective format string.", cmd); const TCLAP::SwitchArg showPaths("P", "path", "Print paths (-c path)", cmd); const TCLAP::SwitchArg keepHistory( "K", "keepHistory", "Print intermediate tranformations (may be hard to read)", cmd); const TCLAP::ValueArg KHSep( "", "KHSep", "Separator between transformation stages", false, " → ", "string", cmd); const TCLAP::ValueArg seed("r", "seed", "Random seed", false, "", "seed", cmd); cmd.parse(argc, argv); const boost::locale::generator gen; std::locale::global(gen("")); if (file.getValue() == "help") { printHelpText(); return EXIT_SUCCESS; } const bool noheader = quiet or silent; std::vector printChannels; if (not noVal.getValue()) { printChannels.emplace_back("val"); } if (ipaMode.getValue()) { printChannels.emplace_back("ipa"); } if (showFreqs.getValue()) { printChannels.emplace_back("freq"); } if (showPaths.getValue()) { printChannels.emplace_back("path"); } for (const auto& c : channels) { printChannels.push_back(c); } randutils::seed_seq_fe128 seed_data{{0}}; if (not seed.getValue().empty()) { seed_data.seed(seed.getValue().begin(), seed.getValue().end()); } else { [&](auto& seed) { std::array data{}; seed.param(data.begin()); seed_data.seed(data.begin(), data.end()); }(randutils::auto_seed_128{}.base()); } set_log_level([&] { using namespace kblib::literals; switch (kblib::FNV32a(loglevel.getValue())) { case "none"_fnv32: return log_level::silent; case "err"_fnv32: case "error"_fnv32: return log_level::err; case "warn"_fnv32: return log_level::warn; case "notice"_fnv32: return log_level::notice; case "info"_fnv32: return log_level::info; case "debug"_fnv32: return log_level::debug; default: log_warn("Unknown log level ", kblib::quoted(loglevel.getValue()), " ignored. Validation bug?"); return get_log_level(); } }()); int cdepth = 24, cexps = 576; if (depth.getValue() != -1 and expansions.getValue() != -1) { cdepth = depth.getValue(); cexps = expansions.getValue(); } else if (depth.getValue() != -1) { cdepth = depth.getValue(); cexps = cdepth * cdepth; } else if (expansions.getValue() != -1) { cdepth = cexps = expansions.getValue(); } // std::ifstream retest(file.getValue()); // testRE(retest); try { auto gen_format = [&](const auto& format, bool header) { std::string out; if (allChannels.getValue()) { if (HTMLMode.getValue()) { out = ""; if (header) { if (not noVal.getValue()) { out += "{val!H}"s; } out += "{%all%!H}{path!H}"s; } else { if (not noVal.getValue()) { out += "{val!h}"s; } out += "{%all%!h}{path!h}"s; } out += ""; } else { if (not noVal.getValue()) { out = "{val}\t"; } out += "{%all%}\t{path}"; } } else if (format.empty()) { std::string flags; if (HTMLMode.getValue()) { out = ""; if (header) { flags = "!H"; } else { flags = "!h"; } } auto base_str = std::accumulate( printChannels.begin(), printChannels.end(), out, [&, first = true](std::string out, const std::string& ch) mutable { if (first or HTMLMode.getValue()) { first = false; return kblib::concat(out, '{', ch, flags, '}'); } else { return kblib::concat(out, "\\t{", ch, flags, '}'); } }); if (HTMLMode.getValue()) { base_str += ""; } return base_str; } else { if (HTMLMode.getValue()) { out = kblib::concat("", kblib::html_encode(format), ""); } else { out = format; } } return out; }; const string fmt = gen_format(fstr.getValue(), false); const string hfmt = gen_format(hfstr.getValue(), true); // cout << "Using format " << kblib::quoted(fmt) << '\n'; if (command.getValue() == "gen"s) { RandomGenerator rng(seed_data); auto d_file = YAML::LoadFile(file.getValue()); const auto& rootname = root.getValue(); const bool direct_ref = not kblib::contains(rootname, '{'); if (not direct_ref) { d_file[":cmdline:"][0]["val"] = root.getValue(); } datafile Data{d_file, rng}; node* rootnode; std::vector p_args; if (direct_ref) { rootnode = Data.find_node(rootname); if (not rootnode) { cerr << "Could not find node with no arguments named " << kblib::quoted(root.getValue()) << '\n'; cerr << "List of all nodes:\n"; for (const auto& [name, nodes] : Data.nodes) { cerr << kblib::quoted(name) << '\t' << nodes.size() << '\n'; } return 1; } if (kblib::contains_any(rootname, "|:!")) { } } else { rootnode = Data.find_node(":cmdline:"); assert(rootnode); } if (HTMLMode.getValue()) { std::cout << ""; } if (not noheader) { std::cout << format(Data.channelNames, hfmt) << '\n'; if (not HTMLMode.getValue()) { std::cout << kblib::repeat('-', 40) << '\n'; } } const auto [percentage_binding, time] = [&]() -> std::pair { using namespace std::chrono_literals; if (progress.getValue() != "none") { auto result = progress_parser.parse(progress.getValue()); if (auto pc = std::get_if(&result)) { return {*pc, 0s}; } else if (auto t = std::get_if(&result)) { return {0., *t}; } else { assert(false); } } else { return {0, 0s}; } }(); const auto percentage = percentage_binding; // work around lambda capture rules using clock = std::chrono::steady_clock; auto iterations_per_update = [&]() -> int { if (percentage == 0) { return kblib::max; } else { return static_cast(num.getValue() * percentage / 100); } }(); int percent_precision{}; int time_precision{3}; if (percentage != 0) { log_info("Progress updates are percentage-based, interval: ", iterations_per_update, " words"); auto& percent_specified = progress.getValue(); auto dot_position = percent_specified.find('.'); if (dot_position == std::string::npos) { percent_precision = 0; } else { percent_precision = percent_specified.length() - dot_position - 1; } percent_precision = std::min( static_cast(std::lrint(std::log10(num.getValue())) - 2), percent_precision); } else if (time.count() > 0) { log_info("Progress updates are time-based, interval: ", time.count(), 's'); auto& time_specified = progress.getValue(); auto dot_position = time_specified.find('.'); if (dot_position == std::string::npos) { time_precision = 0; } else { time_precision = time_specified.length() - dot_position - 1; } percent_precision = std::max( static_cast(std::lrint(std::log10(num.getValue())) - 4), 0); } using std::chrono::duration_cast; const auto t_begin = clock::now(); auto t_last_updated = clock::now(); auto print_update = [&](const auto n, const auto t_now) { std::cout << std::setw(kblib::count_digits(num.getValue())) << std::setfill(' ') << std::right << n << " / " << num.getValue() << " words [ " << std::fixed << std::setprecision(percent_precision); const auto current_percent = 100. * n / num.getValue(); if (percentage != 0 and n != num.getValue()) { std::cout << ' '; } if (current_percent < 10) { std::cout << ' '; } std::cout << current_percent << "% ], " << std::setprecision(time_precision) << duration_cast(t_now - t_begin).count() << "s elapsed\n" << std::defaultfloat; }; for (const auto n : kblib::range(1, num.getValue() + 1)) { const auto t_now = clock::now(); if ((n % iterations_per_update) == 0 or (time.count() != 0 and t_now - t_last_updated >= time)) { t_last_updated = t_now; print_update(n, t_now); } int e{}; const auto w = format(Data.generate( rng, counters{cdepth, cexps, &e}, *rootnode, {}, {}, keepHistory.getValue(), KHSep.getValue()), fmt); /*auto w = format( Data.transform(chooseFrom(Data, rng, counters{cdepth, cexps, &e}, *node, {}, {}), keepHistory.getValue(), KHSep.getValue()), fmt);*/ if (not silent.getValue()) { std::cout << w << '\n'; } } if (HTMLMode.getValue()) { std::cout << "
\n"; } } else if (command.getValue() == "list"s) { RandomGenerator rng(seed_data); const datafile Data{YAML::LoadFile(file.getValue()), rng}; auto* node = Data.find_node(root.getValue()); if (not node) { cerr << "Could not find node named " << kblib::quoted(root.getValue()) << '\n'; cerr << "List of all nodes:\n"; for (const auto& [name, nodes] : Data.nodes) { cerr << kblib::quoted(name) << '\t' << nodes.size() << '\n'; } return 1; } if (HTMLMode.getValue()) { std::cout << ""; } if (not noheader) { std::cout << format(Data.channelNames, hfmt) << '\n'; if (not HTMLMode.getValue()) { std::cout << kblib::repeat('-', 40) << '\n'; } } int e{}; for (const auto& word : enumerate(Data, counters{cdepth, cexps, &e}, *node, {}, {}, listZeros.getValue())) { auto w = format(Data.transform(word, keepHistory.getValue(), KHSep.getValue()), fmt); if (not silent.getValue()) { std::cout << w << '\n'; } e = 0; } if (HTMLMode.getValue()) { std::cout << "
\n"; } } else if (command.getValue() == "diag"s) { RandomGenerator rng; const template_machine BM(nullptr, rng); if (auto& tests = testParser.getValue(); not tests.empty()) { for (auto& t : tests) { try { std::ifstream tfile(t); if (not tfile) { cerr << "Could not open file " << std::quoted(t) << '\n'; } tfile.exceptions(std::ios_base::badbit); if (not test_file(tfile, BM)) { cout << "Test failure, stopping.\n"; break; } cout << "All tests passed.\n"; } catch (std::ios_base::failure& e) { cerr << "Error: " << e.what() << '\n' << "While reading test file: " << std::quoted(t) << "\nContinuing...\n"; } } } else if (testREs.getValue()) { std::ifstream re_file(file.getValue()); std::string regex; while (re_file >> kblib::get_line(regex)) { srell::regex re(regex); } } else if (dumpfmt.getValue()) { cout << std::quoted(hfmt) << '\n'; cout << std::quoted(fmt) << '\n'; } else if (auto& filename = check_REs.getValue(); not filename.empty()) { std::ifstream file(filename); check_tokenizer(file); } else { cout << "d: " << cdepth << " e: " << cexps << " n: " << num.getValue() << '\n'; } } else { cerr << "unknown command\n"; } } catch (const YAML::BadFile& e) { cerr << "Bad input file.\n" << e.what() << "\nAt " << e.mark.pos << ", " << e.mark.line << ':' << e.mark.column << '\n'; } catch (wordgen_error& e) { cerr << *e.l1 << *e.l2 << *e.l3 << *e.l4; } } catch (TCLAP::ArgException& e) { cerr << "error: " << e.error() << " for arg " << e.argId() << '\n'; return 1; } }