#include "tr.h" #include "utf8.h" namespace tr { std::string filter(std::string whitelist, std::string_view input) { if (whitelist.empty()) { return {}; } //ascii only if (std::none_of(whitelist.begin(), whitelist.end(), [](char c) { return static_cast(kblib::to_unsigned(c) & 128); })) { std::array mask{}; for (char c : whitelist) { mask[kblib::to_unsigned(c)] = true; } return [&]{ std::string out; for (char c : input) { if (mask[c]) { out.push_back(c); } } return out; }(); } //unicode std::vector uwhitelist; utf8::utf8to32(whitelist.begin(), whitelist.end(), std::back_inserter(uwhitelist)); std::sort(uwhitelist.begin(), uwhitelist.end()); //work around std::vector std::vector mask(uwhitelist.back()); for (char32_t c : uwhitelist) { mask[kblib::to_unsigned(c)] = 1; } return [&]{ std::string out; for (char32_t c : input) { if (c < mask.size() && mask[c]) { out.push_back(c); } } return out; }(); } namespace { template std::array ascii_buildmap(std::string_view set1, const str& set2) { //by default, each character maps to itself auto map = kblib::buildiota>('\0'); //fill in mappings kblib::for_each(set1.begin(), set1.end(), set2.begin(), [&](char f, To t) { map[kblib::to_unsigned(f)] = t; }); return map; } } // namespace std::string translate(std::string_view set1, std::string set2, std::string_view input) { if (set1.empty()) { return std::string(input); } assert(set2.length()); if (set2.length() < set1.length()) { set2.append(set1.length() - set2.length(), set2.back()); } auto is_multibyte = [](char c) { return static_cast(kblib::to_unsigned(c) & 128); }; bool set1_ascii = std::none_of(set1.begin(), set1.end(), is_multibyte); bool set2_ascii = std::none_of(set2.begin(), set2.end(), is_multibyte); if (set1_ascii && set2_ascii) { //ascii to ascii only auto map = ascii_buildmap(set1, set2); return kblib::build(input.begin(), input.end(), [&](char c) { return map[kblib::to_unsigned(c)]; }); } else if (set1_ascii) { //ascii to unicode auto map = ascii_buildmap(set1, utf8_string(set2)); return kblib::build_dy(input.begin(), input.end(), [&](char c) { return map[kblib::to_unsigned(c)]; }).cpp_str(); } //unicode to unicode auto uset1 = utf8_string(set1.data(), set1.size()); auto uset2 = utf8_string(set2.data(), set2.size()); auto limit = static_cast(*std::max_element(set1.begin(), set1.end())); auto map = kblib::buildiota>(limit, U'\0'); kblib::for_each(uset1.begin(), uset1.end(), uset2.begin(), [&](char32_t f, char32_t t) { map[kblib::to_unsigned(f)] = t; }); auto uinput = utf8_string(input.data(), input.size()); return kblib::build_dy(uinput.begin(), uinput.end(), [&](char32_t c) { return map[kblib::to_unsigned(c)]; }).cpp_str(); } //Algorithm adapted from python-tr make_char_list. //This does not support all features of POSIX tr; to be replaced later by the //algorithm used in the heirloom toolchest's tr. std::u32string expand_set(std::u32string_view set) { std::u32string list; bool escape{false}; bool hyphen{false}; for (char32_t c : set) { if (c == U'\\') { if (!escape) { escape = true; list.push_back(c); continue; } } else if (c == U'-') { if (escape) { list.pop_back(); } else { hyphen = true; continue; } } else if (hyphen) { char32_t start = list.back() + 1; std::generate_n(std::back_inserter(list), c - start, [start]() mutable { return start++; }); } list.push_back(c); escape = false; hyphen = false; } return list; } } //namespace tr