#include "tr.h" #include "utf8.h" #include #include #include #include namespace tr { auto filter(std::string whitelist, std::string_view input) -> std::string { if (whitelist.empty()) { return {}; } // ascii only if (std::none_of(whitelist.begin(), whitelist.end(), [](char c) { return static_cast(kblib::to_unsigned(c) & 128u); })) { std::array mask{}; for (char c : whitelist) { mask[kblib::to_unsigned(c)] = true; } return [&] { std::string out; for (char c : input) { if (mask[kblib::to_unsigned(c)]) { out.push_back(c); } } return out; }(); } else { // unicode std::vector uwhitelist; utf8::utf8to32(whitelist.begin(), whitelist.end(), std::back_inserter(uwhitelist)); std::sort(uwhitelist.begin(), uwhitelist.end()); // work around std::vector std::vector mask(uwhitelist.back()); for (char32_t c : uwhitelist) { mask[kblib::to_unsigned(c)] = 1; } return [&] { std::string out; for (char32_t c : input) { if (c < mask.size() and mask[c]) { out.push_back(c); } } return out; }(); } } namespace { template auto ascii_buildmap(std::string_view set1, const str& set2) -> std::array { // by default, each character maps to itself auto map = kblib::buildiota>('\0'); // fill in mappings kblib::for_each(set1.begin(), set1.end(), set2.begin(), [&](char f, To t) { map[kblib::to_unsigned(f)] = t; }); return map; } } // namespace template auto u8_range(Str& s) { return kblib::indirect(utf8::iterator(s.begin(), s.begin(), s.end()), utf8::iterator(s.end(), s.begin(), s.end())); } auto translate(std::string_view set1, std::string set2, std::string_view input) -> std::string { if (set1.empty()) { return std::string(input); } assert(set2.length()); if (set2.length() < set1.length()) { set2.append(set1.length() - set2.length(), set2.back()); } auto is_multibyte = [](char c) { return static_cast(kblib::to_unsigned(c) & 128u); }; bool set1_ascii = std::none_of(set1.begin(), set1.end(), is_multibyte); bool set2_ascii = std::none_of(set2.begin(), set2.end(), is_multibyte); if (set1_ascii and set2_ascii) { // ascii to ascii only auto map = ascii_buildmap(set1, set2); return kblib::build(input.begin(), input.end(), [&](char c) { return map[kblib::to_unsigned(c)]; }); } else if (set1_ascii) { // ascii to unicode auto map = ascii_buildmap(set1, set2); auto out = std::string{}; auto input_mapped = kblib::transformer( input.begin(), [&](char c) { return map[kblib::to_unsigned(c)]; }); utf8::utf32to8(input_mapped, input_mapped.from_base(input.end()), std::back_inserter(out)); return out; } else { // unicode to unicode auto uset1 = u8_range(set1); auto uset2 = u8_range(set2); auto map = std::unordered_map(); kblib::for_each( uset1.begin(), uset1.end(), uset2.begin(), [&](char32_t f, char32_t t) { map.insert_or_assign(f, t); }); auto out = std::string{}; auto input_mapped = kblib::transformer(input.begin(), [&](char c) { return map.at(c); }); utf8::utf32to8(input_mapped, input_mapped.from_base(input.end()), std::back_inserter(out)); return out; } } // Algorithm adapted from python-tr make_char_list. // This does not support all features of POSIX tr; to be replaced later by the // algorithm used in the heirloom toolchest's tr. auto expand_set(std::u32string_view set) -> std::u32string { std::u32string list; bool escape{false}; bool hyphen{false}; for (char32_t c : set) { if (c == U'\\') { if (not escape) { escape = true; list.push_back(c); continue; } } else if (c == U'-') { if (escape) { list.pop_back(); } else { hyphen = true; continue; } } else if (hyphen) { char32_t start = list.back() + 1; std::generate_n(std::back_inserter(list), c - start, [start]() mutable { return start++; }); } list.push_back(c); escape = false; hyphen = false; } return list; } } // namespace tr