|  | 
| 1 | 1 | #include <algorithm> | 
| 2 |  | -#include <array> | 
| 3 |  | -#include <bitset> | 
| 4 | 2 | #include <cassert> | 
| 5 |  | -#include <cctype> | 
| 6 |  | -#include <cstddef> | 
| 7 |  | -#include <limits> | 
|  | 3 | +#include <iostream> | 
| 8 | 4 | #include <memory> | 
| 9 | 5 | #include <string> | 
| 10 |  | -#include <utility> | 
|  | 6 | +#include <unordered_map> | 
| 11 | 7 | #include <vector> | 
| 12 | 8 | 
 | 
| 13 |  | -#include <iostream> | 
| 14 |  | - | 
| 15 |  | -using std::begin; | 
| 16 |  | -using std::end; | 
| 17 |  | - | 
| 18 |  | -namespace huffman { | 
| 19 |  | - | 
| 20 |  | -[[noreturn]] inline void unreachable() { | 
| 21 |  | - std::cerr << "this should never happen\n"; | 
| 22 |  | - std::abort(); | 
| 23 |  | -} | 
| 24 |  | - | 
| 25 |  | -// --- interface --- | 
| 26 |  | -class codebook { | 
| 27 |  | - struct node { | 
| 28 |  | - int frequency; | 
| 29 |  | - | 
| 30 |  | - node(int freq) : frequency(freq) {} | 
| 31 |  | - virtual ~node() = 0; | 
| 32 |  | - }; | 
| 33 |  | - | 
| 34 |  | - // never null | 
| 35 |  | - using node_ptr = std::unique_ptr<node const>; | 
| 36 |  | - using bitstring = std::vector<bool>; | 
| 37 |  | - | 
| 38 |  | - // this is a flatmap between char and a bitstring | 
| 39 |  | - // there should only ever be one character with a given | 
| 40 |  | - // value at any time. | 
| 41 |  | - using encoder_t = std::vector<std::pair<char, bitstring>>; | 
| 42 |  | - | 
| 43 |  | - struct leaf final : node { | 
| 44 |  | - char key; | 
| 45 |  | - | 
| 46 |  | - leaf(int freq, char key) : node(freq), key(key) {} | 
| 47 |  | - }; | 
| 48 |  | - | 
| 49 |  | - struct branch final : node { | 
| 50 |  | - node_ptr lhs; | 
| 51 |  | - node_ptr rhs; | 
| 52 |  | - | 
| 53 |  | - branch(node_ptr lhs, node_ptr rhs) | 
| 54 |  | - : node(lhs->frequency + rhs->frequency), lhs(std::move(lhs)), | 
| 55 |  | - rhs(std::move(rhs)) {} | 
| 56 |  | - }; | 
| 57 |  | - | 
| 58 |  | - // this allows us to share [codebook]s among encoded strings | 
| 59 |  | - struct data { | 
| 60 |  | - node_ptr decoder; | 
| 61 |  | - encoder_t encoder; | 
| 62 |  | - }; | 
| 63 |  | - std::shared_ptr<data const> underlying_; | 
|  | 9 | +struct node { | 
|  | 10 | + node(int a_weight, char a_ch) : ch(a_ch), weight(a_weight) {} | 
|  | 11 | + node(std::unique_ptr<node>&& a_left, std::unique_ptr<node>&& a_right) | 
|  | 12 | + : left(std::move(a_left)), right(std::move(a_right)), | 
|  | 13 | + weight(left->weight + right->weight) { | 
|  | 14 | + left->parent = this; | 
|  | 15 | + left->branch = branch::left; | 
|  | 16 | + right->parent = this; | 
|  | 17 | + right->branch = branch::right; | 
|  | 18 | + } | 
| 64 | 19 | 
 | 
| 65 |  | -public: | 
| 66 |  | - template <typename Iter> | 
| 67 |  | - codebook(Iter const first, Iter const last); | 
|  | 20 | + // leaf fields | 
|  | 21 | + char ch = '\0'; | 
| 68 | 22 | 
 | 
| 69 |  | - template <typename Iter> | 
| 70 |  | - std::vector<bool> encode(Iter first, Iter last) const; | 
|  | 23 | + // branch fields | 
|  | 24 | + std::unique_ptr<node> left; | 
|  | 25 | + std::unique_ptr<node> right; | 
| 71 | 26 | 
 | 
| 72 |  | - template <typename Iter> | 
| 73 |  | - std::string decode(Iter first, Iter last) const; | 
|  | 27 | + // common fields | 
|  | 28 | + node* parent = nullptr; | 
|  | 29 | + enum class branch { left, right }; | 
|  | 30 | + branch branch = branch::left; | 
|  | 31 | + int weight = 0; | 
| 74 | 32 | }; | 
| 75 | 33 | 
 | 
| 76 |  | -struct encoded_string { | 
| 77 |  | - codebook codes; | 
| 78 |  | - std::vector<bool> string; | 
| 79 |  | - | 
| 80 |  | - explicit encoded_string(std::string const& s) | 
| 81 |  | - : codes(begin(s), end(s)), string(codes.encode(begin(s), end(s))) {} | 
| 82 |  | - | 
| 83 |  | - encoded_string(codebook codes, std::string const& s) | 
| 84 |  | - : codes(codes), string(codes.encode(begin(s), end(s))) {} | 
|  | 34 | +struct codebook { | 
|  | 35 | + // maps characters to leafs in the tree | 
|  | 36 | + std::unordered_map<char, node*> char_map; | 
| 85 | 37 | 
 | 
| 86 |  | - std::string decoded() const { | 
| 87 |  | - return codes.decode(begin(string), end(string)); | 
| 88 |  | - } | 
|  | 38 | + // root of the tree | 
|  | 39 | + std::unique_ptr<node> root; | 
| 89 | 40 | }; | 
| 90 | 41 | 
 | 
| 91 |  | -// --- implementation --- | 
| 92 |  | -inline codebook::node::~node() {} | 
| 93 |  | - | 
| 94 |  | -inline std::vector<bool> with_new_bit(std::vector<bool> bits, bool b) { | 
| 95 |  | - bits.push_back(b); | 
| 96 |  | - return bits; | 
| 97 |  | -} | 
| 98 |  | - | 
| 99 |  | -template <typename Iter> | 
| 100 |  | -codebook::codebook(Iter const first, Iter const last) { | 
| 101 |  | - struct helper { | 
| 102 |  | - static node_ptr make_decoder(Iter const first, Iter const last) { | 
| 103 |  | - // in this part of the function, we build up a frequency list | 
| 104 |  | - // sorted by frequency, descending | 
| 105 |  | - auto freq = std::vector<leaf>(); | 
| 106 |  | - | 
| 107 |  | - std::for_each(first, last, [&freq](char c) { | 
| 108 |  | - auto const it = std::find_if( | 
| 109 |  | - begin(freq), end(freq), [c](auto const& p) { return p.key == c; }); | 
| 110 |  | - if (it != end(freq)) { | 
| 111 |  | - // it's already in the list | 
| 112 |  | - it->frequency += 1; | 
| 113 |  | - } else { | 
| 114 |  | - // it's not already in the list | 
| 115 |  | - freq.emplace_back(1, c); | 
| 116 |  | - } | 
| 117 |  | - }); | 
| 118 |  | - | 
| 119 |  | - if (freq.empty()) { | 
| 120 |  | - throw std::invalid_argument("attempted to codebook an empty range"); | 
| 121 |  | - } | 
| 122 |  | - | 
| 123 |  | - std::sort(begin(freq), end(freq), [](auto const& lhs, auto const& rhs) { | 
| 124 |  | - return lhs.frequency > rhs.frequency; | 
| 125 |  | - }); | 
| 126 |  | - | 
| 127 |  | - auto ret = std::vector<std::unique_ptr<node>>(); | 
| 128 |  | - std::transform( | 
| 129 |  | - begin(freq), end(freq), std::back_inserter(ret), [](auto const l) { | 
| 130 |  | - return std::make_unique<leaf>(l); | 
| 131 |  | - }); | 
| 132 |  | - | 
| 133 |  | - while (ret.size() > 1) { | 
| 134 |  | - auto rhs = std::move(ret.back()); | 
| 135 |  | - ret.pop_back(); | 
| 136 |  | - auto lhs = std::move(ret.back()); | 
| 137 |  | - ret.pop_back(); | 
| 138 |  | - | 
| 139 |  | - auto new_node = | 
| 140 |  | - std::make_unique<branch>(std::move(lhs), std::move(rhs)); | 
| 141 |  | - auto const new_freq = new_node->frequency; | 
| 142 |  | - | 
| 143 |  | - // look for the first element with a smaller frequency | 
| 144 |  | - auto const it = | 
| 145 |  | - std::find_if(begin(ret), end(ret), [new_freq](auto const& n) { | 
| 146 |  | - return n->frequency < new_freq; | 
| 147 |  | - }); | 
| 148 |  | - // and insert the new_node before that element | 
| 149 |  | - ret.insert(it, std::move(new_node)); | 
| 150 |  | - // in this way, we keep the list sorted by frequency | 
| 151 |  | - } | 
| 152 |  | - | 
| 153 |  | - return std::move(ret.back()); | 
| 154 |  | - } | 
| 155 |  | - | 
| 156 |  | - static void | 
| 157 |  | - encoder_rec(node const* cur, std::vector<bool> bits, encoder_t& out) { | 
| 158 |  | - if (auto l = dynamic_cast<leaf const*>(cur)) { | 
| 159 |  | - out.push_back(std::make_pair(l->key, std::move(bits))); | 
| 160 |  | - } else if (auto b = dynamic_cast<branch const*>(cur)) { | 
| 161 |  | - encoder_rec(b->lhs.get(), with_new_bit(bits, 0), out); | 
| 162 |  | - encoder_rec(b->rhs.get(), with_new_bit(std::move(bits), 1), out); | 
| 163 |  | - } else { | 
| 164 |  | - unreachable(); | 
| 165 |  | - } | 
| 166 |  | - } | 
| 167 |  | - | 
| 168 |  | - static encoder_t make_encoder(node const& decoder) { | 
| 169 |  | - auto ret = encoder_t(); | 
| 170 |  | - | 
| 171 |  | - encoder_rec(&decoder, std::vector<bool>(), ret); | 
| 172 |  | - | 
| 173 |  | - return ret; | 
| 174 |  | - } | 
| 175 |  | - }; | 
| 176 |  | - | 
| 177 |  | - auto decoder = helper::make_decoder(first, last); | 
| 178 |  | - auto encoder = helper::make_encoder(*decoder); | 
| 179 |  | - underlying_ = std::make_shared<data const>( | 
| 180 |  | - data{std::move(decoder), std::move(encoder)}); | 
| 181 |  | -} | 
| 182 |  | - | 
| 183 |  | -template <typename Iter> | 
| 184 |  | -std::vector<bool> codebook::encode(Iter const first, Iter const last) const { | 
| 185 |  | - std::vector<bool> ret; | 
|  | 42 | +std::vector<bool> encode(const std::string& string_to_encode, codebook& cb) { | 
|  | 43 | + // first pass | 
|  | 44 | + // calculate weights | 
|  | 45 | + std::unordered_map<char, int> weights; | 
|  | 46 | + std::vector<std::unique_ptr<node>> orphans; | 
|  | 47 | + for (char ch : string_to_encode) { | 
|  | 48 | + ++weights[ch]; | 
|  | 49 | + } | 
| 186 | 50 | 
 | 
| 187 |  | - auto& encoder = underlying_->encoder; | 
| 188 |  | - std::for_each(first, last, [&ret, &encoder](char c) { | 
| 189 |  | - auto const it = | 
| 190 |  | - std::find_if(begin(encoder), end(encoder), [c](auto const& p) { | 
| 191 |  | - return p.first == c; | 
|  | 51 | + // build tree | 
|  | 52 | + for (auto&& w : weights) { | 
|  | 53 | + orphans.push_back(std::make_unique<node>(w.second, w.first)); | 
|  | 54 | + cb.char_map[orphans.back()->ch] = orphans.back().get(); | 
|  | 55 | + } | 
|  | 56 | + // make parents for orphans until only one root orphan is remained | 
|  | 57 | + while (orphans.size() > 1) { | 
|  | 58 | + std::sort( | 
|  | 59 | + std::begin(orphans), | 
|  | 60 | + std::end(orphans), | 
|  | 61 | + [](const std::unique_ptr<node>& a, const std::unique_ptr<node>& b) { | 
|  | 62 | + return a->weight > b->weight; | 
| 192 | 63 |  }); | 
| 193 |  | - if (it != end(encoder)) { | 
| 194 |  | - auto const& code = it->second; | 
| 195 |  | - std::copy(begin(code), end(code), std::back_inserter(ret)); | 
| 196 |  | - } else { | 
| 197 |  | - throw std::invalid_argument( | 
| 198 |  | - "The range has a character which was not in the huffman set"); | 
| 199 |  | - } | 
| 200 |  | - }); | 
| 201 |  | - | 
| 202 |  | - return ret; | 
| 203 |  | -} | 
| 204 |  | - | 
| 205 |  | -template <typename Iter> | 
| 206 |  | -std::string codebook::decode(Iter const first, Iter const last) const { | 
| 207 |  | - std::string ret; | 
| 208 |  | - | 
| 209 |  | - node const* const top = underlying_->decoder.get(); | 
| 210 |  | - | 
| 211 |  | - // returns a pair: | 
| 212 |  | - // the second member is the decoded character | 
| 213 |  | - // the first member is the place we've gotten to in the range | 
| 214 |  | - // i.e., if [0] is an 'a', and we have | 
| 215 |  | - // [it, last) = { 0, 1, 1, 0 } | 
| 216 |  | - // we return (it', 'a') such that | 
| 217 |  | - // [it', last) = { 1, 1, 0 } | 
| 218 |  | - auto decode_single = | 
| 219 |  | - [top](Iter it, Iter const last) -> std::pair<Iter, char> { | 
| 220 |  | - node const* current_node = top; | 
|  | 64 | + auto right = std::move(orphans.back()); | 
|  | 65 | + orphans.pop_back(); | 
|  | 66 | + auto left = std::move(orphans.back()); | 
|  | 67 | + orphans.pop_back(); | 
|  | 68 | + orphans.push_back( | 
|  | 69 | + std::make_unique<node>(std::move(left), std::move(right))); | 
|  | 70 | + } | 
| 221 | 71 | 
 | 
| 222 |  | - for (; it != last; ++it) { | 
| 223 |  | - if (auto l = dynamic_cast<leaf const*>(current_node)) { | 
| 224 |  | - return std::make_pair(it, l->key); | 
| 225 |  | - } else if (auto b = dynamic_cast<branch const*>(current_node)) { | 
| 226 |  | - if (*it) { | 
| 227 |  | - current_node = b->rhs.get(); | 
| 228 |  | - } else { | 
| 229 |  | - current_node = b->lhs.get(); | 
| 230 |  | - } | 
| 231 |  | - } else { | 
| 232 |  | - unreachable(); | 
| 233 |  | - } | 
|  | 72 | + // move last orphan to codebook | 
|  | 73 | + assert(orphans.size() == 1); | 
|  | 74 | + cb.root = std::move(orphans.front()); | 
|  | 75 | + | 
|  | 76 | + // second pass | 
|  | 77 | + // encoding | 
|  | 78 | + std::vector<bool> res; | 
|  | 79 | + for (auto ch : string_to_encode) { | 
|  | 80 | + auto node = cb.char_map[ch]; | 
|  | 81 | + std::vector<bool> compressed_char; | 
|  | 82 | + while (node->parent != nullptr) { | 
|  | 83 | + compressed_char.push_back(node->branch != node::branch::left); | 
|  | 84 | + node = node->parent; | 
| 234 | 85 |  } | 
|  | 86 | + res.insert(std::end(res), compressed_char.rbegin(), compressed_char.rend()); | 
|  | 87 | + } | 
|  | 88 | + return res; | 
|  | 89 | +} | 
| 235 | 90 | 
 | 
| 236 |  | - if (auto l = dynamic_cast<leaf const*>(current_node)) { | 
| 237 |  | - return std::make_pair(last, l->key); | 
| 238 |  | - } else { | 
| 239 |  | - throw std::invalid_argument( | 
| 240 |  | - "The range was not encoded with this huffman set"); | 
|  | 91 | +std::string decode(const codebook& cb, const std::vector<bool>& compressed_bit_stream) { | 
|  | 92 | + std::string res; | 
|  | 93 | + node* node = cb.root.get(); | 
|  | 94 | + for (auto bit : compressed_bit_stream) { | 
|  | 95 | + if (!bit) | 
|  | 96 | + node = node->left.get(); | 
|  | 97 | + else | 
|  | 98 | + node = node->right.get(); | 
|  | 99 | + if (node->right == nullptr) { | 
|  | 100 | + res.push_back(node->ch); | 
|  | 101 | + node = cb.root.get(); | 
| 241 | 102 |  } | 
| 242 |  | - }; | 
| 243 |  | - | 
| 244 |  | - for (auto it = first; it != last;) { | 
| 245 |  | - auto p = decode_single(it, last); | 
| 246 |  | - it = p.first; | 
| 247 |  | - ret.push_back(p.second); | 
| 248 | 103 |  } | 
| 249 |  | - | 
| 250 |  | - return ret; | 
|  | 104 | + return res; | 
| 251 | 105 | } | 
| 252 | 106 | 
 | 
| 253 |  | -} // namespace huffman | 
| 254 |  | - | 
| 255 | 107 | int main() { | 
| 256 |  | - std::string to_be_encoded = R"(bibbity bobbity)"; | 
|  | 108 | + std::string to_be_encoded = "bibbity bobbity"; | 
|  | 109 | + | 
|  | 110 | + codebook cb; | 
| 257 | 111 | 
 | 
| 258 |  | - auto encoded = huffman::encoded_string(to_be_encoded); | 
|  | 112 | + auto encoded = encode(to_be_encoded, cb); | 
| 259 | 113 | 
 | 
| 260 | 114 |  std::cout << "Encoded, the string looks like: "; | 
| 261 |  | - for (bool b : encoded.string) { | 
|  | 115 | + for (bool b : encoded) { | 
| 262 | 116 |  std::cout << b; | 
| 263 | 117 |  } | 
| 264 |  | - std::cout << "\nand decoded, the string looks like: " << encoded.decoded(); | 
|  | 118 | + | 
|  | 119 | + std::cout << "\nand decoded, the string looks like: " | 
|  | 120 | + << decode(cb, encoded); | 
| 265 | 121 |  std::cout << "\n\nAs opposed to the original, which is " | 
| 266 | 122 |  << to_be_encoded.size() * 8 << " bits, the encoded has size " | 
| 267 |  | - << encoded.string.size() << ".\n"; | 
|  | 123 | + << encoded.size() << ".\n"; | 
| 268 | 124 | } | 
0 commit comments