Skip to content

Commit 71c0d42

Browse files
committed
Rework C++ implementation for Huffman encoding
- Make implementation shorter and easier to understand - Make namespace hierarchy flat - Combine Leaf and Branch in one class Node for the tree - Make codebook explicit - Use std::string for string_to_encode - Remove noise (delete function with_new_bit() for instance) - Use unordered map for weights instead of vector - Instead inserting to the orphan array, sort it on every iteration (debatable) - Remove structure for cacheing bits for character, calculate it every time - Remove error handling - Change the encoding interface: encode() and decode() instead huffman::encode_string() and encoded.decoded()
1 parent 4aa3dd8 commit 71c0d42

File tree

1 file changed

+92
-236
lines changed

1 file changed

+92
-236
lines changed
Lines changed: 92 additions & 236 deletions
Original file line numberDiff line numberDiff line change
@@ -1,268 +1,124 @@
11
#include <algorithm>
2-
#include <array>
3-
#include <bitset>
42
#include <cassert>
5-
#include <cctype>
6-
#include <cstddef>
7-
#include <limits>
3+
#include <iostream>
84
#include <memory>
95
#include <string>
10-
#include <utility>
6+
#include <unordered_map>
117
#include <vector>
128

13-
#include <iostream>
14-
15-
using std::begin;
16-
using std::end;
17-
18-
namespace huffman {
19-
20-
[[noreturn]] inline void unreachable() {
21-
std::cerr << "this should never happen\n";
22-
std::abort();
23-
}
24-
25-
// --- interface ---
26-
class codebook {
27-
struct node {
28-
int frequency;
29-
30-
node(int freq) : frequency(freq) {}
31-
virtual ~node() = 0;
32-
};
33-
34-
// never null
35-
using node_ptr = std::unique_ptr<node const>;
36-
using bitstring = std::vector<bool>;
37-
38-
// this is a flatmap between char and a bitstring
39-
// there should only ever be one character with a given
40-
// value at any time.
41-
using encoder_t = std::vector<std::pair<char, bitstring>>;
42-
43-
struct leaf final : node {
44-
char key;
45-
46-
leaf(int freq, char key) : node(freq), key(key) {}
47-
};
48-
49-
struct branch final : node {
50-
node_ptr lhs;
51-
node_ptr rhs;
52-
53-
branch(node_ptr lhs, node_ptr rhs)
54-
: node(lhs->frequency + rhs->frequency), lhs(std::move(lhs)),
55-
rhs(std::move(rhs)) {}
56-
};
57-
58-
// this allows us to share [codebook]s among encoded strings
59-
struct data {
60-
node_ptr decoder;
61-
encoder_t encoder;
62-
};
63-
std::shared_ptr<data const> underlying_;
9+
struct node {
10+
node(int a_weight, char a_ch) : ch(a_ch), weight(a_weight) {}
11+
node(std::unique_ptr<node>&& a_left, std::unique_ptr<node>&& a_right)
12+
: left(std::move(a_left)), right(std::move(a_right)),
13+
weight(left->weight + right->weight) {
14+
left->parent = this;
15+
left->branch = branch::left;
16+
right->parent = this;
17+
right->branch = branch::right;
18+
}
6419

65-
public:
66-
template <typename Iter>
67-
codebook(Iter const first, Iter const last);
20+
// leaf fields
21+
char ch = '\0';
6822

69-
template <typename Iter>
70-
std::vector<bool> encode(Iter first, Iter last) const;
23+
// branch fields
24+
std::unique_ptr<node> left;
25+
std::unique_ptr<node> right;
7126

72-
template <typename Iter>
73-
std::string decode(Iter first, Iter last) const;
27+
// common fields
28+
node* parent = nullptr;
29+
enum class branch { left, right };
30+
branch branch = branch::left;
31+
int weight = 0;
7432
};
7533

76-
struct encoded_string {
77-
codebook codes;
78-
std::vector<bool> string;
79-
80-
explicit encoded_string(std::string const& s)
81-
: codes(begin(s), end(s)), string(codes.encode(begin(s), end(s))) {}
82-
83-
encoded_string(codebook codes, std::string const& s)
84-
: codes(codes), string(codes.encode(begin(s), end(s))) {}
34+
struct codebook {
35+
// maps characters to leafs in the tree
36+
std::unordered_map<char, node*> char_map;
8537

86-
std::string decoded() const {
87-
return codes.decode(begin(string), end(string));
88-
}
38+
// root of the tree
39+
std::unique_ptr<node> root;
8940
};
9041

91-
// --- implementation ---
92-
inline codebook::node::~node() {}
93-
94-
inline std::vector<bool> with_new_bit(std::vector<bool> bits, bool b) {
95-
bits.push_back(b);
96-
return bits;
97-
}
98-
99-
template <typename Iter>
100-
codebook::codebook(Iter const first, Iter const last) {
101-
struct helper {
102-
static node_ptr make_decoder(Iter const first, Iter const last) {
103-
// in this part of the function, we build up a frequency list
104-
// sorted by frequency, descending
105-
auto freq = std::vector<leaf>();
106-
107-
std::for_each(first, last, [&freq](char c) {
108-
auto const it = std::find_if(
109-
begin(freq), end(freq), [c](auto const& p) { return p.key == c; });
110-
if (it != end(freq)) {
111-
// it's already in the list
112-
it->frequency += 1;
113-
} else {
114-
// it's not already in the list
115-
freq.emplace_back(1, c);
116-
}
117-
});
118-
119-
if (freq.empty()) {
120-
throw std::invalid_argument("attempted to codebook an empty range");
121-
}
122-
123-
std::sort(begin(freq), end(freq), [](auto const& lhs, auto const& rhs) {
124-
return lhs.frequency > rhs.frequency;
125-
});
126-
127-
auto ret = std::vector<std::unique_ptr<node>>();
128-
std::transform(
129-
begin(freq), end(freq), std::back_inserter(ret), [](auto const l) {
130-
return std::make_unique<leaf>(l);
131-
});
132-
133-
while (ret.size() > 1) {
134-
auto rhs = std::move(ret.back());
135-
ret.pop_back();
136-
auto lhs = std::move(ret.back());
137-
ret.pop_back();
138-
139-
auto new_node =
140-
std::make_unique<branch>(std::move(lhs), std::move(rhs));
141-
auto const new_freq = new_node->frequency;
142-
143-
// look for the first element with a smaller frequency
144-
auto const it =
145-
std::find_if(begin(ret), end(ret), [new_freq](auto const& n) {
146-
return n->frequency < new_freq;
147-
});
148-
// and insert the new_node before that element
149-
ret.insert(it, std::move(new_node));
150-
// in this way, we keep the list sorted by frequency
151-
}
152-
153-
return std::move(ret.back());
154-
}
155-
156-
static void
157-
encoder_rec(node const* cur, std::vector<bool> bits, encoder_t& out) {
158-
if (auto l = dynamic_cast<leaf const*>(cur)) {
159-
out.push_back(std::make_pair(l->key, std::move(bits)));
160-
} else if (auto b = dynamic_cast<branch const*>(cur)) {
161-
encoder_rec(b->lhs.get(), with_new_bit(bits, 0), out);
162-
encoder_rec(b->rhs.get(), with_new_bit(std::move(bits), 1), out);
163-
} else {
164-
unreachable();
165-
}
166-
}
167-
168-
static encoder_t make_encoder(node const& decoder) {
169-
auto ret = encoder_t();
170-
171-
encoder_rec(&decoder, std::vector<bool>(), ret);
172-
173-
return ret;
174-
}
175-
};
176-
177-
auto decoder = helper::make_decoder(first, last);
178-
auto encoder = helper::make_encoder(*decoder);
179-
underlying_ = std::make_shared<data const>(
180-
data{std::move(decoder), std::move(encoder)});
181-
}
182-
183-
template <typename Iter>
184-
std::vector<bool> codebook::encode(Iter const first, Iter const last) const {
185-
std::vector<bool> ret;
42+
std::vector<bool> encode(const std::string& string_to_encode, codebook& cb) {
43+
// first pass
44+
// calculate weights
45+
std::unordered_map<char, int> weights;
46+
std::vector<std::unique_ptr<node>> orphans;
47+
for (char ch : string_to_encode) {
48+
++weights[ch];
49+
}
18650

187-
auto& encoder = underlying_->encoder;
188-
std::for_each(first, last, [&ret, &encoder](char c) {
189-
auto const it =
190-
std::find_if(begin(encoder), end(encoder), [c](auto const& p) {
191-
return p.first == c;
51+
// build tree
52+
for (auto&& w : weights) {
53+
orphans.push_back(std::make_unique<node>(w.second, w.first));
54+
cb.char_map[orphans.back()->ch] = orphans.back().get();
55+
}
56+
// make parents for orphans until only one root orphan is remained
57+
while (orphans.size() > 1) {
58+
std::sort(
59+
std::begin(orphans),
60+
std::end(orphans),
61+
[](const std::unique_ptr<node>& a, const std::unique_ptr<node>& b) {
62+
return a->weight > b->weight;
19263
});
193-
if (it != end(encoder)) {
194-
auto const& code = it->second;
195-
std::copy(begin(code), end(code), std::back_inserter(ret));
196-
} else {
197-
throw std::invalid_argument(
198-
"The range has a character which was not in the huffman set");
199-
}
200-
});
201-
202-
return ret;
203-
}
204-
205-
template <typename Iter>
206-
std::string codebook::decode(Iter const first, Iter const last) const {
207-
std::string ret;
208-
209-
node const* const top = underlying_->decoder.get();
210-
211-
// returns a pair:
212-
// the second member is the decoded character
213-
// the first member is the place we've gotten to in the range
214-
// i.e., if [0] is an 'a', and we have
215-
// [it, last) = { 0, 1, 1, 0 }
216-
// we return (it', 'a') such that
217-
// [it', last) = { 1, 1, 0 }
218-
auto decode_single =
219-
[top](Iter it, Iter const last) -> std::pair<Iter, char> {
220-
node const* current_node = top;
64+
auto right = std::move(orphans.back());
65+
orphans.pop_back();
66+
auto left = std::move(orphans.back());
67+
orphans.pop_back();
68+
orphans.push_back(
69+
std::make_unique<node>(std::move(left), std::move(right)));
70+
}
22171

222-
for (; it != last; ++it) {
223-
if (auto l = dynamic_cast<leaf const*>(current_node)) {
224-
return std::make_pair(it, l->key);
225-
} else if (auto b = dynamic_cast<branch const*>(current_node)) {
226-
if (*it) {
227-
current_node = b->rhs.get();
228-
} else {
229-
current_node = b->lhs.get();
230-
}
231-
} else {
232-
unreachable();
233-
}
72+
// move last orphan to codebook
73+
assert(orphans.size() == 1);
74+
cb.root = std::move(orphans.front());
75+
76+
// second pass
77+
// encoding
78+
std::vector<bool> res;
79+
for (auto ch : string_to_encode) {
80+
auto node = cb.char_map[ch];
81+
std::vector<bool> compressed_char;
82+
while (node->parent != nullptr) {
83+
compressed_char.push_back(node->branch != node::branch::left);
84+
node = node->parent;
23485
}
86+
res.insert(std::end(res), compressed_char.rbegin(), compressed_char.rend());
87+
}
88+
return res;
89+
}
23590

236-
if (auto l = dynamic_cast<leaf const*>(current_node)) {
237-
return std::make_pair(last, l->key);
238-
} else {
239-
throw std::invalid_argument(
240-
"The range was not encoded with this huffman set");
91+
std::string decode(const codebook& cb, const std::vector<bool>& compressed_bit_stream) {
92+
std::string res;
93+
node* node = cb.root.get();
94+
for (auto bit : compressed_bit_stream) {
95+
if (!bit)
96+
node = node->left.get();
97+
else
98+
node = node->right.get();
99+
if (node->right == nullptr) {
100+
res.push_back(node->ch);
101+
node = cb.root.get();
241102
}
242-
};
243-
244-
for (auto it = first; it != last;) {
245-
auto p = decode_single(it, last);
246-
it = p.first;
247-
ret.push_back(p.second);
248103
}
249-
250-
return ret;
104+
return res;
251105
}
252106

253-
} // namespace huffman
254-
255107
int main() {
256-
std::string to_be_encoded = R"(bibbity bobbity)";
108+
std::string to_be_encoded = "bibbity bobbity";
109+
110+
codebook cb;
257111

258-
auto encoded = huffman::encoded_string(to_be_encoded);
112+
auto encoded = encode(to_be_encoded, cb);
259113

260114
std::cout << "Encoded, the string looks like: ";
261-
for (bool b : encoded.string) {
115+
for (bool b : encoded) {
262116
std::cout << b;
263117
}
264-
std::cout << "\nand decoded, the string looks like: " << encoded.decoded();
118+
119+
std::cout << "\nand decoded, the string looks like: "
120+
<< decode(cb, encoded);
265121
std::cout << "\n\nAs opposed to the original, which is "
266122
<< to_be_encoded.size() * 8 << " bits, the encoded has size "
267-
<< encoded.string.size() << ".\n";
123+
<< encoded.size() << ".\n";
268124
}

0 commit comments

Comments
 (0)