-
- Notifications
You must be signed in to change notification settings - Fork 359
huffman implementation in C++ #75
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,280 @@ | ||
#include <algorithm> | ||
#include <array> | ||
#include <bitset> | ||
#include <cassert> | ||
#include <cctype> | ||
#include <cstddef> | ||
#include <limits> | ||
#include <memory> | ||
#include <string> | ||
#include <utility> | ||
#include <vector> | ||
| ||
#include <iostream> | ||
| ||
using std::begin; | ||
using std::end; | ||
| ||
namespace huffman { | ||
| ||
[[noreturn]] inline void unreachable() { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I am not familiar with the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. "this function does not return" There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Alright, looked it up. I just don't often see attribute specifiers. My bad. | ||
std::cerr << "this should never happen\n"; | ||
std::abort(); | ||
} | ||
| ||
// --- interface --- | ||
class codebook { | ||
struct node { | ||
int frequency; | ||
| ||
node(int freq) : frequency(freq) {} | ||
virtual ~node() = 0; | ||
}; | ||
| ||
// never null | ||
using node_ptr = std::unique_ptr<node const>; | ||
using bitstring = std::vector<bool>; | ||
| ||
// this is a flatmap between char and a bitstring | ||
// there should only ever be one character with a given | ||
// value at any time. | ||
using encoder_t = std::vector<std::pair<char, bitstring>>; | ||
| ||
struct leaf final : node { | ||
char key; | ||
| ||
leaf(int freq, char key) : node(freq), key(key) {} | ||
}; | ||
| ||
struct branch final : node { | ||
node_ptr lhs; | ||
node_ptr rhs; | ||
| ||
branch(node_ptr lhs, node_ptr rhs) | ||
: node(lhs->frequency + rhs->frequency), lhs(std::move(lhs)), | ||
rhs(std::move(rhs)) {} | ||
}; | ||
| ||
// this allows us to share [codebook]s among encoded strings | ||
struct data { | ||
node_ptr decoder; | ||
encoder_t encoder; | ||
}; | ||
std::shared_ptr<data const> underlying_; | ||
| ||
public: | ||
template <class Iter> | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why do we need a template here (or on most of the functions)? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Because There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah, alright. That actually cleared it up a bit, thanks! | ||
codebook(Iter const first, Iter const last); | ||
| ||
template <class Iter> | ||
std::vector<bool> encode(Iter first, Iter last) const; | ||
| ||
template <class Iter> | ||
std::string decode(Iter first, Iter last) const; | ||
}; | ||
| ||
struct encoded_string { | ||
codebook codes; | ||
std::vector<bool> string; | ||
| ||
explicit encoded_string(std::string const& s) | ||
: codes(begin(s), end(s)), string(codes.encode(begin(s), end(s))) {} | ||
| ||
encoded_string(codebook codes, std::string const& s) | ||
: codes(codes), string(codes.encode(begin(s), end(s))) {} | ||
| ||
std::string decoded() const { | ||
return codes.decode(begin(string), end(string)); | ||
} | ||
}; | ||
| ||
// --- implementation --- | ||
inline codebook::node::~node() {} | ||
| ||
inline std::vector<bool> with_new_bit(std::vector<bool> bits, bool b) { | ||
bits.push_back(b); | ||
return bits; | ||
} | ||
| ||
template <class Iter> | ||
codebook::codebook(Iter const first, Iter const last) { | ||
struct helper { | ||
static node_ptr make_decoder(Iter const first, Iter const last) { | ||
// in this part of the function, we build up a frequency list | ||
// sorted by frequency, descending | ||
auto freq = std::vector<leaf>(); | ||
| ||
std::for_each(first, last, [&freq](char c) { | ||
auto const it = std::find_if( | ||
begin(freq), end(freq), [c](auto const& p) { return p.key == c; }); | ||
if (it != end(freq)) { | ||
// it's already in the list | ||
it->frequency += 1; | ||
} else { | ||
// it's not already in the list | ||
freq.emplace_back(1, c); | ||
} | ||
}); | ||
| ||
if (freq.empty()) { | ||
throw std::invalid_argument("attempted to codebook an empty range"); | ||
} | ||
| ||
std::sort(begin(freq), end(freq), [](auto const& lhs, auto const& rhs) { | ||
return lhs.frequency > rhs.frequency; | ||
}); | ||
| ||
auto ret = std::vector<std::unique_ptr<node>>(); | ||
std::transform( | ||
begin(freq), end(freq), std::back_inserter(ret), [](auto const l) { | ||
return std::make_unique<leaf>(l); | ||
}); | ||
| ||
while (ret.size() > 1) { | ||
auto rhs = std::move(ret.back()); | ||
ret.pop_back(); | ||
auto lhs = std::move(ret.back()); | ||
ret.pop_back(); | ||
| ||
auto new_node = | ||
std::make_unique<branch>(std::move(lhs), std::move(rhs)); | ||
auto const new_freq = new_node->frequency; | ||
| ||
// look for the first element with a smaller frequency | ||
auto const it = | ||
std::find_if(begin(ret), end(ret), [new_freq](auto const& n) { | ||
return n->frequency < new_freq; | ||
}); | ||
// and insert the new_node before that element | ||
ret.insert(it, std::move(new_node)); | ||
// in this way, we keep the list sorted by frequency | ||
} | ||
| ||
return std::move(ret.back()); | ||
} | ||
| ||
static void | ||
encoder_rec(node const* cur, std::vector<bool> bits, encoder_t& out) { | ||
if (auto l = dynamic_cast<leaf const*>(cur)) { | ||
out.push_back(std::make_pair(l->key, std::move(bits))); | ||
} else if (auto b = dynamic_cast<branch const*>(cur)) { | ||
encoder_rec(b->lhs.get(), with_new_bit(bits, 0), out); | ||
encoder_rec(b->rhs.get(), with_new_bit(std::move(bits), 1), out); | ||
} else { | ||
unreachable(); | ||
} | ||
} | ||
| ||
static encoder_t make_encoder(node const& decoder) { | ||
auto ret = encoder_t(); | ||
| ||
encoder_rec(&decoder, std::vector<bool>(), ret); | ||
| ||
return ret; | ||
} | ||
}; | ||
| ||
auto decoder = helper::make_decoder(first, last); | ||
auto encoder = helper::make_encoder(*decoder); | ||
underlying_ = std::make_shared<data const>( | ||
data{std::move(decoder), std::move(encoder)}); | ||
} | ||
| ||
template <class Iter> | ||
std::vector<bool> codebook::encode(Iter const first, Iter const last) const { | ||
std::vector<bool> ret; | ||
| ||
auto& encoder = underlying_->encoder; | ||
std::for_each(first, last, [&ret, &encoder](char c) { | ||
auto const it = | ||
std::find_if(begin(encoder), end(encoder), [c](auto const& p) { | ||
return p.first == c; | ||
}); | ||
if (it != end(encoder)) { | ||
auto const& code = it->second; | ||
std::copy(begin(code), end(code), std::back_inserter(ret)); | ||
} else { | ||
throw std::invalid_argument( | ||
"The range has a character which was not in the huffman set"); | ||
} | ||
}); | ||
| ||
return ret; | ||
} | ||
| ||
template <class Iter> | ||
std::string codebook::decode(Iter const first, Iter const last) const { | ||
std::string ret; | ||
| ||
node const* const top = underlying_->decoder.get(); | ||
| ||
// returns a pair: | ||
// the second member is the decoded character | ||
// the first member is the place we've gotten to in the range | ||
// i.e., if [0] is an 'a', and we have | ||
// [it, last) = { 0, 1, 1, 0 } | ||
// we return (it', 'a') such that | ||
// [it', last) = { 1, 1, 0 } | ||
auto decode_single = | ||
[top](Iter it, Iter const last) -> std::pair<Iter, char> { | ||
node const* current_node = top; | ||
| ||
for (; it != last; ++it) { | ||
if (auto l = dynamic_cast<leaf const*>(current_node)) { | ||
return std::make_pair(it, l->key); | ||
} else if (auto b = dynamic_cast<branch const*>(current_node)) { | ||
if (*it) { | ||
current_node = b->rhs.get(); | ||
} else { | ||
current_node = b->lhs.get(); | ||
} | ||
} else { | ||
unreachable(); | ||
} | ||
} | ||
| ||
if (auto l = dynamic_cast<leaf const*>(current_node)) { | ||
return std::make_pair(last, l->key); | ||
} else { | ||
throw std::invalid_argument( | ||
"The range was not encoded with this huffman set"); | ||
} | ||
}; | ||
| ||
for (auto it = first; it != last;) { | ||
auto p = decode_single(it, last); | ||
it = p.first; | ||
ret.push_back(p.second); | ||
} | ||
| ||
return ret; | ||
} | ||
| ||
} // namespace huffman | ||
| ||
int main() { | ||
std::string to_be_encoded = | ||
R"(She should have died hereafter. | ||
There would have been a time for such a word. | ||
Tomorrow, and tomorrow, and tomorrow, | ||
Creeps in this petty pace from day to day | ||
To the last syllable of recorded time, | ||
And all our yesterdays have lighted fools | ||
The way to dusty death. Out, out, brief candle! | ||
Life's but a walking shadow, a poor player | ||
That struts and frets his hour upon the stage | ||
And then is heard no more. It is a tale | ||
Told by an idiot, full of sound and fury, | ||
Signifying nothing.)"; | ||
| ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Just a note here that the main function is quite clean and that the Sound and the Fury is one of my favorite books (I know the quote's from Macbeth) | ||
auto encoded = huffman::encoded_string(to_be_encoded); | ||
| ||
std::cout << "Encoded, the string looks like: "; | ||
for (bool b : encoded.string) { | ||
std::cout << b; | ||
} | ||
std::cout << "\nand decoded, the string looks like: " << encoded.decoded(); | ||
std::cout << "\n\nAs opposed to the original, which is " | ||
<< to_be_encoded.size() * 8 << " bits, the encoded has size " | ||
<< encoded.string.size() << ".\n"; | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should we maybe add a few spaces in subsequent lines so we know it's within the
huffman
namespace? I've seen both spacing and no spacing, so I am not pushing one way or another here.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The style for C++ is without spacing for a single namespace; we'd have to change all the other C++. This is one of the issues with single-file solutions :P
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Fair enough. I just didn't remember the style. Thanks