tudocomp
– The TU Dortmund Compression Framework
LZWCompressor.hpp
Go to the documentation of this file.
1 #pragma once
2 
4 
7 
8 #include <tudocomp/Range.hpp>
9 #include <tudocomp/Coder.hpp>
10 
12 
13 // For default params
16 
17 namespace tdc {
18 
19 template<typename coder_t, typename dict_t>
20 class LZWCompressor: public Compressor {
21 private:
22  using node_t = typename dict_t::node_t;
23 
24  const lz78::factorid_t m_dict_max_size {0};
25 public:
26  inline LZWCompressor(Env&& env):
27  Compressor(std::move(env)),
28  m_dict_max_size(env.option("dict_size").as_integer())
29  {}
30 
31  inline static Meta meta() {
32  Meta m("compressor", "lzw", "Lempel-Ziv-Welch\n\n" LZ78_DICT_SIZE_DESC);
33  m.option("coder").templated<coder_t, BitCoder>("coder");
34  m.option("lz78trie").templated<dict_t, lz78::TernaryTrie>("lz78trie");
35  m.option("dict_size").dynamic(0);
36  return m;
37  }
38 
39  virtual void compress(Input& input, Output& out) override {
40  const size_t n = input.size();
41  const size_t reserved_size = isqrt(n)*2;
42  auto is = input.as_stream();
43 
44  // Stats
45  StatPhase phase("LZW Compression");
46  IF_STATS(size_t stat_dictionary_resets = 0);
47  IF_STATS(size_t stat_dict_counter_at_last_reset = 0);
48  IF_STATS(size_t stat_factor_count = 0);
49  size_t factor_count = 0;
50 
51  size_t remaining_characters = n; // position in the text
52  dict_t dict(env().env_for_option("lz78trie"), n, remaining_characters, reserved_size+ULITERAL_MAX+1);
53  auto reset_dict = [&dict] () {
54  dict.clear();
55  std::stringstream ss;
56  for(size_t i = 0; i < ULITERAL_MAX+1; ++i) {
57  const node_t node = dict.add_rootnode(i);
58  DCHECK_EQ(node.id(), dict.size() - 1);
59  DCHECK_EQ(node.id(), i);
60  ss << node.id() << ", ";
61  }
62  };
63  reset_dict();
64 
65  typename coder_t::Encoder coder(env().env_for_option("coder"), out, NoLiterals());
66 
67  char c;
68  if(!is.get(c)) return;
69 
70  node_t node = dict.get_rootnode(static_cast<uliteral_t>(c));
71 
72  while(is.get(c)) {
73  --remaining_characters;
74  node_t child = dict.find_or_insert(node, static_cast<uliteral_t>(c));
75  DVLOG(2) << " child " << child.id() << " #factor " << factor_count << " size " << dict.size() << " node " << node.id();
76 
77  if(child.is_new()) {
78  coder.encode(node.id(), Range(factor_count + ULITERAL_MAX + 1));
79  IF_STATS(stat_factor_count++);
80  factor_count++;
81  DCHECK_EQ(factor_count+ULITERAL_MAX+1, dict.size());
82  node = dict.get_rootnode(static_cast<uliteral_t>(c));
83  // dictionary's maximum size was reached
84  if(dict.size() == m_dict_max_size) {
85  DCHECK_GT(dict.size(),0);
86  reset_dict();
87  factor_count = 0; //coder.dictionary_reset();
88  IF_STATS(stat_dictionary_resets++);
89  IF_STATS(stat_dict_counter_at_last_reset = m_dict_max_size);
90  }
91  } else { // traverse further
92  node = child;
93  }
94  }
95 
96  DLOG(INFO) << "End node id of LZW parsing " << node.id();
97  // take care of left-overs. We do not assume that the stream has a sentinel
98  DCHECK_NE(node.id(), lz78::undef_id);
99  coder.encode(node.id(), Range(factor_count + ULITERAL_MAX + 1)); //LZW
100  IF_STATS(stat_factor_count++);
101  factor_count++;
102 
103  IF_STATS(
104  phase.log_stat("factor_count", stat_factor_count);
105  phase.log_stat("dictionary_reset_counter", stat_dictionary_resets);
106  phase.log_stat("max_factor_counter", stat_dict_counter_at_last_reset);
107  )
108  }
109 
110  virtual void decompress(Input& input, Output& output) override final {
111  const size_t reserved_size = input.size();
112  //TODO C::decode(in, out, dms, reserved_size);
113  auto out = output.as_stream();
114  typename coder_t::Decoder decoder(env().env_for_option("coder"), input);
115 
116  size_t counter = 0;
117 
118  //TODO file_corrupted not used!
119  lzw::decode_step([&](lz78::factorid_t& entry, bool reset, bool &file_corrupted) -> bool {
120  if (reset) {
121  counter = 0;
122  }
123 
124  if(decoder.eof()) {
125  return false;
126  }
127 
128  lzw::Factor factor(decoder.template decode<len_t>(Range(counter + ULITERAL_MAX + 1)));
129  counter++;
130  entry = factor;
131  return true;
132  }, out, m_dict_max_size == 0 ? lz78::DMS_MAX : m_dict_max_size, reserved_size);
133  }
134 
135 };
136 
137 }
138 
Represents a generic range of positive integers.
Definition: Range.hpp:16
int_t isqrt(int_t num)
Contains the text compression and encoding framework.
Definition: namespaces.hpp:11
virtual void decompress(Input &input, Output &output) override final
Decompress the given input to the given output.
Provides meta information about an Algorithm.
Definition: Meta.hpp:34
Base for data compressors.
Definition: Compressor.hpp:19
void decode_step(F next_code_callback, std::ostream &out, const CodeType dms, const CodeType reserve_dms)
Definition: LZWDecoding.hpp:13
LZWCompressor(Env &&env)
Maximum dictionary size before reset, 0 == unlimited.
Defines data encoding to and decoding from a stream of binary integer representations.
Definition: BitCoder.hpp:13
InputStream as_stream() const
Creates a stream that allows for character-wise reading of the input.
Definition: Input.hpp:264
Provides access to runtime and memory measurement in statistics phases.
Definition: StatPhase.hpp:44
#define LZ78_DICT_SIZE_DESC
Definition: LZ78Trie.hpp:35
len_compact_t Factor
Definition: LZWFactor.hpp:8
constexpr size_t ULITERAL_MAX
The maximum value of uliteral_t.
Definition: def.hpp:134
Env & env()
Provides access to the environment that the algorithm works in.
Definition: Algorithm.hpp:51
OutputStream as_stream() const
Creates a stream that allows for character-wise output.
An empty literal iterator that yields no literals whatsoever.
Definition: Literal.hpp:37
An abstraction layer for algorithm output.
Definition: Output.hpp:23
#define IF_STATS(x)
x is compiled only when the STATS_DISABLED macro is undefined.
Definition: def.hpp:59
virtual void compress(Input &input, Output &out) override
Compress the given input to the given output.
void log_stat(const char *key, const T &value)
Logs a user statistic for this phase.
Definition: StatPhase.hpp:298
constexpr size_t DMS_MAX
Maximum legal dictionary size.
Definition: LZ78Trie.hpp:17
void templated(const std::string &accepted_type)
Declares that this option accepts values of the specified Algorithm type T.
Definition: Meta.hpp:93
size_t size() const
Yields the total amount of characters in the input.
Definition: Input.hpp:233
uint32_t factorid_t
Type for the factor indices, bounded by the number of LZ78 trie nodes.
Definition: LZ78Trie.hpp:11
constexpr factorid_t undef_id
Id that can be used for a non-existing factor.
Definition: LZ78Trie.hpp:14
static Meta meta()
OptionBuilder option(const std::string &name)
Declares an accepted option for this algorithm.
Definition: Meta.hpp:216
Local environment for a compression/encoding/decompression call.
LZ78 Trie Implementation based on Julius Pettersson (MIT/Expat License.) and Juha Nieminen&#39;s work...
Definition: TernaryTrie.hpp:16
void dynamic()
Declares that this option accepts values of a simple type that can be parsed from a string (e...
Definition: Meta.hpp:150
An abstraction layer for algorithm input.
Definition: Input.hpp:37