tudocomp
– The TU Dortmund Compression Framework
RePairCompressor.hpp
Go to the documentation of this file.
1 #pragma once
2 
4 
5 #include <tudocomp/Range.hpp>
6 #include <tudocomp/coders/BitCoder.hpp> //default
7 
9 
11 
12 namespace tdc {
13 
14 template <typename coder_t>
15 class RePairCompressor : public Compressor {
16 private:
17  typedef uint32_t sym_t;
18  typedef uint64_t digram_t;
19  typedef std::vector<digram_t> grammar_t;
20  static const size_t digram_shift = 32UL;
21  static const sym_t sigma = 256; //TODO
22 
23  inline static digram_t digram(sym_t l, sym_t r) {
24  return (digram_t(l) << digram_shift) | digram_t(r);
25  }
26 
27  inline static sym_t left(digram_t di) {
28  return sym_t(di >> digram_shift);
29  }
30 
31  inline static sym_t right(digram_t di) {
32  return sym_t(di);
33  }
34 
35  template<typename text_t>
36  class Literals : LiteralIterator {
37  private:
38  const text_t* m_text;
39  len_t m_text_size;
40  const len_compact_t* m_next;
41  len_t m_pos;
42 
43  std::vector<uliteral_t> m_g_literals;
44  len_t m_g_pos;
45 
46  public:
47  inline Literals(const text_t& text,
48  len_t text_size,
49  const len_compact_t* next,
50  const grammar_t& grammar)
51  : m_text(&text), m_text_size(text_size), m_next(next),
52  m_pos(0), m_g_pos(0) {
53 
54  // count literals from right side of grammar rules
55  for(digram_t di : grammar) {
56  sym_t l = left(di);
57  if(l < sigma) m_g_literals.push_back(uliteral_t(l));
58 
59  sym_t r = right(di);
60  if(r < sigma) m_g_literals.push_back(uliteral_t(r));
61  }
62  }
63 
64  inline bool has_next() const {
65  return m_pos < m_text_size || m_g_pos < m_g_literals.size();
66  }
67 
68  inline Literal next() {
69  assert(has_next());
70 
71  if(m_pos < m_text_size) {
72  // from encoded text
73  auto l = Literal { uliteral_t((*m_text)[m_pos]), m_pos };
74  m_pos = m_next[m_pos];
75  return l;
76  } else {
77  // from grammar right sides
78  auto l = Literal { m_g_literals[m_g_pos],
79  m_text_size + 2 * m_g_pos };
80  ++m_g_pos;
81  return l;
82  }
83  }
84  };
85 
86 public:
87  inline static Meta meta() {
88  Meta m("compressor", "repair", "Re-Pair compression");
89  m.option("coder").templated<coder_t, BitCoder>("coder");
90  m.option("max_rules").dynamic(0);
91  return m;
92  }
93 
94  inline RePairCompressor(Env&& env) : Compressor(std::move(env)) {}
95 
96  virtual void compress(Input& input, Output& output) override {
97  // options
98  size_t max_rules = env().option("max_rules").as_integer();
99  if(max_rules == 0) max_rules = SIZE_MAX;
100 
101  // prepare editable text
102  len_t n;
103  sym_t *text;
104  len_compact_t *next; //TODO use an int vector of required bit width
105 
106  {
107  auto view = input.as_view();
108  n = view.size();
109  text = new sym_t[n];
110  next = new len_compact_t[n];
111 
112  for(size_t i = 0; i < n; i++) {
113  text[i] = view[i];
114  next[i] = i + 1;
115  }
116  }
117 
118  // compute RePair grammar
119  grammar_t grammar;
120 
121  size_t num_replaced = 0;
122 
123  do {
124  // count digrams
125  digram_t max;
126  size_t max_count = 0;
127 
128  {
129  std::unordered_map<digram_t, size_t> count;
130  // TODO: probably not optimal, but twice as fast as std::map
131 
132  size_t i = 0;
133  while(i < n - 1) {
134  size_t j = next[i];
135  if(j >= n) break; // break if at end
136 
137  digram_t di = digram(text[i], text[j]);
138 
139  // update counter
140  size_t c = count[di] + 1;
141  count[di] = c;
142 
143  // update max
144  if(c > max_count) {
145  max = di;
146  max_count = c;
147  }
148 
149  // advance
150  i = j;
151  }
152  }
153 
154  // replace most common digram (max)
155  if(max_count > 1) {
156  sym_t new_sym = sigma + grammar.size();
157  grammar.push_back(max);
158 
159  size_t i = 0;
160  while(i < n - 1) {
161  size_t j = next[i];
162  if(j >= n) break; // break if at end
163 
164  digram_t di = digram(text[i], text[j]);
165  if(di == max) {
166  text[i] = new_sym; // replace symbol at i by new symbol
167  next[i] = next[j];
168 
169  ++num_replaced;
170  }
171 
172  // advance
173  i = next[i];
174  }
175  } else {
176  break; // done
177  }
178  } while(grammar.size() < max_rules);
179 
180  // debug
181  /*
182  {
183  DLOG(INFO) << "grammar:";
184  for(size_t i = 0; i < grammar.size(); i++) {
185  digram_t di = grammar[i];
186  sym_t l = left(di);
187  sym_t r = right(di);
188 
189  DLOG(INFO) << uint8_t('A' + i) << " -> " <<
190  uint8_t((l < sigma) ? l : ('A' + l - sigma)) <<
191  uint8_t((r < sigma) ? r : ('A' + r - sigma));
192  }
193 
194  std::ostringstream start;
195  for(size_t i = 0; i < n; i = next[i]) {
196  sym_t x = text[i];
197  start << uint8_t((x < sigma) ? x : ('A' + x - sigma));
198  }
199 
200  DLOG(INFO) << "S -> " << start.str();
201  }
202  */
203 
204  StatPhase::log("rules", grammar.size());
205  StatPhase::log("replaced", num_replaced);
206 
207  // instantiate encoder
208  typename coder_t::Encoder coder(env().env_for_option("coder"),
209  output, Literals<sym_t*>(text, n, next, grammar));
210 
211  // encode amount of grammar rules
212  coder.encode(grammar.size(), len_r);
213 
214  // lambda for encoding symbols
215  auto encode_sym = [&](sym_t x, const Range& r) {
216  if(x < sigma) {
217  coder.encode(false, bit_r);
218  coder.encode(x, literal_r);
219  } else {
220  coder.encode(true, bit_r);
221  coder.encode(x - sigma, r);
222  }
223  };
224 
225  // encode grammar rules
226  size_t num_grammar_terminals = 0;
227  size_t num_grammar_nonterminals = 0;
228 
229  for(size_t i = 0; i < grammar.size(); i++) {
230  digram_t di = grammar[i];
231 
232  Range grammar_r(i);
233 
234  // statistics
235  sym_t l = left(di);
236  if(l < sigma) ++num_grammar_terminals;
237  else ++num_grammar_nonterminals;
238 
239  sym_t r = right(di);
240  if(r < sigma) ++num_grammar_terminals;
241  else ++num_grammar_nonterminals;
242 
243  encode_sym(l, grammar_r);
244  encode_sym(r, grammar_r);
245  }
246 
247  StatPhase::log("grammar_terms", num_grammar_terminals);
248  StatPhase::log("grammar_nonterms", num_grammar_nonterminals);
249 
250  // encode compressed text (start rule)
251  size_t num_text_terminals = 0;
252  size_t num_text_nonterminals = 0;
253 
254  Range grammar_r(grammar.size());
255  for(size_t i = 0; i < n; i = next[i]) {
256  sym_t x = text[i];
257 
258  // statistics
259  if(x < sigma) ++num_text_terminals;
260  else ++num_text_nonterminals;
261 
262  encode_sym(text[i], grammar_r);
263  }
264 
265  StatPhase::log("text_terms", num_text_terminals);
266  StatPhase::log("text_nonterms", num_text_nonterminals);
267 
268  // clean up
269  delete[] next;
270  delete[] text;
271  }
272 
273 private:
274  inline static void decode(sym_t x, const grammar_t& grammar, std::ostream& ostream) {
275  if(x < sigma) {
276  // terminal
277  ostream << uliteral_t(x);
278  } else {
279  // non-terminal
280  digram_t di = grammar[x - sigma];
281  decode(left(di), grammar, ostream);
282  decode(right(di), grammar, ostream);
283  }
284  }
285 
286 public:
287  virtual void decompress(Input& input, Output& output) override {
288  // instantiate decoder
289  typename coder_t::Decoder decoder(env().env_for_option("coder"), input);
290 
291  // lambda for decoding symbols
292  auto decode_sym = [&](const Range& r) {
293  bool is_nonterminal = decoder.template decode<bool>(bit_r);
294  if(is_nonterminal) {
295  auto dec = decoder.template decode<sym_t>(r);
296  return sigma + dec;
297  } else {
298  auto dec = sym_t(decoder.template decode<uliteral_t>(literal_r));
299  return dec;
300  }
301  };
302 
303  // decode grammar
304  grammar_t grammar;
305  {
306  auto num_rules = decoder.template decode<size_t>(len_r);
307  while(num_rules--) {
308  Range grammar_r(grammar.size());
309  sym_t l = decode_sym(grammar_r);
310  sym_t r = decode_sym(grammar_r);
311  grammar.push_back(digram(l, r));
312  }
313  }
314 
315  // debug
316  /*{
317  DLOG(INFO) << "decoded grammar:";
318  for(size_t i = 0; i < grammar.size(); i++) {
319  digram_t di = grammar[i];
320  sym_t l = left(di);
321  sym_t r = right(di);
322 
323  DLOG(INFO) << uint8_t('A' + i) << " -> " <<
324  uint8_t((l < sigma) ? l : ('A' + l - sigma)) <<
325  uint8_t((r < sigma) ? r : ('A' + r - sigma));
326  }
327  }*/
328 
329  // decode text
330  Range grammar_r(grammar.size());
331 
332  auto ostream = output.as_stream();
333  while(!decoder.eof()) {
334  decode(decode_sym(grammar_r), grammar, ostream);
335  }
336  }
337 };
338 
339 }
340 
Represents a generic range of positive integers.
Definition: Range.hpp:16
Contains the text compression and encoding framework.
Definition: namespaces.hpp:11
Provides meta information about an Algorithm.
Definition: Meta.hpp:34
constexpr auto bit_r
Global predefined range for bits (0 or 1).
Definition: Range.hpp:108
uint8_t uliteral_t
Type to represent signed single literals.
Definition: def.hpp:131
const OptionValue & option(const std::string &option) const
Get an option of this algorithm.
Definition: Env.hpp:44
Base for data compressors.
Definition: Compressor.hpp:19
Defines data encoding to and decoding from a stream of binary integer representations.
Definition: BitCoder.hpp:13
size_type size() const
Returns size of the View.
uint64_t as_integer() const
Env & env()
Provides access to the environment that the algorithm works in.
Definition: Algorithm.hpp:51
OutputStream as_stream() const
Creates a stream that allows for character-wise output.
Contains a literal and its location in the input.
Definition: Literal.hpp:16
constexpr auto literal_r
Global predefined reange for literals.
Definition: Range.hpp:111
An abstraction layer for algorithm output.
Definition: Output.hpp:23
uint32_t len_compact_t
Type to represent an bit-compact length value.
Definition: def.hpp:103
fast_t< len_compact_t > len_t
Type to represent an length value.
Definition: def.hpp:114
static void log(const char *key, const T &value)
Logs a user statistic for the current phase.
Definition: StatPhase.hpp:218
void templated(const std::string &accepted_type)
Declares that this option accepts values of the specified Algorithm type T.
Definition: Meta.hpp:93
constexpr auto len_r
Global predefined range for len_t.
Definition: Range.hpp:115
InputView as_view() const
Provides a view on the input that allows for random access.
Definition: Input.hpp:260
OptionBuilder option(const std::string &name)
Declares an accepted option for this algorithm.
Definition: Meta.hpp:216
virtual void decompress(Input &input, Output &output) override
Decompress the given input to the given output.
Local environment for a compression/encoding/decompression call.
virtual void compress(Input &input, Output &output) override
Compress the given input to the given output.
void dynamic()
Declares that this option accepts values of a simple type that can be parsed from a string (e...
Definition: Meta.hpp:150
An abstraction layer for algorithm input.
Definition: Input.hpp:37