tudocomp
– The TU Dortmund Compression Framework
SLECoder.hpp
Go to the documentation of this file.
1 #pragma once
2 
3 #include <tudocomp/util.hpp>
4 #include <tudocomp/Coder.hpp>
6 
7 namespace tdc {
8 
9 class SLECoder : public Algorithm {
10 private:
11  typedef size_t sym_t;
12  static const size_t max_kmer = sizeof(sym_t) - 1;
13  static const size_t kmer_mask = 0xFFUL << (8UL * max_kmer);
14 
15  static inline bool is_kmer(sym_t x) {
16  return (x & kmer_mask) == kmer_mask;
17  }
18 
19  static inline sym_t compile_kmer(const uliteral_t* kmer, size_t k) {
20  sym_t kmer_sym = 0;
21 
22  for(size_t i = 0; i < k; i++) {
23  kmer_sym |= (sym_t(kmer[k-1-i]) << 8UL * i);
24  }
25 
26  return kmer_sym | kmer_mask;
27  }
28 
29  static inline void decompile_kmer(sym_t x, uliteral_t* kmer, size_t k) {
30  for(size_t i = 0; i < k; i++) {
31  kmer[k-1-i] = (x >> 8UL * i) & 0xFFUL;
32  }
33  }
34 
35 public:
36  inline static Meta meta() {
37  Meta m("coder", "sle", "Static low entropy encoding conforming [Dinklage, 2015]");
38  m.option("kmer").dynamic(3);
39  return m;
40  }
41 
42  SLECoder() = delete;
43 
44  class Encoder : public tdc::Encoder {
45  private:
46  size_t m_k;
47 
48  uliteral_t* m_kmer;
49  size_t m_kmer_cur;
50 
51  Counter<sym_t>::ranking_t m_ranking;
52  size_t m_sigma_bits;
53 
54  inline bool kmer_full() {
55  return m_kmer_cur == m_k;
56  }
57 
58  inline bool kmer_roll(uliteral_t c, uliteral_t& out) {
59  bool roll_out = kmer_full();
60  if(roll_out) {
61  out = m_kmer[0];
62 
63  for(size_t i = 0; i < m_k - 1; i++) m_kmer[i] = m_kmer[i+1];
64  --m_kmer_cur;
65  }
66 
67  m_kmer[m_kmer_cur++] = c;
68  return roll_out;
69  }
70 
71  //DEBUG
72  inline std::string kmer_str() {
73  std::ostringstream s;
74  for(size_t i = 0; i < m_k; i++) s << m_kmer[i];
75  return s.str();
76  }
77 
78  public:
79  template<typename literals_t>
80  inline Encoder(Env&& env, std::shared_ptr<BitOStream> out, literals_t&& literals)
81  : tdc::Encoder(std::move(env), out, literals) {
82 
83  m_k = this->env().option("kmer").as_integer();
84  assert(m_k <= max_kmer);
85 
86  m_kmer = new uliteral_t[m_k];
87  m_kmer_cur = 0;
88  size_t last_literal_pos = 0;
89 
90  Counter<sym_t> alphabet;
91  Counter<sym_t> kmers;
92 
93  while(literals.has_next()) {
94  Literal l = literals.next();
95 
96  if(m_k > 1) {
97  // update k-mer buffer
98  if(l.pos != last_literal_pos + 1) {
99  m_kmer_cur = 0; //reset
100  }
101 
102  uliteral_t out;
103  kmer_roll(l.c, out);
104 
105  // count k-mer if complete
106  if(kmer_full()) {
107  auto x = compile_kmer(m_kmer, m_k);
108  //DLOG(INFO) << "count k-mer: " << kmer_str() <<
109  // " => 0x" << std::hex << x;
110 
111  kmers.increase(x);
112  }
113  }
114 
115  // count single literal
116  alphabet.increase(sym_t(l.c));
117 
118  // save position
119  last_literal_pos = l.pos;
120  }
121 
122  auto sigma = alphabet.getNumItems();
123  m_sigma_bits = bits_for(sigma - 1);
124 
125  // determine alphabet extension size (see [Dinklage, 2015])
126  if(m_k > 1) {
127  size_t eta_add_bits = ((1UL << m_sigma_bits) == sigma) ? 1 : 2;
128  size_t eta = (1UL << (m_sigma_bits + eta_add_bits)) - sigma;
129 
130  // merge eta most common k-mers into alphabet
131  /*DLOG(INFO) <<
132  "sigma0 = " << sigma <<
133  ", eta = " << eta <<
134  ", kmers = " << kmers.getNumItems();*/
135  for(auto e : kmers.getSorted()) {
136  alphabet.setCount(e.first, e.second);
137  if(--eta == 0) break; //no more than eta
138  }
139 
140  // update sigma
141  sigma = alphabet.getNumItems();
142  m_sigma_bits = bits_for(sigma - 1);
143  }
144 
145  // create ranking
146  m_ranking = alphabet.createRanking();
147 
148  // debug
149  /*{
150  DLOG(INFO) << "m_sigma_bits = " << m_sigma_bits;
151  DLOG(INFO) << "ranking:";
152  for(auto e : m_ranking) {
153  DLOG(INFO) << "\t'" << std::hex << e.first << "' -> " <<
154  std::dec << e.second << ", is_kmer = " << is_kmer(e.first);
155  }
156  }*/
157 
158  // encode ranking
159  m_out->write_compressed_int(sigma);
160  for(auto e : alphabet.getSorted()) {
161  m_out->write_compressed_int(e.first);
162  }
163 
164  // reset current k-mer
165  m_kmer_cur = 0;
166  }
167 
168  template<typename literals_t>
169  inline Encoder(Env&& env, Output& out, literals_t&& literals)
170  : Encoder(std::move(env), std::make_shared<BitOStream>(out), literals) {
171  }
172 
174  // flush
175  flush_kmer();
176 
177  // clean up
178  delete[] m_kmer;
179  }
180 
181  private:
182  inline void flush_kmer() {
183  // encode literals in k-mer buffer
184  for(size_t i = 0; i < m_kmer_cur; i++) {
185  encode_sym(sym_t(m_kmer[i]));
186  }
187 
188  // reset current k-mer
189  m_kmer_cur = 0;
190  }
191 
192  inline void encode_sym(sym_t x) {
193  auto r = m_ranking[x];
194  //DLOG(INFO) << "encode_sym(0x" << std::hex << x << "), r = " << std::dec << r;
195 
196  // see [Dinklage, 2015]
197  if(m_sigma_bits < 4) {
198  m_out->write_int(r, m_sigma_bits);
199  } else if(m_sigma_bits < 6) {
200  if(r < 4) {
201  m_out->write_bit(0);
202  m_out->write_int(r, 2);
203  } else {
204  m_out->write_bit(1);
205  m_out->write_int(r, m_sigma_bits);
206  }
207  } else if(m_sigma_bits == 6) {
208  if(r < 8) {
209  m_out->write_int(0, 2);
210  m_out->write_int(r, 3);
211  } else if(r < 16) {
212  m_out->write_int(1, 2);
213  m_out->write_int(r - 8UL, 3);
214  } else if(r < 32) {
215  m_out->write_int(2, 2);
216  m_out->write_int(r - 16UL, 4);
217  } else {
218  m_out->write_int(3, 2);
219  m_out->write_int(r, m_sigma_bits);
220  }
221  } else {
222  if(r < 4) {
223  m_out->write_int(0, 3);
224  m_out->write_int(r, 2);
225  } else if(r < 8) {
226  m_out->write_int(1, 3);
227  m_out->write_int(r - 4UL, 2);
228  } else if(r < 12) {
229  m_out->write_int(2, 3);
230  m_out->write_int(r - 8UL, 2);
231  } else if(r < 16) {
232  m_out->write_int(3, 3);
233  m_out->write_int(r - 12UL, 2);
234  } else if(r < 24) {
235  m_out->write_int(4, 3);
236  m_out->write_int(r - 16UL, 3);
237  } else if(r < 32) {
238  m_out->write_int(5, 3);
239  m_out->write_int(r - 24UL, 3);
240  } else if(r < 40) {
241  m_out->write_int(6, 3);
242  m_out->write_int(r - 32UL, 3);
243  } else {
244  m_out->write_int(7, 3);
245  m_out->write_int(r, m_sigma_bits);
246  }
247  }
248  }
249 
250  public:
251  template<typename value_t>
252  inline void encode(value_t v, const LiteralRange&) {
253  auto c = uliteral_t(v);
254 
255  uliteral_t out = 0;
256  if(kmer_roll(c, out)) {
257  // encode rolled out character
258  encode_sym(sym_t(out));
259  }
260 
261  if(kmer_full()) {
262  sym_t x = compile_kmer(m_kmer, m_k);
263  if(m_ranking.find(x) != m_ranking.end()) {
264  // current k-mer exists in ranking
265  encode_sym(x);
266  m_kmer_cur = 0; // reset
267  }
268  }
269  }
270 
271  template<typename value_t>
272  inline void encode(value_t v, const Range& r) {
273  flush_kmer(); // k-mer interrupted
274  m_out->write_int(v - r.min(), bits_for(r.delta()));
275  }
276 
277  template<typename value_t>
278  inline void encode(value_t v, const MinDistributedRange& r) {
279  flush_kmer(); // k-mer interrupted
280 
281  // see [Dinklage, 2015]
282  v -= r.min();
283  auto bits = bits_for(r.delta());
284 
285  if(bits <= 5) {
286  m_out->write_int(v, bits);
287  } else {
288  if(v < 8u) {
289  m_out->write_int(0, 2);
290  m_out->write_int(v, 3);
291  } else if(v < 16u) {
292  m_out->write_int(1, 2);
293  m_out->write_int(v - value_t(8), 3);
294  } else if(v < 32u) {
295  m_out->write_int(2, 2);
296  m_out->write_int(v - value_t(16), 4);
297  } else {
298  m_out->write_int(3, 2);
299  m_out->write_int(v, bits);
300  }
301  }
302  }
303 
304  template<typename value_t>
305  inline void encode(value_t v, const BitRange&) {
306  flush_kmer(); // k-mer interrupted
307  m_out->write_bit(v);
308  }
309  };
310 
311  class Decoder : public tdc::Decoder {
312  private:
313  size_t m_k;
314 
315  size_t m_sigma_bits;
316  sym_t* m_inv_ranking;
317 
318  uliteral_t* m_kmer;
319  size_t m_kmer_read;
320 
321  inline void reset_kmer() {
322  m_kmer_read = SIZE_MAX;
323  }
324 
325  public:
326  inline Decoder(Env&& env, std::shared_ptr<BitIStream> in)
327  : tdc::Decoder(std::move(env), in) {
328  m_k = this->env().option("kmer").as_integer();
329 
330  m_kmer = new uliteral_t[m_k];
331  reset_kmer();
332 
333  // decode literal ranking
334  auto sigma = m_in->read_compressed_int<size_t>();
335 
336  m_sigma_bits = bits_for(sigma - 1);
337  m_inv_ranking = new sym_t[sigma];
338 
339  for(size_t rank = 0; rank < sigma; rank++) {
340  auto c = m_in->read_compressed_int<sym_t>();
341 
342  //DLOG(INFO) << "decoded rank: " << rank << " -> " <<
343  // std::hex << c << ", is_kmer = " << is_kmer(c);
344 
345  m_inv_ranking[rank] = c;
346  }
347  }
348 
349  inline Decoder(Env&& env, Input& in)
350  : Decoder(std::move(env), std::make_shared<BitIStream>(in)) {
351  }
352 
354  delete[] m_kmer;
355  delete[] m_inv_ranking;
356  }
357 
358  inline bool eof() const {
359  if(m_kmer_read < m_k) {
360  // still decoding a k-mer
361  return false;
362  } else {
363  // check if stream is over
364  return tdc::Decoder::m_in->eof();
365  }
366  }
367 
368  template<typename value_t>
369  inline value_t decode(const LiteralRange&) {
370  if(m_kmer_read < m_k) {
371  // continue reading from current k-mer
372  return value_t(m_kmer[m_kmer_read++]);
373  }
374 
375  size_t r = 0;
376 
377  // see [Dinklage, 2015]
378  if(m_sigma_bits < 4) {
379  r = m_in->read_int<size_t>(m_sigma_bits);
380  } else if(m_sigma_bits < 6) {
381  auto b = m_in->read_bit();
382  if(b == 0) r = m_in->read_int<size_t>(2);
383  else r = m_in->read_int<size_t>(m_sigma_bits);
384  } else if(m_sigma_bits == 6) {
385  auto x = m_in->read_int<uint8_t>(2);
386  switch(x) {
387  case 0: r = m_in->read_int<size_t>(3); break;
388  case 1: r = 8UL + m_in->read_int<size_t>(3); break;
389  case 2: r = 16UL + m_in->read_int<size_t>(4); break;
390  case 3: r = m_in->read_int<size_t>(m_sigma_bits); break;
391  }
392  } else {
393  auto x = m_in->read_int<uint8_t>(3);
394  switch(x) {
395  case 0: r = m_in->read_int<size_t>(2); break;
396  case 1: r = 4UL + m_in->read_int<size_t>(2); break;
397  case 2: r = 8UL + m_in->read_int<size_t>(2); break;
398  case 3: r = 12UL + m_in->read_int<size_t>(2); break;
399  case 4: r = 16UL + m_in->read_int<size_t>(3); break;
400  case 5: r = 24UL + m_in->read_int<size_t>(3); break;
401  case 6: r = 32UL + m_in->read_int<size_t>(3); break;
402  case 7: r = m_in->read_int<size_t>(m_sigma_bits); break;
403  }
404  }
405 
406  auto x = m_inv_ranking[r];
407  //DLOG(INFO) << "decoded 0x" << std::hex << x << " from r = " << std::dec << r;
408 
409  if(is_kmer(x)) {
410  decompile_kmer(x, m_kmer, m_k);
411  m_kmer_read = 1;
412  return m_kmer[0];
413  } else {
414  return value_t(x);
415  }
416  }
417 
418  template<typename value_t>
419  inline value_t decode(const Range& r) {
420  reset_kmer(); // current k-mer interrupted
421  return m_in->read_int<value_t>(bits_for(r.delta())) + value_t(r.min());
422  }
423 
424  template<typename value_t>
425  inline value_t decode(const MinDistributedRange& r) {
426  reset_kmer(); // current k-mer interrupted
427 
428  auto bits = bits_for(r.delta());
429  value_t v = 0;
430 
431  if(bits <= 5) {
432  v = m_in->read_int<value_t>(bits);
433  } else {
434  auto x = m_in->read_int<uint8_t>(2);
435  switch(x) {
436  case 0: v = m_in->read_int<value_t>(3); break;
437  case 1: v = value_t(8) + m_in->read_int<value_t>(3); break;
438  case 2: v = value_t(16) + m_in->read_int<value_t>(4); break;
439  case 3: v = m_in->read_int<value_t>(bits); break;
440  }
441  }
442 
443  return v + value_t(r.min());
444  }
445 
446  template<typename value_t>
447  inline value_t decode(const BitRange&) {
448  reset_kmer(); // current k-mer interrupted
449  return value_t(m_in->read_bit());
450  }
451  };
452 };
453 
454 }
455 
SLECoder()=delete
Represents a generic range of positive integers.
Definition: Range.hpp:16
Contains the text compression and encoding framework.
Definition: namespaces.hpp:11
size_t delta() const
Yields the difference between the range&#39;s minimum and maximum values.
Definition: Range.hpp:47
constexpr uint_fast8_t bits_for(size_t n)
Computes the number of bits required to store the given integer value.
Represents the range of valid tdc::uliteral_t values.
Definition: Range.hpp:90
Provides meta information about an Algorithm.
Definition: Meta.hpp:34
value_t decode(const MinDistributedRange &r)
Definition: SLECoder.hpp:425
Represents a compiler-level fixed range.
Definition: Range.hpp:83
void encode(value_t v, const MinDistributedRange &r)
Definition: SLECoder.hpp:278
uint8_t uliteral_t
Type to represent signed single literals.
Definition: def.hpp:131
const OptionValue & option(const std::string &option) const
Get an option of this algorithm.
Definition: Env.hpp:44
Decoder(Env &&env, std::shared_ptr< BitIStream > in)
Definition: SLECoder.hpp:326
Wrapper for input streams that provides bitwise reading functionality.
Definition: BitIStream.hpp:16
std::vector< std::pair< T, size_t > > getSorted()
Return a list of value-count pairs, sorted with the most-seen first.
Definition: Counter.hpp:47
A data structure for counting occurences of values of a given type T.
Definition: Counter.hpp:15
std::shared_ptr< BitIStream > m_in
The underlying bit input stream.
Definition: Coder.hpp:91
Represents a range of positive integers that tend to be distributed towards the minimum.
Definition: Range.hpp:56
uint64_t as_integer() const
size_t getNumItems()
Return how many differnt values have been seen so far.
Definition: Counter.hpp:42
Decoder(Env &&env, Input &in)
Definition: SLECoder.hpp:349
uliteral_t c
The literal.
Definition: Literal.hpp:18
Env & env()
Provides access to the environment that the algorithm works in.
Definition: Algorithm.hpp:51
void encode(value_t v, const LiteralRange &)
Definition: SLECoder.hpp:252
void increase(const T &item)
Increase the counter for the passed value by one, setting it to 1 if it was not yet seen...
Definition: Counter.hpp:24
Contains a literal and its location in the input.
Definition: Literal.hpp:16
Wrapper for output streams that provides bitwise writing functionality.
Definition: BitOStream.hpp:17
size_t min() const
Yields the range&#39;s minimum value.
Definition: Range.hpp:37
An abstraction layer for algorithm output.
Definition: Output.hpp:23
ranking_t createRanking(size_t num=SIZE_MAX)
Return a map of the num-most common values, keyed by their common-ness, with map[0] being hte most co...
Definition: Counter.hpp:75
void encode(value_t v, const Range &r)
Definition: SLECoder.hpp:272
value_t decode(const Range &r)
Definition: SLECoder.hpp:419
static Meta meta()
Definition: SLECoder.hpp:36
void setCount(const T &item, size_t count)
Set the count of an item to a specific value.
Definition: Counter.hpp:32
value_t decode(const LiteralRange &)
Definition: SLECoder.hpp:369
std::shared_ptr< BitOStream > m_out
The underlying bit output stream.
Definition: Coder.hpp:18
Base for data encoders.
Definition: Coder.hpp:14
Encoder(Env &&env, std::shared_ptr< BitOStream > out, literals_t &&literals)
Definition: SLECoder.hpp:80
OptionBuilder option(const std::string &name)
Declares an accepted option for this algorithm.
Definition: Meta.hpp:216
Local environment for a compression/encoding/decompression call.
len_t pos
The position of the literal in the input.
Definition: Literal.hpp:21
Base for data decoders.
Definition: Coder.hpp:87
Interface for algorithms.
Definition: Algorithm.hpp:15
value_t decode(const BitRange &)
Definition: SLECoder.hpp:447
Encoder(Env &&env, Output &out, literals_t &&literals)
Definition: SLECoder.hpp:169
void dynamic()
Declares that this option accepts values of a simple type that can be parsed from a string (e...
Definition: Meta.hpp:150
void encode(value_t v, const BitRange &)
Definition: SLECoder.hpp:305
An abstraction layer for algorithm input.
Definition: Input.hpp:37