tudocomp
– The TU Dortmund Compression Framework
LFS2BSTCompressor.hpp
Go to the documentation of this file.
1 #pragma once
2 
3 //std includes:
4 #include <vector>
5 #include <tuple>
6 #include <unordered_map>
7 
8 //general includes:
10 #include <tudocomp/util.hpp>
11 #include <tudocomp/io.hpp>
14 
15 
16 
18 
19 
20 //includes encoding:
23 #include <tudocomp/Literal.hpp>
25 
28 
29 
31 
32 
33 namespace tdc {
34 namespace lfs {
35 
36 template<typename literal_coder_t = HuffmanCoder, typename len_coder_t = EliasGammaCoder >
37 class LFS2BSTCompressor : public Compressor {
38 private:
39 
40 
41 
42  std::unique_ptr<BinarySuffixTree> stree;
43 
44  //Stores nts_symbols of first layer
45  IntVector<uint> first_layer_nts;
46  // offset to begin of last nts +1. if ==0 no substitution
47  IntVector<uint> fl_offsets;
48  // stores subs in first layer symbols
49  IntVector<uint> second_layer_nts;
50  // dead pos in first layer
51  BitVector second_layer_dead;
52 
53  uint node_count;
54  uint max_depth;
55 
56 
57  //pair contains begin pos, length
58  std::vector<std::pair<uint, uint> > non_terminal_symbols;
59 
60 
61  //stores node_ids of corresponding factor length
62  std::vector<std::vector<uint> > bins;
63 
64 
65  //stores beginning positions corresponding to node_ids
66  std::unordered_map< uint , std::vector< uint > > node_begins;
67 
68  inline virtual void compute_string_depth(uint node, uint str_depth){
69  //resize if str depth grater than bins size
70  uint child = stree->get_first_child(node);
71 
72 
73  if(child != 0){
74  while(str_depth>= bins.size()){
75  bins.resize(bins.size()*2);
76  }
77 
78 
79 
80  if(str_depth>max_depth){
81  max_depth=str_depth;
82  }
83  node_count++;
84  if(str_depth>0){
85 
86  bins[str_depth].push_back(node);
87  }
88  while (child != 0){
89  compute_string_depth(child, str_depth + stree->get_edge_length(child));
90  child=stree->get_next_sibling(child);
91  }
92 
93  }
94 
95  }
96 
97 
98 
99 
100 public:
101 
102  inline static Meta meta() {
103  Meta m("compressor", "lfs2bst",
104  "lfs2 with bst");
106  m.option("min_lrf").dynamic(5);
107  m.option("lfs2_lit_coder").templated<literal_coder_t, HuffmanCoder>("lfs2_lit_coder");
108  m.option("lfs2_len_coder").templated<len_coder_t, EliasGammaCoder>("lfs2_len_coder");
109 
110  return m;
111  }
112 
113 
115  Compressor(std::move(env))
116  {
117  DLOG(INFO) << "Compressor lfs2bst instantiated";
118  }
119  inline virtual void compress(Input& input, Output& output) override {
120  uint min_lrf = env().option("min_lrf").as_integer();
121 
122  auto in = input.as_view();
123 
124  //create vectors:
125  first_layer_nts = IntVector<uint>(input.size(), 0);
126  fl_offsets = IntVector<uint>(input.size(), 0);
127  second_layer_nts = IntVector<uint>(input.size(), 0);
128  second_layer_dead = BitVector(input.size(), 0);
129 
130 
131  DLOG(INFO) << "text length: "<<in.size();
132 
133  if(in.size() >= min_lrf){
134 
135 
136  StatPhase::wrap("Constructing ST", [&]{
137  DLOG(INFO)<<"Constructing ST";
138  stree = std::make_unique<BinarySuffixTree>(in);
139  StatPhase::log("Number of Nodes", stree->get_tree_size());
140  });
141 
142 
143  StatPhase::wrap("Computing LRF", [&]{
144 
145 
146  StatPhase::wrap("Computing String Depth", [&]{
147  bins.resize(200);
148  node_count=0;
149  max_depth=0;
150  compute_string_depth(0,0);
151 
152  bins.resize(max_depth +1);
153  bins.shrink_to_fit();
154  });
155 
156 
157 
158  node_begins.reserve(node_count);
159 
160  uint nts_number = 1 ;
161  StatPhase::wrap("Iterate over Node Bins", [&]{
162  //iterate node bins top down
163  DLOG(INFO)<<"iterate over Node Bins";
164  for(uint i = bins.size()-1; i>=min_lrf; i--){
165 
166  //iterate over ids in bin:
167  while(!bins[i].empty()){
168  uint no_leaf_id = bins[i].back();
169  bins[i].pop_back();
170 
171 
172  auto bp = node_begins.find(no_leaf_id);
173 
174  //no begin pos found, get from children
175 
176  if(bp == node_begins.end()){
177 
178 
179  std::vector<uint> positions;
180  //get leaves or merge child vectors
181  std::vector<uint> offsets;
182  std::vector<uint> leaf_bps;
183 
184  uint inner = stree->get_first_child(no_leaf_id);
185 
186  while (inner != 0){
187 
188  if(stree->get_first_child(inner) == 0){
189 
190 
191  leaf_bps.push_back(stree->get_suffix(inner));
192 
193  } else {
194  auto & child_bp = node_begins[inner];
195  if(!child_bp.empty()){
196  offsets.push_back(positions.size());
197 
198  positions.insert(positions.end(), child_bp.begin(), child_bp.end());
199 
200 
201  node_begins.erase(inner);
202 
203  }
204 
205  }
206  inner = stree->get_next_sibling(inner);
207  }
208 
209  std::sort(leaf_bps.begin(), leaf_bps.end());
210  offsets.push_back(positions.size());
211  positions.insert(positions.end(),leaf_bps.begin(), leaf_bps.end());
212  for(uint k = 0; k < offsets.size()-1; k++){
213  std::inplace_merge(positions.begin(), positions.begin()+ offsets[k], positions.begin()+ offsets[k+1]);
214 
215  }
216  //now inplace merge to end
217  std::inplace_merge(positions.begin(), positions.begin()+ offsets.back(), positions.end());
218 
219 
220  if(!std::is_sorted(positions.begin(), positions.end() )){
221  DLOG(INFO)<<"positions not sorted!!!";
222  }
223 
224 
225  node_begins[no_leaf_id]=positions;
226 
227 
228 
229 
230  }
231 
232  std::vector<uint> begin_pos = node_begins[no_leaf_id];
233  //if still empty, because everything is substituted...
234  if(begin_pos.empty()){
235  continue;
236  }
237  //check if viable lrf, else sort higher!
238  if(begin_pos.size()>=2){
239 
240  if ( ( begin_pos.back() - begin_pos.front() ) >= i ){
241 
242  //greedily iterate over occurences
243  signed long last = 0 - (long) i;
244  std::vector<uint> first_layer_viable;
245  std::vector<uint> second_layer_viable;
246  for(uint occurence : begin_pos){
247  //check for viability
248  if( (last+i <= (long) occurence)){
249  if(fl_offsets[occurence] == 0){
250  if(fl_offsets[occurence + i -1] == 0){
251  //Position is first layer viable
252  first_layer_viable.push_back(occurence);
253  last= occurence;
254  }
255  } else {
256  //find nts number of symbol that corresponds to substitued occ
257  uint parent_nts= first_layer_nts[ occurence - (fl_offsets[occurence] -1) ];
258  auto nts = non_terminal_symbols[parent_nts-1];
259  //if length of parent nts is greater than current len + offset
260  if(nts.second >=fl_offsets[occurence]-1 + i ){
261  second_layer_viable.push_back(occurence);
262  }
263  }
264 
265  }
266 
267  }
268 
269 
270  //and substitute
271  //if at least 2 first level layer occs viable:
272  if(first_layer_viable.size() >=1 &&(first_layer_viable.size() + second_layer_viable.size() >= 2) ) {
273  std::pair<uint,uint> nts = std::make_pair(first_layer_viable.front(), i);
274  non_terminal_symbols.push_back(nts);
275 
276  //iterate over vector, make first layer unviable:
277  for(uint occ : first_layer_viable){
278  first_layer_nts[occ]= nts_number;
279 
280  for(uint nts_length =0; nts_length < i; nts_length++){
281  fl_offsets[occ + nts_length] = nts_length+1;
282  }
283  }
284 
285 
286 
287  for(uint sl_occ :second_layer_viable){
288  uint parent_nts= first_layer_nts[ sl_occ - (fl_offsets[sl_occ] -1) ];
289 
290  auto parent_sym = non_terminal_symbols[parent_nts-1];
291  uint parent_start= parent_sym.first;
292  uint sl_start = (parent_start + fl_offsets[sl_occ] -1);
293  uint sl_end = sl_start+i-1;
294  if(second_layer_dead[sl_start] == (uint)0 && second_layer_dead[sl_end] == (uint)0){
295 
296  second_layer_nts[sl_start]=nts_number;
297 
298  for(uint dead = sl_start; dead<=sl_end;dead++){
299  second_layer_dead[dead]=1;
300  }
301  }
302  }
303 
304  //raise nts number:
305  nts_number++;
306 
307  }
308 
309 
310 
311  }
312  }
313  }
314 
315  }
316 
317  });
318 
319  });
320 
321  DLOG(INFO)<<"Computing symbol depth";
322 
323  IntVector<uint> nts_depth(non_terminal_symbols.size(), 0);
324 
325  for(uint nts_num =0; nts_num<non_terminal_symbols.size(); nts_num++){
326  auto symbol = non_terminal_symbols[nts_num];
327  uint cur_depth = nts_depth[nts_num];
328 
329  for(uint pos = symbol.first; pos < symbol.second + symbol.first ; pos++){
330  if(second_layer_nts[pos]>0){
331 
332  uint symbol_num = second_layer_nts[pos] -1;
333  if(nts_depth[symbol_num]< cur_depth+1){
334  nts_depth[symbol_num]= cur_depth+1;
335  }
336 
337  }
338 
339 
340  }
341 
342  }
343 
344  DLOG(INFO)<<"Computing done";
345 
346  std::sort(nts_depth.begin(), nts_depth.end());
347  if(nts_depth.size()>0){
348 
349  uint max_depth = nts_depth[nts_depth.size()-1];
350 
351  DLOG(INFO)<<"Max CFG Depth: "<< max_depth;
352  DLOG(INFO)<<"Number of CFG rules: "<< non_terminal_symbols.size();
353 
354  if(nts_depth.size()>=4){
355  uint quarter = nts_depth.size() /4;
356 
357  StatPhase::log("25 \% quantil CFG Depth", nts_depth[quarter -1]);
358  StatPhase::log("50 \% quantil CFG Depth", nts_depth[(2*quarter) -1]);
359  StatPhase::log("75 \% quantil CFG Depth", nts_depth[(3*quarter) -1]);
360 
361  }
362  StatPhase::log("Max CFG Depth", max_depth);
363  }
364 
365  //input size end
366  }
367 
368 
369  StatPhase::log("Number of CFG rules", non_terminal_symbols.size());
370 
371  std::stringstream literals;
372 
373  for(uint position = 0; position< in.size(); position++){
374  if(fl_offsets[position]==0){
375  literals << in[position];
376  }
377  }
378  for(uint nts_num = 0; nts_num<non_terminal_symbols.size(); nts_num++){
379 
380  auto symbol = non_terminal_symbols[nts_num];
381 
382  for(uint pos = symbol.first; pos < symbol.second + symbol.first ; pos++){
383 
384  if(second_layer_nts[pos] == 0 && pos < in.size()){
385  literals<< in[pos];
386 
387  }
388  }
389  }
390 
391 
392  DLOG(INFO)<<"encoding";
393 
394  StatPhase::wrap("Encoding Comp", [&]{
395  // encode dictionary:
396  DLOG(INFO) << "encoding dictionary symbol sizes ";
397 
398  std::shared_ptr<BitOStream> bitout = std::make_shared<BitOStream>(output);
399  typename literal_coder_t::Encoder lit_coder(
400  env().env_for_option("lfs2_lit_coder"),
401  bitout,
402  ViewLiterals(literals.str())
403  );
404  typename len_coder_t::Encoder len_coder(
405  env().env_for_option("lfs2_len_coder"),
406  bitout,
407  NoLiterals()
408  );
409 
410  //encode lengths:
411  DLOG(INFO)<<"number nts: " << non_terminal_symbols.size();
412  Range intrange (0, UINT_MAX);//uint32_r
413  //encode first length:
414  if(non_terminal_symbols.size()>=1){
415  auto symbol = non_terminal_symbols[0];
416  uint last_length=symbol.second;
417  //Range for encoding nts number
418  Range s_length_r (0,last_length);
419  len_coder.encode(last_length,intrange);
420  //encode delta length of following symbols
421  for(uint nts_num = 1; nts_num < non_terminal_symbols.size(); nts_num++){
422  symbol = non_terminal_symbols[nts_num];
423  len_coder.encode(last_length-symbol.second,s_length_r);
424  last_length=symbol.second;
425 
426  }
427  //encode last length, to have zero length last
428  len_coder.encode(symbol.second,s_length_r);
429  }else {
430  len_coder.encode(0,intrange);
431 
432  }
433  Range dict_r(0, non_terminal_symbols.size());
434 
435 
436  long buf_size = bitout->tellp();
437 
438  StatPhase::log("Bytes Length Encoding", buf_size);
439  DLOG(INFO)<<"Bytes Length Encoding: "<< buf_size;
440 
441 
442  DLOG(INFO) << "encoding dictionary symbols";
443  uint dict_literals=0;
444 
445  // encode dictionary strings, backwards, to directly decode strings:
446  if(non_terminal_symbols.size()>=1){
447  std::pair<uint,uint> symbol;
448  for(long nts_num =non_terminal_symbols.size()-1; nts_num >= 0; nts_num--){
449  symbol = non_terminal_symbols[nts_num];
450 
451  for(uint pos = symbol.first; pos < symbol.second + symbol.first ; pos++){
452  if(second_layer_nts[pos] > 0){
453  lit_coder.encode(1, bit_r);
454  lit_coder.encode(second_layer_nts[pos], dict_r);
455  auto symbol = non_terminal_symbols[second_layer_nts[pos] -1];
456 
457 
458 
459  pos += symbol.second - 1;
460 
461  } else {
462  lit_coder.encode(0, bit_r);
463  lit_coder.encode(in[pos],literal_r);
464  dict_literals++;
465 
466  }
467  }
468 
469  }
470  }
471 
472  uint literals=0;
473 
474 
475 
476 
477  buf_size = long(bitout->tellp()) - buf_size;
478  StatPhase::log("Bytes Non-Terminal Symbol Encoding", buf_size);
479 
480 
481  DLOG(INFO)<<"Bytes Non-Terminal Symbol Encoding: "<< buf_size;
482 
483  //encode start symbol
484 
485  DLOG(INFO)<<"encode start symbol";
486  for(uint pos = 0; pos < in.size(); pos++){
487  if(first_layer_nts[pos]>0){
488  lit_coder.encode(1, bit_r);
489  lit_coder.encode(first_layer_nts[pos], dict_r);
490  auto symbol = non_terminal_symbols[first_layer_nts[pos] -1];
491 
492  pos += symbol.second - 1;
493 
494  } else {
495  lit_coder.encode(0, bit_r);
496  lit_coder.encode(in[pos],literal_r);
497  literals++;
498 
499 
500  }
501  }
502 
503  buf_size = long(bitout->tellp()) - buf_size;
504  StatPhase::log("Bytes Start Symbol Encoding", buf_size);
505 
506 
507  DLOG(INFO)<<"Bytes Start Symbol Encoding: "<< buf_size;
508 
509 
510  StatPhase::log("Literals in Dictionary", dict_literals);
511 
512  StatPhase::log("Literals in Start Symbol", literals);
513  StatPhase::log("Literals in Input", in.size());
514 
515  double literal_percent = ((double)dict_literals + (double)literals)/ (double)in.size();
516  StatPhase::log("Literals Encoding / Literals Input", literal_percent);
517 
518  DLOG(INFO)<<"encoding done";
519 
520  });
521  }
522 
523  inline virtual void decompress(Input& input, Output& output) override {
524 
525  DLOG(INFO) << "decompress lfs";
526  std::shared_ptr<BitIStream> bitin = std::make_shared<BitIStream>(input);
527 
528  typename literal_coder_t::Decoder lit_decoder(
529  env().env_for_option("lfs2_lit_coder"),
530  bitin
531  );
532  typename len_coder_t::Decoder len_decoder(
533  env().env_for_option("lfs2_len_coder"),
534  bitin
535  );
536  Range int_r (0,UINT_MAX);
537 
538  uint symbol_length = len_decoder.template decode<uint>(int_r);
539  Range slength_r (0, symbol_length);
540  std::vector<uint> dict_lengths;
541  dict_lengths.reserve(symbol_length);
542  dict_lengths.push_back(symbol_length);
543  while(symbol_length>0){
544 
545  uint current_delta = len_decoder.template decode<uint>(slength_r);
546  symbol_length-=current_delta;
547  dict_lengths.push_back(symbol_length);
548  }
549  dict_lengths.pop_back();
550 
551  DLOG(INFO)<<"decoded number of nts: "<< dict_lengths.size();
552 
553 
554 
555  std::vector<std::string> dictionary;
556  uint dictionary_size = dict_lengths.size();
557 
558  Range dictionary_r (0, dictionary_size);
559 
560 
561  dictionary.resize(dict_lengths.size());
562 
563  std::stringstream ss;
564  uint symbol_number;
565  char c1;
566 
567  DLOG(INFO) << "reading dictionary";
568  for(long i = dict_lengths.size() -1; i>=0 ;i--){
569 
570  ss.str("");
571  ss.clear();
572  long size_cur = (long) dict_lengths[i];
573  while(size_cur > 0){
574  bool bit1 = lit_decoder.template decode<bool>(bit_r);
575 
576 
577  if(bit1){
578  //bit = 1, is nts, decode nts num and copy
579  symbol_number = lit_decoder.template decode<uint>(dictionary_r); // Dekodiere Literal
580 
581  symbol_number-=1;
582 
583  if(symbol_number < dictionary.size()){
584 
585  ss << dictionary.at(symbol_number);
586  size_cur-= dict_lengths[symbol_number];
587  } else {
588  break;
589  }
590 
591  } else {
592  //bit = 0, decode literal
593  c1 = lit_decoder.template decode<char>(literal_r); // Dekodiere Literal
594  size_cur--;
595 
596  ss << c1;
597 
598  }
599 
600  }
601 
602  dictionary[i]=ss.str();
603 
604 
605  }
606 
607  auto ostream = output.as_stream();
608  //reading start symbol:
609  while(!lit_decoder.eof()){
610  //decode bit
611  bool bit1 = lit_decoder.template decode<bool>(bit_r);
612  char c1;
613  uint symbol_number;
614  // if bit = 0 its a literal
615  if(!bit1){
616  c1 = lit_decoder.template decode<char>(literal_r); // Dekodiere Literal
617 
618  ostream << c1;
619  } else {
620  //else its a non-terminal
621  symbol_number = lit_decoder.template decode<uint>(dictionary_r); // Dekodiere Literal
622  symbol_number-=1;
623 
624  if(symbol_number < dictionary.size()){
625 
626  ostream << dictionary.at(symbol_number);
627  } else {
628  DLOG(INFO)<< "too large symbol: " << symbol_number;
629  }
630 
631  }
632  }
633  }
634 
635 };
636 
637 //namespaces closing
638 }}
Represents a generic range of positive integers.
Definition: Range.hpp:16
Contains the text compression and encoding framework.
Definition: namespaces.hpp:11
Provides meta information about an Algorithm.
Definition: Meta.hpp:34
constexpr auto bit_r
Global predefined range for bits (0 or 1).
Definition: Range.hpp:108
A vector over arbitrary unsigned integer types.
Definition: IntVector.hpp:175
const OptionValue & option(const std::string &option) const
Get an option of this algorithm.
Definition: Env.hpp:44
Base for data compressors.
Definition: Compressor.hpp:19
void push_back(const value_type &val)
Definition: IntVector.hpp:426
IntVector< uint_t< 1 > > BitVector
Represents a bit vector, alias for IntVector with a fixed bit width of 1.
Definition: IntVector.hpp:545
unsigned int uint
Definition: characterhash.h:6
virtual void compress(Input &input, Output &output) override
Compress the given input to the given output.
uint64_t as_integer() const
A literal iterator that yields every character from a View.
Definition: Literal.hpp:41
void needs_sentinel_terminator()
Indicates that this Algorithm requires a null terminator symbol in Input.
Definition: Meta.hpp:271
Env & env()
Provides access to the environment that the algorithm works in.
Definition: Algorithm.hpp:51
OutputStream as_stream() const
Creates a stream that allows for character-wise output.
An empty literal iterator that yields no literals whatsoever.
Definition: Literal.hpp:37
void resize(size_type n)
Definition: IntVector.hpp:327
constexpr auto literal_r
Global predefined reange for literals.
Definition: Range.hpp:111
An abstraction layer for algorithm output.
Definition: Output.hpp:23
virtual void decompress(Input &input, Output &output) override
Decompress the given input to the given output.
len_compact_t pos
Definition: LZSSFactors.hpp:38
Defines data encoding to and decoding from a stream of Elias-Gamma codes.
static void log(const char *key, const T &value)
Logs a user statistic for the current phase.
Definition: StatPhase.hpp:218
void templated(const std::string &accepted_type)
Declares that this option accepts values of the specified Algorithm type T.
Definition: Meta.hpp:93
size_t size() const
Yields the total amount of characters in the input.
Definition: Input.hpp:233
static auto wrap(const char *title, F func) -> typename std::result_of< F(StatPhase &)>::type
Executes a lambda as a single statistics phase.
Definition: StatPhase.hpp:143
InputView as_view() const
Provides a view on the input that allows for random access.
Definition: Input.hpp:260
OptionBuilder option(const std::string &name)
Declares an accepted option for this algorithm.
Definition: Meta.hpp:216
Local environment for a compression/encoding/decompression call.
size_type size() const
Definition: IntVector.hpp:284
void dynamic()
Declares that this option accepts values of a simple type that can be parsed from a string (e...
Definition: Meta.hpp:150
An abstraction layer for algorithm input.
Definition: Input.hpp:37