tudocomp
– The TU Dortmund Compression Framework
LZ78UCompressor.hpp
Go to the documentation of this file.
1 #pragma once
2 
4 
5 #include <sdsl/cst_fully.hpp>
6 #include <sdsl/cst_sada.hpp>
7 
8 #include <tudocomp/Range.hpp>
9 #include <tudocomp/ds/TextDS.hpp>
10 
12 
13 #include "lz78u/SuffixTree.hpp"
14 
15 #include "lz78u/pre_header.hpp"
16 
18 
19 namespace tdc {
20 namespace lz78u {
21 
22  // TODO: Define factorid for lz78u uniformly
23 
24  class Decompressor {
25  std::vector<lz78::factorid_t> indices;
26  std::vector<uliteral_t> literal_strings;
27  std::vector<size_t> start_literal_strings;
28 
29  std::vector<uliteral_t> buffer;
30 
31  public:
33  DCHECK_NE(index, 0);
34  size_t i = index - 1;
35  return indices[i];
36  }
37  inline View str_at(lz78::factorid_t index) const {
38  DCHECK_NE(index, 0);
39  size_t i = index - 1;
40  View ls = literal_strings;
41 
42  size_t start = start_literal_strings[i];
43  size_t end = 0;
44  if ((i + 1) < start_literal_strings.size()) {
45  end = start_literal_strings[i + 1];
46  } else {
47  end = ls.size();
48  }
49 
50  return ls.slice(start, end);
51  }
52 
53  inline void decompress(lz78::factorid_t index, View literals, std::ostream& out) {
54  indices.push_back(index);
55  start_literal_strings.push_back(literal_strings.size());
56  for (auto c : literals) {
57  literal_strings.push_back(c);
58  }
59 
60  //std::cout << " indices: " << vec_to_debug_string(indices) << "\n";
61  //std::cout << " start lit str: " << vec_to_debug_string(start_literal_strings) << "\n";
62  //std::cout << " literal_strings: " << vec_to_debug_string(literal_strings) << "\n";
63 
64  buffer.clear();
65 
66  while(true) {
67  auto inv_old_size = buffer.size();
68  for (size_t i = 0; i < literals.size(); i++) {
69  buffer.push_back(literals[literals.size() - i - 1]);
70  }
71 
72  DCHECK_EQ(inv_old_size + literals.size(), buffer.size());
73 
74  if (index == 0) {
75  break;
76  }
77  literals = str_at(index);
78  index = ref_at(index);
79  }
80 
81  std::reverse(buffer.begin(), buffer.end());
82  //std::cout << " reconstructed: " << vec_to_debug_string(buffer) << "\n";
83  out << buffer;
84  }
85 
86  };
87 }
88 
89 template<typename strategy_t, typename ref_coder_t>
90 class LZ78UCompressor: public Compressor {
91 private:
92  using node_type = lz78u::SuffixTree::node_type;
93 
94  using RefEncoder = typename ref_coder_t::Encoder;
95  using RefDecoder = typename ref_coder_t::Decoder;
96 
97  using CompressionStrat
98  = typename strategy_t::template Compression<RefEncoder>;
99  using DecompressionStrat
100  = typename strategy_t::template Decompression<RefDecoder>;
101 
102 public:
103  inline LZ78UCompressor(Env&& env):
104  Compressor(std::move(env))
105  {}
106 
107  inline static Meta meta() {
108  Meta m("compressor", "lz78u", "Lempel-Ziv 78 U\n\n" );
109  m.option("comp").templated<strategy_t>("lz78u_strategy");
110  m.option("coder").templated<ref_coder_t>("coder");
111  m.option("threshold").dynamic("3");
112  // m.option("dict_size").dynamic("inf");
114  return m;
115  }
116 
117  virtual void compress(Input& input, Output& out) override {
118  StatPhase phase1("lz78u");
119  //std::cout << "START COMPRESS\n";
120  const len_t threshold = env().option("threshold").as_integer(); //factor threshold
121  phase1.log_stat("threshold", threshold);
122 
123  auto iview = input.as_view();
124  View T = iview;
125 
126  lz78u::SuffixTree::cst_t backing_cst;
127  StatPhase::wrap("construct suffix tree", [&]{
128  // TODO: Specialize sdsl template for less alloc here
129  std::string bad_copy_1 = T.slice(0, T.size() - 1);
130  //std::cout << "text: " << vec_to_debug_string(bad_copy_1) << "\n";
131 
132  construct_im(backing_cst, bad_copy_1, 1);
133  });
134  lz78u::SuffixTree ST(backing_cst);
135 
136  const size_t max_z = T.size() * bits_for(ST.cst.csa.sigma) / bits_for(T.size());
137  phase1.log_stat("max z", max_z);
138 
139  sdsl::int_vector<> R(ST.internal_nodes,0,bits_for(max_z));
140 
141  len_t pos = 0;
142  len_t z = 0;
143 
144  typedef lz78u::SuffixTree::node_type node_t;
145 
146  CompressionStrat strategy {
147  env().env_for_option("comp"),
148  env().env_for_option("coder"),
149  std::make_shared<BitOStream>(out)
150  };
151 
152  len_t factor_count = 0;
153 
154  // `begin` and `end` denote a half-open range [begin, end)
155  // of characters of the input string
156  auto output = [&](len_t begin, len_t end, size_t ref) {
157  // if trailing 0, remove
158  while(T[end-1] == 0) --end;
159 
160  // First, encode the reference
161  DVLOG(2) << "reference";
162  strategy.encode_ref(ref, Range(factor_count));
163 
164  // factorize the factor label if the label is above the threshold
165  if (end-begin >= threshold) {
166  DVLOG(2) << "factorized";
167 
168  // indicate that this is a factorized string
169  strategy.encode_sep(false);
170 
171  for(len_t pos = begin; pos < end;) {
172  // similar to the normal LZ78U factorization, but does not introduce new factor ids
173 
174  const node_t leaf = ST.select_leaf(ST.cst.csa.isa[pos]);
175  len_t d = 1;
176  node_t parent = ST.root;
177  node_t node = ST.level_anc(leaf, d);
178  while(!ST.cst.is_leaf(node) && R[ST.nid(node)] != 0) {
179  parent = node;
180  node = ST.level_anc(leaf, ++d);
181  } // not a good feature: We lost the factor ids of the leaves, since R only stores the IDs of internal nodes
182  const len_t depth = ST.str_depth(parent);
183  // if the largest factor is not large enough, we only store the current character and move one text position to the right
184  if(depth < threshold) {
185  DVLOG(2) << "sub char";
186  strategy.encode_sep(false);
187  strategy.encode_char(T[pos]);
188  ++pos;
189  } else {
190  DVLOG(2) << "sub ref";
191  strategy.encode_sep(true);
192  strategy.encode_ref(R[ST.nid(parent)], Range(factor_count));
193  pos += depth;
194  // taking the last factor can make the string larger than intended, so we have to store a cut-value at the last position
195  if(pos > end) {
196  // TODO: Is this encoding efficient enough?
197 
198  // 0 factors would not appear here, so use 0 as a escape
199  // char to indicate that a cut-value follows
200  DVLOG(2) << "special sub ref";
201  strategy.encode_sep(true);
202  strategy.encode_ref(0, Range(factor_count));
203  strategy.encode_ref(pos-end, len_r);
204  }
205  }
206  }
207  // all char sequencens in a factor need to be 0 terminated
208  DVLOG(2) << "sub char term";
209  strategy.encode_sep(false);
210  strategy.encode_char(0);
211  } else {
212  // else just output the string as a whole
213 
214  // indicate that this is not a factorized string
215  DVLOG(2) << "plain";
216  strategy.encode_sep(true);
217  strategy.encode_str(T.slice(begin,end));
218  }
219 
220  factor_count++;
221  };
222 
223  DVLOG(2) << "[ compress ]";
224 
225  // Skip the trailing 0
226  while(pos < T.size() - 1) {
227  const node_t l = ST.select_leaf(ST.cst.csa.isa[pos]);
228  const len_t leaflabel = pos;
229 
230  if(ST.parent(l) == ST.root || R[ST.nid(ST.parent(l))] != 0) {
231  const len_t parent_strdepth = ST.str_depth(ST.parent(l));
232 
233  //std::cout << "out leaf: [" << (pos+parent_strdepth) << ","<< (pos + parent_strdepth + 1) << "] ";
234  output(pos + parent_strdepth,
235  pos + parent_strdepth + 1,
236  R[ST.nid(ST.parent(l))]);
237 
238  pos += parent_strdepth+1;
239  ++z;
240 
241  DCHECK_EQ(z, factor_count);
242 
243  continue;
244  }
245 
246  len_t d = 1;
247  node_t parent = ST.root;
248  node_t node = ST.level_anc(l, d);
249 
250 
251  while(R[ST.nid(node)] != 0) {
252  parent = node;
253  node = ST.level_anc(l, ++d);
254  }
255  pos += ST.str_depth(parent);
256 
257 
258  // const auto& str = T.slice(leaflabel + ST.str_depth(parent), leaflabel + ST.str_depth(node));
259  const len_t begin = leaflabel + ST.str_depth(parent);
260  const len_t end = leaflabel + ST.str_depth(node);
261 
262  //std::cout << "out slice: [ "<< (leaflabel + ST.str_depth(parent)) << ", "<< (leaflabel + ST.str_depth(node))<< " ] ";
263  output(begin,
264  end,
265  R[ST.nid(ST.parent(node))]);
266  R[ST.nid(node)] = ++z;
267 
268  pos += end - begin;
269  DCHECK_EQ(z, factor_count);
270  }
271  }
272 
273  virtual void decompress(Input& input, Output& output) override final {
274  //std::cout << "START DECOMPRESS\n";
275  DVLOG(2) << "[ decompress ]";
276  auto out = output.as_stream();
277 
278  {
279  DecompressionStrat strategy {
280  env().env_for_option("comp"),
281  env().env_for_option("coder"),
282  std::make_shared<BitIStream>(input)
283  };
284 
285  uint64_t factor_count = 0;
286 
287  lz78u::Decompressor decomp;
288 
289  std::vector<uliteral_t> rebuilt_buffer;
290 
291  while (!strategy.eof()) {
292  auto ref = strategy.decode_ref(Range(factor_count));
293  DVLOG(2) << "";
294  DVLOG(2) << "decode ref: " << ref;
295  DVLOG(2) << "";
296  bool not_factorized = strategy.decode_sep();
297  DVLOG(2) << "decode sep: " << int(not_factorized);
298 
299  if (not_factorized) {
300  auto str = strategy.decode_str();
301  DVLOG(2) << "decode str: '" << str << "\\0'";
302  DVLOG(2) << " ...'"
303  << ((ref > 0) ? decomp.str_at(ref) : ""_v)
304  << "' '"
305  << str
306  << "'";
307  decomp.decompress(ref, str, out);
308  } else {
309  // rebuild the factorized string label
310  rebuilt_buffer.clear();
311 
312  while (true) {
313  bool is_sub_char = !strategy.decode_sep();
314  DVLOG(2) << "decode sep: " << int(!is_sub_char);
315 
316  if (is_sub_char) {
317  auto sub_chr = strategy.decode_char();
318  DVLOG(2) << "decode chr: " << int(sub_chr);
319 
320  rebuilt_buffer.push_back(sub_chr);
321  } else {
322  auto sub_ref = strategy.decode_ref(Range(factor_count));
323  DVLOG(2) << "decode ref: " << sub_ref;
324 
325  if (sub_ref == 0) {
326  // found a cut-value
327  auto cut = strategy.decode_ref(len_r);
328  DVLOG(2) << "decode special ref: " << cut;
329  while (cut > 0) {
330  rebuilt_buffer.pop_back();
331  --cut;
332  }
333  } else {
334  size_t prev_r = sub_ref;
335  size_t old_end = rebuilt_buffer.size();
336 
337  // push the chars in reverse order
338  // to allow efficient appending of prefixes
339  do {
340  View s = decomp.str_at(prev_r);
341  prev_r = decomp.ref_at(prev_r);
342 
343  for (size_t j = 0; j < s.size(); j++) {
344  rebuilt_buffer.push_back(s[s.size() - j - 1]);
345  }
346  } while (prev_r != 0);
347  DCHECK_EQ(prev_r, 0);
348 
349  // reverse suffix containing the new string fragment
350  std::reverse(rebuilt_buffer.begin() + old_end,
351  rebuilt_buffer.end());
352  }
353  }
354 
355  if (rebuilt_buffer.size() > 0 && rebuilt_buffer.back() == 0) {
356  rebuilt_buffer.pop_back();
357  break;
358  }
359  }
360  DVLOG(2) << "USED!";
361  DVLOG(2) << " ...'"
362  << ((ref > 0) ? decomp.str_at(ref) : ""_v)
363  << "' '"
364  << rebuilt_buffer << "'";
365  decomp.decompress(ref, rebuilt_buffer, out);
366  }
367 
368  /*
369  std::cout << "in m (s,r): ("
370  << vec_to_debug_string(factor.string)
371  << ", " << int(factor.ref) << ")\n";
372  */
373 
374  factor_count++;
375  }
376  }
377 
378  out << '\0';
379  out.flush();
380  }
381 
382 };
383 
384 
385 }//ns
Represents a generic range of positive integers.
Definition: Range.hpp:16
Contains the text compression and encoding framework.
Definition: namespaces.hpp:11
constexpr uint_fast8_t bits_for(size_t n)
Computes the number of bits required to store the given integer value.
Provides meta information about an Algorithm.
Definition: Meta.hpp:34
A const view into a slice of memory.
Base for data compressors.
Definition: Compressor.hpp:19
Describes a set of restrictions placed on input data.
Provides access to runtime and memory measurement in statistics phases.
Definition: StatPhase.hpp:44
size_type size() const
Returns size of the View.
void input_restrictions(const io::InputRestrictions &restr)
Places byte restrictions on the Input.
Definition: Meta.hpp:262
An abstraction layer for algorithm output.
Definition: Output.hpp:23
ConstGenericView slice(size_type from, size_type to=npos) const
Construct a new View that is a sub view into the current one.
fast_t< len_compact_t > len_t
Type to represent an length value.
Definition: def.hpp:114
virtual void compress(Input &input, Output &out) override
Compress the given input to the given output.
len_compact_t pos
Definition: LZSSFactors.hpp:38
void log_stat(const char *key, const T &value)
Logs a user statistic for this phase.
Definition: StatPhase.hpp:298
void templated(const std::string &accepted_type)
Declares that this option accepts values of the specified Algorithm type T.
Definition: Meta.hpp:93
constexpr auto len_r
Global predefined range for len_t.
Definition: Range.hpp:115
lz78::factorid_t ref_at(lz78::factorid_t index) const
uint32_t factorid_t
Type for the factor indices, bounded by the number of LZ78 trie nodes.
Definition: LZ78Trie.hpp:11
static auto wrap(const char *title, F func) -> typename std::result_of< F(StatPhase &)>::type
Executes a lambda as a single statistics phase.
Definition: StatPhase.hpp:143
InputView as_view() const
Provides a view on the input that allows for random access.
Definition: Input.hpp:260
This is a wrapper class around the sdsl-lite library to get a easier translation between the pseudoco...
OptionBuilder option(const std::string &name)
Declares an accepted option for this algorithm.
Definition: Meta.hpp:216
View str_at(lz78::factorid_t index) const
Local environment for a compression/encoding/decompression call.
virtual void decompress(Input &input, Output &output) override final
Decompress the given input to the given output.
void decompress(lz78::factorid_t index, View literals, std::ostream &out)
void dynamic()
Declares that this option accepts values of a simple type that can be parsed from a string (e...
Definition: Meta.hpp:150
An abstraction layer for algorithm input.
Definition: Input.hpp:37