tudocomp
– The TU Dortmund Compression Framework
ScanDec.hpp
Go to the documentation of this file.
1 #pragma once
2 
3 #include <vector>
4 #include <sdsl/int_vector.hpp>
5 #include <sdsl/rank_support.hpp>
6 #include <tudocomp/def.hpp>
8 #include <tudocomp/Algorithm.hpp>
9 #include <algorithm>
10 
12 
13 namespace tdc {
14 namespace lcpcomp {
15 
16 
25  class EagerScanDec {
26  Env& m_env;
27  IntVector<uliteral_t>& m_buffer;
28  const sdsl::bit_vector m_bv;
29  const sdsl::bit_vector::rank_1_type m_rank;
30  const len_t m_empty_entries;
31  len_compact_t**const m_fwd = nullptr;
32 
33  IF_STATS(len_t m_longest_chain = 0);
34  IF_STATS(len_t m_current_chain = 0);
35 
36  public:
38  : m_env(env)
39  , m_buffer { buffer }
40  , m_bv ( [&buffer] () -> sdsl::bit_vector {
41  sdsl::bit_vector bv { buffer.size(),0 };
42  for(len_t i = 0; i < buffer.size(); ++i) {
43  if(buffer[i]) continue;
44  bv[i] = 1;
45  }
46  return bv;
47  }() )
48  , m_rank { &m_bv }
49  //, m_empty_entries { static_cast<len_t>( buffer.size()) }
50  , m_empty_entries { static_cast<len_t>(std::count_if(buffer.cbegin(), buffer.cend(), [] (const uliteral_t& i) { return i == 0; })) }
51  , m_fwd { new len_compact_t*[m_empty_entries+1] }
52  {
53  std::fill(m_fwd,m_fwd+m_empty_entries,nullptr);
54  }
55 
56  len_t rank(len_t i) const {
57  DCHECK(m_bv[i]);
58  return m_rank.rank(i+1);
59  }
60 
61  void decode(const std::vector<len_compact_t>& m_target_pos, const std::vector<len_compact_t>& m_source_pos, const std::vector<len_compact_t>& m_length) {
62  StatPhase phase("Decoding Factors");
63  const len_t factors = m_source_pos.size();
64  phase.log_stat("factors", factors);
65 
66  for(len_t j = 0; j < factors; ++j) {
67  const len_compact_t& target_position = m_target_pos[j];
68  const len_compact_t& source_position = m_source_pos[j];
69  const len_compact_t& factor_length = m_length[j];
70  for(len_t i = 0; i < factor_length; ++i) {
71  if(m_buffer[source_position+i]) {
72  decode_literal_at(target_position+i, m_buffer[source_position+i]);
73  } else {
74  DCHECK_EQ(m_bv[source_position+i],1);
75  len_compact_t*& bucket = m_fwd[rank(source_position+i)];
76  if(bucket == nullptr) {
77  bucket = new len_compact_t[2];
78  bucket[0] = 2;
79  bucket[1] = target_position+i;
80  }
81  else
82  { // this block implements the call of m_fwd[src]->push_back(m_cursor);
83  ++bucket[0]; // increase the size of a bucket only by one!
84  bucket = (len_compact_t*) realloc(bucket, sizeof(len_compact_t)*bucket[0]);
85  bucket[bucket[0]-1] = target_position+i;
86  }
87  }
88 
89  }
90 
91  IF_STATS({
92  if((j+1) % ((factors+5)/5) == 0 ) {
93  phase.split(
94  std::string("Decoding Factors at position ")
95  + std::to_string(target_position));
96  }
97  })
98  }
99  }
101  IF_STATS(++m_current_chain);
102  IF_STATS(m_longest_chain = std::max(m_longest_chain, m_current_chain));
103 
104  DCHECK(m_buffer[pos] == 0 || m_buffer[pos] == c) << "would write " << c << " to mbuffer[" << pos << "] = " << m_buffer[pos];
105  m_buffer[pos] = c;
106  DCHECK(c != 0 || pos == m_buffer.size()-1); // we assume that the text to restore does not contain a NULL-byte but at its very end
107 
108  if(m_bv[pos] == 1) {
109  const len_t rankpos = rank(pos);
110  DCHECK_LE(rankpos, m_empty_entries);
111  if(m_fwd[rankpos] != nullptr) {
112  const len_compact_t*const& bucket = m_fwd[rankpos];
113  for(size_t i = 1; i < bucket[0]; ++i) {
114  decode_literal_at(bucket[i], c); // recursion
115  }
116  delete [] m_fwd[rankpos];
117  m_fwd[rankpos] = nullptr;
118  }
119  }
120 
121  IF_STATS(--m_current_chain);
122  }
123 
125  inline len_t longest_chain() const {
126  return m_longest_chain;
127  })
128 
129  ~EagerScanDec() {
130  DCHECK(m_fwd != nullptr);
131  for(size_t i = 0; i < m_empty_entries; ++i) {
132  if(m_fwd[i] == nullptr) continue;
133  delete [] m_fwd[i];
134  }
135  delete [] m_fwd;
136  }
137 
138 
139  };
140 
146 class ScanDec : public Algorithm {
147 public:
148  inline static Meta meta() {
149  Meta m("lcpcomp_dec", "scan");
150  m.option("scans").dynamic(6);
151  return m;
152 
153  }
154  inline void decode_lazy() {
155  StatPhase::log("remaining factors", m_target_pos.size());
156  StatPhase::log("scans", m_scans);
157  size_t lazy = m_scans;
158  while(lazy > 0) {
159  decode_lazy_();
160  --lazy;
161  }
162  }
163  inline void decode_eagerly() {
164  EagerScanDec* decoder = StatPhase::wrap("Initialize Bit Vector", [&]{
165  return new EagerScanDec(this->env(),m_buffer);
166  });
167 
168  decoder->decode(m_target_pos, m_source_pos, m_length);
169  IF_STATS(m_longest_chain = decoder->longest_chain());
170 
171  StatPhase::wrap("Destructor ScanDec", [&]{
172  delete decoder;
173  });
174  }
175 
176 private:
177  inline void decode_lazy_() {
178  const len_t factors = m_source_pos.size();
179  for(len_t j = 0; j < factors; ++j) {
180  const len_compact_t& target_position = m_target_pos[j];
181  const len_compact_t& source_position = m_source_pos[j];
182  const len_compact_t& factor_length = m_length[j];
183  for(len_t i = 0; i < factor_length; ++i) {
184  //DCHECK(m_buffer[source_position+i] == 0 && m_buffer[target_position+i] == 0);
185  m_buffer[target_position+i] = m_buffer[source_position+i];
186  }
187  }
188  }
189  const size_t m_scans; // number of scan rounds
190 
191  len_t m_cursor;
192 
193  IntVector<uliteral_t> m_buffer;
194 
195  //storing factors
196  std::vector<len_compact_t> m_target_pos;
197  std::vector<len_compact_t> m_source_pos;
198  std::vector<len_compact_t> m_length;
199 
200  IF_STATS(len_t m_longest_chain = 0);
201 
202 public:
203  ScanDec(ScanDec&& other)
204  : Algorithm(std::move(*this))
205  , m_scans(std::move(other.m_scans))
206  , m_cursor(std::move(other.m_cursor))
207  , m_buffer(std::move(other.m_buffer))
208  { }
209 
210  inline ScanDec(Env&& env, len_t size)
211  : Algorithm(std::move(env))
212  , m_scans(this->env().option("scans").as_integer())
213  , m_cursor(0)
214  , m_buffer(size,0)
215  { }
216 
217  inline void decode_literal(uliteral_t c) {
218  m_buffer[m_cursor++] = c;
219  DCHECK(c != 0 || m_cursor == m_buffer.size()); // we assume that the text to restore does not contain a NULL-byte but at its very end
220  }
221 
222  inline void decode_factor(const len_t source_position, const len_t factor_length) {
223  bool factor_stored = false;
224  for(len_t i = 0; i < factor_length; ++i) {
225  const len_t src_pos = source_position+i;
226  if(m_buffer[src_pos]) {
227  m_buffer[m_cursor] = m_buffer[src_pos];
228  }
229  else if(factor_stored == false) {
230  factor_stored = true;
231  m_target_pos.push_back(m_cursor);
232  m_source_pos.push_back(src_pos);
233  m_length.push_back(factor_length-i);
234  }
235  ++m_cursor;
236  }
237  }
238 
239  IF_STATS(
240  inline len_t longest_chain() const {
241  return m_longest_chain;
242  })
243 
244  inline void write_to(std::ostream& out) const {
245  for(auto c : m_buffer) out << c;
246  }
247 };
248 
249 }} //ns
250 
Contains the text compression and encoding framework.
Definition: namespaces.hpp:11
Provides meta information about an Algorithm.
Definition: Meta.hpp:34
A vector over arbitrary unsigned integer types.
Definition: IntVector.hpp:175
uint8_t uliteral_t
Type to represent signed single literals.
Definition: def.hpp:131
void push_back(const value_type &val)
Definition: IntVector.hpp:426
std::string to_string(tdc::uint_impl_t< N > value)
Definition: uint_t.hpp:241
void decode_factor(const len_t source_position, const len_t factor_length)
Definition: ScanDec.hpp:222
Provides access to runtime and memory measurement in statistics phases.
Definition: StatPhase.hpp:44
void split(const char *new_title)
Starts a new phase as a sibling, reusing the same object.
Definition: StatPhase.hpp:264
ScanDec(ScanDec &&other)
Definition: ScanDec.hpp:203
EagerScanDec(Env &env, IntVector< uliteral_t > &buffer)
Definition: ScanDec.hpp:37
Runs a number of scans of the factors.
Definition: ScanDec.hpp:146
void decode_literal(uliteral_t c)
Definition: ScanDec.hpp:217
static Meta meta()
Definition: ScanDec.hpp:148
IF_STATS(inline len_t longest_chain() const { return m_longest_chain;}) ~EagerScanDec()
Definition: ScanDec.hpp:124
uint32_t len_compact_t
Type to represent an bit-compact length value.
Definition: def.hpp:103
len_t rank(len_t i) const
Definition: ScanDec.hpp:56
fast_t< len_compact_t > len_t
Type to represent an length value.
Definition: def.hpp:114
len_compact_t pos
Definition: LZSSFactors.hpp:38
#define IF_STATS(x)
x is compiled only when the STATS_DISABLED macro is undefined.
Definition: def.hpp:59
void decode(const std::vector< len_compact_t > &m_target_pos, const std::vector< len_compact_t > &m_source_pos, const std::vector< len_compact_t > &m_length)
Definition: ScanDec.hpp:61
void log_stat(const char *key, const T &value)
Logs a user statistic for this phase.
Definition: StatPhase.hpp:298
static void log(const char *key, const T &value)
Logs a user statistic for the current phase.
Definition: StatPhase.hpp:218
static auto wrap(const char *title, F func) -> typename std::result_of< F(StatPhase &)>::type
Executes a lambda as a single statistics phase.
Definition: StatPhase.hpp:143
void decode_literal_at(len_t pos, uliteral_t c)
Definition: ScanDec.hpp:100
OptionBuilder option(const std::string &name)
Declares an accepted option for this algorithm.
Definition: Meta.hpp:216
Local environment for a compression/encoding/decompression call.
size_type size() const
Definition: IntVector.hpp:284
Interface for algorithms.
Definition: Algorithm.hpp:15
ScanDec(Env &&env, len_t size)
Definition: ScanDec.hpp:210
void dynamic()
Declares that this option accepts values of a simple type that can be parsed from a string (e...
Definition: Meta.hpp:150