tudocomp
– The TU Dortmund Compression Framework
EncodeStrategy.hpp
Go to the documentation of this file.
1 #pragma once
2 
3 
4 
6 #include <tudocomp/Range.hpp>
7 #include <tudocomp/util.hpp>
8 #include <vector>
9 #include <tuple>
10 
11 #include <tudocomp/io.hpp>
12 
13 
16 
17 
19 
20 
21 
22 
23 
24 #include <tudocomp/ds/TextDS.hpp>
25 
26 #include <tudocomp/Algorithm.hpp>
27 
28 #include <tudocomp/Literal.hpp>
31 
33 
34 
35 //#include <tudocomp/tudocomp.hpp>
36 
37 
38 namespace tdc {
39 namespace lfs {
40 
41 template<typename literal_coder_t, typename len_coder_t >
42 class EncodeStrategy : public Algorithm {
43 private:
44 
45  typedef std::tuple<uint,uint,uint> non_term;
46  typedef std::vector<non_term> non_terminal_symbols;
47 
48  typedef std::vector<std::pair<uint,uint>> rules;
49 
50 public:
51 
52  using Algorithm::Algorithm; //import constructor
53 
54  inline static Meta meta() {
55  Meta m("lfs_comp_enc", "lfs_enocde_strat");
56  m.option("lfs_lit_coder").templated<literal_coder_t, HuffmanCoder>("lfs_lit_coder");
57  m.option("lfs_len_coder").templated<len_coder_t, EliasGammaCoder>("lfs_len_coder");
58  return m;
59  }
60 
61 
62  inline void encode(io::InputView & in, Output & output, rules & dictionary, non_terminal_symbols & nts_symbols){
63 
64 
65  // encode dictionary:
66  DLOG(INFO) << "encoding dictionary symbol sizes ";
67 
68  std::shared_ptr<BitOStream> bitout = std::make_shared<BitOStream>(output);
69  typename literal_coder_t::Encoder lit_coder(
70  env().env_for_option("lfs_lit_coder"),
71  bitout,
72  ViewLiterals(in)
73  );
74  typename len_coder_t::Encoder len_coder(
75  env().env_for_option("lfs_len_coder"),
76  bitout,
77  NoLiterals()
78  );
79 
80  auto it = dictionary.begin();
81 
82  Range intrange (0, UINT_MAX);//uint32_r
83  if(dictionary.size() >=1 ){
84 
85  std::pair<uint,uint> symbol = *it;
86  uint last_length=symbol.second;
87  Range s_length_r (0,last_length);
88  len_coder.encode(last_length,intrange);
89  it++;
90 
91  while (it != dictionary.end()){
92  symbol=*it;
93  len_coder.encode(last_length-symbol.second,s_length_r);
94  last_length=symbol.second;
95  it++;
96 
97  }
98  len_coder.encode(symbol.second,s_length_r);
99  }else {
100  len_coder.encode(0,intrange);
101 
102  }
103 
104  long buf_size = bitout->tellp();
105 
106  StatPhase::log("Bytes Length Encoding", buf_size);
107  uint literals=0;
108 
109 
110  DLOG(INFO) << "encoding dictionary symbols";
111  // encode dictionary strings:
112  if(dictionary.size() >=1 ){
113  auto it = dictionary.begin();
114  std::pair<uint,uint> symbol;
115 
116  while(it != dictionary.end()){
117  //first length of non terminal symbol
118  symbol = *it;
119 
120  for(uint k = 0; k<symbol.second; k++){
121  lit_coder.encode(in[symbol.first + k],literal_r);
122  literals++;
123  }
124  it++;
125  }
126  }
127 
128  StatPhase::log("Literals in Dictionary", literals);
129 
130  buf_size = long(bitout->tellp()) - buf_size;
131  StatPhase::log("Bytes Non-Terminal Symbol Encoding", buf_size);
132 
133  literals=0;
134 
135  Range dict_r(0, dictionary.size());
136  //encode string
137  uint pos = 0;
138  uint start_position;
139  uint symbol_number ;
140  uint symbol_length;
141  bool first_char = true;
142  for(auto it = nts_symbols.begin(); it!= nts_symbols.end(); it++){
143 
144 
145  std::tuple<uint,uint,uint> next_position = *it;
146 
147  start_position = std::get<0>(next_position);
148  symbol_number =std::get<1>(next_position);
149  symbol_length = std::get<2>(next_position);
150 
151  while(pos< start_position){
152  //get original text, because no symbol...
153  lit_coder.encode(0, bit_r);
154  lit_coder.encode(in[pos], literal_r);
155 
156  literals++;
157 
158  if(first_char){
159  first_char=false;
160  }
161  pos++;
162  }
163 
164  //write symbol number
165 
166  lit_coder.encode(1, bit_r);
167  lit_coder.encode(symbol_number, dict_r);
168 
169  pos += symbol_length;
170 
171  }
172  //if no more terminals, write rest of text
173  while( pos<in.size()){
174  lit_coder.encode(0, bit_r);
175  lit_coder.encode(in[pos], literal_r);
176  pos++;
177 
178  literals++;
179  }
180 
181  buf_size = long(bitout->tellp()) - buf_size;
182  StatPhase::log("Bytes Start Symbol Encoding", buf_size);
183 
184  StatPhase::log("Literals in Start Symbol", literals);
185 
186  DLOG(INFO) << "compression with lfs done";
187 
188  }
189 
190  inline void decode(Input & input, Output & output){
191 
192  DLOG(INFO) << "decompress lfs";
193  std::shared_ptr<BitIStream> bitin = std::make_shared<BitIStream>(input);
194 
195  typename literal_coder_t::Decoder lit_decoder(
196  env().env_for_option("lfs_lit_coder"),
197  bitin
198  );
199  typename len_coder_t::Decoder len_decoder(
200  env().env_for_option("lfs_len_coder"),
201  bitin
202  );
203  Range int_r (0,UINT_MAX);
204 
205  uint symbol_length = len_decoder.template decode<uint>(int_r);
206  Range slength_r (0, symbol_length);
207  std::vector<uint> dict_lengths;
208  dict_lengths.reserve(symbol_length);
209  dict_lengths.push_back(symbol_length);
210  while(symbol_length>0){
211 
212  uint current_delta = len_decoder.template decode<uint>(slength_r);
213  symbol_length-=current_delta;
214  dict_lengths.push_back(symbol_length);
215  }
216  dict_lengths.pop_back();
217 
218  std::vector<std::string> dictionary;
219  uint dictionary_size = dict_lengths.size();
220 
221  Range dictionary_r (0, dictionary_size);
222 
223 
224  uint length_of_symbol;
225  std::string non_terminal_symbol;
226  DLOG(INFO) << "reading dictionary";
227  for(uint i = 0; i< dict_lengths.size();i++){
228  non_terminal_symbol ="";
229  char c1;
230  length_of_symbol=dict_lengths[i];
231  for(uint i =0; i< length_of_symbol;i++){
232  c1 = lit_decoder.template decode<char>(literal_r);
233  non_terminal_symbol += c1;
234  }
235  dictionary.push_back(non_terminal_symbol);
236  }
237  auto ostream = output.as_stream();
238 
239  while(!lit_decoder.eof()){
240  //decode bit
241  bool bit1 = lit_decoder.template decode<bool>(bit_r);
242  char c1;
243  uint symbol_number;
244  // if bit = 0 its a literal
245  if(!bit1){
246  c1 = lit_decoder.template decode<char>(literal_r); // Dekodiere Literal
247 
248  ostream << c1;
249  } else {
250  //else its a non-terminal
251  symbol_number = lit_decoder.template decode<uint>(dictionary_r); // Dekodiere Literal
252 
253  if(symbol_number < dictionary.size()){
254 
255  ostream << dictionary.at(symbol_number);
256  } else {
257  DLOG(INFO)<< "too large symbol: " << symbol_number;
258  }
259 
260  }
261  }
262  }
263 
264 
265 
266 };
267 }
268 }
269 
Represents a generic range of positive integers.
Definition: Range.hpp:16
Contains the text compression and encoding framework.
Definition: namespaces.hpp:11
Provides meta information about an Algorithm.
Definition: Meta.hpp:34
constexpr auto bit_r
Global predefined range for bits (0 or 1).
Definition: Range.hpp:108
unsigned int uint
Definition: characterhash.h:6
size_type size() const
Returns size of the View.
A literal iterator that yields every character from a View.
Definition: Literal.hpp:41
void decode(Input &input, Output &output)
Algorithm(Algorithm const &)=default
Env & env()
Provides access to the environment that the algorithm works in.
Definition: Algorithm.hpp:51
OutputStream as_stream() const
Creates a stream that allows for character-wise output.
An empty literal iterator that yields no literals whatsoever.
Definition: Literal.hpp:37
constexpr auto literal_r
Global predefined reange for literals.
Definition: Range.hpp:111
An abstraction layer for algorithm output.
Definition: Output.hpp:23
len_compact_t pos
Definition: LZSSFactors.hpp:38
Defines data encoding to and decoding from a stream of Elias-Gamma codes.
static void log(const char *key, const T &value)
Logs a user statistic for the current phase.
Definition: StatPhase.hpp:218
void encode(io::InputView &in, Output &output, rules &dictionary, non_terminal_symbols &nts_symbols)
void templated(const std::string &accepted_type)
Declares that this option accepts values of the specified Algorithm type T.
Definition: Meta.hpp:93
Provides a view on the input that allows for random access.
Definition: InputView.hpp:29
OptionBuilder option(const std::string &name)
Declares an accepted option for this algorithm.
Definition: Meta.hpp:216
Interface for algorithms.
Definition: Algorithm.hpp:15
An abstraction layer for algorithm input.
Definition: Input.hpp:37