tudocomp
– The TU Dortmund Compression Framework
compressors/esp/ArithmeticCoder.hpp
Go to the documentation of this file.
1 #pragma once
2 
3 #include <sstream>
4 #include <tudocomp/Coder.hpp>
5 #include <iostream>
6 
7 namespace tdc {namespace esp {
8 
11  private:
12  using CountMap = std::vector<size_t>;
13 
14  CountMap m_count_map;
15  size_t m_codebook_size = 0;
16  size_t m_literal_count = 0;
17  size_t m_min_range = std::numeric_limits<size_t>::max();
18  size_t m_literal_counter = 0;
19 
20  size_t m_lower_bound = 0;
21  size_t m_upper_bound = std::numeric_limits<size_t>::max();
22 
23  std::shared_ptr<BitOStream> m_out;
24 
30  template<class T>
31  CountMap count_alphabet_literals(const T& input) {
32  size_t max_value = 0;
33  for (size_t i = 0; i < input.size(); i++) {
34  max_value = std::max(max_value, input[i]);
35  }
36  CountMap ret;
37  size_t overrite = 0; //ULITERAL_MAX+1
38  ret.reserve(std::max(max_value + 1, overrite));
39  ret.resize(std::max(max_value + 1, overrite));
40 
41  for (size_t i = 0; i < input.size(); i++) {
42  size_t val = input[i];
43  DCHECK_LT(ret[val], std::numeric_limits<size_t>::max());
44  ret[val]++;
45  }
46 
47  std::cout << vec_to_debug_string(ret) << "\n";
48 
49  return ret;
50  }
51 
57  void build_intervals(CountMap& c) {
58  if(c[0]) {
59  m_codebook_size++;
60  }
61  size_t min = std::numeric_limits<size_t>::max();
62  //calculates difference to the entry before, searches min and counts entries != 0
63  for(size_t i = 1; i < c.size(); i++) {
64  if(c[i] != 0) {
65  m_codebook_size++;
66  min = std::min(min, c[i]);
67  }
68  c[i] = c[i] + c[i - 1]; // ?
69  }
70  m_literal_count = c[c.size() - 2]; // ???
71 
72  //normalize all Intervals
73  for(size_t i = 1; i < c.size(); i++) {
74  c[i] = c[i] / min;
75  }
76  m_min_range = c[c.size() - 2]; // ???
77  }
78 
79 
80  inline void setNewBounds(size_t v) {
81  size_t range = m_upper_bound - m_lower_bound;
82  //check if all characters can be uniquely mapped to range
83  if(range < m_min_range) {
84  writeCode(m_lower_bound);
85  m_lower_bound = 0;
86  m_upper_bound = std::numeric_limits<size_t>::max();
87  range = m_upper_bound - m_lower_bound;
88  }
89  DCHECK_NE(m_lower_bound, m_upper_bound);
90 
91  const size_t literal_count = m_count_map[m_count_map.size() - 1];
92 
93  //unsure if this is needed
94  const size_t offset_upper = range <= literal_count
95  ? (range * m_count_map[v]) / literal_count
96  : (range / literal_count) * m_count_map[v];
97 
98  m_upper_bound = m_lower_bound + offset_upper;
99 
100  if(v != 0) { //first interval starts at zero
101  const size_t offset_lower = range <= literal_count
102  ? (range * m_count_map[v - 1]) / literal_count
103  : (range / literal_count) * m_count_map[v - 1];
104  m_lower_bound = m_lower_bound + offset_lower;
105  }
106  }
107 
108  inline void writeCodebook() {
115 
116  // TODO: Bit compressed integers benutzen
117 
118  //write count of expected chars
119  m_out->write_int<size_t>(m_literal_count);
120  std::cout<< " " << __LINE__ << " write " << size_t(m_literal_count) << " \n";
121 
122  //write codebook size in outstream
123  m_out->write_int<size_t>(m_codebook_size);
124  std::cout<< " " << __LINE__ << " write " << size_t(m_codebook_size) << " \n";
125 
126  if(m_count_map[0] != 0) {
127  m_out->write_int<size_t>(0);
128  std::cout<< " " << __LINE__ << " write " << size_t(0) << " \n";
129  m_out->write_int<size_t>(m_count_map[0]);
130  std::cout<< " " << __LINE__ << " write " << size_t(m_count_map[0]) << " \n";
131  }
132  for(size_t i = 1; i < m_count_map.size(); i++) {
133  if(m_count_map[i] != m_count_map[i-1]) {
134  m_out->write_int<size_t>(i);
135  std::cout<< " " << __LINE__ << " write " << size_t(i) << " \n";
136  m_out->write_int<size_t>(m_count_map[i]);
137  std::cout<< " " << __LINE__ << " write " << size_t(m_count_map[i]) << " \n";
138  }
139  }
140  }
141 
142  inline void writeCode(size_t code) {
143  m_out->write_int<size_t>(code);
144  std::cout<< " " << __LINE__ << " write " << size_t(code) << " \n";
145  }
146 
147  //write last code-block
148  void postProcessing() {
149  writeCode(m_lower_bound);
150  //dummy codeblock - needed to read until no more code-blocks
151  // TODO: Make sure this works with bit optimal encoding
152  m_out->write_int<size_t>(std::numeric_limits<size_t>::max());
153  std::cout<< " " << __LINE__ << " write " << size_t(std::numeric_limits<size_t>::max()) << " \n";
154  }
155 
156  public:
157  template<typename input_t>
158  ArithmeticEncoder(const std::shared_ptr<BitOStream>& out,
159  const input_t& literals):
160  m_out(out),
161  m_count_map(count_alphabet_literals(literals))
162  {
163  build_intervals(m_count_map);
164  writeCodebook();
165  }
166 
167  inline void encode(size_t v) {
168  m_literal_counter++;
169  setNewBounds(v);
170 
171  if(m_literal_counter == m_literal_count) {
172  postProcessing();
173  }
174  }
175  };
176 
179  private:
180  std::shared_ptr<BitIStream> m_in;
181 
182  size_t m_codebook_size = 0;
183  size_t m_literal_count = 0;
184  std::vector<std::pair<size_t, size_t>> m_literals;
185  size_t m_min_range = std::numeric_limits<size_t>::max();
186  size_t m_literals_read = 0;
187  size_t m_literal_counter = 0;
188 
189  std::vector<size_t> m_decoded;
190 
191  void decode(size_t code) {
192  size_t lower_bound = 0;
193  size_t upper_bound = std::numeric_limits<size_t>::max();
194  std::vector<size_t> os;
195  size_t interval_parts = m_literals[m_codebook_size - 1].second;
196  //count of characters in stream
197 
198  size_t range = upper_bound - lower_bound;
199 
200  //stop decoding code-bllock when range is too small or all literals read
201  while(m_min_range <= range && m_literal_counter < m_literal_count) {
202  size_t interval_lower_bound = lower_bound;
203 
204  //search the right interval
205  for(size_t i = 0; i < m_codebook_size ; i++) {
206  const std::pair<size_t, size_t>& pair = m_literals[i];
207 
208  const size_t offset = range <= interval_parts
209  ? range * pair.second / interval_parts
210  : range / interval_parts * pair.second;
211 
212  upper_bound = lower_bound + offset;
213 
214  std::cout << "i: " << i << ", code: " << code << ", upper_bound: " << upper_bound << "\n";
215 
216  if(code < upper_bound) {
217  //character decoded
218  os.push_back(pair.first);
219  lower_bound = interval_lower_bound;
220  break;
221  }
222  interval_lower_bound = upper_bound;
223  }
224  m_literal_counter++;
225  range = upper_bound - lower_bound;
226  }
227 
228  m_decoded = os;
229 
230  std::cout << "decoded: " << vec_to_debug_string(m_decoded) << "\n";
231 
232  m_literals_read = 0;
233  }
234 
235  public:
236  ArithmeticDecoder(const std::shared_ptr<BitIStream>& in):
237  m_in(in)
238  {
239  //read codebook size
240  m_literal_count = m_in->read_int<size_t>();
241  std::cout<< " " << __LINE__ << " read " << size_t(m_literal_count) << " \n";
242  m_codebook_size = m_in->read_int<size_t>();
243  std::cout<< " " << __LINE__ << " read " << size_t(m_codebook_size) << " \n";
244 
245  m_literals.reserve(m_codebook_size);
246  m_literals.resize(m_codebook_size);
247 
248  //read and parse dictionary - it is already "normalized"
249  for (size_t i = 0; i < m_codebook_size; i++) {
250  size_t c = m_in->read_int<size_t>();
251  std::cout<< " " << __LINE__ << " read " << size_t(c) << " \n";
252  size_t val = m_in->read_int<size_t>();
253  std::cout<< " " << __LINE__ << " read " << size_t(val) << " \n";
254  m_literals[i] = std::pair<size_t, size_t>(c, val);
255  }
256 
257  m_min_range = m_literals[m_codebook_size - 1].second;
258  }
259 
260  inline size_t decode() {
261  //read code if nothing buffered
262  if(!m_decoded.size()) {
263  std::cout<< " " << __LINE__ << " cond 1 " << " \n";
264  size_t code = m_in->read_int<size_t>();
265  std::cout<< " " << __LINE__ << " read " << size_t(code) << " \n";
266  // TODO: NB! Special care if bit optimal
267  if(code != std::numeric_limits<size_t>::max()) {
268  std::cout<< " " << __LINE__ << " cond 2 " << " \n";
269  //code must not be a dummy-code
270  decode(code);
271  }
272  }
273 
274  size_t val = m_decoded[m_literals_read++];
275  std::cout << "val: " << size_t(val) << ", lread: " << m_literals_read << "\n";
276 
277  //if all buffered literals are read: decode the next buffer
278  if(m_literals_read == m_decoded.size()) {
279  std::cout<< " " << __LINE__ << " cond 3 " << " \n";
280  size_t code = m_in->read_int<size_t>();
281  std::cout<< " " << __LINE__ << " read " << size_t(code) << " \n";
282  // TODO: NB! Special care if bit optimal
283  if(code != std::numeric_limits<size_t>::max()) {
284  std::cout<< " " << __LINE__ << " cond 4 " << " \n";
285  //code must not be a dummy-code
286  decode(code);
287  }
288  }
289 
290  return val;
291  }
292  };
293 }}
Contains the text compression and encoding framework.
Definition: namespaces.hpp:11
Encodes data to an ASCII character stream.
std::string vec_to_debug_string(const T &s, size_t indent=0)
Builds the string representation of a vector of byte values, sorrounded by square brackets ([ and ])...
Decodes data from an Arithmetic character stream.
ArithmeticEncoder(const std::shared_ptr< BitOStream > &out, const input_t &literals)
ArithmeticDecoder(const std::shared_ptr< BitIStream > &in)