tudocomp
– The TU Dortmund Compression Framework
test/test/util.hpp
Go to the documentation of this file.
1 #pragma once
2 
3 #include <cstdint>
4 #include <fstream>
5 #include <iostream>
6 #include <string>
7 #include <memory>
8 
9 #include <glog/logging.h>
10 #include <gtest/gtest.h>
11 
12 #include <sys/stat.h>
13 
14 #include <tudocomp/Algorithm.hpp>
16 #include <tudocomp/Env.hpp>
17 #include <tudocomp/Registry.hpp>
18 #include <tudocomp/Compressor.hpp>
20 #include <tudocomp/io.hpp>
21 #include <tudocomp/util/View.hpp>
26 #include <tudocomp/Literal.hpp>
27 #include <tudocomp/Range.hpp>
28 #include <tudocomp/io/Path.hpp>
29 
30 namespace tdc {
31 namespace test {
32 
33 // TODO: Actually specialize the 3 kinds
34 
36 template<class T, class U>
37 void assert_eq_strings(const T& expected_, const U& actual_) {
38  std::string expected(expected_.begin(), expected_.end());
39  std::string actual(actual_.begin(), actual_.end());
40 
41  ASSERT_EQ(expected, actual);
42 }
43 
45 template<class T, class U>
46 void assert_eq_integers(const T& expected_, const U& actual_) {
47  std::vector<uint64_t> expected(expected_.begin(), expected_.end());
48  std::vector<uint64_t> actual(actual_.begin(), actual_.end());
49 
50  ASSERT_EQ(expected, actual);
51 }
52 
54 template<class T, class U>
55 void assert_eq_hybrid_strings(const T& expected, const U& actual) {
56  ASSERT_EQ(expected, actual);
57 }
58 
60 template<class T, class U>
61 void assert_eq_sequence(const T& expected, const U& actual) {
62  ASSERT_EQ(expected.size(), actual.size()) << "assert_eq_sequence: sizes differ";
63  for (size_t i = 0; i < expected.size(); i++)
64  ASSERT_EQ(expected[i], actual[i]) << "assert_eq_sequence: failed at i=" << i;
65 }
66 
74 template<class Lambda>
75 std::string ostream_to_string(Lambda f) {
76  std::stringstream ss;
77  std::ostream& os = ss;
78  f(os);
79  return ss.str();
80 }
81 
90 template<class Lambda>
91 std::vector<uint8_t> ostream_to_bytes(Lambda f) {
92  auto s = ostream_to_string(f);
93  return std::vector<uint8_t>(s.begin(), s.end());
94 }
95 
98 template<class F>
99 void roundtrip_batch(F f) {
100  f("abcdebcdeabc"_v);
101  f("a"_v);
102  f(""_v);
103 
104  f("aaaaaaaaa"_v); \
105  f("banana"_v); \
106  f("ananas"_v); \
107  f("abcdefgh#defgh_abcde"_v); \
108 
109  f("abcdebcdeabcd"_v);
110  f("foobar"_v);
111  f("abcabcabcabc"_v);
112 
113  f("abc abc abc"_v);
114 
115  f("abaaabbababb"_v);
116 
117  f(
118  "asdfasctjkcbweasbebvtiwetwcnbwbbqnqxernqzezwuqwezuet"
119  "qcrnzxbneqebwcbqwicbqcbtnqweqxcbwuexcbzqwezcqbwecqbw"
120  "dassdasdfzdfgfsdfsdgfducezctzqwebctuiqwiiqcbnzcebzqc"_v);
121 
122  f("ประเทศไทย中华Việt Nam"_v);
123 
124  f(
125  "Lorem ipsum dolor sit amet, sea ut etiam solet salut"
126  "andi, sint complectitur et his, ad salutandi imperdi"
127  "et gubergren per mei."_v);
128 
129  f(
130  "Лорэм атоморюм ут хаж, эа граэки емпыдит ёудёкабет "
131  "мэль, декам дежпютатионй про ты. Нэ ёужто жэмпэр"
132  " жкрибэнтур векж, незл коррюмпит."_v);
133 
134  f(
135  "報チ申猛あち涙境ワセ周兵いわ郵入せすをだ漏告されて話巡わッき"
136  "や間紙あいきり諤止テヘエラ鳥提フ健2銀稿97傷エ映田ヒマ役請多"
137  "暫械ゅにうて。関国ヘフヲオ場三をおか小都供セクヲ前俳著ゅ向深"
138  "まも月10言スひす胆集ヌヱナ賀提63劇とやぽ生牟56詰ひめつそ総愛"
139  "ス院攻せいまて報当アラノ日府ラのがし。"_v);
140 
141  f(
142  "Εαμ ανσιλλαε περισυλα συαφιθαθε εξ, δυο ιδ ρεβυμ σομ"
143  "μοδο. Φυγιθ ηομερω ιυς ατ, ει αυδιρε ινθελλεγαμ νες."
144  " Ρεκυε ωμνιυμ μανδαμυς κυο εα. Αδμοδυμ σωνσεκυαθ υθ "
145  "φιξ, εσθ ετ πρωβατυς συαφιθαθε ραθιονιβυς, ταντας αυ"
146  "διαμ ινστρυσθιορ ει σεα."_v);
147 
148  f("struct Foo { uint8_t bar }"_v);
149 
150  f("ABBCBCABA"_v);
151 
152  f("abcabca"_v);
153 
154  f("abbbbbbbbbbcbbbbbbbbbb"_v);
155 
156  //f("abc\0"_v);
157 
158  std::vector<uint8_t> all_bytes {
159  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
160  16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
161  32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
162  48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
163  64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
164  80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95,
165  96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111,
166  112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127,
167  128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143,
168  144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159,
169  160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175,
170  176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191,
171  192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207,
172  208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223,
173  224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239,
174  240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255,
175  };
176 
177  //f(View(all_bytes));
178 }
179 
180 template<class F>
181 void on_string_generators(F func, size_t n) {
182  VLOG(1) << "fibonacci_word ...";
183  for(size_t i = 0; i < n; ++i) {
184  std::string s = FibonacciGenerator::generate(i);
185  func(s);
186  }
187 
188  VLOG(1) << "thue_morse_word ...";
189  for(size_t i = 0; i < n; ++i) {
190  std::string s = ThueMorseGenerator::generate(i);
191  func(s);
192  }
193 
194  VLOG(1) << "rich ...";
195  for(size_t i = 0; i < n; ++i) {
196  std::string s = RunRichGenerator::generate(i);
197  func(s);
198  }
199 
200  VLOG(1) << "random ...";
201  for(size_t i = 2; i < n; ++i) {
202  for(size_t j = 0; j < 2+50/(i+1); ++j) {
203  std::string s = RandomUniformGenerator::generate(1<<i, j);
204  func(s);
205  }
206  }
207 }
208 
209 const std::string TEST_FILE_PATH = "test_files";
210 
211 inline std::string test_file_path(const std::string& filename) {
212  return TEST_FILE_PATH + "/" + filename;
213 }
214 
215 inline bool test_file_exists(const std::string& filename) {
216  std::string test_file_name = test_file_path(filename);
217 
218  struct stat buf;
219  return (stat(test_file_name.c_str(), &buf) == 0);
220 }
221 
222 inline std::string read_test_file(const std::string& filename) {
223  std::ostringstream sout;
224 
225  std::string test_file_name = test_file_path(filename);
226  std::ifstream fin(test_file_name);
227  if(fin) {
228  sout << fin.rdbuf();
229 
230  fin.close();
231  } else {
232  std::string msg = "Could not open test file \"";
233  msg += test_file_name;
234  msg += "\"";
235  throw std::runtime_error(msg);
236  }
237 
238  auto s = sout.str();
239 
240  return s;
241 }
242 
243 inline void create_test_directory() {
244  mkdir(TEST_FILE_PATH.c_str(), 0777);
245 }
246 
247 inline void write_test_file(const std::string& filename, string_ref text) {
249  std::ofstream fout(test_file_path(filename));
250  if(fout) {
251  fout << text;
252  fout.close();
253  }
254 }
255 
256 inline void remove_test_file(const std::string& filename) {
258  remove(test_file_path(filename).c_str());
259 }
260 
261 inline std::vector<uint8_t> pack_integers(std::vector<uint64_t> ints) {
262  CHECK(ints.size() % 2 == 0);
263  std::vector<uint8_t> bits;
264 
265  uint bit_pos = 8;
266  for (size_t i = 0; i < ints.size(); i += 2) {
267  uint64_t val = ints[i];
268  uint64_t val_bits = ints[i + 1];
269  for (uint64_t bit = 0; bit < val_bits; bit++) {
270  if (bit_pos == 8) {
271  bits.push_back(0);
272  bit_pos = 0;
273  }
274 
275  uint8_t& b = bits[bits.size() - 1];
276  if (val & (uint64_t(1) << (val_bits - bit - 1))) {
277  b |= (1 << (7 - bit_pos));
278  }
279 
280  bit_pos++;
281  }
282  }
283 
284  return bits;
285 }
286 
287 std::string format_diff(const std::string& a, const std::string& b) {
288  std::string diff;
289  for(size_t i = 0; i < std::max(a.size(), b.size()); i++) {
290  if (i < std::min(a.size(), b.size())
291  && a[i] == b[i]
292  ) {
293  diff.push_back('-');
294  } else {
295  diff.push_back('#');
296  }
297  }
298  return diff;
299 }
300 
301 std::string format_diff_bin(const std::string& a, const std::string& b) {
302  std::string diff;
303  for(size_t i = 0; i < std::max(a.size(), b.size()); i++) {
304  if (i < std::min(a.size(), b.size())
305  && a[i] == ' ' && b[i] == ' '
306  ) {
307  diff.push_back(' ');
308  } else if (i < std::min(a.size(), b.size())
309  && a[i] == b[i]
310  ) {
311  diff.push_back('-');
312  } else {
313  diff.push_back('#');
314  }
315  }
316  return diff;
317 }
318 
319 using PacketIntegers = std::vector<uint64_t>;
321  auto out = test::pack_integers(expected);
322  if (actual != out) {
323  auto print_bits = [&expected](string_ref s, bool byte_units = false) -> std::string {
324  std::stringstream ss;
325 
326  // iterate over bits
327 
328  size_t packed_i = 0;
329  size_t last_packed_i = 0;
330 
331  for (uint64_t i = 0; i < s.size() * 8; i++) {
332  if (!byte_units && packed_i < expected.size() && i == last_packed_i + expected[packed_i + 1]) {
333  ss << " ";
334  last_packed_i = i;
335  packed_i += 2;
336  } else if (byte_units && i > 0 && i % 8 == 0) {
337  ss << " ";
338  }
339 
340  uint8_t c = s[i / 8];
341  ss << int((c >> (8 - (i % 8) - 1)) & 1);
342  }
343 
344  return ss.str();
345  };
346 
347  auto p_is = print_bits(actual);
348  auto p_should = print_bits(out);
349  auto p_diff = format_diff_bin(p_is, p_should);
350 
351  auto p_is_b = print_bits(actual, true);
352  auto p_should_b = print_bits(out, true);
353  auto p_diff_b = format_diff_bin(p_is_b, p_should_b);
354 
355  FAIL()
356  << "Should Be: " << p_should << "\n"
357  << " Is: " << p_is << "\n"
358  << " Diff: " << p_diff << "\n"
359  << "As Bytes:" << "\n"
360  << "Should Be: " << p_should_b << "\n"
361  << " Is: " << p_is_b << "\n"
362  << " Diff: " << p_diff_b << "\n";
363  }
364 }
365 
366 template<class C>
368 private:
369  Registry<Compressor> m_registry;
370 public:
371  std::vector<uint8_t> bytes;
372  std::string str;
373  std::string orginal_text;
374  std::string options;
375 
377  std::vector<uint8_t>&& p_bytes,
378  std::string&& p_str,
379  std::string&& p_original,
380  std::string&& p_options):
381  m_registry(registry),
382  bytes(std::move(p_bytes)),
383  str(std::move(p_str)),
384  orginal_text(std::move(p_original)),
385  options(std::move(p_options)) {}
386 
388  std::vector<uint8_t> decoded_buffer;
389  {
390  Input text_in = Input::from_memory(bytes);
391  Output decoded_out = Output::from_memory(decoded_buffer);
392 
393  auto compressor = create_algo_with_registry<C>(options, m_registry);
394 
395  if (C::meta().textds_flags().has_restrictions()) {
396  decoded_out = Output(decoded_out, C::meta().textds_flags());
397  }
398 
399  compressor.decompress(text_in, decoded_out);
400  }
401  std::string decompressed_text {
402  decoded_buffer.begin(),
403  decoded_buffer.end(),
404  };
405  ASSERT_EQ(orginal_text, decompressed_text);
406  }
407 
409  std::vector<uint8_t> decompressed_bytes;
410  {
411  Input text_in = Input::from_memory(bytes);
412  Output decoded_out = Output::from_memory(decompressed_bytes);
413 
414  auto compressor = create_algo_with_registry<C>(options, m_registry);
415 
416  if (C::meta().textds_flags().has_restrictions()) {
417  decoded_out = Output(decoded_out, C::meta().textds_flags());
418  }
419 
420  compressor.decompress(text_in, decoded_out);
421  }
422  std::vector<uint8_t> orginal_bytes {
423  orginal_text.begin(),
424  orginal_text.end(),
425  };
426  ASSERT_EQ(orginal_bytes, decompressed_bytes);
427  }
428 };
429 
430 template<class C>
431 class RoundTrip {
432  std::string m_options;
433  Registry<Compressor> m_registry;
434 public:
435  inline RoundTrip(const std::string& options = "",
436  const Registry<Compressor>& registry = Registry<Compressor>("compressor")):
437  m_options(options),
438  m_registry(registry)
439  {
440  }
441 
443  std::vector<uint8_t> encoded_buffer;
444  {
445  Input text_in = Input::from_memory(text);
446  Output encoded_out = Output::from_memory(encoded_buffer);
447 
448  auto compressor = create_algo_with_registry<C>(m_options, m_registry);
449 
450  if (C::meta().textds_flags().has_restrictions()) {
451  text_in = Input(text_in, C::meta().textds_flags());
452  }
453  compressor.compress(text_in, encoded_out);
454  }
455  std::string s(encoded_buffer.begin(), encoded_buffer.end());
456  return CompressResult<C> {
457  m_registry,
458  std::move(encoded_buffer),
459  std::move(s),
460  std::string(text),
461  std::string(m_options),
462  };
463  }
464 };
465 
466 template<class T>
468  const std::string& options = "",
469  const Registry<Compressor>& registry = Registry<Compressor>("compressor")) {
470  return RoundTrip<T>(options, registry).compress(text);
471 }
472 
473 template<class T>
474 inline void roundtrip_ex(string_ref original_text,
475  string_ref expected_compressed_text,
476  const std::string& options = "",
477  const Registry<Compressor>& registry = Registry<Compressor>("compressor")) {
478  auto e = RoundTrip<T>(options, registry).compress(original_text);
479  auto& compressed_text = e.str;
480 
481  if(expected_compressed_text.size() > 0) {
482  ASSERT_EQ(std::string(expected_compressed_text), compressed_text);
483  }
484 
485  e.assert_decompress();
486 }
487 
488 template<class T>
489 inline void roundtrip(string_ref original_text) {
490  roundtrip_ex<T>(original_text, "");
491 }
492 
493 template<class T>
494 inline void roundtrip_binary(string_ref original_text,
495  const std::vector<uint64_t>& expected_compressed_text_packed_ints = {},
496  const std::string& options = "",
497  const Registry<Compressor>& registry = Registry<Compressor>("compressor")) {
498  auto e = RoundTrip<T>(options, registry).compress(original_text);
499  auto& compressed_text = e.bytes;
500 
501  if(expected_compressed_text_packed_ints.size() > 0)
502  assert_eq_binary(compressed_text, expected_compressed_text_packed_ints);
503 
504  e.assert_decompress_bytes();
505 }
506 
507 class TestInput: public Input {
508 public:
509  inline TestInput(string_ref text, bool sentinel): Input(text) {
510  if (sentinel) {
511  ((Input&) *this) = Input(*this, io::InputRestrictions({0}, true));
512  }
513  }
514  inline TestInput(io::Path&& path, bool sentinel): Input(std::move(path)) {
515  if (sentinel) {
516  ((Input&) *this) = Input(*this, io::InputRestrictions({0}, true));
517  }
518  }
519 };
520 
521 class TestOutput: std::vector<uint8_t>, public Output {
522 public:
523  inline TestOutput(bool sentinel): std::vector<uint8_t>(),
524  Output(static_cast<std::vector<uint8_t>&>(*this))
525  {
526  if (sentinel) {
527  ((Output&) *this) = Output(*this, io::InputRestrictions({0}, true));
528  }
529  }
530 
531  inline string_ref result() { return *this; }
532 };
533 
538  return TestInput(text, true);
539 }
540 
545  return TestInput(io::Path{std::string(path)}, true);
546 }
547 
552  return TestOutput(false);
553 }
554 
559  return TestInput(text, false);
560 }
561 
566  return TestInput(io::Path{std::string(path)}, false);
567 }
568 
573  return TestOutput(true);
574 }
575 
576 
577 template<typename Coder>
578 void test_binary_out(string_ref in, std::vector<uint64_t> packed_ints_out, bool interleave = false) {
579  using namespace tdc;
580 
581  auto v = in;
582  test::TestOutput o(false);
583  {
584  auto env = tdc::builder<Coder>().env();
585  std::shared_ptr<BitOStream> bo = std::make_shared<BitOStream>(o);
586  typename Coder::Encoder coder(std::move(env), bo, ViewLiterals(v));
587 
588  bool was_zero = true;
589  for (auto c : v) {
590  if (was_zero && interleave) {
591  bo->write_int<uliteral_t>(0b01010101, 8);
592  was_zero = false;
593  }
594  coder.encode(c, literal_r);
595  if (c == 0) {
596  was_zero = true;
597  }
598  }
599  }
600  auto res = o.result();
601  test::assert_eq_binary(res, packed_ints_out);
602 }
603 
604 }} //ns
605 
TestOutput decompress_output()
Creates an instance of an tdc::Output to be used with Compressor::decompress().
Contains the text compression and encoding framework.
Definition: namespaces.hpp:11
std::string ostream_to_string(Lambda f)
Temporary provides a ostream to write into, and returns it as a string.
CompressResult< C > compress(string_ref text)
TestInput(string_ref text, bool sentinel)
virtual std::string generate() override
Generates a string based on the environment settings.
io::Input Input
Convenience shortcut to io::Input.
Definition: io.hpp:17
void roundtrip_binary(string_ref original_text, const std::vector< uint64_t > &expected_compressed_text_packed_ints={}, const std::string &options="", const Registry< Compressor > &registry=Registry< Compressor >("compressor"))
A const view into a slice of memory.
void assert_eq_integers(const T &expected_, const U &actual_)
Error diagnostic optimized for binary data.
void remove_test_file(const std::string &filename)
uint8_t uliteral_t
Type to represent signed single literals.
Definition: def.hpp:131
void roundtrip_batch(F f)
Call the given function with a number of different strings testing common corner cases and unicode in...
void assert_eq_hybrid_strings(const T &expected, const U &actual)
Error diagnostic optimized for mixed binary/ascii data.
void on_string_generators(F func, size_t n)
Describes a set of restrictions placed on input data.
bool test_file_exists(const std::string &filename)
virtual std::string generate() override
Generates a string based on the environment settings.
unsigned int uint
Definition: characterhash.h:6
void create_test_directory()
TestInput decompress_input_file(string_ref path)
Creates an instance of an tdc::Input to be used with Compressor::decompress().
void test_binary_out(string_ref in, std::vector< uint64_t > packed_ints_out, bool interleave=false)
CompressResult< T > compress(string_ref text, const std::string &options="", const Registry< Compressor > &registry=Registry< Compressor >("compressor"))
TestInput compress_input_file(string_ref path)
Creates an instance of an tdc::Input to be used with Compressor::compress().
size_type size() const
Returns size of the View.
A literal iterator that yields every character from a View.
Definition: Literal.hpp:41
std::vector< uint8_t > ostream_to_bytes(Lambda f)
Temporary provides a ostream to write into, and returns it as a byte vector.
void assert_eq_sequence(const T &expected, const U &actual)
Error diagnostic optimized for arbitrary data.
A registry for algorithms to be made available in the driver application.
void roundtrip_ex(string_ref original_text, string_ref expected_compressed_text, const std::string &options="", const Registry< Compressor > &registry=Registry< Compressor >("compressor"))
static Input from_memory(const std::vector< uint8_t > &buf)
Constructs a file input reading from a byte buffer.
Definition: Input.hpp:187
Represents a file path.
Definition: Path.hpp:8
void assert_eq_binary(string_ref actual, PacketIntegers expected)
CompressResult(const Registry< Compressor > &registry, std::vector< uint8_t > &&p_bytes, std::string &&p_str, std::string &&p_original, std::string &&p_options)
std::string read_test_file(const std::string &filename)
const std::string TEST_FILE_PATH
static Output from_memory(std::vector< uint8_t > &buf)
Constructs an output to a byte buffer.
Definition: Output.hpp:147
TestInput decompress_input(string_ref text)
Creates an instance of an tdc::Input to be used with Compressor::decompress().
constexpr auto literal_r
Global predefined reange for literals.
Definition: Range.hpp:111
An abstraction layer for algorithm output.
Definition: Output.hpp:23
TestInput compress_input(string_ref text)
Creates an instance of an tdc::Input to be used with Compressor::compress().
TestInput(io::Path &&path, bool sentinel)
std::string format_diff(const std::string &a, const std::string &b)
virtual std::string generate() override
Generates a string based on the environment settings.
RoundTrip(const std::string &options="", const Registry< Compressor > &registry=Registry< Compressor >("compressor"))
std::vector< uint8_t > bytes
void roundtrip(string_ref original_text)
void write_test_file(const std::string &filename, string_ref text)
std::string format_diff_bin(const std::string &a, const std::string &b)
void assert_eq_strings(const T &expected_, const U &actual_)
Error diagnostic optimized for string data.
TestOutput compress_output()
Creates an instance of an tdc::Output to be used with Compressor::compress().
std::string test_file_path(const std::string &filename)
std::vector< uint64_t > PacketIntegers
std::vector< uint8_t > pack_integers(std::vector< uint64_t > ints)
TestOutput(bool sentinel)
virtual std::string generate() override
Generates a string based on the environment settings.
io::Output Output
Convenience shortcut to io::Output.
Definition: io.hpp:20
An abstraction layer for algorithm input.
Definition: Input.hpp:37