tudocomp
– The TU Dortmund Compression Framework
bwt.hpp
Go to the documentation of this file.
1 #pragma once
2 
3 #include <cstdint>
4 #include <tudocomp/util/View.hpp>
5 #include <tudocomp/util.hpp>
6 #include <tudocomp/def.hpp>
7 
8 namespace tdc {
9 
12 namespace bwt {
13  static_assert(std::is_same<View::value_type, uliteral_t>::value, "View::value_type and uliteral_t must be the same");
14 
19 template<typename text_t, typename sa_t>
20 inline typename text_t::value_type bwt(const text_t& text, const sa_t& sa, const size_t i) {
21  return (sa[i] == 0u) ? text[sa.size()-1] : text[sa[i]-1];
22 }
23 
28 template<typename bwt_t>
29 len_compact_t* compute_LF(const bwt_t& bwt, const size_t bwt_length) {
30  DVLOG(2) << "Computing LF";
31  if(bwt_length == 0) return nullptr;
32  len_compact_t C[ULITERAL_MAX+1] { 0 }; // alphabet counter
33  for(auto& c : bwt) {
34  if(literal2int(c) != ULITERAL_MAX) {
35  ++C[literal2int(c)+1];
36  }
37  }
38  for(size_t i = 1; i < ULITERAL_MAX; ++i) {
39  DCHECK_LT(static_cast<size_t>(C[i]),bwt.size()+1 - C[i-1]);
40  C[i] += C[i-1];
41  }
42  DVLOG(2) << "C: " << arr_to_debug_string(C,ULITERAL_MAX);
43  DCHECK_EQ(C[0],0u); // no character preceeds 0
44  DCHECK_EQ(C[1],1u); // there is exactly only one '\0' byte
45 
46  len_compact_t* LF { new len_compact_t[bwt_length] };
47  for(len_t i = 0; i < bwt_length; ++i) {
48  DCHECK_LE(literal2int(bwt[i]), ULITERAL_MAX);
49  LF[i] = C[literal2int(bwt[i])];
50  ++C[literal2int(bwt[i])];
51  }
52 
53  DVLOG(2) << "LF: " << arr_to_debug_string(LF, bwt_length);
54 
56  DCHECK([&] () { // unique invariant of the LF mapping
57  assert_permutation(LF,bwt_length);
58  for(len_t i = 0; i < bwt_length; ++i)
59  for(len_t j = i+1; j < bwt_length; ++j) {
60  if(bwt[i] != bwt[j]) continue;
61  DCHECK_LT(LF[i], LF[j]);
62  }
63  return true;
64  }());
65  )
66 
67  DVLOG(2) << "Finished Computing LF";
68  return LF;
69 }
70 
71 
76 template<typename bwt_t>
77 std::string decode_bwt(const bwt_t& bwt) {
78  const size_t bwt_length = bwt.size();
79  VLOG(2) << "InputSize: " << bwt_length;
80  if(tdc_unlikely(bwt_length <= 1)) return std::string();
81 
82  const len_compact_t*const LF { compute_LF(bwt, bwt_length) };
83 
84  std::string decoded_string(bwt_length-1, 0);
85  //decoded_string[bwt_length-1] = 0;
86 
87  len_t i = 0;
88  for(len_t j = 1; j < bwt_length; ++j) {
89  decoded_string[bwt_length - j-1] = bwt[i];
90  i = LF[i];
91 
92  // this debug line only holds for texts that are guaranteed not
93  // to contain no ZERO bytes
94  //DCHECK( (bwt[i] == 0 && j+1 == bwt_length) || (bwt[i] != 0 && j+1 < bwt_length));
95  }
96  delete [] LF;
97  return decoded_string;
98 }
99 
100 }}//ns
101 
Contains the text compression and encoding framework.
Definition: namespaces.hpp:11
std::string decode_bwt(const bwt_t &bwt)
Decodes a BWT It is assumed that the BWT is stored in a container with access to operator[] and ...
Definition: bwt.hpp:77
void assert_permutation(const T &, size_t)
#define tdc_unlikely(x)
Provides a hint to the compiler that x is expected to resolve to false.
Definition: def.hpp:23
constexpr T literal2int(uliteral_t c)
Converts a literal to an integer value as if unsigned.
Definition: def.hpp:142
len_compact_t * compute_LF(const bwt_t &bwt, const size_t bwt_length)
Computes the LF table used for decoding the BWT Input is a BWT and its length.
Definition: bwt.hpp:29
text_t::value_type bwt(const text_t &text, const sa_t &sa, const size_t i)
Computes the value BWT[i] of a text T given its suffix array SA Runs in O(1) time since BWT[i] = SA[(...
Definition: bwt.hpp:20
constexpr size_t ULITERAL_MAX
The maximum value of uliteral_t.
Definition: def.hpp:134
uint32_t len_compact_t
Type to represent an bit-compact length value.
Definition: def.hpp:103
fast_t< len_compact_t > len_t
Type to represent an length value.
Definition: def.hpp:114
std::string arr_to_debug_string(const T *s, size_t length)
Builds the string representation of an array of printable values, sorrounded by square brackets ([ an...
#define IF_PARANOID(x)
x is compiled only in debug builds and when the PARANOID macro is defined.
Definition: def.hpp:49