tudocomp
– The TU Dortmund Compression Framework
LCPSada.hpp
Go to the documentation of this file.
1 #pragma once
2 #include <tudocomp/def.hpp>
3 #include <tudocomp/util.hpp>
4 #include <tudocomp/Env.hpp>
6 #include <sdsl/select_support_mcl.hpp> // for the select data structure
7 
9 
10 namespace tdc {
11 
12 
18 template<class phi_t, class sa_t>
19 inline phi_t construct_phi_array(const sa_t& sa) {
20  const len_t n = sa.size();
21  //assert_permutation(sa,n);
22 
23  phi_t phi(n, 0, bits_for(n));
24  for(size_t i = 1, prev = sa[0]; i < n; i++) {
25  phi[sa[i]] = prev;
26  prev = sa[i];
27  }
28  phi[sa[0]] = sa[n-1];
29  //assert_permutation(phi,n);
30  return phi;
31 }
32 
39 template<typename phi_t, typename text_t>
40 inline void phi_algorithm(phi_t& phi, const text_t& text) {
41  const size_t n = phi.size();
42  DCHECK_EQ(text[n-1],0);
43  for(len_t i = 0, l = 0; i < n - 1; ++i) {
44  const len_t phii = phi[i];
45  DCHECK_LT(i+l, n);
46  DCHECK_LT(phii+l, n);
47  DCHECK_NE(i, phii);
48  while(text[i+l] == text[phii+l]) {
49  l++;
50  DCHECK_LT(i+l, n);
51  DCHECK_LT(phii+l, n);
52  }
53  phi[i] = l;
54  if(l) {
55  --l;
56  }
57  }
58  phi[n-1]=0;
59 }
60 
61 
62 
63 template<
64 typename sa_t,
65 typename select_t = sdsl::select_support_mcl<1,1>>
66 class LCPSada {
67  const sa_t& m_sa;
68  const sdsl::bit_vector m_bv;
69  const select_t m_select;
70  public:
71  LCPSada(const sa_t& sa, const sdsl::bit_vector&& bv)
72  : m_sa(sa)
73  , m_bv(bv)
74  , m_select(&m_bv)
75  {
76  }
77  len_t operator[](len_t i) const {
78  if(size() == 1) return 0;
79  const len_t idx = m_sa[i];
80  return m_select.select(idx+1) - 2*idx;
81  }
82  len_t plcp(len_t idx) const {
83  if(size() == 1) return 0;
84  return m_select.select(idx+1) - 2*idx;
85  }
86  inline len_t size() const {
87  return m_sa.size();
88  }
89 
90  // only for reference:
91  // len_t naive_select(len_t idx) const {
92  // DCHECK_GT(m_bv.size(), 1);
93  // DCHECK_GT(idx,0);
94  // DCHECK_LT(idx, m_bv.size());
95  // const len_t chunk_size = 1 + ((m_bv.size()-1)/64);
96  // len_t pos = 0;
97  // len_t rank = 0;
98  // const uint64_t*const data = m_bv.data();
99  // for(pos = 0; pos < chunk_size; ++pos) {
100  // const uint_fast8_t ones = sdsl::bits::cnt(data[pos]);
101  // if(rank+ones >= idx) break;
102  // rank += ones;
103  // }
104  // if(pos == chunk_size) return m_bv.size();
105  // return 64*pos + sdsl::bits::sel(data[pos], idx-rank);
106  // }
107  //
108  // len_t naive_plcp(len_t idx) const {
109  // if(size() == 1) return 0;
110  // return naive_select(idx+1) - 2*idx;
111  // }
112 
113 };
114 
116  sdsl::bit_vector m_bv;
117 
118  len_t m_idx = 0; // current select parameter
119  len_t m_block = 0; // block index
120  len_t m_blockrank = 0; //number of ones up to previous block
121 
122  public:
123  LCPForwardIterator(sdsl::bit_vector&& bv) : m_bv(bv) {}
124 
125  len_t index() const { return m_idx; }
126 
128  DCHECK_GE(m_bv.size(), 1);
129 // DCHECK_GT(m_idx,0);
130  DCHECK_LT(m_idx+1, m_bv.size());
131  const len_t chunk_size = 1 + ((m_bv.size()-1)/64); //TODO: in constructor
132  const uint64_t*const data = m_bv.data();
133  while(m_block < chunk_size) {
134  const uint_fast8_t ones = sdsl::bits::cnt(data[m_block]); // TODO: make member variable to speed up
135  if(m_blockrank+ones >= m_idx+1) break;
136  m_blockrank += ones;
137  ++m_block;
138  }
139  if(m_block == chunk_size) return m_bv.size();
140  return 64*m_block + sdsl::bits::sel(data[m_block], m_idx+1-m_blockrank);
141 
142  }
144  const len_t ret = next_select() - 2*m_idx;
145  return ret;
146  }
147  void advance() {
148  ++m_idx;
149  }
150 };
151 
152 template<class plcp_t>
153 inline static sdsl::bit_vector construct_plcp_bitvector(const plcp_t& plcp) {
154  const len_t n = plcp.size();
155  len_t len = plcp[0];
156  for(len_t i = 0; i+1 < n; i++) {
157  DCHECK_GE(plcp[i+1]+1, plcp[i]);
158  len += plcp[i+1]-plcp[i]+1;
159  }
160  sdsl::bit_vector bv(len+n,0);
161  bv[plcp[0]]=1;
162  len=plcp[0];
163  for(len_t i = 0; i+1 < n; i++) {
164  len += plcp[i+1]-plcp[i]+2;
165  DCHECK_LT(len, bv.size());
166  bv[len] = 1;
167  }
168  DCHECK_EQ(len, bv.size()-1);
169  DCHECK_EQ(plcp.size(), std::accumulate(bv.begin(), bv.end(), 0));
170  return bv;
171 }
172 
173 template<class sa_t, class text_t, class select_t = sdsl::select_support_mcl<1,1>>
174 sdsl::bit_vector construct_plcp_bitvector(Env&, const sa_t& sa, const text_t& text) {
175  typedef DynamicIntVector phi_t;
176 
177  phi_t phi = StatPhase::wrap("Construct Phi Array", [&]{
178  return construct_phi_array<phi_t,sa_t>(sa);
179  });
180 
181  StatPhase::wrap("Phi-Algorithm", [&]{
182  phi_algorithm(phi, text);
183  });
184 
185  return StatPhase::wrap("Build Sada Bit Vector", [&]{
186  auto ret = construct_plcp_bitvector(phi);
187  StatPhase::log("bit vector length", ret.bit_size());
188  return ret;
189  });
190 }
191 
192 template<class sa_t, class text_t, class select_t = sdsl::select_support_mcl<1,1>>
193 LCPSada<sa_t,select_t> construct_lcp_sada(Env& env, const sa_t& sa, const text_t& text) {
194  return StatPhase::wrap("Build Select on Bit Vector", [&]{
195  sdsl::bit_vector bv = construct_plcp_bitvector(env, sa, text);
196  return LCPSada<sa_t,select_t> { sa, std::move(bv) };
197  });
198 }
199 
200 }//ns
201 
Contains the text compression and encoding framework.
Definition: namespaces.hpp:11
constexpr uint_fast8_t bits_for(size_t n)
Computes the number of bits required to store the given integer value.
phi_t construct_phi_array(const sa_t &sa)
Constructs the phi array with phi[sa[i]] = sa[i-1].
Definition: LCPSada.hpp:19
LCPSada< sa_t, select_t > construct_lcp_sada(Env &env, const sa_t &sa, const text_t &text)
Definition: LCPSada.hpp:193
A vector over arbitrary unsigned integer types.
Definition: IntVector.hpp:175
void phi_algorithm(phi_t &phi, const text_t &text)
Constructs the PLCP array.
Definition: LCPSada.hpp:40
len_t operator[](len_t i) const
Definition: LCPSada.hpp:77
len_t index() const
Definition: LCPSada.hpp:125
len_compact_t len
Definition: LZSSFactors.hpp:38
len_t plcp(len_t idx) const
Definition: LCPSada.hpp:82
len_t size() const
Definition: LCPSada.hpp:86
fast_t< len_compact_t > len_t
Type to represent an length value.
Definition: def.hpp:114
static void log(const char *key, const T &value)
Logs a user statistic for the current phase.
Definition: StatPhase.hpp:218
LCPSada(const sa_t &sa, const sdsl::bit_vector &&bv)
Definition: LCPSada.hpp:71
static auto wrap(const char *title, F func) -> typename std::result_of< F(StatPhase &)>::type
Executes a lambda as a single statistics phase.
Definition: StatPhase.hpp:143
Local environment for a compression/encoding/decompression call.
LCPForwardIterator(sdsl::bit_vector &&bv)
Definition: LCPSada.hpp:123