tudocomp
– The TU Dortmund Compression Framework
ESAStrategy.hpp
Go to the documentation of this file.
1 #pragma once
2 
3 #include <tudocomp/util.hpp>
4 #include <tudocomp/io.hpp>
6 #include <tudocomp/ds/TextDS.hpp>
7 #include <tudocomp/Algorithm.hpp>
8 
9 #include <vector>
10 #include <tuple>
11 
12 namespace tdc {
13 namespace lfs {
14 
15 template<typename text_t = TextDS<> , uint min_lrf = 2>
16 class ESAStrategy : public Algorithm {
17 private:
18 
19  //(position in text, non_terminal_symbol_number, length_of_symbol);
20  typedef std::tuple<uint,uint,uint> non_term;
21  typedef std::vector<non_term> non_terminal_symbols;
22  typedef std::vector<std::pair<uint,uint>> rules;
23 
24  BitVector dead_positions;
25 
26  inline virtual std::vector<uint> select_starting_positions(std::vector<uint> starting_positions, uint length){
27  std::vector<uint> selected_starting_positions;
28  std::sort(starting_positions.begin(), starting_positions.end());
29  //select occurences greedily non-overlapping:
30  selected_starting_positions.reserve(starting_positions.size());
31 
32  int last = 0-length;
33  uint current;
34  for (std::vector<uint>::iterator it=starting_positions.begin(); it!=starting_positions.end(); ++it){
35 
36  current = *it;
37  //DLOG(INFO) << "checking starting position: " << current << " length: " << top.first << "last " << last;
38  if(last+length <= current){
39  selected_starting_positions.push_back(current);
40  last = current;
41 
42  }
43 
44  }
45  return selected_starting_positions;
46  }
47 
48 
49 
50  //could be node_type
51  std::vector<std::vector<uint> > lcp_bins;
52 public:
53 
54  using Algorithm::Algorithm; //import constructor
55 
56  inline static Meta meta() {
57  Meta m("lfs_comp", "esa");
58 
59  m.option("textds").templated<text_t, TextDS<>>("textds");
60 
62 
63  return m;
64  }
65 
66 
67  inline void compute_rules(io::InputView & input, rules & dictionary, non_terminal_symbols & nts_symbols){
68 
69 
70  text_t t(env().env_for_option("textds"), input);
71  DLOG(INFO) << "building sa and lcp";
72  StatPhase::wrap("computing sa and lcp", [&]{
73 
74  t.require(text_t::SA | text_t::ISA | text_t::LCP);
75  });
76  auto& sa_t = t.require_sa();
77  auto& lcp_t = t.require_lcp();
78 
79  StatPhase::wrap("computing lrf occurences", [&]{
80 
81  // iterate over lcp array, add indexes with non overlapping prefix length greater than min_lrf_length to vector
82 
83 
84  StatPhase::wrap("computing lrf occs", [&]{
85 
86  DLOG(INFO) << "iterate over lcp";
87  uint dif ;
88  uint factor_length;
89 
90  uint max = 0;
91 
92  for(uint i = 1; i<lcp_t.size(); i++){
93 
94 
95  if(lcp_t[i] >= min_lrf){
96  if(max < lcp_t[i]){
97  max = lcp_t[i];
98  lcp_bins.resize(max+1);
99  }
100  //compute length of non-overlapping factor:
101 
102  dif = std::abs((long)(sa_t[i-1] - sa_t[i]));
103  factor_length = lcp_t[i];
104  factor_length = std::min(factor_length, dif);
105 
106  //maybe greater non-overlapping factor possible with smaller suffix?
107  int j =i-1;
108  uint alt_dif;
109  while(j>0 && lcp_t[j]>factor_length ){
110  alt_dif = std::abs((long)(sa_t[j] - sa_t[i]));
111  if(alt_dif>dif){
112  dif = alt_dif;
113  }
114  j--;
115 
116 
117  }
118  factor_length = lcp_t[i];
119  factor_length = std::min(factor_length, dif);
120  //second is position in suffix array
121  //first is length of common prefix
122 
123  lcp_bins[factor_length].push_back(i);
124  }
125  }
126 
127  DLOG(INFO)<<"max lcp: "<<max;
128  DLOG(INFO)<<"lcp bins: "<<lcp_bins.size();
129 
130  });
131 
132  if(lcp_bins.size()< min_lrf){
133  DLOG(INFO)<<"nothing to replace, returning";
134  return;
135  }
136 
137  //Pq for the non-terminal symbols
138  //the first in pair is position, the seconds the number of the non terminal symbol
139 
140  DLOG(INFO) << "computing lrfs";
141 
142 
143  StatPhase::wrap("selecting occs", [&]{
144 
145  // Pop PQ, Select occurences of suffix, check if contains replaced symbols
146  dead_positions = BitVector(t.size(), 0);
147 
148  nts_symbols.reserve(lcp_bins.size());
149  uint non_terminal_symbol_number = 0;
150 
151  for(uint lcp_len = lcp_bins.size()-1; lcp_len>= min_lrf; lcp_len--){
152  for(auto bin_it = lcp_bins[lcp_len].begin(); bin_it!=lcp_bins[lcp_len].end(); bin_it++){
153 
154 
155 
156  //detect all starting positions of this string using the sa and lcp:
157  std::vector<uint> starting_positions;
158  starting_positions.reserve(20);
159 
160 
161 
162  // and ceck in bitvector viable starting positions
163  // there is no 1 bit on the corresponding positions
164  // it suffices to check start and end position, because lrf can only be same length and shorter
165  uint i = *bin_it;
166 
167  uint shorter_dif = lcp_len;
168 
169 
170  while(i>=0 && ( lcp_t[i])>=lcp_len){
171  if(!dead_positions[sa_t[i-1]] && !dead_positions[sa_t[i-1]+lcp_len-1]){
172  starting_positions.push_back(sa_t[i-1]);
173  }
174 
175  if(!dead_positions[sa_t[i-1]] && dead_positions[sa_t[i-1]+lcp_len-1] ){
176  while(!dead_positions[sa_t[i-1]+lcp_len-shorter_dif]){
177  shorter_dif--;
178  }
179  }
180  i--;
181 
182  }
183  i = *bin_it;
184  while(i< lcp_t.size() && lcp_t[i]>=lcp_len){
185 
186  if(!dead_positions[sa_t[i]] && !dead_positions[sa_t[i]+lcp_len-1]){
187  starting_positions.push_back(sa_t[i]);
188  }
189  if(!dead_positions[sa_t[i]] && dead_positions[sa_t[i]+lcp_len-1] ){
190  while(!dead_positions[sa_t[i]+lcp_len-shorter_dif]){
191  shorter_dif--;
192  }
193  }
194  i++;
195  }
196  if(starting_positions.size()>=2){
197  std::vector<uint> selected_starting_positions = select_starting_positions(starting_positions, lcp_len);
198  //computing substring to be replaced
199 
200  if(selected_starting_positions.size()>=2){
201 
202 
203 
204  uint offset = sa_t[*bin_it];
205  std::pair<uint,uint> longest_repeating_factor(offset, lcp_len);
206  for (std::vector<uint>::iterator it=selected_starting_positions.begin(); it!=selected_starting_positions.end(); ++it){
207  for(uint k = 0; k<lcp_len; k++){
208  dead_positions[*it+k]=1;
209  }
210 
211  uint length_of_symbol = lcp_len;
212  std::tuple<uint,uint,uint> symbol(*it, non_terminal_symbol_number, length_of_symbol);
213  nts_symbols.push_back(symbol);
214  }
215 
216  dictionary.push_back(longest_repeating_factor);
217  non_terminal_symbol_number++;
218  }
219  }
220 
221  }
222  }
223  });
224  });
225 
226  StatPhase::wrap("sorting symbols", [&]{
227  DLOG(INFO) << "sorting symbols";
228  std::sort(nts_symbols.begin(), nts_symbols.end());
229 
230  });
231 
232  }
233 };
234 }
235 }
Contains the text compression and encoding framework.
Definition: namespaces.hpp:11
Provides meta information about an Algorithm.
Definition: Meta.hpp:34
A vector over arbitrary unsigned integer types.
Definition: IntVector.hpp:175
constexpr dsflags_t ISA
Definition: TextDSFlags.hpp:12
IntVector< uint_t< 1 > > BitVector
Represents a bit vector, alias for IntVector with a fixed bit width of 1.
Definition: IntVector.hpp:545
unsigned int uint
Definition: characterhash.h:6
Algorithm(Algorithm const &)=default
void uses_textds(ds::dsflags_t flags)
Indicates that this Algorithm uses the TextDS class, and how it does.
Definition: Meta.hpp:277
Env & env()
Provides access to the environment that the algorithm works in.
Definition: Algorithm.hpp:51
static Meta meta()
Definition: ESAStrategy.hpp:56
void templated(const std::string &accepted_type)
Declares that this option accepts values of the specified Algorithm type T.
Definition: Meta.hpp:93
constexpr dsflags_t SA
Definition: TextDSFlags.hpp:11
void reserve(size_type n)
Definition: IntVector.hpp:357
static auto wrap(const char *title, F func) -> typename std::result_of< F(StatPhase &)>::type
Executes a lambda as a single statistics phase.
Definition: StatPhase.hpp:143
Provides a view on the input that allows for random access.
Definition: InputView.hpp:29
OptionBuilder option(const std::string &name)
Declares an accepted option for this algorithm.
Definition: Meta.hpp:216
constexpr dsflags_t LCP
Definition: TextDSFlags.hpp:13
Manages text related data structures.
Definition: TextDS.hpp:30
Interface for algorithms.
Definition: Algorithm.hpp:15
void compute_rules(io::InputView &input, rules &dictionary, non_terminal_symbols &nts_symbols)
Definition: ESAStrategy.hpp:67