tudocomp
– The TU Dortmund Compression Framework
BSTStrategy.hpp
Go to the documentation of this file.
1 #pragma once
2 
3 #include <vector>
4 #include <tuple>
5 
6 #include <unordered_map>
7 
8 
9 #include <tudocomp/util.hpp>
10 #include <tudocomp/io.hpp>
12 #include <tudocomp/Algorithm.hpp>
14 
15 
16 
17 
18 namespace tdc {
19 namespace lfs {
20 
21 class BSTStrategy : public Algorithm {
22 private:
23 
24  //(position in text, non_terminal_symbol_number, length_of_symbol);
25  typedef std::tuple<uint,uint,uint> non_term;
26  typedef std::vector<non_term> non_terminal_symbols;
27  typedef std::vector<std::pair<uint,uint>> rules;
28  typedef uint node_type;
29 
30 
31  std::unique_ptr<BinarySuffixTree> stree;
32  uint min_lrf;
33 
34  BitVector dead_positions;
35 
36  std::vector<std::vector<uint> > bins;
37 
38 
39  std::unordered_map< uint , std::vector< uint > > beginning_positions;
40 
41 
42 
43  //stats
44  uint node_count;
45  uint max_depth;
46 
47 
48  inline virtual void compute_string_depth(uint node, uint str_depth){
49  //resize if str depth grater than bins size
50  uint child = stree->get_first_child(node);
51 
52 
53  if(child != 0){
54  while(str_depth>= bins.size()){
55  bins.resize(bins.size()*2);
56  }
57 
58 
59 
60  if(str_depth>max_depth){
61  max_depth=str_depth;
62  }
63  node_count++;
64  if(str_depth>0){
65 
66  bins[str_depth].push_back(node);
67  }
68  while (child != 0){
69  compute_string_depth(child, str_depth + stree->get_edge_length(child));
70  child=stree->get_next_sibling(child);
71  }
72 
73  }
74 
75  }
76 
77 
78 
79 
80 
81 
82 
83 
84  inline virtual std::vector<uint> select_starting_positions(node_type node, uint length){
85 
86 
87  std::vector<uint> & starting_positions = beginning_positions[node];
88  std::vector<uint> selected_starting_positions;
89  std::vector<uint> not_selected_starting_positions;
90 
91  //select occurences greedily non-overlapping:
92  selected_starting_positions.reserve(starting_positions.size());
93 
94  long last = 0- (long) length - 1;
95  long current;
96  for (auto it=starting_positions.begin(); it!=starting_positions.end(); ++it){
97 
98  current = (long) *it;
99  if(last+length <= current && !dead_positions[current] && !dead_positions[current+length-1]){
100 
101  selected_starting_positions.push_back(current);
102  last = current;
103 
104  } else {
105 
106  if( !dead_positions[current] ){
107  not_selected_starting_positions.push_back(current);
108  }
109 
110  }
111 
112 
113  }
114 
115  if(selected_starting_positions.size() >= 2){
116  beginning_positions[node] = not_selected_starting_positions;
117  }
118 
119 
120  return selected_starting_positions;
121  }
122 
123 
124 public:
125 
126  using Algorithm::Algorithm; //import constructor
127 
128  inline static Meta meta() {
129  Meta m("lfs_comp", "bst");
130  m.option("min_lrf").dynamic(2);
131  return m;
132  }
133 
134 
135  inline void compute_rules(io::InputView & input, rules & dictionary, non_terminal_symbols & nts_symbols){
136  min_lrf = env().option("min_lrf").as_integer();
137 
138  StatPhase::wrap("Constructing ST", [&]{
139  DLOG(INFO)<<"Constructing ST";
140  stree = std::make_unique<BinarySuffixTree>(input);
141  StatPhase::log("Number of Nodes", stree->get_tree_size());
142  });
143 
144 
145 
146  StatPhase::wrap("Computing String Depth", [&]{
147  bins.resize(200);
148  node_count=0;
149  max_depth=0;
150  compute_string_depth(0,0);
151 
152  bins.resize(max_depth +1);
153  bins.shrink_to_fit();
154  });
155 
156  StatPhase::log("Number of inner Nodes", node_count);
157  StatPhase::log("Max Depth inner Nodes", max_depth);
158  DLOG(INFO)<<"max depth: "<<max_depth;
159 
160 
161 
162 
163 
164 
165  StatPhase::wrap("Computing LRF Substitution", [&]{
166  dead_positions = BitVector(input.size(), 0);
167  uint nts_number =0;
168 
169  beginning_positions.reserve(node_count);
170 
171 
172 
173 
174  DLOG(INFO)<<"Iterating nodes";
175 
176  for(uint i = bins.size()-1; i>=min_lrf; i--){
177 
178  auto bin_it = bins[i].begin();
179  while (bin_it!= bins[i].end()){
180 
181 
182  node_type node = *bin_it;
183 
184 
185  auto bp = beginning_positions.find(node);
186 
187  //no begin pos found, get from children
188 
189  if(bp == beginning_positions.end()){
190 
191 
192  std::vector<uint> positions;
193  //get leaves or merge child vectors
194  std::vector<uint> offsets;
195  std::vector<uint> leaf_bps;
196 
197  node_type inner = stree->get_first_child(node);
198 
199  while (inner != 0){
200 
201  if(stree->get_first_child(inner) == 0){
202  uint temp = stree->get_suffix(inner);
203  if(!dead_positions[temp]){
204 
205  leaf_bps.push_back(stree->get_suffix(inner));
206  }
207  } else {
208  auto & child_bp = beginning_positions[inner];
209  if(!child_bp.empty()){
210  offsets.push_back(positions.size());
211 
212  positions.insert(positions.end(), child_bp.begin(), child_bp.end());
213 
214 
215  beginning_positions.erase(inner);
216 
217  }
218 
219  }
220  inner = stree->get_next_sibling(inner);
221  }
222 
223  std::sort(leaf_bps.begin(), leaf_bps.end());
224  offsets.push_back(positions.size());
225  positions.insert(positions.end(),leaf_bps.begin(), leaf_bps.end());
226  for(uint k = 0; k < offsets.size()-1; k++){
227  std::inplace_merge(positions.begin(), positions.begin()+ offsets[k], positions.begin()+ offsets[k+1]);
228 
229  }
230  //now inplace merge to end
231  std::inplace_merge(positions.begin(), positions.begin()+ offsets.back(), positions.end());
232 
233  beginning_positions[node]=positions;
234 
235 
236 
237 
238  }
239  std::vector<uint> begin_pos = beginning_positions[node];
240  //check if repeating factor:
241  if(begin_pos.size() >= 2 && ( (begin_pos.back() ) - (begin_pos.front()) >= i)){
242  //check dead positions:
243  if(!(
244  dead_positions[(begin_pos.back())] ||
245  dead_positions[(begin_pos.front())]
246  )
247 
248 
249  ){
250  std::vector<uint> sel_pos = select_starting_positions(node, i);
251 
252  if(! (sel_pos.size() >=2) ){
253  bin_it++;
254  continue;
255  }
256  //vector of text position, length
257  std::pair<uint,uint> rule = std::make_pair(sel_pos.at(0), i);
258  dictionary.push_back(rule);
259 
260  //iterate over selected pos, add non terminal symbols
261  for(auto bp_it = sel_pos.begin(); bp_it != sel_pos.end(); bp_it++){
262  //(position in text, non_terminal_symbol_number, length_of_symbol);
263  non_term nts = std::make_tuple(*bp_it, nts_number, i);
264  nts_symbols.push_back(nts);
265  //mark as used
266  for(uint pos = 0; pos<i;pos++){
267  dead_positions[pos+ *bp_it] = 1;
268  }
269  }
270  nts_number++;
271  }
272 
273 
274  }
275 
276 
277 
278  bin_it++;
279 
280  }
281  }
282 
283 
284 
285 
286  });
287  StatPhase::wrap("Sorting occurences", [&]{
288  DLOG(INFO) << "sorting occurences";
289  std::sort(nts_symbols.begin(), nts_symbols.end());
290  });
291 
292 
293  }
294 };
295 }
296 
297 }
Contains the text compression and encoding framework.
Definition: namespaces.hpp:11
Provides meta information about an Algorithm.
Definition: Meta.hpp:34
A vector over arbitrary unsigned integer types.
Definition: IntVector.hpp:175
const OptionValue & option(const std::string &option) const
Get an option of this algorithm.
Definition: Env.hpp:44
IntVector< uint_t< 1 > > BitVector
Represents a bit vector, alias for IntVector with a fixed bit width of 1.
Definition: IntVector.hpp:545
unsigned int uint
Definition: characterhash.h:6
void compute_rules(io::InputView &input, rules &dictionary, non_terminal_symbols &nts_symbols)
size_type size() const
Returns size of the View.
uint64_t as_integer() const
Algorithm(Algorithm const &)=default
Env & env()
Provides access to the environment that the algorithm works in.
Definition: Algorithm.hpp:51
void resize(size_type n)
Definition: IntVector.hpp:327
len_compact_t pos
Definition: LZSSFactors.hpp:38
static void log(const char *key, const T &value)
Logs a user statistic for the current phase.
Definition: StatPhase.hpp:218
void reserve(size_type n)
Definition: IntVector.hpp:357
static auto wrap(const char *title, F func) -> typename std::result_of< F(StatPhase &)>::type
Executes a lambda as a single statistics phase.
Definition: StatPhase.hpp:143
Provides a view on the input that allows for random access.
Definition: InputView.hpp:29
OptionBuilder option(const std::string &name)
Declares an accepted option for this algorithm.
Definition: Meta.hpp:216
Interface for algorithms.
Definition: Algorithm.hpp:15
void dynamic()
Declares that this option accepts values of a simple type that can be parsed from a string (e...
Definition: Meta.hpp:150