tudocomp
– The TU Dortmund Compression Framework
STStrategy.hpp
Go to the documentation of this file.
1 #pragma once
2 
3 #include <vector>
4 #include <tuple>
5 
6 #include <unordered_map>
7 
8 
9 #include <tudocomp/util.hpp>
10 #include <tudocomp/io.hpp>
12 #include <tudocomp/Algorithm.hpp>
14 
15 
16 
17 
18 namespace tdc {
19 namespace lfs {
20 
21 class STStrategy : public Algorithm {
22 private:
23 
24  typedef std::tuple<uint,uint,uint> non_term;
25  typedef std::vector<non_term> non_terminal_symbols;
26  typedef std::vector<std::pair<uint,uint>> rules;
27 
28 
29  SuffixTree stree;
30  uint min_lrf;
31 
32  BitVector dead_positions;
33 
34  std::vector<std::vector<SuffixTree::STNode*> > bins;
35 
36  std::unordered_map<SuffixTree::STNode *, std::vector<uint> > beginning_positions;
37 
38  //stats
39  uint node_count;
40  uint max_depth;
41 
42 
43 
44 
45  inline virtual void compute_string_depth(SuffixTree::STNode* node, uint str_depth){
46  //resize if str depth grater than bins size
47 
48 
49 
50 
51 
52  SuffixTree::STInnerNode * inner = dynamic_cast<SuffixTree::STInnerNode *>(node);
53  if(inner){
54 
55  inner->string_depth = str_depth;
56 
57  if(str_depth>= bins.size()){
58  bins.resize(bins.size()*2);
59 
60  std::cerr<<"resizing: "<<bins.size();
61  }
62 
63  if(str_depth>max_depth){
64  max_depth=str_depth;
65  }
66  node_count++;
67  if(str_depth>0){
68 
69  bins[str_depth].push_back(node);
70  }
71 
72 
73  auto it = inner->child_nodes.begin();
74  while (it != inner->child_nodes.end()){
75  auto child = *it;
76 
77  SuffixTree::STInnerNode * child_inner = dynamic_cast<SuffixTree::STInnerNode *>(child.second);
78  if(child_inner){
79  child_inner->parent = inner;
80 
81  uint child_depth = (str_depth+stree.edge_length(child.second));
82  compute_string_depth( child.second, child_depth);
83  }
84  it++;
85  }
86 
87  }
88  }
89 
90 
91 
92 
93 
94  inline virtual std::vector<uint> select_starting_positions(SuffixTree::STNode * node, uint length){
95 
96 
97  std::vector<uint> starting_positions = beginning_positions[node];
98  std::vector<uint> selected_starting_positions;
99 
100  long min_shorter = 1;
101 
102  //select occurences greedily non-overlapping:
103  selected_starting_positions.reserve(starting_positions.size());
104 
105  long last = 0- (long) length - 1;
106  long current;
107  for (auto it=starting_positions.begin(); it!=starting_positions.end(); ++it){
108 
109  current = (long) *it;
110  if(last+length <= current && !dead_positions[current] && !dead_positions[current+length-1]){
111 
112  selected_starting_positions.push_back(current);
113  last = current;
114 
115  }
116 
117  if(current < (long) dead_positions.size() && !dead_positions[current] && dead_positions[current+length-1]){
118 
119 
120  while((current+min_shorter < (long) dead_positions.size()) && !dead_positions[current+min_shorter]){
121  min_shorter++;
122  }
123 
124  }
125 
126  }
127 
128  if(min_shorter < length){
129 
130  if(min_shorter >= (int) min_lrf){
131  //check if parent node is shorter
132  SuffixTree::STInnerNode * inner = dynamic_cast<SuffixTree::STInnerNode *>(node);
133 
134  SuffixTree::STInnerNode * parent = inner->parent;
135  uint depth = parent->string_depth;
136  if(depth < (uint)(min_shorter)){
137 
138  //just re-add node, if the possible replaceable lrf is longer than dpeth of parent node
139  bins[min_shorter].push_back(node);
140  }
141  }
142  }
143 
144 
145  return selected_starting_positions;
146  }
147 
148 public:
149 
150  using Algorithm::Algorithm; //import constructor
151 
152  inline static Meta meta() {
153  Meta m("lfs_comp", "st");
154  m.option("min_lrf").dynamic(2);
155  return m;
156  }
157 
158 
159  inline void compute_rules(io::InputView & input, rules & dictionary, non_terminal_symbols & nts_symbols){
160  min_lrf = env().option("min_lrf").as_integer();
161 
162  StatPhase::wrap("Constructing ST", [&]{
163  stree = SuffixTree(input);
164  });
165 
166 
167  StatPhase::wrap("Computing String Depth", [&]{
168  bins.resize(200);
169  node_count=0;
170  max_depth=0;
171  compute_string_depth(stree.get_root(),0);
172  });
173 
174  StatPhase::log("Number of inner Nodes", node_count);
175  StatPhase::log("Max Depth inner Nodes", max_depth);
176  DLOG(INFO)<<"max depth: "<<max_depth;
177 
178 
179 
180  StatPhase::wrap("Computing LRF Substitution", [&]{
181  dead_positions = BitVector(stree.get_size(), 0);
182  uint nts_number =0;
183 
184  beginning_positions.reserve(node_count);
185 
186  uint rd_counter =0;
187 
188 
189 
190  for(uint i = bins.size()-1; i>=min_lrf; i--){
191  auto bin_it = bins[i].begin();
192  while (bin_it!= bins[i].end()){
193 
194  SuffixTree::STNode * node = *bin_it;
195 
196  auto bp = beginning_positions.find(node);
197 
198  //no begin poss found, get from children
199 
200  if(bp == beginning_positions.end()){
201 
202  std::vector<uint> positions;
203 
204  SuffixTree::STInnerNode * inner = dynamic_cast<SuffixTree::STInnerNode *>(node);
205  auto it = inner->child_nodes.begin();
206  while (it != inner->child_nodes.end()){
207  auto child = *it;
208 
209  SuffixTree::STNode * child_node = child.second;
210 
211  if(SuffixTree::STInnerNode * inner_child = dynamic_cast<SuffixTree::STInnerNode *>(child_node)){
212  auto child_bp = beginning_positions.find(inner_child);
213  if(child_bp != beginning_positions.end()){
214 
215  positions.insert(positions.end(), (*child_bp).second.begin(), (*child_bp).second.end());
216 
217  beginning_positions.erase((*child_bp).first);
218 
219 
220  }
221 
222 
223  }
224  if(SuffixTree::STLeaf * leaf_child = dynamic_cast<SuffixTree::STLeaf *>(child_node)){
225  positions.push_back(leaf_child->suffix);
226 
227  }
228 
229 
230 
231 
232  it++;
233  }
234  std::sort(positions.begin(), positions.end());
235 
236  uint real_depth = positions.back() - positions.front();
237 
238  if(real_depth<i){
239  rd_counter++;
240  }
241 
242  beginning_positions[node]=positions;
243 
244 
245 
246 
247  }
248  std::vector<uint> & begin_pos = beginning_positions[node];
249 
250  //check if repeating factor:
251  if(begin_pos.size() >= 2 && ( (begin_pos.back() ) - (begin_pos.front()) >= i)){
252 
253  //check dead positions:
254  if(!(
255  dead_positions[(begin_pos.back())] ||
256  dead_positions[(begin_pos.front())]
257  )
258 
259 
260  ){
261 
262 
263  std::vector<uint> sel_pos = select_starting_positions(node, i);
264 
265 
266 
267 
268 
269  if(! (sel_pos.size() >=2) ){
270  bin_it++;
271  continue;
272  }
273  //vector of text position, length
274  std::pair<uint,uint> rule = std::make_pair(sel_pos.at(0), i);
275  dictionary.push_back(rule);
276 
277  //iterate over selected pos, add non terminal symbols
278  for(auto bp_it = sel_pos.begin(); bp_it != sel_pos.end(); bp_it++){
279  non_term nts = std::make_tuple(*bp_it, nts_number, i);
280  nts_symbols.push_back(nts);
281  //mark as used
282  for(uint pos = 0; pos<i;pos++){
283  dead_positions[pos+ *bp_it] = 1;
284  }
285  }
286  nts_number++;
287  }
288 
289 
290  }
291 
292 
293  bin_it++;
294 
295  }
296  }
297 
298  StatPhase::log("real depth counter",rd_counter);
299 
300 
301 
302 
303  });
304  StatPhase::wrap("Sorting occurences", [&]{
305  DLOG(INFO) << "sorting occurences";
306  std::sort(nts_symbols.begin(), nts_symbols.end());
307  });
308  }
309 };
310 }
311 
312 }
Contains the text compression and encoding framework.
Definition: namespaces.hpp:11
Provides meta information about an Algorithm.
Definition: Meta.hpp:34
A vector over arbitrary unsigned integer types.
Definition: IntVector.hpp:175
void compute_rules(io::InputView &input, rules &dictionary, non_terminal_symbols &nts_symbols)
Definition: STStrategy.hpp:159
const OptionValue & option(const std::string &option) const
Get an option of this algorithm.
Definition: Env.hpp:44
IntVector< uint_t< 1 > > BitVector
Represents a bit vector, alias for IntVector with a fixed bit width of 1.
Definition: IntVector.hpp:545
unsigned int uint
Definition: characterhash.h:6
uint64_t as_integer() const
Algorithm(Algorithm const &)=default
Env & env()
Provides access to the environment that the algorithm works in.
Definition: Algorithm.hpp:51
len_compact_t pos
Definition: LZSSFactors.hpp:38
static void log(const char *key, const T &value)
Logs a user statistic for the current phase.
Definition: StatPhase.hpp:218
SuffixTree::STNode * get_root()
void reserve(size_type n)
Definition: IntVector.hpp:357
static auto wrap(const char *title, F func) -> typename std::result_of< F(StatPhase &)>::type
Executes a lambda as a single statistics phase.
Definition: StatPhase.hpp:143
Provides a view on the input that allows for random access.
Definition: InputView.hpp:29
OptionBuilder option(const std::string &name)
Declares an accepted option for this algorithm.
Definition: Meta.hpp:216
uint edge_length(STNode *node)
static Meta meta()
Definition: STStrategy.hpp:152
std::unordered_map< char, STNode * > child_nodes
size_type size() const
Definition: IntVector.hpp:284
Interface for algorithms.
Definition: Algorithm.hpp:15
void dynamic()
Declares that this option accepts values of a simple type that can be parsed from a string (e...
Definition: Meta.hpp:150