tudocomp
– The TU Dortmund Compression Framework
ds/SuffixTree.hpp
Go to the documentation of this file.
1 #pragma once
2 
3 //Original Implementation::
4 // https://gist.github.com/makagonov/f7ed8ce729da72621b321f0ab547debb
5 //from: http://stackoverflow.com/questions/9452701/ukkonens-suffix-tree-algorithm-in-plain-english/9513423#9513423
6 #include <string>
7 #include <map>
8 #include <vector>
9 #include <tuple>
10 
11 #include <unordered_map>
12 
13 
14 #include <tudocomp/io.hpp>
15 
16 
17 namespace tdc {
18 
19 class SuffixTree{
20 public:
21  struct STNode;
22  struct STLeaf;
23  struct STInnerNode;
24 private:
25 
26 
27  //root node
28  STInnerNode* root;
29 
30  //text added to st
31  std::string Text;
32  int pos;
33  uint suffix;
34 
35  //number of suffixes to be added;
36  uint remainder;
37 
38  //active point, from where to start inserting new suffixes
39  STNode* active_node;
40  char active_edge;
41  uint active_length;
42 
43 
44  //saves last added node
45  STInnerNode* last_added_sl;
46 
47  void add_sl(STInnerNode* node){
48 
49  if(last_added_sl != root) {
50  last_added_sl->suffix_link=node;
51  }
52  last_added_sl=node;
53  }
54 
55 public:
56 
57 
58  //computes edge length:
60  if(node->start==-1){
61  return 0;
62  }
63 
64  if(node->end == 0){
65  return pos - node->start+1;
66  } else {
67  return node->end - node->start;
68  }
69  }
70  struct STNode{
71 
72  //represents the edge leading to this node
73  int start;
74  int end;
75  STNode(int s, int e = 0) : start(s), end (e) {}
76  virtual ~STNode(){}
77 
78  };
79 
80  struct STLeaf : STNode {
81  //suffix if leaf
83  //constructor. e=0
84  STLeaf(int s, int e = 0) : STNode(s,e){}
85 
86  virtual ~STLeaf() {}
87 
88  };
89 
90  struct STInnerNode : STNode {
91  //
94 
95  // child nodes
96  std::unordered_map<char, STNode*> child_nodes;
97  //suffix link
99  //constructor. e=0
100 
101  // start(s), end (e)
102  STInnerNode(int s, int e = 0) : STNode(s,e){ suffix_link = NULL; child_nodes.reserve(6);}
103 
104  virtual ~STInnerNode(){}
105 
106  };
107 
108  //constructor
110  //no Text is read
111  pos=-1;
112  Text="";
113  remainder=0;
114  suffix=0;
115 
116 
117  root = new STInnerNode(-1);
118  root->string_depth=0;
119 
120 
121  //active start node is root
122  active_node=root;
123  active_length=0;
124 
125  last_added_sl=root;
126 
127  }
129  root=NULL;
130 
131  active_node=root;
132  last_added_sl=root;
133  }
134 
135 private:
136  inline void add_char(char c){
137  //Text += c;
138  pos++;
139  remainder++;
140  last_added_sl=root;
141 
142  while(remainder > 0){
143  if(active_length==0){
144  active_edge = c;
145  }
146 
147 
148 
149  //if the active node doesnt have the corresponding edge:
150 
151  STInnerNode * active_inner = dynamic_cast<STInnerNode *> (active_node);
152 
153  auto next_it = active_inner->child_nodes.find(active_edge);
154  if(next_it==active_inner->child_nodes.end()){
155  //insert new leaf
156  STLeaf* new_leaf = new STLeaf(pos);
157  active_inner->child_nodes[active_edge] = new_leaf;
158  // leaves.push_back(new_leaf);
159  new_leaf->suffix=suffix++;
160  add_sl(active_inner);
161 
162  } else {
163  STNode* next = active_inner->child_nodes[active_edge] ;
164  //if the active length is greater than the edge length:
165  //switch active node to that
166  //walk down
167  if(active_length>= edge_length(next)){
168  active_node = next;
169  active_length -= edge_length(next);
170  active_edge = Text[pos-active_length];
171  continue;
172  }
173 
174  //if that suffix is already in the tree::
175  if(Text[next->start +active_length] == c){
176  active_length++;
177  add_sl(active_inner);
178  break;
179  }
180 
181  //now split edge if the edge is found
182 
183  STInnerNode* split = new STInnerNode(next->start, next->start+active_length);
184  active_inner->child_nodes[active_edge] = split;
185 
186  STLeaf* leaf = new STLeaf(pos);
187  leaf->suffix=suffix++;
188  split->child_nodes[c] = leaf;
189 
190  next->start=next->start + active_length;
191  split->child_nodes[Text[next->start]] = next;
192  add_sl(split);
193  // leaves.push_back(leaf);
194  }
195  remainder--;
196  if(active_node==root && active_length>0){
197  active_length--;
198  active_edge = Text[pos-remainder+1];
199  }else {
200  if(active_inner->suffix_link != NULL){
201  active_node = active_inner->suffix_link;
202  } else {
203  active_node = root;
204  }
205 
206  }
207  }
208 
209  }
210 public:
211  inline void append_char(char c){
212 
213  Text +=c;
214  add_char(c);
215  }
216 
217  inline void append_string(std::string input){
218  Text += input;
219  for(uint i = 0; i<input.length();i++){
220  add_char(input[i]);
221  }
222  }
223 
224  inline void append_input(Input& input){
225  auto iview = input.as_view();
226  append_input(iview);
227  }
228 
229  inline void append_input(io::InputView & input){
230  Text += input;
231  for (uint i = 0; i < input.size(); i++) {
232  uint8_t c = input[i];
233  add_char(c);
234  }
235  }
236 
237  inline std::string get_text(){
238  return Text;
239  }
240 
241  inline uint get_size(){
242  return Text.size();
243  }
244 
245 
247  return root;
248  }
249 
250  inline std::string get_string_of_edge(STNode* node){
251  return Text.substr(node->start, edge_length(node));
252  }
253 
254 
256  append_input(input);
257  }
258 
260  append_input(input);
261  }
262 
263  SuffixTree(std::string input) : SuffixTree(){
264  append_string(input);
265  }
266 
267 
268 };
269 
270 }
271 
Contains the text compression and encoding framework.
Definition: namespaces.hpp:11
void append_input(Input &input)
void append_input(io::InputView &input)
unsigned int uint
Definition: characterhash.h:6
std::string get_text()
SuffixTree(std::string input)
size_type size() const
Returns size of the View.
STNode(int s, int e=0)
SuffixTree(io::InputView &input)
STLeaf(int s, int e=0)
len_compact_t pos
Definition: LZSSFactors.hpp:38
void append_string(std::string input)
SuffixTree::STNode * get_root()
std::string get_string_of_edge(STNode *node)
InputView as_view() const
Provides a view on the input that allows for random access.
Definition: Input.hpp:260
Provides a view on the input that allows for random access.
Definition: InputView.hpp:29
uint edge_length(STNode *node)
std::unordered_map< char, STNode * > child_nodes
SuffixTree(Input &input)
void append_char(char c)
An abstraction layer for algorithm input.
Definition: Input.hpp:37