tudocomp
– The TU Dortmund Compression Framework
divsufsort_trsort.hpp
Go to the documentation of this file.
1 /*
2  * This file integrates customized parts of divsufsort into tudocomp.
3  * divsufsort is licensed under the MIT License, which follows.
4  *
5  * Copyright (c) 2003-2008 Yuta Mori All Rights Reserved.
6  *
7  * Permission is hereby granted, free of charge, to any person
8  * obtaining a copy of this software and associated documentation
9  * files (the "Software"), to deal in the Software without
10  * restriction, including without limitation the rights to use,
11  * copy, modify, merge, publish, distribute, sublicense, and/or sell
12  * copies of the Software, and to permit persons to whom the
13  * Software is furnished to do so, subject to the following
14  * conditions:
15  *
16  * The above copyright notice and this permission notice shall be
17  * included in all copies or substantial portions of the Software.
18  *
19  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
20  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
21  * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
22  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
23  * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
24  * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
25  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
26  * OTHER DEALINGS IN THE SOFTWARE.
27  */
28 
29 #pragma once
30 
33 
35 namespace tdc {
36 namespace libdivsufsort {
37 
38 /*---------------------------------------------------------------------------*/
39 
40 /* Simple insertionsort for small size groups. */
41 template<typename buffer_t>
42 inline void tr_insertionsort(buffer_t& B, const saidx_t ISAd, saidx_t first, saidx_t last) {
43  saidx_t a, b;
44  saidx_t t, r;
45 
46  for(a = first + 1; a < last; ++a) {
47  for(t = B[a], b = a - 1; 0 > (r = B[ISAd + t] - B[ISAd + B[b]]);) {
48  do { B[b + 1] = B[b]; } while((first <= --b) && (B[b] < 0));
49  if(b < first) { break; }
50  }
51  if(r == 0) { B[b] = ~B[b]; }
52  B[b + 1] = t;
53  }
54 }
55 
56 
57 /*---------------------------------------------------------------------------*/
58 
59 template<typename buffer_t>
60 inline void tr_fixdown(buffer_t& B, const saidx_t ISAd, saidx_t SA, saidx_t i, saidx_t size) {
61  saidx_t j, k;
62  saidx_t v;
63  saidx_t c, d, e;
64 
65  for(v = B[SA + i], c = B[ISAd + v]; (j = 2 * i + 1) < size; B[SA + i] = B[SA + k], i = k) {
66  k = j++;
67  d = B[ISAd + B[SA + k]];
68  if(d < (e = B[ISAd + B[SA + j]])) { k = j; d = e; }
69  if(d <= c) { break; }
70  }
71  B[SA + i] = v;
72 }
73 
74 /* Simple top-down heapsort. */
75 template<typename buffer_t>
76 inline void tr_heapsort(buffer_t& B, const saidx_t ISAd, saidx_t SA, saidx_t size) {
77  saidx_t i, m;
78  saidx_t t;
79 
80  m = size;
81  if((size % 2) == 0) {
82  m--;
83  if(B[ISAd + B[SA + m / 2]] < B[ISAd + B[SA + m]]) { SWAP(B[SA + m], B[SA + m / 2]); }
84  }
85 
86  for(i = m / 2 - 1; 0 <= i; --i) { tr_fixdown(B, ISAd, SA, i, m); }
87  if((size % 2) == 0) { SWAP(B[SA + 0], B[SA + m]); tr_fixdown(B, ISAd, SA, 0, m); }
88  for(i = m - 1; 0 < i; --i) {
89  t = B[SA + 0], B[SA + 0] = B[SA + i];
90  tr_fixdown(B, ISAd, SA, 0, i);
91  B[SA + i] = t;
92  }
93 }
94 
95 
96 /*---------------------------------------------------------------------------*/
97 
98 /* Returns the median of three elements. */
99 template<typename buffer_t>
100 inline saidx_t tr_median3(buffer_t& B, const saidx_t ISAd, saidx_t v1, saidx_t v2, saidx_t v3) {
101  saidx_t t;
102  if(B[ISAd + B[v1]] > B[ISAd + B[v2]]) { SWAP(v1, v2); }
103  if(B[ISAd + B[v2]] > B[ISAd + B[v3]]) {
104  if(B[ISAd + B[v1]] > B[ISAd + B[v3]]) { return v1; }
105  else { return v3; }
106  }
107  return v2;
108 }
109 
110 /* Returns the median of five elements. */
111 template<typename buffer_t>
112 inline saidx_t tr_median5(buffer_t& B, const saidx_t ISAd,
113  saidx_t v1, saidx_t v2, saidx_t v3, saidx_t v4, saidx_t v5) {
114  saidx_t t;
115  if(B[ISAd + B[v2]] > B[ISAd + B[v3]]) { SWAP(v2, v3); }
116  if(B[ISAd + B[v4]] > B[ISAd + B[v5]]) { SWAP(v4, v5); }
117  if(B[ISAd + B[v2]] > B[ISAd + B[v4]]) { SWAP(v2, v4); SWAP(v3, v5); }
118  if(B[ISAd + B[v1]] > B[ISAd + B[v3]]) { SWAP(v1, v3); }
119  if(B[ISAd + B[v1]] > B[ISAd + B[v4]]) { SWAP(v1, v4); SWAP(v3, v5); }
120  if(B[ISAd + B[v3]] > B[ISAd + B[v4]]) { return v4; }
121  return v3;
122 }
123 
124 /* Returns the pivot element. */
125 template<typename buffer_t>
126 inline saidx_t tr_pivot(buffer_t& B, const saidx_t ISAd, saidx_t first, saidx_t last) {
127  saidx_t middle;
128  saidx_t t;
129 
130  t = last - first;
131  middle = first + t / 2;
132 
133  if(t <= 512) {
134  if(t <= 32) {
135  return tr_median3(B, ISAd, first, middle, last - 1);
136  } else {
137  t >>= 2;
138  return tr_median5(B, ISAd, first, first + t, middle, last - 1 - t, last - 1);
139  }
140  }
141  t >>= 3;
142  first = tr_median3(B, ISAd, first, first + t, first + (t << 1));
143  middle = tr_median3(B, ISAd, middle - t, middle, middle + t);
144  last = tr_median3(B, ISAd, last - 1 - (t << 1), last - 1 - t, last - 1);
145  return tr_median3(B, ISAd, first, middle, last);
146 }
147 
148 
149 /*---------------------------------------------------------------------------*/
150 
151 typedef struct _trbudget_t trbudget_t;
152 struct _trbudget_t {
153  saidx_t chance;
154  saidx_t remain;
155  saidx_t incval;
156  saidx_t count;
157 };
158 
159 inline void trbudget_init(trbudget_t *budget, saidx_t chance, saidx_t incval) {
160  budget->chance = chance;
161  budget->remain = budget->incval = incval;
162 }
163 
164 inline saint_t trbudget_check(trbudget_t *budget, saidx_t size) {
165  if(size <= budget->remain) { budget->remain -= size; return 1; }
166  if(budget->chance == 0) { budget->count += size; return 0; }
167  budget->remain += budget->incval - size;
168  budget->chance -= 1;
169  return 1;
170 }
171 
172 
173 /*---------------------------------------------------------------------------*/
174 
175 template<typename buffer_t>
176 inline void tr_partition(buffer_t& B, const saidx_t ISAd,
177  saidx_t first, saidx_t middle, saidx_t last,
178  saidx_t *pa, saidx_t *pb, saidx_t v) {
179  saidx_t a, b, c, d, e, f;
180  saidx_t t, s;
181  saidx_t x = 0;
182 
183  for(b = middle - 1; (++b < last) && ((x = B[ISAd + B[b]]) == v);) { }
184  if(((a = b) < last) && (x < v)) {
185  for(; (++b < last) && ((x = B[ISAd + B[b]]) <= v);) {
186  if(x == v) { SWAP(B[b], B[a]); ++a; }
187  }
188  }
189  for(c = last; (b < --c) && ((x = B[ISAd + B[c]]) == v);) { }
190  if((b < (d = c)) && (x > v)) {
191  for(; (b < --c) && ((x = B[ISAd + B[c]]) >= v);) {
192  if(x == v) { SWAP(B[c], B[d]); --d; }
193  }
194  }
195  for(; b < c;) {
196  SWAP(B[b], B[c]);
197  for(; (++b < c) && ((x = B[ISAd + B[b]]) <= v);) {
198  if(x == v) { SWAP(B[b], B[a]); ++a; }
199  }
200  for(; (b < --c) && ((x = B[ISAd + B[c]]) >= v);) {
201  if(x == v) { SWAP(B[c], B[d]); --d; }
202  }
203  }
204 
205  if(a <= d) {
206  c = b - 1;
207  if((s = a - first) > (t = b - a)) { s = t; }
208  for(e = first, f = b - s; 0 < s; --s, ++e, ++f) { SWAP(B[e], B[f]); }
209  if((s = d - c) > (t = last - d - 1)) { s = t; }
210  for(e = b, f = last - s; 0 < s; --s, ++e, ++f) { SWAP(B[e], B[f]); }
211  first += (b - a), last -= (d - c);
212  }
213  *pa = first, *pb = last;
214 }
215 
216 template<typename buffer_t>
217 inline void tr_copy(buffer_t& B, saidx_t ISA, const saidx_t SA,
218  saidx_t first, saidx_t a, saidx_t b, saidx_t last,
219  saidx_t depth) {
220  /* sort suffixes of middle partition
221  by using sorted order of suffixes of left and right partition. */
222  saidx_t c, d, e;
223  saidx_t s, v;
224 
225  v = b - SA - 1;
226  for(c = first, d = a - 1; c <= d; ++c) {
227  if((0 <= (s = B[c] - depth)) && (B[ISA + s] == v)) {
228  B[++d] = s;
229  B[ISA + s] = d - SA;
230  }
231  }
232  for(c = last - 1, e = d + 1, d = b; e < d; --c) {
233  if((0 <= (s = B[c] - depth)) && (B[ISA + s] == v)) {
234  B[--d] = s;
235  B[ISA + s] = d - SA;
236  }
237  }
238 }
239 
240 template<typename buffer_t>
241 inline void tr_partialcopy(buffer_t& B, saidx_t ISA, const saidx_t SA,
242  saidx_t first, saidx_t a, saidx_t b, saidx_t last,
243  saidx_t depth) {
244  saidx_t c, d, e;
245  saidx_t s, v;
246  saidx_t rank, lastrank, newrank = -1;
247 
248  v = b - SA - 1;
249  lastrank = -1;
250  for(c = first, d = a - 1; c <= d; ++c) {
251  if((0 <= (s = B[c] - depth)) && (B[ISA + s] == v)) {
252  B[++d] = s;
253  rank = B[ISA + s + depth];
254  if(lastrank != rank) { lastrank = rank; newrank = d - SA; }
255  B[ISA + s] = newrank;
256  }
257  }
258 
259  lastrank = -1;
260  for(e = d; first <= e; --e) {
261  rank = B[ISA + B[e]];
262  if(lastrank != rank) { lastrank = rank; newrank = e - SA; }
263  if(newrank != rank) { B[ISA + B[e]] = newrank; }
264  }
265 
266  lastrank = -1;
267  for(c = last - 1, e = d + 1, d = b; e < d; --c) {
268  if((0 <= (s = B[c] - depth)) && (B[ISA + s] == v)) {
269  B[--d] = s;
270  rank = B[ISA + s + depth];
271  if(lastrank != rank) { lastrank = rank; newrank = d - SA; }
272  B[ISA + s] = newrank;
273  }
274  }
275 }
276 
277 template<typename buffer_t>
278 inline void tr_introsort(buffer_t& B, saidx_t ISA, saidx_t ISAd,
279  saidx_t SA, saidx_t first, saidx_t last,
280  trbudget_t *budget) {
281 #define STACK_SIZE TR_STACKSIZE
282  struct { saidx_t a, b, c; saint_t d, e; }stack[STACK_SIZE];
283  saidx_t a, b, c;
284  saidx_t t;
285  saidx_t v, x = 0;
286  saidx_t incr = ISAd - ISA;
287  saint_t limit, next;
288  saint_t ssize, trlink = -1;
289 
290  for(ssize = 0, limit = ilg<saidx_t>(last - first);;) {
291 
292  if(limit < 0) {
293  if(limit == -1) {
294  /* tandem repeat partition */
295  tr_partition(B, ISAd - incr, first, first, last, &a, &b, last - SA - 1);
296 
297  /* update ranks */
298  if(a < last) {
299  for(c = first, v = a - SA - 1; c < a; ++c) { B[ISA + B[c]] = v; }
300  }
301  if(b < last) {
302  for(c = a, v = b - SA - 1; c < b; ++c) { B[ISA + B[c]] = v; }
303  }
304 
305  /* push */
306  if(1 < (b - a)) {
307  STACK_PUSH5(-1, a, b, 0, 0); //TODO: is -1 instead of NULL good?
308  STACK_PUSH5(ISAd - incr, first, last, -2, trlink);
309  trlink = ssize - 2;
310  }
311  if((a - first) <= (last - b)) {
312  if(1 < (a - first)) {
313  STACK_PUSH5(ISAd, b, last, ilg<saidx_t>(last - b), trlink);
314  last = a, limit = ilg<saidx_t>(a - first);
315  } else if(1 < (last - b)) {
316  first = b, limit = ilg<saidx_t>(last - b);
317  } else {
318  STACK_POP5(ISAd, first, last, limit, trlink);
319  }
320  } else {
321  if(1 < (last - b)) {
322  STACK_PUSH5(ISAd, first, a, ilg<saidx_t>(a - first), trlink);
323  first = b, limit = ilg<saidx_t>(last - b);
324  } else if(1 < (a - first)) {
325  last = a, limit = ilg<saidx_t>(a - first);
326  } else {
327  STACK_POP5(ISAd, first, last, limit, trlink);
328  }
329  }
330  } else if(limit == -2) {
331  /* tandem repeat copy */
332  a = stack[--ssize].b, b = stack[ssize].c;
333  if(stack[ssize].d == 0) {
334  tr_copy(B, ISA, SA, first, a, b, last, ISAd - ISA);
335  } else {
336  if(0 <= trlink) { stack[trlink].d = -1; }
337  tr_partialcopy(B, ISA, SA, first, a, b, last, ISAd - ISA);
338  }
339  STACK_POP5(ISAd, first, last, limit, trlink);
340  } else {
341  /* sorted partition */
342  if(0 <= B[first]) {
343  a = first;
344  do { B[ISA + B[a]] = a - SA; } while((++a < last) && (0 <= B[a]));
345  first = a;
346  }
347  if(first < last) {
348  a = first; do { B[a] = ~B[a]; } while(B[++a] < 0);
349  next = (B[ISA + B[a]] != B[ISAd + B[a]]) ? ilg<saidx_t>(a - first + 1) : -1;
350  if(++a < last) { for(b = first, v = a - SA - 1; b < a; ++b) { B[ISA + B[b]] = v; } }
351 
352  /* push */
353  if(trbudget_check(budget, a - first)) {
354  if((a - first) <= (last - a)) {
355  STACK_PUSH5(ISAd, a, last, -3, trlink);
356  ISAd += incr, last = a, limit = next;
357  } else {
358  if(1 < (last - a)) {
359  STACK_PUSH5(ISAd + incr, first, a, next, trlink);
360  first = a, limit = -3;
361  } else {
362  ISAd += incr, last = a, limit = next;
363  }
364  }
365  } else {
366  if(0 <= trlink) { stack[trlink].d = -1; }
367  if(1 < (last - a)) {
368  first = a, limit = -3;
369  } else {
370  STACK_POP5(ISAd, first, last, limit, trlink);
371  }
372  }
373  } else {
374  STACK_POP5(ISAd, first, last, limit, trlink);
375  }
376  }
377  continue;
378  }
379 
380  if((last - first) <= TR_INSERTIONSORT_THRESHOLD) {
381  tr_insertionsort(B, ISAd, first, last);
382  limit = -3;
383  continue;
384  }
385 
386  if(limit-- == 0) {
387  tr_heapsort(B, ISAd, first, last - first);
388  for(a = last - 1; first < a; a = b) {
389  for(x = B[ISAd + B[a]], b = a - 1; (first <= b) && (B[ISAd + B[b]] == x); --b) { B[b] = ~B[b]; }
390  }
391  limit = -3;
392  continue;
393  }
394 
395  /* choose pivot */
396  a = tr_pivot(B, ISAd, first, last);
397  SWAP(B[first], B[a]);
398  v = B[ISAd + B[first]];
399 
400  /* partition */
401  tr_partition(B, ISAd, first, first + 1, last, &a, &b, v);
402  if((last - first) != (b - a)) {
403  next = (B[ISA + B[a]] != v) ? ilg<saidx_t>(b - a) : -1;
404 
405  /* update ranks */
406  for(c = first, v = a - SA - 1; c < a; ++c) { B[ISA + B[c]] = v; }
407  if(b < last) { for(c = a, v = b - SA - 1; c < b; ++c) { B[ISA + B[c]] = v; } }
408 
409  /* push */
410  if((1 < (b - a)) && (trbudget_check(budget, b - a))) {
411  if((a - first) <= (last - b)) {
412  if((last - b) <= (b - a)) {
413  if(1 < (a - first)) {
414  STACK_PUSH5(ISAd + incr, a, b, next, trlink);
415  STACK_PUSH5(ISAd, b, last, limit, trlink);
416  last = a;
417  } else if(1 < (last - b)) {
418  STACK_PUSH5(ISAd + incr, a, b, next, trlink);
419  first = b;
420  } else {
421  ISAd += incr, first = a, last = b, limit = next;
422  }
423  } else if((a - first) <= (b - a)) {
424  if(1 < (a - first)) {
425  STACK_PUSH5(ISAd, b, last, limit, trlink);
426  STACK_PUSH5(ISAd + incr, a, b, next, trlink);
427  last = a;
428  } else {
429  STACK_PUSH5(ISAd, b, last, limit, trlink);
430  ISAd += incr, first = a, last = b, limit = next;
431  }
432  } else {
433  STACK_PUSH5(ISAd, b, last, limit, trlink);
434  STACK_PUSH5(ISAd, first, a, limit, trlink);
435  ISAd += incr, first = a, last = b, limit = next;
436  }
437  } else {
438  if((a - first) <= (b - a)) {
439  if(1 < (last - b)) {
440  STACK_PUSH5(ISAd + incr, a, b, next, trlink);
441  STACK_PUSH5(ISAd, first, a, limit, trlink);
442  first = b;
443  } else if(1 < (a - first)) {
444  STACK_PUSH5(ISAd + incr, a, b, next, trlink);
445  last = a;
446  } else {
447  ISAd += incr, first = a, last = b, limit = next;
448  }
449  } else if((last - b) <= (b - a)) {
450  if(1 < (last - b)) {
451  STACK_PUSH5(ISAd, first, a, limit, trlink);
452  STACK_PUSH5(ISAd + incr, a, b, next, trlink);
453  first = b;
454  } else {
455  STACK_PUSH5(ISAd, first, a, limit, trlink);
456  ISAd += incr, first = a, last = b, limit = next;
457  }
458  } else {
459  STACK_PUSH5(ISAd, first, a, limit, trlink);
460  STACK_PUSH5(ISAd, b, last, limit, trlink);
461  ISAd += incr, first = a, last = b, limit = next;
462  }
463  }
464  } else {
465  if((1 < (b - a)) && (0 <= trlink)) { stack[trlink].d = -1; }
466  if((a - first) <= (last - b)) {
467  if(1 < (a - first)) {
468  STACK_PUSH5(ISAd, b, last, limit, trlink);
469  last = a;
470  } else if(1 < (last - b)) {
471  first = b;
472  } else {
473  STACK_POP5(ISAd, first, last, limit, trlink);
474  }
475  } else {
476  if(1 < (last - b)) {
477  STACK_PUSH5(ISAd, first, a, limit, trlink);
478  first = b;
479  } else if(1 < (a - first)) {
480  last = a;
481  } else {
482  STACK_POP5(ISAd, first, last, limit, trlink);
483  }
484  }
485  }
486  } else {
487  if(trbudget_check(budget, last - first)) {
488  limit = ilg<saidx_t>(last - first), ISAd += incr;
489  } else {
490  if(0 <= trlink) { stack[trlink].d = -1; }
491  STACK_POP5(ISAd, first, last, limit, trlink);
492  }
493  }
494  }
495 #undef STACK_SIZE
496 }
497 
498 
499 
500 /*---------------------------------------------------------------------------*/
501 
502 /*- Function -*/
503 
504 /* Tandem repeat sort */
505 template<typename buffer_t>
506 inline void trsort(buffer_t& B, saidx_t ISA, saidx_t SA, saidx_t n, saidx_t depth) {
507  saidx_t ISAd;
508  saidx_t first, last;
509  trbudget_t budget;
510  saidx_t t, skip, unsorted;
511 
512  trbudget_init(&budget, ilg<saidx_t>(n) * 2 / 3, n);
513  for(ISAd = ISA + depth; -n < B[SA]; ISAd += ISAd - ISA) {
514  first = SA;
515  skip = 0;
516  unsorted = 0;
517  do {
518  if((t = B[first]) < 0) { first -= t; skip += t; }
519  else {
520  if(skip != 0) { B[first + skip] = skip; skip = 0; }
521  last = SA + B[ISA + t] + 1;
522  if(1 < (last - first)) {
523  budget.count = 0;
524  tr_introsort(B, ISA, ISAd, SA, first, last, &budget);
525  if(budget.count != 0) { unsorted += budget.count; }
526  else { skip = first - last; }
527  } else if((last - first) == 1) {
528  skip = -1;
529  }
530  first = last;
531  }
532  } while(first < (SA + n));
533  if(skip != 0) { B[first + skip] = skip; }
534  if(unsorted == 0) { break; }
535  }
536 }
537 
538 }} //ns
540 
Contains the text compression and encoding framework.
Definition: namespaces.hpp:11
constexpr dsflags_t ISA
Definition: TextDSFlags.hpp:12
constexpr dsflags_t SA
Definition: TextDSFlags.hpp:11