All Classes Namespaces Files Functions Variables Typedefs Enumerations Friends Macros Pages
suffix_array.hpp
Go to the documentation of this file.
1 //=======================================================================
2 // Copyright (c) 2013 Piotr Smulewicz
3 //
4 // Distributed under the Boost Software License, Version 1.0. (See
5 // accompanying file LICENSE_1_0.txt or copy at
6 // http://www.boost.org/LICENSE_1_0.txt)
7 //=======================================================================
15 #ifndef PAAL_SUFFIX_ARRAY_HPP
16 #define PAAL_SUFFIX_ARRAY_HPP
17 
18 #include "paal/utils/irange.hpp"
19 
20 #include <boost/range/algorithm/fill.hpp>
21 #include <boost/range/algorithm/max_element.hpp>
22 #include <boost/range/numeric.hpp>
23 
24 /*
25  * algorithm from:
26  *
27  * http://www.cs.cmu.edu/~guyb/realworld/papersS04/KaSa03.pdf
28  *
29  */
30 
31 namespace paal {
32 
33 namespace detail {
44 // class suffix_array{
45 template <typename Letter>
46 inline bool leq(Letter a1, int a2, Letter b1,
47  int b2) { // lexic. order for pairs
48  return (a1 < b1 || (a1 == b1 && a2 <= b2));
49 }
62 template <typename Letter>
63 inline bool leq(Letter a1, Letter a2, int a3, Letter b1, Letter b2, int b3) {
64  return (a1 < b1 || (a1 == b1 && leq(a2, a3, b2, b3)));
65 }
66 
67 struct radix_pass {
68 
69  template <typename Letter>
70  radix_pass(int max_letter, const std::vector<Letter> & text) {
71  if (max_letter == 0) {
72  max_letter = *boost::max_element(text);
73  }
74  c.resize(max_letter + 2);
75  }
76 
77  // stably sort sortFrom[0..n-1] to sortTo[0..n-1] with keys in 0..K from r
78  template <typename Iterator>
79  void operator()(std::vector<int> const &sortFrom,
80  std::vector<int> &sortTo, Iterator r, int n) { // count occurrences
81  boost::fill(c, 0);
82  for (auto i : irange(n)) {
83  ++c[r[sortFrom[i]] + 1]; // count occurrences
84  }
85 
86  boost::partial_sum(c, c.begin());
87 
88  for (auto i : irange(n)) {
89  sortTo[c[r[sortFrom[i]]]++] = sortFrom[i]; // sort
90  }
91  }
92 private:
93  std::vector<int> c;
94 };
95 
108 // find the suffix array SA of text[0..n-1] in {1..max_letter}^n
109 template <typename Letter>
110 void suffix_array(std::vector<Letter> &text, std::vector<int> &SA,
111  Letter max_letter) {
112  int n = text.size() - 3;
113  assert(n >= 0);
114  int n0 = (n + 2) / 3, n1 = (n + 1) / 3, n2 = n / 3, n02 = n0 + n2;
115  text.resize(text.size() + 3);
116  std::vector<int> text12;
117  std::vector<int> SA12;
118  std::vector<int> text0;
119  std::vector<int> SA0;
120  radix_pass radix{max_letter, text};
121  // generate positions of mod 1 and mod 2 suffixes
122  // the "+(n0-n1)" adds a dummy mod 1 suffix if n%3 == 1
123  for (auto i : irange(n + (n0 - n1))) {
124  if (i % 3 != 0) {
125  text12.push_back(i);
126  }
127  }
128  SA0.resize(n0);
129  SA12.resize(n02 + 3);
130  text12.resize(n02 + 3);
131  // lsb radix sort the mod 1 and mod 2 triples
132  radix(text12, SA12, text.begin() + 2, n02);
133  radix(SA12, text12, text.begin() + 1, n02);
134  radix(text12, SA12, text.begin(), n02);
135 
136  // find lexicographic names of triples
137  int name = 0;
138  Letter c0 = Letter{}, c1 = Letter{}, c2 = Letter{};
139  for (auto i : irange(n02)) {
140  if (text[SA12[i]] != c0 || text[SA12[i] + 1] != c1 ||
141  text[SA12[i] + 2] != c2 || name == 0) {
142  name++;
143  c0 = text[SA12[i]];
144  c1 = text[SA12[i] + 1];
145  c2 = text[SA12[i] + 2];
146  }
147  if (SA12[i] % 3 == 1) {
148  text12[SA12[i] / 3] = name; // left half
149  } else {
150  text12[SA12[i] / 3 + n0] = name; // right half
151  }
152  }
153 
154  // recurse if names are not yet unique
155  if (name < n02) {
156  suffix_array<int>(text12, SA12, name); // parametrized by int intentionally
157  // store unique names in s12 using the suffix array
158  for (auto i : irange(n02)) {
159  text12[SA12[i]] = i + 1;
160  }
161  } else { // generate the suffix array of s12 directly
162  for (auto i : irange(n02)) {
163  SA12[text12[i] - 1] = i;
164  }
165  }
166 
167  // stably sort the mod 0 suffixes from SA12 by their first character
168  for (auto i : irange(n02)) {
169  if (SA12[i] < n0) {
170  text0.push_back(3 * SA12[i]);
171  }
172  }
173  radix(text0, SA0, text.begin(), n0);
174  auto GetI = [&](int t)->int {
175  return SA12[t] < n0 ? SA12[t] * 3 + 1 : (SA12[t] - n0) * 3 + 2;
176  };
177 
178  // merge sorted SA0 suffixes and sorted SA12 suffixes
179  auto p = SA0.begin();
180  int t = n0 - n1;
181  for (auto k = SA.begin(); k < SA.begin() + n; k++) {
182  int i = GetI(t); // pos of current offset 12 suffix
183  int j = (*p); // pos of current offset 0 suffix
184  if (SA12[t] < n0
185  ? leq(text[i], text12[SA12[t] + n0], text[j], text12[j / 3])
186  : leq(text[i], text[i + 1], text12[SA12[t] - n0 + 1], text[j],
187  text[j + 1], text12[j / 3 + n0])) { // suffix from SA12 is
188  // smaller
189  (*k) = i;
190  t++;
191  if (t == n02) { // done --- only SA0 suffixes left
192  k++;
193  if (p < SA0.end()) {
194  k = std::copy(p, SA0.end(), k);
195  p = SA0.end();
196  }
197  }
198  } else {
199  (*k) = j;
200  p++;
201  if (p == SA0.end()) { // done --- only SA12 suffixes left
202  for (k++; t < n02; t++, k++) {
203  (*k) = GetI(t);
204  }
205  }
206  }
207  }
208 }
209 
210 }
211 
212 
225 // find the suffix array SA of text[0..n-1] in {1..max_letter}^n
226 template <typename Letter>
227 void suffix_array(std::vector<Letter> &text, std::vector<int> &SA,
228  Letter max_letter = 0) {
229  text.resize(text.size() + 3);
230  detail::suffix_array(text, SA, max_letter);
231  text.resize(text.size() - 3);
232 }
233 
234 }
235 #endif // PAAL_SUFFIX_ARRAY_HPP
bool leq(Letter a1, int a2, Letter b1, int b2)
return true if pair (a1,a2) is smaller than pair (b1,b2) in lexicographic order and false otherwise ...
auto irange(T begin, T end)
irange
Definition: irange.hpp:22
void suffix_array(std::vector< Letter > &text, std::vector< int > &SA, Letter max_letter=0)
detail
void suffix_array(std::vector< Letter > &text, std::vector< int > &SA, Letter max_letter)
require text[n]=text[n+1]=text[n+2]=0, n&gt;=2 fill suffix_array suffix_array[i] contains the starting po...