All Classes Namespaces Files Functions Variables Typedefs Enumerations Friends Macros Pages
prefix_tree.hpp
Go to the documentation of this file.
1 //=======================================================================
2 // Copyright (c) 2013 Piotr Smulewicz
3 //
4 // Distributed under the Boost Software License, Version 1.0. (See
5 // accompanying file LICENSE_1_0.txt or copy at
6 // http://www.boost.org/LICENSE_1_0.txt)
7 //=======================================================================
15 #ifndef PAAL_PREFIX_TREE_HPP
16 #define PAAL_PREFIX_TREE_HPP
17 
18 #include <vector>
19 namespace paal {
20 namespace greedy {
21 namespace detail {
22 
23 template <typename Letter> class prefix_tree {
24 
25  struct Node {
26  Letter letter = DELIMITER;
27  Node *son = CHILDLESS;
28  std::vector<int> prefixes; // ends of all prefixes of single words in
29  // concatenate words corresponding to Node
30  Node(int _letter) : letter(_letter) {};
31  Node() {}; // root
32  };
33 
34  public:
35  prefix_tree(int length, std::vector<int> const &suffix_array,
36  std::vector<Letter> const &sumWords,
37  std::vector<int> const &lcp,
38  std::vector<int> const &lengthSuffixWord)
39  : m_length(length), m_prefix_tree(m_length), m_which_son_am_i(m_length),
40  m_prefix_to_tree(m_length), m_suffix_to_tree(m_length),
41  m_suffix_array(suffix_array), m_sum_words(sumWords), m_lcp(lcp),
42  m_length_suffix_word(lengthSuffixWord) {}
43 
44  void build_prefix_tree() {
45  m_prefix_tree.push_back(Node()); // root
46  for (auto suffix : m_suffix_array) {
47  // memory protection and we add only whole words in lexographic
48  // order
49  if ((suffix != 0) && (m_sum_words[suffix - 1] == DELIMITER)) {
50  add_word_to_prefix_tree(suffix);
51  }
52  }
53  }
54 
55  void erase_word_form_prefix_tree(int wordBegin) {
56  for (int letterOfWord = 0;
57  m_sum_words[letterOfWord + wordBegin] != DELIMITER;
58  ++letterOfWord) {
59  auto letterIdx = wordBegin + letterOfWord;
60  auto whichSon = m_which_son_am_i[letterIdx];
61  auto &nodePrefixes = m_prefix_to_tree[letterIdx]->prefixes;
62  assert(std::size_t(whichSon) < nodePrefixes.size());
63  int lastPrefix = nodePrefixes.back();
64  nodePrefixes[whichSon] = lastPrefix;
65  m_which_son_am_i[lastPrefix + letterOfWord] = whichSon;
66  nodePrefixes.pop_back();
67  }
68  }
69 
70  // for all suffix of word: if suffix is equal to any prefix of word we
71  // remember position in prefix tree coresponding to suffix
72  void fill_suffix_to_tree() {
73  for (int suffix = m_length - 1, lastWord = 0, commonPrefix = 0;
74  suffix > 0; suffix--) {
75  auto beginOfSuffix = m_suffix_array[suffix];
76  if (beginOfSuffix == 0 ||
77  m_sum_words[beginOfSuffix - 1] == DELIMITER) {
78  lastWord = beginOfSuffix;
79  commonPrefix = m_lcp[suffix];
80  } else {
81  if (commonPrefix == m_length_suffix_word[beginOfSuffix]) {
82  m_suffix_to_tree[suffix] =
83  m_prefix_to_tree[lastWord + commonPrefix - 1];
84  }
85  if (m_lcp[suffix] < commonPrefix) {
86  commonPrefix = m_lcp[suffix];
87  }
88  }
89  }
90  }
91 
92  int get_prefixequal_to_suffix(int suffix, int firstWordInBlock) {
93  Node *nodeCorrespondingToSuffix = m_suffix_to_tree[suffix];
94  if (nodeCorrespondingToSuffix == NO_SUFFIX_IN_TREE) {
95  return NOT_PREFIX;
96  }
97  auto const &overlapPrefixes = nodeCorrespondingToSuffix->prefixes;
98 
99  if (overlapPrefixes.size()) {
100  int whichPrefix = ANY_PREFIX; // which prefix of prefixes equal to
101  // suffix, will be joined
102  // check if first prefix belong to same block as prefix (avoid
103  // loops)
104  if (overlapPrefixes[whichPrefix] == firstWordInBlock) {
105  if (overlapPrefixes.size() >= 2) {
106  whichPrefix = ANY_OTHER_PREFIX;
107  } else {
108  return NOT_PREFIX;
109  }
110  }
111  return overlapPrefixes[whichPrefix];
112  } else {
113  return NOT_PREFIX;
114  }
115  }
116 
117  private:
118  void add_word_to_prefix_tree(int word) {
119  Node *node = &m_prefix_tree[ROOT];
120  int letter = word;
121  // we go by patch until Letter on patch all equal to letter in words
122  // we only check last son because we add words in lexographic order
123  while (node->son != CHILDLESS &&
124  node->son->letter == m_sum_words[letter] &&
125  m_sum_words[letter] != DELIMITER) {
126  node = node->son;
127  ++letter;
128  }
129  // we add new Node
130  while (m_sum_words[letter]) {
131  // if this asserts, you have very strange implementation of stl
132  assert(m_prefix_tree.capacity() > m_prefix_tree.size());
133  m_prefix_tree.push_back(Node(m_sum_words[letter]));
134  node->son = &m_prefix_tree.back();
135  node = &m_prefix_tree.back();
136  ++letter;
137  }
138  node = &m_prefix_tree[ROOT];
139  letter = word;
140  // we fill:
141  // m_prefix_to_tree
142  // m_which_son_am_i
143  // and add to Node.prefixes coresponding prefixes
144  while (m_sum_words[letter] != DELIMITER) {
145  node = node->son;
146  m_prefix_to_tree[letter] = node;
147  m_which_son_am_i[letter] = node->prefixes.size();
148  node->prefixes.push_back(word);
149  ++letter;
150  }
151  }
152  int m_length;
153 
154  std::vector<Node> m_prefix_tree;
155  std::vector<int> m_which_son_am_i;
156 
157  std::vector<Node *> m_prefix_to_tree;
158  std::vector<Node *> m_suffix_to_tree;
159 
160  const std::vector<int> &m_suffix_array;
161  const std::vector<Letter> &m_sum_words;
162  const std::vector<int> &m_lcp;
163  const std::vector<int> &m_length_suffix_word;
164 
165  const static int ROOT = 0;
166  const static int NOT_PREFIX = -1;
167  const static int ANY_PREFIX = 0;
168  const static int ANY_OTHER_PREFIX = 1;
169  const static std::nullptr_t NO_SUFFIX_IN_TREE;
170  const static std::nullptr_t CHILDLESS;
171 
172  public:
173  const static Letter DELIMITER = 0;
174 };
175 }
176 }
177 }
178 #endif // PAAL_PREFIX_TREE_HPP
void lcp(std::vector< int > const &suffix_array, std::vector< int > const &rank, std::vector< int > &lcp, std::vector< Letter > const &sumWords)
fill array lcp lcp[0] is undefined and lcp[i] stores the largest common prefix of the lexicographical...
Definition: lcp.hpp:37
void suffix_array(std::vector< Letter > &text, std::vector< int > &SA, Letter max_letter=0)
detail