All Classes Namespaces Files Functions Variables Typedefs Enumerations Friends Macros Pages
shortest_superstring.hpp
Go to the documentation of this file.
1 //=======================================================================
2 // Copyright (c) 2013 Piotr Smulewicz
3 //
4 // Distributed under the Boost Software License, Version 1.0. (See
5 // accompanying file LICENSE_1_0.txt or copy at
6 // http://www.boost.org/LICENSE_1_0.txt)
7 //=======================================================================
15 #ifndef PAAL_SHORTEST_SUPERSTRING_HPP
16 #define PAAL_SHORTEST_SUPERSTRING_HPP
17 
18 
24 #include "paal/utils/irange.hpp"
25 
26 #include <boost/range/adaptors.hpp>
27 
28 #include <vector>
29 #include <algorithm>
30 #include <utility>
31 #include <type_traits>
32 
33 namespace paal {
34 namespace greedy {
35 namespace detail {
36 
48 template <typename Words> class shortest_superstring {
49  public:
50  typedef range_to_elem_t<Words> Word;
51  typedef range_to_elem_t<Word> Letter;
52 
53  shortest_superstring(const Words &words)
54  : m_length(count_sum_lenght(words)),
55  m_prefix_tree(m_length, m_suffix_array, m_sum_words, m_lcp,
56  m_length_suffix_word) {
57 
58  initialize(words);
59 
60  suffix_array<Letter>(m_sum_words, m_suffix_array);
61 
62  data_structures::rank(m_suffix_array, m_rank);
63 
64  lcp(m_suffix_array, m_rank, m_lcp, m_sum_words);
65 
66  m_prefix_tree.build_prefix_tree();
67 
68  m_prefix_tree.fill_suffix_to_tree();
69 
70  join_all_words();
71  }
72 
82  Word get_solution() {
83  Word answer;
84  for (auto posInSumWords : irange(1, m_length)) {
85  if ((!m_is_joined_sufiix[m_pos_to_word[posInSumWords]]) &&
86  (m_sum_words[posInSumWords - 1] == m_prefix_tree.DELIMITER)) {
87  for (int nextLetter = posInSumWords;
88  m_sum_words[nextLetter] != m_prefix_tree.DELIMITER;) {
89  answer.push_back(m_sum_words[nextLetter]);
90  if (m_res[nextLetter] == NO_OVERLAP_STARTS_HERE) {
91  ++nextLetter;
92  } else {
93  nextLetter = m_res[nextLetter];
94  }
95  }
96  }
97  }
98  return answer;
99  }
100 
101  private:
102  int count_sum_lenght(const Words &words) {
103  int length = 1;
104  for (auto const &word : words) {
105  length += word.size() + 1;
106  }
107  return length;
108  }
109 
110  void initialize(const Words &words) {
111  m_nu_words = words.size();
112  m_first_word_in_block_to_last_word_in_block.resize(m_length);
113  m_last_word_in_block_to_first_word_in_block.resize(m_length);
114  m_pos_to_word.resize(m_length);
115  m_length_suffix_word.resize(m_length);
116 
117  m_suffix_array.resize(m_length);
118  m_lcp.resize(m_length);
119  m_rank.resize(m_length);
120  m_res.resize(m_length);
121  m_sum_words.resize(m_length);
122  m_is_joined_prefix.resize(m_nu_words);
123  m_is_joined_sufiix.resize(m_nu_words);
124  m_length_to_pos.resize(m_length);
125 
126  m_length = 1;
127  int wordsId = 0;
128  for (auto const &word : words) {
129  auto wordSize = boost::distance(word);
130  m_length_words.push_back(wordSize);
131  m_length_to_pos[wordSize].push_back(m_length);
132  int noLetterInWord = 0;
133  for (auto letter : word) {
134  assert(letter != 0);
135  auto globalLetterId = m_length + noLetterInWord;
136  m_sum_words[globalLetterId] = letter;
137  m_pos_to_word[globalLetterId] = wordsId;
138  m_length_suffix_word[globalLetterId] =
139  wordSize - noLetterInWord;
140  ++noLetterInWord;
141  }
142  m_first_word_in_block_to_last_word_in_block[m_length] = m_length;
143  m_last_word_in_block_to_first_word_in_block[m_length] = m_length;
144  m_length += wordSize + 1;
145  ++wordsId;
146  }
147  }
148 
149  void erase_word_form_prefix_tree(int word) {
150  m_is_joined_sufiix[m_pos_to_word[word]] = JOINED;
151  m_prefix_tree.erase_word_form_prefix_tree(word);
152  }
153 
154  void join_all_words() {
155  auto ovelapSizeRange = irange(m_length) | boost::adaptors::reversed;
156  for (auto overlapSize : ovelapSizeRange) {
157  for (auto word : m_length_to_pos[overlapSize]) {
158  if (m_lcp[m_rank[word]] >= overlapSize) { // check if word is
159  // substring
160  erase_word_form_prefix_tree(word);
161  }
162  }
163  }
164 
165  // in each iteration we join all pair of words who have overlap size
166  // equal overlapSize
167  for (auto overlapSize : ovelapSizeRange) {
168  for (auto word : m_long_words) {
169  join_word(word, overlapSize);
170  }
171  for (auto word : m_length_to_pos[overlapSize]) {
172  if (m_lcp[m_rank[word]] < overlapSize) { // check if word is not
173  // substring
174  m_long_words.push_back(word);
175  }
176  }
177  }
178  }
179 
180  void join_word(int ps, int overlap) {
181  if (m_is_joined_prefix[m_pos_to_word[ps]] == JOINED) {
182  return;
183  };
184 
185  int suffix = m_rank[ps + m_length_words[m_pos_to_word[ps]] - overlap];
186 
187  int prefix = m_prefix_tree.get_prefixequal_to_suffix(
188  suffix, m_last_word_in_block_to_first_word_in_block[ps]);
189 
190  if (prefix == NOT_PREFIX) {
191  return;
192  }
193  m_res[ps + m_length_words[m_pos_to_word[ps]] - overlap - 1] = prefix;
194  m_is_joined_prefix[m_pos_to_word[ps]] = JOINED;
195 
196  m_last_word_in_block_to_first_word_in_block[
197  m_first_word_in_block_to_last_word_in_block[prefix]] =
198  m_last_word_in_block_to_first_word_in_block[ps];
199  m_first_word_in_block_to_last_word_in_block[
200  m_last_word_in_block_to_first_word_in_block[prefix]] = prefix;
201  erase_word_form_prefix_tree(prefix);
202  }
203 
204  int m_length, m_nu_words;
205  std::vector<Letter> m_sum_words;
206  std::vector<int> m_first_word_in_block_to_last_word_in_block,
207  m_last_word_in_block_to_first_word_in_block, m_pos_to_word,
208  m_length_words, m_length_suffix_word, m_suffix_array, m_lcp, m_rank,
209  m_res, m_long_words;
210  std::vector<bool> m_is_joined_prefix, m_is_joined_sufiix;
211  std::vector<std::vector<int>> m_length_to_pos;
212 
213  prefix_tree<Letter> m_prefix_tree;
214 
215  const static bool JOINED = true;
216 
217  const static int NO_OVERLAP_STARTS_HERE = 0;
218  const static int NOT_PREFIX = -1;
219 };
220 }
221 
232 template <typename Words>
233 auto shortestSuperstring(const Words &words)->decltype(
234  std::declval<detail::shortest_superstring<Words>>().get_solution()) {
236  return solver.get_solution();
237 }
238 ;
239 
240 }
241 }
242 #endif // PAAL_SHORTEST_SUPERSTRING_HPP
typename boost::range_value< Range >::type range_to_elem_t
for given range returns type of its element
void lcp(std::vector< int > const &suffix_array, std::vector< int > const &rank, std::vector< int > &lcp, std::vector< Letter > const &sumWords)
fill array lcp lcp[0] is undefined and lcp[i] stores the largest common prefix of the lexicographical...
Definition: lcp.hpp:37
Word get_solution()
return word contains all words as subwords, of lenght at most 3.5 larger than shortest superstring...
auto irange(T begin, T end)
irange
Definition: irange.hpp:22
class to solve shortest superstring 3.5 aproximation, using greedy algorithm: contract pair of words ...
auto shortestSuperstring(const Words &words) -> decltype(std::declval< detail::shortest_superstring< Words >>().get_solution())
detail
void rank(std::vector< T > const &m_id_to_t, std::vector< Idx > &m_t_to_id, int INVALID_IDX=0)
computes rank i.e. index of element in range
Definition: bimap.hpp:364