15 #ifndef PAAL_SHORTEST_SUPERSTRING_HPP
16 #define PAAL_SHORTEST_SUPERSTRING_HPP
26 #include <boost/range/adaptors.hpp>
31 #include <type_traits>
54 : m_length(count_sum_lenght(words)),
55 m_prefix_tree(m_length, m_suffix_array, m_sum_words, m_lcp,
56 m_length_suffix_word) {
60 suffix_array<Letter>(m_sum_words, m_suffix_array);
64 lcp(m_suffix_array, m_rank, m_lcp, m_sum_words);
66 m_prefix_tree.build_prefix_tree();
68 m_prefix_tree.fill_suffix_to_tree();
84 for (
auto posInSumWords :
irange(1, m_length)) {
85 if ((!m_is_joined_sufiix[m_pos_to_word[posInSumWords]]) &&
86 (m_sum_words[posInSumWords - 1] == m_prefix_tree.DELIMITER)) {
87 for (
int nextLetter = posInSumWords;
88 m_sum_words[nextLetter] != m_prefix_tree.DELIMITER;) {
89 answer.push_back(m_sum_words[nextLetter]);
90 if (m_res[nextLetter] == NO_OVERLAP_STARTS_HERE) {
93 nextLetter = m_res[nextLetter];
102 int count_sum_lenght(
const Words &words) {
104 for (
auto const &word : words) {
105 length += word.size() + 1;
110 void initialize(
const Words &words) {
111 m_nu_words = words.size();
112 m_first_word_in_block_to_last_word_in_block.resize(m_length);
113 m_last_word_in_block_to_first_word_in_block.resize(m_length);
114 m_pos_to_word.resize(m_length);
115 m_length_suffix_word.resize(m_length);
117 m_suffix_array.resize(m_length);
118 m_lcp.resize(m_length);
119 m_rank.resize(m_length);
120 m_res.resize(m_length);
121 m_sum_words.resize(m_length);
122 m_is_joined_prefix.resize(m_nu_words);
123 m_is_joined_sufiix.resize(m_nu_words);
124 m_length_to_pos.resize(m_length);
128 for (
auto const &word : words) {
129 auto wordSize = boost::distance(word);
130 m_length_words.push_back(wordSize);
131 m_length_to_pos[wordSize].push_back(m_length);
132 int noLetterInWord = 0;
133 for (
auto letter : word) {
135 auto globalLetterId = m_length + noLetterInWord;
136 m_sum_words[globalLetterId] = letter;
137 m_pos_to_word[globalLetterId] = wordsId;
138 m_length_suffix_word[globalLetterId] =
139 wordSize - noLetterInWord;
142 m_first_word_in_block_to_last_word_in_block[m_length] = m_length;
143 m_last_word_in_block_to_first_word_in_block[m_length] = m_length;
144 m_length += wordSize + 1;
149 void erase_word_form_prefix_tree(
int word) {
150 m_is_joined_sufiix[m_pos_to_word[word]] = JOINED;
151 m_prefix_tree.erase_word_form_prefix_tree(word);
154 void join_all_words() {
155 auto ovelapSizeRange =
irange(m_length) | boost::adaptors::reversed;
156 for (
auto overlapSize : ovelapSizeRange) {
157 for (
auto word : m_length_to_pos[overlapSize]) {
158 if (m_lcp[m_rank[word]] >= overlapSize) {
160 erase_word_form_prefix_tree(word);
167 for (
auto overlapSize : ovelapSizeRange) {
168 for (
auto word : m_long_words) {
169 join_word(word, overlapSize);
171 for (
auto word : m_length_to_pos[overlapSize]) {
172 if (m_lcp[m_rank[word]] < overlapSize) {
174 m_long_words.push_back(word);
180 void join_word(
int ps,
int overlap) {
181 if (m_is_joined_prefix[m_pos_to_word[ps]] == JOINED) {
185 int suffix = m_rank[ps + m_length_words[m_pos_to_word[ps]] - overlap];
187 int prefix = m_prefix_tree.get_prefixequal_to_suffix(
188 suffix, m_last_word_in_block_to_first_word_in_block[ps]);
190 if (prefix == NOT_PREFIX) {
193 m_res[ps + m_length_words[m_pos_to_word[ps]] - overlap - 1] = prefix;
194 m_is_joined_prefix[m_pos_to_word[ps]] = JOINED;
196 m_last_word_in_block_to_first_word_in_block[
197 m_first_word_in_block_to_last_word_in_block[prefix]] =
198 m_last_word_in_block_to_first_word_in_block[ps];
199 m_first_word_in_block_to_last_word_in_block[
200 m_last_word_in_block_to_first_word_in_block[prefix]] = prefix;
201 erase_word_form_prefix_tree(prefix);
204 int m_length, m_nu_words;
205 std::vector<Letter> m_sum_words;
206 std::vector<int> m_first_word_in_block_to_last_word_in_block,
207 m_last_word_in_block_to_first_word_in_block, m_pos_to_word,
208 m_length_words, m_length_suffix_word, m_suffix_array, m_lcp, m_rank,
210 std::vector<bool> m_is_joined_prefix, m_is_joined_sufiix;
211 std::vector<std::vector<int>> m_length_to_pos;
213 prefix_tree<Letter> m_prefix_tree;
215 const static bool JOINED =
true;
217 const static int NO_OVERLAP_STARTS_HERE = 0;
218 const static int NOT_PREFIX = -1;
232 template <
typename Words>
242 #endif // PAAL_SHORTEST_SUPERSTRING_HPP
typename boost::range_value< Range >::type range_to_elem_t
for given range returns type of its element
void lcp(std::vector< int > const &suffix_array, std::vector< int > const &rank, std::vector< int > &lcp, std::vector< Letter > const &sumWords)
fill array lcp lcp[0] is undefined and lcp[i] stores the largest common prefix of the lexicographical...
Word get_solution()
return word contains all words as subwords, of lenght at most 3.5 larger than shortest superstring...
auto irange(T begin, T end)
irange
class to solve shortest superstring 3.5 aproximation, using greedy algorithm: contract pair of words ...
auto shortestSuperstring(const Words &words) -> decltype(std::declval< detail::shortest_superstring< Words >>().get_solution())
detail
void rank(std::vector< T > const &m_id_to_t, std::vector< Idx > &m_t_to_id, int INVALID_IDX=0)
computes rank i.e. index of element in range