All Classes Namespaces Files Functions Variables Typedefs Enumerations Friends Macros Pages
lsh_functions.hpp
Go to the documentation of this file.
1 //=======================================================================
2 // Copyright (c) 2014 Andrzej Pacuk, Piotr Wygocki
3 //
4 // Distributed under the Boost Software License, Version 1.0. (See
5 // accompanying file LICENSE_1_0.txt or copy at
6 // http://www.boost.org/LICENSE_1_0.txt)
7 //=======================================================================
15 #ifndef PAAL_LSH_FUNCTIONS_HPP
16 #define PAAL_LSH_FUNCTIONS_HPP
17 
19 #include "paal/utils/functors.hpp"
21 
22 #include <boost/range/algorithm/equal.hpp>
23 #include <boost/range/algorithm/generate.hpp>
24 #include <boost/range/algorithm/min_element.hpp>
25 #include <boost/range/algorithm_ext/iota.hpp>
26 #include <boost/range/counting_range.hpp>
27 #include <boost/range/empty.hpp>
28 #include <boost/range/iterator.hpp>
29 #include <boost/range/numeric.hpp>
30 #include <boost/range/size.hpp>
31 #include <boost/range/adaptor/transformed.hpp>
32 #include <boost/numeric/ublas/vector.hpp>
33 #include <boost/numeric/ublas/vector_sparse.hpp>
34 #include <boost/numeric/ublas/vector_expression.hpp>
35 
36 #include <cassert>
37 #include <cmath>
38 #include <cstddef>
39 #include <random>
40 #include <type_traits>
41 #include <utility>
42 
43 namespace paal {
44 
45 namespace lsh {
46 
50 template <typename IntType = std::size_t>
52  IntType m_chosen_position;
53 
54 public:
56  template<class Archive>
57  void serialize(Archive & ar, const unsigned int version) {
58  ar & m_chosen_position;
59  }
60 
62  projection_hash_function() = default;
63 
70  projection_hash_function(IntType chosen_position) :
71  m_chosen_position(chosen_position) {
72  }
73 
75  bool operator==(projection_hash_function const & other) const {
76  return m_chosen_position == other.m_chosen_position;
77  }
78 
87  template <typename Range>
88  //TODO change to decltype(auto), when it starts working
89  auto operator()(Range &&range) const ->
90  range_to_elem_t<decltype(range)> {
91  //TODO takes to long, use boost assert
92  //assert(m_chosen_position < boost::size(range));
93  return range[m_chosen_position];
94  }
95 };
96 
100 template <typename RandomEngine = std::default_random_engine,
101  typename IntType = std::size_t>
103  mutable RandomEngine m_generator;
104  using distribution_t = std::uniform_int_distribution<IntType>;
105  mutable distribution_t m_distribution;
106 
107  static IntType convert_to_inclusive(IntType upper_bound) {
108  assert(upper_bound > 0);
109  return upper_bound - 1;
110  }
111 
112 public:
120  RandomEngine random_engine = RandomEngine{}) :
121  m_generator(std::move(random_engine)),
122  m_distribution(0, convert_to_inclusive(range_size)) {
123  }
124 
130  //TODO change to auto, when it starts working
132  return projection_hash_function<IntType>(m_distribution(m_generator));
133  }
134 };
135 
136 using hamming_hash_function_generator =
137  random_projection_hash_function_generator<>;
138 
141 template <typename FloatType = double>
144 public:
145  using r_param_t = boost::numeric::ublas::vector<FloatType>;
146 
147 private:
148  r_param_t m_r;
149  FloatType m_b;
150  FloatType m_w;
151 
152 public:
153 
155  template<class Archive>
156  void serialize(Archive & ar, const unsigned int version) {
157  ar & m_r;
158  ar & m_b;
159  ar & m_w;
160  }
161 
170  l_p_hash_function(r_param_t r,
171  FloatType b, FloatType w) :
172  m_r(std::move(r)), m_b(b), m_w(w) {
173  }
174 
176  l_p_hash_function() = default;
177 
179  bool operator==(l_p_hash_function const & other) const {
180  return boost::equal(m_r, other.m_r) &&
181  m_b == other.m_b &&
182  m_w == other.m_w;
183  }
184 
193  template <typename Range>
194  auto operator()(Range &&range) const {
195  //TODO use concept check with VectorExpressionConcept<Range>,
196  //when it works
197 
198  //TODO takes to long, use boost assert
199  //assert(boost::size(m_r) == boost::size(range));
200 
201  using boost::numeric::ublas::inner_prod;
202  auto inner_product = inner_prod(m_r, range);
203  return std::floor((m_b + inner_product) / m_w);
204  }
205 
206 };
207 
211 template <typename FloatType = double,
212  typename RandomEngine = std::default_random_engine,
213  typename Distribution = std::normal_distribution<FloatType>>
215  std::size_t m_range_size;
216  FloatType m_w;
217  mutable RandomEngine m_generator;
218  mutable Distribution m_r_distribution;
219  mutable std::uniform_real_distribution<FloatType> m_b_distribution;
220 
221 public:
229  l_p_hash_function_generator(std::size_t range_size,
230  FloatType w,
231  RandomEngine random_engine = RandomEngine{}) :
232  m_range_size(range_size),
233  m_w(w),
234  m_generator(std::move(random_engine)),
235  m_b_distribution(FloatType{}, w) {
236  }
237 
244  typename l_p_hash_function<FloatType>::r_param_t r(m_range_size);
245  boost::generate(r, [&]() {
246  return m_r_distribution(m_generator);
247  });
248 
249  FloatType b = m_b_distribution(m_generator);
250  return l_p_hash_function<FloatType>(std::move(r), b, m_w);
251  }
252 };
253 
255 template <typename FloatType = double,
256  typename RandomEngine = std::default_random_engine>
257 using l_1_hash_function_generator =
258  l_p_hash_function_generator<FloatType, RandomEngine,
259  std::cauchy_distribution<FloatType>>;
260 
262 template <typename FloatType = double,
263  typename RandomEngine = std::default_random_engine>
264 using l_2_hash_function_generator =
265  l_p_hash_function_generator<FloatType, RandomEngine>;
266 
269  //permutation
270  std::vector<std::size_t> m_perm;
271 
272 public:
273 
275  template<class Archive>
276  void serialize(Archive & ar, const unsigned int version) {
277  ar & m_perm;
278  }
279 
281  template <typename RandomEngine>
282  min_hash_function(std::size_t set_element_upper_bound, RandomEngine && rng)
283  : m_perm(set_element_upper_bound) {
284  boost::iota(m_perm, 0);
285  std::shuffle(m_perm.begin(), m_perm.end(), rng);
286  }
287 
289  min_hash_function() = default;
290 
292  bool operator==(min_hash_function const & other) const {
293  return boost::equal(m_perm, other.m_perm);
294  }
295 
305  template <typename Range>
306  auto operator()(Range &&range) const {
307  static_assert(data_structures::is_sparse_row<Range>::value, "vector must be sparse");
308  using iter_t = typename boost::range_iterator<Range>::type;
309 
310  auto perm_function = [&](iter_t it) {
311  auto set_elem = it.index();
312  assert(set_elem < m_perm.size());
313  return m_perm[set_elem];
314  };
315  auto perm_range =
316  boost::counting_range(range)
317  | boost::adaptors::transformed(utils::make_assignable_functor(perm_function));
318 
319  assert(!boost::empty(perm_range));
320  return *boost::min_element(perm_range);
321  }
322 };
323 
325 template <typename RandomEngine = std::default_random_engine>
327  std::size_t m_range_size;
328  mutable RandomEngine m_generator;
329 
330 public:
332  min_hash_function_generator(std::size_t range_size,
333  RandomEngine random_engine = RandomEngine{}) :
334  m_range_size(range_size),
335  m_generator(std::move(random_engine)) { }
336 
339  return min_hash_function(m_range_size, m_generator);
340  }
341 };
342 using jaccard_hash_function_generator =
343  min_hash_function_generator<>;
344 
345 }
346 
347 }
348 
349 #endif // PAAL_LSH_FUNCTIONS_HPP
350 
typename boost::range_value< Range >::type range_to_elem_t
for given range returns type of its element
bool operator==(projection_hash_function const &other) const
operator==
Factory class for projection_hash_function.
projection_hash_function< IntType > operator()() const
operator()
min_hash_function operator()() const
operator()
l_p_hash_function()=default
default constructor
Factory class for l_p_hash_function.
random_projection_hash_function_generator(IntType range_size, RandomEngine random_engine=RandomEngine{})
constructor
min_hash_function_generator(std::size_t range_size, RandomEngine random_engine=RandomEngine{})
constructor
auto make_assignable_functor(Functor &f)
make function for assignable_functor
Definition: functors.hpp:421
This file contains set of simple useful functors or functor adapters.
void serialize(Archive &ar, const unsigned int version)
serialize
min-wise independent permutations locality sensitive hashing (Jaccard)
min_hash_function(std::size_t set_element_upper_bound, RandomEngine &&rng)
constructor
l_p_hash_function_generator(std::size_t range_size, FloatType w, RandomEngine random_engine=RandomEngine{})
constructor
void serialize(Archive &ar, const unsigned int version)
serialize
void serialize(Archive &ar, const unsigned int version)
serialize
l_p_hash_function(r_param_t r, FloatType b, FloatType w)
constructor
auto operator()(Range &&range) const
operator()
projection_hash_function()=default
default constructor
auto operator()(Range &&range) const
operator()
l_p_hash_function< FloatType > operator()() const
operator()
bool operator==(min_hash_function const &other) const
operator==
bool operator==(l_p_hash_function const &other) const
operator==
hash_function for l_p distance for p in range (0,2]
Factory class for min_hash_function.
projection_hash_function(IntType chosen_position)
constructor
auto operator()(Range &&range) const -> range_to_elem_t< decltype(range)>
operator()
min_hash_function()=default
default constructor