All Classes Namespaces Files Functions Variables Typedefs Enumerations Friends Macros Pages
lsh-regression.cpp
Go to the documentation of this file.
1 //=======================================================================
2 // Copyright (c) 2014 Karol Wegrzycki, Piotr Wygocki
3 //
4 // Distributed under the Boost Software License, Version 1.0. (See
5 // accompanying file LICENSE_1_0.txt or copy at
6 // http://www.boost.org/LICENSE_1_0.txt)
7 //=======================================================================
18 #include "paal/utils/functors.hpp"
20 #include "paal/utils/read_svm.hpp"
23 
24 #include <boost/algorithm/string.hpp>
25 #include <boost/archive/binary_iarchive.hpp>
26 #include <boost/archive/binary_oarchive.hpp>
27 #include <boost/function_output_iterator.hpp>
28 #include <boost/iostreams/device/mapped_file.hpp>
29 #include <boost/numeric/ublas/vector_sparse.hpp>
30 #include <boost/program_options.hpp>
31 #include <boost/range/adaptor/transformed.hpp>
32 #include <boost/range/algorithm/copy.hpp>
33 #include <boost/range/algorithm/fill.hpp>
34 #include <boost/range/empty.hpp>
35 #include <boost/range/size.hpp>
36 
37 #include <cstdlib>
38 #include <iostream>
39 #include <random>
40 #include <sstream>
41 #include <string>
42 #include <thread>
43 #include <utility>
44 
45 using point_type_sparse = boost::numeric::ublas::compressed_vector<double>;
46 using point_type_dense = boost::numeric::ublas::vector<double>;
47 
49 namespace utils = paal::utils;
50 namespace po = boost::program_options;
51 
52 enum Metric {HAMMING, L1, L2, JACCARD};
53 enum Vector_type {SPARSE, DENSE};
54 
55 struct l1_tag{};
56 struct l2_tag{};
57 struct ham_tag{};
58 struct jaccard_tag{};
59 
60 std::istream& operator>>(std::istream& in, Metric& metr) {
61  std::string token;
62  in >> token;
63  boost::algorithm::to_lower(token);
64  if (token == "hamming" || token == "h")
65  metr = HAMMING;
66  else if (token == "l1")
67  metr = L1;
68  else if (token == "l2")
69  metr = L2;
70  else if (token == "jaccard")
71  metr = JACCARD;
72  else
73  assert(0 && "couldn't conclude metric name");
74  return in;
75 }
76 
77 std::istream& operator>>(std::istream& in, Vector_type & vec_type) {
78  std::string token;
79  in >> token;
80  boost::algorithm::to_lower(token);
81  if (token == "dense")
82  vec_type = DENSE;
83  else if (token == "sparse")
84  vec_type = SPARSE;
85  else
86  assert(0 && "couldn't conclude sparse/dense representation");
87  return in;
88 }
89 
90 struct params {
91  unsigned m_passes;
92  unsigned m_nthread;
93  std::size_t m_dimensions;
94  std::size_t m_row_buffer_size;
95  unsigned m_precision;
96  int m_seed;
97  double m_w;
98  Vector_type m_dense;
99  Metric m_metric;
100 };
101 
102 auto get_function_generator(l1_tag, params const &p) {
103  return paal::lsh::l_1_hash_function_generator<>{p.m_dimensions, p.m_w, std::default_random_engine(p.m_seed)};
104 }
105 
106 auto get_function_generator(l2_tag, params const &p) {
107  return paal::lsh::l_2_hash_function_generator<>{p.m_dimensions, p.m_w, std::default_random_engine(p.m_seed)};
108 }
109 
110 auto get_function_generator(ham_tag, params const &p) {
111  return paal::lsh::hamming_hash_function_generator(p.m_dimensions, std::default_random_engine(p.m_seed));
112 }
113 
114 auto get_function_generator(jaccard_tag, params const &p) {
115  return paal::lsh::jaccard_hash_function_generator(p.m_dimensions, std::default_random_engine(p.m_seed));
116 }
117 
118 template <typename Row = point_type_sparse, typename LshFunctionTag>
119 void m_main(po::variables_map const &vm,
120  params const &p,
121  LshFunctionTag tag) {
122  using lsh_fun = paal::pure_result_of_t<
124  decltype(get_function_generator(tag, p))
125  >()
126  >;
127  using hash_result = typename std::remove_reference<
128  typename std::result_of<lsh_fun(Row)>::type
129  >::type;
131  using point_with_result_t = std::tuple<Row, int>;
132  using boost::adaptors::transformed;
133  constexpr paal::utils::tuple_get<0> get_coordinates{};
134  constexpr paal::utils::tuple_get<1> get_result{};
135  std::vector<point_with_result_t> points_buffer;
136 
137  model_t model;
138 
139  if (vm.count("model_in")) {
140  Metric metric;
141  std::size_t dimensions;
142  std::ifstream ifs(vm["model_in"].as<std::string>());
143  boost::archive::binary_iarchive ia(ifs);
144  ia >> metric;
145  ia >> dimensions;
146  ia >> model;
147  assert(metric == p.m_metric);
148  assert(dimensions == p.m_dimensions);
149  } else {
151  points_buffer | transformed(get_coordinates),
152  points_buffer | transformed(get_result),
153  p.m_passes,
154  get_function_generator(tag, p) , p.m_precision, p.m_nthread);
155  }
156 
157  auto ignore_bad_row = [&](std::string const &bad_line) {
158  utils::warning("following line will be ignored cause of bad format (typically more columns than passed dimensions): ", bad_line);
159  return true;
160  };
161 
162  if (vm.count("training_file")) {
163  std::ifstream training_file_stream(vm["training_file"].as<std::string>());
164  if (!training_file_stream.good()) {
165  utils::failure("training file does not exist or is empty!");
166  }
167 
168  points_buffer.reserve(p.m_row_buffer_size);
169  while (training_file_stream.good()) {
170  points_buffer.clear();
171  auto dimensions = p.m_dimensions;
172  paal::read_svm(training_file_stream, dimensions, points_buffer, p.m_row_buffer_size, ignore_bad_row);
173 
174  model.update(points_buffer | transformed(get_coordinates),
175  points_buffer | transformed(get_result),
176  p.m_nthread);
177  }
178  }
179 
180  if (vm.count("test_file")) {
181  // Our algorithm returns range<tuple<Prediction,RealValue>>
182  constexpr paal::utils::tuple_get<0> get_prediction{};
183  constexpr paal::utils::tuple_get<1> get_real_value{};
184  std::string const test_path = vm["test_file"].as<std::string>();
185 
186  // Lambda - for given line return tuple<LshPrediction,RealValue>>
187  auto line_tester = [&](std::string const & line) {
188  double test_result{};
189 
190  std::stringstream row_stream(line);
191  using result_t = int;
192  paal::detail::svm_row<Row, result_t> row{p.m_dimensions};
193  if (!(row_stream >> row)) {
194  ignore_bad_row(line);
195  // prediction equal = result, should have the same impact on global result as ignoring the row
196  return std::make_tuple(test_result, static_cast<result_t>(test_result));
197  }
198 
199  auto test_point_iterator = paal::utils::make_singleton_range(row.get_coordinates());
200  auto output_iterator = boost::make_function_output_iterator([&](double x){test_result = x;});
201 
202  model.test(test_point_iterator, output_iterator);
203 
204  return std::make_tuple(test_result,row.get_result());
205  };
206 
207  // compute results using n threads and our lambda
208  auto test_results = paal::data_structures::for_each_line(line_tester, test_path, p.m_nthread);
209 
210  auto loss = paal::log_loss<double>(test_results | transformed(get_prediction),
211  test_results | transformed(get_real_value));
212 
213  utils::info("logloss on test set = ", loss, ", likelihood = ", paal::likelihood_from_log_loss(loss));
214 
215  if (vm.count("result_file") > 0) {
216  std::ofstream result_file(vm["result_file"].as<std::string>());
217 
218  for (auto d: test_results | transformed(get_prediction)) {
219  result_file << d << "\n";
220  }
221  }
222  }
223 
224  if (vm.count("model_out")) {
225  std::ofstream ofs(vm["model_out"].as<std::string>());
226  boost::archive::binary_oarchive oa(ofs);
227  oa << p.m_metric;
228  oa << p.m_dimensions;
229  oa << model;
230  }
231 }
232 
233 template <typename LshFunctionTag>
234 void choose_vector_type_main(po::variables_map const &vm,
235  params const &p,
236  LshFunctionTag tag) {
237  if (p.m_dense == DENSE) {
238  m_main<point_type_dense>(vm,p,tag);
239  } else if( p.m_dense == SPARSE) {
240  m_main<point_type_sparse>(vm,p,tag);
241  }
242 }
243 
244 void choose_vector_type_main(po::variables_map const &vm,
245  params const &p,
246  jaccard_tag tag) {
247  assert(p.m_dense == SPARSE && "Jaccard metric supports only sparse rows representation.");
248  m_main<point_type_sparse>(vm,p,tag);
249 }
250 
251 int main(int argc, char** argv)
252 {
253  params p{};
254 
255  po::options_description desc("LSH nearest neighbors regression - \n"\
256  "suite for fast machine learning KNN algorithm which is using "\
257  "locality sensitive hashing functions\n\nUsage:\n"\
258  "This command will train on training file and output the predictions in test_file:\n"
259  "\tlsh-regression --training_file path_to_training_file --test_file path_to_test_file --dimensions number_of_dimensions\n\n"\
260  "If you want to use L1 metric, with 7 passes and 10 threads, and save model\n"\
261  "you can use following command:\n"
262  "\tlsh-regression -d training.svm -i 7 -n 10 -m L1 --dimensions number_of_dimensions --model_out model.lsh\n\n"\
263  "Then if you want to use this model to make a prediction to result_file:\n"\
264  "\tlsh-regression -t test.svm --model_in model.lsh -o results.txt\n\n"
265  "Options description");
266 
267  desc.add_options()
268  ("help,h", "help message")
269  ("training_file,d", po::value<std::string>(), "training file path (in SVM format)")
270  ("test_file,t", po::value<std::string>(), "test file path (in SVM format, it doesn't matter what label says)")
271  ("model_in", po::value<std::string>(), "path to model, before doing any training or testing")
272  ("model_out", po::value<std::string>(), "Write the model to this file when everything is done")
273  ("result_file,o", po::value<std::string>(), "path to the file with prediction " \
274  "(float for every test in test set)")
275  ("dimensions", po::value<std::size_t>(&p.m_dimensions), "number of dimensions")
276  ("passes,i", po::value<unsigned>(&p.m_passes)->default_value(3), "number of iteration (default value = 3)")
277  ("nthread,n", po::value<unsigned>(&p.m_nthread)->default_value(std::thread::hardware_concurrency()),
278  "number of threads (default = number of cores)")
279  ("dense", po::value<Vector_type>(&p.m_dense)->default_value(SPARSE), "Dense/Sparse - Allows to use dense\
280  array representation (It might significantly speed up program for dense data)")
281  ("metric,m", po::value<Metric>(&p.m_metric)->default_value(HAMMING), "Metric used for determining " \
282  "similarity between objects - [HAMMING/L1/L2/JACCARD] (default = Hamming)")
283  ("precision,b", po::value<unsigned>(&p.m_precision)->default_value(10), "Number " \
284  "of hashing function that are encoding the object")
285  ("parm_w,w", po::value<double>(&p.m_w)->default_value(1000.), "Parameter w " \
286  "should be essentially bigger than radius of expected test point neighborhood")
287  ("row_buffer_size", po::value<std::size_t>(&p.m_row_buffer_size)->default_value(100000),
288  "size of row buffer (default value = 100000)")
289  ("seed", po::value<int>(&p.m_seed)->default_value(0), "Seed of random number generator, (default = random)")
290  ;
291 
292  po::variables_map vm;
293  po::store(po::parse_command_line(argc, argv, desc), vm);
294  po::notify(vm);
295 
296  auto param_is_set_explicitly = [&vm] (const std::string &param_name) {
297  return vm.count(param_name) > 0 && !vm[param_name].defaulted();
298  };
299 
300  if (vm.count("help")) {
301  utils::info(desc);
302  return EXIT_SUCCESS;
303  }
304 
305  auto error_with_usage = [&] (const std::string &message) {
306  utils::failure(message, "\n", desc);
307  };
308 
309  if (vm.count("training_file") == 0 && vm.count("test_file") == 0) {
310  error_with_usage("Neither training_file nor test_file were set");
311  }
312 
313  if (vm.count("dimensions") == 0 && vm.count("model_in") == 0) {
314  error_with_usage("Parameter dimensions was not set");
315  }
316 
317  if (vm.count("training_file") == 0 && vm.count("model_in") == 0) {
318  error_with_usage("If you don't set training file (training_file) you have to set input model (model_in)");
319  }
320 
321  if (vm.count("model_in")) {
322  Metric m;
323  std::size_t dimensions;
324  std::ifstream ifs(vm["model_in"].as<std::string>());
325  boost::archive::binary_iarchive ia(ifs);
326 
327  ia >> m;
328  ia >> dimensions;
329 
330  auto ignore_serializable_param = [&](std::string const &param_name, bool param_is_equal_to_serialized) {
331  if (param_is_set_explicitly(param_name)) {
332  if (param_is_equal_to_serialized) {
333  utils::warning("if input model is specified one does not have to specify param ", param_name);
334  } else {
335  utils::warning("the specified param ", param_name, " is ignored, because it differs from the input model param ", param_name);
336  }
337  }
338  };
339  ignore_serializable_param("metric", m == p.m_metric);
340  ignore_serializable_param("dimensions", dimensions == p.m_dimensions);
341 
342  auto ignored = [&](std::string const & param, std::string const & param_display) {
343  if (param_is_set_explicitly(param)) {
344  utils::warning("parameter ", param_display, " was set, but model_in is used, param ", param_display, " is discarded");
345  }
346  };
347  ignored("parm_w", "w");
348  ignored("precision", "precision");
349  ignored("seed", "seed");
350  ignored("passes", "passes");
351 
352  p.m_metric = m;
353  p.m_dimensions = dimensions;
354  } else {
355  if (p.m_metric == HAMMING && param_is_set_explicitly("parm_w")) {
356  utils::warning("parameter w was set, but hamming metric is used, param w is discarded");
357  }
358  }
359 
360  switch (p.m_metric) {
361  case L1: choose_vector_type_main(vm, p, l1_tag{});
362  break;
363 
364  case L2: choose_vector_type_main(vm, p, l2_tag{});
365  break;
366 
367  case HAMMING: choose_vector_type_main(vm, p, ham_tag{});
368  break;
369 
370  case JACCARD: choose_vector_type_main(vm, p, jaccard_tag{});
371  break;
372 
373  default:
374  assert(false);
375  }
376  return EXIT_SUCCESS;
377 }
auto for_each_line(Functor f, std::string const &file_path, unsigned threads_count=std::thread::hardware_concurrency())
for_every_line function provides basic functionality for processing text files quickly and clearly...
auto make_singleton_range(Elem &&elem) -> decltype(boost::make_iterator_range(make_singleton_iterator_begin(std::forward< Elem >(elem)), make_singleton_iterator_end< Elem >()))
function to create a singleton range
Factory class for projection_hash_function.
auto info(Arg &&arg, Args...args)
prints info message
Factory class for l_p_hash_function.
Interface for using mmaped files with threads.
auto make_lsh_nearest_neighbors_regression_tuple_hash(TrainingPoints &&training_points, TrainingResults &&training_results, unsigned passes, FunctionGenerator &&function_generator, unsigned hash_functions_per_point, unsigned threads_count=std::thread::hardware_concurrency())
This is the special version of make_lsh_nearest_neighbors_regression. This version assumes that hash ...
This file contains set of simple useful functors or functor adapters.
class that can read single svm row
Definition: read_svm.hpp:64
FloatType likelihood_from_log_loss(FloatType log_loss)
auto warning(Arg &&arg, Args...args)
prints warning message
functor for std::tuple::get&lt;I&gt;
Definition: functors.hpp:1002
auto failure(Arg &&arg, Args...args)
prints failure message
void read_svm(std::istream &input_stream, std::size_t &max_dimensions, std::vector< std::tuple< RowType, ResultType >> &points, std::size_t max_points_to_read, ShouldIgnoreBadRow &&should_ignore_bad_row=ShouldIgnoreBadRow{})
detail
Definition: read_svm.hpp:206
Factory class for min_hash_function.
typename std::decay< typename std::result_of< F >::type >::type pure_result_of_t
return pure type of function (decays const and reference)