24 #include <boost/algorithm/string.hpp>
25 #include <boost/archive/binary_iarchive.hpp>
26 #include <boost/archive/binary_oarchive.hpp>
27 #include <boost/function_output_iterator.hpp>
28 #include <boost/iostreams/device/mapped_file.hpp>
29 #include <boost/numeric/ublas/vector_sparse.hpp>
30 #include <boost/program_options.hpp>
31 #include <boost/range/adaptor/transformed.hpp>
32 #include <boost/range/algorithm/copy.hpp>
33 #include <boost/range/algorithm/fill.hpp>
34 #include <boost/range/empty.hpp>
35 #include <boost/range/size.hpp>
45 using point_type_sparse = boost::numeric::ublas::compressed_vector<double>;
46 using point_type_dense = boost::numeric::ublas::vector<double>;
49 namespace utils = paal::utils;
50 namespace po = boost::program_options;
52 enum Metric {HAMMING, L1, L2, JACCARD};
53 enum Vector_type {SPARSE, DENSE};
60 std::istream& operator>>(std::istream& in, Metric& metr) {
63 boost::algorithm::to_lower(token);
64 if (token ==
"hamming" || token ==
"h")
66 else if (token ==
"l1")
68 else if (token ==
"l2")
70 else if (token ==
"jaccard")
73 assert(0 &&
"couldn't conclude metric name");
77 std::istream& operator>>(std::istream& in, Vector_type & vec_type) {
80 boost::algorithm::to_lower(token);
83 else if (token ==
"sparse")
86 assert(0 &&
"couldn't conclude sparse/dense representation");
93 std::size_t m_dimensions;
94 std::size_t m_row_buffer_size;
118 template <
typename Row = po
int_type_sparse,
typename LshFunctionTag>
119 void m_main(po::variables_map
const &vm,
121 LshFunctionTag tag) {
124 decltype(get_function_generator(tag, p))
127 using hash_result =
typename std::remove_reference<
128 typename std::result_of<lsh_fun(Row)>::type
131 using point_with_result_t = std::tuple<Row, int>;
132 using boost::adaptors::transformed;
135 std::vector<point_with_result_t> points_buffer;
139 if (vm.count(
"model_in")) {
141 std::size_t dimensions;
142 std::ifstream ifs(vm[
"model_in"].as<std::string>());
143 boost::archive::binary_iarchive ia(ifs);
147 assert(metric == p.m_metric);
148 assert(dimensions == p.m_dimensions);
151 points_buffer | transformed(get_coordinates),
152 points_buffer | transformed(get_result),
154 get_function_generator(tag, p) , p.m_precision, p.m_nthread);
157 auto ignore_bad_row = [&](std::string
const &bad_line) {
158 utils::warning(
"following line will be ignored cause of bad format (typically more columns than passed dimensions): ", bad_line);
162 if (vm.count(
"training_file")) {
163 std::ifstream training_file_stream(vm[
"training_file"].as<std::string>());
164 if (!training_file_stream.good()) {
168 points_buffer.reserve(p.m_row_buffer_size);
169 while (training_file_stream.good()) {
170 points_buffer.clear();
171 auto dimensions = p.m_dimensions;
172 paal::read_svm(training_file_stream, dimensions, points_buffer, p.m_row_buffer_size, ignore_bad_row);
174 model.update(points_buffer | transformed(get_coordinates),
175 points_buffer | transformed(get_result),
180 if (vm.count(
"test_file")) {
184 std::string
const test_path = vm[
"test_file"].as<std::string>();
187 auto line_tester = [&](std::string
const & line) {
188 double test_result{};
190 std::stringstream row_stream(line);
191 using result_t = int;
193 if (!(row_stream >> row)) {
194 ignore_bad_row(line);
196 return std::make_tuple(test_result, static_cast<result_t>(test_result));
200 auto output_iterator = boost::make_function_output_iterator([&](
double x){test_result = x;});
202 model.test(test_point_iterator, output_iterator);
204 return std::make_tuple(test_result,row.get_result());
210 auto loss = paal::log_loss<double>(test_results | transformed(get_prediction),
211 test_results | transformed(get_real_value));
215 if (vm.count(
"result_file") > 0) {
216 std::ofstream result_file(vm[
"result_file"].as<std::string>());
218 for (
auto d: test_results | transformed(get_prediction)) {
219 result_file << d <<
"\n";
224 if (vm.count(
"model_out")) {
225 std::ofstream ofs(vm[
"model_out"].as<std::string>());
226 boost::archive::binary_oarchive oa(ofs);
228 oa << p.m_dimensions;
233 template <
typename LshFunctionTag>
234 void choose_vector_type_main(po::variables_map
const &vm,
236 LshFunctionTag tag) {
237 if (p.m_dense == DENSE) {
238 m_main<point_type_dense>(vm,p,tag);
239 }
else if( p.m_dense == SPARSE) {
240 m_main<point_type_sparse>(vm,p,tag);
244 void choose_vector_type_main(po::variables_map
const &vm,
247 assert(p.m_dense == SPARSE &&
"Jaccard metric supports only sparse rows representation.");
248 m_main<point_type_sparse>(vm,p,tag);
251 int main(
int argc,
char** argv)
255 po::options_description desc(
"LSH nearest neighbors regression - \n"\
256 "suite for fast machine learning KNN algorithm which is using "\
257 "locality sensitive hashing functions\n\nUsage:\n"\
258 "This command will train on training file and output the predictions in test_file:\n"
259 "\tlsh-regression --training_file path_to_training_file --test_file path_to_test_file --dimensions number_of_dimensions\n\n"\
260 "If you want to use L1 metric, with 7 passes and 10 threads, and save model\n"\
261 "you can use following command:\n"
262 "\tlsh-regression -d training.svm -i 7 -n 10 -m L1 --dimensions number_of_dimensions --model_out model.lsh\n\n"\
263 "Then if you want to use this model to make a prediction to result_file:\n"\
264 "\tlsh-regression -t test.svm --model_in model.lsh -o results.txt\n\n"
265 "Options description");
268 (
"help,h",
"help message")
269 (
"training_file,d", po::value<std::string>(),
"training file path (in SVM format)")
270 (
"test_file,t", po::value<std::string>(),
"test file path (in SVM format, it doesn't matter what label says)")
271 (
"model_in", po::value<std::string>(),
"path to model, before doing any training or testing")
272 (
"model_out", po::value<std::string>(),
"Write the model to this file when everything is done")
273 (
"result_file,o", po::value<std::string>(),
"path to the file with prediction " \
274 "(float for every test in test set)")
275 (
"dimensions", po::value<std::size_t>(&p.m_dimensions),
"number of dimensions")
276 (
"passes,i", po::value<unsigned>(&p.m_passes)->default_value(3),
"number of iteration (default value = 3)")
277 (
"nthread,n", po::value<unsigned>(&p.m_nthread)->default_value(std::thread::hardware_concurrency()),
278 "number of threads (default = number of cores)")
279 (
"dense", po::value<Vector_type>(&p.m_dense)->default_value(SPARSE),
"Dense/Sparse - Allows to use dense\
280 array representation (It might significantly speed up program for dense data)")
281 (
"metric,m", po::value<Metric>(&p.m_metric)->default_value(HAMMING),
"Metric used for determining " \
282 "similarity between objects - [HAMMING/L1/L2/JACCARD] (default = Hamming)")
283 (
"precision,b", po::value<unsigned>(&p.m_precision)->default_value(10),
"Number " \
284 "of hashing function that are encoding the object")
285 (
"parm_w,w", po::value<double>(&p.m_w)->default_value(1000.),
"Parameter w " \
286 "should be essentially bigger than radius of expected test point neighborhood")
287 (
"row_buffer_size", po::value<std::size_t>(&p.m_row_buffer_size)->default_value(100000),
288 "size of row buffer (default value = 100000)")
289 (
"seed", po::value<int>(&p.m_seed)->default_value(0),
"Seed of random number generator, (default = random)")
292 po::variables_map vm;
293 po::store(po::parse_command_line(argc, argv, desc), vm);
296 auto param_is_set_explicitly = [&vm] (
const std::string ¶m_name) {
297 return vm.count(param_name) > 0 && !vm[param_name].defaulted();
300 if (vm.count(
"help")) {
305 auto error_with_usage = [&] (
const std::string &message) {
309 if (vm.count(
"training_file") == 0 && vm.count(
"test_file") == 0) {
310 error_with_usage(
"Neither training_file nor test_file were set");
313 if (vm.count(
"dimensions") == 0 && vm.count(
"model_in") == 0) {
314 error_with_usage(
"Parameter dimensions was not set");
317 if (vm.count(
"training_file") == 0 && vm.count(
"model_in") == 0) {
318 error_with_usage(
"If you don't set training file (training_file) you have to set input model (model_in)");
321 if (vm.count(
"model_in")) {
323 std::size_t dimensions;
324 std::ifstream ifs(vm[
"model_in"].as<std::string>());
325 boost::archive::binary_iarchive ia(ifs);
330 auto ignore_serializable_param = [&](std::string
const ¶m_name,
bool param_is_equal_to_serialized) {
331 if (param_is_set_explicitly(param_name)) {
332 if (param_is_equal_to_serialized) {
333 utils::warning(
"if input model is specified one does not have to specify param ", param_name);
335 utils::warning(
"the specified param ", param_name,
" is ignored, because it differs from the input model param ", param_name);
339 ignore_serializable_param(
"metric", m == p.m_metric);
340 ignore_serializable_param(
"dimensions", dimensions == p.m_dimensions);
342 auto ignored = [&](std::string
const & param, std::string
const & param_display) {
343 if (param_is_set_explicitly(param)) {
344 utils::warning(
"parameter ", param_display,
" was set, but model_in is used, param ", param_display,
" is discarded");
347 ignored(
"parm_w",
"w");
348 ignored(
"precision",
"precision");
349 ignored(
"seed",
"seed");
350 ignored(
"passes",
"passes");
353 p.m_dimensions = dimensions;
355 if (p.m_metric == HAMMING && param_is_set_explicitly(
"parm_w")) {
356 utils::warning(
"parameter w was set, but hamming metric is used, param w is discarded");
360 switch (p.m_metric) {
361 case L1: choose_vector_type_main(vm, p,
l1_tag{});
364 case L2: choose_vector_type_main(vm, p,
l2_tag{});
367 case HAMMING: choose_vector_type_main(vm, p,
ham_tag{});
370 case JACCARD: choose_vector_type_main(vm, p,
jaccard_tag{});
auto for_each_line(Functor f, std::string const &file_path, unsigned threads_count=std::thread::hardware_concurrency())
for_every_line function provides basic functionality for processing text files quickly and clearly...
auto make_singleton_range(Elem &&elem) -> decltype(boost::make_iterator_range(make_singleton_iterator_begin(std::forward< Elem >(elem)), make_singleton_iterator_end< Elem >()))
function to create a singleton range
Factory class for projection_hash_function.
auto info(Arg &&arg, Args...args)
prints info message
Factory class for l_p_hash_function.
Interface for using mmaped files with threads.
auto make_lsh_nearest_neighbors_regression_tuple_hash(TrainingPoints &&training_points, TrainingResults &&training_results, unsigned passes, FunctionGenerator &&function_generator, unsigned hash_functions_per_point, unsigned threads_count=std::thread::hardware_concurrency())
This is the special version of make_lsh_nearest_neighbors_regression. This version assumes that hash ...
This file contains set of simple useful functors or functor adapters.
class that can read single svm row
FloatType likelihood_from_log_loss(FloatType log_loss)
auto warning(Arg &&arg, Args...args)
prints warning message
functor for std::tuple::get<I>
auto failure(Arg &&arg, Args...args)
prints failure message
void read_svm(std::istream &input_stream, std::size_t &max_dimensions, std::vector< std::tuple< RowType, ResultType >> &points, std::size_t max_points_to_read, ShouldIgnoreBadRow &&should_ignore_bad_row=ShouldIgnoreBadRow{})
detail
Factory class for min_hash_function.
typename std::decay< typename std::result_of< F >::type >::type pure_result_of_t
return pure type of function (decays const and reference)