All Classes Namespaces Files Functions Variables Typedefs Enumerations Friends Macros Pages
frequent-directions.cpp
Go to the documentation of this file.
1 //=======================================================================
2 // Copyright (c) 2015
3 //
4 // Distributed under the Boost Software License, Version 1.0. (See
5 // accompanying file LICENSE_1_0.txt or copy at
6 // http://www.boost.org/LICENSE_1_0.txt)
7 //=======================================================================
17 #include "paal/utils/functors.hpp"
18 #include "paal/utils/irange.hpp"
20 #include "paal/utils/read_rows.hpp"
22 
23 #include <boost/archive/binary_iarchive.hpp>
24 #include <boost/archive/binary_oarchive.hpp>
25 #include <boost/numeric/ublas/io.hpp>
26 #include <boost/numeric/ublas/matrix.hpp>
27 #include <boost/program_options.hpp>
28 
29 #include <cstdlib>
30 #include <iostream>
31 #include <fstream>
32 #include <string>
33 #include <vector>
34 
35 namespace utils = paal::utils;
36 namespace po = boost::program_options;
37 using coordinate_t = double;
38 using matrix_t = boost::numeric::ublas::matrix<coordinate_t>;
40 
41 struct params {
42  size_t m_sketch_rows;
43  size_t m_sketch_compress_size;
44  //TODO
45  //unsigned m_nthread;
46  size_t m_row_buffer_size;
47  bool m_compress_at_end;
48 };
49 
50 void m_main(po::variables_map const &vm, params const &p,
51  std::istream &input_stream, std::ostream &output_stream) {
52  fd_t fd_sketch;
53 
54  std::vector<std::vector<coordinate_t>> row_buffer;
55  row_buffer.reserve(p.m_row_buffer_size);
56 
57  auto ignore_bad_row = [&](std::string const &bad_line) {
58  utils::warning("following line will be ignored cause of bad format: ", bad_line);
59  return true;
60  };
61 
62  std::size_t rows_count;
63  std::size_t columns_count;
64  if (vm.count("model_in")) {
65  std::ifstream ifs(vm["model_in"].as<std::string>());
66  boost::archive::binary_iarchive ia(ifs);
67  ia >> fd_sketch;
68  auto sketch = fd_sketch.get_sketch().first;
69  rows_count = sketch.size1();
70  columns_count = sketch.size2();
71  }
72  else {
73  paal::read_rows_first_row_size<coordinate_t>
74  (input_stream, row_buffer, p.m_row_buffer_size, ignore_bad_row);
75 
76  if(row_buffer.empty()) {
77  utils::failure("Empty input data");
78  }
79 
80  rows_count = p.m_sketch_rows;
81  columns_count = boost::size(row_buffer.front());
82  if(vm.count("sketch_compress_size")) {
83  fd_sketch = paal::make_frequent_directions<coordinate_t>(rows_count, columns_count, p.m_sketch_compress_size);
84  }
85  else {
86  fd_sketch = paal::make_frequent_directions<coordinate_t>(rows_count, columns_count);
87  }
88 
89  fd_sketch.update_range(row_buffer);
90  }
91 
92  while (input_stream.good()) {
93  row_buffer.clear();
94  paal::read_rows<coordinate_t>
95  (input_stream, row_buffer, columns_count, p.m_row_buffer_size, ignore_bad_row);
96  fd_sketch.update_range(row_buffer);
97  }
98 
99  if (p.m_compress_at_end) {
100  fd_sketch.compress();
101  }
102 
103  auto sketch = fd_sketch.get_sketch().first;
104  boost::numeric::ublas::matrix_range<matrix_t> sketch_range (sketch,
105  boost::numeric::ublas::range(0, fd_sketch.get_sketch().second),
106  boost::numeric::ublas::range(0, columns_count));
107  paal::print_matrix(output_stream, sketch_range, " ");
108  output_stream << std::endl;
109 
110  if (vm.count("model_out")) {
111  std::ofstream ofs(vm["model_out"].as<std::string>());
112  boost::archive::binary_oarchive oa(ofs);
113  oa << fd_sketch;
114  }
115 
116 }
117 
118 int main(int argc, char** argv) {
119  params p{};
120 
121  po::options_description desc("Frequent-directions - \n"\
122  "suite for a matrix sketching using Singular Value Decomposition\n\nUsage:\n"\
123  "This command will read data from standard input and write computed sketch to standard output:\n"\
124  "\tfrequent-directions --sketch_rows numer_of_sketch_rows\n\n"\
125  "If you want to read data from an input_file and write computed sketch to an output_file you can use following command:\n"\
126  "\tfrequent-directions --input input_file --output output_file -r rows\n\n"\
127  "If you want to change compress_size and save model you can use following command:\n"\
128  "\tfrequent-directions -i input_file -r rows -s compress_size --model_out model\n\n"\
129  "Then if you want to use this model and add additional data:\n"\
130  "\tfrequent-directions -i input_file --model_in model\n\n"\
131  "Options description");
132 
133  desc.add_options()
134  ("help,h", "help message")
135  ("input,i", po::value<std::string>(), "path to the file with input data in csv format with space as delimiter, "\
136  "(default read from standart input)")
137  ("output,o", po::value<std::string>(), "path to the file with result sketch matrix, only nonzero rows are printed, "\
138  "(default write to standart output)")
139  ("sketch_rows,r", po::value<std::size_t>(&p.m_sketch_rows), "number of sketch rows")
140  ("sketch_compress_size,s", po::value<size_t>(&p.m_sketch_compress_size), "sketch compress size, "\
141  "(default is half of number of sketch rows)")
142  ("model_in", po::value<std::string>(), "read the sketch model from this file")
143  ("model_out", po::value<std::string>(), "write the sketch model to this file")
144  ("final_compress", po::value<bool>(&p.m_compress_at_end)->default_value(true),
145  "determine if sketch will be compressed after update all data, "\
146  "compression in the final phase is necessary to fulfill sketch approximation ratios")
147  //TODO
148  // ("nthread,n", po::value<unsigned>(&p.m_nthread)->default_value(std::thread::hardware_concurrency()),
149  // "number of threads (default = number of cores)")
150  ("row_buffer_size", po::value<std::size_t>(&p.m_row_buffer_size)->default_value(100000),
151  "size of row buffer (default value = 100000)")
152  ;
153 
154  po::variables_map vm;
155  po::store(po::parse_command_line(argc, argv, desc), vm);
156  po::notify(vm);
157 
158  auto param_is_set_explicitly = [&vm] (const std::string &param_name) {
159  return vm.count(param_name) > 0 && !vm[param_name].defaulted();
160  };
161 
162  if (vm.count("help")) {
163  utils::info(desc);
164  return EXIT_SUCCESS;
165  }
166 
167  auto error_with_usage = [&] (const std::string &message) {
168  utils::failure(message, "\n", desc);
169  };
170 
171  if (vm.count("model_in") == 0 && vm.count("sketch_rows") == 0) {
172  error_with_usage("Input model sketch or number of sketch rows was not set");
173  }
174 
175  if (vm.count("model_in")) {
176  auto ignored = [&](std::string const & param) {
177  if (param_is_set_explicitly(param)) {
178  utils::warning("parameter ", param, " was set, but model_in is used, param ", param, " is discarded");
179  }
180  };
181  ignored("sketch_rows");
182  ignored("sketch_compress_size");
183  }
184 
185  if (p.m_row_buffer_size <= 0) {
186  error_with_usage("Size of row buffer must be positive");
187  }
188 
189  std::ifstream ifs;
190  if (vm.count("input")) {
191  ifs.open(vm["input"].as<std::string>());
192  }
193 
194  std::ofstream ofs;
195  if (vm.count("output")) {
196  ofs.open(vm["output"].as<std::string>());
197  }
198 
199  m_main(vm, p,
200  vm.count("input") ? ifs : std::cin,
201  vm.count("output") ? ofs : std::cout);
202 
203 
204  return EXIT_SUCCESS;
205 }
Represents sketch of matrix.
void print_matrix(Stream &o, Matrix &&m, const std::string &del)
prints matrix with delimiters
auto info(Arg &&arg, Args...args)
prints info message
This file contains set of simple useful functors or functor adapters.
auto warning(Arg &&arg, Args...args)
prints warning message
auto failure(Arg &&arg, Args...args)
prints failure message
std::pair< Matrix const &, std::size_t > get_sketch()