BitMagic-C++
xsample03.cpp
Go to the documentation of this file.
1 /*
2 Copyright(c) 2018 Anatoliy Kuznetsov(anatoliy_kuznetsov at yahoo.com)
3 
4 Licensed under the Apache License, Version 2.0 (the "License");
5 you may not use this file except in compliance with the License.
6 You may obtain a copy of the License at
7 
8  http://www.apache.org/licenses/LICENSE-2.0
9 
10 Unless required by applicable law or agreed to in writing, software
11 distributed under the License is distributed on an "AS IS" BASIS,
12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 See the License for the specific language governing permissions and
14 limitations under the License.
15 
16 For more information please visit: http://bitmagic.io
17 */
18 
19 /** \example xsample03.cpp
20  Seach for human mutation (SNP) in within chr1.
21  Benchmark comaprison of different methods
22 
23  \sa bm::sparse_vector
24  \sa bm::rsc_sparse_vector
25  \sa bm::sparse_vector_scanner
26 */
27 
28 /*! \file xsample03.cpp
29  \brief Example: SNP search in human genome
30 
31  Brief description of used method:
32  1. Parse SNP chromosome report and extract information about SNP number and
33  location in the chromosome
34  2. Use this information to build bit-transposed sparse_vector<>
35  where vector position matches chromosome position and SNP ids (aka rsid)
36  is kept as a bit-transposed matrix
37  3. Build rank-select compressed sparse vector, dropping all NULL columns
38  (this data format is pretty sparse, since number of SNPs is significantly
39  less than number of chromosome bases (1:5 or less)
40  Use memory report to understand memory footprint for each form of storage
41  4. Run benchmarks searching for 500 randomly selected SNPs using
42  - bm::sparse_vector<>
43  - bm::rsc_sparse_vector<>
44  - std::vector<pair<unsigned, unsigned> >
45 
46  This example should be useful for construction of compressed columnar
47  tables with parallel search capabilities.
48 
49 */
50 
51 #include <iostream>
52 #include <sstream>
53 #include <chrono>
54 #include <regex>
55 #include <time.h>
56 #include <stdio.h>
57 
58 
59 #include <vector>
60 #include <chrono>
61 #include <map>
62 #include <utility>
63 
64 #include "bm.h"
65 #include "bmalgo.h"
66 #include "bmserial.h"
67 #include "bmrandom.h"
68 #include "bmsparsevec.h"
69 #include "bmsparsevec_compr.h"
70 #include "bmsparsevec_algo.h"
71 #include "bmsparsevec_serial.h"
72 #include "bmalgo_similarity.h"
73 #include "bmsparsevec_util.h"
74 
75 
76 #include "bmdbg.h"
77 #include "bmtimer.h"
78 
79 static
80 void show_help()
81 {
82  std::cerr
83  << "BitMagic SNP Search Sample Utility (c) 2018" << std::endl
84  << "-isnp file-name -- input set file (SNP FASTA) to parse" << std::endl
85  << "-svout spase vector output -- sparse vector name to save" << std::endl
86  << "-rscout rs-compressed spase vector output -- name to save" << std::endl
87  << "-svin sparse vector input -- sparse vector file name to load " << std::endl
88  << "-rscin rs-compressed sparse vector input -- file name to load " << std::endl
89  << "-diag -- run diagnostics" << std::endl
90  << "-timing -- collect timings" << std::endl
91  ;
92 }
93 
94 
95 
96 
97 // Arguments
98 //
99 std::string sv_out_name;
100 std::string rsc_out_name;
101 std::string sv_in_name;
102 std::string rsc_in_name;
103 std::string isnp_name;
104 bool is_diag = false;
105 bool is_timing = false;
106 bool is_bench = false;
107 
108 static
109 int parse_args(int argc, char *argv[])
110 {
111  for (int i = 1; i < argc; ++i)
112  {
113  std::string arg = argv[i];
114  if ((arg == "-h") || (arg == "--help"))
115  {
116  show_help();
117  return 0;
118  }
119 
120  if (arg == "-svout" || arg == "--svout")
121  {
122  if (i + 1 < argc)
123  {
124  sv_out_name = argv[++i];
125  }
126  else
127  {
128  std::cerr << "Error: -svout requires file name" << std::endl;
129  return 1;
130  }
131  continue;
132  }
133  if (arg == "-rscout" || arg == "--rscout")
134  {
135  if (i + 1 < argc)
136  {
137  rsc_out_name = argv[++i];
138  }
139  else
140  {
141  std::cerr << "Error: -rscout requires file name" << std::endl;
142  return 1;
143  }
144  continue;
145  }
146 
147  if (arg == "-svin" || arg == "--svin")
148  {
149  if (i + 1 < argc)
150  {
151  sv_in_name = argv[++i];
152  }
153  else
154  {
155  std::cerr << "Error: -svin requires file name" << std::endl;
156  return 1;
157  }
158  continue;
159  }
160 
161  if (arg == "-rscin" || arg == "--rscin")
162  {
163  if (i + 1 < argc)
164  {
165  rsc_in_name = argv[++i];
166  }
167  else
168  {
169  std::cerr << "Error: -rscin requires file name" << std::endl;
170  return 1;
171  }
172  continue;
173  }
174 
175  if (arg == "-isnp" || arg == "--isnp" || arg == "-snp" || arg == "--snp")
176  {
177  if (i + 1 < argc)
178  {
179  isnp_name = argv[++i];
180  }
181  else
182  {
183  std::cerr << "Error: -isnp requires file name" << std::endl;
184  return 1;
185  }
186  continue;
187  }
188 
189  if (arg == "-diag" || arg == "--diag" || arg == "-d" || arg == "--d")
190  is_diag = true;
191  if (arg == "-timing" || arg == "--timing" || arg == "-t" || arg == "--t")
192  is_timing = true;
193  if (arg == "-bench" || arg == "--bench" || arg == "-b" || arg == "--b")
194  is_bench = true;
195 
196  } // for i
197  return 0;
198 }
199 
200 
201 // Global types
202 //
205 typedef std::vector<std::pair<unsigned, unsigned> > vector_pairs;
206 
207 // ----------------------------------------------------------------------------
208 
210 
211 // SNP report format parser (extraction and transformation)
212 // Parser extracts SNP id (rsid) and positio on genome to build
213 // sparse vector where index (position in vector) metches position on the
214 // chromosome 1.
215 //
216 static
217 int load_snp_report(const std::string& fname, sparse_vector_u32& sv)
218 {
219  bm::chrono_taker tt1("1. parse input SNP chr report", 1, &timing_map);
220 
221  std::ifstream fin(fname.c_str(), std::ios::in);
222  if (!fin.good())
223  return -1;
224 
225  unsigned rs_id, rs_pos;
226  size_t idx;
227 
228  std::string line;
229  std::string delim = " \t";
230 
231  std::regex reg("\\s+");
232  std::sregex_token_iterator it_end;
233 
234  bm::bvector<> bv_rs;
235  bv_rs.init();
236 
237  unsigned rs_cnt = 0;
238  for (unsigned i = 0; std::getline(fin, line); ++i)
239  {
240  if (line.empty() ||
241  !isdigit(line.front())
242  )
243  continue;
244 
245  // regex based tokenizer
246  std::sregex_token_iterator it(line.begin(), line.end(), reg, -1);
247  std::vector<std::string> line_vec(it, it_end);
248  if (line_vec.empty())
249  continue;
250 
251  // parse columns of interest
252  try
253  {
254  rs_id = unsigned(std::stoul(line_vec.at(0), &idx));
255 
256  if (bv_rs.test(rs_id))
257  {
258  continue;
259  }
260  rs_pos = unsigned(std::stoul(line_vec.at(11), &idx));
261 
262  bv_rs.set_bit_no_check(rs_id);
263  sv.set(rs_pos, rs_id);
264 
265  ++rs_cnt;
266  }
267  catch (std::exception& /*ex*/)
268  {
269  continue; // detailed disgnostics commented out
270  // error detected, because some columns are sometimes missing
271  // just ignore it
272  //
273  /*
274  std::cerr << ex.what() << "; ";
275  std::cerr << "rs=" << line_vec.at(0) << " pos=" << line_vec.at(11) << std::endl;
276  continue;
277  */
278  }
279  if (rs_cnt % (4 * 1024) == 0)
280  std::cout << "\r" << rs_cnt << " / " << i; // PROGRESS report
281  } // for i
282 
283  std::cout << std::endl;
284  std::cout << "SNP count=" << rs_cnt << std::endl;
285  return 0;
286 }
287 
288 // Generate random subset of random values from a sparse vector
289 //
290 static
291 void generate_random_subset(const sparse_vector_u32& sv, std::vector<unsigned>& vect, unsigned count)
292 {
294 
295  bm::random_subset<bm::bvector<> > rand_sampler;
296  bm::bvector<> bv_sample;
297  rand_sampler.sample(bv_sample, *bv_null, count);
298 
299  bm::bvector<>::enumerator en = bv_sample.first();
300  for (; en.valid(); ++en)
301  {
302  unsigned idx = *en;
303  unsigned v = sv[idx];
304  vect.push_back(v);
305  }
306 }
307 
308 // build std::vector of pairs (position to rs)
309 //
310 static
312 {
315 
316  for (; it != it_end; ++it)
317  {
318  if (!it.is_null())
319  {
320  std::pair<unsigned, unsigned> pos2rs = std::make_pair(it.pos(), it.value());
321  vp.push_back(pos2rs);
322  }
323  }
324 }
325 
326 // O(N) -- O(N/2) linear search in vector of pairs (position - rsid)
327 //
328 static
329 bool search_vector_pairs(const vector_pairs& vp, unsigned rs_id, unsigned& pos)
330 {
331  for (unsigned i = 0; i < vp.size(); ++i)
332  {
333  if (vp[i].second == rs_id)
334  {
335  pos = vp[i].first;
336  return true;
337  }
338  }
339  return false;
340 }
341 
342 // SNP search benchmark
343 // Search for SNPs using different data structures (Bitmagic and STL)
344 //
345 // An extra step is verification, to make sure all methods are consistent
346 //
347 static
349 {
350  const unsigned rs_sample_count = 2000;
351 
352  std::vector<unsigned> rs_vect;
353  generate_random_subset(sv, rs_vect, rs_sample_count);
354  if (rs_vect.empty())
355  {
356  std::cerr << "Benchmark subset empty!" << std::endl;
357  return;
358  }
359 
360  // build traditional sparse vector
361  vector_pairs vp;
362  build_vector_pairs(sv, vp);
363 
364  // search result bit-vectors
365  //
366  bm::bvector<> bv_found1;
367  bm::bvector<> bv_found2;
368  bm::bvector<> bv_found3;
369 
370  bv_found1.init(); bv_found2.init(); bv_found3.init();// pre-initialize vectors
371 
372  if (!sv.empty())
373  {
374  bm::chrono_taker tt1("09. rs search (sv)", unsigned(rs_vect.size()), &timing_map);
375 
376  // scanner class
377  // it's important to keep scanner class outside the loop to avoid
378  // unnecessary re-allocs and construction costs
379  //
380 
382 
383  for (unsigned i = 0; i < rs_vect.size(); ++i)
384  {
385  unsigned rs_id = rs_vect[i];
386  unsigned rs_pos;
387  bool found = scanner.find_eq(sv, rs_id, rs_pos);
388 
389  if (found)
390  {
391  bv_found1.set_bit_no_check(rs_pos);
392  }
393  else
394  {
395  std::cout << "Error: rs_id = " << rs_id << " not found!" << std::endl;
396  }
397  } // for
398  }
399 
400  if (!csv.empty())
401  {
402  bm::chrono_taker tt1("10. rs search (rsc_sv)", unsigned(rs_vect.size()), &timing_map);
403 
404  bm::sparse_vector_scanner<rsc_sparse_vector_u32> scanner; // scanner class
405 
406  for (unsigned i = 0; i < rs_vect.size(); ++i)
407  {
408  unsigned rs_id = rs_vect[i];
409  unsigned rs_pos;
410  bool found = scanner.find_eq(csv, rs_id, rs_pos);
411 
412  if (found)
413  {
414  bv_found2.set_bit_no_check(rs_pos);
415  }
416  else
417  {
418  std::cout << "rs_id = " << rs_id << " not found!" << std::endl;
419  }
420  } // for
421  }
422 
423  if (vp.size())
424  {
425  bm::chrono_taker tt1("11. rs search (std::vector<>)", unsigned(rs_vect.size()), &timing_map);
426 
427  for (unsigned i = 0; i < rs_vect.size(); ++i)
428  {
429  unsigned rs_id = rs_vect[i];
430  unsigned rs_pos;
431  bool found = search_vector_pairs(vp, rs_id, rs_pos);
432 
433  if (found)
434  {
435  bv_found3.set_bit_no_check(rs_pos);
436  }
437  else
438  {
439  std::cout << "rs_id = " << rs_id << " not found!" << std::endl;
440  }
441  } // for
442  }
443 
444  // compare results from various methods (check integrity)
445  int res = bv_found1.compare(bv_found2);
446  if (res != 0)
447  {
448  std::cerr << "Error: search discrepancy (sparse search) detected!" << std::endl;
449  }
450  res = bv_found1.compare(bv_found3);
451  if (res != 0)
452  {
453  std::cerr << "Error: search discrepancy (std::vector<>) detected!" << std::endl;
454  }
455 
456 }
457 
458 
459 int main(int argc, char *argv[])
460 {
461  if (argc < 3)
462  {
463  show_help();
464  return 1;
465  }
466 
469 
470  try
471  {
472  auto ret = parse_args(argc, argv);
473  if (ret != 0)
474  return ret;
475 
476  if (!isnp_name.empty())
477  {
478  auto res = load_snp_report(isnp_name, sv);
479  if (res != 0)
480  {
481  return res;
482  }
483  }
484  if (!sv_in_name.empty())
485  {
486  bm::chrono_taker tt1("02. Load sparse vector", 1, &timing_map);
487  file_load_svector(sv, sv_in_name);
488  }
489 
490  // load rs-compressed sparse vector
491  if (!rsc_in_name.empty())
492  {
493  bm::chrono_taker tt1("03. Load rsc sparse vector", 1, &timing_map);
494  file_load_svector(csv, rsc_in_name);
495  }
496 
497  // if rs-compressed vector is not available - build it on the fly
498  if (csv.empty() && !sv.empty())
499  {
501  {
502  bm::chrono_taker tt1("04. rs compress sparse vector", 1, &timing_map);
503  csv.load_from(sv);
504  }
505  {
506  bm::chrono_taker tt1("05. rs de-compress sparse vector", 1, &timing_map);
507  csv.load_to(sv2);
508  }
509 
510  if (!sv.equal(sv2)) // diagnostics check (just in case)
511  {
512  std::cerr << "Error: rs-compressed vector check failed!" << std::endl;
513  return 1;
514  }
515  }
516 
517  // save SV vector
518  if (!sv_out_name.empty() && !sv.empty())
519  {
520  bm::chrono_taker tt1("07. Save sparse vector", 1, &timing_map);
521  sv.optimize();
522  file_save_svector(sv, sv_out_name);
523  }
524 
525  // save RS sparse vector
526  if (!rsc_out_name.empty() && !csv.empty())
527  {
528  bm::chrono_taker tt1("08. Save RS sparse vector", 1, &timing_map);
529  csv.optimize();
530  file_save_svector(csv, rsc_out_name);
531  }
532 
533  if (is_diag) // print memory diagnostics
534  {
535  if (!sv.empty())
536  {
537  std::cout << std::endl
538  << "sparse vector statistics:"
539  << std::endl;
540  bm::print_svector_stat(sv, true);
541  }
542  if (!csv.empty())
543  {
544  std::cout << std::endl
545  << "RS compressed sparse vector statistics:"
546  << std::endl;
547  bm::print_svector_stat(csv, true);
548  }
549  }
550 
551  if (is_bench) // run set of benchmarks
552  {
553  run_benchmark(sv, csv);
554  }
555 
556  if (is_timing) // print all collected timings
557  {
558  std::cout << std::endl << "Performance:" << std::endl;
560  }
561  }
562  catch (std::exception& ex)
563  {
564  std::cerr << "Error:" << ex.what() << std::endl;
565  return 1;
566  }
567 
568  return 0;
569 }
570 
build_vector_pairs
static void build_vector_pairs(const sparse_vector_u32 &sv, vector_pairs &vp)
Definition: xsample03.cpp:311
bm::rsc_sparse_vector::empty
bool empty() const
return true if vector is empty
Definition: bmsparsevec_compr.h:374
bmsparsevec_algo.h
Algorithms for bm::sparse_vector.
bm::sparse_vector::optimize
void optimize(bm::word_t *temp_block=0, typename bvector_type::optmode opt_mode=bvector_type::opt_compress, typename sparse_vector< Val, BV >::statistics *stat=0)
run memory optimization for all vector plains
Definition: bmsparsevec.h:1750
bmrandom.h
Generation of random subset.
bm::sparse_vector< unsigned, bm::bvector<> >::const_iterator
friend const_iterator
Definition: bmsparsevec.h:348
bm::rsc_sparse_vector::load_to
void load_to(sparse_vector_type &sv) const
Exort compressed vector to a sparse vector (with NULLs)
Definition: bmsparsevec_compr.h:1088
vector_pairs
std::vector< std::pair< unsigned, unsigned > > vector_pairs
Definition: xsample03.cpp:205
bm::sparse_vector
sparse vector with runtime compression using bit transposition method
Definition: bmsparsevec.h:81
bm::chrono_taker
Utility class to collect performance measurements and statistics.
Definition: bmtimer.h:39
rsc_in_name
std::string rsc_in_name
Definition: xsample03.cpp:102
sv_out_name
std::string sv_out_name
Definition: xsample03.cpp:99
bm::bvector::set_bit_no_check
void set_bit_no_check(size_type n)
Set bit without checking preconditions (size, etc)
Definition: bm.h:3494
bmsparsevec.h
Sparse constainer sparse_vector<> for integer types using bit-transposition transform.
bm::bvector::enumerator
Constant iterator designed to enumerate "ON" bits.
Definition: bm.h:599
bm::sparse_vector_scanner
algorithms for sparse_vector scan/search
Definition: bmsparsevec_algo.h:481
bm::bvector::compare
int compare(const bvector< Alloc > &bvect) const BMNOEXCEPT
Lexicographical comparison with a bitvector.
Definition: bm.h:3185
is_timing
bool is_timing
Definition: xsample03.cpp:105
load_snp_report
static int load_snp_report(const std::string &fname, sparse_vector_u32 &sv)
Definition: xsample03.cpp:217
is_diag
bool is_diag
Definition: xsample03.cpp:104
bmsparsevec_serial.h
Serialization for sparse_vector<>
bmalgo.h
Algorithms for bvector<> (main include)
isnp_name
std::string isnp_name
Definition: xsample03.cpp:103
bm::sparse_vector::equal
bool equal(const sparse_vector< Val, BV > &sv, bm::null_support null_able=bm::use_null) const BMNOEXCEPT
check if another sparse vector has the same content and size
Definition: bmsparsevec.h:1912
rsc_out_name
std::string rsc_out_name
Definition: xsample03.cpp:100
bmsparsevec_compr.h
Compressed sparse container rsc_sparse_vector<> for integer types.
bm::bvector<>
show_help
static void show_help()
Definition: xsample03.cpp:80
bm::use_null
@ use_null
support "non-assigned" or "NULL" logic
Definition: bmconst.h:215
bm::random_subset::sample
void sample(BV &bv_out, const BV &bv_in, size_type sample_count)
Get random subset of input vector.
Definition: bmrandom.h:140
bm::sparse_vector::end
const_iterator end() const BMNOEXCEPT
Provide const iterator access to the end
Definition: bmsparsevec.h:493
bm::bvector::iterator_base::valid
bool valid() const BMNOEXCEPT
Checks if iterator is still valid. Analog of != 0 comparison for pointers.
Definition: bm.h:280
timing_map
bm::chrono_taker::duration_map_type timing_map
Definition: xsample03.cpp:209
bmsparsevec_util.h
sv_in_name
std::string sv_in_name
Definition: xsample03.cpp:101
bm::bvector::init
void init()
Explicit post-construction initialization.
Definition: bm.h:2149
parse_args
static int parse_args(int argc, char *argv[])
Definition: xsample03.cpp:109
bmtimer.h
Timing utilities for benchmarking (internal)
bm::sparse_vector::begin
const_iterator begin() const BMNOEXCEPT
Provide const iterator access to container content
Definition: bmsparsevec.h:1922
is_bench
bool is_bench
Definition: xsample03.cpp:106
bm::chrono_taker::ct_ops_per_sec
@ ct_ops_per_sec
Definition: bmtimer.h:59
search_vector_pairs
static bool search_vector_pairs(const vector_pairs &vp, unsigned rs_id, unsigned &pos)
Definition: xsample03.cpp:329
bm::random_subset
Definition: bmrandom.h:56
bm::rsc_sparse_vector::load_from
void load_from(const sparse_vector_type &sv_src)
Load compressed vector from a sparse vector (with NULLs)
Definition: bmsparsevec_compr.h:1047
sparse_vector_u32
bm::sparse_vector< unsigned, bm::bvector<> > sparse_vector_u32
Definition: xsample03.cpp:203
bm::sparse_vector::push_back
void push_back(value_type v)
push value back into vector
Definition: bmsparsevec.h:1504
rsc_sparse_vector_u32
bm::rsc_sparse_vector< unsigned, sparse_vector_u32 > rsc_sparse_vector_u32
Definition: xsample03.cpp:204
bm::bvector::test
bool test(size_type n) const BMNOEXCEPT
returns true if bit n is set and false is bit n is 0.
Definition: bm.h:1454
bm::chrono_taker::duration_map_type
std::map< std::string, statistics > duration_map_type
test name to duration map
Definition: bmtimer.h:65
bm::rsc_sparse_vector
Rank-Select compressed sparse vector.
Definition: bmsparsevec_compr.h:58
bmserial.h
Serialization / compression of bvector<>. Set theoretical operations on compressed BLOBs.
bm::rsc_sparse_vector::optimize
void optimize(bm::word_t *temp_block=0, typename bvector_type::optmode opt_mode=bvector_type::opt_compress, statistics *stat=0)
run memory optimization for all vector plains
Definition: bmsparsevec_compr.h:1239
bm::chrono_taker::print_duration_map
static void print_duration_map(const duration_map_type &dmap, format fmt=ct_time)
Definition: bmtimer.h:127
bmalgo_similarity.h
bm::sparse_vector::empty
bool empty() const BMNOEXCEPT
return true if vector is empty
Definition: bmsparsevec.h:649
bm::bvector::first
enumerator first() const
Returns enumerator pointing on the first non-zero bit.
Definition: bm.h:1796
main
int main(int argc, char *argv[])
Definition: xsample03.cpp:459
bm.h
Compressed bit-vector bvector<> container, set algebraic methods, traversal iterators.
generate_random_subset
static void generate_random_subset(const sparse_vector_u32 &sv, std::vector< unsigned > &vect, unsigned count)
Definition: xsample03.cpp:291
bm::sparse_vector::set
void set(size_type idx, value_type v)
set specified element with bounds checking and automatic resize
Definition: bmsparsevec.h:1475
run_benchmark
static void run_benchmark(const sparse_vector_u32 &sv, const rsc_sparse_vector_u32 &csv)
Definition: xsample03.cpp:348
bm::base_sparse_vector::get_null_bvector
const bvector_type * get_null_bvector() const BMNOEXCEPT
Get bit-vector of assigned values or NULL (if not constructed that way)
Definition: bmbmatrix.h:329
bm::sparse_vector_scanner::find_eq
void find_eq(const SV &sv, typename SV::value_type value, typename SV::bvector_type &bv_out)
find all sparse vector elements EQ to search value
Definition: bmsparsevec_algo.h:2033