BitMagic-C++
xsample04.cpp
Go to the documentation of this file.
1 /*
2 Copyright(c) 2018 Anatoliy Kuznetsov(anatoliy_kuznetsov at yahoo.com)
3 
4 Licensed under the Apache License, Version 2.0 (the "License");
5 you may not use this file except in compliance with the License.
6 You may obtain a copy of the License at
7 
8  http://www.apache.org/licenses/LICENSE-2.0
9 
10 Unless required by applicable law or agreed to in writing, software
11 distributed under the License is distributed on an "AS IS" BASIS,
12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 See the License for the specific language governing permissions and
14 limitations under the License.
15 
16 For more information please visit: http://bitmagic.io
17 */
18 
19 /** \example xsample04.cpp
20 
21 */
22 
23 /*! \file xsample04.cpp
24  \brief Example: DNA substring search
25 
26 */
27 
28 #include <iostream>
29 #include <sstream>
30 #include <chrono>
31 #include <regex>
32 #include <time.h>
33 #include <stdio.h>
34 
35 #include <stdexcept>
36 #include <memory>
37 #include <vector>
38 #include <chrono>
39 #include <map>
40 #include <utility>
41 #include <algorithm>
42 #include <unordered_map>
43 
44 #include "bm.h"
45 #include "bmalgo.h"
46 #include "bmserial.h"
47 #include "bmaggregator.h"
48 
49 #include "bmdbg.h"
50 #include "bmtimer.h"
51 
52 
53 using namespace std;
54 
55 static
56 void show_help()
57 {
58  std::cerr
59  << "BitMagic DNA Search Sample (c) 2018" << std::endl
60  << "-fa file-name -- input FASTA file" << std::endl
61  << "-s hi|lo -- run substring search benchmark" << std::endl
62  << "-diag -- run diagnostics" << std::endl
63  << "-timing -- collect timings" << std::endl
64  ;
65 }
66 
67 
68 
69 
70 // Arguments
71 //
72 std::string ifa_name;
73 bool is_diag = false;
74 bool is_timing = false;
75 bool is_bench = false;
76 bool is_search = false;
77 bool h_word_set = true;
78 
79 static
80 int parse_args(int argc, char *argv[])
81 {
82  for (int i = 1; i < argc; ++i)
83  {
84  std::string arg = argv[i];
85  if ((arg == "-h") || (arg == "--help"))
86  {
87  show_help();
88  return 0;
89  }
90 
91  if (arg == "-fa" || arg == "--fa")
92  {
93  if (i + 1 < argc)
94  {
95  ifa_name = argv[++i];
96  }
97  else
98  {
99  std::cerr << "Error: -fa requires file name" << std::endl;
100  return 1;
101  }
102  continue;
103  }
104 
105  if (arg == "-diag" || arg == "--diag" || arg == "-d" || arg == "--d")
106  is_diag = true;
107  if (arg == "-timing" || arg == "--timing" || arg == "-t" || arg == "--t")
108  is_timing = true;
109  if (arg == "-bench" || arg == "--bench" || arg == "-b" || arg == "--b")
110  is_bench = true;
111  if (arg == "-search" || arg == "--search" || arg == "-s" || arg == "--s")
112  {
113  is_search = true;
114  if (i + 1 < argc)
115  {
116  std::string a = argv[i+1];
117  if (a != "-")
118  {
119  if (a == "l" || a == "lo")
120  {
121  h_word_set = false;
122  ++i;
123  }
124  else
125  if (a == "h" || a == "hi")
126  {
127  h_word_set = true;
128  ++i;
129  }
130  }
131  }
132  }
133 
134  } // for i
135  return 0;
136 }
137 
138 
139 // Global types
140 //
141 typedef std::map<std::string, unsigned> freq_map;
142 typedef std::vector<std::pair<unsigned, std::string> > dict_vect;
143 
145 
146 // ----------------------------------------------------------------------------
147 
149 
150 // FASTA format parser
151 static
152 int load_FASTA(const std::string& fname, std::vector<char>& seq_vect)
153 {
154  bm::chrono_taker tt1("1. Parse FASTA", 1, &timing_map);
155 
156  seq_vect.resize(0);
157  std::ifstream fin(fname.c_str(), std::ios::in);
158  if (!fin.good())
159  return -1;
160 
161  std::string line;
162  for (unsigned i = 0; std::getline(fin, line); ++i)
163  {
164  if (line.empty() ||
165  line.front() == '>')
166  continue;
167 
168  for (std::string::iterator it = line.begin(); it != line.end(); ++it)
169  seq_vect.push_back(*it);
170  } // for
171  return 0;
172 }
173 
174 
175 
176 /**
177  Utility for keeping all DNA finger print vectors and search
178  using various techniques
179 */
181 {
182 public:
183  enum { eA = 0, eC, eG, eT, eN, eEnd };
184 
186 
187  /// Build fingerprint bit-vectors from the original sequence
188  ///
189  void Build(const vector<char>& sequence)
190  {
191  // use bulk insert iterator (faster way to construct a bit-index)
192  //
198 
199  for (size_t i = 0; i < sequence.size(); ++i)
200  {
201  unsigned pos = unsigned(i);
202  switch (sequence[i])
203  {
204  case 'A':
205  iA = pos;
206  break;
207  case 'C':
208  iC = pos;
209  break;
210  case 'G':
211  iG = pos;
212  break;
213  case 'T':
214  iT = pos;
215  break;
216  case 'N':
217  iN = pos;
218  break;
219  default:
220  break;
221  }
222  }
223  }
224 
225  /// Return fingerprint bit-vector
226  const bm::bvector<>& GetVector(char letter) const
227  {
228  switch (letter)
229  {
230  case 'A':
231  return m_FPrintBV[eA];
232  case 'C':
233  return m_FPrintBV[eC];
234  case 'G':
235  return m_FPrintBV[eG];
236  case 'T':
237  return m_FPrintBV[eT];
238  case 'N':
239  return m_FPrintBV[eN];
240  default:
241  break;
242  }
243  throw runtime_error("Error. Invalid letter!");
244  }
245 
246  /// Find word strings
247  /// using shift + and on fingerprint vectors
248  /// (horizontal, non-fused basic method)
249  ///
250  void Find(const string& word, vector<unsigned>& res)
251  {
252  if (word.empty())
253  return;
254  bm::bvector<> bv(GetVector(word[0])); // step 1: copy first vector
255 
256  // run series of shifts + logical ANDs
257  for (size_t i = 1; i < word.size(); ++i)
258  {
259  bv.shift_right(); // SHIFT the accumulator bit-vector
260  // get and AND the next fingerprint
261  const bm::bvector<>& bv_mask = GetVector(word[i]);
262  bv &= bv_mask;
263 
264  auto any = bv.any();
265  if (!any)
266  break;
267  }
268 
269  // translate results from bvector of word ends to result
270  unsigned ws = unsigned(word.size()) - 1;
271  TranslateResults(bv, ws, res);
272  };
273 
274 
275  /// This method uses cache blocked aggregator with fused SHIFT+AND kernel
276  ///
277  void FindAggFused(const string& word, vector<unsigned>& res)
278  {
279  if (word.empty())
280  return;
281  // first we setup aggregator, add a group of vectors to be processed
282  m_Agg.reset();
283  for (size_t i = 0; i < word.size(); ++i)
284  {
285  const bm::bvector<>& bv_mask = GetVector(word[i]);
286  m_Agg.add(&bv_mask);
287  }
288 
289  // now run the whole algorithm to get benefits of cache blocking
290  //
291  bm::bvector<> bv;
292  m_Agg.combine_shift_right_and(bv);
293 
294  // translate results from bvector of word ends to result
295  unsigned ws = unsigned(word.size()) - 1;
296  TranslateResults(bv, ws, res);
297  };
298 
299  /// Find a set of words in one pass using pipeline
300  /// of aggregators (this is very experimental)
301  ///
302  void FindCollection(const vector<tuple<string,int> >& words,
303  vector<vector<unsigned>>& hits)
304  {
305  vector<unique_ptr<aggregator_type> > agg_pipeline;
306  unsigned ws = 0;
307 
308  for (const auto& w : words)
309  {
310  unique_ptr<aggregator_type> agg_ptr(new aggregator_type());
311  agg_ptr->set_operation(aggregator_type::BM_SHIFT_R_AND);
312 
313  const string& word = get<0>(w);
314  for (size_t i = 0; i < word.size(); ++i)
315  {
316  const bm::bvector<>& bv_mask = GetVector(word[i]);
317  agg_ptr->add(&bv_mask);
318  }
319 
320  agg_pipeline.emplace_back(agg_ptr.release());
321  ws = unsigned(word.size()) - 1;
322  }
323 
324  // run the pipeline
326  vector<unique_ptr<aggregator_type> >::iterator>(agg_pipeline.begin(), agg_pipeline.end());
327 
328  // convert the results
329  for (size_t i = 0; i < agg_pipeline.size(); ++i)
330  {
331  const aggregator_type* agg_ptr = agg_pipeline[i].get();
332  auto bv = agg_ptr->get_target();
333  vector<unsigned> res;
334  res.reserve(12000);
335  TranslateResults(*bv, ws, res);
336  hits.emplace_back(res);
337  }
338  }
339 
340 protected:
341 
342  /// Translate search results vector using (word size) left shift
343  ///
345  unsigned left_shift,
346  vector<unsigned>& res)
347  {
349  for (; en.valid(); ++en)
350  {
351  auto pos = *en;
352  res.push_back(pos - left_shift);
353  }
354  }
355 
356 private:
357  bm::bvector<> m_FPrintBV[eEnd];
358  aggregator_type m_Agg;
359 };
360 
361 static const size_t WORD_SIZE = 28;
362 using THitList = vector<unsigned>;
363 
364 /// generate the most frequent words of specified length from the input sequence
365 ///
366 static
367 void generate_kmers(vector<tuple<string,int>>& top_words,
368  vector<tuple<string,int>>& lo_words,
369  const vector<char>& data,
370  size_t N,
371  unsigned word_size)
372 {
373  cout << "k-mer generation... " << endl;
374 
375  top_words.clear();
376  lo_words.clear();
377 
378  if (data.size() < word_size)
379  return;
380 
381  size_t end_pos = data.size() - word_size;
382  size_t i = 0;
383  map<string, int> words;
384  while (i < end_pos)
385  {
386  string s(&data[i], word_size);
387  if (s.find('N') == string::npos)
388  words[s] += 1;
389  i += word_size;
390  if (i % 10000 == 0)
391  {
392  cout << "\r" << i << "/" << end_pos << flush;
393  }
394  }
395 
396  cout << endl << "Picking k-mer samples..." << flush;
397 
398  multimap<int,string, greater<int>> dst;
399  for_each(words.begin(), words.end(), [&](const std::pair<string,int>& p)
400  {
401  dst.emplace(p.second, p.first);
402  });
403  {
404  auto it = dst.begin();
405  for(size_t count = 0; count < N && it !=dst.end(); ++it,++count)
406  top_words.emplace_back(it->second, it->first);
407  }
408 
409  {
410  auto it = dst.rbegin();
411  for(size_t count = 0; count < N && it !=dst.rend(); ++it, ++count)
412  lo_words.emplace_back(it->second, it->first);
413  }
414 
415  cout << "OK" << endl;
416 }
417 
418 /// 2-way string matching
419 ///
420 static
421 void find_word_2way(vector<char>& data,
422  const char* word, unsigned word_size,
423  THitList& r)
424 {
425  if (data.size() < word_size)
426  return;
427 
428  size_t i = 0;
429  size_t end_pos = data.size() - word_size;
430  while (i < end_pos)
431  {
432  bool found = true;
433  for (size_t j = i, k = 0, l = word_size - 1; l > k; ++j, ++k, --l)
434  {
435  if (data[j] != word[k] || data[i + l] != word[l])
436  {
437  found = false;
438  break;
439  }
440  }
441  if (found)
442  r.push_back(unsigned(i));
443  ++i;
444  }
445 }
446 
447 /// Find all words in one pass (cache coherent algorithm)
448 /// (variation of 2-way string matching for collection search)
449 ///
450 static
451 void find_words(const vector<char>& data,
452  vector<const char*> words,
453  unsigned word_size,
454  vector<vector<unsigned>>& hits)
455 {
456  if (data.size() < word_size)
457  return;
458 
459  size_t i = 0;
460  size_t end_pos = data.size() - word_size;
461  size_t words_size = words.size();
462  while (i < end_pos)
463  {
464  for (size_t idx = 0; idx < words_size; ++idx)
465  {
466  auto& word = words[idx];
467  bool found = true;
468  for (size_t j = i, k = 0, l = word_size - 1; l > k; ++j, ++k, --l)
469  {
470  if (data[j] != word[k] || data[i + l] != word[l])
471  {
472  found = false;
473  break;
474  }
475  } // for
476  if (found)
477  {
478  hits[idx].push_back(unsigned(i));
479  break;
480  }
481  } // for
482  ++i;
483  } // while
484 }
485 
486 
487 /// Check search result match
488 ///
489 static
490 bool hitlist_compare(const THitList& h1, const THitList& h2)
491 {
492  if (h1.size() != h2.size())
493  {
494  cerr << "size1 = " << h1.size() << " size2 = " << h2.size() << endl;
495  return false;
496  }
497  for (size_t i = 0; i < h1.size(); ++i)
498  {
499  if (h1[i] != h2[i])
500  return false;
501  }
502  return true;
503 }
504 
505 
506 
507 
508 int main(int argc, char *argv[])
509 {
510  if (argc < 3)
511  {
512  show_help();
513  return 1;
514  }
515 
516  std::vector<char> seq_vect;
517 
518  try
519  {
520  auto ret = parse_args(argc, argv);
521  if (ret != 0)
522  return ret;
523 
525 
526  if (!ifa_name.empty())
527  {
528  auto res = load_FASTA(ifa_name, seq_vect);
529  if (res != 0)
530  return res;
531  std::cout << "FASTA sequence size=" << seq_vect.size() << std::endl;
532 
533  {
534  bm::chrono_taker tt1("2. Build DNA index", 1, &timing_map);
535  idx.Build(seq_vect);
536  }
537  }
538 
539 
540  if (is_search)
541  {
542  vector<tuple<string,int> > h_words;
543  vector<tuple<string,int> > l_words;
544 
545  vector<tuple<string,int>>& words = h_word_set ? h_words : l_words;
546 
547  // generate search sets for benchmarking
548  //
549  generate_kmers(h_words, l_words, seq_vect, 25, WORD_SIZE);
550 
551 
552  vector<THitList> word_hits;
553  vector<THitList> word_hits_agg;
554 
555  // search all words in one pass and
556  // store results in list of hits according to the order of words
557  // (this method uses memory proximity
558  // of searched words to maximize CPU cache hit rate)
559 
560  {
561  vector<const char*> word_list;
562  for (const auto& w : words)
563  {
564  word_list.push_back(get<0>(w).c_str());
565  }
566  word_hits.resize(words.size());
567  for_each(word_hits.begin(), word_hits.end(), [](THitList& ht) {
568  ht.reserve(12000);
569  });
570 
571  bm::chrono_taker tt1("6. String search 2-way single pass",
572  unsigned(words.size()), &timing_map);
573  find_words(seq_vect, word_list, unsigned(WORD_SIZE), word_hits);
574  }
575 
576  // collection search, runs all hits at once
577  //
578  {
579  bm::chrono_taker tt1("7. Aggregated search single pass",
580  unsigned(words.size()), &timing_map);
581 
582  idx.FindCollection(words, word_hits_agg);
583  }
584 
585  // a few variants of word-by-word searches
586  //
587  for (size_t word_idx = 0; word_idx < words.size(); ++ word_idx)
588  {
589  auto& word = get<0>(words[word_idx]);
590  THitList hits1;
591 
592  {
593  bm::chrono_taker tt1("3. String search 2-way", 1, &timing_map);
594  find_word_2way(seq_vect,
595  word.c_str(), unsigned(word.size()),
596  hits1);
597  }
598  THitList hits2;
599  {
600  bm::chrono_taker tt1("4. Search with bvector SHIFT+AND", 1, &timing_map);
601  idx.Find(word, hits2);
602  }
603  THitList hits4;
604  {
605  bm::chrono_taker tt1("5. Search with aggregator fused SHIFT+AND", 1, &timing_map);
606  idx.FindAggFused(word, hits4);
607  }
608 
609  // check correctness
610  if (!hitlist_compare(hits1, hits2)
611  || !hitlist_compare(hits2, hits4))
612  {
613  cout << "Mismatch ERROR for: " << word << endl;
614  }
615  else
616  if (!hitlist_compare(word_hits[word_idx], hits1)
617  || !hitlist_compare(word_hits_agg[word_idx], hits1))
618  {
619  cout << "Sigle pass mismatch ERROR for: " << word << endl;
620  }
621  else
622  {
623  cout << word_idx << ": " << word << ": " << hits1.size() << " hits " << endl;
624  }
625  }
626 
627  }
628 
629  if (is_timing) // print all collected timings
630  {
631  std::cout << std::endl << "Performance:" << std::endl;
633  }
634  }
635  catch (std::exception& ex)
636  {
637  std::cerr << "Error:" << ex.what() << std::endl;
638  return 1;
639  }
640 
641  return 0;
642 }
Output iterator iterator designed to set "ON" bits based on input sequence of integers.
Definition: bm.h:476
Compressed bit-vector bvector<> container, set algebraic methods, traversal iterators.
void aggregator_pipeline_execute(It first, It last)
Experimental method ro run multiple aggregators in sync.
Definition: bmaggregator.h:464
void Find(const string &word, vector< unsigned > &res)
Find word strings using shift + and on fingerprint vectors (horizontal, non-fused basic method) ...
Definition: xsample04.cpp:250
void Build(const vector< char > &sequence)
Build fingerprint bit-vectors from the original sequence.
Definition: xsample04.cpp:189
bm::chrono_taker::duration_map_type timing_map
Definition: xsample04.cpp:148
static bool hitlist_compare(const THitList &h1, const THitList &h2)
Check search result match.
Definition: xsample04.cpp:490
Timing utilities for benchmarking (internal)
static void find_words(const vector< char > &data, vector< const char *> words, unsigned word_size, vector< vector< unsigned >> &hits)
Find all words in one pass (cache coherent algorithm) (variation of 2-way string matching for collect...
Definition: xsample04.cpp:451
static void find_word_2way(vector< char > &data, const char *word, unsigned word_size, THitList &r)
2-way string matching
Definition: xsample04.cpp:421
Utility for keeping all DNA finger print vectors and search using various techniques.
Definition: xsample04.cpp:180
Algorithms for bvector<> (main include)
void FindCollection(const vector< tuple< string, int > > &words, vector< vector< unsigned >> &hits)
Find a set of words in one pass using pipeline of aggregators (this is very experimental) ...
Definition: xsample04.cpp:302
static int load_FASTA(const std::string &fname, std::vector< char > &seq_vect)
Definition: xsample04.cpp:152
Algorithms for fast aggregation of N bvectors.
void TranslateResults(const bm::bvector<> &bv, unsigned left_shift, vector< unsigned > &res)
Translate search results vector using (word size) left shift.
Definition: xsample04.cpp:344
enumerator first() const
Returns enumerator pointing on the first non-zero bit.
Definition: bm.h:2098
input set is sorted (ascending order)
Definition: bmconst.h:168
std::map< std::string, statistics > duration_map_type
test name to duration map
Definition: bmtimer.h:65
const bvector_type * get_target() const
Definition: bmaggregator.h:318
static void print_duration_map(const duration_map_type &dmap, format fmt=ct_time)
Definition: bmtimer.h:127
bool valid() const
Checks if iterator is still valid.
Definition: bm.h:300
bool is_bench
Definition: xsample04.cpp:75
Utility class to collect performance measurements and statistics.
Definition: bmtimer.h:39
bool is_timing
Definition: xsample04.cpp:74
bool any() const
Returns true if any bits in this bitset are set, and otherwise returns false.
Definition: bm.h:2486
Algorithms for fast aggregation of a group of bit-vectors.
Definition: bmaggregator.h:51
bool is_search
Definition: xsample04.cpp:76
static void generate_kmers(vector< tuple< string, int >> &top_words, vector< tuple< string, int >> &lo_words, const vector< char > &data, size_t N, unsigned word_size)
generate the most frequent words of specified length from the input sequence
Definition: xsample04.cpp:367
std::string ifa_name
Definition: xsample04.cpp:72
Serialization / compression of bvector<>. Set operations on compressed BLOBs.
static int parse_args(int argc, char *argv[])
Definition: xsample04.cpp:80
std::map< std::string, unsigned > freq_map
Definition: xsample04.cpp:141
Constant iterator designed to enumerate "ON" bits.
Definition: bm.h:601
void FindAggFused(const string &word, vector< unsigned > &res)
This method uses cache blocked aggregator with fused SHIFT+AND kernel.
Definition: xsample04.cpp:277
int main(int argc, char *argv[])
Definition: xsample04.cpp:508
static void show_help()
Definition: xsample04.cpp:56
bm::aggregator< bm::bvector<> > aggregator_type
Definition: xsample04.cpp:144
const bm::bvector & GetVector(char letter) const
Return fingerprint bit-vector.
Definition: xsample04.cpp:226
bool shift_right()
Shift right by 1 bit, fill with zero return carry over.
Definition: bm.h:4136
vector< unsigned > THitList
Definition: xsample04.cpp:362
bool is_diag
Definition: xsample04.cpp:73
std::vector< std::pair< unsigned, std::string > > dict_vect
Definition: xsample04.cpp:142
static const size_t WORD_SIZE
Definition: xsample04.cpp:361
bool h_word_set
Definition: xsample04.cpp:77