BitMagic-C++
xsample04.cpp
Go to the documentation of this file.
1/*
2Copyright(c) 2018 Anatoliy Kuznetsov(anatoliy_kuznetsov at yahoo.com)
3
4Licensed under the Apache License, Version 2.0 (the "License");
5you may not use this file except in compliance with the License.
6You may obtain a copy of the License at
7
8 http://www.apache.org/licenses/LICENSE-2.0
9
10Unless required by applicable law or agreed to in writing, software
11distributed under the License is distributed on an "AS IS" BASIS,
12WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13See the License for the specific language governing permissions and
14limitations under the License.
15
16For more information please visit: http://bitmagic.io
17*/
18
19/** \example xsample04.cpp
20
21*/
22
23/*! \file xsample04.cpp
24 \brief Example: DNA substring search
25
26*/
27
28#include <iostream>
29#include <sstream>
30#include <chrono>
31#include <regex>
32#include <time.h>
33#include <stdio.h>
34
35#include <stdexcept>
36#include <memory>
37#include <vector>
38#include <chrono>
39#include <map>
40#include <utility>
41#include <algorithm>
42#include <unordered_map>
43
44#include "bm.h"
45#include "bmalgo.h"
46#include "bmserial.h"
47#include "bmaggregator.h"
48
49#include "bmdbg.h"
50#include "bmtimer.h"
51#include "bmundef.h" /* clear the pre-proc defines from BM */
52
53using namespace std;
54
55static
57{
58 std::cerr
59 << "BitMagic DNA Search Sample (c) 2018" << std::endl
60 << "-fa file-name -- input FASTA file" << std::endl
61 << "-s hi|lo -- run substring search benchmark" << std::endl
62 << "-diag -- run diagnostics" << std::endl
63 << "-timing -- collect timings" << std::endl
64 ;
65}
66
67
68
69
70// Arguments
71//
72std::string ifa_name;
73bool is_diag = false;
74bool is_timing = false;
75bool is_bench = false;
76bool is_search = false;
77bool h_word_set = true;
78
79static
80int parse_args(int argc, char *argv[])
81{
82 for (int i = 1; i < argc; ++i)
83 {
84 std::string arg = argv[i];
85 if ((arg == "-h") || (arg == "--help"))
86 {
87 show_help();
88 return 0;
89 }
90
91 if (arg == "-fa" || arg == "--fa")
92 {
93 if (i + 1 < argc)
94 {
95 ifa_name = argv[++i];
96 }
97 else
98 {
99 std::cerr << "Error: -fa requires file name" << std::endl;
100 return 1;
101 }
102 continue;
103 }
104
105 if (arg == "-diag" || arg == "--diag" || arg == "-d" || arg == "--d")
106 is_diag = true;
107 if (arg == "-timing" || arg == "--timing" || arg == "-t" || arg == "--t")
108 is_timing = true;
109 if (arg == "-bench" || arg == "--bench" || arg == "-b" || arg == "--b")
110 is_bench = true;
111 if (arg == "-search" || arg == "--search" || arg == "-s" || arg == "--s")
112 {
113 is_search = true;
114 if (i + 1 < argc)
115 {
116 std::string a = argv[i+1];
117 if (a != "-")
118 {
119 if (a == "l" || a == "lo")
120 {
121 h_word_set = false;
122 ++i;
123 }
124 else
125 if (a == "h" || a == "hi")
126 {
127 h_word_set = true;
128 ++i;
129 }
130 }
131 }
132 }
133
134 } // for i
135 return 0;
136}
137
138
139// Global types
140//
141typedef std::map<std::string, unsigned> freq_map;
142typedef std::vector<std::pair<unsigned, std::string> > dict_vect;
143
145
146// ----------------------------------------------------------------------------
147
149
150// FASTA format parser
151static
152int load_FASTA(const std::string& fname, std::vector<char>& seq_vect)
153{
154 bm::chrono_taker tt1(cout, "1. Parse FASTA", 1, &timing_map);
155
156 seq_vect.resize(0);
157 std::ifstream fin(fname.c_str(), std::ios::in);
158 if (!fin.good())
159 return -1;
160
161 std::string line;
162 for (unsigned i = 0; std::getline(fin, line); ++i)
163 {
164 if (line.empty() ||
165 line.front() == '>')
166 continue;
167
168 for (std::string::iterator it = line.begin(); it != line.end(); ++it)
169 seq_vect.push_back(*it);
170 } // for
171 return 0;
172}
173
174
175
176/**
177 Utility for keeping all DNA finger print vectors and search
178 using various techniques
179*/
181{
182public:
183 enum { eA = 0, eC, eG, eT, eN, eEnd };
184
186
187 /// Build fingerprint bit-vectors from the original sequence
188 ///
189 void Build(const vector<char>& sequence)
190 {
191 // use bulk insert iterator (faster way to construct a bit-index)
192 //
198
199 for (size_t i = 0; i < sequence.size(); ++i)
200 {
201 unsigned pos = unsigned(i);
202 switch (sequence[i])
203 {
204 case 'A':
205 iA = pos;
206 break;
207 case 'C':
208 iC = pos;
209 break;
210 case 'G':
211 iG = pos;
212 break;
213 case 'T':
214 iT = pos;
215 break;
216 case 'N':
217 iN = pos;
218 break;
219 default:
220 break;
221 }
222 }
223 }
224
225 /// Return fingerprint bit-vector
226 const bm::bvector<>& GetVector(char letter) const
227 {
228 switch (letter)
229 {
230 case 'A':
231 return m_FPrintBV[eA];
232 case 'C':
233 return m_FPrintBV[eC];
234 case 'G':
235 return m_FPrintBV[eG];
236 case 'T':
237 return m_FPrintBV[eT];
238 case 'N':
239 return m_FPrintBV[eN];
240 default:
241 break;
242 }
243 throw runtime_error("Error. Invalid letter!");
244 }
245
246 /// Find word strings
247 /// using shift + and on fingerprint vectors
248 /// (horizontal, non-fused basic method)
249 ///
250 void Find(const string& word, vector<unsigned>& res)
251 {
252 if (word.empty())
253 return;
254 bm::bvector<> bv(GetVector(word[0])); // step 1: copy first vector
255
256 // run series of shifts + logical ANDs
257 for (size_t i = 1; i < word.size(); ++i)
258 {
259 bv.shift_right(); // SHIFT the accumulator bit-vector
260 // get and AND the next fingerprint
261 const bm::bvector<>& bv_mask = GetVector(word[i]);
262 bv &= bv_mask;
263
264 auto any = bv.any();
265 if (!any)
266 break;
267 }
268
269 // translate results from bvector of word ends to result
270 unsigned ws = unsigned(word.size()) - 1;
271 TranslateResults(bv, ws, res);
272 };
273
274
275 /// This method uses cache blocked aggregator with fused SHIFT+AND kernel
276 ///
277 void FindAggFused(const string& word, vector<unsigned>& res)
278 {
279 if (word.empty())
280 return;
281 // first we setup aggregator, add a group of vectors to be processed
282 m_Agg.reset();
283 for (size_t i = 0; i < word.size(); ++i)
284 {
285 const bm::bvector<>& bv_mask = GetVector(word[i]);
286 m_Agg.add(&bv_mask);
287 }
288
289 // now run the whole algorithm to get benefits of cache blocking
290 //
291 bm::bvector<> bv;
292 m_Agg.combine_shift_right_and(bv);
293
294 // translate results from bvector of word ends to result
295 unsigned ws = unsigned(word.size()) - 1;
296 TranslateResults(bv, ws, res);
297 };
298
299 /// Find a set of words in one pass using pipeline
300 /// of aggregators (this is very experimental)
301 ///
302 void FindCollection(const vector<tuple<string,int> >& words,
303 vector<vector<unsigned>>& hits)
304 {
305 vector<unique_ptr<aggregator_type> > agg_pipeline;
306 unsigned ws = 0;
307
308 for (const auto& w : words)
309 {
310 unique_ptr<aggregator_type> agg_ptr(new aggregator_type());
311 agg_ptr->set_operation(aggregator_type::BM_SHIFT_R_AND);
312
313 const string& word = get<0>(w);
314 for (size_t i = 0; i < word.size(); ++i)
315 {
316 const bm::bvector<>& bv_mask = GetVector(word[i]);
317 agg_ptr->add(&bv_mask);
318 }
319
320 agg_pipeline.emplace_back(agg_ptr.release());
321 ws = unsigned(word.size()) - 1;
322 }
323
324 // run the pipeline
326 vector<unique_ptr<aggregator_type> >::iterator>(agg_pipeline.begin(), agg_pipeline.end());
327
328 // convert the results
329 for (size_t i = 0; i < agg_pipeline.size(); ++i)
330 {
331 const aggregator_type* agg_ptr = agg_pipeline[i].get();
332 auto bv = agg_ptr->get_target();
333 vector<unsigned> res;
334 res.reserve(12000);
335 TranslateResults(*bv, ws, res);
336 hits.emplace_back(res);
337 }
338 }
339
340protected:
341
342 /// Translate search results vector using (word size) left shift
343 ///
345 unsigned left_shift,
346 vector<unsigned>& res)
347 {
349 for (; en.valid(); ++en)
350 {
351 auto pos = *en;
352 res.push_back(pos - left_shift);
353 }
354 }
355
356private:
357 bm::bvector<> m_FPrintBV[eEnd];
358 aggregator_type m_Agg;
359};
360
361static const size_t WORD_SIZE = 28;
362using THitList = vector<unsigned>;
363
364/// generate the most frequent words of specified length from the input sequence
365///
366static
367void generate_kmers(vector<tuple<string,int>>& top_words,
368 vector<tuple<string,int>>& lo_words,
369 const vector<char>& data,
370 size_t N,
371 unsigned word_size)
372{
373 cout << "k-mer generation... " << endl;
374
375 top_words.clear();
376 lo_words.clear();
377
378 if (data.size() < word_size)
379 return;
380
381 size_t end_pos = data.size() - word_size;
382 size_t i = 0;
383 map<string, int> words;
384 while (i < end_pos)
385 {
386 string s(&data[i], word_size);
387 if (s.find('N') == string::npos)
388 words[s] += 1;
389 i += word_size;
390 if (i % 10000 == 0)
391 {
392 cout << "\r" << i << "/" << end_pos << flush;
393 }
394 }
395
396 cout << endl << "Picking k-mer samples..." << flush;
397
398 multimap<int,string, greater<int>> dst;
399 for_each(words.begin(), words.end(), [&](const std::pair<string,int>& p)
400 {
401 dst.emplace(p.second, p.first);
402 });
403 {
404 auto it = dst.begin();
405 for(size_t count = 0; count < N && it !=dst.end(); ++it,++count)
406 top_words.emplace_back(it->second, it->first);
407 }
408
409 {
410 auto it = dst.rbegin();
411 for(size_t count = 0; count < N && it !=dst.rend(); ++it, ++count)
412 lo_words.emplace_back(it->second, it->first);
413 }
414
415 cout << "OK" << endl;
416}
417
418/// 2-way string matching
419///
420static
421void find_word_2way(vector<char>& data,
422 const char* word, unsigned word_size,
423 THitList& r)
424{
425 if (data.size() < word_size)
426 return;
427
428 size_t i = 0;
429 size_t end_pos = data.size() - word_size;
430 while (i < end_pos)
431 {
432 bool found = true;
433 for (size_t j = i, k = 0, l = word_size - 1; l > k; ++j, ++k, --l)
434 {
435 if (data[j] != word[k] || data[i + l] != word[l])
436 {
437 found = false;
438 break;
439 }
440 }
441 if (found)
442 r.push_back(unsigned(i));
443 ++i;
444 }
445}
446
447/// Find all words in one pass (cache coherent algorithm)
448/// (variation of 2-way string matching for collection search)
449///
450static
451void find_words(const vector<char>& data,
452 vector<const char*> words,
453 unsigned word_size,
454 vector<vector<unsigned>>& hits)
455{
456 if (data.size() < word_size)
457 return;
458
459 size_t i = 0;
460 size_t end_pos = data.size() - word_size;
461 size_t words_size = words.size();
462 while (i < end_pos)
463 {
464 for (size_t idx = 0; idx < words_size; ++idx)
465 {
466 auto& word = words[idx];
467 bool found = true;
468 for (size_t j = i, k = 0, l = word_size - 1; l > k; ++j, ++k, --l)
469 {
470 if (data[j] != word[k] || data[i + l] != word[l])
471 {
472 found = false;
473 break;
474 }
475 } // for
476 if (found)
477 {
478 hits[idx].push_back(unsigned(i));
479 break;
480 }
481 } // for
482 ++i;
483 } // while
484}
485
486
487/// Check search result match
488///
489static
490bool hitlist_compare(const THitList& h1, const THitList& h2)
491{
492 if (h1.size() != h2.size())
493 {
494 cerr << "size1 = " << h1.size() << " size2 = " << h2.size() << endl;
495 return false;
496 }
497 for (size_t i = 0; i < h1.size(); ++i)
498 {
499 if (h1[i] != h2[i])
500 return false;
501 }
502 return true;
503}
504
505
506
507
508int main(int argc, char *argv[])
509{
510 if (argc < 3)
511 {
512 show_help();
513 return 1;
514 }
515
516 std::vector<char> seq_vect;
517
518 try
519 {
520 auto ret = parse_args(argc, argv);
521 if (ret != 0)
522 return ret;
523
525
526 if (!ifa_name.empty())
527 {
528 auto res = load_FASTA(ifa_name, seq_vect);
529 if (res != 0)
530 return res;
531 std::cout << "FASTA sequence size=" << seq_vect.size() << std::endl;
532
533 {
534 bm::chrono_taker tt1(cout, "2. Build DNA index", 1, &timing_map);
535 idx.Build(seq_vect);
536 }
537 }
538
539
540 if (is_search)
541 {
542 vector<tuple<string,int> > h_words;
543 vector<tuple<string,int> > l_words;
544
545 vector<tuple<string,int>>& words = h_word_set ? h_words : l_words;
546
547 // generate search sets for benchmarking
548 //
549 generate_kmers(h_words, l_words, seq_vect, 25, WORD_SIZE);
550
551
552 vector<THitList> word_hits;
553 vector<THitList> word_hits_agg;
554
555 // search all words in one pass and
556 // store results in list of hits according to the order of words
557 // (this method uses memory proximity
558 // of searched words to maximize CPU cache hit rate)
559
560 {
561 vector<const char*> word_list;
562 for (const auto& w : words)
563 {
564 word_list.push_back(get<0>(w).c_str());
565 }
566 word_hits.resize(words.size());
567 for_each(word_hits.begin(), word_hits.end(), [](THitList& ht) {
568 ht.reserve(12000);
569 });
570
571 bm::chrono_taker tt1(cout, "6. String search 2-way single pass",
572 unsigned(words.size()), &timing_map);
573 find_words(seq_vect, word_list, unsigned(WORD_SIZE), word_hits);
574 }
575
576 // collection search, runs all hits at once
577 //
578 {
579 bm::chrono_taker tt1(cout, "7. Aggregated search single pass",
580 unsigned(words.size()), &timing_map);
581
582 idx.FindCollection(words, word_hits_agg);
583 }
584
585 // a few variants of word-by-word searches
586 //
587 for (size_t word_idx = 0; word_idx < words.size(); ++ word_idx)
588 {
589 auto& word = get<0>(words[word_idx]);
590 THitList hits1;
591
592 {
593 bm::chrono_taker tt1(cout, "3. String search 2-way", 1, &timing_map);
594 find_word_2way(seq_vect,
595 word.c_str(), unsigned(word.size()),
596 hits1);
597 }
598 THitList hits2;
599 {
600 bm::chrono_taker tt1(cout, "4. Search with bvector SHIFT+AND", 1, &timing_map);
601 idx.Find(word, hits2);
602 }
603 THitList hits4;
604 {
605 bm::chrono_taker tt1(cout, "5. Search with aggregator fused SHIFT+AND", 1, &timing_map);
606 idx.FindAggFused(word, hits4);
607 }
608
609 // check correctness
610 if (!hitlist_compare(hits1, hits2)
611 || !hitlist_compare(hits2, hits4))
612 {
613 cout << "Mismatch ERROR for: " << word << endl;
614 }
615 else
616 if (!hitlist_compare(word_hits[word_idx], hits1)
617 || !hitlist_compare(word_hits_agg[word_idx], hits1))
618 {
619 cout << "Sigle pass mismatch ERROR for: " << word << endl;
620 }
621 else
622 {
623 cout << word_idx << ": " << word << ": " << hits1.size() << " hits " << endl;
624 }
625 }
626
627 }
628
629 if (is_timing) // print all collected timings
630 {
631 std::cout << std::endl << "Performance:" << std::endl;
633 }
634 }
635 catch (std::exception& ex)
636 {
637 std::cerr << "Error:" << ex.what() << std::endl;
638 return 1;
639 }
640
641 return 0;
642}
Compressed bit-vector bvector<> container, set algebraic methods, traversal iterators.
Algorithms for fast aggregation of N bvectors.
Algorithms for bvector<> (main include)
Serialization / compression of bvector<>. Set theoretical operations on compressed BLOBs.
Timing utilities for benchmarking (internal)
pre-processor un-defines to avoid global space pollution (internal)
Utility for keeping all DNA finger print vectors and search using various techniques.
Definition: xsample04.cpp:181
void FindAggFused(const string &word, vector< unsigned > &res)
This method uses cache blocked aggregator with fused SHIFT+AND kernel.
Definition: xsample04.cpp:277
void Find(const string &word, vector< unsigned > &res)
Find word strings using shift + and on fingerprint vectors (horizontal, non-fused basic method)
Definition: xsample04.cpp:250
void Build(const vector< char > &sequence)
Build fingerprint bit-vectors from the original sequence.
Definition: xsample04.cpp:189
const bm::bvector & GetVector(char letter) const
Return fingerprint bit-vector.
Definition: xsample04.cpp:226
void TranslateResults(const bm::bvector<> &bv, unsigned left_shift, vector< unsigned > &res)
Translate search results vector using (word size) left shift.
Definition: xsample04.cpp:344
void FindCollection(const vector< tuple< string, int > > &words, vector< vector< unsigned >> &hits)
Find a set of words in one pass using pipeline of aggregators (this is very experimental)
Definition: xsample04.cpp:302
Algorithms for fast aggregation of a group of bit-vectors.
Definition: bmaggregator.h:122
void combine_shift_right_and(bvector_type &bv_target)
Aggregate added group of vectors using SHIFT-RIGHT and logical AND Operation does NOT perform an expl...
void reset()
Reset aggregate groups, forget all attached vectors.
Definition: bmaggregator.h:932
const bvector_type * get_target() const BMNOEXCEPT
Definition: bmaggregator.h:623
size_t add(const bvector_type *bv, unsigned agr_group=0)
Attach source bit-vector to a argument group (0 or 1).
Definition: bmaggregator.h:986
Output iterator iterator designed to set "ON" bits based on input sequence of integers.
Definition: bm.h:465
Constant iterator designed to enumerate "ON" bits.
Definition: bm.h:603
bool valid() const BMNOEXCEPT
Checks if iterator is still valid.
Definition: bm.h:283
Bitvector Bit-vector container with runtime compression of bits.
Definition: bm.h:115
bool any() const BMNOEXCEPT
Returns true if any bits in this bitset are set, and otherwise returns false.
Definition: bm.h:2416
enumerator first() const
Returns enumerator pointing on the first non-zero bit.
Definition: bm.h:1849
bool shift_right()
Shift right by 1 bit, fill with zero return carry out.
Definition: bm.h:5185
Utility class to collect performance measurements and statistics.
Definition: bmtimer.h:41
std::map< std::string, statistics > duration_map_type
test name to duration map
Definition: bmtimer.h:66
static void print_duration_map(TOut &tout, const duration_map_type &dmap, format fmt=ct_time)
Definition: bmtimer.h:150
@ BM_SORTED
input set is sorted (ascending order)
Definition: bmconst.h:205
void aggregator_pipeline_execute(It first, It last)
Experimental method ro run multiple aggregators in sync.
Definition: bmaggregator.h:865
static void find_words(const vector< char > &data, vector< const char * > words, unsigned word_size, vector< vector< unsigned >> &hits)
Find all words in one pass (cache coherent algorithm) (variation of 2-way string matching for collect...
Definition: xsample04.cpp:451
int main(int argc, char *argv[])
Definition: xsample04.cpp:508
static int parse_args(int argc, char *argv[])
Definition: xsample04.cpp:80
vector< unsigned > THitList
Definition: xsample04.cpp:362
std::map< std::string, unsigned > freq_map
Definition: xsample04.cpp:141
bool is_search
Definition: xsample04.cpp:76
static const size_t WORD_SIZE
Definition: xsample04.cpp:361
bool is_bench
Definition: xsample04.cpp:75
bm::aggregator< bm::bvector<> > aggregator_type
Definition: xsample04.cpp:144
static void show_help()
Definition: xsample04.cpp:56
static bool hitlist_compare(const THitList &h1, const THitList &h2)
Check search result match.
Definition: xsample04.cpp:490
std::string ifa_name
Definition: xsample04.cpp:72
bm::chrono_taker ::duration_map_type timing_map
Definition: xsample04.cpp:148
bool h_word_set
Definition: xsample04.cpp:77
bool is_diag
Definition: xsample04.cpp:73
std::vector< std::pair< unsigned, std::string > > dict_vect
Definition: xsample04.cpp:142
static void find_word_2way(vector< char > &data, const char *word, unsigned word_size, THitList &r)
2-way string matching
Definition: xsample04.cpp:421
static int load_FASTA(const std::string &fname, std::vector< char > &seq_vect)
Definition: xsample04.cpp:152
bool is_timing
Definition: xsample04.cpp:74
static void generate_kmers(vector< tuple< string, int >> &top_words, vector< tuple< string, int >> &lo_words, const vector< char > &data, size_t N, unsigned word_size)
generate the most frequent words of specified length from the input sequence
Definition: xsample04.cpp:367