BitMagic-C++
xsample04a.cpp
Go to the documentation of this file.
1 /*
2 Copyright(c) 2018 Anatoliy Kuznetsov(anatoliy_kuznetsov at yahoo.com)
3 
4 Licensed under the Apache License, Version 2.0 (the "License");
5 you may not use this file except in compliance with the License.
6 You may obtain a copy of the License at
7 
8  http://www.apache.org/licenses/LICENSE-2.0
9 
10 Unless required by applicable law or agreed to in writing, software
11 distributed under the License is distributed on an "AS IS" BASIS,
12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 See the License for the specific language governing permissions and
14 limitations under the License.
15 
16 For more information please visit: http://bitmagic.io
17 */
18 
19 /** \example xsample04a.cpp
20 
21 */
22 
23 /*! \file xsample04a.cpp
24  \brief Example: DNA index construction
25 
26 */
27 
28 #include <iostream>
29 #include <sstream>
30 #include <regex>
31 #include <time.h>
32 #include <stdio.h>
33 
34 #include <stdexcept>
35 #include <memory>
36 #include <vector>
37 
38 #include <future>
39 #include <thread>
40 #include <mutex>
41 
42 #include "bm.h"
43 
44 #include "bmdbg.h"
45 #include "bmtimer.h"
46 
47 
48 using namespace std;
49 
50 static
51 void show_help()
52 {
53  std::cerr
54  << "BitMagic DNA Index Build Sample (c) 2018" << std::endl
55  << "-fa file-name -- input FASTA file" << std::endl
56  << "-j number -- number of parallel jobs to run" << std::endl
57  << "-timing -- collect timings" << std::endl
58  ;
59 }
60 
61 
62 
63 
64 // Arguments
65 //
66 std::string ifa_name;
67 bool is_timing = false;
68 unsigned parallel_jobs = 4;
69 
70 static
71 int parse_args(int argc, char *argv[])
72 {
73  for (int i = 1; i < argc; ++i)
74  {
75  std::string arg = argv[i];
76  if ((arg == "-h") || (arg == "--help"))
77  {
78  show_help();
79  return 0;
80  }
81  if (arg == "-fa" || arg == "--fa")
82  {
83  if (i + 1 < argc)
84  {
85  ifa_name = argv[++i];
86  }
87  else
88  {
89  std::cerr << "Error: -fa requires file name" << std::endl;
90  return 1;
91  }
92  continue;
93  }
94  if (arg == "-j" || arg == "--j")
95  {
96  if (i + 1 < argc)
97  {
98  parallel_jobs = unsigned(::atoi(argv[++i]));
99  }
100  else
101  {
102  std::cerr << "Error: -j requires number of jobs" << std::endl;
103  return 1;
104  }
105  continue;
106  }
107 
108  if (arg == "-timing" || arg == "--timing" || arg == "-t" || arg == "--t")
109  is_timing = true;
110 
111  } // for i
112  return 0;
113 }
114 
115 
116 
117 // ----------------------------------------------------------------------------
118 
120 
121 // FASTA format parser
122 static
123 int load_FASTA(const std::string& fname, std::vector<char>& seq_vect)
124 {
125  bm::chrono_taker tt1("1. Parse FASTA", 1, &timing_map);
126 
127  seq_vect.resize(0);
128  std::ifstream fin(fname.c_str(), std::ios::in);
129  if (!fin.good())
130  return -1;
131 
132  std::string line;
133  for (unsigned i = 0; std::getline(fin, line); ++i)
134  {
135  if (line.empty() ||
136  line.front() == '>')
137  continue;
138 
139  for (std::string::iterator it = line.begin(); it != line.end(); ++it)
140  seq_vect.push_back(*it);
141  } // for
142  return 0;
143 }
144 
145 
146 
147 /**
148  Utility for keeping all DNA finger print vectors and search
149  using various techniques
150 */
152 {
153 public:
154  enum { eA = 0, eC, eG, eT, eN, eEnd };
155 
157 
158  /// Build fingerprint bit-vectors from the original sequence
159  ///
160  void Build(const vector<char>& sequence)
161  {
162  bm::bvector<>::insert_iterator iA = m_FPrintBV[eA].inserter();
163  bm::bvector<>::insert_iterator iC = m_FPrintBV[eC].inserter();
164  bm::bvector<>::insert_iterator iG = m_FPrintBV[eG].inserter();
165  bm::bvector<>::insert_iterator iT = m_FPrintBV[eT].inserter();
166  bm::bvector<>::insert_iterator iN = m_FPrintBV[eN].inserter();
167 
168  for (size_t i = 0; i < sequence.size(); ++i)
169  {
170  unsigned pos = unsigned(i);
171  switch (sequence[i])
172  {
173  case 'A':
174  iA = pos;
175  break;
176  case 'C':
177  iC = pos;
178  break;
179  case 'G':
180  iG = pos;
181  break;
182  case 'T':
183  iT = pos;
184  break;
185  case 'N':
186  iN = pos;
187  break;
188  default:
189  break;
190  }
191  }
192  }
193 
194  /// Build index using bulk insert iterator
195  ///
196  void BuildBulk(const vector<char>& sequence)
197  {
203 
204  for (size_t i = 0; i < sequence.size(); ++i)
205  {
206  unsigned pos = unsigned(i);
207  switch (sequence[i])
208  {
209  case 'A':
210  iA = pos;
211  break;
212  case 'C':
213  iC = pos;
214  break;
215  case 'G':
216  iG = pos;
217  break;
218  case 'T':
219  iT = pos;
220  break;
221  case 'N':
222  iN = pos;
223  break;
224  default:
225  break;
226  }
227  }
228  }
229 
230 
231  /// Build fingerprint bit-vectors using bulk insert iterator and parallel
232  /// processing
233  ///
234  void BuildParallel(const vector<char>& sequence, unsigned threads)
235  {
236  struct Func
237  {
238  DNA_FingerprintScanner* target_idx;
239  const std::vector<char>* src_sequence;
240 
241  Func(DNA_FingerprintScanner* idx, const vector<char>& src)
242  : target_idx(idx), src_sequence(&src) {}
243 
244  void operator() (size_t from, size_t to)
245  {
246  const vector<char>& sequence = *src_sequence;
247  bm::bvector<> bvA, bvT, bvG, bvC, bvN;
248 
249  {
255  for (size_t i = from; i < sequence.size() && (i < to); ++i)
256  {
257  unsigned pos = unsigned(i);
258  switch (sequence[i])
259  {
260  case 'A':
261  iA = pos;
262  break;
263  case 'C':
264  iC = pos;
265  break;
266  case 'G':
267  iG = pos;
268  break;
269  case 'T':
270  iT = pos;
271  break;
272  case 'N':
273  iN = pos;
274  break;
275  default:
276  break;
277  }
278  } // for
279  // Bulk insert iterator keeps an buffer, which has to be
280  // flushed, before all bits appear in the target vector
281  //
282  iA.flush();
283  iC.flush();
284  iT.flush();
285  iG.flush();
286  iN.flush();
287  }
288  // merge results of parallel processing back to index
289  target_idx->MergeVector('A', bvA);
290  target_idx->MergeVector('T', bvT);
291  target_idx->MergeVector('G', bvG);
292  target_idx->MergeVector('C', bvC);
293  target_idx->MergeVector('N', bvN);
294  }
295  };
296 
297  if (threads <= 1)
298  {
299  BuildBulk(sequence);
300  return;
301  }
302 
303  // Create parallel async tasks running on a range of source sequence
304  //
305  std::vector<std::future<void> > futures;
306  futures.reserve(8);
307  unsigned range = unsigned(sequence.size() / threads);
308 
309  for (unsigned k = 0; k < sequence.size(); k += range)
310  {
311  futures.emplace_back(std::async(std::launch::async,
312  Func(this, sequence), k, k + range));
313  }
314 
315  // wait for all tasks
316  for (auto& e : futures)
317  {
318  e.wait();
319  }
320  }
321 
322  /// Thread sync bit-vector merge
323  ///
324  void MergeVector(char letter, bm::bvector<>& bv)
325  {
326  static std::mutex mtx_A;
327  static std::mutex mtx_T;
328  static std::mutex mtx_G;
329  static std::mutex mtx_C;
330  static std::mutex mtx_N;
331 
332  switch (letter)
333  {
334  case 'A':
335  {
336  std::lock_guard<std::mutex> guard(mtx_A);
337  m_FPrintBV[eA].merge(bv);
338  }
339  break;
340  case 'C':
341  {
342  std::lock_guard<std::mutex> guard(mtx_C);
343  m_FPrintBV[eC].merge(bv);
344  }
345  break;
346  case 'G':
347  {
348  std::lock_guard<std::mutex> guard(mtx_G);
349  m_FPrintBV[eG].merge(bv);
350  }
351  break;
352  case 'T':
353  {
354  std::lock_guard<std::mutex> guard(mtx_T);
355  m_FPrintBV[eT].merge(bv);
356  }
357  break;
358  case 'N':
359  {
360  std::lock_guard<std::mutex> guard(mtx_N);
361  m_FPrintBV[eN].merge(bv);
362  }
363  break;
364  default:
365  break;
366  }
367 
368  }
369 
370  /// Return fingerprint bit-vector
371  const bm::bvector<>& GetVector(char letter) const
372  {
373  switch (letter)
374  {
375  case 'A':
376  return m_FPrintBV[eA];
377  case 'C':
378  return m_FPrintBV[eC];
379  case 'G':
380  return m_FPrintBV[eG];
381  case 'T':
382  return m_FPrintBV[eT];
383  case 'N':
384  return m_FPrintBV[eN];
385  default:
386  break;
387  }
388  throw runtime_error("Error. Invalid letter!");
389  }
390 
391 private:
392  bm::bvector<> m_FPrintBV[eEnd];
393 };
394 
395 /// Check correctness of indexes constructed using different methods
396 ///
397 static
399  const DNA_FingerprintScanner& idx2)
400 {
401  std::vector<char> letters {'A', 'T', 'G', 'C'};
402  for (char base : letters)
403  {
404  const bm::bvector<>& bv1 = idx1.GetVector(base);
405  const bm::bvector<>& bv2 = idx2.GetVector(base);
406 
407  int cmp = bv1.compare(bv2);
408  if (cmp != 0)
409  {
410  throw runtime_error(string("Fingerprint mismatch for:") + string(1, base));
411  }
412  } // for
413 }
414 
415 
416 
417 int main(int argc, char *argv[])
418 {
419  if (argc < 3)
420  {
421  show_help();
422  return 1;
423  }
424 
425  std::vector<char> seq_vect;
426 
427  try
428  {
429  auto ret = parse_args(argc, argv);
430  if (ret != 0)
431  return ret;
432 
435 
436  if (!ifa_name.empty())
437  {
438  auto res = load_FASTA(ifa_name, seq_vect);
439  if (res != 0)
440  return res;
441  std::cout << "FASTA sequence size=" << seq_vect.size() << std::endl;
442 
443  {
444  bm::chrono_taker tt1("2. Build DNA index", 1, &timing_map);
445  idx1.Build(seq_vect);
446  }
447 
448  if (parallel_jobs > 0)
449  {
450  std::cout << "jobs = " << parallel_jobs << std::endl;
451  bm::chrono_taker tt1("3. Build DNA index (bulk, parallel)", 1, &timing_map);
452  idx2.BuildParallel(seq_vect, parallel_jobs);
453  }
454 
455  // compare results (correctness check)
456  //
457  fingerprint_compare(idx1, idx2);
458  }
459 
460  if (is_timing) // print all collected timings
461  {
462  std::cout << std::endl << "Performance:" << std::endl;
464  }
465  }
466  catch (std::exception& ex)
467  {
468  std::cerr << "Error:" << ex.what() << std::endl;
469  return 1;
470  }
471 
472  return 0;
473 }
parallel_jobs
unsigned parallel_jobs
Definition: xsample04a.cpp:68
bm::chrono_taker
Utility class to collect performance measurements and statistics.
Definition: bmtimer.h:39
bm::bvector::insert_iterator
Output iterator iterator designed to set "ON" bits based on input sequence of integers (bit indeces).
Definition: bm.h:377
DNA_FingerprintScanner::BuildParallel
void BuildParallel(const vector< char > &sequence, unsigned threads)
Build fingerprint bit-vectors using bulk insert iterator and parallel processing.
Definition: xsample04a.cpp:234
DNA_FingerprintScanner::BuildBulk
void BuildBulk(const vector< char > &sequence)
Build index using bulk insert iterator.
Definition: xsample04a.cpp:196
bm::BM_SORTED
@ BM_SORTED
input set is sorted (ascending order)
Definition: bmconst.h:191
DNA_FingerprintScanner::Build
void Build(const vector< char > &sequence)
Build fingerprint bit-vectors from the original sequence.
Definition: xsample04a.cpp:160
bm::bvector::compare
int compare(const bvector< Alloc > &bvect) const BMNOEXCEPT
Lexicographical comparison with a bitvector.
Definition: bm.h:3185
DNA_FingerprintScanner
Utility for keeping all DNA finger print vectors and search using various techniques.
Definition: xsample04.cpp:180
ifa_name
std::string ifa_name
Definition: xsample04a.cpp:66
main
int main(int argc, char *argv[])
Definition: xsample04a.cpp:417
DNA_FingerprintScanner::MergeVector
void MergeVector(char letter, bm::bvector<> &bv)
Thread sync bit-vector merge.
Definition: xsample04a.cpp:324
is_timing
bool is_timing
Definition: xsample04a.cpp:67
DNA_FingerprintScanner::DNA_FingerprintScanner
DNA_FingerprintScanner()
Definition: xsample04a.cpp:156
bm::bvector<>
timing_map
bm::chrono_taker::duration_map_type timing_map
Definition: xsample04a.cpp:119
show_help
static void show_help()
Definition: xsample04a.cpp:51
bmtimer.h
Timing utilities for benchmarking (internal)
bm::chrono_taker::ct_all
@ ct_all
Definition: bmtimer.h:60
fingerprint_compare
static void fingerprint_compare(const DNA_FingerprintScanner &idx1, const DNA_FingerprintScanner &idx2)
Check correctness of indexes constructed using different methods.
Definition: xsample04a.cpp:398
bm::chrono_taker::duration_map_type
std::map< std::string, statistics > duration_map_type
test name to duration map
Definition: bmtimer.h:65
bm::chrono_taker::print_duration_map
static void print_duration_map(const duration_map_type &dmap, format fmt=ct_time)
Definition: bmtimer.h:127
bm::bvector::bulk_insert_iterator::flush
void flush()
Definition: bm.h:563
load_FASTA
static int load_FASTA(const std::string &fname, std::vector< char > &seq_vect)
Definition: xsample04a.cpp:123
bm::bvector::bulk_insert_iterator
Output iterator iterator designed to set "ON" bits based on input sequence of integers.
Definition: bm.h:461
bm.h
Compressed bit-vector bvector<> container, set algebraic methods, traversal iterators.
parse_args
static int parse_args(int argc, char *argv[])
Definition: xsample04a.cpp:71
DNA_FingerprintScanner::GetVector
const bm::bvector & GetVector(char letter) const
Return fingerprint bit-vector.
Definition: xsample04a.cpp:371