BitMagic-C++
xsample06.cpp
Go to the documentation of this file.
1/*
2Copyright(c) 2002-2019 Anatoliy Kuznetsov(anatoliy_kuznetsov at yahoo.com)
3
4Licensed under the Apache License, Version 2.0 (the "License");
5you may not use this file except in compliance with the License.
6You may obtain a copy of the License at
7
8 http://www.apache.org/licenses/LICENSE-2.0
9
10Unless required by applicable law or agreed to in writing, software
11distributed under the License is distributed on an "AS IS" BASIS,
12WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13See the License for the specific language governing permissions and
14limitations under the License.
15
16For more information please visit: http://bitmagic.io
17*/
18
19/** \example xsample06.cpp
20 Use of sparse vector for compressed DNA strings (2 bits per bp)
21 Construction of comparison functions on bit-transposed compressed
22 containers, benchmarking.
23
24 \sa bm::sparse_vector
25 \sa bm::sparse_vector_find_first_mismatch
26
27*/
28
29/*! \file xsample06.cpp
30 \brief Example: Use of sparse vector for compressed DNA strings
31*/
32
33#include <assert.h>
34#include <stdlib.h>
35
36#include <iostream>
37#include <vector>
38#include <algorithm>
39#include <utility>
40
41#include "bm.h"
42#include "bmsparsevec.h"
43#include "bmsparsevec_algo.h"
44#include "bmsparsevec_serial.h"
45
46// BitMagic utilities for debug and timings
47#include "bmdbg.h"
48#include "bmtimer.h"
49#include "bmundef.h" /* clear the pre-proc defines from BM */
50
51using namespace std;
52
53
54/// Translate DNA letter to integer code
55/// Please note this function uses extended alphabet ATGC plus 'N' and '$'
56/// Ns are used to indicate ambiguity and $ is part of Burrows-Wheeler transform
57///
58inline unsigned DNA2int(char DNA_bp)
59{
60 switch (DNA_bp)
61 {
62 case 'A':
63 return 0; // 000
64 case 'T':
65 return 1; // 001
66 case 'G':
67 return 2; // 010
68 case 'C':
69 return 3; // 011
70 case 'N':
71 return 4; // 100
72 case '$':
73 return 5; // 101
74 default:
75 assert(0);
76 return 0;
77 }
78}
79/// Translate integer code to DNA letter
80///
81inline char int2DNA(unsigned code)
82{
83 static char lut[] = { 'A', 'T', 'G', 'C', 'N', '$' };
84 if (code < 5)
85 return lut[code];
86 assert(0);
87 return 'N';
88}
89
90/// Print sparse vector
91template<typename SV> void PrintSV(const SV& sv)
92{
93 cout << "size() = " << sv.size() << " : ";
94 auto it = sv.begin(); auto it_end = sv.end();
95 for (; it != it_end; ++it)
96 {
97 auto v = *it;
98 char bp = int2DNA(v);
99 cout << bp;
100 }
101 cout << endl;
102}
103
104/// Test sparse vector against reference vector
105/// (for QA purposes)
106///
107template<typename SV, typename VECT>
108void TestSV(const SV& sv, const VECT& vect)
109{
110 auto it = sv.begin(); auto it_end = sv.end();
111 auto itv = vect.begin(); auto itv_end = vect.end();
112 for (; it != it_end && itv != itv_end; ++it, ++itv)
113 {
114 auto v = *it;
115 char bp = int2DNA(v);
116 char bpv = *itv;
117 assert(bp == bpv);
118 if (bp != bpv)
119 {
120 cerr << "Error: reference vector mismatch!" << endl;
121 exit(1);
122 }
123 }
124 if (it != it_end || itv != itv_end)
125 {
126 cerr << "Error: reference size mismatch!" << endl;
127 exit(1);
128 }
129}
130
131
133typedef std::vector<char> vector_char_type;
134typedef std::vector<std::pair<unsigned, unsigned> > vector_pairs_type;
135
136/*
137 Lexicographical compare of two sparse vectors using bit-plane
138 mismatch search
139*/
140static
141int compare_sv(const svector_u32& sv1, const svector_u32& sv2)
142{
144 bool found = bm::sparse_vector_find_first_mismatch(sv1, sv2, pos);
145 if (found)
146 {
147 auto v1 = sv1[pos];
148 auto v2 = sv2[pos];
149 if (v1 < v2)
150 return -1;
151 return 1;
152 }
153 return 0; // EQ
154}
155
156/*
157 Lexicographical compare of two sparse vectors using
158 iterators. This variant is quite slow, because sparse vector iterators
159 are heavy classes doing reverse transformation on the fly
160*/
161static
162int compare_sv_it(const svector_u32& sv1, const svector_u32& sv2)
163{
166 svector_u32::size_type sz = sv1.size();
167 for (svector_u32::size_type i = 0; i < sz; ++i, ++it1, ++it2)
168 {
169 auto v1 = *it1; auto v2 = *it2;
170 if (v1 == v2)
171 continue;
172 if (v1 < v2)
173 return -1;
174 return 1;
175 }
176 return 0;
177}
178
179/*
180 Utility function to generate random vector of DNA chars (4 letters)
181*/
182static
184{
186 for (unsigned i = 0; i < sz; ++i)
187 {
188 unsigned code = rand() % 4; // generate code between 0 and 3
189 char bp = int2DNA(code);
190 assert(bp == 'A' || bp == 'T' || bp == 'G' || bp == 'C');
191 vect.push_back(bp);
192 bi = code;
193 }
194 bi.flush();
195}
196
197/*
198 Utility function to add symulated centromer (block of Ns in the middle)
199*/
200static
201void add_centromer_Ns(svector_u32& sv, vector_char_type& vect, unsigned csz)
202{
203 assert(csz);
204
206 assert(sz == svector_u32::size_type(vect.size()));
207 assert(csz < sz);
208
209 svector_u32::size_type half = sz / 2; // center position
210 svector_u32::size_type c_half = csz / 2;
211 svector_u32::size_type c_start = half - c_half; // simulated centromere start
212 svector_u32::size_type c_end = half + c_half; // simulated centromere end
213
214 // fill-in simulated centromere 'NNNNNN...NNNN'
215 for (svector_u32::size_type i = c_start; i < c_end; ++i)
216 {
217 vect[i] = 'N';
218 sv[i] = DNA2int('N'); // 4
219 }
220}
221
222/*
223 Generate large vectors to simulate human chr1
224*/
225static
227{
228 const unsigned case_size = 200000000; // 200M bps
229 const unsigned centr_size = 7000000; // 7M centromere Ns
230
231 generate_DNA_vector(sv, vect, case_size);
232 add_centromer_Ns(sv, vect, centr_size);
233
234 sv.optimize(); // this will compress the sparse (N) plane
235
236 TestSV(sv, vect); // paranoiya check
237}
238
239/*
240 Generate benchmark set of mismatches
241*/
242static
244 const vector_char_type& vect,
245 unsigned m_count)
246{
247 vect_m.resize(0);
248 if (!vect.size())
249 return;
250
251 vector_char_type::size_type sz = vect.size();
252 vector_char_type::size_type delta = sz / m_count;
253
254 for (vector_char_type::size_type i = 0; i < sz; i += delta)
255 {
256 vector_char_type::value_type v1 = vect[i];
257 unsigned code = rand() % 4;
258 vector_char_type::value_type v2 = int2DNA(code);
259 if (v2 == v1)
260 continue;
261 vect_m.push_back(vector_pairs_type::value_type(unsigned(i), code));
262 } // for i
263
264 // add some extra with a distrubution skewed to the beginning
265 //
266 for (vector_char_type::size_type i = 1; i < sz / 4; i += (rand()%(1024 * 10)))
267 {
268 vector_char_type::value_type v1 = vect[i];
269 unsigned code = rand() % 4;
270 vector_char_type::value_type v2 = int2DNA(code);
271 if (v2 == v1)
272 continue;
273 vect_m.push_back(vector_pairs_type::value_type(unsigned(i), code));
274 } // for i
275
276}
277
278// -------------------------------------------------------------------
279
280const unsigned rep = 20000;
282
283
284/*
285 Mismatch search benchmark for bm::sparse_vector_find_first_mismatch
286*/
287static
289{
290 svector_u32 sv1(sv); // copy sv
292
293 unsigned cnt = 0;
294
295 bm::chrono_taker tt1(cout, "1. SV mismatch", sz, &timing_map);
296
297 for (unsigned k = 0; k < sz; ++k)
298 {
299 unsigned i = vect_m[k].first;
300 svector_u32::value_type v1 = sv1[i];
301 svector_u32::value_type v2 = vect_m[k].second;
302 assert(v1 != v2);
303
304 sv1.set(i, v2); // change the copy vector
305
307 bool found = bm::sparse_vector_find_first_mismatch(sv, sv1, pos);
308 cnt += found;
309 assert(found);
310 assert(pos == i);
311
312 sv1.set(pos, v1); // restore old value
313 } // for
314 assert(cnt == vect_m.size());
315}
316
317/*
318 Mismatch search benchmark for std::vector::const_iterator
319*/
320static
322 const vector_pairs_type& vect_m)
323{
324 vector_char_type vect1(vect);
325 unsigned sz = (unsigned)vect_m.size();
326
327 unsigned cnt = 0;
328
329 bm::chrono_taker tt1(cout, "2. STL iterator", sz, &timing_map);
330
331 for (unsigned k = 0; k < sz; ++k)
332 {
333 unsigned i = vect_m[k].first;
334 vector_char_type::value_type v1 = vect[i];
335 vector_char_type::value_type v2 = int2DNA(vect_m[k].second);
336 assert(v1 != v2);
337
338 vect1[i] = v2; // change the copy vector
339
340 // possible to use std::mismatch() here
341 bool found = false;
342 vector_char_type::size_type pos;
343 auto it = vect.begin(); auto it_end = vect.end();
344 auto it1 = vect1.begin();
345 for (; it != it_end; ++it, ++it1)
346 {
347 if (*it != *it1)
348 {
349 found = true;
350 pos = vector_char_type::size_type(it - vect.begin());
351 break;
352 }
353 } // for it
354 cnt += found;
355 assert(found);
356 assert(pos == i);
357 vect1[i] = v1; // restore old value
358 } // for
359 assert(cnt == vect_m.size());
360}
361
362
363/*
364 Mismatch search benchmark for std::vector using strncmp()
365*/
366static
367void test_strcmp(const vector_char_type& vect, const vector_pairs_type& vect_m)
368{
369 vector_char_type vect1(vect);
370 unsigned sz = (unsigned)vect_m.size();
371 unsigned cnt = 0;
372
373 bm::chrono_taker tt1(cout, "3. strcmp() test ", sz, &timing_map);
374
375 unsigned vs = unsigned(vect.size());
376
377 for (unsigned k = 0; k < sz; ++k)
378 {
379 unsigned i = vect_m[k].first;
380 vector_char_type::value_type v1 = vect[i];
381 assert(vect_m[k].second < 4);
382 vector_char_type::value_type v2 = int2DNA(vect_m[k].second);
383 assert(v1 != v2);
384
385 vect1[i] = v2; // change the copy vector
386
387 const char* s1 = vect.data();
388 const char* s2 = vect1.data();
389
390 int res = ::strncmp(s1, s2, vs);
391 cnt += bool(res);
392 assert(res);
393 vect1[i] = v1; // restore
394 } // for
395 assert(cnt == vect_m.size());
396}
397
398/*
399 Sparse vector compare benchmark
400*/
401static
402void test_sv_cmp(const svector_u32& sv, const vector_pairs_type& vect_m)
403{
404 svector_u32 sv1(sv);
405 unsigned sz = (unsigned)vect_m.size();
406 unsigned cnt = 0;
407
408 bm::chrono_taker tt1(cout, "4. sv compare", sz, &timing_map);
409
410 for (unsigned k = 0; k < sz; ++k)
411 {
412 unsigned i = vect_m[k].first;
413 svector_u32::value_type v1 = sv1[i];
414 svector_u32::value_type v2 = vect_m[k].second;
415 assert(v1 != v2);
416
417 sv1.set(i, v2); // change the copy vector
418
419 int res = compare_sv(sv, sv1);
420 assert(res != 0);
421 cnt += bool(res);
422
423 sv1.set(i, v1); // restore
424 } // for
425 assert(cnt == vect_m.size());
426}
427
428/*
429 Sparse vector compare benchmark using iterators (very slow)
430*/
431inline
432void test_sv_cmp_it(const svector_u32& sv, const vector_pairs_type& vect_m)
433{
434 svector_u32 sv1(sv);
435 unsigned sz = (unsigned)vect_m.size();
436 unsigned cnt = 0;
437
438 bm::chrono_taker tt1(cout, "4. sv-cmp-it()", sz, &timing_map);
439
440 for (unsigned k = 0; k < sz; ++k)
441 {
442 unsigned i = vect_m[k].first;
443 svector_u32::value_type v1 = sv1[i];
444 svector_u32::value_type v2 = vect_m[k].second;
445 assert(v1 != v2);
446
447 sv1.set(i, v2); // change the copy vector
448
449 int res = compare_sv_it(sv, sv1);
450 cnt += bool(res);
451
452 assert(res != 0);
453
454 sv1.set(i, v1); // restore
455 } // for
456 assert(cnt == vect_m.size());
457}
458
459// -------------------------------------------------------------------
460
461int main(void)
462{
463 try
464 {
465 // generate short DNA vector as STL vector and BitMagic
466 // bit-transposed container, print the results (as a demo)
467 // we don't have to re-load it every time,
468 // bit transposed container is serializable
469 {
470 const char* s = "ATGTCNNNNNTATA";
471 svector_u32 sv;
472 {
474 for (unsigned i = 0; s[i]; ++i)
475 {
476 bi = DNA2int(s[i]);
477 }
478 bi.flush();
479 }
480 sv.optimize(); // this will compress the sparse (N) plane
481 PrintSV(sv);
482 }
483
484 // run a battery of benchmarks for variants of mismatch searches
485 // and compare functions
486 {
487 svector_u32 sv1;
488 vector_char_type dna_vect;
489 vector_pairs_type vect_m;
490
491 cout << "Generate benchmark test data." << endl;
492 generate_big_case(sv1, dna_vect);
493
494 generate_mismatches(vect_m, dna_vect, rep);
495 cout << "generated mismatches=" << vect_m.size() << endl;
496
497 cout << "SV mismatch search test" << endl;
498 test_mismatch_search(sv1, vect_m);
499
500 cout << "STL interator mismatch test" << endl;
501 test_vect_mismatch_search(dna_vect, vect_m);
502
503 // test compare functions
504 //
505
506 cout << "strncmp() test" << endl;
507 test_strcmp(dna_vect, vect_m);
508
509 cout << "SV compare test" << endl;
510 test_sv_cmp(sv1, vect_m);
511
512 // really sow, not worth testing
513 //
514 #if 0
515 cout << "SV compare iterators test" << endl;
516 test_sv_cmp_it(sv1, vect_m);
517 #endif
518
519 }
520
521 std::cout << std::endl << "Performance:" << std::endl;
523
524 }
525 catch(std::exception& ex)
526 {
527 std::cerr << ex.what() << std::endl;
528 return 1;
529 }
530
531
532
533 return 0;
534}
535
Compressed bit-vector bvector<> container, set algebraic methods, traversal iterators.
Sparse constainer sparse_vector<> for integer types using bit-transposition transform.
Algorithms for bm::sparse_vector.
Serialization for sparse_vector<>
Timing utilities for benchmarking (internal)
pre-processor un-defines to avoid global space pollution (internal)
Utility class to collect performance measurements and statistics.
Definition: bmtimer.h:41
std::map< std::string, statistics > duration_map_type
test name to duration map
Definition: bmtimer.h:66
static void print_duration_map(TOut &tout, const duration_map_type &dmap, format fmt=ct_time)
Definition: bmtimer.h:150
succinct sparse vector with runtime compression using bit-slicing / transposition method
Definition: bmsparsevec.h:87
size_type size() const BMNOEXCEPT
return size of the vector
Definition: bmsparsevec.h:711
void set(size_type idx, value_type v)
set specified element with bounds checking and automatic resize
Definition: bmsparsevec.h:1804
back_insert_iterator get_back_inserter()
Provide back insert iterator Back insert iterator implements buffered insertion, which is faster,...
Definition: bmsparsevec.h:572
const_iterator begin() const BMNOEXCEPT
Provide const iterator access to container content
Definition: bmsparsevec.h:2256
void optimize(bm::word_t *temp_block=0, typename bvector_type::optmode opt_mode=bvector_type::opt_compress, typename sparse_vector< Val, BV >::statistics *stat=0)
run memory optimization for all vector planes
Definition: bmsparsevec.h:2094
bool sparse_vector_find_first_mismatch(const SV &sv1, const SV &sv2, typename SV::size_type &midx, bm::null_support null_proc=bm::use_null)
Find first mismatch (element which is different) between two sparse vectors (uses linear scan in bit-...
static void test_sv_cmp(const svector_u32 &sv, const vector_pairs_type &vect_m)
Definition: xsample06.cpp:402
void TestSV(const SV &sv, const VECT &vect)
Test sparse vector against reference vector (for QA purposes)
Definition: xsample06.cpp:108
bm::sparse_vector< unsigned, bm::bvector<> > svector_u32
Definition: xsample06.cpp:132
static int compare_sv_it(const svector_u32 &sv1, const svector_u32 &sv2)
Definition: xsample06.cpp:162
static int compare_sv(const svector_u32 &sv1, const svector_u32 &sv2)
Definition: xsample06.cpp:141
static void test_strcmp(const vector_char_type &vect, const vector_pairs_type &vect_m)
Definition: xsample06.cpp:367
void test_sv_cmp_it(const svector_u32 &sv, const vector_pairs_type &vect_m)
Definition: xsample06.cpp:432
static void add_centromer_Ns(svector_u32 &sv, vector_char_type &vect, unsigned csz)
Definition: xsample06.cpp:201
int main(void)
Definition: xsample06.cpp:461
static void test_mismatch_search(const svector_u32 &sv, const vector_pairs_type &vect_m)
Definition: xsample06.cpp:288
bm::chrono_taker ::duration_map_type timing_map
Definition: xsample06.cpp:281
std::vector< char > vector_char_type
Definition: xsample06.cpp:133
static void test_vect_mismatch_search(const vector_char_type &vect, const vector_pairs_type &vect_m)
Definition: xsample06.cpp:321
char int2DNA(unsigned code)
Translate integer code to DNA letter.
Definition: xsample06.cpp:81
void PrintSV(const SV &sv)
Print sparse vector.
Definition: xsample06.cpp:91
static void generate_DNA_vector(svector_u32 &sv, vector_char_type &vect, unsigned sz)
Definition: xsample06.cpp:183
std::vector< std::pair< unsigned, unsigned > > vector_pairs_type
Definition: xsample06.cpp:134
static void generate_mismatches(vector_pairs_type &vect_m, const vector_char_type &vect, unsigned m_count)
Definition: xsample06.cpp:243
static void generate_big_case(svector_u32 &sv, vector_char_type &vect)
Definition: xsample06.cpp:226
unsigned DNA2int(char DNA_bp)
Translate DNA letter to integer code Please note this function uses extended alphabet ATGC plus 'N' a...
Definition: xsample06.cpp:58
const unsigned rep
Definition: xsample06.cpp:280