Use of sparse vector for compressed DNA strings (2 bits per bp) Construction of comparison functions on bit-transposed compressed containers, benchmarking.
#include <assert.h>
#include <stdlib.h>
#include <iostream>
#include <vector>
#include <algorithm>
#include <utility>
#include "bmdbg.h"
using namespace std;
inline unsigned DNA2int(
char DNA_bp)
{
switch (DNA_bp)
{
case 'A':
return 0;
case 'T':
return 1;
case 'G':
return 2;
case 'C':
return 3;
case 'N':
return 4;
case '$':
return 5;
default:
assert(0);
return 0;
}
}
{
static char lut[] = { 'A', 'T', 'G', 'C', 'N', '$' };
if (code < 5)
return lut[code];
assert(0);
return 'N';
}
template<
typename SV>
void PrintSV(
const SV& sv)
{
cout << "size() = " << sv.size() << " : ";
auto it = sv.begin(); auto it_end = sv.end();
for (; it != it_end; ++it)
{
auto v = *it;
cout << bp;
}
cout << endl;
}
template<typename SV, typename VECT>
void TestSV(
const SV& sv,
const VECT& vect)
{
auto it = sv.begin(); auto it_end = sv.end();
auto itv = vect.begin(); auto itv_end = vect.end();
for (; it != it_end && itv != itv_end; ++it, ++itv)
{
auto v = *it;
char bpv = *itv;
assert(bp == bpv);
if (bp != bpv)
{
cerr << "Error: reference vector mismatch!" << endl;
exit(1);
}
}
if (it != it_end || itv != itv_end)
{
cerr << "Error: reference size mismatch!" << endl;
exit(1);
}
}
static
{
if (found)
{
auto v1 = sv1[pos];
auto v2 = sv2[pos];
if (v1 < v2)
return -1;
return 1;
}
return 0;
}
static
{
{
auto v1 = *it1; auto v2 = *it2;
if (v1 == v2)
continue;
if (v1 < v2)
return -1;
return 1;
}
return 0;
}
static
{
for (unsigned i = 0; i < sz; ++i)
{
unsigned code = rand() % 4;
assert(bp == 'A' || bp == 'T' || bp == 'G' || bp == 'C');
vect.push_back(bp);
bi = code;
}
bi.flush();
}
static
{
assert(csz);
assert(csz < sz);
{
vect[i] = 'N';
}
}
static
{
const unsigned case_size = 200000000;
const unsigned centr_size = 7000000;
}
static
unsigned m_count)
{
vect_m.resize(0);
if (!vect.size())
return;
vector_char_type::size_type sz = vect.size();
vector_char_type::size_type delta = sz / m_count;
for (vector_char_type::size_type i = 0; i < sz; i += delta)
{
vector_char_type::value_type v1 = vect[i];
unsigned code = rand() % 4;
vector_char_type::value_type v2 =
int2DNA(code);
if (v2 == v1)
continue;
vect_m.push_back(vector_pairs_type::value_type(unsigned(i), code));
}
for (vector_char_type::size_type i = 1; i < sz / 4; i += (rand()%(1024 * 10)))
{
vector_char_type::value_type v1 = vect[i];
unsigned code = rand() % 4;
vector_char_type::value_type v2 =
int2DNA(code);
if (v2 == v1)
continue;
vect_m.push_back(vector_pairs_type::value_type(unsigned(i), code));
}
}
const unsigned rep = 20000;
static
{
unsigned cnt = 0;
for (unsigned k = 0; k < sz; ++k)
{
unsigned i = vect_m[k].first;
assert(v1 != v2);
cnt += found;
assert(found);
assert(pos == i);
}
assert(cnt == vect_m.size());
}
static
{
unsigned sz = (unsigned)vect_m.size();
unsigned cnt = 0;
for (unsigned k = 0; k < sz; ++k)
{
unsigned i = vect_m[k].first;
vector_char_type::value_type v1 = vect[i];
vector_char_type::value_type v2 =
int2DNA(vect_m[k].second);
assert(v1 != v2);
vect1[i] = v2;
bool found = false;
vector_char_type::size_type pos;
auto it = vect.begin(); auto it_end = vect.end();
auto it1 = vect1.begin();
for (; it != it_end; ++it, ++it1)
{
if (*it != *it1)
{
found = true;
pos = vector_char_type::size_type(it - vect.begin());
break;
}
}
cnt += found;
assert(found);
assert(pos == i);
vect1[i] = v1;
}
assert(cnt == vect_m.size());
}
static
{
unsigned sz = (unsigned)vect_m.size();
unsigned cnt = 0;
unsigned vs = unsigned(vect.size());
for (unsigned k = 0; k < sz; ++k)
{
unsigned i = vect_m[k].first;
vector_char_type::value_type v1 = vect[i];
assert(vect_m[k].second < 4);
vector_char_type::value_type v2 =
int2DNA(vect_m[k].second);
assert(v1 != v2);
vect1[i] = v2;
const char* s1 = vect.data();
const char* s2 = vect1.data();
int res = ::strncmp(s1, s2, vs);
cnt += bool(res);
assert(res);
vect1[i] = v1;
}
assert(cnt == vect_m.size());
}
static
{
unsigned sz = (unsigned)vect_m.size();
unsigned cnt = 0;
for (unsigned k = 0; k < sz; ++k)
{
unsigned i = vect_m[k].first;
assert(v1 != v2);
assert(res != 0);
cnt += bool(res);
}
assert(cnt == vect_m.size());
}
inline
{
unsigned sz = (unsigned)vect_m.size();
unsigned cnt = 0;
for (unsigned k = 0; k < sz; ++k)
{
unsigned i = vect_m[k].first;
assert(v1 != v2);
cnt += bool(res);
assert(res != 0);
}
assert(cnt == vect_m.size());
}
{
try
{
{
const char* s = "ATGTCNNNNNTATA";
{
for (unsigned i = 0; s[i]; ++i)
{
}
bi.flush();
}
}
{
cout << "Generate benchmark test data." << endl;
cout << "generated mismatches=" << vect_m.size() << endl;
cout << "SV mismatch search test" << endl;
cout << "STL interator mismatch test" << endl;
cout << "strncmp() test" << endl;
cout << "SV compare test" << endl;
#if 0
cout << "SV compare iterators test" << endl;
#endif
}
std::cout << std::endl << "Performance:" << std::endl;
}
catch(std::exception& ex)
{
std::cerr << ex.what() << std::endl;
return 1;
}
return 0;
}
Compressed bit-vector bvector<> container, set algebraic methods, traversal iterators.
Sparse constainer sparse_vector<> for integer types using bit-transposition transform.
Algorithms for bm::sparse_vector.
Serialization for sparse_vector<>
Timing utilities for benchmarking (internal)
pre-processor un-defines to avoid global space pollution (internal)
Utility class to collect performance measurements and statistics.
std::map< std::string, statistics > duration_map_type
test name to duration map
static void print_duration_map(TOut &tout, const duration_map_type &dmap, format fmt=ct_time)
succinct sparse vector with runtime compression using bit-slicing / transposition method
bvector_type::size_type size_type
size_type size() const BMNOEXCEPT
return size of the vector
void set(size_type idx, value_type v)
set specified element with bounds checking and automatic resize
back_insert_iterator get_back_inserter()
Provide back insert iterator Back insert iterator implements buffered insertion, which is faster,...
friend back_insert_iterator
const_iterator begin() const BMNOEXCEPT
Provide const iterator access to container content
void optimize(bm::word_t *temp_block=0, typename bvector_type::optmode opt_mode=bvector_type::opt_compress, typename sparse_vector< Val, BV >::statistics *stat=0)
run memory optimization for all vector planes
bool sparse_vector_find_first_mismatch(const SV &sv1, const SV &sv2, typename SV::size_type &midx, bm::null_support null_proc=bm::use_null)
Find first mismatch (element which is different) between two sparse vectors (uses linear scan in bit-...
static void test_sv_cmp(const svector_u32 &sv, const vector_pairs_type &vect_m)
void TestSV(const SV &sv, const VECT &vect)
Test sparse vector against reference vector (for QA purposes)
bm::sparse_vector< unsigned, bm::bvector<> > svector_u32
static int compare_sv_it(const svector_u32 &sv1, const svector_u32 &sv2)
static int compare_sv(const svector_u32 &sv1, const svector_u32 &sv2)
static void test_strcmp(const vector_char_type &vect, const vector_pairs_type &vect_m)
void test_sv_cmp_it(const svector_u32 &sv, const vector_pairs_type &vect_m)
static void add_centromer_Ns(svector_u32 &sv, vector_char_type &vect, unsigned csz)
static void test_mismatch_search(const svector_u32 &sv, const vector_pairs_type &vect_m)
bm::chrono_taker ::duration_map_type timing_map
std::vector< char > vector_char_type
static void test_vect_mismatch_search(const vector_char_type &vect, const vector_pairs_type &vect_m)
char int2DNA(unsigned code)
Translate integer code to DNA letter.
void PrintSV(const SV &sv)
Print sparse vector.
static void generate_DNA_vector(svector_u32 &sv, vector_char_type &vect, unsigned sz)
std::vector< std::pair< unsigned, unsigned > > vector_pairs_type
static void generate_mismatches(vector_pairs_type &vect_m, const vector_char_type &vect, unsigned m_count)
static void generate_big_case(svector_u32 &sv, vector_char_type &vect)
unsigned DNA2int(char DNA_bp)
Translate DNA letter to integer code Please note this function uses extended alphabet ATGC plus 'N' a...