Seach for human mutation (SNP) in within chr1. Benchmark comaprison of different methods
#include <iostream>
#include <sstream>
#include <chrono>
#include <regex>
#include <time.h>
#include <stdio.h>
#include <vector>
#include <chrono>
#include <map>
#include <utility>
using namespace std;
#include "bmdbg.h"
static
{
std::cerr
<< "BitMagic SNP Search Sample Utility (c) 2018" << std::endl
<< "-isnp file-name -- input set file (SNP FASTA) to parse" << std::endl
<< "-svout spase vector output -- sparse vector name to save" << std::endl
<< "-rscout rs-compressed spase vector output -- name to save" << std::endl
<< "-svin sparse vector input -- sparse vector file name to load " << std::endl
<< "-rscin rs-compressed sparse vector input -- file name to load " << std::endl
<< "-diag -- run diagnostics" << std::endl
<< "-timing -- collect timings" << std::endl
;
}
static
{
for (int i = 1; i < argc; ++i)
{
std::string arg = argv[i];
if ((arg == "-h") || (arg == "--help"))
{
return 0;
}
if (arg == "-svout" || arg == "--svout")
{
if (i + 1 < argc)
{
}
else
{
std::cerr << "Error: -svout requires file name" << std::endl;
return 1;
}
continue;
}
if (arg == "-rscout" || arg == "--rscout")
{
if (i + 1 < argc)
{
}
else
{
std::cerr << "Error: -rscout requires file name" << std::endl;
return 1;
}
continue;
}
if (arg == "-svin" || arg == "--svin")
{
if (i + 1 < argc)
{
}
else
{
std::cerr << "Error: -svin requires file name" << std::endl;
return 1;
}
continue;
}
if (arg == "-rscin" || arg == "--rscin")
{
if (i + 1 < argc)
{
}
else
{
std::cerr << "Error: -rscin requires file name" << std::endl;
return 1;
}
continue;
}
if (arg == "-isnp" || arg == "--isnp" || arg == "-snp" || arg == "--snp")
{
if (i + 1 < argc)
{
}
else
{
std::cerr << "Error: -isnp requires file name" << std::endl;
return 1;
}
continue;
}
if (arg == "-diag" || arg == "--diag" || arg == "-d" || arg == "--d")
if (arg == "-timing" || arg == "--timing" || arg == "-t" || arg == "--t")
if (arg == "-bench" || arg == "--bench" || arg == "-b" || arg == "--b")
}
return 0;
}
typedef std::vector<std::pair<unsigned, unsigned> >
vector_pairs;
static
{
std::ifstream fin(fname.c_str(), std::ios::in);
if (!fin.good())
return -1;
unsigned rs_id, rs_pos;
size_t idx;
std::string line;
std::string delim = " \t";
std::regex reg("\\s+");
std::sregex_token_iterator it_end;
unsigned rs_cnt = 0;
for (unsigned i = 0; std::getline(fin, line); ++i)
{
if (line.empty() ||
!isdigit(line.front())
)
continue;
std::sregex_token_iterator it(line.begin(), line.end(), reg, -1);
std::vector<std::string> line_vec(it, it_end);
if (line_vec.empty())
continue;
try
{
rs_id = unsigned(std::stoul(line_vec.at(0), &idx));
{
continue;
}
rs_pos = unsigned(std::stoul(line_vec.at(11), &idx));
++rs_cnt;
}
catch (std::exception& )
{
continue;
}
if (rs_cnt % (4 * 1024) == 0)
std::cout << "\r" << rs_cnt << " / " << i;
}
std::cout << std::endl;
std::cout << "SNP count=" << rs_cnt << std::endl;
return 0;
}
static
{
rand_sampler.
sample(bv_sample, *bv_null, count);
{
unsigned idx = *en;
unsigned v = sv[idx];
}
}
static
{
for (; it != it_end; ++it)
{
if (!it.is_null())
{
std::pair<unsigned, unsigned> pos2rs = std::make_pair(it.pos(), it.value());
vp.push_back(pos2rs);
}
}
}
static
{
for (unsigned i = 0; i < vp.size(); ++i)
{
if (vp[i].second == rs_id)
{
pos = vp[i].first;
return true;
}
}
return false;
}
static
{
const unsigned rs_sample_count = 2000;
std::vector<unsigned> rs_vect;
if (rs_vect.empty())
{
std::cerr << "Benchmark subset empty!" << std::endl;
return;
}
{
for (unsigned i = 0; i < rs_vect.size(); ++i)
{
unsigned rs_id = rs_vect[i];
unsigned rs_pos;
bool found = scanner.
find_eq(sv, rs_id, rs_pos);
if (found)
{
}
else
{
std::cout << "Error: rs_id = " << rs_id << " not found!" << std::endl;
}
}
}
{
for (unsigned i = 0; i < rs_vect.size(); ++i)
{
unsigned rs_id = rs_vect[i];
unsigned rs_pos;
bool found = scanner.
find_eq(csv, rs_id, rs_pos);
if (found)
{
}
else
{
std::cout << "rs_id = " << rs_id << " not found!" << std::endl;
}
}
}
if (vp.size())
{
for (unsigned i = 0; i < rs_vect.size(); ++i)
{
unsigned rs_id = rs_vect[i];
unsigned rs_pos;
if (found)
{
}
else
{
std::cout << "rs_id = " << rs_id << " not found!" << std::endl;
}
}
}
int res = bv_found1.
compare(bv_found2);
if (res != 0)
{
std::cerr << "Error: search discrepancy (sparse search) detected!" << std::endl;
}
res = bv_found1.
compare(bv_found3);
if (res != 0)
{
std::cerr << "Error: search discrepancy (std::vector<>) detected!" << std::endl;
}
}
int main(
int argc,
char *argv[])
{
if (argc < 3)
{
return 1;
}
try
{
if (ret != 0)
return ret;
{
if (res != 0)
{
return res;
}
}
{
}
{
}
{
{
}
{
}
{
std::cerr << "Error: rs-compressed vector check failed!" << std::endl;
return 1;
}
}
{
}
{
}
{
{
std::cout << std::endl
<< "sparse vector statistics:"
<< std::endl;
bm::print_svector_stat(std::cout, sv, true);
}
{
std::cout << std::endl
<< "RS compressed sparse vector statistics:"
<< std::endl;
bm::print_svector_stat(std::cout, csv, true);
}
}
{
}
{
std::cout << std::endl << "Performance:" << std::endl;
}
}
catch (std::exception& ex)
{
std::cerr << "Error:" << ex.what() << std::endl;
return 1;
}
return 0;
}
Compressed bit-vector bvector<> container, set algebraic methods, traversal iterators.
Algorithms for bvector<> (main include)
Generation of random subset.
Serialization / compression of bvector<>. Set theoretical operations on compressed BLOBs.
Sparse constainer sparse_vector<> for integer types using bit-transposition transform.
Algorithms for bm::sparse_vector.
Compressed sparse container rsc_sparse_vector<> for integer types.
Serialization for sparse_vector<>
Timing utilities for benchmarking (internal)
pre-processor un-defines to avoid global space pollution (internal)
const bvector_type * get_null_bvector() const BMNOEXCEPT
Get bit-vector of assigned values or NULL (if not constructed that way)
Constant iterator designed to enumerate "ON" bits.
bool valid() const BMNOEXCEPT
Checks if iterator is still valid.
Bitvector Bit-vector container with runtime compression of bits.
bool test(size_type n) const BMNOEXCEPT
returns true if bit n is set and false is bit n is 0.
enumerator first() const
Returns enumerator pointing on the first non-zero bit.
void init()
Explicit post-construction initialization. Must be caled to make sure safe use of *_no_check() method...
int compare(const bvector< Alloc > &bvect) const BMNOEXCEPT
Lexicographical comparison with a bitvector.
void set_bit_no_check(size_type n)
Set bit without checking preconditions (size, etc)
Utility class to collect performance measurements and statistics.
std::map< std::string, statistics > duration_map_type
test name to duration map
static void print_duration_map(TOut &tout, const duration_map_type &dmap, format fmt=ct_time)
void sample(BV &bv_out, const BV &bv_in, size_type sample_count)
Get random subset of input vector.
void load_from(const sparse_vector_type &sv_src)
Load compressed vector from a sparse vector (with NULLs)
void load_to(sparse_vector_type &sv) const
Exort compressed vector to a sparse vector (with NULLs)
void optimize(bm::word_t *temp_block=0, typename bvector_type::optmode opt_mode=bvector_type::opt_compress, statistics *stat=0)
run memory optimization for all vector slices
bool empty() const BMNOEXCEPT
return true if vector is empty
algorithms for sparse_vector scan/search
void find_eq(const SV &sv, value_type value, bvector_type &bv_out)
find all sparse vector elements EQ to search value
succinct sparse vector with runtime compression using bit-slicing / transposition method
bool equal(const sparse_vector< Val, BV > &sv, bm::null_support null_able=bm::use_null) const BMNOEXCEPT
check if another sparse vector has the same content and size
void push_back(value_type v)
push value back into vector
void set(size_type idx, value_type v)
set specified element with bounds checking and automatic resize
const_iterator end() const BMNOEXCEPT
Provide const iterator access to the end
bool empty() const BMNOEXCEPT
return true if vector is empty
const_iterator begin() const BMNOEXCEPT
Provide const iterator access to container content
void optimize(bm::word_t *temp_block=0, typename bvector_type::optmode opt_mode=bvector_type::opt_compress, typename sparse_vector< Val, BV >::statistics *stat=0)
run memory optimization for all vector planes
@ use_null
support "non-assigned" or "NULL" logic
int main(int argc, char *argv[])
static int parse_args(int argc, char *argv[])
std::vector< std::pair< unsigned, unsigned > > vector_pairs
static void build_vector_pairs(const sparse_vector_u32 &sv, vector_pairs &vp)
bm::sparse_vector< unsigned, bm::bvector<> > sparse_vector_u32
static bool search_vector_pairs(const vector_pairs &vp, unsigned rs_id, unsigned &pos)
bm::chrono_taker ::duration_map_type timing_map
bm::rsc_sparse_vector< unsigned, sparse_vector_u32 > rsc_sparse_vector_u32
static void generate_random_subset(const sparse_vector_u32 &sv, std::vector< unsigned > &vect, unsigned count)
static int load_snp_report(const std::string &fname, sparse_vector_u32 &sv)
static void run_benchmark(const sparse_vector_u32 &sv, const rsc_sparse_vector_u32 &csv)