Use of bvector<> for k-mer fingerprint K should be short, no minimizers are used here (the approach can be used to create a scheme with minimizers).This example:
#include <assert.h>
#include <stdlib.h>
#include <iostream>
#include <vector>
#include <map>
#include <algorithm>
#include <utility>
#include <future>
#include <thread>
#include <mutex>
#include <atomic>
#include "bm64.h"
#include "bmdbg.h"
#include "dna_finger.h"
using namespace std;
#include "cmd_args.h"
static
{
seq_vect.resize(0);
std::ifstream fin(fname.c_str(), std::ios::in);
if (!fin.good())
return -1;
std::string line;
for (unsigned i = 0; std::getline(fin, line); ++i)
{
if (line.empty() ||
line.front() == '>')
continue;
for (std::string::iterator it = line.begin(); it != line.end(); ++it)
seq_vect.push_back(*it);
}
return 0;
}
inline
{
switch (bp)
{
case 'A':
dna_code = 0;
break;
case 'T':
dna_code = 1;
break;
case 'G':
dna_code = 2;
break;
case 'C':
dna_code = 3;
break;
default:
return false;
}
return true;
}
inline
size_t pos, unsigned k_size,
{
unsigned shift = 0;
dna += pos;
for (size_t i = 0; i < k_size; ++i)
{
char bp = dna[i];
if (!valid)
return false;
k_acc |= (dna_code << shift);
shift += 2;
}
k_mer = k_acc;
return true;
}
inline
{
static char lut[] = { 'A', 'T', 'G', 'C', 'N', '$' };
if (code < 5)
return lut[code];
assert(0);
return 'N';
}
inline
{
dna.resize(k_size);
for (size_t i = 0; i < k_size; ++i)
{
unsigned dna_code = unsigned(kmer_code & 3);
dna[i] = bp;
kmer_code >>= 2;
}
assert(!kmer_code);
}
inline
size_t pos, unsigned k_size,
{
for (size_t i = 0; i < k_size; ++i)
{
char bp = dna[pos+i];
unsigned dna_code = unsigned(k_mer & 3ull);
if (bp != bp_c)
{
if (bp == 'N' && bp_c != 'A')
{
cerr << bp << " " << bp_c << endl;
cerr << "Error! N code mismatch at pos = " << pos+i
<< endl;
exit(1);
}
}
k_mer >>= 2;
}
if (k_mer)
{
cerr << "Error! non-zero k-mer remainder at " << pos << endl;
exit(1);
}
}
template<typename VECT>
{
std::sort(vect.begin(), vect.end());
auto last = std::unique(vect.begin(), vect.end());
vect.erase(last, vect.end());
}
template<typename VECT, typename COUNT_VECT>
{
if (!vect.size())
return;
std::sort(vect.begin(), vect.end());
typename VECT::value_type prev = vect[0];
typename COUNT_VECT::value_type cnt = 1;
auto vsize = vect.size();
size_t i = 1;
for (; i < vsize; ++i)
{
auto v = vect[i];
if (v == prev)
{
++cnt;
continue;
}
cvect.inc_not_null(prev, cnt);
prev = v; cnt = 1;
}
cvect.inc_not_null(prev, cnt);
assert(cvect.in_sync());
}
template<typename BV>
unsigned k_size,
bool check)
{
bv.clear();
bv.init();
if (seq_vect.empty())
return;
const char* dna_str = &seq_vect[0];
std::vector<bm::id64_t> k_buf;
k_buf.reserve(chunk_size);
{
vector_char_type::size_type dna_sz = seq_vect.size()-(k_size-1);
vector_char_type::size_type pos = 0;
bool valid = false;
for (; pos < dna_sz; ++pos)
{
if (valid)
{
k_buf.push_back(k_mer_code);
break;
}
}
const unsigned k_shift = (k_size-1) * 2;
if (valid)
{
for (++pos; pos < dna_sz; ++pos)
{
if (!valid)
{
pos += k_size;
for (; pos < dna_sz; ++pos)
{
if (valid)
{
k_buf.push_back(k_mer_code);
break;
}
}
continue;
}
k_mer_code = ((k_mer_code >> 2) | (bp_code << k_shift));
k_buf.push_back(k_mer_code);
if (check)
{
assert(valid);
assert(k_check == k_mer_code);
}
if (k_buf.size() >= chunk_size)
{
if (k_buf.size())
{
k_buf.resize(0);
bv.optimize();
}
float pcnt = float(pos) / float(dna_sz);
pcnt *= 100;
cout << "\r" << unsigned(pcnt) << "% of " << dna_sz
<< " (" << (pos+1) <<") "
<< flush;
}
}
}
if (k_buf.size())
{
cout << "Unique k-mers: " << k_buf.size() << endl;
}
}
bv.optimize();
}
inline
unsigned k_size,
{
if (seq_vect.empty())
return;
const char* dna_str = &seq_vect[0];
std::vector<bm::id64_t> k_buf;
k_buf.reserve(chunk_size);
vector_char_type::size_type dna_sz = seq_vect.size()-(k_size-1);
vector_char_type::size_type pos = 0;
bool valid = false;
for (; pos < dna_sz; ++pos)
{
if (valid)
{
k_buf.push_back(k_mer_code);
break;
}
}
const unsigned k_shift = (k_size-1) * 2;
if (valid)
{
for (++pos; pos < dna_sz; ++pos)
{
if (!valid)
{
pos += k_size;
for (; pos < dna_sz; ++pos)
{
if (valid)
{
k_buf.push_back(k_mer_code);
break;
}
}
continue;
}
k_mer_code = ((k_mer_code >> 2) | (bp_code << k_shift));
k_buf.push_back(k_mer_code);
if (k_buf.size() == chunk_size)
{
k_buf.resize(0);
float pcnt = float(pos) / float(dna_sz);
pcnt *= 100;
cout << "\r" << unsigned(pcnt) << "% of " << dna_sz
<< " (" << (pos+1) <<") "
<< flush;
}
}
}
}
template<typename BV>
{
public:
unsigned k_size,
: m_seq_vect(seq_vect), m_k_size(k_size), m_from(from), m_to(to),
m_kmer_counts(kmer_counts)
{}
: m_seq_vect(func.m_seq_vect), m_k_size(func.m_k_size),
m_from(func.m_from), m_to(func.m_to),
m_kmer_counts(func.m_kmer_counts)
{}
{
kmer_counts_part.sync();
if (m_seq_vect.empty())
return;
const char* dna_str = &m_seq_vect[0];
std::vector<bm::id64_t> k_buf;
k_buf.reserve(chunk_size);
const auto k_size = m_k_size;
vector_char_type::size_type dna_sz = m_seq_vect.size()-(m_k_size-1);
vector_char_type::size_type pos = 0;
bool valid = false;
for (; pos < dna_sz; ++pos)
{
if (valid)
{
if (k_mer_code >= m_from && k_mer_code <= m_to)
k_buf.push_back(k_mer_code);
break;
}
}
const unsigned k_shift = (k_size-1) * 2;
if (valid)
{
for (++pos; pos < dna_sz; ++pos)
{
if (!valid)
{
pos += k_size;
for (; pos < dna_sz; ++pos)
{
if (valid)
{
if (k_mer_code >= m_from && k_mer_code <= m_to)
k_buf.push_back(k_mer_code);
break;
}
}
continue;
}
k_mer_code = ((k_mer_code >> 2) | (bp_code << k_shift));
if (k_mer_code >= m_from && k_mer_code <= m_to)
{
k_buf.push_back(k_mer_code);
if (k_buf.size() == chunk_size)
{
k_buf.resize(0);
}
}
}
}
{
static std::mutex mtx_counts_lock;
std::lock_guard<std::mutex> guard(mtx_counts_lock);
}
}
private:
unsigned m_k_size;
};
template<typename BV>
unsigned k_size,
unsigned concurrency)
{
if (split_rank < concurrency || concurrency < 2)
{
return;
}
std::vector<std::future<void> > futures;
futures.reserve(concurrency);
for (size_t k = 0; k < pair_vect.size(); ++k)
{
futures.emplace_back(std::async(std::launch::async,
pair_vect[k].first,
pair_vect[k].second,
kmer_counts)));
}
for (auto& e : futures)
{
unsigned m = 0;
while(1)
{
std::future_status status = e.wait_for(std::chrono::seconds(60));
if (status == std::future_status::ready)
break;
cout << "\r" << ++m << " min" << flush;
}
}
cout << endl;
}
template<typename BV>
{
std::string kmer_str;
typename BV::size_type cnt = 0;
typename BV::enumerator en = bv_kmers.first();
for ( ;en.valid(); ++en, ++cnt)
{
auto k_mer_code = *en;
typename BV::size_type count =
dna_scanner.FindCount(kmer_str);
assert(count);
kmer_counts.
set(k_mer_code, (
unsigned)count);
if ((cnt % 1000) == 0)
cout << "\r" << cnt << flush;
}
}
template<typename DNA_Scan>
{
public:
: m_parent_scanner(parent_scanner), m_bv_kmers(bv_kmers),
m_from(from), m_to(to), m_kmer_counts(kmer_counts)
{}
: m_parent_scanner(func.m_parent_scanner), m_bv_kmers(func.m_bv_kmers),
m_from(func.m_from), m_to(func.m_to), m_kmer_counts(func.m_kmer_counts)
{}
{
std::string kmer_str;
static std::mutex mtx_counts_lock;
for ( ;en.
valid(); ++en, ++cnt)
{
auto k_mer_code = *en;
if (k_mer_code > m_to)
break;
m_Agg.reset();
m_Agg.set_compute_count(true);
for (size_t i = 0; i < kmer_str.size(); ++i)
{
const bvector_type& bv_mask = m_parent_scanner.GetVector(kmer_str[i]);
m_Agg.add(&bv_mask);
}
m_Agg.combine_shift_right_and(bv_search_res);
{
std::lock_guard<std::mutex> guard(mtx_counts_lock);
m_kmer_counts.
set(k_mer_code,
unsigned(search_count));
}
}
}
private:
const DNA_Scan& m_parent_scanner;
};
template<typename BV>
unsigned concurrency)
{
if (split_rank < concurrency || concurrency < 2)
{
return;
}
std::vector<std::future<void> > futures;
futures.reserve(concurrency);
for (size_t k = 0; k < pair_vect.size(); ++k)
{
futures.emplace_back(std::async(std::launch::async,
pair_vect[k].first,
pair_vect[k].second,
kmer_counts)));
}
for (auto& e : futures)
{
unsigned long long c_prev = 0;
while(1)
{
std::future_status status = e.wait_for(std::chrono::seconds(60));
if (status == std::future_status::ready)
break;
auto delta = c - c_prev;
c_prev = c;
auto remain_cnt = cnt - c;
auto remain_min = remain_cnt / delta;
cout << "\r" << c << ": progress per minute=" << delta;
if (remain_min < 120)
{
cout << " wait for " << remain_min << "m " << flush;
}
else
{
auto remain_h = remain_min / 60;
cout << " wait for " << remain_h << "h " << flush;
}
}
}
cout << endl;
}
static
{
auto en = bv_null->first();
{
auto kmer_code = *en;
auto kmer_count = kmer_counts.
get(kmer_code);
auto mit = hmap.find(kmer_count);
if (mit == hmap.end())
hmap[kmer_count] = 1;
else
mit->second++;
}
}
static
{
ofstream outf;
outf.open(fname, ios::out | ios::trunc );
outf << "kmer count \t number of kmers\n";
auto it = hmap.rbegin(); auto it_end = hmap.rend();
for (; it != it_end; ++it)
{
outf << it->first << "\t" << it->second << endl;
}
}
template<typename BV>
unsigned percent,
unsigned k_size)
{
(void)k_size;
frequent_bv.clear();
if (!percent)
return;
BV bv_found;
auto total_kmers = bv_null->count();
bm::id64_t target_f_count = (total_kmers * percent) / 100;
auto it = hmap.rbegin();
auto it_end = hmap.rend();
for (; it != it_end; ++it)
{
auto kmer_count = it->first;
scanner.
find_eq(kmer_counts, kmer_count, bv_found);
auto found_cnt = bv_found.count();
(void)found_cnt;
assert(found_cnt);
{
{
auto kmer_code = *en;
unsigned k_count = kmer_counts.
get(kmer_code);
if (k_count == 1)
continue;
if (it->second == 1)
{
assert(k_count == kmer_count);
}
frequent_bv.set(kmer_code);
if (kmer_count >= target_f_count)
{
frequent_bv.optimize();
return;
}
target_f_count -= 1;
}
}
}
frequent_bv.optimize();
}
int main(
int argc,
char *argv[])
{
try
{
if (ret != 0)
{
cerr << "cmd-line parse error. " << endl;
return ret;
}
{
if (res != 0)
return res;
std::cout << "FASTA sequence size=" << seq_vect.size() << std::endl;
}
if (seq_vect.size())
{
cout <<
"k-mer generation for k=" <<
ik_size << endl;
cout <<
"Found " << bv_kmers.
count() <<
" k-mers." << endl;
{
bm::print_bvector_stat(cout,bv_kmers);
size_t blob_size = bm::compute_serialization_size(bv_kmers);
cout << "DNA 2-bit coded FASTA size=" << seq_vect.size()/4 << endl;
cout << " Compressed size=" << blob_size << endl;
}
}
{
bm::SaveBVector(
ikd_name.c_str(), bv_kmers);
}
if (seq_vect.size())
{
}
if (seq_vect.size() &&
{
rsc_kmer_counts.sync();
rsc_kmer_counts2.sync();
cout << " Searching for k-mer counts..." << endl;
{
cout << " ... using bm::aggregator<>" << endl;
rsc_kmer_counts.optimize();
}
{
cout << " ... using std::sort() and count" << endl;
rsc_kmer_counts2.optimize();
{
if (res)
{
cerr << "Error: Count vector save failed!" << endl;
exit(1);
}
}
{
bool eq = rsc_kmer_counts.equal(rsc_kmer_counts2);
if(!eq)
{
rsc_kmer_counts2,
idx);
auto v1 = rsc_kmer_counts.get(idx);
auto v2 = rsc_kmer_counts2.get(idx);
cerr << "Mismatch at idx=" << idx << " v1=" << v1 << " v2=" << v2 << endl;
assert(found); (void)found;
assert(eq);
cerr << "Integrity check failed!" << endl;
exit(1);
}
}
}
{
}
{
}
{
}
{
}
cout << "Found frequent k-mers: " << bv_freq.count() << endl;
{
}
}
{
std::cout << std::endl << "Performance:" << std::endl;
}
}
catch(std::exception& ex)
{
std::cerr << ex.what() << std::endl;
return 1;
}
return 0;
}
Algorithms for fast aggregation of N bvectors.
Algorithms for bvector<> (main include)
Serialization / compression of bvector<>. Set theoretical operations on compressed BLOBs.
Algorithms for bm::sparse_vector.
Compressed sparse container rsc_sparse_vector<> for integer types.
Timing utilities for benchmarking (internal)
pre-processor un-defines to avoid global space pollution (internal)
k-mer counting job functor class using bm::aggregator<>
DNA_Scan::bvector_type bvector_type
Counting_JobFunctor(const DNA_Scan &parent_scanner, const bvector_type &bv_kmers, size_type from, size_type to, rsc_sparse_vector_u32 &kmer_counts)
constructor
bvector_type::size_type size_type
void operator()()
Main logic (functor)
Utility for keeping all DNA finger print vectors and search using various techniques.
void BuildParallel(const vector< char > &sequence, unsigned threads)
Build fingerprint bit-vectors using bulk insert iterator and parallel processing.
Functor to process job batch (task)
void operator()()
Main logic (functor)
SortCounting_JobFunctor(const vector_char_type &seq_vect, unsigned k_size, size_type from, size_type to, rsc_sparse_vector_u32 &kmer_counts)
constructor
bvector_type::size_type size_type
Constant iterator designed to enumerate "ON" bits.
bool valid() const BMNOEXCEPT
Checks if iterator is still valid.
Bitvector Bit-vector container with runtime compression of bits.
size_type count() const BMNOEXCEPT
population count (count of ON bits)
bm::bvector< Alloc > & bit_sub(const bm::bvector< Alloc > &bv1, const bm::bvector< Alloc > &bv2, typename bm::bvector< Alloc >::optmode opt_mode=opt_none)
3-operand SUB : this := bv1 MINUS bv2 SUBtraction is also known as AND NOT
void optimize(bm::word_t *temp_block=0, optmode opt_mode=opt_compress, statistics *stat=0)
Optimize memory bitvector's memory allocation.
bvector_size_type size_type
Utility class to collect performance measurements and statistics.
std::map< std::string, statistics > duration_map_type
test name to duration map
static void print_duration_map(TOut &tout, const duration_map_type &dmap, format fmt=ct_time)
void merge_not_null(rsc_sparse_vector< Val, SV > &csv)
merge two vectors (argument gets destroyed) It is important that both vectors have the same NULL vect...
void set(size_type idx, value_type v)
set specified element with bounds checking and automatic resize
value_type get(size_type idx) const BMNOEXCEPT
get specified element without bounds checking
const bvector_type * get_null_bvector() const BMNOEXCEPT
Get bit-vector of assigned values (or NULL)
bool in_sync() const BMNOEXCEPT
returns true if prefix sum table is in sync with the vector
SV::bvector_type bvector_type
algorithms for sparse_vector scan/search
void find_eq(const SV &sv, value_type value, bvector_type &bv_out)
find all sparse vector elements EQ to search value
succinct sparse vector with runtime compression using bit-slicing / transposition method
@ BM_SORTED
input set is sorted (ascending order)
@ BM_GAP
GAP compression is ON.
void rank_range_split(const BV &bv, typename BV::size_type rank, PairVect &target_v)
Algorithm to identify bit-vector ranges (splits) for the rank.
bool sparse_vector_find_first_mismatch(const SV &sv1, const SV &sv2, typename SV::size_type &midx, bm::null_support null_proc=bm::use_null)
Find first mismatch (element which is different) between two sparse vectors (uses linear scan in bit-...
unsigned long long int id64_t
bm::bvector ::size_type bv_size_type
std::vector< std::pair< bv_size_type, bv_size_type > > bv_ranges_vector
static int parse_args(int argc, char *argv[])
bm::aggregator< bm::bvector<> > aggregator_type
std::vector< char > vector_char_type
DNA_FingerprintScanner< bm::bvector<> > dna_scanner_type
int main(int argc, char *argv[])
void validate_k_mer(const char *dna, size_t pos, unsigned k_size, bm::id64_t k_mer)
QA function to validate if reverse k-mer decode gives the same string.
void compute_frequent_kmers(BV &frequent_bv, const histogram_map_u32 &hmap, const rsc_sparse_vector_u32 &kmer_counts, unsigned percent, unsigned k_size)
Create vector, representing subset of k-mers of high frequency.
void generate_k_mer_bvector(BV &bv, const vector_char_type &seq_vect, unsigned k_size, bool check)
This function turns each k-mer into an integer number and encodes it in a bit-vector (presense vector...
void translate_kmer(std::string &dna, bm::id64_t kmer_code, unsigned k_size)
Translate k-mer code into ATGC DNA string.
static void report_hmap(const string &fname, const histogram_map_u32 &hmap)
Save TSV report of k-mer frequences (reverse sorted, most frequent k-mers first)
std::map< unsigned, unsigned > histogram_map_u32
static void compute_kmer_histogram(histogram_map_u32 &hmap, const rsc_sparse_vector_u32 &kmer_counts)
Compute a map of how often each k-mer frequency is observed in the k-mer counts vector.
bm::sparse_vector< unsigned, bm::bvector<> > sparse_vector_u32
void count_kmers(const vector_char_type &seq_vect, unsigned k_size, rsc_sparse_vector_u32 &kmer_counts)
k-mer counting algorithm using reference sequence, regenerates k-mer codes, sorts them and counts
dna_scanner_type dna_scanner
std::string ikd_freq_name
std::string ikd_counts_name
bm::chrono_taker ::duration_map_type timing_map
bool get_kmer_code(const char *dna, size_t pos, unsigned k_size, bm::id64_t &k_mer)
Calculate k-mer as an unsigned long integer.
std::vector< char > vector_char_type
bm::rsc_sparse_vector< unsigned, sparse_vector_u32 > rsc_sparse_vector_u32
void sort_unique(VECT &vect)
Auxiliary function to do sort+unique on a vactor of ints removes duplicate elements.
char int2DNA(unsigned code)
Translate integer code to DNA letter.
void count_kmers_parallel(const BV &bv_kmers, const vector_char_type &seq_vect, rsc_sparse_vector_u32 &kmer_counts, unsigned k_size, unsigned concurrency)
MT k-mer counting.
static int load_FASTA(const std::string &fname, vector_char_type &seq_vect)
really simple FASTA parser (one entry per file)
void sort_count(VECT &vect, COUNT_VECT &cvect)
Auxiliary function to do sort+unique on a vactor of ints and save results in a counts vector.
bool get_DNA_code(char bp, bm::id64_t &dna_code)
std::atomic_ullong k_mer_progress_count(0)