BitMagic-C++
|
Example: Use of bvector<> for k-mer fingerprint K should be short, no minimizers here. More...
#include <assert.h>
#include <stdlib.h>
#include <iostream>
#include <vector>
#include <map>
#include <algorithm>
#include <utility>
#include <future>
#include <thread>
#include <mutex>
#include <atomic>
#include "bm64.h"
#include "bmalgo.h"
#include "bmserial.h"
#include "bmaggregator.h"
#include "bmsparsevec_compr.h"
#include "bmsparsevec_algo.h"
#include "bmundef.h"
#include "bmdbg.h"
#include "bmtimer.h"
#include "dna_finger.h"
#include "cmd_args.h"
Go to the source code of this file.
Data Structures | |
class | SortCounting_JobFunctor< BV > |
Functor to process job batch (task) More... | |
class | Counting_JobFunctor< DNA_Scan > |
k-mer counting job functor class using bm::aggregator<> More... | |
Typedefs | |
typedef std::vector< char > | vector_char_type |
typedef DNA_FingerprintScanner< bm::bvector<> > | dna_scanner_type |
typedef bm::sparse_vector< unsigned, bm::bvector<> > | sparse_vector_u32 |
typedef bm::rsc_sparse_vector< unsigned, sparse_vector_u32 > | rsc_sparse_vector_u32 |
typedef std::map< unsigned, unsigned > | histogram_map_u32 |
Functions | |
std::atomic_ullong | k_mer_progress_count (0) |
More... | |
static int | load_FASTA (const std::string &fname, vector_char_type &seq_vect) |
really simple FASTA parser (one entry per file) More... | |
bool | get_DNA_code (char bp, bm::id64_t &dna_code) |
More... | |
bool | get_kmer_code (const char *dna, size_t pos, unsigned k_size, bm::id64_t &k_mer) |
Calculate k-mer as an unsigned long integer. More... | |
char | int2DNA (unsigned code) |
Translate integer code to DNA letter. More... | |
void | translate_kmer (std::string &dna, bm::id64_t kmer_code, unsigned k_size) |
Translate k-mer code into ATGC DNA string. More... | |
void | validate_k_mer (const char *dna, size_t pos, unsigned k_size, bm::id64_t k_mer) |
QA function to validate if reverse k-mer decode gives the same string. More... | |
template<typename VECT > | |
void | sort_unique (VECT &vect) |
Auxiliary function to do sort+unique on a vactor of ints removes duplicate elements. More... | |
template<typename VECT , typename COUNT_VECT > | |
void | sort_count (VECT &vect, COUNT_VECT &cvect) |
Auxiliary function to do sort+unique on a vactor of ints and save results in a counts vector. More... | |
template<typename BV > | |
void | generate_k_mer_bvector (BV &bv, const vector_char_type &seq_vect, unsigned k_size, bool check) |
This function turns each k-mer into an integer number and encodes it in a bit-vector (presense vector) The natural limitation here is that integer has to be less tha 48-bits (limitations of bm::bvector<>) This method build a presense k-mer fingerprint vector which can be used for Jaccard distance comparison. More... | |
void | count_kmers (const vector_char_type &seq_vect, unsigned k_size, rsc_sparse_vector_u32 &kmer_counts) |
k-mer counting algorithm using reference sequence, regenerates k-mer codes, sorts them and counts More... | |
template<typename BV > | |
void | count_kmers_parallel (const BV &bv_kmers, const vector_char_type &seq_vect, rsc_sparse_vector_u32 &kmer_counts, unsigned k_size, unsigned concurrency) |
MT k-mer counting. More... | |
template<typename BV > | |
void | count_kmers (const BV &bv_kmers, rsc_sparse_vector_u32 &kmer_counts) |
k-mer counting method using Bitap algorithm for occurence search this method is significantly slower than direct regeneration of k-mer codes and sorting count More... | |
template<typename BV > | |
void | count_kmers_parallel (const BV &bv_kmers, rsc_sparse_vector_u32 &kmer_counts, unsigned concurrency) |
Runs k-mer counting in parallel. More... | |
static void | compute_kmer_histogram (histogram_map_u32 &hmap, const rsc_sparse_vector_u32 &kmer_counts) |
Compute a map of how often each k-mer frequency is observed in the k-mer counts vector. More... | |
static void | report_hmap (const string &fname, const histogram_map_u32 &hmap) |
Save TSV report of k-mer frequences (reverse sorted, most frequent k-mers first) More... | |
template<typename BV > | |
void | compute_frequent_kmers (BV &frequent_bv, const histogram_map_u32 &hmap, const rsc_sparse_vector_u32 &kmer_counts, unsigned percent, unsigned k_size) |
Create vector, representing subset of k-mers of high frequency. More... | |
int | main (int argc, char *argv[]) |
More... | |
Variables | |
std::string | ifa_name |
More... | |
std::string | ikd_name |
More... | |
std::string | ikd_counts_name |
More... | |
std::string | kh_name |
More... | |
std::string | ikd_rep_name |
More... | |
std::string | ikd_freq_name |
More... | |
bool | is_diag = false |
More... | |
bool | is_timing = false |
More... | |
bool | is_bench = false |
unsigned | ik_size = 8 |
More... | |
unsigned | parallel_jobs = 4 |
More... | |
unsigned | f_percent = 5 |
More... | |
bm::chrono_taker ::duration_map_type | timing_map |
More... | |
dna_scanner_type | dna_scanner |
More... | |
Example: Use of bvector<> for k-mer fingerprint K should be short, no minimizers here.
Definition in file xsample07.cpp.
void compute_frequent_kmers | ( | BV & | frequent_bv, |
const histogram_map_u32 & | hmap, | ||
const rsc_sparse_vector_u32 & | kmer_counts, | ||
unsigned | percent, | ||
unsigned | k_size | ||
) |
Create vector, representing subset of k-mers of high frequency.
frequent_bv[out] | - bit-vector of frequent k-mers (subset of all k-mers) |
hmap | - histogram map of all k-mers |
kmer_counts | - kmer frequency(counts) vector |
percent | - percent of frequent k-mers to build a subset (5%) percent here is of total number of k-mers (not percent of all occurences) |
k_size | - K mer size |
Definition at line 905 of file xsample07.cpp.
References bm::sparse_vector_scanner< SV >::find_eq(), bm::rsc_sparse_vector< Val, SV >::get(), bm::rsc_sparse_vector< Val, SV >::get_null_bvector(), and bm::bvector< Alloc >::iterator_base::valid().
Referenced by main().
|
static |
Compute a map of how often each k-mer frequency is observed in the k-mer counts vector.
hmap | - [out] histogram map |
kmer_counts | - [in] kmer counts vector |
Definition at line 859 of file xsample07.cpp.
References bm::rsc_sparse_vector< Val, SV >::get(), and bm::rsc_sparse_vector< Val, SV >::get_null_bvector().
Referenced by main().
void count_kmers | ( | const BV & | bv_kmers, |
rsc_sparse_vector_u32 & | kmer_counts | ||
) |
k-mer counting method using Bitap algorithm for occurence search this method is significantly slower than direct regeneration of k-mer codes and sorting count
Definition at line 653 of file xsample07.cpp.
References dna_scanner, ik_size, bm::rsc_sparse_vector< Val, SV >::set(), and translate_kmer().
|
inline |
k-mer counting algorithm using reference sequence, regenerates k-mer codes, sorts them and counts
Definition at line 408 of file xsample07.cpp.
References get_DNA_code(), get_kmer_code(), and sort_count().
Referenced by count_kmers_parallel().
void count_kmers_parallel | ( | const BV & | bv_kmers, |
const vector_char_type & | seq_vect, | ||
rsc_sparse_vector_u32 & | kmer_counts, | ||
unsigned | k_size, | ||
unsigned | concurrency | ||
) |
MT k-mer counting.
Definition at line 594 of file xsample07.cpp.
References count_kmers(), ik_size, and bm::rank_range_split().
Referenced by main().
void count_kmers_parallel | ( | const BV & | bv_kmers, |
rsc_sparse_vector_u32 & | kmer_counts, | ||
unsigned | concurrency | ||
) |
Runs k-mer counting in parallel.
Definition at line 781 of file xsample07.cpp.
References count_kmers(), dna_scanner, k_mer_progress_count(), and bm::rank_range_split().
void generate_k_mer_bvector | ( | BV & | bv, |
const vector_char_type & | seq_vect, | ||
unsigned | k_size, | ||
bool | check | ||
) |
This function turns each k-mer into an integer number and encodes it in a bit-vector (presense vector) The natural limitation here is that integer has to be less tha 48-bits (limitations of bm::bvector<>) This method build a presense k-mer fingerprint vector which can be used for Jaccard distance comparison.
bv | - [out] - target bit-vector |
seq_vect | - [out] DNA sequence vector |
k-size | - dimention for k-mer generation |
Definition at line 306 of file xsample07.cpp.
References bm::BM_SORTED, get_DNA_code(), get_kmer_code(), sort_unique(), timing_map, and validate_k_mer().
Referenced by main().
|
inline |
Definition at line 138 of file xsample07.cpp.
Referenced by count_kmers(), generate_k_mer_bvector(), get_kmer_code(), and SortCounting_JobFunctor< BV >::operator()().
|
inline |
Calculate k-mer as an unsigned long integer.
Definition at line 165 of file xsample07.cpp.
References get_DNA_code().
Referenced by count_kmers(), generate_k_mer_bvector(), and SortCounting_JobFunctor< BV >::operator()().
|
inline |
Translate integer code to DNA letter.
Definition at line 192 of file xsample07.cpp.
Referenced by translate_kmer(), and validate_k_mer().
std::atomic_ullong k_mer_progress_count | ( | 0 | ) |
Referenced by count_kmers_parallel(), and Counting_JobFunctor< DNA_Scan >::operator()().
|
static |
really simple FASTA parser (one entry per file)
Definition at line 116 of file xsample07.cpp.
References timing_map.
Referenced by main().
int main | ( | int | argc, |
char * | argv[] | ||
) |
Definition at line 966 of file xsample07.cpp.
References bm::bvector< Alloc >::bit_sub(), bm::BM_GAP, DNA_FingerprintScanner::BuildParallel(), compute_frequent_kmers(), compute_kmer_histogram(), bm::bvector< Alloc >::count(), count_kmers_parallel(), dna_scanner, bm::rsc_sparse_vector< Val, SV >::equal(), f_percent, generate_k_mer_bvector(), bm::rsc_sparse_vector< Val, SV >::get(), ifa_name, ik_size, ikd_counts_name, ikd_freq_name, ikd_name, ikd_rep_name, is_diag, is_timing, kh_name, load_FASTA(), bm::bvector< Alloc >::optimize(), bm::rsc_sparse_vector< Val, SV >::optimize(), parallel_jobs, parse_args(), bm::chrono_taker< TOut >::print_duration_map(), report_hmap(), bm::sparse_vector_find_first_mismatch(), bm::rsc_sparse_vector< Val, SV >::sync(), and timing_map.
|
static |
Save TSV report of k-mer frequences (reverse sorted, most frequent k-mers first)
Definition at line 881 of file xsample07.cpp.
Referenced by main().
void sort_count | ( | VECT & | vect, |
COUNT_VECT & | cvect | ||
) |
Auxiliary function to do sort+unique on a vactor of ints and save results in a counts vector.
Definition at line 268 of file xsample07.cpp.
Referenced by count_kmers(), and SortCounting_JobFunctor< BV >::operator()().
void sort_unique | ( | VECT & | vect | ) |
Auxiliary function to do sort+unique on a vactor of ints removes duplicate elements.
Definition at line 256 of file xsample07.cpp.
Referenced by generate_k_mer_bvector().
|
inline |
Translate k-mer code into ATGC DNA string.
dna | - target string |
k_mer | - k-mer code |
k_size | - |
Definition at line 207 of file xsample07.cpp.
References int2DNA().
Referenced by count_kmers(), and Counting_JobFunctor< DNA_Scan >::operator()().
|
inline |
QA function to validate if reverse k-mer decode gives the same string.
Definition at line 224 of file xsample07.cpp.
References int2DNA().
Referenced by generate_k_mer_bvector().