BitMagic-C++
Data Structures | Typedefs | Functions | Variables
xsample07a.cpp File Reference

Example: Use of bvector<> for k-mer fingerprint K should be short, no minimizers here (k < 24) More...

#include <assert.h>
#include <stdlib.h>
#include <math.h>
#include <iostream>
#include <vector>
#include <list>
#include <map>
#include <algorithm>
#include <utility>
#include <memory>
#include <future>
#include <thread>
#include <mutex>
#include <atomic>
#include "bm64.h"
#include "bmalgo.h"
#include "bmserial.h"
#include "bmaggregator.h"
#include "bmsparsevec_compr.h"
#include "bmsparsevec_algo.h"
#include "bmrandom.h"
#include "bmdbg.h"
#include "bmtimer.h"
#include "bmundef.h"
#include "dna_finger.h"
#include "cmd_args.h"
Include dependency graph for xsample07a.cpp:

Go to the source code of this file.

Data Structures

class  CSequenceColl
 Collection of sequences and k-mer fingerprint vectors. More...
 
class  CSeqGroup
 Group (clustrer) of sequences. More...
 
class  CSeqClusters
 
struct  CKMerAcc
 Utility class to accumulate cahnges to cluster before commiting it (mutex syncronous operation) More...
 

Typedefs

typedef bm::bvector bvector_type
 
typedef std::vector< char > vector_char_type
 
typedef bm::dynamic_heap_matrix< unsigned, bm::bvector<>::allocator_type > distance_matrix_type
 
typedef std::vector< std::unique_ptr< bvector_type > > bvector_ptr_vector_type
 
typedef bvector_type::size_type bv_size_type
 
typedef std::vector< std::pair< bv_size_type, bv_size_type > > bv_ranges_vector
 

Functions

template<typename FV >
void wait_for_slot (FV &futures, unsigned *parallel_cnt, unsigned concurrency)
 wait for any opening in a list of futures used to schedule parallel tasks with CPU overbooking control More...
 
static int load_FASTA (const std::string &fname, CSequenceColl &seq_coll)
 Load multi-sequence FASTA. More...
 
static void save_kmer_buffers (const std::string &fname, const CSequenceColl &seq_coll)
 save k-mer vectors to a file More...
 
static void load_kmer_buffers (const std::string &fname, CSequenceColl &seq_coll)
 Load k-mer vectors. More...
 
bool get_DNA_code (char bp, bm::id64_t &dna_code)
 
bool get_kmer_code (const char *dna, size_t pos, unsigned k_size, bm::id64_t &k_mer)
 Calculate k-mer as an unsigned long integer. More...
 
char int2DNA (unsigned code)
 Translate integer code to DNA letter. More...
 
void translate_kmer (std::string &dna, bm::id64_t kmer_code, unsigned k_size)
 Translate k-mer code into ATGC DNA string. More...
 
template<typename BV >
void generate_k_mer_bvector (BV &bv, const vector_char_type &seq_vect, unsigned k_size, std::vector< bm::id64_t > &k_buf, const bm::id64_t chunk_size=400000000)
 This function turns each k-mer into an integer number and encodes it in a bit-vector (presense vector) The natural limitation here is that integer has to be less tha 48-bits (limitations of bm::bvector<>) This method build a presense k-mer fingerprint vector which can be used for Jaccard distance comparison. More...
 
std::atomic_ullong k_mer_progress_count (0)
 
static void generate_k_mers (CSequenceColl &seq_coll, unsigned k_size, size_t from, size_t to)
 
static void generate_k_mers_parallel (CSequenceColl &seq_coll, unsigned k_size, unsigned concurrency)
 
void resolve_duplicates (CSeqGroup &seq_group1, CSeqGroup &seq_group2, const CSequenceColl &seq_coll)
 Resolve duplicate members between two groups. More...
 
static void compute_and_sim_row (unsigned *row, const bm::bvector<> *bv_i, size_t i, const std::vector< std::unique_ptr< bvector_type > > &k_mers_vect)
 Compute similarity distances for one row/vector (1:N) of distance matrix. More...
 
static void compute_and_sim (distance_matrix_type &dm, const CSequenceColl &seq_coll, const bm::bvector<> &bv_mem, bm::bvector<>::size_type bv_mem_cnt, unsigned concurrency)
 Compute similarity distances matrix (COUNT(AND(a, b)) More...
 
static void compute_seq_group_union (CSeqGroup &seq_group, const CSequenceColl &seq_coll)
 Compute union (Universe) of all k-mers in the cluster group Implemented as a OR of all k-mer fingerprints. More...
 
static void compute_group (CSeqGroup &seq_group, const CSequenceColl &seq_coll, const bm::bvector<> &bv_exceptions, float similarity_cut_off)
 
static void assign_to_best_cluster (CSeqClusters &cluster_groups, const CSequenceColl &seq_coll, const bm::bvector<> &bv_seq_ids, bm::bvector<>::size_type seq_id_from, bm::bvector<>::size_type seq_id_to)
 Compute AND similarity to all available clusters assign to the most similar using cluster representative. More...
 
static void assign_to_best_cluster_union (CSeqClusters &cluster_groups, const CSequenceColl &seq_coll, const bm::bvector<> &bv_seq_ids, bm::bvector<>::size_type seq_id_from, bm::bvector<>::size_type seq_id_to)
 Compute AND similarity to all available clusters assign to the most similar using UNION of k-mers in the cluster This is a more relaxed assignmnet, used when representative does not work. More...
 
static void compute_random_clusters (CSeqClusters &cluster_groups, const CSequenceColl &seq_coll, const bm::bvector<> &bv_total, bm::random_subset< bvector_type > &rsub, unsigned num_clust, float similarity_cut_off, unsigned concurrency)
 Pick random sequences as cluster seed elements, try attach initial sequences based on weighted similarity measure. More...
 
static void compute_jaccard_clusters (CSeqClusters &seq_clusters, const CSequenceColl &seq_coll, unsigned num_clust, float similarity_cut_off, unsigned concurrency)
 
int main (int argc, char *argv[])
 

Variables

std::string ifa_name
 
std::string ikd_name
 
std::string ikd_counts_name
 
std::string kh_name
 
std::string ikd_rep_name
 
std::string ikd_freq_name
 
bool is_diag = false
 
bool is_timing = false
 
bool is_bench = false
 
unsigned ik_size = 8
 
unsigned parallel_jobs = 4
 
unsigned f_percent = 5
 
bm::chrono_taker::duration_map_type timing_map
 

Detailed Description

Example: Use of bvector<> for k-mer fingerprint K should be short, no minimizers here (k < 24)

Example loads FASTA file (large multi-molecule file is expected, builds a collection of k-mers for each molecule and runs clusterization algorithm on the input collection using set intersect (logical AND) as a similarity measure.

Example uses std::async for running parallel jobs.

Definition in file xsample07a.cpp.

Typedef Documentation

◆ bv_ranges_vector

typedef std::vector<std::pair<bv_size_type, bv_size_type> > bv_ranges_vector
Examples:
xsample07a.cpp.

Definition at line 99 of file xsample07a.cpp.

◆ bv_size_type

Examples:
xsample07a.cpp.

Definition at line 98 of file xsample07a.cpp.

◆ bvector_ptr_vector_type

typedef std::vector<std::unique_ptr<bvector_type> > bvector_ptr_vector_type
Examples:
xsample07a.cpp.

Definition at line 97 of file xsample07a.cpp.

◆ bvector_type

Examples:
xsample07.cpp, and xsample07a.cpp.

Definition at line 94 of file xsample07a.cpp.

◆ distance_matrix_type

typedef bm::dynamic_heap_matrix<unsigned, bm::bvector<>::allocator_type> distance_matrix_type
Examples:
xsample07a.cpp.

Definition at line 96 of file xsample07a.cpp.

◆ vector_char_type

typedef std::vector<char> vector_char_type
Examples:
xsample07a.cpp.

Definition at line 95 of file xsample07a.cpp.

Function Documentation

◆ assign_to_best_cluster()

static void assign_to_best_cluster ( CSeqClusters cluster_groups,
const CSequenceColl seq_coll,
const bm::bvector<> &  bv_seq_ids,
bm::bvector<>::size_type  seq_id_from,
bm::bvector<>::size_type  seq_id_to 
)
static

Compute AND similarity to all available clusters assign to the most similar using cluster representative.

Examples:
xsample07a.cpp.

Definition at line 1245 of file xsample07a.cpp.

References BM_DECLARE_TEMP_BLOCK, bm::count_and(), bm::deserialize(), CSequenceColl::get_buf(), CSeqClusters::get_group(), CSeqGroup::get_rep(), CSeqClusters::groups_size(), CSeqGroup::merge_member_sync(), and bm::bvector< Alloc >::set_allocator_pool().

Referenced by compute_jaccard_clusters().

◆ assign_to_best_cluster_union()

static void assign_to_best_cluster_union ( CSeqClusters cluster_groups,
const CSequenceColl seq_coll,
const bm::bvector<> &  bv_seq_ids,
bm::bvector<>::size_type  seq_id_from,
bm::bvector<>::size_type  seq_id_to 
)
static

Compute AND similarity to all available clusters assign to the most similar using UNION of k-mers in the cluster This is a more relaxed assignmnet, used when representative does not work.

Examples:
xsample07a.cpp.

Definition at line 1315 of file xsample07a.cpp.

References CSeqGroup::add_member_sync(), BM_DECLARE_TEMP_BLOCK, CSeqGroup::count_and_union_sync(), bm::deserialize(), CSequenceColl::get_buf(), CSeqClusters::get_group(), bm::bvector< Alloc >::enumerator::go_to(), CSeqClusters::groups_size(), bm::bvector< Alloc >::set_allocator_pool(), and bm::bvector< Alloc >::iterator_base::valid().

Referenced by compute_jaccard_clusters().

◆ compute_and_sim()

static void compute_and_sim ( distance_matrix_type dm,
const CSequenceColl seq_coll,
const bm::bvector<> &  bv_mem,
bm::bvector<>::size_type  bv_mem_cnt,
unsigned  concurrency 
)
static

Compute similarity distances matrix (COUNT(AND(a, b))

Examples:
xsample07a.cpp.

Definition at line 910 of file xsample07a.cpp.

References compute_and_sim_row(), CSequenceColl::deserialize_k_mers(), bm::random_subset< BV >::sample(), and wait_for_slot().

Referenced by CSeqClusters::elect_leaders().

◆ compute_and_sim_row()

static void compute_and_sim_row ( unsigned *  row,
const bm::bvector<> *  bv_i,
size_t  i,
const std::vector< std::unique_ptr< bvector_type > > &  k_mers_vect 
)
static

Compute similarity distances for one row/vector (1:N) of distance matrix.

Examples:
xsample07a.cpp.

Definition at line 890 of file xsample07a.cpp.

References bm::bvector< Alloc >::count(), and bm::count_and().

Referenced by compute_and_sim().

◆ compute_group()

static void compute_group ( CSeqGroup seq_group,
const CSequenceColl seq_coll,
const bm::bvector<> &  bv_exceptions,
float  similarity_cut_off 
)
static

◆ compute_jaccard_clusters()

static void compute_jaccard_clusters ( CSeqClusters seq_clusters,
const CSequenceColl seq_coll,
unsigned  num_clust,
float  similarity_cut_off,
unsigned  concurrency 
)
static

◆ compute_random_clusters()

static void compute_random_clusters ( CSeqClusters cluster_groups,
const CSequenceColl seq_coll,
const bm::bvector<> &  bv_total,
bm::random_subset< bvector_type > &  rsub,
unsigned  num_clust,
float  similarity_cut_off,
unsigned  concurrency 
)
static

Pick random sequences as cluster seed elements, try attach initial sequences based on weighted similarity measure.

Examples:
xsample07a.cpp.

Definition at line 1374 of file xsample07a.cpp.

References CSeqClusters::add_group(), compute_group(), bm::bvector< Alloc >::first(), bm::random_subset< BV >::sample(), and wait_for_slot().

Referenced by compute_jaccard_clusters().

◆ compute_seq_group_union()

static void compute_seq_group_union ( CSeqGroup seq_group,
const CSequenceColl seq_coll 
)
static

Compute union (Universe) of all k-mers in the cluster group Implemented as a OR of all k-mer fingerprints.

Examples:
xsample07a.cpp.

Definition at line 973 of file xsample07a.cpp.

References BM_DECLARE_TEMP_BLOCK, bm::bvector< Alloc >::clear(), bm::deserialize(), CSequenceColl::get_buf(), CSeqGroup::get_kmer_union(), CSeqGroup::get_members(), bm::bvector< Alloc >::optimize(), and bm::bvector< Alloc >::iterator_base::valid().

Referenced by CSeqClusters::elect_leaders().

◆ generate_k_mer_bvector()

template<typename BV >
void generate_k_mer_bvector ( BV &  bv,
const vector_char_type seq_vect,
unsigned  k_size,
std::vector< bm::id64_t > &  k_buf,
const bm::id64_t  chunk_size = 400000000 
)

This function turns each k-mer into an integer number and encodes it in a bit-vector (presense vector) The natural limitation here is that integer has to be less tha 48-bits (limitations of bm::bvector<>) This method build a presense k-mer fingerprint vector which can be used for Jaccard distance comparison.

Parameters
bv- [out] - target bit-vector
seq_vect- [out] DNA sequence vector
k-size- dimention for k-mer generation
k_buf- sort buffer for generated k-mers
chunk_size- sort buffer size (number of k-mers per sort)
Examples:
xsample07a.cpp.

Definition at line 474 of file xsample07a.cpp.

References bm::BM_SORTED, get_DNA_code(), get_kmer_code(), and k_mer_progress_count().

Referenced by generate_k_mers().

◆ generate_k_mers()

static void generate_k_mers ( CSequenceColl seq_coll,
unsigned  k_size,
size_t  from,
size_t  to 
)
static

◆ generate_k_mers_parallel()

static void generate_k_mers_parallel ( CSequenceColl seq_coll,
unsigned  k_size,
unsigned  concurrency 
)
static

◆ get_DNA_code()

bool get_DNA_code ( char  bp,
bm::id64_t dna_code 
)
inline
Examples:
xsample07a.cpp.

Definition at line 374 of file xsample07a.cpp.

Referenced by generate_k_mer_bvector(), and get_kmer_code().

◆ get_kmer_code()

bool get_kmer_code ( const char *  dna,
size_t  pos,
unsigned  k_size,
bm::id64_t k_mer 
)
inline

Calculate k-mer as an unsigned long integer.

Returns
true - if k-mer is "true" (not 'NNNNNN')
Examples:
xsample07a.cpp.

Definition at line 402 of file xsample07a.cpp.

References get_DNA_code().

Referenced by generate_k_mer_bvector().

◆ int2DNA()

char int2DNA ( unsigned  code)
inline

Translate integer code to DNA letter.

Examples:
xsample07a.cpp.

Definition at line 429 of file xsample07a.cpp.

Referenced by translate_kmer().

◆ k_mer_progress_count()

std::atomic_ullong k_mer_progress_count ( )

◆ load_FASTA()

static int load_FASTA ( const std::string &  fname,
CSequenceColl seq_coll 
)
static

Load multi-sequence FASTA.

Examples:
xsample07a.cpp.

Definition at line 244 of file xsample07a.cpp.

References CSequenceColl::add_sequence().

Referenced by main().

◆ load_kmer_buffers()

static void load_kmer_buffers ( const std::string &  fname,
CSequenceColl seq_coll 
)
static

Load k-mer vectors.

Examples:
xsample07a.cpp.

Definition at line 332 of file xsample07a.cpp.

References CSequenceColl::set_buffer().

Referenced by main().

◆ main()

int main ( int  argc,
char *  argv[] 
)

◆ resolve_duplicates()

void resolve_duplicates ( CSeqGroup seq_group1,
CSeqGroup seq_group2,
const CSequenceColl seq_coll 
)

◆ save_kmer_buffers()

static void save_kmer_buffers ( const std::string &  fname,
const CSequenceColl seq_coll 
)
static

save k-mer vectors to a file

Examples:
xsample07a.cpp.

Definition at line 294 of file xsample07a.cpp.

References CSequenceColl::get_buf(), CSequenceColl::get_buf_size(), and CSequenceColl::size().

Referenced by main().

◆ translate_kmer()

void translate_kmer ( std::string &  dna,
bm::id64_t  kmer_code,
unsigned  k_size 
)
inline

Translate k-mer code into ATGC DNA string.

Parameters
dna- target string
k_mer- k-mer code
k_size-
Examples:
xsample07a.cpp.

Definition at line 444 of file xsample07a.cpp.

References int2DNA().

◆ wait_for_slot()

template<typename FV >
void wait_for_slot ( FV &  futures,
unsigned *  parallel_cnt,
unsigned  concurrency 
)

wait for any opening in a list of futures used to schedule parallel tasks with CPU overbooking control

Examples:
xsample07a.cpp.

Definition at line 111 of file xsample07a.cpp.

Referenced by compute_and_sim(), and compute_random_clusters().

Variable Documentation

◆ f_percent

unsigned f_percent = 5
Examples:
xsample07a.cpp.

Definition at line 86 of file xsample07a.cpp.

◆ ifa_name

std::string ifa_name
Examples:
xsample07a.cpp.

Definition at line 75 of file xsample07a.cpp.

Referenced by main().

◆ ik_size

unsigned ik_size = 8
Examples:
xsample07a.cpp.

Definition at line 84 of file xsample07a.cpp.

Referenced by main().

◆ ikd_counts_name

std::string ikd_counts_name
Examples:
xsample07a.cpp.

Definition at line 77 of file xsample07a.cpp.

◆ ikd_freq_name

std::string ikd_freq_name
Examples:
xsample07a.cpp.

Definition at line 80 of file xsample07a.cpp.

◆ ikd_name

std::string ikd_name
Examples:
xsample07a.cpp.

Definition at line 76 of file xsample07a.cpp.

Referenced by main().

◆ ikd_rep_name

std::string ikd_rep_name
Examples:
xsample07a.cpp.

Definition at line 79 of file xsample07a.cpp.

◆ is_bench

bool is_bench = false
Examples:
xsample07a.cpp.

Definition at line 83 of file xsample07a.cpp.

◆ is_diag

bool is_diag = false
Examples:
xsample07a.cpp.

Definition at line 81 of file xsample07a.cpp.

◆ is_timing

bool is_timing = false
Examples:
xsample07a.cpp.

Definition at line 82 of file xsample07a.cpp.

Referenced by main().

◆ kh_name

std::string kh_name
Examples:
xsample07a.cpp.

Definition at line 78 of file xsample07a.cpp.

◆ parallel_jobs

unsigned parallel_jobs = 4
Examples:
xsample07a.cpp.

Definition at line 85 of file xsample07a.cpp.

Referenced by main().

◆ timing_map

Examples:
xsample07a.cpp.

Definition at line 104 of file xsample07a.cpp.

Referenced by main().