42#include <unordered_map>
59 <<
"BitMagic DNA Search Sample (c) 2018" << std::endl
60 <<
"-fa file-name -- input FASTA file" << std::endl
61 <<
"-s hi|lo -- run substring search benchmark" << std::endl
62 <<
"-diag -- run diagnostics" << std::endl
63 <<
"-timing -- collect timings" << std::endl
82 for (
int i = 1; i < argc; ++i)
84 std::string arg = argv[i];
85 if ((arg ==
"-h") || (arg ==
"--help"))
91 if (arg ==
"-fa" || arg ==
"--fa")
99 std::cerr <<
"Error: -fa requires file name" << std::endl;
105 if (arg ==
"-diag" || arg ==
"--diag" || arg ==
"-d" || arg ==
"--d")
107 if (arg ==
"-timing" || arg ==
"--timing" || arg ==
"-t" || arg ==
"--t")
109 if (arg ==
"-bench" || arg ==
"--bench" || arg ==
"-b" || arg ==
"--b")
111 if (arg ==
"-search" || arg ==
"--search" || arg ==
"-s" || arg ==
"--s")
116 std::string a = argv[i+1];
119 if (a ==
"l" || a ==
"lo")
125 if (a ==
"h" || a ==
"hi")
142typedef std::vector<std::pair<unsigned, std::string> >
dict_vect;
152int load_FASTA(
const std::string& fname, std::vector<char>& seq_vect)
157 std::ifstream fin(fname.c_str(), std::ios::in);
162 for (
unsigned i = 0; std::getline(fin, line); ++i)
168 for (std::string::iterator it = line.begin(); it != line.end(); ++it)
169 seq_vect.push_back(*it);
189 void Build(
const vector<char>& sequence)
199 for (
size_t i = 0; i < sequence.size(); ++i)
201 unsigned pos = unsigned(i);
231 return m_FPrintBV[
eA];
233 return m_FPrintBV[
eC];
235 return m_FPrintBV[
eG];
237 return m_FPrintBV[
eT];
239 return m_FPrintBV[
eN];
243 throw runtime_error(
"Error. Invalid letter!");
250 void Find(
const string& word, vector<unsigned>& res)
257 for (
size_t i = 1; i < word.size(); ++i)
270 unsigned ws = unsigned(word.size()) - 1;
283 for (
size_t i = 0; i < word.size(); ++i)
295 unsigned ws = unsigned(word.size()) - 1;
303 vector<vector<unsigned>>& hits)
305 vector<unique_ptr<aggregator_type> > agg_pipeline;
308 for (
const auto& w : words)
313 const string& word = get<0>(w);
314 for (
size_t i = 0; i < word.size(); ++i)
317 agg_ptr->add(&bv_mask);
320 agg_pipeline.emplace_back(agg_ptr.release());
321 ws = unsigned(word.size()) - 1;
326 vector<unique_ptr<aggregator_type> >::iterator>(agg_pipeline.begin(), agg_pipeline.end());
329 for (
size_t i = 0; i < agg_pipeline.size(); ++i)
333 vector<unsigned> res;
336 hits.emplace_back(res);
346 vector<unsigned>& res)
349 for (; en.
valid(); ++en)
352 res.push_back(pos - left_shift);
368 vector<tuple<string,int>>& lo_words,
369 const vector<char>& data,
373 cout <<
"k-mer generation... " << endl;
378 if (data.size() < word_size)
381 size_t end_pos = data.size() - word_size;
383 map<string, int> words;
386 string s(&data[i], word_size);
387 if (s.find(
'N') == string::npos)
392 cout <<
"\r" << i <<
"/" << end_pos << flush;
396 cout << endl <<
"Picking k-mer samples..." << flush;
398 multimap<int,string, greater<int>> dst;
399 for_each(words.begin(), words.end(), [&](
const std::pair<string,int>& p)
401 dst.emplace(p.second, p.first);
404 auto it = dst.begin();
405 for(
size_t count = 0; count < N && it !=dst.end(); ++it,++count)
406 top_words.emplace_back(it->second, it->first);
410 auto it = dst.rbegin();
411 for(
size_t count = 0; count < N && it !=dst.rend(); ++it, ++count)
412 lo_words.emplace_back(it->second, it->first);
415 cout <<
"OK" << endl;
422 const char* word,
unsigned word_size,
425 if (data.size() < word_size)
429 size_t end_pos = data.size() - word_size;
433 for (
size_t j = i, k = 0, l = word_size - 1; l > k; ++j, ++k, --l)
435 if (data[j] != word[k] || data[i + l] != word[l])
442 r.push_back(
unsigned(i));
452 vector<const char*> words,
454 vector<vector<unsigned>>& hits)
456 if (data.size() < word_size)
460 size_t end_pos = data.size() - word_size;
461 size_t words_size = words.size();
464 for (
size_t idx = 0; idx < words_size; ++idx)
466 auto& word = words[idx];
468 for (
size_t j = i, k = 0, l = word_size - 1; l > k; ++j, ++k, --l)
470 if (data[j] != word[k] || data[i + l] != word[l])
478 hits[idx].push_back(
unsigned(i));
492 if (h1.size() != h2.size())
494 cerr <<
"size1 = " << h1.size() <<
" size2 = " << h2.size() << endl;
497 for (
size_t i = 0; i < h1.size(); ++i)
508int main(
int argc,
char *argv[])
516 std::vector<char> seq_vect;
531 std::cout <<
"FASTA sequence size=" << seq_vect.size() << std::endl;
542 vector<tuple<string,int> > h_words;
543 vector<tuple<string,int> > l_words;
545 vector<tuple<string,int>>& words =
h_word_set ? h_words : l_words;
552 vector<THitList> word_hits;
553 vector<THitList> word_hits_agg;
561 vector<const char*> word_list;
562 for (
const auto& w : words)
564 word_list.push_back(get<0>(w).c_str());
566 word_hits.resize(words.size());
567 for_each(word_hits.begin(), word_hits.end(), [](
THitList& ht) {
587 for (
size_t word_idx = 0; word_idx < words.size(); ++ word_idx)
589 auto& word = get<0>(words[word_idx]);
595 word.c_str(),
unsigned(word.size()),
601 idx.
Find(word, hits2);
613 cout <<
"Mismatch ERROR for: " << word << endl;
619 cout <<
"Sigle pass mismatch ERROR for: " << word << endl;
623 cout << word_idx <<
": " << word <<
": " << hits1.size() <<
" hits " << endl;
631 std::cout << std::endl <<
"Performance:" << std::endl;
635 catch (std::exception& ex)
637 std::cerr <<
"Error:" << ex.what() << std::endl;
Compressed bit-vector bvector<> container, set algebraic methods, traversal iterators.
Algorithms for fast aggregation of N bvectors.
Algorithms for bvector<> (main include)
Serialization / compression of bvector<>. Set theoretical operations on compressed BLOBs.
Timing utilities for benchmarking (internal)
pre-processor un-defines to avoid global space pollution (internal)
Utility for keeping all DNA finger print vectors and search using various techniques.
void FindAggFused(const string &word, vector< unsigned > &res)
This method uses cache blocked aggregator with fused SHIFT+AND kernel.
void Find(const string &word, vector< unsigned > &res)
Find word strings using shift + and on fingerprint vectors (horizontal, non-fused basic method)
void Build(const vector< char > &sequence)
Build fingerprint bit-vectors from the original sequence.
const bm::bvector & GetVector(char letter) const
Return fingerprint bit-vector.
void TranslateResults(const bm::bvector<> &bv, unsigned left_shift, vector< unsigned > &res)
Translate search results vector using (word size) left shift.
void FindCollection(const vector< tuple< string, int > > &words, vector< vector< unsigned >> &hits)
Find a set of words in one pass using pipeline of aggregators (this is very experimental)
Algorithms for fast aggregation of a group of bit-vectors.
void combine_shift_right_and(bvector_type &bv_target)
Aggregate added group of vectors using SHIFT-RIGHT and logical AND Operation does NOT perform an expl...
void reset()
Reset aggregate groups, forget all attached vectors.
const bvector_type * get_target() const BMNOEXCEPT
size_t add(const bvector_type *bv, unsigned agr_group=0)
Attach source bit-vector to a argument group (0 or 1).
Output iterator iterator designed to set "ON" bits based on input sequence of integers.
Constant iterator designed to enumerate "ON" bits.
bool valid() const BMNOEXCEPT
Checks if iterator is still valid.
Bitvector Bit-vector container with runtime compression of bits.
bool any() const BMNOEXCEPT
Returns true if any bits in this bitset are set, and otherwise returns false.
enumerator first() const
Returns enumerator pointing on the first non-zero bit.
bool shift_right()
Shift right by 1 bit, fill with zero return carry out.
Utility class to collect performance measurements and statistics.
std::map< std::string, statistics > duration_map_type
test name to duration map
static void print_duration_map(TOut &tout, const duration_map_type &dmap, format fmt=ct_time)
@ BM_SORTED
input set is sorted (ascending order)
void aggregator_pipeline_execute(It first, It last)
Experimental method ro run multiple aggregators in sync.
static void find_words(const vector< char > &data, vector< const char * > words, unsigned word_size, vector< vector< unsigned >> &hits)
Find all words in one pass (cache coherent algorithm) (variation of 2-way string matching for collect...
int main(int argc, char *argv[])
static int parse_args(int argc, char *argv[])
vector< unsigned > THitList
std::map< std::string, unsigned > freq_map
static const size_t WORD_SIZE
bm::aggregator< bm::bvector<> > aggregator_type
static bool hitlist_compare(const THitList &h1, const THitList &h2)
Check search result match.
bm::chrono_taker ::duration_map_type timing_map
std::vector< std::pair< unsigned, std::string > > dict_vect
static void find_word_2way(vector< char > &data, const char *word, unsigned word_size, THitList &r)
2-way string matching
static int load_FASTA(const std::string &fname, std::vector< char > &seq_vect)
static void generate_kmers(vector< tuple< string, int >> &top_words, vector< tuple< string, int >> &lo_words, const vector< char > &data, size_t N, unsigned word_size)
generate the most frequent words of specified length from the input sequence