#include #include #include #include #include #include #include #include #include "boost/foreach.hpp" #include "opt.h" #include "string_tools.h" #include "search.h" #include "pattern.h" #include "pwm.h" #include "dna.h" #include "hit.h" #include "index_trie_scan.h" #include "cluster.h" #include "parse_example.h" #include "input_grammar.hpp" #include "parse_funcs.hpp" #include "boost/spirit/tree/parse_tree.hpp" #include "boost/spirit/tree/tree_to_xml.hpp" #include "boost/spirit/utility/confix.hpp" #define foreach BOOST_FOREACH float MarkovLogProb(NUC * pfx, int sz){ static float logfreqs[] = { MIN_SCORE , log(0.33 ), log( 0.17 ), log(0.33 + 0.17 ), log( 0.17 ), log(0.33 + 0.17 ), log( 0.17 + 0.17 ), log(0.33 + 0.17 + 0.17 ), log( 0.33), log(0.33 + 0.33), log( 0.17 + 0.33), log(0.33 + 0.17 + 0.33), log( 0.17 + 0.33), log(0.33 + 0.17 + 0.33), log( 0.17 + 0.17 + 0.33), log(0.33 + 0.17 + 0.17 + 0.33) }; float frac = 0.0; for (int i=0; i < sz; ++i) frac += logfreqs[static_cast(pfx[i])]; return frac; } //here we need the log-sum-exp trick... //given t1 = log(f1), t2 = log(f2), updates t1 := log(f1 + f2); void MarkovLogSum(void *accu, NUC * pfx, int sz){ float & t1 = *(float *)accu; //t is log(a), log(b) = MarkovProb(), we want log(a + b) float t2 = MarkovLogProb(pfx, sz); float m = std::max(t1, t2); float tp1 = t1 - m, tp2 = t2 - m; float s = exp(tp1) + exp(tp2); t1 = log(s) + m; } void Increment(void *accumulator, NUC * prefix, int size){ int & counter = * static_cast(accumulator); counter++; } struct score_below : public std::unary_function { public: int thresh; score_below(int t) : thresh(t) {} bool operator()(LOCUS const& l){ return l.first->score(l.second.sequence) < thresh; } }; char *input = NULL; char *hits_file = NULL; char *cluster_file = NULL; long lTrieBuffer = 100000; long lGPosBuffer = 100000; //should be longer than longest possible chromosome, //but shorter than roughly the maximum memory to hold //1/100 that number of hits const int g_max_dna_search_length = 1000000000; int icisj(int argc,char const** argv){ if (input == NULL){ fprintf(stdout, "You must provide the name of a properly formatted input file\n"); fprintf(stdout, "For example:\n\n\n"); fprintf(stdout, "%s", INPUT_EXAMPLE); return 0; } litestream::postype GPosBuffer = lGPosBuffer; litestream::postype TrieBuffer = lTrieBuffer; std::ifstream input_stream(input); if (! input_stream){ cerr<<"Couldn't open input "< input_contents; std::copy(std::istream_iterator(input_stream), std::istream_iterator(), std::back_inserter(input_contents)); input_stream.close(); // ensure that we have enough trailing newlines to eliminate // the need to check for end of file in the grammar. input_contents.push_back('\n'); input_contents.push_back('\n'); std::vector::const_iterator begin_of_input = input_contents.begin(); std::vector::const_iterator end_of_input = input_contents.end(); parsed Parsed; cis_grammar Parsing(Parsed); tree_parse_info::const_iterator> parse_tree = boost::spirit::ast_parse(begin_of_input, end_of_input, Parsing, (space_p | comment_p("#"))); FILE * cluster_stream = fopen(cluster_file, "w"); if (cluster_stream == NULL){ fprintf(stderr, "Couldn't open output clusters file %s", cluster_file); exit(31); } FILE * hits_stream = fopen(hits_file, "w"); if (hits_stream == NULL){ fprintf(stderr, "Couldn't open output hits file %s", hits_file); exit(31); } //tree_to_xml(std::cout, parse_tree.trees, ""); if (! parse_tree.full){ fprintf(stderr, "Couldn't parse input."); exit(30); } set included_species; foreach(string sp, Parsed.species) included_species.insert(sp); cis::pattern *pat; //Reverse complement all patterns std::vector patterns_rc; for (size_t i=0; i != Parsed.patterns.size(); ++i){ if (Parsed.patterns[i]->kind == "matrix" && (Parsed.patterns[i]->label.strand == cis::POS || Parsed.patterns[i]->label.strand == cis::NEG)){ patterns_rc.push_back(cis::MakeRCPattern(Parsed.patterns[i])); } } Parsed.prepend_paths(); std::vector pattern_names; foreach(pat, Parsed.patterns) pattern_names.push_back(pat->label.id); int max_patlength = 0; foreach(pat, Parsed.patterns) max_patlength = std::max(max_patlength, pat->max_length); //parse the species file to get a list of species names that correspond to the std::string dna_index_path(Parsed.dnaindexfile); std::ifstream species_stream(dna_index_path.c_str()); if (! species_stream){ cerr<<"Couldn't open species list file "< > scantrie(dnac, std::string(Parsed.treefile), std::string(Parsed.posfile), max_patlength); std::cout<<"Parsed index trie info, with "< loci; //now add the reverse complemented patterns. Parsed.patterns.insert(Parsed.patterns.end(), patterns_rc.begin(), patterns_rc.end()); //display the matrices of the parsed patterns for (size_t p = 0; p != Parsed.patterns.size(); ++p){ if (Parsed.patterns[p]->kind == "matrix"){ printf("Matrix for %s %s\n", Parsed.patterns[p]->label.id.c_str(), cis::PrintDNAStrand(Parsed.patterns[p]->label.strand).c_str()); PrintMatrix(* static_cast(Parsed.patterns[p]->ptr2)); PrintPWMMatrix(* static_cast(Parsed.patterns[p]->ptr)); } } //set score thresholds foreach(pat, Parsed.patterns){ if (pat->kind == "matrix"){ std::pair threshold = std::make_pair(0,0); int outer_max_matrix_score = 0; int max_matrix_score = 0; int min_matrix_score = 0; int num_estimated_hits = 0; float logfreq = -1000000.0; int pattern_count = 0; int num_hits_to_find = pat->total_hits; pwm_trie * ptrie = static_cast(pat->t); while (num_hits_to_find > 0 && min_matrix_score >= pat->min_score){ if (min_matrix_score != pat->min_score){ ptrie->set_scores(min_matrix_score, min_matrix_score); ptrie->set_curscore(0); pattern_count = 0; ptrie->apply_to_leaves(&Increment, &pattern_count); if (! pattern_count) { --min_matrix_score; continue; } ptrie->set_scores(min_matrix_score, max_matrix_score); ptrie->apply_to_leaves(&MarkovLogSum, &logfreq); num_estimated_hits = static_cast(std::floor(exp(logfreq + logGenomeSize))); printf("For %s %s, estimated %i hits of %i desired at scores [%i, %i], log val = (%f)\n", pat->label.id.c_str(), cis::PrintDNAStrand(pat->label.strand).c_str(), num_estimated_hits, num_hits_to_find, min_matrix_score, max_matrix_score, logfreq + logGenomeSize); //if the number of estimated hits that would be found by this //score range is less than the number needed, lower the score //band (min,max) by one and continue. if (num_estimated_hits < num_hits_to_find){ --min_matrix_score; max_matrix_score = min_matrix_score; continue; } } //if we get here it means we have met the desired number of hits printf("Searching in score range [%i, %i]\n", min_matrix_score, outer_max_matrix_score); ptrie->set_scores(min_matrix_score, outer_max_matrix_score); ptrie->set_curscore(0); // trie_stream.seekg(0, litestream::POS_BEGIN); // node_info node = readNode(trie_stream); LOCI loci_tmp = Search(scantrie, pat->t, trie_stream, gpos_stream, pat->max_length); loci[pat].insert(loci[pat].end(), loci_tmp.begin(), loci_tmp.end()); FindThresholdScore(loci_tmp, num_hits_to_find, &threshold); num_hits_to_find -= threshold.second; num_estimated_hits = 0; logfreq = -1000000.0; --min_matrix_score; outer_max_matrix_score = min_matrix_score; } score_below pred(threshold.first); LOCI & ploc = loci[pat]; std::cout<label.id<<"("<cluster_group<<") before remove_if"<t, trie_stream, gpos_stream, pat->max_length); std::cout<label.id<<"("<cluster_group<<")"< cluster_names; foreach(clust, Parsed.clusters) cluster_names.push_back(clust->name); std::string ctlist = join_token(cluster_names, ","); std::string patlist = join_token(pattern_names, ","); std::string splist = join_token(Parsed.species, ","); //since it may be memory prohibitive to make all hits at once, //make them for a fraction of dna pieces and then filter them by //clustering... //traverse the pieces of dna in groups, advancing cis::dna_collection::const_iterator start_dna = dnac.begin(); cis::dna_collection::const_iterator end_dna = dnac.begin(); cis::dna_collection::const_iterator dna_iter; while (end_dna != dnac.end()){ while (end_dna != dnac.end() && dnac.TotalLength(start_dna, end_dna) < g_max_dna_search_length){ end_dna++; } // if (included_species.find(dna->species()) == included_species.end()) continue; foreach(pat, Parsed.patterns){ cis::REG_MAP hits_tmp = makeHits(loci[pat], pat, gpos_stream, dnac, start_dna, end_dna, pat->max_length); for (cis::REG_MAP::const_iterator rit = hits_tmp.begin(); rit != hits_tmp.end(); ++rit) hits[(*rit).first].insert(rit->second.begin(), rit->second.end()); } for (cis::REG_MAP::iterator rit = hits.begin(); rit != hits.end(); ++rit){ cis::DNA_P this_dna = rit->first; cis::REGIONS_MULTI & this_dna_hits = rit->second; foreach(clust, Parsed.clusters){ clust->set_hits_container(&this_dna_hits); clust->clear_tags(this_dna_hits.begin(), this_dna_hits.end()); clust->assign_clusters(); if (Parsed.merge_overlap) cis::MergeOverlappingClusters(this_dna_hits); cluster_num = cis::ReduceClusterNumbers(this_dna_hits, cluster_num); PrintHitClusters(this_dna_hits, cluster_stream); } //filter clusters if there are any clustering criteria if (! Parsed.clusters.empty()) for (cis::RIT rit = this_dna_hits.begin(); rit != this_dna_hits.end(); rit++){ cis::hit const* h = static_cast(*rit); if (h->is_cluster == 0) { delete h; this_dna_hits.erase(rit); } } //now append the output to the output file cis::PrintHits(this_dna_hits, *this_dna, hits_stream); for (cis::RIT rit = this_dna_hits.begin(); rit != this_dna_hits.end(); rit++){ cis::hit const* h = static_cast(*rit); delete h; } this_dna_hits.clear(); } hits.clear(); } fclose(cluster_stream); fclose(hits_stream); for (size_t p = 0; p != Parsed.patterns.size(); ++p) delete Parsed.patterns[p]; opt(&argc,&argv); return 0; } int main(int argc, char const**argv){ OptRegister(&input,OPT_STRING, 'i', "input","input file or stream"); OptRegister(&lTrieBuffer,OPT_LONG, 't', "trie-buffer-size","size of buffer for buffered reading of trie file"); OptRegister(&lGPosBuffer,OPT_LONG, 'g', "GPos-buffer-size","size of buffer for buffered reading of gpos file"); //OptRegister(&memory_scan_mode,OPT_BOOL, 'm', "memory-scan-mode","set to true if you want to load index trie and gpos into memory first"); OptRegister(&hits_file,OPT_STRING, 'h', "hits-file-output","hits output file"); OptRegister(&cluster_file,OPT_STRING, 'c', "cluster-file-output","cluster output file"); optMain(icisj); if (argc==1){ char const**myargv; char const* mybuf[2]; mybuf[0]=argv[0]; mybuf[1]="$"; myargv=mybuf; int myargc=2; std::cout<<"Welcome to cis. Search for clustered binding sites in multiple genomes"<