#include #include #include #include #include #define __STDC_FORMAT_MACROS #include #include "region.h" #include "hit.h" #include "misc.h" #include "enum.h" #include "dnacol.h" using namespace std; using namespace cis; using namespace cis_enum; namespace cis { int StrandToEnum(char s) { int es; switch (s) { case '+': case '1': case 'p': case 'P': es = POS; break; case '-': case '0': case 'n': case 'N': es = NEG; break; default: cerr<<"Warning: don't understand strand type " < dna.length() || end > dna.length()) { cerr<<"Warning: region "< end) { cerr<<"Error: Trying to create a backwards region."<>(istream&i, region& r) { char strand; i>>r.start>>r.end>>strand; switch (strand) { case '+': case '1': case 'p': case 'P': r.strand = POS; break; case '-': case '0': case 'n': case 'N': r.strand = NEG; break; default: cerr<<"Warning: don't understand strand type " <= r.end) { cerr<<"Error: region has start="<= end="<= r.end; } bool region::overlaps(REG_CR r) const { assert(dna == r.dna); return end > r.start && r.end > start; } bool operator<(REG_CR r1, REG_CR r2) { return // r1.dna < r2.dna || // (r1.dna == r2.dna && (r1.start < r2.start || (r1.start == r2.start && (r1.end < r2.end || (r1.end == r2.end && (r1.strand < r2.strand))))); } // string lower_case_regions(string const& seq, REG_PC r, // cis::region_tree * regs) //{ // REGIONS_MULTI overlaps = cis::IntervalOverlap(*regs, *r); // int start, end, size; // string seq_mask(seq); // REGIONS_MULTI::iterator overlaps_iter; // for (overlaps_iter = overlaps.begin(); // overlaps_iter != overlaps.end(); ++overlaps_iter) //{ // REG_PC o = (*overlaps_iter); // start = max((int)o->start - (int)r->start, 0); // end = min((int)r->end, (int)o->end - (int)r->start); // size = end - start; // string sub = seq_mask.substr(start, size); // boost::to_lower(sub); // seq_mask.replace(start, size, sub); // } // cout<= main.end) r = DOWNSTREAM; // else r = IN; // } // else { // if (sub.end <= main.start) r = DOWNSTREAM; // else if (sub.start >= main.end) r = UPSTREAM; // else r = IN; // } // return r; // } //a kludge for the lack of random access iterators... RIT_M_R min_iter(REGIONS_MULTI * c, RIT_M_R a, RIT_M_R b) { if (a == c->end()) return b; else if (b == c->end()) return a; else { assert((*a)->dna == (*b)->dna); return (c->key_comp()(*a, *b)) ? a : b; } } REGIONS_MULTI MergeRegions(REG_MAP r) { REGIONS_MULTI reg; REG_MAP::iterator reg_iter; for (reg_iter = r.begin(); reg_iter != r.end(); ++reg_iter) { REG_MAP::value_type & d = (*reg_iter); reg.insert(d.second.begin(), d.second.end()); } return reg; } //here we don't depend on the source information to tell us what numbers the exons have... REG_P MakeRegion(cis::dna_t const* dna, int64_t start, int64_t end, string const& strand, string const& name, int start_adjust, int five_prime_flank, int three_prime_flank, int id, int group_id) { dna_strand bstrand = POS; ostringstream errmsg; bstrand = (dna_strand)cis_enum::from_string(strand); REG_P reg = NULL; if (start == 0 && start_adjust == -1) { cerr<<"this file appears to be 0-based, but was specified as ones-based"< dna->length() || end > dna->length()) { if (num_out_of_range < 5) { fprintf(stderr, "Warning: Taking subset of region %"PRId64" -> %"PRId64 "on dna %s %s (%"PRId64" bases long)\n", start, end, dna->species().c_str(), dna->name.c_str(), dna->length() ); num_out_of_range++; if (num_out_of_range == 5) fprintf(stderr, "Further warnings omitted\n"); } } int64_t start_final = bstrand == POS ? max(int64_t(0), (int64_t)(start + start_adjust - five_prime_flank)) : max(int64_t(0), (int64_t)(start + start_adjust - three_prime_flank)); int64_t end_final = bstrand == POS ? min((int64_t)dna->length(), (int64_t)(end + three_prime_flank)) : min((int64_t)dna->length(), (int64_t)(end + five_prime_flank)); reg = new region(*dna, name, start_final, end_final, bstrand, id, group_id, region::NONE); return reg; } //load regions from an rdbfile. REG_MAP RDBToRegions(DNAS& dnas, char const* rdbfile, bool with_group_field) { int region_id, group_id = -1; int64_t bound1, bound2; char species[256], dnaname[256], strand[16], name[256]; REG_MAP regions; FILE * rdbstream = fopen(rdbfile, "r"); if (rdbstream == NULL) { fprintf(stderr, "Couldn't open RDB regions file %s\n", rdbfile); exit(50); } int num_fields = with_group_field ? 7 : 6; int num_assign = 0; while (! feof(rdbstream)) { if (with_group_field) num_assign = std::fscanf(rdbstream, "%i %i %s %s %"PRId64" %"PRId64" %s\n", ®ion_id, &group_id, species, dnaname, &bound1, &bound2, strand); else num_assign = std::fscanf(rdbstream, "%i %s %s %"PRId64" %"PRId64" %s\n", ®ion_id, species, dnaname, &bound1, &bound2, strand); if (num_assign == num_fields) { //success reading this line std::string sspecies(species); std::string sdnaname(dnaname); cis::dna_t const* dna = GetDNAByName(dnas, sspecies, sdnaname); if (dna == NULL) { } else { int64_t start = min(bound1,bound2); int64_t end = max(bound1,bound2); REG_P reg = MakeRegion(dna, start, end, std::string(strand), std::string(name), 0, 0, 0, region_id, group_id); if (regions.find(dna) == regions.end()) regions[dna] = REGIONS_MULTI(); if (reg != NULL) regions[dna].insert(reg); } } else { if (! feof(rdbstream)) { char line[1000]; fgets(line, 1000, rdbstream); fprintf(stderr, "Found %i fields (should be %i) in RDB regions file. (Next line is:\n%s\n", num_assign, num_fields, line); exit(35); } } } fclose(rdbstream); return regions; } //find the left and rightmost intervals among each with the same group_id //and set their group_relation_ mask void SetGroupRelation(RIT const& begin, RIT const& end) { set group_ids; int group_id; for (RIT rit = begin; rit != end; ++rit) { group_id = (*rit)->group_id; if (group_ids.find(group_id) == group_ids.end()) { group_ids.insert(group_id); const_cast(*rit)->group_relation_ |= region::LEFT_MOST; } } group_ids.clear(); for (RIT_REV rit = RIT_REV(end); rit != RIT_REV(begin); ++rit) { group_id = (*rit)->group_id; if (group_ids.find(group_id) == group_ids.end()) { group_ids.insert(group_id); const_cast(*rit)->group_relation_ |= region::RIGHT_MOST; } } } //returns a tab-separated representation of the region and its DNA sequence, //reverse complemented if region is negative stranded void PrintDNARegion(REG_PC dna_region, FILE * outfile) { std::fprintf(outfile, "%i\t%s\t%s\t%"PRId64"\t%"PRId64"\t%s\n", dna_region->id, dna_region->dna.species().c_str(), dna_region->dna.name.c_str(), dna_region->start, dna_region->end, PrintDNAStrand(dna_region->strand).c_str()); } //simply prints the region's id and sequence (which is reverse complemented if //the region is on the negative strand) void PrintDNARegionSimple(REG_PC dna_region, int max_sequence_length, FILE * outfile) { if (dna_region->length() > max_sequence_length) return; string dna_sequence = dna_region->dna.sequence(dna_region->start, dna_region->end); if (dna_region->strand == cis::NEG) dna_sequence = RevCompStr(dna_sequence); std::fprintf(outfile, "%i\t%s\n", dna_region->id, dna_sequence.c_str()); } } // namespace cis