/* yeastint - look for yeast intron (or other organism introns * where don't want to be tied to the whole work database.) */ #include "common.h" #include "portable.h" #include "dnautil.h" #include "nt4.h" #include "localmem.h" #include "cdnaAli.h" struct nt4Seq **chromNt4; void usage() /* Explain usage and abort. */ { errAbort( "yeastInt - look for yeast (or other non-worm) introns in refiAli good.txt\n" "alignment output file. Usage:\n" " yeastInt good.txt ntDir out.fa out.gff out.txt\n"); } int cloneAliReverseStrand(struct ali *ali) /* Returns TRUE (1) if the alignment of the clone EST came from is on minus * strand, FALSE (0) otherwise. */ { struct cdna *cdna = ali->cdna; char *name = cdna->name; int nameLen = strlen(name); boolean isReverse = FALSE; if (nameLen > 1 && name[nameLen-1] == '3' && name[nameLen-2] == '.') isReverse = TRUE; if (ali->strand == '-') isReverse = !isReverse; return (isReverse ? 1 : 0); } int consensusStrand(struct cdnaRef *refList) /* Return most common strand in refList. */ { struct cdnaRef *ref; int counts[2]; counts[0] = counts[1] = 0; for (ref = refList; ref != NULL; ref = ref->next) { counts[cloneAliReverseStrand(ref->ali)] += 1; } return (counts[0] >= counts[1] ? 0 : 1); } char intronStrand(DNA *start, int size, boolean *retPerfect) /* Returns '+', '-' or '.' for forward, reverse, and who knows. */ { DNA a,b,y,z; int fCount = 0, rCount = 0; a = start[0]; b = start[1]; y = start[size-2]; z = start[size-1]; if (a == 'g') ++fCount; if (b == 't') ++fCount; if (y == 'a') ++fCount; if (z == 'g') ++fCount; if (a == 'c') ++rCount; if (b == 't') ++rCount; if (y == 'a') ++rCount; if (z == 'c') ++rCount; *retPerfect = (fCount >= 4 || rCount >= 4); if (fCount >= 3 || rCount >= 3) return (fCount > rCount) ? '+' : '-'; else return '.'; } void writeIntron(FILE *f, struct feature *intron, DNA *dna, int dnaSize, char strand, int startExtra, int intronSize, int insideIntronSize) { int i; struct cdnaRef *ref; fprintf(f, "%d %c %s ", intron->cdnaCount, strand, intron->gene); for (i=0; icdnaRefs; ref != NULL; ref = ref->next) fprintf(f, " %s", ref->ali->cdna->name); fputc('\n', f); } void convertToRna(char *dna) /* Convert from lower case DNA to upper case RNA */ { char c; while ((c = *dna) != 0) { c = toupper(c); if (c == 'T') c = 'U'; *dna++ = c; } } void writeDnaText(FILE *f, char *dna, int dnaSize, int lineSize) /* Write out DNA to a file. */ { while (dnaSize > 0) { if (lineSize > dnaSize) lineSize = dnaSize; mustWrite(f, dna, lineSize); fputc('\n', f); dna += lineSize; dnaSize -= lineSize; } } void writeIntrons(struct feature *intronList, char *gffName, char *txtName, char *faName) { FILE *gff = mustOpen(gffName, "w"); FILE *txt = mustOpen(txtName, "w"); FILE *fa = mustOpen(faName, "w"); struct feature *intron; int intronSize; int intronCount = 0; int idealExtra = 5; int insideIntronSize = 10; int startExtra, endExtra; int dnaStart, dnaEnd, dnaSize; struct nt4Seq *chrom; DNA *dna; char strand; boolean sureAboutStrand; int fCount=0, rCount=0, qCount=0; static char strandIxToC[2] = {'+', '-'}; char cloneStrand; int consMismatchCount = 0; int nonconsMismatchCount = 0; printf("Writing introns gff\n"); fprintf(gff, "##gff-version 1\n"); fprintf(gff, "##source-version cDnaIntrons 1.0\n"); for (intron = intronList; intron != NULL; intron = intron->next) { /* Get the dna for this intron, and a little extra. */ chrom = chromNt4[ixInStrings(intron->chrom, chromNames, chromCount)]; dnaStart = intron->start - idealExtra; if (dnaStart < 0) dnaStart = 0; intronSize = intron->end - intron->start; startExtra = intron->start - dnaStart; dnaEnd = intron->end + idealExtra; if (dnaEnd > chrom->baseCount) dnaEnd = chrom->baseCount; endExtra = dnaEnd - intron->end; dnaSize = dnaEnd - dnaStart; dna = nt4Unpack(chrom, dnaStart, dnaSize); /* Figure out what strand it's on by match to consensus sequence. */ strand = intronStrand(dna + startExtra, intronSize, &sureAboutStrand); cloneStrand = strandIxToC[consensusStrand(intron->cdnaRefs)]; if (strand == '.') strand = cloneStrand; if (cloneStrand != strand) { //warn("Mixed signals on %s intron strand pos %s:%d-%d", // (sureAboutStrand ? "consensus" : "non-consensus"), // intron->chrom, intron->start, intron->end); if (sureAboutStrand) consMismatchCount += 1; else nonconsMismatchCount += 1; } if (strand == '.') ++qCount; else if (strand == '+') ++fCount; else if (strand == '-') ++rCount; /* Write line to gff file. */ fprintf(gff, "%s\t%s\tintron\t%d\t%d\t%d\t%c\t.\t%s\n", intron->chrom, intron->cdnaRefs->ali->cdna->name, intron->start + 1, intron->end, intron->cdnaCount, strand, intron->gene); /* Write line to txt file, reverse complementing if necessary. */ if (strand == '-') { int temp; reverseComplement(dna, dnaSize); temp = startExtra; startExtra = endExtra; endExtra = temp; } convertToRna(dna); writeIntron(txt, intron, dna, dnaSize, strand, startExtra, intronSize, insideIntronSize); /* Write to fa file. */ fprintf(fa, ">%s_%d_%d%c\n", intron->chrom, intron->start+1, intron->end, strand); writeDnaText(fa, dna + startExtra, intronSize, 50); freeMem(dna); ++intronCount; } fclose(fa); fclose(gff); fclose(txt); printf("Wrote %d introns: %d + %d - %d ?\n", intronCount, fCount, rCount, qCount); printf("%d consensus mismatches, %d nonconsensus mismatches on strand\n", consMismatchCount, nonconsMismatchCount); } int main(int argc, char *argv[]) { char *goodName; char *ntDir; char *outFa, *outGff, *outTxt; long startTime, endTime; struct cdna *cdnaList; struct feature *intronList; /* Check command line. */ if (argc != 6) usage(); goodName = argv[1]; ntDir = argv[2]; outFa = argv[3]; outGff = argv[4]; outTxt = argv[5]; cdnaAliInit(); startTime = clock1000(); loadGenome(ntDir, &chromNt4, &chromNames, &chromCount); endTime = clock1000(); printf("Loaded %d genome files in %f seconds\n", chromCount, (endTime - startTime)*0.001); startTime = clock1000(); cdnaList = loadCdna(goodName); endTime = clock1000(); printf("Loaded %s in %f seconds\n", goodName, (endTime - startTime)*0.001); intronList = makeIntrons(cdnaList); writeIntrons(intronList, outGff, outTxt, outFa); return 0; }