/* psl.c was originally generated by the autoSql program, which also * generated as_psl.h and as_psl.sql. This module links the database and the RAM * representation of objects. * * This file is copyright 2002 Jim Kent, but license is hereby * granted for all use - public, private or commercial. */ #include "common.h" #include "sqlNum.h" #include "sqlList.h" #include "localmem.h" #include "psl.h" #include "hash.h" #include "linefile.h" #include "dnaseq.h" #include "dystring.h" #include "fuzzyFind.h" #include "aliType.h" #include "binRange.h" #include "rangeTree.h" static char *createString = "CREATE TABLE %s (\n" "%s" /* Optional bin */ "matches int unsigned not null, # Number of bases that match that aren't repeats\n" "misMatches int unsigned not null, # Number of bases that don't match\n" "repMatches int unsigned not null, # Number of bases that match but are part of repeats\n" "nCount int unsigned not null, # Number of 'N' bases\n" "qNumInsert int unsigned not null, # Number of inserts in query\n" "qBaseInsert int unsigned not null, # Number of bases inserted in query\n" "tNumInsert int unsigned not null, # Number of inserts in target\n" "tBaseInsert int unsigned not null, # Number of bases inserted in target\n" "strand char(2) not null, # + or - for strand. First character is query, second is target.\n" "qName varchar(255) not null, # Query sequence name\n" "qSize int unsigned not null, # Query sequence size\n" "qStart int unsigned not null, # Alignment start position in query\n" "qEnd int unsigned not null, # Alignment end position in query\n" "tName varchar(255) not null, # Target sequence name\n" "tSize int unsigned not null, # Target sequence size\n" "tStart int unsigned not null, # Alignment start position in target\n" "tEnd int unsigned not null, # Alignment end position in target\n" "blockCount int unsigned not null, # Number of blocks in alignment\n" "blockSizes longblob not null, # Size of each block\n" "qStarts longblob not null, # Start of each block in query.\n" "tStarts longblob not null, # Start of each block in target.\n"; static char *indexString = "#Indices\n" "%s" /* Optional bin. */ "INDEX(qName(12))\n" ")\n"; struct psl *pslxLoad(char **row) /* Load a psl from row fetched with select * from psl * from database. Dispose of this with pslFree(). */ { struct psl *ret = pslLoad(row); int retSize; sqlStringDynamicArray(row[21],&ret->qSequence, &retSize); sqlStringDynamicArray(row[22],&ret->tSequence, &retSize); return ret; } struct psl *pslLoad(char **row) /* Load a psl from row fetched with select * from psl * from database. Dispose of this with pslFree(). */ { struct psl *ret; int sizeOne; AllocVar(ret); ret->blockCount = sqlUnsigned(row[17]); ret->match = sqlUnsigned(row[0]); ret->misMatch = sqlUnsigned(row[1]); ret->repMatch = sqlUnsigned(row[2]); ret->nCount = sqlUnsigned(row[3]); ret->qNumInsert = sqlUnsigned(row[4]); ret->qBaseInsert = sqlSigned(row[5]); ret->tNumInsert = sqlUnsigned(row[6]); ret->tBaseInsert = sqlSigned(row[7]); strcpy(ret->strand, row[8]); ret->qName = cloneString(row[9]); ret->qSize = sqlUnsigned(row[10]); ret->qStart = sqlUnsigned(row[11]); ret->qEnd = sqlUnsigned(row[12]); ret->tName = cloneString(row[13]); ret->tSize = sqlUnsigned(row[14]); ret->tStart = sqlUnsigned(row[15]); ret->tEnd = sqlUnsigned(row[16]); sqlUnsignedDynamicArray(row[18], &ret->blockSizes, &sizeOne); if (sizeOne != ret->blockCount) { printf("sizeOne bloxksizes %d bs %d block=%s\n",sizeOne, ret->blockCount,row[18]); } assert(sizeOne == ret->blockCount); sqlUnsignedDynamicArray(row[19], &ret->qStarts, &sizeOne); if (sizeOne != ret->blockCount) { printf("sizeOne qStarts %d bs %d\n",sizeOne, ret->blockCount); } assert(sizeOne == ret->blockCount); sqlUnsignedDynamicArray(row[20], &ret->tStarts, &sizeOne); if (sizeOne != ret->blockCount) { printf("sizeOne tStarts %d bs %d\n",sizeOne, ret->blockCount); } assert(sizeOne == ret->blockCount); return ret; } struct psl *pslCommaIn(char **pS, struct psl *ret) /* Create a psl out of a comma separated string. * This will fill in ret if non-null, otherwise will * return a new psl */ { char *s = *pS; int i; if (ret == NULL) AllocVar(ret); ret->match = sqlUnsignedComma(&s); ret->misMatch = sqlUnsignedComma(&s); ret->repMatch = sqlUnsignedComma(&s); ret->nCount = sqlUnsignedComma(&s); ret->qNumInsert = sqlUnsignedComma(&s); ret->qBaseInsert = sqlSignedComma(&s); ret->tNumInsert = sqlUnsignedComma(&s); ret->tBaseInsert = sqlSignedComma(&s); sqlFixedStringComma(&s, ret->strand, sizeof(ret->strand)); ret->qName = sqlStringComma(&s); ret->qSize = sqlUnsignedComma(&s); ret->qStart = sqlUnsignedComma(&s); ret->qEnd = sqlUnsignedComma(&s); ret->tName = sqlStringComma(&s); ret->tSize = sqlUnsignedComma(&s); ret->tStart = sqlUnsignedComma(&s); ret->tEnd = sqlUnsignedComma(&s); ret->blockCount = sqlUnsignedComma(&s); s = sqlEatChar(s, '{'); AllocArray(ret->blockSizes, ret->blockCount); for (i=0; iblockCount; ++i) { ret->blockSizes[i] = sqlUnsignedComma(&s); } s = sqlEatChar(s, '}'); s = sqlEatChar(s, ','); s = sqlEatChar(s, '{'); AllocArray(ret->qStarts, ret->blockCount); for (i=0; iblockCount; ++i) { ret->qStarts[i] = sqlUnsignedComma(&s); } s = sqlEatChar(s, '}'); s = sqlEatChar(s, ','); s = sqlEatChar(s, '{'); AllocArray(ret->tStarts, ret->blockCount); for (i=0; iblockCount; ++i) { ret->tStarts[i] = sqlUnsignedComma(&s); } s = sqlEatChar(s, '}'); s = sqlEatChar(s, ','); *pS = s; return ret; } void pslFree(struct psl **pEl) /* Free a single dynamically allocated psl such as created * with pslLoad(). */ { struct psl *el; if ((el = *pEl) == NULL) return; freeMem(el->qName); freeMem(el->tName); freeMem(el->blockSizes); freeMem(el->qStarts); freeMem(el->tStarts); if (el->qSequence) { freeMem(el->qSequence[0]); freeMem(el->qSequence); } if (el->tSequence) { freeMem(el->tSequence[0]); freeMem(el->tSequence); } freez(pEl); } void pslFreeList(struct psl **pList) /* Free a list of dynamically allocated psl's */ { struct psl *el, *next; for (el = *pList; el != NULL; el = next) { next = el->next; pslFree(&el); } *pList = NULL; } void pslOutput(struct psl *el, FILE *f, char sep, char lastSep) /* Print out psl. Separate fields with sep. Follow last field with lastSep. */ { int i; fprintf(f, "%u", el->match); fputc(sep,f); fprintf(f, "%u", el->misMatch); fputc(sep,f); fprintf(f, "%u", el->repMatch); fputc(sep,f); fprintf(f, "%u", el->nCount); fputc(sep,f); fprintf(f, "%u", el->qNumInsert); fputc(sep,f); fprintf(f, "%d", el->qBaseInsert); fputc(sep,f); fprintf(f, "%u", el->tNumInsert); fputc(sep,f); fprintf(f, "%d", el->tBaseInsert); fputc(sep,f); if (sep == ',') fputc('"',f); fprintf(f, "%s", el->strand); if (sep == ',') fputc('"',f); fputc(sep,f); if (sep == ',') fputc('"',f); fprintf(f, "%s", el->qName); if (sep == ',') fputc('"',f); fputc(sep,f); fprintf(f, "%u", el->qSize); fputc(sep,f); fprintf(f, "%u", el->qStart); fputc(sep,f); fprintf(f, "%u", el->qEnd); fputc(sep,f); if (sep == ',') fputc('"',f); fprintf(f, "%s", el->tName); if (sep == ',') fputc('"',f); fputc(sep,f); fprintf(f, "%u", el->tSize); fputc(sep,f); fprintf(f, "%u", el->tStart); fputc(sep,f); fprintf(f, "%u", el->tEnd); fputc(sep,f); fprintf(f, "%u", el->blockCount); fputc(sep,f); if (sep == ',') fputc('{',f); for (i=0; iblockCount; ++i) { fprintf(f, "%u", el->blockSizes[i]); fputc(',', f); } if (sep == ',') fputc('}',f); fputc(sep,f); if (sep == ',') fputc('{',f); for (i=0; iblockCount; ++i) { fprintf(f, "%u", el->qStarts[i]); fputc(',', f); } if (sep == ',') fputc('}',f); fputc(sep,f); if (sep == ',') fputc('{',f); for (i=0; iblockCount; ++i) { fprintf(f, "%u", el->tStarts[i]); fputc(',', f); } if (sep == ',') fputc('}',f); if (el->qSequence) { fputc(sep,f); if (sep == ',') fputc('{',f); for (i=0; iblockCount; ++i) { fprintf(f, "%s", el->qSequence[i]); fputc(',', f); } if (sep == ',') fputc('}',f); fputc(sep,f); if (sep == ',') fputc('{',f); for (i=0; iblockCount; ++i) { fprintf(f, "%s", el->tSequence[i]); fputc(',', f); } if (sep == ',') fputc('}',f); } fputc(lastSep,f); if (ferror(f)) { perror("Error writing psl file\n"); errAbort("\n"); } } /* ----- end autoSql generated part --------------- */ void pslOutputShort(struct psl *el, FILE *f, char sep, char lastSep) /* Print out psl. Separate fields with sep. Follow last field with lastSep. */ { fprintf(f, "%u", el->match); fputc(sep,f); fprintf(f, "%u", el->misMatch); fputc(sep,f); fprintf(f, "%u", el->repMatch); fputc(sep,f); fprintf(f, "%u", el->qNumInsert); fputc(sep,f); fprintf(f, "%d", el->qBaseInsert); fputc(sep,f); fprintf(f, "%u", el->tNumInsert); fputc(sep,f); fprintf(f, "%d", el->tBaseInsert); fputc(sep,f); if (sep == ',') fputc('"',f); fprintf(f, "%s", el->strand); if (sep == ',') fputc('"',f); fputc(sep,f); if (sep == ',') fputc('"',f); fprintf(f, "%s", el->qName); if (sep == ',') fputc('"',f); fputc(sep,f); fprintf(f, "%u", el->qStart); fputc(sep,f); fprintf(f, "%u", abs(el->qEnd - el->qStart)); fputc(sep,f); if (sep == ',') fputc('"',f); fprintf(f, "%s", el->tName); if (sep == ',') fputc('"',f); fputc(sep,f); fprintf(f, "%u", el->tStart); fputc(sep,f); fprintf(f, "%u", abs(el->tEnd - el->tStart)); fputc(sep,f); fprintf(f, "%u", el->blockCount); fputc(sep,f); if (sep == ',') fputc('{',f); fputc(lastSep,f); if (ferror(f)) { perror("Error writing psl file\n"); errAbort("\n"); } } void pslOutFormat(struct psl *el, FILE *f, char sep, char lastSep) /* Print out selected psl values. Separate fields with sep. Follow last field with lastSep. */ /* Prints out a better format with bold field headings followed by value */ /* Requires further upstream work to ensure that only the field headers */ /* declared here are printed if replacing an existing psl print function*/ { const char *headers[] = {"Matches", "Mismatches", "Matches in repeats", "Number of N bases", "Query name", "Size", "Start", "End", "Chromosome", "Strand", "Start", "End"}; char *hformat = "%s: "; /* string for formatted print for headers */ char *uformat = "%s: %u%c"; /* string for formatted print for unsigned variable */ char *targName; fprintf(f, uformat, headers[0], el->match, sep); fprintf(f, uformat, headers[1], el->misMatch, sep); fprintf(f, uformat, headers[2], el->repMatch, sep); fprintf(f, uformat, headers[3], el->nCount, sep); fprintf(f, hformat, headers[4]); if (sep == ',') fputc('"',f); fprintf(f, "%s", el->qName); if (sep == ',') fputc('"',f); fputc(sep,f); fprintf(f, uformat, headers[5], el->qSize, sep); fprintf(f, uformat, headers[6], el->qStart, sep); fprintf(f, uformat, headers[7], el->qEnd, sep); fprintf(f, hformat, headers[8]); if (sep == ',') fputc('"',f); /* skip leading 'chr' in string to get only chromosome part */ targName = el->tName; if (startsWith("chr", el->tName)) targName += 3; fprintf(f, "%s", targName); if (sep == ',') fputc('"',f); fputc(sep,f); fprintf(f, hformat, headers[9]); if (sep == ',') fputc('"',f); fprintf(f, "%s", el->strand); if (sep == ',') fputc('"',f); fputc(sep,f); fprintf(f, uformat, headers[10], el->tStart, sep); fprintf(f, uformat, headers[11], el->tEnd, sep); fputc(lastSep,f); if (ferror(f)) { perror("Error writing psl file\n"); errAbort("\n"); } } struct psl *pslLoadAll(char *fileName) /* Load all psl's in file. */ { struct lineFile *lf = pslFileOpen(fileName); struct psl *pslList = NULL, *psl; while ((psl = pslNext(lf)) != NULL) { slAddHead(&pslList, psl); } slReverse(&pslList); lineFileClose(&lf); return pslList; } int pslCmpQuery(const void *va, const void *vb) /* Compare to sort based on query start. */ { const struct psl *a = *((struct psl **)va); const struct psl *b = *((struct psl **)vb); int dif; dif = strcmp(a->qName, b->qName); if (dif == 0) dif = a->qStart - b->qStart; return dif; } int pslCmpTarget(const void *va, const void *vb) /* Compare to sort based on target start. */ { const struct psl *a = *((struct psl **)va); const struct psl *b = *((struct psl **)vb); int dif; dif = strcmp(a->tName, b->tName); if (dif == 0) dif = a->tStart - b->tStart; return dif; } int pslCmpTargetAndStrand(const void *va, const void *vb) /* Compare to sort based on target, strand, tStart. */ { const struct psl *a = *((struct psl **)va); const struct psl *b = *((struct psl **)vb); int dif; dif = strcmp(a->tName, b->tName); if (dif == 0) dif = strcmp(a->strand, b->strand); if (dif == 0) dif = a->tStart - b->tStart; return dif; } int pslCmpScore(const void *va, const void *vb) /* Compare to sort based on score (descending). */ { const struct psl *a = *((struct psl **)va); const struct psl *b = *((struct psl **)vb); return pslScore(b) - pslScore(a); } int pslCmpQueryScore(const void *va, const void *vb) /* Compare to sort based on query then score (descending). */ { const struct psl *a = *((struct psl **)va); const struct psl *b = *((struct psl **)vb); int diff = strcmp(a->qName, b->qName); if (diff == 0) diff = pslScore(b) - pslScore(a); return diff; } int pslCmpMatch(const void *va, const void *vb) /* Compare to sort based on match */ { const struct psl *a = *((struct psl **)va); const struct psl *b = *((struct psl **)vb); return b->match - a->match; } static void pslLabelColumns(FILE *f) /* Write column info. */ { fputs("\n" "match\tmis- \trep. \tN's\tQ gap\tQ gap\tT gap\tT gap\tstrand\tQ \tQ \tQ \tQ \tT \tT \tT \tT \tblock\tblockSizes \tqStarts\t tStarts\n" " \tmatch\tmatch\t \tcount\tbases\tcount\tbases\t \tname \tsize\tstart\tend\tname \tsize\tstart\tend\tcount\n" "---------------------------------------------------------------------------------------------------------------------------------------------------------------\n", f); } void pslxWriteHead(FILE *f, enum gfType qType, enum gfType tType) /* Write header for extended (possibly protein) psl file. */ { fprintf(f, "psLayout version 4 %s %s\n", gfTypeName(qType), gfTypeName(tType)); pslLabelColumns(f); } void pslWriteHead(FILE *f) /* Write head of psl. */ { fputs("psLayout version 3\n", f); pslLabelColumns(f); } void pslWriteAll(struct psl *pslList, char *fileName, boolean writeHeader) /* Write a psl file from list. */ { FILE *f; struct psl *psl; f = mustOpen(fileName, "w"); if (writeHeader) pslWriteHead(f); for (psl = pslList; psl != NULL; psl = psl->next) pslTabOut(psl, f); fclose(f); } void pslxFileOpen(char *fileName, enum gfType *retQueryType, enum gfType *retTargetType, struct lineFile **retLf) /* Read header part of psl and make sure it's right. Return * sequence types and file handle. */ { char *line; int lineSize; char *words[30]; char *version; int wordCount; int i; enum gfType qt = gftRna, tt = gftDna; struct lineFile *lf = lineFileOpen(fileName, TRUE); if (!lineFileNext(lf, &line, &lineSize)) warn("%s is empty", fileName); else { if (startsWith("psLayout version", line)) { wordCount = chopLine(line, words); if (wordCount < 3) errAbort("%s is not a psLayout file", fileName); version = words[2]; if (sameString(version, "3")) { } else if (sameString(version, "4")) { qt = gfTypeFromName(words[3]); tt = gfTypeFromName(words[4]); } else { errAbort("%s is version %s of psLayout, this program can only handle through version 4", fileName, version); } for (i=0; i<4; ++i) { if (!lineFileNext(lf, &line, &lineSize)) errAbort("%s severely truncated", fileName); } } else { char *s = cloneString(line); while (line != NULL && line[0] == '#') { freeMem(s); lineFileNext(lf, &line, &lineSize); s = cloneString(line); } wordCount = chopLine(s, words); if (wordCount < 21 || wordCount > 23 || (words[8][0] != '+' && words[8][0] != '-')) errAbort("%s is not a psLayout file", fileName); else lineFileReuse(lf); freeMem(s); } } *retQueryType = qt; *retTargetType = tt; *retLf = lf; } static void pslxFileOpenWithMetaConfig(char *fileName, bool isMetaUnique, enum gfType *retQueryType, enum gfType *retTargetType, struct lineFile **retLf, FILE *f) /* Read header part of psl and make sure it's right. Return * sequence types and file handle and send meta data to output file f */ { char *line; int lineSize; char *words[30]; char *version; int wordCount; int i; enum gfType qt = gftRna, tt = gftDna; struct lineFile *lf = lineFileOpen(fileName, TRUE); lineFileSetMetaDataOutput(lf, f); if (isMetaUnique) lineFileSetUniqueMetaData(lf); if (!lineFileNext(lf, &line, &lineSize)) warn("%s is empty", fileName); else { if (startsWith("psLayout version", line)) { wordCount = chopLine(line, words); if (wordCount < 3) errAbort("%s is not a psLayout file", fileName); version = words[2]; if (sameString(version, "3")) { } else if (sameString(version, "4")) { qt = gfTypeFromName(words[3]); tt = gfTypeFromName(words[4]); } else { errAbort("%s is version %s of psLayout, this program can only handle through version 4", fileName, version); } for (i=0; i<4; ++i) { if (!lineFileNext(lf, &line, &lineSize)) errAbort("%s severely truncated", fileName); } } else { char *s = cloneString(line); boolean eof = FALSE; while ((line[0] == '#') && (!eof)) { freeMem(s); if (!lineFileNext(lf, &line, &lineSize)) eof = TRUE; s = cloneString(line); } wordCount = chopLine(s, words); if ((wordCount < 21 || wordCount > 23 || (words[8][0] != '+' && words[8][0] != '-')) && (!eof)) errAbort("%s is not a psLayout file", fileName); else lineFileReuse(lf); freeMem(s); } } *retQueryType = qt; *retTargetType = tt; *retLf = lf; } void pslxFileOpenWithMeta(char *fileName, enum gfType *retQueryType, enum gfType *retTargetType, struct lineFile **retLf, FILE *f) /* Read header part of psl and make sure it's right. Return * sequence types and file handle and send meta data to output file f */ { pslxFileOpenWithMetaConfig(fileName, FALSE, retQueryType, retTargetType, retLf, f); } void pslxFileOpenWithUniqueMeta(char *fileName, enum gfType *retQueryType, enum gfType *retTargetType, struct lineFile **retLf, FILE *f) /* Read header part of psl and make sure it's right. Return * sequence types and file handle and send only unique meta data to output f */ { pslxFileOpenWithMetaConfig(fileName, TRUE, retQueryType, retTargetType, retLf, f); } struct lineFile *pslFileOpen(char *fileName) /* Read header part of psl and make sure it's right. * Return line file handle to it. */ { enum gfType qt, tt; struct lineFile *lf; pslxFileOpen(fileName, &qt, &tt, &lf); return lf; } struct lineFile *pslFileOpenWithMeta(char *fileName, FILE *f) /* Read header part of psl and make sure it's right. * Return line file handle to it. */ { enum gfType qt, tt; struct lineFile *lf; pslxFileOpenWithMeta(fileName, &qt, &tt, &lf, f); return lf; } struct lineFile *pslFileOpenWithUniqueMeta(char *fileName, FILE *f) /* Read header part of psl and make sure it's right. * Set flag to suppress duplicate header comments. * Return line file handle to it. */ { enum gfType qt, tt; struct lineFile *lf; pslxFileOpenWithUniqueMeta(fileName, &qt, &tt, &lf, f); return lf; } struct psl *pslNext(struct lineFile *lf) /* Read next line from file and convert it to psl. Return * NULL at eof. */ { char *line; int lineSize; char *words[32]; int wordCount; static int lineAlloc = 0; static char *chopBuf = NULL; if (!lineFileNextReal(lf, &line)) { return NULL; } lineSize = strlen(line); if (lineSize >= lineAlloc) { lineAlloc = lineSize+256; chopBuf = needMoreMem(chopBuf, 0, lineAlloc); } memcpy(chopBuf, line, lineSize+1); wordCount = chopLine(chopBuf, words); if (wordCount == 21) { return pslLoad(words); } if (wordCount == 23) { return pslxLoad(words); } else { errAbort("Bad line %d of %s wordCount is %d instead of 21 or 23\n", lf->lineIx, lf->fileName, wordCount); return NULL; } } struct psl *pslxLoadLm(char **row, struct lm *lm) /* Load row into local memory pslx. */ { struct psl *ret = pslLoadLm(row, lm); ret->qSequence = lmAlloc(lm, sizeof(ret->qSequence[0]) * ret->blockCount); sqlStringArray(lmCloneString(lm,row[21]),ret->qSequence, ret->blockCount); ret->tSequence = lmAlloc(lm, sizeof(ret->tSequence[0]) * ret->blockCount); sqlStringArray(lmCloneString(lm,row[22]),ret->tSequence, ret->blockCount); return ret; } struct psl *pslLoadLm(char **row, struct lm *lm) /* Load row into local memory psl. */ { struct psl *ret; ret = lmAlloc(lm, sizeof(*ret)); ret->blockCount = sqlUnsigned(row[17]); ret->match = sqlUnsigned(row[0]); ret->misMatch = sqlUnsigned(row[1]); ret->repMatch = sqlUnsigned(row[2]); ret->nCount = sqlUnsigned(row[3]); ret->qNumInsert = sqlUnsigned(row[4]); ret->qBaseInsert = sqlSigned(row[5]); ret->tNumInsert = sqlUnsigned(row[6]); ret->tBaseInsert = sqlSigned(row[7]); strcpy(ret->strand, row[8]); ret->qName = lmCloneString(lm,row[9]); ret->qSize = sqlUnsigned(row[10]); ret->qStart = sqlUnsigned(row[11]); ret->qEnd = sqlUnsigned(row[12]); ret->tName = lmCloneString(lm, row[13]); ret->tSize = sqlUnsigned(row[14]); ret->tStart = sqlUnsigned(row[15]); ret->tEnd = sqlUnsigned(row[16]); ret->blockSizes = lmAlloc(lm, sizeof(ret->blockSizes[0]) * ret->blockCount); sqlUnsignedArray(row[18], ret->blockSizes, ret->blockCount); ret->qStarts = lmAlloc(lm, sizeof(ret->qStarts[0]) * ret->blockCount); sqlUnsignedArray(row[19], ret->qStarts, ret->blockCount); ret->tStarts = lmAlloc(lm, sizeof(ret->tStarts[0]) * ret->blockCount); sqlUnsignedArray(row[20], ret->tStarts, ret->blockCount); return ret; } boolean pslIsProtein(const struct psl *psl) /* is psl a protein psl (are it's blockSizes and scores in protein space) */ { int lastBlock = psl->blockCount - 1; return (((psl->strand[1] == '+' ) && (psl->tEnd == psl->tStarts[lastBlock] + 3*psl->blockSizes[lastBlock])) || ((psl->strand[1] == '-') && (psl->tStart == (psl->tSize-(psl->tStarts[lastBlock] + 3*psl->blockSizes[lastBlock]))))); } int pslCalcMilliBad(struct psl *psl, boolean isMrna) /* Calculate badness in parts per thousand. */ { int sizeMul = pslIsProtein(psl) ? 3 : 1; int qAliSize, tAliSize, aliSize; int milliBad = 0; int sizeDif; int insertFactor; int total; qAliSize = sizeMul * (psl->qEnd - psl->qStart); tAliSize = psl->tEnd - psl->tStart; aliSize = min(qAliSize, tAliSize); if (aliSize <= 0) return 0; sizeDif = qAliSize - tAliSize; if (sizeDif < 0) { if (isMrna) sizeDif = 0; else sizeDif = -sizeDif; } insertFactor = psl->qNumInsert; if (!isMrna) insertFactor += psl->tNumInsert; total = (sizeMul * (psl->match + psl->repMatch + psl->misMatch)); if (total != 0) milliBad = (1000 * (psl->misMatch*sizeMul + insertFactor + round(3*log(1+sizeDif)))) / total; return milliBad; } int pslScore(const struct psl *psl) /* Return score for psl. */ { int sizeMul = pslIsProtein(psl) ? 3 : 1; return sizeMul * (psl->match + ( psl->repMatch>>1)) - sizeMul * psl->misMatch - psl->qNumInsert - psl->tNumInsert; } int pslCmpScoreDesc(const void *va, const void *vb) /* Compare to sort based on score. */ { const struct psl *a = *((struct psl **)va); const struct psl *b = *((struct psl **)vb); return pslScore(b) - pslScore(a); } struct ffAli *pslToFakeFfAli(struct psl *psl, DNA *needle, DNA *haystack) /* Convert from psl to ffAli format. In some cases you can pass NULL * for needle and haystack - depending what the post-processing is going * to be. */ { struct ffAli *ffList = NULL, *ff; int blockCount = psl->blockCount; unsigned *blockSizes = psl->blockSizes; unsigned *qStarts = psl->qStarts; unsigned *tStarts = psl->tStarts; int size; int i; for (i=0; ileft = ffList; ffList = ff; ff->nStart = ff->nEnd = needle + qStarts[i]; ff->nEnd += size; ff->hStart = ff->hEnd = haystack + tStarts[i]; ff->hEnd += size; } ffList = ffMakeRightLinks(ffList); return ffList; } struct psl *pslFromFakeFfAli(struct ffAli *ff, DNA *needle, DNA *haystack, char strand, char *qName, int qSize, char *tName, int tSize) /* This will create a basic psl structure from a sorted series of ffAli * blocks. The fields that would need actual sequence to be filled in * are left zero however - fields including match, repMatch, mismatch. */ { struct psl *psl; unsigned *blockSizes; unsigned *qStarts; unsigned *tStarts; int blockCount; int i; int nStart, hStart; int nEnd, hEnd; AllocVar(psl); psl->blockCount = blockCount = ffAliCount(ff); psl->blockSizes = AllocArray(blockSizes, blockCount); psl->qStarts = AllocArray(qStarts, blockCount); psl->tStarts = AllocArray(tStarts, blockCount); psl->qName = cloneString(qName); psl->qSize = qSize; psl->tName = cloneString(tName); psl->tSize = tSize; psl->strand[0] = strand; for (i=0; inStart - needle; nEnd = ff->nEnd - needle; hStart = ff->hStart - haystack; hEnd = ff->hEnd - haystack; blockSizes[i] = nEnd - nStart; qStarts[i] = nStart; tStarts[i] = hStart; if (i == 0) { psl->qStart = nStart; psl->tStart = hStart; } if (i == blockCount-1) { psl->qEnd = nEnd; psl->tEnd = hEnd; } ff = ff->right; } if (strand == '-') { reverseIntRange(&psl->qStart, &psl->qEnd, psl->qSize); } return psl; } struct ffAli *pslToFfAli(struct psl *psl, struct dnaSeq *query, struct dnaSeq *target, int targetOffset) /* Convert from psl to ffAli format. Clip to parts that we actually * have sequence for. */ { struct ffAli *ffList = NULL, *ff; DNA *needle = query->dna; DNA *haystack = target->dna; int blockCount = psl->blockCount; unsigned *blockSizes = psl->blockSizes; unsigned *qStarts = psl->qStarts; unsigned *tStarts = psl->tStarts; int size; int i; int tMin = targetOffset; int tMax = targetOffset + target->size; int tStart, tEnd; int clipStart, clipEnd, clipOffset, clipSize; for (i=0; i tMin) { if (clipStart < tMin) clipStart = tMin; if (clipEnd > tMax) clipEnd = tMax; clipOffset = clipStart - tStart; clipSize = clipEnd - clipStart; AllocVar(ff); ff->left = ffList; ffList = ff; ff->nStart = ff->nEnd = needle + qStarts[i] + clipOffset; ff->nEnd += clipSize; ff->hStart = ff->hEnd = haystack + clipStart - targetOffset; ff->hEnd += clipSize; } } ffList = ffMakeRightLinks(ffList); ffCountGoodEnds(ffList); return ffList; } int pslOrientation(struct psl *psl) /* Translate psl strand + or - to orientation +1 or -1 */ { if (psl->strand[1] != '\0') { /* translated blat */ if (psl->strand[0] != psl->strand[1]) return -1; else return 1; } else { if (psl->strand[0] == '-') return -1; else return 1; } } int pslWeightedIntronOrientation(struct psl *psl, struct dnaSeq *genoSeq, int offset) /* Return >0 if introns make it look like alignment is on + strand, * <0 if introns make it look like alignment is on - strand, * 0 if can't tell. The absolute value of the return indicates * how many splice sites we've seen supporting the orientation. * Sequence should NOT be reverse complemented. */ { int intronDir = 0; int oneDir; int i; DNA *dna = genoSeq->dna; /* code below doesn't support negative target strand (translated blat) */ if (psl->strand[1] == '-') errAbort("pslWeightedIntronOrientation doesn't support a negative target strand"); for (i=1; iblockCount; ++i) { int iStart, iEnd, blockSize = psl->blockSizes[i-1]; if (psl->qStarts[i-1] + blockSize == psl->qStarts[i]) { iStart = psl->tStarts[i-1] + psl->blockSizes[i-1] - offset; iEnd = psl->tStarts[i] - offset; oneDir = intronOrientation(dna+iStart, dna+iEnd); intronDir += oneDir; } } return intronDir; } int pslIntronOrientation(struct psl *psl, struct dnaSeq *genoSeq, int offset) /* Return 1 if introns make it look like alignment is on + strand, * -1 if introns make it look like alignment is on - strand, * 0 if can't tell. * Sequence should NOT be reverse complemented. */ { int intronDir = pslWeightedIntronOrientation(psl, genoSeq, offset); if (intronDir < 0) intronDir = -1; else if (intronDir > 0) intronDir = 1; return intronDir; } boolean pslHasIntron(struct psl *psl, struct dnaSeq *seq, int seqOffset) /* Return TRUE if there's a probable intron. Sequence should NOT be * reverse complemented.*/ { int blockCount = psl->blockCount, i; unsigned *tStarts = psl->tStarts; unsigned *blockSizes = psl->blockSizes; unsigned *qStarts = psl->qStarts; int blockSize, start, end; DNA *dna = seq->dna; for (i=1; istrand[1] == '-') reverseIntRange(&start, &end, psl->tSize); start -= seqOffset; end -= seqOffset; if (intronOrientation(dna+start, dna+end) != 0) return TRUE; } } return FALSE; } void pslTailSizes(struct psl *psl, int *retStartTail, int *retEndTail) /* Find the length of "tails" (rather than extensions) implied by psl. */ { int orientation = pslOrientation(psl); int qFloppyStart, qFloppyEnd; int tFloppyStart, tFloppyEnd; if (orientation > 0) { qFloppyStart = psl->qStart; qFloppyEnd = psl->qSize - psl->qEnd; } else { qFloppyStart = psl->qSize - psl->qEnd; qFloppyEnd = psl->qStart; } tFloppyStart = psl->tStart; tFloppyEnd = psl->tSize - psl->tEnd; *retStartTail = min(qFloppyStart, tFloppyStart); *retEndTail = min(qFloppyEnd, tFloppyEnd); } static void rcSeqs(char **seqs, unsigned blockCount, unsigned *blockSizes) /* reverses complement sequences in list, maintain property that all strings * are in one malloc block. blockSizes should already be reversed. */ { char *buf, *next; int i, memSz = 0; /* get a new memory block for strings */ for (i = 0; i < blockCount; i++) memSz += blockSizes[i]+1; next = buf = needLargeMem(memSz); /* reverse compliment and copy to new memory block */ for (i = blockCount-1; i >= 0; i--) { int len = strlen(seqs[i]); reverseComplement(seqs[i], len); memcpy(next, seqs[i], len+1); next += len+1; } /* swap memory and update pointers */ freeMem(seqs[0]); seqs[0] = buf; next = buf; for (i = 0; i < blockCount; i++) { seqs[i] = next; next += blockSizes[i]+1; } } void pslRc(struct psl *psl) /* Reverse-complement a PSL alignment. This makes the target strand explicit. */ { unsigned tSize = psl->tSize, qSize = psl->qSize; unsigned blockCount = psl->blockCount, i; unsigned *tStarts = psl->tStarts, *qStarts = psl->qStarts, *blockSizes = psl->blockSizes; /* swap strand, forcing target to have an explict strand */ psl->strand[0] = (psl->strand[0] != '-') ? '-' : '+'; psl->strand[1] = (psl->strand[1] != '-') ? '-' : '+'; psl->strand[2] = 0; for (i=0; iqSequence != NULL) { rcSeqs(psl->qSequence, blockCount, blockSizes); rcSeqs(psl->tSequence, blockCount, blockSizes); } } /* macro to swap to variables */ #define swapVars(a, b, tmp) ((tmp) = (a), (a) = (b), (b) = (tmp)) static void swapBlocks(struct psl *psl) /* Swap the blocks in a psl without reverse complementing them. */ { int i; unsigned utmp; char *stmp; for (i = 0; i < psl->blockCount; i++) { swapVars(psl->qStarts[i], psl->tStarts[i], utmp); if (psl->qSequence != NULL) swapVars(psl->qSequence[i], psl->tSequence[i], stmp); } } static void swapRCBlocks(struct psl *psl) /* Swap and reverse complement blocks in a psl. Other psl fields must * be modified first */ { int i; unsigned *uatmp; char **satmp; reverseUnsigned(psl->tStarts, psl->blockCount); reverseUnsigned(psl->qStarts, psl->blockCount); reverseUnsigned(psl->blockSizes, psl->blockCount); swapVars(psl->tStarts, psl->qStarts, uatmp); /* qSize and tSize have already been swapped */ for (i = 0; i < psl->blockCount; i++) { psl->qStarts[i] = psl->qSize - (psl->qStarts[i] + psl->blockSizes[i]); psl->tStarts[i] = psl->tSize - (psl->tStarts[i] + psl->blockSizes[i]); } if (psl->qSequence != NULL) { /* note: all block sequences are stored in one malloc block, which is * entry zero */ rcSeqs(psl->qSequence, psl->blockCount, psl->blockSizes); rcSeqs(psl->tSequence, psl->blockCount, psl->blockSizes); swapVars(psl->qSequence, psl->tSequence, satmp); } } void pslSwap(struct psl *psl, boolean noRc) /* swap query and target in psl. If noRc is TRUE, don't reverse-complement * PSL if needed, instead make target strand explict. */ { int itmp; unsigned utmp; char ctmp, *stmp; swapVars(psl->qBaseInsert, psl->tBaseInsert, utmp); swapVars(psl->tNumInsert, psl->qNumInsert, utmp); swapVars(psl->qName, psl->tName, stmp); swapVars(psl->qSize, psl->tSize, utmp); swapVars(psl->qStart, psl->tStart, itmp); swapVars(psl->qEnd, psl->tEnd, itmp); /* handle strand and block copy */ if (psl->strand[1] != '\0') { /* translated */ swapVars(psl->strand[0], psl->strand[1], ctmp); swapBlocks(psl); } else if (noRc) { /* untranslated with no reverse complement */ psl->strand[1] = psl->strand[0]; psl->strand[0] = '+'; swapBlocks(psl); } else { /* untranslated */ if (psl->strand[0] == '+') swapBlocks(psl); else swapRCBlocks(psl); } } void pslTargetOffset(struct psl *psl, int offset) /* Add offset to target positions in psl. */ { int i, blockCount = psl->blockCount; unsigned *tStarts = psl->tStarts; psl->tStart += offset; psl->tEnd += offset; for (i=0; i\n"); fprintf(f, "psl %s:%d-%d %s %s:%d-%d %d\n", psl->qName, psl->qStart, psl->qEnd, psl->strand, psl->tName, psl->tStart, psl->tEnd, psl->blockCount); for (i=0; iblockCount; ++i) fprintf(f, " size %d, qStart %d, tStart %d\n", psl->blockSizes[i], psl->qStarts[i], psl->tStarts[i]); fprintf(f, ""); } static void pslRecalcBounds(struct psl *psl) /* Calculate qStart/qEnd tStart/tEnd at top level to be consistent * with blocks. */ { int qStart, qEnd, tStart, tEnd, size; int last = psl->blockCount - 1; qStart = psl->qStarts[0]; tStart = psl->tStarts[0]; size = psl->blockSizes[last]; qEnd = psl->qStarts[last] + size; tEnd = psl->tStarts[last] + size; if (psl->strand[0] == '-') reverseIntRange(&qStart, &qEnd, psl->qSize); if (psl->strand[1] == '-') reverseIntRange(&tStart, &tEnd, psl->tSize); psl->qStart = qStart; psl->qEnd = qEnd; psl->tStart = tStart; psl->tEnd = tEnd; } struct psl *pslTrimToTargetRange(struct psl *oldPsl, int tMin, int tMax) /* Return psl trimmed to fit inside tMin/tMax. Note this does not * update the match/misMatch and related fields. */ { int newSize; int oldBlockCount = oldPsl->blockCount; boolean tIsRc = (oldPsl->strand[1] == '-'); int newBlockCount = 0, completeBlockCount = 0; int i; struct psl *newPsl = NULL; int tMn = tMin, tMx = tMax; /* tMin/tMax adjusted for strand. */ /* Deal with case where we're completely trimmed out quickly. */ newSize = rangeIntersection(oldPsl->tStart, oldPsl->tEnd, tMin, tMax); if (newSize <= 0) return NULL; if (tIsRc) reverseIntRange(&tMn, &tMx, oldPsl->tSize); /* Count how many blocks will survive trimming. */ oldBlockCount = oldPsl->blockCount; for (i=0; itStarts[i]; int e = s + oldPsl->blockSizes[i]; int sz = e - s; int overlap; if ((overlap = rangeIntersection(s, e, tMn, tMx)) > 0) ++newBlockCount; if (overlap == sz) ++completeBlockCount; } if (newBlockCount == 0) return NULL; /* Allocate new psl and fill in what we already know. */ AllocVar(newPsl); strcpy(newPsl->strand, oldPsl->strand); newPsl->qName = cloneString(oldPsl->qName); newPsl->qSize = oldPsl->qSize; newPsl->tName = cloneString(oldPsl->tName); newPsl->tSize = oldPsl->tSize; newPsl->blockCount = newBlockCount; AllocArray(newPsl->blockSizes, newBlockCount); AllocArray(newPsl->qStarts, newBlockCount); AllocArray(newPsl->tStarts, newBlockCount); /* Fill in blockSizes, qStarts, tStarts with real data. */ newBlockCount = completeBlockCount = 0; for (i=0; iblockSizes[i]; int sz = oldSz; int tS = oldPsl->tStarts[i]; int tE = tS + sz; int qS = oldPsl->qStarts[i]; int qE = qS + sz; if (rangeIntersection(tS, tE, tMn, tMx) > 0) { int diff; if ((diff = (tMn - tS)) > 0) { tS += diff; qS += diff; sz -= diff; } if ((diff = (tE - tMx)) > 0) { tE -= diff; qE -= diff; sz -= diff; } newPsl->qStarts[newBlockCount] = qS; newPsl->tStarts[newBlockCount] = tS; newPsl->blockSizes[newBlockCount] = sz; ++newBlockCount; if (sz == oldSz) ++completeBlockCount; } } pslRecalcBounds(newPsl); return newPsl; } struct psl *pslTrimToQueryRange(struct psl *oldPsl, int qMin, int qMax) /* Return psl trimmed to fit inside qMin/qMax. Note this does not * update the match/misMatch and related fields. */ { int newSize; int oldBlockCount = oldPsl->blockCount; boolean qIsRc = (oldPsl->strand[0] == '-'); int newBlockCount = 0, completeBlockCount = 0; int i; struct psl *newPsl = NULL; int qMn = qMin, qMx = qMax; /* qMin/qMax adjusted for strand. */ /* Deal with case where we're completely trimmed out quickly. */ newSize = rangeIntersection(oldPsl->qStart, oldPsl->qEnd, qMin, qMax); if (newSize <= 0) return NULL; if (qIsRc) reverseIntRange(&qMn, &qMx, oldPsl->qSize); /* Count how many blocks will survive trimming. */ oldBlockCount = oldPsl->blockCount; for (i=0; iqStarts[i]; int e = s + oldPsl->blockSizes[i]; int sz = e - s; int overlap; if ((overlap = rangeIntersection(s, e, qMn, qMx)) > 0) ++newBlockCount; if (overlap == sz) ++completeBlockCount; } if (newBlockCount == 0) return NULL; /* Allocate new psl and fill in what we already know. */ AllocVar(newPsl); strcpy(newPsl->strand, oldPsl->strand); newPsl->qName = cloneString(oldPsl->qName); newPsl->qSize = oldPsl->qSize; newPsl->tName = cloneString(oldPsl->tName); newPsl->tSize = oldPsl->tSize; newPsl->blockCount = newBlockCount; AllocArray(newPsl->blockSizes, newBlockCount); AllocArray(newPsl->qStarts, newBlockCount); AllocArray(newPsl->tStarts, newBlockCount); /* Fill in blockSizes, qStarts, tStarts with real data. */ newBlockCount = completeBlockCount = 0; for (i=0; iblockSizes[i]; int sz = oldSz; int qS = oldPsl->qStarts[i]; int qE = qS + sz; int tS = oldPsl->tStarts[i]; int tE = tS + sz; if (rangeIntersection(qS, qE, qMn, qMx) > 0) { int diff; if ((diff = (qMn - qS)) > 0) { tS += diff; qS += diff; sz -= diff; } if ((diff = (qE - qMx)) > 0) { tE -= diff; qE -= diff; sz -= diff; } newPsl->qStarts[newBlockCount] = qS; newPsl->tStarts[newBlockCount] = tS; newPsl->blockSizes[newBlockCount] = sz; ++newBlockCount; if (sz == oldSz) ++completeBlockCount; } } pslRecalcBounds(newPsl); return newPsl; } char* pslGetCreateSql(char* table, unsigned options, int tNameIdxLen) /* Get SQL required to create PSL table. Options is a bit set consisting * of PSL_TNAMEIX, PSL_WITH_BIN, and PSL_XA_FORMAT. tNameIdxLen is * the number of characters in target name to index. If greater than * zero, must specify PSL_TNAMEIX. If zero and PSL_TNAMEIX is specified, * to will default to 8. */ { struct dyString *sqlCmd = newDyString(2048); char *sqlCmdStr; char binIx[32]; binIx[0] = '\0'; /* check and default tNameIdxLen */ if ((tNameIdxLen > 0) && !(options & PSL_TNAMEIX)) errAbort("pslGetCreateSql: must specify PSL_TNAMEIX with tNameIdxLen > 0"); if ((options & PSL_TNAMEIX) && (tNameIdxLen == 0)) tNameIdxLen = 8; /* setup tName and bin index fields */ if (options & PSL_WITH_BIN) { if (options & PSL_TNAMEIX) safef(binIx, sizeof(binIx), "INDEX(tName(%d),bin),\n", tNameIdxLen); else safef(binIx, sizeof(binIx), "INDEX(bin),\n"); } else if (options & PSL_TNAMEIX) safef(binIx, sizeof(binIx), "INDEX(tName(%d)),\n", tNameIdxLen); dyStringPrintf(sqlCmd, createString, table, ((options & PSL_WITH_BIN) ? "bin smallint unsigned not null,\n" : "")); if (options & PSL_XA_FORMAT) { dyStringPrintf(sqlCmd, "qSeq longblob not null,\n"); dyStringPrintf(sqlCmd, "tSeq longblob not null,\n"); } dyStringPrintf(sqlCmd, indexString, binIx); sqlCmdStr = cloneString(sqlCmd->string); dyStringFree(&sqlCmd); return sqlCmdStr; } static void printPslDesc(char* pslDesc, FILE* out, struct psl* psl) /* print description of a PSL on first error */ { fprintf(out, "Error: invalid PSL: %s:%u-%u %s:%u-%u %s %s\n", psl->qName, psl->qStart, psl->qEnd, psl->tName, psl->tStart, psl->tEnd, psl->strand, pslDesc); } static void chkError(char* pslDesc, FILE* out, struct psl* psl, int* errCount, char* format, ...) /* forward needed to specify printf signature for gcc checking */ #if defined(__GNUC__) __attribute__((format(printf, 5, 6))) #endif ; static void chkError(char* pslDesc, FILE* out, struct psl* psl, int* errCount, char* format, ...) /* error handling on an pslCheck error, counting error and issuing description * of PSL on the first error. */ { if (*errCount == 0) printPslDesc(pslDesc, out, psl); va_list args; va_start(args, format); vfprintf(out, format, args); va_end(args); (*errCount)++; } static void chkBlkRanges(char* pslDesc, FILE* out, struct psl* psl, char* pName, char* pLabel, char pCLabel, char pStrand, unsigned pSize, unsigned pStart, unsigned pEnd, unsigned iBlk, unsigned* blockSizes, unsigned* pBlockStarts, int* errCount) /* check the target or query ranges in a PSL incrementing errorCnt */ { unsigned blkStart = pBlockStarts[iBlk]; unsigned blkEnd = blkStart+blockSizes[iBlk]; /* translate stand to genomic coords */ unsigned gBlkStart = (pStrand == '+') ? blkStart : (pSize - blkEnd); unsigned gBlkEnd = (pStrand == '+') ? blkEnd : (pSize - blkStart); if ((pSize > 0) && (blkEnd > pSize)) chkError(pslDesc, out, psl, errCount, "\t%s %s block %u end %u > %cSize %u\n", pName, pLabel, iBlk, blkEnd, pCLabel, pSize); if (gBlkStart < pStart) chkError(pslDesc, out, psl, errCount, "\t%s %s block %u start %u < %cStart %u\n", pName, pLabel, iBlk, gBlkStart, pCLabel, pStart); if (gBlkStart >= pEnd) chkError(pslDesc, out, psl, errCount, "\t%s %s block %u start %u >= %cEnd %u\n", pName, pLabel, iBlk, gBlkStart, pCLabel, pEnd); if (gBlkEnd < pStart) chkError(pslDesc, out, psl, errCount, "\t%s %s block %u end %u < %cStart %u\n", pName, pLabel, iBlk, gBlkEnd, pCLabel, pStart); if (gBlkEnd > pEnd) chkError(pslDesc, out, psl, errCount, "\t%s %s block %u end %u > %cEnd %u\n", pName, pLabel, iBlk, gBlkEnd, pCLabel, pEnd); if (iBlk > 0) { unsigned prevBlkEnd = pBlockStarts[iBlk-1]+blockSizes[iBlk-1]; if (blkStart < prevBlkEnd) chkError(pslDesc, out, psl, errCount, "\t%s %s block %u start %u < previous block end %u\n", pName, pLabel, iBlk, blkStart, prevBlkEnd); } } static void chkRanges(char* pslDesc, FILE* out, struct psl* psl, char* pName, char* pLabel, char pCLabel, char pStrand, unsigned pSize, unsigned pStart, unsigned pEnd, unsigned blockCount, unsigned* blockSizes, unsigned* pBlockStarts, int blockSizeMult, int* errCount) /* check the target or query ranges in a PSL, increment errorCnt */ { unsigned iBlk; if (pStart >= pEnd) chkError(pslDesc, out, psl, errCount, "\t%s %cStart %u >= %cEnd %u\n", pName, pCLabel, pStart, pCLabel, pEnd); if (pEnd > pSize) chkError(pslDesc, out, psl, errCount, "\t%s %cEnd %u >= %cSize %u\n", pName, pCLabel, pEnd, pCLabel, pSize); // check that block start/end matches overall start end unsigned pStartStrand = pStart, pEndStrand = pEnd; if (pStrand != '+') reverseUnsignedRange(&pStartStrand, &pEndStrand, pSize); unsigned lastBlkEnd = pBlockStarts[blockCount-1] + (blockSizeMult * blockSizes[blockCount-1]); if ((pStartStrand != pBlockStarts[0]) || (pEndStrand != lastBlkEnd)) chkError(pslDesc, out, psl, errCount, "\t%s strand \"%c\" adjusted %cStart-%cEnd range %u-%u != block range %u-%u\n", pName, pStrand, pCLabel, pCLabel, pStartStrand, pEndStrand, pBlockStarts[0], lastBlkEnd); for (iBlk = 0; iBlk < blockCount; iBlk++) chkBlkRanges(pslDesc, out, psl, pName, pLabel, pCLabel, pStrand, pSize, pStart, pEnd, iBlk, blockSizes, pBlockStarts, errCount); } int pslCheck(char *pslDesc, FILE* out, struct psl* psl) /* Validate a PSL for consistency. pslDesc is printed the error messages * to file out (open /dev/null to discard). Return count of errors. */ { static char* VALID_STRANDS[] = { "+", "-", "++", "+-", "-+", "--", NULL }; int i, errCount = 0; int tBlockSizeMult = pslIsProtein(psl) ? 3 : 1; /* check strand value */ for (i = 0; VALID_STRANDS[i] != NULL; i++) { if (strcmp(psl->strand, VALID_STRANDS[i]) == 0) break; } if (VALID_STRANDS[i] == NULL) chkError(pslDesc, out, psl, &errCount, "\tinvalid PSL strand: \"%s\"\n", psl->strand); /* check target */ chkRanges(pslDesc, out, psl, psl->tName, "target", 't', pslTStrand(psl), psl->tSize, psl->tStart, psl->tEnd, psl->blockCount, psl->blockSizes, psl->tStarts, tBlockSizeMult, &errCount); /* check query */ chkRanges(pslDesc, out, psl, psl->qName, "query", 'q', pslQStrand(psl), psl->qSize, psl->qStart, psl->qEnd, psl->blockCount, psl->blockSizes, psl->qStarts, 1, &errCount); return errCount; } struct hash *readPslToBinKeeper(char *sizeFileName, char *pslFileName) /* read a list of psls and return results in hash of binKeeper structure for fast query*/ { struct binKeeper *bk; struct psl *psl; struct lineFile *sf = lineFileOpen(sizeFileName, TRUE); struct lineFile *pf = lineFileOpen(pslFileName , TRUE); struct hash *hash = newHash(0); char *chromRow[2]; char *row[21] ; while (lineFileRow(sf, chromRow)) { char *name = chromRow[0]; int size = lineFileNeedNum(sf, chromRow, 1); if (hashLookup(hash, name) != NULL) warn("Duplicate %s, ignoring all but first\n", name); else { bk = binKeeperNew(0, size); assert(size > 1); hashAdd(hash, name, bk); } } while (lineFileNextRow(pf, row, ArraySize(row))) { psl = pslLoad(row); bk = hashMustFindVal(hash, psl->tName); binKeeperAdd(bk, psl->tStart, psl->tEnd, psl); } lineFileClose(&pf); lineFileClose(&sf); return hash; } static boolean isDelChar(char c) /* is this a indel character? */ { return (c == '-') || (c == '.') || (c == '=') || (c == '_'); } static void trimAlignment(struct psl* psl, char** qStrPtr, char** tStrPtr, int* aliSizePtr) /* remove leading or trailing indels from alignment */ { char* qStr = *qStrPtr; char* tStr = *tStrPtr; int aliSize = *aliSizePtr; // skip lending indels while ((aliSize > 0) && (isDelChar(*qStr) || isDelChar(*tStr))) { if (!isDelChar(*qStr)) psl->qStart++; else if (!isDelChar(*tStr)) psl->tStart++; qStr++; tStr++; aliSize--; } // skip trailing indels while ((aliSize > 0) && (isDelChar(qStr[aliSize-1]) || isDelChar(tStr[aliSize-1]))) { if (!isDelChar(qStr[aliSize-1])) psl->qEnd--; else if (!isDelChar(tStr[aliSize-1])) psl->tEnd--; aliSize--; } *qStrPtr = qStr; *tStrPtr = tStr; *aliSizePtr = aliSize; } static void addBlock(struct psl* psl, int qs, int qe, int ts, int te, int *blockSpace) /* add a block to the psl, growing if necessary */ { assert((qe-qs) == (te-ts)); /* same lengths? */ if (psl->blockCount == *blockSpace) pslGrow(psl, blockSpace); psl->blockSizes[psl->blockCount] = qe - qs; psl->qStarts[psl->blockCount] = qs; psl->tStarts[psl->blockCount] = ts; psl->blockCount++; } static void accumCounts(struct psl *psl, char prevQ, char prevT, char q, char t, unsigned options) /* accumulate block and base counts */ { if (!isDelChar(q) && !isDelChar(t)) { /* aligned column. */ char qu = toupper(q); char tu = toupper(t); if ((q == 'N') || (t == 'N')) psl->nCount++; else if (qu == tu) { if ((options & PSL_IS_SOFTMASK) && ((qu != q) || (tu != t))) psl->repMatch++; else psl->match++; } else psl->misMatch++; } else if (isDelChar(q) && !isDelChar(t)) { /* target insert */ psl->tBaseInsert++; if (!isDelChar(prevQ)) psl->tNumInsert++; } else if (isDelChar(t) && !isDelChar(q)) { /* query insert */ psl->qBaseInsert++; if (!isDelChar(prevT)) psl->qNumInsert++; } } struct psl* pslFromAlign(char *qName, int qSize, int qStart, int qEnd, char *qString, char *tName, int tSize, int tStart, int tEnd, char *tString, char* strand, unsigned options) /* Create a PSL from an alignment. Options PSL_IS_SOFTMASK if lower case * bases indicate repeat masking. Returns NULL if alignment is empty after * triming leading and trailing indels.*/ { /* Note, the unit tests for these programs exercise this function: * hg/embossToPsl * hg/mouseStuff/axtToPsl * hg/spideyToPsl * utils/est2genomeToPsl */ int blockSpace = 16; struct psl* psl = NULL; int aliSize = strlen(qString); boolean eitherInsert = FALSE; /* True if either in insert state. */ int i, qs,qe,ts,te; char prevQ = '\0', prevT = '\0'; AllocVar(psl); if (strlen(tString) != aliSize) errAbort("query and target alignment strings are different lengths"); psl = pslNew(qName, qSize, qStart, qEnd, tName, tSize, tStart, tEnd, strand, blockSpace, 0); trimAlignment(psl, &qString, &tString, &aliSize); /* Don't create if either query or target is zero length after trim */ if ((psl->qStart == psl->qEnd) || (psl->tStart == psl->tEnd)) { pslFree(&psl); return NULL; } /* Get range of alignment in strand-specified coordinates */ qs = psl->qStart; qe = psl->qEnd; if (strand[0] == '-') reverseIntRange(&qs, &qe, psl->qSize); ts = psl->tStart; te = psl->tEnd; if (strand[1] == '-') reverseIntRange(&ts, &te, psl->tSize); eitherInsert = FALSE; qe = qs; /* current block coords */ te = ts; for (i=0; i 0); psl->qName = cloneString(qName); psl->qSize = qSize; psl->qStart = qStart; psl->qEnd = qEnd; psl->tName = cloneString(tName); psl->tSize = tSize; psl->tStart = tStart; psl->tEnd = tEnd; strncpy(psl->strand, strand, 2); AllocArray(psl->blockSizes, blockSpace); AllocArray(psl->qStarts, blockSpace); AllocArray(psl->tStarts, blockSpace); if (opts & PSL_XA_FORMAT) { AllocArray(psl->qSequence, blockSpace); AllocArray(psl->tSequence, blockSpace); } return psl; } void pslGrow(struct psl *psl, int *blockSpacePtr) /* Increase memory allocated to a psl to hold more blocks. blockSpacePtr * should point the the current maximum number of blocks and will be * updated to with the new amount of space. */ { int blockSpace = *blockSpacePtr; int newSpace = 2 * blockSpace; ExpandArray(psl->blockSizes, blockSpace, newSpace); ExpandArray(psl->qStarts, blockSpace, newSpace); ExpandArray(psl->tStarts, blockSpace, newSpace); if (psl->qSequence != NULL) { ExpandArray(psl->qSequence, blockSpace, newSpace); ExpandArray(psl->tSequence, blockSpace, newSpace); } *blockSpacePtr = newSpace; } static boolean getNextCigarOp(char **ptr, char *op, int *size) /* parts the next cigar op */ { char *str = *ptr; if ((str == NULL) || (*str == 0)) return FALSE; char *end = strchr(str, '+'); if (end) { *end = 0; *ptr = end + 1; } else *ptr = NULL; *op = *str++; *size = atoi(str); return TRUE; } struct psl* pslFromGff3Cigar(char *qName, int qSize, int qStart, int qEnd, char *tName, int tSize, int tStart, int tEnd, char* strand, char *cigar) /* create a PSL from a GFF3-style cigar formatted alignment */ { // this is currently tested by the gff3ToPsl int blocksAlloced = 4; struct psl *psl = pslNew(qName, qSize, qStart, qEnd, tName, tSize, tStart, tEnd, strand, blocksAlloced, 0); char cigarSpec[strlen(cigar+1)]; // copy since parsing is destructive strcpy(cigarSpec, cigar); char *cigarNext = cigarSpec; char op; int size; int qNext = qStart, qBlkEnd = qEnd; if (strand[0] == '-') reverseIntRange(&qNext, &qBlkEnd, qSize); int tNext = tStart, tBlkEnd = tEnd; if (strand[1] == '-') reverseIntRange(&tNext, &tBlkEnd, tSize); while(getNextCigarOp(&cigarNext, &op, &size)) { switch (op) { case 'M': // match or mismatch (gapless aligned block) if (psl->blockCount == blocksAlloced) pslGrow(psl, &blocksAlloced); psl->blockSizes[psl->blockCount] = size; psl->qStarts[psl->blockCount] = qNext; psl->tStarts[psl->blockCount] = tNext; psl->blockCount++; tNext += size; qNext += size; break; case 'I': // inserted in target tNext += size; break; case 'D': // deleted from target qNext += size; break; default: errAbort("unrecognized CIGAR op %c in %s", op, cigar); } } assert(qNext == qBlkEnd); assert(tNext == tBlkEnd); return psl; } int pslRangeTreeOverlap(struct psl *psl, struct rbTree *rangeTree) /* Return amount that psl overlaps (on target side) with rangeTree. */ { int i; int overlap = 0; boolean isRc = (psl->strand[1] == '-'); for (i=0; iblockCount; ++i) { int start = psl->tStarts[i]; int end = start + psl->blockSizes[i]; if (isRc) reverseIntRange(&start, &end, psl->tSize); overlap += rangeTreeOverlapSize(rangeTree, start, end); } return overlap; } float pslIdent(struct psl *psl) /* computer fraction identity */ { float aligned = psl->match + psl->misMatch + psl->repMatch; if (aligned == 0.0) return 0.0; else return ((float)(psl->match + psl->repMatch))/aligned; } float pslQueryAligned(struct psl *psl) /* compute fraction of query that was aligned */ { float aligned = psl->match + psl->misMatch + psl->repMatch; return aligned/(float)psl->qSize; }