/* vcf - stuff to handle VCF stuff in table browser. */ #include "common.h" #include "hgTables.h" #include "asFilter.h" #include "hubConnect.h" #include "asParse.h" #include "hgBam.h" #include "linefile.h" #include "localmem.h" #include "obscure.h" #include "vcf.h" #include "web.h" #if (defined USE_TABIX && defined KNETFILE_HOOKS) #include "knetUdc.h" #include "udc.h" #endif//def USE_TABIX && KNETFILE_HOOKS #define VCFDATALINE_NUM_COLS 10 boolean isVcfTable(char *table) /* Return TRUE if table corresponds to a VCF file. */ { struct trackDb *tdb = hashFindVal(fullTableToTdbHash, table); if (tdb) return tdbIsVcf(tdb); else return trackIsType(database, table, curTrack, "vcfTabix", ctLookupName); } struct hTableInfo *vcfToHti(char *table) /* Get standard fields of VCF into hti structure. */ { struct hTableInfo *hti; AllocVar(hti); hti->rootName = cloneString(table); hti->isPos= TRUE; strcpy(hti->chromField, "chrom"); strcpy(hti->startField, "pos"); strcpy(hti->nameField, "id"); hti->type = cloneString("vcfTabix"); return hti; } struct slName *vcfGetFields(char *table) /* Get fields of vcf as simple name list. */ { struct asObject *as = vcfAsObj(); struct slName *names = asColNames(as); return names; } struct sqlFieldType *vcfListFieldsAndTypes() /* Get fields of bigBed as list of sqlFieldType. */ { struct asObject *as = vcfAsObj(); struct sqlFieldType *list = sqlFieldTypesFromAs(as); return list; } void dyJoin(struct dyString *dy, char *delim, char **words, int wordCount) /* Clear dy and append wordCount words interspersed with delim. */ { dyStringClear(dy); int i; for (i = 0; i < wordCount; i++) { if (i > 0) dyStringAppend(dy, delim); dyStringAppend(dy, words[i]); } } void vcfInfoElsToString(struct dyString *dy, struct vcfFile *vcff, struct vcfRecord *rec) /* Unpack rec's typed infoElements to semicolon-sep'd string in dy.*/ { dyStringClear(dy); if (rec->infoCount == 0) dyStringAppendC(dy, '.'); int i; for (i = 0; i < rec->infoCount; i++) { if (i > 0) dyStringAppendC(dy, ';'); const struct vcfInfoElement *el = &(rec->infoElements[i]); const struct vcfInfoDef *def = vcfInfoDefForKey(vcff, el->key); enum vcfInfoType type = def? def->type : vcfInfoNoType; dyStringAppend(dy, el->key); if (el->count > 0) dyStringAppendC(dy, '='); int j; for (j = 0; j < el->count; j++) { if (j > 0) dyStringAppendC(dy, ','); if (el->missingData[j]) { dyStringAppend(dy, "."); continue; } union vcfDatum dat = el->values[j]; switch (type) { case vcfInfoInteger: dyStringPrintf(dy, "%d", dat.datInt); break; case vcfInfoFloat: { // use big precision and erase trailing zeros: char fbuf[64]; safef(fbuf, sizeof(fbuf), "%.16lf", dat.datFloat); int i; for (i = strlen(fbuf) - 1; i > 0; i--) if (fbuf[i] == '0') fbuf[i] = '\0'; else break; dyStringAppend(dy, fbuf); } break; case vcfInfoCharacter: dyStringAppendC(dy, dat.datChar); break; case vcfInfoFlag: // Flags could have values in older VCF case vcfInfoNoType: case vcfInfoString: dyStringAppend(dy, dat.datString); break; default: errAbort("Invalid vcfInfoType %d (how did this get past parser?", type); } } } } #define VCF_NUM_BUF_SIZE 256 void vcfRecordToRow(struct vcfRecord *rec, char *chrom, char *numBuf, struct dyString *dyAlt, struct dyString *dyFilter, struct dyString *dyInfo, struct dyString *dyGt, char *row[VCFDATALINE_NUM_COLS]) /* Convert vcfRecord to array of strings, using numBuf to store ascii * versions of numbers and dyStrings to store stringified arrays. */ { struct vcfFile *vcff = rec->file; char *numPt = numBuf; row[0] = chrom; // VCF files' chrom field often lacks "chr" -- use region->chrom from caller. row[1] = numPt; numPt += safef(numPt, VCF_NUM_BUF_SIZE - (numPt-numBuf), "%u", rec->chromStart+1); numPt++; row[2] = rec->name; row[3] = rec->alleles[0]; dyJoin(dyAlt, ",", &(rec->alleles[1]), rec->alleleCount-1); row[4] = dyAlt->string; row[5] = rec->qual; dyJoin(dyFilter, ";", rec->filters, rec->filterCount); row[6] = dyFilter->string; vcfInfoElsToString(dyInfo, vcff, rec); row[7] = dyInfo->string; if (vcff->genotypeCount > 0) { row[8] = rec->format; dyJoin(dyGt, "\t", rec->genotypeUnparsedStrings, vcff->genotypeCount); row[9] = dyGt->string; } else row[8] = row[9] = ""; // compatible with localmem usage } void vcfTabOut(char *db, char *table, struct sqlConnection *conn, char *fields, FILE *f) /* Print out selected fields from VCF. If fields is NULL, then print out all fields. */ { struct hTableInfo *hti = NULL; hti = getHti(db, table, conn); struct hash *idHash = NULL; char *idField = getIdField(db, curTrack, table, hti); int idFieldNum = 0; /* if we know what field to use for the identifiers, get the hash of names */ if (idField != NULL) idHash = identifierHash(db, table); if (f == NULL) f = stdout; /* Convert comma separated list of fields to array. */ int fieldCount = chopByChar(fields, ',', NULL, 0); char **fieldArray; AllocArray(fieldArray, fieldCount); chopByChar(fields, ',', fieldArray, fieldCount); /* Get list of all fields in big bed and turn it into a hash of column indexes keyed by * column name. */ struct hash *fieldHash = hashNew(0); struct slName *bb, *bbList = vcfGetFields(table); int i; for (bb = bbList, i=0; bb != NULL; bb = bb->next, ++i) { /* if we know the field for identifiers, save it away */ if ((idField != NULL) && sameString(idField, bb->name)) idFieldNum = i; hashAddInt(fieldHash, bb->name, i); } /* Create an array of column indexes corresponding to the selected field list. */ int *columnArray; AllocArray(columnArray, fieldCount); for (i=0; i 0); region = region->next) { char *fileName = bbiNameFromSettingOrTableChrom(tdb, conn, table, region->chrom); struct vcfFile *vcff = vcfTabixFileMayOpen(fileName, region->chrom, region->start, region->end, 100, maxOut); if (vcff == NULL) noWarnAbort(); // If we are outputting all fields, but this VCF has no genotype info, omit the // genotype columns from output: if (allFields && vcff->genotypeCount == 0) fieldCount = VCFDATALINE_NUM_COLS - 2; if (!printedHeader) { fprintf(f, "%s", vcff->headerString); if (filter) fprintf(f, "# Filtering on %d columns\n", slCount(filter->columnList)); if (!allFields) { fprintf(f, "#%s", fieldArray[0]); for (i=1; irecords; rec != NULL && (maxOut > 0); rec = rec->next) { vcfRecordToRow(rec, region->chrom, numBuf, dyAlt, dyFilter, dyInfo, dyGt, row); if (asFilterOnRow(filter, row)) { /* if we're looking for identifiers, check if this matches */ if ((idHash != NULL) && (hashLookup(idHash, row[idFieldNum]) == NULL)) continue; // All fields output: after asFilter'ing, preserve original VCF chrom if (allFields && !sameString(rec->chrom, region->chrom)) row[0] = rec->chrom; int i; fprintf(f, "%s", row[columnArray[0]]); for (i=1; ichrom, region->start, region->end, 100, *pMaxOut); if (vcff == NULL) noWarnAbort(); struct lm *lm = lmInit(0); char *row[VCFDATALINE_NUM_COLS]; char numBuf[VCF_NUM_BUF_SIZE]; // Temporary storage for row-ification: struct dyString *dyAlt = newDyString(1024); struct dyString *dyFilter = newDyString(1024); struct dyString *dyInfo = newDyString(1024); struct dyString *dyGt = newDyString(1024); struct vcfRecord *rec; for (rec = vcff->records; rec != NULL; rec = rec->next) { vcfRecordToRow(rec, region->chrom, numBuf, dyAlt, dyFilter, dyInfo, dyGt, row); if (asFilterOnRow(filter, row)) { if ((idHash != NULL) && (hashLookup(idHash, rec->name) == NULL)) continue; struct bed *bed; lmAllocVar(bedLm, bed); bed->chrom = lmCloneString(bedLm, region->chrom); bed->chromStart = rec->chromStart; bed->chromEnd = rec->chromEnd; bed->name = lmCloneString(bedLm, rec->name); slAddHead(pBedList, bed); } (*pMaxOut)--; if (*pMaxOut <= 0) break; } dyStringFree(&dyAlt); dyStringFree(&dyFilter); dyStringFree(&dyInfo); dyStringFree(&dyGt); lmCleanup(&lm); vcfFileFree(&vcff); } struct bed *vcfGetFilteredBedsOnRegions(struct sqlConnection *conn, char *db, char *table, struct region *regionList, struct lm *lm, int *retFieldCount) /* Get list of beds from VCF, in all regions, that pass filtering. */ { int maxOut = bigFileMaxOutput(); /* Figure out vcf file name get column info and filter. */ struct asObject *as = vcfAsObj(); struct asFilter *filter = asFilterFromCart(cart, db, table, as); struct hash *idHash = identifierHash(db, table); /* Get beds a region at a time. */ struct trackDb *tdb = hashFindVal(fullTableToTdbHash, table); struct bed *bedList = NULL; struct region *region; for (region = regionList; region != NULL; region = region->next) { char *fileName = bbiNameFromSettingOrTableChrom(tdb, conn, table, region->chrom); if (fileName == NULL) continue; addFilteredBedsOnRegion(fileName, region, table, filter, lm, &bedList, idHash, &maxOut); freeMem(fileName); if (maxOut <= 0) { warn("Reached output limit of %d data values, please make region smaller,\n" "\tor set a higher output line limit with the filter settings.", bigFileMaxOutput()); break; } } slReverse(&bedList); return bedList; } struct slName *randomVcfIds(char *table, struct sqlConnection *conn, int count) /* Return some semi-random IDs from a VCF file. */ { /* Read 10000 items from vcf file, or if they ask for a big list, then 4x what they ask for. */ struct trackDb *tdb = hashFindVal(fullTableToTdbHash, table); char *fileName = bbiNameFromSettingOrTableChrom(tdb, conn, table, hDefaultChrom(database)); struct lineFile *lf = lineFileTabixMayOpen(fileName, TRUE); if (lf == NULL) noWarnAbort(); int orderedCount = count * 4; if (orderedCount < 100) orderedCount = 100; struct slName *idList = NULL; char *words[4]; int i; for (i = 0; i < orderedCount && lineFileChop(lf, words); i++) { // compress runs of identical ID, in case most are placeholder if (i == 0 || !sameString(words[2], idList->name)) slAddHead(&idList, slNameNew(words[2])); } lineFileClose(&lf); /* Shuffle list and trim it to count if necessary. */ shuffleList(&idList, 1); struct slName *sl; for (sl = idList, i = 0; sl != NULL; sl = sl->next, i++) { if (i+1 >= count) { slNameFreeList(&(sl->next)); break; } } freez(&fileName); return idList; } #define VCF_MAX_SCHEMA_COLS 20 void showSchemaVcf(char *table, struct trackDb *tdb) /* Show schema on vcf. */ { struct sqlConnection *conn = hAllocConn(database); char *fileName = bbiNameFromSettingOrTableChrom(tdb, conn, table, hDefaultChrom(database)); struct asObject *as = vcfAsObj(); hPrintf("Database: %s", database); hPrintf("    Primary Table: %s
", table); hPrintf("VCF File: %s", fileName); hPrintf("
\n"); hPrintf("Format description: %s
", as->comment); hPrintf("See the Variant Call Format specification for more details
\n", "http://www.1000genomes.org/wiki/analysis/vcf4.0"); /* Put up table that describes fields. */ hTableStart(); hPrintf("field"); hPrintf("description "); puts("\n"); struct asColumn *col; int colCount = 0; for (col = as->columnList; col != NULL; col = col->next) { hPrintf("%s", col->name); hPrintf("%s", col->comment); ++colCount; } hTableEnd(); /* Put up another section with sample rows. */ webNewSection("Sample Rows"); hTableStart(); /* Fetch sample rows. */ struct lineFile *lf = lineFileTabixMayOpen(fileName, TRUE); if (lf == NULL) noWarnAbort(); char *row[VCF_MAX_SCHEMA_COLS]; int i; for (i = 0; i < 10; i++) { int colCount = lineFileChop(lf, row); int colIx; if (i == 0) { // Print field names as column headers, using colCount to compute genotype span hPrintf(""); for (colIx = 0, col = as->columnList; col != NULL && colIx < colCount; colIx++, col = col->next) { if (sameString("genotypes", col->name) && colCount > colIx+1) hPrintf("%s", colCount - colIx, col->name); else hPrintf("%s", col->name); } hPrintf("\n"); } hPrintf(""); for (colIx=0; colIx < colCount; ++colIx) { if (colCount > VCFDATALINE_NUM_COLS && colIx == colCount - 1) hPrintf("..."); else writeHtmlCell(row[colIx]); } hPrintf("\n"); } hTableEnd(); printTrackHtml(tdb); /* Clean up and go home. */ lineFileClose(&lf); freeMem(fileName); hFreeConn(&conn); }