/* txGeneXref - Make kgXref type table for genes.. */ #include "common.h" #include "linefile.h" #include "hash.h" #include "options.h" #include "jksql.h" #include "spDb.h" #include "cdsPick.h" #include "txInfo.h" #include "txRnaAccs.h" void usage() /* Explain usage and exit. */ { errAbort( "txGeneXref - Make kgXref type table for genes.\n" "usage:\n" " txGeneXref genomeDb tempDb uniProtDb gene.gp gene.info genes.picks genes.ev output.xref\n" "where:\n" " genomeDb - a browser database (ex. hg19)\n" " tempDb - the database where the gene data is getting built" " uniProtDb - UniProt database containing data on the relevant proteins" " gene.gp - the gene prediction file, containing the gene structures" " gene.info - summary information on the gene structures" " gene.picks - information on the CDS region picked for the gene, if any" " gene.ev - evidence on which txWalk based the gene structure" " output.xref - the output file" "options:\n" " -xxx=XXX\n" ); } static struct optionSpec options[] = { {NULL, 0}, }; void removePickVersions(struct cdsPick *pick) /* Remove version suffixes from various pick fields. */ { chopSuffix(pick->refProt); chopSuffix(pick->refSeq); } struct hash *makeGeneToProtHash(char *fileName) /* Create hash that links gene name to protein name. * Feed this in extended gene pred.*/ { struct hash *hash = newHash(18); char *row[11]; struct lineFile *lf = lineFileOpen(fileName, TRUE); while (lineFileRowTab(lf, row)) hashAdd(hash, row[0], cloneString(row[10])); lineFileClose(&lf); return hash; } boolean isTrna(char *sourceAcc) /* isTrna - determine if the gene is a tRNA gene predictioon */ { /* If the gene is a tRNA gene prediction from the Lowe lab, * then its name will contain the word 'tRNA' immediately after * a period. */ if (stringIn(".tRNA", sourceAcc)) return(TRUE); else return(FALSE); } boolean isRfam(char *sourceAcc) /* isRfam - determine if the sequence comes from Rfam */ { /* If the sequence comes from Rfam, its source accession should begin * with the letters RF, and then should have a string of digits * followed by a semicolon (and subsequent characters). No other * known accessions match this pattern. */ /* Note: At this time (9/15/11), Rfam accessions include five digits after * the RF. But I wouldn't want to bet on that never changing... */ int ii; if (strlen(sourceAcc) >= 3) { if (sourceAcc[0] == 'R' && sourceAcc[1] == 'F' && isdigit(sourceAcc[2])) { for (ii = 3; ii < strlen(sourceAcc); ii++) { if (!isdigit(sourceAcc[ii]) && sourceAcc[ii] != ';') { break; } if (sourceAcc[ii] == ';') { return(TRUE); } } } } return(FALSE); } void getNameDescrFromRefSeq(char *refseq, struct sqlConnection *gConn, char **geneSymbol, char **description) /* Given a RefSeq accession, get the gene symbol and description */ { char query[256]; struct sqlResult *sr; safef(query, sizeof(query), "select name,product from refLink where mrnaAcc='%s'", refseq); sr = sqlGetResult(gConn, query); char **row = sqlNextRow(sr); if (row != NULL) *geneSymbol = cloneString(row[0]); sqlFreeResult(&sr); safef(query, sizeof(query), "select d.name from gbCdnaInfo g, description d where g.description = d.id and g.acc = '%s'", refseq); sr = sqlGetResult(gConn, query); row = sqlNextRow(sr); if (row != NULL) *description = cloneString(row[0]); sqlFreeResult(&sr); } void txGeneXref(char *genomeDb, char *tempDb, char *uniProtDb, char *genePredFile, char *infoFile, char *pickFile, char *evFile, char *outFile) /* txGeneXref - Make kgXref type table for genes.. */ { /* Load picks into hash. We don't use cdsPicksLoadAll because empty fields * cause that autoSql-generated routine problems. */ struct hash *pickHash = newHash(18); struct hash *geneToProtHash = makeGeneToProtHash(genePredFile); struct cdsPick *pick; struct lineFile *lf = lineFileOpen(pickFile, TRUE); char *row[CDSPICK_NUM_COLS]; while (lineFileRowTab(lf, row)) { pick = cdsPickLoad(row); removePickVersions(pick); hashAdd(pickHash, pick->name, pick); } /* Load evidence into hash */ struct hash *evHash = newHash(18); struct txRnaAccs *ev, *evList = txRnaAccsLoadAll(evFile); for (ev = evList; ev != NULL; ev = ev->next) hashAdd(evHash, ev->name, ev); /* Open connections to our databases */ struct sqlConnection *gConn = sqlConnect(genomeDb); struct sqlConnection *tConn = sqlConnect(tempDb); struct sqlConnection *uConn = sqlConnect(uniProtDb); /* Read in info file, and loop through it to make out file. */ struct txInfo *info, *infoList = txInfoLoadAll(infoFile); FILE *f = mustOpen(outFile, "w"); for (info = infoList; info != NULL; info = info->next) { char *kgID = info->name; char *mRNA = ""; char *spID = ""; char *spDisplayID = ""; char *geneSymbol = NULL; char *refseq = ""; char *protAcc = ""; char *description = NULL; char *rfamAcc = ""; char *tRnaName = ""; char query[256]; char *proteinId = hashMustFindVal(geneToProtHash, info->name); boolean isAb = sameString(info->category, "antibodyParts"); pick = hashFindVal(pickHash, info->name); ev = hashFindVal(evHash, info->name); /* Fill in gene symbol and description from an overlapping refseq if possible. */ struct sqlResult *sr; safef(query, sizeof(query), "select value from knownToRefSeq where name='%s'", kgID); sr = sqlGetResult(tConn, query); char **row = sqlNextRow(sr); if (row != NULL) { getNameDescrFromRefSeq(row[0], gConn, &geneSymbol, &description); } sqlFreeResult(&sr); if (pick != NULL) { /* Fill in the relatively straightforward fields. */ refseq = pick->refSeq; if (info->orfSize > 0) { protAcc = pick->refProt; spID = proteinId; if (sameString(protAcc, spID)) spID = pick->uniProt; if (spID[0] != 0) spDisplayID = spAnyAccToId(uConn, spID); } if (refseq[0] != 0 && (geneSymbol == NULL || description == NULL)) { getNameDescrFromRefSeq(refseq, gConn, &geneSymbol, &description); } /* If need be try uniProt for gene symbol and description. */ if (spID[0] != 0 && (geneSymbol == NULL || description == NULL)) { char *acc = spLookupPrimaryAcc(uConn, spID); if (description == NULL) description = spDescription(uConn, acc); if (geneSymbol == NULL) { struct slName *nameList = spGenes(uConn, acc); if (nameList != NULL) geneSymbol = cloneString(nameList->name); slFreeList(&nameList); } } } /* If it's an Rfam, split out the tokens from the semicolon-delimited * sourceAcc. The first token will be the Rfam accession, the second * token will be used as the gene symbol, and the description will be * derived from the third symbol (contig ID) and Rfam accession */ if (isRfam(info->sourceAcc)) { struct slName *accessionTokens = slNameListFromString(info->sourceAcc, ';'); assert(accessionTokens != NULL); rfamAcc = cloneString(accessionTokens->name); assert(accessionTokens->next != NULL); char *accession = replaceChars(accessionTokens->next->name, "-", "_"); geneSymbol = replaceChars(accession, "Alias=", ""); freeMem(accession); if (isalpha(*geneSymbol)) *geneSymbol = toupper(*geneSymbol); assert(accessionTokens->next->next != NULL); char *contigCoordinates = replaceChars(accessionTokens->next->next->name, "Note=", ""); struct dyString *dyTmp = dyStringCreate("Rfam model %s hit found at contig region %s", rfamAcc, contigCoordinates); description = dyStringCannibalize(&dyTmp); freeMem(contigCoordinates); slFreeList(&accessionTokens); } /* If it's a tRNA from the tRNA track, sourceAcc will have the following * form: chr.tRNA- where C identifies the chromosome, N * identifies the tRNA gene on the chromosome (so C and N together identify * the tRNA track item), AA identifies the amino acid (or is Pseudo in the * case of a predicted pseudogene), and AC is the anticodon. Use "TRNA_" * as the gene symbol and a gene description of "transfer RNA (anticodon ), * and put the sourceAcc in the tRnaName field */ else if (isTrna(info->sourceAcc)) { tRnaName = cloneString(info->sourceAcc); description = (char *) needMem(50); if (stringIn("Pseudo", tRnaName)) { char antiCodon[4]; (void) strncpy(antiCodon, strchr(tRnaName, '-') + strlen("Pseudo") + 1,3); geneSymbol = cloneString("TRNA_Pseudo"); sprintf(description, "transfer RNA pseudogene (anticodon %s)", &antiCodon[0]); } else { char aminoAcid[4], antiCodon[4]; (void) strncpy(antiCodon, strchr(tRnaName, '-') + 4, 3); (void) strncpy(aminoAcid, strchr(tRnaName, '-') + 1, 3); (void) strncpy(antiCodon, strchr(tRnaName, '-') + 4, 3); geneSymbol = catTwoStrings("TRNA_", aminoAcid); sprintf(description, "transfer RNA %s (anticodon %s)", &aminoAcid[0], &antiCodon[0]); } } else { /* If it's an antibody fragment use that as name. */ if (isAb) { geneSymbol = cloneString("abParts"); description = cloneString("Parts of antibodies, mostly variable regions."); isAb = TRUE; } if (ev == NULL) { mRNA = cloneString(""); if (!isAb) { errAbort("%s is %s but not %s\n", info->name, infoFile, evFile); } } else { mRNA = cloneString(ev->primary); chopSuffix(mRNA); } /* Still no joy? Try genbank RNA records. * First, try to get the symbol and description from * the same record. Don't do any of this if there's a RefSeq. * GenBank records are last resort, and if there's a RefSeq with * a blank description, going to GenBank records for the description * involves too much risk of a gene symbol and description that * contradict each other. */ if (geneSymbol == NULL && description == NULL && strcmp(refseq, "") != 0) { if (ev != NULL) { int i; for (i=0; iaccCount; ++i) { char *acc = ev->accs[i]; chopSuffix(acc); if (geneSymbol == NULL && description == NULL) { safef(query, sizeof(query), "select geneName.name from gbCdnaInfo,geneName " "where geneName.id=gbCdnaInfo.geneName " "and geneName.name != 'n/a'" "and gbCdnaInfo.description > 0 " "and gbCdnaInfo.acc = '%s'", acc); geneSymbol = sqlQuickString(gConn, query); if (geneSymbol != NULL) { safef(query, sizeof(query), "select description.name " "from gbCdnaInfo,description " "where description.id=gbCdnaInfo.description " "and gbCdnaInfo.acc = '%s'", acc); description = sqlQuickString(gConn, query); if (description != NULL) { if (sameString(description, "n/a")) description = NULL; } } } } } } } /* And if that failed too, try getting them from different records * and HOPING that they match... */ if (geneSymbol == NULL || description == NULL) { if (ev != NULL) { int i; for (i=0; iaccCount; ++i) { char *acc = ev->accs[i]; chopSuffix(acc); if (geneSymbol == NULL) { safef(query, sizeof(query), "select geneName.name from gbCdnaInfo,geneName " "where geneName.id=gbCdnaInfo.geneName and gbCdnaInfo.acc = '%s'", acc); geneSymbol = sqlQuickString(gConn, query); if (geneSymbol != NULL) { if (sameString(geneSymbol, "n/a")) geneSymbol = NULL; } } if (description == NULL) { safef(query, sizeof(query), "select description.name from gbCdnaInfo,description " "where description.id=gbCdnaInfo.description " "and gbCdnaInfo.acc = '%s'", acc); description = sqlQuickString(gConn, query); if (description != NULL) { if (sameString(description, "n/a")) description = NULL; } } } } } if (geneSymbol == NULL) geneSymbol = mRNA; if (description == NULL) description = mRNA; /* Get rid of some characters that will cause havoc downstream. */ stripChar(geneSymbol, '\''); subChar(geneSymbol, '<', '['); subChar(geneSymbol, '>', ']'); /* Abbreviate geneSymbol if too long */ if (strlen(geneSymbol) > 40) strcpy(geneSymbol+37, "..."); fprintf(f, "%s\t", kgID); fprintf(f, "%s\t", mRNA); fprintf(f, "%s\t", spID); fprintf(f, "%s\t", spDisplayID); fprintf(f, "%s\t", geneSymbol); fprintf(f, "%s\t", refseq); fprintf(f, "%s\t", protAcc); fprintf(f, "%s\t", description); fprintf(f, "%s\t", rfamAcc); fprintf(f, "%s\n", tRnaName); } carefulClose(&f); } int main(int argc, char *argv[]) /* Process command line. */ { optionInit(&argc, argv, options); if (argc != 9) usage(); txGeneXref(argv[1], argv[2], argv[3], argv[4], argv[5], argv[6], argv[7], argv[8]); return 0; }