/* hgFlyBase - Parse FlyBase genes.txt file and turn it into a couple of * tables. See http://flybase.org/.data/docs/refman/refman-B.html for a * description of these files. We don't try too hard to extract and * normalize every piece of data here, just the particularly valuable parts * we can't find elsewhere. In particular we want to get information on * wild-type function, phenotypes, and synonyms. * * Flybase records are separated by a line with a '#'. The fields * are line separated and each begin with a two character code that * defines the line type. Most fields start with '*' and a letter. * The first field is always '*a' and contains the gene symbol. * A particularly important and complex field is the '*E' field. * This describes a literature reference and data derived from it. * Subfields of the *E field start with % instead of * and are followed * by a single letter which means the same as when the field is * standalone. * * The '*A' field, indicating an allele, is also important and * constains subfields. This subfields start with '$'. If an * *A field has a $E subfield, the $E's sub-sub-fields start with * a '&'. * * From this we want to extract the following field types: * z - flybase ID * a - gene symbol * d - biological role of gene product * e - gene name * f - cellular compartment of which gene product is a component * i - synonyms (just for genes, not for alleles) * r - wild-type biological role * p - phenotypes of genes * A - allele with subfields * E - reference with subfields * F - function of gene product * * We'll make this into the following tables: * fbGene * fbSynonym (This includes gene symbol and name) * fbAllele * fbRef (These are not cleaned up from flybase) * fbRole * The allele or reference ID's may be zero indicating * the annotation belongs the gene as a whole or flybase * respectively. * fbPhenotype * fbGo * Note that we are ignoring the GO terms since we'll get them * from SwissProt, where the data is in a somewhat more regular * format. */ #include "common.h" #include "linefile.h" #include "hash.h" #include "options.h" #include "dystring.h" #include "portable.h" #include "hdb.h" #include "hgRelate.h" char *tabDir = "."; boolean doLoad; char *geneTable = "bdgpGene"; boolean doTranscript = TRUE; void usage() /* Explain usage and exit. */ { errAbort( "hgFlyBase - Parse FlyBase genes.txt file and turn it into a couple of tables\n" "usage:\n" " hgFlyBase database genes.txt\n" "options:\n" " -geneTable=tbl - Use tbl instead of default %s\n" " -tab=dir - Output tab-separated files to directory.\n" " -noLoad - If true don't load database and don't clean up tab files\n" , geneTable ); } static struct optionSpec options[] = { {"tab", OPTION_STRING}, {"noLoad", OPTION_BOOLEAN}, {"geneTable", OPTION_STRING}, {NULL, 0}, }; char *ungreekTable[] = { "agr", "alpha", "bgr", "beta", "dgr", "delta", "egr", "epsilon", "eegr", "eta", "ggr", "gamma", "igr", "iota", "ohgr", "omega", "thgr", "theta", "zgr", "zeta", }; char *hideTagTable[] = { "up", "/up", "down", "/down", }; char *spelledGreek(char *abbr) /* Convert from 'agr;' to 'alpha' or return NULL */ { int i; for (i=0; i'); if (e != NULL) { int size = e - s; if (size < 8); { int i; char tag[8]; memcpy(tag, s, size); tag[size] = 0; for (i=0; istring); dyStringFree(&dy); return result; } struct dyString *suckSameLines(struct lineFile *lf, char *line) /* Suck up lines concatenating as long as they begin with the same * first two characters as initial line. */ { struct dyString *dy = dyStringNew(0); char c1 = line[0], c2 = line[1]; dyStringAppend(dy, line+3); while (lineFileNext(lf, &line, NULL)) { if (line[0] != c1 || line[1] != c2) { lineFileReuse(lf); break; } dyStringAppend(dy, line+2); } return dy; } struct ref /* Keep track of reference. */ { int id; /* Reference ID. */ }; void remakeTables(struct sqlConnection *conn) /* Remake all our tables. */ { sqlRemakeTable(conn, "fbGene", "#Links FlyBase IDs, gene symbols and gene names\n" "CREATE TABLE fbGene (\n" " geneId varchar(255) not null, # FlyBase ID\n" " geneSym varchar(255) not null, # Short gene symbol\n" " geneName varchar(255) not null, # Gene name - up to a couple of words\n" " #Indices\n" " PRIMARY KEY(geneId(11)),\n" " INDEX(geneSym(8)),\n" " INDEX(geneName(12))\n" ")\n"); sqlRemakeTable(conn, "fbTranscript", "#Links FlyBase gene IDs and BDGP transcripts\n" "CREATE TABLE fbTranscript (\n" " geneId varchar(255) not null, # FlyBase ID\n" " transcriptId varchar(255) not null, # BDGP Transcript ID\n" " #Indices\n" " PRIMARY KEY(transcriptId(11)),\n" " INDEX(transcriptId(11))\n" ")\n"); sqlRemakeTable(conn, "fbSynonym", "#Links all the names we call a gene to it's flybase ID\n" "CREATE TABLE fbSynonym (\n" " geneId varchar(255) not null, # FlyBase ID\n" " name varchar(255) not null, # A name (synonym or real\n" " #Indices\n" " INDEX(geneId(11)),\n" " INDEX(name(12))\n" ")\n"); sqlRemakeTable(conn, "fbAllele", "#The alleles of a gene\n" "CREATE TABLE fbAllele (\n" " id int not null, # Allele ID\n" " geneId varchar(255) not null, # Flybase ID of gene\n" " name varchar(255) not null, # Allele name\n" " #Indices\n" " PRIMARY KEY(id),\n" " INDEX(geneId(11))\n" ")\n"); sqlRemakeTable(conn, "fbRef", "#A literature or sometimes database reference\n" "CREATE TABLE fbRef (\n" " id int not null, # Reference ID\n" " text longblob not null, # Usually begins with flybase ref ID, but not always\n" " #Indices\n" " PRIMARY KEY(id)\n" ")\n"); sqlRemakeTable(conn, "fbRole", "#Role of gene in wildType\n" "CREATE TABLE fbRole (\n" " geneId varchar(255) not null, # Flybase Gene ID\n" " fbAllele int not null, # ID in fbAllele table or 0 if not allele-specific\n" " fbRef int not null, # ID in fbRef table\n" " text longblob not null, # Descriptive text\n" " #Indices\n" " INDEX(geneId(11))\n" ")\n"); sqlRemakeTable(conn, "fbPhenotype", "#Observed phenotype in mutant. Sometimes contains gene function info\n" "CREATE TABLE fbPhenotype (\n" " geneId varchar(255) not null, # Flybase Gene ID\n" " fbAllele int not null, # ID in fbAllele table or 0 if not allele-specific\n" " fbRef int not null, # ID in fbRef table\n" " text longblob not null, # Descriptive text\n" " #Indices\n" " INDEX(geneId(11))\n" ")\n"); sqlRemakeTable(conn, "fbGo", "#Links FlyBase gene IDs and GO IDs/aspects\n" "CREATE TABLE fbGo (\n" " geneId varchar(255) not null, # FlyBase ID\n" " goId varchar(255) not null, # GO ID\n" " aspect varchar(255) not null, # P (process), F (function) or C (cellular component)" " #Indices\n" " INDEX(geneId(11)),\n" " INDEX(goId(10))\n" ")\n"); sqlRemakeTable(conn, "fbUniProt", "#Links FlyBase gene IDs and UniProt IDs/aspects\n" "CREATE TABLE fbUniProt (\n" " geneId varchar(255) not null, # FlyBase ID\n" " uniProtId varchar(255) not null, # UniProt ID\n" " #Indices\n" " INDEX(geneId(11)),\n" " INDEX(uniProtId(6))\n" ")\n"); } struct geneAlt /* Gene and list of isoforms. */ { struct geneAlt *next; char *fbName; /* Flybase gene name. */ char *bdgpName; /* BDGP Gene name. */ struct slName *isoformList; /* List of BDGP isoforms. */ }; void getAllSplices(char *database, FILE *f) /* Write out table linking flybase genes with BDGP transcripts -- * unfortunately bdgpGeneInfo lacks -R* transcript/isoform identifiers, * so strip those off of bdgpGene.name. * This is not necessary with flyBaseGene/flyBase2004Xref where -R*'s * are preserved. */ { struct sqlConnection *conn = sqlConnect(database); struct sqlResult *sr; char query[256], **row; struct geneAlt *altList = NULL, *alt; struct hash *bdgpHash = newHash(16); /* Keyed by bdgp gene id. */ struct slName *n; /* First build up list of all genes with flybase and bdgp ids. */ safef(query, sizeof(query), "select bdgpName,flyBaseId from bdgpGeneInfo"); sr = sqlGetResult(conn, query); while ((row = sqlNextRow(sr)) != NULL) { AllocVar(alt); alt->bdgpName = cloneString(row[0]); alt->fbName = cloneString(row[1]); slAddHead(&altList, alt); hashAdd(bdgpHash, alt->bdgpName, alt); } sqlFreeResult(&sr); slReverse(&altList); /* Now associate splicing variants. */ safef(query, sizeof(query), "select name from %s", geneTable); sr = sqlGetResult(conn, query); while ((row = sqlNextRow(sr)) != NULL) { char *s = row[0]; char *e = rStringIn("-R", s); int size = e ? (e - s) : strlen(s); char bdgpGene[16]; if (size >= sizeof(bdgpGene)) errAbort("'%s' too big", s); memcpy(bdgpGene, s, size); bdgpGene[size] = 0; alt = hashMustFindVal(bdgpHash, bdgpGene); n = slNameNew(s); slAddTail(&alt->isoformList, n); } sqlFreeResult(&sr); sqlDisconnect(&conn); for (alt = altList; alt != NULL; alt = alt->next) { for (n = alt->isoformList; n != NULL; n = n->next) fprintf(f, "%s\t%s\n", alt->fbName, n->name); } freeHash(&bdgpHash); } void hgFlyBase(char *database, char *genesFile) /* hgFlyBase - Parse FlyBase genes.txt file and turn it into a couple of * tables. */ { char *tGene = "fbGene"; char *tSynonym = "fbSynonym"; char *tAllele = "fbAllele"; char *tRef = "fbRef"; char *tRole = "fbRole"; char *tPhenotype = "fbPhenotype"; char *tTranscript = "fbTranscript"; char *tGo = "fbGo"; char *tUniProt = "fbUniProt"; FILE *fGene = hgCreateTabFile(tabDir, tGene); FILE *fSynonym = hgCreateTabFile(tabDir, tSynonym); FILE *fAllele = hgCreateTabFile(tabDir, tAllele); FILE *fRef = hgCreateTabFile(tabDir, tRef); FILE *fRole = hgCreateTabFile(tabDir, tRole); FILE *fPhenotype = hgCreateTabFile(tabDir, tPhenotype); FILE *fTranscript = NULL; FILE *fGo = hgCreateTabFile(tabDir, tGo); FILE *fUniProt = hgCreateTabFile(tabDir, tUniProt); struct lineFile *lf = lineFileOpen(genesFile, TRUE); struct hash *refHash = newHash(19); int nextRefId = 0; int nextAlleleId = 0; char *line, sub, type, *rest, *s; char *geneSym = NULL, *geneName = NULL, *geneId = NULL; int recordCount = 0; struct slName *synList = NULL, *syn; int curAllele = 0, curRef = 0; struct ref *ref = NULL; struct sqlConnection *conn; struct hash *goUniqHash = newHash(18); /* Make table from flybase genes to BGDP transcripts. */ if (doTranscript) { fTranscript = hgCreateTabFile(tabDir, tTranscript); getAllSplices(database, fTranscript); } /* Make dummy reference for flybase itself. */ fprintf(fRef, "0\tFlyBase\n"); /* Loop through parsing and writing tab files. */ while (lineFileNext(lf, &line, NULL)) { sub = line[0]; if (sub == '#') { /* End of record. */ ++recordCount; if (geneId == NULL) errAbort("Record without *z line ending line %d of %s", lf->lineIx, lf->fileName); /* Write out synonyms. */ s = naForNull(geneSym); geneSym = ungreek(s); freeMem(s); s = naForNull(geneName); geneName = ungreek(s); if (! sameString(s, "n/a")) freeMem(s); if (geneSym != NULL && !sameString(geneSym, "n/a")) slNameStore(&synList, geneSym); if (geneName != NULL && !sameString(geneName, "n/a")) slNameStore(&synList, geneName); for (syn = synList; syn != NULL; syn = syn->next) { s = ungreek(syn->name); fprintf(fSynonym, "%s\t%s\n", geneId, s); freeMem(s); } /* Write out gene record. */ fprintf(fGene, "%s\t%s\t%s\n", geneId, geneSym, geneName); /* Clean up. */ freez(&geneSym); freez(&geneName); freez(&geneId); slFreeList(&synList); ref = NULL; curRef = curAllele = 0; continue; } else if (sub == 0) errAbort("blank line %d of %s, not allowed in gene.txt", lf->lineIx, lf->fileName); else if (isalnum(sub)) errAbort("line %d of %s begins with %c, not allowed", lf->lineIx, lf->fileName, sub); type = line[1]; rest = trimSpaces(line+2); if (sub == '*' && type == 'a') geneSym = cloneString(rest); else if (sub == '*' && type == 'e') geneName = cloneString(rest); else if (sub == '*' && type == 'z') { geneId = cloneString(rest); if (!startsWith("FBgn", geneId)) errAbort("Bad FlyBase gene ID %s line %d of %s", geneId, lf->lineIx, lf->fileName); } else if (type == 'i' && (sub == '*' || sub == '$')) { if (strlen(rest) > 2) /* Avoid short useless ones. */ slNameStore(&synList, rest); } else if (sub == '*' && type == 'A') { if (geneId == NULL) errAbort("Allele before geneId line %d of %s", lf->lineIx, lf->fileName); curAllele = ++nextAlleleId; fprintf(fAllele, "%d\t%s\t%s\n", curAllele, geneId, rest); if (!sameString(rest, "classical") && !sameString(rest, "in vitro") && !sameString(rest, "wild-type") ) { slNameStore(&synList, rest); } } else if (sub == '*' && type == 'm') { if (geneId == NULL) errAbort("*m protein ID before geneId line %d of %s", lf->lineIx, lf->fileName); if (startsWith("UniProt", rest)) { char *ptr = strchr(rest, ':'); if (ptr != NULL) ptr++; else errAbort("Trouble parsing UniProt ID %s like %d of %s", rest, lf->lineIx, lf->fileName); fprintf(fUniProt, "%s\t%s\n", geneId, ptr); } } else if (type == 'E') { ref = hashFindVal(refHash, rest); if (ref == NULL) { AllocVar(ref); ref->id = ++nextRefId; hashAdd(refHash, rest, ref); subChar(rest, '\t', ' '); fprintf(fRef, "%d\t%s\n", ref->id, rest); } curRef = ref->id; } else if ((type == 'k' || type == 'r' || type == 'p') && sub != '@') { FILE *f = (type == 'r' ? fRole : fPhenotype); struct dyString *dy = suckSameLines(lf, line); subChar(dy->string, '\t', ' '); if (geneId == NULL) errAbort("Expecting *z in record before line %d of %s", lf->lineIx, lf->fileName); fprintf(f, "%s\t%d\t%d\t%s\n", geneId, curAllele, curRef, dy->string); dyStringFree(&dy); } else if (type == 'd' || type == 'f' || type == 'F') { FILE *f = fGo; char aspect = (type == 'd') ? 'P' : (type == 'f') ? 'C' : 'F'; char *goId = rest; char *p = strstr(goId, " ; "); char assoc[128]; if (p == NULL) continue; else goId = firstWordInLine(p + 3); safef(assoc, sizeof(assoc), "%s.%s", geneId, goId); if (hashLookup(goUniqHash, assoc) == NULL) { hashAddInt(goUniqHash, assoc, 1); fprintf(f, "%s\t%s\t%c\n", geneId, goId, aspect); } } } printf("Processed %d records in %d lines\n", recordCount, lf->lineIx); lineFileClose(&lf); conn = sqlConnect(database); remakeTables(conn); if (doLoad) { printf("Loading %s\n", tGene); hgLoadTabFile(conn, tabDir, tGene, &fGene); if (doTranscript) { printf("Loading %s\n", tTranscript); hgLoadTabFile(conn, tabDir, tTranscript, &fTranscript); } printf("Loading %s\n", tSynonym); hgLoadTabFile(conn, tabDir, tSynonym, &fSynonym); printf("Loading %s\n", tAllele); hgLoadTabFile(conn, tabDir, tAllele, &fAllele); printf("Loading %s\n", tRef); hgLoadTabFile(conn, tabDir, tRef, &fRef); printf("Loading %s\n", tRole); hgLoadTabFile(conn, tabDir, tRole, &fRole); printf("Loading %s\n", tPhenotype); hgLoadTabFile(conn, tabDir, tPhenotype, &fPhenotype); printf("Loading %s\n", tGo); hgLoadTabFile(conn, tabDir, tGo, &fGo); printf("Loading %s\n", tUniProt); hgLoadTabFile(conn, tabDir, tUniProt, &fUniProt); hgRemoveTabFile(tabDir, tGene); if (doTranscript) hgRemoveTabFile(tabDir, tTranscript); hgRemoveTabFile(tabDir, tSynonym); hgRemoveTabFile(tabDir, tAllele); hgRemoveTabFile(tabDir, tRef); hgRemoveTabFile(tabDir, tRole); hgRemoveTabFile(tabDir, tPhenotype); hgRemoveTabFile(tabDir, tGo); hgRemoveTabFile(tabDir, tUniProt); } } int main(int argc, char *argv[]) /* Process command line. */ { optionInit(&argc, argv, options); if (argc != 3) usage(); doLoad = !optionExists("noLoad"); if (optionExists("tab")) { tabDir = optionVal("tab", tabDir); makeDir(tabDir); } geneTable = optionVal("geneTable", geneTable); doTranscript = sameString(geneTable, "bdgpGene"); hgFlyBase(argv[1], argv[2]); return 0; }