/* hgNetDist - Gene/Interaction Network Distance path length calculator for GS Galt Barber 2005-05-08 This code assumes the network is bi-directional (undirected). */ #include #include "common.h" #include "linefile.h" #include "hash.h" #include "math.h" #include "options.h" #include "hdb.h" boolean skipFirst=FALSE; boolean weighted=FALSE; float threshold=2.0; static struct optionSpec options[] = { {"skipFirst", OPTION_BOOLEAN}, {"weighted", OPTION_BOOLEAN}, {"threshold", OPTION_FLOAT}, {NULL, 0}, }; void usage() /* Explain usage and exit. */ { errAbort( "hgNetDist - GS loader for gene/protein interaction network distances.\n" "usage:\n" " hgNetDist input.tab output.tab\n" "This will read the input.tab file and calculate the distances\n" "between all the genes/proteins in the (interaction) network and store\n" "the results in the output.tab specified, for use with hgLoadNetDist.\n" "Input .tab file format is 3 columns: gene1 gene2 value.\n" "The value is the edge-weight/distance but is ignored by default, treated as 1.0.\n" "Output .tab file format is 3 columns: gene1 gene2 pathLength.\n" "options:\n" " -skipFirst - skip the first header row in input.\n" " -weighted - count the 3rd column value as network edge weight.\n" " -threshold=9.9 - maximum network distance allowed, default=%f.\n" , threshold ); } int numEdges = 0; int numGenes = 0; char **geneIds; /* dynamically allocated array of strings */ struct hash *geneHash = NULL; struct hash *aliasHash = NULL; struct hash *missingHash = NULL; /* Floyd-Warshall dyn prg arrays */ float *d; /* previous d at step k */ float *dd; /* current d at step k+1 */ #define INFINITE -1.0 /* --- the fields needed from input tab file ---- */ struct edge /* Information about gene/protein network interaction This code assumes the edges are bi-directional. */ { struct edge *next; /* Next in singly linked list. */ char *geneA; /* geneId */ char *geneB; /* geneId */ float weight; /* distance or weight (default value is 1) */ }; int slEdgeCmp(const void *va, const void *vb) /* Compare two slNames. */ { const struct edge *a = *((struct edge **)va); const struct edge *b = *((struct edge **)vb); return strcmp(a->geneA, b->geneA); } int slEdgeCmpWeight(const void *va, const void *vb) /* Compare two slNames. */ { const struct edge *a = *((struct edge **)va); const struct edge *b = *((struct edge **)vb); if (a->weight > b->weight) return 1; if (a->weight < b->weight) return -1; return 0; } struct edge *edgeLoad(char **row) /* Load an edge from row fetched from linefile * Dispose of this with edgeFree(). */ { struct edge *ret; AllocVar(ret); ret->geneA = cloneString(row[0]); ret->geneB = cloneString(row[1]); ret->weight = weighted ? atof(row[2]) : 1.0; return ret; } void edgeFree(struct edge **pEl) /* Free an edge */ { struct edge *el; if ((el = *pEl) == NULL) return; freeMem(el->geneA); freeMem(el->geneB); freez(pEl); } void edgeFreeList(struct edge **pList) /* Free a list of dynamically allocated edges */ { struct edge *el, *next; for (el = *pList; el != NULL; el = next) { next = el->next; edgeFree(&el); } *pList = NULL; } struct edge *readEdges(char *fileName) /* read edges from file */ { struct lineFile *lf=NULL; struct edge *list=NULL, *el; char *row[3]; int rowCount=3; lf=lineFileOpen(fileName, TRUE); if (lf == NULL) { return NULL; } if (skipFirst) { /* skip first header row */ lineFileNextRowTab(lf, row, rowCount); } while (lineFileNextRowTab(lf, row, rowCount)) { el = edgeLoad(row); slAddHead(&list,el); } lineFileClose(&lf); return list; } void hgNetDist(char *inTab, char *outTab) { struct edge *edges = NULL; struct edge *edge = NULL; int k=0, i=0, j=0; float *dij=NULL, *dik=NULL, *dkj=NULL, *ddij=NULL, *tempswap=NULL; FILE *f=NULL; verbose(2,"inTab=%s outTab=%s \n", inTab, outTab); verbose(2,"reading edges %s \n",inTab); edges = readEdges(inTab); if (edges == NULL) { errAbort("readEdges returned NULL for %s.\n",inTab); } numEdges = slCount(edges); verbose(2,"slCount(edges)=%d for %s \n",numEdges,inTab); if (weighted) { slSort(&edges, slEdgeCmpWeight); verbose(2,"slSort done for %s \n",inTab); } /* for now, default to using numEdges for size estimate since we don't know numGenes at this point */ /* estimate numGenes worst case as numEdges*2 as all disjoint pairs edges */ geneHash = newHash(logBase2((numEdges*2))); geneIds = needMem((numEdges*2)*sizeof(char*)); verbose(2,"beginning processing data %s ... \n",inTab); for(edge = edges; edge; edge = edge->next) { int n = 0; n = hashIntValDefault(geneHash,edge->geneA,-1); if (n == -1) { geneIds[numGenes] = edge->geneA; hashAddInt(geneHash, edge->geneA, numGenes++); } n = hashIntValDefault(geneHash,edge->geneB,-1); if (n == -1) { geneIds[numGenes] = edge->geneB; hashAddInt(geneHash, edge->geneB, numGenes++); } } verbose(2,"number of nodes=%d \n",numGenes); /* allocate arrays for Floyd-Warshall */ d = (float *) needHugeMem(numGenes*numGenes*sizeof(float)); dd = (float *) needHugeMem(numGenes*numGenes*sizeof(float)); /* initialize first array d */ /* clear values to "INFINITE" */ for(i=0;inext) { int ia = 0, ib = 0; ia = hashIntVal(geneHash,edge->geneA); ib = hashIntVal(geneHash,edge->geneB); dij = d+(ia*numGenes)+ib; /* filter - initialize each unique pair only once, * priority least weighted distance due to sort above */ if ((*dij == INFINITE) || (*dij==0.0 && ia==ib)) { *dij = edge->weight; /* repeat b/c non-directional undirected graph assumption */ dij = d+(ib*numGenes)+ia; *dij = edge->weight; } } /* repeat k times through the Floyd-Warshall dyn prg algorithm */ for(k=0;k= 0.0) && (*dij <= threshold)) { char *gi=NULL, *gj=NULL; gi=geneIds[i]; gj=geneIds[j]; fprintf(f,"%s\t%s\t%f\n",gi,gj,*dij); } } } carefulClose(&f); edgeFreeList(&edges); freeMem(d); freeMem(dd); } int main(int argc, char *argv[]) /* Process command line. */ { optionInit(&argc, argv, options); if (argc != 3) usage(); skipFirst = optionExists("skipFirst"); weighted = optionExists("weighted"); threshold = optionFloat("threshold", threshold); hgNetDist(argv[1],argv[2]); verbose(2,"\ndone.\n"); return 0; }