/* probeMatcher - Program to sort out which probes belong to which splicing isoform. */ #include "common.h" #include "bed.h" #include "altGraphX.h" #include "hdb.h" #include "options.h" #include "affyProbe.h" void usage() { errAbort("probeMatcher - Program to associate probes with splicing isoforms.\n" "usage:\n" " probeMatcher \n"); } struct hash *constHash = NULL; struct agProbeSets /* Little info about the probe sets for altGraphX. */ { char *chrom; /* Chromosome. */ int chromStart; /* Chromosome Start. */ int chromEnd; /* Chromosome End. */ int type; /* Type of splicing event 2==cassette. */ char strand[3]; /* Genomic Strand. */ char *name; /* Name from altSplice clustering. */ int maxCounts; /* Maximum possible counts, used for * memory management. */ int contProbeCount; /* Number of constitutive probe sets. */ char **contProbeSets; /* Names of constitutive probe sets. */ int alt1ProbeCount; /* Number of alternate1 probe sets. */ char **alt1ProbeSets; /* Names of alternate1 probe sets. */ int alt2ProbeCount; /* Number of alternate2 probe sets. */ char **alt2ProbeSets; /* Names of alternate2 probe sets. */ int transcriptCount; /* Number of altMerge transcripts involved. */ char **transcriptNames; /* Names of altMerge transcripts. */ }; void occassionalDot() /* Write out a dot every 20 times this is called. */ { static int dotMod = 100; static int dot = 100; if (--dot <= 0) { putc('.', stdout); fflush(stdout); dot = dotMod; } } struct hash *makeBedHash(char *bedFile) { struct bed *bed=NULL, *bedNext = NULL, *bedList = NULL; struct bed *item = NULL; struct hash *bedHash = newHash(12); bedList = bedLoadAll(bedFile); for(bed = bedList; bed != NULL; bed = bedNext) { bedNext = bed->next; if(bed->score != 6) continue; item = hashFindVal(bedHash, bed->name); bed->next = NULL; if(item == NULL) { hashAdd(bedHash, bed->name, bed); } else { slAddTail(&item, bed); } } return bedHash; } struct agProbeSets *newAgP(struct altGraphX *ag) /* Allocated and fill in some initial values for agProbeSets. */ { struct agProbeSets *agp = NULL; AllocVar(agp); agp->chrom = cloneString(ag->tName); agp->chromStart = ag->tStart; agp->chromEnd = ag->tEnd; agp->type = ag->id; agp->strand[0] = ag->strand[0]; agp->name = cloneString(ag->name); agp->maxCounts = 10; AllocArray(agp->contProbeSets, agp->maxCounts); AllocArray(agp->alt1ProbeSets, agp->maxCounts); AllocArray(agp->alt2ProbeSets, agp->maxCounts); agp->transcriptCount = ag->mrnaRefCount; agp->transcriptNames = ag->mrnaRefs; return agp; } char ** expandAgp(struct agProbeSets *agp, char **probeSets ) { char **cT=agp->contProbeSets, **a1=agp->alt1ProbeSets, **a2 = agp->alt1ProbeSets; ExpandArray(agp->contProbeSets, agp->maxCounts, 2*agp->maxCounts); ExpandArray(agp->alt1ProbeSets, agp->maxCounts, 2*agp->maxCounts); ExpandArray(agp->alt2ProbeSets, agp->maxCounts, 2*agp->maxCounts); agp->maxCounts *= 2; if(probeSets == cT) return agp->contProbeSets; else if(probeSets == a1) return agp->alt1ProbeSets; else if(probeSets == a2) return agp->alt2ProbeSets; errAbort("Not sure which probeSet was being used."); return NULL; } void addProbeSets(struct altGraphX *ag, enum ggEdgeType type, int chromStart, int chromEnd, struct sqlConnection *conn, char *junctionTable, char *nonJunctionTable, struct agProbeSets *agp, int *count, char **probeSets) /* Add the relevant probe sets to probeSets for edge in ag. */ { char where[256]; struct sqlResult *sr = NULL; char **row = NULL; int *vPos = ag->vPositions; int *starts = ag->edgeStarts; int *ends = ag->edgeEnds; char *table = NULL; int rowOffset = 0; boolean unique = TRUE; struct bed *bedList = NULL, *bed = NULL; int i=0; struct affyProbe *affyProbe=NULL, *affyProbeList=NULL; /* Junctions have to be handled a bit differently. The numbers we need are going to be inside the data type and difficult to select for. have to get everything and sort it out afterwards. */ if(type == ggSJ) { safef(where, sizeof(where), " chromStart >= %d and chromEnd <= %d and blockCount = 2 ", chromStart-25, chromEnd+25); table = junctionTable; sr = hRangeQuery(conn, table, ag->tName, chromStart, chromEnd, where, &rowOffset); while((row = sqlNextRow(sr)) != NULL) { affyProbe = affyProbeLoad(row+rowOffset); slAddHead(&affyProbeList, affyProbe); } sqlFreeResult(&sr); for(affyProbe=affyProbeList; affyProbe != NULL; affyProbe = affyProbe->next) { /* If it is a juction it must line up with the starts and stops. */ if(type == ggSJ && ((affyProbe->chromStart+affyProbe->chromStarts[0]+affyProbe->blockSizes[0] != chromStart) || (affyProbe->chromStart+affyProbe->chromStarts[1] != chromEnd))) continue; safef(where, sizeof(where), "%s:%s", affyProbe->psName, affyProbe->seq); for(i=0; i<*count; i++) { if(sameString(probeSets[i], where)) unique = FALSE; } if(unique) { if(*count+1 > agp->maxCounts) probeSets = expandAgp(agp, probeSets); probeSets[*count] = cloneString(where); (*count)++; } } affyProbeFreeList(&affyProbe); } else { table = nonJunctionTable; sr = hRangeQuery(conn, table, ag->tName, chromStart, chromEnd, NULL, &rowOffset); while((row = sqlNextRow(sr)) != NULL) { affyProbe = affyProbeLoad(row+rowOffset); slAddHead(&affyProbeList, affyProbe); } sqlFreeResult(&sr); for(affyProbe=affyProbeList; affyProbe != NULL; affyProbe = affyProbe->next) { safef(where, sizeof(where), "%s:%s", affyProbe->psName, affyProbe->seq); for(i=0; i<*count; i++) { if(sameString(probeSets[i], where)) unique = FALSE; } if(unique) { if(*count+1 > agp->maxCounts) probeSets = expandAgp(agp, probeSets); probeSets[*count] = cloneString(where); (*count)++; } } affyProbeFreeList(&affyProbe); } } void agpTabOut(struct agProbeSets *agp, FILE *out) /*Write out tab delimited. */ { int i=0; fprintf(out, "%s\t%d\t%d\t%d\t%s\t%s\t%d\t%d\t", agp->chrom, agp->chromStart, agp->chromEnd, agp->type, agp->strand, agp->name, agp->maxCounts, agp->contProbeCount); for(i=0; icontProbeCount; i++) fprintf(out, "%s,", agp->contProbeSets[i]); fprintf(out, "\t"); fprintf(out, "%d\t", agp->alt1ProbeCount); for(i=0; ialt1ProbeCount; i++) fprintf(out, "%s,", agp->alt1ProbeSets[i]); fprintf(out, "\t"); fprintf(out, "%d\t", agp->alt2ProbeCount); for(i=0; ialt2ProbeCount; i++) fprintf(out, "%s,", agp->alt2ProbeSets[i]); fprintf(out, "\t"); fprintf(out, "%d\t", agp->transcriptCount); for(i=0; itranscriptCount; i++) fprintf(out, "%s,", agp->transcriptNames[i]); fprintf(out, "\n"); } void findProbesForAg(struct altGraphX *ag, struct sqlConnection *conn, char *junctionTable, char *nonJunctionTable, FILE *out) /* Dump out the altGraphX's constitutive, alternative and alternative2 probe sets. */ { struct agProbeSets *agp = newAgP(ag); int i; struct bed *bed=NULL, *bedList = NULL; int *starts = ag->edgeStarts; int *ends = ag->edgeEnds; int *vPos = ag->vPositions; for(i=0; iedgeCount; i++) { enum ggEdgeType type = getSpliceEdgeType(ag, i); switch (ag->edgeTypes[i]) { case 0 : addProbeSets(ag, type, vPos[starts[i]], vPos[ends[i]], conn, junctionTable, nonJunctionTable, agp, &agp->contProbeCount, agp->contProbeSets); break; case 1 : addProbeSets(ag, type, vPos[starts[i]], vPos[ends[i]], conn, junctionTable, nonJunctionTable, agp, &agp->alt1ProbeCount, agp->alt1ProbeSets); break; case 2 : addProbeSets(ag, type, vPos[starts[i]], vPos[ends[i]], conn, junctionTable, nonJunctionTable, agp, &agp->alt2ProbeCount, agp->alt2ProbeSets); break; default : errAbort("probeMatcher::findProbesForAg() - Don't recognize type %d", ag->id); } } bedList = hashFindVal(constHash, ag->name); for(bed = bedList; bed != NULL; bed = bed->next) { addProbeSets(ag, ggExon, bed->chromStart, bed->chromEnd, conn, junctionTable, nonJunctionTable, agp, &agp->contProbeCount, agp->contProbeSets); } agpTabOut(agp, out); } void matchProbes(struct altGraphX *agList, char *junctionTable, char *nonJunctionTable, FILE *out) /* Run through the agList and find the probes for the constitutive and alternative edges. */ { struct sqlConnection *conn = hAllocConn(); struct altGraphX *ag = NULL; for(ag=agList; ag != NULL; ag = ag->next) { occassionalDot(); findProbesForAg(ag, conn, junctionTable, nonJunctionTable, out); } hFreeConn(&conn); } int main(int argc, char *argv[]) { struct altGraphX *agList = NULL; FILE *out = NULL; char *bedName = NULL; optionHash(&argc, argv); if(argc != 6) usage(); hSetDb(argv[2]); bedName = optionVal("bedName", NULL); if(bedName != NULL) { warn("Creating bed hash."); constHash = makeBedHash(bedName); warn(""); } else constHash = newHash(2); warn("Loading altGraphX records."); agList = altGraphXLoadAll(argv[1]); out = mustOpen(argv[5], "w"); warn("Looking for probe sets."); matchProbes(agList, argv[3], argv[4], out); carefulClose(&out); warn("\nDone"); return 0; }