/* encodeMergeReplicatesBatch - Create a script that merges a bunch of replicates.. */ #include "common.h" #include "linefile.h" #include "hash.h" #include "options.h" #include "dystring.h" #include "ra.h" void usage() /* Explain usage and exit. */ { errAbort( "encodeMergeReplicatesBatch - Create a script that merges a bunch of replicates.\n" "usage:\n" " encodeMergeReplicatesBatch input.ra inDir output.sh output.ra outDir\n" "where input.ra is generated by mdbQuery, and includes the fields replicate\n" "fileName, and enough metadata to distinguish the experiments by grouping together\n" "on all matching metadata (other than replicate and fileName).\n" "The inDir is the directory where the input files live. The outDir is where to put\n" "output wiggles and the like. The output.sh is the script to run to do the merging.\n" "The output.ra is similar to input.ra but with the new file name substituted in and\n" "the replicate field removed. The outDir is where to put the output\n" "options:\n" " -xxx=XXX\n" ); } static struct optionSpec options[] = { {NULL, 0}, }; struct replicate /* Info on a single replicate. */ { struct replicate *next; /* Next in list. */ char *fileName; /* Name of file. */ char *replicate; /* Replicate id. */ struct slPair *tagList; /* name/value pairs. */ }; struct repBundle /* A bundle of replicates. */ { struct repBundle *next; /* Next in list. */ char *hashName; /* Name constructed by metadata. Allocated in hash. */ struct replicate *repList; /* List of associated replicates. */ }; char *mustFindInPairList(struct slPair *list, char *name, struct lineFile *lf) /* Look through list for one with given name, and return associated value. Use * lf for error reporting if not found. */ { char *val = slPairFindVal(list, name); if (val == NULL) errAbort("Could not find required tag '%s' in stanza ending line %d of %s", name, lf->lineIx, lf->fileName); return val; } struct replicate *readRaAsReplicates(char *fileName) /* Read RA file and return it as a list of replicates. */ { struct lineFile *lf = lineFileOpen(fileName, TRUE); struct replicate *rep, *repList = NULL; struct slPair *tagList; while ((tagList = raNextRecordAsSlPairList(lf)) != NULL) { AllocVar(rep); rep->fileName = mustFindInPairList(tagList, "fileName", lf); rep->replicate = mustFindInPairList(tagList, "replicate", lf); rep->tagList = tagList; slAddHead(&repList, rep); } slReverse(&repList); lineFileClose(&lf); return repList; } struct repBundle *bundleReplicates(struct replicate **pRepList) /* Bundle together replicates, eating *pRepList in the process. */ { struct replicate *rep, *nextRep; struct repBundle *bundleList = NULL, *bundle; struct dyString *hashName = dyStringNew(0); struct hash *bundleHash = hashNew(0); for (rep = *pRepList; rep != NULL; rep = nextRep) { nextRep = rep->next; // Going to clobber ->next field dyStringClear(hashName); struct slPair *tag; for (tag = rep->tagList; tag != NULL; tag = tag->next) { if (!sameString(tag->name, "replicate") && !sameString(tag->name, "fileName")) dyStringPrintf(hashName, "%s=%s;", tag->name, (char *)(tag->val)); } bundle = hashFindVal(bundleHash, hashName->string); if (bundle == NULL) { AllocVar(bundle); slAddHead(&bundleList, bundle); hashAddSaveName(bundleHash, hashName->string, bundle, &bundle->hashName); } slAddTail(&bundle->repList, rep); } slReverse(&bundleList); *pRepList = NULL; return bundleList; } struct dyString *unrepFileName(char *fileName, boolean isSingle) /* Return string with Rep# in fileName replaced with "Merged" */ { char *s = strstr(fileName, "Rep"); struct dyString *dy = dyStringNew(0); if (s == NULL) { if (isSingle) dyStringAppend(dy, fileName); else errAbort("No 'Rep' in fileName %s", fileName); } else { char *pastRep = s + strlen("Rep"); int digitCount = countLeadingDigits(pastRep); if (digitCount < 1) errAbort("No digits after 'Rep' in filename %s", fileName); pastRep += digitCount; dyStringAppendN(dy, fileName, s-fileName); dyStringAppend(dy, "Merged"); int len = strlen(pastRep); if (!isSingle && endsWith(pastRep, ".gz")) len -= strlen(".gz"); dyStringAppendN(dy, pastRep, len); } return dy; } void encodeMergeReplicatesBatch(char *inRa, char *inDir, char *outSh, char *outRa, char *outDir) /* encodeMergeReplicatesBatch - Create a script that merges a bunch of replicates.. */ { /* Read in input.ra and bundle together replicates. */ struct replicate *repList = readRaAsReplicates(inRa); verbose(2, "Got %d replicates\n", slCount(repList)); struct repBundle *bundle, *bundleList = bundleReplicates(&repList); verbose(2, "Got %d bundles\n", slCount(bundleList)); /* Create output . */ FILE *fSh = mustOpen(outSh, "w"); FILE *fRa = mustOpen(outRa, "w"); fprintf(fSh, "#!/bin/tcsh -efx\n"); for (bundle = bundleList; bundle != NULL; bundle = bundle->next) { /* Create output file name */ struct replicate *rep = bundle->repList; int count = slCount(rep); verbose(2, "%d %s\n", count, bundle->hashName); struct dyString *outName = unrepFileName(rep->fileName, (count==1)); /* Write out shell line. */ if (count > 1) { fprintf(fSh, "encodeMergeReplicates -add -maxMin -uniqueName "); for (rep = bundle->repList; rep != NULL; rep = rep->next) fprintf(fSh, " %s/%s", inDir, rep->fileName); fprintf(fSh, " %s/%s\n", outDir, outName->string); } else fprintf(fSh, "cp %s/%s %s/%s\n", inDir, rep->fileName, outDir, outName->string); /* Write out ra stanza */ struct slPair *tag; rep = bundle->repList; for (tag = rep->tagList; tag != NULL; tag = tag->next) { if (sameString(tag->name, "fileName")) fprintf(fRa, "%s %s\n", tag->name, outName->string); else if (sameString(tag->name, "replicate")) ; // do nothing else { fprintf(fRa, "%s %s\n", tag->name, (char *)(tag->val)); } } fprintf(fRa, "\n"); } carefulClose(&fSh); carefulClose(&fRa); } int main(int argc, char *argv[]) /* Process command line. */ { optionInit(&argc, argv, options); if (argc != 6) usage(); encodeMergeReplicatesBatch(argv[1], argv[2], argv[3], argv[4], argv[5]); return 0; }