/* encode2MakeEncode3 - Create a makefile that will reformat and copy encode2 files into * a parallel directory of encode3 files. */ #include "common.h" #include "linefile.h" #include "hash.h" #include "options.h" #include "md5.h" #include "portable.h" #include "obscure.h" #include "sqlNum.h" #include "encode3/encode2Manifest.h" char *dataDir = "/scratch/kent/encValData"; char *tempDir = "/tmp"; void usage() /* Explain usage and exit. */ { errAbort( "encode2MakeEncode3 - Create a makefile that will reformat and copy encode2 files into\n" "a parallel direcgtory of encode3 files.\n" " encode2MakeEncode3 sourceDir sourceManifest destDir out.make destManifest\n" "options:\n" " -dataDir=/path/to/encode/asFilesAndChromSizesEtc\n" " -tmpDir=/tmp\n" ); } /* Command line validation table. */ static struct optionSpec options[] = { {"dataDir", OPTION_STRING}, {NULL, 0}, }; void makeDirOnlyOnce(char *dir, struct hash *hash) /* Check if dir is already in hash. If so we're done. If not make dir and add it to hash. */ { if (!hashLookup(hash, dir)) { verbose(2, "make dir %s\n", dir); hashAdd(hash, dir, NULL); makeDirs(dir); } } char *veryTempName(char *dir, char *base, char *suffix) /* Make a temp name that should be uniq on file system */ { static int id = 0; char rebase[128]; safef(rebase, sizeof(rebase), "%d_%s", ++id, base); return cloneString(rTempName(dir, rebase, suffix)); } boolean needsBedDoctor(char *fileName) /* Returns TRUE if we should run encode2BedDoctor on file. */ { if (startsWith("wgEncodeGisRnaPet", fileName)) return TRUE; else if (startsWith("wgEncodeCshlLongRnaSeq", fileName)) return TRUE; else if (startsWith("wgEncodeCshlShortRnaSeq", fileName)) return TRUE; else if (startsWith("wgEncodeRikenCage", fileName)) return TRUE; else if (startsWith("wgEncodeUwRepliSeq", fileName)) return TRUE; else return FALSE; } void doGzippedBedToBigBed(struct encode2Manifest *mi, char *bedFile, char *assembly, char *asType, char *bedType, char *destDir, char *destFileName, struct slName **pTargetList, FILE *f, FILE *manF) /* Convert some bed file to a a bigBed file possibly using an as file. */ { /* Figure out name of bigBed file we will output and write it as a make target. */ char outFileName[FILENAME_LEN]; safef(outFileName, sizeof(outFileName), "%s%s", destFileName, ".bigBed"); char outPath[PATH_LEN]; safef(outPath, sizeof(outPath), "%s%s%s", destDir, destFileName, ".bigBed"); fprintf(f, "%s: %s\n", outPath, bedFile); /* Unpack gzipped bed and sort it. */ char *tempNameRoot = "b2bb"; char *clippedBed = veryTempName(tempDir, tempNameRoot, ".clipped"); if (sameOk(asType, "peptideMapping")) // special cleanups... fprintf(f, "\tzcat %s | tr '\\r' '\\n' | grep -v '^track' | cut -f 1-10 " "| bedClip stdin %s/%s/chrom.sizes %s\n", bedFile, dataDir, assembly, clippedBed); else fprintf(f, "\tzcat %s | grep -v '^track' | bedClip stdin %s/%s/chrom.sizes %s\n", bedFile, dataDir, assembly, clippedBed); char *sortedBed = veryTempName(tempDir, tempNameRoot, ".sorted.bed"); fprintf(f, "\tsort -k1,1 -k2,2n %s > %s\n", clippedBed, sortedBed); fprintf(f, "\trm %s\n", clippedBed); /* Figure out if it's one we need to doctor up, and if so emit that code */ char *doctoredBed = NULL; char *bigBedSource = NULL; if (needsBedDoctor(destFileName)) { doctoredBed = veryTempName(tempDir, "b2bb", ".doctored.bed"); if (asType == NULL) fprintf(f, "\tencode2BedDoctor %s %s\n", sortedBed, doctoredBed); else fprintf(f, "\tencode2BedPlusDoctor %s %s/as/%s.as %s\n", sortedBed, dataDir, asType, doctoredBed); fprintf(f, "\trm %s\n", sortedBed); bigBedSource = doctoredBed; } else { bigBedSource = sortedBed; } /* Write bigBed, initially to a temp name, and then moving it to real name if all went well. */ char tempBigBed[PATH_LEN]; safef(tempBigBed, sizeof(tempBigBed), "%s.tmp", outPath); fprintf(f, "\tbedToBigBed "); if (bedType != NULL) fprintf(f, " -type=%s", bedType); if (asType != NULL) fprintf(f, " -as=%s/as/%s.as", dataDir, asType); fprintf(f, " %s %s/%s/chrom.sizes %s\n", bigBedSource, dataDir, assembly, tempBigBed); fprintf(f, "\tmv %s %s\n", tempBigBed, outPath); fprintf(f, "\trm %s\n", bigBedSource); /* Add to target list. */ slNameAddHead(pTargetList, outPath); /* Print out info about bigBed we made to new manifest files. */ char localFileName[PATH_LEN+8]; // a little extra for .bigBed safef(localFileName, PATH_LEN, "%s", mi->fileName); chopSuffix(localFileName); strcat(localFileName, ".bigBed"); mi->fileName = localFileName; encode2ManifestTabOut(mi, manF); } boolean justCopySuffix(char *fileName) /* Return TRUE if fileName has a suffix that indicates we just copy it rather than * transform it. */ { static char *copySuffixes[] = {".fastq.gz", ".bigWig", ".bigBed", ".fasta.gz", ".bam", ".spikeins", ".pdf.gz", ".pdf", ".pair.tar.gz", ".tar.gz", ".tab.gz", ".csfasta.gz", ".csfastq.gz", ".csqual.gz", "Validation.tgz", ".doc.tgz", ".matrix.gz", "PrimerPeaks.peaks.gz", ".matrix.tgz", ".CEL.gz", ".spikeins.CL.bam.gz", ".document.tgz" }; int i; for (i=0; i %s\n", tempBed, sortedTempBed); fprintf(f, "\trm %s\n", tempBed); char *clippedTempBed = veryTempName(tempDir, tempNameRoot, ".clipped"); fprintf(f, "\tbedClip %s %s/%s/chrom.sizes %s\n", sortedTempBed, dataDir, assembly, clippedTempBed); fprintf(f, "\trm %s\n", sortedTempBed); fprintf(f, "\tbedToBigBed %s %s/%s/chrom.sizes %s\n", clippedTempBed, dataDir, assembly, tempBigBed); fprintf(f, "\trm %s\n", clippedTempBed); fprintf(f, "\tmv %s %s\n", tempBigBed, bigBedName); slNameAddHead(pTargetList, bigBedName); /* Print out info about bigBed we made to new manifest files. */ char localFileName[PATH_LEN+8]; // a little extra for .bigBed safef(localFileName, PATH_LEN, "%s", mi->fileName); chopSuffix(localFileName); strcat(localFileName, ".bigBed"); mi->fileName = localFileName; encode2ManifestTabOut(mi, manF); } void doGzippedGffToBigBed(struct encode2Manifest *mi, char *sourcePath, char *destPath, char *assembly, char *destDir, char *destFileName, struct slName **pTargetList, FILE *f, FILE *manF) /* Do both copy and conversion to bigBed. Also do some doctoring. */ { /* First handle the straight up copy. */ fprintf(f, "%s: %s\n", destPath, sourcePath); fprintf(f, "\tln -s %s %s\n", sourcePath, destPath); slNameAddHead(pTargetList, destPath); encode2ManifestTabOut(mi, manF); /* Now convert to big bed. */ char *tempNameRoot = "gff2bb"; char bigBedName[PATH_LEN]; safef(bigBedName, sizeof(bigBedName), "%s%s%s", destDir, destFileName, ".bigBed"); char tempBigBed[PATH_LEN]; safef(tempBigBed, sizeof(tempBigBed), "%s.tmp", bigBedName); char *fixedGff = veryTempName(tempDir, tempNameRoot, ".gff"); char *tempBed = veryTempName(tempDir, tempNameRoot, ".bed"); char *sortedTempBed = veryTempName(tempDir, tempNameRoot, ".sorted"); char *clippedTempBed = veryTempName(tempDir, tempNameRoot, ".clipped"); fprintf(f, "%s: %s\n", bigBedName, sourcePath); fprintf(f, "\tencode2GffDoctor %s %s\n", sourcePath, fixedGff); fprintf(f, "\tgffToBed %s %s\n", fixedGff, tempBed); fprintf(f, "\trm %s\n", fixedGff); fprintf(f, "\tsort -k1,1 -k2,2n %s > %s\n", tempBed, sortedTempBed); fprintf(f, "\trm %s\n", tempBed); fprintf(f, "\tbedClip %s %s/%s/chrom.sizes %s\n", sortedTempBed, dataDir, assembly, clippedTempBed); fprintf(f, "\trm %s\n", sortedTempBed); fprintf(f, "\tbedToBigBed %s %s/%s/chrom.sizes %s\n", clippedTempBed, dataDir, assembly, tempBigBed); fprintf(f, "\trm %s\n", clippedTempBed); fprintf(f, "\tmv %s %s\n", tempBigBed, bigBedName); slNameAddHead(pTargetList, bigBedName); /* Print out info about bigBed we made to new manifest files. */ char localFileName[PATH_LEN+8]; // a little extra for .bigBed safef(localFileName, PATH_LEN, "%s", mi->fileName); chopSuffix(localFileName); strcat(localFileName, ".bigBed"); mi->fileName = localFileName; encode2ManifestTabOut(mi, manF); } void processManifestItem(int itemNo, struct encode2Manifest *mi, char *sourceRoot, char *destRoot, struct slName **pTargetList, FILE *f, FILE *manF) /* Process a line from the manifest. Write section of make file needed to transform/copy it. * record name of this target file in pTargetList. * The transformations are: * o - Many files are just copied. * o - Files that are bed variants are turned into bigBed variants * o - Files that are tgz's of multiple fastqs are split into individual fastq.gz's inside * a directory named after the archive. */ { fprintf(manF, "# from %s:\n", mi->fileName); /* Make up bunches of components for file names. */ char *fileName = mi->fileName; char sourcePath[PATH_LEN]; safef(sourcePath, sizeof(sourcePath), "%s/%s", sourceRoot, fileName); char destPath[PATH_LEN]; char destDir[PATH_LEN], destFileName[FILENAME_LEN], destExtension[FILEEXT_LEN]; safef(destPath, sizeof(destPath), "%s/%s", destRoot, fileName); splitPath(destPath, destDir, destFileName, destExtension); /* See if source file exists. If not warn and skip. */ if (!fileExists(sourcePath)) { warn("%s doesn't exist", sourcePath); return; } /* Figure out whether we are on assembly hg19, mm9, or something we don't understand */ char *assembly = NULL; if (startsWith("hg19/", fileName)) assembly = "hg19"; else if (startsWith("mm9/", fileName)) assembly = "mm9"; else errAbort("Don't recognize assembly for %s", fileName); verbose(2, "processing %s\t%s\n", fileName, mi->format); if (endsWith(fileName, ".fastq.tgz")) { char outDir[PATH_LEN]; safef(outDir, sizeof(outDir), "%s.dir", destPath); verbose(2, "Unpacking %s into %s\n", sourcePath, outDir); fprintf(f, "%s: %s\n", outDir, sourcePath); char tmpDir[PATH_LEN]; safef(tmpDir, sizeof(tmpDir), "%s.tmp", destPath); fprintf(f, "\tmkdir %s\n", tmpDir); fprintf(f, "\tcd %s; tar -zxf %s\n", tmpDir, sourcePath); fprintf(f, "\tencode2FlattenFastqSubdirs %s\n", tmpDir); fprintf(f, "\tcd %s; gzip -4 *\n", tmpDir); fprintf(f, "\tmv %s %s\n", tmpDir, outDir); slNameAddHead(pTargetList, outDir); /* Write out revised manifest info */ char localFileName[PATH_LEN+4]; // a little extra for .dir safef(localFileName, PATH_LEN, "%s", mi->fileName); strcat(localFileName, ".dir"); mi->fileName = localFileName; encode2ManifestTabOut(mi, manF); } else if (endsWith(fileName, ".narrowPeak.gz")) { doGzippedBedToBigBed(mi, sourcePath, assembly, "narrowPeak", "bed6+4", destDir, destFileName, pTargetList, f, manF); } else if (endsWith(fileName, ".broadPeak.gz")) { doGzippedBedToBigBed(mi, sourcePath, assembly, "broadPeak", "bed6+3", destDir, destFileName, pTargetList, f, manF); } else if (endsWith(fileName, ".bedRnaElements.gz")) { doGzippedBedToBigBed(mi, sourcePath, assembly, "bedRnaElements", "bed6+3", destDir, destFileName, pTargetList, f, manF); } else if (endsWith(fileName, ".bedLogR.gz")) { doGzippedBedToBigBed(mi, sourcePath, assembly, "bedLogR", "bed9+1", destDir, destFileName, pTargetList, f, manF); } else if (endsWith(fileName, "bedRrbs.gz")) { doGzippedBedToBigBed(mi, sourcePath, assembly, "bedRrbs", "bed9+2", destDir, destFileName, pTargetList, f, manF); } else if (endsWith(fileName, ".peptideMapping.gz")) { doGzippedBedToBigBed(mi, sourcePath, assembly, "peptideMapping", "bed6+4", destDir, destFileName, pTargetList, f, manF); } else if (endsWith(fileName, ".shortFrags.gz")) { doGzippedBedToBigBed(mi, sourcePath, assembly, "shortFrags", "bed6+21", destDir, destFileName, pTargetList, f, manF); } else if (endsWith(fileName, ".bedClusters.gz") || endsWith(fileName, ".bedCluster.gz")) { doGzippedBedToBigBed(mi, sourcePath, assembly, NULL, NULL, destDir, destFileName, pTargetList, f, manF); } else if (endsWith(fileName, ".bed.gz") || endsWith(fileName, ".bed9.gz")) { if (stringIn("wgEncodeHaibMethylRrbs/", fileName)) { doGzippedBedToBigBed(mi, sourcePath, assembly, "bedRrbs", "bed9+2", destDir, destFileName, pTargetList, f, manF); } else if (stringIn("wgEncodeOpenChromSynth/", fileName)) { doGzippedBedToBigBed(mi, sourcePath, assembly, "openChromCombinedPeaks", "bed9+12", destDir, destFileName, pTargetList, f, manF); } else { chopSuffix(destFileName); // remove .bed doGzippedBedToBigBed(mi, sourcePath, assembly, NULL, NULL, destDir, destFileName, pTargetList, f, manF); } } else if (endsWith(fileName, ".gp.gz")) { doGzippedSomethingToBigBed(mi, sourcePath, assembly, destDir, destFileName, "genePredToBed", "gp2bb", pTargetList, f, manF); } else if (endsWith(fileName, ".gtf.gz") || endsWith(fileName, ".gff.gz")) { doGzippedGffToBigBed(mi, sourcePath, destPath, assembly, destDir, destFileName, pTargetList, f, manF); } else if (justCopySuffix(fileName)) { fprintf(f, "%s: %s\n", destPath, sourcePath); fprintf(f, "\tln -s %s %s\n", sourcePath, destPath); slNameAddHead(pTargetList, destPath); encode2ManifestTabOut(mi, manF); } else { errAbort("Don't know what to do with item %d %s in %s line %d", itemNo, fileName, __FILE__, __LINE__); } } void encode2MakeEncode3(char *sourceDir, char *sourceManifest, char *destDir, char *outMake, char *outManifest) /* encode2MakeEncode3 - Copy files in encode2 manifest and in case of tar'd files rezip them * independently. */ { struct encode2Manifest *fileList = encode2ManifestLoadAll(sourceManifest); verbose(2, "Loaded information on %d files from %s\n", slCount(fileList), sourceManifest); verboseTimeInit(); FILE *f = mustOpen(outMake, "w"); FILE *manF = mustOpen(outManifest, "w"); struct encode2Manifest *mi; struct hash *destDirHash = hashNew(0); makeDirOnlyOnce(destDir, destDirHash); /* Print first dependency in makefile - the one that causes all files to be made. */ fprintf(f, "startHere: all\n\techo all done\n\n"); /* Write out each file target, and save also list of all targets. */ struct slName *targetList = NULL; int itemNo = 0; for (mi = fileList; mi != NULL; mi = mi->next) { /* Make path to source file. */ char sourcePath[PATH_LEN]; safef(sourcePath, sizeof(sourcePath), "%s/%s", sourceDir, mi->fileName); /* Make destination dir */ char localDir[PATH_LEN]; splitPath(mi->fileName, localDir, NULL, NULL); char destSubDir[PATH_LEN]; safef(destSubDir, sizeof(destSubDir), "%s/%s", destDir, localDir); makeDirOnlyOnce(destSubDir, destDirHash); char destPath[PATH_LEN]; safef(destPath, sizeof(destPath), "%s/%s", destDir, mi->fileName); processManifestItem(++itemNo, mi, sourceDir, destDir, &targetList, f, manF); } slReverse(&targetList); fprintf(f, "all:"); struct slName *target; for (target = targetList; target != NULL; target = target->next) fprintf(f, " %s", target->name); fprintf(f, "\n"); carefulClose(&manF); carefulClose(&f); } int main(int argc, char *argv[]) /* Process command line. */ { optionInit(&argc, argv, options); dataDir = optionVal("dataDir", dataDir); if (argc != 6) usage(); encode2MakeEncode3(argv[1], argv[2], argv[3], argv[4], argv[5]); return 0; }