/* chromToPos - Given list of chromosomes and sizes, create files that index them by position * in genome via a b+ tree. * This module is as much about experimenting with b+ tree implementation as anything, * though once that is stable, it'll probably be refactored into a b+ tree library module * with this being just a client. */ #include "common.h" #include "linefile.h" #include "hash.h" #include "options.h" #include "sig.h" #include "net.h" int blockSize = 100; void usage() /* Explain usage and exit. */ { errAbort( "chromToPos - Given list of chromosomes and sizes, create files that index them by position\n" "via a b+ tree.\n" "usage:\n" " chromToPos chromSizes.txt chromPos.bin chromPos.ix\n" "options:\n" " -blockSize=N (default %d) Size of block for index purposes\n" , blockSize); } static struct optionSpec options[] = { {"blockSize", OPTION_INT}, {NULL, 0}, }; struct chromInfo /* A chromosome name and a chromosome size. */ { struct chromInfo *next; /* Next in list. */ char *name; /* Chromosome name. */ bits32 size; /* Size of chromosome. */ bits32 genomeOffset; /* Offset of chromosome in genome coordinates. */ bits32 binFileOffset; /* Offset of record in binary file. */ }; struct chromInfo *readChromSizes(char *fileName) /* Create list of chromInfos based on a two column file */ { struct lineFile *lf = lineFileOpen(fileName, TRUE); char *row[2]; struct chromInfo *list = NULL, *el; bits64 maxTotal = (1LL << 32) - 1; bits64 total = 0; int chromCount = 0; struct hash *uniqHash = hashNew(16); while (lineFileChop(lf, row)) { char *name = row[0]; if (hashLookup(uniqHash, name)) errAbort("Duplicate chromosome or contig name %s line %d of %s", name, lf->lineIx, lf->fileName); hashAdd(uniqHash, name, NULL); AllocVar(el); el->name = cloneString(name); el->size = lineFileNeedNum(lf, row, 1); el->genomeOffset = total; total += el->size; if (total > maxTotal) errAbort("Too many bases line %d of %s. Max is %lld, total so far is %lld", lf->lineIx, lf->fileName, maxTotal, total); slAddHead(&list, el); ++chromCount; } hashFree(&uniqHash); lineFileClose(&lf); slReverse(&list); verbose(1, "Read %d chroms totalling %lld bases in %s\n", chromCount, total, fileName); return list; } int countLevels(int maxBlockSize, int itemCount) /* Count up number of levels needed in tree of given maximum block size. */ { int levels = 1; while (itemCount > maxBlockSize) { itemCount = (itemCount + maxBlockSize - 1) / blockSize; levels += 1; } return levels; } void writeBinSaveOffsets(struct chromInfo *chromList, int chromCount, char *fileName) /* Save chromosome info as a binary file, and save offsets of each chromosome within file. */ { bits32 magic = chromSizeBinSig; bits32 count = chromCount; bits32 reserved = 0; FILE *f = mustOpen(fileName, "wb"); writeOne(f, magic); writeOne(f, chromCount); writeOne(f, reserved); writeOne(f, reserved); struct chromInfo *chrom; for (chrom = chromList; chrom != NULL; chrom = chrom->next) { chrom->binFileOffset = ftell(f); writeOne(f, chrom->genomeOffset); mustWrite(f, chrom->name, strlen(chrom->name)+1); } carefulClose(&f); } int xToY(int x, unsigned y) /* Return x to the Y power, with y usually small. */ { int i, val = 1; for (i=0; i blockSize) countOne = blockSize; /* Write block header. */ writeOne(f, isLeaf); writeOne(f, reserved); writeOne(f, countOne); int slotsUsed = 0; int endIx = i + nodeSizePer; if (endIx > chromCount) endIx = chromCount; for (j=i; jgenomeOffset); writeOne(f, nextChild); nextChild += bytesInBlock; ++slotsUsed; } assert(slotsUsed == countOne); for (j=countOne; j blockSize) countOne = blockSize; else countOne = countLeft; writeOne(f, isLeaf); writeOne(f, reserved); writeOne(f, countOne); /* Write out position in genome and in file for each chrom. */ for (j=0; jgenomeOffset); writeOne(f, chrom->binFileOffset); } /* Pad out any unused bits of last block with zeroes. */ for (j=countOne; jnext) chromArray[i] = chrom; /* Figure out how many levels in B tree, and number of chroms between items at highest level. */ int levels = countLevels(blockSize, chromCount); verbose(1, "%d levels with blockSize %d covers %d items\n", levels, blockSize, chromCount); /* Write non-leaf nodes. */ for (i=levels-1; i > 0; --i) { bits32 endLevelOffset = writeIndexLevel(chromArray, chromCount, indexOffset, i, f); indexOffset = ftell(f); if (endLevelOffset != indexOffset) errAbort("internal err: mismatch endLevelOffset=%u vs indexOffset=%u", endLevelOffset, indexOffset); } /* Write leaf nodes */ writeLeafLevel(chromArray, chromCount, f); /* Clean up and go home. */ freez(&chromArray); carefulClose(&f); } void chromToPos(char *sizesText, char *posBin, char *posIndex) /* chromToPos - Given list of chromosomes and sizes, create files that index them by position via * a b+ tree. */ { struct chromInfo *chrom, *chromList = readChromSizes(sizesText); int chromCount = slCount(chromList); writeBinSaveOffsets(chromList, chromCount, posBin); writeIndex(chromList, chromCount, posIndex); } int main(int argc, char *argv[]) /* Process command line. */ { optionInit(&argc, argv, options); if (argc != 4) usage(); blockSize = optionInt("blockSize", blockSize); int minBlockSize = 2, maxBlockSize = (1L << 16) - 1; if (blockSize < minBlockSize || blockSize > maxBlockSize) errAbort("Block size (%d) not in range, must be between %d and %d", blockSize, minBlockSize, maxBlockSize); chromToPos(argv[1], argv[2], argv[3]); return 0; }