/* splix - splat (speedy local alignment tool) index. Index that helps map short reads * quickly to the genome. */ /* This file is copyright 2008 Jim Kent, but license is hereby * granted for all use - public, private or commercial. */ #ifndef SPLIX_H #define SPLIX_H struct splixFileHeader /* Short read index file binary file header. A splix file starts with this fixed 128 byte * structure. It is followed by the following sections: * chromosome name strings - zero terminated * chromosome sizes (32 bits each) * chromosome DNA - one byte per base lower case. A zero between each chrom. * indexSlotSizes (4^^12 32 bit words containing size of each index slot * indexSlots - explained more below * Each of these sections is padded with zeroes to end on an 8 byte (64 bit) boundary. * The index section consists of 4^^12 (16 million roughly) index slots. Each slot * corresponds to a DNA 12-mer. The format of a slot is: * hexesBefore1 - size # of 16 bit words, each containing 6 bases of DNA 2 bits/base * and 4 bits of zero (most significant bits are zero). These represent * the sixmers found before the 12-mer. They are sorted numerically. * hexesBefore2 - as hexesBefore, but contains sixmer six before the 12-mer. * hexesAfter1 - sixmers after the 12-mer. * hexesAfter2 - sixmers six after the 12-mer. * offsetsBefore1 - 32 bit offsets into indexed DNA corresponding with hexBefore1 * offsetsBefore2 - 32 bit offsets into indexed DNA corresponding with hexBefore2 * offsetsAfter1 - 32 bit offsets corresponding ith hexesAfter1 * offsetsAfter2 - 32 bit offsets corresponding ith hexesAfter2 * The splix files are structured so that they can be memory mapped relatively easily, * and so that on program load, and for a particular read, most of the action happens * in a few isolated piece of memory rather than scattered all over. */ { bits32 magic; /* Always SPLIX_MAGIC */ bits16 majorVersion; /* This version changes when backward compatibility breaks. */ bits16 minorVersion; /* This version changes whenever a feature is added. */ bits64 size; /* Total size to memmap, including header. */ bits32 chromCount; /* Total count of chromosomes/contigs in file. */ bits32 chromNamesSize; /* Size of names of all contigs (including zeroes at end), padded to 8 byte boundary as needed). */ bits64 basesIndexed;/* Total number of bases actually indexed (non-N, unmasked). */ bits64 dnaDiskSize; /* Size of DNA on disk including zero separators and 8 byte padding */ bits64 reserved[11];/* All zeroes for now. */ }; struct splix /* Short read index in memory */ { struct splix *next; boolean isMapped; /* True if memory mapped. */ struct splixFileHeader *header; /* File header. */ char **chromNames; /* Name of each chromosome. */ bits32 *chromSizes; /* Size of each chromosome. No deallocation required (in memmap) */ bits32 *chromOffsets; /* Offset of each chromosome's DNA */ char *allDna; /* All DNA from each contig/chromosome with zero separators. */ bits32 *slotSizes; /* 4^^12 array of slot sizes. No deallocation required (in memmap) */ char **slots; /* 16 M slots corresponding to 12 bases. Actual format of slot is * explained in indexSlots section of splixFileHeader */ }; #define splixSlotCount (1<<(12*2)) #define splixMinQuerySize 24 struct splix *splixRead(char *fileName, boolean memoryMap); /* Read in a splix from a file. Does this via memory mapping if you like, * which will be faster typically for about 100 reads, and slower for more * than that (_much_ slower for thousands of reads and more). */ void splixFree(struct splix **pSplix); /* Free up resources associated with index. */ int splixOffsetToChromIx(struct splix *splix, bits32 tOffset); /* Figure out index of chromosome containing tOffset */ /** Stuff to define SPLIX files **/ #define SPLIX_MAGIC 0x5616A283 /* Magic number at start of SPLIX file */ #define SPLIX_MAJOR_VERSION 0 #define SPLIX_MINOR_VERSION 0 #endif /* SPLIX_H */