/* crTree chromosome r tree. This module creates and uses a disk-based index that can find items * that overlap with a chromosome range - something of the form chrN:start-end - with a * minimum of disk access. It is implemented with a combination of bPlusTrees and r-trees. * The items being indexed can overlap with each other. * * There's two main sides to using this module - creating an index, and using it. * * The first step of index creation is actually to insure that the file being indexed * is ordered by chromosome,start,end. For a .bed file you can insure this * with the command: * sort -k1,1 -k2,2n -k3,3n unsorted.bed > sorted.bed * Note that the the chromosome field is sorted alphabetically and the start and end * fields are sorted numerically. * * Once this is done then the index creation program scans the input file, and * makes a list of crTreeItems, one for each item in the file. A crTreeItem just * contains the chromosome range and file offset for an item. In the process of scanning, * build up a hash containing the chromosome names (using hashStoreName) rather than allocating * a separate string for each chromosome name in each item. Pass the list and the * hash to crTreeFileCreate. * * Using an index is done in two steps. First you open the index with crTreeFileOpen, * and then you use crTreeFindOverlappingBlocks to find parts of the file the overlap * with your query range. The result of a crTreeFindOverlappingBlocks call is a list * of regions in the file. These regions typically include some non-overlapping items * as well. It is up to the caller to parse through the resulting region list to * convert it from just bytes on disk into the memory data structure. During this * parsing you should ignore items that don't overlap your range of interest. * * The programs crTreeIndexBed and crTreeSearchBed create and search a crTree index * for a bed file, and are useful examples to view for other programs that want to * use the crTree system. */ #ifndef CRTREE_H #define CRTREE_H struct crTreeFile /* Chromosome R tree index file handle. */ { struct crTreeFile *next; /* Next in list of index files if any. */ char *fileName; /* Name of file - for better error reporting. */ struct udcFile *udc; /* Open file handle. */ struct bptFile *chromBpt; /* Index of chromosomes. */ struct cirTreeFile *cir; /* Index of ranges. */ boolean isSwapped; /* If TRUE need to byte swap everything. */ bits64 chromOffset; /* Offset of chromosome index. */ bits64 cirOffset; /* Offset of range index. */ }; struct crTreeFile *crTreeFileOpen(char *fileName); /* Open up r-tree index file - reading headers and verifying things. */ void crTreeFileClose(struct crTreeFile **pCrt); /* Close and free up crTree file opened with crTreeFileAttach. */ struct fileOffsetSize *crTreeFindOverlappingBlocks(struct crTreeFile *crt, char *chrom, bits32 start, bits32 end); /* Return list of file blocks that between them contain all items that overlap * start/end on chromIx. Also there will be likely some non-overlapping items * in these blocks too. When done, use slListFree to dispose of the result. */ struct crTreeItem /* A chromosome and an interval inside it. */ { struct crTreeItem *next; /* Next in singly linked list. */ char *chrom; /* Name of chromosome not allocated here. */ bits32 start; /* Start position in chromosome. */ bits32 end; /* One past last base in interval in chromosome. */ bits64 fileOffset; /* Offset of item in file we are indexing. */ }; void crTreeFileCreateInputCheck(struct crTreeItem *itemList, struct hash *chromHash, bits32 blockSize, bits32 itemsPerSlot, bits64 endPosition, char *fileName); /* Do sanity checking on itemList and chromHash and endPosition. Make sure that itemList is * sorted properly mostly. */ void crTreeFileCreate( struct crTreeItem *itemList, /* List of all items - sorted here and in underlying file. */ struct hash *chromHash, /* Hash of all chromosome names. */ bits32 blockSize, /* R tree block size - # of children for each node. 1024 is good. */ bits32 itemsPerSlot, /* Number of items to put in each index slot at lowest level. * Typically either blockSize/2 or 1. */ bits64 endPosition, /* File offset after have read all items in file. */ char *fileName); /* Name of output file. */ /* Create a cr tree index of file. The itemList contains the position of each item in the * chromosome and in the file being indexed. Both the file and the itemList must be sorted * by chromosome (alphabetic), start (numerical), end (numerical). */ #endif /* CRTREE_H */