/* genoFind.h - Interface to modules for fast finding of sequence * matches. */ /* Copyright 2001-2002 Jim Kent. All rights reserved. */ #ifndef GENOFIND_H #define GENOFIND_H #ifndef DNASEQ_H #include "dnaseq.h" #endif #ifndef FUZZYFIND_H #include "fuzzyFind.h" #endif #ifndef HASH_H #include "hash.h" #endif #ifndef ALITYPE_H #include "aliType.h" #endif #ifndef LOCALMEM_H #include "localmem.h" #endif #ifndef BITS_H #include "bits.h" #endif #ifndef AXT_H #include "axt.h" #endif enum gfConstants { gfMinMatch = 2, gfMaxGap = 2, gfTileSize = 11, gfMaxTileUse = 1024, gfPepMaxTileUse = 30000, }; struct gfSeqSource /* Where a block of sequence comes from. */ { struct gfSeqSource *next; char *fileName; /* Name of file. */ bioSeq *seq; /* Sequences. Usually either this or fileName is NULL. */ bits32 start,end; /* Position within merged sequence. */ Bits *maskedBits; /* If non-null contains repeat-masking info. */ }; struct gfHit /* A genoFind hit. */ { struct gfHit *next; bits32 qStart; /* Where it hits in query. */ bits32 tStart; /* Where it hits in target. */ bits32 diagonal; /* tStart + qSize - qStart. */ }; /* gfHits are free'd with simple freeMem or slFreeList. */ struct gfClump /* A clump of hits. */ /* Note: for clumps from regular (blat) queries, tStart and tEnd include * target->start, but for clumps from gfPcrClumps(), tStart and tEnd have * already had target->start subtracted. So tStart and tEnd in PCR clumps * are relative to that target sequence (not the collection of all target * sequences). */ { struct gfClump *next; /* Next clump. */ bits32 qStart, qEnd; /* Position in query. */ struct gfSeqSource *target; /* Target source sequence. */ bits32 tStart, tEnd; /* Position in target. */ int hitCount; /* Number of hits. */ struct gfHit *hitList; /* List of hits. Not allocated here. */ int queryCoverage; /* Number of bases covered in query (thx AG!) */ }; void gfClumpFree(struct gfClump **pClump); /* Free a single clump. */ void gfClumpFreeList(struct gfClump **pList); /* Free a list of dynamically allocated gfClump's */ struct genoFind /* An index of all K-mers in the genome. */ { int maxPat; /* Max # of times pattern can occur * before it is ignored. */ int minMatch; /* Minimum number of tile hits needed * to trigger a clump hit. */ int maxGap; /* Max gap between tiles in a clump. */ int tileSize; /* Size of each N-mer. */ int stepSize; /* Spacing between N-mers. */ int tileSpaceSize; /* Number of N-mer values. */ int tileMask; /* 1-s for each N-mer. */ int sourceCount; /* Count of source files. */ struct gfSeqSource *sources; /* List of sequence sources. */ bool isPep; /* Is a peptide. */ bool allowOneMismatch; /* Allow a single mismatch? */ int segSize; /* Index is segmented if non-zero. */ bits32 totalSeqSize; /* Total size of all sequences. */ bits32 *listSizes; /* Size of list for each N-mer */ void *allocated; /* Storage space for all lists. */ bits32 **lists; /* A list for each N-mer. Used if * isSegmented is false. */ bits16 **endLists; /* A more complex list for each N-mer. * Used if isSegmented is true. * Values come in groups of threes. * The first is the packed last few * letters of the tile. The next two * are the offset in the genome. This * would be a struct but that would take * 8 bytes instead of 6, or nearly an * extra gigabyte of RAM. */ }; void genoFindFree(struct genoFind **pGenoFind); /* Free up a genoFind index. */ struct gfSeqSource *gfFindNamedSource(struct genoFind *gf, char *name); /* Find target of given name. Return NULL if none. */ /* --- Stuff for saving results ---- */ struct gfOutput /* A polymorphic object to help us write many file types. */ { struct gfOutput *next; void *data; /* Type-specific data pointer. Must be freeMem'able */ void (*out)(char *chromName, int chromSize, int chromOffset, struct ffAli *ali, bioSeq *tSeq, struct hash *t3Hash, bioSeq *qSeq, boolean qIsRc, boolean tIsRc, enum ffStringency stringency, int minMatch, struct gfOutput *out); /* This is the type of a client provided function to save an alignment. * The parameters are: * chromName - name of target (aka genomic or database) sequence. * chromSize - size of target sequence. * chromOffset - offset of genoSequence in target. * ffAli - alignment with pointers into tSeq/qSeq or in * translated target case, into t3Hash. * tSeq - part of target sequence in normal case. In translated * target case look at t3Hash instead. * t3Hash - used only in translated target case. A hash keyed by * target sequence name with values *lists* of trans3 structures. * This hash can be searched to find both the translated and * untranslated versions of the bits of the target that are in * memory. (You can assume at this point all parts needed for * output are indeed in memory.) * qSeq - query sequence (this isn't segmented at all). * isRc - True if query is reverse complemented. * stringency - ffCdna, etc. I'm hoping to move this elsewhere. * minMatch - minimum score to output. Also should be moved elsewhere. * outputData - custom data for specific output function. * The interface is a bit complex - partly from the demands of translated * output, and partly from trying not to have the entire target sequence in * memory. */ void (*queryOut)(struct gfOutput *out, FILE *f); /* Called for each query */ void (*fileHead)(struct gfOutput *out, FILE *f); /* Write file header if any */ boolean reportTargetStrand; /* Report target as well as query strand? */ struct hash *maskHash; /* associates target sequence name and mask. */ int minGood; /* Minimum sequence identity in thousandths. */ boolean qIsProt; /* Query is peptide. */ boolean tIsProt; /* Target is peptide. */ int queryIx; /* Index of query */ boolean includeTargetFile; /* Prefix file: to target sequence name. */ }; struct gfOutput *gfOutputAny(char *format, int goodPpt, boolean qIsProt, boolean tIsProt, boolean noHead, char *databaseName, int databaseSeqCount, double databaseLetters, double minIdentity, FILE *f); /* Initialize output in a variety of formats in file or memory. * Parameters: * format - either 'psl', 'pslx', 'blast', 'wublast', 'axt' * goodPpt - minimum identity of alignments to output in parts per thousand * qIsProt - true if query side is a protein. * tIsProt - true if target (database) side is a protein. * noHead - if true suppress header in psl/pslx output. * databaseName - name of database. Only used for blast output * databaseSeq - number of sequences in database - only for blast * databaseLetters - number of bases/aas in database - only blast * FILE *f - file. */ struct gfOutput *gfOutputPsl(int goodPpt, boolean qIsProt, boolean tIsProt, FILE *f, boolean saveSeq, boolean noHead); /* Set up psl/pslx output */ struct gfOutput *gfOutputAxt(int goodPpt, boolean qIsProt, boolean tIsProt, FILE *f); /* Setup output for axt format. */ struct gfOutput *gfOutputAxtMem(int goodPpt, boolean qIsProt, boolean tIsProt); /* Setup output for in memory axt output. */ struct gfOutput *gfOutputBlast(int goodPpt, boolean qIsProt, boolean tIsProt, char *databaseName, int databaseSeqCount, double databaseLetters, char *blastType, /* blast, blast8, blast9, wublast, or xml */ double minIdentity, FILE *f); /* Setup output for blast/wublast format. */ void gfOutputQuery(struct gfOutput *out, FILE *f); /* Finish writing out results for a query to file. */ void gfOutputHead(struct gfOutput *out, FILE *f); /* Write out header if any. */ void gfOutputFree(struct gfOutput **pOut); /* Free up output. */ /* -------- Routines to build up index ------------ */ void gfCheckTileSize(int tileSize, boolean isPep); /* Check that tile size is legal. Abort if not. */ struct genoFind *gfIndexSeq(bioSeq *seqList, int minMatch, int maxGap, int tileSize, int maxPat, char *oocFile, boolean isPep, boolean allowOneMismatch, boolean maskUpper, int stepSize); /* Make index for all seqs in list. * minMatch - minimum number of matching tiles to trigger alignments * maxGap - maximum deviation from diagonal of tiles * tileSize - size of tile in nucleotides * maxPat - maximum use of tile to not be considered a repeat * oocFile - .ooc format file that lists repeat tiles. May be NULL. * isPep - TRUE if indexing proteins, FALSE for DNA. * maskUpper - Mask out upper case sequence (currently only for nucleotides). * stepSize - space between tiles. Zero means default (which is tileSize). * For DNA sequences upper case bits will be unindexed. */ struct genoFind *gfIndexNibsAndTwoBits(int fileCount, char *fileNames[], int minMatch, int maxGap, int tileSize, int maxPat, char *oocFile, boolean allowOneMismatch, int stepSize); /* Make index for all .nib and .2bits in list. * minMatch - minimum number of matching tiles to trigger alignments * maxGap - maximum deviation from diagonal of tiles * tileSize - size of tile in nucleotides * maxPat - maximum use of tile to not be considered a repeat * oocFile - .ooc format file that lists repeat tiles. May be NULL. * allowOneMismatch - allow one mismatch in a tile. * stepSize - space between tiles. Zero means default (which is tileSize). */ void gfIndexTransNibsAndTwoBits(struct genoFind *transGf[2][3], int fileCount, char *fileNames[], int minMatch, int maxGap, int tileSize, int maxPat, char *oocFile, boolean allowOneMismatch, boolean mask, int stepSize); /* Make translated (6 frame) index for all .nib and .2bit files. */ /* -------- Routines to scan index for homolgous areas ------------ */ struct gfClump *gfFindClumps(struct genoFind *gf, struct dnaSeq *seq, struct lm *lm, int *retHitCount); /* Find clumps associated with one sequence. */ struct gfClump *gfFindClumpsWithQmask(struct genoFind *gf, bioSeq *seq, Bits *qMaskBits, int qMaskOffset, struct lm *lm, int *retHitCount); /* Find clumps associated with one sequence soft-masking seq according to qMaskBits */ struct gfHit *gfFindHitsInRegion(struct genoFind *gf, bioSeq *seq, Bits *qMaskBits, int qMaskOffset, struct lm *lm, struct gfSeqSource *target, int tMin, int tMax); /* Find hits restricted to one particular region. * The hits returned by this will be in target sequence * coordinates rather than concatenated whole genome * coordinates as hits inside of clumps usually are. */ void gfTransFindClumps(struct genoFind *gfs[3], aaSeq *seq, struct gfClump *clumps[3], struct lm *lm, int *retHitCount); /* Find clumps associated with one sequence in three translated reading frames. */ void gfTransTransFindClumps(struct genoFind *gfs[3], aaSeq *seqs[3], struct gfClump *clumps[3][3], struct lm *lm, int *retHitCount); /* Find clumps associated with three sequences in three translated * reading frames. Used for translated/translated protein comparisons. */ void gfClumpDump(struct genoFind *gf, struct gfClump *clump, FILE *f); /* Print out info on clump. This routine subtracts clump->target->start * from clump->tStart and from clump->tEnd for printing, so that printed * coords are relative to that target sequence. */ void gfAlignAaClumps(struct genoFind *gf, struct gfClump *clumpList, aaSeq *seq, boolean isRc, int minMatch, struct gfOutput *out); /* Convert gfClumps to an actual alignment that gets saved via * outFunction/outData. */ void gfFindAlignAaTrans(struct genoFind *gfs[3], aaSeq *qSeq, struct hash *t3Hash, boolean tIsRc, int minMatch, struct gfOutput *out); /* Look for qSeq alignment in three translated reading frames. Save alignment * via outFunction/outData. */ /* --- Some routines for dealing with gfServer at a low level ---- */ char *gfSignature(); /* Return signature that starts each command to gfServer. Helps defend * server from confused clients. */ void gfCatchPipes(); /* Set up to catch broken pipe signals. */ int gfReadMulti(int sd, void *vBuf, size_t size); /* Read in until all is read or there is an error. */ /* --- Some routines for dealing with gfServer at a high level ---- */ struct hash *gfFileCacheNew(); /* Create hash for storing info on .nib and .2bit files. */ void gfFileCacheFree(struct hash **pCache); /* Free up resources in cache. */ void gfAlignStrand(int *pConn, char *nibDir, struct dnaSeq *seq, boolean isRc, int minMatch, struct hash *tFileCache, struct gfOutput *out); /* Search genome on server with one strand of other sequence to find homology. * Then load homologous bits of genome locally and do detailed alignment. * Call 'outFunction' with each alignment that is found. gfSavePsl is a handy * outFunction to use. */ void gfAlignTrans(int *pConn, char *nibDir, aaSeq *seq, int minMatch, struct hash *tFileHash, struct gfOutput *out); /* Search indexed translated genome on server with an amino acid sequence. * Then load homologous bits of genome locally and do detailed alignment. * Call 'outFunction' with each alignment that is found. */ void gfAlignTransTrans(int *pConn, char *nibDir, struct dnaSeq *seq, boolean qIsRc, int minMatch, struct hash *tFileCache, struct gfOutput *out, boolean isRna); /* Search indexed translated genome on server with an dna sequence. Translate * this sequence in three frames. Load homologous bits of genome locally * and do detailed alignment. Call 'outFunction' with each alignment * that is found. */ int gfConnect(char *hostName, char *portName); /* Set up our network connection to server. */ int gfDefaultRepMatch(int tileSize, int stepSize, boolean protTiles); /* Figure out appropriate step repMatch value. */ void gfMakeOoc(char *outName, char *files[], int fileCount, int tileSize, bits32 maxPat, enum gfType tType); /* Count occurences of tiles in seqList and make a .ooc file. */ void gfLongDnaInMem(struct dnaSeq *query, struct genoFind *gf, boolean isRc, int minScore, Bits *qMaskBits, struct gfOutput *out, boolean fastMap, boolean band); /* Chop up query into pieces, align each, and stitch back * together again. */ void gfLongTransTransInMem(struct dnaSeq *query, struct genoFind *gfs[3], struct hash *t3Hash, boolean qIsRc, boolean tIsRc, boolean qIsRna, int minScore, struct gfOutput *out); /* Chop up query into pieces, align each in translated space, and stitch back * together again as nucleotides. */ struct gfClump *gfPcrClumps(struct genoFind *gf, char *fPrimer, int fPrimerSize, char *rPrimer, int rPrimerSize, int minDistance, int maxDistance); /* Find possible PCR hits. The fPrimer and rPrimer are on opposite strands. * Note: unlike clumps from other query functions, PCR clumps from this * function have already had clump->target->start subtracted from * clump->tStart and clump->tEnd so that the coords are relative to that * target sequence (not the collection of all target sequences). */ #define MAXSINGLEPIECESIZE 5000 /* maximum size of a single piece */ #define gfVersion "35x1" /* Current BLAT version number */ #endif /* GENOFIND_H */