/* genePred.h was originally generated by the autoSql program, which also * generated genePred.c and genePred.sql. This header links the database and the RAM * representation of objects. */ #ifndef GENEPRED_H #define GENEPRED_H struct gff; struct gffFile; struct gffGroup; struct psl; struct genbankCds; struct rbTree; enum cdsStatus /* value to indicate status of CDS annotation at either start or end */ { cdsNone, /* "none" - No CDS (non-coding) */ cdsUnknown, /* "unk" - CDS is unknown (coding, but not known) */ cdsIncomplete, /* "incmpl" - CDS is not complete at this end */ cdsComplete, /* "cmpl" - CDS is complete at this end */ }; enum genePredCreateOpts /* bit set of options for genePredGetCreateSql */ { genePredBasicSql = 0x00, /* used if nothing special */ genePredWithBin = 0x01 /* create bin column */ }; enum genePredFromPslOpts /* bit set of options for genePredFromPsl3 */ { genePredPslDefaults = 0x00, /* used if nothing special */ genePredPslCdsMod3 = 0x01 /* only merge gaps in CDS if mod 3 */ }; enum genePredFromGxfOpts /* bit set of options for genePredFromGroupedGff/genePredFromGroupedGtf */ { genePredGxfDefaults = 0x00, /* used if nothing special */ genePredGxfImpliedStopAfterCds = 0x01, /* stop codon is implied outside of * the annotated CDS bounds */ genePredGxfGeneNameAsName2 = 0x02 /* use gene_name instead of gene_id * for name2 */ }; enum genePredFields /* Bit set to indicate which optional fields are used. * N.B. value order must match order in genePred */ { genePredNoOptFld = 0x00, /* use for no opt fields */ genePredScoreFld = 0x01, /* score field */ genePredName2Fld = 0x02, /* name2 field */ genePredCdsStatFld = 0x04, /* cdsStart/EndStat fields */ genePredExonFramesFld = 0x08, /* exonFrames field */ genePredAllFlds = 0xFF /* include all extended fields */ }; struct genePred /* A gene prediction, with optional fields. */ { struct genePred *next; /* Next in singly linked list. */ char *name; /* Name of loci, transcript, mRNA, etc */ char *chrom; /* Chromosome name */ char strand[2]; /* + or - for strand */ unsigned txStart; /* Transcription start position */ unsigned txEnd; /* Transcription end position */ unsigned cdsStart; /* Coding region start */ unsigned cdsEnd; /* Coding region end */ unsigned exonCount; /* Number of exons */ unsigned *exonStarts; /* Exon start positions */ unsigned *exonEnds; /* Exon end positions */ /* optional fields */ unsigned optFields; /* which optional fields are used (not in * database) */ int score; /* score */ char *name2; /* Secondary name. (e.g. name of gene), or * empty if none, NULL if field not * requested */ enum cdsStatus cdsStartStat; /* Status of cdsStart annotation */ enum cdsStatus cdsEndStat; /* Status of cdsEnd annotation */ int *exonFrames; /* List of frame for each exon, or -1 * if no frame or not known. NULL if not * available. */ }; /* Standard value to use for insertMergeSize when creating genePred. * Set to 8 due to microdeletions. */ #define genePredStdInsertMergeSize 8 #define GENEPRED_NUM_COLS 10 /* number of columns in a genePred */ #define GENEPREDX_NUM_COLS 15 /* max number of columns in extended genePred */ struct genePred *genePredLoad(char **row); /* Load a genePred from row fetched with select * from genePred * from database. Dispose of this with genePredFree(). * NOTE: cannabalizes the row argument */ struct genePred *genePredLoadAll(char *fileName); /* Load all genePred from whitespace-separated file. * Dispose of this with genePredFreeList(). */ struct genePred *genePredLoadAllByChar(char *fileName, char chopper); /* Load all genePred from chopper separated file. * Dispose of this with genePredFreeList(). */ #define genePredLoadAllByTab(a) genePredLoadAllByChar(a, '\t'); /* Load all genePred from tab separated file. * Dispose of this with genePredFreeList(). */ struct genePred *genePredCommaIn(char **pS, struct genePred *ret); /* Create a genePred out of a comma separated string. * This will fill in ret if non-null, otherwise will * return a new genePred */ void genePredFree(struct genePred **pEl); /* Free a single dynamically allocated genePred such as created * with genePredLoad(). */ void genePredFreeList(struct genePred **pList); /* Free a list of dynamically allocated genePred's */ void genePredOutput(struct genePred *el, FILE *f, char sep, char lastSep); /* Print out genePred. Separate fields with sep. Follow last field with lastSep. */ #define genePredTabOut(el,f) genePredOutput(el,f,'\t','\n') /* Print out genePred as a line in a tab-separated file. */ #define genePredCommaOut(el,f) genePredOutput(el,f,',',',') /* Print out genePred as a comma separated list including final comma. */ /* --------- Start of hand generated code. ---------------------------- */ struct genePred *genePredExtLoad(char **row, int numCols); /* Load a genePred with from a row, with optional fields. The row must * contain columns in the order in the struct, and they must be present up to * the last specfied optional field. Missing intermediate fields must have * zero or empty columns, they may not be omitted. Fields at the end can be * omitted. Dispose of this with genePredFree(). */ struct genePred *genePredExtLoadAll(char *fileName); /* Load all genePreds with from tab-separated file, possibly with optional * fields. Dispose of this with genePredFreeList(). */ char *genePredCdsStatStr(enum cdsStatus stat); /* get string value of a cdsStatus */ void genePredAddGenbankCds(struct psl *psl, struct genbankCds* cds, struct genePred *gene); /* Convert cdsStart/End from mrna to genomic coordinates. * Note that the genePred blocks need not be filled in before * this call. */ int genePredCmp(const void *va, const void *vb); /* Compare to sort based on chromosome, txStart. */ int genePredNameCmp(const void *va, const void *vb); /* Compare to sort based on name, then chromosome, txStart. */ struct genePred *genePredFromGroupedGff(struct gffFile *gff, struct gffGroup *group, char *name, char *exonSelectWord, unsigned optFields, unsigned options); /* Convert gff->groupList to genePred list. Only put lines where feature type matches * exonSelectWord into the gene. (If exonSelectWord is NULL, all go in) * If optFields contains the bit set of optional fields to add to the genePred. * If genePredCdsStatFld is set, then the CDS status information is * set based on the presences of start_codon, stop_codon, and CDS features. * If genePredExonFramesFld is set, then frame is set as specified in the GTF. * Options are from genePredFromGxfOpts. If genePredGxfImpliedStopAfterCds * is specified, it is treated as if a stop_codon annotation was found, * if there isn't one. If genePredGxfGeneNameAsName2 is specified, use * gene_name for the name2 field otherwise gene_id. */ struct genePred *genePredFromGroupedGtf(struct gffFile *gff, struct gffGroup *group, char *name, unsigned optFields, unsigned options); /* Convert gff->groupList to genePred list, using GTF feature conventions; * including the stop codon in the 3' UTR, not the CDS (grr). Assumes * gffGroup is sorted in assending coords, with overlaping starts sorted by * end coords, which is true if it was created by gffGroupLines(). If * optFields contains the bit set of optional fields to add to the genePred. * If genePredName2Fld is specified, then the gene_id is used for the name2 * field. If genePredCdsStatFld is set, then the CDS status information is * set based on the presences of start_codon, stop_codon, and CDS features. * If genePredExonFramesFld is set, then frame is set as specified in the GTF. * Options are from genePredFromGxfOpts. If genePredGxfImpliedStopAfterCds * is specified, it is treated as if a stop_codon annotation was found, * if there isn't one. */ struct genePred *genePredFromPsl3(struct psl *psl, struct genbankCds* cds, unsigned optFields, unsigned options, int cdsMergeSize, int utrMergeSize); /* Convert a PSL of an mRNA alignment to a genePred, converting a genbank CDS * specification string to genomic coordinates. Small genomic inserts are * merged based on the mergeSize parameters. Gaps no larger than the * specified merge sizes result in the adjacent blocks being merged into a * single exon. Gaps in CDS use cdsMergeSize, in UTR use utrMergeSize. If * the genePredPslCdsMod3 option is specified, then CDS gaps are only merged * if a multiple of three. A negative merge sizes disables merging of blocks. * This differs from specifying zero in that adjacent blocks will not be * merged. The optfields field is a set from genePredFields, indicated what * fields to create. Zero-length CDS, or null cds, creates without CDS * annotation. If cds is null, it will set status fields to cdsNone. */ struct genePred *genePredFromPsl2(struct psl *psl, unsigned optFields, struct genbankCds* cds, int insertMergeSize); /* Compatibility function, genePredFromPsl3 is prefered. See that function's * documentation for details. This calls genePredFromPsl3 with no options * and insertMergeSize set for CDS and UTR. */ struct genePred *genePredFromPsl(struct psl *psl, int cdsStart, int cdsEnd, int insertMergeSize); /* Compatibility function, genePredFromPsl3 is prefered. See that function's * documentation for details. This calls genePredFromPsl3 with no options. */ char* genePredGetCreateSql(char* table, unsigned optFields, unsigned options, int chromIndexLen); /* Get SQL required to create a genePred table. optFields is a bit set * consisting of the genePredFields values. Options are a bit set of * genePredCreateOpts. Returned string should be freed. This will create all * optional fields that preceed the highest optFields column. chromIndexLen * is now ignored.. */ struct genePred *getOverlappingGene(char *db, struct genePred **list, char *table, char *chrom, int cStart, int cEnd, char *name, int *retOverlap); /* read all genes from a table find the gene with the biggest overlap. * Cache the list of genes to so we only read it once. * If there are multiple hits and the name that matches exactly, * this overrides the biggest overlap */ int genePredBases(struct genePred *gp); /* count coding and utr bases in a gene prediction */ int genePredCodingBases(struct genePred *gp); /* Count up the number of coding bases in gene prediction. */ boolean genePredCdsExon(struct genePred *gp, int iExon, int *startPtr, int *endPtr); /* Get the CDS range in an exon. If there is no CDS, return FALSE and then * set start == end */ int genePredCheck(char *desc, FILE* out, int chromSize, struct genePred* gp); /* Validate a genePred for consistency. desc is printed the error messages * to file out (open /dev/null to discard). chromSize should contain * size of chromosome, or 0 if chrom is not valid, or -1 to not check * chromosome bounds. Returns count of errors. */ boolean genePredNmdTarget(struct genePred *gp); /* Return TRUE if cds end is more than 50bp upstream of last intron. */ void genePredAddExonFrames(struct genePred *gp); /* Add exonFrames array to a genePred that doesn't have it. Frame is assumed * to be contiguous. */ void genePredRc(struct genePred *gp, int chromSize); /* Reverse complement a genePred (project it to the opposite strand). Useful * when doing analysis that is simplified by having things on the same strand. */ int genePredCdsSize(struct genePred *gp); /* compute the number of bases of CDS */ struct genePred *genePredNew(char *name, char *chrom, char strand, unsigned txStart, unsigned txEnd, unsigned cdsStart, unsigned cdsEnd, unsigned optFields, unsigned exonSpace); /* create a new gene with space for the specified number of exons allocated. * genePredGrow maybe used to expand this space if needed. */ void genePredGrow(struct genePred *gp, unsigned *exonSpacePtr); /* Increase memory allocated to a psl to hold more exons. exonSpacePtr * should point the the current maximum number of exons and will be * updated to with the new amount of space. */ struct rbTree *genePredToRangeTree(struct genePred *gp, boolean cdsOnly); /* Convert genePred into a range tree. */ void gpPartOutAsBed(struct genePred *gp, int start, int end, FILE *f, char *type, int id, int minSize); /* Write out part of gp as bed12. */ boolean codonToPos(struct genePred *gp, unsigned num, int *chromStart, int *chromEnd); // map 1-based codon to genomic coordinates. If the codon crosses an exon junction, we return just the beginning (LHS) of the codon. // Returns true if we find the codon in given gene predition; chromStart and chromEnd are set to appropriate three base region. boolean exonToPos(struct genePred *gp, unsigned num, int *chromStart, int *chromEnd); // map 1-based exon number to genomic coordinates. // Returns true if we find the exon in given gene predition; chromStart and chromEnd are set to appropriate region. #endif /* GENEPRED_H */