/* Some stuff that you'll likely need in any program that works with
 * DNA.  Includes stuff for amino acids as well. 
 *
 * Assumes that DNA is stored as a character.
 * The DNA it generates will include the bases 
 * as lowercase tcag.  It will generally accept
 * uppercase as well, and also 'n' or 'N' or '-'
 * for unknown bases. 
 *
 * Amino acids are stored as single character upper case. 
 *
 * This file is copyright 2002 Jim Kent, but license is hereby
 * granted for all use - public, private or commercial. */


#ifndef DNAUTIL_H
#define DNAUTIL_H

void dnaUtilOpen(); /* Good idea to call this before using any arrays
		     * here.  */

/* Numerical values for bases. */
#define T_BASE_VAL 0
#define U_BASE_VAL 0
#define C_BASE_VAL 1
#define A_BASE_VAL 2
#define G_BASE_VAL 3
#define N_BASE_VAL 4   /* Used in 1/2 byte representation. */

typedef char DNA;
typedef char AA;
typedef char BIOPOL;	/* Biological polymer. */

/* A little array to help us decide if a character is a 
 * nucleotide, and if so convert it to lower case. 
 * Contains zeroes for characters that aren't used
 * in DNA sequence. */
extern DNA ntChars[256];
extern AA aaChars[256];

/* An array that converts alphabetical DNA representation
 * to numerical one: X_BASE_VAL as above.  For charaters
 * other than [atgcATGC], has -1. */
extern int ntVal[256];
extern int aaVal[256];
extern int ntValLower[256];	/* NT values only for lower case. */
extern int ntValUpper[256];	/* NT values only for upper case. */

/* Like ntVal, but with T_BASE_VAL in place of -1 for nonexistent nucleotides. */
extern int ntValNoN[256];     

/* Like ntVal but with N_BASE_VAL in place of -1 for 'n', 'x', '-', etc. */
extern int ntVal5[256];

/* Inverse array - takes X_BASE_VAL int to a DNA char
 * value. */
extern DNA valToNt[];

/* Similar array that doesn't convert to lower case. */
extern DNA ntMixedCaseChars[256];

/* Another array to help us do complement of DNA  */
extern DNA ntCompTable[256];

/* Arrays to convert between lower case indicating repeat masking, and
 * a 1/2 byte representation where the 4th bit indicates if the characeter
 * is masked. Uses N_BASE_VAL for `n', `x', etc.
*/
#define MASKED_BASE_BIT 8
extern int ntValMasked[256];
extern DNA valToNtMasked[256];

/*Complement DNA (not reverse)*/
void complement(DNA *dna, long length);

/* Reverse complement DNA. */
void reverseComplement(DNA *dna, long length);


/* Reverse offset - return what will be offset (0 based) to
 * same member of array after array is reversed. */
long reverseOffset(long offset, long arraySize);

/* Switch start/end (zero based half open) coordinates
 * to opposite strand. */
void reverseIntRange(int *pStart, int *pEnd, int size);

/* Switch start/end (zero based half open) coordinates
 * to opposite strand. */
void reverseUnsignedRange(unsigned *pStart, unsigned *pEnd, int size); 

enum dnaCase {dnaUpper,dnaLower,dnaMixed,};
/* DNA upper, lower, or mixed case? */

/* Convert T's to U's */
void toRna(DNA *dna);

int cmpDnaStrings(DNA *a, DNA *b);
/* Compare using screwy non-alphabetical DNA order TCGA */

typedef char Codon; /* Our codon type. */

/* Return single letter code (upper case) for protein.
 * Returns X for bad input, 0 for stop codon.
 * The "Standard" Code */
AA lookupCodon(DNA *dna); 

boolean isStopCodon(DNA *dna);
/* Return TRUE if it's a stop codon. */

boolean isKozak(char *dna, int dnaSize, int pos);
/* Return TRUE if it's a Kozak compatible start, using a relatively
 * weak definition (either A/G 3 bases before or G after) . */

boolean isReallyStopCodon(char *dna, boolean selenocysteine);
/* Return TRUE if it's really a stop codon, even considering
 * possibilility of selenocysteine. */

/* Returns one letter code for protein, 
 * 0 for stop codon or X for bad input,
 * Vertebrate Mitochondrial Code */
AA lookupMitoCodon(DNA *dna);

Codon codonVal(DNA *start);
/* Return value from 0-63 of codon starting at start. 
 * Returns -1 if not a codon. */

DNA *valToCodon(int val);
/* Return  codon corresponding to val (0-63) */

extern char *aaAbbr(int i);
/* return pointer to AA abbrevation */

extern char aaLetter(int i);
/* return AA letter */

void dnaTranslateSome(DNA *dna, char *out, int outSize);
/* Translate DNA upto a stop codon or until outSize-1 amino acids, 
 * whichever comes first. Output will be zero terminated. */

char *skipIgnoringDash(char *a, int size, bool skipTrailingDash);
/* Count size number of characters, and any 
 * dash characters. */

int countNonDash(char *a, int size);
/* Count number of non-dash characters. */

int nextPowerOfFour(long x);
/* Return next power of four that would be greater or equal to x.
 * For instance if x < 4, return 1, if x < 16 return 2.... 
 * (From biological point of view how many bases are needed to
 * code this number.) */

long dnaFilteredSize(char *rawDna);
/* Return how long DNA will be after non-DNA is filtered out. */

long aaFilteredSize(char *rawDna);
/* Return how long peptide will be after non-peptide is filtered out. */

void dnaFilter(char *in, DNA *out);
/* Filter out non-DNA characters. */

void aaFilter(char *in, DNA *out);
/* Filter out non-peptide characters. */

void dnaFilterToN(char *in, DNA *out);
/* Change all non-DNA characters to N. */

void upperToN(char *s, int size);
/* Turn upper case letters to N's. */

void lowerToN(char *s, int size);
/* Turn lower case letters to N's. */

void dnaBaseHistogram(DNA *dna, int dnaSize, int histogram[4]);
/* Count up frequency of occurance of each base and store 
 * results in histogram. Use X_BASE_VAL to index histogram. */

void dnaMixedCaseFilter(char *in, DNA *out);
/* Filter out non-DNA characters but leave case intact. */

bits64 basesToBits64(char *dna, int size);
/* Convert dna of given size (up to 32) to binary representation */

bits32 packDna16(DNA *in);
/* pack 16 bases into a word */

bits16 packDna8(DNA *in);
/* Pack 8 bases into a short word */

UBYTE packDna4(DNA *in);
/* Pack 4 bases into a UBYTE */

void unpackDna(bits32 *tiles, int tileCount, DNA *out);
/* Unpack DNA. Expands to 16x tileCount in output. */

void unpackDna4(UBYTE *tiles, int byteCount, DNA *out);
/* Unpack DNA. Expands to 4x byteCount in output. */

void unalignedUnpackDna(bits32 *tiles, int start, int size, DNA *unpacked);
/* Unpack into out, even though not starting/stopping on tile 
 * boundaries. */

int intronOrientationMinSize(DNA *iStart, DNA *iEnd, int minIntronSize);
/* Given a gap in genome from iStart to iEnd, return 
 * Return 1 for GT/AG intron between left and right, -1 for CT/AC, 0 for no
 * intron. */

int intronOrientation(DNA *iStart, DNA *iEnd);
/* Given a gap in genome from iStart to iEnd, return 
 * Return 1 for GT/AG intron between left and right, -1 for CT/AC, 0 for no
 * intron.  Assumes minIntronSize of 32. */

int dnaScore2(DNA a, DNA b);
/* Score match between two bases (relatively crudely). */

int dnaScoreMatch(DNA *a, DNA *b, int size);
/* Compare two pieces of DNA base by base. Total mismatches are
 * subtracted from total matches and returned as score. 'N's 
 * neither hurt nor help score. */

int aaScore2(AA a, AA b);
/* Score match between two bases (relatively crudely). */

int aaScoreMatch(AA *a, AA *b, int size);
/* Compare two peptides aa by aa. */

int  dnaOrAaScoreMatch(char *a, char *b, int size, int matchScore, int mismatchScore, 
	char ignore);
/* Compare two sequences (without inserts or deletions) and score. */

void writeSeqWithBreaks(FILE *f, char *letters, int letterCount, int maxPerLine);
/* Write out letters with newlines every maxLine. */

int tailPolyASizeLoose(DNA *dna, int size);
/* Return size of PolyA at end (if present).  This allows a few non-A's as 
 * noise to be trimmed too, but skips first two aa for taa stop codon. 
 * It is less conservative in extending the polyA region than maskTailPolyA. */

int headPolyTSizeLoose(DNA *dna, int size);
/* Return size of PolyT at start (if present).  This allows a few non-T's as 
 * noise to be trimmed too, but skips last two tt for revcomp'd taa stop 
 * codon.  
 * It is less conservative in extending the polyA region than maskHeadPolyT. */

int maskTailPolyA(DNA *dna, int size);
/* Convert PolyA at end to n.  This allows a few non-A's as noise to be 
 * trimmed too.  Returns number of bases trimmed.  */

int maskHeadPolyT(DNA *dna, int size);
/* Convert PolyT at start.  This allows a few non-T's as noise to be 
 * trimmed too.  Returns number of bases trimmed.  */

boolean isDna(char *poly, int size);
/* Return TRUE if letters in poly are at least 90% ACGTU */

boolean isAllDna(char *poly, int size);
/* Return TRUE if letters in poly are 100% ACGTU */

#endif /* DNAUTIL_H */