/* Example program for Alessandro Guffanti that shows how
 * to use PatSpace and FuzzyFinder to make  EST/genomic 
 * alignments. */

/* This program finds an alignment between a short and a long
 * .fa file and and prints the result as a .html file. */

#include "common.h"         /* String, list, file utilities. */
#include "dnautil.h"       /* Handy utilities on DNA. */
#include "dnaseq.h"         /* DNA sequence - byte per nucleotide. */
#include "fa.h"             /* Read/write fasta files. */
#include "fuzzyFind.h"      /* Local aligner for similar sequences. */
#include "patSpace.h"       /* Fast homology finder for similar sequences. */

void usage()
/* Print usage instructions and exit. */
{
    errAbort("forAg - Aligns a series of EST sequences to a genomic sequence.\n"
             "Usage:\n"
             "    forAg EST.fa genomic.fa celegans.ooc\n"
             "The EST.fa will ideally have a lot of ESTs in it.  The\n"
             "genomic .fa file can have several clones.  It is sadly\n"
             "limited to 8 million bases long.  You'll have to break\n"
             "up larger contigs.   celegans.ooc contains a list of\n"
             "11-mers which occur too frequently in the C. elegans\n"
             "genome to be useful for locating things.");
}

int totalSequenceSize(struct dnaSeq *seqList)
/* Returns total size of all sequences in list. */
{
struct dnaSeq *seq;
int total = 0;

for (seq = seqList; seq != NULL; seq = seq->next)
    total += seq->size;
return total;
}

void freeSeqList(struct dnaSeq **pSeqList)
/* Free an entire list of sequences */
{
struct dnaSeq *seq, *next;
for (seq = *pSeqList; seq != NULL; seq = next)
    {
    next = seq->next;
    freeDnaSeq(&seq);
    }
*pSeqList = NULL;
}

int main(int argc, char *argv[])
{
char *estName, *targetName, *oocName;
FILE *estFile;
struct dnaSeq *target;
struct dnaSeq *est;
struct patSpace *ps;
struct patClump *clumpList, *clump;
int estIx = 0;

/* Check command line arguments and assign to local variables. */
if (argc != 4)
    usage();
estName = argv[1];
estFile = mustOpen(estName, "rb");
targetName = argv[2];
oocName = argv[3];

/* Read in target DNA from fasta files and check not too big. */
fprintf(stderr, "Reading %s\n", targetName);
target = faReadAllDna(targetName);
if (totalSequenceSize(target) > 8000000)
    {
    errAbort("Can only handle 8000000 bases of genomic sequence at once, %s has %d.", 
        targetName, totalSequenceSize(target));
    }

/* Make a pattern space index structure. */
fprintf(stderr, "Making Pattern Space index\n");
ps = makePatSpace(&target, 1, oocName, 4, 32000);

/* Loop through each EST in query list. */
printf("Searching for hits\n\n");
while (faReadNext(estFile, NULL, TRUE, NULL, &est))
    {
    boolean isRc;   /* Reverse complemented? */

    if (++estIx % 5000 == 0)
        fprintf(stderr, "Processing EST %d\n", estIx);
    if (est->size > 20000)
        {
        warn("Very large EST sequence %s.\n"
             "Maybe you mixed up the EST and genomic parameters?", est->name);
        usage();
        }
    
    for (isRc = 0; isRc <= 1; ++isRc)   /* Search both strands. */
        {
        if (isRc)
            reverseComplement(est->dna, est->size);
        clumpList = patSpaceFindOne(ps, est->dna, est->size);

        /* For each homology clump patSpace finds, do a fuzzyFinder
         * alignment of it and print the results. */
        for (clump = clumpList; clump != NULL; clump = clump->next)
            {
            struct ffAli *ali, *a;
            boolean isRc;
            int score;
            struct dnaSeq *t = clump->seq;
            DNA *tStart = t->dna + clump->start;

            ali = ffFind(est->dna, est->dna+est->size, tStart, tStart + clump->size, ffCdna);
            if (ali != NULL)
                {
                score = ffScoreCdna(ali);
                printf("%s hits %s strand %c score %d\n", 
                    est->name, t->name, (isRc ? '+' : '-'), score);
                for (a = ali; a != NULL; a = a->right)
                    {
                    printf("  Q %4d - %4d\t T %4d -%4d\n", 
                        a->nStart - est->dna, a->nEnd - est->dna,
                        a->hStart - t->dna, a->hEnd - t->dna);
                    }
                printf("\n");
                ffFreeAli(&ali);
                }
            else
                {
                printf("Couldn't align clump at %s %d-%d\n",
                    t->name, clump->start, clump->start + clump->size);
                }
            }
        slFreeList(&clumpList);
        }
    freeDnaSeq(&est);
    }
/* Clean up time. */
freePatSpace(&ps);
freeSeqList(&target);
return 0;
}