/* regStartSampleEmbl - Make up a EMBL format file (because it's an easy way to do 
 * structured multiline text) with a sample of genes to annotate.. */
#include "common.h"
#include "linefile.h"
#include "hash.h"
#include "options.h"
#include "jksql.h"
#include "basicBed.h"


void usage()
/* Explain usage and exit. */
{
errAbort(
  "regStartSampleEmbl - Make up a EMBL format file (because it's an easy way to do structured multiline text) with a sample of genes to annotate.\n"
  "usage:\n"
  "   regStartSampleEmbl db count output.embl\n"
  "options:\n"
  "   -xxx=XXX\n"
  );
}

static struct optionSpec options[] = {
   {NULL, 0},
};

int getStartOfGeneBefore(struct sqlConnection *conn, struct bed4 *gene)
/* Get the start of the previous non-overlapping gene */
{
char query[256];
safef(query, sizeof(query), 
    "select max(txStart) from knownGene where chrom='%s' and txEnd < %d", 
    gene->chrom, gene->chromStart);
return sqlQuickNum(conn, query);
}

int getEndOfGeneAfter(struct sqlConnection *conn, struct bed4 *gene)
/* Get the end of the next non-overlapping gene */
{
char query[256];
safef(query, sizeof(query), 
    "select min(txEnd) from knownGene where chrom='%s' and txStart > %d", 
    gene->chrom, gene->chromEnd);
return sqlQuickNum(conn, query);
}

void regStartSampleEmbl(char *db, char *countString, char *outFile)
/* regStartSampleEmbl - Make up a EMBL format file (because it's an easy way to do 
 * structured multiline text) with a sample of genes to annotate.. */
{
int count = atoi(countString);
struct sqlConnection *conn = sqlConnect(db);
FILE *f = mustOpen(outFile, "w");

/* Get list of random genes (canonical isoform) into bed4 format. */
struct bed4 *gene, *geneList = NULL;
char query[512];
safef(query, sizeof(query), 
	"select chrom,chromStart,chromEnd,transcript from knownCanonical,kgTxInfo "
	"where knownCanonical.transcript = kgTxInfo.name "
	"and chrom not like '%%hap%%' "
	"and category='coding' order by rand() limit %d", count);
struct sqlResult *sr = sqlGetResult(conn, query);
char **row;
while ((row = sqlNextRow(sr)) != NULL)
    {
    AllocVar(gene);
    gene->chrom = cloneString(row[0]);
    gene->chromStart = sqlUnsigned(row[1]);
    gene->chromEnd = sqlUnsigned(row[2]);
    gene->name = cloneString(row[3]);
    slAddHead(&geneList, gene);
    }
slReverse(&geneList);

int ix = 0;
for (gene = geneList; gene != NULL; gene = gene->next)
    {
    /* Print basic information on gene. */
    safef(query, sizeof(query), "select geneSymbol,description from kgXref where kgId = '%s'"
    	,gene->name);
    struct sqlResult *sr = sqlGetResult(conn, query);
    char **row = sqlNextRow(sr);
    fprintf(f, "GENE %s\n", row[0]);
    fprintf(f, "DESC %s\n", row[1]);
    sqlFreeResult(&sr);
    fprintf(f, "UCSC %s\n", gene->name);
    fprintf(f, "NUMB %d\n", ++ix);

    /* Print out number of splicing isoforms. */
    safef(query, sizeof(query), "select clusterId from knownCanonical where transcript='%s'"
    	,gene->name);
    int clusterId = sqlQuickNum(conn, query);
    safef(query, sizeof(query), "select count(*) from knownIsoforms where clusterId=%d"
    	,clusterId);
    int isoformCount = sqlQuickNum(conn, query);
    fprintf(f, "ISOF %d\n", isoformCount);

    /* Get gene neighborhood. */
    safef(query, sizeof(query), "select chrom from knownGene whre name = '%s'", gene->name);
    int start = getStartOfGeneBefore(conn, gene);
    int end = getEndOfGeneAfter(conn, gene);
    fprintf(f, "NBHD %s:%d-%d\n", gene->chrom, start+1, end);
    sqlFreeResult(&sr);

    /* Now print some lines we need to fill in by hand. */
    fprintf(f, "TRANSC \n");
    fprintf(f, "BIPROM \n");
    fprintf(f, "DNAPRO \n");
    fprintf(f, "ME3PRO \n");
    fprintf(f, "ME1PRO \n");
    fprintf(f, "ME1BEF \n");
    fprintf(f, "ME1INT \n");
    fprintf(f, "NOTES \n");

    fprintf(f, "//\n");
    }
}

int main(int argc, char *argv[])
/* Process command line. */
{
optionInit(&argc, argv, options);
if (argc != 4)
    usage();
regStartSampleEmbl(argv[1], argv[2], argv[3]);
return 0;
}