/* txPaperPfam - Do stuff for pfam calcs.. */
#include "common.h"
#include "linefile.h"
#include "hash.h"
#include "options.h"
#include "obscure.h"
#include "portable.h"
#include "txInfo.h"
#include "protFeat.h"


void usage()
/* Explain usage and exit. */
{
errAbort(
  "txPaperPfam - Do stuff for pfam calcs.\n"
  "usage:\n"
  "   txPaperPfam genes.info pfam.tab pfamDesc.tab canonical.lst outDir\n"
  "options:\n"
  "   -xxx=XXX\n"
  );
}

static struct optionSpec options[] = {
   {NULL, 0},
};


FILE *mustCreateInDir(char *dir, char *file)
/* Create a file to write in dir. */
{
char path[PATH_LEN];
safef(path, sizeof(path), "%s/%s", dir, file);
return mustOpen(path, "w");
}

struct featCount
/* Count up number of features. */
    {
    struct featCount *next;
    char *feature;	/* Name of feature. */
    int count;		/* Number of instances. */
    };

int featCountCmp(const void *va, const void *vb)
/* Compare to sort based on count, most numerous first. */
{
const struct featCount *a = *((struct featCount **)va);
const struct featCount *b = *((struct featCount **)vb);
return b->count - a->count;
}

void countFeature(char *protName, struct hash *featHash, struct hash *countHash, struct featCount **pCountList)
/* Look up protName in featHash, and update countHash with features seen for this
 * protein.  Do not count a feature more than once per protein. */
{
struct hash *uniqHash = hashNew(8);
struct hashEl *el;
for (el = hashLookup(featHash, protName); el != NULL; el = hashLookupNext(el))
    {
    struct protFeat *feat = el->val;
    char *featName = feat->feature;
    if (!hashLookup(uniqHash, featName))
        {
	hashAdd(uniqHash, featName, NULL);
	struct featCount *count = hashFindVal(countHash, featName);
	if (count == NULL)
	    {
	    AllocVar(count);
	    hashAddSaveName(countHash, featName, count, &count->feature);
	    slAddHead(pCountList, count);
	    }
	count->count += 1;
	}
    }
}

void featCountStats(struct featCount **pList, char *type, struct hash *descHash, FILE *f)
/* Sort list of feature and print out top ten, and percentages. */
{
int total = 0;
int i;
struct featCount *count;
slSort(pList, featCountCmp);
for (count = *pList; count != NULL; count = count->next)
    total += count->count;
fprintf(f, "%s total unique domains in proteins: %d\n", type, total);
for (count = *pList, i=1; i<=25 && count != NULL; ++i, count = count->next)
    {
    fprintf(f, "   %4.2f%% (%d) %s \"%s\"\n", 100.0 * count->count / total, count->count,
    	count->feature, naForNull(hashFindVal(descHash, count->feature)));
    }
}

void txPaperPfam(char *infoFile, char *pfamFile, char *pfamDescFile, char *canonicalFile, char *outDir)
/* txPaperPfam - Do stuff for pfam calcs.. */
{
/* Load in txInfo. */
struct txInfo *info, *infoList = txInfoLoadAll(infoFile);
struct hash *infoHash = hashNew(18);
for (info = infoList; info != NULL; info = info->next)
    hashAdd(infoHash, info->name, info);

/* Load in pfam features. */
struct protFeat *pfam, *pfamList = protFeatLoadAll(pfamFile);
struct hash *pfamHash = hashNew(18);
for (pfam = pfamList; pfam != NULL; pfam = pfam->next)
    hashAdd(pfamHash, pfam->protein, pfam);

/* Load in pfam descriptions into hash */
struct hash *pfamDescHash = hashNew(0);
struct lineFile *lf = lineFileOpen(pfamDescFile, TRUE);
char *row[3];
while (lineFileRowTab(lf, row))
    hashAdd(pfamDescHash, row[1], cloneString(row[2]));
lineFileClose(&lf);

struct hash *canonicalHash = hashWordsInFile(canonicalFile, 16);

struct hash *refCountHash = hashNew(16);
struct hash *nonrefCountHash = hashNew(16);
struct featCount *refCountList = NULL, *nonrefCountList = NULL;

/* Figure out what percentage of genes of various types are
 * covered by pfam feature. */
makeDir(outDir);
FILE *f = mustCreateInDir(outDir, "summary");
int codingTxCount = 0, codingGeneCount = 0;
int refTxCount = 0, nonrefTxCount = 0, refGeneCount = 0, nonrefGeneCount = 0;
int refTxPfam = 0, nonrefTxPfam = 0, refGenePfam = 0, nonrefGenePfam = 0;
for (info = infoList; info != NULL; info = info->next)
    {
    if (sameString(info->category, "coding"))
        {
	boolean gotPfam = (hashLookup(pfamHash, info->name) != NULL);
	boolean isRef = info->isRefSeq;
	boolean isCanonical = (hashLookup(canonicalHash, info->name) != NULL);
	++codingTxCount;
	if (isCanonical)
	    ++codingGeneCount;
	if (isRef)
	    {
	    ++refTxCount;
	    if (isCanonical)
		{
		++refGeneCount;
		if (gotPfam)
		    {
		    ++refGenePfam;
		    countFeature(info->name, pfamHash, refCountHash, &refCountList);
		    }
		}
	    if (gotPfam)
		++refTxPfam;
	    }
	else
	    {
	    ++nonrefTxCount;
	    if (isCanonical)
	        {
		++nonrefGeneCount;
		if (gotPfam)
		    {
		    ++nonrefGenePfam;
		    countFeature(info->name, pfamHash, nonrefCountHash, &nonrefCountList);
		    }
		}
	    if (gotPfam)
	        ++nonrefTxPfam;
	    }
	}
    }
fprintf(f, "Coding transcripts: %d\n", codingTxCount);
fprintf(f, "Coding genes: %d\n", codingGeneCount);
fprintf(f, "RefSeq transcripts: %d (%4.2f%%)\n", refTxCount, 100.0 * refTxCount / codingTxCount);
fprintf(f, "RefSeq genes: %d (%4.2f%%)\n", refGeneCount, 100.0 * refGeneCount / codingGeneCount);
fprintf(f, "non-RefSeq transcripts: %d (%4.2f%%)\n", nonrefTxCount, 100.0 * nonrefTxCount / codingTxCount);
fprintf(f, "non-RefSeq genes: %d (%4.2f%%)\n", nonrefGeneCount, 100.0 * nonrefGeneCount / codingGeneCount);
fprintf(f, "RefSeq transcripts with pfam: %d (%4.2f%%)\n", refTxPfam, 100.0 * refTxPfam / refTxCount);
fprintf(f, "RefSeq genes with pfam: %d (%4.2f%%)\n", refGenePfam, 100.0 * refGenePfam / refGeneCount);
fprintf(f, "non-RefSeq transcripts with pfam: %d (%4.2f%%)\n", nonrefTxPfam, 100.0 * nonrefTxPfam / nonrefTxCount);
fprintf(f, "non-RefSeq genes with pfam: %d (%4.2f%%)\n", nonrefGenePfam, 100.0 * nonrefGenePfam / nonrefGeneCount);

featCountStats(&refCountList, "RefSeq", pfamDescHash, f);
featCountStats(&nonrefCountList, "non-RefSeq", pfamDescHash, f);
}

int main(int argc, char *argv[])
/* Process command line. */
{
optionInit(&argc, argv, options);
if (argc != 6)
    usage();
txPaperPfam(argv[1], argv[2], argv[3], argv[4], argv[5]);
return 0;
}