/* hgKnownGeneList - Generate Known Genes List HTML pages to be indexed by Google. */
#include "common.h"
#include "hdb.h"
#include "dbDb.h"
#include "hCommon.h"
#include "web.h"
#define LINKSPERPAGE 30
#define MAXPAGES 3000
#define MAXSUBDIR 50
#define MAXTOP 200
#define TESTSIZE 2600
/* global variables */
char *genome, *genomeDesc;
char command[255];
char *database;
char startSymbol[MAXPAGES][20];
char endSymbol[MAXPAGES][20];
char pageStartSymbol[MAXSUBDIR][20];
char pageEndSymbol[MAXSUBDIR][20];
char topStartSymbol[MAXTOP][20];
char topEndSymbol[MAXTOP][20];
int currentPage;
char emptyString[10] = {" "};
void usage()
/* Explain usage and exit. */
{
errAbort(
"hgKnownGeneList - Generate Known Genes List HTML pages to be indexed by Google\n"
"usage:\n"
" hgKnownGeneList db\n"
" db is the genome database\n"
"example:\n"
" hgKnownGeneList hg17\n");
}
void printHtmlHead(FILE *outf)
{
fprintf(outf, "");
fprintf(outf, "
");
fprintf(outf, "\n");
fprintf(outf, "\n");
fprintf(outf, "UCSC Known Genes Description and Page Index\n");
webIncludeResourcePrintToFile(outf,"HGStyle.css");
fprintf(outf, "\n");
}
void printHtmlEnd(FILE *outf)
{
fprintf(outf, "\n");
fflush(outf);
}
int main(int argc, char *argv[])
{
struct sqlConnection *conn, *conn2, *conn3;
struct sqlConnection *connCentral = hConnectCentral();
char query[256], query2[256], query3[256];
struct sqlResult *sr, *sr2;
char **row, **row2;
char buf[128];
char *answer;
char *kgID, *chrom, *txStart, *txEnd;
char *mRNA;
int i;
int geneCnt = 0;
int pageNum = 0;
int topLevel = 1;
char *geneSymbol, *proteinID, *spID, *desc;
FILE *outf, *outf2;
char fileName[255];
database = strdup("hg17");
boolean newPage;
int totalKgId, totalKgCnt;
int totalKgPage;
int kgIdCnt = 0;
if (argc != 2) usage();
database = argv[1];
sprintf(query, "select genome from dbDb where name = '%s'", database);
answer = sqlQuickQuery(connCentral, query, buf, sizeof(buf));
if (answer == NULL)
{
fprintf(stderr,"'%s' is not a valid genome database name.", database);
exit(1);
}
else
{
genome = strdup(answer);
}
if (!hTableExists(database, "knownGene"))
{
fprintf(stderr,"Database %s currently does not have UCSC Known Genes.", database);
exit(1);
}
sprintf(query, "select description from dbDb where name = '%s'", database);
genomeDesc = strdup(sqlQuickQuery(connCentral, query, buf, sizeof(buf)));
hDisconnectCentral(&connCentral);
/* create first top level subdirectory */
safef(command, sizeof(command), "mkdir -p knownGeneList/%s/%d", database, topLevel);
mustSystem(command);
conn = hAllocConn(database);
conn2= hAllocConn(database);
conn3= hAllocConn(database);
newPage = TRUE;
currentPage = 0;
/* put this in to avoid compiler complaining */
outf = NULL;
geneSymbol = NULL;
char *protAcc = NULL;
/* figure out how many pages in total */
safef(query2, sizeof(query2), "select count(k.name) from %s.knownGene k, %s.kgXref x where k.name=x.kgId and geneSymbol != ''", database, database);
sr2 = sqlMustGetResult(conn2, query2);
row2 = sqlNextRow(sr2);
totalKgCnt = atoi(row2[0]);
sqlFreeResult(&sr2);
/* figure out how many KG IDs in total */
safef(query2, sizeof(query2), "select count(*) from %s.kgXref where geneSymbol !=''", database);
sr2 = sqlMustGetResult(conn2, query2);
row2 = sqlNextRow(sr2);
totalKgId = atoi(row2[0]);
sqlFreeResult(&sr2);
totalKgPage = totalKgId/LINKSPERPAGE + 1;
safef(query2, sizeof(query2),
"select kgID, geneSymbol, description from %s.kgXref where geneSymbol!= '' order by geneSymbol",
database);
/* for debugging */
/* "select kgID, geneSymbol, description from %s.kgXref order by geneSymbol limit %d",
database, TESTSIZE);*/
sr2 = sqlMustGetResult(conn2, query2);
row2 = sqlNextRow(sr2);
/* for debugging */
/* while (kgIdCnt < TESTSIZE) */
while (kgIdCnt < totalKgId)
{
kgIdCnt++;
kgID = row2[0];
geneSymbol = strdup(row2[1]);
desc = row2[2];
safef(query, sizeof(query),
"select chrom,txSTart,txEnd,proteinID from %s.knownGene where name='%s'", database, kgID);
sr = sqlMustGetResult(conn, query);
row = sqlNextRow(sr);
if (row != NULL)
{
geneCnt++;
chrom = row[0];
txStart = row[1];
txEnd = row[2];
proteinID = row[3];
if (newPage)
{
/* create a KG links page */
pageNum++;
currentPage++;
/* use mkdir -p to make sure the subdirectory exists */
safef(command, sizeof(command), "mkdir -p knownGeneList/%s/%d", database, topLevel);
mustSystem(command);
safef(fileName, sizeof(fileName),
"knownGeneList/%s/%d/kgList%d.html", database, topLevel, pageNum);
outf = fopen(fileName, "w");
printHtmlHead(outf);
fprintf(outf,"UCSC %s Known Genes List (page %d of %d)
\n",
genome, pageNum, totalKgPage);
fprintf(outf, "\n");
fprintf(outf,
"
Gene Symbol | Known Gene ID | mRNA | UniProt | RefSeq Protein | Description | \n");
strcpy(startSymbol[pageNum], geneSymbol);
strcpy(pageStartSymbol[currentPage], geneSymbol);
newPage = FALSE;
}
fprintf(outf,"
---|
");
fprintf(outf,"%s | ", geneSymbol);
/*fprintf(outf,"%d:%s | ", geneCnt, geneSymbol);*/
fprintf(outf,"");
fprintf(outf,"", chrom, txStart, txEnd);
fprintf(outf,"%s", kgID);
fprintf(outf,"");
fprintf(outf," | \n");
safef(query3,sizeof(query3),"select spID from %s.kgXref where kgID = '%s'", database, kgID);
spID = cloneString(sqlQuickQuery(conn3, query3, buf, sizeof(buf)));
if (spID == NULL)
{
spID = emptyString;
}
else
{
if (sameWord(spID,"")) spID = emptyString;
}
safef(query3,sizeof(query3),"select mRNA from %s.kgXref where kgID = '%s'", database, kgID);
mRNA = cloneString(sqlQuickQuery(conn3, query3, buf, sizeof(buf)));
if (mRNA == NULL)
{
mRNA = emptyString;
}
else
{
if (sameWord(mRNA,"")) mRNA = emptyString;
}
safef(query3,sizeof(query3),"select protAcc from %s.kgXref where kgID = '%s'", database, kgID);
protAcc = sqlQuickQuery(conn3, query3, buf, sizeof(buf));
if (protAcc == NULL)
{
protAcc = emptyString;
}
else
{
if (sameWord(protAcc,"")) protAcc = emptyString;
}
fprintf(outf,"%s | ", mRNA);
fprintf(outf,"%s | ", spID);
fprintf(outf,"%s | ", protAcc);
fprintf(outf,"%s | ", desc );
fprintf(outf,"
\n");
if ((geneCnt % LINKSPERPAGE) == 0)
{
/* flush out and close the page if a page is filled, and start a new page */
fprintf(outf,"
");
strcpy(endSymbol[pageNum], geneSymbol);
strcpy(pageEndSymbol[currentPage], endSymbol[pageNum]);
fprintf(outf, "
");
fprintf(outf, "",
database, topLevel,topLevel);
fprintf(outf, "Up");
fprintf(outf,"
\n");
printHtmlEnd(outf);
newPage = TRUE;
fclose(outf);
outf = NULL;
if ((pageNum % LINKSPERPAGE) == 0 )
{
printf("Processing topLevel %d ...\n", topLevel);fflush(stdout);
safef(fileName, sizeof(fileName),
"knownGeneList/%s/%d/kgIndex%d.html", database, topLevel, topLevel);
outf2 = fopen(fileName, "w");
printHtmlHead(outf2);
//fprintf(outf2,"UCSC %s Known Genes List
\n", genome);
fprintf(outf2,"UCSC %s Known Genes List (Group %d)
\n", genome, topLevel);
for (i=1; i<= currentPage; i++)
{
fprintf(outf2, "Page %d: ", (topLevel-1)*LINKSPERPAGE+i);
fprintf(outf2,
"",
database, topLevel, (topLevel-1)*LINKSPERPAGE+i);
fprintf(outf2, "%s to %s", pageStartSymbol[i], pageEndSymbol[i]);
fprintf(outf2,"
\n");
}
fprintf(outf2, "
");
fprintf(outf2, "",database);
fprintf(outf2, "Up");
fprintf(outf2,"
\n");
printHtmlEnd(outf2);
fclose(outf2);
strcpy(topStartSymbol[topLevel], pageStartSymbol[1]);
strcpy( topEndSymbol[topLevel], pageEndSymbol[currentPage]);
currentPage = 0;
topLevel++;
}
}
row = sqlNextRow(sr);
}
sqlFreeResult(&sr);
row2 = sqlNextRow(sr2);
}
sqlFreeResult(&sr2);
/* flush out and close the last list page */
if (outf != NULL)
{
fprintf(outf,"");
strcpy(endSymbol[pageNum], geneSymbol);
strcpy(pageEndSymbol[currentPage], endSymbol[pageNum]);
fprintf(outf, "
");
fprintf(outf, "",
database, topLevel,topLevel);
fprintf(outf, "Up");
fprintf(outf,"
\n");
printHtmlEnd(outf);
fclose(outf);
}
/* generate the last index page */
safef(command, sizeof(command), "mkdir -p knownGeneList/%s/%d", database, topLevel);
mustSystem(command);
safef(fileName, sizeof(fileName),
"knownGeneList/%s/%d/kgIndex%d.html", database, topLevel, topLevel);
outf2 = fopen(fileName, "w");
printHtmlHead(outf2);
fprintf(outf2,"UCSC %s Known Genes List (Group %d)
\n", genome, topLevel);
for (i=1; i<= currentPage; i++)
{
fprintf(outf2, "Page %d: ", (topLevel-1)*LINKSPERPAGE+i);
fprintf(outf2, "",
database, topLevel, (topLevel-1)*LINKSPERPAGE+i);
fprintf(outf2, "%s to %s", pageStartSymbol[i], pageEndSymbol[i]);
fprintf(outf2,"
\n");
fflush(outf2);
}
fprintf(outf2, "
");
fprintf(outf2, "",database);
fprintf(outf2, "Up");
fprintf(outf2,"
\n");
strcpy(topStartSymbol[topLevel], pageStartSymbol[1]);
strcpy( topEndSymbol[topLevel], pageEndSymbol[currentPage]);
fclose(outf2);
currentPage = 0;
/* generate the top HTML page */
safef(fileName, sizeof(fileName), "knownGeneList/%s/top.html", database);
outf2 = fopen(fileName, "w");
printHtmlHead(outf2);
fprintf(outf2,"UCSC %s Known Genes List
\n", genome);
for (i=1; i<= topLevel; i++)
{
fprintf(outf2, "Group %d: ", i);
fprintf(outf2, "", database, i, i);
fprintf(outf2, " %s to %s", topStartSymbol[i], topEndSymbol[i]);
fprintf(outf2,"
\n");
fflush(outf2);
}
fprintf(outf2, "
");
fprintf(outf2, "");
fprintf(outf2, "Up");
fprintf(outf2,"
\n");
printHtmlEnd(outf2);
printHtmlEnd(outf2);
fclose(outf2);
return(0);
}