/* kgBestMrna - driver program to call blat to select best mRNA for each protein */ #include #include "common.h" #include "hCommon.h" #include "hdb.h" char proteinName[20], mrnaName[20]; char mrnaNames[500][20]; int mrnaScore[500]; char proteinNameOld[20] = {""}; struct dnaSeq *seq; HGID id; bioSeq *mSeq, qSeq, *pSeq; extern int answer_for_kg; struct dnaSeq *untransList; char line[2000]; char line2[2000]; int mrnaCount; int proteinCount; char mrnaNames[500][20]; char mrnaDates[500][20]; int mrnaScore[500]; int diffIdent[500]; FILE *o3, *o7; char *proteinDataDate; char *genomeRelease; char *genomeReadOnly; char *genomeDBname; char proteinsDB[100]; char spDB[100]; char gbTempDB[100]; /* Explain usage and exit. */ void usage() { errAbort( "usage:\tkgResultBestMrna YYMMDD db ro_db> BestResult.out\n" "\tYYMMDD is the release date of SWISS-PROT data, eg: 031117\n" "\tdb is the genome under construction, eg: kgDB\n" "\tro_db is the actual target genome, e.g.: mm4\n" "kgResultBestMrna - after the cluster run is done, this reads the\n" "\tresults and produces a best.lis listing.\n" "\tExpects to find ./clusterRun and ./out directories in the\n" "\tcurrent working directory."); } int cal_months(char *date) { int year, month, day; int months; sscanf(date, "%d-%d-%d", &year, &month, &day); months = (year - 1970)*12 + month - 1; return(months); } void calScores(char *proteinID, int mrnaCount) { int ixm, maxixm; // index for mRNA int maxScore; int i, ii; char proteinName[20], mrnaName[20]; int diffs[500]; int monthss[500]; int mrnalens[500]; char line[2000]; char mrnaDate[20]; int months; int diff; int mrnalen; char *temp_str; struct dnaSeq *seq; struct sqlConnection *connR; connR = hAllocConn(); ixm = 0; maxScore = 0; strcpy(proteinName, proteinID); maxixm = -1; for (ii=0; iidna); months = cal_months(mrnaDate); diffs[ii] = diffIdent[ii]; mrnalens[ii] = mrnalen; monthss[ii] = months; mrnaScore[ii] = mrnalen + months*2 - diffs[ii]*50; if (mrnaScore[ii] > maxScore) { maxScore = mrnaScore[ii]; maxixm = ii; } } for (i=0; i%s\n", proteinID); fflush(stdout); sprintf(cond_str, "val='%s'", proteinID); accession = sqlGetField(conn3, spDB, "displayId","acc", cond_str); sprintf(cond_str, "acc='%s'", accession); aaSeq = sqlGetField(conn3, spDB, "protein","val", cond_str); if (aaSeq == NULL) { printf("no seq found for %s\n", proteinID); fflush(stdout); exit(1); } if ( 0 == (proteinCount % 2000) ) { snprintf(dirName, (size_t) sizeof(dirName), "prot%05d", proteinCount ); snprintf(outDir, (size_t) sizeof(outDir), "prot%05d", proteinCount ); } sprintf(query2,"select mrnaID from %sTemp.spMrna where spID='%s';",genomeRelease, proteinID); sr2 = sqlMustGetResult(conn2, query2); row2 = sqlNextRow(sr2); imrna = 0; while (row2 != NULL) { mrnaID = row2[0]; strcpy(mrnaNames[imrna], mrnaID); sprintf(cond_str, "name='%s'", mrnaID); mrnaSeq = sqlGetField(conn3,gbTempDB,"refMrna","seq", cond_str); row2 = sqlNextRow(sr2); imrna++; } mrnaCount = imrna; sqlFreeResult(&sr2); if ( ((char *) NULL) == getcwd(cwd, (size_t) sizeof(cwd)) ) errAbort("ERROR: Can not get current working directory"); snprintf(outName, (size_t) sizeof(outName),"%s/out/%s/b%05d.out", cwd, outDir, proteinCount); inf2 = fopen(outName, "r"); if ((FILE *) NULL == inf2) errAbort("ERROR: Can not open result file: %s, errno: %d", outName, errno); for (i=0; i') { newMrna = 1; line2[strlen(line2)-1] = '\0'; chp = line2 + 1; ii=-1; for (i=0; i