/* pbCalResStd- Calculate the avg frequency and standard deviation of each AA residue for the proteins in a specific genome*/ #define MAXN 1000000 #define MAXRES 23 #include "common.h" #include "hCommon.h" #include "hdb.h" #include "spDb.h" #include "math.h" void usage() /* Explain usage and exit. */ { errAbort( "pbCalResStd calculates the avg frequency and standard deviation of every AA residues of the proteins in a specific genome\n" "usage:\n" " pbCalResStd spDb taxNum gDb\n" " spDb is the SWISS-PROT database name\n" " taxNumis the taxnomy number of the genome\n" " gDb is the genome database name\n" "Example: pbCalResStd sp050415 9606 hg17\n" ); } double measure[MAXN]; double freq[MAXN][MAXRES]; double avg[MAXRES]; double sumJ[MAXRES]; double sigma[MAXRES]; double sumJ[MAXRES]; int recordCnt; double recordCntDouble; double lenDouble; double sum; int calDist(double *measure, int nInput, int nDist, double xMin, double xDelta, char *oFileName) /* calculate histogram distribution of a double array of nInput elements */ { int distCnt[1000]; double xDist[1000]; FILE *o3; int i,j; int highestCnt, totalCnt; assert(nDist < ArraySize(distCnt)); o3 = mustOpen(oFileName, "w"); for (j=0; j<=nDist; j++) { distCnt[j] = 0; xDist[j] = xMin + xDelta * (double)j; } for (i=0; i xDist[j-1]) && (measure[i] <= xDist[j])) { distCnt[j]++; } } if (measure[i] > xDist[nDist-1]) { distCnt[nDist]++; } } highestCnt = 0; totalCnt = 0; for (j=0; j<=nDist; j++) { if (distCnt[j] > highestCnt) highestCnt = distCnt[j]; totalCnt = totalCnt + distCnt[j]; } if (totalCnt != nInput) errAbort("nInput %d is not equal totalCnt %d, aborting ...\n", nInput, totalCnt); /* do not print out count of the last inteval, which is everything beyond xMax */ for (j=0; j= MAXN) errAbort("Too many proteins - please set MAXN to be more than %d\n", MAXN); skip: row2 = sqlNextRow(sr2); } recordCnt = icnt; recordCntDouble = (double)recordCnt; for (j=0; j<20; j++) { carefulClose(&(fh[j])); } sqlFreeResult(&sr2); hFreeConn(&conn); hFreeConn(&conn2); for (j=0; j %c.srt", aaAlphabet[j], aaAlphabet[j]); mustSystem(temp_str); /* figure out how many unique entries */ safef(temp_str, sizeof(temp_str), "wc %c.srt > %c.tmp", aaAlphabet[j], aaAlphabet[j]); mustSystem(temp_str); safef(temp_str, sizeof(temp_str), "%c.tmp", aaAlphabet[j]); o3 = mustOpen(temp_str, "r"); mustGetLine(o3, temp_str, 1000); chp = temp_str; while (*chp == ' ') chp++; while (*chp != ' ') chp++; *chp = '\0'; sscanf(temp_str, "%d", &sortedCnt); safef(temp_str, sizeof(temp_str), "rm %c.tmp", aaAlphabet[j]); mustSystem(temp_str); /* cal hi and low cutoff threshold */ ilow = (int)((float)sortedCnt * 0.025); ihi = (int)((float)sortedCnt * 0.975); safef(temp_str, sizeof(temp_str), "%c.srt", aaAlphabet[j]); o2 = mustOpen(temp_str, "r"); i=0; for (i=0; i