/* pbCalResStdGlobal- Calculate the avg frequency and standard deviation of each AA residue for the proteins for all proteins */ #include "common.h" #include "hCommon.h" #include "hdb.h" #include "spDb.h" #include "math.h" #define MAXN 20000000 #define MAXRES 23 void usage() /* Explain usage and exit. */ { errAbort( "pbCalResStd calculates the avg frequency and standard deviation of every AA residues of the proteins in a protein database\n" "usage:\n" " pbCalResStdGlobal YYMMDD\n" " YYMMDD is the date in sp and proteins database names\n" "Example: pbCalResStdGlobal 040515\n" ); } double measure[MAXN]; double (*freq)[MAXRES]; // Dynamically allocated to be same as freq[MAXN][MAXRES]; double avg[MAXRES]; double sumJ[MAXRES]; double sigma[MAXRES]; double sumJ[MAXRES]; int recordCnt; double recordCntDouble; double lenDouble; double sum; int calDist(double *measure, int nInput, int nDist, double xMin, double xDelta, char *oFileName) /* calculate histogram distribution of a double array of nInput elements */ { int distCnt[1000]; double xDist[1000]; FILE *o3; int i,j; int highestCnt, totalCnt; assert(nDist < ArraySize(distCnt)); o3 = mustOpen(oFileName, "w"); for (j=0; j<=nDist; j++) { distCnt[j] = 0; xDist[j] = xMin + xDelta * (double)j; } for (i=0; i xDist[j-1]) && (measure[i] <= xDist[j])) { distCnt[j]++; } } if (measure[i] > xDist[nDist-1]) { distCnt[nDist]++; } } highestCnt = 0; totalCnt = 0; for (j=0; j<=nDist; j++) { if (distCnt[j] > highestCnt) highestCnt = distCnt[j]; totalCnt = totalCnt + distCnt[j]; } if (totalCnt != nInput) errAbort("nInput %d is not equal totalCnt %d, aborting ...\n", nInput, totalCnt); /* do not print out count of the last inteval, which is everything beyond xMax */ for (j=0; j= MAXN) errAbort("too many proteins, please increase value of MAXN beyond %d", MAXN); accession = row2[0]; aaSeq = row2[1]; len = strlen(aaSeq); if (len < 100) goto skip; lenDouble = (double)len; for (j=0; j= MAXN) errAbort("Too many proteins - please set MAXN to be more than %d\n", MAXN); skip: row2 = sqlNextRow(sr2); } recordCnt = icnt; recordCntDouble = (double)recordCnt; for (j=0; j<20; j++) { carefulClose(&(fh[j])); } sqlFreeResult(&sr2); hFreeConn(&conn2); for (j=0; j %c.srt", aaAlphabet[j], aaAlphabet[j]); mustSystem(temp_str); /* figure out how many unique entries */ safef(temp_str, sizeof(temp_str), "wc %c.srt > %c.tmp", aaAlphabet[j], aaAlphabet[j]); mustSystem(temp_str); safef(temp_str, sizeof(temp_str), "%c.tmp", aaAlphabet[j]); o3 = mustOpen(temp_str, "r"); mustGetLine(o3, temp_str, 1000); chp = temp_str; while (*chp == ' ') chp++; while (*chp != ' ') chp++; *chp = '\0'; sscanf(temp_str, "%d", &sortedCnt); safef(temp_str, sizeof(temp_str), "rm %c.tmp", aaAlphabet[j]); mustSystem(temp_str); /* cal hi and low cutoff threshold */ ilow = (int)((float)sortedCnt * 0.025); ihi = (int)((float)sortedCnt * 0.975); safef(temp_str, sizeof(temp_str), "%c.srt", aaAlphabet[j]); o2 = mustOpen(temp_str, "r"); i=0; for (i=0; i