#include #include #include #include"cl.h" #include"cmp.h" typedef struct { int len; int *chr; int *pos; double *score; } SERIES; typedef struct { int start; int end; double score; } SEGMENT; typedef struct { int idx; double score; } IDX; SERIES *new_series( int len, SERIES *series ) { if ( series == NULL ) { series = (SERIES*)calloc(1,sizeof(SERIES)); series->len = len; series->chr = (int*)calloc(len,sizeof(int)); series->pos = (int*)calloc(len,sizeof(int)); series->score = (double*)calloc(len,sizeof(double)); } else { series->len = len; series->chr = (int*)realloc(series->chr, len*sizeof(int)); series->pos = (int*)realloc(series->pos, len*sizeof(int)); series->score = (double*)realloc(series->score, len*sizeof(double)); } return(series); } SERIES *read_series( char *filename ) { FILE *fp = fopen(filename,"r"); if ( fp != NULL ) { int maxlen = 1000000; int k=0; SERIES *series = new_series( maxlen, NULL ); printf( "reading from %s\n", filename ); while( fscanf(fp, "%d %d %lf", &(series->chr[k]), &(series->pos[k]), &(series->score[k++])) == 3) { if ( k > maxlen-2 ) { maxlen *= 2; series = new_series( maxlen, series ); } } printf( "%d data points\n", k ); return(series); } printf( "ERROR could not open %s\n", filename ); return(NULL); } SEGMENT *find_segments( SERIES *series, double offset, double threshold, int *Nseg ) { int maxseg=100000; int k=0; SEGMENT *segments =(SEGMENT*)calloc(maxseg,sizeof(SEGMENT)); while ( max_segment( series, offset, &(segments[k]))) { if ( k > maxseg - 2 ) { maxseg *=2; segments = (SEGMENT*)realloc( segments, maxseg*sizeof(SEGMENT)); } k++; if ( segments[k-1].score < threshold ) break; } return(segments); } int max_segment ( SERIES *series, double offset, SEGMENT *seg ) { int max_start =0; int max_end = 0; double max_score = 0.0; int start = 0; int end = 0; int chr = -1; double score = 0.0; int k; for(k=0;klen;k++) { if ( series->chr[k] != chr ) { score = 0.0; start = end = k; } score += series->score[k]-offset; chr = series->chr[k]; if ( score < 0.0 ) { score = 0.0; start = end = k; } if ( score > max_score ) { max_score = score; max_start = start; max_end = k; } } if ( max_score > 0 ) { for( k=max_start;k<=max_end;k++) { series->score[k] = -1.0e10; } seg->start = max_start; seg->end = max_end; seg->score = max_score; printf( "%lf %d %d %d %d %d %d %d\n", max_score, max_start, max_end, max_end-max_start+1, series->chr[max_start], series->pos[max_start], series->pos[max_end], series->pos[max_end]- series->pos[max_start]+1 ); return(1); } return(0); } int idx_cmp( const void *A, const void *B ) { IDX *a = (IDX*)A; IDX *b = (IDX*)B; return( a->idx - b->idx ); } double get_threshold( SERIES *series, double offset, int nperm ) { IDX *buf = (IDX*)calloc(series->len, sizeof(IDX)); double *tmp = (double*)calloc(series->len, sizeof(double)); double *score; double *x = (double*)calloc( nperm, sizeof(double)); int n, i, k=0; SEGMENT *segments; double threshold; segments = (SEGMENT*)calloc(nperm,sizeof(SEGMENT)); score = series->score; series->score = tmp; for( i=0;ilen;n++) { buf[n].idx = rand(); buf[n].score = score[n]; } qsort( buf, series->len, sizeof(IDX), idx_cmp ); for(n=0;nlen;n++) { tmp[n] = buf[n].score; } max_segment( series, offset, &(segments[i])); x[i] = segments[i].score; printf( "perm %d %lf\n", i, x[i]); } qsort(x,nperm,sizeof(double), lfcmp); threshold = x[nperm/2]; series->score = score; free(buf); free(tmp); free(segments); free(x); return(threshold); } double get_offset( SERIES *series, double percentile ) { double median=0; int Nsample=10000; int n; double *data = (double*)calloc(Nsample, sizeof(double)); for(n=0;nscore[j]; } qsort( data, Nsample, sizeof(double), lfcmp ); return( data[(int)(percentile*Nsample)]); } int main( int argc, char **argv) { char filename[256]; double offset; double percentile=0.8; double threshold; int Nseg=0; int nperm=4; if ( argc >=1 ) { SERIES *series = read_series( argv[1] ); if ( series != NULL ) { offset = get_offset( series, percentile ); offset = 0.2l; //use 0.2mad is the best printf("offset %lf\n", offset ); threshold = get_threshold( series, offset, nperm ); printf("threshold %lf\n", threshold ); SEGMENT *segments = find_segments( series, offset, threshold, &Nseg ); } } }