#include #include #include #include #include #include #include "main.h" #include "seq.h" #include "genran.h" #include "const.h" #include "maqmap.h" #ifndef MAX_READLEN #define MAX_READLEN 128 #endif #define MAX_QUAL 63 typedef struct { int l_min, l_max, q_max; bit64_t n; bit64_t comp_count[MAX_READLEN][5]; bit64_t qual_count[MAX_READLEN][MAX_QUAL+1]; double fqual_count[MAX_READLEN][MAX_QUAL+1]; bit64_t markov_count[MAX_READLEN][MAX_QUAL+1][MAX_QUAL+1]; double fmarkov_count[MAX_READLEN][MAX_QUAL+1][MAX_QUAL+1]; } fqc_t; fqc_t *fqc_collect(FILE *fp_fq) { seq_t seq, qual; char name[256]; int i, l, q1, q2; fqc_t *fqc = (fqc_t*)calloc(1, sizeof(fqc_t)); fqc->l_min = 1<<30; INIT_SEQ(seq); INIT_SEQ(qual); seq_set_block_size(256); while ((l = seq_read_fastq(fp_fq, &seq, &qual, name)) >= 0) { bit8_t t, tt, prev = 0; if (l < fqc->l_min) fqc->l_min = l; if (l > fqc->l_max) fqc->l_max = l; for (i = 0; i != l; ++i) { t = nst_nt4_table[seq.s[i]]; if (t > 4) t = 4; tt = (qual.s[i] > MAX_QUAL)? MAX_QUAL : qual.s[i]; if (tt > fqc->q_max) fqc->q_max = tt; ++fqc->comp_count[i][t]; ++fqc->qual_count[i][tt]; if (i > 0) ++fqc->markov_count[i][prev][tt]; prev = tt; } ++fqc->n; } free(seq.s); free(qual.s); assert(fqc->l_min == fqc->l_max); // this should not happen for (i = 0; i < fqc->l_max; ++i) { // normalize to 1 bit64_t sum = 0; for (q1 = 0; q1 <= fqc->q_max; ++q1) sum += fqc->qual_count[i][q1]; for (q1 = 0; q1 <= fqc->q_max; ++q1) fqc->fqual_count[i][q1] = (double)fqc->qual_count[i][q1] / sum; for (q1 = 0; q1 <= fqc->q_max; ++q1) { sum = 0; for (q2 = 0; q2 <= fqc->q_max; ++q2) sum += fqc->markov_count[i][q1][q2]; for (q2 = 0; q2 <= fqc->q_max; ++q2) fqc->fmarkov_count[i][q1][q2] = (double)fqc->markov_count[i][q1][q2] / sum; } } return fqc; } int maq_simutrain(int argc, char *argv[]) { fqc_t *fqc; FILE *fp; gzFile *fpout; if (argc < 3) { fprintf(stderr, "Usage: maq simutrain \n"); return 1; } fp = (strcmp(argv[2], "-") == 0)? stdin : fopen(argv[2], "r"); fpout = gzopen(argv[1], "w"); assert(fp && fpout); fqc = fqc_collect(fp); gzwrite(fpout, fqc, sizeof(fqc_t)); gzclose(fpout); fclose(fp); free(fqc); return 0; } static void gen_qual(fqc_t *fqc, char str[], int size) { double ran = ran_uniform(); double *p = fqc->fqual_count[0]; double sum; int i, k, prev; for (k = 0, sum = 0.0; k <= fqc->q_max; ++k) { sum += p[k]; // precalculate this sum will be more effficient, but this is not critical... if (sum >= ran) break; } str[0] = 33 + k; prev = k; for (i = 1; i != size; ++i) { double ran = ran_uniform(); p = fqc->fmarkov_count[i][prev]; for (k = 0, sum = 0.0; k <= fqc->q_max; ++k) { sum += p[k]; if (sum >= ran) break; } str[i] = 33 + k; prev = k; } str[size] = 0; // end of a string } static double MUT_RATE = 0.001; static double INDEL_FRAC = 0.1; void maq_mut_diref(const seq_t *seq, int is_hap, seq_t *hap1, seq_t *hap2) { int i; seq_t *ret[2]; ret[0] = hap1; ret[1] = hap2; ret[0]->l = seq->l; ret[1]->l = seq->l; ret[0]->m = seq->m; ret[1]->m = seq->m; ret[0]->s = (bit8_t*)calloc(seq->m, 1); ret[1]->s = (bit8_t*)calloc(seq->m, 1); for (i = 0; i != seq->l; ++i) { int c; c = ret[0]->s[i] = ret[1]->s[i] = 0x50 | nst_nt4_table[(int)seq->s[i]]; if ((c&0xf) < 4 && ran_uniform() < MUT_RATE) { // mutation if (ran_uniform() >= INDEL_FRAC) { // substitution double r = ran_uniform(); c = (c + (int)(r * 3.0 + 1)) & 3; if (is_hap || ran_uniform() < 0.333333) { // hom ret[0]->s[i] = ret[1]->s[i] = 0x50|c; } else { // het ret[ran_uniform()<0.5?0:1]->s[i] = 0x50|c; } } else { // indel if (ran_uniform() < 0.5) { // deletion if (is_hap || ran_uniform() < 0.333333) { // hom-del ret[0]->s[i] = ret[1]->s[i] = 0x55; } else { // het-del ret[ran_uniform()<0.5?0:1]->s[i] = 0x55; } } else { // insersion int ins = (int)(ran_uniform() * 4.0); if (is_hap || ran_uniform() < 0.333333) { // hom-ins ret[0]->s[i] = ret[1]->s[i] = ins << 4 | (c&0xf); } else { // het-ins ret[ran_uniform()<0.5?0:1]->s[i] = ins << 4 | (c&0xf); } } } } } } void maq_print_mutref(const char *name, const seq_t *seq, seq_t *hap1, seq_t *hap2) { int i; for (i = 0; i != seq->l; ++i) { int c[3]; c[0] = nst_nt4_table[(int)seq->s[i]]; c[1] = hap1->s[i]; c[2] = hap2->s[i]; if (c[0] >= 4) continue; if (c[1] != (c[0]|0x50) || c[2] != (c[0]|0x50)) { printf("%s\t%d\t", name, i+1); if (c[1] == c[2]) { // hom if ((c[1]&0xf) < 5 && c[1]>>4 == 5) { // substitution printf("%c\t%c\t-\n", "ACGTN"[c[0]], "ACGTN"[c[1]&0xf]); } else if (c[1]>>4 < 5) { // ins printf("-\t%c\t-\n", "ACGTN"[c[1]>>4&0xf]); } else if ((c[1]&0xf) == 5) { // del printf("%c\t-\t-\n", "ACGTN"[c[0]]); } else assert(0); } else { // het if ((c[1]&0xf) < 5 && c[1]>>4 == 5 && (c[2]&0xf) < 5 && c[2]>>4 == 5) { // substitution printf("%c\t%c\t+\n", "ACGTN"[c[0]], "XACMGRSVTWYHKDBN"[1<<(c[1]&0xf)|1<<(c[2]&0xf)]); } else if (c[1]>>4 < 5) { // ins1 printf("-\t%c\t+\n", "ACGTN"[c[1]>>4&0xf]); } else if (c[2]>>4 < 5) { // ins2 printf("-\t%c\t+\n", "ACGTN"[c[2]>>4&0xf]); } else if ((c[1]&0xf) == 5) { printf("%c\t-\t+\n", "ACGTN"[c[0]]); } else if ((c[2]&0xf) == 5) { printf("%c\t-\t+\n", "ACGTN"[c[0]]); } else assert(0); } } } } void maq_simulate_core(FILE *fpout1, FILE *fpout2, FILE *fp_fa, gzFile fp_par, int is_hap, bit64_t N, int dist, int std_dev, int size_l, int size_r) { fqc_t *fqc; seq_t seq, rseq[2]; bit64_t tot_len, ii; int i, l, n_ref, k; char name[256], qstr[256], str[1024]; double Q_table[MAX_QUAL+1]; int size[2]; bit8_t tmp_seq[2][256], *target; fqc = (fqc_t*)malloc(sizeof(fqc_t)); l = gzread(fp_par, fqc, sizeof(fqc_t)); for (i = 0; i <= MAX_QUAL; ++i) Q_table[i] = exp(-log(10.0) * i / 10.0); INIT_SEQ(seq); ran_seed(); seq_set_block_size(0x1000000); assert(size_l <= fqc->l_max && size_r <= fqc->l_max); if (size_l == 0) size_l = fqc->l_max; if (size_r == 0) size_r = fqc->l_max; size[0] = size_l; size[1] = size_r; tot_len = n_ref = 0; while ((l = seq_read_fasta(fp_fa, &seq, name, 0)) >= 0) { tot_len += l; ++n_ref; } fprintf(stderr, "-- %d sequences, total length: %llu\n", n_ref, tot_len); rewind(fp_fa); while ((l = seq_read_fasta(fp_fa, &seq, name, 0)) >= 0) { bit64_t n_pairs = (bit64_t)((long double)l / tot_len * N + 0.5); if (l < dist + 2 * std_dev) { fprintf(stderr, "[maq_simulate_core] Each reference sequence should be longer than dist+2*stddev. Abort!\n"); exit(1); } maq_mut_diref(&seq, is_hap, rseq, rseq+1); maq_print_mutref(name, &seq, rseq, rseq+1); for (ii = 0; ii != n_pairs; ++ii) { double ran; int d, pos, s[2], begin, end, is_flip = 0; FILE *fpo[2]; do { ran = ran_normal(); ran = ran * std_dev + dist; if (ran < fqc->l_max) ran = fqc->l_max; d = (int)(ran + 0.5); pos = (int)((l - d + 1) * ran_uniform()); } while (pos < 0 || pos >= seq.l); // flip or not if (ran_uniform() < 0.5) { fpo[0] = fpout1; fpo[1] = fpout2; s[0] = size[0]; s[1] = size[1]; } else { fpo[1] = fpout1; fpo[0] = fpout2; s[1] = size[0]; s[0] = size[1]; is_flip = 1; } // generate the read sequences target = rseq[ran_uniform()<0.5?0:1].s; // haploid from which the reads are generated for (i = pos, k = 0, begin = 0; i < seq.l; ++i) { int c = target[i]; if ((c&0xf) == 5) continue; // deletion if (begin == 0) { begin = i; if ((c&0xf0) < 4) continue; // skip ins at the first base, FIXME! } if ((c>>4) < 4) { tmp_seq[0][k++] = c>>4; if (k == s[0]) break; } tmp_seq[0][k++] = c&0xf; if (k == s[0]) break; } for (i = pos + d - 1, k = 0, end = 0; i >= 0; --i) { int c = target[i]; if ((c&0xf) == 5) continue; // deletion if (end == 0) end = i; tmp_seq[1][k++] = c&0xf; if (k == s[1]) break; if ((c>>4) < 4) { tmp_seq[1][k++] = c>>4; if (k == s[1]) break; } } // start to print out if (!is_flip) sprintf(str, "%s_%u_%u_%llx", name, begin+1, end+1, ii); else sprintf(str, "%s_%u_%u_%llx", name, begin+1, end+1, ii); // print forward read fprintf(fpo[0], "@%s/%d\n", str, is_flip+1); gen_qual(fqc, qstr, s[0]); // generate qual for (i = 0; i < s[0]; ++i) { int c = tmp_seq[0][i]; if (c > 4) c = 4; if (c < 4) { if (ran_uniform() < Q_table[qstr[i]-33]) c = (c + (int)(ran_uniform() * 3.0 + 1)) & 3; } fputc("ACGTN"[c], fpo[0]); } fprintf(fpo[0], "\n+\n%s\n", qstr); // print reverse read fprintf(fpo[1], "@%s/%d\n", str, 2-is_flip); gen_qual(fqc, qstr, s[1]); for (i = 0; i < s[1]; ++i) { int c = tmp_seq[1][i]; if (c > 4) c = 4; if (c < 4) { c = 3 - c; // complement if (ran_uniform() < Q_table[qstr[i]-33]) c = (c + (int)(ran_uniform() * 3.0 + 1)) & 3; } fputc("ACGTN"[c], fpo[1]); } fprintf(fpo[1], "\n+\n%s\n", qstr); } } free(seq.s); free(fqc); } static int simu_usage() { fprintf(stderr, "\n"); fprintf(stderr, "Usage: maq simulate [options] \n\n"); fprintf(stderr, "Options: -d INT outer distance between the two ends [170]\n"); fprintf(stderr, " -s INT standard deviation [20]\n"); fprintf(stderr, " -N INT number of read pairs [1000000]\n"); fprintf(stderr, " -1 INT length of the first read\n"); fprintf(stderr, " -2 INT length of the second read\n"); fprintf(stderr, " -r FLOAT rate of mutations [0.001]\n"); fprintf(stderr, " -R FLOAT fraction of 1bp indels [0.1]\n"); fprintf(stderr, " -h haploid mode\n"); fprintf(stderr, "\n"); return 1; } int maq_simulate(int argc, char *argv[]) { bit64_t N; int dist, std_dev, c, size_l, size_r, is_hap = 0; FILE *fpout1, *fpout2, *fp_fa; gzFile fp_par; N = 1000000; dist = 170; std_dev = 20; size_l = size_r = 0; while ((c = getopt(argc, argv, "d:s:N:1:2:r:R:h")) >= 0) { switch (c) { case 'd': dist = atoi(optarg); break; case 's': std_dev = atoi(optarg); break; case 'N': N = atoi(optarg); break; case '1': size_l = atoi(optarg); break; case '2': size_r = atoi(optarg); break; case 'r': MUT_RATE = atof(optarg); break; case 'R': INDEL_FRAC = atof(optarg); break; case 'h': is_hap = 1; break; } } if (argc - optind < 4) return simu_usage(); fpout1 = fopen(argv[optind+0], "w"); fpout2 = fopen(argv[optind+1], "w"); fp_fa = (strcmp(argv[optind+2], "-") == 0)? stdin : fopen(argv[optind+2], "r"); fp_par = gzopen(argv[optind+3], "r"); if (fpout1 == 0) fprintf(stderr, "[maq_simulate] cannot write to file '%s'", argv[optind+0]); if (fpout2 == 0) fprintf(stderr, "[maq_simulate] cannot write to file '%s'", argv[optind+1]); if (fp_fa == 0) fprintf(stderr, "[maq_simulate] cannot open reference sequence file '%s'", argv[optind+2]); if (fp_par == 0) fprintf(stderr, "[maq_simulate] cannot open parameter file '%s'\n", argv[optind+3]); assert(fpout1 && fpout2 && fp_fa && fp_par); maq_simulate_core(fpout1, fpout2, fp_fa, fp_par, is_hap, N, dist, std_dev, size_l, size_r); fclose(fpout1);fclose(fpout2); fclose(fp_fa); gzclose(fp_par); return 0; } /* simustat */ static void simustat_core(gzFile fp, int Q_thres) { static int n_valid_abflags = 5; static int valid_abflags[5] = {PAIRFLAG_FF, PAIRFLAG_FR, PAIRFLAG_RF, PAIRFLAG_RR, PAIRFLAG_DIFFCHR}; bit32_t wc_single[10], wc_pair[10], abpair[256][10], tot_single[10], tot_pair[10]; maqmap_t *m = maqmap_read_header(fp); maqmap1_t *m1, mm1; char str[256]; int i, k, tc[2][2]; m1 = &mm1; memset(wc_single, 0, 40); memset(tot_single, 0, 40); memset(wc_pair, 0, 40); memset(tot_pair, 0, 40); memset(abpair, 0, 4 * 256 * 10); memset(tc[2], 0, 4 * sizeof(int)); while (maqmap_read1(fp, m1)) { int is_correct; bit32_t p1, p2; if (m1->info2 > 60) continue; p1 = p2 = 0xffffffffu; strcpy(str, m1->name); for (k = 0, i = strlen(str) - 1; i >= 0; --i) { if (str[i] == '_') { if (k == 0) { ++k; str[i] = 0; } else if (k == 1) { p2 = atoi(str + i + 1); ++k; str[i] = 0; } else if (k == 2) { p1 = atoi(str + i + 1); ++k; str[i] = 0; } } } assert(p1 != 0xffffffffu && p2 != 0xffffffffu); is_correct = 1; if (strcmp(str, m->ref_name[m1->seqid])) is_correct = 0; else { if (p1 < p2) { --p1; // zero based if (m1->pos&1) { // reverse if (abs((m1->pos>>1) - (p2 - m1->size)) > 1) is_correct = 0; } else { // forward if (abs((m1->pos>>1) - p1) > 1) is_correct = 0; } } else { --p2; // zero based if (m1->pos&1) { // reverse if (abs((m1->pos>>1) - (p1 - m1->size)) > 1) is_correct = 0; } else { // forward if (abs((m1->pos>>1) - p2) > 1) is_correct = 0; } } } k = m1->seq[MAX_READLEN-1] / 10; if (k > 9) k = 9; ++tot_single[k]; if (!is_correct) ++wc_single[k]; k = m1->map_qual / 10; if (k > 9) k = 0; ++tot_pair[k]; // ++tc[is_correct][m1->info1>>4&1]; // debug only if (!is_correct) ++wc_pair[k]; if (!is_correct && m1->map_qual >= Q_thres) { fprintf(stderr, "%s\t%u\t%c\t%d\t%d\t%d\t%d\n", m1->name, m1->pos>>1, (m1->pos&1)? '-' : '+', m1->map_qual, m1->seq[MAX_READLEN-1], m1->c[0], m1->c[1]); } // count abnormal pairs if (m1->flag != (PAIRFLAG_FR | PAIRFLAG_PAIRED) && m1->flag != PAIRFLAG_NOMATCH) { k = m1->alt_qual / 10; if (k > 9) k = 9; ++abpair[m1->flag][k]; } } // print out // fprintf(stderr, "%d,%d,%d,%d\n", tc[0][0], tc[0][1], tc[1][0], tc[1][1]); printf("%2s%12s / %-12s%12s / %-12s", "Q", "SE_wrong", "SE_tot", "PE_wrong", "PE_tot"); for (i = 0; i != n_valid_abflags; ++i) printf("%12d", valid_abflags[i]); printf("\n"); for (k = 0; k <= 9; ++k) { printf("%dx%12d / %-12d%12d / %-12d", k, wc_single[k], tot_single[k], wc_pair[k], tot_pair[k]); for (i = 0; i != n_valid_abflags; ++i) printf("%12d", abpair[valid_abflags[i]][k]); printf("\n"); } maq_delete_maqmap(m); } int maq_simustat(int argc, char *argv[]) { gzFile fp; int Q = 100; if (argc < 2) { fprintf(stderr, "Usage: maq simustat [=100]\n"); return 1; } fp = gzopen(argv[1], "r"); if (argc >= 3) Q = atoi(argv[2]); assert(fp); simustat_core(fp, Q); gzclose(fp); return 0; }