Finished reading input. ++ echo CB6633.3/./s_4_1_sequence_read1.fastq ++ tr , ' ' + fastq_file_set1=CB6633.3/./s_4_1_sequence_read1.fastq ++ cd /data/maqgene/reads ++ cat CB6633.3/./s_4_1_sequence_read1.fastq ++ wc -l ++ cut -f 1 -d ' ' + TOTAL_LINES=252530668 ++ echo '(252530668 / 5000000) + (252530668 % 5000000 != 0)' ++ bc + num_chunks=51 + set +x make -j 14 -l 14 -C /data/maqgene/work --warn-undefined-variables -I /data/maqgene -f /data/maqgene/makefile BFA_FILE=elegans.bfa MAQGENE_GENOME_DIR=/data/maqgene/genomes reads_cksum=3884200371 map_cksum=3736054160 pileup_cksum=1242175186 cns_cksum=2320895160 umdd=50000 dmdd=1000 umid=1000 dmid=1000 max_gene_radius=50000 full_results_dir=/data/maqgene/out/example_user/CB6633_3 outfile_basename=CB6633_3 map_parameters=" -m 0.00001 -C 250 -n 2 -2 0 -a 500 -e 100 -A 500 -1 0" assemble_parameters=" -N 2 -q 0 -r 0.0 -m 7 -Q 100 -p " pileup_parameters=" -Q 100 -p -m 7 -q 0" CHUNK_SIZE=5000000 NUM_CHUNKS=51 Will process 252530668 lines of input in 51 chunks make: Entering directory `/data/maqgene/work' # Fri Nov 25 10:37:22 GMT 2011: Removing frontend files in case this is a duplicate run ... rm -f /data/maqgene/out/example_user/CB6633_3/CB6633_3_{grouped.txt,flat.txt} rm -f /data/maqgene/out/example_user/CB6633_3/CB6633_3_uncovered.txt /data/maqgene/out/example_user/CB6633_3/CB6633_3_coverage.txt /data/maqgene/out/example_user/CB6633_3/CB6633_3_pileup.txt /data/maqgene/out/example_user/CB6633_3/CB6633_3_log.txt /data/maqgene/out/example_user/CB6633_3/CB6633_3_check.txt /data/maqgene/out/example_user/CB6633_3/CB6633_3_unmapped.txt make: Leaving directory `/data/maqgene/work' make: Entering directory `/data/maqgene/work' /data/maqgene/makefile:126: warning: undefined variable `mapcheck_parameters' # Fri Nov 25 10:37:22 GMT 2011: Generating consensus ... /data/maqgene/bin/maq assemble -N 2 -q 0 -r 0.0 -m 7 -Q 100 -p 2320895160.cns \ /data/maqgene/genomes/elegans.bfa 3736054160.map 2> 2320895160_log.txt # Fri Nov 25 10:48:29 GMT 2011: Linking backend file 3736054160_unmapped.txt to /data/maqgene/out/example_user/CB6633_3/CB6633_3_unmapped.txt ln -fs /data/maqgene/work/3736054160_unmapped.txt /data/maqgene/out/example_user/CB6633_3/CB6633_3_unmapped.txt # Fri Nov 25 10:48:29 GMT 2011: Linking backend file 2320895160_log.txt to /data/maqgene/out/example_user/CB6633_3/CB6633_3_log.txt ln -fs /data/maqgene/work/2320895160_log.txt /data/maqgene/out/example_user/CB6633_3/CB6633_3_log.txt # Fri Nov 25 10:48:29 GMT 2011: Running 'mapcheck' ... /data/maqgene/bin/maq mapcheck /data/maqgene/genomes/elegans.bfa 3736054160.map > 3736054160_check.txt \ 2>/dev/null # Fri Nov 25 10:55:48 GMT 2011: Creating pileup ... /data/maqgene/bin/maq pileup -Q 100 -p -m 7 -q 0 /data/maqgene/genomes/elegans.bfa 3736054160.map > 1242175186_pileup.txt # Fri Nov 25 11:05:30 GMT 2011: Linking backend file 3736054160_check.txt to /data/maqgene/out/example_user/CB6633_3/CB6633_3_check.txt ln -fs /data/maqgene/work/3736054160_check.txt /data/maqgene/out/example_user/CB6633_3/CB6633_3_check.txt # Fri Nov 25 11:05:30 GMT 2011: Linking backend file 1242175186_pileup.txt to /data/maqgene/out/example_user/CB6633_3/CB6633_3_pileup.txt ln -fs /data/maqgene/work/1242175186_pileup.txt /data/maqgene/out/example_user/CB6633_3/CB6633_3_pileup.txt # Fri Nov 25 11:05:30 GMT 2011: Making coverage histogram ... (echo -en "sequencing_depth\tnumber_of_bases\n"; \ cut -f 4 1242175186_pileup.txt \ | /data/maqgene/bin/pileup_histogram 100) > 1242175186_coverage.txt # Fri Nov 25 11:07:26 GMT 2011: Getting uncovered regions ... cut -f 1,2,4 1242175186_pileup.txt \ | /data/maqgene/bin/get_uncovered_regions 50 \ > 1242175186_uncovered.txt # Fri Nov 25 11:09:37 GMT 2011: Linking backend file 1242175186_coverage.txt to /data/maqgene/out/example_user/CB6633_3/CB6633_3_coverage.txt ln -fs /data/maqgene/work/1242175186_coverage.txt /data/maqgene/out/example_user/CB6633_3/CB6633_3_coverage.txt # Fri Nov 25 11:09:37 GMT 2011: Linking backend file 1242175186_uncovered.txt to /data/maqgene/out/example_user/CB6633_3/CB6633_3_uncovered.txt ln -fs /data/maqgene/work/1242175186_uncovered.txt /data/maqgene/out/example_user/CB6633_3/CB6633_3_uncovered.txt /data/maqgene/bin/filter_matching_lines 1242175186_pileup.txt "%s\t%i\t" \ <(/usr/bin/mysql -Dmaqgenedb -umaqgene --skip-column-names --batch --raw --local-infile -e "select distinct dna, start + 1 from elegans_features where feature = 'SNP' order by dna, start") "%s\t%i\n" si \ | /data/maqgene/bin/filter_maq_pileup 'AaCcGgTt,.' 0 \ > 1242175186_known_snps # Fri Nov 25 11:11:57 GMT 2011: Extracting point mutants from consensus ... /usr/bin/mysql -umaqgene -Dmaqgenedb --local-infile -e "drop table if exists 2320895160_snps; create table 2320895160_snps ( id int(10) NOT NULL auto_increment, dna char(40) NOT NULL, start int(10) NOT NULL, end int(10) NOT NULL, indel_size int(5) NOT NULL, variant_type enum('indel', 'point') NOT NULL, ref_base char NOT NULL, sample_base char NOT NULL, snp_score int(5) NOT NULL, read_depth int(5) NOT NULL, loci_multiplicity double(5,2) NOT NULL, mapping_quality int(5) NOT NULL, neighbor_quality int(5) NOT NULL, primary key (id), unique key (dna, start), key (indel_size) )" /data/maqgene/bin/maq cns2snp 2320895160.cns \ | cat -b | /usr/bin/mysql -umaqgene -Dmaqgenedb --local-infile -e "load data local infile '/dev/stdin' into table 2320895160_snps (id, dna, @start, ref_base, sample_base, snp_score, read_depth, loci_multiplicity, mapping_quality, neighbor_quality) set start = @start - 1, end = @start, indel_size = 0, variant_type = 'point'; flush table 2320895160_snps;" # Fri Nov 25 11:12:21 GMT 2011: Extracting indels from consensus ... /data/maqgene/bin/maq indelsoa /data/maqgene/genomes/elegans.bfa 3736054160.map \ | awk '{ if ($5+$6-$4 >= 5 && $4 <= 1) { print $0 } }' \ | /usr/bin/mysql -umaqgene -Dmaqgenedb --local-infile -e "load data local infile '/dev/stdin' into table 2320895160_snps (dna, @start, indel_size, @num_spanning_reads, @num_left_reads, @num_right_reads, @junk) set id = NULL, start = @start - 1, end = @start - 1 + if (indel_size > 0, 0, -indel_size), ref_base = 'X', sample_base = 'X', variant_type = 'indel', snp_score = -1000, read_depth = if (@num_left_reads < @num_right_reads, @num_left_reads, @num_right_reads), loci_multiplicity = -1000, mapping_quality = -1000, neighbor_quality = -1000; flush table 2320895160_snps;" # Fri Nov 25 11:17:58 GMT 2011: Adding placeholders for known SNPs. cat 1242175186_known_snps \ | /usr/bin/mysql -umaqgene -Dmaqgenedb --local-infile -e "load data local infile '/dev/stdin' ignore into table 2320895160_snps (dna, @start, ref_base, read_depth, @junk) set id = NULL, start = @start - 1, end = @start - 1, variant_type = 'point', snp_score = -1000, loci_multiplicity = -1000, mapping_quality = -1000, neighbor_quality = -1000; flush table 2320895160_snps;" Found 160198 variants. touch 2320895160_snps # Fri Nov 25 11:18:24 GMT 2011: Filtering and loading pileup for analysis ... /usr/bin/mysql -umaqgene -Dmaqgenedb --local-infile -e "drop table if exists 1242175186_pileup; create table 1242175186_pileup ( dna enum('I','II','III','IV','MtDNA','V','X') NOT NULL, start int(10) NOT NULL, ref_base char(1) NOT NULL, read_depth int(5) NOT NULL, sample_reads varchar(100) NOT NULL, A_fwd int(5) NOT NULL, A_rev int(5) NOT NULL, C_fwd int(5) NOT NULL, C_rev int(5) NOT NULL, G_fwd int(5) NOT NULL, G_rev int(5) NOT NULL, T_fwd int(5) NOT NULL, T_rev int(5) NOT NULL, wt_fwd int(5) NOT NULL, wt_rev int(5) NOT NULL, primary key (dna, start) );" cat 1242175186_pileup.txt \ | /data/maqgene/bin/filter_maq_pileup 'AaCcGgTt,.' 2 \ | awk 'BEGIN { OFS="\t" } { $5=substr($5,1,100); print $0 }' \ | /usr/bin/mysql -umaqgene -Dmaqgenedb --local-infile -e "load data local infile '/dev/stdin' into table 1242175186_pileup (dna, @start, ref_base, read_depth, sample_reads, A_fwd, A_rev, C_fwd, C_rev, G_fwd, G_rev, T_fwd, T_rev, wt_fwd, wt_rev) set start = @start - 1; flush table 1242175186_pileup" cat 1242175186_known_snps \ | awk 'BEGIN { OFS="\t" } { $5=substr($5,1,100); print $0 }' \ | /usr/bin/mysql -umaqgene -Dmaqgenedb --local-infile -e "load data local infile '/dev/stdin' into table 1242175186_pileup (dna, @start, ref_base, read_depth, sample_reads, A_fwd, A_rev, C_fwd, C_rev, G_fwd, G_rev, T_fwd, T_rev, wt_fwd, wt_rev) set start = @start - 1; flush table 1242175186_pileup" touch 1242175186_pileup # Fri Nov 25 11:21:07 GMT 2011: Getting masking regions /data/maqgene/bin/associate_regions -b -1 -c -1000000 -m 2 -o >(/usr/bin/mysql -Dmaqgenedb -umaqgene --skip-column-names --batch --raw --local-infile -e "drop table if exists 2320895160_masked_ids; create table 2320895160_masked_ids (query_region_id int(10) NOT NULL, primary key (query_region_id)); load data local infile '/dev/stdin' ignore into table 2320895160_masked_ids (@j1, query_region_id, @j3, @j4, @j5, @j6, @j7)") -d /data/maqgene/genomes/elegans.dnas -p /data/maqgene/genomes -q <(/usr/bin/mysql -Dmaqgenedb -umaqgene --skip-column-names --batch --raw --local-infile -e "select id, 'elegans', dna, start, end, '+' from 2320895160_snps") -t <(/usr/bin/mysql -Dmaqgenedb -umaqgene --skip-column-names --batch --raw --local-infile -e "select id,'elegans',dna,start,end,strand from elegans_features feature where feature.feature not in ('mRNA', 'intron')") 1>/dev/null; touch 2320895160_masked_ids # Fri Nov 25 11:21:11 GMT 2011: Writing snp read counts ... /usr/bin/mysql -Dmaqgenedb -umaqgene --skip-column-names --batch --raw --local-infile --column-names -e "select f.id, f.dna, f.start + 1 start, e.snp_name, e.snp_base, if(e.snp_base = 'a', p.A_fwd + p.A_rev, if (e.snp_base = 'c', p.C_fwd + p.C_rev, if (e.snp_base = 'g', p.G_fwd + p.G_rev, if (e.snp_base = 't', p.T_fwd + p.T_rev, -1)))) snp_reads, p.ref_base, p.wt_fwd + p.wt_rev wt_reads, p.read_depth, p.sample_reads from elegans_features f join elegans_snp_changes e on (f.attribute = e.snp_name) join 1242175186_pileup p using (dna,start)" > /data/maqgene/out/example_user/CB6633_3/CB6633_3_snp_read_counts.txt 0 snp read count lines written. # Fri Nov 25 11:21:11 GMT 2011: Finding all genomic features overlapping variants... /usr/bin/mysql -umaqgene -Dmaqgenedb --local-infile -e "drop table if exists 2320895160_rel_snps; CREATE TABLE 2320895160_rel_snps ( association_id INT(10) NOT NULL, query_region_id INT(10) unsigned NOT NULL, target_region_id INT(10) NOT NULL, distance INT(10) NOT NULL, overlap INT(10) NOT NULL, same_strand enum('SAME', 'OPP') NOT NULL, num_regions_between INT(3) NOT NULL, PRIMARY KEY (association_id), KEY (query_region_id), KEY (target_region_id) )" /data/maqgene/bin/associate_regions -b -1 -c -1000000 -m 0 -o >(/usr/bin/mysql -umaqgene -Dmaqgenedb --local-infile -e "load data local infile '/dev/stdin' into table 2320895160_rel_snps; flush table 2320895160_rel_snps") -d /data/maqgene/genomes/elegans.dnas -p /data/maqgene/genomes -q <(/usr/bin/mysql -Dmaqgenedb -umaqgene --skip-column-names --batch --raw --local-infile -e "select id, 'elegans', dna, start, end, '+' from 2320895160_snps") -t <(/usr/bin/mysql -Dmaqgenedb -umaqgene --skip-column-names --batch --raw --local-infile -e "select id, 'elegans', dna,start,end,strand from elegans_features where feature in ('SNP','exon','five_prime_UTR','intron','mRNA','three_prime_UTR','ncRNA')") 1>/dev/null touch 2320895160_rel_snps # Fri Nov 25 11:21:19 GMT 2011: Loading uncovered regions into table ... cat 1242175186_uncovered.txt \ | /usr/bin/mysql -umaqgene -Dmaqgenedb --local-infile -e "drop table if exists 1242175186_uncovered; create table 1242175186_uncovered ( id int(10) NOT NULL auto_increment, dna char(40) NOT NULL, start int(10) NOT NULL, end int(10) NOT NULL, primary key (id) ) auto_increment = 160199; load data local infile '/dev/stdin' into table 1242175186_uncovered (dna, start, end)" Uncovered region statistics: chromosome number_uncovered_regions total_uncovered_length I 12 1749 II 14 2047 III 7 825 IV 19 5480 V 9 1148 X 9 10639822 touch 1242175186_uncovered # Fri Nov 25 11:21:20 GMT 2011: Getting intergenic regions /usr/bin/mysql -umaqgene -Dmaqgenedb --local-infile -e "drop table if exists 2320895160_rel_intergenic; create table 2320895160_rel_intergenic like 2320895160_rel_snps"; /data/maqgene/bin/associate_regions -b 50000 -c -1000000 -m 2 -o >(/usr/bin/mysql -umaqgene -Dmaqgenedb --local-infile -e "load data local infile '/dev/stdin' into table 2320895160_rel_intergenic; flush table 2320895160_rel_intergenic") -d /data/maqgene/genomes/elegans.dnas -p /data/maqgene/genomes -q <(/usr/bin/mysql -Dmaqgenedb -umaqgene --skip-column-names --batch --raw --local-infile -e "select id, 'elegans', dna, start, end, '+' from 2320895160_snps") -t <(/usr/bin/mysql -Dmaqgenedb -umaqgene --skip-column-names --batch --raw --local-infile -e "select id, 'elegans',dna,start,end,strand from elegans_features feature where feature.feature = 'mRNA'") 1>/dev/null; # Fri Nov 25 11:21:27 GMT 2011: Filtering masked intergenic regions /usr/bin/mysql -Dmaqgenedb -umaqgene --skip-column-names --batch --raw --local-infile -e "delete i.* from 2320895160_rel_intergenic i join 2320895160_masked_ids m using (query_region_id)" touch 2320895160_rel_intergenic # Fri Nov 25 11:21:34 GMT 2011: Finding mRNA features overlapping uncovered regions ... /usr/bin/mysql -umaqgene -Dmaqgenedb --local-infile -e "drop table if exists 1242175186_uncovered_rel; CREATE TABLE 1242175186_uncovered_rel ( association_id INT(10) NOT NULL, query_region_id INT(10) unsigned NOT NULL, target_region_id INT(10) NOT NULL, distance INT(10) NOT NULL, overlap INT(10) NOT NULL, same_strand enum('SAME', 'OPP') NOT NULL, num_regions_between INT(3) NOT NULL, PRIMARY KEY (association_id), KEY (query_region_id), KEY (target_region_id) )" /data/maqgene/bin/associate_regions -b -1 -c -1000000 -m 0 -o >(/usr/bin/mysql -umaqgene -Dmaqgenedb --local-infile -e "load data local infile '/dev/stdin' into table 1242175186_uncovered_rel; flush table 1242175186_uncovered_rel") -d /data/maqgene/genomes/elegans.dnas -p /data/maqgene/genomes -q <(/usr/bin/mysql -Dmaqgenedb -umaqgene --skip-column-names --batch --raw --local-infile -e "select id, 'elegans', dna, start, end, '+' from 1242175186_uncovered") -t <(/usr/bin/mysql -Dmaqgenedb -umaqgene --skip-column-names --batch --raw --local-infile -e "select id, 'elegans', dna,start,end,strand from elegans_features where feature in ('SNP','exon','five_prime_UTR','intron','mRNA','three_prime_UTR','ncRNA')") 1>/dev/null touch 1242175186_uncovered_rel # Fri Nov 25 11:22:06 GMT 2011: Classifying intergenic relations /usr/bin/mysql -umaqgene -Dmaqgenedb --local-infile -e "drop table if exists 2320895160_intergenic_assoc; create table 2320895160_intergenic_assoc ( id INT(10) NOT NULL, target_region_id INT(10) NOT NULL, relation enum ('upstream', 'downstream', 'into') NOT NULL, distance INT(10) NOT NULL, num_regions_between INT(3) NOT NULL, attribute varchar(50) NOT NULL, primary key (id, target_region_id), key (id, attribute) ); insert into 2320895160_intergenic_assoc select snp.id, rel.target_region_id, if (rel.overlap > 0, 'into', if (feature.strand = '+', if (snp.end < feature.start, 'upstream', 'downstream'), if (feature.end < snp.start, 'upstream', 'downstream') ) ) relation, rel.distance, rel.num_regions_between, feature.attribute from 2320895160_snps snp join 2320895160_rel_intergenic rel on (snp.id = rel.query_region_id) join elegans_features feature on (rel.target_region_id = feature.id)" touch 2320895160_intergenic_assoc # Fri Nov 25 11:22:26 GMT 2011: Calculating translation start offsets for uncovered regions... /usr/bin/mysql -umaqgene -Dmaqgenedb --local-infile -e "drop table if exists 1242175186_offsets_uncovered; create table 1242175186_offsets_uncovered ( id int(10) NOT NULL, gene varchar(50) NOT NULL, 5prime_offset int(10) NOT NULL, 3prime_offset int(10) NOT NULL, total_length int(10) NOT NULL, boundary_type char(20) NOT NULL, primary key (id, gene, boundary_type) );" /usr/bin/mysql -umaqgene -Dmaqgenedb --local-infile -e "insert into 1242175186_offsets_uncovered select reg.id, gene.attribute gene, if(exon.strand = '+', if(reg.start < min(exon.start), reg.start - min(exon.start), sum(if(reg.start < exon.end, reg.start, exon.end) - if(reg.start < exon.start, reg.start, exon.start))), if(max(exon.end) < reg.start, max(exon.end) - reg.start, sum(if(exon.end < reg.start, reg.start, exon.end) - if(exon.start < reg.start, reg.start, exon.start))) ), if(exon.strand = '+', if(max(exon.end) < reg.start, max(exon.end) - reg.start, sum(if(exon.end < reg.start, reg.start, exon.end) - if(exon.start < reg.start, reg.start, exon.start))), if(reg.start < min(exon.start), reg.start - min(exon.start), sum(if(reg.start < exon.end, reg.start, exon.end) - if(reg.start < exon.start, reg.start, exon.start)))), sum(exon.end - exon.start) total_length, 'start' boundary_type from 1242175186_uncovered reg join 1242175186_uncovered_rel rel on (reg.id = rel.query_region_id) join elegans_features gene on (rel.target_region_id = gene.id) join elegans_features exon on (exon.attribute = gene.attribute) where exon.feature = 'exon' and gene.feature = 'mRNA' group by reg.id, gene.attribute" /usr/bin/mysql -umaqgene -Dmaqgenedb --local-infile -e "insert into 1242175186_offsets_uncovered select reg.id, gene.attribute gene, if(exon.strand = '+', if(reg.end < min(exon.start), reg.end - min(exon.start), sum(if(reg.end < exon.end, reg.end, exon.end) - if(reg.end < exon.start, reg.end, exon.start))), if(max(exon.end) < reg.end, max(exon.end) - reg.end, sum(if(exon.end < reg.end, reg.end, exon.end) - if(exon.start < reg.end, reg.end, exon.start))) ), if(exon.strand = '+', if(max(exon.end) < reg.end, max(exon.end) - reg.end, sum(if(exon.end < reg.end, reg.end, exon.end) - if(exon.start < reg.end, reg.end, exon.start))), if(reg.end < min(exon.start), reg.end - min(exon.start), sum(if(reg.end < exon.end, reg.end, exon.end) - if(reg.end < exon.start, reg.end, exon.start)))), sum(exon.end - exon.start) total_length, 'end' boundary_type from 1242175186_uncovered reg join 1242175186_uncovered_rel rel on (reg.id = rel.query_region_id) join elegans_features gene on (rel.target_region_id = gene.id) join elegans_features exon on (exon.attribute = gene.attribute) where exon.feature = 'exon' and gene.feature = 'mRNA' group by reg.id, gene.attribute" touch 1242175186_offsets_uncovered # Fri Nov 25 11:22:27 GMT 2011: Calculating translation start offsets... /usr/bin/mysql -umaqgene -Dmaqgenedb --local-infile -e "drop table if exists 2320895160_offsets_snps; create table 2320895160_offsets_snps ( id int(10) NOT NULL, gene varchar(50) NOT NULL, 5prime_offset int(10) NOT NULL, 3prime_offset int(10) NOT NULL, total_length int(10) NOT NULL, boundary_type char(20) NOT NULL, primary key (id, gene, boundary_type) );" /usr/bin/mysql -umaqgene -Dmaqgenedb --local-infile -e "insert into 2320895160_offsets_snps select reg.id, gene.attribute gene, if(exon.strand = '+', if(reg.start < min(exon.start), reg.start - min(exon.start), sum(if(reg.start < exon.end, reg.start, exon.end) - if(reg.start < exon.start, reg.start, exon.start))), if(max(exon.end) < reg.start, max(exon.end) - reg.start, sum(if(exon.end < reg.start, reg.start, exon.end) - if(exon.start < reg.start, reg.start, exon.start))) ), if(exon.strand = '+', if(max(exon.end) < reg.start, max(exon.end) - reg.start, sum(if(exon.end < reg.start, reg.start, exon.end) - if(exon.start < reg.start, reg.start, exon.start))), if(reg.start < min(exon.start), reg.start - min(exon.start), sum(if(reg.start < exon.end, reg.start, exon.end) - if(reg.start < exon.start, reg.start, exon.start)))), sum(exon.end - exon.start) total_length, 'start' boundary_type from 2320895160_snps reg join 2320895160_rel_snps rel on (reg.id = rel.query_region_id) join elegans_features gene on (rel.target_region_id = gene.id) join elegans_features exon on (exon.attribute = gene.attribute) where exon.feature = 'exon' and gene.feature = 'mRNA' group by reg.id, gene.attribute" touch 2320895160_offsets_snps # Fri Nov 25 11:22:59 GMT 2011: Getting codons affected by point mutations... /usr/bin/mysql -umaqgene -Dmaqgenedb --local-infile -e "drop table if exists 2320895160_codons; create table 2320895160_codons ( id int(10) NOT NULL, gene varchar(50) NOT NULL, ref_codon char(4) NOT NULL, sam_codon char(4) NOT NULL, start char(1) NOT NULL, primary key (id, gene) ); insert into 2320895160_codons select snp.id, off.gene, @ref_codon := substr(gene_dna.sequence, (floor(if(feature.strand = '+', off.5prime_offset, off.5prime_offset - 1 ) / 3 ) * 3 ) + 1, 3 ) ref_codon, insert(@ref_codon, (if(feature.strand = '+', off.5prime_offset, off.5prime_offset - 1) % 3) + 1, 1, if(feature.strand = '+', snp.sample_base, substr('ACGTTGCA', instr('ACGTTGCA', snp.sample_base)+4, 1) ) ) sam_codon, if((feature.strand = '+' && off.5prime_offset < 3) || (feature.strand = '-' && off.5prime_offset - 1 < 3), 'Y', 'N') start from 2320895160_offsets_snps off join 2320895160_snps snp on (off.id = snp.id) join elegans_coding_dna gene_dna using (gene) join elegans_features feature on (gene_dna.gene = feature.attribute and feature.start <= snp.start and feature.end >= snp.end) where snp.variant_type = 'point' and feature.feature = 'exon' and off.boundary_type = 'start' and off.5prime_offset >= 0 and off.3prime_offset >= 0;" touch 2320895160_codons /usr/bin/mysql -umaqgene -Dmaqgenedb --local-infile -e "drop table if exists 2320895160_marked; create table 2320895160_marked ( id int(10) NOT NULL, class varchar(100) NOT NULL, description varchar(100) NOT NULL, parent_feature varchar(100) NOT NULL, primary key (id, class, parent_feature) );" # Fri Nov 25 11:23:13 GMT 2011: Finding exonic indels ... /usr/bin/mysql -umaqgene -Dmaqgenedb --local-infile -e "insert into 2320895160_marked select snp.id, if (snp.indel_size < 0, if (rel.overlap % 3 = 0, 'inframe', 'frameshift'), if (snp.indel_size % 3 = 0, 'inframe', 'frameshift')) class, if (rel.overlap != snp.end - snp.start, 'exon_boundary', 'none') description, feature.attribute parent_feature from 2320895160_snps snp join 2320895160_rel_snps rel on (snp.id = rel.query_region_id) join elegans_features feature on (rel.target_region_id = feature.id) where feature.feature = 'exon' and snp.variant_type = 'indel';" # Fri Nov 25 11:23:14 GMT 2011: Finding coding variants ... /usr/bin/mysql -umaqgene -Dmaqgenedb --local-infile -e "insert into 2320895160_marked select trans.id, if (reference_codons.amino = sample_codons.amino, 'silent', if (reference_codons.amino3 = 'stop', 'readthrough', if (sample_codons.amino3 = 'stop', 'premature_stop', if (trans.start = 'Y' and sample_codons.is_start != 'Y', 'non_start', 'missense') ) ) ) class, concat(ref_codon,'->',sam_codon,'[', reference_codons.amino3,'->',sample_codons.amino3,']') description, trans.gene from 2320895160_codons trans join elegans_genetic_code reference_codons on (trans.ref_codon = reference_codons.codon) join elegans_genetic_code sample_codons on (trans.sam_codon = sample_codons.codon);" # Fri Nov 25 11:23:14 GMT 2011: Finding noncoding variants ... /usr/bin/mysql -umaqgene -Dmaqgenedb --local-infile -e "insert into 2320895160_marked select snp.id, feature.feature class, 'none' description, feature.attribute parent_feature from 2320895160_snps snp join 2320895160_rel_snps rel on (snp.id = rel.query_region_id) join elegans_features feature on (rel.target_region_id = feature.id) where feature.feature not in ('exon', 'intron', 'mRNA');" # Fri Nov 25 11:23:15 GMT 2011: Finding splice site variants ... /usr/bin/mysql -umaqgene -Dmaqgenedb --local-infile -e "insert into 2320895160_marked select snp.id, if (feature.strand = '+', if (snp.start - feature.start < 2, 'splice_donor', 'splice_acceptor'), if (snp.start - feature.start < 2, 'splice_acceptor', 'splice_donor')) class, 'none' description, feature.attribute parent_feature from 2320895160_snps snp join 2320895160_rel_snps rel on (snp.id = rel.query_region_id) join elegans_features feature on (rel.target_region_id = feature.id) where feature.feature = 'intron' and rel.distance > -2;" # Fri Nov 25 11:23:16 GMT 2011: Finding intergenic and intronic variants ... /usr/bin/mysql -umaqgene -Dmaqgenedb --local-infile -e "insert into 2320895160_marked select id, 'nongenic' class, concat(if (relation in ('upstream','into'), - distance, distance),' ',relation) description, attribute from 2320895160_intergenic_assoc where ((relation = 'upstream' and num_regions_between = 0 and distance < 50000) or (relation = 'upstream' and num_regions_between < 2 and distance < 1000) or (relation = 'downstream' and num_regions_between = 0 and distance < 1000) or (relation = 'downstream' and num_regions_between < 2 and distance < 1000) or relation = 'into' );" # Fri Nov 25 11:23:21 GMT 2011: Finding uncovered regions in genes ... /usr/bin/mysql -umaqgene -Dmaqgenedb --local-infile -e "insert into 2320895160_marked select b.id, 'uncovered', if(b.5prime_offset = e.5prime_offset, concat('non-exonic, ',unc.start - gene.start,' to ',unc.end - gene.start,' bp into'), concat( if(b.5prime_offset < 0, concat(-b.5prime_offset, ' bp upstream'), if(b.3prime_offset < 0, concat(-b.3prime_offset, ' bp downstream'), concat('codon ',floor(b.5prime_offset / 3) + 1) ) ), ' to ', if(e.3prime_offset < 0, concat(-e.3prime_offset, ' bp downstream'), if(e.5prime_offset < 0, concat(-e.5prime_offset, ' bp upstream'), concat('codon ',ceiling(e.5prime_offset / 3) + 1) ) ), ' (', format(100 * (if(e.5prime_offset < 0,0,e.5prime_offset) - (if(b.5prime_offset < 0,0,b.5prime_offset)) ) / b.total_length,0 ), ' % of ', floor(b.total_length/3), ' codons)' ) ) description, b.gene from 1242175186_uncovered unc join 1242175186_offsets_uncovered b using (id) join 1242175186_offsets_uncovered e using (id, gene) join elegans_features gene on (e.gene = gene.attribute) where b.boundary_type = 'start' and e.boundary_type = 'end' and gene.strand = '+' and gene.feature = 'mRNA' union select b.id, 'uncovered', if(b.5prime_offset = e.5prime_offset, concat('non-exonic, ',gene.end - unc.end,' to ',gene.end - unc.start,' bp into'), concat( if(e.5prime_offset < 0, concat(-e.5prime_offset, ' bp upstream'), if(e.3prime_offset < 0, concat(-e.3prime_offset, ' bp downstream'), concat('codon ',floor(e.5prime_offset / 3) + 1) ) ), ' to ', if(b.3prime_offset < 0, concat(-b.3prime_offset, ' bp downstream'), if(b.5prime_offset < 0, concat(-b.5prime_offset, ' bp upstream'), concat('codon ',ceiling(b.5prime_offset / 3) + 1) ) ), ' (', format(100 * (if(b.5prime_offset < 0,0,b.5prime_offset) - (if(e.5prime_offset < 0,0,e.5prime_offset)) ) / b.total_length,0 ), ' % of ', floor(b.total_length/3), ' codons)' ) ) description, b.gene from 1242175186_uncovered unc join 1242175186_offsets_uncovered b using (id) join 1242175186_offsets_uncovered e using (id, gene) join elegans_features gene on (e.gene = gene.attribute) where b.boundary_type = 'start' and e.boundary_type = 'end' and gene.strand = '-' and gene.feature = 'mRNA'" # Fri Nov 25 11:23:21 GMT 2011: Variants found: +-----------------+-----------------+ | class | number_variants | +-----------------+-----------------+ | five_prime_UTR | 1300 | | frameshift | 77 | | inframe | 25 | | missense | 6092 | | ncRNA | 854 | | nongenic | 123843 | | non_start | 9 | | premature_stop | 686 | | readthrough | 19 | | silent | 7382 | | SNP | 970 | | splice_acceptor | 45 | | splice_donor | 45 | | three_prime_UTR | 3545 | | uncovered | 24 | +-----------------+-----------------+ touch 2320895160_marked # Fri Nov 25 11:23:22 GMT 2011: Combining results ... /usr/bin/mysql -umaqgene -Dmaqgenedb --local-infile -e "drop table if exists 2320895160_combined; create table 2320895160_combined (primary key (id, class, description), key (dna, start)) as select snp.id, 'CB6633_3' mutant_strain, snp.dna, snp.start + 1 start, snp.end - snp.start length, snp.ref_base reference_base, snp.sample_base sample_base, snp.snp_score consensus_score, snp.loci_multiplicity, snp.mapping_quality, snp.neighbor_quality, (pile.wt_fwd + pile.wt_rev) number_wildtype_reads, (snp.read_depth - (pile.wt_fwd + pile.wt_rev)) number_variant_reads, snp.read_depth sequencing_depth, pile.sample_reads, snp.variant_type, snp.indel_size, class.class, class.description, group_concat(class.parent_feature order by class.parent_feature) parent_features from 2320895160_snps snp join 1242175186_pileup pile using (dna,start) join 2320895160_marked class on (snp.id = class.id) left join elegans_per_locus using (dna,start) where (snp.variant_type = 'point' and (snp.snp_score >= 3 and snp.loci_multiplicity <= 10 and snp.neighbor_quality >= 0 and pile.read_depth >= 4 and ((pile.read_depth - pile.wt_fwd - pile.wt_rev) / pile.read_depth) >= 0.5) or snp.variant_type = 'indel' ) group by id, class, description union select unc.id id, 'CB6633_3' mutant_strain, unc.dna, unc.start + 1 start, unc.end - unc.start length, '-' reference_base, '-' sample_base, 0 consensus_score, 0, 0, 0, 0 number_wildtype_reads, 0 number_variant_reads, 0 sequencing_depth, '' sample_reads, 'uncovered', 0, class.class, class.description, group_concat(class.parent_feature order by class.parent_feature) parent_features from 1242175186_uncovered unc join 2320895160_marked class on (unc.id = class.id) left join elegans_per_locus using (dna,start) group by id, class, description; alter table 2320895160_combined order by dna, start, length" touch 2320895160_combined # Fri Nov 25 11:23:33 GMT 2011: Writing results to grouped file ... /usr/bin/mysql -Dmaqgenedb -umaqgene --skip-column-names --batch --raw --local-infile --column-names -e "select id, mutant_strain, dna, start, length, reference_base, sample_base, consensus_score, loci_multiplicity, mapping_quality, neighbor_quality, number_wildtype_reads, number_variant_reads, sequencing_depth, sample_reads, variant_type, indel_size, group_concat(class) classes, group_concat(description) descriptions, group_concat(concat('{',parent_features,'}')) parent_features from 2320895160_combined group by id order by dna, start;" > /data/maqgene/out/example_user/CB6633_3/CB6633_3_grouped.txt 1860 lines written. # Fri Nov 25 11:23:33 GMT 2011: Writing results to flat file ... /usr/bin/mysql -Dmaqgenedb -umaqgene --skip-column-names --batch --raw --local-infile --column-names -e "select * from 2320895160_combined" > /data/maqgene/out/example_user/CB6633_3/CB6633_3_flat.txt 3168 lines written. make: Leaving directory `/data/maqgene/work' make: Entering directory `/data/maqgene/work' # Fri Nov 25 11:23:34 GMT 2011: Cleaning intermediate data ... /usr/bin/mysql -umaqgene -Dmaqgenedb --local-infile -e "drop table if exists 2320895160_snps,2320895160_rel,2320895160_masked_ids,2320895160_rel_intergenic,2320895160_intergenic_assoc,2320895160_offsets,2320895160_codons,2320895160_marked,2320895160_combined,1242175186_pileup" rm -f 2320895160_snps 2320895160_rel 2320895160_masked_ids 2320895160_rel_intergenic 2320895160_intergenic_assoc 2320895160_offsets 2320895160_codons 2320895160_marked 2320895160_combined 1242175186_pileup make: Leaving directory `/data/maqgene/work'