# The makefile for a performing a MAQGene run SHELL = /bin/bash # $(info from make: $(MYSQL_BATCH), $(shell echo "from the shell: $$MYSQL_BATCH")) include commands.mk outfile_prefix_path := $(full_results_dir)/$(outfile_basename) linked_txt_file_stubs = uncovered coverage pileup log check unmapped linked_txt_file_list := $(patsubst %,$(outfile_prefix_path)_%.txt,$(linked_txt_file_stubs)) # meta script to determine whether snp_changes table exists #has_snp_changes_table := $(shell $(MYSQL_BATCH) -e "$(strip $(call has_snp_changes_table))") has_snp_changes_table := 1 .PHONY: all all: $(full_results_dir)/$(outfile_basename)_grouped.txt all: $(full_results_dir)/$(outfile_basename)_flat.txt all: $(linked_txt_file_list) ifeq ($(has_snp_changes_table),1) all: snp_read_counts else endif .PHONY: clean_frontend_files clean_frontend_files: @echo "# $$(date): Removing frontend files in case this is a duplicate run ..." rm -f ${outfile_prefix_path}_{grouped.txt,flat.txt} rm -f $(linked_txt_file_list) #make TOTAL_LINES a target split_extensions := $(strip $(shell for e in $$(seq 0 $$(($(NUM_CHUNKS) - 1))); \ do printf "%05i " $$e; done)) map_files := $(patsubst %,$(map_cksum).%.map,$(split_extensions)) define per_locus_column_def ifnull($(PER_LOCUS_TABLE).$(col),'-missing-') per_locus_$(col), endef locus_extra_columns := $(strip $(shell $(MYSQL_BATCH) -e "$(strip $(call get_locus_extra_columns))")) per_locus_combined := $(foreach col,$(locus_extra_columns),$(per_locus_column_def)) # split first or second reads into sized chunks define rename_fastq for stem in $(split_extensions); do \ mv $(map_cksum).$(1).fastq.$$stem $(map_cksum).$$stem.$(1).fastq; \ done endef define FASTQ_SPLIT_TEMPLATE $(reads_cksum)_split$(1): $(addprefix $(MAQGENE_READS_DIR)/, $(fastq_file_set$(1))) @echo "# $$(date): Regrouping fastq reads into chunks of size $(CHUNK_SIZE)." split -l $(CHUNK_SIZE) -a 5 -d <(cat $$^) $(map_cksum).$(1).fastq. $$(call rename_fastq,$(1)) touch $$@ fastq_merge_set$(1) := $(patsubst %,$(map_cksum).%.$(1).fastq,$(split_extensions)) $$(fastq_merge_set$(1)) : $(reads_cksum)_split$(1) endef ifndef fastq_file_list2 fastq_file_set1 := $(subst $(comma),$(space),$(fastq_file_list1)) $(eval $(call FASTQ_SPLIT_TEMPLATE,1)) else fastq_file_set1 := $(subst $(comma),$(space),$(fastq_file_list1)) $(eval $(call FASTQ_SPLIT_TEMPLATE,1)) fastq_file_set2 := $(subst $(comma),$(space),$(fastq_file_list2)) $(eval $(call FASTQ_SPLIT_TEMPLATE,2)) endif .INTERMEDIATE : $(patsubst %,$(map_cksum).%.map,$(split_extensions)) .INTERMEDIATE : $(patsubst %,$(map_cksum).%.mismatch,$(split_extensions)) .INTERMEDIATE : $(patsubst %,$(map_cksum).%.unmapped,$(split_extensions)) .INTERMEDIATE : $(patsubst %,$(map_cksum).%.1.fastq,$(split_extensions)) .INTERMEDIATE : $(patsubst %,$(map_cksum).%.2.fastq,$(split_extensions)) .INTERMEDIATE : $(reads_cksum)_split1 $(reads_cksum)_split2 # convert to bfq $(map_cksum).%.bfq: $(map_cksum).%.fastq @echo "# $$(date): Converting fastq files to bfq ..." $(MAQGENE_EXE_DIR)/maq sol2sanger $< /dev/stdout | \ $(MAQGENE_EXE_DIR)/maq fastq2bfq /dev/stdin $@ # map ifdef fastq_file_list2 $(addprefix $(map_cksum).%.,map unmapped mismatch): $(map_cksum).%.1.bfq $(map_cksum).%.2.bfq else $(addprefix $(map_cksum).%.,map unmapped mismatch): $(map_cksum).%.1.bfq endif @echo "# $$(date): Mapping file(s) $^" $(MAQGENE_EXE_DIR)/maq map $(map_parameters) -u $(<:1.bfq=unmapped) -H $(<:1.bfq=mismatch) \ $(map_cksum).$*.map $(reference_genome) $^ 2> /dev/null # merge map files $(map_cksum).map: $(map_files) @echo "# $$(date): Merging all maps ..." $(MAQGENE_EXE_DIR)/maq mapmerge $@ $^ # merge *.unmapped files $(map_cksum)_unmapped.txt: $(map_files:map=unmapped) @echo "# $$(date): Merging all *.unmapped files ..." cat $^ > $@ # merge *.mismatch files $(map_cksum).mismatch: $(map_files:map=mismatch) @echo "# $$(date): Merging all *.mismatch files ..." cat $^ > $@ @echo $(wc -l $@) " lines." # mapcheck $(map_cksum)_check.txt: $(map_cksum).map @echo "# $$(date): Running 'mapcheck' ..." $(MAQGENE_EXE_DIR)/maq mapcheck $(mapcheck_parameters) $(reference_genome) $< > $@ \ 2>/dev/null # generate consensus $(cns_cksum)_log.txt: $(cns_cksum).cns $(cns_cksum).cns: $(map_cksum).map @echo "# $$(date): Generating consensus ..." $(MAQGENE_EXE_DIR)/maq assemble $(assemble_parameters) $@ \ $(reference_genome) $< 2> $(cns_cksum)_log.txt $(cns_cksum)_cq.txt: $(cns_cksum).cns @echo "# $$(date): Running maq command cns2fq ..." $(MAQGENE_EXE_DIR)/maq cns2fq $< > $@ $(cns_cksum)_view.txt: $(cns_cksum).cns @echo "# $$(date): Running maq command cns2view ..." $(MAQGENE_EXE_DIR)/maq cns2view $< > $@ $(cns_cksum)_aref.txt: $(cns_cksum).cns @echo "# $$(date): Running maq command cns2ref ..." $(MAQGENE_EXE_DIR)/maq cns2ref $< > $@ .PHONY: extra_files extra_files: $(patsubst %,$(outfile_prefix_path)_%.txt,cq view aref) # link backend map or cns files to front $(outfile_prefix_path)%.txt: $(map_cksum)%.txt @echo "# $$(date): Linking backend file $< to $@" ln -fs $(MAQGENE_WORK_DIR)/$< $@ $(outfile_prefix_path)%.txt: $(cns_cksum)%.txt @echo "# $$(date): Linking backend file $< to $@" ln -fs $(MAQGENE_WORK_DIR)/$< $@ $(outfile_prefix_path)%.txt: $(pileup_cksum)%.txt @echo "# $$(date): Linking backend file $< to $@" ln -fs $(MAQGENE_WORK_DIR)/$< $@ ####################################################################### # pileup pileup_symbol_order := 'AaCcGgTt,.' $(pileup_cksum)_pileup.txt: $(map_cksum).map @echo "# $$(date): Creating pileup ..." $(MAQGENE_EXE_DIR)/maq pileup $(pileup_parameters) $(reference_genome) $< > $@ define awk_print_filter BEGIN { OFS="\t" } { $$5=substr($$5,1,$(PILEUP_MAX_DISPLAY_DEPTH)); print $$0 } endef $(pileup_cksum)_known_snps: $(pileup_cksum)_pileup.txt $(MAQGENE_EXE_DIR)/filter_matching_lines $< "%s\t%i\t" \ <($(MYSQL_BATCH) -e "$(strip $(call get_known_snp_coords))") "%s\t%i\n" si \ | $(MAQGENE_EXE_DIR)/filter_maq_pileup $(pileup_symbol_order) 0 \ > $@ $(pileup_cksum)_pileup: $(pileup_cksum)_pileup.txt $(pileup_cksum)_known_snps @echo "# $$(date): Filtering and loading pileup for analysis ..." $(MYSQL_CMD) -e "$(strip $(call make_pileup_table_cmd,$@))" cat $< \ | $(MAQGENE_EXE_DIR)/filter_maq_pileup $(pileup_symbol_order) $(min_polymorphic_reads) \ | awk 'BEGIN { OFS="\t" } { $$5=substr($$5,1,$(PILEUP_MAX_DISPLAY_DEPTH)); print $$0 }' \ | $(MYSQL_CMD) -e "$(strip $(call load_pileup_table_cmd,$@))" cat $(word 2,$^) \ | awk 'BEGIN { OFS="\t" } { $$5=substr($$5,1,$(PILEUP_MAX_DISPLAY_DEPTH)); print $$0 }' \ | $(MYSQL_CMD) -e "$(strip $(call load_pileup_table_cmd,$@))" touch $@ ####################################################################### # Uncovered regions ####################################################################### $(pileup_cksum)_uncovered.txt: $(pileup_cksum)_pileup.txt @echo "# $$(date): Getting uncovered regions ..." cut -f 1,2,4 $< \ | $(MAQGENE_EXE_DIR)/get_uncovered_regions $(uncovered_region_window) \ > $@ $(pileup_cksum)_uncovered: $(pileup_cksum)_uncovered.txt $(cns_cksum)_snps @echo "# $$(date): Loading uncovered regions into table ..." cat $< \ | $(MYSQL_CMD) -e "$(strip $(call load_uncovered_table,$@,$(call get_next_id,id,$(word 2,$^))))" @echo "Uncovered region statistics:" @echo "$$($(MYSQL_CMD) --batch -e \ 'select dna chromosome, count(*) number_uncovered_regions, sum(end - start) total_uncovered_length \ from $@ group by dna')" touch $@ $(pileup_cksum)_uncovered_rel: $(pileup_cksum)_uncovered @echo "# $$(date): Finding mRNA features overlapping uncovered regions ..." $(MYSQL_CMD) -e "$(strip $(call make_rel_position_table,$@))" $(strip $(call get_containing_regions,$@,$<,$(genomic_feature_list))) touch $@ $(pileup_cksum)_offsets_uncovered: $(pileup_cksum)_uncovered $(pileup_cksum)_uncovered_rel @echo "# $$(date): Calculating translation start offsets for uncovered regions..." $(MYSQL_CMD) -e "$(strip $(call make_offsets_table,$@))" $(MYSQL_CMD) -e "$(strip $(call calc_tr_offsets,$@,$<,$(word 2,$^),start,id))" $(MYSQL_CMD) -e "$(strip $(call calc_tr_offsets,$@,$<,$(word 2,$^),end,id))" touch $@ $(pileup_cksum)_coverage.txt: $(pileup_cksum)_pileup.txt @echo "# $$(date): Making coverage histogram ..." (echo -en "sequencing_depth\tnumber_of_bases\n"; \ cut -f 4 $< \ | $(MAQGENE_EXE_DIR)/pileup_histogram $(HISTOGRAM_MAX_DEPTH)) > $@ $(cns_cksum)_snps: $(cns_cksum).cns $(map_cksum).map $(pileup_cksum)_known_snps @echo "# $$(date): Extracting point mutants from consensus ..." $(MYSQL_CMD) -e "$(strip $(call make_snp_table,$@))" $(MAQGENE_EXE_DIR)/maq cns2snp $< \ | cat -b | $(MYSQL_CMD) -e "$(strip $(call load_snp_table,$@))" @echo "# $$(date): Extracting indels from consensus ..." $(MAQGENE_EXE_DIR)/maq indelsoa $(reference_genome) $(word 2,$^) \ | awk '{ if ($$5+$$6-$$4 >= $(INDEL_READS_MIN_DIF) && $$4 <= 1) { print $$0 } }' \ | $(MYSQL_CMD) -e "$(strip $(call load_indels,$@))" @echo "# $$(date): Adding placeholders for known SNPs." cat $(word 3,$^) \ | $(MYSQL_CMD) -e "$(strip $(call load_snp_placeholders,$@))" @echo "Found $$($(MYSQL_CMD) --raw --skip-column-names -e 'select count(*) from $@') variants." touch $@ $(cns_cksum)_rel_snps: $(cns_cksum)_snps @echo "# $$(date): Finding all genomic features overlapping variants..." $(MYSQL_CMD) -e "$(strip $(call make_rel_position_table,$@))" $(strip $(call get_containing_regions,$@,$<,$(genomic_feature_list))) touch $@ $(cns_cksum)_masked_ids: $(cns_cksum)_snps @echo "# $$(date): Getting masking regions" $(strip $(call get_masking_regions,$@,$<)) touch $@ $(cns_cksum)_rel_intergenic: $(cns_cksum)_rel_snps $(cns_cksum)_snps $(cns_cksum)_masked_ids @echo "# $$(date): Getting intergenic regions" $(strip $(call get_intergenic_regions,$@,$<,$(word 2,$^))) @echo "# $$(date): Filtering masked intergenic regions" $(MYSQL_BATCH) -e "$(call filter_masked_regions,$@,$(word 3,$^))" touch $@ $(cns_cksum)_intergenic_assoc: $(cns_cksum)_snps $(cns_cksum)_rel_intergenic @echo "# $$(date): Classifying intergenic relations" $(MYSQL_CMD) -e "$(strip $(call classify_intergenic_relations,$@,$<,$(word 2,$^)))" touch $@ $(cns_cksum)_offsets_snps: $(cns_cksum)_snps $(cns_cksum)_rel_snps @echo "# $$(date): Calculating translation start offsets..." $(MYSQL_CMD) -e "$(strip $(call make_offsets_table,$@))" $(MYSQL_CMD) -e "$(strip $(call calc_tr_offsets,$@,$<,$(word 2,$^),start,id))" touch $@ $(cns_cksum)_codons : $(cns_cksum)_offsets_snps $(cns_cksum)_snps @echo "# $$(date): Getting codons affected by point mutations..." $(MYSQL_CMD) -e "$(strip $(call get_snp_codons,$@,$<,$(word 2,$^)))" touch $@ # All of these commands insert into 'marked' table. And since MySQL # inserts are sequential, there is no point in dividing up these commands for # parallelization. $(cns_cksum)_marked: $(cns_cksum)_snps $(cns_cksum)_rel_snps \ $(cns_cksum)_codons $(cns_cksum)_rel_intergenic $(cns_cksum)_intergenic_assoc \ $(pileup_cksum)_uncovered $(pileup_cksum)_offsets_uncovered $(MYSQL_CMD) -e "$(strip $(call make_annotations_table,$@))" @echo "# $$(date): Finding exonic indels ..." $(MYSQL_CMD) -e "$(strip $(call load_exonic_indels,$@,$<,$(word 2,$^)))" @echo "# $$(date): Finding coding variants ..." $(MYSQL_CMD) -e "$(strip $(call load_coding_annotations,$@,$(word 3,$^)))" @echo "# $$(date): Finding noncoding variants ..." $(MYSQL_CMD) -e "$(strip $(call load_noncoding_annotations,$@,$<,$(word 2,$^)))" @echo "# $$(date): Finding splice site variants ..." $(MYSQL_CMD) -e "$(strip $(call load_splice_annotations,$@,$<,$(word 2,$^)))" @echo "# $$(date): Finding intergenic and intronic variants ..." $(MYSQL_CMD) -e "$(strip $(call load_intergenic_annotations,$@,$(word 5,$^)))" @echo "# $$(date): Finding uncovered regions in genes ..." $(MYSQL_CMD) -e "$(strip $(call load_uncovered_gene_annotations,$@,$(word 6,$^),$(word 7,$^)))" @echo "# $$(date): Variants found:" @echo "$$($(MYSQL_CMD) --table -e 'select class, count(distinct id) number_variants from $@ group by class')" touch $@ $(cns_cksum)_combined: $(cns_cksum)_snps $(pileup_cksum)_uncovered \ $(pileup_cksum)_pileup $(cns_cksum)_marked @echo "# $$(date): Combining results ..." $(MYSQL_CMD) -e "$(strip $(call combine_all,$(per_locus_combined),$@,$<,$(word 3,$^),$(word 4,$^),$(word 2,$^)))" touch $@ $(full_results_dir)/$(outfile_basename)_grouped.txt: $(cns_cksum)_combined @echo "# $$(date): Writing results to grouped file ..." $(MYSQL_BATCH) --column-names -e "$(strip $(call retrieve_grouped,$<))" > $@ @echo " $$(wc -l $@ | cut -f 1 -d ' ') lines written." $(full_results_dir)/$(outfile_basename)_flat.txt: $(cns_cksum)_combined @echo "# $$(date): Writing results to flat file ..." $(MYSQL_BATCH) --column-names -e "select * from $<" > $@ @echo " $$(wc -l $@ | cut -f 1 -d ' ') lines written." .PHONY: snp_read_counts snp_read_counts: $(full_results_dir)/$(outfile_basename)_snp_read_counts.txt $(full_results_dir)/$(outfile_basename)_snp_read_counts.txt: $(pileup_cksum)_pileup @echo "# $$(date): Writing snp read counts ..." $(MYSQL_BATCH) --column-names -e "$(strip $(call known_snp_mixin_ratio_table,$<))" > $@ @echo " $$(wc -l $@ | cut -f 1 -d ' ') snp read count lines written." drop_tables := $(addprefix $(cns_cksum)_,snps rel masked_ids rel_intergenic intergenic_assoc \ offsets codons marked combined) drop_tables := $(drop_tables) $(pileup_cksum)_pileup drop_tables_list := $(subst $(space),$(comma),$(drop_tables)) .PHONY: clean_tables clean_tables: @echo "# $$(date): Cleaning intermediate data ..." $(MYSQL_CMD) -e "drop table if exists $(drop_tables_list)" rm -f $(drop_tables)