# The makefile for a performing a MAQGene run

SHELL = /bin/bash

# $(info from make: $(MYSQL_BATCH),  $(shell echo "from the shell: $$MYSQL_BATCH"))

include commands.mk

outfile_prefix_path := $(full_results_dir)/$(outfile_basename)
linked_txt_file_stubs = uncovered coverage pileup log check unmapped
linked_txt_file_list := $(patsubst %,$(outfile_prefix_path)_%.txt,$(linked_txt_file_stubs))


# meta script to determine whether snp_changes table exists
#has_snp_changes_table := $(shell $(MYSQL_BATCH) -e "$(strip $(call has_snp_changes_table))")
has_snp_changes_table := 1

.PHONY: all
all: $(full_results_dir)/$(outfile_basename)_grouped.txt
all: $(full_results_dir)/$(outfile_basename)_flat.txt
all: $(linked_txt_file_list)
ifeq ($(has_snp_changes_table),1)
all: snp_read_counts
else
endif

.PHONY: clean_frontend_files
clean_frontend_files:
	@echo "# $$(date): Removing frontend files in case this is a duplicate run ..."
	rm -f ${outfile_prefix_path}_{grouped.txt,flat.txt}
	rm -f $(linked_txt_file_list)


#make TOTAL_LINES a target

split_extensions := $(strip $(shell for e in $$(seq 0 $$(($(NUM_CHUNKS) - 1))); \
	do printf "%05i " $$e; done))

map_files := $(patsubst %,$(map_cksum).%.map,$(split_extensions))

define per_locus_column_def
ifnull($(PER_LOCUS_TABLE).$(col),'-missing-') per_locus_$(col),
endef

locus_extra_columns := $(strip $(shell $(MYSQL_BATCH) -e "$(strip $(call get_locus_extra_columns))"))
per_locus_combined := $(foreach col,$(locus_extra_columns),$(per_locus_column_def))

# split first or second reads into sized chunks

define rename_fastq
	for stem in $(split_extensions); do \
	mv $(map_cksum).$(1).fastq.$$stem $(map_cksum).$$stem.$(1).fastq; \
	done
endef


define FASTQ_SPLIT_TEMPLATE
$(reads_cksum)_split$(1): $(addprefix $(MAQGENE_READS_DIR)/, $(fastq_file_set$(1)))

	@echo "# $$(date): Regrouping fastq reads into chunks of size $(CHUNK_SIZE)."
	split -l $(CHUNK_SIZE) -a 5 -d <(cat $$^) $(map_cksum).$(1).fastq.
	$$(call rename_fastq,$(1))
	touch $$@

fastq_merge_set$(1) := $(patsubst %,$(map_cksum).%.$(1).fastq,$(split_extensions))

$$(fastq_merge_set$(1)) : $(reads_cksum)_split$(1)
endef

ifndef fastq_file_list2
fastq_file_set1 := $(subst $(comma),$(space),$(fastq_file_list1))
$(eval $(call FASTQ_SPLIT_TEMPLATE,1))
else
fastq_file_set1 := $(subst $(comma),$(space),$(fastq_file_list1))
$(eval $(call FASTQ_SPLIT_TEMPLATE,1))
fastq_file_set2 := $(subst $(comma),$(space),$(fastq_file_list2))
$(eval $(call FASTQ_SPLIT_TEMPLATE,2))
endif

.INTERMEDIATE : $(patsubst %,$(map_cksum).%.map,$(split_extensions))
.INTERMEDIATE : $(patsubst %,$(map_cksum).%.mismatch,$(split_extensions))
.INTERMEDIATE : $(patsubst %,$(map_cksum).%.unmapped,$(split_extensions))
.INTERMEDIATE : $(patsubst %,$(map_cksum).%.1.fastq,$(split_extensions))
.INTERMEDIATE : $(patsubst %,$(map_cksum).%.2.fastq,$(split_extensions))
.INTERMEDIATE : $(reads_cksum)_split1 $(reads_cksum)_split2

# convert to bfq
$(map_cksum).%.bfq: $(map_cksum).%.fastq
	@echo "# $$(date): Converting fastq files to bfq ..."
	$(MAQGENE_EXE_DIR)/maq sol2sanger $< /dev/stdout | \
	$(MAQGENE_EXE_DIR)/maq fastq2bfq /dev/stdin $@


# map
ifdef fastq_file_list2
$(addprefix $(map_cksum).%.,map unmapped mismatch): $(map_cksum).%.1.bfq $(map_cksum).%.2.bfq
else
$(addprefix $(map_cksum).%.,map unmapped mismatch): $(map_cksum).%.1.bfq
endif
	@echo "# $$(date): Mapping file(s) $^"
	$(MAQGENE_EXE_DIR)/maq map $(map_parameters) -u $(<:1.bfq=unmapped) -H $(<:1.bfq=mismatch) \
	$(map_cksum).$*.map $(reference_genome) $^ 2> /dev/null


# merge map files
$(map_cksum).map: $(map_files)
	@echo "# $$(date): Merging all maps ..."
	$(MAQGENE_EXE_DIR)/maq mapmerge $@ $^


# merge *.unmapped files
$(map_cksum)_unmapped.txt: $(map_files:map=unmapped)
	@echo "# $$(date): Merging all *.unmapped files ..."
	cat $^ > $@


# merge *.mismatch files
$(map_cksum).mismatch: $(map_files:map=mismatch)
	@echo "# $$(date): Merging all *.mismatch files ..."
	cat $^ > $@
	@echo $(wc -l $@) " lines."

# mapcheck
$(map_cksum)_check.txt: $(map_cksum).map
	@echo "# $$(date): Running 'mapcheck' ..."
	$(MAQGENE_EXE_DIR)/maq mapcheck $(mapcheck_parameters) $(reference_genome) $< > $@ \
	2>/dev/null


# generate consensus
$(cns_cksum)_log.txt: $(cns_cksum).cns

$(cns_cksum).cns: $(map_cksum).map
	@echo "# $$(date): Generating consensus ..."
	$(MAQGENE_EXE_DIR)/maq assemble $(assemble_parameters) $@ \
	$(reference_genome) $< 2> $(cns_cksum)_log.txt


$(cns_cksum)_cq.txt: $(cns_cksum).cns
	@echo "# $$(date): Running maq command cns2fq ..."
	$(MAQGENE_EXE_DIR)/maq cns2fq $< > $@


$(cns_cksum)_view.txt: $(cns_cksum).cns
	@echo "# $$(date): Running maq command cns2view ..."
	$(MAQGENE_EXE_DIR)/maq cns2view $< > $@


$(cns_cksum)_aref.txt: $(cns_cksum).cns
	@echo "# $$(date): Running maq command cns2ref ..."
	$(MAQGENE_EXE_DIR)/maq cns2ref $< > $@


.PHONY: extra_files
extra_files: $(patsubst %,$(outfile_prefix_path)_%.txt,cq view aref)


# link backend map or cns files to front
$(outfile_prefix_path)%.txt: $(map_cksum)%.txt
	@echo "# $$(date): Linking backend file $< to $@"
	ln -fs $(MAQGENE_WORK_DIR)/$< $@

$(outfile_prefix_path)%.txt: $(cns_cksum)%.txt
	@echo "# $$(date): Linking backend file $< to $@"
	ln -fs $(MAQGENE_WORK_DIR)/$< $@

$(outfile_prefix_path)%.txt: $(pileup_cksum)%.txt
	@echo "# $$(date): Linking backend file $< to $@"
	ln -fs $(MAQGENE_WORK_DIR)/$< $@


#######################################################################
# pileup

pileup_symbol_order := 'AaCcGgTt,.'

$(pileup_cksum)_pileup.txt: $(map_cksum).map
	@echo "# $$(date): Creating pileup ..."
	$(MAQGENE_EXE_DIR)/maq pileup $(pileup_parameters) $(reference_genome) $< > $@


define awk_print_filter
BEGIN { OFS="\t" } { $$5=substr($$5,1,$(PILEUP_MAX_DISPLAY_DEPTH)); print $$0 }
endef

$(pileup_cksum)_known_snps: $(pileup_cksum)_pileup.txt
	$(MAQGENE_EXE_DIR)/filter_matching_lines $< "%s\t%i\t" \
	<($(MYSQL_BATCH) -e "$(strip $(call get_known_snp_coords))") "%s\t%i\n" si \
	| $(MAQGENE_EXE_DIR)/filter_maq_pileup $(pileup_symbol_order) 0 \
	> $@


$(pileup_cksum)_pileup: $(pileup_cksum)_pileup.txt $(pileup_cksum)_known_snps
	@echo "# $$(date): Filtering and loading pileup for analysis ..."
	$(MYSQL_CMD) -e "$(strip $(call make_pileup_table_cmd,$@))"
	cat $< \
	| $(MAQGENE_EXE_DIR)/filter_maq_pileup $(pileup_symbol_order) $(min_polymorphic_reads) \
	| awk 'BEGIN { OFS="\t" } { $$5=substr($$5,1,$(PILEUP_MAX_DISPLAY_DEPTH)); print $$0 }' \
	| $(MYSQL_CMD) -e "$(strip $(call load_pileup_table_cmd,$@))"

	cat $(word 2,$^) \
	| awk 'BEGIN { OFS="\t" } { $$5=substr($$5,1,$(PILEUP_MAX_DISPLAY_DEPTH)); print $$0 }' \
	| $(MYSQL_CMD) -e "$(strip $(call load_pileup_table_cmd,$@))"
	touch $@


#######################################################################
# Uncovered regions
#######################################################################

$(pileup_cksum)_uncovered.txt: $(pileup_cksum)_pileup.txt
	@echo "# $$(date): Getting uncovered regions ..."
	cut -f 1,2,4 $< \
	| $(MAQGENE_EXE_DIR)/get_uncovered_regions $(uncovered_region_window) \
	> $@

$(pileup_cksum)_uncovered: $(pileup_cksum)_uncovered.txt $(cns_cksum)_snps
	@echo "# $$(date): Loading uncovered regions into table ..."
	cat $< \
	| $(MYSQL_CMD) -e "$(strip $(call load_uncovered_table,$@,$(call get_next_id,id,$(word 2,$^))))"

	@echo "Uncovered region statistics:"
	@echo "$$($(MYSQL_CMD) --batch -e \
	'select dna chromosome, count(*) number_uncovered_regions, sum(end - start) total_uncovered_length \
	from $@ group by dna')"

	touch $@

$(pileup_cksum)_uncovered_rel: $(pileup_cksum)_uncovered
	@echo "# $$(date): Finding mRNA features overlapping uncovered regions ..."
	$(MYSQL_CMD) -e "$(strip $(call make_rel_position_table,$@))"
	$(strip $(call get_containing_regions,$@,$<,$(genomic_feature_list)))

	touch $@


$(pileup_cksum)_offsets_uncovered: $(pileup_cksum)_uncovered $(pileup_cksum)_uncovered_rel
	@echo "# $$(date): Calculating translation start offsets for uncovered regions..."
	$(MYSQL_CMD) -e "$(strip $(call make_offsets_table,$@))"
	$(MYSQL_CMD) -e "$(strip $(call calc_tr_offsets,$@,$<,$(word 2,$^),start,id))"
	$(MYSQL_CMD) -e "$(strip $(call calc_tr_offsets,$@,$<,$(word 2,$^),end,id))"
	touch $@


$(pileup_cksum)_coverage.txt: $(pileup_cksum)_pileup.txt
	@echo "# $$(date): Making coverage histogram ..."
	(echo -en "sequencing_depth\tnumber_of_bases\n"; \
	cut -f 4 $< \
	| $(MAQGENE_EXE_DIR)/pileup_histogram $(HISTOGRAM_MAX_DEPTH)) > $@


$(cns_cksum)_snps: $(cns_cksum).cns $(map_cksum).map $(pileup_cksum)_known_snps
	@echo "# $$(date): Extracting point mutants from consensus ..."
	$(MYSQL_CMD) -e "$(strip $(call make_snp_table,$@))"
	$(MAQGENE_EXE_DIR)/maq cns2snp $< \
	| cat -b | $(MYSQL_CMD) -e "$(strip $(call load_snp_table,$@))"

	@echo "# $$(date): Extracting indels from consensus ..."
	$(MAQGENE_EXE_DIR)/maq indelsoa $(reference_genome) $(word 2,$^) \
	| awk '{ if ($$5+$$6-$$4 >= $(INDEL_READS_MIN_DIF) && $$4 <= 1) { print $$0 } }' \
	| $(MYSQL_CMD) -e "$(strip $(call load_indels,$@))"

	@echo "# $$(date): Adding placeholders for known SNPs."
	cat $(word 3,$^) \
	| $(MYSQL_CMD) -e "$(strip $(call load_snp_placeholders,$@))"

	@echo "Found $$($(MYSQL_CMD) --raw --skip-column-names -e 'select count(*) from $@') variants."
	touch $@


$(cns_cksum)_rel_snps: $(cns_cksum)_snps
	@echo "# $$(date): Finding all genomic features overlapping variants..."
	$(MYSQL_CMD) -e "$(strip $(call make_rel_position_table,$@))"

	$(strip $(call get_containing_regions,$@,$<,$(genomic_feature_list)))
	touch $@


$(cns_cksum)_masked_ids: $(cns_cksum)_snps
	@echo "# $$(date): Getting masking regions"
	$(strip $(call get_masking_regions,$@,$<))
	touch $@


$(cns_cksum)_rel_intergenic: $(cns_cksum)_rel_snps $(cns_cksum)_snps $(cns_cksum)_masked_ids
	@echo "# $$(date): Getting intergenic regions"
	$(strip $(call get_intergenic_regions,$@,$<,$(word 2,$^)))
	@echo "# $$(date): Filtering masked intergenic regions"
	$(MYSQL_BATCH) -e "$(call filter_masked_regions,$@,$(word 3,$^))"
	touch $@


$(cns_cksum)_intergenic_assoc: $(cns_cksum)_snps $(cns_cksum)_rel_intergenic
	@echo "# $$(date): Classifying intergenic relations"
	$(MYSQL_CMD) -e "$(strip $(call classify_intergenic_relations,$@,$<,$(word 2,$^)))"
	touch $@


$(cns_cksum)_offsets_snps: $(cns_cksum)_snps $(cns_cksum)_rel_snps
	@echo "# $$(date): Calculating translation start offsets..."
	$(MYSQL_CMD) -e "$(strip $(call make_offsets_table,$@))"
	$(MYSQL_CMD) -e "$(strip $(call calc_tr_offsets,$@,$<,$(word 2,$^),start,id))"
	touch $@


$(cns_cksum)_codons : $(cns_cksum)_offsets_snps $(cns_cksum)_snps
	@echo "# $$(date): Getting codons affected by point mutations..."
	$(MYSQL_CMD) -e "$(strip $(call get_snp_codons,$@,$<,$(word 2,$^)))"
	touch $@


# All of these commands insert into 'marked' table.  And since MySQL
# inserts are sequential, there is no point in dividing up these commands for
# parallelization.
$(cns_cksum)_marked: $(cns_cksum)_snps $(cns_cksum)_rel_snps \
	$(cns_cksum)_codons $(cns_cksum)_rel_intergenic $(cns_cksum)_intergenic_assoc \
    $(pileup_cksum)_uncovered $(pileup_cksum)_offsets_uncovered

	$(MYSQL_CMD) -e "$(strip $(call make_annotations_table,$@))"

	@echo "# $$(date): Finding exonic indels ..."
	$(MYSQL_CMD) -e "$(strip $(call load_exonic_indels,$@,$<,$(word 2,$^)))"

	@echo "# $$(date): Finding coding variants ..."
	$(MYSQL_CMD) -e "$(strip $(call load_coding_annotations,$@,$(word 3,$^)))"

	@echo "# $$(date): Finding noncoding variants ..."
	$(MYSQL_CMD) -e "$(strip $(call load_noncoding_annotations,$@,$<,$(word 2,$^)))"

	@echo "# $$(date): Finding splice site variants ..."
	$(MYSQL_CMD) -e "$(strip $(call load_splice_annotations,$@,$<,$(word 2,$^)))"

	@echo "# $$(date): Finding intergenic and intronic variants ..."
	$(MYSQL_CMD) -e "$(strip $(call load_intergenic_annotations,$@,$(word 5,$^)))"

	@echo "# $$(date): Finding uncovered regions in genes ..."
	$(MYSQL_CMD) -e "$(strip $(call load_uncovered_gene_annotations,$@,$(word 6,$^),$(word 7,$^)))"

	@echo "# $$(date): Variants found:"
	@echo "$$($(MYSQL_CMD) --table -e 'select class, count(distinct id) number_variants from $@ group by class')"

	touch $@


$(cns_cksum)_combined: $(cns_cksum)_snps $(pileup_cksum)_uncovered \
	$(pileup_cksum)_pileup $(cns_cksum)_marked
	@echo "# $$(date): Combining results ..."
	$(MYSQL_CMD) -e "$(strip $(call combine_all,$(per_locus_combined),$@,$<,$(word 3,$^),$(word 4,$^),$(word 2,$^)))"
	touch $@


$(full_results_dir)/$(outfile_basename)_grouped.txt: $(cns_cksum)_combined
	@echo "# $$(date): Writing results to grouped file ..."
	$(MYSQL_BATCH) --column-names -e "$(strip $(call retrieve_grouped,$<))" > $@
	@echo " $$(wc -l $@ | cut -f 1 -d ' ') lines written."


$(full_results_dir)/$(outfile_basename)_flat.txt: $(cns_cksum)_combined
	@echo "# $$(date): Writing results to flat file ..."
	$(MYSQL_BATCH) --column-names -e "select * from $<" > $@
	@echo " $$(wc -l $@ | cut -f 1 -d ' ') lines written."


.PHONY: snp_read_counts
snp_read_counts: $(full_results_dir)/$(outfile_basename)_snp_read_counts.txt


$(full_results_dir)/$(outfile_basename)_snp_read_counts.txt: $(pileup_cksum)_pileup
	@echo "# $$(date): Writing snp read counts ..."
	$(MYSQL_BATCH) --column-names -e "$(strip $(call known_snp_mixin_ratio_table,$<))" > $@
	@echo " $$(wc -l $@ | cut -f 1 -d ' ') snp read count lines written."


drop_tables := $(addprefix $(cns_cksum)_,snps rel masked_ids rel_intergenic intergenic_assoc \
	offsets codons marked combined)

drop_tables := $(drop_tables) $(pileup_cksum)_pileup

drop_tables_list := $(subst $(space),$(comma),$(drop_tables))

.PHONY: clean_tables

clean_tables:
	@echo "# $$(date): Cleaning intermediate data ..."
	$(MYSQL_CMD) -e "drop table if exists $(drop_tables_list)"
	rm -f $(drop_tables)