################################################################################ # Defines the following, which are to be called with automatic variables $@, $< # and $^ in various combinations in 'makefile' # These commands are Make macros, expanded and then evaluated by the SHELL (/bin/bash) # define make_snp_table # define load_snp_table # define load_indels # define load_pileup_table_cmd # define make_annotations_table # define load_exonic_indels # define load_coding_annotations # define load_noncoding_annotations # define load_splice_annotations # define calc_tx_offset # define get_snp_codons # define get_intergenic_regions # define load_masked_query_sql # define get_masking_regions # define filter_masked_regions # define classify_intergenic_relations # define load_intergenic_annotations # define make_rel_position_table # define get_containing_regions # define combine_all # define retrieve_grouped empty := space := $(empty) $(empty) comma := , define newline $(empty) $(empty) endef define quote_list $(subst $(space),$(comma),$(patsubst %,'%',$(strip $(1)))) endef define print_nonempty_file $(shell if [ -s $(1) ]; then echo -en $(1); fi) endef # required defined variables # BFA_FILE # MAQGENE_GENOME_DIR SPECIES := $(basename $(BFA_FILE)) FEATURE_TABLE := $(SPECIES)_features SNP_CHANGES_TABLE := $(SPECIES)_snp_changes GENE_DNA_TABLE := $(SPECIES)_coding_dna CODON_TABLE := $(SPECIES)_genetic_code DNA_BOUNDS_FILE := $(MAQGENE_GENOME_DIR)/$(SPECIES).dnas PER_LOCUS_TABLE := $(SPECIES)_per_locus GENE_NAMES_TABLE := $(SPECIES)_gene_names ################################################################################ # variant table ################################################################################ define make_snp_table drop table if exists $(1); create table $(1) ( id int(10) NOT NULL auto_increment, dna char(40) NOT NULL, start int(10) NOT NULL, end int(10) NOT NULL, indel_size int(5) NOT NULL, variant_type enum('indel', 'point') NOT NULL, ref_base char NOT NULL, sample_base char NOT NULL, snp_score int(5) NOT NULL, read_depth int(5) NOT NULL, loci_multiplicity double(5,2) NOT NULL, mapping_quality int(5) NOT NULL, neighbor_quality int(5) NOT NULL, primary key (id), unique key (dna, start), key (indel_size) ) endef define load_snp_table load data local infile '/dev/stdin' into table $(1) (id, dna, @start, ref_base, sample_base, snp_score, read_depth, loci_multiplicity, mapping_quality, neighbor_quality) set start = @start - 1, end = @start, indel_size = 0, variant_type = 'point'; flush table $(1); endef define load_indels load data local infile '/dev/stdin' into table $(1) (dna, @start, indel_size, @num_spanning_reads, @num_left_reads, @num_right_reads, @junk) set id = NULL, start = @start - 1, end = @start - 1 + if (indel_size > 0, 0, -indel_size), ref_base = 'X', sample_base = 'X', variant_type = 'indel', snp_score = -1000, read_depth = if (@num_left_reads < @num_right_reads, @num_left_reads, @num_right_reads), loci_multiplicity = -1000, mapping_quality = -1000, neighbor_quality = -1000; flush table $(1); endef define load_snp_placeholders load data local infile '/dev/stdin' ignore into table $(1) (dna, @start, ref_base, read_depth, @junk) set id = NULL, start = @start - 1, end = @start - 1, variant_type = 'point', snp_score = -1000, loci_multiplicity = -1000, mapping_quality = -1000, neighbor_quality = -1000; flush table $(1); endef dna_enum := $(call quote_list,$(shell cut -f 2 $(DNA_BOUNDS_FILE))) genomic_feature_list := $(call quote_list,$(shell $(MYSQL_CMD) --batch --skip-column-names -e \ "select distinct feature from $(FEATURE_TABLE)")) define make_pileup_table_cmd drop table if exists $(1); create table $(1) ( dna enum($(dna_enum)) NOT NULL, start int(10) NOT NULL, ref_base char(1) NOT NULL, read_depth int(5) NOT NULL, sample_reads varchar($(PILEUP_MAX_DISPLAY_DEPTH)) NOT NULL, A_fwd int(5) NOT NULL, A_rev int(5) NOT NULL, C_fwd int(5) NOT NULL, C_rev int(5) NOT NULL, G_fwd int(5) NOT NULL, G_rev int(5) NOT NULL, T_fwd int(5) NOT NULL, T_rev int(5) NOT NULL, wt_fwd int(5) NOT NULL, wt_rev int(5) NOT NULL, primary key (dna, start) ); endef define load_pileup_table_cmd load data local infile '/dev/stdin' into table $(1) (dna, @start, ref_base, read_depth, sample_reads, A_fwd, A_rev, C_fwd, C_rev, G_fwd, G_rev, T_fwd, T_rev, wt_fwd, wt_rev) set start = @start - 1; flush table $(1) endef define make_annotations_table drop table if exists $(1); create table $(1) ( id int(10) NOT NULL, class varchar(100) NOT NULL, description varchar(100) NOT NULL, parent_feature varchar(100) NOT NULL, primary key (id, class, parent_feature) ); endef ############################################################################ # Classify exonic indels as frameshift or inframe. By convention, # indel_size > 0 means insertion. Same start and end position on reference # indel_size < 0 means deletion. End - start = size of deleted piece of DNA # For deletions, classifies based on the size of the piece of exon deleted # For insertions, the size of the insertion is used in its entirety. ############################################################################ define load_exonic_indels insert into $(1) select snp.id, if (snp.indel_size < 0, if (rel.overlap % 3 = 0, 'inframe', 'frameshift'), if (snp.indel_size % 3 = 0, 'inframe', 'frameshift')) class, if (rel.overlap != snp.end - snp.start, 'exon_boundary', 'none') description, feature.attribute parent_feature from $(2) snp join $(3) rel on (snp.id = rel.query_region_id) join $(FEATURE_TABLE) feature on (rel.target_region_id = feature.id) where feature.feature = 'exon' and snp.variant_type = 'indel'; endef ######################################################################## # Load the coding annotations # 4. Classify the transition as: # Silent (AA -> same AA), missense (AA -> Stop), # nonsense (AA -> diff AA), readthrough (stop -> AA) ######################################################################## define load_coding_annotations insert into $(1) select trans.id, if (reference_codons.amino = sample_codons.amino, 'silent', if (reference_codons.amino3 = 'stop', 'readthrough', if (sample_codons.amino3 = 'stop', 'premature_stop', if (trans.start = 'Y' and sample_codons.is_start != 'Y', 'non_start', 'missense') ) ) ) class, concat(ref_codon,'->',sam_codon,'[', reference_codons.amino3,'->',sample_codons.amino3,']') description, trans.gene from $(2) trans join $(CODON_TABLE) reference_codons on (trans.ref_codon = reference_codons.codon) join $(CODON_TABLE) sample_codons on (trans.sam_codon = sample_codons.codon); endef ############################################################# # Load non-coding annotations # All snps that overlap non-coding features # three_prime_UTR, five_prime_UTR, ncRNA, known variants # introns are considered non-genic and handled in the section # 'intergenic' regions ############################################################# define load_noncoding_annotations insert into $(1) select snp.id, feature.feature class, 'none' description, feature.attribute parent_feature from $(2) snp join $(3) rel on (snp.id = rel.query_region_id) join $(FEATURE_TABLE) feature on (rel.target_region_id = feature.id) where feature.feature not in ('exon', 'intron', 'mRNA'); endef ############################################################### # Load splice site annotations ############################################################### define load_splice_annotations insert into $(1) select snp.id, if (feature.strand = '+', if (snp.start - feature.start < 2, 'splice_donor', 'splice_acceptor'), if (snp.start - feature.start < 2, 'splice_acceptor', 'splice_donor')) class, 'none' description, feature.attribute parent_feature from $(2) snp join $(3) rel on (snp.id = rel.query_region_id) join $(FEATURE_TABLE) feature on (rel.target_region_id = feature.id) where feature.feature = 'intron' and rel.distance > -2; endef define make_offsets_table drop table if exists $(1); create table $(1) ( id int(10) NOT NULL, gene varchar(50) NOT NULL, 5prime_offset int(10) NOT NULL, 3prime_offset int(10) NOT NULL, total_length int(10) NOT NULL, boundary_type char(20) NOT NULL, primary key (id, gene, boundary_type) ); endef # gives sum of exonic bases between gene boundaries and named feature bound # Arguments: # 1 output_table # 2 region_table # 3 rel_table # 4 field in region_table (start or end) # 5 id field in region_table (usually just 'id') # output_table: id, gene, 5prime_offset, 3prime_offset, total_length, boundary_type # assumes snp regions are associated to mRNA regions via 'rel' define calc_tr_offsets insert into $(1) select reg.$(5), gene.attribute gene, if(exon.strand = '+', if(reg.$(4) < min(exon.start), reg.$(4) - min(exon.start), sum(if(reg.$(4) < exon.end, reg.$(4), exon.end) - if(reg.$(4) < exon.start, reg.$(4), exon.start))), if(max(exon.end) < reg.$(4), max(exon.end) - reg.$(4), sum(if(exon.end < reg.$(4), reg.$(4), exon.end) - if(exon.start < reg.$(4), reg.$(4), exon.start))) ), if(exon.strand = '+', if(max(exon.end) < reg.$(4), max(exon.end) - reg.$(4), sum(if(exon.end < reg.$(4), reg.$(4), exon.end) - if(exon.start < reg.$(4), reg.$(4), exon.start))), if(reg.$(4) < min(exon.start), reg.$(4) - min(exon.start), sum(if(reg.$(4) < exon.end, reg.$(4), exon.end) - if(reg.$(4) < exon.start, reg.$(4), exon.start)))), sum(exon.end - exon.start) total_length, '$(4)' boundary_type from $(2) reg join $(3) rel on (reg.$(5) = rel.query_region_id) join $(FEATURE_TABLE) gene on (rel.target_region_id = gene.id) join $(FEATURE_TABLE) exon on (exon.attribute = gene.attribute) where exon.feature = 'exon' and gene.feature = 'mRNA' group by reg.$(5), gene.attribute endef #define get_next_id #100 #endef define get_next_id $(shell $(MYSQL_CMD) --skip-column-names --batch -e "select max($(1)) + 1 from $(2)") endef define load_uncovered_table drop table if exists $(1); create table $(1) ( id int(10) NOT NULL auto_increment, dna char(40) NOT NULL, start int(10) NOT NULL, end int(10) NOT NULL, primary key (id) ) auto_increment = $(2); load data local infile '/dev/stdin' into table $(1) (dna, start, end) endef # produces an annotation string for uncovered regions of protein-coding genes # uses intermediate result from 'calc_tx_offsets' # here 'startbound' is the table of offsets of the left (start) bound define load_uncovered_gene_annotations insert into $(1) select b.id, 'uncovered', if(b.5prime_offset = e.5prime_offset, concat('non-exonic, ',unc.start - gene.start,' to ',unc.end - gene.start,' bp into'), concat( if(b.5prime_offset < 0, concat(-b.5prime_offset, ' bp upstream'), if(b.3prime_offset < 0, concat(-b.3prime_offset, ' bp downstream'), concat('codon ',floor(b.5prime_offset / 3) + 1) ) ), ' to ', if(e.3prime_offset < 0, concat(-e.3prime_offset, ' bp downstream'), if(e.5prime_offset < 0, concat(-e.5prime_offset, ' bp upstream'), concat('codon ',ceiling(e.5prime_offset / 3) + 1) ) ), ' (', format(100 * (if(e.5prime_offset < 0,0,e.5prime_offset) - (if(b.5prime_offset < 0,0,b.5prime_offset)) ) / b.total_length,0 ), ' % of ', floor(b.total_length/3), ' codons)' ) ) description, b.gene from $(2) unc join $(3) b using (id) join $(3) e using (id, gene) join $(FEATURE_TABLE) gene on (e.gene = gene.attribute) where b.boundary_type = 'start' and e.boundary_type = 'end' and gene.strand = '+' and gene.feature = 'mRNA' union select b.id, 'uncovered', if(b.5prime_offset = e.5prime_offset, concat('non-exonic, ',gene.end - unc.end,' to ',gene.end - unc.start,' bp into'), concat( if(e.5prime_offset < 0, concat(-e.5prime_offset, ' bp upstream'), if(e.3prime_offset < 0, concat(-e.3prime_offset, ' bp downstream'), concat('codon ',floor(e.5prime_offset / 3) + 1) ) ), ' to ', if(b.3prime_offset < 0, concat(-b.3prime_offset, ' bp downstream'), if(b.5prime_offset < 0, concat(-b.5prime_offset, ' bp upstream'), concat('codon ',ceiling(b.5prime_offset / 3) + 1) ) ), ' (', format(100 * (if(b.5prime_offset < 0,0,b.5prime_offset) - (if(e.5prime_offset < 0,0,e.5prime_offset)) ) / b.total_length,0 ), ' % of ', floor(b.total_length/3), ' codons)' ) ) description, b.gene from $(2) unc join $(3) b using (id) join $(3) e using (id, gene) join $(FEATURE_TABLE) gene on (e.gene = gene.attribute) where b.boundary_type = 'start' and e.boundary_type = 'end' and gene.strand = '-' and gene.feature = 'mRNA' endef # load variant annotations into define get_snp_codons drop table if exists $(1); create table $(1) ( id int(10) NOT NULL, gene varchar(50) NOT NULL, ref_codon char(4) NOT NULL, sam_codon char(4) NOT NULL, start char(1) NOT NULL, primary key (id, gene) ); insert into $(1) select snp.id, off.gene, @ref_codon := substr(gene_dna.sequence, (floor(if(feature.strand = '+', off.5prime_offset, off.5prime_offset - 1 ) / 3 ) * 3 ) + 1, 3 ) ref_codon, insert(@ref_codon, (if(feature.strand = '+', off.5prime_offset, off.5prime_offset - 1) % 3) + 1, 1, if(feature.strand = '+', snp.sample_base, substr('ACGTTGCA', instr('ACGTTGCA', snp.sample_base)+4, 1) ) ) sam_codon, if((feature.strand = '+' && off.5prime_offset < 3) || (feature.strand = '-' && off.5prime_offset - 1 < 3), 'Y', 'N') start from $(2) off join $(3) snp on (off.id = snp.id) join $(GENE_DNA_TABLE) gene_dna using (gene) join $(FEATURE_TABLE) feature on (gene_dna.gene = feature.attribute and feature.start <= snp.start and feature.end >= snp.end) where snp.variant_type = 'point' and feature.feature = 'exon' and off.boundary_type = 'start' and off.5prime_offset >= 0 and off.3prime_offset >= 0; endef ############################################################### # Intergenic regions ############################################################### # Retrieve all variants in the proximity of an mRNA. 'proximity' is # defined as having an interval distance in the range [0, max] with # some mRNA. NOTE: in general there is a many-to-many relationship # between variants and mRNAs. Each association is treated # independently. # find all regions that are within a certain distance range of the span # of mRNA. intergenic_target_sql := select id, '$(SPECIES)',dna,start,end,strand \ from $(FEATURE_TABLE) feature where feature.feature = 'mRNA' define get_intergenic_regions $(MYSQL_CMD) -e "drop table if exists $(1); create table $(1) like $(2)"; $(MAQGENE_EXE_DIR)/associate_regions -b $(max_gene_radius) -c -1000000 -m $(max_intervening_genes) -o >($(MYSQL_CMD) -e "load data local infile '/dev/stdin' into table $(1); flush table $(1)") -d $(DNA_BOUNDS_FILE) -p $(MAQGENE_GENOME_DIR) -q <($(MYSQL_BATCH) -e "select id, '$(SPECIES)', dna, start, end, '+' from $(3)") -t <($(MYSQL_BATCH) -e "$(intergenic_target_sql)") 1>/dev/null; endef # Get masked query ids, load them into a table define load_masked_query_sql drop table if exists $(1); create table $(1) (query_region_id int(10) NOT NULL, primary key (query_region_id)); load data local infile '/dev/stdin' ignore into table $(1) (@j1, query_region_id, @j3, @j4, @j5, @j6, @j7) endef # Find all such SNPs that are associated with a more important feature, such as # a SNP, exon, ncRNA, or other thing. (This is basically anything that is not # mRNA or intron. These will be filtered out from the 'intergenic associations # as found by 'intergenic_target_sql' masked_target_sql := select id,'$(SPECIES)',dna,start,end,strand from \ $(FEATURE_TABLE) feature where feature.feature not in ('mRNA', 'intron') define get_masking_regions $(MAQGENE_EXE_DIR)/associate_regions -b -1 -c -1000000 -m $(max_intervening_genes) -o >($(MYSQL_BATCH) -e "$(call load_masked_query_sql,$(1))") -d $(DNA_BOUNDS_FILE) -p $(MAQGENE_GENOME_DIR) -q <($(MYSQL_BATCH) -e "select id, '$(SPECIES)', dna, start, end, '+' from $(2)") -t <($(MYSQL_BATCH) -e "$(masked_target_sql)") 1>/dev/null; endef # Filter masked query ids from intergenic relative position table intergenic_rel_position_table := $(cns_cksum)_rel_intergenic define filter_masked_regions delete i.* from $(1) i join $(2) m using (query_region_id) endef # Classify intergenic relations intergenic_associations_table := $(cns_cksum)_intergenic define classify_intergenic_relations drop table if exists $(1); create table $(1) ( id INT(10) NOT NULL, target_region_id INT(10) NOT NULL, relation enum ('upstream', 'downstream', 'into') NOT NULL, distance INT(10) NOT NULL, num_regions_between INT(3) NOT NULL, attribute varchar(50) NOT NULL, primary key (id, target_region_id), key (id, attribute) ); insert into $(1) select snp.id, rel.target_region_id, if (rel.overlap > 0, 'into', if (feature.strand = '+', if (snp.end < feature.start, 'upstream', 'downstream'), if (feature.end < snp.start, 'upstream', 'downstream') ) ) relation, rel.distance, rel.num_regions_between, feature.attribute from $(2) snp join $(3) rel on (snp.id = rel.query_region_id) join $(FEATURE_TABLE) feature on (rel.target_region_id = feature.id) endef define conventionalize_genic_distance update $(1) set distance = -distance where relation in ('upstream','into') endef # Load intergenic annotations define load_intergenic_annotations insert into $(1) select id, 'nongenic' class, concat(if (relation in ('upstream','into'), - distance, distance),' ',relation) description, attribute from $(2) where ((relation = 'upstream' and num_regions_between = 0 and distance < $(umdd)) or (relation = 'upstream' and num_regions_between < $(max_intervening_genes) and distance < $(umid)) or (relation = 'downstream' and num_regions_between = 0 and distance < $(dmdd)) or (relation = 'downstream' and num_regions_between < $(max_intervening_genes) and distance < $(dmid)) or relation = 'into' ); endef define make_rel_position_table drop table if exists $(1); CREATE TABLE $(1) ( association_id INT(10) NOT NULL, query_region_id INT(10) unsigned NOT NULL, target_region_id INT(10) NOT NULL, distance INT(10) NOT NULL, overlap INT(10) NOT NULL, same_strand enum('SAME', 'OPP') NOT NULL, num_regions_between INT(3) NOT NULL, PRIMARY KEY (association_id), KEY (query_region_id), KEY (target_region_id) ) endef ################################################################### # produces a 'rel' table associating all query regions that overlap features ################################################################### define get_containing_regions $(MAQGENE_EXE_DIR)/associate_regions -b -1 -c -1000000 -m 0 -o >($(MYSQL_CMD) -e "load data local infile '/dev/stdin' into table $(1); flush table $(1)") -d $(DNA_BOUNDS_FILE) -p $(MAQGENE_GENOME_DIR) -q <($(MYSQL_BATCH) -e "select id, '$(SPECIES)', dna, start, end, '+' from $(2)") -t <($(MYSQL_BATCH) -e "select id, '$(SPECIES)', dna,start,end,strand from $(FEATURE_TABLE) \ where feature in ($(3))") 1>/dev/null endef define get_locus_extra_columns select column_name from INFORMATION_SCHEMA.columns where table_name = '$(PER_LOCUS_TABLE)' and column_name not in ('dna','start') endef # has primary key (id, class, parent_feature) i.e. there should be at # most one relationship between a given id for a class, parent_feature # combination define combine_all drop table if exists $(2); create table $(2) (primary key (id, class, description), key (dna, start)) as select snp.id, '$(outfile_basename)' mutant_strain, snp.dna, snp.start + 1 start, snp.end - snp.start length, $(1) snp.ref_base reference_base, snp.sample_base sample_base, snp.snp_score consensus_score, snp.loci_multiplicity, snp.mapping_quality, snp.neighbor_quality, (pile.wt_fwd + pile.wt_rev) number_wildtype_reads, (snp.read_depth - (pile.wt_fwd + pile.wt_rev)) number_variant_reads, snp.read_depth sequencing_depth, pile.sample_reads, snp.variant_type, snp.indel_size, class.class, class.description, group_concat(class.parent_feature order by class.parent_feature) parent_features from $(3) snp join $(4) pile using (dna,start) join $(5) class on (snp.id = class.id) left join $(PER_LOCUS_TABLE) using (dna,start) where (snp.variant_type = 'point' and (snp.snp_score >= $(min_snp_score) and snp.loci_multiplicity <= $(max_loci_multiplicity) and snp.neighbor_quality >= $(min_neighborhood_quality) and pile.read_depth >= $(min_total_depth) and ((pile.read_depth - pile.wt_fwd - pile.wt_rev) / pile.read_depth) >= $(min_fraction_polymorphic_reads)) or snp.variant_type = 'indel' ) group by id, class, description union select unc.id id, '$(outfile_basename)' mutant_strain, unc.dna, unc.start + 1 start, unc.end - unc.start length, $(1) '-' reference_base, '-' sample_base, 0 consensus_score, 0, 0, 0, 0 number_wildtype_reads, 0 number_variant_reads, 0 sequencing_depth, '' sample_reads, 'uncovered', 0, class.class, class.description, group_concat(class.parent_feature order by class.parent_feature) parent_features from $(6) unc join $(5) class on (unc.id = class.id) left join $(PER_LOCUS_TABLE) using (dna,start) group by id, class, description; alter table $(2) order by dna, start, length endef define ungrouped_column_list select concat(column_name,',') from INFORMATION_SCHEMA.columns where table_name = '$(1)' and column_name not in ('class','description','parent_features') endef define retrieve_grouped select $(shell $(MYSQL_BATCH) -e "$(call ungrouped_column_list,$(1))") group_concat(class) classes, group_concat(description) descriptions, group_concat(concat('{',parent_features,'}')) parent_features from $(1) group by id order by dna, start; endef define has_snp_changes_table select count(*) from INFORMATION_SCHEMA.tables where table_schema = '$(MYSQL_DB)' and TABLE_NAME = '$(SPECIES)_snp_changes' endef define known_snp_mixin_ratio_table select f.id, f.dna, f.start + 1 start, e.snp_name, e.snp_base, if(e.snp_base = 'a', p.A_fwd + p.A_rev, if (e.snp_base = 'c', p.C_fwd + p.C_rev, if (e.snp_base = 'g', p.G_fwd + p.G_rev, if (e.snp_base = 't', p.T_fwd + p.T_rev, -1)))) snp_reads, p.ref_base, p.wt_fwd + p.wt_rev wt_reads, p.read_depth, p.sample_reads from $(FEATURE_TABLE) f join $(SNP_CHANGES_TABLE) e on (f.attribute = e.snp_name) join $(1) p using (dna,start) endef define get_known_snp_coords select distinct dna, start + 1 from $(FEATURE_TABLE) where feature = 'SNP' order by dna, start endef