=pod 

=head1 NAME

Bio::EnsEMBL::Hive::RunnableDB::Funcgen::AnnotateRegulatoryFeatures

=head1 DESCRIPTION

'AnnotateRegulatoryFeatures'
 implements all Damian's regulatory annotation scripts in one single process

=cut

package Bio::EnsEMBL::Funcgen::RunnableDB::AnnotateRegulatoryFeatures;

use warnings;
use strict;
use Bio::EnsEMBL::DBSQL::DBAdaptor;
use Bio::EnsEMBL::Funcgen::DBSQL::DBAdaptor; 
use Bio::EnsEMBL::Utils::Exception qw(throw warning stack_trace_dump);

use base ('Bio::EnsEMBL::Funcgen::RunnableDB::Annotation');

use constant  NO_ROWS => '0E0';

sub fetch_input {   # fetch parameters...
  my $self = shift @_;
  
  $self->SUPER::fetch_input();

  my $cell_type = $self->param('cell_type') || throw "No cell type defined";
  $self->_cell_type($cell_type);

  return 1;
}

sub run {   # Check parameters and do appropriate database/file operations... 
  my $self = shift @_;

  #Create a database for this cell_type... maybe add this step to setup?
  my %wparams = %{$self->_workdb_params};
  my $dbparams = "-h".$wparams{'-host'}." -u".$wparams{'-user'}." -P".$wparams{'-port'}." -p".$wparams{'-pass'}; 
  #my $dbname = "annotation_".$self->_species."_".$self->_release."_".$self->_cell_type;
  my $ct = $self->_cell_type;
  $ct =~ s/\-/_/g;
  my $dbname = "annotation_".$self->_efgdba->dbc->dbname."_".$ct;

  system("mysql $dbparams -e 'drop database if exists ".$dbname."; create database ".$dbname."';") && throw "Could not create work database $dbname";
  $wparams{'-dbname'} = $dbname;

  eval{
       $self->_workdb(Bio::EnsEMBL::DBSQL::DBConnection->new( %wparams ));
  };
  if($@) { throw "Error creating the Work DB Connection: $@";  }    
  if(!$self->_workdb()){ throw "Could not create Work DB server connection"; }

  my $workdb = $self->_workdb;

  my $dump_dir = $self->_output_dir."/tmp_".$self->_cell_type."_".int(rand(100));

  open(REPORT,">".$self->_output_dir."/".$self->_cell_type.".report");

  _copy_data($self->_dnadba->dbc,$self->_efgdba->dbc,$workdb,$dump_dir);

  #Perhaps make a feature factory?... Each feature knows which data it needs...
  # ORDER MATTERS!!??
  _create_transcript_features($workdb);
  _create_repeat_features($workdb);
  _create_gene_features($workdb);
  _create_PolIII_features($workdb);
  _create_exon_features($workdb);
  _create_intergenic_features($workdb);
  _cage_ditag_transcript_tss($workdb);

  #Not in use at the moment...
  #&karyotype_features($dbh,$dbu);
  #&cpg_features($dbh,$dbu);

  #Process 
  _process_regulatory_features($self->_dnadba->dbc,$self->_efgdba->dbc,$workdb,$self->_cell_type);

  _generate_random_frags($workdb);

  #read from file or parameters...
  my @gen_feats;
  push @gen_feats,  "protein_coding_exon1_plus_enhancer";
  push @gen_feats,  "protein_coding_transcript_downstream_2500";
  push @gen_feats,  "protein_coding_single_exon_gene_plus_enhancer";
  push @gen_feats,  "protein_coding_intron1";
  push @gen_feats,  "protein_coding_gene_body";
  push @gen_feats,  "intergenic_2500";
  push @gen_feats,  "RNA_gene_single_exon_gene_plus_enhancer";
  # This one needs cage_ditag_transcript_tss
  push @gen_feats,  "tss_centred_5000";
  push @gen_feats,  "PolIII_transcribed_gene_plus_enhancer";

 _calculate_associations($workdb, $self->_output_dir."/".$self->_cell_type, \@gen_feats, $self->_species);

  #&clean_temp();

  close(REPORT);

  return 1;
}


sub write_output {  
  my $self = shift @_;

  return 1;

}

# Dumps data from core and other relevant databases and imports it into a working db
sub _copy_data {
  my ($dnadb, $efgdb, $workdb, $dump_dir) = (shift,shift,shift,shift);

  # Copy data from core
  system("mkdir -p ".$dump_dir) && throw "Error creating $dump_dir folder";

  my $dump_templ = "mysqldump --opt --skip-lock-tables -h ".$dnadb->host.
    " -u ".$dnadb->username.
      " -P ".$dnadb->port.
	" ".$dnadb->dbname.
	  ' %s '.
	    " > ${dump_dir}/".'%s'.".dump";
  
  my $load_templ = "mysql -h ".$workdb->host.
    " -u ".$workdb->username.
      " -P ".$workdb->port.
	" -p".$workdb->password.
	  " ".$workdb->dbname.
	    " < ${dump_dir}/".'%s'.".dump";
  
  my @core_tables = (
		     'transcript',
		     'transcript_stable_id',
		     'gene',
		     'gene_attrib',
		     'attrib_type',
		     'exon',
		     'exon_transcript',
		     'coord_system',
		     'meta_coord',
		     'seq_region',
		     'seq_region_attrib',
		     'assembly',
		     'seq_region_attrib',
		     'simple_feature',
		     'analysis',
		     'karyotype',
		     'xref',
		     'object_xref',
		     'external_db',
		     'translation',
		     'analysis_description'
		    );

  push @core_tables, ('repeat_feature','repeat_consensus');

  foreach my $table (@core_tables){
    
    #unless( $core_dbu->table_exists($table)){die ("ERROR: The core database does not contain table $table")}
    
    my $command = sprintf($dump_templ,$table,$table);
    system($command) && throw "Error running $command";
    $command = sprintf($load_templ,$table);
    system($command) && throw "Error running $command";
  }

  # copy data from funcgen
  $dump_templ = "mysqldump --opt --skip-lock-tables -h ".$efgdb->host.
    " -u ".$efgdb->username.
      " -P ".$efgdb->port;
  if($efgdb->password){
    $dump_templ .= " -p".$efgdb->password;
  }  
  $dump_templ .= " ".$efgdb->dbname.
    ' %s '.
      " > ${dump_dir}/".'%s'.".dump";
  
  my @funcgen_tables =( 'regulatory_attribute',
		     'feature_type',
		     'regulatory_feature',
		     'feature_set',
		     'data_set',
		     'supporting_set',
		     'external_feature',
		     'meta',
		     'annotated_feature'
		   );
  

  foreach my $table (@funcgen_tables){
    
    #Do some checking??
    #unless( $source_dbu->table_exists($table)){die ("ERROR: The funcgen database does not contain table $table")}
    
    my $command = sprintf($dump_templ,$table,$table);
    system($command) && throw "Error running $command";
    $command = sprintf($load_templ,$table);
    system($command) && throw "Error running $command";
  }
  
  system("rm -r -f ".$dump_dir) && throw "Error deleting $dump_dir folder";

}

sub _create_PolIII_features{
    my($dbc) = @_;

    my @sql;
    push @sql, "insert into PolIII_transcribed_gene select * from all_repeats where feature_type = 'PolIII_transcribed_repeat'";
    push @sql, "insert into PolIII_transcribed_gene select * from all_repeats where feature_type = 'tRNA_repeat'";

    push @sql, "drop table if exists PolIII_transcribed_gene_plus_enhancer";
    push @sql, "create table PolIII_transcribed_gene_plus_enhancer select  seq_region_id, seq_region_name,feature_id, 'PolIII_transcribed_gene_plus_enhancer' as feature_type,if(feature_strand = 1,feature_start-2500,feature_start) as feature_start, if(feature_strand = 1,feature_end,feature_end +2500) as feature_end, feature_strand from PolIII_transcribed_gene";

    push @sql, _col_types_and_indices('PolIII_transcribed_gene_plus_enhancer');

    _run_sql($dbc,@sql) or throw "cannot run sql";
}


sub _create_gene_features{
    my ($dbc) = @_;

    my @sql;
    push @sql,"drop table if exists protein_coding_gene";
    push @sql,"create table protein_coding_gene select sr.seq_region_id,sr.name as seq_region_name,gene_id as feature_id,'protein_coding_gene' as feature_type, t.seq_region_start as feature_start,t.seq_region_end as feature_end, t.seq_region_strand as feature_strand from gene t, seq_region sr where t.seq_region_id = sr.seq_region_id and t.biotype in ('protein_coding')";
    push @sql,"alter table protein_coding_gene add index(seq_region_name)";
    push @sql,"alter table protein_coding_gene add index(seq_region_id)";


    push @sql,"drop table if exists pseudogene";
    push @sql,"create table pseudogene select sr.seq_region_id,sr.name as seq_region_name,gene_id as feature_id,concat(biotype,'_gene') as feature_type, t.seq_region_start as feature_start,t.seq_region_end as feature_end, t.seq_region_strand as feature_strand from gene t, seq_region sr where t.seq_region_id = sr.seq_region_id and t.biotype in ('pseudogene','repeat','retrotransposed')";
    push @sql,"alter table pseudogene add index(seq_region_name)";
    push @sql,"alter table pseudogene add index(seq_region_id)";


    push @sql,"drop table if exists RNA_gene";
    push @sql,"create table RNA_gene select sr.seq_region_id,sr.name as seq_region_name,gene_id as feature_id,concat(biotype,'_gene') as feature_type, t.seq_region_start as feature_start,t.seq_region_end as feature_end, t.seq_region_strand as feature_strand from gene t, seq_region sr where t.seq_region_id = sr.seq_region_id and t.biotype like '%RNA%' and t.biotype not like '%pseudogene%'";
    push @sql,"alter table RNA_gene add index(seq_region_name)";
    push @sql,"alter table RNA_gene add index(seq_region_id)";


    #This section is possibly not necessary...
    push @sql,"drop table if exists processed_transcript"; # used for QC
    push @sql,"create table processed_transcript select sr.seq_region_id,sr.name as seq_region_name,gene_id as feature_id,concat(biotype,'_gene') as feature_type, t.seq_region_start as feature_start,t.seq_region_end as feature_end, t.seq_region_strand as feature_strand from gene t, seq_region sr where t.seq_region_id = sr.seq_region_id and t.biotype in ('processed_transcript')";
    push @sql,"alter table processed_transcript add index(seq_region_name)";
    push @sql,"alter table processed_transcript add index(seq_region_id)";


    push @sql,"drop table if exists RNA_pseudogene";
    push @sql,"create table RNA_pseudogene select sr.seq_region_id,sr.name as seq_region_name,gene_id as feature_id,concat(biotype,'_gene') as feature_type, t.seq_region_start as feature_start,t.seq_region_end as feature_end, t.seq_region_strand as feature_strand from gene t, seq_region sr where t.seq_region_id = sr.seq_region_id and t.biotype like '%RNA%' and t.biotype like '%pseudogene%'";
    push @sql,"alter table RNA_pseudogene add index(seq_region_name)";
    push @sql,"alter table RNA_pseudogene add index(seq_region_id)";


    push @sql,"drop table if exists lincRNA_gene";
    push @sql,"create table lincRNA_gene select sr.seq_region_id,sr.name as seq_region_name,gene_id as feature_id,concat(biotype,'_gene') as feature_type, t.seq_region_start as feature_start,t.seq_region_end as feature_end, t.seq_region_strand as feature_strand from gene t, seq_region sr where t.seq_region_id = sr.seq_region_id and t.biotype = 'lincRNA' ";
    push @sql,"alter table lincRNA_gene add index(seq_region_name)";
    push @sql,"alter table lincRNA_gene add index(seq_region_id)";

    _run_sql($dbc,@sql) or throw "cannot run sql";

    @sql=();
    push @sql,"drop table if exists PolIII_transcribed_gene";
    push @sql,"create table PolIII_transcribed_gene select * from lincRNA_gene where 1=0"; # make empty table

    foreach my $desc ('Nuclear RNase P','RNase MRP','Vault','Y RNA','5S ribosomal RNA','7SK RNA',){
	push @sql,"insert into PolIII_transcribed_gene select sr.seq_region_id,sr.name as seq_region_name,gene_id as feature_id, '$desc' as feature_type, t.seq_region_start as feature_start,t.seq_region_end as feature_end, t.seq_region_strand as feature_strand from gene t, seq_region sr where t.seq_region_id = sr.seq_region_id and t.description like '$desc".'%'."'";
    }
    push @sql,"alter table PolIII_transcribed_gene add index(seq_region_name)";
    push @sql,"alter table PolIII_transcribed_gene add index(seq_region_id)";

    _run_sql($dbc,@sql) or throw "cannot run sql";

}

# creates tables storing transcript related features
sub _create_transcript_features{
    my ($dbc) = @_;

    my @sql;
    push @sql,"drop table if exists protein_coding_transcript";
    push @sql,"create table protein_coding_transcript select sr.seq_region_id,sr.name as seq_region_name,transcript_id as feature_id,'protein_coding_transcript' as feature_type, t.seq_region_start as feature_start,t.seq_region_end as feature_end, t.seq_region_strand as feature_strand from transcript t, seq_region sr where t.seq_region_id = sr.seq_region_id and t.biotype in ('protein_coding')";
    push @sql,"alter table protein_coding_transcript add index(seq_region_name)";
    push @sql,"alter table protein_coding_transcript add index(seq_region_id)";

    push @sql,"drop table if exists protein_coding_transcript_downstream_2500";
    push @sql,"create table protein_coding_transcript_downstream_2500 select sr.seq_region_id,sr.name as seq_region_name,transcript_id as feature_id,'protein_coding_transcript_downstream_2500' as feature_type,if(seq_region_strand = 1, t.seq_region_end+1,t.seq_region_start -2500) as feature_start,if(seq_region_strand = 1,t.seq_region_end+2500,t.seq_region_start - 1) as feature_end, t.seq_region_strand as feature_strand from transcript t, seq_region sr where t.seq_region_id = sr.seq_region_id and t.biotype in ('protein_coding')";
    push @sql,"alter table protein_coding_transcript_downstream_2500 add index(seq_region_name)";
    push @sql,"alter table protein_coding_transcript_downstream_2500 add index(seq_region_id)";

    push @sql,"drop table if exists pseudogene_transcript";
    push @sql,"create table pseudogene_transcript select sr.seq_region_id,sr.name as seq_region_name,transcript_id as feature_id,concat(biotype,'_transcript') as feature_type, t.seq_region_start as feature_start,t.seq_region_end as feature_end, t.seq_region_strand as feature_strand from transcript t, seq_region sr where t.seq_region_id = sr.seq_region_id and t.biotype in ('pseudogene','repeat','retrotransposed')";
    push @sql,"alter table pseudogene_transcript add index(seq_region_name)";
    push @sql,"alter table pseudogene_transcript add index(seq_region_id)";

    push @sql,"drop table if exists RNA_transcript";
    push @sql,"create table RNA_transcript select sr.seq_region_id,sr.name as seq_region_name,transcript_id as feature_id,concat(biotype,'_transcript') as feature_type, t.seq_region_start as feature_start,t.seq_region_end as feature_end, t.seq_region_strand as feature_strand from transcript t, seq_region sr where t.seq_region_id = sr.seq_region_id and t.biotype like '%RNA%' and t.biotype not like '%pseudogene%'";
    push @sql,"alter table RNA_transcript add index(seq_region_name)";
    push @sql,"alter table RNA_transcript add index(seq_region_id)";

    push @sql,"drop table if exists RNA_pseudogene_transcript";
    push @sql,"create table RNA_pseudogene_transcript select sr.seq_region_id,sr.name as seq_region_name,transcript_id as feature_id,concat(biotype,'_transcript') as feature_type, t.seq_region_start as feature_start,t.seq_region_end as feature_end, t.seq_region_strand as feature_strand from transcript t, seq_region sr where t.seq_region_id = sr.seq_region_id and t.biotype like '%RNA%' and t.biotype like '%pseudogene%'";
    push @sql,"alter table RNA_pseudogene_transcript add index(seq_region_name)";
    push @sql,"alter table RNA_pseudogene_transcript add index(seq_region_id)";

    _run_sql($dbc,@sql) or throw "Couldn't create transcript features";

}

#for repeats on top level sequence
sub _create_repeat_features{
    my($dbc)=@_;

    my @sql;
    my $temp1 = "tmp_".int(rand(1000));
    # we lose a few mappings because some mappings are on parts of the contigs
    # which are not included in the assembly
    push @sql,"drop table if exists $temp1";
    
        my $q= "create table $temp1
        select f.*,
               f.seq_region_start as chr_start,
               f.seq_region_end as chr_end,
               f.seq_region_id as chromosome_id,
               asm_sr.name as chr_name 
from repeat_feature f,
     analysis an,
     seq_region asm_sr
where f.analysis_id = an.analysis_id
and an.logic_name = 'RepeatMask'
and f.seq_region_id = asm_sr.seq_region_id ";

	$q =~ tr/\n/ /;
	push @sql, $q;

	push @sql,"alter table $temp1 add index(repeat_consensus_id)";
        # now add the repeat type 
        push @sql,"drop table if exists all_repeats";
        push @sql,"create table all_repeats select t1.chromosome_id as seq_region_id,t1.chr_name as seq_region_name,repeat_feature_id as feature_id,concat(rc.repeat_class,'_repeat') as feature_type,chr_start as feature_start,chr_end as feature_end , 0 as feature_strand from $temp1 t1, repeat_consensus rc where rc.repeat_consensus_id = t1.repeat_consensus_id ";
        # we want the U6 snRNA, 5S rRNA, 7SLRNA etc separately for pol3 transcribed regions
        push @sql,"insert into all_repeats select t1.chromosome_id as seq_region_id,t1.chr_name as seq_region_name,repeat_feature_id as feature_id,'PolIII_transcribed_repeat' as feature_type,chr_start as feature_start,chr_end as feature_end , 0 as feature_strand from $temp1 t1, repeat_consensus rc where rc.repeat_consensus_id = t1.repeat_consensus_id and rc.repeat_name in ('U6','5S','7SLRNA','7SK') ";

	push @sql, "alter table all_repeats add index(feature_type)";

        push @sql,"drop table $temp1";

    
    _run_sql($dbc,@sql) or throw "cannot run sql";

    my $aref = ['Satellite_repeat','Satellite/centr_repeat','LTR/ERV1_repeat','LTR/MaLR_repeat','SINE/Alu_repeat' ];
    @sql = ();
    foreach my $type (@$aref){
        @sql = ();
	my $table = $type;
	$table =~ tr/\//_/;
	$table =~ tr/-/_/;
	$table =~ tr/?/_/;
	push @sql,"drop table if exists $table";
	push @sql,"create table $table select * from all_repeats where feature_type = '$type'";
	push @sql,"update $table set feature_type = '$table'";
        push @sql, _col_types_and_indices("$table");
        _run_sql($dbc,@sql) or throw "cannot run sql for $type";
	#print "$table\t$table\t$table\t0\t0\t0\n";
    }

}

sub _create_exon_features{
    my($dbc)=@_;
    my @sql;

    my $temp1 = "tmp_".int(rand(1000))."_1";
    # denormalise exon, transcript and exon_transcript
    push @sql, "drop table if exists $temp1";
    push @sql, "create table $temp1 select e.*,et.rank,et.transcript_id,t.biotype from exon e, exon_transcript et, transcript t where e.exon_id = et.exon_id and et.transcript_id = t.transcript_id order by et.transcript_id,e.seq_region_start";
    push @sql, "alter table $temp1 add index(transcript_id)";

    # get rank of last exon for each transcript
    my $temp2 = "tmp_".int(rand(1000))."_2";
    push @sql, "drop table if exists $temp2";
    push @sql, "create table $temp2 select transcript_id,max(rank) as max_rank from $temp1 group by transcript_id";
    push @sql, "alter table $temp2 add index(transcript_id)";
    my $temp3 = "tmp_".int(rand(1000))."_3";
    push @sql, "drop table if exists $temp3";
    push @sql, "create table $temp3 select t1.*,t2.max_rank from $temp1 t1, $temp2 t2 where t1.transcript_id = t2.transcript_id";
    push @sql, "drop table if exists $temp2";
    push @sql, "drop table if exists $temp1";
    push @sql, "alter table $temp3 add index(transcript_id)";
    push @sql, "alter table $temp3 add index(max_rank)";
    push @sql, "alter table $temp3 add index(rank)";
    push @sql, "alter table $temp3 add index(seq_region_id)";
    

    # get first intron for each transcript
    my $temp4 =  "tmp_".int(rand(1000))."_4";
    push @sql, "create table $temp4 select t3.seq_region_id,sr.name as seq_region_name,t3.transcript_id as feature_id, concat(t3.biotype,'_intron1') as feature_type,if(t3.seq_region_strand = 1,t3.seq_region_end+1,b.seq_region_end+1) as feature_start, if(t3.seq_region_strand = 1,b.seq_region_start-1,t3.seq_region_start -1) as feature_end,t3.seq_region_strand as feature_strand from $temp3 t3, $temp3 b, seq_region sr where t3.seq_region_id = sr.seq_region_id and t3.transcript_id = b.transcript_id and t3.rank = 1 and b.rank = 2"; 
    push @sql,"drop table if exists intron1";
    push @sql,"alter table $temp4 rename as intron1";
    push @sql,_col_types_and_indices("intron1");
    push @sql,_split_by_biotype("intron1");
   _run_sql($dbc,@sql);

    # promoter defined as 500 bp upstream
    push @sql, "drop table if exists exon1_plus_promoter";
    push @sql, "create table exon1_plus_promoter select t3.seq_region_id,sr.name as seq_region_name,transcript_id as feature_id, concat(t3.biotype,'_exon1_plus_promoter') as feature_type,if(seq_region_strand = 1,t3.seq_region_start-500,seq_region_start) as feature_start, if(seq_region_strand = 1,seq_region_end,seq_region_end +500) as feature_end,t3.seq_region_strand as feature_strand from $temp3 t3, seq_region sr where t3.seq_region_id = sr.seq_region_id and t3.rank = 1";
    push @sql,_col_types_and_indices("exon1_plus_promoter");
    push @sql,_split_by_biotype("exon1_plus_promoter");

    
    # enhancer defined as 2500 bp upstream
    push @sql, "drop table if exists exon1_plus_enhancer";
    push @sql, "create table exon1_plus_enhancer select t3.seq_region_id,sr.name as seq_region_name,transcript_id as feature_id, concat(t3.biotype,'_exon1_plus_enhancer') as feature_type,if(seq_region_strand = 1,t3.seq_region_start-2500,seq_region_start) as feature_start, if(seq_region_strand = 1,seq_region_end,seq_region_end +2500) as feature_end,t3.seq_region_strand as feature_strand from $temp3 t3, seq_region sr where t3.seq_region_id = sr.seq_region_id and t3.rank = 1";
    push @sql,_col_types_and_indices("exon1_plus_enhancer");
    push @sql,_split_by_biotype("exon1_plus_enhancer");

    push @sql, "drop table if exists single_exon_gene";
    push @sql, "create table single_exon_gene select t3.seq_region_id,sr.name as seq_region_name,exon_id as feature_id, concat(t3.biotype, '_single_exon_gene') as feature_type,t3.seq_region_start as feature_start, t3.seq_region_end as feature_end,t3.seq_region_strand as feature_strand from $temp3 t3, seq_region sr where t3.seq_region_id = sr.seq_region_id and t3.max_rank = 1";
    push @sql,_col_types_and_indices("single_exon_gene");    
    push @sql,_split_by_biotype("single_exon_gene");

    push @sql, "drop table if exists single_exon_gene_plus_enhancer";
    push @sql, "create table single_exon_gene_plus_enhancer select  t3.seq_region_id,sr.name as seq_region_name,transcript_id as feature_id, concat(t3.biotype,'_single_exon_gene_plus_enhancer') as feature_type,if(seq_region_strand = 1,t3.seq_region_start-2500,seq_region_start) as feature_start, if(seq_region_strand = 1,seq_region_end,seq_region_end +2500) as feature_end,t3.seq_region_strand as feature_strand from $temp3 t3, seq_region sr where t3.seq_region_id = sr.seq_region_id and t3.max_rank = 1";
    push @sql,_col_types_and_indices("single_exon_gene_plus_enhancer");    
    push @sql,_split_by_biotype("single_exon_gene_plus_enhancer");

    # gene body is all exons and introns except the first of each
    push @sql, "drop table if exists gene_body";
    push @sql, "create table gene_body select t3.seq_region_id,sr.name as seq_region_name,transcript_id as feature_id,  concat(t3.biotype,'_gene_body') as feature_type,min(t3.seq_region_start) as feature_start, max(t3.seq_region_end) as feature_end,t3.seq_region_strand as feature_strand from $temp3 t3, seq_region sr where t3.seq_region_id = sr.seq_region_id and t3.max_rank != 1 and t3.rank != 1 group by transcript_id";
    push @sql,_col_types_and_indices("gene_body");  
    push @sql,_split_by_biotype("gene_body");


    push @sql, "drop table if exists exon_plus_flanks_500";
    push @sql, "create table exon_plus_flanks_500 select distinct t3.seq_region_id,sr.name as seq_region_name,exon_id as feature_id, concat(t3.biotype,'_exon_plus_flanks_500') as feature_type,t3.seq_region_start-500 as feature_start, seq_region_end + 500 as feature_end,t3.seq_region_strand as feature_strand from $temp3 t3, seq_region sr where t3.seq_region_id = sr.seq_region_id ";
    push @sql,_col_types_and_indices("exon_plus_flanks_500");    
    push @sql,_split_by_biotype("exon_plus_flanks_500");


    push @sql, "drop table if exists $temp3";

    _run_sql($dbc,@sql) or throw "cannot run sql";
    
}

sub _split_by_biotype{
    my($feat_name) = @_;

    my @sql;

    my $prots = "'protein_coding_$feat_name', 'IG_V_gene_$feat_name', 'IG_C_gene_$feat_name', 'IG_J_gene_$feat_name', 'IG_D_gene_$feat_name'";
    push @sql,"drop table if exists protein_coding_$feat_name";
    push @sql,"create table protein_coding_$feat_name select * from $feat_name where feature_type in ($prots)";
    push @sql,"update protein_coding_$feat_name set feature_type = 'protein_coding_$feat_name'"; 
    push @sql,_col_types_and_indices("protein_coding_$feat_name");

    push @sql,"drop table if exists RNA_gene_$feat_name";
    push @sql,"create table RNA_gene_$feat_name select * from $feat_name where feature_type like '%RNA_".$feat_name."'";
    push @sql,_col_types_and_indices("RNA_gene_$feat_name");

    if($feat_name eq 'exon1_plus_enhancer'){
	push @sql,"drop table if exists snRNA_gene_$feat_name";
	push @sql,"create table snRNA_gene_$feat_name select * from $feat_name where feature_type like '%snRNA_".$feat_name."'";
	push @sql,"update snRNA_gene_$feat_name set feature_type = 'snRNA_gene_$feat_name'";
	push @sql,_col_types_and_indices("snRNA_gene_$feat_name");

	push @sql,"drop table if exists snoRNA_gene_$feat_name";
	push @sql,"create table snoRNA_gene_$feat_name select * from $feat_name where feature_type like '%snoRNA_".$feat_name."'";
	push @sql,"update snoRNA_gene_$feat_name set feature_type = 'snoRNA_gene_$feat_name'";
	push @sql,_col_types_and_indices("snoRNA_gene_$feat_name");

	push @sql,"drop table if exists miRNA_gene_$feat_name";
	push @sql,"create table miRNA_gene_$feat_name select * from $feat_name where feature_type like '%miRNA_".$feat_name."'";
	push @sql,"update miRNA_gene_$feat_name set feature_type = 'miRNA_gene_$feat_name'";
	push @sql,_col_types_and_indices("miRNA_gene_$feat_name");

	push @sql,"drop table if exists miscRNA_gene_$feat_name";
	push @sql,"create table miscRNA_gene_$feat_name select * from $feat_name where feature_type like '%miscRNA_".$feat_name."'";
	push @sql,"update miscRNA_gene_$feat_name set feature_type = 'miscRNA_gene_$feat_name'";
	push @sql,_col_types_and_indices("miscRNA_gene_$feat_name");

	push @sql,"drop table if exists rRNA_gene_$feat_name";
	push @sql,"create table rRNA_gene_$feat_name select * from $feat_name where feature_type like '%rRNA_".$feat_name."'";
	push @sql,"update rRNA_gene_$feat_name set feature_type = 'rRNA_gene_$feat_name'";
	push @sql,_col_types_and_indices("rRNA_gene_$feat_name");
    }

    my $pseuds = "'pseudogene_$feat_name','repeat_$feat_name','retrotransposed_$feat_name'";
    push @sql,"drop table if exists pseudogene_$feat_name";
    push @sql,"create table pseudogene_$feat_name select * from $feat_name where feature_type in ($pseuds)";
    push @sql,_col_types_and_indices("pseudogene_$feat_name");

    return @sql;
}


sub _cage_ditag_transcript_tss{
    my($dbc)=@_;

    my $dbh = $dbc->db_handle;

    my @sql;
    my $temp1 = "tmp_".int(rand(1000))."_1";
    push @sql, "drop table if exists $temp1";
    push @sql, "create table $temp1 select f.seq_region_id,sr.name as seq_region_name,if(f.seq_region_strand = 1,f.seq_region_start,f.seq_region_end) as feature_start,f.seq_region_strand as feature_strand from transcript f , seq_region sr where sr.seq_region_id = f.seq_region_id and f.biotype not like '%pseudogene%' and f.biotype not in('repeat','retrotransposed')";

    if($dbh->selectrow_array("select count(*) from information_schema.tables where table_schema ='".$dbc->dbname."' and table_name='ditag_feature'") > 0){
    #if($dbu->table_exists("ditag_feature")){
        push @sql, "insert into $temp1  select f.seq_region_id,sr.name as seq_region_name,if(f.seq_region_strand = 1,f.seq_region_start,f.seq_region_end) as feature_start,f.seq_region_strand from ditag_feature f , seq_region sr where sr.seq_region_id = f.seq_region_id";
    }

    my $temp2 = "tmp_".int(rand(1000))."_2";
    push @sql, "drop table if exists $temp2";
    push @sql, "create table $temp2 select distinct * from $temp1";

    # we may want to do clustering here to reduce the total number of features

    push @sql, "drop table if exists tss_upstream_500";
    push @sql, "create table tss_upstream_500 select seq_region_id,seq_region_name,if(feature_strand = -1,feature_start,feature_start-500) as feature_start,if(feature_strand = -1,feature_start+500,feature_start) as feature_end,feature_strand,'tss_upstream_500' as feature_type from $temp2";
push @sql,"alter table tss_upstream_500 add column feature_id int(10) not null auto_increment primary key";
    push @sql,_col_types_and_indices("tss_upstream_500");


    push @sql, "drop table if exists tss_downstream_500";
    push @sql, "create table tss_downstream_500 select seq_region_id,seq_region_name,if(feature_strand = -1,feature_start-500,feature_start) as feature_start,if(feature_strand = -1,feature_start,feature_start+500) as feature_end,feature_strand,'tss_downstream_500' as feature_type from $temp2";
push @sql,"alter table tss_downstream_500 add column feature_id int(10) not null auto_increment primary key";
    push @sql,_col_types_and_indices("tss_downstream_500");


    push @sql, "drop table if exists tss_centred_500";
    push @sql, "create table tss_centred_500 select seq_region_id,seq_region_name,feature_start-250 as feature_start,feature_start+250 as feature_end,feature_strand,'tss_centred_500' as feature_type from $temp2";
push @sql,"alter table tss_centred_500 add column feature_id int(10) not null auto_increment primary key";
    push @sql,_col_types_and_indices("tss_centred_500");


    push @sql, "drop table if exists tss_centred_5000";
    push @sql, "create table tss_centred_5000 select seq_region_id,seq_region_name,feature_start-2500 as feature_start,feature_start+2500 as feature_end,feature_strand,'tss_centred_5000' as feature_type from $temp2";
push @sql,"alter table tss_centred_5000 add column feature_id int(10) not null auto_increment primary key";
    push @sql,_col_types_and_indices("tss_centred_5000");

    # 
    _run_sql($dbh,@sql) or die;

    @sql = ();
    push @sql,"drop table if exists $temp1";
    push @sql,"drop table if exists $temp2";
    _run_sql($dbh,@sql) or die;

}

sub _create_intergenic_features{
    my ($dbc) = @_;

    my $dbh = $dbc->db_handle;

    # we process one seq_region ata a time
    # if we want to include more biotypes of gene then we should select from 
    # the gene table
    my $q = "select distinct seq_region_id from protein_coding_gene";
    my $regions_aref = $dbh->selectcol_arrayref($q);
    unless(@$regions_aref > 0){ throw "no data returned by :\n$q\n" }

    my @sql;
    my $temp1 = "tmp_".int(rand(1000));
    push @sql, "drop table if exists $temp1";
    # start and end are signed so we can use -ve numbers as flags
    push @sql, "create table $temp1 (seq_region_id int(10) unsigned,feature_start int(10), feature_end int(10))";
    _run_sql($dbh,@sql) or throw "error running sql";

    # first we get the regions which lie around/between the genes
    my @intergenics;
    foreach my $id (@$regions_aref){

	@intergenics = ();

	my $region_length = $dbh->selectrow_array("select length from seq_region where seq_region_id = $id");

	$q = "select feature_start,feature_end from protein_coding_gene where seq_region_id = $id order by feature_start";
	my $genes_aaref = $dbh->selectall_arrayref($q);
        unless(defined $genes_aaref){
            throw "query failed on:\n$q\n".$dbh->errstr;
	}

	my $last_end;
        if($genes_aaref->[0]->[0] == 1){
	    $last_end = $genes_aaref->[0]->[1];
	}else{
	    my @intergenic;
            $intergenic[0] = -1; # use -ve to flag chromosome end
            $intergenic[1] = $genes_aaref->[0]->[0] -1;
	    push @intergenics,\@intergenic;
            $last_end = $genes_aaref->[0]->[1];
	}

        
        for(my $i=1;$i< @$genes_aaref;$i++){

	    #print "gene $i ".join("\t",@{$genes_aaref->[$i]})."\n";
	    my @intergenic;
	    if($genes_aaref->[$i]->[0] > $last_end+1){
                $intergenic[0] = $last_end+1;
                $intergenic[1] = $genes_aaref->[$i]->[0] -1;
 	        push @intergenics,\@intergenic;
	    }

            if($genes_aaref->[$i]->[1] > $last_end){
                $last_end = $genes_aaref->[$i]->[1];
	    }

	}

        if($last_end < $region_length){
	    my @intergenic;
            $intergenic[0] = $last_end+1;
            $intergenic[1] = - $region_length; # use -ve to flag chromosome end
 	    push @intergenics,\@intergenic;
	}

	@sql = ();
        foreach my $aref (@intergenics){
	    my $q = "insert into $temp1 values($id,";
	    $q .= join(',',@$aref);
	    $q .= ")";
	    push @sql,$q;

	}
	_run_sql($dbh,@sql) or throw "Error running sql";
	
    }

    # now we create gene-distal regions by shortening the regions in $temp1
    _intergenic_variant($dbh,$temp1,2500);
    _intergenic_variant($dbh,$temp1,5000);
    _intergenic_variant($dbh,$temp1,10000);

    _run_sql($dbh,"drop table $temp1") or throw "cannot drop $temp1 table";

}


sub _intergenic_variant{
    my($dbh,$temp1,$dist) = @_;

    my $final_table = 'intergenic_'.$dist;
    my @sql;
    push @sql,"drop table if exists $final_table";
    my $temp2 = "tmp_".int(rand(1000));
    push @sql,"drop table if exists $temp2";
    push @sql,"create table $temp2 select seq_region_id,feature_start,feature_end from $temp1 where 1=0";
    # only add/subtract $dist if we are not at the chromosome end
    push @sql,"insert into $temp2 select seq_region_id,if(feature_start> 0,feature_start +$dist,feature_start * -1) as feature_start,if(feature_end > 0,feature_end - $dist, feature_end * -1) as feature_end from $temp1";
    
    push @sql,"delete from $temp2 where feature_end <= feature_start";
    push @sql,"create table $final_table select t2.seq_region_id,sr.name as seq_region_name,'$final_table' as feature_type,t2.feature_start,t2.feature_end,'0' as feature_strand from $temp2 t2, seq_region sr where t2.seq_region_id = sr.seq_region_id";
    push @sql,"alter table $final_table add column feature_id int(10) not null auto_increment primary key";

    push @sql,_col_types_and_indices($final_table);

    _run_sql($dbh,@sql) or throw "cannot run sql";
    _run_sql($dbh,"drop table $temp2") or throw "cannot drop $temp2 table";

}


sub _col_types_and_indices{
    my $table = shift;

    my @sql;

    push @sql,"alter table $table modify column feature_start int(10) unsigned";
    push @sql,"alter table $table modify column feature_end int(10) unsigned";
    push @sql,"alter table $table modify column feature_strand tinyint(2)";
    push @sql,"alter table $table modify column feature_type varchar(45)";
    push @sql,"alter table $table add index(seq_region_id)";
    push @sql,"alter table $table add index(seq_region_name)";
    push @sql,"alter table $table add index(feature_strand)";
    push @sql,"alter table $table add index(feature_start)";
    push @sql,"alter table $table add index(feature_end)";
    return @sql;
}

sub _process_regulatory_features {
  my ($dnadb, $efgdb, $workdb, $cell_name) = @_;
      
  _copy_with_rename($efgdb->db_handle,'seq_region',$workdb->db_handle,'func_seq_region'); 

  my @sql = ();

  push @sql, "drop table if exists seq_name_lookup";
  push @sql, "create table seq_name_lookup select distinct seq_region_id,name as seq_region_name from func_seq_region";
  push @sql, "alter table seq_name_lookup add index(seq_region_id)";
  push @sql, "alter table seq_name_lookup add index(seq_region_name)";
  _run_sql($workdb,@sql) or throw "cannot create seq_name_lookup";

  #add seq_region_name to regulatory_features
  @sql = ();
  my $temp1 = "tmp_".int(rand(1000));
  push @sql, "create table $temp1 select r.*,s.seq_region_name from regulatory_feature r,seq_name_lookup s where r.seq_region_id = s.seq_region_id";
  push @sql, "drop table regulatory_feature";
  push @sql, "alter table $temp1 rename as regulatory_feature";
  push @sql, "alter table regulatory_feature add index(feature_set_id)";
  _run_sql($workdb,@sql) or throw "cannot update regulatory_feature table";

  @sql = ();
  # remove all but the current set of features
  my $feature_set_name ='RegulatoryFeatures';
  if($cell_name){
    $feature_set_name .= ':'.$cell_name;
  } 
  
  my $kh = "select feature_set_id from feature_set where name = '$feature_set_name'";
  my $feature_set_id = $workdb->db_handle->selectrow_array($kh);
  if($feature_set_id == 0){ throw "Could not find feature_set_id for $feature_set_name"; }

  push @sql, "delete  from regulatory_feature where feature_set_id != $feature_set_id";
  push @sql, "alter table regulatory_feature add index(regulatory_feature_id)";
  push @sql, "alter table regulatory_feature add index(seq_region_start)";
  push @sql, "alter table regulatory_feature add index(seq_region_end)";
  push @sql, "alter table regulatory_feature add index(seq_region_name)";
  push @sql, "alter table regulatory_feature add index(feature_type_id)";
  _run_sql($workdb,@sql) or throw "could not delete irrelevant regulatory features";
  
  # we remove features which have very long 'whiskers'
  # these tend to contain multiple focus features which results in
  # a group of regulatory features all of which have the same attributes
  @sql = ();
  my $filtered_features_table = 'regulatory_features_filtered';
  push @sql,"drop table if exists $filtered_features_table";
  push @sql,"create table $filtered_features_table select *, 'regulatory_feature' as feature_type from regulatory_feature";
  push @sql, "alter table $filtered_features_table add index(seq_region_name)";
  push @sql, "alter table $filtered_features_table add index(feature_type_id)";
  push @sql, "alter table $filtered_features_table add index(feature_set_id)";
  push @sql, "alter table $filtered_features_table add index(regulatory_feature_id)";
  push @sql, "alter table $filtered_features_table add index(seq_region_start)";
  push @sql, "alter table $filtered_features_table add index(seq_region_end)";
  
  my $temp2 = "tmp_".int(rand(1000));
  my $temp3 = "tmp_".int(rand(1000));
  
  push @sql,"create table $temp3 select ra.regulatory_feature_id , af.* from regulatory_attribute ra, annotated_feature af where ra.attribute_feature_id = af.annotated_feature_id";

  push @sql,"create table $temp2 select j.regulatory_feature_id,count(*) as n_attribs,count(distinct j.feature_set_id) as n_attrib_types,max(j.seq_region_end)-min(j.seq_region_start) as len,min(j.seq_region_start)as attribs_start,max(j.seq_region_end) as attribs_end,rf.binary_string from $temp3 j, $filtered_features_table rf where j.regulatory_feature_id=rf.regulatory_feature_id group by regulatory_feature_id ";
  push @sql,"alter table $temp2 add index(regulatory_feature_id)";
  push @sql,"alter table $temp2 add index(attribs_start)";
  push @sql,"alter table $temp2 add index(attribs_end)";

  push @sql,"delete r from $filtered_features_table r, $temp2 t2 where r.regulatory_feature_id = t2.regulatory_feature_id and t2.len > 5000";

  push @sql, "alter table $filtered_features_table add index(seq_region_name)";
  push @sql, "alter table $filtered_features_table add index(feature_type_id)";
  push @sql, "alter table $filtered_features_table add index(feature_set_id)";
  push @sql, "alter table $filtered_features_table add index(regulatory_feature_id)";
  push @sql, "alter table $filtered_features_table add index(seq_region_start)";
  push @sql, "alter table $filtered_features_table add index(seq_region_end)";
  push @sql, "alter table $filtered_features_table add index(binary_string)";
  # we want to remove features which occur in the same place and have the same
  # attributes as one another, leaving only one representative
  push @sql, "delete b from $filtered_features_table a, $filtered_features_table b, $temp2 t2a, $temp2 t2b where a.seq_region_name=b.seq_region_name and b.binary_string = a.binary_string and a.regulatory_feature_id < b.regulatory_feature_id and a.regulatory_feature_id = t2a.regulatory_feature_id and b.regulatory_feature_id = t2b.regulatory_feature_id and t2a.attribs_start=t2b.attribs_start and t2a.attribs_end = t2b.attribs_end";

  push @sql,"drop table if exists $temp2";

  _run_sql($workdb,@sql) or throw "cannot run sql";

  # reg  features which overlap centromeric repeats tend to contain too many marks# which suggests there is something wrong with the mappings.
  # so we get rid of them

  @sql = ();
  push @sql,"drop table if exists $temp3";
  
  push @sql,"create table $temp3 select f.regulatory_feature_id from $filtered_features_table f, Satellite_centr_repeat s where s.seq_region_name = f.seq_region_name and s.feature_end >= f.seq_region_start and s.feature_start <= f.seq_region_end";
  push @sql,"delete f from $filtered_features_table f,$temp3 t where f.regulatory_feature_id = t.regulatory_feature_id";


  push @sql,"drop table if exists $temp3";
  
  push @sql,"create table $temp3 select f.regulatory_feature_id from $filtered_features_table f, Satellite_repeat s where s.seq_region_name = f.seq_region_name and s.feature_end >= f.seq_region_start and s.feature_start <= f.seq_region_end";
  push @sql,"delete f from $filtered_features_table f,$temp3 t where f.regulatory_feature_id = t.regulatory_feature_id";

  push @sql,"drop table if exists $temp3";
  
  # reg feats on the mitochondrial DNA are unlikely to use the same
  # 'histone code' as the rest of the genome. they might even be artefacts.
  # so we remove them (this should not be necessary as there should not be features on MT..
  push @sql, "delete from $filtered_features_table where seq_region_name = 'MT'";
  _run_sql($workdb,@sql) or throw "cannot run sql";

  @sql=();
  # we make the col names compatible with the genomic features in the analysis
  push @sql,"alter table $filtered_features_table change column seq_region_start feature_start int(10) unsigned";
  push @sql,"alter table $filtered_features_table change column seq_region_end feature_end int(10) unsigned";
  push @sql,"alter table $filtered_features_table change column seq_region_strand feature_strand tinyint(2)";
  push @sql,"alter table $filtered_features_table modify column feature_type varchar(45)";

  _run_sql($workdb,@sql) or throw "cannot run sql";

}

sub _generate_random_frags {
  my ($workdb) = @_;
  
  my $originals_table = "regulatory_features_filtered";
  my $outtable = "mockreg_features_filtered";

  my @sql;

  push @sql,"drop table if exists banned_region_table";
  push @sql,"create table banned_region_table ( seq_region_name varchar(10), feature_start int(11), feature_end int(11))";
  my $dbh = $workdb->db_handle;
  _run_sql($dbh,@sql) or throw "Cannot run sql";

  @sql = ();

  # first we need the lengths of the seq regions so we know the max 
  # tfrag_enc_end value we can generate
  my %enc_len;
  my $q = "select distinct seq_region_name from $originals_table";
  my $col_ref = $dbh->selectcol_arrayref($q) or throw $dbh->errstr;
  my $clause = " where name in('";
  $clause .= join("','",@$col_ref);
  $clause .= "')";

  $q="select name, length from seq_region" .$clause;
  my $aaref = $dbh->selectall_arrayref($q) or throw $dbh->errstr;
  foreach my $aref (@$aaref){
    $enc_len{$aref->[0]} = $aref->[1];
  }

  my $tmp_table = "tmp_".int(rand(1000));
  push @sql, "drop table if exists $tmp_table";
  push @sql, "create table $tmp_table (regulatory_feature_id int(10) unsigned,seq_region_name char(10),feature_start int(10) unsigned,feature_end int(10) unsigned,feature_strand tinyint(2),feature_type varchar(40))"; 

  # next we need the seq_region and length of each original feature
  my %frag_len;
  $q = "select seq_region_name, feature_end - feature_start +1 as length,feature_type,feature_strand,regulatory_feature_id from $originals_table order by (feature_end - feature_start +1) desc ";
  $aaref = $dbh->selectall_arrayref($q) or throw "failed on \n$q\n".$dbh->errstr;
  
  foreach my $aref (@$aaref){
    my $enc = $aref->[0];    
    my $len = $aref->[1];
    my $type = $aref->[2];
    my $ori = $aref->[3];
    my $id =  $aref->[4]; 
    
    my($start,$end) = _get_rand_region($dbh,$enc,$len,%enc_len);

    my $q = "insert into $tmp_table values(";
    $q .= $id.",";
    $q .= "\'".$enc."\'," ;
    $q .= $start.",";
    $q .= $end.",";
    $q .= "\'".$ori."\'," ;
    $q .= "\'".$type."\'" ;
    $q .= ")";

    push @sql,$q;

    #Add frag to the banned list (so it won't be generated again)...
    my $frag_q = "insert into banned_region_table values('$enc',$start,$end) ";
    _run_sql($dbh,$frag_q) or throw("failed on \n$q\n".$dbh->errstr);
    
  }
  
  push @sql, "drop table if exists $outtable";
  push @sql, "create table $outtable select t.*,o.binary_string from $tmp_table t, $originals_table o where t.regulatory_feature_id = o.regulatory_feature_id"; # select f.seq_region_name, f.feature_type,f.feature_start,f.feature_end ,f.feature_strand from temp f";
  
  push @sql, "alter table $outtable add index(regulatory_feature_id)";
  push @sql, "alter table $outtable add index(seq_region_name)";
  push @sql, "alter table $outtable add index(feature_start)";
  push @sql, "alter table $outtable add index(feature_type)";
  push @sql, "alter table $outtable add index(feature_strand)";

  push @sql, "drop table if exists banned_region_table";
  push @sql, "drop table if exists $tmp_table";

  _run_sql($dbh,@sql) or throw "Cannot run sql";


}

sub _calculate_associations{
  my ($dbc, $dumpdir, $gen_feats, $species) = @_;

  #Pass these as parameters?
  my $reg_feat_table = 'regulatory_features_filtered';
  my $mock_reg_table = 'mockreg_features_filtered';
  my $gen_feat_table = 'genomic_features';
  my $overlaps_table = 'reg_feat_gen_feat_overlaps';
  my $mock_olaps_table = 'mock_feat_gen_feat_overlaps';
  my $flags_table = 'regulatory_feature_association_flags';
  my $types_table = 'regulatory_features_classified';

  my $dbh = $dbc->db_handle;

  system("mkdir -p ".$dumpdir) && throw "Error creating $dumpdir folder";

  #@gen_feats is the list of feature classes relevant for classification...
  _create_gen_feats_table($dbh,$gen_feats,$gen_feat_table);
  

  # open a file for writing the results
  my $reg_results_file = $dumpdir.'/reg_results.tab';
  my $mock_results_file = $dumpdir.'/mock_results.tab';
  
  my $reg_ofh;
  my $mock_ofh;
  open($reg_ofh,"> $reg_results_file") or die "couldn't open $reg_results_file";
  open($mock_ofh,"> $mock_results_file") or die "couldn't open $mock_results_file";
  # filename for temp storage of genomic features
  my $gen_file = $dumpdir.'/genomic_features';

  my $q = "select distinct seq_region_name from $gen_feat_table";
  my $chrs_ref = $dbh->selectcol_arrayref($q);
  foreach my $chr (@$chrs_ref){

    my $command = "mysql -h ".$dbc->host." -u ".$dbc->username." -P ".$dbc->port." -p".$dbc->password." ".$dbc->dbname.' -BN -e"'.
		  "select feature_type_id,feature_start,feature_end from  genomic_features where seq_region_name = '$chr' order by feature_start,feature_end".'"'.
		    " > $gen_file";
    system($command) && throw "Error dumping genomics features for $chr";

    _overlap_analysis($dbh,$gen_file,$chr,$reg_feat_table,$reg_ofh,scalar(@{$gen_feats}));
    _overlap_analysis($dbh,$gen_file,$chr,$mock_reg_table,$mock_ofh,scalar(@{$gen_feats}));
  }
  close($reg_ofh);
  close($mock_ofh);

  _load_results($dbh,$reg_results_file,$overlaps_table,$reg_feat_table,$gen_feats);
  _load_results($dbh,$mock_results_file,$mock_olaps_table,$mock_reg_table,$gen_feats);

  my $pat_bits = $dbh->selectrow_array("select length(binary_string) from regulatory_features_filtered limit 1");
  my @pats = _sub_pats($dbh,$reg_feat_table,$pat_bits);
  
  my $patt_count_thresh = 100;

  my $overlap_results_file = $dumpdir.'/overlap_results.tab';
  my $overlap_ofh;
  open($overlap_ofh,"> $overlap_results_file") or throw "couldn't open $overlap_results_file";
  #for each pattern we want the number of reg feats with that pattern  
  # and then for each genomic feature we want the overlap counts for the
  # real and mock reg feats with that pattern
  my($reg_count,$real_count,$mock_count);
  foreach my $pat (@pats){
    $reg_count = $dbh->selectrow_array("select count(1) from  $overlaps_table where pattern like '$pat'");
    if($reg_count >= $patt_count_thresh){
      print $overlap_ofh "$pat\t$reg_count";
      foreach my $gen (@{$gen_feats}){
	$real_count = $dbh->selectrow_array("select count(1) from  $overlaps_table where pattern like '$pat' and $gen");
	$mock_count = $dbh->selectrow_array("select count(1) from  $mock_olaps_table where pattern like '$pat' and $gen");
	print $overlap_ofh "\t$real_count\t$mock_count";
      }
      print $overlap_ofh "\n" or throw "failed to print to file";
    }
  }
  close($overlap_ofh); 

  _create_pattern_results_tables($dbh, $overlap_results_file,$gen_feats);

  my $assoc_thresh = 51; #70;
  my $second_thresh = 50;
  _association_tables($dbh,$gen_feats,'pattern_overlap_summary','pattern_overlap_summary_chi',$assoc_thresh,$second_thresh);

  _create_flags_table($dbh,$flags_table,$gen_feats);

  _create_types_table($dbh,$flags_table,$types_table,$species);

  system("rm -r -f ".$dumpdir) && throw "Error deleting $dumpdir folder";

  
}

sub _create_gen_feats_table{
    my($dbh,$gen_feats_aref,$table)=@_;

    my @sql;
    # create a temporary table of gen_feat,id
    my $temp1 = "tmp_".int(rand(100))."_1";;
    push @sql,"drop table if exists $temp1";
    push @sql,"create table $temp1 (feature_type varchar(60),feature_type_id int(10) unsigned)";
    for(my $i=0;$i < @$gen_feats_aref;$i++){
	push @sql, "insert into $temp1 values( '".$gen_feats_aref->[$i]."','".$i."')";
    }
    push @sql,"alter table $temp1 add index(feature_type)";

    # concatenate all the gen feats into one table
    my $temp2 = "tmp_".int(rand(100))."_2";;
    push @sql,"drop table if exists $temp2";
    push @sql, "create table $temp2 (feature_type varchar(60), seq_region_name varchar(40), feature_strand tinyint(2), feature_start int(10) unsigned, feature_end int(10) unsigned)";
    foreach my $feat (@$gen_feats_aref){
	push @sql, "insert into $temp2 select distinct feature_type, seq_region_name, feature_strand, feature_start, feature_end from $feat";
    }

    push @sql,"alter table $temp2 add index(feature_type)";

    push @sql,"drop table if exists $table";
    push @sql, "create table $table select t2.*,t1.feature_type_id from $temp1 t1, $temp2 t2 where t1.feature_type = t2.feature_type";
    push @sql,"alter table $table add index(seq_region_name)";

    push @sql,"drop table if exists $temp1";
    push @sql,"drop table if exists $temp2";

    _run_sql($dbh,@sql) or throw "Error running sql";


}


sub _overlap_analysis{
    my($dbh,$gen_file,$chr,$reg_feat_table,$ofh,$columns)=@_;

    my $zeros = '0' x $columns;
    my @zero = split('',$zeros);

    my $q = "select regulatory_feature_id,feature_start,feature_end from $reg_feat_table where seq_region_name = '$chr' order by feature_start,feature_end";
    my $reg_aaref = $dbh->selectall_arrayref($q);
    unless(defined $reg_aaref){ throw "failed on \n $q\n".$dbh->errstr; }

    # regulatory features may not be present on all chrs
    if($reg_aaref eq NO_ROWS){return}
    #if($reg_aaref == 0 ){return}

    # create a hash to store the overlap flags
    my %flags;
    foreach my $reg_ref (@$reg_aaref){
        my @zero = split('',$zeros);
	$flags{$reg_ref->[0]} = \@zero;
    }

    open(IN,$gen_file) or throw "failed to open $gen_file";

    while(<IN>){
	chop;
	my($type,$start,$end) = split("\t",$_);

        foreach  my $reg_ref (@$reg_aaref){

            # if the reg and gen feats overlap set the flag for that gen feat
            if($reg_ref->[1] <= $end && $reg_ref->[2] >= $start){
                $flags{$reg_ref->[0]}->[$type] += 1;
	    }

            # trim the regulatory feature array
            # remove any reg feat whose end < this gen feat start
	    if($reg_ref->[2] < $start){shift(@$reg_aaref)}

            # if the start of the reg feat is > end of the gen feat
            # we can move onto the next gen feat
            if($reg_ref->[1] > $end){last}
	}

    }

    while(my($id,$aref)=each(%flags)){
	print $ofh $id."\t".join("\t",@$aref)."\n";
    }
    
}


sub _load_results{
    my($dbh,$results_file,$overlaps_table,$reg_feat_table,$gen_feat_ref)=@_;

    my @sql;

    #create the summary table
    push @sql,"drop table if exists $overlaps_table";
    my $q = "create table $overlaps_table (regulatory_feature_id int(10) unsigned, ";
    $q .= join(' int(1) unsigned, ',@$gen_feat_ref);
    $q .= " int(1) unsigned)";
    push @sql, $q;

    push @sql, "load data local infile '$results_file' into table $overlaps_table";
    push @sql, "alter table $overlaps_table add index(regulatory_feature_id)";

    my $temp1 = "tmp_".int(rand(100));
    push @sql,"drop table if exists $temp1";

    push @sql,"create table $temp1 select o.*,r.binary_string as pattern from $reg_feat_table r, $overlaps_table o where o.regulatory_feature_id = r.regulatory_feature_id";
    push @sql,"drop table $overlaps_table";
    push @sql,"alter table $temp1 rename as $overlaps_table ";
    
    _run_sql($dbh,@sql) or throw "cannot run sql";
}


sub _sub_pats{
    my($dbh,$reg_feat_table,$pat_bits)=@_;

    my $aref = $dbh->selectcol_arrayref("select binary_string from regulatory_features_filtered group by binary_string having count(*)  > 1 ");
    unless(defined $aref && @$aref >0){ die "no binary strings found with query :- \n select binary_string from regulatory_features_filtered group by binary_string having count(*)  > 1"}

    my %pats;
    foreach my $orig (@$aref){
        my $zero_count = $orig =~ tr/0/_/;
        #if($zero_count >= $pat_bits-2){next}#dont want pats with only one mark
        # following the introduction of projected builds and the use of TFBS 
        # as focus features, patterns with only one mark are not necessarily
        # just a focus feature. we therefore want to analyse them
        if($zero_count == $pat_bits){next}
        if($zero_count == $pat_bits - 1){ # just one bit set
            $pats{$orig} = 1;
            next;
        }
 
        my @ch = split('',$orig);
        # we create the set of all patterns which comprise the original but 
        # with one of the set bits made into '_' 
        for(my $i = 0;$i<@ch;$i++){
            if($ch[$i] eq '1'){
                $ch[$i]='_';
                $pats{(join('',@ch))} = 1;
                $ch[$i] = 1;

            }
            $pats{$orig} = 1;
        }
    }

    return keys(%pats);

}

sub _create_pattern_results_tables{
    my($dbh, $overlap_results_file,$gen_feats_ref)=@_;

    my @sql;
    push @sql,"drop table if exists raw_overlap_results";

    my $bitsize=255;
    my $q = "create table raw_overlap_results (pattern varchar(${bitsize}),total_reg_feats int(10)";
    foreach my $gen (@$gen_feats_ref){
	$q .= ",$gen"."_real int(10), $gen"."_mock int(10)";
    }
    $q .= ")";
    push @sql,$q;    
    #Similar as mysqlimport...
    push @sql, "load data local infile '$overlap_results_file' into table raw_overlap_results";

    push @sql,"drop table if exists pattern_overlap_summary";
    push @sql,"drop table if exists pattern_overlap_summary_chi";
    $q = "create table pattern_overlap_summary (pattern varchar(${bitsize})";
    my $q1 = "create table pattern_overlap_summary_chi (pattern varchar(${bitsize})"; 
    my $q2 = "insert into pattern_overlap_summary select pattern";
    my $q3 = "insert into pattern_overlap_summary_chi select pattern";
    foreach my $gen (@$gen_feats_ref){
        # using int(10) to remove non integer part of %age 
	$q .= ",$gen int(10)";
	$q1 .= ",$gen float";
        $q2 .= ",100* $gen".'_real/total_reg_feats '."as $gen";
        #$q3 .= ",($gen"."_real- $gen"."_mock)*($gen"."_real- $gen"."_mock)/ $gen"."_mock as $gen"
        $q3 .= ",($gen"."_real- $gen"."_mock)*($gen"."_real- $gen"."_mock)/ if($gen"."_mock,$gen"."_mock,1) as $gen"
    }
    $q .= ")";
    $q1 .= ")";
    $q2 .= " from raw_overlap_results";
    $q3 .= " from raw_overlap_results";
    push @sql,$q;   
    push @sql,$q1;   
    push @sql,$q2;
    push @sql,$q3;
    push @sql,"alter table pattern_overlap_summary_chi add index(pattern)";
    push @sql,"alter table pattern_overlap_summary add index(pattern)";

    _run_sql($dbh,@sql) or die;

}

sub _association_tables{
    my($dbh, $gen_feat_ref,$summary_table,$chi_table,$assoc_thresh,$second_thresh)=@_;
    
    
    foreach my $feat (@$gen_feat_ref){
	# scan for initial associations
	my $q = "select perc.pattern, perc.$feat from $summary_table perc, $chi_table chi where perc.$feat >= $assoc_thresh and chi.$feat > 8 and perc.pattern = chi.pattern";

	my $aaref = $dbh->selectall_arrayref($q);
	unless(defined $aaref){ throw "failed on \n$q\n".$dbh->errstr}
	foreach my $aref (@$aaref){
	    push @$aref,$feat;
	}

	my @sql;
	my $assoc_table = $feat."_assoc";
	my $not_assoc_table =  $feat."_not_assoc";
	my $zero_one_table = $feat."_0_1";
	push @sql,"drop table if exists $assoc_table";
	push @sql,"create table $assoc_table select * from $summary_table where 1 = 0";

	push @sql,"drop table if exists $not_assoc_table";
	push @sql,"create table $not_assoc_table select * from $summary_table where 1 = 0";
	push @sql,"drop table if exists $zero_one_table";
	push @sql,"create table $zero_one_table select pattern from $summary_table where 1 = 0";
	_run_sql($dbh,@sql) or throw "error running sql";

	foreach my $aref (@$aaref){
	    my $pattern = $aref->[0];

	    _run_sql($dbh, "insert into $assoc_table select * from $summary_table where pattern like '$pattern'") or throw "error running sql";


	    # if none of the patterns have percent < $second_thresh then the original pattern
	    # is OK and it stays in the table.  we remove all the
	    # other patterns which have percent < $assoc_thresh
	    $q = "select pattern from $assoc_table where  $feat < $second_thresh";
	    my $not_ref = $dbh->selectcol_arrayref($q);
	    die $q."\n".$dbh->errstr unless(defined $not_ref);

	    if(@$not_ref > 0){
		foreach my $pat (@$not_ref){
		    _run_sql($dbh,"insert into $not_assoc_table select * from $assoc_table where  pattern = '$pat'") or throw "error running sql";
		    _run_sql($dbh,"delete from $assoc_table where pattern = '$pat'") or throw "error running sql";
		      
		    my $zero_one = _add_nots($pattern,$pat);
		    _run_sql($dbh,"insert into $zero_one_table values('$zero_one')") or throw "error running sql";
		}
	    }


	    @sql=();
	    push @sql,"drop table if exists temp_$$";
	    push @sql,"create table temp_$$ select distinct * from $assoc_table";
	    push @sql,"drop table $assoc_table";
	    push @sql,"alter table temp_$$ rename as $assoc_table";
            push @sql,"alter table $assoc_table add index(pattern)";
            _run_sql($dbh,@sql);
	}

	@sql = ();
        # the assoc table currently contains patterns which lie between
        # the assoc_thresh and the second_thresh
	push @sql,"delete from $assoc_table where $feat < $assoc_thresh";
	# distinct assoc table
	# **** and add back the two gene bits to the patterns ****
	push @sql,"drop table if exists temp_$$";
	push @sql,"create table  temp_$$ select distinct * from $assoc_table";
	push @sql,"drop table $assoc_table";
	push @sql,"alter table temp_$$ rename as $assoc_table";
	#push @sql,"update $assoc_table set pattern = concat(pattern,'__')";
	push @sql,"alter table $assoc_table add index(pattern)";

	# distinct not_assoc table
	# **** and add the two gene bits to the patterns ****
	# distinct not_assoc table
	# **** and add the two gene bits to the patterns ****
	push @sql,"drop table if exists temp_$$";
	push @sql,"create table  temp_$$ select distinct * from $not_assoc_table";
	push @sql,"drop table $not_assoc_table";
	push @sql,"alter table temp_$$ rename as $not_assoc_table";
	#push @sql,"update $not_assoc_table set pattern = concat(pattern,'__')";
	push @sql,"alter table $assoc_table add index(pattern)";

	_run_sql($dbh,@sql) or die;
    }

}

sub _add_nots{
    my($pat,$not) = @_;

    my @pat = split('',$pat);
    my @not = split('',$not);

    for(my $i = 0;$i<@pat;$i++){
        if($pat[$i] ne '1' && $not[$i] eq '1'){
            $pat[$i] = 0;
        }
    }
    return join('',@pat);
}

sub _create_flags_table{
    my($dbh,$flags_table,$gen_feat_ref)=@_;


    my @sql;
    push @sql,"drop table if exists $flags_table";
    push @sql, "create table $flags_table select regulatory_feature_id,binary_string from regulatory_feature";
    push @sql, "alter table $flags_table add index(binary_string)";

    _run_sql($dbh,@sql) or throw "error running sql";
 
   
    @sql = ();
    foreach my $feat (@$gen_feat_ref){

        my $assoc_table = $feat."_assoc";
        my $not_assoc_table =  $feat."_not_assoc";

	#what do we want here? check if the assoc table is empty or see if it exists?
	# hack covering specific cases where there are no associations!!!
	unless( ($feat eq 'intergenic_2500')
		|| ($feat eq 'protein_coding_gene_body')
		|| ($feat eq 'protein_coding_exon1_plus_enhancer')
		|| ($feat eq 'protein_coding_intron1')
		|| ($feat eq 'tss_centred_5000')
		|| ($feat eq 'PolIII_transcribed_gene_plus_enhancer')
		|| ($dbh->selectrow_array("select count(*) from $assoc_table") > 0)){
	    next;
	}

	push @sql, "alter table $flags_table add column $feat int(1) default 0";

	my $q = "select pattern from $assoc_table";
        my $pat_ref = $dbh->selectcol_arrayref($q);
	unless(defined $pat_ref){die "failed on \n$q\n".$dbh->errstr}
        foreach my $pat (@$pat_ref){
	    push @sql, "update $flags_table set $feat = 1 where binary_string like '$pat'";

	}

	$q = "select pattern from $not_assoc_table";
        $pat_ref = $dbh->selectcol_arrayref($q);
	unless(defined $pat_ref){die "failed on \n$q\n".$dbh->errstr}
        foreach my $pat (@$pat_ref){
	    push @sql, "update $flags_table set $feat = 0 where binary_string like '$pat'";
	}
    }

    # hack covering specific cases where there are no associations!!!
#    unless(_column_exists($dbh,$flags_table,'intergenic_2500')){
#    	push @sql, "alter table $flags_table add column intergenic_2500 int(1) default 0";
#    }
#    unless(_column_exists($dbh,$flags_table,'protein_coding_gene_body')){
#	push @sql, "alter table $flags_table add column protein_coding_gene_body int(1) default 0";
#    }
#    unless(_column_exists($dbh,$flags_table,'protein_coding_exon1_plus_enhancer')){
#	push @sql, "alter table $flags_table add column protein_coding_exon1_plus_enhancer int(1) default 0";
#    }
#    unless(_column_exists($dbh,$flags_table,'protein_coding_intron1')){
#	push @sql, "alter table $flags_table add column protein_coding_intron1 int(1) default 0";
#    }
#    unless(_column_exists($dbh,$flags_table,'tss_centred_5000')){
#	push @sql, "alter table $flags_table add column tss_centred_5000  int(1) default 0";
#    }
#    unless(_column_exists($dbh,$flags_table,'PolIII_transcribed_gene_plus_enhancer')){
#      push @sql, "alter table $flags_table add column PolIII_transcribed_gene_plus_enhancer  int(1) default 0";
#    }

    _run_sql($dbh,@sql) or throw "error running sql";

}


sub _column_exists {

  my $dbh = shift;
  my $table = shift;
  my $col = shift;
  
  my $query = "select $col from $table limit 1";
  my $sth;
  
  unless($sth=$dbh->prepare($query)){
    throw("ERROR: preparation of statement $query failed");
  }
  
  unless($sth->execute){
    $sth->finish;
    return(0);
  }

  $sth->finish;
  return(1);
  
}

sub _create_types_table{
    my($dbh,$flags_table,$types_table,$species)=@_;

    my @sql;

    push @sql, "drop table if exists $types_table";

    # The same for all species...
    #if($species eq 'homo_sapiens'){
      push @sql, "create table $types_table select regulatory_feature_id,binary_string,0 as cell_type_specific,tss_centred_5000 as promoter_associated,protein_coding_gene_body as gene_associated, intergenic_2500 as non_gene_associated,0 as unclassified, PolIII_transcribed_gene_plus_enhancer as poliii_transcription_associated from $flags_table";
    #}else{
    #    push @sql, "create table $types_table select regulatory_feature_id,binary_string,0 as cell_type_specific,if(protein_coding_exon1_plus_enhancer+protein_coding_intron1 > 0, 1,0) as promoter_associated,protein_coding_gene_body as gene_associated, intergenic_2500 as non_gene_associated,0 as unclassified from $flags_table";
    #}

    # apply arbitrary rules to resolve conflicts
    push @sql, "update $types_table set promoter_associated = 0 where promoter_associated and gene_associated"; # as used for v58
    #push @sql, "update $types_table set gene_associated = 0 where promoter_associated and gene_associated"; #

    push @sql, "update $types_table set gene_associated = 0 where poliii_transcription_associated and gene_associated";
    push @sql, "update $types_table set non_gene_associated = 0 where poliii_transcription_associated and non_gene_associated";
    push @sql, "update $types_table set poliii_transcription_associated = 0 where poliii_transcription_associated and promoter_associated";

    # use unclassified col to flag conflicts
    push @sql, "update $types_table set unclassified = 1 where promoter_associated and non_gene_associated";
    push @sql, "update $types_table set unclassified = 1 where gene_associated and non_gene_associated";
    # set both of the conflicting cols to 0 where there is a conflict
    push @sql, "update $types_table set promoter_associated = 0 where unclassified";
    push @sql, "update $types_table set gene_associated = 0 where unclassified";    push @sql, "update $types_table set non_gene_associated = 0 where unclassified";

    # set unclassified for rows with no flags
    push @sql, "update $types_table set unclassified = 1 where gene_associated + non_gene_associated + promoter_associated+poliii_transcription_associated = 0";

    _run_sql($dbh,@sql) or throw "error running sql";

    # add the feature_type_id column

    _run_sql($dbh, "alter table $types_table add column feature_type_id int(10) unsigned") or throw "could not add feature_type_id column in $types_table";
    # set the type_id 
    # now that we have moved to single cell line classification there are
    # no Cell type specific classifications
    @sql = ();
    foreach my $ft ('Gene Associated',
                    'Non-Gene Associated',
                    'Promoter Associated',
                    'Unclassified',
                    'PolIII Transcription Associated',
		    ){

      my $ftid = $dbh->selectrow_array("select feature_type_id from feature_type where name = '$ft' and class = 'Regulatory Feature'");
      if(!defined($ftid) || ($ftid <1)){ throw "feature_type_id for $ft does not exist or value is less than 1"}
      my $cts = 0;
      my $col;
      if($ft =~ 'specific'){
	$cts = 1;
	($col) = $ft =~ /(.+ *.+) -.*/;
	$col = lc($col);
	$col =~ tr/ /_/;
	$col =~ tr/-/_/;
      }else{
	$col = lc($ft);
	$col =~ tr/ /_/;
	$col =~ tr/-/_/;
      }
      print "ft $ft col $col\n";
      push @sql, "update $types_table set feature_type_id = $ftid where cell_type_specific = $cts and $col = 1";
      
    }
    _run_sql($dbh,@sql) or throw "cannot run sql";

    # summary report
    # remove and not cell_type_specific... 
    my $res = $dbh->selectrow_array(" select count(*) from  regulatory_features_classified where promoter_associated and not cell_type_specific ");
    _commentary("promoter_associated     $res\n");
    $res = $dbh->selectrow_array(" select count(*) from  regulatory_features_classified where gene_associated and not cell_type_specific");
    _commentary("gene_associated         $res\n");
    $res = $dbh->selectrow_array(" select count(*) from  regulatory_features_classified where non_gene_associated and not cell_type_specific");
    _commentary("non_gene_associated     $res\n");
    $res = $dbh->selectrow_array(" select count(*) from  regulatory_features_classified where poliii_transcription_associated and not cell_type_specific");
    _commentary("PolIII_transcription_associated     $res\n");

    $res = $dbh->selectrow_array(" select count(*) from  regulatory_features_classified where unclassified and not cell_type_specific");
    _commentary("unclassified            $res\n");

    _qc($dbh,);


}


sub _qc{
    my($dbh) = @_;

    my @sql;
    #goto GENE_ASSOC;
    push @sql, "drop table if exists promoter_associated_temp";
    push @sql, "create table promoter_associated_temp select rfc.regulatory_feature_id, seq_region_name,seq_region_start,seq_region_end,rfc.binary_string from regulatory_feature rf, regulatory_features_classified rfc where rfc.promoter_associated and rfc.regulatory_feature_id =rf.regulatory_feature_id order by seq_region_name;";
    push @sql, "alter table promoter_associated_temp add index(seq_region_name)";
    push @sql, "drop table if exists promoter_features";
    push @sql, "create table promoter_features select * from protein_coding_exon1_plus_enhancer ";
    #push @sql, "insert into promoter_features select * from  protein_coding_intron1";
    push @sql, "insert into promoter_features select * from RNA_gene_exon1_plus_enhancer";
    push @sql, " alter table promoter_features add index(seq_region_name)";

    _run_sql($dbh,@sql) or throw "cannot run sql";

    my $res;
    $res = $dbh->selectrow_array("select count( distinct pa.regulatory_feature_id ) from promoter_associated_temp pa, promoter_features e where pa.seq_region_name=e.seq_region_name and pa.seq_region_start <= e.feature_end and pa.seq_region_end >= e.feature_start"); 
    _commentary("promoter_associated features\n");
    _commentary("$res overlap an exon1_plus_2.5kb (both RNA and prot_cod)\n");

    $res = $dbh->selectrow_array("select count( distinct pa.regulatory_feature_id ) from promoter_associated_temp pa, protein_coding_intron1 e where pa.seq_region_name=e.seq_region_name and pa.seq_region_start <= e.feature_end and pa.seq_region_end >= e.feature_start");   
    _commentary("$res overlap a protein coding intron1\n");

    $res = $dbh->selectrow_array("select count( distinct pa.regulatory_feature_id ) from promoter_associated_temp pa, processed_transcript e where pa.seq_region_name=e.seq_region_name and pa.seq_region_start <= if(e.feature_strand = -1 ,e.feature_end+2500,e.feature_end)  and pa.seq_region_end >= if(e.feature_strand = -1 ,e.feature_start,e.feature_start - 2500)");
    _commentary("$res overlap a 'processed transcript'\n");

    $res = $dbh->selectrow_array("select count( distinct pa.regulatory_feature_id ) from promoter_associated_temp pa, protein_coding_gene_body e where pa.seq_region_name=e.seq_region_name and pa.seq_region_start <= e.feature_end and pa.seq_region_end >= e.feature_start");
    _commentary("$res overlap a protein coding gene body\n");
    $res = $dbh->selectrow_array("select count( distinct pa.regulatory_feature_id ) from promoter_associated_temp pa, intergenic_2500 e where pa.seq_region_name=e.seq_region_name and pa.seq_region_start > e.feature_start and pa.seq_region_end < e.feature_end");
    _commentary("$res are at least 2500bp from any part of a protein coding gene\n");
    $res = $dbh->selectrow_array("select count( distinct pa.regulatory_feature_id ) from promoter_associated_temp pa, pseudogene_exon1_plus_enhancer e where pa.seq_region_name=e.seq_region_name and pa.seq_region_start > e.feature_start and pa.seq_region_end < e.feature_end");
    _commentary("$res overlap a pseudogene exon1_plus_2.5kb \n");

    @sql=();
    push @sql, "drop table if exists gene_associated_temp";
    push @sql, "create table gene_associated_temp select rfc.regulatory_feature_id, seq_region_name,seq_region_start,seq_region_end from regulatory_feature rf, regulatory_features_classified rfc where rfc.gene_associated and rfc.regulatory_feature_id =rf.regulatory_feature_id order by seq_region_name";
    push @sql, "alter table gene_associated_temp add index(seq_region_name)";
    _run_sql($dbh,@sql) or throw "cannot run sql";

    _commentary("gene_associated features\n");

    $res = $dbh->selectrow_array("select count( distinct pa.regulatory_feature_id ) from gene_associated_temp pa, protein_coding_gene e where pa.seq_region_name=e.seq_region_name and pa.seq_region_start <= e.feature_end and pa.seq_region_end >= e.feature_start");
    _commentary("$res overlap some part of a protein coding gene\n");

    $res = $dbh->selectrow_array("select count( distinct pa.regulatory_feature_id ) from gene_associated_temp pa, RNA_gene e where pa.seq_region_name=e.seq_region_name and pa.seq_region_start <= e.feature_end and pa.seq_region_end >= e.feature_start");
    _commentary("$res overlap some part of an RNA gene\n");

    $res = $dbh->selectrow_array("select count( distinct pa.regulatory_feature_id ) from gene_associated_temp pa, intergenic_2500 e where pa.seq_region_name=e.seq_region_name and pa.seq_region_start > e.feature_start and pa.seq_region_end < e.feature_end");
    _commentary("$res are at least 2500bp from any part of a protein coding gene\n");

    $res = $dbh->selectrow_array("select count( distinct pa.regulatory_feature_id ) from gene_associated_temp pa, pseudogene_transcript e where pa.seq_region_name=e.seq_region_name and pa.seq_region_start <= e.feature_end and pa.seq_region_end >= e.feature_start");
    _commentary("$res overlap some part of a pseudogene\n");

    $res = $dbh->selectrow_array("select count( distinct pa.regulatory_feature_id ) from gene_associated_temp pa, processed_transcript e where pa.seq_region_name=e.seq_region_name and pa.seq_region_start <= e.feature_end and pa.seq_region_end >= e.feature_start");
    _commentary("$res overlap some part of a 'processed_transcript'\n");
}

sub _commentary{
  my ($comm) = @_;
  print $comm;
  print REPORT $comm;
}


# returns start and end 
sub _get_rand_region{
    my ($dbh,$enc,$len,%enc_len) = @_;

    my $bad = 1;
    my $end;
    my $start;
    my $try = 1;
    while($bad){
      $try++;
      if($try > 1000000){
	throw "can't find a random region for seq_region $enc, length $len\n";
      }
      
      # get a random number which is within the encode region
      $end = int(rand($enc_len{$enc}));
      
      # the number must be higher than the length of the frag
      if($end > $len){
	$bad = 0;
      }
      $start = $end - $len +1;
      
      if($bad == 0){
	#Avoid regions that were already taken...
	my $q = "select * from banned_region_table where seq_region_name = '$enc' and $start <= feature_end and $end >= feature_start limit 1 ";
	my $aaref = $dbh->selectall_arrayref($q);
	unless(defined $aaref){die $dbh->errstr} 
	if(scalar(@$aaref) > 0){  $bad = 1; } 
      }
    }
    
    return ($start,$end);

}


#Help function to copy a table between databases with different names...
#Require DBI db handle objects
sub _copy_with_rename{
    my($s_dbh,$source_table,$t_dbh,$targ_table)=@_;

    my $q = "desc $source_table";
    my $desc_aaref=$s_dbh->selectall_arrayref($q);
    unless(defined $desc_aaref){die "failed on:\n $q\n".$s_dbh->errstr}

    _run_sql($t_dbh,"drop table if exists $targ_table");
    $q = " create table $targ_table (";
    my $select;
    foreach my $aref (@$desc_aaref){
	$q  .= $aref->[0].' '.$aref->[1].',';
        $select .=  $aref->[0].',';
    }
    chop $q; #remove last comma
    chop $select;
    $q .= ")";
    print $q."\n";
    _run_sql($t_dbh,$q) or throw "Error creating table";

    $q = "select $select from $source_table";
    my $aaref = $s_dbh->selectall_arrayref($q);
    unless(defined $aaref){die "failed on:\n $q\n".$s_dbh->errstr}
    my @sql;
    foreach my $aref (@$aaref){
	$q= "insert into $targ_table values('".join("','",@$aref);
	$q .= "')";
	push @sql,$q;

    }

    _run_sql($t_dbh,@sql) or throw "Error copying data";

}


# _run_sql
#
#  Arg [1]   : scalar database handle from DBI module
#  Arg [2]   : array of scalars containing lines of text in the format of SQL statements
#  Function  : submits each element of Arg[2] to the $dbh->prepare and 
#              $dbh->execute methods and checks for successful execution. 
#              Returns 0 on failure, 1 on success. Emits error messages.
#  Returntype: int
#  Exceptions: none
#  Example   : _run_sql($dbh, @array);

sub _run_sql{
  my $dbh = shift;
  my (@array)=@_;
  
  my $sth;
  
  foreach my $query(@array){
    
    eval {
      unless($sth=$dbh->prepare($query)){
	warn("Error: preparation of statement failed on: $query\n");
	warn("database error message: ".$dbh->errstr."\n");
	return(0);
      }
      
      unless($sth->execute){ # returns true on success
	warn("Error: statement execution failed on: $query\n");
	warn("statement handle error message:".$sth->errstr."\n");
	return(0);
      }
    };
    if($@){ warn "Error running SQL"; return(0); }

  }
  
  return(1);
}

#Private getter / setter to the Work DB Connection
sub _workdb {
  return $_[0]->_getter_setter('workdb',$_[1]);
}

#Private getter / setter to the cell type
sub _cell_type {
  return $_[0]->_getter_setter('cell_type',$_[1]);
}


1;