#!/bin/bash -e # ensemblDbImport - create a mysql database using a mysql dump from the ensembl ftp site, skipping uneeded tables. Rename chroms to ucsc standard names. [ $# != 3 ] && { echo "usage: ensemblDbImport species version indir " >&2 exit 1 } organism=$1 version=$2 indir=$3 delay=4 ensDb=ensembl_${organism}_${version} input="ftp://ftp.ensembl.org/pub/current_$organism/data/mysql/${organism}_${version}" sql="${organism}_${version}.sql.gz" exceptions="density|ditag|dna|dna_align_feature|oligo|protein_align_feature|repeat_feature" # create mysql ensembl database from downloaded sql and data (drop old one first) cd $indir tables=`cat tables.tmp |grep -v -E $exceptions` echo "Dropping database $ensDb in $delay seconds" sleep $delay hgsql mysql -e "drop database if exists $ensDb" echo "Creating database $ensDb" hgsql mysql -e "create database $ensDb" echo "Creating empty tables in database $ensDb from $sql" zcat $sql | sed -e 's/DEFAULT CHARSET=latin1//' -e 's/AUTO_INCREMENT=[0-9]*//' -e 's/character set latin1 collate latin1_bin//' > create_table.sql hgsql $ensDb < create_table.sql echo "Loading data" for i in $tables ; do echo "hgLoadSqlTab $ensDb $i /dev/null $i.txt.table -oldTable ">> dbLoad.log; zcat $i.txt.table.gz > temp.txt ; hgLoadSqlTab $ensDb $i /dev/null temp.txt -oldTable 2>> dbLoad.log >> dbLoad.log; done # rename chromosomes to conform to ucsc standards echo "Fix chrom names" set -x hgsql $ensDb -e "update seq_region set name = 'chrM' where name = 'MT';" hgsql $ensDb -e "update seq_region set name = concat('chr',name) where name between 1 and 99;" hgsql $ensDb -e "update seq_region s, transcript t set s.name = concat('chr',s.name) where s.name in ('M','W','X','Y','Z','Un','M_random','X_random','Y_random','W_random','Z_random','Un_random') and s.seq_region_id = t.seq_region_id;" hgsql $ensDb -e "update seq_region set name = 'chr6_cox_hap1' where name = 'c6_COX';" hgsql $ensDb -e "update seq_region set name = 'chr6_qbl_hap2' where name = 'c6_QBL';" hgsql $ensDb -e "update seq_region set name = 'chr5_h2_hap1' where name = 'c5_H2';" hgsql $ensDb -e "update seq_region set name = 'chr22_h2_hap1' where name = 'c22_H2';" #add redundant field to avoid silly outer join limitation in mysql hgsql $ensDb -e "alter table xref add db_name varchar(27);" hgsql $ensDb -e "update xref x, external_db e set x.db_name = e.db_name where x.external_db_id = e.external_db_id;"