########################################################################## # CREATE H1N1 DATABASE (STARTED 4/26/09, Fan) # for emacs: -*- mode: sh; -*- # DOWNLOAD SEQUENCE (DONE, Fan, 4/26/09) ssh hgwdev mkdir /hive/data/genomes/h1n1 cd /hive/data/genomes/h1n1 # Get H1N1 sequences from GISAID/EpiFluDB mkdir download cd download # downloaded the following sequences: epiflu_0421_dna_sequence.fasta epiflu_0425_dna_sequence.fasta epiflu_0421_protein_sequence.fasta epiflu_0425_protein_sequence.fasta # translate to nib cd .. ln -s /hive/data/genomes/h1n1 ~/h1n1 cd ~/h1n1 mkdir nib # use sequence segments in epiflu_0421_dna_sequence.fasta as the base genome fgrep -v ">" download/epiflu_0421_dna_sequence.fasta >j.1 echo ">chr1" >j.0 cat j.0 j.1 >chr1.fa rm j.0 j.1 faToNib chr1.fa nib/chr1.nib # CREATING DATABASE (DONE 4/26/09) # Create the h1n1 database. ssh hgwdev echo 'create database h1n1' | hgsql '' # make a semi-permanent read-only alias: alias h1n1 "mysql -u hguser -phguserstuff -A h1n1" # CREATING GRP TABLE FOR TRACK GROUPING (DONE 4/26/09) ssh hgwdev echo "create table grp (PRIMARY KEY(NAME)) select * from hg18.grp" \ | hgsql h1n1 # remove ENCODE groups echo "delete from grp where name like 'encode%'" | hgsql h1n1 # STORING O+O SEQUENCE AND ASSEMBLY INFORMATION (DONE 4/26/09) # Make symbolic links from /gbdb/h1n1/nib to the real nibs. ssh hgwdev mkdir -p /gbdb/h1n1/nib ln -s /hive/data/genomes/h1n1/nib/chr1.nib /gbdb/h1n1/nib/chr1.nib # Load /gbdb/h1n1/nib paths into database and save size info. hgsql h1n1 < ~/src/hg/lib/chromInfo.sql hgNibSeq -preMadeNib h1n1 /gbdb/h1n1/nib chr1.fa echo "select chrom,size from chromInfo" | hgsql -N h1n1 > chrom.sizes # MAKE HGCENTRALTEST ENTRY AND TRACKDB TABLE FOR H1N1 (DONE 08/02/06) echo 'insert into defaultDb values("A/California/04/2009(EPI_ISL_29573)", "h1n1");' \ | hgsql -h genome-testdb hgcentraltest echo 'insert into dbDb values("h1n1", "Apr. 2009", \ "/gbdb/h1n1/nib", "Human case of H1N1 swine influenza", "chr1", 1, 99.5, "A H1N1","Human case of H1N1 swine influenza", "/gbdb/h1n1/html/description.html", 0, 0, "GISAID sequence as of Apr. 21st, 2009", 99999);' \ | hgsql -h genome-testdb hgcentraltest echo 'insert into genomeClade values("A H1N1", "other", 100);'\ | hgsql -h genome-testdb hgcentraltest # Make trackDb table so browser knows what tracks to expect: ssh hgwdev cd ~/src/hg/makeDb/trackDb cvs up -d -P # Edit that makefile to add h1n1 in all the right places vi makefile make update make alpha cvs commit makefile # START BLAT SERVERS (on hgwdev for now, DONE 4/26/09, Fan) cd /hive/data/genomes/h1n1/nib gfServer start hgwdev 18891 -trans -mask -log=gfServer.trans.log chr1.nib & gfServer start hgwdev 18892 -stepSize=5 -log=gfServer.log chr1.nib & # MAKE HGCENTRALTEST BLATSERVERS ENTRY FOR H1N1 echo 'insert into blatServers values("h1n1", "hgwdev", "18891", "1", "0"); \ insert into blatServers values("h1n1", "hgwdev", "18892", "0", "0");' \ | hgsql -h genome-testdb hgcentraltest # CREATE h1n1Gene TRACK (DONE. 4/26/09, Fan) mkdir /hive/data/genomes/h1n1/bed/h1n1Gene cd /hive/data/genomes/h1n1/bed/h1n1Gene cp ../../download/epiflu_0421_dna_sequence.fasta h1n1Gene.fa # manually edit h1n1Gene.fa # change line like: # >EPI176470 | HA | A/California/04/2009 | EPI_ISL_29573 | 2009712049_seg4 | H1N1 # into: # >HA EPI176470 gfClient -minScore=200 -minIdentity=80 -nohead hgwdev.cse.ucsc.edu 18892 /gbdb/h1n1/nib \ -out=psl -t=dna -q=dna h1n1Gene.fa h1n1GenePsl.psl hgLoadPsl h1n1 h1n1GenePsl.psl hgsql h1n1 -N -e 'select tName, tStart, tEnd, qName from h1n1GenePsl' >h1n1Gene.bed hgLoadBed h1n1 h1n1Gene h1n1Gene.bed # CREATE h1n1Seq TRACK (DONE. 4/26/09, Fan) mkdir –p /hive/data/genomes/h1n1/bed/h1n1Seq cd /hive/data/genomes/h1n1/bed/h1n1Seq # get h1n1Seq sequences cat ../../download/*dna*.fasta |sed -e 's/ | /_/g' |sed -e 's/_EPI_ISL/ | /' >h1n1Seq.fa # create .fa file mkdir -p /gbdb/h1n1/h1n1Seq cp -p h1n1Seq.fa /gbdb/h1n1/h1n1Seq/h1n1Seq.fa hgLoadSeq –replace h1n1 /gbdb/h1n1/h1n1Seq/h1n1Seq.fa # BLAT gfClient -minScore=200 -minIdentity=80 -nohead hgwdev.cse.ucsc.edu 18892 /gbdb/h1n1/nib \ -out=psl -t=dna -q=dna h1n1Seq.fa h1n1Seq.psl # load the psl result into h1n1Seq table hgLoadPsl h1n1 h1n1Seq.psl # CREATE humanHA TRACK (DONE. 4/26/09, Fan) mkdir -p /hive/data/genomes/h1n1/bed/humanHA cd /hive/data/genomes/h1n1/bed/humanHA # get humanHA sequences cat ../../download/humanHA_dna.fasta |sed -e 's/ | /_/g' |sed -e 's/_EPI_ISL/ | /' >humanHA.fa # create .fa file mkdir -p /gbdb/h1n1/humanHA cp -p humanHA.fa /gbdb/h1n1/humanHA/humanHA.fa hgLoadSeq -replace h1n1 /gbdb/h1n1/humanHA/humanHA.fa # BLAT gfClient -minScore=500 -minIdentity=80 -nohead hgwdev.cse.ucsc.edu 18892 /gbdb/h1n1/nib \ -out=psl -t=dna -q=dna humanHA.fa humanHA.psl # load the psl result into humanHA table hgLoadPsl h1n1 humanHA.psl # CREATE swineHA TRACK (DONE. 4/26/09, Fan) mkdir -p /hive/data/genomes/h1n1/bed/swineHA cd /hive/data/genomes/h1n1/bed/swineHA # get swineHA sequences cat ../../download/swineHA_dna.fasta |sed -e 's/ | /_/g' |sed -e 's/_EPI_ISL/ | /' >swineHA.fa # create .fa file mkdir -p /gbdb/h1n1/swineHA cp -p swineHA.fa /gbdb/h1n1/swineHA/swineHA.fa hgLoadSeq -replace h1n1 /gbdb/h1n1/swineHA/swineHA.fa # BLAT gfClient -minScore=500 -minIdentity=80 -nohead hgwdev.cse.ucsc.edu 18892 /gbdb/h1n1/nib \ -out=psl -t=dna -q=dna swineHA.fa swineHA.psl # load the psl result into humanHA table hgLoadPsl h1n1 swineHA.psl ####################################################################################### # CREATE h1n1_0514Seq TRACK (DONE. 5/15/09, Fan) mkdir -p /hive/data/genomes/h1n1/bed/h1n1_0514Seq cd /hive/data/genomes/h1n1/bed/h1n1_0514Seq # create .fa files from downloaded .fasta files mkdir -p /gbdb/h1n1/h1n1_0514Seq cd ../../download/0514 dos2unix < epiflu_0514_dna_sequence.fasta > epiflu_0514_dna_sequence.fa dos2unix < epiflu_0514_protein_sequence.fasta > epiflu_0514_protein_sequence.fa cp -p ../../download/0514/epiflu_0514_dna_sequence.fa /gbdb/h1n1/h1n1_0514Seq cd /hive/data/genomes/h1n1/bed/h1n1_0514Seq hgLoadSeq -replace h1n1 /gbdb/h1n1/h1n1_0514Seq/epiflu_0514_dna_sequence.fa # BLAT, make sure the port number is correct. gfClient -minScore=200 -minIdentity=80 -nohead hgwdev.cse.ucsc.edu 18892 /gbdb/h1n1/nib \ -out=psl -t=dna -q=dna /gbdb/h1n1/h1n1_0514Seq/epiflu_0514_dna_sequence.fa h1n1_0514Seq.psl # load the psl result into h1n1_0514Seq table hgLoadPsl h1n1 h1n1_0514Seq.psl # update h1n1SeqXref table fgrep ">" /gbdb/h1n1/h1n1_0514Seq/epiflu_0514_dna_sequence.fa|\ sed -e 's/ | /\t/g' |sed -e "s/>//" > h1n1SeqXref.tab hgsql h1n1 -e 'delete from h1n1SeqXref' hgsql h1n1 -e 'load data local infile "h1n1SeqXref.tab" into table h1n1SeqXref' # create dna sequences table cp -p ~/h1n1/download/0514/epiflu_0514_dna_sequence.fa j1 ~/src/utils/faToTab/faToTab.pl /dev/null j1 >j2.tab cut -f 1 j2.tab >jj1 cut -f 2 j2.tab >jseq cat jj1 |sed -e 's/ | /\t/g'|sed -e 's/>//' >jj cut -f 1 jj > jDnaId paste jDnaId jseq >epiflu_0514_dna_sequence.tab hgLoadSqlTab h1n1 dnaSeq ~/src/hg/lib/dnaSeq.sql epiflu_0514_dna_sequence.tab rm j1 jj jj1 jseq jDnaId j2.tab # create protein sequences file with appropriate IDs cp -p ~/h1n1/download/0514/epiflu_0514_protein_sequence.fa j1 ~/src/utils/faToTab/faToTab.pl /dev/null j1 >j2.tab cut -f 1 j2.tab >jj1 cut -f 2 j2.tab >jseq cat jj1 |sed -e 's/ | /\t/g'|sed -e 's/>//' >jj cut -f 1 jj > jDnaId cut -f 2 jj > j2 cut -f 1,2 jj|sed -e 's/\t/_/' |\ sed -e 's/_PB2//'|\ sed -e 's/_HA//'|\ sed -e 's/_NS//'|\ sed -e 's/_PA//'|\ sed -e 's/_NA//'|\ sed -e 's/_NP//'|\ sed -e 's/-/_/'|\ sed -e 's/_PB1//' > jProtId paste jDnaId jProtId >h1n1DnaProt.tab hgsql h1n1 -e 'delete from h1n1DnaProt' hgsql h1n1 -e 'load data local infile "h1n1DnaProt.tab" into table h1n1DnaProt' # remove _F2 sequences, they are too short paste jProtId jseq |sort -u >epiflu_0514_protein_sequence.tab tabToFa epiflu_0514_protein_sequence mkdir -p /gbdb/h1n1/h1n1_0514ProtSeq cp epiflu_0514_protein_sequence.fa /gbdb/h1n1/h1n1_0514ProtSeq # BLAT the protein sequences, make sure port number is correct. gfClient -minScore=50 -minIdentity=60 -nohead hgwdev.cse.ucsc.edu 18891 /gbdb/h1n1/nib \ -q=prot -out=psl -t=dnax /gbdb/h1n1/h1n1_0514ProtSeq/epiflu_0514_protein_sequence.fa testPsl0.psl hgsql h1n1 -e 'delete from aaSeq' hgsql h1n1 -e 'load data local infile "epiflu_0514_protein_sequence.tab" into table aaSeq' rm j* cat testPsl0.psl |grep -v "_NEP"|grep -v "_M2"|grep -v "_F2"|sed -e 's/_M1//' >testPsl.psl hgLoadPsl h1n1 testPsl.psl # construct CDS info based on corresponding DNA and protein alignments. hgsql h1n1 -N -e \ 'select p.qName, p.tStart-d.tStart+d.qStart+1, "xxx", d.qStart+(d.tEnd-d.tStart)-(d.tEnd-p.tEnd) from testPsl p, h1n1_0514Seq d, h1n1DnaProt n where dnaId=d.qName and d.qName=p.qName' |\ sed -e 's/\txxx\t/\.\./' >h1n1_0514SeqCds.tab hgLoadSqlTab h1n1 h1n1_0514SeqCds ~/hg/lib/cdsSpec.sql h1n1_0514SeqCds.tab ####################################################################################