# for emacs: -*- mode: sh; -*- # This file describes how we made the browser database on the # SARS Coronavirus Tor2 isolate, draft genome sequence dated # April 14, 2003. It's April 15 here in NZ but April 14 in # the states -- dates may shows that inconsistency. :) # NOTE: this doc may have genePred loads that fail to include # the bin column. Please correct that for the next build by adding # a bin column when you make any of these tables: # # mysql> SELECT tableName, type FROM trackDb WHERE type LIKE "%Pred%"; # +---------------+-----------------------+ # | tableName | type | # +---------------+-----------------------+ # | softberryGene | genePred softberryPep | # +---------------+-----------------------+ DOWNLOAD SEQUENCE (DONE 04/15/03) ssh eieio mkdir /cluster/store5/SARS_Coronavirus_TOR2 cd /cluster/store5/SARS_Coronavirus_TOR2 wget http://ybweb.bcgsc.ca/sars/TOR2_draft_genome_assembly_140403.fasta wget -o TOR2.gbff 'http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=Retrieve&db=nucleotide&list_uids=29826276&dopt=GenBank' # translate to nib ln -s /cluster/store5/SARS_Coronavirus_TOR2 ~/sc1 cd ~/sc1 mkdir nib ln -s TOR2_draft_genome_assembly_140403.fasta chr1.fa faToNib chr1.fa nib/chr1.nib CREATING DATABASE (DONE 04/15/03) # Create the database. ssh hgwdev echo 'create database sc1' | hgsql '' # make a semi-permanent read-only alias: alias sc1 "mysql -u hguser -phguserstuff -A sc1" # Use df to ake sure there is at least 5 gig free on # hgwdev:/var/lib/mysql CREATING GRP TABLE FOR TRACK GROUPING (DONE 04/15/03) ssh hgwdev echo "create table grp (PRIMARY KEY(NAME)) select * from rn1.grp" \ | hgsql sc1 STORING O+O SEQUENCE AND ASSEMBLY INFORMATION (DONE 04/15/03) # Make symbolic links from /gbdb/sc1/nib to the real nibs. ssh hgwdev mkdir -p /gbdb/sc1/nib foreach f (/cluster/store5/SARS_Coronavirus_TOR2/nib/*.nib) ln -s $f /gbdb/sc1/nib end # Load /gbdb/sc1/nib paths into database and save size info. hgsql sc1 < ~/src/hg/lib/chromInfo.sql cd ~/sc1 hgNibSeq -preMadeNib sc1 /gbdb/sc1/nib chr1.fa echo "select chrom,size from chromInfo" | hgsql -N sc1 > chrom.sizes # Set up relational mrna tables. hgLoadRna new sc1 MAKE GCPERCENT (DONE 04/15/03) ssh hgwdev mkdir -p /cluster/store5/SARS_Coronavirus_TOR2/bed/gcPercent cd /cluster/store5/SARS_Coronavirus_TOR2/bed/gcPercent hgsql sc1 < ~/src/hg/lib/gcPercent.sql hgGcPercent sc1 ../../nib MAKE HGCENTRALTEST ENTRY AND TRACKDB TABLE FOR SC1 (DONE 04/15/03) echo 'insert into defaultDb values("SARS", "sc1");' \ | hgsql -h genome-testdb hgcentraltest # Note: for next assembly, set scientificName column to # "SARS coronavirus" echo 'insert into dbDb values("sc1", "SARS Cor. TOR2 Apr. 2003", \ "/gbdb/sc1/nib", "SARS", "chr1", 1, 10, "SARS");' \ | hgsql -h genome-testdb hgcentraltest # Make trackDb table so browser knows what tracks to expect: ssh hgwdev cd ~/src/hg/makeDb/trackDb cvs up -d -P # Edit that makefile to add sc1 in all the right places and do make update make alpha cvs commit makefile MAKE HGCENTRALTEST BLATSERVERS ENTRY FOR SC1 (TODO) ssh hgwdev echo 'insert into blatServers values("sc1", "blat10", "17778", "1"); \ insert into blatServers values("sc1", "blat10", "17779", "0");' \ | hgsql -h genome-testdb hgcentraltest SIMPLE REPEAT TRACK (DONE 04/15/03) # TRF runs pretty quickly now... it takes a few hours total runtime, # so instead of binrsyncing and para-running, just do this on eieio: ssh eieio mkdir ~/sc1/bed/simpleRepeat cd ~/sc1/bed/simpleRepeat mkdir trf rm -f jobs.csh touch jobs.csh foreach f (/cluster/store5/SARS_Coronavirus_TOR2/*.fa) set fout = $f:t:r.bed echo "/cluster/home/kent/bin/i386/trfBig -trf=/cluster/home/kent/bin/i386/trf $f /dev/null -bedAt=trf/$fout -tempDir=/tmp" \ >> jobs.csh end tcsh jobs.csh |& tee jobs.log wc -l jobs.csh ls -1 trf | wc -l # STOPPED HERE -- no simple repeats found. # When job is done do: liftUp simpleRepeat.bed ~/sc1/jkStuff/liftAll.lft warn trf/*.bed # Load this into the database as so ssh hgwdev cd ~/sc1/bed/simpleRepeat hgLoadBed sc1 simpleRepeat simpleRepeat.bed \ -sqlTable=$HOME/src/hg/lib/simpleRepeat.sql MAKING VIRAL mRNA FILES (DONE 04/15/03) cd /cluster/store5/mrna.134 gunzip -c \ /cluster/store5/genbank.134/gbvrl* \ | gbToFaRa anyRna.fil viralRna.{fa,ra,ta} stdin mkdir /gbdb/sc1/mrna.134 ln -s /cluster/store5/mrna.134/viralRna.fa /gbdb/sc1/mrna.134/ hgLoadRna add -type=mrna sc1 /gbdb/sc1/mrna.134/viralRna.fa \ /cluster/store5/mrna.134/viralRna.ra # distribute to /iscratch/i/ ssh kkr1u00 mkdir -p /iscratch/i/mrna.134 cp -p /cluster/store5/mrna.134/viralRna.fa /iscratch/i/mrna.134 ~kent/bin/iSync ALIGNING VIRAL RNA (DONE 04/16/03) ssh kkr1u00 mkdir -p ~/sc1/bed/mrna cd ~/sc1/bed/mrna ls -1S /cluster/store5/SARS_Coronavirus_TOR2/*.fa > genome.lst ls -1 /iscratch/i/mrna.134/viralRna.fa > mrna.lst cp ~/lastRn/bed/mrna/gsub . # edited gsub to remove -ooc and add query options for translated. mkdir psl gensub2 genome.lst mrna.lst gsub spec para create spec para try, para check, para push, para check.... para time > time # Process alignments into near best in genome. ssh eieio cd ~/sc1/bed/mrna pslSort dirs raw.psl /cluster/store2/temp psl # Skip the pslReps -- need the sensitivity! cat raw.psl \ | sed -e 's/TOR2_draft_genome_assembly_140403/chr1/' \ > all_mrna.psl pslSortAcc nohead chrom /cluster/store2/temp all_mrna.psl # Load mRNA alignments into database. ssh hgwdev cd ~/sc1/bed/mrna/chrom foreach i (chr?{,?}{,_random}.psl) mv $i $i:r_mrna.psl end hgLoadPsl -noTNameIx sc1 *.psl cd .. hgLoadPsl sc1 all_mrna.psl -nobin LOAD SWISS PROT CORONAVIRUS PROTEINS cd ~/sc1/bed/swissprot download all ORGANISM=coronavirus proteins from SRS into file viralProt.fa ssh kkr1u00 mkdir psl para create spec para push cd psl mv chr1_viralProt.psl chr1_viralProtLoose.psl pslReps -minAli=0.67 chr1_viralProtLoose.psl chr1_viralProt.psl hgLoadPsl -noTNameIx sc1 chr1_viralProt.psl chr1_viralProtLoose.psl LOAD CPGISSLANDS (DONE 04/15/03) ssh eieio mkdir -p ~/sc1/bed/cpgIsland cd ~/sc1/bed/cpgIsland # Build software emailed from Asif Chinwalla (achinwal@watson.wustl.edu) # copy the tar file to the current directory cp ~/lastRn/bed/cpgIsland/cpg_dist.tar . tar xvf cpg_dist.tar cd cpg_dist gcc readseq.c cpg_lh.c -o cpglh.exe cd .. foreach f (../../*.fa) set fout=$f:t:r:r.cpg echo running cpglh on $f to $fout ./cpg_dist/cpglh.exe $f > $fout.cpg end # copy filter.awk from a previous release cp ~/lastRn/bed/cpgIsland/filter.awk . awk -f filter.awk chr*.cpg > cpgIsland.bed # STOPPED HERE -- NO CPG ISLANDS FOUND # load into database: ssh hgwdev cd ~/sc1/bed/cpgIsland hgLoadBed sc1 cpgIsland -tab -noBin \ -sqlTable=$HOME/kent/src/hg/lib/cpgIsland.sql cpgIsland.bed LOAD SOFTBERRY GENES mkdir -p ~/sc1/bed/softberry cd ~/sc1/bed/softberry # Copy in email attachment Soft_genes_SARS_Tor2.tar.gz # from Victor Solovyev gtar -zxvf Soft_gene*.gz # Substitute chr1 for subs in the gff file ldHgGene sc1 softberryGene SARS.gff hgPepPred sc1 softberry *.protein hgSoftberryHom sc1 *.protein LOAD GENBANK ANNOTATED PROTEINS (DONE 04/23/03) mkdir -p ~/sc1/bed/gbProtAnn cd ~/sc1/bed/gbProtAnn # go to http://www.ncbi.nlm.nih.gov/ -- search Protein for NP_828849 # Save the displayed genbank record to NP_828849.gbff # # Used emacs macros (and judgment about what the short names should be) # to edit the mat_peptide elements to a bed 4 + tab file like this: # chrom start end name product note proteinId giId # called NP_828849.pbed -- pbed because it's in protein coordinates. # # Now translate the protein coordinates into genomic coords: # /coded_by="join(NC_004718.1:250..13383, NC_004718.1:13383..21470)" # /note="putative -1 frameshift" # and strip the "gi:" from genbank id numbers: # OK, also use the proteinId as the name instead of my short names so # that others can link in more easily as Fan suggested: perl -wpe '@w = split(/\t/); \ if ($w[1] < 4379) { \ $w[1] = (($w[1] - 1) * 3) + 250 - 1; \ $w[2] = (($w[2] - 1) * 3) + 250 + 2; \ } else { \ $w[1] = (($w[1] - 4379) * 3) + 13383 - 1; \ $w[2] = (($w[2] - 4379) * 3) + 13383 + 2; \ } \ $w[7] =~ s/gi://i; \ $w[3] = $w[6]; \ $_ = join("\t", @w); \ ' NP_828849.pbed \ > NP_828849.bed # Make sure it looks right: sdiff NP_828849.pbed NP_828849.bed hgLoadBed sc1 gbProtAnn NP_828849.bed \ -tab -noBin -sqlTable=$HOME/src/hg/lib/gbProtAnn.sql PRODUCING GENSCAN PREDICTIONS (DONE 4/23/03) ssh kkr1u00 mkdir -p ~/sc1/bed/genscan cd ~/sc1/bed/genscan # No cluster run (or liftUp) required -- just run it: /cluster/home/kent/bin/i386/gsBig ~/sc1/chr1.fa genscan.gtf \ -trans=genscan.pep -subopt=genscanSubopt.bed \ -exe=/cluster/home/fanhsu/projects/compbio/bin/genscan-linux/genscan \ -par=/cluster/home/fanhsu/projects/compbio/bin/genscan-linux/HumanIso.smat \ -tmp=/tmp -window=2400000 # Load into the database as so: ssh hgwdev cd ~/sc1/bed/genscan ldHgGene sc1 genscan genscan.gtf hgPepPred sc1 generic genscanPep genscan.pep hgLoadBed sc1 genscanSubopt genscanSubopt.bed BLATTING OTHER SARS WHOLE GENOMES AGAINST TOR2 (DONE 04/25/03) ssh eieio mkdir -p ~/sc1/bed/otherSARS cd ~/sc1/bed/otherSARS # Went to http://www.ncbi.nlm.nih.gov/ , did a search in Nucleotide # for SARS Coronavirus, viewed results as FASTA, and saved results to # .fa files: # AY278554.2 (CUHK-W1) # AY278741.1 (Urbani) # AY278491.2 (HKU-39849) # AY279354.1 (BJ04 partial) # AY278490.1 (BJ03 partial) # AY278489.1 (GZ01 partial) # AY278488.1 (BJ01 partial) # AY278487.1 (BJ02 partial) # AY268049.1 (Taiwan partial cds) # AY269391.1 (Vietnam partial) cat A*.fa > otherSARS.fa # Now blat (sensitive is OK because these are same organism): /cluster/home/kent/bin/i386/blat -noHead ~/sc1/chr1.fa otherSARS.fa \ otherSARS.psl # Now put the sequence in /gbdb and index it into extFile/seq: ssh hgwdev mkdir -p /gbdb/sc1/otherSARS/ ln -s `pwd`/otherSARS.fa /gbdb/sc1/otherSARS/ hgLoadRna addSeq sc1 /gbdb/sc1/otherSARS/otherSARS.fa -type=otherSARS.fa # And load the alignments: hgLoadPsl -noTNameIx sc1 otherSARS.psl LOADING VIRAL mrna for blastz track (done 4/30/03) cd ~/sars/genbank gunzip -c /cluster/store5/genbank/data/download/genbank.135.0/gb{phg,vrl,vrt}*.seq.gz | gbToFaRa any.fil viralany.fa viralany.ra viralany.ta stdin gunzip -c /cluster/store5/genbank/data/download/refseq.135.0/cumulative/rscu.gbff.Z | gbToFaRa refseq.fil refseqviral.fa refseqviral.ra refseqviral.ta stdin cat viralany.fa refseqviral.fa > viral.fa cat viralany.ra refseqviral.ra > viral.ra mv viral.fa viralany.fa mv viral.ra viralany.ra hgLoadRna new sc1 hgLoadRna add -type=mRNA sc1 /gbdb/sc1/mrnaViral/viralany.fa viralany.ra hgLoadRna addSeq -abbr=prot sc1 /gbdb/sc1/swissprot/viralProt.fa hgLoadRna addSeq sc1 /gbdb/sc1/otherSARS/otherSARS.fa -type=otherSARS.fa tr acgt ACGT viralany.fa > viralAnyUpper.fa MRNA ALIGNMENT USING BLASTZ and CHAINING (done 5/16/03) cd ~/sars/mrna.blastz.all blastz /cluster/store5/SARS_Coronavirus_TOR2/nib/chr1.nib[1,29736] /cluster/store5/SARS_Coronavirus_TOR2/bed/genbank/viralAllUpper.fa m=40000000 H=2000 K=2000 v=1 M=0 > viralany.out /cluster/home/angie/schwartzbin/xout2lav chr1 29736 /cluster/store5/SARS_Coronavirus_TOR2/bed/mrna.blastz.all/ /cluster/store5/SARS_Coronavirus_TOR2/bed/mrna.blastz.all/lav/viralany.lav /cluster/store5/SARS_Coronavirus_TOR2/bed/mrna.blastz.all/DEF 2> viralany.log lavToPsl lav/viralany.lav psl/viralany.psl pslReps viralany.psl -minAli=0.6 all_mrna.psl /dev/null vi all_mrna.psl change tName to chr1 and delete sars self join hgLoadPsl sc1 all_mrna.psl hgLoadPsl sc1 all_mrna.psl -table=xenoMrna lavToAxt /cluster/store5/SARS_Coronavirus_TOR2/bed/mrna.blastz.all/lav/viralany.lav /cluster/store5/SARS_Coronavirus_TOR2/nib /cluster/store5/SARS_Coronavirus_TOR2/bed/genbank/viralAllUpper.fa axt/viralany.axt -fa axtChain axt/xeno.axt /cluster/store5/SARS_Coronavirus_TOR2/nib/ /cluster/store5/SARS_Coronavirus_TOR2/bed/mrna.blastz.all/nib chain/xeno.chain chainFilter chain/xeno.chain -minScore=3000 -maxScore=2000000 > chain/filter.chain chainToPsl chain/filter.chain S1.len S2.len ../../nib/chr1.nib query.lst psl/xenoBlastzMrna.psl cd psl hgLoadPsl sc1 xenoBlastzMrna.psl DOWNLOADS (done 10/8/04) ssh hgwdev cd /usr/local/apache/htdocs/goldenPath/scApr2003/bigZips # done 6/5/04: zip -j otherSARS.fa.zip /cluster/data/sc1/bed/otherSARS/otherSARS.fa zip -j viralProt.fa.zip /cluster/data/sc1/bed/swissprot/viralProt.fa zip -j viralany.fa.zip /cluster/data/sc1/bed/genbank/viralany.fa # done 10/8/04: zip -j SARS.fa.zip /cluster/data/sc1/chr1.fa