# for emacs: -*- mode: sh; -*- # This describes how to make the sacCer1 browser database. # # The genomic sequence was downloaded Nov. 17, 2003, and is dated # 10/1/2003 on the Stanford FTP site. The previous version of the # genomic sequence is dated Nov. 2002. I don't see version numbers. # NOTE: this doc may have genePred loads that fail to include # the bin column. Please correct that for the next build by adding # a bin column when you make any of these tables: # # mysql> SELECT tableName, type FROM trackDb WHERE type LIKE "%Pred%"; # +-----------+-----------------+ # | tableName | type | # +-----------+-----------------+ # | sgdGene | genePred sgdPep | # +-----------+-----------------+ # Create the directory structure and download sequence from the SGD # site at Stanford. mkdir /cluster/store6/sacCer1 ln -s /cluster/store6/sacCer1 /cluster/data/sacCer1 cd /cluster/data/sacCer1 mkdir 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 M bed download cd download mkdir chromosomes cd chromosomes foreach i (01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 16 mt) wget ftp://genome-ftp.stanford.edu/pub/yeast/data_download/sequence/genomic_sequence/chromosomes/fasta/chr$i.fsa end foreach i (1 2 3 4 5 6 7 8 9) echo ">chr$i" > chr$i.fa grep -v '^>' chr0$i.fsa >> chr$i.fa end foreach i (chr1?.fsa) echo ">$i:r" > $i:r.fa grep -v '^>' $i >> $i:r.fa end echo ">chrM" > chrM.fa grep -v '^>' chrmt.fsa > chrM.fa foreach i (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 M) mv chr$i.fa ../../$i end gzip -4 *.fsa # Download other info from the FTP site. wget -nH --cut-dirs=3 -r ftp://genome-ftp.stanford.edu/pub/yeast/data_download/chromosomal_feature wget -nH --cut-dirs=3 -r ftp://genome-ftp.stanford.edu/pub/yeast/data_download/gene_registry wget -nH --cut-dirs=3 -r ftp://genome-ftp.stanford.edu/pub/yeast/data_download/literature_curation wget -nH --cut-dirs=3 -r ftp://genome-ftp.stanford.edu/pub/yeast/data_download/oracle_schema wget -nH --cut-dirs=3 -r ftp://genome-ftp.stanford.edu/pub/yeast/data_download/protein_info wget -nH --cut-dirs=3 -r ftp://genome-ftp.stanford.edu/pub/yeast/data_download/sequence_similarity wget -nH --cut-dirs=3 -r ftp://genome-ftp.stanford.edu/pub/yeast/data_download/systematic_results wget -nH --cut-dirs=5 -r -l 2 ftp://genome-ftp.stanford.edu/pub/yeast/data_download/sequence/genomic_sequence/orf_protein # Check that the genome is in sync with the annotations cd /cluster/data/sacCer1 checkSgdSync download # This showed 5 of 5788 with no ATG. I sent these to Mike Cherry. # CREATING DATABASE, MAKE AND LOAD NIBS (DONE 2003-11-24 - Jim) # NOTE FOR YEAST WE DO NOT REPEAT MASK SEQUENCE. ssh hgwdev echo 'create database sacCer1' | hgsql '' cd /cluster/data/sacCer1 hgNibSeq sacCer1 /cluster/data/sacCer1/nib */*.fa faSize -detailed */*.fa > chrom.sizes mkdir -p /gbdb/sacCer1/nib ln -s /cluster/data/sacCer1/nib/*.nib /gbdb/sacCer1/nib # Create a read-only alias in your .cshrc or .profile alias sacCer1 mysql -u hguser -phguserstuff -A sacCer1 # Use df to ake sure there is at least 5 gig of free mySql table space df -h /var/lib/mysql # CREATING GRP TABLE FOR TRACK GROUPING (DONE 2003-11-21 - Jim) ssh hgwdev # the following command copies all the data from the table # grp in the database hg16 to our new database sacCer1 echo "create table grp (PRIMARY KEY(NAME)) select * from hg16.grp" \ | hgsql sacCer1 # MAKE HGCENTRALTEST ENTRY AND TRACKDB TABLE FOR YEAST (DONE 2003-11-24 Jim) # Warning: must genome and organism fields must correspond # with defaultDb values echo 'INSERT INTO dbDb \ (name, description, nibPath, organism, \ defaultPos, active, orderKey, genome, scientificName, \ htmlPath, hgNearOk) values \ ("sacCer1", "Oct. 2003", "/gbdb/sacCer1/nib", "Yeast", \ "chr2:827700-845800", 1, 65, "Yeast", \ "Saccharomyces cerevisiae", "/gbdb/sacCer1/html/description.html", \ 0);' \ | hgsql -h genome-testdb hgcentraltest echo 'INSERT INTO defaultDb (genome, name) values ("Yeast", "sacCer1");' \ | hgsql -h genome-testdb hgcentraltest # Make trackDb table so browser knows what tracks to expect: ssh hgwdev cd ~/src/hg/makeDb/trackDb cvs up -d -P # Edit that makefile to add sacCer1 in all the right places and do make update # go public on genome-test #make alpha cvs commit makefile # Add trackDb directories mkdir sacCer mkdir sacCer/sacCer1 cvs add sacCer cvs add sacCer/sacCer1 cvs commit sacCer # MAKE GCPERCENT (DONE 2003-11-24 - Jim) ssh hgwdev mkdir /cluster/data/sacCer1/bed/gcPercent cd /cluster/data/sacCer1/bed/gcPercent # create and load gcPercent table hgsql sacCer1 < ~/src/hg/lib/gcPercent.sql hgGcPercent sacCer1 ../../nib # RUN TANDEM REPEAT MASKER (DONE 2003-11-24 - Jim) # Do tandem repeat masking - this takes about 2 minutes. ssh hgwdev mkdir -p /cluster/data/sacCer1/bed/simpleRepeat cd /cluster/data/sacCer1 foreach i (? ??) trfBig $i/chr$i.fa /dev/null \ -bedAt=/cluster/data/sacCer1/bed/simpleRepeat/chr$i.bed end # Load into the database cd /cluster/data/sacCer1/bed/simpleRepeat hgLoadBed sacCer1 simpleRepeat *.bed \ -sqlTable=$HOME/src/hg/lib/simpleRepeat.sql # Loaded 1316 elements of size 16 featureBits sacCer1 simpleRepeat # 82600648 bases of 2627444668 (3.144%) in intersection # MAKE DESCRIPTION/SAMPLE POSITION HTML PAGE (done) ssh hgwdev # Write ~/kent/src/hg/makeDb/trackDb/sacCer/sacCer1/description.html # with a description of the assembly and some sample position queries. chmod a+r ~/kent/src/hg/makeDb/trackDb/sacCer/sacCer1/description.html # Check it in mkdir -p /gbdb/sacCer1/html ln -s /cluster/data/sacCer1/html/description.html /gbdb/sacCer1/html/ # Create line in trackDb/makefile that copies description.html into # /cluster/data/sacCer1/html/description.html # Note, you definitely want to make the symbolic link in /gbdb/sacCer/html # before doing this. # MAKE HGCENTRALTEST BLATSERVERS ENTRY (DONE 2003-11-24 Jim) # AND SET UP BLAT SERVERS ssh blat10 cd /scratch mkdir sacCer1Nib scp 'hgwdev:/cluster/data/sacCer1/nib/*.nib' sacCer1Nib # Ask admins to set up blat servers and ask which ports they assign. # 8/26/04: set canPcr=1 for untranslated blat server. ssh hgwdev echo 'insert into blatServers values("sacCer1", "blat10", 17788, 0, 1); \ insert into blatServers values("sacCer1", "blat10", 17789, 1, 0);' \ | hgsql -h genome-testdb hgcentraltest # CREATING SGD-BASED KNOWN GENES AND OTHER FEATURES (DONE 2003-12-02 Jim) # Note initially the s_cerevisiae.gff3 file ended up being out of # sync with the genome. Mike Cherry (cherry@genome.stanford.edu) # regenerated it. The format may end up changing in the future though. ssh hgwdev cd /cluster/data/sacCer1/bed mkdir sgdGene hgSgdGff3 ../download/chromosomal_feature/s_cerevisiae.gff3 sgdGene cd sgdGene ldHgGene sacCer1 sgdGene codingGenes.gff hgLoadBed sacCer1 sgdOther otherFeatures.bed \ -tab -sqlTable=$HOME/kent/src/hg/lib/sgdOther.sql zcat ../../download/orf_protein/*.fasta.gz \ | hgSgdPep -restrict=genePred.tab stdin sgdPep.fa symbol.txt hgPepPred sacCer1 generic sgdPep sgdPep.fa echo 'create table sgdToName ( \ name varchar(10) not null, \ value varchar(10) not null, \ PRIMARY KEY(name), \ INDEX (value));' | hgsql sacCer1 echo 'load data local infile "symbol.txt" \ into table sgdToName;' | hgsql sacCer1 hgsql sacCer1 < $HOME/kent/src/hg/lib/sgdDescription.sql echo 'load data local infile "descriptions.txt" \ into table sgdDescription;' | hgsql sacCer1 hgsql sacCer1 < $HOME/kent/src/hg/lib/sgdOtherDescription.sql echo 'load data local infile "notes.txt" \ into table sgdOtherDescription;' | hgsql sacCer1 # ADDING SWISSPROT ACCESSION TO KNOWN GENES (DONE 2003-11-25 Jim) ssh hgwdev cd /cluster/data/sacCer1/bed/sgdGene awk '$2 == "SwissProt" {printf("%s\t%s\n", $3, $1);}' \ ../../download/chromosomal_feature/external_id.tab \ > sgdToSwissProt.txt echo 'create table sgdToSwissProt ( \ name varchar(10) not null, \ value varchar(10) not null, \ PRIMARY KEY(name), \ INDEX (value));' | hgsql sacCer1 echo 'load data local infile "sgdToSwissProt.txt" \ into table sgdToSwissProt;' | hgsql sacCer1 hgProtIdToGenePred sacCer1 sgdGene sgdToSwissProt name value # CREATE SGD-BASED CLONE TRACK (DONE 2003-11-25 Jim) ssh hgwdev cd /cluster/data/sacCer1/bed mkdir sgdClone cd sgdClone awk -F '\t' '{printf("chr%s\t%d\t%d\t%s\t%s\n", $3, $4-1, $5, $2, $1);}' \ ../../download/chromosomal_feature/clone.tab > sgdClone.bed hgLoadBed sacCer1 sgdClone sgdClone.bed -tab \ -sqlTable=$HOME/kent/src/hg/lib/sgdClone.sql # AUTO UPDATE GENBANK MRNA RUN (Done 2003-11-24 Jim) # Put the nib's on /cluster/bluearc: ssh eieio mkdir -p /cluster/bluearc/sacCer/sacCer1/nib cp -pR /cluster/data/sacCer1/nib/*.nib /cluster/bluearc/sacCer/sacCer1/nib # Instructions for setting up incremental genbank updates are here: # http://www.soe.ucsc.edu/~markd/genbank-update/doc/initial-load.html # Added static char *sacCerNames[] = {"Saccharomyces cerevisiae", NULL}; and {"dm", dmNames, NULL}, to appropriate parts of src/hg/makeDb/genbank/src/lib/gbGenome.c # Then make and make install cd src/hg/make/makeDb/genbank make PREFIX=/cluster/store5/genbank install # Edit /cluster/data/genbank/etc/genbank.conf and add: # sacCer1 sacCer1.genome = /cluster/bluearc/sacCer/sacCer1/nib/chr*.nib sacCer1.lift = no sacCer1.genbank.mrna.xeno.load = no sacCer1.genbank.est.xeno.load = no sacCer1.downloadDir = sacCer1 # Do the alignments ssh eieio cd /cluster/data/genbank nice bin/gbAlignStep -iserver=no \ -clusterRootDir=/cluster/bluearc/genbank \ -verbose=1 -initial sacCer1 # Load the results from the above ssh hgwdev cd /cluster/data/genbank nice bin/gbDbLoadStep -verbose=1 -drop -initialLoad sacCer1 # DOING MULTIPLE ALIGNMENT WITH OTHER YEAST (Done Jim) # Grab sequence from all six yeast and massage # it so that fasta records all start with sacXXX # in the directory /cluster/data/sacCer1/bed/otherYeast/align ssh kkr1u00 cd /cluster/data/sacCer1/bed/otherYeast/align # Create directory full of size info mkdir sizes foreach i (sac*) faSize $i -detailed > sizes/$i end # Create some working directories mkdir lav axtAll axtBest mafIn mafRawOut mafOut # Create little shell script to do blastz alignments cat > bz << end #!/bin/csh -ef blastz \$1 \$2 > \$3 end chmod a+x bz # Create job spec to do all blastz alignments ls -1 ../../../*/chr*.fa > sacCer.lst ls -1 sac??? > other.lst cat > gsub << end #LOOP bz \$(path1) \$(path2) {check out line+ lav/\$(root1).\$(root2)} #ENDLOOP end gensub2 sacCer.lst other.lst gsub spec # Do parasol blastz run para create spec para try # Do some para checks and if all is well para push #Completed: 102 of 102 jobs #CPU time in finished jobs: 43279s 721.31m 12.02h 0.50d 0.001 y #IO & Wait Time: 540s 9.00m 0.15h 0.01d 0.000 y #Average job time: 430s 7.16m 0.12h 0.00d #Longest job: 1458s 24.30m 0.41h 0.02d #Submission to last job: 3994s 66.57m 1.11h 0.05d # Convert from lav to axt cd lav foreach i (sacPar sacMik sacKud sacBay sacCas sacKlu) foreach j (*.$i) lavToAxt $j ../../../../nib ../$i ../axtAll/$j -fa end echo done $i end cd .. # Run axtBest cd axtAll foreach i (sacPar sacMik sacKud sacBay sacCas sacKlu) foreach j (*.$i) axtBest $j $j:r ../axtBest/$j end echo done $i end cd .. # Convert to maf cd axtBest foreach i (sacPar sacMik sacKud sacBay sacCas sacKlu) foreach j (*.$i) axtToMaf $j ../../../../chrom.sizes ../sizes/$i ../mafIn/$j -tPrefix=sacCer1. end echo done $i end cd .. # Run multiz cd mafIn set mz = /cluster/bin/penn/tba.9.13/multiz foreach i (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 M) set c = chr$i $mz $c.sacPar $c.sacMik - > ../mafRawOut/$c.maf echo done $c.sacMik foreach s (sacKud sacBay sacCas sacKlu) $mz $c.$s ../mafRawOut/$c.maf - > ../mafRawOut/tmp mv ../mafRawOut/tmp ../mafRawOut/$c.maf echo done $c.$s end end cd .. #Load into database ssh hgwdev cd /cluster/data/sacCer1/bed/otherYeast/align ln -s /cluster/store6/sacCer1/bed/otherYeast/align/mafOut /gbdb/sacCer1/multizYeast hgLoadMaf sacCer1 multizYeast # Clean up cd mafRawOut foreach i (*.maf) mafFilter -minScore=100 $i > ../mafOut/$i rm -r axtAll axtBest lav err mafRawOut multizYeast.tab # ADDING LOCALIZATION AND ABUNDANCE INFO FROM SGD AND # http://yeastgfp.ucsf.edu (DONE 2003-11-24 - Jim) ssh hgwdev cd /cluster/data/sacCer1/bed mkdir sgdLocalization cd sgdLocalization hgSgdGfp sacCer1 ../../download/systematic_results/localization/OS*.tab . # UPDATE GO DATABASE (DONE -Jim) ssh hgwdev cd /cluster/store1/geneOntology mkdir 20031202 cd 20031202 wget ftp://ftp.geneontology.org/pub/go/godatabase/archive/latest/go_200311-termdb-data.gz wget ftp://ftp.geneontology.org/pub/go/gene-associations/gene_association.goa_sptr.gz hgsql mysql < sgdToSgd.tab echo 'load data local infile "sgdToSgd.tab" into table sgdToSgd' \ | hgsql sacCer1 # Make expression similarity table. cd ~/src/hg/near/hgExpDistance hgExpDistance sacCer1 hgFixed.yeastChoCellCycle hgFixed.yeastChoCellCycleExps choExpDistance #DOING SELF-PROTEIN ALIGNMENTS AND LOADING (DONE 2003-12-8 Jim) # Extract peptides from genome database into fasta file # and create a blast database out of them. mkdir -p /cluster/data/sacCer1/bed/blastp cd /cluster/data/sacCer1/bed/blastp pepPredToFa sacCer1 sgdPep sgdPep.faa formatdb -i sgdPep.faa -t sgdPep -n sgdPep cd .. # Copy over database to /scratch ssh kk mkdir -p /scratch/sacCer1/blastp cp /cluster/data/sacCer1/bed/blastp/sgdPep.p* /scratch/sacCer1/blastp sudo /cluster/install/utilities/updateLocal # Split up fasta file into bite sized chunks for cluster cd /cluster/data/sacCer1/bed/blastp mkdir split faSplit sequence sgdPep.faa 6000 split/kg # Make parasol run directory ssh kk mkdir -p /cluster/data/sacCer1/bed/blastp/self cd /cluster/data/sacCer1/bed/blastp/self mkdir run cd run mkdir out # Make blast script cat > blastSome < gsub < split.lst gensub2 split.lst single gsub spec para create spec para try # Wait a couple of minutes, and do a para check, if all is good # then do a para push # This should finish in 1 minute if the cluster is free. #Completed: 5743 of 5743 jobs #CPU time in finished jobs: 3570s 59.50m 0.99h 0.04d 0.000 y #IO & Wait Time: 14973s 249.55m 4.16h 0.17d 0.000 y #Average job time: 3s 0.05m 0.00h 0.00d #Longest job: 12s 0.20m 0.00h 0.00d #Submission to last job: 55s 0.92m 0.02h 0.00d # Load into database. This takes a minute ssh hgwdev cd /cluster/data/sacCer1/bed/blastp/self/run/out hgLoadBlastTab sacCer1 sgdBlastTab *.tab #Scanning through 5743 files #Loading database with 52725 rows #DOING C.ELEGANS-PROTEIN ALIGNMENTS AND LOADING (DONE 2003-12-8 Jim) # Make parasol run directory ssh kk mkdir -p /cluster/data/sacCer1/bed/blastp/ce1 cd /cluster/data/sacCer1/bed/blastp/ce1 mkdir run cd run mkdir out # Make blast script cat > blastSome < gsub < split.lst gensub2 split.lst single gsub spec para create spec para try # Wait a couple of minutes, and do a para check, if all is good # then do a para push Completed: 5743 of 5743 jobs #CPU time in finished jobs: 9397s 156.62m 2.61h 0.11d 0.000 y #IO & Wait Time: 21849s 364.15m 6.07h 0.25d 0.001 y #Average job time: 5s 0.09m 0.00h 0.00d #Longest job: 26s 0.43m 0.01h 0.00d #Submission to last job: 75s 1.25m 0.02h 0.00d # Load into database. ssh hgwdev cd /cluster/data/sacCer1/bed/blastp/ce1/run/out hgLoadBlastTab sacCer1 ceBlastTab -maxPer=1 *.tab #DOING MOUSE-PROTEIN ALIGNMENTS AND LOADING (DONE 2003-12-8 Jim) # Make mouse ortholog column using blastp on mouse known genes. # First make mouse protein database and copy it to iscratch/i # if it doesn't exist already cd /cluster/data/mm4/bed mkdir blastp cd blastp pepPredToFa mm4 knownGenePep known.faa formatdb -i known.faa -t known -n known ssh kkr1u00 if (-e /iscratch/i/mm4/blastp) then rm -r /iscratch/i/mm4/blastp endif mkdir -p /iscratch/i/mm4/blastp cp /cluster/data/mm4/bed/blastp/known.p?? /iscratch/i/mm4/blastp iSync # Make parasol run directory ssh kk mkdir -p /cluster/data/sacCer1/bed/blastp/mm4 cd /cluster/data/sacCer1/bed/blastp/mm4 mkdir run cd run mkdir out # Make blast script cat > blastSome < gsub < split.lst gensub2 split.lst single gsub spec para create spec para try # Wait a couple of minutes, and do a para check, if all is good # then do a para push #Completed: 5743 of 5743 jobs #CPU time in finished jobs: 13913s 231.88m 3.86h 0.16d 0.000 y #IO & Wait Time: 27267s 454.45m 7.57h 0.32d 0.001 y #Average job time: 7s 0.12m 0.00h 0.00d #Longest job: 40s 0.67m 0.01h 0.00d #Submission to last job: 80s 1.33m 0.02h 0.00d # Load into database. ssh hgwdev cd /cluster/data/sacCer1/bed/blastp/mm4/run/out hgLoadBlastTab sacCer1 mmBlastTab -maxPer=1 *.tab #DOING ZEBRAFISH-PROTEIN ALIGNMENTS AND LOADING (DONE 2003-12-8 Jim) # Make Danio rerio (zebrafish) ortholog column using blastp on Ensembl. # First make protein database and copy it to iscratch/i # if it doesn't exist already cd /cluster/data/dr1/bed mkdir blastp cd blastp wget ftp://ftp.ensembl.org/pub/current_zebrafish/data/fasta/pep/Danio_rerio.ZFISH2.pep.fa.gz zcat Dan*.pep.fa.gz > ensembl.faa echo "Translation:" > subs.in subs -e ensembl.faa > /dev/null formatdb -i ensembl.faa -t ensembl -n ensembl ssh kkr1u00 if (-e /iscratch/i/dr1/blastp) then rm -r /iscratch/i/dr1/blastp endif mkdir -p /iscratch/i/dr1/blastp cp /cluster/data/dr1/bed/blastp/ensembl.p?? /iscratch/i/dr1/blastp iSync # Make parasol run directory ssh kk mkdir -p /cluster/data/sacCer1/bed/blastp/dr1 cd /cluster/data/sacCer1/bed/blastp/dr1 mkdir run cd run mkdir out # Make blast script cat > blastSome < gsub < split.lst gensub2 split.lst single gsub spec para create spec para try # Wait a couple of minutes, and do a para check, if all is good # then do a para push #Completed: 5743 of 5743 jobs #CPU time in finished jobs: 11135s 185.58m 3.09h 0.13d 0.000 y #IO & Wait Time: 23501s 391.69m 6.53h 0.27d 0.001 y #Average job time: 6s 0.10m 0.00h 0.00d #Longest job: 40s 0.67m 0.01h 0.00d #Submission to last job: 71s 1.18m 0.02h 0.00d # Load into database. ssh hgwdev cd /cluster/data/sacCer1/bed/blastp/dr1/run/out hgLoadBlastTab sacCer1 drBlastTab -maxPer=1 *.tab #DOING FRUITFLY-PROTEIN ALIGNMENTS AND LOADING (DONE 2003-12-8 Jim) # Make Drosophila melanagaster ortholog column using blastp on FlyBase. # First make SwissProt protein database and copy it to iscratch/i # if it doesn't exist already cd /cluster/data/dm1/bed mkdir blastp cd blastp pepPredToFa dm1 bdgpGenePep bdgp.faa formatdb -i bdgp.faa -t bdgp -n bdgp ssh kkr1u00 if (-e /iscratch/i/dm1/blastp) then rm -r /iscratch/i/dm1/blastp endif mkdir -p /iscratch/i/dm1/blastp cp /cluster/data/dm1/bed/blastp/bdgp.p?? /iscratch/i/dm1/blastp iSync # Make parasol run directory ssh kk mkdir -p /cluster/data/sacCer1/bed/blastp/dm1 cd /cluster/data/sacCer1/bed/blastp/dm1 mkdir run cd run mkdir out # Make blast script cat > blastSome < gsub < split.lst gensub2 split.lst single gsub spec para create spec para try # Wait a couple of minutes, and do a para check, if all is good # then do a para push #Completed: 5743 of 5743 jobs #CPU time in finished jobs: 9749s 162.49m 2.71h 0.11d 0.000 y #IO & Wait Time: 22247s 370.78m 6.18h 0.26d 0.001 y #Average job time: 6s 0.09m 0.00h 0.00d #Longest job: 28s 0.47m 0.01h 0.00d #Submission to last job: 64s 1.07m 0.02h 0.00d # Load into database. ssh hgwdev cd /cluster/data/sacCer1/bed/blastp/dm1/run/out hgLoadBlastTab sacCer1 dmBlastTab -maxPer=1 *.tab #DOING HUMAN-PROTEIN ALIGNMENTS AND LOADING (DONE 2003-12-8 Jim) # Make human ortholog column using blastp on human known genes. # First make human protein database and copy it to iscratch/i # if it doesn't exist already cd /cluster/data/hg16/bed mkdir blastp cd blastp pepPredToFa hg16 knownGenePep known.faa formatdb -i known.faa -t known -n known ssh kkr1u00 if (-e /iscratch/i/hg16/blastp) then rm -r /iscratch/i/hg16/blastp endif mkdir -p /iscratch/i/hg16/blastp cp /cluster/data/hg16/bed/blastp/known.p?? /iscratch/i/hg16/blastp iSync # Make parasol run directory ssh kk mkdir -p /cluster/data/sacCer1/bed/blastp/hg16 cd /cluster/data/sacCer1/bed/blastp/hg16 mkdir run cd run mkdir out # Make blast script cat > blastSome < gsub < split.lst gensub2 split.lst single gsub spec para create spec para try # Wait a couple of minutes, and do a para check, if all is good # then do a para push #Completed: 5743 of 5743 jobs #CPU time in finished jobs: 16096s 268.27m 4.47h 0.19d 0.001 y #IO & Wait Time: 29943s 499.05m 8.32h 0.35d 0.001 y #Average job time: 8s 0.13m 0.00h 0.00d #Longest job: 65s 1.08m 0.02h 0.00d #Submission to last job: 100s 1.67m 0.03h 0.00d # Load into database. ssh hgwdev cd /cluster/data/sacCer1/bed/blastp/hg16/run/out hgLoadBlastTab sacCer1 hgBlastTab -maxPer=1 *.tab # LOADING ERAN SEGAL'S REGULATORY MODULE STUFF (DONE 2004-9-22 -Jim) cd /cluster/data/sacCer1/bed mkdir eranSegal cd eranSegal # Copy module_assignments.tab, motif_attributes.tab, motif_modules.tab # from email from into this directory. echo "select name,chrom,txStart,txEnd,strand,500,0 from sgdGene;" | \ hgsql -N sacCer1 > gene_position.tab echo "select name,chrom,chromStart,chromEnd,strand,500,0 from sgdOther;" | \ hgsql -N sacCer1 >> gene_position.tab hgLoadEranModules sacCer1 module_assignments.tab motif_modules.tab \ modules_motifs.gxm modules_motif_positions.tab gene_position.tab # LOADING REGULATORY CODE STUFF (In Progress 2004-9-22 -Jim) cd /cluster/data/sacCer1/bed mkdir harbisonGordon cd harbisonGordon # Create growthCondition.tab by editing by hand # http://jura.wi.mit.edu/young_public/regulatory_code/GrowthEnvironments.html # and then load: hgsql sacCer1 < $HOME/kent/src/hg/lib/growthCondition.sql hgsql sacCer1 -e 'load data local infile "growthCondition.tab" into table growthCondition' # Get GFF files and motif file from downloads section of # http://jura.wi.mit.edu/fraenkel/regulatory_map # Also get v24_probes_041004.GFF from Ben Gordon dbgordon@wi.mit.edu via email. # Get Conditions_Summary.txt also from Ben from email. sort v24_probes_041004.GFF | uniq > probes.GFF hgYeastRegCode motifGff Final_InTableS2_v24.motifs probes.GFF Conditions_Summary.txt\ transRegCode.bed transRegCodeMotif.tab transRegCodeProbe.bed transRegCodeCondition.tab hgLoadBed sacCer1 transRegCode transRegCode.bed -sqlTable=$HOME/kent/src/hg/lib/transRegCode.sql hgLoadBed sacCer1 transRegCodeProbe transRegCodeProbe.bed -sqlTable=$HOME/kent/src/hg/lib/transRegCodeProbe.sql -tab hgsql sacCer1 < $HOME/kent/src/hg/lib/transRegCodeCondition.sql hgsql sacCer1 -e 'load data local infile "transRegCodeCondition.tab" into table transRegCodeCondition' hgsql sacCer1 < $HOME/kent/src/hg/lib/transRegCodeMotif.sql hgsql sacCer1 -e 'load data local infile "transRegCodeMotif.tab" into table transRegCodeMotif' # Get rid of some crufty motif placements Ben Gordon doesn't like anymore that aren't in the # transRegCodeMotif table. hgsql sacCer1 -e 'delete from transRegCode where name="CRZ1" or name="ECM22" or name="SFL1"' # Trim some motif columns fixHarbisonMotifs sacCer1 #CREATING DOWNLOADS DIRECTORY ssh hgwdev cd /usr/local/apache/htdocs/goldenPath mkdir sacCer1 cd sacCer1 mkdir bigZips database cd bigZips zip -j chromFa.zip /cluster/data/sacCer1/*/chr*.fa zip -j multizYeast.zip /gbdb/sacCer1/multizYeast/*.maf # Create a README.txt file here. # MITOPRED DATA FOR HGGENE (DONE 7/30/04 angie) ssh hgwdev mkdir /cluster/data/sacCer1/bed/mitopred cd /cluster/data/sacCer1/bed/mitopred wget http://mitopred.sdsc.edu/data/yst_30.out perl -wpe 's/^(\S+)\s+\S+\s+(.*)/$1\t$2/' yst_30.out > mitopred.tab cat > mitopred.sql << '_EOF_' # Prediction of nuclear-encoded mito. proteins from http://mitopred.sdsc.edu/ CREATE TABLE mitopred ( name varchar(10) not null, # SwissProt ID confidence varchar(8) not null, # Confidence level #Indices PRIMARY KEY(name(6)) ); '_EOF_' # << this line makes emacs coloring happy hgsql sacCer1 < mitopred.sql hgsql sacCer1 -e 'load data local infile "mitopred.tab" into table mitopred' # MAKE Human Proteins track (DONE 2004-08-25 braney) ssh kksilo mkdir -p /cluster/data/sacCer1/blastDb cd /cluster/data/sacCer1/blastDb for i in ../*/*.fa; do ln -s $i .; done for i in *.fa; do formatdb -i $i -p F 2> /dev/null; done rm *.fa *.log ssh kkr1u00 mkdir -p /iscratch/i/sacCer1/blastDb cp /cluster/data/sacCer1/blastDb/* /iscratch/i/sacCer1/blastDb (~kent/bin/iSync) 2>&1 > sync.out mkdir -p /cluster/data/sacCer1/bed/tblastn.hg16KG cd /cluster/data/sacCer1/bed/tblastn.hg16KG ls -1S /iscratch/i/sacCer1/blastDb/*.nsq | sed "s/\.nsq//" > yeast.lst exit # back to kksilo cd /cluster/data/sacCer1/bed/tblastn.hg16KG mkdir kgfa ls -1S /iscratch/i/squirt/ci1/kgfas/*.fa > kg.lst mkdir blastOut for i in `cat kg.lst`; do mkdir blastOut/`basename $i .fa`; done cat << '_EOF_' > blastGsub #LOOP blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl } #ENDLOOP '_EOF_' cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/iscratch/i/blast/data export BLASTMAT g=`basename $2` f=/tmp/`basename $3`.$g for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11 do if /scratch/blast/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8 then mv $f.8 $f.1 break; fi done if test -f $f.1 then if /cluster/bin/i386/blastToPsl $f.1 $f.2 then liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg16/bed/blat.hg16KG/protein.lft warn $f.2 if pslCheck -prot $3.tmp then mv $3.tmp $3 rm -f $f.1 $f.2 exit 0 fi fi fi rm -f $f.1 $f.2 $3.tmp $f.8 exit 1 '_EOF_' chmod +x blastSome gensub2 yeast.lst kg.lst blastGsub blastSpec ssh kk cd /cluster/data/sacCer1/bed/tblastn.hg16KG para create blastSpec para try, push cat << '_EOF_' > chainGsub #LOOP chainSome $(path1) #ENDLOOP '_EOF_' cat << '_EOF_' > chainSome (cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=7500 stdin ../c.`basename $1`.psl) '_EOF_' chmod +x chainSome ls -1dS `pwd`/blastOut/kg?? > chain.lst gensub2 chain.lst single chainGsub chainSpec ssh kki cd /cluster/data/sacCer1/bed/tblastn.hg16KG para create chainSpec para push exit # back to kksilo cd /cluster/data/sacCer1/bed/tblastn.hg16KG/blastOut for i in kg?? do awk "(\$13 - \$12)/\$11 > 0.6 {print}" c.$i.psl > c60.$i.psl sort -rn c60.$i.psl | pslUniq stdin u.$i.psl awk "((\$1 / \$11) ) > 0.60 { print }" c60.$i.psl > m60.$i.psl echo $i done cat u.*.psl m60* | sort -T /tmp -k 14,14 -k 16,16n -k 17,17n | uniq > ../blastHg16KG.psl ssh hgwdev cd /cluster/data/sacCer1/bed/tblastn.hg16KG hgLoadPsl sacCer1 blastHg16KG.psl exit # back to kksilo rm -rf blastOut # End tblastn # BLAT SGD predicted sacCer1 proteins against sacCer1 (DONE 2004-08-31 braney) ssh hgwdev cd /cluster/data/sacCer1/bed mkdir blat.sacCer1SG cd blat.sacCer1SG pepPredToFa sacCer1 sgdPep sacCer1SG.fa hgPepPred sacCer1 generic blastSGPep00 sacCer1SG.fa ssh kk cd /cluster/data/sacCer1/bed/blat.sacCer1SG cat << '_EOF_' > blatSome #!/bin/csh -fe /cluster/bin/i386/blat -t=dnax -q=prot -out=pslx $1 $2 $3 '_EOF_' # << keep emacs happy chmod +x blatSome ls -1S /cluster/bluearc/sacCer/sacCer1/nib/*.nib > yeast.lst mkdir sgfas cd sgfas faSplit sequence ../sacCer1SG.fa 1000 sg cd .. ls -1S sgfas/*.fa > sg.lst cat << '_EOF_' > blatGsub #LOOP blatSome $(path1) {check in line $(path2)} {check out line psl/$(root1)/$(root2).psl} #ENDLOOP '_EOF_' # << keep emacs happy gensub2 yeast.lst sg.lst blatGsub blatSpec mkdir psl cd psl foreach i (`cat ../yeast.lst`) mkdir `basename $i .nib` end cd .. para create blatSpec para push # Completed: 16490 of 16490 jobs # CPU time in finished jobs: 52796s 879.94m 14.67h 0.61d 0.002 y # IO & Wait Time: 44955s 749.25m 12.49h 0.52d 0.001 y # Average job time: 6s 0.10m 0.00h 0.00d # Longest job: 22s 0.37m 0.01h 0.00d # Submission to last job: 714s 11.90m 0.20h 0.01d ssh kksilo cd /cluster/data/sacCer1/bed/blat.sacCer1SG pslSort dirs raw.psl /tmp psl/* pslReps -nohead -minCover=0.9 -minAli=0.9 raw.psl cov90.psl /dev/null pslMaxMap -maxMap=1 cov90.psl sacCer1SG.psl pslxToFa sacCer1SG.psl sacCer1SG_ex.fa -liftTarget=genome.lft -liftQuery=protein.lft sgName sacCer1 sacCer1SG.psl blastSGRef00 ssh hgwdev cd /cluster/data/sacCer1/bed/blat.sacCer1SG hgsql sacCer1 < ~/kent/src/hg/lib/blastRef.sql echo "rename table blastRef to blastSGRef00" | hgsql sacCer1 echo "load data local infile 'blastSGRef00' into table blastSGRef00" | hgsql sacCer1 # PHASTCONS AND PHASTCONS ELEMENTS (acs, 9/15/04) ssh hgwdev cd /cluster/data/sacCer1/bed/otherYeast mkdir phastCons cd phastCons # split up alignments; no need to use cluster here MAF=/cluster/data/sacCer1/bed/otherYeast/align/mafOut FA=/cluster/data/sacCer1 CHROMS="1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 M" mkdir -p SS for i in $CHROMS ; do \ echo chr${i} ; \ msa_view $MAF/chr${i}.maf -M $FA/$i/chr${i}.fa -i MAF -o SS -O sacCer1,sacKud,sacMik,sacPar,sacBay,sacCas,sacKlu > SS/chr${i}.ss ; \ done # produce rough starting model msa_view -i SS --aggregate sacCer1,sacKud,sacMik,sacPar,sacBay,sacCas,sacKlu -o SS -z SS/*.ss > all.ss phyloFit -i SS all.ss --tree "((((((sacCer1,sacPar),sacMik),sacKud),sacBay),sacCas),sacKlu)" -o starting-tree # (cheat the branch lengths up a bit, since this represents an # average of conserved and nonconserved sites; we want to # initialize for nonconserved) tree_doctor --scale 2 starting-tree.mod > tmp.mod mv tmp.mod starting-tree.mod # also get average GC content of aligned regions msa_view -i SS all.ss --summary-only #descrip. A C G T G+C length all_gaps some_gaps #all.ss 0.3052 0.1954 0.1951 0.3042 0.3906 13251474 0 2260153 # save some I/O and space gzip SS/*.ss # now set up cluster job to estimate model parameters. See # makeHg17.doc for add'l details cat << 'EOF' > doEstimate.sh #!/bin/sh zcat $1 | /cluster/bin/phast/phastCons - starting-tree.mod --gc 0.3906 --nrates 1,1 --no-post-probs --ignore-missing --expected-lengths 75 --target-coverage 0.5 --quiet --log $2 --estimate-trees $3 EOF chmod u+x doEstimate.sh rm -fr LOG TREES mkdir -p LOG TREES rm -f jobs.lst for f in SS/*.ss.gz ; do root=`basename $f .ss.gz` ;\ echo doEstimate.sh $f LOG/$root.log TREES/$root >> jobs.lst ;\ done # run cluster job ssh kk9... para create ... # (takes about 15 minutes) # Now combine parameter estimates. ls TREES/*.cons.mod > cons.txt phyloBoot --read-mods '*cons.txt' --output-average ave.cons.mod > cons_summary.txt ls TREES/*.noncons.mod > noncons.txt phyloBoot --read-mods '*noncons.txt' --output-average ave.noncons.mod > noncons_summary.txt # Now set up cluster job for computing consevation scores and # predicted elements cat << 'EOF' > doPhastCons.sh #!/bin/sh mkdir -p POSTPROBS ELEMENTS pref=`basename $1 .ss.gz` chr=`echo $pref | awk -F\. '{print $1}'` tmpfile=/scratch/phastCons.$$ zcat $1 | /cluster/bin/phast/phastCons - ave.cons.mod,ave.noncons.mod --expected-lengths 75 --target-coverage 0.5 --quiet --seqname $chr --idpref $pref --viterbi ELEMENTS/$pref.bed --score --require-informative 0 > $tmpfile gzip -c $tmpfile > POSTPROBS/$pref.pp.gz rm $tmpfile EOF chmod u+x doPhastCons.sh rm -fr POSTPROBS ELEMENTS rm -f jobs2.lst for f in SS/*.ss.gz ; do echo doPhastCons.sh $f >> jobs2.lst ; done # run cluster job ssh kk9... para create, ... # (takes about 30 sec) # set up phastConsElements track awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' ELEMENTS/*.bed > all.raw.bed /cluster/bin/scripts/lodToBedScore all.raw.bed > all.bed hgLoadBed sacCer1 phastConsElements all.bed featureBits sacCer1 phastConsElements # 7517116 bases of 12156302 (61.837%) in intersection # set up wiggle mkdir -p wib for i in $CHROMS ; do \ wigAsciiToBinary -chrom=chr${i} -wibFile=wib/chr${i}_phastCons POSTPROBS/chr${i}.pp.gz ;\ done # Changed this path 2004-09-27 so a new set of data could go out # under the same table name - Hiram, was simply: # /gbdb/sacCer1/wib/chr*_phastCons.wib hgLoadWiggle sacCer1 phastCons -pathPrefix=/gbdb/sacCer1/wib/phastCons wib/chr*_phastCons.wig mkdir -p /gbdb/sacCer1/wib/phastCons rm -f /gbdb/sacCer1/wib/phastCons/chr*_phastCons.wib ln -s /cluster/data/sacCer1/bed/otherYeast/phastCons/wib/*.wib /gbdb/sacCer1/wib/phastCons chmod 775 . wib /gbdb/sacCer1/wib/phastCons /gbdb/sacCer1/wib/phastCons/*.wib chmod 664 wib/*.wib # set up full alignment/conservation track rm -rf pwMaf /gbdb/sacCer1/pwMaf mkdir -p pwMaf /gbdb/sacCer1/pwMaf cd pwMaf ;\ for org in sacBay sacCas sacKlu sacKud sacMik sacPar ; do \ mkdir -p $org ; \ cp /cluster/data/sacCer1/bed/otherYeast/align/mafIn/chr*.$org $org ; \ for f in $org/* ; do chr=`basename $f .$org` ; mv $f $org/$chr.maf ; done ;\ ln -s /cluster/data/sacCer1/bed/otherYeast/phastCons/pwMaf/$org /gbdb/sacCer1/pwMaf/${org}_pwMaf ; \ hgLoadMaf sacCer1 -warn ${org}_pwMaf -pathPrefix=/gbdb/sacCer1/pwMaf/${org}_pwMaf ;\ done cd .. chmod 755 pwMaf pwMaf/* /gbdb/sacCer1/pwMaf /gbdb/sacCer1/pwMaf/* chmod 644 pwMaf/*/*.maf # trackDb entries: #track multizYeast #shortLabel Conservation #longLabel Seven Species of Saccharomyces, Alignments & Conservation #group compGeno #priority 100 #visibility pack #type wigMaf 0.0 1.0 #maxHeightPixels 100:40:11 #wiggle phastCons #yLineOnOff Off #autoScale Off #pairwise pwMaf #speciesOrder sacPar sacMik sacKud sacBay sacCas sacKlu #track phastConsElements #shortLabel Most Conserved #longLabel PhastCons Conserved Elements (Seven Species of Saccharomyces) #group compGeno #priority 105 #visibility hide #spectrum on #exonArrows off #showTopScorers 200 #type bed 5 . # MAF COVERAGE FIGURES FOR ADAM (DONE 10/18/04 angie) # First, get ranges of target coverage: ssh kksilo mkdir /cluster/data/sacCer1/bed/otherYeast/align/coverage cd /cluster/data/sacCer1/bed/otherYeast/align/coverage cat /cluster/data/sacCer1/bed/otherYeast/align/mafOut/*.maf \ | nice mafRanges -notAllOGap stdin sacCer1 sacCer1.mafRanges.bed # Get pairwise coverage as well. foreach other (sacBay sacCas sacKlu sacKud sacMik sacPar) cat /cluster/data/sacCer1/bed/otherYeast/align/mafIn/chr*.$other \ | nice mafRanges -notAllOGap stdin sacCer1 sacCer1.$other.mafRanges.bed end ssh hgwdev cd /cluster/data/sacCer1/bed/otherYeast/align/coverage # To make subsequent intersections a bit quicker, output a bed with # duplicate/overlapping ranges collapsed: nice featureBits sacCer1 sacCer1.mafRanges.bed \ -bed=sacCer1.mafRangesCollapsed.bed #11718319 bases of 12156302 (96.397%) in intersection # mafCoverage barfs on chr15 currently, so pass on this for now: #cat /cluster/data/sacCer1/bed/otherYeast/align/mafOut/*.maf \ #| nice mafCoverage -count=2 sacCer1 stdin > sacCer1.mafCoverage # Intersect maf target coverage with gene regions: nice featureBits sacCer1 -enrichment sgdGene:cds \ sacCer1.mafRangesCollapsed.bed \ -bed=sacCer1.mafCds.bed #sgdGene:cds 69.491%, sacCer1.mafRangesCollapsed.bed 96.397%, both 69.114%, cover 99.46%, enrich 1.03x # No UTR info for yeast, so can't intersect. # Intersect pairwise target coverages with gene regions: foreach other (sacBay sacCas sacKlu sacKud sacMik sacPar) nice featureBits sacCer1 -enrichment sgdGene:cds \ sacCer1.$other.mafRanges.bed -bed=sacCer1.${other}Cds.bed \ |& grep -v "table gap doesn't exist" end #sgdGene:cds 69.491%, sacCer1.sacBay.mafRanges.bed 89.478%, both 66.581%, cover 95.81%, enrich 1.07x #sgdGene:cds 69.491%, sacCer1.sacCas.mafRanges.bed 63.359%, both 55.560%, cover 79.95%, enrich 1.26x #sgdGene:cds 69.491%, sacCer1.sacKlu.mafRanges.bed 56.086%, both 49.245%, cover 70.87%, enrich 1.26x #sgdGene:cds 69.491%, sacCer1.sacKud.mafRanges.bed 89.684%, both 64.873%, cover 93.35%, enrich 1.04x #sgdGene:cds 69.491%, sacCer1.sacMik.mafRanges.bed 92.989%, both 67.178%, cover 96.67%, enrich 1.04x #sgdGene:cds 69.491%, sacCer1.sacPar.mafRanges.bed 96.550%, both 68.464%, cover 98.52%, enrich 1.02x # CREATE TABLES TO SUPPORT SHOWING SAM-T02 RESULTS (DONE 1/4/05, Fan) # Kevin just did his monthly update, so reuild ... (REBUILT 1/7/05 Fan) ssh hgwdev cd /cluster/data/sacCer1/bed mkdir sam cd sam # create script to process 1 SAM subdirectory cat << '_EOF_' >do1Subdir ls /projects/compbio/experiments/protein-predict/yeast/$1/*/summary.html >j1.tmp cat j1.tmp |sed -e 's/yeast\// /g' |sed -e 's/\/summary/ /g' >j2.tmp cat j2.tmp | awk '{print $2}' |sed -e 's/\// /g ' | awk '{print "do1 " $1 " " $2}' >> doall '_EOF_' chmod +x do1Subdir # create high level script to parse all SAM results ls -l /projects/compbio/experiments/protein-predict/yeast | grep drw >j1 cp j1 j2 #edit j2 to remove non SAM result subdirectories vi j2 cat j2 |awk '{print "do1Subdir " $9}' >doAllSubdir chmod +x doAllSubdir rm j1 j2 rm doall doAllSubdir rm j1.tmp j2.tmp # create data needed for the samSubdir table and load them cat doall |awk '{print $3"\t"$2}' >samSubdir.lis hgsql sacCer1 -e "drop table samSubdir" hgsql sacCer1 < ~/src/hg/lib/samSubdir.sql hgsql sacCer1 -e 'load data local infile "samSubdir.lis" into table samSubdir' # create script to parse SAM output results in one subdirectory cat << '_EOF_' >do1 echo processing $2 samHit $2 /projects/compbio/experiments/protein-predict/yeast/$1/$2/$2.t2k.best-scores.rdb >$2.tab '_EOF_' chmod +x do1 chmod +x doall # run the top level script to parse all SAM output results rm *.tab doall # collect all results cat *.tab |sort -u >protHomolog.all # load the results into the protHomolog table hgsql sacCer1 -e "drop table protHomolog" hgsql sacCer1 < ~/src/hg/lib/protHomolog.sql hgsql sacCer1 -e 'load data local infile "protHomolog.all" into table protHomolog' # remove all intermediate .tab files rm *.tab # COPY PART OF SAM-T02 RESULTS TO UCSC GB SERVER (DONE 1/10/05, Fan) # Kevin is going to do monthly updates on SAM-T02 results. # To prevent data inconsistency problems, we now will copy # over some key files and host them on our own server. ssh hgwdev cd /cluster/data/sacCer1/bed/sam mkdir yeast050107 cd yeast050107 # create script to process 1 SAM subdirectory cat << '_EOF_' >do1Subdir mkdir -p yeast/$1 '_EOF_' chmod +x do1Subdir cp -p ../doAllSubdir . mkdir yeast doAllSubdir cp -p ../doall . # create script doall to copy over needed SAM output result files cat << '_EOF_' >do1 echo processing $2 mkdir yeast/$1/$2 cp -f /projects/compbio/experiments/protein-predict/yeast/$1/$2/*.jpg yeast/$1/$2 cp -f /projects/compbio/experiments/protein-predict/yeast/$1/$2/$2.t2k.w0.5-logo.pdf yeast/$1/$2 cp -f /projects/compbio/experiments/protein-predict/yeast/$1/$2/$2.t2k.dssp-ehl2-logo.pdf yeast/$1/$2 cp -f /projects/compbio/experiments/protein-predict/yeast/$1/$2/$2.t2k.undertaker-align.pdb.gz yeast/$1/$2 '_EOF_' chmod +x do1 doall ln -s ./yeast /usr/local/apache/htdocs/goldenPath/sacCer1/sam # REBUIL MOUSE-PROTEIN ALIGNMENTS AND LOADING (DONE 2005-12-16 Fan) # Update mouse ortholog column using blastp on mouse known genes. # First make mouse protein database and copy it to iscratch/i # if it doesn't exist already cd /cluster/data/mm7/bed mkdir blastp cd blastp pepPredToFa mm7 knownGenePep known.faa formatdb -i known.faa -t known -n known ssh kkr1u00 if (-e /iscratch/i/mm7/blastp) then rm -r /iscratch/i/mm7/blastp endif mkdir -p /iscratch/i/mm7/blastp cp -p /cluster/data/mm7/bed/blastp/known.p?? /iscratch/i/mm7/blastp iSync # Make parasol run directory ssh kk mkdir -p /cluster/data/sacCer1/bed/blastp/mm7 cd /cluster/data/sacCer1/bed/blastp/mm7 mkdir run cd run mkdir out # Make blast script cat > blastSome < gsub < split.lst gensub2 split.lst single gsub spec para create spec para try,push,check ... # Completed: 5743 of 5743 jobs # CPU time in finished jobs: 11476s 191.27m 3.19h 0.13d 0.000 y # IO & Wait Time: 15126s 252.10m 4.20h 0.18d 0.000 y # Average job time: 5s 0.08m 0.00h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 29s 0.48m 0.01h 0.00d # Submission to last job: 700s 11.67m 0.19h 0.01d # Load into database. ssh hgwdev cd /cluster/data/sacCer1/bed/blastp/mm7/run/out hgLoadBlastTab sacCer1 mmBlastTab -maxPer=1 *.tab ########################################################################## # HGNEAR PROTEIN BLAST TABLES (DONE 5/22/06 angie) ssh hgwdev mkdir /cluster/data/sacCer1/bed/hgNearBlastp cd /cluster/data/sacCer1/bed/hgNearBlastp # Re-fetch current sgdPep because the one in ../blastz/sgdPep.faa # has been overwritten with more recent data from SGD. That's OK # for other organisms who will be linking straight to SGD instead # of to sacCer1, but sacCer1 *BlastTab should jive with sgdGene. pepPredToFa sacCer1 sgdPep sgdPep.faa cat << _EOF_ > config.ra # Yeast vs. other Gene Sorter orgs that have recently been updated: # human, mouse, fly targetGenesetPrefix sgd targetDb sacCer1 queryDbs hg18 mm8 dm2 sacCer1Fa /cluster/data/sacCer1/bed/hgNearBlastp/sgdPep.faa hg18Fa /cluster/data/hg18/bed/blastp/known.faa mm8Fa /cluster/data/mm8/bed/geneSorter/blastp/known.faa dm2Fa /cluster/data/dm2/bed/flybase4.2/flybasePep.fa buildDir /cluster/data/sacCer1/bed/hgNearBlastp scratchDir /san/sanvol1/scratch/sacCer1HgNearBlastp _EOF_ doHgNearBlastp.pl -noSelf -targetOnly config.ra >& do.log & tail -f do.log # *** hgBlastTab mmBlastTab dmBlastTab ########################################################################## # RELOAD GENBANK (DONE 2006-09-06 markd) # reloaded due to xenos track being dropped confusing joinerCheck nice bin/gbDbLoadStep -drop -initialLoad sacCer1 ######################################################################### # ORegAnno - Open Regulatory Annotations # update Oct 26, 2007; update July 7, 2008 # loaded by Belinda Giardine, in same manner as hg18 ORegAnno track ######################################################################### # MAKE 11.OOC FILE FOR BLAT (DONE - 2009-02-04 - Hiram) # repMatch = 1024 * sizeof(sacCer1)/sizeof(hg18) # 4.32 = 1024 * (12156302/2881515245) # use 10 to be very conservative ssh hgwdev cd /hive/data/genomes/sacCer1 blat sacCer1.2bit /dev/null /dev/null -tileSize=11 -makeOoc=11.ooc \ -repMatch=10 # Wrote 3145 overused 11-mers to 11.ooc # copy this to scratch data cp -p 11.ooc /hive/data/staging/data/sacCer1/sacCer1.11.ooc # make push request to kluster nodes /scratch/data/ ############################################################################# # LIFTOVER TO SacCer2 (DONE - 2009-01-04 - Hiram ) mkdir /hive/data/genomes/sacCer1/bed/blat.sacCer2.2009-02-04 cd /hive/data/genomes/sacCer1/bed/blat.sacCer2.2009-02-04 # -debug run to create run dir, preview scripts... doSameSpeciesLiftOver.pl -debug sacCer1 sacCer2 # Real run: time nice -n +19 doSameSpeciesLiftOver.pl -verbose=2 \ -bigClusterHub=pk -dbHost=hgwdev -workhorse=hgwdev \ sacCer1 sacCer2 > do.log 2>&1 # real 3m21.840s ############################################################################# # LIFTOVER TO SacCer3 (DONE - 2009-01-04 - Hiram ) mkdir /hive/data/genomes/sacCer1/bed/blat.sacCer3.2011-08-31 cd /hive/data/genomes/sacCer1/bed/blat.sacCer3.2011-08-31 # -debug run to create run dir, preview scripts... doSameSpeciesLiftOver.pl -debug sacCer1 sacCer3 # Real run: time nice -n +19 doSameSpeciesLiftOver.pl -verbose=2 \ -bigClusterHub=swarm -dbHost=hgwdev -workhorse=hgwdev \ sacCer1 sacCer3 > do.log 2>&1 # real 3m31.458s ############################################################################# # uwFootprints: (2009-04-04 markd) # obtained from Nobel lab # William Stafford Noble # Xiaoyu Chen mkdir /hive/data/genomes/sacCer1/bed/uwFootprints cd /hive/data/genomes/sacCer1/bed/uwFootprints wget http://noble.gs.washington.edu/proj/footprinting/yeast.dnaseI.tagCounts.wig wget http://noble.gs.washington.edu/proj/footprinting/yeast.mappability.bed wget http://noble.gs.washington.edu/proj/footprinting/yeast.footprints.bed chmod a-w yeast.* wigEncode yeast.dnaseI.tagCounts.wig uwFootprintsTagCounts.wig uwFootprintsTagCounts.wib # Converted yeast.dnaseI.tagCounts.wig, upper limit 13798.00, lower limit 1.00 ln -sf /hive/data/genomes/sacCer1/bed/uwFootprints/uwFootprintsTagCounts.wib /gbdb/sacCer1/wib hgLoadWiggle -tmpDir=/data/tmp sacCer1 uwFootprintsTagCounts uwFootprintsTagCounts.wig hgLoadBed -tab -tmpDir=/data/tmp sacCer1 uwFootprintsMappability yeast.mappability.bed # drop yeast.footprints.bed and truncate name to three decimal places tawk 'NR>1{print $1,$2,$3,sprintf("%0.3f",$4)}' yeast.footprints.bed | hgLoadBed -tmpDir=/data/tmp sacCer1 uwFootprintsPrints stdin # lift counts to sacSer2 and give back to UW wget http://noble.gs.washington.edu/proj/footprinting/yeast.dnaseI.tagCounts.bed liftOver yeast.dnaseI.tagCounts.bed /hive/data/genomes/sacCer1/bed/blat.sacCer2.2009-02-04/sacCer1ToSacCer2.over.chain.gz /cluster/home/markd/public_html/tmp/yeast/yeast.dnaseI.tagCounts.sacCer2.bed /cluster/home/markd/public_html/tmp/yeast/yeast.dnaseI.tagCounts.sacCer2.nolift.bed #############################################################################