# for emacs: -*- mode: sh; -*- # This file describes browser build for the Fugu # genome, Takifugu rubripes, October 2004, Fugu v4.0 from # # "$Id: fr2.txt,v 1.25 2009/09/20 17:16:43 markd Exp $" # ########################################################################## ### Fetch sequence (DONE - 2007-01-22 - Cory McLean and Hiram) ssh kkstore02 mkdir /cluster/store5/fr2 ln -s /cluster/store5/fr2 /cluster/data/fr2 cd /cluster/data/fr2 mkdir jgi cd jgi cat << '_EOF_' > fetch.sh #!/bin/sh wget --timestamping "ftp://ftp.jgi-psf.org/pub/JGI_data/Fugu/v4.0/*" gunzip fugu.041029.scaffolds.fasta.gz scaffoldFaToAgp fugu.041029.scaffolds.fasta gzip fugu.041029.scaffolds.fasta '_EOF_' # << happy emacs chmod +x fetch.sh ./fetch.sh ########################################################################## # Run the makeGenomeDb.pl script (DONE - 2007-01-22 - Cory and Hiram) # prepare for the makeGenomeDb.pl script: ssh hgwdev cd /cluster/data/fr2 # the config.ra script pretty much specifies everything cat << '_EOF_' > config.ra db fr2 scientificName Takifugu Rubripes assemblyDate Oct. 2004 assemblyLabel JGI V4.0 # orderKey = fr1.orderKey - 1 orderKey 464 # NC_004299.1 mitoAcc 23397366 fastaFiles /cluster/data/fr2/jgi/fugu.041029.scaffolds.fasta.gz dbDbSpeciesDir fugu agpFiles /cluster/data/fr2/jgi/fugu.041029.scaffolds.agp commonName Fugu clade Vertebrate genomeCladePriority 110 '_EOF_' # << happy emacs makeGenomeDb.pl config.ra > mgdb.out 2>&1 # This sequence creates and loads the following files into the # new database oryLat1 # chr*_gap, chr*_gold, chromInfo, gc5Base, grp # And, when you follow the instructions it gives at the end to check in # the trackDb files it creates, and you do a make in your trackDb # hierarchy, you will then create the trackDb and hgFindSpec tables # (with a 'make alpha') or your specific trackDb_logname # hgFindSpec_logname with a simple 'make' with no arguments. # The sequence also adds an entry to the dbDb table to turn on this # organism in the drop-down menus ############################################ # Checked in trackDb fr2 files to the source tree. Instructions # located in ~/kent/src/product/README.trackDb ssh hgwdev cd /gbdb/fr2 ln -s /cluster/data/fr2/fr2.unmasked.2bit ./fr2.2bit ################################################ ## WINDOWMASKER (DONE - 2007-01-22 - Cory and Hiram) cd /cluster/data/fr2/bed/ ~/kent/src/hg/utils/automation/doWindowMasker.pl fr2 \ -workhorse=kolossus > wmRun.log 2>&1 & # Save the log mv wmRun.log WindowMasker.2007-01-22 # Masking statistics cd WindowMasker.2007-01-22 twoBitToFa fr2.wmsk.2bit stdout | faSize stdin # 400525790 bases (49313545 N's 351212245 real 284686886 upper # 66525359 lower) hgLoadBed -strict fr2 windowmaskerSdust windowmasker.sdust.bed.gz # Loaded 1747418 elements of size 3 ######################################################################### # SIMPLE REPEATS (TRF) (DONE 2007-01-22 - Cory and Hiram) ssh kolossus mkdir /cluster/data/fr2/bed/simpleRepeat cd /cluster/data/fr2/bed/simpleRepeat # This missed chrM sequence time nice -n 19 twoBitToFa ../../fr2.unmasked.2bit stdout \ | trfBig -trf=/cluster/bin/i386/trf stdin /dev/null \ -bedAt=simpleRepeat.bed -tempDir=/tmp > trf.log 2>&1 & # ~31m # Make a filtered version for sequence masking: awk '{if ($5 <= 12) print;}' simpleRepeat.bed > trfMask.bed splitFileByColumn trfMask.bed trfMaskChrom # Load unfiltered repeats into the database: ssh hgwdev hgLoadBed fr2 simpleRepeat \ /cluster/data/fr2/bed/simpleRepeat/simpleRepeat.bed \ -sqlTable=$HOME/kent/src/hg/lib/simpleRepeat.sql featureBits fr2 simpleRepeat # 9915088 bases of 393312790 (2.521%) in intersection featureBits fr1 simpleRepeat # 6801339 bases of 315518167 (2.156%) in intersection # recovery attempt to get chrM masked time nice -n 19 twoBitToFa -seq=chrM ../../fr2.unmasked.2bit stdout \ | trfBig -trf=/cluster/bin/i386/trf stdin /dev/null \ -bedAt=chrM.simpleRepeat.bed -tempDir=/tmp > trf.log 2>&1 & # It finds nothing ! # make an empty trfMaskChrom file: touch trfMaskChrom/chrM.bed ######################################################################### ## Add TRF mask to WindowMasker masked sequence ssh kkstore02 cd /cluster/data/fr2 twoBitMask bed/WindowMasker.2007-01-22/fr2.wmsk.sdust.2bit \ -add bed/simpleRepeat/trfMask.bed fr2.2bit # Received ignorable warning: # Warning: BED file bed/simpleRepeat/trfMask.bed has >=13 fields which means it # might contain block coordinates, but this program uses only the first three # fields (the entire span -- no support for blocks). # Make this be the actual file the get DNA browser sees ssh hgwdev cd /gbdb/fr2/ rm fr2.2bit ln -s /cluster/data/fr2/fr2.2bit fr2.2bit ######################################################################### ## Lift our .2bit file against the .lft file to create a scaffold fasta file cd /cluster/data/fr2/jkStuff cp ../jgi/fugu.041029.scaffolds.lft liftAll.lft cp /cluster/data/tetNig1/jkStuff/lft2BitToFa.pl . cd .. mkdir noUn cd noUn time ../jkStuff/lft2BitToFa.pl ../fr2.2bit ../jkStuff/liftAll.lft \ > chrUn.scaffolds.fa # real 5m4.520s twoBitToFa -seq=chrM ../fr2.2bit chrM.fa faToTwoBit *.fa fr2.scaffolds.2bit twoBitInfo *.2bit stdout | sort -k2nr > fr2.scaffolds.sizes ########################################################################## ## Move the data out to the cluster cd /san/sanvol1/scratch/ mkdir fr2 cd fr2 cp -p /cluster/data/fr2/jkStuff/liftAll.lft cp -p /cluster/data/fr2/chrom.sizes . cp -p /cluster/data/fr2/fr2.2bit . cp -p /cluster/data/fr2/noUn/*2bit . cp -p /cluster/data/fr2/noUn/*sizes . ## Edit the kent/src/hg/makeDb/doc/gasAcu1.txt doc to show what we're going to ## do there. ## ## To display the new chains and nets in the gasAcu1 browser, we had to edit the ## trackDb.ra file to include the new chain and net: ## ## track chainFr2 ## shortLabel $o_db Chain ## longLabel $o_Organism ($o_date/$o_db) Chained Alignments ## group compGeno ## priority 140 ## visibility hide ## color 100,50,0 ## altColor 255,240,200 ## matrix 16 91,-90,-25,-100,-90,100,-100,-25,-25,-100,100,-90,-100,-25,-90,91 ## spectrum on ## type chain fr2 ## otherDb fr2 ## track netFr2 ## shortLabel $o_db Net ## longLabel $o_Organism ($o_date/$o_db) Alignment Net ## group compGeno ## priority 140.1 ## visibility hide ## spectrum on ## type netAlign fr2 chainFr2 ## otherDb fr2 ## Then we verified that the chain covered a higher percentage of gasAcu1 than fr1 did: ## Look into the gasAcu1.txt file to see the commands going on there. ## Then we needed to update the links to the detail pages of chains and nets: ## /cluster/home/cmclean/kent/src/hg/makeDb/trackDb/chainFr2.html and netFr2.html ########################################################### ######################################################################### # MAKE 11.OOC FILE FOR BLAT (DONE - 2007-01-24 - Hiram and Cory) # This will find repeats within the genome that should not be matched # against. Uses 11-mers. # Use -repMatch=128 (based on size -- for human we use 1024, and # fugu size is ~12% of human judging by gapless fr2 vs. hg18 # genome sizes from featureBits. featureBits hg18 gap featureBits -countGaps hg18 gap ssh kolossus blat /cluster/data/fr2/fr2.2bit /dev/null /dev/null -tileSize=11 \ -makeOoc=/cluster/data/fr2/11.ooc -repMatch=128 # Wrote 8898 overused 11-mers to /cluster/data/fr2/11.ooc cp -p /cluster/data/fr2/11.ooc /cluster/bluearc/fugu/fr2 cp -p /cluster/data/fr2/jkStuff/liftAll.lft /cluster/bluearc/fugu/fr2 ######################################################################### # GENBANK AUTO UPDATE (DONE - 2007-01-24 - Hiram and Cory) # Make a liftAll.lft that specifies 5M chunks for genbank: # Actually not necessary, since our chunks are small enough. If we had # to, would look like this: # ssh kkstore05 # cd /cluster/data/fr2 # simplePartition.pl fr2.2bit 5000000 /tmp/fr2 # cat /tmp/fr2/*/*.lft > jkStuff/liftAll.lft # rm -r /tmp/fr2 # align with latest genbank process. ssh hgwdev cd ~/kent/src/hg/makeDb/genbank cvsup # edit etc/genbank.conf to add fr2 just after fr1 # fr2 fr2.serverGenome = /cluster/data/fr2/fr2.2bit fr2.clusterGenome = /cluster/bluearc/fugu/fr2/fr2.2bit fr2.ooc = /cluster/bluearc/fugu/fr2/11.ooc fr2.align.unplacedChroms = chrUn fr2.lift = /cluster/bluearc/fugu/fr2/liftAll.lft fr2.refseq.mrna.native.pslCDnaFilter = ${lowCover.refseq.mrna.native.pslCDnaFilter} fr2.refseq.mrna.xeno.pslCDnaFilter = ${lowCover.refseq.mrna.xeno.pslCDnaFilter} fr2.genbank.mrna.native.pslCDnaFilter = ${lowCover.genbank.mrna.native.pslCDnaFilter} fr2.genbank.mrna.xeno.pslCDnaFilter = ${lowCover.genbank.mrna.xeno.pslCDnaFilter} fr2.genbank.est.native.pslCDnaFilter = ${lowCover.genbank.est.native.pslCDnaFilter} fr2.genbank.mrna.xeno.loadDesc = yes fr2.refseq.mrna.native.load = no fr2.refseq.mrna.xeno.load = no cvs ci -m "Added fr2." etc/genbank.conf # update /cluster/data/genbank/: make etc-update # Edit src/lib/gbGenome.c to add new species. Not necessary here since # fugu already exists. # # cvs ci -m "Added Oryzias latipes (medaka)." src/lib/gbGenome.c # make install-server cd /cluster/data/genbank screen # This is a call to a script that will push our jobs out to the cluster # since it's a big job. nice -n +19 bin/gbAlignStep -initial fr2 & # logFile: var/build/logs/2007.01.24-12:09:11.fr2.initalign.log # We had an error because machine kkr4u02 was unable to ssh to. This # happened in the middle of the -run subroutine. para problems > problems.out 2>&1 grep host problems.out | sort | uniq -c | sort -n # 62021 host: kkr4u02.kilokluster.ucsc.edu parasol list machines | grep kkr4u02 # kkr4u02.kilokluster.ucsc.edu parasol remove machine kkr4u02.kilokluster.ucsc.edu "unable to ssh" para push -retries=5 # updated job database on disk # Pushed Jobs: 14820 # Retried jobs: 14820 # We still need to finish the rest of the gbAlignStep since it failed # because the -run subroutine did not finish correctly. We must manually # call the rest of the routine. nice -n 19 bin/gbAlignStep -continue=finish -initial fr2 & # logFile: var/build/logs/2007.01.24-15:48:19.fr2.initalign.log # load database when finished ssh hgwdev cd /cluster/data/genbank time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad fr2 # enable daily alignment and update of hgwdev (DONE - 2007-02-05 - Hiram) cd ~/kent/src/hg/makeDb/genbank cvsup # add fr2 to: etc/align.dbs etc/hgwdev.dbs cvs ci -m "Added fr2." etc/align.dbs etc/hgwdev.dbs make etc-update ######################################################################### # ENSEMBL GENES (DONE - 2007-01-24 - Hiram and Cory) # Good luck with the biomart interface. It seems to be different each time # it is used. mkdir /cluster/data/fr2/bed/ensembl cd /cluster/data/fr2/bed/ensembl # Get the ensembl gene data from # http://www.biomart.org/biomart/martview/ # # Follow this sequence through the pages: # The default dataset will be Human. In the right side # frame, if you scroll it up and down, you will come to a pull-down # dataset menu where you can select the organism. There appear to be # two pull-down menus in this frame, one for: # Database: ENSEMBL 42 GENE (SANGER) # and you can select the second: # Dataset: Takifugu rubripes genes (FUGU4) # After selecting the Dataset, in the left frame, click on the # "Attributes" (Structures) label, now the right frame changes to radio # buttons, Features, Sequences, Structures # Click the "Structures" button, the three optional buttons can be # expanded to select elements of these: # REGION: # GENE: # EXON: # REGION: has Chromosome checked # GENE: has Ensembl Gene ID and Biotype selected # EXON: has no selections # In the GENE: menu: # Unselect Biotype # and Select # Ensembl Gene ID # Ensembl Transcript ID # External Gene ID # Click on the "Filters" section on the left-side frame. # Under GENE in the right-side frame, select Gene type checkbox and # highlight "protein_coding". # Check: Click "Count" from top buttons, and 22,008/22,409 genes will # be reported. # Then, in the black menu bar above these frames, click the "Results" # it shows the first ten rows. For this organism, there appear to be no # External Gene ID in the HTML view. Change the "Display maximum" # "rows as" pull-down # to GFF, and use the "Export all results to" pull-down to # Compressed web file (notify by email), press the "Go" button to download. # After retrieving the URL where our data is located # (http://www.biomart.org/biomart/martresults?file=martquery_0124222017_859.txt.gz), # we get it from that place: wget http://www.biomart.org/biomart/martresults?file=martquery_0124222017_859.txt.gz \ -O fr2.ensembl42.gff.gz # Ensemble gives us coordinates on each scaffold relative to the beginning # of *that scaffold.* We want to have them in chrUn coordinates instead. # We use the liftUp program on the Ensembl data, with our liftAll.lft file # we created a long time ago, to achieve this, and name it # fr2.ensembl42.protein_coding.gff since we only took the protein coding # genes. zcat fr2.ensembl42.gff.gz | liftUp -type=.gff fr2.ensembl42.protein_coding.gff \ ../../jkStuff/liftAll.lft error stdin # On other files, sometimes we need to massage the input names to match # the UCSC naming conventions. Below is an example, though it was not # necessary today. # Add "chr" to front of each line in the gene data gtf file to make # it compatible with our software, and liftUp to get scaffolds onto chrUn # The scaffolds and ultracontigs mentioned in this are not the scaffolds # we have in our lift file for chrUn ... can't use them. # zcat oryLat1.ensembl42.gff.gz | egrep -v "scaffold|ultracontig" \ # | sed -e "s/^\([0-9][0-9]*\)/chr\1/" | gzip -c > ensembl42.gff.gz # Verify names OK: # zcat ensembl42.gff.gz | awk '{print $1}' | sort | uniq -c # 22938 chr1 # 15887 chr10 # 18645 chr11 # 20162 chr12 # 21474 chr13 # 21302 chr14 # 20148 chr15 # 22978 chr16 # 26164 chr17 # 13671 chr18 # 17109 chr19 # 10988 chr2 # 15705 chr20 # 21361 chr21 # 21030 chr22 # 12984 chr23 # 19009 chr24 # 21767 chr3 # 26204 chr4 # 24335 chr5 # 24300 chr6 # 24329 chr7 # 23323 chr8 # 27795 chr9 cd /cluster/data/fr2/bed/ensembl ldHgGene -gtf -genePredExt fr2 ensGene fr2.ensembl42.protein_coding.gff # Read 22102 transcripts in 407112 lines in 1 files # 22102 groups 1 seqs 1 sources 4 feature types # 22102 gene predictions # The genome-test database will already populate this into our browser # since Ensembl genes are a default track. However, the link to the # Ensembl website is broken because it automatically assumes we are # referencing the human genome. We need to edit the # ~/kent/src/hg/makeDb/trackDb/fugu/trackDb.ra file in the ensGene track # and update the URL to point to the correct spot. In this case, we want # the incorrect URL: # url http://www.ensembl.org/perl/transview?transcript=$$track genscan # to the correct URL: # url http://dec2006.archive.ensembl.org/Takifugu_rubripes/transview?transcript=$$ # Also, we add the following lines at the end # urlName gene # archive dec2006 ## Now we want to get the proteins that are derived from these genes as a link at # the top of the detail browser. # ensGtp associates geneId/transcriptId/proteinId for hgPepPred and # hgKnownToSuper. Use ensMart to create it as above, except: # for the Attributes, choose the "Features" box, and then In "GENE:" # select Ensembl Gene ID, Ensembl Transcript ID, Ensembl Peptide ID. # Results, choose txt output and a Compressed web file (notify by email). # Save this as # ensGtp42.txt.gz wget http://www.biomart.org/biomart/martresults?file=martquery_0124225551_471.txt.gz \ -O ensGtp42.txt.gz # Strip the first lines which is merely column heading labels zcat ensGtp42.txt.gz | headRest 1 stdin | sed -e "s/ /\t/g" > ensGtp.txt # We want to load our genes, but unfortunately the gene names are larger # than the standard ensGtp.sql file that the following command handles: # hgLoadSqlTab fr2 ensGtp ~/kent/src/hg/lib/ensGtp.sql ensGtp.txt # Instead, we make our own temporary .sql file to handle the insert. sed -e "s/20/21/; s/18/21/" ~/kent/src/hg/lib/ensGtp.sql > ensGtp.bigcols.sql; # And then perform the insert. hgLoadSqlTab fr2 ensGtp ensGtp.bigcols.sql ensGtp.txt rm ensGtp.bigcols.sql hgsql -N -e "select count(*) from ensGtp;" fr2 # +-------+ # | 22102 | # +-------+ wc -l ensGtp.txt # 22102 ensGtp.txt # Load Ensembl peptides: # Get them from ensembl as above in the gene section except for # for the Attributes, choose the "Sequences" box, and then # SEQUENCES: Peptide and # Header Information "Ensembl Transcript ID" # Results output as FASTA # Save file as ensPep.fa.gz # XXX Still waiting for the proteins to be sent to us so that we can add # this part. # hgPepPred fr2 generic ensPep ensPep.fa ############################################################################ # BLATSERVERS ENTRY (DONE - 2007-01-25 - Hiram) # After getting a blat server assigned by the Blat Server Gods, ssh hgwdev hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ VALUES ("fr2", "blat3", "17786", "1", "0"); \ INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ VALUES ("fr2", "blat3", "17787", "0", "1");' \ hgcentraltest # test it with some sequence ######################################################################### # BLASTZ/CHAIN/NET Hg18 (DONE - 2007-01-26 - Hiram) ## Swap back to fr2 mkdir /cluster/data/fr2/bed/blastz.hg18.swap cd /cluster/data/fr2/bed/blastz.hg18.swap time doBlastzChainNet.pl -verbose=2 \ /cluster/data/hg18/bed/blastz.fr2.2007-01-24/DEF \ -chainMinScore=2000 -chainLinearGap=loose \ -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -bigClusterHub=pk -swap > swap.log 2>&1 & # real 47m14.554s ssh hgwdev cd /cluster/data/fr2/bed/blastz.hg18.swap time nice -n +19 featureBits fr2 chainHg18Link \ > fb.fr2.chainHg18Link.txt 2>&1 & # 42875664 bases of 393312790 (10.901%) in intersection ########################################################################### # HUMAN (hg18) PROTEINS TRACK (DONE braney 2007-01-26) ssh kkstore02 bash # if not using bash shell already mkdir /cluster/data/fr2/blastDb cd /cluster/data/fr2 zcat jgi/fugu.041029.scaffolds.fasta.gz > temp.fa faSplit sequence temp.fa 500 blastDb/ rm temp.fa cd blastDb for i in *.fa do /cluster/bluearc/blast229/formatdb -i $i -p F done rm *.fa mkdir -p /san/sanvol1/scratch/fr2/blastDb cd /cluster/data/fr2/blastDb for i in nhr nin nsq; do echo $i cp *.$i /san/sanvol1/scratch/fr2/blastDb done mkdir -p /cluster/data/fr2/bed/tblastn.hg18KG cd /cluster/data/fr2/bed/tblastn.hg18KG echo /san/sanvol1/scratch/fr2/blastDb/*.nsq | xargs ls -S | sed "s/\.nsq//" > query.lst wc -l query.lst # 495 query.lst # we want around 100000 jobs calc `wc /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl | awk "{print \\\$1}"`/\(100000/`wc query.lst | awk "{print \\\$1}"`\) # 36727/(100000/495) = 181.798650 mkdir -p /cluster/bluearc/fr2/bed/tblastn.hg18KG/kgfa split -l 180 /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl /cluster/bluearc/fr2/bed/tblastn.hg18KG/kgfa/kg ln -s /cluster/bluearc/fr2/bed/tblastn.hg18KG/kgfa kgfa cd kgfa for i in *; do nice pslxToFa $i $i.fa; rm $i; done cd .. ls -1S kgfa/*.fa > kg.lst mkdir -p /cluster/bluearc/fr2/bed/tblastn.hg18KG/blastOut ln -s /cluster/bluearc/fr2/bed/tblastn.hg18KG/blastOut for i in `cat kg.lst`; do mkdir blastOut/`basename $i .fa`; done tcsh cd /cluster/data/fr2/bed/tblastn.hg18KG cat << '_EOF_' > blastGsub #LOOP blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl } #ENDLOOP '_EOF_' cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/cluster/bluearc/blast229/data export BLASTMAT g=`basename $2` f=/tmp/`basename $3`.$g for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11 do if /cluster/bluearc/blast229/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8 then mv $f.8 $f.1 break; fi done if test -f $f.1 then if /cluster/bin/i386/blastToPsl $f.1 $f.2 then liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg18/bed/blat.hg18KG/protein.lft warn $f.2 if pslCheck -prot $3.tmp then mv $3.tmp $3 rm -f $f.1 $f.2 $f.3 $f.4 fi exit 0 fi fi rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 $f.4 exit 1 '_EOF_' # << happy emacs chmod +x blastSome gensub2 query.lst kg.lst blastGsub blastSpec exit # back to bash ssh pk cd /cluster/data/fr2/bed/tblastn.hg18KG para create blastSpec # para try, check, push, check etc. para time # Completed: 101475 of 101475 jobs # CPU time in finished jobs: 3924977s 65416.29m 1090.27h 45.43d 0.124 y # IO & Wait Time: 1118884s 18648.06m 310.80h 12.95d 0.035 y # Average job time: 50s 0.83m 0.01h 0.00d # Longest finished job: 3925s 65.42m 1.09h 0.05d # Submission to last job: 46557s 775.95m 12.93h 0.54d ssh kkstore04 cd /cluster/data/fr2/bed/tblastn.hg18KG tcsh mkdir chainRun cd chainRun cat << '_EOF_' > chainGsub #LOOP chainOne $(path1) #ENDLOOP '_EOF_' cat << '_EOF_' > chainOne (cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=75000 stdin /cluster/bluearc/fr2/bed/tblastn.hg18KG/blastOut/c.`basename $1`.psl) '_EOF_' chmod +x chainOne ls -1dS /cluster/bluearc/fr2/bed/tblastn.hg18KG/blastOut/kg?? > chain.lst gensub2 chain.lst single chainGsub chainSpec # do the cluster run for chaining ssh kk cd /cluster/data/fr2/bed/tblastn.hg18KG/chainRun para create chainSpec para try, check, push, check etc. # Completed: 205 of 205 jobs # CPU time in finished jobs: 2262s 37.70m 0.63h 0.03d 0.000 y # IO & Wait Time: 40989s 683.15m 11.39h 0.47d 0.001 y # Average job time: 211s 3.52m 0.06h 0.00d # Longest finished job: 360s 6.00m 0.10h 0.00d # Submission to last job: 1492s 24.87m 0.41h 0.02d ssh kkstore04 cd /cluster/data/fr2/bed/tblastn.hg18KG/blastOut bash # if using another shell for i in kg?? do cat c.$i.psl | awk "(\$13 - \$12)/\$11 > 0.6 {print}" > c60.$i.psl sort -rn c60.$i.psl | pslUniq stdin u.$i.psl awk "((\$1 / \$11) ) > 0.60 { print }" c60.$i.psl > m60.$i.psl echo $i done sort -T /tmp -k 14,14 -k 16,16n -k 17,17n u.*.psl m60* | uniq > /cluster/data/fr2/bed/tblastn.hg18KG/unliftBlastHg18KG.psl cd .. pslCheck unliftBlastHg18KG.psl liftUp blastHg18KG.psl ../../jkStuff/liftAll.lft warn unliftBlastHg18KG.psl # load table ssh hgwdev cd /cluster/data/fr2/bed/tblastn.hg18KG hgLoadPsl fr2 blastHg18KG.psl # check coverage featureBits fr2 blastHg18KG # 19761405 bases of 393312790 (5.024%) in intersection featureBits fr2 refGene:cds blastHg18KG -enrichment # ensGene:cds 8.216%, blastHg18KG 5.024%, both 4.401%, cover 53.57%, enrich 10.66x ssh kkstore04 rm -rf /cluster/data/fr2/bed/tblastn.hg18KG/blastOut rm -rf /cluster/bluearc/fr2/bed/tblastn.hg18KG/blastOut #end tblastn ######################################################################### # BLASTZ/CHAIN/NET TetNig1 SWAP (DONE - 2007-01-29 - Hiram) ## Align to fr2 scaffolds, ## results lifted to fr2 chrUn coordinates ## Swap to fr2 mkdir /cluster/data/fr2/bed/blastz.tetNig1.swap cd /cluster/data/fr2/bed/blastz.tetNig1.swap time doBlastzChainNet.pl -verbose=2 \ /cluster/data/tetNig1/bed/blastz.fr2.2007-01-25/DEF \ -chainMinScore=2000 -chainLinearGap=loose \ -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -bigClusterHub=pk -swap > swap.log 2>&1 & time doBlastzChainNet.pl -verbose=2 \ /cluster/data/tetNig1/bed/blastz.fr2.2007-01-25/DEF \ -chainMinScore=2000 -chainLinearGap=loose \ -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -continue=net -bigClusterHub=pk -swap > net_swap.log 2>&1 & # real 40m40.471s ssh hgwdev cd /cluster/data/tetNig1/bed/blastz.fr2.2007-01-25 time nice -n +19 featureBits tetNig1 chainFr2Link \ > fb.tetNig1.chainFr2Link.txt 2>&1 # 246828605 bases of 342403326 (72.087%) in intersection cd /cluster/data/fr2/bed/blastz.tetNig1.swap time nice -n +19 featureBits fr2 chainTetNig1Link \ > fb.fr2.chainTetNig1.txt 2>&1 # 247086553 bases of 393312790 (62.822%) in intersection ######################################################################### # BLASTZ/CHAIN/NET gasAcu1 swap (DONE - 2007-01-23 - Hiram) ## no chrUn in gasAcu1, and align to fr2 scaffolds, ## results lifted to fr2 chrUn coordinates ssh kkstore05 mkdir /cluster/data/fr2/bed/blastz.gasAcu1.swap cd /cluster/data/fr2/bed/blastz.gasAcu1.swap time doBlastzChainNet.pl \ /cluster/data/gasAcu1/bed/blastz.fr2.2007-01-23/DEF \ -chainMinScore=2000 -chainLinearGap=loose \ -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -swap -bigClusterHub=pk > swap.log 2>&1 & # real 24m33.761s ssh hgwdev cd /cluster/data/fr2/bed/blastz.gasAcu1.swap time nice -n 19 featureBits fr2 chainGasAcu1Link \ > fb.fr2.chainGasAcu1Link.txt 2>&1 & # 158383996 bases of 393312790 (40.269%) in intersection ######################################################################### ## BLASTZ/CHAIN/NET to gasAcu1 chrUn - the above swap does not include ## gasAcu1 chrUn - thus its browser would be empty for any fr2 alignments. ## This procedure will get fr2 alignments added to the gasAcu1 browser ## for chrUn ssh kkstore02 mkdir /cluster/data/fr2/bed/blastz.gasAcu1.2007-01-31 cd /cluster/data/fr2/bed/blastz.gasAcu1.2007-01-31 cat << '_EOF_' > DEF # Stickleback vs. Fugu, Stickleback chrUn in contigs only, to fugu contigs # Try "human-fugu" (more distant, less repeat-killed than mammal) params # +M=50: BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_M=50 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET: Fugu fr2 # Align to the scaffolds, results lifed up to chrUn.sdTrf # coordinates SEQ1_DIR=/san/sanvol1/scratch/fr2/fr2.2bit SEQ1_LEN=/san/sanvol1/scratch/fr2/chrom.sizes SEQ1_CTGDIR=/san/sanvol1/scratch/fr2/fr2.scaffolds.2bit SEQ1_CTGLEN=/san/sanvol1/scratch/fr2/fr2.scaffolds.sizes SEQ1_LIFT=/san/sanvol1/scratch/fr2/liftAll.lft SEQ1_CHUNK=20000000 SEQ1_LIMIT=30 SEQ1_LAP=10000 # QUERY: Stickleback gasAcu1 chrUn only # chrUn in contigs for this alignment run # The largest is 418,000 bases and there are 5,000 of them. SEQ2_DIR=/san/sanvol1/scratch/gasAcu1/gasAcu1.chrUn.sdTrf.2bit SEQ2_LEN=/san/sanvol1/scratch/gasAcu1/gasAcu1.chrUn.sdTrf.sizes SEQ2_CTGDIR=/san/sanvol1/scratch/gasAcu1/gasAcu1.chrUnContigsOnly.sdTrf.2bit SEQ2_CTGLEN=/san/sanvol1/scratch/gasAcu1/gasAcu1.chrUnContigsOnly.sdTrf.sizes SEQ2_LIFT=/san/sanvol1/scratch/gasAcu1/chrUn.extraCloneGap.lift SEQ2_CHUNK=1000000 SEQ2_LIMIT=30 SEQ2_LAP=0 BASE=/cluster/data/fr2/bed/blastz.gasAcu1.2007-01-31 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time doBlastzChainNet.pl DEF \ -chainMinScore=2000 -chainLinearGap=loose \ -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -blastzOutRoot /cluster/bluearc/fr2GasAcu1 \ -stop=net -bigClusterHub=pk > do.log 2>&1 & # real 73m13.156s ## swap back to gasAcu1 mkdir /cluster/data/gasAcu1/bed/blastz.fr2.swap cd /cluster/data/gasAcu1/bed/blastz.fr2.swap time doBlastzChainNet.pl -verbose=2 \ /cluster/data/fr2/bed/blastz.gasAcu1.2007-01-31/DEF \ -chainMinScore=2000 -chainLinearGap=loose \ -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -swap -stop=chainMerge -bigClusterHub=pk > swap.log 2>&1 & ## Now, with that chain result in hand, place it manually back in with ## the full chroms chains and re-run the nets and so forth. ######################################################################### ## swap oryLat1 results back to fr2 (DONE - 2007-01-24 - Hiram) mkdir /cluster/data/fr2/bed/blastz.oryLat1.swap cd /cluster/data/fr2/bed/blastz.oryLat1.swap time doBlastzChainNet.pl -verbose=2 \ /cluster/data/oryLat1/bed/blastz.fr2.2007-01-24/DEF \ -chainMinScore=2000 -chainLinearGap=loose \ -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -swap -bigClusterHub=pk > swap.log 2>&1 & ssh hgwdev cd /cluster/data/oryLat1/bed/blastz.fr2.2007-01-24 time nice -n +19 featureBits oryLat1 chainFr2Link \ > fb.oryLat1.chainFr2Link.txt 2>&1 # 177508958 bases of 700386597 (25.344%) in intersection cd /cluster/data/fr2/bed/blastz.oryLat1.swap] time nice -n +19 featureBits fr2 chainOryLat1Link \ > fb.fr2.chainOryLat1Link.txt 2>&1 # 143996507 bases of 393312790 (36.611%) in intersection ######################################################################### ## 5-Way Multiz (DONE - 2007-02-03 - Hiram) ssh hgwdev mkdir /cluster/data/fr2/bed/multiz5way cd /cluster/data/fr2/bed/multiz5way cp /cluster/data/gasAcu1/bed/multiz8way/8way.nh . /cluster/bin/phast/tree_doctor \ --prune human_hg18,mouse_mm8,chicken_galGal3 8way.nh # use the output of that to manually construct this tree. # Arbitrarily set 0.2 distances for this added branch # All other distances remain as specified in the 17way.nh cat << '_EOF_' > 5way.nh (((tetraodon_tetNig1:0.199381,fugu_fr2:0.239894):0.2, (stickleback_gasAcu1:0.2,medaka_oryLat1:0.2):0.2):0.292961, zebrafish_danRer4:0.782561); '_EOF_' # << happy emacs # Use this specification in the phyloGif tool: # http://genome.ucsc.edu/cgi-bin/phyloGif # to obtain a gif image for htdocs/images/phylo/fr2_5way.gif /cluster/bin/phast/all_dists 5way.nh > 5way.distances.txt # Use this output to create the table below grep -y fr2 5way.distances.txt | sort -k3,3n # # If you can fill in all the numbers in this table, you are ready for # the multiple alignment procedure # # featureBits chainLink measures # chainGasAcu1Link chain linearGap # distance on fr2 on other minScore # 1 0.4393 - tetraodon tetNig1 (% 62.822) (% 72.087) 2000 loose # 2 0.8399 - medaka oryLat1 (% 36.611) (% 25.344) 2000 loose # 3 0.8399 - stickleback gasAcu1 (% 40.269) (% 37.574) 2000 loose # 4 1.5156 - zebrafish danRer4 (% 20.585) (% 8.543) 5000 loose cd /cluster/data/fr2/bed/multiz5way # bash shell syntax here ... export H=/cluster/data/fr2/bed mkdir mafLinks for G in oryLat1 tetNig1 gasAcu1 danRer4 do mkdir mafLinks/$G if [ ! -d ${H}/blastz.${G}/mafNet ]; then echo "missing directory blastz.${G}/mafNet" exit 255 fi ln -s ${H}/blastz.$G/mafNet/*.maf.gz ./mafLinks/$G done # Copy MAFs to some appropriate NFS server for kluster run ssh kkstore02 mkdir /san/sanvol1/scratch/fr2/multiz5way cd /san/sanvol1/scratch/fr2/multiz5way time rsync -a --copy-links --progress \ /cluster/data/fr2/bed/multiz5way/mafLinks/ . mkdir penn cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/multiz penn cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/maf_project penn cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/autoMZ penn # the autoMultiz cluster run, there are only 2 jobs, kki is perfect ssh kki cd /cluster/data/fr2/bed/multiz5way/ # create species list and stripped down tree for autoMZ sed 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//; /^ *$/d' \ 5way.nh > tmp.nh echo `cat tmp.nh` > tree-commas.nh echo `cat tree-commas.nh` | sed 's/ //g; s/,/ /g' > tree.nh sed 's/[()]//g; s/,/ /g' tree.nh > species.lst mkdir run maf cd run # NOTE: you need to set the db and multiz dirname properly in this script cat > autoMultiz << '_EOF_' #!/bin/csh -ef set db = fr2 set c = $1 set maf = $2 set binDir = /san/sanvol1/scratch/$db/multiz5way/penn set tmp = /scratch/tmp/$db/multiz.$c set pairs = /san/sanvol1/scratch/$db/multiz5way rm -fr $tmp mkdir -p $tmp cp ../{tree.nh,species.lst} $tmp pushd $tmp foreach s (`cat species.lst`) set in = $pairs/$s/$c.maf set out = $db.$s.sing.maf if ($s == $db) then continue endif if (-e $in.gz) then zcat $in.gz > $out else if (-e $in) then cp $in $out else echo "##maf version=1 scoring=autoMZ" > $out endif end set path = ($binDir $path); rehash $binDir/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c.maf popd cp $tmp/$c.maf $maf rm -fr $tmp '_EOF_' # << happy emacs chmod +x autoMultiz cat << '_EOF_' > template #LOOP autoMultiz $(root1) {check out line+ /cluster/data/fr2/bed/multiz5way/maf/$(root1).maf} #ENDLOOP '_EOF_' # << happy emacs awk '{print $1}' /cluster/data/fr2/chrom.sizes > chrom.lst gensub2 chrom.lst single template jobList para create jobList # 2 jobs para try ... check ... push ... etc ... # Completed: 2 of 2 jobs # CPU time in finished jobs: 7246s 120.77m 2.01h 0.08d 0.000 y # IO & Wait Time: 357s 5.94m 0.10h 0.00d 0.000 y # Average job time: 3802s 63.36m 1.06h 0.04d # Longest finished job: 7601s 126.68m 2.11h 0.09d # Submission to last job: 7601s 126.68m 2.11h 0.09d # combine results into a single file for loading and gbdb reference ssh kkstore02 cd /cluster/data/fr2/bed/multiz5way nice -n +19 catDir maf > multiz5way.maf # makes a 1.3 Gb file: # -rw-rw-r-- 1 1341786986 Feb 3 19:52 multiz5way.maf ############################################################################ # ANNOTATE MULTIZ5WAY MAF AND LOAD TABLES (DONE - 2007-02-03 - Hiram) ## re-done 2007-03-27 with corrected nBeds and sizes files - Hiram ssh kolossus mkdir /cluster/data/fr2/bed/multiz5way/anno cd /cluster/data/fr2/bed/multiz5way/anno mkdir maf run cd run rm -f sizes nBeds twoBitInfo -nBed /cluster/data/fr2/fr2.{2bit,N.bed} for DB in `cat /cluster/data/fr2/bed/multiz5way/species.lst` do ln -s /cluster/data/${DB}/chrom.sizes ${DB}.len ln -s /cluster/data/${DB}/${DB}.N.bed ${DB}.bed echo ${DB}.bed >> nBeds echo ${DB}.len >> sizes echo $DB done echo '#!/bin/csh -ef' > jobs.csh echo date >> jobs.csh # do smaller jobs first so you can see some progress immediately: for F in `ls -1rS ../../maf/*.maf` do echo mafAddIRows -nBeds=nBeds -sizes=sizes $F \ /cluster/data/fr2/fr2.2bit ../maf/`basename $F` >> jobs.csh echo "echo $F" >> jobs.csh done echo date >> jobs.csh chmod +x jobs.csh time nice -n +19 ./jobs.csh > jobs.log 2>&1 & # to watch progress; tail -f jobs.log # real 165m43.716s # Load anno/maf ssh hgwdev cd /cluster/data/fr2/bed/multiz5way/anno/maf mkdir -p /gbdb/fr2/multiz5way/anno/maf ln -s /cluster/data/fr2/bed/multiz5way/anno/maf/*.maf \ /gbdb/fr2/multiz5way/anno/maf time nice -n +19 hgLoadMaf \ -pathPrefix=/gbdb/fr2/multiz5way/anno/maf fr2 multiz5way # Loaded 1469786 mafs in 2 files from /gbdb/fr2/multiz5way/anno/maf # real 0m41.123s # Do the computation-intensive part of hgLoadMafSummary on a workhorse # machine and then load on hgwdev: ssh kolossus cd /cluster/data/fr2/bed/multiz5way/anno/maf time cat *.maf | \ nice -n +19 hgLoadMafSummary fr2 -minSize=30000 -mergeGap=1500 \ -maxSize=200000 -test multiz5waySummary stdin # Created 146928 summary blocks from 3159326 components # and 1469786 mafs from stdin # real 0m58.171s ssh hgwdev cd /cluster/data/fr2/bed/multiz5way/anno/maf sed -e 's/mafSummary/multiz5waySummary/' ~/kent/src/hg/lib/mafSummary.sql \ > /tmp/multiz5waySummary.sql time nice -n +19 hgLoadSqlTab fr2 multiz5waySummary \ ~/kent/src/hg/lib/mafSummary.sql multiz5waySummary.tab # real 0m1.941 ####################################################################### # MULTIZ5WAY MAF FRAMES (DONE - 2007-02-03 - Hiram) ssh hgwdev mkdir /cluster/data/fr2/bed/multiz5way/frames cd /cluster/data/fr2/bed/multiz5way/frames mkdir genes # The following is adapted from the gasAcu1 sequence #------------------------------------------------------------------------ # get the genes for all genomes # mRNAs with CDS. single select to get cds+psl, then split that up and # create genePred # using refGene for danRer4 for qDB in danRer4 do geneTbl=refGene echo hgsql -N -e \"'select * from '$geneTbl\;\" ${qDB} hgsql -N -e "select * from $geneTbl" ${qDB} | cut -f 2-100 \ | genePredSingleCover stdin stdout | gzip -2c \ > /scratch/tmp/$qDB.tmp.gz mv /scratch/tmp/$qDB.tmp.gz genes/$qDB.gp.gz rm -f $tmpExt done # using genscan for tetNig1 # using ensGene for gasAcu1, oryLat1 and fr2 # genePreds; (must keep only the first 10 columns for knownGene) for qDB in gasAcu1 oryLat1 fr2 tetNig1 do if [ $qDB = "gasAcu1" -o $qDB = "oryLat1" -o $qDB = "fr2" ]; then geneTbl=ensGene elif [ $qDB = "tetNig1" ]; then geneTbl=genscan else exit 255 fi echo hgsql -N -e \"'select * from '$geneTbl\;\" ${qDB} hgsql -N -e "select * from $geneTbl" ${qDB} | cut -f 1-10 \ | genePredSingleCover stdin stdout | gzip -2c \ > /scratch/tmp/$qDB.tmp.gz mv /scratch/tmp/$qDB.tmp.gz genes/$qDB.gp.gz rm -f $tmpExt done ### ssh kkstore02 cd /cluster/data/fr2/bed/multiz5way/frames time cat ../maf/*.maf | nice -n +19 genePredToMafFrames fr2 stdin stdout fr2 genes/fr2.gp.gz gasAcu1 genes/gasAcu1.gp.gz oryLat1 genes/oryLat1.gp.gz tetNig1 genes/tetNig1.gp.gz danRer4 genes/danRer4.gp.gz | gzip > multiz5way.mafFrames.gz # real 0m52.606 ssh hgwdev cd /cluster/data/fr2/bed/multiz5way/frames time nice -n +19 hgLoadMafFrames fr2 multiz5wayFrames \ multiz5way.mafFrames.gz # real 0m20.580s ######################################################################### # Adding automatic generation of upstream files (DONE - 2009-08-13 - Hiram) # edit src/hg/makeDb/genbank/genbank.conf to add: fr2.upstreamGeneTbl = ensGene fr2.upstreamMaf = multiz5way /hive/data/genomes/fr2/bed/multiz5way/species.lst ######################################################################### # MULTIZ5WAY DOWNLOADABLES (DONE - 2007-02-05 - Hiram) ssh hgwdev mkdir /cluster/data/fr2/bed/multiz5way/mafDownloads cd /cluster/data/fr2/bed/multiz5way # upstream mafs # rebuilt 2007-12-21 to fix difficulty in mafFrags when species.lst # did not have fr2 as the first one # There isn't any refGene table, using ensGene instead for S in 1000 2000 5000 do echo "making upstream${S}.maf" nice -n +19 $HOME/bin/$MACHTYPE/featureBits -verbose=2 fr2 \ ensGene:upstream:${S} -fa=/dev/null -bed=stdout \ | perl -wpe 's/_up[^\t]+/\t0/' \ | $HOME/kent/src/hg/ratStuff/mafFrags/mafFrags fr2 multiz5way \ stdin stdout -orgs=species.lst \ | gzip -c > mafDownloads/ensGene.upstream${S}.maf.gz echo "done ensGene.upstream${S}.maf.gz" done ssh kkstore05 cd /cluster/data/fr2/bed/multiz5way ## re-done 2007-03-27 after correction to annotation step - Hiram time for M in anno/maf/chr*.maf do B=`basename $M` nice -n +19 gzip -c ${M} > mafDownloads/${B}.gz echo ${B}.gz done done # real 4m39.440s cd mafDownloads nice -n +19 md5sum *.maf.gz > md5sum.txt # Make a README.txt ssh hgwdev mkdir /usr/local/apache/htdocs/goldenPath/fr2/multiz5way cd /usr/local/apache/htdocs/goldenPath/fr2/multiz5way ln -s /cluster/data/fr2/bed/multiz5way/mafDownloads/{*.gz,*.txt} . ############################################################################ # CREATE CONSERVATION WIGGLE WITH PHASTCONS # (DONE - 2007-02-05 - Hiram) # Estimate phastCons parameters ssh kkstore05 mkdir /cluster/data/fr2/bed/multiz5way/cons cd /cluster/data/fr2/bed/multiz5way/cons # Create a starting-tree.mod based on one 25,000,000 window of chrUn time nice -n +19 /cluster/bin/phast/$MACHTYPE/msa_split ../maf/chrUn.maf \ --refseq ../../../Un/chrUn.fa --in-format MAF \ --windows 25000000,1000 --out-format SS \ --between-blocks 5000 --out-root s1 # real 4m27.989s time nice -n +19 /cluster/bin/phast/$MACHTYPE/phyloFit -i SS \ s1.174992629-199991578.ss \ --tree "(((tetNig1,fr2),(gasAcu1,oryLat1)),danRer4)" \ --out-root starting-tree # As an experiment, ran all of these ss files through this prediction # and the resulting stats of the add up the C and G: ## min Q1 median Q3 max mean N sum stddev # 0.45 0.457 0.463 0.469 0.479 0.461941 17 7.853 0.00759621 # Using the one closest to the mean: s1.174992629-199991578.ss rm s1.*.ss # add up the C and G: grep BACKGROUND starting-tree.mod | awk '{printf "%0.3f\n", $3 + $4;}' # 0.463 # This 0.463 is used in the --gc argument below ## the fa files are needed for the sequence and they are created during # this loop if they haven't been done before # Create SS files on san filesystem ssh kkstore05 mkdir -p /san/sanvol1/scratch/fr2/cons/ss cd /san/sanvol1/scratch/fr2/cons/ss time for C in \ `awk '{print $1}' /cluster/data/fr2/chrom.sizes | sed -e "s/chr//"` do mkdir -p chr${C} echo msa_split $C nice -n +19 /cluster/bin/phast/$MACHTYPE/msa_split \ /cluster/data/fr2/bed/multiz5way/maf/chr${C}.maf \ --refseq /cluster/data/fr2/${C}/chr${C}.fa \ --in-format MAF --windows 2500000,0 --between-blocks 5000 \ --out-format SS --out-root chr${C}/chr${C} done & # real 4m2.736s # Create a random list of 50 1 mb regions cd /san/sanvol1/scratch/fr2/cons/ss ls -1l chr*/chr*.ss \ | awk '$5 > 4000000 {print $9;}' \ | randomLines stdin 50 ../randomSs.list # Set up parasol directory to calculate trees on these 50 regions ssh pk mkdir /san/sanvol1/scratch/fr2/cons/treeRun1 cd /san/sanvol1/scratch/fr2/cons/treeRun1 mkdir tree log # Tuning this loop should come back to here to recalculate # Create little script that calls phastCons with right arguments # --target-coverage of 0.20 is about right for mouse, will be # tuned exactly below cat > makeTree.csh << '_EOF_' #!/bin/csh -fe set C=$1:h mkdir -p log/${C} tree/${C} /cluster/bin/phast/$MACHTYPE/phastCons ../ss/$1 \ /cluster/data/fr2/bed/multiz5way/cons/starting-tree.mod \ --gc 0.463 --nrates 1,1 --no-post-probs --ignore-missing \ --expected-length 10 --target-coverage 0.20 \ --quiet --log log/$1 --estimate-trees tree/$1 '_EOF_' # << happy emacs chmod a+x makeTree.csh # Create gensub file cat > template << '_EOF_' #LOOP ./makeTree.csh $(path1) #ENDLOOP '_EOF_' # << happy emacs # Make cluster job and run it gensub2 ../randomSs.list single template jobList para create jobList para try/push/check/etc # Completed: 50 of 50 jobs # CPU time in finished jobs: 5204s 86.74m 1.45h 0.06d 0.000 y # IO & Wait Time: 204s 3.39m 0.06h 0.00d 0.000 y # Average job time: 108s 1.80m 0.03h 0.00d # Longest finished job: 138s 2.30m 0.04h 0.00d # Submission to last job: 141s 2.35m 0.04h 0.00d # Now combine parameter estimates. We can average the .mod files # using phyloBoot. This must be done separately for the conserved # and nonconserved models ssh pk cd /san/sanvol1/scratch/fr2/cons/treeRun1 ls -1 tree/chr*/*.cons.mod > cons.list /cluster/bin/phast/$MACHTYPE/phyloBoot --read-mods '*cons.list' \ --output-average ave.cons.mod > cons_summary.txt 2>&1 & ls -1 tree/chr*/*.noncons.mod > noncons.list /cluster/bin/phast/$MACHTYPE/phyloBoot --read-mods '*noncons.list' \ --output-average ave.noncons.mod > noncons_summary.txt cp -p ave.*.mod .. cd .. cp -p ave.*.mod /cluster/data/fr2/bed/multiz5way/cons # measuring entropy # consEntopy # ave.cons.mod ave.noncons.mod --NH 9.78 # never stops with the --NH argument time /cluster/bin/phast/$MACHTYPE/consEntropy --NH 9.7834 \ 0.20 10 ave.{cons,noncons}.mod ## 0.20 10 ( Solving for new omega: 10.000000 10.467305 10.449210 10.449184 ) Transition parameters: gamma=0.200000, omega=10.000000, mu=0.100000, nu=0.025000 Relative entropy: H=0.903779 bits/site Expected min. length: L_min=10.726004 sites Expected max. length: L_max=6.735382 sites Phylogenetic information threshold: PIT=L_min*H=9.693936 bits Recommended expected length: omega=10.449184 sites (for L_min*H=9.783400) ssh pk # Create cluster dir to do main phastCons run mkdir /san/sanvol1/scratch/fr2/cons/consRun1 cd /san/sanvol1/scratch/fr2/cons/consRun1 mkdir ppRaw bed # Create script to run phastCons with right parameters # This job is I/O intensive in its output files, thus it is all # working over in /scratch/tmp/ cat > doPhast << '_EOF_' #!/bin/csh -fe mkdir /scratch/tmp/${2} cp -p ../ss/${1}/${2}.ss ../ave.{cons,noncons}.mod /scratch/tmp/${2} pushd /scratch/tmp/${2} > /dev/null /cluster/bin/phast/${MACHTYPE}/phastCons ${2}.ss ave.cons.mod,ave.noncons.mod \ --expected-length 10 --target-coverage 0.20 --quiet \ --seqname ${1} --idpref ${1} --viterbi ${2}.bed --score > ${2}.pp popd > /dev/null mkdir -p ppRaw/${1} mkdir -p bed/${1} mv /scratch/tmp/${2}/${2}.pp ppRaw/${1} mv /scratch/tmp/${2}/${2}.bed bed/${1} rm /scratch/tmp/${2}/ave.{cons,noncons}.mod rm /scratch/tmp/${2}/${2}.ss rmdir /scratch/tmp/${2} '_EOF_' # << happy emacs chmod a+x doPhast # root1 == chrom name, file1 == ss file name without .ss suffix # Create gsub file cat > template << '_EOF_' #LOOP ./doPhast $(root1) $(file1) #ENDLOOP '_EOF_' # << happy emacs # Create parasol batch and run it ls -1 ../ss/chr*/chr*.ss | sed 's/.ss$//' > in.list gensub2 in.list single template jobList para create jobList para try/check/push/etc. # These jobs are very fast and very I/O intensive, even on the san # they will hang it up as they work at full tilt. # Completed: 162 of 162 jobs # CPU time in finished jobs: 1501s 25.02m 0.42h 0.02d 0.000 y # IO & Wait Time: 924s 15.40m 0.26h 0.01d 0.000 y # Average job time: 15s 0.25m 0.00h 0.00d # Longest finished job: 23s 0.38m 0.01h 0.00d # Submission to last job: 44s 0.73m 0.01h 0.00d # combine predictions and transform scores to be in 0-1000 interval # it uses a lot of memory, so on kolossus: ssh kolossus cd /san/sanvol1/scratch/fr2/cons/consRun1 # The sed's and the sort get the file names in chrom,start order # You might like to verify it is correct by first looking at the # list it produces: find ./bed -type f | sed -e "s#/# x #g; s#\.# y #g; s#-# z #g" \ | sort -k7,7 -k9,9n \ | sed -e "s# z #-#g; s# y #\.#g; s# x #/#g" | less # if that looks right, then let it run: find ./bed -type f | sed -e "s#/# x #g; s#\.# y #g; s#-# z #g" \ | sort -k7,7 -k9,9n \ | sed -e "s# z #-#g; s# y #\.#g; s# x #/#g" | xargs cat \ | awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' \ | /cluster/bin/scripts/lodToBedScore /dev/stdin > mostConserved5Way.bed # ~ 16 seconds cp -p mostConserved5Way.bed /cluster/data/fr2/bed/multiz5way # Figure out how much is actually covered by the bed file as so: # Get the non-n genome size from faSize on all chroms: ssh kkstore05 cd /cluster/data/fr2 faSize Un/chrUn.fa M/chrM.fa # 400525790 bases (49313545 N's 351212245 real 284435760 # upper 66776485 lower) in 2 sequences in 2 files cd /san/sanvol1/scratch/fr2/cons/consRun1 # The 351212245 comes from the non-n genome as counted above. awk ' {sum+=$3-$2} END{printf "%% %.2f = 100.0*%d/351212245\n",100.0*sum/351212245,sum}' \ mostConserved5Way.bed # % 15.82 = 100.0*55573943/351212245 --exp-len 10 --tar-cov 0.20 # Aiming for %70 coverage in # the following featureBits measurement on CDS: # Beware of negative scores when too high. The logToBedScore # will output an error on any negative scores. HGDB_CONF=~/.hg.conf.read-only time nice -n +19 featureBits fr2 \ -enrichment ensGene:cds mostConserved5Way.bed # --expected-length 10 --target-coverage 0.20 fr2 # ensGene:cds 8.216%, mostConserved5Way.bed 14.130%, both 5.188%, cover # 63.14%, enrich 4.47x # Load most conserved track into database ssh hgwdev cd /cluster/data/fr2/bed/multiz5way # ended up using the set: --expected-length 10 --target-coverage 0.20 time nice -n +19 hgLoadBed -strict fr2 phastConsElements5way \ mostConserved5Way.bed # Loaded 1140341 elements of size 5 # real 0m28.545 # should measure the same as above time nice -n +19 \ featureBits fr2 -enrichment ensGene:cds phastConsElements5way # At: --expected-length 10 --target-coverage 0.20 fr2 # ensGene:cds 8.216%, phastConsElements5way 14.130%, both 5.188%, cover # 63.14%, enrich 4.47x # Create merged posterier probability file and wiggle track data files ssh pk cd /san/sanvol1/scratch/fr2/cons/consRun1 # the sed business gets the names sorted by chromName, chromStart # so that everything goes in numerical order into wigEncode # This was verified above to be correct time nice -n +19 find ./ppRaw -type f \ | sed -e "s#/# x #g; s#\.# y #g; s#-# z #g" \ | sort -k7,7 -k9,9n \ | sed -e "s# z #-#g; s# y #\.#g; s# x #/#g" | xargs cat \ | $HOME/bin/$MACHTYPE/wigEncode -noOverlap stdin \ phastCons5.wig phastCons5.wib # Converted stdin, upper limit 1.00, lower limit 0.00 # real 3m49.865s # -rw-rw-r-- 1 288320445 Feb 5 13:51 phastCons5.wib # -rw-rw-r-- 1 37122305 Feb 5 13:51 phastCons5.wig time nice -n +19 cp -p phastCons5.wi? /cluster/data/fr2/bed/multiz5way/ # prepare compressed copy of ascii data values for downloads ssh pk cd /san/sanvol1/scratch/fr2/cons/consRun1 cat << '_EOF_' > gzipAscii.sh #!/bin/sh TOP=`pwd` export TOP mkdir -p phastCons5Scores for D in ppRaw/chr* do C=${D/ppRaw\/} out=phastCons5Scores/${C}.data.gz echo "========================== ${C} ${D}" find ./${D} -type f | sed -e "s#/# x #g; s#\.# y #g; s#-# z #g" \ | sort -k7,7 -k9,9n \ | sed -e "s# z #-#g; s# y #\.#g; s# x #/#g" | xargs cat | gzip > ${out} done '_EOF_' # << happy emacs chmod +x gzipAscii.sh time nice -n +19 ./gzipAscii.sh # real 5m21.400s # copy them for downloads ssh kkstore05 # this directory is actually a symlink from store9 to store8 to # avoid the data full problem on store9 mkdir /cluster/data/fr2/bed/multiz5way/phastCons5Scores cd /cluster/data/fr2/bed/multiz5way/phastCons5Scores cp -p /san/sanvol1/scratch/fr2/cons/consRun1/phastCons5Scores/* . # make a README.txt file here, and an md5sum ssh hgwdev mkdir /usr/local/apache/htdocs/goldenPath/fr2/phastCons5Scores cd /usr/local/apache/htdocs/goldenPath/fr2/phastCons5Scores ln -s /cluster/data/fr2/bed/multiz5way/phastCons5Scores/* . # Load gbdb and database with wiggle. ssh hgwdev cd /cluster/data/fr2/bed/multiz5way ln -s `pwd`/phastCons5.wib /gbdb/fr2/wib/phastCons5.wib # ended up using the set: --expected-length 10 --target-coverage 0.20 time nice -n +19 hgLoadWiggle fr2 phastCons5 phastCons5.wig # real 0m9.256s # Create histogram to get an overview of all the data ssh hgwdev cd /cluster/data/fr2/bed/multiz5way time nice -n +19 hgWiggle -doHistogram \ -hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \ -db=fr2 phastCons5 > histogram.data 2>&1 # real 0m30.744 # create plot of histogram: cat << '_EOF_' | gnuplot > histo.png set terminal png small color \ xffffff x000000 x000000 x444444 xaa4400 x00ffff xff0000 set size 1.4, 0.8 set key left box set grid noxtics set grid ytics set title " Stickleback fr2 Histogram phastCons5 track" set xlabel " phastCons5 score - --expected-length 10 --target-coverage 0.20" set ylabel " Relative Frequency" set y2label " Cumulative Relative Frequency (CRF)" set y2range [0:1] set y2tics set yrange [0:0.02] plot "histogram.data" using 2:5 title " RelFreq" with impulses, \ "histogram.data" using 2:7 axes x1y2 title " CRF" with lines '_EOF_' # << happy emacs display histo.png & # The mostConserved can also be characterized by a histogram awk '{print $3-$2}' mostConserved5Way.bed > mostCons.txt textHistogram -verbose=2 -autoScale=1000 -pValues mostCons.txt \ > mostCons.histo.txt cat << '_EOF_' | gnuplot > mostCons.png set terminal png small color \ xffffff x000000 x000000 x444444 xaa4400 x00ffff xff0000 set size 1.4, 0.8 set key left box set grid noxtics set grid ytics set title " Stickleback fr2 histogram: lengths of mostConserved track elements" set xlabel " mostConserved element length - --expected-length 10 --target-coverage 0.20" set ylabel " # of elements at this length" set y2label " Cumulative Relative Frequency (CRF)" set y2range [0:1] set xrange [0:200] set y2tics set boxwidth 2 set style fill solid plot "mostCons.histo.txt" using 2:3 title " # of elements" with boxes, \ "mostCons.histo.txt" using 2:6 axes x1y2 title " CRF" with lines '_EOF_' # << happy emacs ############################################################################ ## Set a more interesting default position, location of CABIN1 gene ## DONE - 2007-02-08 - Hiram ssh hgwdev hgsql -e "update hgsql -e \ 'update dbDb set defaultPos="chrUn:29,305,920-29,330,760" where name = "fr2";' \ -h genome-testdb hgcentraltest ############################################################################ ## DOWNLOADS - (DONE - 2007-02-12 - 2007-02-16 - Hiram) ssh hgwdev cd /cluster/data/fr2 ~/kent/src/hg/utils/automation/makeDownloads.pl fr2 \ > makeDownloads.out 2>&1 # Doesn't work due to missing Repeat masker outputs # Create WindowMasker separate files by chrom, for downloads ssh kkstore05 cd /cluster/data/fr2/bed/WindowMasker.2007-01-22 # This name change here is so the names created by splitFileByColumn # will be reasonable zcat windowmasker.sdust.bed.gz > chr.fr2.WMSdust.bed splitFileByColumn chr.fr2.WMSdust.bed chrWM # Creating chrWM/chr1.fr2.WMSdust.bed # Creating chrWM/chr2.fr2.WMSdust.bed # ... etc ... cd chrWM tar cvzf ../chromWMSdust.bed.tar.gz *.bed cd .. # Verify this process didn't destroy anything: cat chrWM/*.bed | awk '{sum += $3-$2}END{printf "total size: %d\n",sum}' # total size: 115825307 zcat windowmasker.sdust.bed.gz \ | awk '{sum += $3-$2}END{printf "total size: %d\n",sum}' # total size: 115825307 # deliver to bigZips cd /cluster/data/fr2/goldenPath/bigZips ln -s \ /cluster/data/fr2/bed/WindowMasker.2007-01-22/chromWMSdust.bed.tar.gz . # remove the chromOut.tar.gz file and re-make the md5sum.txt md5sum *gz > md5sum.txt # go back to simpleRepeat and attempt to make a chrM.bed - turns out to # be an empty result, so leave an empty file there. Then running this # makeDownloads.pl again, create two empty .out files to get through # that. cd /cluster/data/fr2 touch Un/chrUn.fa.out touch M/chrM.fa.out ~/kent/src/hg/utils/automation/makeDownloads.pl fr2 \ > makeDownloads.out 2>&1 cd goldenPath/bigZips ln -s \ /cluster/data/fr2/bed/WindowMasker.2007-01-22/chromWMSdust.bed.tar.gz . # get GenBank native mRNAs ssh hgwdev cd /cluster/data/genbank ./bin/i386/gbGetSeqs -db=fr2 -native \ GenBank mrna /cluster/data/fr2/goldenPath/bigZips/mrna.fa # get GenBank xeno mRNAs ./bin/i386/gbGetSeqs -db=fr2 -xeno \ GenBank mrna /cluster/data/fr2/goldenPath/bigZips/xenoMrna.fa cd /cluster/data/fr2/goldenPath/bigZips ssh kkstore05 gzip mrna.fa xenoMrna.fa md5sum *.gz > md5sum.txt # Edit the README.txt file to be correct ssh hgwdev cd /usr/local/apache/htdocs/goldenPath/fr2/bigZips ln -s /cluster/data/fr2/goldenPath/bigZips/* . ############################################################################ ## Fugu Photograph - obtained from Byrappa Venkatesh in email 2007-02-15 ## (DONE - 2007-02-16 - Hiram) ssh hgwdev mkdir /cluster/data/fr2/photograph cd /cluster/data/fr2/photograph ## original: Byrappa.Venkatesh.Fugu.jpg identify By* # Byrappa.Venkatesh.Fugu.jpg JPEG 700x286 DirectClass 43kb 0.000u 0:01 convert -quality 80 -geometry 300x200 Byrappa.Venkatesh.Fugu.jpg \ Takifugu_rubripes.jpg ## check this file into the browser/images CVS source tree and ## copy to /usr/local/apache/htdocs/images ########################################################################## ## RepeatMasker run to cover all bases (DONE - 2007-03-07 - Hiram) ssh kkstore02 mkdir /cluster/data/fr2/bed/RepeatMasker cd /cluster/data/fr2/bed/RepeatMasker time nice -n +19 doRepeatMasker.pl -verbose=2 -bigClusterHub=kk \ -buildDir=/cluster/data/fr2/bed/RepeatMasker fr2 > do.log 2>&1 ########################################################################### ## Chicken/Fugu chain/net swap - (DONE - 2007-03-12 - Hiram) mkdir /cluster/data/fr2/bed/blastz.galGal3.swap cd /cluster/data/fr2/bed/blastz.galGal3.swap time doBlastzChainNet.pl -verbose=2 -qRepeats=windowmaskerSdust \ /cluster/data/galGal3/bed/blastz.fr2.2007-03-09/DEF \ -bigClusterHub=kk -chainMinScore=5000 -chainLinearGap=loose \ -swap > swap.log 2>&1 & time doBlastzChainNet.pl -verbose=2 -qRepeats=windowmaskerSdust \ /cluster/data/galGal3/bed/blastz.fr2.2007-03-09/DEF \ -bigClusterHub=kk -chainMinScore=5000 -chainLinearGap=loose \ -continue=net -swap > swap_net.log 2>&1 & # real 3m1.239s cat fb.fr2.chainGalGal3Link.txt # 36175581 bases of 393312790 (9.198%) in intersection ########################################################################### # Create liftover fr1 to fr2 (DONE - 2007-04-09 - Hiram) ssh kkstore02 mkdir /cluster/data/fr2/bed/blat.fr1 cd /cluster/data/fr2/bed/blat.fr1 time nice -n +19 doSameSpeciesLiftOver.pl fr1 fr2 -bigClusterHub pk \ -buildDir=/cluster/data/fr2/bed/blat.fr1 > do.log 2>&1 cp -p fr1ToFr2.over.chain.gz ../liftOver cd ../liftOver md5sum *.gz > ../../goldenPath/liftOver/md5sum.txt ssh hgwdev cd /usr/local/apache/htdocs/goldenPath/fr2/liftOver ln -s /cluster/data/fr2/bed/liftOver/fr1ToFr2.over.chain.gz . ############################################################################## # SWAP DANRER5 BLASTZ RESULT TO CREATE DANRER5 CHAINS AND NETS TRACKS, # AXTNET, MAFNET AND ALIGNMENT DOWNLOADS # (DONE, 2007-09-19 and 2007-09-22, hartera) ssh kkstore02 mkdir /cluster/data/fr2/bed/blastz.swap.danRer5 cd /cluster/data/fr2/bed/blastz.swap.danRer5 # blastz parameters used to align fr2 as query to danRer5 as target: # BLASTZ_H=2500 # BLASTZ_M=50 # BLASTZ_Q=/cluster/data/blastz/HoxD55.q # Results for fr2 blastz on danRer5 are in: # /cluster/data/danRer5/bed/blastz.fr2.2007-09-18 time nice /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ -qRepeats=windowmaskerSdust -swap \ /cluster/data/danRer5/bed/blastz.fr2.2007-09-18/DEF \ >& swap.log & # 0.139u 0.098s 14:09.59 0.0% 0+0k 0+0io 0pf+0w ssh hgwdev cat \ /cluster/data/fr2/bed/blastz.danRer5.swap/fb.fr2.chainDanRer5Link.txt # 78259559 bases of 393312790 (19.898%) in intersection # look at coverage of ensGene CDS, there is no native RefSeqs # track for fugu, fr2. featureBits fr2 ensGene:cds chainDanRer5Link -enrichment # ensGene:cds 8.216%, chainDanRer5Link 19.898%, both 7.130%, cover 86.78%, # enrich 4.36x featureBits fr2 ensGene:cds chainDanRer4Link -enrichment # ensGene:cds 8.216%, chainDanRer4Link 20.585%, both 7.030%, cover 85.56%, # enrich 4.16x featureBits fr2 ensGene:cds netDanRer5 -enrichment # ensGene:cds 8.216%, netDanRer5 66.051%, both 7.845%, cover 95.48%, # enrich 1.45x featureBits fr2 ensGene:cds netDanRer4 -enrichment # ensGene:cds 8.216%, netDanRer4 65.374%, both 7.766%, cover 94.51%, # enrich 1.45x # clean up a little (2007-09-22, hartera) ssh kkstore02 cd /cluster/data/fr2/bed mv ./blastz.swap.danRer5/swap.log ./blastz.danRer5.swap rm -r blastz.swap.danRer5 ln -s blastz.danRer5.swap blastz.danRer5 ############################################################################ # TRANSMAP vertebrate.2008-05-20 build (2008-05-24 markd) vertebrate-wide transMap alignments were built Tracks are created and loaded by a single Makefile. This is available from: svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-05-20 see doc/builds.txt for specific details. ############################################################################ ############################################################################ # TRANSMAP vertebrate.2008-06-07 build (2008-06-30 markd) vertebrate-wide transMap alignments were built Tracks are created and loaded by a single Makefile. This is available from: svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-06-30 see doc/builds.txt for specific details. ############################################################################ # SWAP BLASTZ Medaka oryLat2 (DONE - 2008-08-27 - Hiram) ssh kkstore04 # not too important since everything moved to the hive screen # use a screen to control this job cd /cluster/data/oryLat2/bed/blastz.fr2.2008-08-25 cat fb.oryLat2.chainFr2Link.txt # 180945351 bases of 700386597 (25.835%) in intersection mkdir /cluster/data/fr2/bed/blastz.oryLat2.swap cd /cluster/data/fr2/bed/blastz.oryLat2.swap time doBlastzChainNet.pl -verbose=2 -swap \ /cluster/data/oryLat2/bed/blastz.fr2.2008-08-25/DEF \ -chainMinScore=3000 -chainLinearGap=medium \ -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -smallClusterHub=pk -bigClusterHub=pk > swap.log 2>&1 & # real 24m32.826s cat fb.fr2.chainOryLat2Link.txt # 153621820 bases of 393312790 (39.058%) in intersection ############################################################################ ############################################################################ # TRANSMAP vertebrate.2009-07-01 build (2009-07-21 markd) vertebrate-wide transMap alignments were built Tracks are created and loaded by a single Makefile. This is available from: svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-07-01 see doc/builds.txt for specific details. ############################################################################ # LASTZ/CHAIN/NET fr2 (DONE - 2009-09-15,18 - Hiram) # original alignment to tetNig2: cd /hive/data/genomes/tetNig2/bed/lastzFr2.2009-09-15 cat fb.tetNig2.chainFr2Link.txt # 243965150 bases of 302314788 (80.699%) in intersection # and the swap to here, fr2: mkdir /hive/data/genomes/fr2/bed/blastz.tetNig2.swap cd /hive/data/genomes/fr2/bed/blastz.tetNig2.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/tetNig2/bed/lastzFr2.2009-09-15/DEF \ -swap -tRepeats=windowmaskerSdust \ -noLoadChainSplit -chainMinScore=2000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \ > swap.log 2>&1 & # real 21m56.576s cat fb.fr2.chainTetNig2Link.txt # 248984008 bases of 393312790 (63.304%) in intersection ############################################################################ # TRANSMAP vertebrate.2009-09-13 build (2009-09-20 markd) vertebrate-wide transMap alignments were built Tracks are created and loaded by a single Makefile. This is available from: svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-09-13 see doc/builds.txt for specific details. ############################################################################ # construct liftOver to fr3 (DONE - 2012-02-22 - Hiram) screen # manage this longish running job in a screen mkdir /hive/data/genomes/fr2/bed/blat.fr3.2012-02-22 cd /hive/data/genomes/fr2/bed/blat.fr3.2012-02-22 # check it with -debug first to see if it is going to work: time doSameSpeciesLiftOver.pl -buildDir=`pwd` -bigClusterHub=swarm \ -ooc=/scratch/data/fr2/11.ooc \ -debug -dbHost=hgwdev -workhorse=hgwdev fr2 fr3 > do.log 2>&1 # if that is OK, then run it: time doSameSpeciesLiftOver.pl -buildDir=`pwd` -bigClusterHub=swarm \ -ooc=/scratch/data/fr2/11.ooc \ -dbHost=hgwdev -workhorse=hgwdev fr2 fr3 > do.log 2>&1 # real 17m59.258s # verify this file exists: # /gbdb/fr2/liftOver/fr2ToFr3.over.chain.gz # and try out the conversion on genome-test from fr2 to fr3 ############################################################################