# for emacs: -*- mode: sh; -*- # This file describes how we made the browser database on the # Chicken (Gallus gallus) February 2004 release. # CREATE BUILD DIRECTORY (DONE 2/23/04 angie) ssh kksilo mkdir /cluster/store7/galGal2 ln -s /cluster/store7/galGal2 /cluster/data/galGal2 # DOWNLOAD MITOCHONDRION GENOME SEQUENCE (DONE 2/23/04 angie) mkdir /cluster/data/galGal2/M cd /cluster/data/galGal2/M # go to http://www.ncbi.nih.gov/ and search Nucleotide for # "gallus mitochondrion genome". That shows the gi number: # 5834843 # Use that number in the entrez linking interface to get fasta: wget -O chrM.fa \ 'http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=Text&db=Nucleotide&uid=5834843&dopt=FASTA' # Edit chrM.fa: make sure the long fancy header line says it's the # Gallus gallus mitochondrion complete genome, and then replace the # header line with just ">chrM". # DOWNLOAD WHOLE GENOME SHOTGUN CONTIGS & QUAL (DONE 2/23/04 angie) ssh kksilo cd /cluster/data/galGal2 # Contig sequences have been stable for a while: wget ftp://genome.wustl.edu/private/chicken_040105/Fasta/chicken_040105.contigs.tar.gz # Official location (password-protected): # http://genome.wustl.edu/projects/chicken/chicken_analysis/chicken_040105.contigs.tar.gz mkdir contigs cd contigs tar xvfz ../chicken_040105.contigs.tar.gz foreach f (chicken_031214.pcap.contigs*) set n = `echo $f | sed -e 's/chicken_031214.pcap.contigs//'` mv $f contigs$n.fa end faSize *.fa #1054180845 bases (107580 N's 1054073265 real) in 111864 sequences in 100 files #Total size: mean 9423.8 sd 19990.9 min 52 (Contig28908.1) max 441791 (Contig132.21) median 2122 #N count: mean 1.0 sd 5.1 mkdir /cluster/bluearc/galGal2 cat ../contigs/*.fa > /cluster/bluearc/galGal2/allContigs.fa # Get qual scores too cd /cluster/data/galGal2 wget ftp://genome.wustl.edu/private/chicken_040105/Qual/chicken_040105.contigs.qual.tar.gz # Official location (password-protected): # http://genome.wustl.edu/projects/chicken/chicken_analysis/chicken_040105.contigs.qual.tar.gz # DOWNLOAD AGP, BUILD AND CHECK CHROM-LEVEL SEQUENCE (DONE 2/24/04 angie) ssh kolossus cd /cluster/data/galGal2 wget ftp://genome.wustl.edu/private/lhillier/old/chicken_agp_040224.tar.gz # anticipated Official location (password-protected): # http://genome.wustl.edu/projects/chicken/chicken_analysis/chicken_agp_040224.tar.gz tar xvzf chicken_agp_040224.tar.gz cd agp_040224 cp /dev/null ../chrom.lst foreach f (*.agp) set chr = `echo $f:r | sed -e 's/^chr//'` set base = `echo $chr | sed -e 's/_random$//'` mkdir -p ../$base cp -p $f ../$base if ("$chr" == "$base") echo $chr >> ../chrom.lst end # OK, tack chrM on there too: echo M >> ../chrom.lst cd /cluster/data/galGal2 foreach c (`cat chrom.lst`) foreach agp ($c/chr$c{,_random}.agp) if (-e $agp) then set fa = $agp:r.fa echo building $fa from $agp agpToFa -simpleMultiMixed $agp $agp:t:r $fa \ /cluster/bluearc/galGal2/allContigs.fa endif end end # checkAgpAndFa prints out way too much info -- keep the end/stderr only: foreach c (`cat chrom.lst`) foreach agp ($c/chr$c{,_random}.agp) if (-e $agp) then set fa = $agp:r.fa echo checking consistency of $agp and $fa ~/bin/$MACHTYPE/checkAgpAndFa $agp $fa | tail -1 endif end end faSize */chr*.fa #1133629576 bases (79539536 N's 1054090040 real) in 54 sequences in 54 files #Total size: mean 20993140.3 sd 41649023.2 min 1525 (chrE64) max 188239860 (chr1) median 4731479 #N count: mean 1472954.4 sd 5969394.3 # excluding chrM.fa: note the same #real as for WGS contigs: #1133612801 bases (79539536 N's 1054073265 real) in 53 sequences in 53 files #Total size: mean 21388920.8 sd 41944943.4 min 1525 (chrE64) max 188239860 (chr1) median 4731479 #N count: mean 1500746.0 sd 6022991.0 # COMPARE BASE COUNTS WITH WUSTL'S (DONE 2/27/04 angie) # Compare LaDeana's size summary file with our chrom faSize output: ssh kksilo cd /cluster/data/galGal2 # Download from password-protected location: wget --http-user=xxxxxx --http-passwd=xxxxx \ -O basepairs_per_chromosome \ http://genome.wustl.edu/projects/chicken/chicken_analysis/rsc/basepairs_per_chromosome # Actually, we can only compare total bases with faSize because faSize # doesn't differentiate between an N from a gap and an N base. perl -wpe 'chop; if (! /[0-9]/ || /TOTALS/) { s/^.*$//; next; } \ s/^GGA//; \ if (s/^(\w+)\s+(\d+)\s+(\d+)\s*$//) { \ ($c, $noGapBP, $totalBP) = ($1, $2, $3); \ $c =~ s/Unlocalized/Un/; \ $chr = "chr$c"; \ $c =~ s/_random//; \ $faSize = "faSize $c/$chr.fa"; \ $faSize = `$faSize`; \ if ($faSize =~ /^(\d+) bases \((\d+) N.s (\d+) real/) { \ if ($totalBP != $1) { \ print "$chr totalBP: WUSTL $totalBP, us $1!\n"; \ } \ } else { print "parsed faSize wrong:\n$faSize\n"; } \ } else { print "what is this?\n$_\n"; }' \ basepairs_per_chromosome # oops, basepairs_per_chromosome doesn't include terminal gaps in total BP # counts... told LaDeana: #chr12 totalBP: WUSTL 19811895, us 19821895! #chr23 totalBP: WUSTL 5166127, us 5666127! #chr28 totalBP: WUSTL 4231479, us 4731479! # BREAK UP SEQUENCE INTO 5 MB CHUNKS AT CONTIGS/GAPS (DONE 2/24/04 angie) ssh kksilo cd /cluster/data/galGal2 foreach c (`cat chrom.lst`) foreach agp ($c/chr$c{,_random}.agp) if (-e $agp) then set fa = $agp:r.fa echo splitting $agp and $fa cp -p $agp $agp.bak cp -p $fa $fa.bak splitFaIntoContigs $agp $fa . -nSize=5000000 endif end end # splitFaIntoContigs makes new dirs for _randoms. Move their contents # back into the main chrom dirs and get rid of the _random dirs. foreach d (*_random) set base = `echo $d | sed -e 's/_random$//'` mv $d/lift/oOut.lst $base/lift/rOut.lst mv $d/lift/ordered.lft $base/lift/random.lft mv $d/lift/ordered.lst $base/lift/random.lst rmdir $d/lift mv $d/* $base rmdir $d end # Make a "pseudo-contig" for processing chrM too: mkdir M/chrM_1 sed -e 's/chrM/chrM_1/' M/chrM.fa > M/chrM_1/chrM_1.fa mkdir M/lift echo "chrM_1/chrM_1.fa.out" > M/lift/oOut.lst echo "chrM_1" > M/lift/ordered.lst echo "0 M/chrM_1 16775 chrM 16775" > M/lift/ordered.lft # MAKE JKSTUFF AND BED DIRECTORIES (DONE 2/23/04 angie) # This used to hold scripts -- better to keep them inline here so # they're in CVS. Now it should just hold lift file(s) and # temporary scripts made by copy-paste from this file. mkdir /cluster/data/galGal2/jkStuff # This is where most tracks will be built: mkdir /cluster/data/galGal2/bed # CREATING DATABASE (DONE 2/23/04 angie) ssh hgwdev echo 'create database galGal2' | hgsql '' # Use df to make sure there is at least 75G free on hgwdev:/var/lib/mysql df -h /var/lib/mysql # /dev/sdc1 1.8T 318G 1.4T 20% /var/lib/mysql # CREATING GRP TABLE FOR TRACK GROUPING (DONE 2/23/04 angie) ssh hgwdev echo "create table grp (PRIMARY KEY(NAME)) select * from hg16.grp" \ | hgsql galGal2 # MAKE CHROMINFO TABLE WITH (TEMPORARILY UNMASKED) NIBS (DONE 2/24/04 angie) # Make nib/, unmasked until RepeatMasker and TRF steps are done. # Do this now so we can load up RepeatMasker and run featureBits; # can also load up other tables that don't depend on masking. ssh kksilo cd /cluster/data/galGal2 mkdir nib foreach c (`cat chrom.lst`) foreach f ($c/chr${c}{,_random}.fa) if (-e $f) then echo "nibbing $f" /cluster/bin/i386/faToNib $f nib/$f:t:r.nib endif end end # Make symbolic links from /gbdb/galGal2/nib to the real nibs. ssh hgwdev mkdir -p /gbdb/galGal2/nib foreach f (/cluster/data/galGal2/nib/chr*.nib) ln -s $f /gbdb/galGal2/nib end # Load /gbdb/galGal2/nib paths into database and save size info. cd /cluster/data/galGal2 hgsql galGal2 < $HOME/kent/src/hg/lib/chromInfo.sql hgNibSeq -preMadeNib galGal2 /gbdb/galGal2/nib */chr*.fa echo "select chrom,size from chromInfo" | hgsql -N galGal2 > chrom.sizes # take a look at chrom.sizes, should be 54 lines wc chrom.sizes # REPEAT MASKING (DONE 2/25/04 angie) #- Split contigs into 500kb chunks, at gaps if possible: ssh kksilo cd /cluster/data/galGal2 foreach c (`cat chrom.lst`) foreach d ($c/chr${c}*_?{,?}) cd $d echo "splitting $d" set contig = $d:t ~/bin/i386/faSplit gap $contig.fa 500000 ${contig}_ -lift=$contig.lft \ -minGapSize=100 cd ../.. end end #- Make the run directory and job list: cd /cluster/data/galGal2 cat << '_EOF_' > jkStuff/RMChicken #!/bin/csh -fe cd $1 pushd . /bin/mkdir -p /tmp/galGal2/$2 /bin/cp $2 /tmp/galGal2/$2/ cd /tmp/galGal2/$2 /cluster/bluearc/RepeatMasker/RepeatMasker -ali -s -spec chicken $2 popd /bin/cp /tmp/galGal2/$2/$2.out ./ if (-e /tmp/galGal2/$2/$2.align) /bin/cp /tmp/galGal2/$2/$2.align ./ if (-e /tmp/galGal2/$2/$2.tbl) /bin/cp /tmp/galGal2/$2/$2.tbl ./ if (-e /tmp/galGal2/$2/$2.cat) /bin/cp /tmp/galGal2/$2/$2.cat ./ /bin/rm -fr /tmp/galGal2/$2/* /bin/rmdir --ignore-fail-on-non-empty /tmp/galGal2/$2 /bin/rmdir --ignore-fail-on-non-empty /tmp/galGal2 '_EOF_' # << this line makes emacs coloring happy chmod +x jkStuff/RMChicken mkdir RMRun cp /dev/null RMRun/RMJobs foreach c (`cat chrom.lst`) foreach d ($c/chr${c}{,_random}_?{,?}) set ctg = $d:t foreach f ( $d/${ctg}_?{,?}.fa ) set f = $f:t echo /cluster/data/galGal2/jkStuff/RMChicken \ /cluster/data/galGal2/$d $f \ '{'check out line+ /cluster/data/galGal2/$d/$f.out'}' \ >> RMRun/RMJobs end end end #- Do the run ssh kk cd /cluster/data/galGal2/RMRun para create RMJobs para try, para check, para check, para push, para check,... #Completed: 2724 of 2724 jobs #Average job time: 2033s 33.89m 0.56h 0.02d #Longest job: 9752s 162.53m 2.71h 0.11d #Submission to last job: 9771s 162.85m 2.71h 0.11d #- Lift up the 500KB chunk .out's to 5MB ("pseudo-contig") level ssh kksilo cd /cluster/data/galGal2 foreach d (*/chr*_?{,?}) set contig = $d:t echo $contig liftUp $d/$contig.fa.out $d/$contig.lft warn $d/${contig}_*.fa.out \ > /dev/null end #- Lift pseudo-contigs to chromosome level foreach c (`cat chrom.lst`) echo lifting $c cd $c if (-e lift/ordered.lft && ! -z lift/ordered.lft) then liftUp chr$c.fa.out lift/ordered.lft warn `cat lift/oOut.lst` \ > /dev/null endif if (-e lift/random.lft && ! -z lift/random.lft) then liftUp chr${c}_random.fa.out lift/random.lft warn `cat lift/rOut.lst` \ > /dev/null endif cd .. end #- Load the .out files into the database with: ssh hgwdev cd /cluster/data/galGal2 hgLoadOut galGal2 */chr*.fa.out # RE-RUN PROCESSREPEATS (DONE 2/25/04 angie) # Sure hope this is the only time we ever need to do this: # Arian sent an update to the ProcessRepeats script -- we can re-run # from .cat files, should go pretty quickly, and Arian says it shouldn't # affect our masking that much. ssh kksilo cd /cluster/data/galGal2 foreach d (*/chr*_?{,?}) echo $d rm -rf /tmp/gawdammit mkdir /tmp/gawdammit cp -p $d/*.cat /tmp/gawdammit pushd /tmp/gawdammit /cluster/home/angie/cb/RepeatMasker/ProcessRepeats -spec gallus *.cat \ > /dev/null popd liftUp $d/$d:t.new.out $d/$d:t.lft warn /tmp/gawdammit/*.out > /dev/null end # lift *.new.out to chrom-level new.out foreach c (`cat chrom.lst`) echo lifting $c cd $c if (-e lift/ordered.lft && ! -z lift/ordered.lft) then liftUp chr$c.new.out lift/ordered.lft warn \ `cat lift/oOut.lst | sed -e 's/.fa.out/.new.out/'` \ > /dev/null endif if (-e lift/random.lft && ! -z lift/random.lft) then liftUp chr${c}_random.new.out lift/random.lft warn \ `cat lift/rOut.lst | sed -e 's/.fa.out/.new.out/'` \ > /dev/null endif cd .. end # Compare old .fa.out to new (reloading chr*_rmsk in the process) ssh hgwdev mkdir /cluster/data/galGal2/bed/ProcessRepeatsCheck cd /cluster/data/galGal2/bed/ProcessRepeatsCheck featureBits galGal2 rmsk -bed=rmsk.orig.bed #104249252 bases of 1054197620 (9.889%) in intersection hgLoadOut galGal2 */chr*.new.out featureBits galGal2 rmsk -bed=rmsk.new.bed #104249260 bases of 1054197620 (9.889%) in intersection featureBits galGal2 rmsk \!rmsk.orig.bed -bed=notIn.orig.bed #8 bases of 1054197620 (0.000%) in intersection featureBits galGal2 \!rmsk rmsk.orig.bed -bed=notIn.new.bed #0 bases of 1054197620 (0.000%) in intersection # SWEET! cat notIn.orig.bed #chr1 141589449 141589454 chr1.1 #chr2 110897485 110897488 chr2.1 # In chr1, looks like these two original lines... # 1014 31.8 6.7 1.7 chr1 141588904 141589449 (46650411) + CR1-Y4 LINE/CR1 281 4041 (472) 176 # 1099 27.8 6.2 1.6 chr1 141589455 141589838 (46650022) + CR1-Y4 LINE/CR1 806 1207 (7) 177 # ... got merged: # 1099 30.1 6.5 1.7 chr1 141588904 141589838 (46650022) + CR1-Y4 LINE/CR1 3454 4508 (7) 176 # In chr2, # 1305 23.6 2.1 1.2 chr2 110897148 110897485 (36693280) C CR1-Y4 LINE/CR1 (14) 1200 860 96 # 335 20.4 0.9 8.8 chr2 110897489 110897601 (36693164) C CR1-Y LINE/CR1 (423) 4090 3987 97 # get merged to # 1305 22.8 1.8 3.3 chr2 110897148 110897601 (36693164) C CR1-Y4 LINE/CR1 (14) 4501 3987 # Replace old .fa.outs with .new.outs foreach f ( */chr*.new.out */*/chr*_?{,?}.new.out ) set g = `echo $f | sed -e 's/.new.out/.fa.out/'` mv $f $g end # Remove lowest-level old .fa.out's -- and .fa's while at it foreach d (*/chr*_?{,?}) rm $d/${d:t}_?{,?}.fa.out rm $d/${d:t}_?{,?}.fa end # VERIFY REPEATMASKER RESULTS (DONE 2/26/04 angie) # Eyeball some repeat annotations in the browser, compare to lib seqs. # Run featureBits on galGal2 and on a comparable genome build, and compare: ssh hgwdev featureBits galGal2 rmsk #104249260 bases of 1054197620 (9.889%) in intersection # slight increase from galGal1: #103553237 bases of 1054197620 (9.823%) in intersection # MAKE 10.OOC, 11.OOC FILES FOR BLAT (DONE 2/27/04 angie) # Use -repMatch=380 (based on size -- for human we use 1024, and # chicken size is ~37% of human) ssh kkr1u00 mkdir /cluster/data/galGal2/bed/ooc cd /cluster/data/galGal2/bed/ooc ls -1 /cluster/data/galGal2/nib/chr*.nib > nib.lst blat nib.lst /dev/null /dev/null -tileSize=11 \ -makeOoc=/cluster/bluearc/galGal2/11.ooc -repMatch=380 #Wrote 13552 overused 11-mers to /cluster/bluearc/galGal2/11.ooc blat nib.lst /dev/null /dev/null -tileSize=10 \ -makeOoc=/cluster/bluearc/galGal2/10.ooc -repMatch=380 #Wrote 169709 overused 10-mers to /cluster/bluearc/galGal2/10.ooc cp -p /cluster/bluearc/galGal2/*.ooc /iscratch/i/galGal2/ iSync # MAKE LIFTALL.LFT (DONE 2/24/04 angie) ssh kksilo cd /cluster/data/galGal2 cat */lift/{ordered,random}.lft > jkStuff/liftAll.lft # PUT UNMASKED CHUNKS ON BLUEARC (DONE 2/24/04 angie) # Some of Terry's staging scripts require a flat dir containing # all contigs; also, doesn't hurt to set aside unmasked seq. ssh kksilo cd /cluster/data/galGal2 mkdir /cluster/bluearc/galGal2/unmaskedChunks foreach d (*/chr*_?{,?}) cp -p $d/${d:t}.fa /cluster/bluearc/galGal2/unmaskedChunks end # MAKE STS MARKERS TRACK (TODO 3//04 angie -- need more info from Terry) set base = /cluster/data/galGal2/bed/markers mkdir -p $base/primers $base/sts # Need to find out how Terry built these: cp -p /cluster/bluearc/gg1/markers/sts/convert.pl $base cp -p /cluster/bluearc/gg1/markers/stsAlias.bed $base cp -p /cluster/bluearc/gg1/markers/stsInfoGG.bed $base # Need to find out where Terry got these files: cp -p /cluster/bluearc/gg1/markers/primers/all.primers{,.fa} $base/primers cp -p /cluster/bluearc/gg1/markers/sts/all.sequence.fa $base/sts # Run e-PCR on primers mkdir $base/primers/epcr cd $base/primers/epcr mkdir epcr.out /cluster/bin/scripts/createCloneList \ /cluster/bluearc/galGal2/unmaskedChunks # That script creates in.lst/$n.in files, where $n is # the contigs number *plus 1*! in.lst/0.in is empty and makes # para create complain, so get rid of it. rm in.lst/0.in egrep -v '/0.in$' files.lst > tmp.lst; mv tmp.lst files.lst echo $base/primers/all.primers > epcr.lst cat << '_EOF_' > template #LOOP /cluster/bin/scripts/runEpcr {check in line+ $(path1)} {check in line+ $(path2)} {check out exists epcr.out/$(root2).epcr} #ENDLOOP '_EOF_' # << this line keeps emacs coloring happy /cluster/bin/i386/gensub2 epcr.lst files.lst template jobList ssh kk set base = /cluster/data/galGal2/bed/markers cd $base/primers/epcr para create jobList para try, check, push, check, ... #Completed: 251 of 251 jobs #Average job time: 30s 0.50m 0.01h 0.00d #Longest job: 45s 0.75m 0.01h 0.00d #Submission to last job: 46s 0.77m 0.01h 0.00d cat `ls -1 epcr.out/* | sort -g` > all.epcr # blat primers -- with old blat.2 ! mkdir $base/primers/blat cd $base/primers/blat mkdir primers.out ls -1S /cluster/bluearc/galGal2/unmaskedChunks/*.fa > contigs.lst echo $base/primers/all.primers.fa > primers.lst cat << '_EOF_' > template #LOOP /cluster/home/kent/bin/i386/blat.2 -tileSize=10 -ooc=/cluster/bluearc/galGal2/10.ooc -minMatch=1 -minScore=1 -minIdentity=75 {check in line+ $(path1)} {check in line+ $(path2)} {check out line+ primers.out/$(root1).$(root2).psl} #ENDLOOP '_EOF_' # << this line keeps emacs coloring happy /cluster/bin/i386/gensub2 contigs.lst primers.lst template jobList ssh kk set base = /cluster/data/galGal2/bed/markers cd $base/primers/blat para create jobList para try, check, push, check, ... #Completed: 261 of 261 jobs #Average job time: 18s 0.30m 0.00h 0.00d #Longest job: 27s 0.45m 0.01h 0.00d #Submission to last job: 28s 0.47m 0.01h 0.00d /cluster/bin/i386/pslSort dirs primers.psl temp primers.out rm -r temp # Make primers.final from e-PCR and blat results: cd $base/primers filterPrimers -chicken ../stsInfoGG.bed blat/primers.psl all.primers \ epcr/all.epcr > primers.psl.filter extractPslInfo primers.psl.filter mv primers.psl.filter.initial primers.initial sort -k 4n primers.initial > primers.final # blat sts sequences mkdir $base/sts/blat cd $base/sts/blat mkdir stsMarkers.out ls -1S /cluster/bluearc/galGal2/unmaskedChunks/*.fa > contigs.lst echo $base/sts/all.sequence.fa > stsMarkers.lst cat << '_EOF_' > template #LOOP /cluster/bin/i386/blat {check in line+ $(path1)} {check in line+ $(path2)} {check out line+ stsMarkers.out/$(root1).psl} #ENDLOOP '_EOF_' # << this line keeps emacs coloring happy /cluster/bin/i386/gensub2 contigs.lst stsMarkers.lst template jobList ssh kk set base = /cluster/data/galGal2/bed/markers cd $base/sts/blat para create jobList para try, check, push, check, ... #Completed: 261 of 261 jobs #Average job time: 96s 1.59m 0.03h 0.00d #Longest job: 199s 3.32m 0.06h 0.00d #Submission to last job: 199s 3.32m 0.06h 0.00d /cluster/bin/i386/pslSort dirs raw.psl temp stsMarkers.out /cluster/bin/i386/pslReps -nearTop=0.01 -minCover=0.6 -minAli=0.8 \ -noIntrons raw.psl stsMarkers.psl /dev/null rm -rf temp rm -f raw.psl cd $base/sts filterPsl blat/stsMarkers.psl extractUniqPsl stsMarkers.psl.filter stsMarkers.psl.filter pslReps stsMarkers.psl.filter.dup.psl stsMarkers.psl.filter.1 /dev/null \ -noIntrons -minCover=0.80 -minAli=0.90 -nearTop=0.05 mv stsMarkers.psl.filter stsMarkers.psl.filter.orig cat stsMarkers.psl.filter.1 stsMarkers.psl.filter.uniq.psl \ > stsMarkers.psl.filter extractPslInfo -h stsMarkers.psl.filter rm stsMarkers.psl.filter.1* rm stsMarkers.psl.filter.dup.psl stsMarkers.psl.filter.names \ stsMarkers.psl.filter.uniq.psl extractPslInfo -h stsMarkers.psl.filter mv stsMarkers.psl.filter.initial stsMarkers.initial sort -k 4n stsMarkers.initial > stsMarkers.final.orig ../convert.pl ../stsAlias.bed stsMarkers.final.orig > stsMarkers.final cd $base combineSeqPrimerPos $base/sts/stsMarkers.final \ $base/primers/primers.final > stsMarkers_pos.rdb # This step needs work -- output is 0-length createSTSbed stsInfoGG.bed stsMarkers_pos.rdb > stsMap.bed ssh hgwdev cd /cluster/data/galGal2/bed/markers # Make an stsMarkersTmp so we can at least see the locations awk '{printf "%s\t%s\t%s\t%s\t%d\n", $1, $2, $3, $4, $5 * 1000};' \ sts/stsMarkers.final.orig \ | liftUp -type=.bed stsMarkersTmp.bed ../../jkStuff/liftAll.lft warn stdin hgLoadBed galGal2 stsMarkerTmp stsMarkersTmp.bed # lift & load all_sts_primer liftUp primers.filter.lifted.psl ../../jkStuff/liftAll.lft warn \ primers/primers.psl.filter hgLoadPsl -table=all_sts_primer -fastLoad \ galGal2 primers.filter.lifted.psl # lift & load all_sts_seq liftUp stsMarkers.filter.lifted.psl ../../jkStuff/liftAll.lft warn \ sts/stsMarkers.psl.filter hgLoadPsl -table=all_sts_seq -fastLoad \ galGal2 stsMarkers.filter.lifted.psl # load sequences mkdir /gbdb/galGal2/sts ln -s /cluster/data/galGal2/bed/markers/sts/all.sequence.fa \ /gbdb/galGal2/sts/ ln -s /cluster/data/galGal2/bed/markers/primers/all.primers.fa \ /gbdb/galGal2/sts/ hgLoadSeq galGal2 /gbdb/galGal2/sts/*.fa # BACENDS (DONE 2/24/04 angie) ssh kksilo mkdir /cluster/data/galGal2/bed/bacends cd /cluster/data/galGal2/bed/bacends # download wget ftp://ftp.ncbi.nih.gov/genomes/BACENDS/gallus_gallus/AllBACends.mfa.gz wget ftp://ftp.ncbi.nih.gov/genomes/BACENDS/gallus_gallus/cl_acc_gi_len.gz gunzip *.gz /cluster/bin/scripts/convertBacEndPairInfo cl_acc_gi_len >& pairInfo.log perl -wpe 's/^>gi\|\d+\|gb\|(\w+)\.\d+\|.*/>$1/' AllBACends.mfa \ > bacends.fa faSize bacends.fa #146921771 bases (2724913 N's 144196858 real) in 135555 sequences in 1 files #Total size: mean 1083.9 sd 184.7 min 64 (CC322877.1) max 2431 (CC263290.1) median 1095 #N count: mean 20.1 sd 73.1 rm AllBACends.mfa mkdir /cluster/bluearc/galGal2/bacends faSplit sequence bacends.fa 20 /cluster/bluearc/galGal2/bacends/bacends ls -1S /cluster/bluearc/galGal2/bacends/*.fa > bacends.lst ls -1S /cluster/bluearc/galGal2/unmaskedChunks/*.fa > contigs.lst cat << '_EOF_' > template #LOOP /cluster/bin/i386/blat $(path1) $(path2) -tileSize=10 -ooc=/cluster/bluearc/galGal2/10.ooc {check out line+ psl/$(root1)_$(root2).psl} #ENDLOOP '_EOF_' # << this line keeps emacs coloring happy mkdir psl gensub2 contigs.lst bacends.lst template jobList ssh kk cd /cluster/data/galGal2/bed/bacends para create jobList para try, check, push, check, ... #Completed: 5180 of 5180 jobs #Average job time: 97s 1.62m 0.03h 0.00d #Longest job: 5256s 87.60m 1.46h 0.06d #Submission to last job: 5333s 88.88m 1.48h 0.06d # back on kksilo, filter and lift results: pslSort dirs raw.psl temp psl pslReps -nearTop=0.02 -minCover=0.60 -minAli=0.85 -noIntrons raw.psl \ bacEnds.psl /dev/null liftUp bacEnds.lifted.psl ../../jkStuff/liftAll.lft warn bacEnds.psl rm -r temp rm raw.psl extractUniqPsl bacEnds.lifted.psl bacEnds.psl.filter pslReps bacEnds.psl.filter.dup.psl bacEnds.psl.filter.1 /dev/null \ -noIntrons -minCover=0.80 -minAli=0.90 -nearTop=0.05 extractUniqPsl bacEnds.psl.filter.1 bacEnds.psl.filter.1 rm bacEnds.psl.filter.1 pslReps bacEnds.psl.filter.1.dup.psl bacEnds.psl.filter.2 /dev/null \ -noIntrons -minCover=0.90 -minAli=0.95 -nearTop=0.01 extractUniqPsl bacEnds.psl.filter.2 bacEnds.psl.filter.2 rm bacEnds.psl.filter.2 pslReps bacEnds.psl.filter.2.dup.psl bacEnds.psl.filter.3 /dev/null \ -noIntrons -minCover=0.95 -minAli=0.98 -nearTop=0.01 cat bacEnds.psl.filter.3 \ bacEnds.psl.filter.uniq.psl \ bacEnds.psl.filter.1.uniq.psl \ bacEnds.psl.filter.2.uniq.psl \ > bacEnds.psl.filter.lifted rm *.names bacEnds.psl.filter.3 bacEnds.psl.filter.uniq.psl \ bacEnds.psl.filter.1.uniq.psl bacEnds.psl.filter.2.uniq.psl extractPslInfo -h -be bacEnds.psl.filter.lifted mv bacEnds.psl.filter.lifted.initial bacEnds.initial findAccession -agp -chicken bacEnds.initial /cluster/data/galGal2 rm bacEnds.initial sort -k 4 bacEnds.initial.acc > bacEnds.sort compileBacEnds bacEnds.sort bacEndPairs.txt bacEndSingles.txt \ > bacEnds.temp sorttbl chrom chromStart < bacEnds.temp > bacEnds.rdb rm bacEnds.initial.acc bacEnds.sort bacEnds.temp createBACpairs bacEnds.rdb bacEndPairs.txt > temp.rdb sorttbl chrom1 -r ord1 chromStart1 < temp.rdb > bacEnds.pairs.rdb rm temp.rdb createBACpairs -bed bacEnds.rdb bacEndPairs.txt > bacEndPairs.rdb sorttbl chrom chromStart < bacEndPairs.rdb \ | headchg -del > bacEndPairs.bed rm bacEndPairs.rdb createBacPairsBad -bed bacEnds.rdb bacEndPairs.txt > bacEndPairsBad.rdb sorttbl chrom chromStart < bacEndPairsBad.rdb \ | headchg -del > bacEndPairsBad.bed rm bacEndPairsBad.rdb sorttbl chrom chromStart < bacEndPairsLong.rdb \ | headchg -del > bacEndPairsLong.bed rm bacEndPairsLong.rdb findMissingEnd -max 500000 singleBac.txt bacEnds.psl.filter.2.dup.psl \ bacEnds.psl.filter.1.dup.psl bacEnds.psl.filter.dup.psl > bacEndsMiss.psl mv bacEnds.psl.filter.lifted bacEnds.psl.filter.lifted.orig cat bacEnds.psl.filter.lifted.orig bacEndsMiss.psl \ > bacEnds.psl.filter.lifted rm bacEnds.psl.filter.2.dup.psl bacEnds.psl.filter.1.dup.psl \ bacEnds.psl.filter.dup.psl extractPslLoad bacEnds.psl.filter.lifted bacEndPairs.bed \ bacEndPairsBad.bed bacEndPairsLong.bed > bacEnds.psl.rdb sorttbl tname tstart < bacEnds.psl.rdb | headchg -del > bacEnds.load.psl rm bacEnds.psl.rdb # Load seq's and alignments ssh hgwdev cd /cluster/data/galGal2/bed/bacends mkdir /gbdb/galGal2/bacends ln -s /cluster/data/galGal2/bed/bacends/bacends.fa /gbdb/galGal2/bacends hgLoadSeq galGal2 /gbdb/galGal2/bacends/bacends.fa hgLoadPsl -table=all_bacends -fastLoad \ galGal2 /cluster/data/galGal2/bed/bacends/bacEnds.load.psl # Don't know what caused the warning... but all seem to have loaded. #load of all_bacends did not go as planned: 76421 record(s), 0 row(s) skipped, 1 warning(s) loading psl.tab hgLoadBed -sqlTable=$HOME/kent/src/hg/lib/bacEndPairs.sql \ -hasBin galGal2 bacEndPairs bacEndPairs.bed sed -e 's/bacEndPairs/bacEndPairsBad/g' \ $HOME/kent/src/hg/lib/bacEndPairs.sql > bacEndPairsBad.sql ~/bin/i386/hgLoadBed -sqlTable=bacEndPairsBad.sql \ -hasBin galGal2 bacEndPairsBad bacEndPairsBad.bed # Doh! 6 lines of that have negative starts. Well, trim those # for now and load the rest, then go back and find the neg-start # cause... egrep -v ' -[0-9]+' bacEndPairsBad.bed > bacEndPairsBadFudge.bed ~/bin/i386/hgLoadBed -sqlTable=bacEndPairsBad.sql \ -hasBin galGal2 bacEndPairsBad bacEndPairsBadFudge.bed sed -e 's/bacEndPairs/bacEndPairsLong/g' \ $HOME/kent/src/hg/lib/bacEndPairs.sql > bacEndPairsLong.sql ~/bin/i386/hgLoadBed -sqlTable=bacEndPairsLong.sql \ -hasBin galGal2 bacEndPairsLong bacEndPairsLong.bed # SIMPLE REPEATS (TRF) (DONE 2/24/04 angie) # TRF runs pretty quickly now... it takes a few hours total runtime, # so instead of binrsyncing and para-running, just do this on the # local fileserver ssh kksilo mkdir /cluster/data/galGal2/bed/simpleRepeat cd /cluster/data/galGal2/bed/simpleRepeat mkdir trf cp /dev/null jobs.csh foreach d (/cluster/data/galGal2/*/chr*_?{,?}) set ctg = $d:t foreach f ($d/${ctg}.fa) set fout = $f:t:r.bed echo $fout echo "/cluster/bin/i386/trfBig -trf=/cluster/bin/i386/trf $f /dev/null -bedAt=trf/$fout -tempDir=/tmp" \ >> jobs.csh end end csh -ef jobs.csh >&! jobs.log & # check on this with tail -f jobs.log wc -l jobs.csh ls -1 trf | wc -l endsInLf trf/* # When job is done do: liftUp simpleRepeat.bed /cluster/data/galGal2/jkStuff/liftAll.lft warn \ trf/*.bed # Load into the database: ssh hgwdev hgLoadBed galGal2 simpleRepeat \ /cluster/data/galGal2/bed/simpleRepeat/simpleRepeat.bed \ -sqlTable=$HOME/src/hg/lib/simpleRepeat.sql featureBits galGal2 simpleRepeat # 8434365 bases of 1054197620 (0.800%) in intersection # Wow, identical to galGal1! Which would really freak me out except that # the underlying WGS contig sequences have not changed at all, only their # relative ordering+orientation along chroms. # PROCESS SIMPLE REPEATS INTO MASK (DONE 2/24/04/ angie) # After the simpleRepeats track has been built, make a filtered version # of the trf output: keep trf's with period <= 12: ssh kksilo cd /cluster/data/galGal2/bed/simpleRepeat mkdir -p trfMask foreach f (trf/chr*.bed) awk '{if ($5 <= 12) print;}' $f > trfMask/$f:t end # Lift up filtered trf output to chrom coords as well: cd /cluster/data/galGal2 mkdir bed/simpleRepeat/trfMaskChrom foreach c (`cat chrom.lst`) if (-e $c/lift/ordered.lst) then perl -wpe 's@(\S+)@bed/simpleRepeat/trfMask/$1.bed@' \ $c/lift/ordered.lst > $c/lift/oTrf.lst liftUp bed/simpleRepeat/trfMaskChrom/chr$c.bed \ jkStuff/liftAll.lft warn `cat $c/lift/oTrf.lst` endif if (-e $c/lift/random.lst) then perl -wpe 's@(\S+)@bed/simpleRepeat/trfMask/$1.bed@' \ $c/lift/random.lst > $c/lift/rTrf.lst liftUp bed/simpleRepeat/trfMaskChrom/chr${c}_random.bed \ jkStuff/liftAll.lft warn `cat $c/lift/rTrf.lst` endif end # Here's the coverage for the filtered TRF: ssh hgwdev cat /cluster/data/galGal2/bed/simpleRepeat/trfMaskChrom/*.bed \ > /tmp/filtTrf.bed featureBits galGal2 /tmp/filtTrf.bed # 4510381 bases of 1054197620 (0.428%) in intersection # OK, not *identical* to galGal1 (4512560) but very close. Maybe # there's a difference in the period chosen for some range of bases?? # MASK SEQUENCE WITH REPEATMASKER AND SIMPLE REPEAT/TRF (DONE 2/25/04 angie) # Note: just to keep things consistent, redid chr1 and chr2 2/26 with # the ProcessRepeats-only rerun results (only masking changes were # 5bp in chr1 and 3bp in chr2) ssh kksilo cd /cluster/data/galGal2 # Soft-mask (lower-case) the contig and chr .fa's, # then make hard-masked versions from the soft-masked. set trfCtg=bed/simpleRepeat/trfMask set trfChr=bed/simpleRepeat/trfMaskChrom foreach f (*/chr*.fa) echo "repeat- and trf-masking $f" maskOutFa -soft $f $f.out $f set chr = $f:t:r maskOutFa -softAdd $f $trfChr/$chr.bed $f echo "hard-masking $f" maskOutFa $f hard $f.masked end # This warning is extremely rare -- if it indicates a problem, it's only # with the repeat annotation and doesn't affect our masking. #WARNING: negative rEnd: -25 chr2:94907561-94907817 GGLTR3B1 # sent test case to Arian. (line 101 of chr2_19_19.fa.out) foreach c (`cat chrom.lst`) echo "repeat- and trf-masking contigs of chr$c, chr${c}_random" foreach d ($c/chr*_?{,?}) set ctg=$d:t set f=$d/$ctg.fa maskOutFa -soft $f $f.out $f maskOutFa -softAdd $f $trfCtg/$ctg.bed $f maskOutFa $f hard $f.masked end end # same deal here: #WARNING: negative rEnd: -25 chr2_19:4344750-4345006 GGLTR3B1 #- Rebuild the nib files, using the soft masking in the fa: mkdir nib foreach f (*/chr*.fa) faToNib -softMask $f nib/$f:t:r.nib end # Make one big 2bit file as well, and make a link to it in # /gbdb/galGal2/nib because hgBlat looks there: faToTwoBit */chr*.fa galGal2.2bit ln -s /cluster/data/galGal2/galGal2.2bit /gbdb/galGal2/nib/ # GOLD AND GAP TRACKS (DONE 2/24/04 angie) ssh hgwdev cd /cluster/data/galGal2 hgGoldGapGl -noGl -chromLst=chrom.lst galGal2 /cluster/data/galGal2 . # featureBits fails if there's no chrM_gap, so make one: # echo "create table chrM_gap like chr1_gap" | hgsql galGal2 # oops, that won't work until v4.1, so do this for the time being: echo "create table chrM_gap select * from chr1_gap where 0=1" \ | hgsql galGal2 # MAKE HGCENTRALTEST ENTRY AND TRACKDB TABLE FOR GALGAL1 (DONE 2/24/04 angie) ssh hgwdev # Make trackDb table so browser knows what tracks to expect: ssh hgwdev cd $HOME/kent/src/hg/makeDb/trackDb cvs up -d -P # Edit that makefile to add galGal2 in all the right places and do make update make alpha cvs commit makefile # Add dbDb and defaultDb entries: echo 'insert into dbDb (name, description, nibPath, organism, \ defaultPos, active, orderKey, genome, scientificName, \ htmlPath, hgNearOk) \ values("galGal2", "Feb. 2004", \ "/gbdb/galGal2/nib", "Chicken", "chr13:13120186-13124117", 1, \ 35, "Chicken", "Gallus gallus", \ "/gbdb/galGal2/html/description.html", 0);' \ | hgsql -h genome-testdb hgcentraltest echo 'update defaultDb set name = "galGal2" where genome = "Chicken"' \ | hgsql -h genome-testdb hgcentraltest # MAKE HGCENTRALTEST BLATSERVERS ENTRY FOR GALGAL1 (DONE 2/26/04 angie) ssh hgwdev echo 'insert into blatServers values("galGal2", "blat1", "17778", "1"); \ insert into blatServers values("galGal2", "blat1", "17779", "0");' \ | hgsql -h genome-testdb hgcentraltest # MAKE DESCRIPTION/SAMPLE POSITION HTML PAGE (DONE 2/23/04 angie) ssh hgwdev mkdir /gbdb/galGal2/html # Write ~/kent/src/hg/makeDb/trackDb/chicken/galGal2/description.html # with a description of the assembly and some sample position queries. chmod a+r $HOME/kent/src/hg/makeDb/trackDb/chicken/galGal2/description.html # Check it in and copy (ideally using "make alpha" in trackDb) to # /gbdb/galGal2/html # MAKE DOWNLOADABLE SEQUENCE FILES (DONE 2/25/04 angie) ssh kksilo cd /cluster/data/galGal2 #- Build the .zip files cat << '_EOF_' > jkStuff/zipAll.csh rm -rf zip mkdir zip zip -j zip/chromAgp.zip [0-9A-Z]*/chr*.agp zip -j zip/chromOut.zip */chr*.fa.out zip -j zip/chromFa.zip */chr*.fa zip -j zip/chromFaMasked.zip */chr*.fa.masked cd bed/simpleRepeat zip ../../zip/chromTrf.zip trfMaskChrom/chr*.bed cd ../.. cd /cluster/data/genbank ./bin/i386/gbGetSeqs -db=galGal2 -native GenBank mrna \ /cluster/data/galGal2/zip/mrna.fa cd /cluster/data/galGal2/zip zip -j mrna.zip mrna.fa '_EOF_' # << this line makes emacs coloring happy csh ./jkStuff/zipAll.csh |& tee zipAll.log cd zip #- Look at zipAll.log to make sure all file lists look reasonable. #- Check zip file integrity: foreach f (*.zip) unzip -t $f > $f.test tail -1 $f.test end wc -l *.zip.test #- Copy the .zip files to hgwdev:/usr/local/apache/... ssh hgwdev cd /cluster/data/galGal2/zip set gp = /usr/local/apache/htdocs/goldenPath/galGal2 mkdir -p $gp/bigZips cp -p *.zip $gp/bigZips mkdir -p $gp/chromosomes foreach f ( ../*/chr*.fa ) zip -j $gp/chromosomes/$f:t.zip $f end cd $gp/bigZips md5sum *.zip > md5sum.txt cd $gp/chromosomes md5sum *.zip > md5sum.txt # Take a look at bigZips/* and chromosomes/*, update their README.txt's # Can't make refGene upstream sequence files - no refSeq for chicken. # Maybe ensGene when we get that. # PUT MASKED SEQUENCE OUT FOR CLUSTER RUNS (DONE 2/26/04 angie) ssh kkr1u00 # Chrom-level mixed nibs that have been repeat- and trf-masked: rm -rf /iscratch/i/galGal2/nib mkdir /iscratch/i/galGal2/nib cp -p /cluster/data/galGal2/nib/chr*.nib /iscratch/i/galGal2/nib # Pseudo-contig fa that have been repeat- and trf-masked: rm -rf /iscratch/i/galGal2/trfFa mkdir /iscratch/i/galGal2/trfFa foreach d (/cluster/data/galGal2/*/chr*_?{,?}) cp $d/$d:t.fa /iscratch/i/galGal2/trfFa end cp -p /cluster/data/galGal2/galGal2.2bit /iscratch/i/galGal2/ iSync # MAKE CHICKEN LINEAGE-SPECIFIC REPEATS (DONE 2/25/04 angie) # In an email 2/13/04, Arian said we could treat all human repeats as # lineage-specific, but could exclude these from chicken as ancestral: # L3, L3a, L3b, MIR, MIR3, MIRb, MIRm ssh kkr1u00 cd /cluster/data/galGal2 mkdir -p /iscratch/i/galGal2/linSpecRep foreach f (*/chr*.fa.out) awk '$10 !~/^(L3|L3a|L3b|MIR|MIR3|MIRb|MIRm)$/ {print;}' $f \ > /iscratch/i/galGal2/linSpecRep/$f:t:r:r.out.spec end # SWAP BLASTZ HUMAN-CHICKEN TO CHICKEN-HUMAN (HG17) (DONE 8/20/04 angie) ssh kolossus # hg17-galGal2 alignments were already axtRescored, no need to re-rescore. mkdir /cluster/data/galGal2/bed/blastz.hg17.swap.2004-06-14 ln -s blastz.hg17.swap.2004-06-14 /cluster/data/galGal2/bed/blastz.hg17 cd /cluster/data/galGal2/bed/blastz.hg17 set aliDir = /cluster/data/hg17/bed/blastz.galGal2.2004-06-14 cp $aliDir/S1.len S2.len cp $aliDir/S2.len S1.len mkdir unsorted axtChrom cat $aliDir/axtChrom/chr*.axt \ | axtSwap stdin $aliDir/S1.len $aliDir/S2.len stdout \ | axtSplitByTarget stdin unsorted # Sort the shuffled .axt files. foreach f (unsorted/*.axt) echo sorting $f:t:r axtSort $f axtChrom/$f:t end du -sh $aliDir/axtChrom unsorted axtChrom #1.4G /cluster/data/hg17/bed/blastz.galGal2.2004-06-14/axtChrom #1.4G unsorted #1.4G axtChrom rm -r unsorted # CHAIN HUMAN BLASTZ (DONE 8/20/04 angie) # Run axtChain on little cluster ssh kki cd /cluster/data/galGal2/bed/blastz.hg17 mkdir -p axtChain/run1 cd axtChain/run1 mkdir out chain ls -1S /cluster/data/galGal2/bed/blastz.hg17/axtChrom/*.axt \ > input.lst cat << '_EOF_' > gsub #LOOP doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out exists out/$(root1).out} #ENDLOOP '_EOF_' # << this line makes emacs coloring happy cat << '_EOF_' > doChain #!/bin/csh axtChain -scoreScheme=/cluster/data/blastz/HoxD55.q \ -linearGap=/cluster/data/blastz/chickenHumanTuned.gap \ -minScore=5000 $1 \ /iscratch/i/galGal2/nib \ /iscratch/i/gs.18/build35/bothMaskedNibs $2 > $3 '_EOF_' # << this line makes emacs coloring happy chmod a+x doChain gensub2 input.lst single gsub jobList para create jobList para try, check, push, check... #Completed: 50 of 50 jobs #Average job time: 69s 1.15m 0.02h 0.00d #Longest job: 223s 3.72m 0.06h 0.00d #Submission to last job: 229s 3.82m 0.06h 0.00d # now on the cluster server, sort chains ssh kksilo cd /cluster/data/galGal2/bed/blastz.hg17/axtChain chainMergeSort run1/chain/*.chain > all.chain chainSplit chain all.chain rm run1/chain/*.chain # take a look at score distr's foreach f (chain/*.chain) grep chain $f | awk '{print $2;}' | sort -nr > /tmp/score.$f:t:r echo $f:t:r textHistogram -binSize=10000 /tmp/score.$f:t:r echo "" end # Load chains into database ssh hgwdev cd /cluster/data/galGal2/bed/blastz.hg17/axtChain/chain foreach i (*.chain) set c = $i:r echo loading $c hgLoadChain galGal2 ${c}_chainHg17 $i end # NET HUMAN BLASTZ (DONE 8/20/04 angie) ssh kksilo cd /cluster/data/galGal2/bed/blastz.hg17/axtChain chainPreNet all.chain ../S1.len ../S2.len stdout \ | chainNet stdin -minSpace=1 ../S1.len ../S2.len stdout /dev/null \ | netSyntenic stdin noClass.net # Add classification info using db tables: ssh hgwdev cd /cluster/data/galGal2/bed/blastz.hg17/axtChain netClass noClass.net galGal2 hg17 human.net # Make a 'syntenic' subset: ssh kksilo cd /cluster/data/galGal2/bed/blastz.hg17/axtChain rm noClass.net # Make a 'syntenic' subset of these with netFilter -syn human.net > humanSyn.net # Load the nets into database ssh hgwdev cd /cluster/data/galGal2/bed/blastz.hg17/axtChain netFilter -minGap=10 human.net | hgLoadNet galGal2 netHg17 stdin netFilter -minGap=10 humanSyn.net | h gLoadNet galGal2 netSyntenyHg17 stdin # Add entries for chainHg17, netHg17, netSyntenyHg17 to chicken/galGal2 trackDb # GENERATE HG17 MAF FOR MULTIZ FROM NET (DONE 8/20/04 angie) ssh kksilo cd /cluster/data/galGal2/bed/blastz.hg17/axtChain netSplit human.net net cd /cluster/data/galGal2/bed/blastz.hg17 mkdir axtNet foreach f (axtChain/net/*) set chr = $f:t:r netToAxt $f axtChain/chain/$chr.chain /cluster/data/galGal2/nib \ /cluster/data/hg17/nib stdout \ | axtSort stdin axtNet/$chr.axt end mkdir mafNet foreach f (axtNet/chr*.axt) set maf = mafNet/$f:t:r.maf axtToMaf $f \ /cluster/data/galGal2/chrom.sizes /cluster/data/hg17/chrom.sizes \ $maf -tPrefix=galGal2. -qPrefix=hg17. end # XENOPUS BLASTZ/CHAIN/NET (DONE 9/24/04 jk) # see makeXenTro1.doc and search for zb.galGal2 # The results of this are also symlinked under galGal2/bed # MAKE VSHG17 DOWNLOADABLES (DONE 5/2/05 angie) ssh hgwdev mkdir /usr/local/apache/htdocs/goldenPath/galGal2/vsHg17 cd /usr/local/apache/htdocs/goldenPath/galGal2/vsHg17 ln -s /cluster/data/galGal2/bed/blastz.hg17/axtChain/galGal2.hg17.*.gz . mkdir axtNet foreach f (/cluster/data/galGal2/bed/blastz.hg17/axtNet/*.axt.gz) ln -s $f axtNet/$f:t:r:r.galGal2.hg17.net.axt.gz end nice md5sum *.gz */*.gz > md5sum.txt # Copy over & edit README.txt w/pointers to chain, net formats. # BLASTZ SELF (DONE 3/3/04 angie) ssh kk mkdir -p /cluster/data/galGal2/bed/blastz.galGal2.2004-03-03 cd /cluster/data/galGal2/bed/blastz.galGal2.2004-03-03 # OK, the 500k x whole chrom 03-02 run was taking FOREVER (18hrs, ~63%) # and multiple folks are asking me to dig up pileups for Arian. So, # just do a vanilla run as a baseline; return to the experiments later. cat << '_EOF_' > DEF # chicken vs. chicken export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin ALIGN=blastz-run BLASTZ=blastz BLASTZ_H=2000 BLASTZ_ABRIDGE_REPEATS=0 # TARGET # Chicken SEQ1_DIR=/iscratch/i/galGal2/nib SEQ1_RMSK= SEQ1_FLAG= SEQ1_SMSK= SEQ1_IN_CONTIGS=0 SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY # Chicken SEQ2_DIR=/iscratch/i/galGal2/nib SEQ2_RMSK= SEQ2_FLAG= SEQ2_SMSK= SEQ2_IN_CONTIGS=0 SEQ2_CHUNK=10000000 SEQ2_LAP=10000 BASE=/cluster/data/galGal2/bed/blastz.galGal2.2004-03-03 DEF=$BASE/DEF RAW=$BASE/raw CDBDIR=$BASE SEQ1_LEN=$BASE/S1.len SEQ2_LEN=$BASE/S2.len '_EOF_' # << this line makes emacs coloring happy # Save the DEF file in the current standard place chmod +x DEF cp DEF ~angie/hummus/DEF.galGal2-galGal2 # source the DEF file bash . ./DEF mkdir -p $BASE/run ~angie/hummus/make-joblist $DEF > $BASE/run/j sh $BASE/xdir.sh cd $BASE/run # now edit j to prefix path to executable name sed -e 's#^#/cluster/bin/penn/#' j > j2 wc -l j* head j2 # make sure the j2 edits are OK, then use it: mv j2 j para create j para try, check, push, check, ... #Completed: 22801 of 22801 jobs #Average job time: 257s 4.29m 0.07h 0.00d #Longest job: 1238s 20.63m 0.34h 0.01d #Submission to last job: 6939s 115.65m 1.93h 0.08d # --- normalize (PennSpeak for lift) ssh kki cd /cluster/data/galGal2/bed/blastz.galGal2.2004-03-03 # run bash shell if not running it already source DEF mkdir -p $BASE/run.1 mkdir -p $BASE/lav # create a new job list to convert out files to lav /cluster/bin/scripts/blastz-make-out2lav $DEF $BASE \ > run.1/jobList cd run.1 wc -l jobList head jobList # make sure the job list is OK para create jobList para push #Completed: 151 of 151 jobs #Average job time: 24s 0.41m 0.01h 0.00d #Longest job: 127s 2.12m 0.04h 0.00d #Submission to last job: 246s 4.10m 0.07h 0.00d # convert lav files to axt ssh kki cd /cluster/data/galGal2/bed/blastz.galGal2.2004-03-03 mkdir axtChrom # a new run directory mkdir run.2 cd run.2 # lavToAxt has a new -dropSelf option that *will* replace axtDropOverlap, # once /cluster/bin/i386/lavToAxt is rebuilt... Meanwhile it's built # for x86_64. cat << '_EOF_' > do.csh #!/bin/csh cd $1 cat `ls -1 *.lav | sort -g` \ | lavToAxt -dropSelf stdin /iscratch/i/galGal2/nib /iscratch/i/galGal2/nib \ stdout \ | axtSort stdin $2 '_EOF_' # << this line makes emacs coloring happy chmod a+x do.csh cat << '_EOF_' > gsub #LOOP ./do.csh {check in exists $(path1)} {check out line+ /cluster/data/galGal2/bed/blastz.galGal2.2004-03-03/axtChrom/$(root1).axt} #ENDLOOP '_EOF_' # << this line makes emacs coloring happy \ls -1Sd ../lav/chr* > chrom.list gensub2 chrom.list single gsub jobList wc -l jobList head jobList para create jobList para try, check, push, check,... #Completed: 53 of 54 jobs #Crashed: 1 jobs #Average job time: 30s 0.51m 0.01h 0.00d #Longest job: 541s 9.02m 0.15h 0.01d #Submission to last job: 576s 9.60m 0.16h 0.01d # The crash was not a memory problem -- it was that chr8_random's lav # contained only 1 alignment, to itself (dropped), so its .axt is empty. # CHAIN SELF BLASTZ (DONE 3/3/04 angie) # Run axtChain on little cluster ssh kki cd /cluster/data/galGal2/bed/blastz.galGal2.2004-03-03 mkdir -p axtChain/run1 cd axtChain/run1 mkdir out chain ls -1S /cluster/data/galGal2/bed/blastz.galGal2.2004-03-03/axtChrom/*.axt \ > input.lst cat << '_EOF_' > gsub #LOOP doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out} #ENDLOOP '_EOF_' # << this line makes emacs coloring happy cat << '_EOF_' > doChain #!/bin/csh axtFilter -notQ=chrUn $1 \ | axtChain stdin /iscratch/i/galGal2/nib /iscratch/i/galGal2/nib $2 \ > $3 '_EOF_' # << this line makes emacs coloring happy chmod a+x doChain gensub2 input.lst single gsub jobList para create jobList para try, check, push, check... # oops -- empty chr8_random strikes again: #Completed: 53 of 54 jobs #Crashed: 1 jobs #Average job time: 53s 0.89m 0.01h 0.00d #Longest job: 514s 8.57m 0.14h 0.01d #Submission to last job: 514s 8.57m 0.14h 0.01d # now on the cluster server, sort chains ssh kolossus cd /cluster/data/galGal2/bed/blastz.galGal2.2004-03-03/axtChain chainMergeSort run1/chain/*.chain > all.chain chainSplit chain all.chain rm run1/chain/*.chain # take a look at score distr's foreach f (chain/*.chain) grep chain $f | awk '{print $2;}' | sort -nr > /tmp/score.$f:t:r echo $f:t:r textHistogram -binSize=10000 /tmp/score.$f:t:r echo "" end # trim to minScore=20000 to cut some of the fluff mkdir chainFilt foreach f (chain/*.chain) chainFilter -minScore=20000 $f > chainFilt/$f:t end # Load chains into database ssh hgwdev cd /cluster/data/galGal2/bed/blastz.galGal2.2004-03-03/axtChain/chainFilt foreach i (*.chain) set c = $i:r echo loading $c hgLoadChain galGal2 ${c}_chainSelf $i end # NET SELF BLASTZ (DONE 3/3/04 angie) ssh kolossus cd /cluster/data/galGal2/bed/blastz.galGal2.2004-03-03/axtChain chainPreNet all.chain ../S1.len ../S2.len stdout \ | chainNet stdin -minSpace=1 ../S1.len ../S2.len stdout /dev/null \ | netSyntenic stdin noClass.net # Add classification info using db tables: ssh hgwdev cd /cluster/data/galGal2/bed/blastz.galGal2.2004-03-03/axtChain netClass -noAr noClass.net galGal2 galGal2 self.net # Make a 'syntenic' subset: ssh kolossus cd /cluster/data/galGal2/bed/blastz.galGal2.2004-03-03/axtChain rm noClass.net netFilter -syn self.net > selfSyn.net # Load the nets into database ssh hgwdev cd /cluster/data/galGal2/bed/blastz.galGal2.2004-03-03/axtChain netFilter -minGap=10 self.net | hgLoadNet galGal2 netSelf stdin netFilter -minGap=10 selfSyn.net | hgLoadNet galGal2 netSyntenySelf stdin # Add entries for chainSelf, netSelf, netSyntenySelf to # chicken/galGal2 trackDb # MAKE VSSELF DOWNLOADABLES (DONE 3/3/04 angie) ssh kolossus cd /cluster/data/galGal2/bed/blastz.galGal2.2004-03-03 # Webb asked for axtChrom/chr22.axt... since axtChrom is rel. small # this time, just put it all out there. zip /cluster/data/galGal2/zip/GGaxtChrom.zip axtChrom/chr*.axt cd /cluster/data/galGal2/bed/blastz.galGal2.2004-03-03/axtChain cp all.chain self.chain zip /cluster/data/galGal2/zip/self.chain.zip self.chain rm self.chain zip /cluster/data/galGal2/zip/self.net.zip self.net zip /cluster/data/galGal2/zip/selfSyn.net.zip selfSyn.net ssh hgwdev mkdir /usr/local/apache/htdocs/goldenPath/galGal2/vsSelf cd /usr/local/apache/htdocs/goldenPath/galGal2/vsSelf mv /cluster/data/galGal2/zip/GGaxtChrom.zip axtChrom.zip mv /cluster/data/galGal2/zip/self*.zip . md5sum *.zip > md5sum.txt # Copy over & edit README.txt w/pointers to chain, net formats. # EXTRACT PILEUPS FROM SELF ALIGNMENTS (DONE 3/9/04 angie) ssh kolossus cd /cluster/data/galGal2/bed/blastz.galGal2.2004-03-03 mkdir pslChrom foreach f (axtChrom/chr*.axt) echo $f:t:r axtToPsl $f S1.len S2.len pslChrom/$f:t:r.psl end mkdir pileups foreach f (pslChrom/chr*.psl) echo extracting pileups from $f:t:r pslToPileup $f S1.len pileups/$f:t:r.bed end ssh hgwdev cd /cluster/data/galGal2/bed/blastz.galGal2.2004-03-03 hgLoadBed -noBin -sqlTable=$HOME/kent/src/hg/lib/pileups.sql galGal2 \ pileups pileups/chr*.bed # Used table browser to intersect pileups w/>=50% overlap with rmsk, # save BED to pileups50PctRmsk.bed . # AUTO UPDATE GENBANK MRNA RUN (DONE 2/26/04 angie) # Update genbank config and source in CVS: cd ~/kent/src/hg/makeDb/genbank cvsup . # See if /cluster/data/genbank/etc/genbank.conf has had any un-checked-in # edits, check them in if necessary: diff /cluster/data/genbank/etc/genbank.conf etc/genbank.conf # Edit etc/genbank.conf and add these lines: # galGal2 (chicken) galGal2.genome = /iscratch/i/galGal2/nib/chr*.nib galGal2.lift = /cluster/data/galGal2/jkStuff/liftAll.lft galGal2.refseq.mrna.native.load = no galGal2.genbank.mrna.xeno.load = yes galGal2.genbank.est.xeno.load = no galGal2.downloadDir = galGal2 cvs ci etc/genbank.conf # Edit src/align/gbBlat to add /iscratch/i/galGal2/11.ooc cvs diff src/align/gbBlat make cvs ci src/align/gbBlat # Install to /cluster/data/genbank: make install-server ssh eieio cd /cluster/data/genbank # This is an -initial run, mRNA only: nice bin/gbAlignStep -clusterRootDir=/cluster/bluearc/genbank \ -iserver=kkr1u00 -iserver=kkr2u00 -iserver=kkr3u00 -iserver=kkr4u00 \ -iserver=kkr5u00 -iserver=kkr6u00 -iserver=kkr7u00 -iserver=kkr8u00 \ -srcDb=genbank -type=mrna -verbose=1 -initial galGal2 # Load results: ssh hgwdev cd /cluster/data/genbank nice bin/gbDbLoadStep -verbose=1 -drop -initialLoad galGal2 # Clean up: rm -r /cluster/bluearc/genbank/work/initial.galGal2 ssh eieio # -initial for ESTs (now with /cluster/store7 and iservers): nice bin/gbAlignStep -clusterRootDir=/cluster/store7/genbank \ -iserver=kkr1u00 -iserver=kkr2u00 -iserver=kkr3u00 -iserver=kkr4u00 \ -iserver=kkr5u00 -iserver=kkr6u00 -iserver=kkr7u00 -iserver=kkr8u00 \ -srcDb=genbank -type=est -verbose=1 -initial galGal2 # Load results: ssh hgwdev cd /cluster/data/genbank nice bin/gbDbLoadStep -verbose=1 galGal2 # Clean up: rm -r /cluster/store7/genbank/work/initial.galGal2 # PRODUCING GENSCAN PREDICTIONS (DONE 2/25/04 angie) ssh hgwdev mkdir /cluster/data/galGal2/bed/genscan cd /cluster/data/galGal2/bed/genscan # Check out hg3rdParty/genscanlinux to get latest genscan: cvs co hg3rdParty/genscanlinux # Run on small cluster (more mem than big cluster). ssh kki cd /cluster/data/galGal2/bed/genscan # Make 3 subdirectories for genscan to put their output files in mkdir gtf pep subopt # Generate a list file, genome.list, of all the hard-masked contigs that # *do not* consist of all-N's (which would cause genscan to blow up) rm -f genome.list touch genome.list foreach f ( `ls -1S /cluster/data/galGal2/*/chr*_*/chr*_?{,?}.fa.masked` ) egrep '[ACGT]' $f > /dev/null if ($status == 0) echo $f >> genome.list end wc -l genome.list # Create template file, gsub, for gensub2. For example (3-line file): cat << '_EOF_' > gsub #LOOP /cluster/bin/i386/gsBig {check in line+ $(path1)} {check out line gtf/$(root1).gtf} -trans={check out line pep/$(root1).pep} -subopt={check out line subopt/$(root1).bed} -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=2400000 #ENDLOOP '_EOF_' # << this line makes emacs coloring happy gensub2 genome.list single gsub jobList para create jobList para try, check, push, check, ... #Completed: 254 of 259 jobs #Crashed: 5 jobs #Average job time: 438s 7.29m 0.12h 0.01d #Longest job: 21229s 353.82m 5.90h 0.25d #Submission to last job: 21229s 353.82m 5.90h 0.25d # If there are crashes, diagnose with "para problems". # If a job crashes due to genscan running out of memory, re-run it # manually with "-window=1200000" instead of "-window=2400000". /cluster/bin/i386/gsBig /cluster/data/galGal2/1/chr1_4/chr1_4.fa.masked gtf/chr1_4.fa.gtf -trans=pep/chr1_4.fa.pep -subopt=subopt/chr1_4.fa.bed -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=1200000 /cluster/bin/i386/gsBig /cluster/data/galGal2/7/chr7_5/chr7_5.fa.masked gtf/chr7_5.fa.gtf -trans=pep/chr7_5.fa.pep -subopt=subopt/chr7_5.fa.bed -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=1200000 /cluster/bin/i386/gsBig /cluster/data/galGal2/11/chr11_2/chr11_2.fa.masked gtf/chr11_2.fa.gtf -trans=pep/chr11_2.fa.pep -subopt=subopt/chr11_2.fa.bed -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=1200000 /cluster/bin/i386/gsBig /cluster/data/galGal2/1/chr1_36/chr1_36.fa.masked gtf/chr1_36.fa.gtf -trans=pep/chr1_36.fa.pep -subopt=subopt/chr1_36.fa.bed -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=1200000 /cluster/bin/i386/gsBig /cluster/data/galGal2/Z/chrZ_6/chrZ_6.fa.masked gtf/chrZ_6.fa.gtf -trans=pep/chrZ_6.fa.pep -subopt=subopt/chrZ_6.fa.bed -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=1200000 # Convert these to chromosome level files as so: ssh kksilo cd /cluster/data/galGal2/bed/genscan liftUp genscan.gtf ../../jkStuff/liftAll.lft warn gtf/*.gtf liftUp genscanSubopt.bed ../../jkStuff/liftAll.lft warn subopt/*.bed cat pep/*.pep > genscan.pep # Load into the database as so: ssh hgwdev cd /cluster/data/galGal2/bed/genscan ldHgGene galGal2 genscan genscan.gtf hgPepPred galGal2 generic genscanPep genscan.pep hgLoadBed galGal2 genscanSubopt genscanSubopt.bed # PRODUCING FUGU BLAT ALIGNMENTS (DONE 2/27/04) ssh kk mkdir /cluster/data/galGal2/bed/blatFr1 cd /cluster/data/galGal2/bed/blatFr1 ls -1S /iscratch/i/fugu/trfFa/*.fa > fugu.lst ls -1S /iscratch/i/galGal2/nib/*.nib > chicken.lst cat << '_EOF_' > gsub #LOOP blat -mask=lower -q=dnax -t=dnax {check in exists $(path1)} {check in line+ $(path2)} {check out line+ psl/$(root1)_$(root2).psl} #ENDLOOP '_EOF_' # << this line makes emacs coloring happy mkdir psl gensub2 chicken.lst fugu.lst gsub spec para create spec para try, check, push, check, ... # for future reference, the trfFa on bluearc seem better partitioned... #Completed: 31212 of 31212 jobs #Average job time: 187s 3.11m 0.05h 0.00d #Longest job: 31706s 528.43m 8.81h 0.37d #Submission to last job: 31706s 528.43m 8.81h 0.37d # Sort alignments: ssh kksilo cd /cluster/data/galGal2/bed/blatFr1 pslCat -dir psl | pslSortAcc nohead chrom /cluster/store2/temp stdin # load seq & psl into database: ssh hgwdev mkdir /gbdb/galGal2/fuguSeq ln -s /cluster/data/fr1/fugu_v3.masked.fa /gbdb/galGal2/fuguSeq/ cd /cluster/data/galGal2/bed/blatFr1 hgLoadSeq galGal2 /gbdb/galGal2/fuguSeq/fugu_v3.masked.fa cd /cluster/data/galGal2/bed/blatFr1/chrom cat *.psl | hgLoadPsl -fastLoad -table=blatFr1 galGal2 stdin # LOAD CPGISSLANDS (DONE 3/23/04 angie) ssh hgwdev mkdir -p /cluster/data/galGal2/bed/cpgIsland cd /cluster/data/galGal2/bed/cpgIsland # Build software from Asif Chinwalla (achinwal@watson.wustl.edu) cvs co hg3rdParty/cpgIslands cd hg3rdParty/cpgIslands make mv cpglh.exe /cluster/data/galGal2/bed/cpgIsland/ ssh kksilo cd /cluster/data/galGal2/bed/cpgIsland foreach f (../../*/chr*.fa.masked) set fout=$f:t:r:r.cpg echo running cpglh on $f to $fout ./cpglh.exe $f > $fout end # Transform cpglh output to bed + cat << '_EOF_' > filter.awk /* Input columns: */ /* chrom, start, end, len, CpG: cpgNum, perGc, cpg:gpc, observed:expected */ /* chr1\t 41776\t 42129\t 259\t CpG: 34\t 65.8\t 0.92\t 0.94 */ /* Output columns: */ /* chrom, start, end, name, length, cpgNum, gcNum, perCpg, perGc, obsExp */ /* chr1\t41775\t42129\tCpG: 34\t354\t34\t233\t19.2\t65.8\to0.94 */ { $2 = $2 - 1; width = $3 - $2; printf("%s\t%d\t%s\t%s %s\t%s\t%s\t%0.0f\t%0.1f\t%s\t%s\n", $1, $2, $3, $5,$6, width, $6, width*$7*0.01, 100.0*2*$6/width, $7, $9); } '_EOF_' # << this line makes emacs coloring happy awk -f filter.awk chr*.cpg > cpgIsland.bed # load into database: ssh hgwdev cd /cluster/data/galGal2/bed/cpgIsland hgLoadBed galGal2 cpgIslandExt -tab -noBin \ -sqlTable=$HOME/kent/src/hg/lib/cpgIslandExt.sql cpgIsland.bed # LOAD GENEID GENES (DONE 3/4/04 angie) # reloaded 3/16/04 with -gtf instead of -exon=CDS (nec. now! for stop_codon) mkdir -p /cluster/data/galGal2/bed/geneid/download cd /cluster/data/galGal2/bed/geneid/download foreach f (/cluster/data/galGal2/*/chr*.fa) set chr = $f:t:r wget \ http://genome.imim.es/genepredictions/G.gallus/golden_path_200402/geneid_v1.1/$chr.gtf wget \ http://genome.imim.es/genepredictions/G.gallus/golden_path_200402/geneid_v1.1/$chr.prot end # Add missing .1 to protein id's foreach f (*.prot) perl -wpe 's/^(>chr\w+)$/$1.1/' $f > $f:r-fixed.prot end cd .. ldHgGene -gtf galGal2 geneid download/*.gtf hgPepPred galGal2 generic geneidPep download/*-fixed.prot # QA NOTE [ASZ: 9-11-2006]: mytouch for geneidPep # sudo mytouch galGal2 geneidPep 200403162100.00 # BUILD BLAST DATABASES cd /cluster/data/galGal2 mkdir blastDb for i in `cat chrom.lst`; do for j in `echo $i/chr$i_*/chr*_*_*.fa`; do ln -s `pwd`/$j blastDb/`basename $j .fa`; done; done cd blastDb for i in *; do formatdb -p F -i $i; done # END BLAST # TIGR GENE INDEX (TODO) mkdir -p /cluster/data/galGal2/bed/tigr cd /cluster/data/galGal2/bed/tigr wget ftp://ftp.tigr.org//pub/data/tgi/TGI_track_ChickenGenome_Jun2003.tgz tar xvfz TGI_track_ChickenGenome_Jun2003.tgz foreach f (*cattle*) set f1 = `echo $f | sed -e 's/cattle/cow/g'` mv $f $f1 end foreach o (rat cow human pig chicken) setenv O $o foreach f (chr*_$o*s) tail +2 $f | perl -wpe 's /THC/TC/; s/(TH?C\d+)/$ENV{O}_$1/;' > $f.gff end end ldHgGene -exon=TC galGal2 tigrGeneIndex *.gff # 139456 gene predictions gzip *TCs *.gff # GC 5 BASE WIGGLE TRACK (DONE 2/24/04 angie) ssh kksilo mkdir /cluster/data/galGal2/bed/gc5Base cd /cluster/data/galGal2/bed/gc5Base mkdir wigData5 dataLimits5 foreach n (../../nib/*.nib) set c=`basename ${n} | sed -e "s/.nib//"` set C=`echo $c | sed -e "s/chr//"` echo "working on ${c} - ${C} ... " hgGcPercent -chr=${c} -doGaps -file=stdout -win=5 galGal2 ../../nib \ | grep -w GC \ | awk '{ \ bases = $3 - $2 \ perCent = $5/10.0 \ printf "%d\t%.1f\n", $2+1, perCent \ }' \ | wigAsciiToBinary \ -dataSpan=5 -chrom=${c} -wibFile=wigData5/gc5Base_${C} \ -name=${C} stdin > dataLimits5/${c} end # data is complete, load track on hgwdev ssh hgwdev cd /cluster/data/galGal2/bed/gc5Base # create symlinks for .wib files, then load track mkdir /gbdb/galGal2/wib ln -s `pwd`/wigData5/*.wib /gbdb/galGal2/wib hgLoadWiggle galGal2 gc5Base wigData5/*.wig # SUPERCONTIG LOCATIONS TRACK (DONE 2/26/04 angie) # Supercontig N is a collection of contained "ContigN.*". # Extract Supercontig starts and ends from AGP into bed. mkdir /cluster/data/galGal2/bed/supercontig cd /cluster/data/galGal2/bed/supercontig cat ../../agp_040224/chr*.agp \ | perl -we 'while (<>) { \ chop; @words = split("\t"); \ next if ($words[4] eq "N"); \ if ($words[5] =~ /^(Contig\d+)\.(\d+)$/) { \ $sup = $1; $newNum = $2; \ $firstNum = $newNum if (! defined $firstNum); \ if (defined $lastSup && $lastSup ne $sup) { \ $start--; \ print "$chr\t$start\t$end\t$lastSup ($firstNum-$lastNum)\t1000\t$strand\n"; \ undef $start; undef $end; $firstNum = $newNum; \ } \ $chr = $words[0]; $cStart = $words[1]; $cEnd = $words[2]; \ $strand = $words[8]; \ $start = $cStart if (\!defined $start || $start > $cStart); \ $end = $cEnd if (\!defined $end || $end < $cEnd); \ $lastSup = $sup; $lastNum = $newNum; \ } else { die "bad 6th word $words[5]\n"; } \ } \ print "$chr\t$start\t$end\t$lastSup ($firstNum-$lastNum)\t1000\t$strand\n";' \ > supercontig.bed ssh hgwdev cd /cluster/data/galGal2/bed/supercontig hgLoadBed -tab galGal2 supercontig supercontig.bed # CREATING QUALITY SCORE TRACK (DONE 2/24/04 angie) ssh kksilo mkdir /cluster/data/galGal2/bed/qual cd /cluster/data/galGal2/bed/qual cat ../../agp_040224/chr*.agp > assembly.agp tar xvzf ../../chicken_040105.contigs.qual.tar.gz cat chicken*.qual | qaToQac stdin stdout \ | chimpChromQuals assembly.agp stdin chrom.qac rm chicken*.qual mkdir wigData dataLimits foreach c (`cat ../../chrom.lst`) foreach agp (../../$c/chr$c.agp ../../$c/chr${c}_random.agp) if (-e $agp) then set chr = $agp:t:r set abbrev = `echo $chr | sed -e 's/^chr//; s/_random/r/;'` echo $chr to $abbrev wiggle qacToWig chrom.qac -name=$chr stdout \ | wigAsciiToBinary -dataSpan=1 -chrom=$chr \ -wibFile=wigData/qual_$abbrev -name=$abbrev stdin \ > dataLimits/$chr endif end end # Verify size of .wib file = chrom length foreach f (wigData/*.wib) set abbrev = $f:t:r set chr = `echo $abbrev | sed -e 's/^qual_/chr/; s/r$/_random/;'` set wibLen = `ls -l $f | awk '{print $5;}'` set chromLen = `grep -w $chr ../../chrom.sizes | awk '{print $2;}'` if ($wibLen != $chromLen) then echo "ERROR: $chr size is $chromLen but wib size is $wibLen" else echo $chr OK. endif end #ERROR: chr12 size is 19821895 but wib size is 19811895 #ERROR: chr23 size is 5666127 but wib size is 5166127 #ERROR: chr28 size is 4731479 but wib size is 4231479 # That's OK -- they end in gaps of the right sizes (not incl'd in wib) # /gbdb & load: ssh hgwdev cd /cluster/data/galGal2/bed/qual mkdir -p /gbdb/galGal2/wib ln -s `pwd`/wigData/*.wib /gbdb/galGal2/wib hgLoadWiggle galGal2 quality wigData/*.wig # To speed up display for whole chrom views, need to make zoomed # data views. Zoom to 1K points per pixel. ssh kksilo cd /cluster/data/galGal2/bed/qual/wigData1K mkdir -p wigData1K mkdir -p dataLimits1K foreach c (`cat ../../chrom.lst`) if (-f ../../${c}/chr${c}.agp) then echo "chr${c} quality 1K zoom" qacToWig chrom.qac -name=chr${c} stdout \ | wigZoom stdin \ | wigAsciiToBinary -dataSpan=1024 \ -chrom=chr${c} -wibFile=wigData1K/qc1K_${c} \ -name=${c} stdin > dataLimits1K/chr${c} endif if (-f ../../${c}/chr${c}_random.agp) then echo "chr${c}_random quality 1K zoom" qacToWig chrom.qac -name=chr${c}_random stdout \ | wigZoom stdin \ | wigAsciiToBinary -dataSpan=1024 \ -chrom=chr${c}_random -wibFile=wigData1K/qc1K_${c}r \ -name=${c}_random stdin > dataLimits1K/chr${c}r endif end ssh hgwdev cd /cluster/data/galGal2/bed/qual/wigData1K # create symlinks for .wib files ln -s `pwd`/*.wib /gbdb/galGal2/wib # load in addition to existing data hgLoadWiggle -oldTable galGal2 quality *.wig # MAKE UNIGENE ALIGNMENTS (DONE 2/29/04 angie) ssh kksilo mkdir /cluster/data/galGal2/bed/unigene cd /cluster/data/galGal2/bed/unigene # Download chicken Unigene and run Chuck's preproc scripts: wget ftp://ftp.ncbi.nih.gov/repository/UniGene/Gga.info wget ftp://ftp.ncbi.nih.gov/repository/UniGene/Gga.seq.uniq.gz wget ftp://ftp.ncbi.nih.gov/repository/UniGene/Gga.data.gz gunzip *.gz # Chuck's script expects human (Hs) -- use Gga for chicken: sed -e 's/Hs/Gga/g' /projects/cc/hg/sugnet/uniGene/countSeqsInCluster.pl \ > ./countSeqsInCluster.pl chmod a+x ./countSeqsInCluster.pl ./countSeqsInCluster.pl Gga.data counts.tab /projects/cc/hg/sugnet/uniGene/parseUnigene.pl Gga.seq.uniq \ Gga.seq.uniq.simpleHeader.fa leftoverData.tab # Put on iscratch for cluster run ssh kkr1u00 mkdir /iscratch/i/galGal2/unigene cp -p /cluster/data/galGal2/bed/unigene/Gga.seq.uniq.simpleHeader.fa \ /iscratch/i/galGal2/unigene/ iSync # align on big cluster ssh kk cd /cluster/data/galGal2/bed/unigene ls -1S /iscratch/i/galGal2/trfFa/*.fa > allctg.lst ls -1S /iscratch/i/galGal2/unigene/*.fa > uniGene.lst # seems to be a bug in latest blat: -mask=lower => 0 alignments! # for now, use Jim's next-to-latest blat.27: cat << '_EOF_' > template.sub #LOOP /cluster/home/kent/bin/i386/blat.27 -mask=lower -minIdentity=95 -ooc=/iscratch/i/galGal2/11.ooc {check in line+ $(path1)} {check in line+ $(path2)} {check out line+ psl/$(root1)_$(root2).psl} #ENDLOOP '_EOF_' # << this line makes emacs coloring happy gensub2 allctg.lst uniGene.lst template.sub jobList para create jobList mkdir psl para try, check, push, check #Completed: 259 of 259 jobs #Average job time: 13s 0.22m 0.00h 0.00d #Longest job: 21s 0.35m 0.01h 0.00d #Submission to last job: 22s 0.37m 0.01h 0.00d # postprocess ssh kksilo pslSort dirs raw.psl tmp psl >& pslSort.log liftUp -type=.psl stdout ../../jkStuff/liftAll.lft warn raw.psl \ | pslReps -minCover=0.2 -minAli=0.965 -nearTop=0.002 \ stdin uniGene.lifted.pslReps.psl /dev/null rm raw.psl gzip Gga.seq.uniq Gga.data # load into db ssh hgwdev cd /cluster/data/galGal2/bed/unigene mkdir /gbdb/galGal2/unigene ln -s /cluster/data/galGal2/bed/unigene/Gga.seq.uniq.simpleHeader.fa \ /gbdb/galGal2/unigene/ hgLoadSeq galGal2 /gbdb/galGal2/unigene/Gga.seq.uniq.simpleHeader.fa hgLoadPsl -fastLoad -table=uniGene_gg galGal2 uniGene.lifted.pslReps.psl # LIFTOVER ENCODE REGIONS FOR JOHN WALLIS (DONE 3/2/04 angie) # It's too early to show these in the browser, but try to liftOver: ssh hgwdev mkdir /cluster/data/galGal2/bed/encodeLiftOver cd /cluster/data/galGal2/bed/encodeLiftOver echo 'select * from hg16.encodeRegions' | hgsql hg16 -N > hg16.eR.bed # Of 44 regions: # -minMatch=.95 (default) -> nothing mapped # -minMatch=.1 -> 2 mapped # -minMatch=.01 -> 33 mapped # -minMatch=.001 -> 14 mapped -- huh??? -> "Duplicated in new." # -minMatch=.011 -> 34 mapped -- best I could get. liftOver -minMatch=.011 hg16.eR.bed \ /cluster/data/hg16/bed/bedOver/hg16TogalGal2.chain \ galGal2.encodeRegions.bed galGal2.encodeRegions.unmapped # emailed the files to John Wallis w/warning they're prelim, automated. # This is experimental, so no track is put up yet, (2004-07-9 kate) cd /cluster/data/galGal2/bed ln -s encodeLiftOver encodeRegions cd encodeRegions ln -s galGal2.encodeRegions.bed encodeRegions.bed hgLoadBed galGal2 encodeRegions encodeRegions.bed -noBin # SNP & GENES FROM BEIJING GENOME INST. (DONE 7/20/04 angie) ssh kksilo mkdir /cluster/data/galGal2/bed/bgi cd /cluster/data/galGal2/bed/bgi # Their files are very fragmented into a deep dir structure. # Pull the whole thing (except chromatograms) over intact. wget -r --exclude-directories=\*/chromatograms \ --timestamping ftp://genome.wustl.edu/private/BGI/ # Takes hours... # Zhang Yong said to ignore the target regions, so move the dir # out of the way for now... mv genome.wustl.edu/private/BGI/SNPs_by_locus/ . # Coverage track: bed 4 showing where SNP reads were performed. # Looks like we can get strain info from filename... find genome.wustl.edu/private/BGI/SNPs_by_strain/2004-07-07 \ -name \*ContigCov-\*.txt\* -print \ | grep -v /BAC-endpairs/ \ | xargs $HOME/kent/src/hg/snp/bgi/bgiCovToBed.pl \ > bgiCov.bed # Gene annotations: find genome.wustl.edu/private/BGI/external_data/2004-05-11 \ -name \*exon_intron.txt -print \ | sed -e 's/ /\\ /g' \ | xargs $HOME/kent/src/hg/snp/bgi/bgiExonIntronToGenePred.pl \ > bgiGene.tab # Gene annotation peptide seqs: find genome.wustl.edu/private/BGI/external_data/2004-05-11 \ -name \*.pep\* -print \ | sed -e 's/ /\\ /g' \ | xargs $HOME/kent/src/hg/snp/bgi/bgiPep.pl \ > bgiGenePep.fa # Gene info and SNPs: slurp in all of the GeneSNPcon* files and # SNPlist-* files, cross-reference, and dump out bgiGeneInfo.sql # and bgiSnp.bed. (bgiGeneInfo.sql with INSERT statements because of # the longblobs which can't be loaded in from .tab unfortunately) find genome.wustl.edu/private/BGI/SNPs_by_strain/2004-07-07 \ -name SNPlist-\*.txt\* -print \ | grep -v /BAC-endpairs/ \ > SNPlists.list find genome.wustl.edu/private/BGI/SNPs_by_cDNA/2004-07-07 \ -name \*GeneSNPcon\*.txt\* -print \ > GeneSNPcons.list # Takes about 15 minutes: time $HOME/kent/src/hg/snp/bgi/bgiXref.pl SNPlists.list GeneSNPcons.list # Tons of duplicated gene-snp association warnings like this: #dup assoc ENS.ENSGALT00000000020 snp.117.58.6795.S.2: # ENS.ENSGALT00000000020 snp.117.58.6795.S.2 Intron # ENS.ENSGALT00000000020 snp.117.58.6795.S.2 Splice Site AG->AA # which I think is OK. Also duplicated gene sources (asked Zhang Yong): #dup: HDG.ENSG00000005381 HumanDiseaseGene HumanDiseaseGene # Load 'em up... ssh hgwdev cd /cluster/data/galGal2/bed/bgi hgLoadBed galGal2 bgiCov bgiCov.bed hgsql galGal2 < $HOME/kent/src/hg/lib/bgiGeneInfo.sql hgsql galGal2 < ./bgiGeneInfo.sql hgsql galGal2 < $HOME/kent/src/hg/lib/bgiGeneSnp.sql echo "load data local infile './bgiGeneSnp.tab' into table bgiGeneSnp" \ | hgsql galGal2 hgPepPred galGal2 generic bgiGenePep bgiGenePep.fa ldHgGene galGal2 bgiGene -predTab bgiGene.tab hgLoadBed galGal2 bgiSnp -tab -sqlTable=$HOME/kent/src/hg/lib/bgiSnp.sql \ bgiSnp.bed # ENSEMBL GENE BUILD (DONE 4/27/04 angie) ssh kksilo mkdir /cluster/data/galGal2/bed/ensembl cd /cluster/data/galGal2/bed/ensembl wget ftp://ftp.sanger.ac.uk/pub/searle/chicken/chicken_build.pep.gz wget ftp://ftp.sanger.ac.uk/pub/searle/chicken/chicken_build.gtf.gz gunzip *.gz sed -e 's/^/chr/' chicken_build.gtf \ > chicken_build.fixed.gtf perl -wpe 's/^>.*trans_id=(\w+).*/>$1/' \ chicken_build.pep > chicken_build.fixed.pep ssh hgwdev cd /cluster/data/galGal2/bed/ensembl hgPepPred galGal2 generic ensPep chicken_build.fixed.pep ldHgGene -gtf -frame -id -geneName galGal2 ensGene chicken_build.fixed.gtf # MAKE KNOWN REGULATORY REGIONS (DONE 3/29/04 angie) # Adam got several text files from Webb (and/or Laura Elnitski?), # translated them to bed 3, and loaded. I tweaked -> bed 4. # (this does both hg16 and galGal2) ssh hgwdev cd /cluster/home/acs/webb-launcher make -f Makefile.angie munge make -f Makefile.angie track # RUN GENE-CHECK ON MRNA W/CDS, ENSEMBL (DONE 3/29/04 markd) # Note: /cluster/bin/scripts/runGeneCheck should make this easier # in the future, but this also shows how MarkD ran gene-check on # mRNAs with annotated CDS: # get mrna data from hgwbeta, which is most current: ssh hgwbeta mkdir /cluster/store7/markd/galGal2 cd /cluster/store7/markd/galGal2 mkdir data hgsql -N -e 'select * from all_mrna' galGal2 |cut -f 2-30 \ > data/all_mrna.psl hgsql -N -e 'select mrna.acc,cds.name from mrna,cds,all_mrna where all_mrna.qName=mrna.acc and mrna.cds = cds.id' galGal2 \ > data/cds.tab nice mrnaToGene -cdsFile=data/cds.tab data/all_mrna.psl data/all_mrna.gp \ >& data/all_mrna.log # get ensGene data from hgwdev: ssh hgwdev cd /cluster/store7/markd/galGal2 hgsql -N -e 'select * from ensGene ' galGal2 >data/ensGene.gp # run gene-check and gene-extract ssh kksilo cd /cluster/store7/markd/galGal2 mkdir results ~markd/bin/gene-check -incl-ok -nib-dir /cluster/data/galGal2/nib \ -details-out results/all_mrna.details data/all_mrna.gp \ results/all_mrna.chk >& all_mrna.log ~markd/bin/gene-check -incl-ok -nib-dir /cluster/data/galGal2/nib \ -details-out results/ensGene.details data/ensGene.gp \ results/ensGene.chk >& ensGene.log ~markd/bin/gene-extract -nib-dir /cluster/data/galGal2/nib \ -all-cds-fa results/all_mrna.cds.fa data/all_mrna.gp >& all_mrna.log ~markd/bin/gene-extract -nib-dir /cluster/data/galGal2/nib \ -all-cds-fa results/ensGene.cds.fa data/ensGene.gp >&ensGene.log # Files in results/ # *.chk - output of the gene-check program, rdb format # *.details - details of problems detected # *.cds.fa - fasta of CDS taken from genome # Columns in *.chk file are # acc - name field from genePred # chr - genome location from genePred # chrStart # chrEnd # strand # stat - overall status based on other data: ok or err # frame - is the frame annotation sane (mult of 3): ok, bad, or noCDS # if noCDS, other CDS checks are not done. # start - Is there a valid start codon: ok or no # stop - Is there a valid stop codon: ok or no # orfStop - Number of in-frame stop codons # smallGap - Number of small gaps, too small to be introns # unknownSplice - Number of introns with unknown splicing patterns # (considers the three most common splicing patterns as valid) # causes - list of symbolic names summarizing the failures # WEBB'S PUTATIVE NON-EXONIC CONSERVED REGIONS (REDONE 6/26/04 angie) ssh hgwdev mkdir /cluster/data/galGal2/bed/webbNonExonic cd /cluster/data/galGal2/bed/webbNonExonic # saved email attachment from Webb Miller (5/26/04) awk 'BEGIN {i=1;} {print $1 "\t" $2-1 "\t" $3 "\t" $1 "." i; i++;}' \ chickenspans2.txt > webbNonExonic.bed hgLoadBed galGal2 webbNonExonic webbNonExonic.bed # UN-ANNOTATED (EXCEPT FOR CROSS-SPECIES) REGIONS (DONE 4/27/04 angie) # Anton Nekrutenko asked the chickenanalysis list for this... sounds # like a job for featureBits. :) ssh hgwdev mkdir /cluster/data/galGal2/bed/unAnnotated cd /cluster/data/galGal2/bed/unAnnotated nice featureBits galGal2 -minSize=12 \ \!gap \!bgiGene \!ensGene \!geneid \!genscan \!twinscan \ \!mrna \!intronEst \!est \!xenoMrna \ \!uniGene_gg \!cpgIslandExt \!rmsk \!simpleRepeat \ -bed=unAnnotated.bed #826274476 bases of 1054197620 (78.379%) in intersection hgLoadBed galGal2 unAnnotated unAnnotated.bed # not much of a drop in coverage with the -minSize: nice featureBits galGal2 unAnnotated #825770129 bases of 1054197620 (78.332%) in intersection # SYNTENY BY HOMOLOGY FROM EVGENY ZDOBNOV (DONE 4/15/04 angie) # Note 6/3/04: This has been replaced by # http://azra.embl-heidelberg.de/~zdobnov/Chicken/chicken-human-mm3_mouse-rat_synteny.txt # -- but unfortunately that only has start coords, no ends (or lengths). # could get lengths from ensmart... for all 4 genomes... see if there's # any demand/interest. ssh kksilo mkdir /cluster/data/galGal2/bed/zdobnov cd /cluster/data/galGal2/bed/zdobnov wget --timestamping \ http://azra.embl-heidelberg.de/~zdobnov/Chicken/homo_chicken_synteny.txt wget --timestamping \ http://azra.embl-heidelberg.de/~zdobnov/Chicken/mouse_chicken_synteny.txt wget --timestamping \ http://azra.embl-heidelberg.de/~zdobnov/Chicken/rat_chicken_synteny.txt # This is a one-shot, so I'm writing a perl script... transZdobnov.pl hg16 homo_chicken_synteny.txt > zdobnovHg16.bed transZdobnov.pl mm3 mouse_chicken_synteny.txt > zdobnovMm3.bed transZdobnov.pl rn3 rat_chicken_synteny.txt > zdobnovRn3.bed ssh hgwdev cd /cluster/data/galGal2/bed/zdobnov sed -e 's/zdobnovSynt/zdobnovHg16/' $HOME/kent/src/hg/lib/zdobnovSynt.sql \ > zd.sql hgLoadBed -sqlTable=zd.sql galGal2 zdobnovHg16 zdobnovHg16.bed sed -e 's/zdobnovSynt/zdobnovMm3/' $HOME/kent/src/hg/lib/zdobnovSynt.sql \ > zd.sql hgLoadBed -sqlTable=zd.sql galGal2 zdobnovMm3 zdobnovMm3.bed sed -e 's/zdobnovSynt/zdobnovRn3/' $HOME/kent/src/hg/lib/zdobnovSynt.sql \ > zd.sql hgLoadBed -sqlTable=zd.sql galGal2 zdobnovRn3 zdobnovRn3.bed # FILTER MRNAS FOR CHECKING ENSEMBL GENES (DONE 4/12/04 angie) # Use the TB to generate BED for mRNA alignments with no overlap w/ensGene. # saved as galGal2.mrnaNotEnsGene.bed (Sent to Ewan 4/6/04 -- needed filt) ssh hgwdev mkdir /cluster/data/galGal2/bed/tightMrna cd /cluster/data/galGal2/bed/tightMrna # Get list of mRNA accs from the chEST project, which may be a little # more shaky than others (?). echo 'select mrna.acc from mrna,description \ where mrna.description = description.id and \ description.name like "Gallus gallus finished cDNA, clone ChEST%.";' \ | use galGal2 -N > chESTaccs.txt ssh kksilo cd /cluster/data/galGal2/bed/tightMrna # Make a uniq'd list of accs that didn't overlap ensGene: awk '{print $4;}' galGal2.mrnaNotEnsGene.bed | sort | uniq > mNotE.txt # Separate those into chEST and non-chEST: cp /dev/null mNotEChEST.txt cp /dev/null mNotEOther.txt foreach a (`cat mNotE.txt`) if (`grep $a chESTaccs.txt` != "") then echo $a >> mNotEChEST.txt else echo $a >> mNotEOther.txt endif end wc -l mNot*.txt # 2987 mNotE.txt # 2484 mNotEChEST.txt # 503 mNotEOther.txt # Get psl for those alignments using the TB (paste in those files as # keyword lists). (All fields but bin.) Save as mNotEChEST.psl, # mNotEOther.psl. Edit the #chrom ... comments out of them. # Dang, extra tab at end causes trouble for pslReps, edit out: perl -wpe 's/\t$//' mNotEChEST.psl > tmp; mv tmp mNotEChEST.psl perl -wpe 's/\t$//' mNotEOther.psl > tmp; mv tmp mNotEOther.psl # Run pslReps with more stringent parameters than used for genbank: pslReps -minCover=0.50 -minAli=0.97 -nearTop=0.001 \ mNotEChEST.psl mNotEChEST.filt.psl /dev/null pslReps -minCover=0.50 -minAli=0.97 -nearTop=0.001 \ mNotEOther.psl mNotEOther.filt.psl /dev/null # Get overlap with human net netToBed -maxGap=1000 -minFill=12 \ /cluster/data/galGal2/bed/blastz.hg16.swap.2004-02-25/axtChain/human.net \ netHg16Cov.bed ssh hgwdev cd /cluster/data/galGal2/bed/tightMrna # Load these tables temporarily for TB usage... hgLoadBed galGal2 netHg16Cov netHg16Cov.bed hgLoadPsl galGal2 -table=mNotEChEST mNotEChEST.filt.psl hgLoadPsl galGal2 -table=mNotEOther mNotEOther.filt.psl featureBits galGal2 -or genscan geneid -bed=lotsaExons.bed hgLoadBed galGal2 lotsaExons lotsaExons.bed # Use TB to keep mrna's with overlap w/netHg16Cov but no overlap w/ensGene. # Saved as mNotE{ChEST,Other}_pslReps_netHg16Cov_noEnsGene.bed # Use TB to overlap above w/lotsaExons (genscan | geneid). # Saved as mNotE{ChEST,Other}_pslReps_netHg16Cov_noEnsGene_abinit.bed # Made /usr/local/apache/htdocs/angie/ensPrelim/ w/index.html linking # to custom track files and to browser w/custom tracks. # clean up. hgsql galGal2 -e 'drop table netHg16Cov; drop table lotsaExons' hgsql galGal2 -e 'drop table mNotEChEST; drop table mNotEOther' # 5-SPECIES ORTHOLOGY FROM DEWEY/PACHTER (DONE 4/20/04 angie) ssh kksilo mkdir /cluster/data/galGal2/bed/deweySynt cd /cluster/data/galGal2/bed/deweySynt # Saved email attachment from Colin Dewey # as gg-hs-mm-pt-rn.map.gz gunzip gg-hs-mm-pt-rn.map.gz # format of the map file is: # segment# chrom start end strand chrom start end strand ... # where the columns chrom, start, end, strand are repeated for each # genome in the map (in this case we have 5*4 + 1 = 21 columns). If a # segment is not found in one of the genomes, the columns for that genome # are set to "NA" for that segment. The order of the genomes in the file is: # galGal2 hg16 mm3 rn3 # and all intervals are 0-based, half-open. # Make a bed6 track for each xenoDb. ./deweyMap2Bed6.pl 1 5 gg-hs-mm-pt-rn.map > deweySyntHg16.bed ./deweyMap2Bed6.pl 1 9 gg-hs-mm-pt-rn.map > deweySyntMm3.bed ./deweyMap2Bed6.pl 1 13 gg-hs-mm-pt-rn.map > deweySyntPanTro1.bed ./deweyMap2Bed6.pl 1 17 gg-hs-mm-pt-rn.map > deweySyntRn3.bed ssh hgwdev cd /cluster/data/galGal2/bed/deweySynt hgLoadBed galGal2 deweySyntHg16 deweySyntHg16.bed hgLoadBed galGal2 deweySyntMm3 deweySyntMm3.bed hgLoadBed galGal2 deweySyntPanTro1 deweySyntPanTro1.bed hgLoadBed galGal2 deweySyntRn3 deweySyntRn3.bed # CHICKEN NCRNA FROM SAM GRIFFITHS-JONES (DONE 4/20/04 angie) ssh hgwdev mkdir /cluster/data/galGal2/bed/ncRNA cd /cluster/data/galGal2/bed/ncRNA # saved email attachment from Sam Griffiths-Jones # as gallus_gallus_ncrna_sgj_v1.gff # That's not GTF with exons/CDS/codons the way that ldHgGene expects, # so translate it into the rnaGene.as format: perl -wpe 'if (/^#/) { $_ = ""; next } chomp; \ ($chr,$source,$type,$start,$end,undef,$strand,undef,$other) = split(/\t/); \ $start--; \ $fullScore = 0; \ if ($other =~ /SCORE=([^;]+);/) { $fullScore = $1; } \ $score = int($fullScore); \ $source = "miRNA_Registry" if ($source eq "SGJ"); \ if ($other =~ /ACC=([^;]+); ID=([^;]+);/) { $name = "$1:$2";} \ elsif ($other =~ /ID=([^;]+);/) { $name = $1; } \ elsif ($other =~ /TYPE=([^;]+);/) { $name = $1; } \ else { $name = $type; } \ $_ = "$chr\t$start\t$end\t$name\t$score\t$strand\t$source\t$type\t$fullScore\t0\n";' \ gallus_gallus_ncrna_sgj_v1.gff > rnaGene.bed hgLoadBed galGal2 rnaGene -sqlTable=$HOME/kent/src/hg/lib/rnaGene.sql \ -noBin rnaGene.bed # Downloaded new data 4/27/05... it's only the miRNA subset of the # original combined {miRNA, Rfam, tRNAscan-SE} set. wget ftp://ftp.sanger.ac.uk/pub/databases/Rfam/miRNA/genomes/gga.gff # TWINSCAN FROM BRENT LAB (DONE 5/1/04 angie - protein added 6/12/04) # contact: Paul Flicek ssh hgwdev mkdir /cluster/data/galGal2/bed/twinscan cd /cluster/data/galGal2/bed/twinscan wget http://www.cs.wustl.edu/~pflicek/chicken/chicken.043004.ts.tar.gz tar xvzf chicken.043004.ts.tar.gz ldHgGene -gtf -frame -id -geneName galGal2 twinscan chr*.gtf runGeneCheck . # 1173 frameMismatch # 775 badFrame # 477 noStop # 237 gap # 223 noStart # 22 frameDiscontig # 6 inFrameStop wget http://www.cs.wustl.edu/~pflicek/chicken/chicken.043004.ts.ptx.tar.gz tar xvzf chicken.043004.ts.ptx.tar.gz perl -wpe 's/^>(\S+)/>$1.a/' chr*.prot.fa > twinscanPep.fa hgPepPred galGal2 generic twinscanPep twinscanPep.fa # ISOCHORES FROM NEKRUTENKO LAB (DONE 5/3/04 angie) # contact: Jianbin HE ssh hgwdev mkdir /cluster/data/galGal2/bed/isochore cd /cluster/data/galGal2/bed/isochore wget http://www.bx.psu.edu/~jbhe/isochore/chi-iso.tar.gz tar xvzf chi-iso.tar.gz # Remove the custom track "browser" and "track" lines: cat chr*.bed | egrep -v '^(browser|track)' > isochore.bed hgLoadBed galGal2 isochore isochore.bed # SGP GENE PREDICTIONS (REDONE 9/27/04 angie - NEW DATA 1/4/06 angie) # contact: "\"C%G�%@mara, Francisco\"" (Roderic Guigo's group) ssh hgwdev mkdir /cluster/data/galGal2/bed/sgp cd /cluster/data/galGal2/bed/sgp foreach chr (`awk '{print $1;}' ../../chrom.sizes`) wget http://genome.imim.es/genepredictions/G.gallus/golden_path_200402/SGP/humangp200405+Btau2/$chr.gtf end # Add .1 suffix to proteins to match gtf transcript_id's: cat chr*.prot | perl -wpe 's/^>(\w+)/>$1.1/' > sgpPep.fa # 1/4/06: get rid of redundant protein table: hgsql galGal2 -e 'drop table sgpPep' ldHgGene -gtf -genePredExt galGal2 sgpGene chr*.gtf # CONTAMINATION (DONE 5/12/04) # emailed from LaDeana Hillier ssh hgwdev mkdir /cluster/data/galGal2/bed/contamination cd /cluster/data/galGal2/bed/contamination # saved email attachments to that dir: # weber_wgs_maize_mito.hits.txt, weber_wgs_vector_or_E.coli.txt, # chicken_accession_to_contig_mapping # One-shot perl script: chmod a+x mkContam.pl mkContam.pl weber_wgs_maize_mito.hits.txt weber_wgs_vector_or_E.coli.txt \ chicken_accession_to_contig_mapping \ ../../agp_040224/chr*.agp \ > contamination.bed hgLoadBed galGal2 contamination \ -noBin -sqlTable=$HOME/kent/src/hg/lib/contamination.sql \ contamination.bed # FORMAT MARKER DATA FROM WUSTL (IN PROGRESS 4/27/05) ssh kksilo mkdir /cluster/data/galGal2/bed/markers cd /cluster/data/galGal2/bed/markers # Many files emailed from John Wallis # saved to: # accessionsDump040513.sql -- marker -> acc # geneticPositionsDump040513.sql -- marker, chrom, genPos, error, # primers(maybe), type(maybe), source # jacobssonPositionsDump040513.sql -- some overlap w/geneticPostions; # marker, chrom, genPos, type, source # sequencesDump040513.sql # Uppsala_genetic.txt -- corrected positions for jacobssonPositions # GeneticMarkerAccID.txt -- marker -> acc, some overlap w/accessionsDump ssh hgwdev cd /cluster/data/galGal2/bed/markers hgsql -e "create database wallis" cat *.sql | hgsql wallis hgsql wallis <<_EOF_ create table uppsalaPositions ( marker varchar(40) NOT NULL default '', chromosome varchar(40) NOT NULL default '', position int(11) default NULL, primary key (marker) ) TYPE=MyISAM; load data local infile "Uppsala_genetic.txt" into table uppsalaPositions; create table markerAcc ( marker varchar(40) NOT NULL default '', acc varchar(40) NOT NULL default '', primary key (marker) ) TYPE=MyISAM; load data local infile "GeneticMarkerAccID.txt" into table markerAcc; _EOF_ # extract sts marker sequences as fasta: echo 'select marker,sequence from sequences' \ | hgsql -N wallis \ | awk '{printf ">%s\n%s\n", $1, $2;}' > stsSequences.fa # extract primer pairs in ePCR format (works for isPCR too): echo 'select marker,primer1,primer2,length(sequence),source from sequences \ where primer1 is not null and primer2 is not null' \ | hgsql -N wallis \ | awk '{printf "%s\t%s\t%s\t%d\t%s\n", $1, $2, $3, $4, $5;}' \ > stsPrimers.epcr # Still need to extract aux. info... mainly need positions in various # maps. # Take a look at NCBI UniSTS too... ssh kolossus mkdir -p /santest/scratch/galGal2/sts cd /santest/scratch/galGal2/sts # This is the Wageningen map, with some UniSTS IDs associated with # markers and distances: wget ftp://ftp.ncbi.nih.gov/repository/UniSTS/UniSTS_MapReports/Gallus_gallus/9031.WURC.txt # All UniSTS sequences and aliases... filter out from here: wget ftp://ftp.ncbi.nih.gov/repository/UniSTS/UniSTS.sts wget ftp://ftp.ncbi.nih.gov/repository/UniSTS/UniSTS.aliases grep 'Gallus gallus' UniSTS.sts > gg.sts # ANDY LAW CPGISSLANDS (DONE 6/16/04 angie) # Andy Law found 60k CpG islands while we found only 22k. Andy sent # his script (which needs another script/program to preprocess seq # into CpG offsets, GC-since-last-CpG and length-since-last-CpG) to # Laura Elnitski who sent it to us. # --> /cluster/home/angie/andy-cpg-island.pl # His obs/exp calculation used a very rough approximation for number # of C and number of G: length / 4 for each!! # So I modified his script to take counts of C and G separately, # and use the obs/exp from this '87 paper that everybody claims to follow # (but they usually don't): # Gardiner-Garden and Frommer, CpG Islands in Vertebrate Genomes # J. Mol. Biol. (20 Jul. 1987) 196 (2), 261-282. # --> /cluster/home/angie/ggf-andy-cpg-island.pl # Had to write a oneshot program, preProcGgfAndy, to preprocess the # fasta for the script. ssh kksilo mkdir /cluster/data/galGal2/bed/cpgIslandGgfAndy cd /cluster/data/galGal2/bed/cpgIslandGgfAndy cp /dev/null cpgIslandAndy.bed cp /dev/null cpgIslandGgfAndy.bed foreach f (../../*/chr*.fa) set chr = $f:t:r echo preproc $chr /cluster/home/angie/bin/$MACHTYPE/preProcGgfAndy $f > $chr.preproc echo running original on $chr awk '{print $1 "\t" $2 "\t" ($3 + $4) "\t" $5;}' $chr.preproc \ | /cluster/home/angie/andy-cpg-island.pl \ | perl -wpe '$i=0 if (not defined $i); \ chomp; ($s,$e) = split("\t"); $s--; \ $_ = "'$chr'\t$s\t$e\tcpg$i\n"; $i++' \ >> cpgIslandAndy.bed echo running modified on $chr /cluster/home/angie/ggf-andy-cpg-island.pl $chr.preproc \ | perl -wpe 'chomp; ($s,$e,$cpg,$n,$c,$g,$oE) = split("\t"); $s--; \ $gc = $c + $g; $pCpG = (100.0 * 2 * $cpg / $n); \ $pGc = (100.0 * $gc / $n); \ $_ = "'$chr'\t$s\t$e\tCpG: $cpg\t$n\t$cpg\t$gc\t" . \ "$pCpG\t$pGc\t$oE\n";' \ >> cpgIslandGgfAndy.bed end # Now try Takai & Jones' script too, what the heck! :) # (it took 4 days though, and unless I modify it, no CpG count...:b). cp /dev/null cpgIslandTJUnlifted.bed foreach d (../../*/chr*_?{,?}) set ctg = $d:t /cluster/home/angie/cpgi130.pl $d/$ctg.fa \ | tail +6 \ | perl -wpe 'if (/CpG island (\d+), start=(\d+), end=(\d+), %GC=([\d\.]+), ObsCpG\/ExpCpG=([\d\.]+), Length=(\d+)/) { \ ($id, $s, $e, $pGc, $oE, $n) = ($1, $2-1, $3,$4,$5,$6); \ $cpg = 0; $gc = int($pGc * $n / 100.0); \ $pCpG = 0; \ $_ = "'$ctg'\t$s\t$e\tcpg$id\t$n\t$cpg\t$gc\t" . \ "$pCpG\t$pGc\t$oE\n"; \ } else { die "Cant parse this:\n$_\n"; }' \ >> cpgIslandTJUnlifted.bed end liftUp cpgIslandTJ.bed ../../jkStuff/liftAll.lft warn \ cpgIslandTJUnlifted.bed # load into database: ssh hgwdev cd /cluster/data/galGal2/bed/cpgIslandGgfAndy # this one is a bed 4: hgLoadBed galGal2 cpgIAndy -tab -noBin cpgIslandAndy.bed # this one is a cpgIslandExt but with a different table name: sed -e 's/cpgIslandExt/cpgIslandGgfAndy/g' \ $HOME/kent/src/hg/lib/cpgIslandExt.sql > cpgIslandGgfAndy.sql hgLoadBed galGal2 cpgIslandGgfAndy -tab -noBin \ -sqlTable=cpgIslandGgfAndy.sql cpgIslandGgfAndy.bed # likewise for perl'd output of Takai & Jones: sed -e 's/cpgIslandExt/cpgIslandTJ/g' \ $HOME/kent/src/hg/lib/cpgIslandExt.sql > cpgIslandTJ.sql hgLoadBed galGal2 cpgIslandTJ -tab -noBin \ -sqlTable=cpgIslandTJ.sql cpgIslandTJ.bed featureBits galGal2 cpgIslandExt #15097911 bases of 1054197620 (1.432%) in intersection featureBits galGal2 cpgIAndy #84546678 bases of 1054197620 (8.020%) in intersection featureBits galGal2 cpgIslandGgfAndy #60782698 bases of 1054197620 (5.766%) in intersection featureBits galGal2 cpgIslandTJ #27489160 bases of 1054197620 (2.608%) in intersection wc -l ../cpgIsland/cpgIsland.bed *bed # 21802 ../cpgIsland/cpgIsland.bed # 86247 cpgIslandAndy.bed # 73381 cpgIslandGgfAndy.bed # 25788 cpgIslandTJ.bed # OK, just for fun let's run ggf-andy script with TJ parameters! ssh kksilo cd /cluster/data/galGal2/bed/cpgIslandGgfAndy foreach f (../../*/chr*.fa) set chr = $f:t:r echo running modified w/TJ on $chr /cluster/home/angie/ggf-andy-cpg-island.pl \ -min-length 500 -percent-gc 55 -obs-over-exp 0.65 \ $chr.preproc \ | perl -wpe 'chomp; ($s,$e,$cpg,$n,$c,$g,$oE) = split("\t"); $s--; \ $gc = $c + $g; $pCpG = (100.0 * 2 * $cpg / $n); \ $pGc = (100.0 * $gc / $n); \ $_ = "'$chr'\t$s\t$e\tCpG: $cpg\t$n\t$cpg\t$gc\t" . \ "$pCpG\t$pGc\t$oE\n";' \ >> cpgIslandGgfAndyTJ.bed end # load into database: ssh hgwdev cd /cluster/data/galGal2/bed/cpgIslandGgfAndy # this one is a cpgIslandExt but with a different table name: sed -e 's/cpgIslandExt/cpgIslandGgfAndyTJ/g' \ $HOME/kent/src/hg/lib/cpgIslandExt.sql > cpgIslandGgfAndyTJ.sql hgLoadBed galGal2 cpgIslandGgfAndyTJ -tab -noBin \ -sqlTable=cpgIslandGgfAndyTJ.sql cpgIslandGgfAndyTJ.bed featureBits galGal2 cpgIslandGgfAndyTJ #32146870 bases of 1054197620 (3.049%) in intersection wc -l cpgIslandGgfAndyTJ.bed # 23685 cpgIslandGgfAndyTJ.bed # LaDeana asked for a run on masked seq for comparison. cp /dev/null cpgIslandAndyMasked.bed cp /dev/null cpgIslandGgfAndyMasked.bed foreach f (../../*/chr*.fa.masked) set chr = $f:t:r:r echo preproc masked $chr /cluster/home/angie/bin/$MACHTYPE/preProcGgfAndy $f > $chr.masked.preproc echo running original on $chr masked awk '{print $1 "\t" $2 "\t" ($3 + $4) "\t" $5;}' $chr.masked.preproc \ | /cluster/home/angie/andy-cpg-island.pl \ | perl -wpe '$i=0 if (not defined $i); \ chomp; ($s,$e) = split("\t"); $s--; \ $_ = "'$chr'\t$s\t$e\tcpg$i\n"; $i++' \ >> cpgIslandAndyMasked.bed echo running modified on $chr masked /cluster/home/angie/ggf-andy-cpg-island.pl $chr.masked.preproc \ | perl -wpe 'chomp; ($s,$e,$cpg,$n,$c,$g,$oE) = split("\t"); $s--; \ $gc = $c + $g; $pCpG = (100.0 * 2 * $cpg / $n); \ $pGc = (100.0 * $gc / $n); \ $_ = "'$chr'\t$s\t$e\tCpG: $cpg\t$n\t$cpg\t$gc\t" . \ "$pCpG\t$pGc\t$oE\n";' \ >> cpgIslandGgfAndyMasked.bed end # load into database: ssh hgwdev cd /cluster/data/galGal2/bed/cpgIslandGgfAndy hgLoadBed galGal2 cpgIAndyMasked -tab -noBin cpgIslandAndyMasked.bed sed -e 's/cpgIslandExt/cpgIslandGgfAndyMasked/g' \ $HOME/kent/src/hg/lib/cpgIslandExt.sql > cpgIslandGgfAndyMasked.sql hgLoadBed galGal2 cpgIslandGgfAndyMasked -tab -noBin \ -sqlTable=cpgIslandGgfAndyMasked.sql cpgIslandGgfAndyMasked.bed featureBits galGal2 cpgIAndyMasked #67353879 bases of 1054197620 (6.389%) in intersection featureBits galGal2 cpgIslandGgfAndyMasked #47911662 bases of 1054197620 (4.545%) in intersection wc -l cpgIslandAndyMasked.bed cpgIslandGgfAndyMasked.bed # 84570 cpgIslandAndyMasked.bed # 70655 cpgIslandGgfAndyMasked.bed # 6/28/04 -- masking simpleRepeats, and even repeats other than Alu's, # might not be the right thing to do (?). Give it a try with less-masked # sequence. ssh kksilo cd /cluster/data/galGal2/bed/cpgIslandGgfAndy cp /dev/null cpgIslandGgfAndyOnlyRM.bed foreach f (../../*/chr*.fa) set chr = $f:t:r echo preproc, ggf-andy $chr onlyRM maskOutFa $f $f.out stdout \ | /cluster/home/angie/bin/$MACHTYPE/preProcGgfAndy stdin \ | /cluster/home/angie/ggf-andy-cpg-island.pl \ | perl -wpe 'chomp; ($s,$e,$cpg,$n,$c,$g,$oE) = split("\t"); $s--; \ $gc = $c + $g; $pCpG = (100.0 * 2 * $cpg / $n); \ $pGc = (100.0 * $gc / $n); \ $_ = "'$chr'\t$s\t$e\tCpG: $cpg\t$n\t$cpg\t$gc\t" . \ "$pCpG\t$pGc\t$oE\n";' \ >> cpgIslandGgfAndyOnlyRM.bed end # 70694 cpgIslandGgfAndyOnlyRM.bed ssh hgwdev cd /cluster/data/galGal2/bed/cpgIslandGgfAndy sed -e 's/cpgIslandExt/cpgIslandGgfAndyOnlyRM/g' \ $HOME/kent/src/hg/lib/cpgIslandExt.sql > /tmp/c.sql hgLoadBed galGal2 cpgIslandGgfAndyOnlyRM -tab -noBin -sqlTable=/tmp/c.sql \ cpgIslandGgfAndyOnlyRM.bed featureBits galGal2 cpgIslandGgfAndyOnlyRM.bed #48007046 bases of 1054197620 (4.554%) in intersection # 1/26/05: Make better island names in cpgIslandGgfAndyMasked, # for Dave Burt's cross-species island comparisons. ssh kksilo cd /cluster/data/galGal2/bed/cpgIslandGgfAndy mv cpgIslandGgfAndyMasked.bed cpgIslandGgfAndyMasked.bed.orig perl -wpe '@w=split("\t"); $w[3] = "galGal2.$w[0]." . ($w[1]+1) . ".$w[2]"; \ $_ = join("\t", @w);' \ cpgIslandGgfAndyMasked.bed.orig \ > cpgIslandGgfAndyMasked.bed ssh hgwdev cd /cluster/data/galGal2/bed/cpgIslandGgfAndy hgLoadBed -noBin -tab -sqlTable=cpgIslandGgfAndyMasked.sql \ galGal2 cpgIslandGgfAndyMasked cpgIslandGgfAndyMasked.bed # RELOAD ENSEMBL GENES (DONE 6/16/04 angie - REDONE 1/10/05 angie) # previous set was sent out to chicken group -- pull in the released # version from Ensembl's main site. Looks like 75 transcripts have # been yanked, will ask Steve Searle and Ewan. mkdir /cluster/data/galGal2/bed/ensembl cd /cluster/data/galGal2/bed/ensembl # Get the ensembl gene data from # http://www.ensembl.org/Gallus_gallus/martview # Follow this sequence through the pages: # Page 1) Make sure that the Gallus_gallus choice is selected. Hit next. # Page 2) Uncheck the "Limit to" box in the region choice. Then hit next. # Page 3) Choose the "Structures" box. # Page 4) Choose GTF as the ouput. choose gzip compression. hit export. # Save as ensembl.gff.gz # Add "chr" to front of each line in the gene data gtf file to make # it compatible with our software. gunzip -c ensembl.gff.gz \ | perl -wpe 's/^([0-9]+|E\w+|[ZW]|Un(_random)?)/chr$1/ \ || die "Line $. doesnt start with chicken chrom:\n$_"' \ > ensGene.gtf ssh hgwdev ldHgGene -gtf -genePredExt galGal2 ensGene \ /cluster/data/galGal2/bed/ensembl/ensGene.gtf # ensGtp associates geneId/transcriptId/proteinId for hgPepPred and # hgKnownToSuper. Use ensMart to create it as above, except: # Page 3) Choose the "Features" box. In "Ensembl Attributes", check # Ensembl Gene ID, Ensembl Transcript ID, Ensembl Peptide ID. # Choose Text, tab-separated as the output format. Result name ensGtp. # Save file as ensGtp.txt.gz gunzip ensGtp.txt.gz hgsql galGal2 < ~/kent/src/hg/lib/ensGtp.sql hgsql galGal2 -e 'load data local infile "ensGtp.txt" into table ensGtp' # Load Ensembl peptides: # Get them from ensembl as above in the gene section except for # Page 3) Choose the "Sequences" box. # Page 4) Transcripts/Proteins. Peptide. Format = FASTA. # Save file as ensemblPep.fa.gz gunzip -c ensemblPep.fa.gz \ | perl -wpe 's/^>ENSGALG\w+\|(ENSGALT\d+\.\d+).*/>$1/' \ > ensPep.fa hgPepPred galGal2 generic ensPep ensPep.fa # ECORES FROM GENOSCOPE (DONE, hartera, 2004-08-09) # download data from http://www.genoscope.cns.fr/externe/4ucsc/ # ecotigCT - ecores on chicken genome which are conserved with Tetraodon ssh hgwdev mkdir -p /cluster/data/galGal2/bed/ecores # these are in a different format to the previous Ecores # Create a script here to parse the Ecores into bed format # add parse_ecotig.pl to this directory cp /cluster/data/rn3/bed/ecores/parse_ecotig.pl \ /cluster/data/galGal2/bed/ecores/ # ecores conserved with Tetraodon mkdir -p /cluster/data/galGal2/bed/ecores/tetraodon cd /cluster/data/galGal2/bed/ecores/tetraodon # download data for ecotigRF to this directory wget -o fetch.log -r -l1 --no-directories --timestamp \ http://www.genoscope.cns.fr/externe/4ucsc/ecotigCT # The format is like those for ecores on Drosophila conserved with Anopheles # but with a species code missing # [chr] [start ecore] [end ecore] [chr] [start ecore] [end ecore] # Reformat chromosome names to match those in our database sed -e 's/RANDOM/_random/g' ecotigCT | sed -e 's/UN/Un/g;' > ecotigCT.format # get list of chromosomes represented awk '{print $1}' tmp | sort -n | uniq > chrom.lst # compare chroms to those in galGal2: E64 and chr6_random are missing # use perl script to process this to the same format at those # for rn3, hg16 and mm3 cd .. cat << 'EOF_' > /cluster/data/galGal2/bed/ecores/processEcotigs.pl #!/usr/bin/perl -w use strict; # convert ecores file to bed format # Format of Ecores file: # [chr] [start ecore] [end ecore] [chr] [start ecore] [end ecore] # if the second set of ecores are the same as those at the beginning of the # next line then they all belong to the same ecotig # Process so all ecores from the same ecotig are on one line as for the other # format. my $beg_start = 0; my $beg_end = 0; my $last_start = 0; my $last_end = 0; my $chr_old = ""; my $count = 0; while () { chomp; # remove trailing space s/\s+$//; my @fields = split(/\s+/); my $chr = $fields[0]; $beg_start = $fields[1]; $beg_end = $fields[2]; # if first two co-ordinates are the same as those on the previous line if (($beg_start == $last_start) && ($beg_end == $last_end) && ($chr eq $chr_old)) { # get new last start and end from this line print "\t$last_start\t$last_end"; $count++; } else { if ($count > 0) { print "\t$last_start\t$last_end"; $count = 0; } print "\nchr$chr\t$beg_start\t$beg_end"; } $chr_old = $chr; if ($#fields == 5) { $last_start = $fields[$#fields - 1]; $last_end = $fields[$#fields]; } } '_EOF_' # << this line makes emacs coloring happy # convert data to bed file format # copy parse_ecotigs.pl from rn3 cp /cluster/data/rn3/bed/ecores/parse_ecotig.pl . cd tetraodon perl ../processEcotigs.pl < ecotigCT.format > ecotigCT.format2 perl ../parse_ecotig.pl < ecotigCT.format2 > ecotigCT.bed hgLoadBed -tab galGal2 ecoresTetraodon ecotigCT.bed # SWAP BLASTZ MOUSE-CHICKEN TO CHICKEN-MOUSE (MM5) (DONE 8/18/04 angie) # mm5-galGal2 alignments were already axtRescored, no need to re-rescore. ssh kolossus mkdir /cluster/data/galGal2/bed/blastz.mm5.swap.2004-07-15 cd /cluster/data/galGal2/bed/blastz.mm5.swap.2004-07-15 set aliDir = /cluster/data/mm5/bed/blastz.galGal2.2004-07-15 cp $aliDir/S1.len S2.len cp $aliDir/S2.len S1.len mkdir unsorted axtChrom cat $aliDir/axtChrom/chr*.axt \ | axtSwap stdin $aliDir/S1.len $aliDir/S2.len stdout \ | axtSplitByTarget stdin unsorted # Sort the shuffled .axt files. foreach f (unsorted/*.axt) echo sorting $f:t:r axtSort $f axtChrom/$f:t end du -sh $aliDir/axtChrom unsorted axtChrom #1.3G /cluster/data/mm5/bed/blastz.galGal2.2004-07-15/axtChrom #1.3G unsorted #1.3G axtChrom rm -r unsorted # CHAIN MOUSE BLASTZ (DONE 8/20/04 angie) # Run axtChain on little cluster ssh kki cd /cluster/data/galGal2/bed/blastz.mm5.swap.2004-07-15 mkdir -p axtChain/run1 cd axtChain/run1 mkdir out chain ls -1S /cluster/data/galGal2/bed/blastz.mm5.swap.2004-07-15/axtChrom/*.axt \ > input.lst cat << '_EOF_' > gsub #LOOP doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out exists out/$(root1).out} #ENDLOOP '_EOF_' # << this line makes emacs coloring happy cat << '_EOF_' > doChain #!/bin/csh -ef axtChain -scoreScheme=/cluster/data/blastz/HoxD55.q \ -linearGap=/cluster/data/blastz/chickenHumanTuned.gap \ -minScore=5000 $1 \ /iscratch/i/galGal2/nib \ /iscratch/i/mus/mm5/softNib $2 > $3 '_EOF_' # << this line makes emacs coloring happy chmod a+x doChain gensub2 input.lst single gsub jobList para create jobList para try, check, push, check... #Completed: 52 of 52 jobs #Average job time: 78s 1.31m 0.02h 0.00d #Longest job: 391s 6.52m 0.11h 0.00d #Submission to last job: 391s 6.52m 0.11h 0.00d # now on the cluster server, sort chains ssh kksilo cd /cluster/data/galGal2/bed/blastz.mm5.swap.2004-07-15/axtChain chainMergeSort run1/chain/*.chain > all.chain chainSplit chain all.chain rm run1/chain/*.chain # take a look at score distr's foreach f (chain/*.chain) grep chain $f | awk '{print $2;}' | sort -nr > /tmp/score.$f:t:r echo $f:t:r textHistogram -binSize=10000 /tmp/score.$f:t:r echo "" end # Load chains into database ssh hgwdev cd /cluster/data/galGal2/bed/blastz.mm5.swap.2004-07-15/axtChain/chain foreach i (*.chain) set c = $i:r echo loading $c hgLoadChain galGal2 ${c}_chainMm5 $i end # NET MOUSE BLASTZ (DONE 8/20/04 angie) ssh kksilo cd /cluster/data/galGal2/bed/blastz.mm5.swap.2004-07-15/axtChain chainPreNet all.chain ../S1.len ../S2.len stdout \ | chainNet stdin -minSpace=1 ../S1.len ../S2.len stdout /dev/null \ | netSyntenic stdin noClass.net # Add classification info using db tables: ssh hgwdev cd /cluster/data/galGal2/bed/blastz.mm5.swap.2004-07-15/axtChain netClass noClass.net galGal2 mm5 mouse.net # Make a 'syntenic' subset: ssh kksilo cd /cluster/data/galGal2/bed/blastz.mm5.swap.2004-07-15/axtChain rm noClass.net # Make a 'syntenic' subset of these with netFilter -syn mouse.net > mouseSyn.net # Load the nets into database ssh hgwdev cd /cluster/data/galGal2/bed/blastz.mm5.swap.2004-07-15/axtChain netFilter -minGap=10 mouse.net | hgLoadNet galGal2 netMm5 stdin netFilter -minGap=10 mouseSyn.net | hgLoadNet galGal2 netSyntenyMm5 stdin # Add entries for chaing16, netMm5, syntenyMm5 to chicken/galGal2 trackDb # GENERATE MM5 MAF FOR MULTIZ FROM NET (DONE 8/20/04 angie) ssh kksilo cd /cluster/data/galGal2/bed/blastz.mm5/axtChain netSplit mouse.net net cd /cluster/data/galGal2/bed/blastz.mm5 mkdir axtNet foreach f (axtChain/net/*) set chr = $f:t:r netToAxt $f axtChain/chain/$chr.chain /cluster/data/galGal2/nib \ /cluster/data/mm5/nib stdout \ | axtSort stdin axtNet/$chr.axt end mkdir mafNet foreach f (axtNet/chr*.axt) set maf = mafNet/$f:t:r.maf axtToMaf $f \ /cluster/data/galGal2/chrom.sizes /cluster/data/mm5/chrom.sizes \ $maf -tPrefix=galGal2. -qPrefix=mm5. end # MAKE VSMM5 DOWNLOADABLES (TODO 8/20/04 angie) ssh kksilo cd /cluster/data/galGal2/bed/blastz.mm5.swap.2004-07-15/axtChain gzip -c all.chain > /cluster/data/galGal2/zip/mouse.chain.gz gzip -c mouse.net > /cluster/data/galGal2/zip/mouse.net.gz ssh hgwdev mkdir /usr/local/apache/htdocs/goldenPath/galGal2/vsMm5 cd /usr/local/apache/htdocs/goldenPath/galGal2/vsMm5 mv /cluster/data/galGal2/zip/mouse*.gz . mkdir axtNet cp -p \ /cluster/data/galGal2/bed/blastz.mm5.swap.2004-07-15/axtNet/chr*.axt.gz \ axtNet/ md5sum *.gz > md5sum.txt # Copy over & edit README.txt w/pointers to chain, net formats. # VAR_MULTIZ 3-WAY (DONE 8/23/04 angie) # Copy inputs to bluearc ssh kksilo mkdir /cluster/bluearc/var_multiz.galGal2{,/hg17,/mm5} cp -p /cluster/data/galGal2/bed/blastz.hg17/mafNet/* \ /cluster/bluearc/var_multiz.galGal2/hg17 cp -p /cluster/data/galGal2/bed/blastz.mm5/mafNet/* \ /cluster/bluearc/var_multiz.galGal2/mm5 # Small cluster job: ssh kki mkdir /cluster/data/galGal2/bed/multiz.hg17mm5.2004-08-23 cd /cluster/data/galGal2/bed/multiz.hg17mm5.2004-08-23 cat > doMultiz.csh << '_EOF_' #!/bin/csh -ef set path = (/cluster/bin/penn/var_multiz.2004.08.12 $path) if (-e $1 && -e $2) then set chr = $1:t:r set tmp = /scratch/$chr.tmp.maf var_multiz $1 $2 0 0 > $tmp maf_project $tmp galGal2.$chr > $3 rm $tmp else if (-e $1) then cp $1 $3 else if (-e $2) then cp $2 $3 endif '_EOF_' # << this line makes emacs coloring happy chmod a+x doMultiz.csh cp /dev/null jobList foreach chr (`awk '{print $1;}' /cluster/data/galGal2/chrom.sizes`) set f1 = /cluster/bluearc/var_multiz.galGal2/hg17/$chr.maf set f2 = /cluster/bluearc/var_multiz.galGal2/mm5/$chr.maf echo "doMultiz.csh $f1 $f2 ghm/$chr.maf" >> jobList end mkdir ghm para create jobList para try, check, push, check, ... #Completed: 54 of 54 jobs #Average job time: 30s 0.49m 0.01h 0.00d #Longest job: 190s 3.17m 0.05h 0.00d #Submission to last job: 208s 3.47m 0.06h 0.00d # Clean up. rm -r /cluster/bluearc/var_multiz.galGal2 # setup external files for database reference ssh hgwdev cd /tmp mkdir /gbdb/galGal2/multizHg17Mm5{,/maf} ln -s /cluster/data/galGal2/bed/multiz.hg17mm5.2004-08-23/ghm/chr*.maf \ /gbdb/galGal2/multizHg17Mm5/maf/ # load multiz maf hgLoadMaf -pathPrefix=/gbdb/galGal2/multizHg17Mm5/maf -warn \ galGal2 multizHg17Mm5 # load pairwise mafs mkdir /gbdb/galGal2/{human,mouse}_ghm ln -s /cluster/data/galGal2/bed/blastz.hg17/mafNet/*.maf \ /gbdb/galGal2/human_ghm/ ln -s /cluster/data/galGal2/bed/blastz.mm5/mafNet/*.maf \ /gbdb/galGal2/mouse_ghm/ hgLoadMaf -WARN galGal2 human_ghm hgLoadMaf -WARN galGal2 mouse_ghm # PHASTCONS CHICKEN/HUMAN/MOUSE (DONE 8/24/04 angie) # NOTE: REPLACED, SEE "PHASTCONS 7WAY" BELOW # Make .ss input for phastCons ssh kksilo mkdir /cluster/data/galGal2/bed/multiz.hg17mm5.2004-08-23/phastCons cd /cluster/data/galGal2/bed/multiz.hg17mm5.2004-08-23/phastCons set WINSIZE=1000000 set WINOVERLAP=0 mkdir SS foreach f (../ghm/*.maf) set chr=$f:t:r set c=`echo $chr | sed 's/chr//; s/_random//;'` echo $f $chr $c mkdir SS/$chr /cluster/bin/phast/msa_split $f -i MAF -o SS \ -O galGal2,hg17,mm5 -M /cluster/data/galGal2/$c/$chr.fa \ -w ${WINSIZE},${WINOVERLAP} -r SS/$chr/$chr -I 1000 -d 1 -B 5000 end # save some space and I/O time: gzip SS/*/*.ss # put on bluearc for cluster runs: mv SS /cluster/bluearc/galGal2.SS # Run phastCons twice: # - more stringent parameters to get postProbs for wiggle # - less stringent params (more coverage) to get conserved elements. ssh kk9 cd /cluster/data/galGal2/bed/multiz.hg17mm5.2004-08-23/phastCons # Pare down hg17 8-way tree: /cluster/bin/phast/tree_doctor --rename '1->hg17; 3->mm5; 6->galGal2' \ /cluster/data/hg17/bed/multiz8way/cons/run.cons/hg17panTro1mm5rn3canFam1galGal2fr1danRer1.mod \ | /cluster/bin/phast/tree_doctor --prune-all-but galGal2,hg17,mm5 - \ | perl -wpe 's/TREE: \((\(hg17:\S+),(galGal2:\S+)\)/TREE: ($2,$1)/' \ > rev-dg.mod # More stringent: same --transitions and --nrates as human 8-way. cat << '_EOF_' > doPostProbs #!/bin/csh -fe set PHAST=/cluster/bin/phast set TMP=/scratch/phastCons/ghm set file=$1 set root = $file:t:r:r set chrom = $root:r mkdir -p $TMP cp $file $TMP/$root.ss.gz zcat $TMP/$root.ss.gz \ | $PHAST/phastCons - rev-dg.mod --nrates 40 --transitions 0.08,0.008 --quiet \ > $TMP/$root.pp mkdir -p POSTPROBS/$chrom gzip -c $TMP/$root.pp > POSTPROBS/$chrom/$root.pp.gz rm $TMP/$root.pp rm $TMP/$root.ss.gz '_EOF_' # << this line makes emacs coloring happy chmod 775 doPostProbs rm -f jobListPostProbs foreach f (`ls -1S /cluster/bluearc/galGal2.SS/*/*.ss.gz`) echo './doPostProbs {check in exists+ '$f'}' \ >> jobListPostProbs end para create jobListPostProbs para try ; para push ; etc.... #Completed: 970 of 970 jobs #Average job time: 10s 0.17m 0.00h 0.00d #Longest job: 16s 0.27m 0.00h 0.00d #Submission to last job: 122s 2.03m 0.03h 0.00d # now create wiggle track ssh kksilo cd /cluster/data/galGal2/bed/multiz.hg17mm5.2004-08-23/phastCons mkdir wib foreach dir (POSTPROBS/*) echo $dir set chr=$dir:t zcat `ls -1 $dir/*.pp.gz | sort -t\. -k2,2n` | \ wigAsciiToBinary -chrom=$chr \ -wibFile=wib/${chr}_phastCons stdin end ssh hgwdev mkdir -p /gbdb/galGal2/multizHg17Mm5/wib cd /cluster/data/galGal2/bed/multiz.hg17mm5.2004-08-23/phastCons chmod 775 . wib chmod 664 wib/*.wib ln -s `pwd`/wib/*.wib /gbdb/galGal2/multizHg17Mm5/wib/ hgLoadWiggle galGal2 mzHg17Mm5_phast_wig \ -pathPrefix=/gbdb/galGal2/multizHg17Mm5/wib wib/*.wig # Less stringent run: same --transitions as human 8-way, but smaller # nrates (fewer categories ==> more frequently in top category) ssh kk9 cd /cluster/data/galGal2/bed/multiz.hg17mm5.2004-08-23/phastCons cat << '_EOF_' > doEls #!/bin/csh -fe set PHAST=/cluster/bin/phast set TMP=/scratch/phastCons/ghm set file=$1 set root = $file:t:r:r set chrom = $root:r mkdir -p $TMP mkdir -p PREDICTIONS/$chrom cp $file $TMP/$root.ss.gz zcat $TMP/$root.ss.gz \ | $PHAST/phastCons - rev-dg.mod --nrates 10 --transitions 0.08,0.008 --quiet \ --viterbi PREDICTIONS/$chrom/$root.bed --score --seqname $chrom \ --no-post-probs mkdir -p POSTPROBS/$chrom rm $TMP/$root.ss.gz '_EOF_' # << this line makes emacs coloring happy chmod 775 doEls rm -f jobListEls foreach f (`ls -1S /cluster/bluearc/galGal2.SS/*/*.ss.gz`) echo './doEls {check in exists+ '$f'}' \ >> jobListEls end para create jobListEls para try ; para push ; etc.... #Completed: 970 of 970 jobs #Average job time: 4s 0.06m 0.00h 0.00d #Longest job: 7s 0.12m 0.00h 0.00d #Submission to last job: 56s 0.93m 0.02h 0.00d # Create bed track: tweak scores and names, trim the unused strand column. cat PREDICTIONS/*/*.bed \ | awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' \ | /cluster/bin/scripts/lodToBedScore \ > all.bed ssh hgwdev cd /cluster/data/galGal2/bed/multiz.hg17mm5.2004-08-23/phastCons hgLoadBed galGal2 phastConsElements all.bed featureBits galGal2 phastConsElements #34279869 bases of 1054197620 (3.252%) in intersection # make top-5000 list and launcher on Adam's home page: sort -k5,5nr raw.bed | head -5000 > top5000.bed /cluster/home/acs/bin/make-launcher-with-scores.sh top5000.bed \ /cse/grads/acs/public_html/gg-top5000 \ "top 5000 conserved elements" galGal2 # Clean up rm -r /cluster/bluearc/galGal2.SS # MAKE Human Proteins track mkdir -p /cluster/data/galGal2/bed/tblastn.hg16KG cd /cluster/data/galGal2/bed/tblastn.hg16KG ls -1S /iscratch/i/galGal2/blastDb/*.nsq | sed "s/\.nsq//" > chick.lst mkdir kgfa # calculate a reasonable number of jobs calc `wc /cluster/data/hg16/bed/blat.hg16KG/hg16KG.psl | awk "{print \\\$1}"`/\(350000/`wc chick.lst | awk "{print \\\$1}"`\) # 41117/(350000/259) = 30.426580 split -l 300 /cluster/data/hg16/bed/blat.hg16KG/hg16KG.psl kgfa/kg cd kgfa for i in *; do split -l 30 $i $i; rm $i; done for i in *; do pslxToFa $i $i.fa; rm $i; done mkdir -p /cluster/bluearc/galGal2/bed/tblastn.hg16KG/kgfa cp * /cluster/bluearc/galGal2/bed/tblastn.hg16KG/kgfa cd .. ls -1S /cluster/bluearc/galGal2/bed/tblastn.hg16KG/kgfa/*.fa > kg.lst mkdir blastOut for i in `cat kg.lst`; do mkdir blastOut/`basename $i .fa`; done cat << '_EOF_' > blastGsub #LOOP blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl } #ENDLOOP '_EOF_' cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/iscratch/i/blast/data export BLASTMAT f=/tmp/`basename $3` for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11 do if /scratch/blast/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8 then mv $f.8 $f.1 break; fi done if test -f $f.1 then if /cluster/bin/i386/blastToPsl $f.1 $f.2 then liftUp -nosort -type=".psl" -nohead $f.3 /iscratch/i/galGal2/lifts/liftAll.lft warn $f.2 liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /iscratch/i/hg16/lifts/protein.lft warn $f.3 if pslCheck -prot $3.tmp then mv $3.tmp $3 rm -f $f.1 $f.2 $f.3 exit 0 fi fi fi rm -f $f.1 $f.2 $3.tmp $f.8 exit 1 '_EOF_' chmod +x blastSome gensub2 chick.lst kg.lst blastGsub blastSpec ssh kk cd /cluster/data/galGal2/bed/tblastn.hg16KG para create blastSpec para push # Completed: 355088 of 355089 jobs # CPU time in finished jobs: 43208509s 720141.82m 12002.36h 500.10d 1.370 y # IO & Wait Time: 2326912s 38781.87m 646.36h 26.93d 0.074 y # Average job time: 128s 2.14m 0.04h 0.00d # Longest job: 33767s 562.78m 9.38h 0.39d # Submission to last job: 90026s 1500.43m 25.01h 1.04d cat << '_EOF_' > chainGsub #LOOP chainSome $(path1) #ENDLOOP '_EOF_' cat << '_EOF_' > chainSome (cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=75000 stdin ../c.`basename $1`.psl) '_EOF_' chmod +x chainSome ls -1dS `pwd`/blastOut/kg???? > chain.lst gensub2 chain.lst single chainGsub chainSpec ssh kki cd /cluster/data/galGal2/bed/tblastn.hg16KG para create chainSpec para push # Completed: 1371 of 1371 jobs # CPU time in finished jobs: 941133s 15685.54m 261.43h 10.89d 0.030 y # IO & Wait Time: 11226s 187.11m 3.12h 0.13d 0.000 y # Average job time: 695s 11.58m 0.19h 0.01d # Longest job: 14841s 247.35m 4.12h 0.17d # Submission to last job: 23130s 385.50m 6.42h 0.27d exit # back to kksilo cd /cluster/data/galGal2/bed/tblastn.hg16KG/blastOut for i in kg???? do awk "(\$13 - \$12)/\$11 > 0.6 {print}" c.$i.psl > c60.$i.psl sort -rn c60.$i.psl | pslUniq stdin u.$i.psl awk "((\$1 / \$11) ) > 0.60 { print }" c60.$i.psl > m60.$i.psl echo $i done cat u.*.psl m60* | sort -T /tmp -k 14,14 -k 16,16n -k 17,17n | uniq > ../preblastHg16KG.psl cd .. blatDir=/cluster/data/hg16/bed/blat.hg16KG protDat -kg preblastHg16KG.psl $blatDir/hg16KG.psl $blatDir/kg.mapNames blastHg16KG.psl ssh hgwdev cd /cluster/data/galGal2/bed/tblastn.hg16KG hgLoadPsl galGal2 blastHg16KG.psl exit # back to kksilo rm -rf blastOut # End tblastn # ACCESSIONS FOR CONTIGS (DONE 2004-08-20 kate) # Used for Assembly track details, and also ENCODE AGP's cd /cluster/data/galGal2/bed mkdir -p contigAcc cd contigAcc # extract WGS contig to accession mapping from Entrez Nucleotide summaries # To get the summary file, access the Genbank page for the project # by searching: # genus[ORGN] AND WGS[KYWD] # At the end of the page, click on the list of accessions for the contigs. # Select summary format, and send to file. # output to file .acc contigAccession chicken.acc > contigAcc.tab wc -l contigAcc.tab # 111864 contigAcc.tab cd /cluster/data/galGal2/agp_040224 grep Contig *.agp | wc -l # 111864 hgsql galGal2 < $HOME/kent/src/hg/lib/contigAcc.sql echo "LOAD DATA LOCAL INFILE 'contigAcc.tab' INTO TABLE contigAcc" | \ hgsql galGal2 hgsql galGal2 -e "SELECT COUNT(*) FROM contigAcc" # 111864 # Pick up photo from NHGRI (DONE - 2004-12-22 - Hiram) ssh hgwdev cd /tmp wget \ 'http://www.genome.gov/Pages/News/Photos/Science/redfowl_image.jpg' -O redfowl_image.jpg # view that image in 'display' to determine crop edges, then: convert -crop 600x488+80+32 -quality 80 -sharpen 0 \ -normalize redfowl_image.jpg ggg.jpg convert -geometry 300x200 -quality 80 ggg.jpg Gallus_gallus.jpg rm -f ggg.jpg redfowl_image.jpg cp -p Gallus_gallus.jpg /usr/local/apache/htdocs/images # add links to this image in the description.html page, request push # MAKE Human Proteins track (DONE 2005-1-15 braney) ssh kksilo mkdir -p /cluster/data/galGal2/blastDb cd /cluster/data/galGal2/blastDb for i in `cat ../chrom.lst`; do ls -1 ../$i/*/*.fa ; done > list for i in `cat list` do ln -s $i . formatdb -i `basename $i` -p F done rm *.log *.fa list # cd .. # for i in `cat chrom.lst`; do cat $i/chr*/*.lft ; done > jkStuff/subChr.lft ssh kkr1u00 mkdir -p /iscratch/i/galGal2/blastDb cd /cluster/data/galGal2/blastDb for i in nhr nin nsq; do cp *.$i /iscratch/i/galGal2/blastDb ; echo $i; done cd iSync > sync.out mkdir -p /cluster/data/galGal2/bed/tblastn.hg17KG cd /cluster/data/galGal2/bed/tblastn.hg17KG echo /iscratch/i/galGal2/blastDb/*.nsq | xargs ls -S | sed "s/\.nsq//" > query.lst # back to kksilo exit cd /cluster/data/galGal2/bed/tblastn.hg17KG mkdir -p /cluster/bluearc/galGal2/bed/tblastn.hg17KG/kgfa split -l 65 /cluster/data/hg17/bed/blat.hg17KG/hg17KG.psl /cluster/bluearc/galGal2/bed/tblastn.hg17KG/kgfa/kg ln -s /cluster/bluearc/galGal2/bed/tblastn.hg17KG/kgfa kgfa cd kgfa for i in *; do pslxToFa $i $i.fa; rm $i; done cd .. ls -1S kgfa/*.fa > kg.lst mkdir -p /cluster/bluearc/galGal2/bed/tblastn.hg17KG/blastOut ln -s /cluster/bluearc/galGal2/bed/tblastn.hg17KG/blastOut for i in `cat kg.lst`; do mkdir blastOut/`basename $i .fa`; done tcsh cat << '_EOF_' > blastGsub #LOOP blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl } #ENDLOOP '_EOF_' cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/iscratch/i/blast/data export BLASTMAT g=`basename $2` f=/tmp/`basename $3`.$g for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11 do if /scratch/blast/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8 then mv $f.8 $f.1 break; fi done if test -f $f.1 then if /cluster/bin/i386/blastToPsl $f.1 $f.2 then liftUp -nosort -type=".psl" -nohead $f.3 ../../jkStuff/liftAll.lft carry $f.2 liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg17/bed/blat.hg17KG/protein.lft warn $f.3 if pslCheck -prot $3.tmp then mv $3.tmp $3 rm -f $f.1 $f.2 $f.3 fi exit 0 fi fi rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 exit 1 '_EOF_' chmod +x blastSome gensub2 query.lst kg.lst blastGsub blastSpec ssh kk cd /cluster/data/galGal2/bed/tblastn.hg17KG para create blastSpec para push # Completed: 168091 of 168091 jobs # CPU time in finished jobs: 42705613s 711760.22m 11862.67h 494.28d 1.354 y # IO & Wait Time: 2300980s 38349.66m 639.16h 26.63d 0.073 y # Average job time: 268s 4.46m 0.07h 0.00d # Longest job: 56699s 944.98m 15.75h 0.66d # Submission to last job: 171648s 2860.80m 47.68h 1.99d cat << '_EOF_' > chainGsub #LOOP chainSome $(path1) #ENDLOOP '_EOF_' cat << '_EOF_' > chainSome (cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=200000 stdin ../c.`basename $1`.psl) '_EOF_' chmod +x chainSome ls -1dS `pwd`/blastOut/kg?? > chain.lst gensub2 chain.lst single chainGsub chainSpec ssh kki cd /cluster/data/galGal2/bed/tblastn.hg17KG para create chainSpec para push # Completed: 649 of 649 jobs # CPU time in finished jobs: 243653s 4060.88m 67.68h 2.82d 0.008 y # IO & Wait Time: 50013s 833.55m 13.89h 0.58d 0.002 y # Average job time: 452s 7.54m 0.13h 0.01d # Longest job: 3831s 63.85m 1.06h 0.04d # Submission to last job: 24525s 408.75m 6.81h 0.28d exit # back to kksilo cd /cluster/data/galGal2/bed/tblastn.hg17KG/blastOut for i in kg?? do cat c.$i.psl | awk "(\$13 - \$12)/\$11 > 0.6 {print}" > cs60.$i.psl sort -rn cs60.$i.psl | pslUniq stdin us.$i.psl awk "((\$1 / \$11) ) > 0.60 { print }" cs60.$i.psl > ms60.$i.psl echo $i done cat us.*.psl ms60* | sort -T /tmp -k 14,14 -k 17,17n -k 17,17n | uniq > /cluster/data/galGal2/bed/tblastn.hg17KG/blastHg17KG.psl cd .. ssh hgwdev cd /cluster/data/galGal2/bed/tblastn.hg17KG hgLoadPsl galGal2 blastHg17KG.psl # back to kksilo rm -rf blastOut # End tblastn # MAKE Human Known Gene mRNAs Mapped by BLATZ track (hg17 DONE 1/18/05 braney) # Create working directory and extract known gene mrna from database ssh hgwdev mkdir -p /cluster/data/galGal2/bed/blatz.hg17KG cd /cluster/data/galGal2/bed/blatz.hg17KG pepPredToFa hg17 knownGeneMrna known.fa # Split known genes into pieces and load on /iscratch/i temp device ssh kkr1u00 cd /iscratch/i/hg17 mkdir knownRna2 cd knownRna2 faSplit sequence /cluster/data/galGal2/bed/blatz.hg17KG/known.fa 580 known iSync rm /cluster/data/galGal2/bed/blatz.hg17KG/known.fa # Do parasol job to align via blatz ssh kk cd /cluster/data/galGal2/bed/blatz.hg17KG mkdir run0 cd run0 mkdir psl # awk '{print $1;}' /cluster/data/galGal2/chrom.sizes > genome.lst ls -1S /iscratch/i/galGal2/trfFa/*.fa > genome.lst for i in `cat genome.lst` ; do f=`basename $i .fa`; mkdir psl/$f ; done for i in /iscratch/i/hg17/knownRna2/*.fa ; do echo $i; done > mrna.lst cat << '_EOF_' > doBlatz #!/bin/sh f=/tmp/`basename $1`.`basename $2` if blatz -rna -minScore=6000 -out=psl $1 $2 $f.1 then if liftUp -nosort -type=".psl" -nohead $f.2 ../../../jkStuff/liftAll.lft carry $f.1 then mv $f.2 $3 rm $f.1 exit 0 fi fi rm $f.1 $f.2 exit 1 '_EOF_' chmod +x doBlatz cat << '_EOF_' > gsub #LOOP doBlatz $(path1) $(path2) {check out line psl/$(root1)/$(root2).psl} #ENDLOOP '_EOF_' # << this line keeps emacs coloring happy gensub2 genome.lst mrna.lst gsub spec para create spec # Then do usual para try/para push/para time # Completed: 148407 of 148407 jobs # CPU time in finished jobs: 1884903s 31415.06m 523.58h 21.82d 0.060 y # IO & Wait Time: 490225s 8170.41m 136.17h 5.67d 0.016 y # Average job time: 16s 0.27m 0.00h 0.00d # Longest job: 116s 1.93m 0.03h 0.00d # Submission to last job: 28248s 470.80m 7.85h 0.33d # Do sort and near-besting on file server ssh kksilo cd /cluster/data/galGal2/bed/blatz.hg17KG/run0 catDir -r psl | pslFilter -minScore=100 -minAli=255 -noHead stdin stdout | sort -k 10 > ../filtered.psl cd .. pslReps filtered.psl nearBest.psl /dev/null -nohead -minAli=0.5 -nearTop=0.01 -minCover=0.255 sort -k 14 nearBest.psl > blatzHg17KG.psl # Clean up rm -r run0/psl # Load into database ssh hgwdev cd /cluster/data/galGal2/bed/blatz.hg17KG hgLoadPsl galGal2 blatzHg17KG.psl # MAKE GALGAL2-HG17 OVER.CHAIN FOR LIFTOVER (DONE 1/25/05 angie) ssh kolossus set chainDir = /cluster/data/galGal2/bed/blastz.hg17/axtChain mkdir -p /cluster/data/galGal2/bed/bedOver netChainSubset $chainDir/human.net $chainDir/all.chain \ /cluster/data/galGal2/bed/bedOver/galGal2ToHg17.over.chain # MAKE GALGAL2-MM5 OVER.CHAIN FOR LIFTOVER (DONE 1/25/05 angie) ssh kolossus set chainDir = /cluster/data/galGal2/bed/blastz.mm5/axtChain mkdir -p /cluster/data/galGal2/bed/bedOver netChainSubset $chainDir/mouse.net $chainDir/all.chain \ /cluster/data/galGal2/bed/bedOver/galGal2ToMm5.over.chain # BLASTZ RAT (RN3) (IN PROGRESS 1/26/05 angie) ssh kk mkdir /cluster/data/galGal2/bed/blastz.rn3.2005-01-26 ln -s blastz.rn3.2005-01-26 /cluster/data/galGal2/bed/blastz.rn3 cd /cluster/data/galGal2/bed/blastz.rn3.2005-01-26 # Use human-chicken params: set L=10000 (higher threshold on blastz's # outer loop) and abridge repeats. cat << '_EOF_' > DEF # chicken vs. rat export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin ALIGN=blastz-run BLASTZ=blastz # Specific settings for chicken (per Webb email to Brian Raney) BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=10000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q BLASTZ_ABRIDGE_REPEATS=1 # TARGET: Chicken SEQ1_DIR=/iscratch/i/galGal2/nib SEQ1_SMSK=/iscratch/i/galGal2/linSpecRep SEQ1_IN_CONTIGS=0 SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Chicken SEQ2_DIR=/iscratch/i/rn3/bothMaskedNibs SEQ2_SMSK=/iscratch/i/rn3/linSpecRep.notInChicken SEQ2_IN_CONTIGS=0 SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/cluster/data/galGal2/bed/blastz.rn3.2005-01-26 DEF=$BASE/DEF RAW=$BASE/raw CDBDIR=$BASE SEQ1_LEN=$BASE/S1.len SEQ2_LEN=$BASE/S2.len '_EOF_' # << this line keeps emacs coloring happy # Partition chroms into 10MB (+10k overlap) chunks and generate parasol # job list. cp /cluster/data/galGal2/chrom.sizes S1.len cp /cluster/data/rn3/chrom.sizes S2.len mkdir run.blastz cd run.blastz partitionSequence.pl 10000000 10000 /iscratch/i/galGal2/nib ../S1.len \ -xdir xdir.sh -rawDir ../lav > galGal2.lst partitionSequence.pl 10000000 0 /iscratch/i/rn3/bothMaskedNibs ../S2.len \ > rn3.lst csh -ef xdir.sh cat << '_EOF_' > gsub #LOOP /cluster/home/angie/kent/src/utils/blastz-run-ucsc $(path1) $(path2) ../DEF {check out line ../lav/$(file1)/$(file1)_$(file2).out} #ENDLOOP '_EOF_' # << this line keeps emacs coloring happy gensub2 galGal2.lst rn3.lst gsub jobList para create jobList para try, check, push, check, ... # HUMAN RECIPROCAL-BEST NET FOR STRINGENT LIFTOVER (DONE 2/3/05 angie) ssh kolossus cd /cluster/data/galGal2/bed/blastz.hg17/axtChain # Run chainNet again, this time keeping both of its outputs: chainPreNet all.chain ../S1.len ../S2.len stdout \ | chainNet stdin ../S1.len ../S2.len g_h.net h_g.net # Get the human chains from the human-referenced (but chicken-centric) net: chainSwap all.chain h_g.chain netChainSubset h_g.net h_g.chain stdout \ | chainSort stdin h_g.subset.chain # Net those (sorted) human chains, and keep both outputs, to get # reciprocal best nets referenced to both species: chainPreNet h_g.subset.chain ../S2.len ../S1.len stdout \ | chainNet stdin ../S2.len ../S1.len h_g.rbest.net g_h.rbest.net # Get the chains from the recip-best nets for stringent liftOver: netChainSubset h_g.rbest.net h_g.chain hg17ToGalGal2.rbest.over.chain netChainSubset g_h.rbest.net all.chain galGal2ToHg17.rbest.over.chain # BLASTZ/CHAIN/NET TETRAODON (DONE 4/25/05 angie) mkdir /cluster/data/galGal2/bed/blastz.tetNig1.2005-04-25 cd /cluster/data/galGal2/bed/blastz.tetNig1.2005-04-25 cat << '_EOF_' > DEF # Chicken vs. Tetraodon BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET - Chicken SEQ1_DIR=/iscratch/i/galGal2/nib SEQ1_CHUNK=10000000 SEQ1_LAP=10000 SEQ1_LEN=/cluster/data/galGal2/chrom.sizes # QUERY - Tetraodon SEQ2_DIR=/iscratch/i/tetNig1/tetNig1.2bit SEQ2_CHUNK=10000000 SEQ2_LAP=10000 SEQ2_LEN=/cluster/data/tetNig1/chrom.sizes BASE=/cluster/data/galGal2/bed/blastz.tetNig1.2005-04-25 '_EOF_' # << this line keeps emacs coloring happy doBlastzChainNet.pl DEF \ -blastzOutRoot /santest/scratch/galGal2tetNig1 >& do.log & tail -f do.log rmdir /santest/scratch/galGal2tetNig1 ln -s blastz.tetNig1.2005-04-25 /cluster/data/galGal2/bed/blastz.tetNig1 # Add {chain,net}TetNig1 to trackDb.ra if necessary. featureBits galGal2 chainTetNig1Link #45071616 bases of 1054197620 (4.275%) in intersection # BLASTZ/CHAIN/NET ZEBRAFISH (DONE 4/27/05 angie) ssh hgwdev # fileserver woes -- put the build dir on bluearc, then copy back # to /cluster/store-land when things are better. mkdir -p /cluster/bluearc/angie/galGal2/blastz.danRer2.2005-04-26 ln -s /cluster/bluearc/angie/galGal2/blastz.danRer2.2005-04-26 \ /cluster/data/galGal2/bed/blastz.danRer2.2005-04-26 cp /cluster/data/galGal2/chrom.sizes /cluster/bluearc/galGal2/ cd /cluster/data/galGal2/bed/blastz.danRer2.2005-04-26 cat << '_EOF_' > DEF # Chicken vs. Zebrafish BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET - Chicken SEQ1_DIR=/iscratch/i/galGal2/nib SEQ1_CHUNK=10000000 SEQ1_LAP=10000 SEQ1_LEN=/cluster/bluearc/galGal2/chrom.sizes # QUERY - Zebrafish SEQ2_DIR=/iscratch/i/danRer2/danRer2.2bit SEQ2_CHUNK=10000000 SEQ2_LAP=10000 SEQ2_LEN=/cluster/data/danRer2/chrom.sizes BASE=/cluster/bluearc/angie/galGal2/blastz.danRer2.2005-04-26 '_EOF_' # << this line keeps emacs coloring happy doBlastzChainNet.pl DEF \ -blastzOutRoot /santest/scratch/galGal2danRer2 >& do.log & tail -f do.log ln -s blastz.danRer2.2005-04-26 /cluster/data/galGal2/bed/blastz.danRer2 # Add {chain,net}DanRer2 to trackDb.ra if necessary. featureBits galGal2 chainDanRer2Link #60589750 bases of 1054197620 (5.747%) in intersection # Move the run dir from bluearc to /cluster/store. ssh kolossus # login not allowed on kkusr01, current home of galGal2 rm /cluster/data/galGal2/bed/blastz.danRer2.2005-04-26 rsync -av /cluster/bluearc/angie/galGal2/blastz.danRer2.2005-04-26 \ /cluster/data/galGal2/bed/ rm -rf /cluster/bluearc/angie/galGal2/blastz.danRer2.2005-04-26 # SET UP OPOSSUM AND FROG ALIGNMENTS FOR MULTIZ (DONE 4/27/05 angie) # possum and frog alignments were put in non-std dir locations and # need maf... ssh kolossus ln -s /cluster/data/xenTro1/bed/zb.galGal2 \ /cluster/data/galGal2/bed/blastz.xenTro1 cd /cluster/data/galGal2/bed/blastz.xenTro1 mkdir mafNet foreach f (axtNet/*.axt) echo $f:t:r axtSort $f stdout \ | axtToMaf -tPrefix=galGal2. -qPrefix=xenTro1. \ stdin /cluster/data/{galGal2,xenTro1}/chrom.sizes stdout \ | gzip -c > mafNet/$f:t:r.maf.gz end ln -s /cluster/data/monDom1/bed/zb.galGal2 \ /cluster/data/galGal2/bed/blastz.monDom1 cd /cluster/data/galGal2/bed/blastz.monDom1 mkdir mafNet foreach f (axtNet/*.axt.gz) echo $f:t:r:r gunzip -c $f | axtSort stdin stdout \ | axtToMaf -tPrefix=galGal2. -qPrefix=monDom1. \ stdin /cluster/data/{galGal2,monDom1}/chrom.sizes stdout \ | gzip -c > mafNet/$f:t:r:r.maf.gz end # MAKE VSXENTRO1 DOWNLOADABLES (DONE 5/2/05 angie) ssh hgwdev mkdir /usr/local/apache/htdocs/goldenPath/galGal2/vsXenTro1 cd /usr/local/apache/htdocs/goldenPath/galGal2/vsXenTro1 ln -s /cluster/data/galGal2/bed/blastz.xenTro1/axtChain/galGal2.xenTro1.*.gz . mkdir axtNet foreach f (/cluster/data/galGal2/bed/blastz.xenTro1/axtNet/*.axt.gz) ln -s $f axtNet/$f:t:r:r.galGal2.xenTro1.net.axt.gz end nice md5sum *.gz */*.gz > md5sum.txt # Copy over & edit README.txt w/pointers to chain, net formats. # MULTIZ CHICKEN/HUMAN/MOUSE/OPOSSUM/FROG/FISHES (DONE 4/27/05 angie) # tree: (((galGal2 ((hg17 mm5) monDom1)) xenTro1) (danRer2 tetNig1)) ssh kolossus # Put run dir on /santest/scratch for now, to ease load on fileservers: mkdir -p /santest/scratch/galGal2/multiz7way.2005-04-27 cd /santest/scratch/galGal2/multiz7way.2005-04-27 # Setup: Copy pairwise MAF to /santest/scratch: mkdir /santest/scratch/chickenMultiz7way foreach db (hg17 mm5 monDom1 xenTro1 danRer2 tetNig1) cp -pR /cluster/data/galGal2/bed/blastz.$db/mafNet \ /santest/scratch/chickenMultiz7way/$db end # Most but not all of those dirs contain gzip'd maf -- gzip where # necessary to make them all uniform (& less cluster I/O): gzip /santest/scratch/chickenMultiz7way/*/*.maf # Make output dir: mkdir maf # Create script to run multiz.v10 in the right order: cat << '_EOF_' > doMultiz.csh #!/bin/csh -fe set chr = $1 set tmp = /scratch/tmp/chickenMultiz7way.$chr mkdir $tmp set REF = galGal2.$chr set HUM = /santest/scratch/chickenMultiz7way/hg17/$chr.maf.gz set MUS = /santest/scratch/chickenMultiz7way/mm5/$chr.maf.gz set MON = /santest/scratch/chickenMultiz7way/monDom1/$chr.maf.gz set XEN = /santest/scratch/chickenMultiz7way/xenTro1/$chr.maf.gz set DAN = /santest/scratch/chickenMultiz7way/danRer2/$chr.maf.gz set TET = /santest/scratch/chickenMultiz7way/tetNig1/$chr.maf.gz set DEST = /santest/scratch/galGal2/multiz7way.2005-04-27/maf/$chr.maf set MZ10 = /cluster/bin/penn/multiz.v10 set PROJECT = /cluster/bin/penn/maf_project # final tree: (((galGal2 ((hg17 mm5) monDom1)) xenTro1) (danRer2 tetNig1)) # subtree: ((galGal2 (hg17 mm5)) if ( -s $HUM && -s $MUS ) then echo "Aligning $HUM $MUS..." zcat $HUM > $tmp/$chr.Hum.maf zcat $MUS > $tmp/$chr.Mus.maf $MZ10 $tmp/$chr.Hum.maf $tmp/$chr.Mus.maf 0 > $tmp/$chr.tmp.maf echo "Projecting on $REF..." $PROJECT $tmp/$chr.tmp.maf $REF > $tmp/$chr.HumMus.maf else if ( -s $HUM ) then zcat $HUM > $tmp/$chr.HumMus.maf else if ( -s $MUS ) then zcat $MUS > $tmp/$chr.HumMus.maf endif # subtree: (galGal2 ((hg17 mm5) monDom1)) if ( -s $tmp/$chr.HumMus.maf && -s $MON ) then echo "Aligning $tmp/$chr.HumMus.maf $MON..." zcat $MON > $tmp/$chr.Mon.maf $MZ10 $tmp/$chr.HumMus.maf $tmp/$chr.Mon.maf 0 > $tmp/$chr.tmp.maf echo "Projecting on $REF..." $PROJECT $tmp/$chr.tmp.maf $REF > $tmp/$chr.HumMusMon.maf else if ( -s $tmp/$chr.HumMus.maf ) then cp $tmp/$chr.HumMus.maf $tmp/$chr.HumMusMon.maf else if ( -s $MON ) then zcat $MON > $tmp/$chr.HumMusMon.maf endif # subtree: ((galGal2 ((hg17 mm5) monDom1)) xenTro1) if ( -s $tmp/$chr.HumMusMon.maf && -s $XEN ) then echo "Aligning $tmp/$chr.HumMusMon.maf $XEN..." zcat $XEN > $tmp/$chr.Xen.maf $MZ10 $tmp/$chr.HumMusMon.maf $tmp/$chr.Xen.maf 1 > $tmp/$chr.tmp.maf echo "Projecting on $REF..." $PROJECT $tmp/$chr.tmp.maf $REF > $tmp/$chr.HumMusMonXen.maf else if ( -s $tmp/$chr.HumMusMon.maf ) then cp $tmp/$chr.HumMusMon.maf $tmp/$chr.HumMusMonXen.maf else if ( -s $XEN ) then zcat $XEN > $tmp/$chr.HumMusMonXen.maf endif # subtree: (galGal2 (danRer2 tetNig1)) if ( -s $DAN && -s $TET ) then echo "Aligning $DAN $TET..." zcat $DAN > $tmp/$chr.Dan.maf zcat $TET > $tmp/$chr.Tet.maf $MZ10 $tmp/$chr.Dan.maf $tmp/$chr.Tet.maf 0 > $tmp/$chr.tmp.maf echo "Projecting on $REF..." $PROJECT $tmp/$chr.tmp.maf $REF > $tmp/$chr.DanTet.maf else if ( -s $DAN ) then zcat $DAN > $tmp/$chr.DanTet.maf else if ( -s $TET ) then zcat $TET > $tmp/$chr.DanTet.maf endif # final tree: (((galGal2 ((hg17 mm5) monDom1)) xenTro1) (danRer2 tetNig1)) if ( -s $tmp/$chr.HumMusMonXen.maf && -s $tmp/$chr.DanTet.maf ) then echo "Aligning $tmp/$chr.HumMusMonXen.maf $tmp/$chr.DanTet.maf..." $MZ10 $tmp/$chr.HumMusMonXen.maf $tmp/$chr.DanTet.maf 1 > $tmp/$chr.tmp.maf echo "Projecting on $REF..." $PROJECT $tmp/$chr.tmp.maf $REF > $tmp/$chr.HumMusMonXenDanTet.maf mv $tmp/$chr.HumMusMonXenDanTet.maf $DEST else if ( -s $tmp/$chr.DanTet.maf ) then cp $tmp/$chr.DanTet.maf $DEST else if ( -s $tmp/$chr.HumMusMonXen.maf ) then cp $tmp/$chr.HumMusMonXen.maf $DEST endif rm $tmp/$chr.*.maf rmdir $tmp '_EOF_' # << keep emacs coloring happy chmod 755 doMultiz.csh awk '{print "./doMultiz.csh " $1;}' /cluster/data/galGal2/chrom.sizes \ > jobs.lst # Run on small cluster ssh kki cd /santest/scratch/galGal2/multiz7way.2005-04-27 para create jobs.lst para try, check, push, check, ... #Completed: 53 of 54 jobs #Crashed: 1 jobs #Average job time: 164s 2.73m 0.05h 0.00d #Longest finished job: 1268s 21.13m 0.35h 0.01d #Submission to last job: 1456s 24.27m 0.40h 0.02d # The failure was for chr8_random which had no alignments in any # of the species. # Copy back over to /cluster/store-land rsync -av /santest/scratch/galGal2/multiz7way.2005-04-27 \ /cluster/data/galGal2/bed ln -s /cluster/data/galGal2/bed/multiz7way.2005-04-27 \ /cluster/data/galGal2/bed/multiz7way # make /gbdb/ links to 7way maf files: ssh hgwdev mkdir -p /gbdb/galGal2/multiz7way/maf/multiz7way ln -s /cluster/data/galGal2/bed/multiz7way/maf/chr*.maf \ /gbdb/galGal2/multiz7way/maf/multiz7way/ # load into database cd /tmp hgLoadMaf -warn galGal2 multiz7way \ -pathPrefix=/gbdb/galGal2/multiz7way/maf/multiz7way # put 7way MAF out for download ssh kolossus cd /cluster/data/galGal2/bed/multiz7way mkdir mafDownload foreach f (maf/*.maf) nice gzip -c $f > mafDownload/$f:t.gz end cd mafDownload nice md5sum *.maf.gz > md5sum.txt # make a README.txt there too. ssh hgwdev mkdir /usr/local/apache/htdocs/goldenPath/galGal2/multiz7way ln -s /cluster/data/galGal2/bed/multiz7way/mafDownload/{*.maf.gz,md5sum.txt,README.txt} \ /usr/local/apache/htdocs/goldenPath/galGal2/multiz7way # Cleanup rm -rf /santest/scratch/galGal2/multiz7way.2005-04-27 rm -rf /santest/scratch/chickenMultiz7way/ # PHASTCONS 7WAY WITH METHODS FROM PAPER (DONE 4/28/05 angie) # NOTE FROM QA, 5/12/08: the phastConsElements7way track has an overlap-by-one # error at chr1:92,640,085-92,640,105 . This was found when investigating a # similar bug reported by user Michael Hiller in dm3 phastCons data. As # galGal2 has been superceded by galGal3, this track is going to be left as-is. # Using the param estimation methods described in Adam's # Genome Research paper... but what to use for CDS coverage metric??? # tree: (((galGal2,((hg17,mm5),monDom1)),xenTro1),(danRer2,tetNig1)) ssh kolossus mkdir -p /santest/scratch/galGal2/chrom cp -p /cluster/data/galGal2/*/chr*.fa /santest/scratch/galGal2/chrom/ mkdir /santest/scratch/galGal2/maf7way cp -p /cluster/data/galGal2/bed/multiz7way/maf/*.maf \ /santest/scratch/galGal2/maf7way # Split chrom fa & maf into smaller windows for phastCons: ssh kki mkdir /cluster/data/galGal2/bed/multiz7way/phastCons mkdir /cluster/data/galGal2/bed/multiz7way/phastCons/run.split cd /cluster/data/galGal2/bed/multiz7way/phastCons/run.split set WINDOWS = /santest/scratch/galGal2/phastCons/WINDOWS rm -fr $WINDOWS mkdir -p $WINDOWS cat << 'EOF' > doSplit.sh #!/bin/csh -ef set PHAST=/cluster/bin/phast set FA_SRC=/santest/scratch/galGal2/chrom set WINDOWS=/santest/scratch/galGal2/phastCons/WINDOWS set maf=$1 set c = $maf:t:r:r set tmpDir = /scratch/msa_split/$c rm -rf $tmpDir mkdir -p $tmpDir ${PHAST}/msa_split $1 -i MAF -M ${FA_SRC}/$c.fa \ -O galGal2,hg17,mm5,monDom1,xenTro1,danRer2,tetNig1 \ -w 1000000,0 -r $tmpDir/$c -o SS -I 1000 -B 5000 cd $tmpDir foreach file ($c.*.ss) gzip -c $file > ${WINDOWS}/$file.gz end rm -f $tmpDir/$c.*.ss rmdir $tmpDir 'EOF' # << for emacs chmod a+x doSplit.sh rm -f jobList foreach file (/santest/scratch/galGal2/maf7way/*.maf) if (-s $file) then echo "doSplit.sh {check in exists+ $file}" >> jobList endif end para create jobList para try, check, push, check... #Completed: 52 of 53 jobs #Crashed: 1 jobs #Average job time: 32s 0.53m 0.01h 0.00d #Longest finished job: 204s 3.40m 0.06h 0.00d #Submission to last job: 506s 8.43m 0.14h 0.01d # 1 failure because there wasn't enough info in chr6_random (just one # little alignment to monDom1 only), OK. #WARNING: skipping partition 1; insufficient informative sites. # Get genome-wide all-species average GC content # by running msa_view --aggregate --summary-only on the WINDOWS .SS files. # msa_view needs seekable single-ss file, so unpack each to /tmp and # compute overall average: ssh kolossus cd /cluster/data/galGal2/bed/multiz7way/phastCons/ cp /dev/null gcs.txt # Do this on just a random sample of windows: foreach f (`ls -1 /santest/scratch/galGal2/phastCons/WINDOWS/* \ | randomLines stdin 200 stdout`) echo $f:t:r:r zcat $f > /tmp/galGal2.sample.ss /cluster/bin/phast/msa_view \ --aggregate galGal2,hg17,mm5,monDom1,xenTro1,danRer2,tetNig1 \ -i SS --summary-only /tmp/galGal2.sample.ss \ | awk '$1 == "[aggregate]" {printf "%0.4f\n", $3 + $4}' \ >> gcs.txt end ave gcs.txt #average 0.418876 # OK, so use 0.419 as the --gc parameter below. rm /tmp/galGal2.sample.ss ############### FIRST ITERATION OF PARAMETER ESTIMATION ONLY ############# # Use phyloFit to make an initial model: ssh kolossus cd /cluster/data/galGal2/bed/multiz7way/phastCons /cluster/bin/phast/msa_view -i MAF \ /santest/scratch/galGal2/maf7way/chr{1,Z_random,32}.maf \ --aggregate galGal2,hg17,mm5,monDom1,xenTro1,danRer2,tetNig1 \ > /santest/scratch/galGal2/phastCons/all.ss /cluster/bin/phast/phyloFit /santest/scratch/galGal2/phastCons/all.ss \ --tree "(((galGal2,((hg17,mm5),monDom1)),xenTro1),(danRer2,tetNig1))" \ -i SS --out-root starting-tree cat starting-tree.mod #TREE: (((galGal2:0.219806,((hg17:0.090064,mm5:0.137589):0.090370,monDom1:0.171369):0.112129):0.062872,xenTro1:0.267337):0.086182,(danRer2:0.219771,tetNig1:0.184348):0.086182); # Use consEntropy --NH to make it suggest a --expected-lengths param # that we should try next. Adam ran this on hg17 to find out the # total entropy of the hg17 model: # consEntropy 0.265 12 ave.cons.mod ave.noncons.mod #Transition parameters: gamma=0.265000, omega=12.000000, mu=0.083333, nu=0.030045 #Relative entropy: H=0.608216 bits/site #Required length: N=16.085437 sites #Total entropy: NH=9.783421 bits # Our target is that NH result: 9.7834 bits. # phyloFit makes one .mod but consEntropy wants two... # Multiply each subst rate on the TREE line by 3.2 which is roughly the # ratio of noncons to cons in # /cluster/data/hg17/bed/multiz8way/cons/run.elements/ave.*.mod /cluster/bin/phast/tree_doctor -s 3.2 starting-tree.mod \ > starting-tree.noncons.mod # OK, so Adam's initial consEntropy run used 0.265 as the target-cov, # but the latest 8way ended up with 0.17. Hmmm. I'm going to find # a target-cov that keeps expected-length 12, just for the hell of it, # as a starting point. /cluster/bin/phast/consEntropy --NH 9.7834 0.189 12 \ starting-tree*.mod #( Solving for new omega: 12.000000 12.038661 12.038583 ) #Transition parameters: gamma=0.189000, omega=12.000000, mu=0.083333, nu=0.019420 #Relative entropy: H=1.444808 bits/site #Required length: N=6.765816 sites #Total entropy: NH=9.775307 bits #Recommended expected length: omega=12.038583 sites (for NH=9.783400) ############## SUBSEQUENT ITERATIONS OF PARAM ESTIMATION ONLY ########### # We're here because the actual target coverage was not satisfactory, # so we're changing the --target-coverage param. Given that we're # changing that, take a guess at how we should change --expected-lengths # in order to also hit the total entropy target. cd /cluster/data/galGal2/bed/multiz7way/phastCons/run.estimate # SECOND ITERATION: /cluster/bin/phast/consEntropy --NH 9.7834 0.22 12 ave.{cons,noncons}.mod #Recommended expected length: omega=12.334751 sites (for NH=9.783400) # OK, --expected-lengths 12.3 # Now set up cluster job to estimate model parameters given free params # --target-coverage and --expected-lengths and the data. ssh kk9 mkdir /cluster/data/galGal2/bed/multiz7way/phastCons/run.estimate cd /cluster/data/galGal2/bed/multiz7way/phastCons/run.estimate # FIRST ITERATION: Use ../starting-tree.noncons.mod: cat << '_EOF_' > doEstimate.sh #!/bin/csh -ef zcat $1 \ | /cluster/bin/phast/phastCons - ../starting-tree.noncons.mod --gc 0.419 --nrates 1,1 \ --no-post-probs --ignore-missing \ --expected-lengths 12 --target-coverage 0.189 \ --quiet --log $2 --estimate-trees $3 '_EOF_' # << for emacs # SUBSEQUENT ITERATIONS: Use last iteration's estimated noncons model. cat << '_EOF_' > doEstimate.sh #!/bin/csh -ef zcat $1 \ | /cluster/bin/phast/phastCons - ave.noncons.mod --gc 0.419 --nrates 1,1 \ --no-post-probs --ignore-missing \ --expected-lengths 12.3 --target-coverage 0.22 \ --quiet --log $2 --estimate-trees $3 '_EOF_' # << for emacs chmod a+x doEstimate.sh rm -fr LOG TREES mkdir -p LOG TREES rm -f jobList foreach f (/santest/scratch/galGal2/phastCons/WINDOWS/*.ss.gz) set root = $f:t:r:r echo doEstimate.sh $f LOG/$root.log TREES/$root >> jobList end # run cluster job para create jobList para try, check, push, check, ... #Completed: 1161 of 1161 jobs #Average job time: 357s 5.95m 0.10h 0.00d #Longest finished job: 1435s 23.92m 0.40h 0.02d #Submission to last job: 1440s 24.00m 0.40h 0.02d # Now combine parameter estimates. We can average the .mod files # using phyloBoot. This must be done separately for the conserved # and nonconserved models ssh kolossus cd /cluster/data/galGal2/bed/multiz7way/phastCons/run.estimate ls -1 TREES/*.cons.mod > cons.txt /cluster/bin/phast/phyloBoot --read-mods '*cons.txt' \ --output-average ave.cons.mod > cons_summary.txt ls -1 TREES/*.noncons.mod > noncons.txt /cluster/bin/phast/phyloBoot --read-mods '*noncons.txt' \ --output-average ave.noncons.mod > noncons_summary.txt grep TREE ave*.mod # FIRST ITERATION: #ave.cons.mod:TREE: (((galGal2:0.075819,((hg17:0.038800,mm5:0.063133):0.039437,monDom1:0.074646):0.048105):0.022112,xenTro1:0.134733):0.038043,(danRer2:0.102215,tetNig1:0.105871):0.038043); #ave.noncons.mod:TREE: (((galGal2:0.249812,((hg17:0.127082,mm5:0.208431):0.130021,monDom1:0.244194):0.158600):0.072560,xenTro1:0.447059):0.127473,(danRer2:0.339178,tetNig1:0.351891):0.127473); # SECOND ITERATION: #ave.cons.mod:TREE: (((galGal2:0.076928,((hg17:0.038920,mm5:0.063282):0.039737,monDom1:0.074819):0.048432):0.022500,xenTro1:0.135616):0.038629,(danRer2:0.103217,tetNig1:0.106871):0.038629); #ave.noncons.mod:TREE: (((galGal2:0.254336,((hg17:0.128177,mm5:0.210032):0.131589,monDom1:0.246276):0.160281):0.074147,xenTro1:0.451898):0.129916,(danRer2:0.344050,tetNig1:0.356388):0.129916); cat cons_summary.txt # look over the files cons_summary.txt and noncons_summary.txt. # The means and medians should be roughly equal and the stdevs # should be reasonably small compared to the means, particularly # for rate matrix parameters (at bottom) and for branches to the # leaves of the tree. The stdevs may be fairly high for branches # near the root of the tree; that's okay. Some min values may be # 0 for some parameters. That's okay, but watch out for very large # values in the max column, which might skew the mean. If you see # any signs of bad outliers, you may have to track down the # responsible .mod files and throw them out. I've never had to do # this; the estimates generally seem pretty well behaved. # NOTE: Actually, a random sample of several hundred to a thousand # alignment fragments (say, a number equal to the number of # available cluster nodes) should be more than adequate for # parameter estimation. If pressed for time, use this strategy. # Check the total entropy figure to see if we're way off. # We probably don't need to do this, since it has always been very close # if not the same as what we used above, but it only takes a second. # FIRST ITERATION: /cluster/bin/phast/consEntropy --NH 9.7834 0.189 12 ave.{cons,noncons}.mod #Recommended expected length: omega=10.404849 sites (for NH=9.783400) # OK, tweak --expected-lengths to 10.4 below (and for next iteration). # SECOND ITERATION: /cluster/bin/phast/consEntropy --NH 9.7834 0.22 12.3 ave.{cons,noncons}.mod #Recommended expected length: omega=12.405521 sites (for NH=9.783400) # OK, tweak --expected-lengths to 12.4 below (and for next iteration). # Now we are ready to set up the cluster job for computing the # conservation scores and predicted elements. The we measure the # conserved elements coverage, and if that's not satisfactory then we # adjust parameters and repeat. ssh kk9 mkdir /cluster/data/galGal2/bed/multiz7way/phastCons/run.phast cd /cluster/data/galGal2/bed/multiz7way/phastCons/run.phast cat << 'EOF' > doPhastCons.sh #!/bin/csh -ef set pref = $1:t:r:r set chr = `echo $pref | awk -F\. '{print $1}'` set tmpfile = /scratch/phastCons.$$ zcat $1 \ | /cluster/bin/phast/phastCons - \ ../run.estimate/ave.cons.mod,../run.estimate/ave.noncons.mod \ --expected-lengths 12.4 --target-coverage 0.22 \ --quiet --seqname $chr --idpref $pref \ --viterbi /santest/scratch/galGal2/phastCons/ELEMENTS/$pref.bed --score \ --require-informative 0 \ > $tmpfile gzip -c $tmpfile > /santest/scratch/galGal2/phastCons/POSTPROBS/$pref.pp.gz rm $tmpfile 'EOF' # << for emacs chmod a+x doPhastCons.sh rm -fr /santest/scratch/galGal2/phastCons/{POSTPROBS,ELEMENTS} mkdir -p /santest/scratch/galGal2/phastCons/{POSTPROBS,ELEMENTS} rm -f jobList foreach f (/santest/scratch/galGal2/phastCons/WINDOWS/*.ss.gz) echo doPhastCons.sh $f >> jobList end para create jobList para try, check, push, check, ... #Completed: 1161 of 1161 jobs #Average job time: 15s 0.24m 0.00h 0.00d #Longest finished job: 20s 0.33m 0.01h 0.00d #Submission to last job: 36s 0.60m 0.01h 0.00d # back on kolossus: # combine predictions and transform scores to be in 0-1000 interval cd /cluster/data/galGal2/bed/multiz7way/phastCons awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' \ /santest/scratch/galGal2/phastCons/ELEMENTS/*.bed \ | /cluster/bin/scripts/lodToBedScore > all.bed ssh hgwdev # Now measure coverage of CDS by conserved elements. # We want the "cover" figure to be close to 68.9%. # But what to use as CDS? There's a refGene but it's terribly incomplete. # There's a human proteins track, but that's only what human and chicken # have in common. There's a xenoRefGene too... again comparative- # influenced, but a lot more complete than refGene presumably. # So just go with first cut results for now? cd /cluster/data/galGal2/bed/multiz7way/phastCons featureBits -enrichment galGal2 refGene:cds all.bed featureBits -enrichment galGal2 xenoRefGene:cds all.bed featureBits -enrichment galGal2 blastHg17KG all.bed # FIRST ITERATION: pretty close! Let's try to bump the coverage up a bit. #refGene:cds 0.340%, all.bed 2.831%, both 0.207%, cover 60.76%, enrich 21.46x #xenoRefGene:cds 1.644%, all.bed 2.831%, both 1.114%, cover 67.76%, enrich 23.93x #blastHg17KG 2.001%, all.bed 2.831%, both 1.109%, cover 55.44%, enrich 19.58x # SECOND ITERATION: was hoping for a bit more, but this will do. #refGene:cds 0.340%, all.bed 3.008%, both 0.212%, cover 62.27%, enrich 20.70x #xenoRefGene:cds 1.644%, all.bed 3.008%, both 1.144%, cover 69.55%, enrich 23.12x #blastHg17KG 2.001%, all.bed 3.008%, both 1.143%, cover 57.14%, enrich 19.00x # Having met the CDS coverage target, load up the results. hgLoadBed galGal2 phastConsElements7way all.bed # Create wiggle on the small cluster ssh kki mkdir /cluster/data/galGal2/bed/multiz7way/phastCons/run.wib cd /cluster/data/galGal2/bed/multiz7way/phastCons/run.wib rm -rf /santest/scratch/galGal2/phastCons/wib mkdir -p /santest/scratch/galGal2/phastCons/wib cat << 'EOF' > doWigEncode #!/bin/csh -ef set chr = $1 cd /santest/scratch/galGal2/phastCons/wib zcat `ls -1 /santest/scratch/galGal2/phastCons/POSTPROBS/$chr.*.pp.gz \ | sort -t\. -k2,2n` \ | wigEncode stdin ${chr}_phastCons.wi{g,b} 'EOF' # << for emacs chmod a+x doWigEncode rm -f jobList foreach chr (`ls -1 /santest/scratch/galGal2/phastCons/POSTPROBS \ | awk -F\. '{print $1}' | sort -u`) echo doWigEncode $chr >> jobList end para create jobList para try, check, push, check, ... #Completed: 52 of 52 jobs #Average job time: 5s 0.09m 0.00h 0.00d #Longest finished job: 27s 0.45m 0.01h 0.00d #Submission to last job: 30s 0.50m 0.01h 0.00d # back on kolossus, copy wibs, wigs and POSTPROBS (people sometimes want # the raw scores) from bluearc cd /cluster/data/galGal2/bed/multiz7way/phastCons rm -rf wib POSTPROBS rsync -av /santest/scratch/galGal2/phastCons/wib . rsync -av /santest/scratch/galGal2/phastCons/POSTPROBS . # load wiggle component of Conservation track ssh hgwdev mkdir /gbdb/galGal2/multiz7way/wib cd /cluster/data/galGal2/bed/multiz7way/phastCons chmod 775 . wib chmod 664 wib/*.wib ln -s `pwd`/wib/*.wib /gbdb/galGal2/multiz7way/wib/ hgLoadWiggle galGal2 phastCons7way \ -pathPrefix=/gbdb/galGal2/multiz7way/wib wib/*.wig rm wiggle.tab # make top-5000 list and launcher on Adam's home page: sed -e 's/lod=//' all.bed | sort -k4,4nr | head -5000 \ | awk '{printf "%s\t%d\t%d\tlod=%d\t%d\n", $1, $2, $3, $4, $4}' \ > top5000.bed # Don't have group write perms to /cse/grads/acs/ so can't do this anymore: /cluster/home/acs/bin/make-launcher-with-scores.sh top5000.bed \ /cse/grads/acs/public_html/gg-top5000-7way \ "top 5000 conserved elements (7way)" galGal2 # and clean up bluearc. rm -r /santest/scratch/galGal2/phastCons/{ELEMENTS,POSTPROBS,wib} rm -r /santest/scratch/galGal2/chrom # MULTIZ CHICKEN/HUMAN/OPOSSUM/ZEBRAFISH (DONE 4/28/05 angie) # Backup plan in case QA doesn't have time to release the 7way and # all the pairwise it uses just these nicely spaced critters: # tree: ((galGal2 (hg17 monDom1)) danRer2) ssh kolossus # Put run dir on /santest/scratch for now, to ease load on fileservers: mkdir -p /santest/scratch/galGal2/multiz4way.2005-04-28 cd /santest/scratch/galGal2/multiz4way.2005-04-28 # Setup: Copy pairwise MAF to /santest/scratch: mkdir /santest/scratch/chickenMultiz4way foreach db (hg17 monDom1 danRer2) cp -pR /cluster/data/galGal2/bed/blastz.$db/mafNet \ /santest/scratch/chickenMultiz4way/$db end # Most but not all of those dirs contain gzip'd maf -- gzip where # necessary to make them all uniform (& less cluster I/O): gzip /santest/scratch/chickenMultiz4way/*/*.maf # Make output dir: mkdir maf # Create script to run multiz.v10 in the right order: cat << '_EOF_' > doMultiz.csh #!/bin/csh -fe set chr = $1 set tmp = /scratch/tmp/chickenMultiz4way.$chr mkdir $tmp set REF = galGal2.$chr set HUM = /santest/scratch/chickenMultiz4way/hg17/$chr.maf.gz set MON = /santest/scratch/chickenMultiz4way/monDom1/$chr.maf.gz set DAN = /santest/scratch/chickenMultiz4way/danRer2/$chr.maf.gz set DEST = /santest/scratch/galGal2/multiz4way.2005-04-28/maf/$chr.maf set MZ10 = /cluster/bin/penn/multiz.v10 set PROJECT = /cluster/bin/penn/maf_project # final tree: ((galGal2 (hg17 monDom1)) danRer2) # subtree: ((galGal2 (hg17 monDon1)) if ( -s $HUM && -s $MON ) then echo "Aligning $HUM $MON..." zcat $HUM > $tmp/$chr.Hum.maf zcat $MON > $tmp/$chr.Mon.maf $MZ10 $tmp/$chr.Hum.maf $tmp/$chr.Mon.maf 0 > $tmp/$chr.tmp.maf echo "Projecting on $REF..." $PROJECT $tmp/$chr.tmp.maf $REF > $tmp/$chr.HumMon.maf else if ( -s $HUM ) then zcat $HUM > $tmp/$chr.HumMon.maf else if ( -s $MON ) then zcat $MON > $tmp/$chr.HumMon.maf endif # final tree: ((galGal2 (hg17 monDom1)) danRer2) if ( -s $tmp/$chr.HumMon.maf && -s $DAN ) then echo "Aligning $tmp/$chr.HumMon.maf $DAN..." zcat $DAN > $tmp/$chr.Dan.maf $MZ10 $tmp/$chr.HumMon.maf $tmp/$chr.Dan.maf 1 > $tmp/$chr.tmp.maf echo "Projecting on $REF..." $PROJECT $tmp/$chr.tmp.maf $REF > $tmp/$chr.HumMonDan.maf mv $tmp/$chr.HumMonDan.maf $DEST else if ( -s $DAN ) then zcat $DAN > $DEST touch $tmp/$chr.stub.maf else if ( -s $tmp/$chr.HumMon.maf ) then cp $tmp/$chr.HumMon.maf $DEST endif rm $tmp/$chr.*.maf rmdir $tmp '_EOF_' # << keep emacs coloring happy chmod 755 doMultiz.csh awk '{print "./doMultiz.csh " $1;}' /cluster/data/galGal2/chrom.sizes \ > jobs.lst # chr6_random and chr8_random always cause problems due to lack of # alignment coverage -- edit them out of the list now. # Run on small cluster ssh kki cd /santest/scratch/galGal2/multiz4way.2005-04-28 para create jobs.lst para try, check, push, check, ... #Completed: 52 of 52 jobs #Average job time: 55s 0.92m 0.02h 0.00d #Longest finished job: 384s 6.40m 0.11h 0.00d #Submission to last job: 417s 6.95m 0.12h 0.00d # Copy back over to /cluster/store-land rsync -av /santest/scratch/galGal2/multiz4way.2005-04-28 \ /cluster/data/galGal2/bed ln -s /cluster/data/galGal2/bed/multiz4way.2005-04-28 \ /cluster/data/galGal2/bed/multiz4way # make /gbdb/ links to 4way maf files: ssh hgwdev mkdir -p /gbdb/galGal2/multiz4way/maf/multiz4way ln -s /cluster/data/galGal2/bed/multiz4way/maf/chr*.maf \ /gbdb/galGal2/multiz4way/maf/multiz4way/ # load into database cd /tmp hgLoadMaf -warn galGal2 multiz4way \ -pathPrefix=/gbdb/galGal2/multiz4way/maf/multiz4way # put 4way MAF out for download ssh kolossus cd /cluster/data/galGal2/bed/multiz4way mkdir mafDownload foreach f (maf/*.maf) nice gzip -c $f > mafDownload/$f:t.gz end cd mafDownload nice md5sum *.maf.gz > md5sum.txt # make a README.txt there too. ssh hgwdev mkdir /usr/local/apache/htdocs/goldenPath/galGal2/multiz4way ln -s /cluster/data/galGal2/bed/multiz4way/mafDownload/{*.maf.gz,md5sum.txt,README.txt} \ /usr/local/apache/htdocs/goldenPath/galGal2/multiz4way # Cleanup rm -rf /santest/scratch/galGal2/multiz4way.2005-04-28 rm -rf /santest/scratch/chickenMultiz4way/ # PHASTCONS 4WAY WITH METHODS FROM PAPER (DONE 4/28/05 angie) # Backup plan in case QA doesn't have time to release the 7way and # all the pairwise it uses just these nicely spaced critters: # ((galGal2 (hg17 monDom1)) danRer2) ssh kolossus mkdir -p /santest/scratch/galGal2/chrom cp -p /cluster/data/galGal2/*/chr*.fa /santest/scratch/galGal2/chrom/ mkdir /santest/scratch/galGal2/maf4way cp -p /cluster/data/galGal2/bed/multiz4way/maf/*.maf \ /santest/scratch/galGal2/maf4way # Split chrom fa & maf into smaller windows for phastCons: ssh kki mkdir /cluster/data/galGal2/bed/multiz4way/phastCons mkdir /cluster/data/galGal2/bed/multiz4way/phastCons/run.split cd /cluster/data/galGal2/bed/multiz4way/phastCons/run.split set WINDOWS = /santest/scratch/galGal2/phastCons/WINDOWS rm -fr $WINDOWS mkdir -p $WINDOWS cat << 'EOF' > doSplit.sh #!/bin/csh -ef set PHAST=/cluster/bin/phast set FA_SRC=/santest/scratch/galGal2/chrom set WINDOWS=/santest/scratch/galGal2/phastCons/WINDOWS set maf=$1 set c = $maf:t:r:r set tmpDir = /scratch/msa_split/$c rm -rf $tmpDir mkdir -p $tmpDir ${PHAST}/msa_split $1 -i MAF -M ${FA_SRC}/$c.fa \ -O galGal2,hg17,monDom1,danRer2 \ -w 1000000,0 -r $tmpDir/$c -o SS -I 1000 -B 5000 cd $tmpDir foreach file ($c.*.ss) gzip -c $file > ${WINDOWS}/$file.gz end rm -f $tmpDir/$c.*.ss rmdir $tmpDir 'EOF' # << for emacs chmod a+x doSplit.sh rm -f jobList foreach file (/santest/scratch/galGal2/maf4way/*.maf) if (-s $file) then echo "doSplit.sh {check in exists+ $file}" >> jobList endif end para create jobList para try, check, push, check... #Completed: 51 of 52 jobs #Crashed: 1 jobs #Average job time: 28s 0.47m 0.01h 0.00d #Longest finished job: 266s 4.43m 0.07h 0.00d #Submission to last job: 266s 4.43m 0.07h 0.00d # 1 failure because there wasn't enough info in chrE64, oh well: #WARNING: skipping partition 1; insufficient informative sites. # Get genome-wide all-species average GC content # by running msa_view --aggregate --summary-only on the WINDOWS .SS files. # msa_view needs seekable single-ss file, so unpack each to /tmp and # compute overall average: ssh kolossus cd /cluster/data/galGal2/bed/multiz4way/phastCons/ cp /dev/null gcs.txt # Do this on just a random sample of windows: foreach f (`ls -1 /santest/scratch/galGal2/phastCons/WINDOWS/* \ | randomLines stdin 200 stdout`) echo $f:t:r:r zcat $f > /tmp/galGal2.sample.ss /cluster/bin/phast/msa_view \ --aggregate galGal2,hg17,monDom1,danRer2 \ -i SS --summary-only /tmp/galGal2.sample.ss \ | awk '$1 == "[aggregate]" {printf "%0.4f\n", $3 + $4}' \ >> gcs.txt end ave gcs.txt #average 0.418849 # OK, so use 0.419 as the --gc parameter below. rm /tmp/galGal2.sample.ss ############### FIRST ITERATION OF PARAMETER ESTIMATION ONLY ############# # Use phyloFit to make an initial model: ssh kolossus cd /cluster/data/galGal2/bed/multiz4way/phastCons /cluster/bin/phast/msa_view -i MAF \ /santest/scratch/galGal2/maf4way/chr{1,5,10,Z_random,32}.maf \ --aggregate galGal2,hg17,monDom1,danRer2 \ > /santest/scratch/galGal2/phastCons/all.ss /cluster/bin/phast/phyloFit /santest/scratch/galGal2/phastCons/all.ss \ --tree "((galGal2,(hg17,monDom1)),danRer2)" \ -i SS --out-root starting-tree cat starting-tree.mod #TREE: ((galGal2:0.217116,(hg17:0.159841,monDom1:0.154894):0.109434):0.197513,danRer2:0.197513); # Use consEntropy --NH to make it suggest a --expected-lengths param # that we should try next. Adam ran this on hg17 to find out the # total entropy of the hg17 model: # consEntropy 0.265 12 ave.cons.mod ave.noncons.mod #Transition parameters: gamma=0.265000, omega=12.000000, mu=0.083333, nu=0.030045 #Relative entropy: H=0.608216 bits/site #Required length: N=16.085437 sites #Total entropy: NH=9.783421 bits # Our target is that NH result: 9.7834 bits. # phyloFit makes one .mod but consEntropy wants two... # Multiply each subst rate on the TREE line by 3.3 which is roughly the # ratio of noncons to cons in # /cluster/data/galGal2/bed/multiz7way/cons/run.estimate/ave.*.mod /cluster/bin/phast/tree_doctor -s 3.3 starting-tree.mod \ > starting-tree.noncons.mod # Use params from last run of 7way: /cluster/bin/phast/consEntropy --NH 9.7834 0.22 12.4 \ starting-tree*.mod #( Solving for new omega: 12.400000 10.781171 10.586253 10.582989 ) #Transition parameters: gamma=0.220000, omega=12.400000, mu=0.080645, nu=0.022746 #Relative entropy: H=0.765400 bits/site #Required length: N=13.193791 sites #Total entropy: NH=10.098533 bits #Recommended expected length: omega=10.582989 sites (for NH=9.783400) # OK, start with 10.6 below. ############## SUBSEQUENT ITERATIONS OF PARAM ESTIMATION ONLY ########### # We're here because the actual target coverage was not satisfactory, # so we're changing the --target-coverage param. Given that we're # changing that, take a guess at how we should change --expected-lengths # in order to also hit the total entropy target. cd /cluster/data/galGal2/bed/multiz4way/phastCons/run.estimate # SECOND ITERATION: /cluster/bin/phast/consEntropy --NH 9.7834 0.35 12 ave.{cons,noncons}.mod #Recommended expected length: omega=18.452486 sites (for NH=9.783400) # OK, --expected-lengths 18.5 # Now set up cluster job to estimate model parameters given free params # --target-coverage and --expected-lengths and the data. ssh kk9 mkdir /cluster/data/galGal2/bed/multiz4way/phastCons/run.estimate cd /cluster/data/galGal2/bed/multiz4way/phastCons/run.estimate # FIRST ITERATION: Use ../starting-tree.noncons.mod: cat << '_EOF_' > doEstimate.sh #!/bin/csh -ef zcat $1 \ | /cluster/bin/phast/phastCons - ../starting-tree.noncons.mod --gc 0.419 --nrates 1,1 \ --no-post-probs --ignore-missing \ --expected-lengths 10.6 --target-coverage 0.22 \ --quiet --log $2 --estimate-trees $3 '_EOF_' # << for emacs # SUBSEQUENT ITERATIONS: Use last iteration's estimated noncons model. cat << '_EOF_' > doEstimate.sh #!/bin/csh -ef zcat $1 \ | /cluster/bin/phast/phastCons - ave.noncons.mod --gc 0.419 --nrates 1,1 \ --no-post-probs --ignore-missing \ --expected-lengths 18.5 --target-coverage 0.35 \ --quiet --log $2 --estimate-trees $3 '_EOF_' # << for emacs chmod a+x doEstimate.sh rm -fr LOG TREES mkdir -p LOG TREES rm -f jobList foreach f (/santest/scratch/galGal2/phastCons/WINDOWS/*.ss.gz) set root = $f:t:r:r echo doEstimate.sh $f LOG/$root.log TREES/$root >> jobList end # run cluster job para create jobList para try, check, push, check, ... #Completed: 1160 of 1160 jobs #Average job time: 16s 0.26m 0.00h 0.00d #Longest finished job: 45s 0.75m 0.01h 0.00d #Submission to last job: 285s 4.75m 0.08h 0.00d # Now combine parameter estimates. We can average the .mod files # using phyloBoot. This must be done separately for the conserved # and nonconserved models ssh kolossus cd /cluster/data/galGal2/bed/multiz4way/phastCons/run.estimate ls -1 TREES/*.cons.mod > cons.txt /cluster/bin/phast/phyloBoot --read-mods '*cons.txt' \ --output-average ave.cons.mod > cons_summary.txt ls -1 TREES/*.noncons.mod > noncons.txt /cluster/bin/phast/phyloBoot --read-mods '*noncons.txt' \ --output-average ave.noncons.mod > noncons_summary.txt grep TREE ave*.mod # FIRST ITERATION: #ave.cons.mod:TREE: ((galGal2:0.069779,(hg17:0.067234,monDom1:0.063760):0.048611):0.083881,danRer2:0.083881); #ave.noncons.mod:TREE: ((galGal2:0.250372,(hg17:0.241752,monDom1:0.227546):0.176240):0.307772,danRer2:0.307772); # SECOND ITERATION: #ave.cons.mod:TREE: ((galGal2:0.084421,(hg17:0.080762,monDom1:0.076024):0.057691):0.102428,danRer2:0.102428); #ave.noncons.mod:TREE: ((galGal2:0.266716,(hg17:0.255716,monDom1:0.239179):0.183533):0.330915,danRer2:0.330915); cat cons_summary.txt # look over the files cons_summary.txt and noncons_summary.txt. # The means and medians should be roughly equal and the stdevs # should be reasonably small compared to the means, particularly # for rate matrix parameters (at bottom) and for branches to the # leaves of the tree. The stdevs may be fairly high for branches # near the root of the tree; that's okay. Some min values may be # 0 for some parameters. That's okay, but watch out for very large # values in the max column, which might skew the mean. If you see # any signs of bad outliers, you may have to track down the # responsible .mod files and throw them out. I've never had to do # this; the estimates generally seem pretty well behaved. # NOTE: Actually, a random sample of several hundred to a thousand # alignment fragments (say, a number equal to the number of # available cluster nodes) should be more than adequate for # parameter estimation. If pressed for time, use this strategy. # Check the total entropy figure to see if we're way off. # We probably don't need to do this, since it has always been very close # if not the same as what we used above, but it only takes a second. # FIRST ITERATION: /cluster/bin/phast/consEntropy --NH 9.7834 0.22 10.6 ave.{cons,noncons}.mod #Recommended expected length: omega=7.336732 sites (for NH=9.783400) # That is so low! But OK, try it below. # SECOND ITERATION: /cluster/bin/phast/consEntropy --NH 9.7834 0.35 18.5 ave.{cons,noncons}.mod #Recommended expected length: omega=17.857022 sites (for NH=9.783400) # OK, tweak --expected-lengths to 17.9 below (and for next iteration). # Now we are ready to set up the cluster job for computing the # conservation scores and predicted elements. The we measure the # conserved elements coverage, and if that's not satisfactory then we # adjust parameters and repeat. ssh kk9 mkdir /cluster/data/galGal2/bed/multiz4way/phastCons/run.phast cd /cluster/data/galGal2/bed/multiz4way/phastCons/run.phast cat << 'EOF' > doPhastCons.sh #!/bin/csh -ef set pref = $1:t:r:r set chr = `echo $pref | awk -F\. '{print $1}'` set tmpfile = /scratch/phastCons.$$ zcat $1 \ | /cluster/bin/phast/phastCons - \ ../run.estimate/ave.cons.mod,../run.estimate/ave.noncons.mod \ --expected-lengths 17.9 --target-coverage 0.35 \ --quiet --seqname $chr --idpref $pref \ --viterbi /santest/scratch/galGal2/phastCons/ELEMENTS/$pref.bed --score \ --require-informative 0 \ > $tmpfile gzip -c $tmpfile > /santest/scratch/galGal2/phastCons/POSTPROBS/$pref.pp.gz rm $tmpfile 'EOF' # << for emacs chmod a+x doPhastCons.sh rm -fr /santest/scratch/galGal2/phastCons/{POSTPROBS,ELEMENTS} mkdir -p /santest/scratch/galGal2/phastCons/{POSTPROBS,ELEMENTS} rm -f jobList foreach f (/santest/scratch/galGal2/phastCons/WINDOWS/*.ss.gz) echo doPhastCons.sh $f >> jobList end para create jobList para try, check, push, check, ... #Completed: 1160 of 1160 jobs #Average job time: 10s 0.17m 0.00h 0.00d #Longest finished job: 16s 0.27m 0.00h 0.00d #Submission to last job: 138s 2.30m 0.04h 0.00d # back on kolossus: # combine predictions and transform scores to be in 0-1000 interval cd /cluster/data/galGal2/bed/multiz4way/phastCons awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' \ /santest/scratch/galGal2/phastCons/ELEMENTS/*.bed \ | /cluster/bin/scripts/lodToBedScore > all.bed ssh hgwdev # Now measure coverage of CDS by conserved elements. # We want the "cover" figure to be close to 68.9%. # But what to use as CDS? There's a refGene but it's terribly incomplete. # There's a human proteins track, but that's only what human and chicken # have in common. There's a xenoRefGene too... again comparative- # influenced, but a lot more complete than refGene presumably. # So just go with first cut results for now? cd /cluster/data/galGal2/bed/multiz4way/phastCons featureBits -enrichment galGal2 refGene:cds all.bed featureBits -enrichment galGal2 xenoRefGene:cds all.bed featureBits -enrichment galGal2 blastHg17KG all.bed # FIRST ITERATION: too low, increase --target-coverage and try again. #refGene:cds 0.340%, all.bed 2.488%, both 0.187%, cover 55.04%, enrich 22.12x #xenoRefGene:cds 1.644%, all.bed 2.488%, both 1.011%, cover 61.48%, enrich 24.71x #blastHg17KG 2.001%, all.bed 2.488%, both 0.993%, cover 49.63%, enrich 19.95x # SECOND ITERATION: this'll do: #refGene:cds 0.340%, all.bed 3.497%, both 0.231%, cover 67.79%, enrich 19.39x #xenoRefGene:cds 1.644%, all.bed 3.497%, both 1.260%, cover 76.62%, enrich 21.91x #blastHg17KG 2.001%, all.bed 3.497%, both 1.263%, cover 63.12%, enrich 18.05x # Having met the CDS coverage target, load up the results. hgLoadBed galGal2 phastConsElements4way all.bed # Create wiggle on the small cluster ssh kki mkdir /cluster/data/galGal2/bed/multiz4way/phastCons/run.wib cd /cluster/data/galGal2/bed/multiz4way/phastCons/run.wib rm -rf /santest/scratch/galGal2/phastCons/wib mkdir -p /santest/scratch/galGal2/phastCons/wib cat << 'EOF' > doWigEncode #!/bin/csh -ef set chr = $1 cd /santest/scratch/galGal2/phastCons/wib zcat `ls -1 /santest/scratch/galGal2/phastCons/POSTPROBS/$chr.*.pp.gz \ | sort -t\. -k2,2n` \ | wigEncode stdin ${chr}_phastCons.wi{g,b} 'EOF' # << for emacs chmod a+x doWigEncode rm -f jobList foreach chr (`ls -1 /santest/scratch/galGal2/phastCons/POSTPROBS \ | awk -F\. '{print $1}' | sort -u`) echo doWigEncode $chr >> jobList end para create jobList para try, check, push, check, ... #Completed: 51 of 51 jobs #Average job time: 5s 0.09m 0.00h 0.00d #Longest finished job: 21s 0.35m 0.01h 0.00d #Submission to last job: 32s 0.53m 0.01h 0.00d # back on kolossus, copy wibs, wigs and POSTPROBS (people sometimes want # the raw scores) from bluearc cd /cluster/data/galGal2/bed/multiz4way/phastCons rm -rf wib POSTPROBS rsync -av /santest/scratch/galGal2/phastCons/wib . rsync -av /santest/scratch/galGal2/phastCons/POSTPROBS . # load wiggle component of Conservation track ssh hgwdev mkdir /gbdb/galGal2/multiz4way/wib cd /cluster/data/galGal2/bed/multiz4way/phastCons chmod 775 . wib chmod 664 wib/*.wib ln -s `pwd`/wib/*.wib /gbdb/galGal2/multiz4way/wib/ hgLoadWiggle galGal2 phastCons4way \ -pathPrefix=/gbdb/galGal2/multiz4way/wib wib/*.wig rm wiggle.tab # make top-5000 list and launcher on Adam's home page: sed -e 's/lod=//' all.bed | sort -k4,4nr | head -5000 \ | awk '{printf "%s\t%d\t%d\tlod=%d\t%d\n", $1, $2, $3, $4, $4}' \ > top5000.bed # Don't have group write perms to /cse/grads/acs/ so can't do this anymore: /cluster/home/acs/bin/make-launcher-with-scores.sh top5000.bed \ /cse/grads/acs/public_html/gg-top5000-4way \ "top 5000 conserved elements (4way)" galGal2 # and clean up bluearc. rm -r /santest/scratch/galGal2/phastCons/{ELEMENTS,POSTPROBS,wib} rm -r /santest/scratch/galGal2/chrom # MAKE AN NCBI LIFT FILE FOR CHICKEN (DONE 4/29/05 angie) # In order to import dbSNP data, must lift NCBI-named contigs to # chromosomes. I don't see an agp file in the chicken ftp dir, # so seq_contig.md will have to do: cd /cluster/data/galGal2/jkStuff wget ftp://ftp.ncbi.nih.gov/genomes/Gallus_gallus/seq_contig.md.gz # Wow, *lots* of differences between NCBI and our AGP on chrUn, # and some on chr{W,Z}_random as well. Resolved them by making # small edits to jkStuff/chr{Un,W_random,Z_random}.ncbiAlike.agp # and writing a galGal2-tailored script to attempt to line up seq_contig # and AGP by size/order of sequence.. icch, but that's the best # I can do without an AGP from NCBI! Also, seq_contig shows # everything as + strand... I hope that means that NCBI reverse- # complemented the sequences that appear - in the AGP, but have not # verified. zcat seq_contig.md.gz \ | ./seq_contig_to_lift.pl \ > liftNcbi.lft awk '{print $4 " " $5;}' liftNcbi.lft | uniq | sort > /tmp/1 awk '{print $1 " " $2;}' ../chrom.sizes | sort > /tmp/2 diff /tmp/1 /tmp/2 #43c43 #< chr8_random 5783 #--- #> chr8_random 15783 # chr8_random diff is NBD: the agp has an exta gap at the end. # LOAD DBSNP (DONE 4/29/05 angie) # Using Daryl's method from makeHg17.doc... ssh kolossus set db = galGal2 set org = chicken set build = 124 set dir = /santest/scratch/snp/$db/build$build mkdir -p $dir $dir/ds_ch.xml $dir/det $dir/str $dir/loc $dir/seq cd $dir # started 8:10pm - finished in ~10min, not nearly as much for chicken # as human! # Check to make sure the chrMT file is included screen ftp ftp.ncbi.nih.gov cd snp/$org/XML prompt mget ds_ch*.xml.gz exit # screen exit # machine # Don't know what Daryl means by this: check chromStart for each locType cp -f {$HOME}/kent/src/hg/snp/parseDbSnpXML /cluster/bin/scripts chmod 775 /cluster/bin/scripts/parseDbSnpXML ssh kk9 set db = galGal2 set org = chicken set build = 124 set dir = /santest/scratch/snp/$db/build$build cd $dir touch jobList foreach file ( $dir/ds_ch*.xml.gz ) set out = $file:t:r echo /cluster/bin/scripts/parseDbSnpXML $file $dir $out.contig >> jobList end # I removed these from the job list: # ds_chMulti.xml.gz # ds_chNotOn.xml.gz # ds_chLG.xml.gz # ds_chMasked.xml.gz para create jobList para push; para check ... #Completed: 35 of 35 jobs #Average job time: 117s 1.95m 0.03h 0.00d #Longest finished job: 777s 12.95m 0.22h 0.01d #Submission to last job: 777s 12.95m 0.22h 0.01d # exit kk # back on kolossus: cd $dir # some of the NT contigs are not in the liftSpec - this is # expected as snps that map to alternate assemblies (Celera) are # in the original files, but we disregard their mappings. time liftUp $db.build$build.bed /cluster/data/$db/jkStuff/liftNcbi.lft \ warn det/ds_ch*.xml.contig.det.gz |& grep liftSpec \ | awk '{print $1;}' | sort -u > notLifted.log #19.722u 3.497s 0:27.95 83.0% 0+0k 0+0io 0pf+0w time gzip $db.build$build.bed #13.034u 0.318s 0:14.07 94.8% 0+0k 0+0io 0pf+0w du -sh . #558M . mkdir -p /cluster/data/$db/bed/snp rsync -av $dir /cluster/data/$db/bed/snp set dir = /cluster/data/$db/bed/snp/build$build ssh hgwdev set db = galGal2 set org = chicken set build = 124 set dir = /cluster/data/$db/bed/snp/build$build cd $dir # hgLoadBed is the important step - check to make sure there are no warnings time nice hgLoadBed $db snp $db.build$build.bed.gz \ -sqlTable=${HOME}/kent/src/hg/lib/snp.sql #Loaded 2863139 elements of size 16 #82.270u 14.920s 15:07.24 10.7% 0+0k 0+0io 17803pf+0w # If there are warnings, do this to analyze what didn't get loaded right: #zcat $db.build$build.bed.gz | sort > /tmp/1 #echo select \* from snp | hgsql -N galGal2 \ #| perl -wpe 's/^[0-9]+\t//' | sort > /tmp/2 #diff /tmp/1 /tmp/2 | less # Copy over snpExceptions table from hg17: hgsql galGal2 -e \ 'create table snpExceptions select * from hg17.snpExceptions' # basic snp table is now loaded, but exception column needs to be updated # run queries from snpException.query against snp table umask 002 mkdir -p /usr/local/apache/htdocs/qa/test-results/snpException/$db/build$build cd /usr/local/apache/htdocs/qa/test-results/snpException/$db/build$build time nice snpException $db 0 ${db}snpException > ${db}snpException.log #0.690u 1.300s 13:37.00 0.2% 0+0k 0+0io 206pf+0w # check alignment of flanking sequences time nice snpValid $db /cluster/data/$db/bed/snp/build$build/seq \ > ${db}snpValid.log #1110.730u 40.640s 21:22.09 89.8% 0+0k 0+0io 2942pf+0w ######################################################################### # MOUSE NET/CHAINS MM6 - Info contained in makeMm6.doc (200503 Hiram) # CHICKEN 13K ARRAY PROBES (DONE 5/19/2005 Andy) mkdir /cluster/data/galGal2/bed/chicken13k cd /cluster/data/galGal2/bed/chicken13k wget http://www.fhcrc.org/shared_resources/genomics/dna_array/spotted_arrays/chicken_array/chicken13k_anno_v2.0.txt cut -f1,8,9,10 chicken13k_anno_v2.0.txt | grep chr \ | awk '{printf("%s\t%s\t%s\t%s\n", $2, $3, $4, $1)}' > chicken13k.bed # Not all the probes are mapped. tail +2 chicken13k_anno_v2.0.txt | wc -l # 15769 wc -l chicken13k.bed # 13174 # So that's a loss of about 17% of the probes. # Double-checked the half-open bed start in the browser with the corresponding EST. # It appears to be bed-ready. hgLoadBed galGal2 chicken13k chicken13k.bed # trackDb entry: track chicken13k shortLabel 13k Probes longLabel Probes for 13k Microarray group regulation priority 91.0 visibility hide type bed 4 . # trackDb findSpec: searchName chicken13k searchTable chicken13k searchType bed searchMethod exact searchPriority 55 # Add information to the chicken13kInfo table: cut -f1-7,11-15 chicken13k_anno_v2.0.txt | tail +2 > chicken13kInfo.txt hgsql galGal2 < ~/kent/src/hg/lib/chicken13kInfo.sql echo "load data local infile 'chicken13kInfo.txt' into table chicken13kinfo" | hgsql galGal2 ############################################################################ ## BLASTZ swap from mm8 alignments (DONE - 2006-02-18 - Hiram) ssh pk cd /cluster/data/mm8/bed/blastzGalGal2.2006-02-18 time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -swap -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ `pwd`/DEF > swap.out 2>&1 & time nice -n +19 featureBits galGal2 chainMm8Link # 57074100 bases of 1054197620 (5.414%) in intersection # SWAP CHAINS/NET RN4 (DONE 3/30/06 angie) ssh kkr3u00 mkdir /cluster/data/galGal2/bed/blastz.rn4.swap cd /cluster/data/galGal2/bed/blastz.rn4.swap doBlastzChainNet.pl -swap /cluster/data/rn4/bed/blastz.galGal2/DEF \ -workhorse kkr3u00 >& do.log & tail -f do.log ln -s blastz.rn4.swap /cluster/data/galGal2/bed/blastz.rn4 # BLASTZ/CHAIN/NET XENTRO2 (DONE 4/20/06 angie) ssh kkstore03 mkdir /cluster/data/galGal2/bed/blastz.xenTro2.2006-04-20 cd /cluster/data/galGal2/bed/blastz.xenTro2.2006-04-20 cat << '_EOF_' > DEF # chicken vs. frog BLASTZ=/cluster/bin/penn/x86_64/blastz.v7.x86_64 # Use same params as used for galGal2-xenTro1 (see makeXenTro1.doc) BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=8000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET: Chicken galGal2 SEQ1_DIR=/scratch/hg/galGal2/nib SEQ1_LEN=/cluster/data/galGal2/chrom.sizes SEQ1_CHUNK=50000000 SEQ1_LAP=10000 # QUERY: Frog xenTro2 - single chunk big enough to run two of the # largest scaffolds in one job SEQ2_DIR=/scratch/hg/xenTro2/xenTro2.2bit SEQ2_LEN=/san/sanvol1/scratch/xenTro2/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LAP=0 SEQ2_LIMIT=100 BASE=/cluster/data/galGal2/bed/blastz.xenTro2.2006-04-20 '_EOF_' # << emacs doBlastzChainNet.pl -blastzOutRoot=/san/sanvol1/galGal2XenTro2 \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose DEF \ >& do.log & tail -f do.log ln -s blastz.xenTro2.2006-04-20 /cluster/data/galGal2/bed/blastz.xenTro2 ##################################################################### # SEGMENTAL DUPLICATIONS (DONE 6/30/06 angie) # File emailed from Xinwei She mkdir /cluster/data/galGal2/bed/genomicSuperDups cd /cluster/data/galGal2/bed/genomicSuperDups sed -e 's/\t_\t/\t-\t/' galGal2_genomicSuperDup.tab \ | awk '($3 - $2) >= 1000 && ($9 - $8) >= 1000 {print;}' \ | hgLoadBed galGal2 genomicSuperDups stdin \ -sqlTable=$HOME/kent/src/hg/lib/genomicSuperDups.sql