# for emacs: -*- mode: sh; -*- # Drosophila yakuba -- # # WUSTL's # http://www.genome.wustl.edu/??? # # DOWNLOAD SEQUENCE (DONE 5/21/04 angie) ssh kksilo mkdir /cluster/store6/droYak1 cd /cluster/data ln -s /cluster/store6/droYak1 droYak1 cd /cluster/data/droYak1 mkdir downloads cd downloads wget ftp://genome.wustl.edu/private/lhillier/old/dy040407/agp_040407.gz wget ftp://genome.wustl.edu/private/lhillier/old/dy040407/fa_040407.gz tar xvzf agp_040407.gz tar xvzf fa_040407.gz foreach c (2L 2R 2h 3L 3R 3h 4 U X Xh Yh) mkdir ../$c if (-e chr$c.fa) then mv chr$c.fa ../$c mv chr$c.agp ../$c endif if (-e chr${c}_random.fa) then mv chr${c}_random.fa ../$c mv chr${c}_random.agp ../$c endif end # download mitochondrion sequence mkdir M cd M # go to http://www.ncbi.nih.gov/ and search Genome for # "yakuba mitochondrion". That shows the gi number: # 5834829 # Use that number in the entrez linking interface to get fasta: wget -O chrM.fa \ 'http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=Text&db=Nucleotide&uid=5834829&dopt=FASTA' # Edit chrM.fa: make sure the long fancy header line says it's the # Drosophila yakuba mitochondrion complete genome, and then replace the # header line with just ">chrM". cd .. # checkAgpAndFa prints out way too much info -- keep the end/stderr only: foreach c (?{,?}) foreach agp ($c/chr$c{,_random}.agp) if (-e $agp) then set fa = $agp:r.fa echo checking consistency of $agp and $fa checkAgpAndFa $agp $fa | tail -1 endif end end faSize */chr*.fa #179076649 bases (9662478 N's 169414171 real) in 18 sequences in 18 files #Total size: mean 9948702.7 sd 13848622.8 min 4082 (chr4_random) max 45280584 (chrU) median 1395135 #N count: mean 536804.3 sd 1230439.2 # see what kind of gap types we have: awk '$5 == "N" {print $7;}' */chr*.agp | uniq | sort | uniq #contig #fragment # wow, biggest gap size is 1800! awk '$5 == "N" {print $6;}' */chr*.agp | sort -nr | head -1 # rough hist of gap sizes from AGP: awk '$5 == "N" {print $6;}' */chr*.agp | textHistogram stdin -binSize=100 # fancy hist of gap size from FASTA: cat */chr*.fa | faGapSizes -niceSizes=10,50,100,200,300,700,1000,1800 stdin # BREAK UP SEQUENCE INTO 5 MB CHUNKS AT CONTIGS/GAPS (DONE 5/21/04 angie) ssh kksilo cd /cluster/data/droYak1 foreach c (?{,?}) foreach agp ($c/chr$c{,_random}.agp) if (-e $agp) then set fa = $agp:r.fa echo splitting $agp and $fa cp -p $agp $agp.bak cp -p $fa $fa.bak splitFaIntoContigs $agp $fa . -nSize=5000000 endif end end # splitFaIntoContigs makes new dirs for _randoms. Move their contents # back into the main chrom dirs and get rid of the _random dirs. foreach d (*_random) set base = `echo $d | sed -e 's/_random$//'` mkdir -p $base/lift mv $d/lift/oOut.lst $base/lift/rOut.lst mv $d/lift/ordered.lft $base/lift/random.lft mv $d/lift/ordered.lst $base/lift/random.lst rmdir $d/lift mv $d/* $base rmdir $d end # Make a "pseudo-contig" for processing chrM too: mkdir M/chrM_1 sed -e 's/chrM/chrM_1/' M/chrM.fa > M/chrM_1/chrM_1.fa mkdir M/lift echo "chrM_1/chrM_1.fa.out" > M/lift/oOut.lst echo "chrM_1" > M/lift/ordered.lst # 7/13/04: Oops, first time around, used the wrong hardcoded value! # So liftAll.lft had too big of an upper limit for chrM until now: set msize = `faSize M/chrM.fa | awk '{print $1;}'` echo "0\tM/chrM_1\t$msize\tchrM\t$msize" > M/lift/ordered.lft # MAKE JKSTUFF AND BED DIRECTORIES (DONE 5/21/04 angie) # This used to hold scripts -- better to keep them inline here so # they're in CVS. Now it should just hold lift file(s) and # temporary scripts made by copy-paste from this file. mkdir /cluster/data/droYak1/jkStuff # This is where most tracks will be built: mkdir /cluster/data/droYak1/bed # CREATING DATABASE (DONE 5/21/04 angie) # Create the database. ssh hgwdev # Make sure there is at least 5 gig free for the database df -h /var/lib/mysql hgsql -e 'create database droYak1' # CREATING GRP TABLE FOR TRACK GROUPING (DONE 2/23/04 angie) ssh hgwdev echo "create table grp (PRIMARY KEY(NAME)) select * from hg16.grp" \ | hgsql droYak1 # MAKE CHROMINFO TABLE WITH (TEMPORARILY UNMASKED) NIBS (DONE 5/21/04 angie) # Make nib/, unmasked until RepeatMasker and TRF steps are done. # Do this now so we can load up RepeatMasker and run featureBits; # can also load up other tables that don't depend on masking. ssh kksilo cd /cluster/data/droYak1 mkdir nib foreach c (?{,?}) foreach f ($c/chr${c}{,_random}.fa) if (-e $f) then echo "nibbing $f" /cluster/bin/i386/faToNib $f nib/$f:t:r.nib endif end end # Make symbolic links from /gbdb/droYak1/nib to the real nibs. ssh hgwdev mkdir -p /gbdb/droYak1/nib foreach f (/cluster/data/droYak1/nib/chr*.nib) ln -s $f /gbdb/droYak1/nib end # Load /gbdb/droYak1/nib paths into database and save size info. cd /cluster/data/droYak1 hgsql droYak1 < $HOME/kent/src/hg/lib/chromInfo.sql hgNibSeq -preMadeNib droYak1 /gbdb/droYak1/nib */chr*.fa echo "select chrom,size from chromInfo" | hgsql -N droYak1 > chrom.sizes # take a look at chrom.sizes, should be 18 lines wc chrom.sizes # REPEATMASKER (DONE 5/21/04 angie) #- Split contigs into 500kb chunks, at gaps if possible: ssh kksilo cd /cluster/data/droYak1 foreach c (?{,?}) foreach d ($c/chr${c}*_?{,?}) cd $d echo "splitting $d" set contig = $d:t faSplit gap $contig.fa 500000 ${contig}_ -lift=$contig.lft \ -minGapSize=100 cd ../.. end end #- Make the run directory and job list: cd /cluster/data/droYak1 cat << '_EOF_' > jkStuff/RMDrosophila #!/bin/csh -fe cd $1 pushd . /bin/mkdir -p /tmp/droYak1/$2 /bin/cp $2 /tmp/droYak1/$2/ cd /tmp/droYak1/$2 /cluster/bluearc/RepeatMasker/RepeatMasker -ali -s -spec drosophila $2 popd /bin/cp /tmp/droYak1/$2/$2.out ./ if (-e /tmp/droYak1/$2/$2.align) /bin/cp /tmp/droYak1/$2/$2.align ./ if (-e /tmp/droYak1/$2/$2.tbl) /bin/cp /tmp/droYak1/$2/$2.tbl ./ if (-e /tmp/droYak1/$2/$2.cat) /bin/cp /tmp/droYak1/$2/$2.cat ./ /bin/rm -fr /tmp/droYak1/$2/* /bin/rmdir --ignore-fail-on-non-empty /tmp/droYak1/$2 /bin/rmdir --ignore-fail-on-non-empty /tmp/droYak1 '_EOF_' # << this line makes emacs coloring happy chmod +x jkStuff/RMDrosophila mkdir RMRun cp /dev/null RMRun/RMJobs foreach c (?{,?}) foreach d ($c/chr${c}{,_random}_?{,?}) set ctg = $d:t foreach f ( $d/${ctg}_?{,?}.fa ) set f = $f:t echo /cluster/data/droYak1/jkStuff/RMDrosophila \ /cluster/data/droYak1/$d $f \ '{'check out line+ /cluster/data/droYak1/$d/$f.out'}' \ >> RMRun/RMJobs end end end #- Do the run ssh kk9 cd /cluster/data/droYak1/RMRun para create RMJobs para try, para check, para check, para push, para check,... #Completed: 452 of 452 jobs #Average job time: 2607s 43.44m 0.72h 0.03d #Longest job: 4138s 68.97m 1.15h 0.05d #Submission to last job: 14079s 234.65m 3.91h 0.16d #- Lift up the 500KB chunk .out's to 5MB ("pseudo-contig") level ssh kksilo cd /cluster/data/droYak1 foreach d (*/chr*_?{,?}) set contig = $d:t echo $contig liftUp $d/$contig.fa.out $d/$contig.lft warn $d/${contig}_*.fa.out \ > /dev/null end #- Lift pseudo-contigs to chromosome level foreach c (?{,?}) echo lifting $c cd $c if (-e lift/ordered.lft && ! -z lift/ordered.lft) then liftUp chr$c.fa.out lift/ordered.lft warn `cat lift/oOut.lst` \ > /dev/null endif if (-e lift/random.lft && ! -z lift/random.lft) then liftUp chr${c}_random.fa.out lift/random.lft warn `cat lift/rOut.lst` \ > /dev/null endif cd .. end #- Load the .out files into the database with: ssh hgwdev cd /cluster/data/droYak1 hgLoadOut droYak1 */chr*.fa.out # VERIFY REPEATMASKER RESULTS (DONE 5/21/04 angie) # Eyeball some repeat annotations in the browser, compare to lib seqs. # Run featureBits on droYak1 and on a comparable genome build, and compare: ssh hgwdev featureBits droYak1 rmsk #29021326 bases of 169423277 (17.129%) in intersection # compare to dm1: featureBits dm1 rmsk #15452875 bases of 126527731 (12.213%) in intersection # wow, new coverage is a lot higher though lib doesn't seem to have # grown so much. running by LaDeana & Jim. # MAKE HGCENTRALTEST ENTRY AND TRACKDB TABLE (DONE 5/21/04 angie) # Warning: genome and organism fields must correspond # with defaultDb values hgsql -h genome-testdb hgcentraltest \ -e 'INSERT INTO dbDb \ (name, description, nibPath, organism, \ defaultPos, active, orderKey, genome, scientificName, \ htmlPath, hgNearOk, hgPbOk, sourceName) values \ ("droYak1", "Apr. 2004", "/gbdb/droYak1/nib", "D.yakuba", \ "chr2L:827700-845800", 1, 57, "D.yakuba", \ "Drosophila yakuba", "/gbdb/droYak1/html/description.html", \ 0, 0, "WUSTL version 1.0");' hgsql -h genome-testdb hgcentraltest \ -e 'INSERT INTO defaultDb (genome, name) values ("D.yakuba", "droYak1");' # Make trackDb table so browser knows what tracks to expect: ssh hgwdev cd ~/src/hg/makeDb/trackDb cvs up -d -P # Edit that makefile to add droYak1 in all the right places and do make update mkdir /gbdb/droYak1/html # go public on genome-test #make alpha cvs commit makefile # Add trackDb directories mkdir drosophila/droYak1 cvs add drosophila/droYak1 cvs commit drosophila/droYak1 # MAKE LIFTALL.LFT (DONE 5/21/04 angie) # Redone 7/13/04: chrM/lift/ordered.lft was regenerated w/correct size. ssh kksilo cd /cluster/data/droYak1 cat */lift/{ordered,random}.lft > jkStuff/liftAll.lft # SIMPLE REPEATS (TRF) (DONE 5/21/04 angie) ssh kksilo mkdir /cluster/data/droYak1/bed/simpleRepeat cd /cluster/data/droYak1/bed/simpleRepeat mkdir trf cp /dev/null jobs.csh foreach d (/cluster/data/droYak1/?{,?}/chr*_?{,?}) set ctg = $d:t foreach f ($d/${ctg}.fa) set fout = $f:t:r.bed echo $fout echo "/cluster/bin/i386/trfBig -trf=/cluster/bin/i386/trf $f /dev/null -bedAt=trf/$fout -tempDir=/tmp" \ >> jobs.csh end end tcsh jobs.csh >&! jobs.log & # check on this with tail -f jobs.log wc -l jobs.csh ls -1 trf | wc -l liftUp simpleRepeat.bed ../../jkStuff/liftAll.lft warn \ trf/*.bed > /dev/null # Load this into the database as so ssh hgwdev hgLoadBed droYak1 simpleRepeat \ /cluster/data/droYak1/bed/simpleRepeat/simpleRepeat.bed \ -sqlTable=$HOME/src/hg/lib/simpleRepeat.sql # FILTER SIMPLE REPEATS (TRF) INTO MASK (DONE 5/21/04 angie) # make a filtered version of the trf output: # keep trf's with period <= 12: ssh kksilo cd /cluster/data/droYak1/bed/simpleRepeat mkdir trfMask foreach f (trf/*.bed) echo -n "filtering $f... " awk '{if ($5 <= 12) print;}' $f > trfMask/$f:t end # Lift up filtered trf output to chrom coords: mkdir trfMaskChrom foreach f (../../?{,?}/chr*.fa) set c = $f:t:r liftUp trfMaskChrom/$c.bed ../../jkStuff/liftAll.lft warn \ trfMask/${c}_[0-9]*.bed > /dev/null end # MASK FA USING REPEATMASKER AND FILTERED TRF FILES (DONE 5/21/04 angie) ssh kksilo cd /cluster/data/droYak1 # Soft-mask (lower-case) the contig and chr .fa's, # then make hard-masked versions from the soft-masked. set trfCtg=bed/simpleRepeat/trfMask set trfChr=bed/simpleRepeat/trfMaskChrom foreach f (*/chr*.fa) echo "repeat- and trf-masking $f" maskOutFa -soft $f $f.out $f set chr = $f:t:r maskOutFa -softAdd $f $trfChr/$chr.bed $f echo "hard-masking $f" maskOutFa $f hard $f.masked end # This warning is rare (but less so this year, hmm) # -- if it indicates a problem, it's only # with the repeat annotation and doesn't affect our masking. #WARNING: negative rEnd: -1937 chrU:5728141-5728199 SAR_DM #WARNING: negative rEnd: -1555 chrU:5728312-5728374 SAR_DM #WARNING: negative rEnd: -2302 chrU:38943439-38943488 SAR_DM #WARNING: negative rEnd: -2121 chrU:38943621-38943652 SAR_DM foreach c (?{,?}) echo "repeat- and trf-masking contigs of chr$c, chr${c}_random" foreach d ($c/chr*_?{,?}) set ctg=$d:t set f=$d/$ctg.fa maskOutFa -soft $f $f.out $f maskOutFa -softAdd $f $trfCtg/$ctg.bed $f maskOutFa $f hard $f.masked end end # same deal here: #WARNING: negative rEnd: -1937 chrU_2:725794-725852 SAR_DM #WARNING: negative rEnd: -1555 chrU_2:725965-726027 SAR_DM #WARNING: negative rEnd: -2302 chrU_8:3896563-3896612 SAR_DM #WARNING: negative rEnd: -2121 chrU_8:3896745-3896776 SAR_DM #- Rebuild the nib files, using the soft masking in the fa: foreach f (*/chr*.fa) faToNib -softMask $f nib/$f:t:r.nib end # Make one big 2bit file as well, and make a link to it in # /gbdb/droYak1/nib because hgBlat looks there: faToTwoBit */chr*.fa droYak1.2bit ssh hgwdev ln -s /cluster/data/droYak1/droYak1.2bit /gbdb/droYak1/nib/ # GOLD AND GAP TRACKS (DONE 5/21/04 angie) ssh hgwdev cd /cluster/data/droYak1 cp /dev/null chrom.lst foreach f (?{,?}/chr*.agp chrM) echo $f:t:r >> chrom.lst end hgGoldGapGl -noGl -chromLst=chrom.lst droYak1 /cluster/data/droYak1 . # featureBits fails if there's no chrM_gap, so make one: # echo "create table chrM_gap like chr1_gap" | hgsql droYak1 # oops, that won't work until v4.1, so do this for the time being: hgsql droYak1 -e 'create table chrM_gap select * from chr2L_gap where 0=1' # MAKE DOWNLOADABLE SEQUENCE FILES (DONE 5/22/04 angie) ssh kksilo cd /cluster/data/droYak1 #- Build the .zip files -- no genbank for now. cat << '_EOF_' > jkStuff/zipAll.csh rm -rf zip mkdir zip zip -j zip/chromAgp.zip [0-9A-Z]*/chr*.agp zip -j zip/chromOut.zip */chr*.fa.out zip -j zip/chromFa.zip */chr*.fa zip -j zip/chromFaMasked.zip */chr*.fa.masked cd bed/simpleRepeat zip ../../zip/chromTrf.zip trfMaskChrom/chr*.bed cd ../.. '_EOF_' # << this line makes emacs coloring happy csh ./jkStuff/zipAll.csh |& tee zipAll.log cd zip #- Look at zipAll.log to make sure all file lists look reasonable. #- Check zip file integrity: foreach f (*.zip) unzip -t $f > $f.test tail -1 $f.test end wc -l *.zip.test #- Copy the .zip files to hgwdev:/usr/local/apache/... ssh hgwdev cd /cluster/data/droYak1/zip set gp = /usr/local/apache/htdocs/goldenPath/droYak1 mkdir -p $gp/bigZips cp -p *.zip $gp/bigZips mkdir -p $gp/chromosomes foreach f ( ../*/chr*.fa ) zip -j $gp/chromosomes/$f:t.zip $f end cd $gp/bigZips md5sum *.zip > md5sum.txt cd $gp/chromosomes md5sum *.zip > md5sum.txt # Take a look at bigZips/* and chromosomes/*, update their README.txt's # Can't make refGene upstream sequence files - no refSeq for yakuba. # PUT MASKED SEQUENCE OUT FOR CLUSTER RUNS (DONE 5/22/04 angie) ssh kkr1u00 # Chrom-level mixed nibs that have been repeat- and trf-masked: rm -rf /iscratch/i/droYak1/nib mkdir -p /iscratch/i/droYak1/nib cp -p /cluster/data/droYak1/nib/chr*.nib /iscratch/i/droYak1/nib # Pseudo-contig fa that have been repeat- and trf-masked: rm -rf /iscratch/i/droYak1/trfFa mkdir /iscratch/i/droYak1/trfFa foreach d (/cluster/data/droYak1/*/chr*_?{,?}) cp $d/$d:t.fa /iscratch/i/droYak1/trfFa end iSync # PRODUCING GENSCAN PREDICTIONS (DONE 5/22/04 angie) # Run on small cluster -- genscan needs big mem. ssh hgwdev mkdir /cluster/data/droYak1/bed/genscan cd /cluster/data/droYak1/bed/genscan # Check out hg3rdParty/genscanlinux to get latest genscan: cvs co hg3rdParty/genscanlinux # Run on small cluster (more mem than big cluster). ssh kki cd /cluster/data/droYak1/bed/genscan # Make 3 subdirectories for genscan to put their output files in mkdir gtf pep subopt # Generate a list file, genome.list, of all the hard-masked contigs that # *do not* consist of all-N's (which would cause genscan to blow up) cp /dev/null genome.list foreach f ( `ls -1S /cluster/data/droYak1/*/chr*_*/chr*_?{,?}.fa.masked` ) egrep '[ACGT]' $f > /dev/null if ($status == 0) echo $f >> genome.list end wc -l genome.list # Create template file, gsub, for gensub2. For example (3-line file): cat << '_EOF_' > gsub #LOOP /cluster/bin/i386/gsBig {check in line+ $(path1)} {check out line gtf/$(root1).gtf} -trans={check out line pep/$(root1).pep} -subopt={check out line subopt/$(root1).bed} -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=2400000 #ENDLOOP '_EOF_' # << this line makes emacs coloring happy gensub2 genome.list single gsub jobList para create jobList para try, check, push, check, ... #Completed: 49 of 49 jobs #Average job time: 110s 1.84m 0.03h 0.00d #Longest job: 171s 2.85m 0.05h 0.00d #Submission to last job: 425s 7.08m 0.12h 0.00d # If there are crashes, diagnose with "para problems". # If a job crashes due to genscan running out of memory, re-run it # manually with "-window=1200000" instead of "-window=2400000". # Convert these to chromosome level files as so: ssh kksilo cd /cluster/data/droYak1/bed/genscan liftUp genscan.gtf ../../jkStuff/liftAll.lft warn gtf/*.gtf liftUp genscanSubopt.bed ../../jkStuff/liftAll.lft warn subopt/*.bed cat pep/*.pep > genscan.pep # Load into the database as so: ssh hgwdev cd /cluster/data/droYak1/bed/genscan ldHgGene droYak1 genscan genscan.gtf hgPepPred droYak1 generic genscanPep genscan.pep hgLoadBed droYak1 genscanSubopt genscanSubopt.bed # SWAP BLASTZ MELANOGASTER-YAKUBA TO YAKUBA-MEL (dm1) (DONE 5/22/04 angie) ssh kolossus mkdir /cluster/data/droYak1/bed/blastz.dm1.swap.2004-05-22 cd /cluster/data/droYak1/bed/blastz.dm1.swap.2004-05-22 set aliDir = /cluster/data/dm1/bed/blastz.droYak1.2004-05-22 cp $aliDir/S1.len S2.len cp $aliDir/S2.len S1.len mkdir unsorted axtChrom cat $aliDir/axtChrom/chr*.axt \ | axtSwap stdin $aliDir/S1.len $aliDir/S2.len stdout \ | axtSplitByTarget stdin unsorted # Sort the shuffled .axt files. foreach f (unsorted/*.axt) echo sorting $f:t:r axtSort $f axtChrom/$f:t end du -sh $aliDir/axtChrom unsorted axtChrom #623M /cluster/data/dm1/bed/blastz.droYak1.2004-05-22/axtChrom #623M unsorted #623M axtChrom rm -r unsorted # CHAIN MELANOGASTER BLASTZ (DONE 5/27/04 angie) # Run axtChain on little cluster ssh kki cd /cluster/data/droYak1/bed/blastz.dm1.swap.2004-05-22 mkdir -p axtChain/run1 cd axtChain/run1 mkdir out chain ls -1S /cluster/data/droYak1/bed/blastz.dm1.swap.2004-05-22/axtChrom/*.axt \ > input.lst cat << '_EOF_' > gsub #LOOP doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out exists out/$(root1).out} #ENDLOOP '_EOF_' # << this line makes emacs coloring happy cat << '_EOF_' > doChain #!/bin/csh axtFilter -notQ=chrUn_random $1 \ | axtChain stdin -verbose=0 \ /iscratch/i/droYak1/nib \ /cluster/bluearc/drosophila/dm1/nib $2 > $3 '_EOF_' # << this line makes emacs coloring happy chmod a+x doChain gensub2 input.lst single gsub jobList para create jobList para try, check, push, check... #Completed: 18 of 18 jobs #Average job time: 16s 0.26m 0.00h 0.00d #Longest job: 59s 0.98m 0.02h 0.00d #Submission to last job: 59s 0.98m 0.02h 0.00d # now on the cluster server, sort chains ssh kksilo cd /cluster/data/droYak1/bed/blastz.dm1.swap.2004-05-22/axtChain chainMergeSort run1/chain/*.chain > all.chain chainSplit chain all.chain rm run1/chain/*.chain # take a look at score distr's foreach f (chain/*.chain) grep chain $f | awk '{print $2;}' | sort -nr > /tmp/score.$f:t:r echo $f:t:r textHistogram -binSize=10000 /tmp/score.$f:t:r echo "" end # Load chains into database ssh hgwdev cd /cluster/data/droYak1/bed/blastz.dm1.swap.2004-05-22/axtChain/chain foreach i (*.chain) set c = $i:r echo loading $c hgLoadChain droYak1 ${c}_chainDm1 $i end # NET MELANOGASTER BLASTZ (DONE 5/27/04 angie) ssh kksilo cd /cluster/data/droYak1/bed/blastz.dm1.swap.2004-05-22/axtChain chainPreNet all.chain ../S1.len ../S2.len stdout \ | chainNet stdin -minSpace=1 ../S1.len ../S2.len stdout /dev/null \ | netSyntenic stdin noClass.net # Add classification info using db tables: ssh hgwdev cd /cluster/data/droYak1/bed/blastz.dm1.swap.2004-05-22/axtChain netClass -noAr noClass.net droYak1 dm1 melanogaster.net # Make a 'syntenic' subset: ssh kksilo cd /cluster/data/droYak1/bed/blastz.dm1.swap.2004-05-22/axtChain rm noClass.net netFilter -syn melanogaster.net > melanogasterSyn.net # Load the nets into database ssh hgwdev cd /cluster/data/droYak1/bed/blastz.dm1.swap.2004-05-22/axtChain netFilter -minGap=10 melanogaster.net | hgLoadNet droYak1 netDm1 stdin netFilter -minGap=10 melanogasterSyn.net | hgLoadNet droYak1 netSyntenyDm1 stdin # MAKE VSDM1 DOWNLOADABLES (DONE 5/27/04 angie) ssh kksilo cd /cluster/data/droYak1/bed/blastz.dm1.swap.2004-05-22/axtChain cp all.chain melanogaster.chain zip /cluster/data/droYak1/zip/melanogaster.chain.zip melanogaster.chain rm melanogaster.chain zip /cluster/data/droYak1/zip/melanogaster.net.zip melanogaster.net cd /cluster/data/droYak1/bed/blastz.dm1.swap.2004-05-22 zip /cluster/data/droYak1/zip/axtChrom.zip axtChrom/chr*.axt ssh hgwdev mkdir /usr/local/apache/htdocs/goldenPath/droYak1/vsDm1 cd /usr/local/apache/htdocs/goldenPath/droYak1/vsDm1 mv /cluster/data/droYak1/zip/melanogaster*.zip . mv /cluster/data/droYak1/zip/axtChrom.zip . md5sum *.zip > md5sum.txt # Copy over & edit README.txt w/pointers to chain, net formats. # AUTO UPDATE GENBANK MRNA RUN (DONE 7/15/04 angie) ssh hgwdev # Update genbank config and source in CVS: cd ~/kent/src/hg/makeDb/genbank cvsup . # See if /cluster/data/genbank/etc/genbank.conf has had any un-checked-in # edits, check them in if necessary: diff /cluster/data/genbank/etc/genbank.conf etc/genbank.conf # Edit etc/genbank.conf and add these lines: # droYak1 (D. yakuba) droYak1.genome = /iscratch/i/droYak1/nib/chr*.nib droYak1.lift = /cluster/data/droYak1/jkStuff/liftAll.lft droYak1.refseq.mrna.native.load = no droYak1.refseq.mrna.xeno.load = yes droYak1.refseq.mrna.xeno.pslReps = -minCover=0.15 -minAli=0.75 -nearTop=0.005 droYak1.genbank.mrna.xeno.load = yes droYak1.genbank.est.xeno.load = no droYak1.downloadDir = droYak1 cvs ci etc/genbank.conf # Since D. yakuba is a new species for us, edit src/lib/gbGenome.c. # Pick some other browser species, & monkey-see monkey-do. cvs diff src/lib/gbGenome.c make cvs ci src/lib/gbGenome.c # Install to /cluster/data/genbank: make install-server ssh eieio cd /cluster/data/genbank # This is an -initial run, (xeno) RefSeq only: nice bin/gbAlignStep -srcDb=refseq -type=mrna -initial droYak1 & tail -f [its logfile] # Load results: ssh hgwdev cd /cluster/data/genbank nice bin/gbDbLoadStep -verbose=1 -drop -initialLoad droYak1 featureBits droYak1 xenoRefGene #26062889 bases of 169423277 (15.383%) in intersection # Clean up: rm -rf work/initial.droYak1 # This is an -initial run, mRNA only: nice bin/gbAlignStep -srcDb=genbank -type=mrna -initial droYak1 & tail -f [its logfile] # Load results: ssh hgwdev cd /cluster/data/genbank nice bin/gbDbLoadStep -verbose=1 -drop -initialLoad droYak1 featureBits droYak1 mrna #350085 bases of 169423277 (0.207%) in intersection featureBits droYak1 xenoMrna #22644430 bases of 169423277 (13.366%) in intersection # Clean up: rm -rf work/initial.droYak1 ssh eieio # -initial for ESTs nice bin/gbAlignStep -srcDb=genbank -type=est -initial droYak1 & tail -f [its logfile] # Load results: ssh hgwdev cd /cluster/data/genbank nice bin/gbDbLoadStep -verbose=1 droYak1 & # Wow, very very little for yakuba, even for ESTs! featureBits droYak1 intronEst #277291 bases of 169423277 (0.164%) in intersection featureBits droYak1 est #420049 bases of 169423277 (0.248%) in intersection # Clean up: rm -rf work/initial.droYak1 # MAKE GCPERCENT (DONE 7/21/04 angie) ssh hgwdev mkdir /cluster/data/droYak1/bed/gcPercent cd /cluster/data/droYak1/bed/gcPercent # create and load gcPercent table hgsql droYak1 < ~/src/hg/lib/gcPercent.sql hgGcPercent droYak1 ../../nib # MAKE HGCENTRALTEST BLATSERVERS ENTRY (DONE 7/21/04 angie) ssh hgwdev echo 'insert into blatServers values("droYak1", "blat7", "17780", 1, 0); \ insert into blatServers values("droYak1", "blat7", "17781", 0, 1);' \ | hgsql -h genome-testdb hgcentraltest # MAKE Drosophila Proteins track ssh kksilo mkdir -p /cluster/data/droYak1/blastDb cd /cluster/data/droYak1/blastDb for i in ../*/*/*_[0-9]*_[0-9]*.fa; do ln -s $i .; done for i in *.fa; do formatdb -i $i -p F 2> /dev/null; done rm *.fa *.log ssh kkr1u00 mkdir -p /iscratch/i/droYak1/blastDb cp /cluster/data/droYak1/blastDb/* /iscratch/i/droYak1/blastDb (~kent/bin/iSync) 2>&1 > sync.out mkdir -p /cluster/data/droYak1/bed/tblastn.dm1FB cd /cluster/data/droYak1/bed/tblastn.dm1FB ls -1S /iscratch/i/droYak1/blastDb/*.nsq | sed "s/\.nsq//" > bug.lst exit # back to kksilo cd /cluster/data/droYak1/bed/tblastn.dm1FB mkdir fbfa # calculate a reasonable number of jobs calc `wc /cluster/data/dm1/bed/blat.dm1FB/dm1FB.psl | awk "{print \\\$1}"`/\(150000/`wc bug.lst | awk "{print \\\$1}"`\) # 18735/(150000/463) = 57.828700 split -l 58 /cluster/data/dm1/bed/blat.dm1FB/dm1FB.psl fbfa/fb cd fbfa for i in *; do pslxToFa $i $i.fa; rm $i; done cd .. ls -1S fbfa/*.fa > fb.lst mkdir blastOut for i in `cat fb.lst`; do mkdir blastOut/`basename $i .fa`; done cat << '_EOF_' > blastGsub #LOOP blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl } #ENDLOOP '_EOF_' cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/iscratch/i/blast/data export BLASTMAT f=/tmp/`basename $3` for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11 do if /scratch/blast/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8 then mv $f.8 $f.1 break; fi done if test -f $f.1 then if /cluster/bin/i386/blastToPsl $f.1 $f.2 then liftUp -nosort -type=".psl" -nohead $f.3 /cluster/data/droYak1/jkStuff/subLift.lft warn $f.2 liftUp -nosort -type=".psl" -nohead $f.4 /cluster/data/droYak1/jkStuff/liftAll.lft warn $f.3 liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /iscratch/i/dm1/protein.lft warn $f.4 mv $3.tmp $3 rm -f $f.1 $f.2 $f.3 exit 0 fi fi rm -f $f.1 $f.2 $3.tmp $f.3 exit 1 '_EOF_' chmod +x blastSome gensub2 bug.lst fb.lst blastGsub blastSpec ssh kk cd /cluster/data/droYak1/bed/tblastn.dm1FB para create blastSpec para shove # jobs will need to be restarted, but they should all finish # Completed: 150012 of 150012 jobs # CPU time in finished jobs: 2892894s 48214.90m 803.58h 33.48d 0.092 y # IO & Wait Time: 748046s 12467.43m 207.79h 8.66d 0.024 y # Average job time: 24s 0.40m 0.01h 0.00d # Longest job: 562s 9.37m 0.16h 0.01d # Submission to last job: 14761s 246.02m 4.10h 0.17d cat << '_EOF_' > chainGsub #LOOP chainSome $(path1) #ENDLOOP '_EOF_' cat << '_EOF_' > chainSome (cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=25000 stdin ../c.`basename $1`.psl) '_EOF_' chmod +x chainSome ls -1dS `pwd`/blastOut/fb?? > chain.lst gensub2 chain.lst single chainGsub chainSpec ssh kki cd /cluster/data/droYak1/bed/tblastn.dm1FB para create chainSpec para push # Completed: 324 of 324 jobs # CPU time in finished jobs: 409s 6.81m 0.11h 0.00d 0.000 y # IO & Wait Time: 8539s 142.32m 2.37h 0.10d 0.000 y # Average job time: 28s 0.46m 0.01h 0.00d # Longest job: 89s 1.48m 0.02h 0.00d # Submission to last job: 585s 9.75m 0.16h 0.01d exit # back to kksilo cd /cluster/data/droYak1/bed/tblastn.dm1FB/blastOut for i in fb?? do awk "(\$13 - \$12)/\$11 > 0.6 {print}" c.$i.psl > c60.$i.psl sort -rn c60.$i.psl | pslUniq stdin u.$i.psl awk "((\$1 / \$11) ) > 0.60 { print }" c60.$i.psl > m60.$i.psl echo $i done cat u.*.psl m60* | sort -T /tmp -k 14,14 -k 16,16n -k 17,17n | uniq > ../preblastDm1FB.psl cd .. blatDir=/cluster/data/dm1/bed/blat.dm1FB protDat -fb preblastDm1FB.psl $blatDir/dm1FB.psl $blatDir/dm1FB.mapNames blastDm1FB.psl ssh hgwdev cd /cluster/data/droYak1/bed/tblastn.dm1FB hgLoadPsl droYak1 blastDm1FB.psl exit # back to kksilo rm -rf blastOut # End tblastn # MAKE 11.OOC FILE FOR BLAT (DONE 11/4/04 angie) # Use -repMatch=100 (based on size -- for human we use 1024, and # fly size is ~4.4% of human judging by gapless dm1 genome size from # featureBits -- we would use 45, but bump that up a bit to be more # conservative). ssh kkr1u00 mkdir /cluster/bluearc/droYak1 blat /cluster/data/droYak1/droYak1.2bit /dev/null /dev/null -tileSize=11 \ -makeOoc=/cluster/bluearc/droYak1/11.ooc -repMatch=100 #Wrote 7023 overused 11-mers to /cluster/bluearc/droYak1/11.ooc cp -p /cluster/bluearc/droYak1/*.ooc /iscratch/i/droYak1/ iSync # SWAP CHAINS FROM DM2, BUILD NETS ETC. (REDONE 5/23/05 angie) # Originally done 3/2/05 -- redone (better params) 5/23/05. mkdir /cluster/data/droYak1/bed/blastz.dm2.swap cd /cluster/data/droYak1/bed/blastz.dm2.swap doBlastzChainNet.pl -swap /cluster/data/dm2/bed/blastz.droYak1/DEF \ >& do.log & tail -f do.log # Add {chain,net}Dm2 to trackDb.ra if necessary. # MAKE Drosophila Proteins track (DONE braney 06-30-05) ssh kk mkdir -p /cluster/data/droYak1/bed/tblastn.dm2FB cd /cluster/data/droYak1/bed/tblastn.dm2FB ls -1S /iscratch/i/droYak1/blastDb/*.nsq | sed "s/\.nsq//" > target.lst mkdir fbfa # calculate a reasonable number of jobs calc `wc /cluster/data/dm2/bed/blat.dm2FB/dm2FB.psl | awk "{print \\\$1}"`/\(164630/`wc target.lst | awk "{print \\\$1}"`\) # 18929/(164630/452) = 51.970528 split -l 52 /cluster/data/dm2/bed/blat.dm2FB/dm2FB.psl fbfa/fb cd fbfa for i in *; do pslxToFa $i $i.fa; rm $i; done cd .. ls -1S fbfa/*.fa > fb.lst mkdir -p /cluster/bluearc/droYak1/bed/tblastn.dm2FB/blastOut ln -s /cluster/bluearc/droYak1/bed/tblastn.dm2FB/blastOut for i in `cat fb.lst`; do mkdir blastOut/`basename $i .fa`; done tcsh cat << '_EOF_' > blastGsub #LOOP blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl } #ENDLOOP '_EOF_' cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/iscratch/i/blast/data export BLASTMAT g=`basename $2` f=/tmp/`basename $3`.$g for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11 do if /scratch/blast/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8 then mv $f.8 $f.1 break; fi done if test -f $f.1 then if /cluster/bin/i386/blastToPsl $f.1 $f.2 then liftUp -nosort -type=".psl" -nohead $f.3 /cluster/data/droYak1/jkStuff/subLift.lft warn $f.2 liftUp -nosort -type=".psl" -nohead $f.4 /cluster/data/droYak1/jkStuff/liftAll.lft warn $f.3 liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/dm2/bed/blat.dm2FB/protein.lft warn $f.4 mv $3.tmp $3 rm -f $f.1 $f.2 $f.3 $f.4 exit 0 fi fi rm -f $f.1 $f.2 $3.tmp $f.3 $f.8 $f.4 exit 1 '_EOF_' chmod +x blastSome gensub2 target.lst fb.lst blastGsub blastSpec para create blastSpec para push # Completed: 164980 of 164980 jobs # CPU time in finished jobs: 1200354s 20005.90m 333.43h 13.89d 0.038 y # IO & Wait Time: 648791s 10813.19m 180.22h 7.51d 0.021 y # Average job time: 11s 0.19m 0.00h 0.00d # Longest finished job: 194s 3.23m 0.05h 0.00d # Submission to last job: 23476s 391.27m 6.52h 0.27d ssh kki cd /cluster/data/droYak1/bed/tblastn.dm2FB tcsh cat << '_EOF_' > chainGsub #LOOP chainSome $(path1) #ENDLOOP '_EOF_' cat << '_EOF_' > chainSome (cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=25000 stdin ../c.`basename $1`.psl) '_EOF_' chmod +x chainSome ls -1dS `pwd`/blastOut/fb?? > chain.lst gensub2 chain.lst single chainGsub chainSpec para create chainSpec para push # Completed: 153 of 153 jobs # CPU time in finished jobs: 8279s 137.98m 2.30h 0.10d 0.000 y # IO & Wait Time: 4933s 82.22m 1.37h 0.06d 0.000 y # Average job time: 86s 1.44m 0.02h 0.00d # Longest finished job: 3445s 57.42m 0.96h 0.04d # Submission to last job: 3823s 63.72m 1.06h 0.04d cd /cluster/data/droYak1/bed/tblastn.dm2FB/blastOut for i in fb?? do awk "(\$13 - \$12)/\$11 > 0.6 {print}" c.$i.psl > c60.$i.psl sort -rn c60.$i.psl | pslUniq stdin u.$i.psl awk "((\$1 / \$11) ) > 0.60 { print }" c60.$i.psl > m60.$i.psl echo $i done sort -u -T /tmp -k 14,14 -k 16,16n -k 17,17n u.*.psl m60* > /cluster/data/droYak1/bed/tblastn.dm2FB/blastDm2FB.psl cd .. ssh hgwdev cd /cluster/data/droYak1/bed/tblastn.dm2FB hgLoadPsl droYak1 blastDm2FB.psl exit # back to kksilo rm -rf blastOut # End tblastn # GENEID PREDICTIONS FROM IMIM (DONE 7/26/05 angie) ssh hgwdev mkdir /cluster/data/droYak1/bed/geneid cd /cluster/data/droYak1/bed/geneid foreach chr (`awk '{print $1;}' ../../chrom.sizes`) wget http://genome.imim.es/genepredictions/D.yakuba/golden_path_200404/geneidv1.2/$chr.gtf end ldHgGene -gtf -genePredExt droYak1 geneid *.gtf # GENEMAPPER PREDICTIONS FROM UCB (DONE 1/24/06 angie) ssh hgwdev mkdir /cluster/data/droYak1/bed/geneMapper cd /cluster/data/droYak1/bed/geneMapper wget http://bio.math.berkeley.edu/genemapper/GFF/rel0.2/DroYak_1.gff # Get rid of custom track header lines: egrep -v '^(track|browser) ' DroYak_1.gff > geneMapper.gtf # Don't use -genePredExt... there are no start/stop_codon items, so # all get marked "incmpl", and name2 always gets the same value as name. ldHgGene -gtf droYak1 geneMapper geneMapper.gtf # SWAP CHAINS FROM DROSIM1, BUILD NETS ETC. (DONE 6/26/06 angie) mkdir /cluster/data/droYak1/bed/blastz.droSim1.swap cd /cluster/data/droYak1/bed/blastz.droSim1.swap doBlastzChainNet.pl -swap /cluster/data/droSim1/bed/blastz.droYak1/DEF \ >& do.log & tail -f do.log ln -s blastz.droSim1.swap /cluster/data/droYak1/bed/blastz.droSim1