# for emacs: -*- mode: sh; -*- # Drosophila yakuba -- # # WUSTL's # http://www.genome.wustl.edu/??? # # NOTE: this doc may have genePred loads that fail to include # the bin column. Please correct that for the next build by adding # a bin column when you make any of these tables: # # mysql> SELECT tableName, type FROM trackDb WHERE type LIKE "%Pred%"; # +-------------+---------------------------------+ # | tableName | type | # +-------------+---------------------------------+ # | xenoRefGene | genePred xenoRefPep xenoRefMrna | # | genscan | genePred genscanPep | # +-------------+---------------------------------+ # DOWNLOAD SEQUENCE AND AGP (DONE 11/2/05 angie) ssh kkstore03 mkdir /cluster/store6/droYak2 cd /cluster/data ln -s /cluster/store6/droYak2 droYak2 cd /cluster/data/droYak2 mkdir downloads cd downloads wget ftp://genome.wustl.edu/private/lhillier/old/dyak2.tar.gz tar xvzf dyak2.tar.gz cd dyak2 faSize *.fa #168647858 bases (5982851 N's 162665007 real 162665007 upper 0 lower) in 20 sequences in 20 files #Total size: mean 8432392.9 sd 10946150.8 min 31700 (chr4_random) max 28832112 (chr3R) median 3277457 #N count: mean 299142.5 sd 709171.6 #U count: mean 8133250.3 sd 10635266.7 #L count: mean 0.0 sd 0.0 foreach c (2L 2R 2h 3L 3R 3h 4 U Uh X Xh Yh) mkdir ../../$c if (-e chr$c.fa) then mv chr$c.fa ../../$c mv chr$c.agp ../../$c endif if (-e chr${c}_random.fa) then mv chr${c}_random.fa ../../$c mv chr${c}_random.agp ../../$c endif end cd ../.. # download mitochondrion sequence mkdir M cd M # go to http://www.ncbi.nih.gov/ and search Genome for # "yakuba mitochondrion". That shows the gi number: # 5834829 # Use that number in the entrez linking interface to get fasta: wget -O chrM.fa \ 'http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=Text&db=Nucleotide&uid=5834829&dopt=FASTA' # Edit chrM.fa: make sure the long fancy header line says it's the # Drosophila yakuba mitochondrion complete genome, and then replace the # header line with just ">chrM". cd .. # checkAgpAndFa prints out way too much info -- keep the end/stderr only: foreach c (?{,?}) foreach agp ($c/chr$c{,_random}.agp) if (-e $agp) then set fa = $agp:r.fa echo checking consistency of $agp and $fa checkAgpAndFa $agp $fa | tail -1 endif end end # see what kind of gap types we have: awk '$5 == "N" {print $7;}' */chr*.agp | uniq | sort | uniq #contig #fragment # biggest gap size is 3000: awk '$5 == "N" {print $6;}' */chr*.agp | sort -nr | head -1 # rough hist of gap sizes from AGP: awk '$5 == "N" {print $6;}' */chr*.agp | textHistogram stdin -binSize=100 # fancy hist of gap size from FASTA (10, 250, 1000, 3000 overrepresented): cat */chr*.fa | faGapSizes -niceSizes=10,50,100,250,300,700,1000,3000 stdin # BREAK UP SEQUENCE INTO 5 MB CHUNKS AT CONTIGS/GAPS (DONE 11/2/05 angie) ssh kkstore03 cd /cluster/data/droYak2 foreach c (?{,?}) foreach agp ($c/chr$c{,_random}.agp) if (-e $agp) then set fa = $agp:r.fa echo splitting $agp and $fa cp -p $agp $agp.bak cp -p $fa $fa.bak splitFaIntoContigs $agp $fa . -nSize=5000000 endif end end # splitFaIntoContigs makes new dirs for _randoms. Move their contents # back into the main chrom dirs and get rid of the _random dirs. foreach d (*_random) set base = `echo $d | sed -e 's/_random$//'` mkdir -p $base/lift mv $d/lift/oOut.lst $base/lift/rOut.lst mv $d/lift/ordered.lft $base/lift/random.lft mv $d/lift/ordered.lst $base/lift/random.lst rmdir $d/lift mv $d/* $base rmdir $d end # Make a "pseudo-contig" for processing chrM too: mkdir M/chrM_1 sed -e 's/chrM/chrM_1/' M/chrM.fa > M/chrM_1/chrM_1.fa mkdir M/lift echo "chrM_1/chrM_1.fa.out" > M/lift/oOut.lst echo "chrM_1" > M/lift/ordered.lst set msize = `faSize M/chrM.fa | awk '{print $1;}'` echo "0\tM/chrM_1\t$msize\tchrM\t$msize" > M/lift/ordered.lft # MAKE JKSTUFF AND BED DIRECTORIES (DONE 11/2/05 angie) # This used to hold scripts -- better to keep them inline here so # they're in CVS. Now it should just hold lift file(s) and # temporary scripts made by copy-paste from this file. mkdir /cluster/data/droYak2/jkStuff # This is where most tracks will be built: mkdir /cluster/data/droYak2/bed # CREATING DATABASE AND GRP TABLE (DONE 11/2/05 angie) # Create the database. ssh hgwdev # Make sure there is at least 5 gig free for the database df -h /var/lib/mysql hgsql -e 'create database droYak2' hgsql -e "create table grp (PRIMARY KEY(NAME)) select * from droYak1.grp" \ droYak2 # MAKE CHROMINFO TABLE WITH (TEMPORARILY UNMASKED) 2BIT (DONE 11/2/05 angie) # Make nib/, unmasked until RepeatMasker and TRF steps are done. # Do this now so we can load up RepeatMasker and run featureBits; # can also load up other tables that don't depend on masking. ssh kkstore03 cd /cluster/data/droYak2 faToTwoBit ?{,?}/chr*.fa droYak2.2bit mkdir bed/chromInfo twoBitInfo droYak2.2bit stdout \ | awk '{print $1 "\t" $2 "\t/gbdb/droYak2/droYak2.2bit";}' \ > bed/chromInfo/chromInfo.tab # Make symbolic links from /gbdb/droYak2/nib to the real nibs. ssh hgwdev mkdir /gbdb/droYak2 ln -s /cluster/data/droYak2/droYak2.2bit /gbdb/droYak2/ # Load /gbdb/droYak2/droYak2.2bit paths into database and save size info. cd /cluster/data/droYak2 hgsql droYak2 < $HOME/kent/src/hg/lib/chromInfo.sql hgsql droYak2 -e 'load data local infile \ "/cluster/data/droYak2/bed/chromInfo/chromInfo.tab" \ into table chromInfo;' echo "select chrom,size from chromInfo" | hgsql -N droYak2 > chrom.sizes # take a look at chrom.sizes size wc chrom.sizes # 21 42 338 chrom.sizes # REPEATMASKER (DONE 11/4/05 angie) #- Split contigs into 500kb chunks, at gaps if possible: ssh kkstore03 cd /cluster/data/droYak2 foreach c (?{,?}) foreach d ($c/chr${c}*_?{,?}) cd $d echo "splitting $d" set contig = $d:t faSplit gap $contig.fa 500000 ${contig}_ -lift=$contig.lft \ -minGapSize=100 cd ../.. end end #- Make the run directory and job list: cd /cluster/data/droYak2 cat << '_EOF_' > jkStuff/RMDrosophila #!/bin/csh -fe cd $1 pushd . /bin/mkdir -p /tmp/droYak2/$2 /bin/cp $2 /tmp/droYak2/$2/ cd /tmp/droYak2/$2 /cluster/bluearc/RepeatMasker/RepeatMasker -s -spec drosophila $2 popd /bin/cp /tmp/droYak2/$2/$2.out ./ if (-e /tmp/droYak2/$2/$2.tbl) /bin/cp /tmp/droYak2/$2/$2.tbl ./ if (-e /tmp/droYak2/$2/$2.cat) /bin/cp /tmp/droYak2/$2/$2.cat ./ /bin/rm -fr /tmp/droYak2/$2/* /bin/rmdir --ignore-fail-on-non-empty /tmp/droYak2/$2 /bin/rmdir --ignore-fail-on-non-empty /tmp/droYak2 '_EOF_' # << this line makes emacs coloring happy chmod +x jkStuff/RMDrosophila mkdir RMRun cp /dev/null RMRun/RMJobs foreach c (?{,?}) foreach d ($c/chr${c}{,_random}_?{,?}) set ctg = $d:t foreach f ( $d/${ctg}_?{,?}.fa ) set f = $f:t echo /cluster/data/droYak2/jkStuff/RMDrosophila \ /cluster/data/droYak2/$d $f \ '{'check out line+ /cluster/data/droYak2/$d/$f.out'}' \ >> RMRun/RMJobs end end end # Do a dummy run to make sure that the libraries are unpacked # before kicking off a cluster run: /cluster/bluearc/RepeatMasker/RepeatMasker -spec drosophila /dev/null #- Do the run ssh kk cd /cluster/data/droYak2/RMRun para make RMJobs para time #Completed: 417 of 417 jobs #Average job time: 6212s 103.53m 1.73h 0.07d #Longest finished job: 8274s 137.90m 2.30h 0.10d #Submission to last job: 15830s 263.83m 4.40h 0.18d #- Lift up the 500KB chunk .out's to 5MB ("pseudo-contig") level ssh kkstore03 cd /cluster/data/droYak2 foreach d (*/chr*_?{,?}) set contig = $d:t echo $contig liftUp $d/$contig.fa.out $d/$contig.lft warn $d/${contig}_*.fa.out \ > /dev/null end #- Lift pseudo-contigs to chromosome level foreach c (?{,?}) echo lifting $c cd $c if (-e lift/ordered.lft && ! -z lift/ordered.lft) then liftUp chr$c.fa.out lift/ordered.lft warn `cat lift/oOut.lst` \ > /dev/null endif if (-e lift/random.lft && ! -z lift/random.lft) then liftUp chr${c}_random.fa.out lift/random.lft warn `cat lift/rOut.lst` \ > /dev/null endif cd .. end #- Load the .out files into the database with: ssh hgwdev cd /cluster/data/droYak2 hgLoadOut droYak2 */chr*.fa.out # VERIFY REPEATMASKER RESULTS (DONE 11/15/05 angie) # Eyeball some repeat annotations in the browser, compare to lib seqs. # Run featureBits on droYak2 and on a comparable genome build, and compare: ssh hgwdev featureBits droYak2 rmsk #27434880 bases of 162681153 (16.864%) in intersection # compare to droYak1: featureBits droYak1 rmsk #29021326 bases of 169423277 (17.129%) in intersection # Interesting that both total #bases and %repetitive dropped in # the new assembly... same holds for just chr2L, so it's not just a # chr*_random difference: featureBits -chrom=chr2L droYak2 rmsk #1515320 bases of 22210025 (6.823%) in intersection featureBits -chrom=chr2L droYak1 rmsk #1554767 bases of 22382034 (6.946%) in intersection # LaDeana says this is as expected. # MAKE HGCENTRALTEST ENTRY AND TRACKDB TABLE (DONE 11/15/05 angie) # Make trackDb table so browser knows what tracks to expect: ssh hgwdev cd ~/kent/src/hg/makeDb/trackDb cvs up -d -P # Edit that makefile to add droYak2 in all the right places and do make update # Add trackDb directories mkdir drosophila/droYak2 cvs add drosophila/droYak2 cvs commit drosophila/droYak2 mkdir /gbdb/droYak2/html cvs commit makefile # go public on genome-test make alpha # Warning: genome and organism fields must correspond # with defaultDb values hgsql -h genome-testdb hgcentraltest \ -e 'INSERT INTO dbDb \ (name, description, nibPath, organism, \ defaultPos, active, orderKey, genome, scientificName, \ htmlPath, hgNearOk, hgPbOk, sourceName) values \ ("droYak2", "Nov. 2005", "/gbdb/droYak2/nib", "D. yakuba", \ "chr2L:809001-825000", 1, 53, "D. yakuba", \ "Drosophila yakuba", "/gbdb/droYak2/html/description.html", \ 0, 0, "WUSTL version 2.0");' # MAKE LIFTALL.LFT (DONE 11/2/05 angie) ssh kkstore03 cd /cluster/data/droYak2 cat */lift/{ordered,random}.lft > jkStuff/liftAll.lft # SIMPLE REPEATS (TRF) (DONE 11/3/05 angie) ssh kkstore03 mkdir /cluster/data/droYak2/bed/simpleRepeat cd /cluster/data/droYak2/bed/simpleRepeat mkdir trf cp /dev/null jobs.csh foreach d (/cluster/data/droYak2/?{,?}/chr*_?{,?}) set ctg = $d:t foreach f ($d/${ctg}.fa) set fout = $f:t:r.bed echo $fout echo "trfBig -trf=/cluster/bin/i386/trf $f /dev/null -bedAt=trf/$fout -tempDir=/tmp" \ >> jobs.csh end end tcsh jobs.csh >&! jobs.log & # check on this with tail -f jobs.log wc -l jobs.csh ls -1 trf | wc -l liftUp simpleRepeat.bed ../../jkStuff/liftAll.lft warn \ trf/*.bed > /dev/null # Load this into the database as so ssh hgwdev hgLoadBed droYak2 simpleRepeat \ /cluster/data/droYak2/bed/simpleRepeat/simpleRepeat.bed \ -sqlTable=$HOME/kent/src/hg/lib/simpleRepeat.sql # FILTER SIMPLE REPEATS (TRF) INTO MASK (DONE 11/3/05 angie) # make a filtered version of the trf output: # keep trf's with period <= 12: ssh kkstore03 cd /cluster/data/droYak2/bed/simpleRepeat mkdir trfMask foreach f (trf/*.bed) echo -n "filtering $f... " awk '{if ($5 <= 12) print;}' $f > trfMask/$f:t end # Lift up filtered trf output to chrom coords: mkdir trfMaskChrom foreach f (../../?{,?}/chr*.fa) set c = $f:t:r liftUp trfMaskChrom/$c.bed ../../jkStuff/liftAll.lft warn \ trfMask/${c}_[0-9]*.bed > /dev/null end # MASK FA USING REPEATMASKER AND FILTERED TRF FILES (DONE 11/15/05 angie) ssh kkstore03 cd /cluster/data/droYak2 # Soft-mask (lower-case) the contig and chr .fa's, # then make hard-masked versions from the soft-masked. set trfCtg=bed/simpleRepeat/trfMask set trfChr=bed/simpleRepeat/trfMaskChrom foreach f (*/chr*.fa) echo "repeat- and trf-masking $f" maskOutFa -soft $f $f.out $f set chr = $f:t:r maskOutFa -softAdd $f $trfChr/$chr.bed $f echo "hard-masking $f" maskOutFa $f hard $f.masked end foreach c (?{,?}) echo "repeat- and trf-masking contigs of chr$c, chr${c}_random" foreach d ($c/chr*_?{,?}) set ctg=$d:t set f=$d/$ctg.fa maskOutFa -soft $f $f.out $f maskOutFa -softAdd $f $trfCtg/$ctg.bed $f maskOutFa $f hard $f.masked end end #- Rebuild the 2bit, using the soft masking in the fa: faToTwoBit ?{,?}/chr*.fa droYak2.2bit # GOLD AND GAP TRACKS (DONE 11/3/05 angie) ssh hgwdev cd /cluster/data/droYak2 cp /dev/null chrom.lst foreach f (?{,?}/chr*.agp chrM) echo $f:t:r >> chrom.lst end hgGoldGapGl -noGl -chromLst=chrom.lst droYak2 /cluster/data/droYak2 . # featureBits fails if there's no chrM_gap, so make one: # echo "create table chrM_gap like chr1_gap" | hgsql droYak2 # oops, that won't work until v4.1, so do this for the time being: hgsql droYak2 -e 'create table chrM_gap select * from chr2L_gap where 0=1' # MAKE GCPERCENT (DONE 11/3/05 angie) ssh hgwdev mkdir /cluster/data/droYak2/bed/gc5Base cd /cluster/data/droYak2/bed/gc5Base hgGcPercent -wigOut -doGaps -file=stdout -win=5 -verbose=2 droYak2 \ /cluster/data/droYak2 | wigEncode stdin gc5Base.wig gc5Base.wib mkdir /gbdb/droYak2/wib ln -s `pwd`/gc5Base.wib /gbdb/droYak2/wib hgLoadWiggle -pathPrefix=/gbdb/droYak2/wib droYak2 gc5Base gc5Base.wig # MAKE DOWNLOADABLE SEQUENCE FILES (DONE 11/15/05 angie) ssh kolossus cd /cluster/data/droYak2 #- Build the .tar.gz files -- no genbank for now. cat << '_EOF_' > jkStuff/zipAll.csh rm -rf bigZips mkdir bigZips tar cvzf bigZips/chromAgp.tar.gz ?{,?}/chr*.agp tar cvzf bigZips/chromOut.tar.gz ?{,?}/chr*.fa.out tar cvzf bigZips/chromFa.tar.gz ?{,?}/chr*.fa tar cvzf bigZips/chromFaMasked.tar.gz ?{,?}/chr*.fa.masked cd bed/simpleRepeat tar cvzf ../../bigZips/chromTrf.tar.gz trfMaskChrom/chr*.bed cd ../.. '_EOF_' # << this line makes emacs coloring happy csh -ef ./jkStuff/zipAll.csh |& tee zipAll.log #- Look at zipAll.log to make sure all file lists look reasonable. cd bigZips md5sum *.gz > md5sum.txt # Make a README.txt cd .. mkdir chromGz foreach f ( ?{,?}/chr*.fa ) echo $f:t:r gzip -c $f > chromGz/$f:t.gz end cd chromGz md5sum *.gz > md5sum.txt # Make a README.txt #- Link the .gz and .txt files to hgwdev:/usr/local/apache/... ssh hgwdev set gp = /usr/local/apache/htdocs/goldenPath/droYak2 mkdir -p $gp/bigZips ln -s /cluster/data/droYak2/bigZips/{chrom*.tar.gz,*.txt} $gp/bigZips mkdir -p $gp/chromosomes ln -s /cluster/data/droYak2/chromGz/{chr*.gz,*.txt} $gp/chromosomes # Take a look at bigZips/* and chromosomes/* # Can't make refGene upstream sequence files - no refSeq for yakuba. mkdir $gp/database # Create README.txt files in database/ to explain the files. # PUT MASKED SEQUENCE OUT FOR CLUSTER RUNS (DONE 11/15/05 angie) ssh kkr1u00 # Chrom-level 2bit that has been repeat- and trf-masked: mkdir /iscratch/i/droYak2 cp -p /cluster/data/droYak2/droYak2.2bit /iscratch/i/droYak2/ iSync # PRODUCING GENSCAN PREDICTIONS (DONE 11/15/05 angie) # Run on small cluster -- genscan needs big mem. ssh hgwdev mkdir /cluster/data/droYak2/bed/genscan cd /cluster/data/droYak2/bed/genscan # Check out hg3rdParty/genscanlinux to get latest genscan: cvs co hg3rdParty/genscanlinux # Run on small cluster (more mem than big cluster). ssh kki cd /cluster/data/droYak2/bed/genscan # Make 3 subdirectories for genscan to put their output files in mkdir gtf pep subopt # Generate a list file, genome.list, of all the hard-masked contigs that # *do not* consist of all-N's (which would cause genscan to blow up) cp /dev/null genome.list foreach f ( `ls -1S /cluster/data/droYak2/*/chr*_*/chr*_?{,?}.fa.masked` ) egrep '[ACGT]' $f > /dev/null if ($status == 0) echo $f >> genome.list end wc -l genome.list #47 # Create template file, gsub, for gensub2. For example (3-line file): cat << '_EOF_' > gsub #LOOP /cluster/bin/x86_64/gsBig {check in line+ $(path1)} {check out line gtf/$(root1).gtf} -trans={check out line pep/$(root1).pep} -subopt={check out line subopt/$(root1).bed} -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=2400000 #ENDLOOP '_EOF_' # << this line makes emacs coloring happy gensub2 genome.list single gsub jobList para make jobList para time #Completed: 47 of 47 jobs #Average job time: 109s 1.81m 0.03h 0.00d #Longest finished job: 225s 3.75m 0.06h 0.00d #Submission to last job: 396s 6.60m 0.11h 0.00d # If there are crashes, diagnose with "para problems". # If a job crashes due to genscan running out of memory, re-run it # manually with "-window=1200000" instead of "-window=2400000". # Convert these to chromosome level files as so: ssh kkstore03 cd /cluster/data/droYak2/bed/genscan liftUp genscan.gtf ../../jkStuff/liftAll.lft warn gtf/*.gtf liftUp genscanSubopt.bed ../../jkStuff/liftAll.lft warn subopt/*.bed cat pep/*.pep > genscan.pep # Load into the database as so: ssh hgwdev cd /cluster/data/droYak2/bed/genscan ldHgGene droYak2 genscan genscan.gtf hgPepPred droYak2 generic genscanPep genscan.pep hgLoadBed droYak2 genscanSubopt genscanSubopt.bed # MAKE 11.OOC FILE FOR BLAT (DONE 11/15/05 angie) # Use -repMatch=100 (based on size -- for human we use 1024, and # fly size is ~4.4% of human judging by gapless dm1 genome size from # featureBits -- we would use 45, but bump that up a bit to be more # conservative). ssh kkr1u00 mkdir /cluster/bluearc/droYak2 blat /cluster/data/droYak2/droYak2.2bit /dev/null /dev/null -tileSize=11 \ -makeOoc=/cluster/bluearc/droYak2/11.ooc -repMatch=100 #Wrote 7237 overused 11-mers to /cluster/bluearc/droYak2/11.ooc cp -p /cluster/bluearc/droYak2/*.ooc /iscratch/i/droYak2/ iSync # AUTO UPDATE GENBANK MRNA RUN (DONE 11/23/05 angie) ssh hgwdev # Update genbank config and source in CVS: cd ~/kent/src/hg/makeDb/genbank cvsup . make # See if /cluster/data/genbank/etc/genbank.conf has had any un-checked-in # edits, check them in if necessary: diff /cluster/data/genbank/etc/genbank.conf etc/genbank.conf # Edit etc/genbank.conf and add these lines: # droYak2 (D. yakuba) droYak2.serverGenome = /cluster/data/droYak2/droYak2.2bit droYak2.clusterGenome = /iscratch/i/droYak2/droYak2.2bit droYak2.ooc = /iscratch/i/droYak2/11.ooc droYak2.lift = /cluster/data/droYak2/jkStuff/liftAll.lft droYak2.refseq.mrna.native.pslCDnaFilter = ${lowCover.refseq.mrna.native.pslCDnaFilter} droYak2.refseq.mrna.xeno.pslCDnaFilter = ${lowCover.refseq.mrna.xeno.pslCDnaFilter} droYak2.genbank.mrna.native.pslCDnaFilter = ${lowCover.genbank.mrna.native.pslCDnaFilter} droYak2.genbank.mrna.xeno.pslCDnaFilter = ${lowCover.genbank.mrna.xeno.pslCDnaFilter} droYak2.genbank.est.native.pslCDnaFilter = ${lowCover.genbank.est.native.pslCDnaFilter} droYak2.refseq.mrna.native.load = no droYak2.refseq.mrna.xeno.load = yes droYak2.genbank.mrna.xeno.load = yes droYak2.downloadDir = droYak2 cvs ci etc/genbank.conf # update /cluster/data/genbank/ make etc-update ssh kkstore02 cd /cluster/data/genbank nice bin/gbAlignStep -initial droYak2 & # when finished ssh hgwdev cd /cluster/data/genbank nice ./bin/gbDbLoadStep -drop -initialLoad droYak2 & featureBits droYak2 xenoRefGene #27309563 bases of 162681153 (16.787%) in intersection featureBits droYak2 mrna #376949 bases of 162681153 (0.232%) in intersection featureBits droYak2 xenoMrna #23932293 bases of 162681153 (14.711%) in intersection featureBits droYak2 est #1768531 bases of 162681153 (1.087%) in intersection # SWAP CHAINS FROM DM2, BUILD NETS ETC. (DONE 11/23/05 angie) mkdir /cluster/data/droYak2/bed/blastz.dm2.swap cd /cluster/data/droYak2/bed/blastz.dm2.swap doBlastzChainNet.pl -swap /cluster/data/dm2/bed/blastz.droYak2/DEF \ >& do.log & tail -f do.log # Add {chain,net}Dm2 to trackDb.ra if necessary. # MAKE THIS THE DEFAULT ASSEMBLY (DONE 11/23/05 angie) # -- when there are enough tracks! hgsql -h genome-testdb hgcentraltest \ -e 'UPDATE defaultDb set name = "droYak2" where genome = "D. yakuba"' # MAKE Drosophila Proteins track (DONE braney 2005-12-07) ssh kkstore03 mkdir -p /cluster/data/droYak2/blastDb cd /cluster/data/droYak2/blastDb awk "{print \$2}" ../*/chr*/*.lft > subChr.lst for i in `cat subChr.lst`; do ln -s ../*/*/$i.fa; done for i in *.fa; do /cluster/bluearc/blast2211x86_64/bin/formatdb -i $i -p F 2> /dev/null; done rm *.fa *.log cd .. cat */chr*/*.lft > jkStuff/subChr.lft ssh kk destDir=/cluster/panasas/home/store/droYak2/blastDb mkdir -p $destDir cp /cluster/data/droYak2/blastDb/* $destDir mkdir -p /cluster/data/droYak2/bed/tblastn.dm2FB cd /cluster/data/droYak2/bed/tblastn.dm2FB ls -1S $destDir/*.nsq | sed "s/\.nsq//" > target.lst mkdir fbfa # calculate a reasonable number of jobs calc `wc /cluster/data/dm2/bed/blat.dm2FB/dm2FB.psl|awk "{print \\\$1}"`/\(80000/`wc target.lst | awk "{print \\\$1}"`\) # 18929/(80000/417) = 98.667412 split -l 99 /cluster/data/dm2/bed/blat.dm2FB/dm2FB.psl fbfa/fb cd fbfa for i in *; do pslxToFa $i $i.fa; rm $i; done cd .. ls -1S fbfa/*.fa > fb.lst mkdir -p /cluster/bluearc/droYak2/bed/tblastn.dm2FB/blastOut ln -s /cluster/bluearc/droYak2/bed/tblastn.dm2FB/blastOut for i in `cat fb.lst`; do mkdir blastOut/`basename $i .fa`; done tcsh cat << '_EOF_' > blastGsub #LOOP blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl } #ENDLOOP '_EOF_' cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/iscratch/i/blast/data export BLASTMAT g=`basename $2` f=/tmp/`basename $3`.$g for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11 do if /scratch/blast/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8 then mv $f.8 $f.1 break; fi done if test -f $f.1 then if /cluster/bin/i386/blastToPsl $f.1 $f.2 then liftUp -nosort -type=".psl" -nohead $f.3 /cluster/data/droYak2/jkStuff/subChr.lft warn $f.2 liftUp -nosort -type=".psl" -nohead $f.4 /cluster/data/droYak2/jkStuff/liftAll.lft warn $f.3 liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/dm2/bed/blat.dm2FB/protein.lft warn $f.4 mv $3.tmp $3 rm -f $f.1 $f.2 $f.3 $f.4 exit 0 fi fi rm -f $f.1 $f.2 $3.tmp $f.3 $f.8 $f.4 exit 1 '_EOF_' chmod +x blastSome gensub2 target.lst fb.lst blastGsub blastSpec para create blastSpec para push # Completed: 80064 of 80064 jobs # CPU time in finished jobs: 2324537s 38742.28m 645.70h 26.90d 0.074 y # IO & Wait Time: 586197s 9769.96m 162.83h 6.78d 0.019 y # Average job time: 36s 0.61m 0.01h 0.00d # Longest finished job: 348s 5.80m 0.10h 0.00d # Submission to last job: 10775s 179.58m 2.99h 0.12d ssh kkstore03 cd /cluster/data/droYak2/bed/tblastn.dm2FB tcsh cat << '_EOF_' > chainGsub #LOOP chainSome $(path1) #ENDLOOP '_EOF_' cat << '_EOF_' > chainSome (cd $1; cat q.*.psl | /cluster/bin/i386/simpleChain -prot -outPsl -maxGap=25000 stdin ../c.`basename $1`.psl) '_EOF_' chmod +x chainSome ls -1dS `pwd`/blastOut/fb?? > chain.lst gensub2 chain.lst single chainGsub chainSpec para create chainSpec para push # Completed: 192 of 192 jobs # CPU time in finished jobs: 11624s 193.73m 3.23h 0.13d 0.000 y # IO & Wait Time: 3404s 56.74m 0.95h 0.04d 0.000 y # Average job time: 78s 1.30m 0.02h 0.00d # Longest finished job: 1681s 28.02m 0.47h 0.02d # Submission to last job: 1681s 28.02m 0.47h 0.02d cd /cluster/data/droYak2/bed/tblastn.dm2FB/blastOut for i in fb?? do awk "(\$13 - \$12)/\$11 > 0.6 {print}" c.$i.psl > c60.$i.psl sort -rn c60.$i.psl | pslUniq stdin u.$i.psl awk "((\$1 / \$11) ) > 0.60 { print }" c60.$i.psl > m60.$i.psl echo $i done sort -T /tmp -k 14,14 -k 16,16n -k 17,17n u.*.psl m60* | uniq > /cluster/data/droYak2/bed/tblastn.dm2FB/blastDm2FB.psl cd .. wc blastDm2FB.psl # 21136 443856 3623407 blastDm2FB.psl pslUniq blastDm2FB.psl stdout | wc # 18837 395577 3306944 cat fbfa/*fa | grep ">" | wc # 82338 82338 1300520 ssh hgwdev cd /cluster/data/droYak2/bed/tblastn.dm2FB hgLoadPsl droYak2 blastDm2FB.psl featureBits droYak2 blastDm2FB # 21478785 bases of 162681153 (13.203%) in intersection exit # back to kkstore03 rm -rf blastOut # End tblastn # CONTAMINATION (DONE 12/13/05 angie) # LaDeana emailed a list of 57 contaminated contigs on chrU -- # make a track out of her coords. She cleared them all to NNN's # in a 2.1 release for FlyBase, but all coords are the same and # we can just make a contamination track showing the affected # chrU contigs, instead of redoing the whole db. mkdir /cluster/data/droYak2/bed/contamination cd /cluster/data/droYak2/bed/contamination # saved off LaDeana's emailed list to contamContigs.txt wc -l contamContigs.txt #57 contamContigs.txt cp /dev/null contamination.bed foreach ctg (`cat contamContigs.txt`) fgrep -w $ctg ../../U/chrU.agp \ | awk '{print $1 "\t" $2-1 "\t" $3 "\t" $6;}' >> contamination.bed end wc -l contamination.bed #57 contamination.bed hgLoadBed droYak2 contamination contamination.bed # SWAP CHAINS FROM DROSIM1, BUILD NETS ETC. (DONE 8/1/06 angie) mkdir /cluster/data/droYak2/bed/blastz.droSim1.swap cd /cluster/data/droYak2/bed/blastz.droSim1.swap doBlastzChainNet.pl -swap /cluster/data/droSim1/bed/blastz.droYak2/DEF \ >& do.log & tail -f do.log ln -s blastz.droSim1.swap /cluster/data/droYak2/bed/blastz.droSim1 ########################################################################### # SWAP/CHAIN/NET DM3 (DONE 6/7/07 angie) ssh kkstore03 mkdir /cluster/data/droYak2/bed/blastz.dm3.swap cd /cluster/data/droYak2/bed/blastz.dm3.swap doBlastzChainNet.pl -swap /cluster/data/dm3/bed/blastz.droYak2/DEF >& do.log & tail -f do.log ln -s blastz.dm3.swap /cluster/data/droYak2/bed/blastz.dm3