# sequence from Webb Miller ssh kkstore02 mkdir -p /cluster/store10/ancestor/canHg9/fromWebb ln -s /cluster/store10/ancestor/canHg9 /cluster/data/canHg4 cd /cluster/data/canHg9/fromWebb zcat borEut.gz | sed "s/ borEut/chr15/" > canHg9.fa cd /cluster/data/canHg9/fromRico zcat borEut.gz | cat ../fromWebb/canHg9.fa - | sed "s/ borEut/cftr/" > canHg4.fa scaffoldFaToAgp canHg9.fa sed "s/chrUn/chr15/g" *.gap > tmp.gap mv tmp.gap canHg9.gap sed "s/chrUn/chr15/g" *.agp > tmp.agp mv tmp.agp canHg9.agp cd .. mkdir 15 mv fromWebb/canHg9.agp 15/chr15.agp mv fromWebb/canHg9.lft 15/chr15.lft mkdir jkStuff # Create chromosome FA file from AGP and file of masked scaffolds cd 15 agpToFa -simpleMultiMixed chr15.agp chr15 chr15.fa ../fromWebb/canHg9.fa cd .. # STORE SEQUENCE AND ASSEMBLY INFORMATION # Translate to nib cd /cluster/data/canHg9 mkdir nib for i in */split do f=`dirname $i` faToNib $f/$f.fa nib/$f.nib done faToTwoBit */*.fa canHg9.2bit cd 15 splitFaIntoContigs chr15.agp chr15.fa . -nSize=5000000 cd .. echo 15 > chrom.lst mv 15 old.15 mv old.15/15 15 cat 15/lift/*.lft > jkStuff/liftAll.lft cd 15 for i in chr*_*; do cd $i; faSplit gap $i.fa 500000 "$i"_ -lift=$i.lft -minGapSize=100; cd ..; done cd .. # CREATING DATABASE # Create the database. ssh hgwdev cd /cluster/data/canHg9 echo 'create database canHg9' | hgsql '' mkdir -p /gbdb/canHg9/nib ln -s /cluster/data/canHg9/canHg4.2bit /gbdb/canHg4/ # Make symbolic links from /gbdb/canHg9/nib to the real nibs. mkdir -p /gbdb/canHg9/nib ln -s /cluster/data/canHg9/nib/*.nib /gbdb/canHg4/nib hgsql canHg9 < $HOME/kent/src/hg/lib/chromInfo.sql hgNibSeq -preMadeNib canHg9 /gbdb/canHg4/nib */*.fa echo "select chrom,size from chromInfo" | hgsql -N canHg9 > chrom.sizes # create assembly and gap tracks hgGoldGapGl -noGl -chromLst=chrom.lst canHg9 /cluster/data/canHg4 . echo "create table grp (PRIMARY KEY(NAME)) select * from hg17.grp" | hgsql canHg9 echo 'insert into blatServers values("canHg9", "blat6", "17778", "1","0"); \ insert into blatServers values("canHg9", "blat6", "17779", "0","1");' \ | hgsql -h genome-testdb hgcentraltest # MAKE GCPERCENT mkdir -p /cluster/data/canHg9/bed/gcPercent cd /cluster/data/canHg9/bed/gcPercent hgsql canHg9 < ~/kent/src/hg/lib/gcPercent.sql # load gcPercent table hgGcPercent canHg9 ../../nib cd ../.. # Add dbDb and defaultDb entries: echo 'insert into dbDb (name, description, nibPath, organism, defaultPos, active, orderKey, genome, scientificName, \ htmlPath, hgNearOk) values("canHg9", "July 14. 2005", "/gbdb/canHg9/nib", "Dog/Human", "", 1, \ 44, "Dog/Human", "BoreoEutheria", "/gbdb/canHg9/html/description.html", 0);' \ | hgsql -h genome-testdb hgcentraltest # #echo 'INSERT INTO defaultDb VALUES ("Dog/Human", "canHg9");' | hgsql -h genome-testdb hgcentraltest echo 'update defaultDb set name = "canHg9" where genome = "Dog/Human"' | hgsql -h genome-testdb hgcentraltest # #echo 'insert into genomeClade (genome, clade, priority) values ("BoreoEutheria", "vertebrate",65);' \ # | hgsql -h genome-testdb hgcentraltest # echo 'INSERT INTO defaultDb VALUES ("Dog/Human", "canHg9");' | hgsql -h genome-testdb hgcentraltest # Make trackDb table so browser knows what tracks to expect: cd $HOME/kent/src/hg/makeDb/trackDb mkdir -p borEut/canHg9 cvs add borEut cvs add borEut/canHg9 cvs ci borEut # Edit that makefile to add canHg9 in all the right places and do make update # SIMPLE REPEATS (TRF) ssh kkstore02 mkdir -p /cluster/data/canHg9/bed/simpleRepeat cd /cluster/data/canHg9/bed/simpleRepeat mkdir trf tcsh cp /dev/null jobs.csh foreach d (/cluster/data/canHg9/*/split) foreach f ($d/*.fa) set fout = $f:t:r.bed echo $fout echo "/cluster/bin/i386/trfBig -trf=/cluster/bin/i386/trf $f /dev/null -bedAt=trf/$fout -tempDir=/tmp" >> jobs.csh end end screen csh -ef jobs.csh # check on this with tail -f jobs.log wc -l jobs.csh ls -1 trf | wc -l endsInLf trf/* # When job is done do: liftUp simpleRepeat.bed /cluster/data/canHg9/jkStuff/liftAll.lft warn trf/*.bed # Load into the database: ssh hgwdev hgLoadBed canHg9 simpleRepeat /cluster/data/canHg9/bed/simpleRepeat/simpleRepeat.bed -sqlTable=$HOME/kent/src/hg/lib/simpleRepeat.sql featureBits canHg9 simpleRepeat # 302498 bases of 65811035 (0.460%) in intersection # 302498 bases of 67269975 (0.450%) in intersection # 308592 bases of 67269975 (0.459%) in intersection # 321169 bases of 67460583 (0.476%) in intersection # 321414 bases of 67472088 (0.476%) in intersection # in canHg3 290048 bases of 64944656 (0.447%) in int # in borEut2 213442 bases of 62796757 (0.340%) in intersection # in borEut1 591095 bases of 70435997 (0.839%) in intersection # REPEAT MASKING #- Make the run directory and job list: cd /cluster/data/canHg9 tcsh cat << '_EOF_' > jkStuff/RMCanHg #!/bin/csh -fe cd $1 pushd . /bin/mkdir -p /tmp/canHg9/$2 /bin/cp $2 /tmp/canHg9/$2/ cd /tmp/canHg9/$2 /cluster/bluearc/RepeatMasker/RepeatMasker -ali -s $2 popd /bin/cp /tmp/canHg9/$2/$2.out ./ if (-e /tmp/canHg9/$2/$2.align) /bin/cp /tmp/canHg4/$2/$2.align ./ if (-e /tmp/canHg9/$2/$2.tbl) /bin/cp /tmp/canHg4/$2/$2.tbl ./ if (-e /tmp/canHg9/$2/$2.cat) /bin/cp /tmp/canHg4/$2/$2.cat ./ /bin/rm -fr /tmp/canHg9/$2/* /bin/rmdir --ignore-fail-on-non-empty /tmp/canHg9/$2 /bin/rmdir --ignore-fail-on-non-empty /tmp/canHg9 '_EOF_' # << this line makes emacs coloring happy chmod +x jkStuff/RMCanHg mkdir RMRun for i in `cat chrom.lst` do f=$i/*/*_*_*.fa for j in $f do d=`dirname $j` f=`basename $j` echo "/cluster/data/canHg9/jkStuff/RMCanHg /cluster/data/canHg4/$d $f {check out line+ /cluster/data/canHg4/$d/$f.out} " done done > RMRun/RMJobs #- Do the run ssh kk cd /cluster/data/canHg9/RMRun para create RMJobs para try, para check, para check, para push, para check,... # Completed: 132 of 132 jobs # CPU time in finished jobs: 1429795s 23829.92m 397.17h 16.55d 0.045 y # IO & Wait Time: 51412s 856.86m 14.28h 0.60d 0.002 y # Average job time: 11221s 187.02m 3.12h 0.13d # Longest finished job: 12593s 209.88m 3.50h 0.15d # Submission to last job: 12629s 210.48m 3.51h 0.15d # Completed: 135 of 135 jobs # CPU time in finished jobs: 1446975s 24116.24m 401.94h 16.75d 0.046 y # IO & Wait Time: 54030s 900.51m 15.01h 0.63d 0.002 y # Average job time: 11119s 185.31m 3.09h 0.13d # Longest finished job: 12443s 207.38m 3.46h 0.14d # Submission to last job: 12551s 209.18m 3.49h 0.15d # Completed: 135 of 135 jobs # CPU time in finished jobs: 1448115s 24135.25m 402.25h 16.76d 0.046 y # IO & Wait Time: 25745s 429.09m 7.15h 0.30d 0.001 y # Average job time: 10917s 181.96m 3.03h 0.13d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 12323s 205.38m 3.42h 0.14d # Submission to last job: 12446s 207.43m 3.46h 0.14d # Completed: 135 of 135 jobs # CPU time in finished jobs: 1447724s 24128.73m 402.15h 16.76d 0.046 y # IO & Wait Time: 10980s 183.00m 3.05h 0.13d 0.000 y # Average job time: 10805s 180.09m 3.00h 0.13d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 12471s 207.85m 3.46h 0.14d # Submission to last job: 12481s 208.02m 3.47h 0.14d # Completed: 130 of 130 jobs # CPU time in finished jobs: 1390313s 23171.88m 386.20h 16.09d 0.044 y # IO & Wait Time: 14211s 236.86m 3.95h 0.16d 0.000 y # Average job time: 10804s 180.07m 3.00h 0.13d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 12585s 209.75m 3.50h 0.15d # Submission to last job: 12609s 210.15m 3.50h 0.15d #- Lift up the 500KB chunk .out's to 5MB ("pseudo-contig") level ssh kkstore02 cd /cluster/data/canHg9 tcsh foreach d (*/chr*_?{,?}) set contig = $d:t echo $contig liftUp $d/$contig.fa.out $d/$contig.lft warn $d/${contig}_*.fa.out > /dev/null end #- Lift pseudo-contigs to chromosome level foreach c (`cat chrom.lst`) echo lifting $c cd $c if (-e lift/ordered.lft && ! -z lift/ordered.lft) then liftUp chr$c.fa.out lift/ordered.lft warn `cat lift/oOut.lst` \ > /dev/null endif if (-e lift/random.lft && ! -z lift/random.lft) then liftUp chr${c}_random.fa.out lift/random.lft warn `cat lift/rOut.lst` \ > /dev/null endif cd .. end #- Load the .out files into the database with: ssh hgwdev cd /cluster/data/canHg9 hgLoadOut canHg9 */chr*.fa.out featureBits canHg9 rmsk # 17150700 bases of 65811035 (26.061%) in intersection # 18597816 bases of 67460583 (27.568%) in intersection # 18036581 bases of 64944656 (27.772%) in intersection # PROCESS SIMPLE REPEATS INTO MASK # After the simpleRepeats track has been built, make a filtered version # of the trf output: keep trf's with period <= 12: ssh kkstore02 cd /cluster/data/canHg9/bed/simpleRepeat mkdir -p trfMask for i in trf/*.bed do awk '{if ($5 <= 12) print;}' $i > trfMask/`basename $i` done # Lift up filtered trf output to chrom coords as well: cd /cluster/data/canHg9 mkdir bed/simpleRepeat/trfMaskChrom foreach c (`cat chrom.lst`) if (-e $c/lift/ordered.lst) then perl -wpe 's@(\S+)@bed/simpleRepeat/trfMask/$1.bed@' \ $c/lift/ordered.lst > $c/lift/oTrf.lst liftUp bed/simpleRepeat/trfMaskChrom/chr$c.bed \ jkStuff/liftAll.lft warn `cat $c/lift/oTrf.lst` endif if (-e $c/lift/random.lst) then perl -wpe 's@(\S+)@bed/simpleRepeat/trfMask/$1.bed@' \ $c/lift/random.lst > $c/lift/rTrf.lst liftUp bed/simpleRepeat/trfMaskChrom/chr${c}_random.bed \ jkStuff/liftAll.lft warn `cat $c/lift/rTrf.lst` endif end # Here's the coverage for the filtered TRF: ssh hgwdev cat /cluster/data/canHg9/bed/simpleRepeat/trfMaskChrom/*.bed > /tmp/filtTrf.bed featureBits canHg9 /tmp/filtTrf.bed # 126308 bases of 65811035 (0.192%) in intersection # 121086 bases of 64944656 (0.186%) in intersection # 111742 bases of 62796757 (0.178%) in intersection # MASK SEQUENCE WITH REPEATMASKER AND SIMPLE REPEAT/TRF ssh kkstore02 cd /cluster/data/canHg9 # Soft-mask (lower-case) the contig and chr .fa's, # then make hard-masked versions from the soft-masked. set trfCtg=bed/simpleRepeat/trfMask set trfChr=bed/simpleRepeat/trfMaskChrom foreach f (*/*.fa) echo "repeat- and trf-masking $f" maskOutFa -soft $f $f.out $f set chr = $f:t:r maskOutFa -softAdd $f $trfChr/$chr.bed $f echo "hard-masking $f" maskOutFa $f hard $f.masked end # WARNING: negative rEnd: -3 chr15:11003133-11003355 L1MB7 # WARNING: negative rEnd: -426 chr15:32589194-32589360 L1MCc foreach c (`cat car.names`) echo "repeat- and trf-masking contigs of chr$c, chr${c}_random" foreach d ($c/*) set ctg=$d:t set f=$d/$ctg.fa maskOutFa -soft $f $f.out $f maskOutFa -softAdd $f $trfCtg/$ctg.bed $f maskOutFa $f hard $f.masked end end #- Rebuild the nib files, using the soft masking in the fa: foreach f (*/chr*.fa) faToNib -softMask $f nib/$f:t:r.nib end # Make one big 2bit file as well, and make a link to it in # /gbdb/canHg9/nib because hgBlat looks there: faToTwoBit */chr*.fa canHg9.2bit ln -s /cluster/data/canHg9/canHg4.2bit /gbdb/canHg4/nib/ # PUT MASKED SEQUENCE OUT FOR CLUSTER RUNS ssh kkr1u00 # Chrom-level mixed nibs that have been repeat- and trf-masked: rm -rf /iscratch/i/canHg9/nib mkdir /iscratch/i/canHg9/nib cp -p /cluster/data/canHg9/nib/chr*.nib /iscratch/i/canHg4/nib # Pseudo-contig fa that have been repeat- and trf-masked: rm -rf /iscratch/i/canHg9/trfFa mkdir /iscratch/i/canHg9/trfFa foreach d (/cluster/data/canHg9/*/chr*_?{,?}) cp $d/$d:t.fa /iscratch/i/canHg9/trfFa end cp -p /cluster/data/canHg9/canHg4.2bit /iscratch/i/canHg4/ iSync # MAKE 10.OOC, 11.OOC FILES FOR BLAT ssh kolossus mkdir /cluster/data/canHg9/bed/ooc mkdir -p /cluster/bluearc/canHg9/ cd /cluster/data/canHg9/bed/ooc ls -1 /cluster/data/canHg9/nib/chr*.nib > nib.lst blat nib.lst /dev/null /dev/null -tileSize=11 -makeOoc=/cluster/bluearc/canHg9/11.ooc -repMatch=1024 # Wrote 8 overused 11-mers to /cluster/bluearc/canHg9/11.ooc blat nib.lst /dev/null /dev/null -tileSize=10 -makeOoc=/cluster/bluearc/canHg9/10.ooc -repMatch=843 # Wrote 67 overused 10-mers to /cluster/bluearc/canHg9/10.ooc ssh kkr1u00 cp -p /cluster/bluearc/canHg9/*.ooc /iscratch/i/canHg4/ iSync > sync.out # MAKE Human Proteins track ssh kkstore02 mkdir -p /cluster/data/canHg9/blastDb cd /cluster/data/canHg9/blastDb awk "{print \$2}" ../*/chr*/*.lft > subChr.lst for i in `cat subChr.lst` do ln -s ../*/chr*/$i.fa echo formatdb -i $i.fa -p F formatdb -i $i.fa -p F done rm *.log *.fa list cd .. for i in `cat chrom.lst`; do cat $i/chr*/*.lft ; done > jkStuff/subChr.lft ssh kkr1u00 rm -rf /iscratch/i/canHg9/blastDb mkdir -p /iscratch/i/canHg9/blastDb cd /cluster/data/canHg9/blastDb for i in nhr nin nsq; do cp *.$i /iscratch/i/canHg9/blastDb ; echo $i; done cd iSync > sync.out mkdir -p /cluster/data/canHg9/bed/tblastn.hg17KG cd /cluster/data/canHg9/bed/tblastn.hg17KG echo /panasas/store/braney/carV9/blastDb/*.nsq | xargs ls -S | sed "s/\.nsq//" > query.lst # back to kkstore02 exit cd /cluster/data/canHg9/bed/tblastn.hg17KG rm -rf /cluster/bluearc/canHg9/bed/tblastn.hg17KG/kgfa mkdir -p /cluster/bluearc/canHg9/bed/tblastn.hg17KG/kgfa split -l 70 /cluster/data/hg17/bed/blat.hg17KG/hg17KG.chr13.psl /cluster/bluearc/canHg9/bed/tblastn.hg17KG/kgfa/kg ln -s /cluster/bluearc/canHg9/bed/tblastn.hg17KG/kgfa kgfa cd kgfa for i in *; do pslxToFa $i $i.fa; rm $i; done cd .. ls -1S kgfa/*.fa > kg.lst rm -rf /cluster/bluearc/canHg9/bed/tblastn.hg17KG/blastOut mkdir -p /cluster/bluearc/canHg9/bed/tblastn.hg17KG/blastOut ln -s /cluster/bluearc/canHg9/bed/tblastn.hg17KG/blastOut for i in `cat kg.lst`; do mkdir blastOut/`basename $i .fa`; done tcsh cat << '_EOF_' > blastGsub #LOOP blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl } #ENDLOOP '_EOF_' cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/iscratch/i/blast/data export BLASTMAT g=`basename $2` f=/tmp/`basename $3`.$g for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11 do if /scratch/blast/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8 then mv $f.8 $f.1 break; fi done if test -f $f.1 then if /cluster/bin/i386/blastToPsl $f.1 $f.2 then liftUp -nosort -type=".psl" -nohead $f.3 ../../jkStuff/subChr.lft carry $f.2 liftUp -nosort -type=".psl" -nohead $f.4 ../../jkStuff/liftAll.lft carry $f.3 liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg17/bed/blat.hg17KG/protein.lft warn $f.4 if pslCheck -prot $3.tmp then mv $3.tmp $3 rm -f $f.1 $f.2 $f.3 $f.4 fi exit 0 fi fi rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 $f.4 exit 1 '_EOF_' chmod +x blastSome gensub2 query.lst kg.lst blastGsub blastSpec ssh kk cd /cluster/data/canHg9/bed/tblastn.hg17KG para create blastSpec para push # Completed: 1320 of 1320 jobs # CPU time in finished jobs: 48667s 811.12m 13.52h 0.56d 0.002 y # IO & Wait Time: 11837s 197.28m 3.29h 0.14d 0.000 y # Average job time: 46s 0.76m 0.01h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 117s 1.95m 0.03h 0.00d # Submission to last job: 302s 5.03m 0.08h 0.00d # Completed: 1280 of 1280 jobs # CPU time in finished jobs: 46610s 776.84m 12.95h 0.54d 0.001 y # IO & Wait Time: 40126s 668.76m 11.15h 0.46d 0.001 y # Average job time: 68s 1.13m 0.02h 0.00d # Longest finished job: 127s 2.12m 0.04h 0.00d # Submission to last job: 203s 3.38m 0.06h 0.00d # Completed: 1350 of 1350 jobs # CPU time in finished jobs: 49304s 821.73m 13.70h 0.57d 0.002 y # IO & Wait Time: 11577s 192.96m 3.22h 0.13d 0.000 y # Average job time: 45s 0.75m 0.01h 0.00d # Longest finished job: 99s 1.65m 0.03h 0.00d # Submission to last job: 419s 6.98m 0.12h 0.00d # Completed: 1350 of 1350 jobs # CPU time in finished jobs: 48174s 802.90m 13.38h 0.56d 0.002 y # IO & Wait Time: 11493s 191.55m 3.19h 0.13d 0.000 y # Average job time: 44s 0.74m 0.01h 0.00d # Longest finished job: 101s 1.68m 0.03h 0.00d # Submission to last job: 352s 5.87m 0.10h 0.00d # Completed: 81405 of 81405 jobs # CPU time in finished jobs: 2806780s 46779.66m 779.66h 32.49d 0.089 y # IO & Wait Time: 634146s 10569.11m 176.15h 7.34d 0.020 y # Average job time: 42s 0.70m 0.01h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 141s 2.35m 0.04h 0.00d # Submission to last job: 5998s 99.97m 1.67h 0.07d # Completed: 78390 of 78390 jobs # CPU time in finished jobs: 2724404s 45406.73m 756.78h 31.53d 0.086 y # IO & Wait Time: 1013006s 16883.43m 281.39h 11.72d 0.032 y # Average job time: 48s 0.79m 0.01h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 390s 6.50m 0.11h 0.00d # Submission to last job: 5652s 94.20m 1.57h 0.07d cat << '_EOF_' > chainGsub #LOOP chainSome $(path1) #ENDLOOP '_EOF_' ssh kki cd /cluster/data/canHg9/bed/tblastn.hg17KG tcsh cat << '_EOF_' > chainOne simpleChain -prot -outPsl -maxGap=200000 $1 $2 '_EOF_' chmod +x chainOne for j in blastOut/kg??.sort; do for i in $j/*; do echo chainOne $i $j/c.`basename $i`; done; done > chainSpec para create chainSpec para push # Completed: 10 of 10 jobs # CPU time in finished jobs: 28s 0.47m 0.01h 0.00d 0.000 y # IO & Wait Time: 44s 0.73m 0.01h 0.00d 0.000 y # Average job time: 7s 0.12m 0.00h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 10s 0.17m 0.00h 0.00d # Submission to last job: 10s 0.17m 0.00h 0.00d # Completed: 10 of 10 jobs # CPU time in finished jobs: 11s 0.19m 0.00h 0.00d 0.000 y # IO & Wait Time: 25s 0.41m 0.01h 0.00d 0.000 y # Average job time: 4s 0.06m 0.00h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 7s 0.12m 0.00h 0.00d # Submission to last job: 7s 0.12m 0.00h 0.00d # Completed: 603 of 603 jobs # CPU time in finished jobs: 549s 9.16m 0.15h 0.01d 0.000 y # IO & Wait Time: 1589s 26.48m 0.44h 0.02d 0.000 y # Average job time: 4s 0.06m 0.00h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 19s 0.32m 0.01h 0.00d # Submission to last job: 167s 2.78m 0.05h 0.00d exit # back to kkstore02 cd /cluster/data/canHg9/bed/tblastn.hg17KG/blastOut for i in kg??.sort do cat $i/c.*.psl | awk "(\$13 - \$12)/\$11 > 0.6 {print}" > $i/c60.psl sort -rn $i/c60.psl | pslUniq stdin $i/u.psl awk "((\$1 / \$11) ) > 0.90 { print }" $i/c60.psl > $i/m60.psl echo $i done sort -u -T /tmp -k 14,14 -k 17,17n -k 17,17n */u.psl */m60* > /cluster/data/canHg9/bed/tblastn.hg17KG/blastHg17KG.psl cd .. ssh hgwdev cd /cluster/data/canHg9/bed/tblastn.hg17KG hgLoadPsl canHg9 blastHg17KG.psl # 1425966 bases of 64944656 (2.196%) # 1368835 bases of 62796757 (2.180%) in borEut2 # back to kkstore02 rm -rf blastOut # End tblastn # PRODUCING GENSCAN PREDICTIONS ssh hgwdev mkdir /cluster/data/canHg9/bed/genscan cd /cluster/data/canHg9/bed/genscan # Check out hg3rdParty/genscanlinux to get latest genscan: cvs co hg3rdParty/genscanlinux # Run on small cluster (more mem than big cluster). ssh kki cd /cluster/data/canHg9/bed/genscan # Make 3 subdirectories for genscan to put their output files in mkdir gtf pep subopt # Generate a list file, genome.list, of all the hard-masked contigs that # *do not* consist of all-N's (which would cause genscan to blow up) rm -f genome.list touch genome.list foreach f ( `ls -1S /cluster/data/canHg9/*/chr*/chr*_?{,?}.fa.masked` ) egrep '[ACGT]' $f > /dev/null if ($status == 0) echo $f >> genome.list end wc -l genome.list # Create template file, gsub, for gensub2. For example (3-line file): cat << '_EOF_' > gsub #LOOP /cluster/bin/i386/gsBig {check in line+ $(path1)} {check out line gtf/$(root1).gtf} -trans={check out line pep/$(root1).pep} -subopt={check out line subopt/$(root1).bed} -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=2400000 #ENDLOOP '_EOF_' # << this line makes emacs coloring happy gensub2 genome.list single gsub jobList # did one job on kolossus # Convert these to chromosome level files as so: ssh kkstore02 cd /cluster/data/canHg9/bed/genscan liftUp genscan.gtf ../../jkStuff/liftAll.lft warn gtf/*.gtf liftUp genscanSubopt.bed ../../jkStuff/liftAll.lft warn subopt/*.bed cat pep/*.pep > genscan.pep # Load into the database as so: ssh hgwdev cd /cluster/data/canHg9/bed/genscan ldHgGene canHg9 genscan genscan.gtf hgPepPred canHg9 generic genscanPep genscan.pep hgLoadBed canHg9 genscanSubopt genscanSubopt.bed #END GENSCAN # MAKE Human Known Gene mRNAs Mapped by BLATZ track (hg17 # Create working directory and extract known gene mrna from database mkdir -p /cluster/data/canHg9/bed/blatz.hg17KG cd /cluster/data/canHg9/bed/blatz.hg17KG pepPredToFa hg17 knownGeneMrna known.fa # Split known genes into pieces and load on /iscratch/i temp device ssh kkr1u00 cd /iscratch/i/canHg9 mkdir knownRna cd knownRna faSplit sequence /cluster/data/canHg9/bed/blatz.hg17KG/known.fa 580 known cd iSync > sync.out ssh kk cd /cluster/data/canHg9/bed/blatz.hg17KG # Do parasol job to align via blatz mkdir run0 cd run0 mkdir psl # awk '{print $1;}' /cluster/data/canHg9/chrom.sizes > genome.lst ls -1S /iscratch/i/canHg9/trfFa/*.fa > genome.lst for i in `cat genome.lst` ; do f=`basename $i .fa`; mkdir psl/$f ; done for i in /iscratch/i/canHg3/knownRna/*.fa ; do echo $i; done > mrna.lst cat << '_EOF_' > doBlatz #!/bin/sh f=/tmp/`basename $1`.`basename $2` if blatz -rna -minScore=6000 -out=psl $1 $2 $f.1 then if liftUp -nosort -type=".psl" -nohead $f.2 ../../../jkStuff/liftAll.lft carry $f.1 then mv $f.2 $3 rm $f.1 exit 0 fi fi rm $f.1 $f.2 exit 1 '_EOF_' chmod +x doBlatz cat << '_EOF_' > gsub #LOOP doBlatz $(path1) $(path2) {check out line psl/$(root1)/$(root2).psl} #ENDLOOP '_EOF_' # << this line keeps emacs coloring happy gensub2 genome.lst mrna.lst gsub spec para create spec # Then do usual para try/para push/para time # Completed: 1146 of 1146 jobs # CPU time in finished jobs: 108857s 1814.28m 30.24h 1.26d 0.003 y # IO & Wait Time: 819365s 13656.09m 227.60h 9.48d 0.026 y # Average job time: 810s 13.50m 0.22h 0.01d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 2985s 49.75m 0.83h 0.03d # Submission to last job: 2997s 49.95m 0.83h 0.03d # Completed: 6303 of 6303 jobs # CPU time in finished jobs: 920417s 15340.29m 255.67h 10.65d 0.029 y # IO & Wait Time: 23131s 385.51m 6.43h 0.27d 0.001 y # Average job time: 150s 2.49m 0.04h 0.00d # Longest job: 1064s 17.73m 0.30h 0.01d # Submission to last job: 1632s 27.20m 0.45h 0.02d # Do sort and near-besting on file server ssh kkstore02 cd /cluster/data/canHg9/bed/blatz.hg17KG/run0 catDir -r psl | pslFilter -minScore=100 -minAli=255 -noHead stdin stdout | sort -k 10 > ../filtered.psl cd .. pslReps filtered.psl nearBest.psl /dev/null -nohead -minAli=0.8 -nearTop=0.01 -minCover=0.8 sort -k 14 nearBest.psl > blatzHg17KG.psl # Clean up rm -r run0/psl # Load into database ssh hgwdev cd /cluster/data/canHg9/bed/blatz.hg17KG hgLoadPsl canHg9 blatzHg17KG.psl #END BLATZ # BLASTZ SWAP FOR hg17 vs canHg9 BLASTZ TO CREATE canHg4 vs hg17 BLASTZ ssh kolossus mkdir -p /cluster/data/canHg9/bed/blastz.hg17 cd /cluster/data/canHg9/bed/blastz.hg17 set aliDir = /cluster/data/hg17/bed/blastz.canHg9 cp $aliDir/S1.len S2.len cp $aliDir/S2.len S1.len mkdir unsorted axtChrom cat $aliDir/axtChrom/chr*.axt \ | axtSwap stdin $aliDir/S1.len $aliDir/S2.len stdout \ | axtSplitByTarget stdin unsorted # Sort the shuffled .axt files. foreach f (unsorted/*.axt) echo sorting $f:t:r axtSort $f axtChrom/$f:t end du -sh $aliDir/axtChrom unsorted axtChrom # 409M /cluster/data/hg17/bed/blastz.canHg9/axtChrom # 409M unsorted # 409M axtChrom rm -r unsorted # CHAIN HUMAN BLASTZ # Run axtChain on little cluster ssh kki cd /cluster/data/canHg9/bed/blastz.hg17 mkdir -p axtChain/run1 cd axtChain/run1 mkdir out chain ls -1S /cluster/data/canHg9/bed/blastz.hg17/axtChrom/*.axt > input.lst cat << '_EOF_' > gsub #LOOP doChain {check in line+ $(path1)} {check out line+ chain/$(root1).chain} {check out exists out/$(root1).out} #ENDLOOP '_EOF_' # << this line makes emacs coloring happy cat << '_EOF_' > doChain #!/bin/csh axtChain $1 \ /iscratch/i/canHg9/nib \ /iscratch/i/gs.18/build35/bothMaskedNibs $2 > $3 '_EOF_' # << this line makes emacs coloring happy chmod a+x doChain gensub2 input.lst single gsub jobList para create jobList para try, check, push, check... # Completed: 1 of 1 jobs # CPU time in finished jobs: 109s 1.81m 0.03h 0.00d 0.000 y # IO & Wait Time: 104s 1.73m 0.03h 0.00d 0.000 y # Average job time: 212s 3.53m 0.06h 0.00d # Longest finished job: 212s 3.53m 0.06h 0.00d # Submission to last job: 212s 3.53m 0.06h 0.00d # now on the cluster server, sort chains ssh kkstore02 cd /cluster/data/canHg9/bed/blastz.hg17/axtChain chainMergeSort run1/chain/*.chain > all.chain chainSplit chain all.chain rm run1/chain/*.chain # take a look at score distr's foreach f (chain/*.chain) grep chain $f | awk '{print $2;}' | sort -nr > /tmp/score.$f:t:r echo $f:t:r textHistogram -binSize=4000 /tmp/score.$f:t:r echo "" end mv all.chain all.chain.unfiltered chainFilter -minScore=50000 all.chain.unfiltered > all.chain rm chain/* chainSplit chain all.chain gzip all.chain.unfiltered # Load chains into database ssh hgwdev cd /cluster/data/canHg9/bed/blastz.hg17/axtChain/chain foreach i (*.chain) set c = $i:r hgLoadChain canHg9 ${c}_chainHg17 $i end featureBits canHg9 chainHg17Link # 52252036 bases of 62796757 (83.208%) in intersection # NET HUMAN BLASTZ ssh kkstore02 cd /cluster/data/canHg9/bed/blastz.hg17/axtChain chainPreNet all.chain ../S1.len ../S2.len stdout \ | chainNet stdin -minSpace=1 ../S1.len ../S2.len stdout /dev/null \ | netSyntenic stdin noClass.net # Add classification info using db tables: ssh hgwdev cd /cluster/data/canHg9/bed/blastz.hg17/axtChain netClass noClass.net canHg9 hg17 human.net # Make a 'syntenic' subset: ssh kkstore02 cd /cluster/data/canHg9/bed/blastz.hg17/axtChain rm noClass.net # Make a 'syntenic' subset of these with netFilter -syn human.net > humanSyn.net # Load the nets into database ssh hgwdev cd /cluster/data/canHg9/bed/blastz.hg17/axtChain netFilter -minGap=10 human.net | hgLoadNet canHg9 netHg17 stdin netFilter -minGap=10 humanSyn.net | hgLoadNet canHg9 netSyntenyHg17 stdin # Add entries for chainHg17, netHg17, syntenyHg17 to dog/canHg9 trackDb # GENERATE HG17 MAF FOR MULTIZ FROM NET ssh kkstore02 cd /cluster/data/canHg9/bed/blastz.hg17/axtChain netSplit human.net net cd /cluster/data/canHg9/bed/blastz.hg17 mkdir axtNet foreach f (axtChain/net/*) set chr = $f:t:r netToAxt $f axtChain/chain/$chr.chain /cluster/data/canHg9/nib \ /cluster/data/hg17/nib stdout | axtSort stdin axtNet/$chr.axt end mkdir mafNet foreach f (axtNet/chr*.axt) set maf = mafNet/$f:t:r.maf axtToMaf $f /cluster/data/canHg9/chrom.sizes /cluster/data/hg17/chrom.sizes $maf -tPrefix=canHg4. -qPrefix=hg17. end # BLASTZ SWAP FOR canFam1 vs canHg9 BLASTZ TO CREATE canHg4 vs canFam1 BLASTZ ssh kolossus mkdir -p /cluster/data/canHg9/bed/blastz.canFam1 cd /cluster/data/canHg9/bed/blastz.canFam1 set aliDir = /cluster/data/canFam1/bed/blastz.canHg9 cp $aliDir/S1.len S2.len cp $aliDir/S2.len S1.len mkdir unsorted axtChrom cat $aliDir/axtChrom/chr*.axt \ | axtSwap stdin $aliDir/S1.len $aliDir/S2.len stdout \ | axtSplitByTarget stdin unsorted # Sort the shuffled .axt files. foreach f (unsorted/*.axt) echo sorting $f:t:r axtSort $f axtChrom/$f:t end du -sh $aliDir/axtChrom unsorted axtChrom # 207M /cluster/data/canFam1/bed/blastz.canHg9/axtChrom # 207M unsorted # 207M axtChrom rm -rf unsorted # CHAIN DOG BLASTZ # Run axtChain on little cluster ssh kki cd /cluster/data/canHg9/bed/blastz.canFam1 mkdir -p axtChain/run1 cd axtChain/run1 mkdir out chain ls -1S /cluster/data/canHg9/bed/blastz.canFam1/axtChrom/*.axt > input.lst cat << '_EOF_' > gsub #LOOP doChain {check in line+ $(path1)} {check out line+ chain/$(root1).chain} {check out exists out/$(root1).out} #ENDLOOP '_EOF_' # << this line makes emacs coloring happy cat << '_EOF_' > doChain #!/bin/csh axtChain $1 \ /iscratch/i/canHg9/nib \ /iscratch/i/canFam1/nib $2 > $3 '_EOF_' # << this line makes emacs coloring happy chmod a+x doChain gensub2 input.lst single gsub jobList para create jobList para try, check, push, check... # Completed: 1 of 1 jobs # CPU time in finished jobs: 86s 1.44m 0.02h 0.00d 0.000 y # IO & Wait Time: 7s 0.11m 0.00h 0.00d 0.000 y # Average job time: 93s 1.55m 0.03h 0.00d # Longest finished job: 93s 1.55m 0.03h 0.00d # Submission to last job: 93s 1.55m 0.03h 0.00d # now on the cluster server, sort chains ssh kkstore02 cd /cluster/data/canHg9/bed/blastz.canFam1/axtChain chainMergeSort run1/chain/*.chain > all.chain chainSplit chain all.chain rm run1/chain/*.chain # take a look at score distr's foreach f (chain/*.chain) grep chain $f | awk '{print $2;}' | sort -nr > /tmp/score.$f:t:r echo $f:t:r textHistogram -binSize=4000 /tmp/score.$f:t:r echo "" end mv all.chain all.chain.unfiltered chainFilter -minScore=50000 all.chain.unfiltered > all.chain rm chain/* chainSplit chain all.chain gzip all.chain.unfiltered & # Load chains into database ssh hgwdev cd /cluster/data/canHg9/bed/blastz.canFam1/axtChain/chain foreach i (*.chain) set c = $i:r hgLoadChain canHg9 ${c}_chainCanFam1 $i end featureBits canHg9 chainCanFam1Link # 51373603 bases of 62796757 (81.809%) in intersection # GENBANK/REFSEQ alignments: - added borEut to: /cluster/data/genbank/etc/genbank.conf src/hg/makeDb/genbank/src/lib/gbGenome.c src/hg/makeDb/genbank/src/align/gbBlat cvs commit the changes cd src/hg/makeDb/genbank jkmake install-server ssh kkstore02 cd /cluster/data/genbank ./bin/gbAlignStep -initial canHg9 & # load into database ssh hgwdev cd /cluster/data/genbank/ ./bin/gbDbLoadStep -initialLoad canHg9 & # BLASTZ EutGli1 ssh kk mkdir /cluster/data/canHg9/bed/blastz.eutGli1 cd /cluster/data/canHg9/bed/blastz.eutGli1 cat << '_EOF_' > DEF export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin ALIGN=blastz-run BLASTZ=blastz # Default BLASTZ_H=2000 BLASTZ_ABRIDGE_REPEATS=1 SEQ1_DIR=/iscratch/i/canHg9/nib SEQ1_RMSK= SEQ1_FLAG= # SEQ1_SMSK=/scratch/hg/gs.18/build35/linSpecRep.notInDog SEQ1_SMSK= SEQ1_IN_CONTIGS=0 SEQ1_CHUNK=1000000 SEQ1_LAP=10000 SEQ2_DIR=/iscratch/i/eutGli1/nib SEQ2_RMSK= SEQ2_FLAG= #SEQ2_SMSK=/scratch/hg/canHg9/linSpecRep.notInHuman SEQ2_SMSK= SEQ2_IN_CONTIGS=0 SEQ2_CHUNK=1000000 SEQ2_LAP=0 BASE=/cluster/data/canHg9/bed/blastz.eutGli1 DEF=$BASE/DEF RAW=$BASE/raw CDBDIR=$BASE SEQ1_LEN=$BASE/S1.len SEQ2_LEN=$BASE/S2.len '_EOF_' # << this line keeps emacs coloring happy /cluster/data/hg17/jkStuff/BlastZ_run0.sh cd run.0 para push # Completed: 4899 of 4899 jobs # CPU time in finished jobs: 918787s 15313.12m 255.22h 10.63d 0.029 y # IO & Wait Time: 21355s 355.91m 5.93h 0.25d 0.001 y # Average job time: 192s 3.20m 0.05h 0.00d # Longest job: 2206s 36.77m 0.61h 0.03d # Submission to last job: 9206s 153.43m 2.56h 0.11d ssh kki cd /cluster/data/canHg9/bed/blastz.eutGli1 /cluster/data/hg17/jkStuff/BlastZ_run1.sh cd run.1 para push # Completed: 71 of 71 jobs # CPU time in finished jobs: 1139s 18.99m 0.32h 0.01d 0.000 y # IO & Wait Time: 797s 13.28m 0.22h 0.01d 0.000 y # Average job time: 27s 0.45m 0.01h 0.00d # Longest job: 75s 1.25m 0.02h 0.00d # Submission to last job: 282s 4.70m 0.08h 0.00d ssh kk cd /cluster/data/canHg9/bed/blastz.eutGli1 /cluster/data/hg17/jkStuff/BlastZ_run2.sh cd run.2 para push # Completed: 1 of 1 jobs # CPU time in finished jobs: 362s 6.04m 0.10h 0.00d 0.000 y # IO & Wait Time: 1464s 24.39m 0.41h 0.02d 0.000 y # Average job time: 1826s 30.43m 0.51h 0.02d # Longest job: 1826s 30.43m 0.51h 0.02d # Submission to last job: 1826s 30.43m 0.51h 0.02d # END BLASTZ eutGli1 # CHAIN EutGli1 BLASTZ # Run axtChain on little cluster ssh kki cd /cluster/data/canHg9/bed/blastz.eutGli1 mkdir -p axtChain/run1 cd axtChain/run1 mkdir out chain ls -1S /cluster/data/canHg9/bed/blastz.eutGli1/axtChrom/*.axt > input.lst cat << '_EOF_' > gsub #LOOP doChain {check in line+ $(path1)} {check out line+ chain/$(root1).chain} {check out exists out/$(root1).out} #ENDLOOP '_EOF_' # << this line makes emacs coloring happy cat << '_EOF_' > doChain #!/bin/csh /cluster/bin/x86_64/axtChain $1 \ /iscratch/i/canHg9/nib \ /iscratch/i/eutGli1/nib $2 > $3 '_EOF_' # << this line makes emacs coloring happy chmod a+x doChain gensub2 input.lst single gsub jobList para create jobList para try, check, push, check... # Completed: 1 of 1 jobs # CPU time in finished jobs: 197s 3.28m 0.05h 0.00d 0.000 y # IO & Wait Time: 88s 1.47m 0.02h 0.00d 0.000 y # Average job time: 285s 4.75m 0.08h 0.00d # Longest job: 285s 4.75m 0.08h 0.00d # Submission to last job: 285s 4.75m 0.08h 0.00d # now on the cluster server, sort chains ssh kkstore02 cd /cluster/data/canHg9/bed/blastz.eutGli1/axtChain chainMergeSort run1/chain/*.chain > all.chain chainSplit chain all.chain rm run1/chain/*.chain # take a look at score distr's foreach f (chain/*.chain) grep chain $f | awk '{print $2;}' | sort -nr > /tmp/score.$f:t:r echo $f:t:r textHistogram -binSize=4000 /tmp/score.$f:t:r echo "" end mv all.chain all.chain.unfiltered chainFilter -minScore=50000 all.chain.unfiltered > all.chain rm chain/* chainSplit chain all.chain gzip all.chain.unfiltered & # Load chains into database ssh hgwdev cd /cluster/data/canHg9/bed/blastz.eutGli1/axtChain/chain foreach i (*.chain) set c = $i:r hgLoadChain canHg9 ${c}_chainEutGli1 $i end featureBits canHg9 chainEutGli1Link # 66936604 bases of 70435997 (95.032%) in intersection # NET eutGli1 BLASTZ ssh kkstore02 cd /cluster/data/canHg9/bed/blastz.eutGli1/axtChain chainPreNet all.chain ../S1.len ../S2.len stdout \ | chainNet stdin -minSpace=1 ../S1.len ../S2.len stdout /dev/null \ | netSyntenic stdin noClass.net # Add classification info using db tables: ssh hgwdev cd /cluster/data/canHg9/bed/blastz.eutGli1/axtChain netClass -noAr noClass.net canHg9 eutGli1 eutGli1.net # Make a 'syntenic' subset: ssh kkstore02 cd /cluster/data/canHg9/bed/blastz.eutGli1/axtChain rm noClass.net # Make a 'syntenic' subset of these with netFilter -syn eutGli1.net > eutGli1Syn.net # Load the nets into database ssh hgwdev cd /cluster/data/canHg9/bed/blastz.eutGli1/axtChain netFilter -minGap=10 eutGli1.net | hgLoadNet canHg9 netEutGli1 stdin netFilter -minGap=10 eutGli1Syn.net | hgLoadNet canHg9 netSyntenyEutGli1 stdin # Add entries for chainHg17, netHg17, syntenyHg17 to dog/canHg9 trackDb # GENERATE EutGli1 MAF FOR MULTIZ FROM NET ssh kkstore02 cd /cluster/data/canHg9/bed/blastz.eutGli1/axtChain netSplit eutGli1.net net cd /cluster/data/canHg9/bed/blastz.eutGli1 mkdir axtNet foreach f (axtChain/net/*) set chr = $f:t:r netToAxt $f axtChain/chain/$chr.chain /cluster/data/canHg9/nib \ /cluster/data/eutGli1/nib stdout | axtSort stdin axtNet/$chr.axt end mkdir mafNet foreach f (axtNet/chr*.axt) set maf = mafNet/$f:t:r.maf axtToMaf $f /cluster/data/canHg9/chrom.sizes /cluster/data/eutGli1/chrom.sizes $maf -tPrefix=canHg4. -qPrefix=eutGli1. end # BLASTZ EutPri1 ssh kk mkdir /cluster/data/canHg9/bed/blastz.eutPri1 cd /cluster/data/canHg9/bed/blastz.eutPri1 cat << '_EOF_' > DEF export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin ALIGN=blastz-run BLASTZ=blastz # Default BLASTZ_H=2000 BLASTZ_ABRIDGE_REPEATS=1 SEQ1_DIR=/iscratch/i/canHg9/nib SEQ1_RMSK= SEQ1_FLAG= # SEQ1_SMSK=/scratch/hg/gs.18/build35/linSpecRep.notInDog SEQ1_SMSK= SEQ1_IN_CONTIGS=0 SEQ1_CHUNK=1000000 SEQ1_LAP=10000 SEQ2_DIR=/iscratch/i/eutPri1/nib SEQ2_RMSK= SEQ2_FLAG= #SEQ2_SMSK=/scratch/hg/canHg9/linSpecRep.notInHuman SEQ2_SMSK= SEQ2_IN_CONTIGS=0 SEQ2_CHUNK=1000000 SEQ2_LAP=0 BASE=/cluster/data/canHg9/bed/blastz.eutPri1 DEF=$BASE/DEF RAW=$BASE/raw CDBDIR=$BASE SEQ1_LEN=$BASE/S1.len SEQ2_LEN=$BASE/S2.len '_EOF_' # << this line keeps emacs coloring happy /cluster/data/hg17/jkStuff/BlastZ_run0.sh cd run.0 para push # Completed: 6319 of 6319 jobs # CPU time in finished jobs: 144839s 2413.98m 40.23h 1.68d 0.005 y # IO & Wait Time: 21882s 364.71m 6.08h 0.25d 0.001 y # Average job time: 26s 0.44m 0.01h 0.00d # Longest job: 2517s 41.95m 0.70h 0.03d # Submission to last job: 3900s 65.00m 1.08h 0.05d ssh kki cd /cluster/data/canHg9/bed/blastz.eutPri1 /cluster/data/hg17/jkStuff/BlastZ_run1.sh cd run.1 para push # Completed: 71 of 71 jobs # CPU time in finished jobs: 72s 1.21m 0.02h 0.00d 0.000 y # IO & Wait Time: 211s 3.51m 0.06h 0.00d 0.000 y # Average job time: 4s 0.07m 0.00h 0.00d # Longest job: 13s 0.22m 0.00h 0.00d # Submission to last job: 52s 0.87m 0.01h 0.00d ssh kk cd /cluster/data/canHg9/bed/blastz.eutPri1 /cluster/data/hg17/jkStuff/BlastZ_run2.sh cd run.2 para push # Completed: 1 of 1 jobs # CPU time in finished jobs: 52s 0.86m 0.01h 0.00d 0.000 y # IO & Wait Time: 26s 0.44m 0.01h 0.00d 0.000 y # Average job time: 78s 1.30m 0.02h 0.00d # Longest job: 78s 1.30m 0.02h 0.00d # Submission to last job: 80s 1.33m 0.02h 0.00d # END BLASTZ eutPri1 # CHAIN EutPri1 BLASTZ # Run axtChain on little cluster ssh kki cd /cluster/data/canHg9/bed/blastz.eutPri1 mkdir -p axtChain/run1 cd axtChain/run1 mkdir out chain ls -1S /cluster/data/canHg9/bed/blastz.eutPri1/axtChrom/*.axt > input.lst cat << '_EOF_' > gsub #LOOP doChain {check in line+ $(path1)} {check out line+ chain/$(root1).chain} {check out exists out/$(root1).out} #ENDLOOP '_EOF_' # << this line makes emacs coloring happy cat << '_EOF_' > doChain #!/bin/csh /cluster/bin/x86_64/axtChain $1 \ /iscratch/i/canHg9/nib \ /iscratch/i/eutPri1/nib $2 > $3 '_EOF_' # << this line makes emacs coloring happy chmod a+x doChain gensub2 input.lst single gsub jobList para create jobList para try, check, push, check... # Completed: 1 of 1 jobs # CPU time in finished jobs: 72s 1.20m 0.02h 0.00d 0.000 y # IO & Wait Time: 7s 0.12m 0.00h 0.00d 0.000 y # Average job time: 79s 1.32m 0.02h 0.00d # Longest job: 79s 1.32m 0.02h 0.00d # Submission to last job: 79s 1.32m 0.02h 0.00d # now on the cluster server, sort chains ssh kkstore02 cd /cluster/data/canHg9/bed/blastz.eutPri1/axtChain chainMergeSort run1/chain/*.chain > all.chain chainSplit chain all.chain rm run1/chain/*.chain # take a look at score distr's foreach f (chain/*.chain) grep chain $f | awk '{print $2;}' | sort -nr > /tmp/score.$f:t:r echo $f:t:r textHistogram -binSize=4000 /tmp/score.$f:t:r echo "" end mv all.chain all.chain.unfiltered chainFilter -minScore=50000 all.chain.unfiltered > all.chain rm chain/* chainSplit chain all.chain gzip all.chain.unfiltered & # Load chains into database ssh hgwdev cd /cluster/data/canHg9/bed/blastz.eutPri1/axtChain/chain foreach i (*.chain) set c = $i:r hgLoadChain canHg9 ${c}_chainEutPri1 $i end featureBits canHg9 chainEutPri1Link # 65181722 bases of 70435997 (92.540%) in intersection # NET eutPri1 BLASTZ ssh kkstore02 cd /cluster/data/canHg9/bed/blastz.eutPri1/axtChain chainPreNet all.chain ../S1.len ../S2.len stdout \ | chainNet stdin -minSpace=1 ../S1.len ../S2.len stdout /dev/null \ | netSyntenic stdin noClass.net # Add classification info using db tables: ssh hgwdev cd /cluster/data/canHg9/bed/blastz.eutPri1/axtChain netClass -noAr noClass.net canHg9 eutPri1 eutPri1.net # Make a 'syntenic' subset: ssh kkstore02 cd /cluster/data/canHg9/bed/blastz.eutPri1/axtChain rm noClass.net # Make a 'syntenic' subset of these with netFilter -syn eutPri1.net > eutPri1Syn.net # Load the nets into database ssh hgwdev cd /cluster/data/canHg9/bed/blastz.eutPri1/axtChain netFilter -minGap=10 eutPri1.net | hgLoadNet canHg9 netEutPri1 stdin netFilter -minGap=10 eutPri1Syn.net | hgLoadNet canHg9 netSyntenyEutPri1 stdin # Add entries for chainHg17, netHg17, syntenyHg17 to dog/canHg9 trackDb # GENERATE EutPri1 MAF FOR MULTIZ FROM NET ssh kkstore02 cd /cluster/data/canHg9/bed/blastz.eutPri1/axtChain netSplit eutPri1.net net cd /cluster/data/canHg9/bed/blastz.eutPri1 mkdir axtNet foreach f (axtChain/net/*) set chr = $f:t:r netToAxt $f axtChain/chain/$chr.chain /cluster/data/canHg9/nib \ /cluster/data/eutPri1/nib stdout | axtSort stdin axtNet/$chr.axt end mkdir mafNet foreach f (axtNet/chr*.axt) set maf = mafNet/$f:t:r.maf axtToMaf $f /cluster/data/canHg9/chrom.sizes /cluster/data/eutPri1/chrom.sizes $maf -tPrefix=canHg4. -qPrefix=eutPri1. end # Load into database ssh hgwdev cd /cluster/data/canHg9/bed/multizDescend /cluster/bin/pen mkdir -p /gbdb/canHg9/multiz4way ln -s /cluster/data/canHg9/bed/multizDescend/multiz4way.maf /gbdb/canHg4/multiz4way hgLoadMaf canHg9 multiz4way for i in eutGli1 eutPri1 hg17 do f="$i"_netBlastz mkdir -p /gbdb/canHg9/multiz4way/$f ln -s /cluster/data/canHg9/bed/blastz.$i/chr15.maf /gbdb/canHg4/multiz4way/$f/$i.maf hgLoadMaf canHg9 $f -pathPrefix=/gbdb/canHg4/multiz4way/$f done # Load the 9-way with ancestors ssh kkilo mkdir -p /cluster/data/canHg9/bed/webbMaf cd /cluster/data/canHg9/bed/webbMaf /cluster/bin/penn/maf_project canHg9.maf canHg4 > canHg4.local.maf mkdir -p /gbdb/canHg9/multiz4way ln -s /cluster/data/canHg9/bed/multizDescend/multiz4way.maf /gbdb/canHg4/multiz4way hgLoadMaf canHg9 multiz4way #load Webb's maf mkdir -p /cluster/data/canHg9/bed/tba14way zcat ../../fromWebb/borEut.maf.gz | sed "s/chimp/panTro1/g" |sed "s/cow/bosTau1/g" |sed "s/primate/eutPri2.chr15/g" | sed "s/euarch/eutGli2.chr15/g" | sed "s/boreo/canHg9.chr15/g" | sed "s/macaca/rheMac1/g" > global.maf time /cluster/bin/penn/maf_project global.maf canHg9 > canHg4.local.maf mkdir -p /gbdb/canHg9/tba14way ln -s /cluster/data/canHg9/bed/webbMaf/canHg4.local.maf /gbdb/canHg4/webbMaf/webbMaf.maf awk "! /^s/ { print} /^s/ {if (\$3 + \$4 > \$6) print \$1, \$2, \$3, \$4,\$5,\$3+\$4, \$7; else print \$0}" canHg9.local.maf > fixed.local.maf /cluster/bin/penn/maf_order canHg9.local.maf canHg4 mmHg3 panHg3 hg17 panTro1 rheMac1 rabbit mm6 rn3 canFam1 bosTau1 armadillo elephant > canHg4.order.maf hgLoadMaf canHg9 webbMaf -pathPrefix=/gbdb/canHg4/webbMaf mkdir /cluster/data/canHg9/bed/tba12way cd /cluster/data/canHg9/bed/tba12way wget "http://www.bx.psu.edu/~webb/borEut.maf.gz" nice bash grep "^s" *local* | sed "s/\..*$//" | sort -u > species.lst time /cluster/bin/penn/maf_order canHg9.local.maf canHg4 eutGli2 eutPri2 hg17 panTro1 rabbit mouse rat canFam1 bosTau1 armadillo elephant > canHg4.order.maf awk "! /^s/ { print} /^s/ {if (\$3 + \$4 > \$6) print \$1, \$2, \$3, \$4,\$5,\$3+\$4, \$7; else print \$0}" canHg9.local.maf > fixed.local.maf mkdir -p /gbdb/canHg9/tba12way ln -s /cluster/data/canHg9/bed/tba12way/canHg4.order.maf /gbdb/canHg4/tba12way/tba12way.maf hgLoadMaf canHg9 tba12way -pathPrefix=/gbdb/canHg4/tba12way cat canHg9.order.maf | hgLoadMafSummary -minSize=10000 -mergeGap=500 -maxSize=50000 canHg4 tba12waySummary stdin # MAKE Mouse Proteins track ssh kkstore02 mkdir -p /cluster/data/canHg9/blastDb cd /cluster/data/canHg9/blastDb awk "{print \$2}" ../*/chr*/*.lft > subChr.lst for i in `cat subChr.lst` do ln -s ../*/chr*/$i.fa echo formatdb -i $i.fa -p F formatdb -i $i.fa -p F done rm *.log *.fa list cd .. for i in `cat chrom.lst`; do cat $i/chr*/*.lft ; done > jkStuff/subChr.lft ssh kkr1u00 rm -rf /iscratch/i/canHg9/blastDb mkdir -p /iscratch/i/canHg9/blastDb cd /cluster/data/canHg9/blastDb for i in nhr nin nsq; do cp *.$i /iscratch/i/canHg9/blastDb ; echo $i; done cd iSync > sync.out mkdir -p /cluster/data/canHg9/bed/tblastn.mm6KG cd /cluster/data/canHg9/bed/tblastn.mm6KG echo /iscratch/i/canHg9/blastDb/*.nsq | xargs ls -S | sed "s/\.nsq//" > query.lst # back to kkstore02 exit cd /cluster/data/canHg9/bed/tblastn.mm6KG rm -rf /cluster/bluearc/canHg9/bed/tblastn.mm6KG/kgfa mkdir -p /cluster/bluearc/canHg9/bed/tblastn.mm6KG/kgfa split -l 70 /cluster/data/mm6/bed/blat.mm6KG/mm6KG.psl /cluster/bluearc/canHg9/bed/tblastn.mm6KG/kgfa/kg ln -s /cluster/bluearc/canHg9/bed/tblastn.mm6KG/kgfa kgfa cd kgfa for i in *; do pslxToFa $i $i.fa; rm $i; done cd .. ls -1S kgfa/*.fa > kg.lst rm -rf /cluster/bluearc/canHg9/bed/tblastn.mm6KG/blastOut mkdir -p /cluster/bluearc/canHg9/bed/tblastn.mm6KG/blastOut ln -s /cluster/bluearc/canHg9/bed/tblastn.mm6KG/blastOut for i in `cat kg.lst`; do mkdir blastOut/`basename $i .fa`; done tcsh cat << '_EOF_' > blastGsub #LOOP blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl } #ENDLOOP '_EOF_' cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/iscratch/i/blast/data export BLASTMAT g=`basename $2` f=/tmp/`basename $3`.$g for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11 do if /scratch/blast/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8 then mv $f.8 $f.1 break; fi done if test -f $f.1 then if /cluster/bin/i386/blastToPsl $f.1 $f.2 then liftUp -nosort -type=".psl" -nohead $f.3 ../../jkStuff/subChr.lft carry $f.2 liftUp -nosort -type=".psl" -nohead $f.4 ../../jkStuff/liftAll.lft carry $f.3 liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/mm6/bed/blat.mm6KG/protein.lft warn $f.4 if pslCheck -prot $3.tmp then mv $3.tmp $3 rm -f $f.1 $f.2 $f.3 $f.4 fi exit 0 fi fi rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 $f.4 exit 1 '_EOF_' chmod +x blastSome gensub2 query.lst kg.lst blastGsub blastSpec ssh kk cd /cluster/data/canHg9/bed/tblastn.mm6KG para create blastSpec para push # Completed: 37620 of 37620 jobs # CPU time in finished jobs: 1022938s 17048.96m 284.15h 11.84d 0.032 y # IO & Wait Time: 275126s 4585.44m 76.42h 3.18d 0.009 y # Average job time: 35s 0.58m 0.01h 0.00d # Longest finished job: 111s 1.85m 0.03h 0.00d # Submission to last job: 1684s 28.07m 0.47h 0.02d cat << '_EOF_' > chainGsub #LOOP chainSome $(path1) #ENDLOOP '_EOF_' ssh kki cd /cluster/data/canHg9/bed/tblastn.mm6KG tcsh cat << '_EOF_' > chainOne (cd $1; cat q."$2"* | simpleChain -prot -outPsl -maxGap=200000 stdin ../c.`basename $1`.$2.psl) '_EOF_' chmod +x chainOne for j in blastOut/kg??; do for i in `cat ../../chrom.lst`; do echo chainOne $j chr"$i"; done ; done > chainSpec para create chainSpec para push # Completed: 285 of 285 jobs # CPU time in finished jobs: 239s 3.98m 0.07h 0.00d 0.000 y # IO & Wait Time: 731s 12.19m 0.20h 0.01d 0.000 y # Average job time: 3s 0.06m 0.00h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 14s 0.23m 0.00h 0.00d # Submission to last job: 85s 1.42m 0.02h 0.00d exit # back to kkstore02 cd /cluster/data/canHg9/bed/tblastn.mm6KG/blastOut for i in kg?? do cat c.$i.*.psl | awk "(\$13 - \$12)/\$11 > 0.6 {print}" > c60.$i.psl sort -rn c60.$i.psl | pslUniq stdin u.$i.psl awk "((\$1 / \$11) ) > 0.90 { print }" c60.$i.psl > m60.$i.psl echo $i done sort -u -T /tmp -k 14,14 -k 17,17n -k 17,17n u.*.psl m60* > /cluster/data/canHg9/bed/tblastn.mm6KG/blastMm6KG.psl cd .. ssh hgwdev cd /cluster/data/canHg9/bed/tblastn.mm6KG hgLoadPsl canHg9 blastHg17KG.psl # 1425966 bases of 64944656 (2.196%) # 1368835 bases of 62796757 (2.180%) in borEut2 # back to kkstore02 rm -rf blastOut # End tblastn ########################################################################## # EVOFOLD - RNA secondary structure predictions lifted from hg17 # Jakob Skou Pedersen, Aug 5, 2005 ssh -C hgwdev mkdir -p /cluster/data/canHg9/bed/evofold cd /cluster/data/canHg9/bed/evofold # concatenating chain files cat /cluster/data/canHg9/bed/chains.hg17/hg17/*chain > hg17ToCanHg9.over.chain # lifting evofolds from hg17 to canHg9 echo "select chrom, chromStart, chromEnd, name, score, strand, size, secStr, conf from evofold;" | hgsql hg17 | sed -e 1d > foldsHg17.bed liftOver -minMatch=1.0 foldsHg17.bed hg17ToCanHg9.over.chain tmp.bed unmapped.bed grep "#" unmapped.bed | wc -l # remove elements which are wrong size after lifting awk '$3-$2 == $7' tmp.bed > foldsCanHg9.bed # clean up rm foldsHg17.bed unmapped.bed tmp.bed hgLoadBed -notItemRgb -sqlTable=/cluster/home/jsp/prog/kent/src/hg/lib/evofold.sql canHg9 evofold foldsCanHg9.bed ########################################################################## # RFAMSEEDFOLDS - Rfam seed secondary structure annotation lifted from hg17 # Jakob Skou Pedersen, Aug 5, 2005 # lifting rfamSeed from hg17 to canHg9 echo "select chrom, chromStart, chromEnd, name, score, strand, size, secStr, conf from RfamSeedFolds;" | hgsql hg17 | sed -e 1d > foldsHg17.bed liftOver -minMatch=1.0 foldsHg17.bed hg17ToCanHg9.over.chain tmp.bed unmapped.bed grep "#" unmapped.bed | wc -l # remove elements which are of wrong size after lifting awk '$3-$2 == $7' tmp.bed > foldsCanHg9.bed # clean up rm foldsHg17.bed unmapped.bed tmp.bed hgLoadBed -noBin -notItemRgb -sqlTable=/cluster/home/jsp/prog/kent/src/hg/lib/RfamSeedFolds.sql canHg9 RfamSeedFolds foldsCanHg9.bed