# sequence from Mattheiu Blanchette 1/28/05 in /cluster/store8/blanchem/hsa13/ancestors1 ssh kksilo mkdir -p /cluster/store10/ancestor/eutPri1/sequence ln -s /cluster/store10/ancestor/eutPri1 /cluster/data/eutPri1 cd /cluster/data/eutPri1/sequence g=PREDICTIONANCESTRAL7 echo $g > list for i in /cluster/store8/blanchem/hsa13/ancestors1/* do f=`basename $i .ancestors` faSomeRecords $i list stdout | sed "s/ $g/$f/" | tr -d '-' > $f.fa echo $i done cat *.fa > eutPri1.fa scaffoldFaToAgp eutPri1.fa cd .. mkdir 15 sed -e "s/chrUn/chr15/g" sequence/eutPri1.agp > 15/chr15.agp sed -e "s/chrUn/chr15/g" sequence/eutPri1.lft > 15/chr15.lft cat */*/*.lft > jkStuff/liftAll.lft # Create chromosome FA file from AGP and file of masked scaffolds cd 15 agpToFa -simpleMultiMixed chr15.agp chr15 chr15.fa ../sequence/eutPri1.fa # CREATING DATABASE # Create the database. ssh hgwdev echo 'create database eutPri1' | hgsql '' # STORE SEQUENCE AND ASSEMBLY INFORMATION # Translate to nib cd /cluster/data/eutPri1 mkdir nib faToNib -softMask 15/chr15.fa nib/chr15.nib faToTwoBit */chr*.fa eutPri1.2bit mkdir -p /gbdb/eutPri1/nib ln -s /cluster/data/eutPri1/eutPri1.2bit /gbdb/eutPri1/nib/ ln -s /cluster/data/eutPri1/nib/chr15.nib /gbdb/eutPri1/nib hgsql eutPri1 < $HOME/kent/src/hg/lib/chromInfo.sql hgNibSeq -preMadeNib eutPri1 /gbdb/eutPri1/nib 15/chr15.fa echo "select chrom,size from chromInfo" | hgsql -N eutPri1 > chrom.sizes echo 15 > chrom.lst # create assembly and gap tracks hgGoldGapGl -noGl -chromLst=chrom.lst eutPri1 /cluster/data/eutPri1 . echo "create table grp (PRIMARY KEY(NAME)) select * from hg17.grp" | hgsql eutPri1 cd 15 splitFaIntoContigs chr15.agp chr15.fa . -nSize=5000000 cd .. # stupid move to restore sanity mv 15 old.15 mv old.15/15 15 # MAKE GCPERCENT mkdir -p /cluster/data/eutPri1/bed/gcPercent cd /cluster/data/eutPri1/bed/gcPercent hgsql eutPri1 < ~/src/hg/lib/gcPercent.sql # load gcPercent table hgGcPercent eutPri1 ../../nib cd ../.. echo 'INSERT INTO defaultDb VALUES ("Primate Ancestor", "eutPri1");' | hgsql -h genome-testdb hgcentraltest # Add dbDb and defaultDb entries: echo 'insert into dbDb (name, description, nibPath, organism, \ defaultPos, active, orderKey, genome, scientificName, \ htmlPath, hgNearOk) \ values("eutPri1", "Jan. 2005", \ "/gbdb/eutPri1/nib", "Primate Ancestor", "chr15:13120186-13124117", 1, \ 46, "Primate Ancestor", "Primate Ancestor", \ "/gbdb/eutPri1/html/description.html", 0);' \ | hgsql -h genome-testdb hgcentraltest echo 'update defaultDb set name = "eutPri1" where genome = "Primate Ancestor"' | hgsql -h genome-testdb hgcentraltest echo 'insert into genomeClade (genome, clade, priority) values ("Primate Ancestor", "vertebrate",67);' \ | hgsql -h genome-testdb hgcentraltest # Make trackDb table so browser knows what tracks to expect: cd $HOME/kent/src/hg/makeDb/trackDb mkdir -p eutPri/eutPri1 cvs add eutPri cvs add eutPri/eutPri1 cvs ci eutPri # Edit that makefile to add eutPri1 in all the right places and do make update # SIMPLE REPEATS (TRF) ssh kksilo mkdir -p /cluster/data/eutPri1/bed/simpleRepeat cd /cluster/data/eutPri1/bed/simpleRepeat mkdir trf tcsh cp /dev/null jobs.csh foreach d (/cluster/data/eutPri1/*/chr*_?{,?}) set ctg = $d:t foreach f ($d/${ctg}.fa) set fout = $f:t:r.bed echo $fout echo "/cluster/bin/i386/trfBig -trf=/cluster/bin/i386/trf $f /dev/null -bedAt=trf/$fout -tempDir=/tmp" \ >> jobs.csh end end csh -ef jobs.csh >&! jobs.log & # check on this with tail -f jobs.log wc -l jobs.csh ls -1 trf | wc -l endsInLf trf/* # When job is done do: liftUp simpleRepeat.bed /cluster/data/eutPri1/jkStuff/liftAll.lft warn trf/*.bed # Load into the database: ssh hgwdev hgLoadBed eutPri1 simpleRepeat \ /cluster/data/eutPri1/bed/simpleRepeat/simpleRepeat.bed \ -sqlTable=$HOME/kent/src/hg/lib/simpleRepeat.sql featureBits eutPri1 simpleRepeat # 1251045 bases of 88781993 (1.409%) in intersection # REPEAT MASKING #- Make the run directory and job list: cd /cluster/data/eutPri1 cd 15 for i in chr*_*; do cd $i; faSplit gap $i.fa 500000 "$i"_ -lift=$i.lft -minGapSize=100; cd ..; done cd .. cat << '_EOF_' > jkStuff/RMHuman #!/bin/csh -fe cd $1 pushd . /bin/mkdir -p /tmp/eutPri1/$2 /bin/cp $2 /tmp/eutPri1/$2/ cd /tmp/eutPri1/$2 /cluster/bluearc/RepeatMasker/RepeatMasker -ali -s $2 popd /bin/cp /tmp/eutPri1/$2/$2.out ./ if (-e /tmp/eutPri1/$2/$2.align) /bin/cp /tmp/eutPri1/$2/$2.align ./ if (-e /tmp/eutPri1/$2/$2.tbl) /bin/cp /tmp/eutPri1/$2/$2.tbl ./ if (-e /tmp/eutPri1/$2/$2.cat) /bin/cp /tmp/eutPri1/$2/$2.cat ./ /bin/rm -fr /tmp/eutPri1/$2/* /bin/rmdir --ignore-fail-on-non-empty /tmp/eutPri1/$2 /bin/rmdir --ignore-fail-on-non-empty /tmp/eutPri1 '_EOF_' # << this line makes emacs coloring happy chmod +x jkStuff/RMHuman mkdir RMRun for i in `cat chrom.lst` do f=$i/*/*_*_*.fa for j in $f do d=`dirname $j` f=`basename $j` echo /cluster/data/eutPri1/jkStuff/RMHuman /cluster/data/eutPri1/$d $f \ '{'check out line+ /cluster/data/eutPri1/$d/$f.out'}' done done > RMRun/RMJobs #- Do the run ssh kk cd /cluster/data/eutPri1/RMRun para create RMJobs para try, para check, para check, para push, para check,... # Completed: 195 of 195 jobs # CPU time in finished jobs: 1591364s 26522.74m 442.05h 18.42d 0.050 y # IO & Wait Time: 4786s 79.76m 1.33h 0.06d 0.000 y # Average job time: 8185s 136.42m 2.27h 0.09d # Longest job: 10708s 178.47m 2.97h 0.12d # Submission to last job: 10759s 179.32m 2.99h 0.12d #- Lift up the 500KB chunk .out's to 5MB ("pseudo-contig") level ssh kksilo cd /cluster/data/eutPri1 foreach d (*/chr*_?{,?}) set contig = $d:t echo $contig liftUp $d/$contig.fa.out $d/$contig.lft warn $d/${contig}_*.fa.out > /dev/null end #- Lift pseudo-contigs to chromosome level foreach c (`cat chrom.lst`) echo lifting $c cd $c if (-e lift/ordered.lft && ! -z lift/ordered.lft) then liftUp chr$c.fa.out lift/ordered.lft warn `cat lift/oOut.lst` \ > /dev/null endif if (-e lift/random.lft && ! -z lift/random.lft) then liftUp chr${c}_random.fa.out lift/random.lft warn `cat lift/rOut.lst` \ > /dev/null endif cd .. end #- Load the .out files into the database with: ssh hgwdev cd /cluster/data/eutPri1 hgLoadOut eutPri1 */chr*.fa.out # Strange perc. field -14.9 line 53256 of 15/chr15.fa.out # Strange perc. field -54.5 line 53256 of 15/chr15.fa.out # Loading up table chr15_rmsk # note: 24 records dropped due to repStart > repEnd # PROCESS SIMPLE REPEATS INTO MASK # After the simpleRepeats track has been built, make a filtered version # of the trf output: keep trf's with period <= 12: ssh kksilo cd /cluster/data/eutPri1/bed/simpleRepeat mkdir -p trfMask foreach f (trf/chr*.bed) awk '{if ($5 <= 12) print;}' $f > trfMask/$f:t end # Lift up filtered trf output to chrom coords as well: cd /cluster/data/eutPri1 mkdir bed/simpleRepeat/trfMaskChrom foreach c (`cat chrom.lst`) if (-e $c/lift/ordered.lst) then perl -wpe 's@(\S+)@bed/simpleRepeat/trfMask/$1.bed@' \ $c/lift/ordered.lst > $c/lift/oTrf.lst liftUp bed/simpleRepeat/trfMaskChrom/chr$c.bed \ jkStuff/liftAll.lft warn `cat $c/lift/oTrf.lst` endif if (-e $c/lift/random.lst) then perl -wpe 's@(\S+)@bed/simpleRepeat/trfMask/$1.bed@' \ $c/lift/random.lst > $c/lift/rTrf.lst liftUp bed/simpleRepeat/trfMaskChrom/chr${c}_random.bed \ jkStuff/liftAll.lft warn `cat $c/lift/rTrf.lst` endif end # Here's the coverage for the filtered TRF: ssh hgwdev cat /cluster/data/eutPri1/bed/simpleRepeat/trfMaskChrom/*.bed > /tmp/filtTrf.bed featureBits eutPri1 /tmp/filtTrf.bed # 535895 bases of 88781993 (0.604%) in intersection # MASK SEQUENCE WITH REPEATMASKER AND SIMPLE REPEAT/TRF ssh kksilo cd /cluster/data/eutPri1 # Soft-mask (lower-case) the contig and chr .fa's, # then make hard-masked versions from the soft-masked. set trfCtg=bed/simpleRepeat/trfMask set trfChr=bed/simpleRepeat/trfMaskChrom foreach f (*/chr*.fa) echo "repeat- and trf-masking $f" maskOutFa -soft $f $f.out $f set chr = $f:t:r maskOutFa -softAdd $f $trfChr/$chr.bed $f echo "hard-masking $f" maskOutFa $f hard $f.masked end # WARNING: negative rEnd: -10 chr15:5787338-5787377 FRAM # WARNING: negative rEnd: -203 chr15:22088023-22089143 L1MEc # WARNING: negative rEnd: -140 chr15:41818311-41818422 L1MA8 foreach c (`cat chrom.lst`) echo "repeat- and trf-masking contigs of chr$c, chr${c}_random" foreach d ($c/chr*_?{,?}) set ctg=$d:t set f=$d/$ctg.fa maskOutFa -soft $f $f.out $f maskOutFa -softAdd $f $trfCtg/$ctg.bed $f maskOutFa $f hard $f.masked end end #- Rebuild the nib files, using the soft masking in the fa: mkdir nib foreach f (*/chr*.fa) faToNib -softMask $f nib/$f:t:r.nib end # Writing 88841993 bases in 44421005 bytes # Make one big 2bit file as well, and make a link to it in # /gbdb/eutPri1/nib because hgBlat looks there: faToTwoBit */chr*.fa eutPri1.2bit ln -s /cluster/data/eutPri1/eutPri1.2bit /gbdb/eutPri1/nib/ # PUT MASKED SEQUENCE OUT FOR CLUSTER RUNS ssh kkr1u00 # Chrom-level mixed nibs that have been repeat- and trf-masked: rm -rf /iscratch/i/eutPri1/nib mkdir -p /iscratch/i/eutPri1/nib cp -p /cluster/data/eutPri1/nib/chr*.nib /iscratch/i/eutPri1/nib # Pseudo-contig fa that have been repeat- and trf-masked: rm -rf /iscratch/i/eutPri1/trfFa mkdir /iscratch/i/eutPri1/trfFa foreach d (/cluster/data/eutPri1/*/chr*_?{,?}) cp $d/$d:t.fa /iscratch/i/eutPri1/trfFa end cp -p /cluster/data/eutPri1/eutPri1.2bit /iscratch/i/eutPri1/ iSync # MAKE 10.OOC, 11.OOC FILES FOR BLAT ssh kolossus mkdir /cluster/data/eutPri1/bed/ooc mkdir -p /cluster/bluearc/eutPri1/ cd /cluster/data/eutPri1/bed/ooc ls -1 /cluster/data/eutPri1/nib/chr*.nib > nib.lst blat nib.lst /dev/null /dev/null -tileSize=11 -makeOoc=/cluster/bluearc/eutPri1/11.ooc -repMatch=1024 # Wrote 8 overused 11-mers to /cluster/bluearc/eutPri1/11.ooc blat nib.lst /dev/null /dev/null -tileSize=10 -makeOoc=/cluster/bluearc/eutPri1/10.ooc -repMatch=843 # Wrote 67 overused 10-mers to /cluster/bluearc/eutPri1/10.ooc ssh kkr1u00 cp -p /cluster/bluearc/eutPri1/*.ooc /iscratch/i/eutPri1/ iSync # MAKE Human Proteins track ssh kksilo mkdir -p /cluster/data/eutPri1/blastDb cd /cluster/data/eutPri1/blastDb for i in `cat ../chrom.lst`; do ls -1 ../$i/*/*.fa ; done > list for i in `cat list` do f=`basename $i` sed "s/ .*$//" $i > $f formatdb -i $f -p F done rm *.log *.fa list cd .. # for i in `cat chrom.lst`; do cat $i/chr*/*.lft ; done > jkStuff/subChr.lft ssh kkr1u00 mkdir -p /iscratch/i/eutPri1/blastDb cd /cluster/data/eutPri1/blastDb for i in nhr nin nsq; do cp *.$i /iscratch/i/eutPri1/blastDb ; echo $i; done cd iSync > sync.out mkdir -p /cluster/data/eutPri1/bed/tblastn.hg17KG cd /cluster/data/eutPri1/bed/tblastn.hg17KG echo /iscratch/i/eutPri1/blastDb/*.nsq | xargs ls -S | sed "s/\.nsq//" > query.lst # back to kksilo exit cd /cluster/data/eutPri1/bed/tblastn.hg17KG mkdir -p /cluster/bluearc/eutPri1/bed/tblastn.hg17KG/kgfa split -l 70 /cluster/data/hg17/bed/blat.hg17KG/hg17KG.psl /cluster/bluearc/eutPri1/bed/tblastn.hg17KG/kgfa/kg ln -s /cluster/bluearc/eutPri1/bed/tblastn.hg17KG/kgfa kgfa cd kgfa for i in *; do pslxToFa $i $i.fa; rm $i; done cd .. ls -1S kgfa/*.fa > kg.lst mkdir -p /cluster/bluearc/eutPri1/bed/tblastn.hg17KG/blastOut ln -s /cluster/bluearc/eutPri1/bed/tblastn.hg17KG/blastOut for i in `cat kg.lst`; do mkdir blastOut/`basename $i .fa`; done tcsh cat << '_EOF_' > blastGsub #LOOP blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl } #ENDLOOP '_EOF_' cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/iscratch/i/blast/data export BLASTMAT g=`basename $2` f=/tmp/`basename $3`.$g for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11 do if /scratch/blast/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8 then mv $f.8 $f.1 break; fi done if test -f $f.1 then if /cluster/bin/i386/blastToPsl $f.1 $f.2 then liftUp -nosort -type=".psl" -nohead $f.3 ../../jkStuff/liftAll.lft carry $f.2 liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg17/bed/blat.hg17KG/protein.lft warn $f.3 if pslCheck -prot $3.tmp then mv $3.tmp $3 rm -f $f.1 $f.2 $f.3 fi exit 0 fi fi rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 exit 1 '_EOF_' chmod +x blastSome gensub2 query.lst kg.lst blastGsub blastSpec ssh kk cd /cluster/data/eutPri1/bed/tblastn.hg17KG para create blastSpec para push # Completed: 6633 of 6633 jobs # CPU time in finished jobs: 2509978s 41832.96m 697.22h 29.05d 0.080 y # IO & Wait Time: 57960s 966.00m 16.10h 0.67d 0.002 y # Average job time: 387s 6.45m 0.11h 0.00d # Longest job: 2437s 40.62m 0.68h 0.03d # Submission to last job: 7992s 133.20m 2.22h 0.09d cat << '_EOF_' > chainGsub #LOOP chainSome $(path1) #ENDLOOP '_EOF_' cat << '_EOF_' > chainOne (cd $1; cat q."$2"* | simpleChain -prot -outPsl -maxGap=200000 stdin ../c.`basename $1`.$2.psl) '_EOF_' chmod +x chainOne for j in blastOut/kg??; do for i in `cat ../../chrom.lst`; do echo chainOne $j chr"$i"; done ; done > chainSpec ssh kki cd /cluster/data/eutPri1/bed/tblastn.hg17KG para create chainSpec para push # Completed: 603 of 603 jobs # CPU time in finished jobs: 34193s 569.88m 9.50h 0.40d 0.001 y # IO & Wait Time: 2951s 49.18m 0.82h 0.03d 0.000 y # Average job time: 62s 1.03m 0.02h 0.00d # Longest job: 814s 13.57m 0.23h 0.01d # Submission to last job: 3282s 54.70m 0.91h 0.04d exit # back to kksilo cd /cluster/data/eutPri1/bed/tblastn.hg17KG/blastOut for i in kg?? do cat c.$i.*.psl | awk "(\$13 - \$12)/\$11 > 0.6 {print}" > c60.$i.psl sort -rn c60.$i.psl | pslUniq stdin u.$i.psl awk "((\$1 / \$11) ) > 0.90 { print }" c60.$i.psl > m60.$i.psl echo $i done cat u.*.psl m60* | sort -T /tmp -k 14,14 -k 17,17n -k 17,17n | uniq > /cluster/data/eutPri1/bed/tblastn.hg17KG/blastHg17KG.psl cd .. ssh hgwdev cd /cluster/data/eutPri1/bed/tblastn.hg17KG hgLoadPsl eutPri1 blastHg17KG.psl # back to kksilo rm -rf blastOut # End tblastn # PRODUCING GENSCAN PREDICTIONS ssh hgwdev mkdir /cluster/data/eutPri1/bed/genscan cd /cluster/data/eutPri1/bed/genscan # Check out hg3rdParty/genscanlinux to get latest genscan: cvs co hg3rdParty/genscanlinux # Run on small cluster (more mem than big cluster). ssh kki cd /cluster/data/eutPri1/bed/genscan # Make 3 subdirectories for genscan to put their output files in mkdir gtf pep subopt # Generate a list file, genome.list, of all the hard-masked contigs that # *do not* consist of all-N's (which would cause genscan to blow up) rm -f genome.list touch genome.list foreach f ( `ls -1S /cluster/data/eutPri1/*/chr*_*/chr*_?{,?}.fa.masked` ) egrep '[ACGT]' $f > /dev/null if ($status == 0) echo $f >> genome.list end wc -l genome.list # Create template file, gsub, for gensub2. For example (3-line file): cat << '_EOF_' > gsub #LOOP /cluster/bin/i386/gsBig {check in line+ $(path1)} {check out line gtf/$(root1).gtf} -trans={check out line pep/$(root1).pep} -subopt={check out line subopt/$(root1).bed} -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=2400000 #ENDLOOP '_EOF_' # << this line makes emacs coloring happy gensub2 genome.list single gsub jobList para create jobList para try, check, push, check, ... # Completed: 11 of 11 jobs # CPU time in finished jobs: 1865s 31.09m 0.52h 0.02d 0.000 y # IO & Wait Time: 39s 0.64m 0.01h 0.00d 0.000 y # Average job time: 173s 2.88m 0.05h 0.00d # Longest job: 253s 4.22m 0.07h 0.00d # Submission to last job: 253s 4.22m 0.07h 0.00d # Convert these to chromosome level files as so: ssh kksilo cd /cluster/data/eutPri1/bed/genscan liftUp genscan.gtf ../../jkStuff/liftAll.lft warn gtf/*.gtf liftUp genscanSubopt.bed ../../jkStuff/liftAll.lft warn subopt/*.bed cat pep/*.pep > genscan.pep # Load into the database as so: ssh hgwdev cd /cluster/data/eutPri1/bed/genscan ldHgGene eutPri1 genscan genscan.gtf hgPepPred eutPri1 generic genscanPep genscan.pep hgLoadBed eutPri1 genscanSubopt genscanSubopt.bed #END GENSCAN # MAKE Human Known Gene mRNAs Mapped by BLATZ track (hg17 # Create working directory and extract known gene mrna from database ssh kk mkdir -p /cluster/data/eutPri1/bed/blatz.hg17KG cd /cluster/data/eutPri1/bed/blatz.hg17KG # Do parasol job to align via blatz mkdir run0 cd run0 mkdir psl # awk '{print $1;}' /cluster/data/eutPri1/chrom.sizes > genome.lst ls -1S /iscratch/i/eutPri1/trfFa/*.fa > genome.lst for i in `cat genome.lst` ; do f=`basename $i .fa`; mkdir psl/$f ; done for i in /iscratch/i/hg17/knownRna2/*.fa ; do echo $i; done > mrna.lst cat << '_EOF_' > doBlatz #!/bin/sh f=/tmp/`basename $1`.`basename $2` if blatz -rna -minScore=6000 -out=psl $1 $2 $f.1 then if liftUp -nosort -type=".psl" -nohead $f.2 ../../../jkStuff/liftAll.lft carry $f.1 then mv $f.2 $3 rm $f.1 exit 0 fi fi rm $f.1 $f.2 exit 1 '_EOF_' chmod +x doBlatz cat << '_EOF_' > gsub #LOOP doBlatz $(path1) $(path2) {check out line psl/$(root1)/$(root2).psl} #ENDLOOP '_EOF_' # << this line keeps emacs coloring happy gensub2 genome.lst mrna.lst gsub spec para create spec # Then do usual para try/para push/para time # Completed: 6303 of 6303 jobs # CPU time in finished jobs: 920417s 15340.29m 255.67h 10.65d 0.029 y # IO & Wait Time: 23131s 385.51m 6.43h 0.27d 0.001 y # Average job time: 150s 2.49m 0.04h 0.00d # Longest job: 1064s 17.73m 0.30h 0.01d # Submission to last job: 1632s 27.20m 0.45h 0.02d # Do sort and near-besting on file server ssh kksilo cd /cluster/data/eutPri1/bed/blatz.hg17KG/run0 catDir -r psl | pslFilter -minScore=100 -minAli=255 -noHead stdin stdout | sort -k 10 > ../filtered.psl cd .. pslReps filtered.psl nearBest.psl /dev/null -nohead -minAli=0.8 -nearTop=0.01 -minCover=0.8 sort -k 14 nearBest.psl > blatzHg17KG.psl # Clean up rm -r run0/psl # Load into database ssh hgwdev cd /cluster/data/eutPri1/bed/blatz.hg17KG hgLoadPsl eutPri1 blatzHg17KG.psl #END BLATZ # BLASTZ SWAP FOR hg17 vs eutPri1 BLASTZ TO CREATE eutPri1 vs hg17 BLASTZ ssh kolossus mkdir -p /cluster/data/eutPri1/bed/blastz.hg17 cd /cluster/data/eutPri1/bed/blastz.hg17 set aliDir = /cluster/data/hg17/bed/blastz.eutPri1 cp $aliDir/S1.len S2.len cp $aliDir/S2.len S1.len mkdir unsorted axtChrom cat $aliDir/axtChrom/chr*.axt \ | axtSwap stdin $aliDir/S1.len $aliDir/S2.len stdout \ | axtSplitByTarget stdin unsorted # Sort the shuffled .axt files. foreach f (unsorted/*.axt) echo sorting $f:t:r axtSort $f axtChrom/$f:t end du -sh $aliDir/axtChrom unsorted axtChrom # 409M /cluster/data/hg17/bed/blastz.eutPri1/axtChrom # 409M unsorted # 409M axtChrom rm -r unsorted # CHAIN HUMAN BLASTZ # Run axtChain on little cluster ssh kki cd /cluster/data/eutPri1/bed/blastz.hg17 mkdir -p axtChain/run1 cd axtChain/run1 mkdir out chain ls -1S /cluster/data/eutPri1/bed/blastz.hg17/axtChrom/*.axt > input.lst cat << '_EOF_' > gsub #LOOP doChain {check in line+ $(path1)} {check out line+ chain/$(root1).chain} {check out exists out/$(root1).out} #ENDLOOP '_EOF_' # << this line makes emacs coloring happy cat << '_EOF_' > doChain #!/bin/csh axtChain $1 \ /iscratch/i/eutPri1/nib \ /iscratch/i/gs.18/build35/bothMaskedNibs $2 > $3 '_EOF_' # << this line makes emacs coloring happy chmod a+x doChain gensub2 input.lst single gsub jobList para create jobList para try, check, push, check... # Completed: 1 of 1 jobs # CPU time in finished jobs: 189s 3.16m 0.05h 0.00d 0.000 y # IO & Wait Time: 11s 0.18m 0.00h 0.00d 0.000 y # Average job time: 200s 3.33m 0.06h 0.00d # Longest job: 200s 3.33m 0.06h 0.00d # Submission to last job: 200s 3.33m 0.06h 0.00d # now on the cluster server, sort chains ssh kksilo cd /cluster/data/eutPri1/bed/blastz.hg17/axtChain chainMergeSort run1/chain/*.chain > all.chain chainSplit chain all.chain rm run1/chain/*.chain # take a look at score distr's foreach f (chain/*.chain) grep chain $f | awk '{print $2;}' | sort -nr > /tmp/score.$f:t:r echo $f:t:r textHistogram -binSize=4000 /tmp/score.$f:t:r echo "" end mv all.chain all.chain.unfiltered chainFilter -minScore=50000 all.chain.unfiltered > all.chain rm chain/* chainSplit chain all.chain gzip all.chain.unfiltered # Load chains into database ssh hgwdev cd /cluster/data/eutPri1/bed/blastz.hg17/axtChain/chain foreach i (*.chain) set c = $i:r hgLoadChain eutPri1 ${c}_chainHg17 $i end featureBits eutPri1 chainHg17Link # 54236215 bases of 70435997 (77.001%) in intersection # BLASTZ SWAP FOR borEut1 vs eutPri1 BLASTZ TO CREATE eutPri1 vs borEut1 BLASTZ ssh kolossus mkdir -p /cluster/data/eutPri1/bed/blastz.borEut1 cd /cluster/data/eutPri1/bed/blastz.borEut1 set aliDir = /cluster/data/borEut1/bed/blastz.eutPri1 cp $aliDir/S1.len S2.len cp $aliDir/S2.len S1.len mkdir unsorted axtChrom cat $aliDir/axtChrom/chr*.axt \ | axtSwap stdin $aliDir/S1.len $aliDir/S2.len stdout \ | axtSplitByTarget stdin unsorted # Sort the shuffled .axt files. foreach f (unsorted/*.axt) echo sorting $f:t:r axtSort $f axtChrom/$f:t end du -sh $aliDir/axtChrom unsorted axtChrom # 409M /cluster/data/borEut1/bed/blastz.eutPri1/axtChrom # 409M unsorted # 409M axtChrom rm -rf unsorted # CHAIN DOG BLASTZ # Run axtChain on little cluster ssh kki cd /cluster/data/eutPri1/bed/blastz.borEut1 mkdir -p axtChain/run1 cd axtChain/run1 mkdir out chain ls -1S /cluster/data/eutPri1/bed/blastz.borEut1/axtChrom/*.axt > input.lst cat << '_EOF_' > gsub #LOOP doChain {check in line+ $(path1)} {check out line+ chain/$(root1).chain} {check out exists out/$(root1).out} #ENDLOOP '_EOF_' # << this line makes emacs coloring happy cat << '_EOF_' > doChain #!/bin/csh axtChain $1 \ /iscratch/i/eutPri1/nib \ /iscratch/i/borEut1/nib $2 > $3 '_EOF_' # << this line makes emacs coloring happy chmod a+x doChain gensub2 input.lst single gsub jobList para create jobList para try, check, push, check... # Completed: 1 of 1 jobs # CPU time in finished jobs: 197s 3.28m 0.05h 0.00d 0.000 y # IO & Wait Time: 88s 1.47m 0.02h 0.00d 0.000 y # Average job time: 285s 4.75m 0.08h 0.00d # Longest job: 285s 4.75m 0.08h 0.00d # Submission to last job: 285s 4.75m 0.08h 0.00d # now on the cluster server, sort chains ssh kksilo cd /cluster/data/eutPri1/bed/blastz.borEut1/axtChain chainMergeSort run1/chain/*.chain > all.chain chainSplit chain all.chain rm run1/chain/*.chain # take a look at score distr's foreach f (chain/*.chain) grep chain $f | awk '{print $2;}' | sort -nr > /tmp/score.$f:t:r echo $f:t:r textHistogram -binSize=4000 /tmp/score.$f:t:r echo "" end mv all.chain all.chain.unfiltered chainFilter -minScore=50000 all.chain.unfiltered > all.chain rm chain/* chainSplit chain all.chain gzip all.chain.unfiltered & # Load chains into database ssh hgwdev cd /cluster/data/eutPri1/bed/blastz.borEut1/axtChain/chain foreach i (*.chain) set c = $i:r hgLoadChain eutPri1 ${c}_chainCanFam1 $i end featureBits eutPri1 chainHg17Link # 54236215 bases of 70435997 (77.001%) in intersection # BLASTZ Primate Ancestor (BORPRI1) ssh kk mkdir /cluster/data/borEut1/bed/blastz.eutPri1 cd /cluster/data/borEut1/bed/blastz.eutPri1 # Use default (Human-Mouse) settings for starters. cat << '_EOF_' > DEF # dog vs. borEut export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin ALIGN=blastz-run BLASTZ=blastz # Default BLASTZ_H=2000 BLASTZ_ABRIDGE_REPEATS=1 # TARGET: Dog SEQ1_DIR=/iscratch/i/borEut1/nib SEQ1_RMSK= SEQ1_FLAG= # SEQ1_SMSK=/scratch/hg/gs.18/build35/linSpecRep.notInDog SEQ1_SMSK= SEQ1_IN_CONTIGS=0 SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: BorEut SEQ2_DIR=/iscratch/i/eutPri1/nib SEQ2_RMSK= SEQ2_FLAG= #SEQ2_SMSK=/scratch/hg/borEut1/linSpecRep.notInHuman SEQ2_SMSK= SEQ2_IN_CONTIGS=0 SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/cluster/data/borEut1/bed/blastz.eutPri1 DEF=$BASE/DEF RAW=$BASE/raw CDBDIR=$BASE SEQ1_LEN=$BASE/S1.len SEQ2_LEN=$BASE/S2.len '_EOF_' # << this line keeps emacs coloring happy /cluster/data/hg17/jkStuff/BlastZ_run0.sh cd run.0 para push # Completed: 2200 of 2200 jobs # CPU time in finished jobs: 720384s 12006.40m 200.11h 8.34d 0.023 y # IO & Wait Time: 21690s 361.50m 6.03h 0.25d 0.001 y # Average job time: 337s 5.62m 0.09h 0.00d # Longest job: 2099s 34.98m 0.58h 0.02d # Submission to last job: 2267s 37.78m 0.63h 0.03d ssh kki cd /cluster/data/borEut1/bed/blastz.eutPri1 /cluster/data/hg17/jkStuff/BlastZ_run1.sh cd run.1 para push # Completed: 275 of 275 jobs # CPU time in finished jobs: 243s 4.06m 0.07h 0.00d 0.000 y # IO & Wait Time: 774s 12.89m 0.21h 0.01d 0.000 y # Average job time: 4s 0.06m 0.00h 0.00d # Longest job: 8s 0.13m 0.00h 0.00d # Submission to last job: 78s 1.30m 0.02h 0.00d ssh kk cd /cluster/data/borEut1/bed/blastz.eutPri1 /cluster/data/hg17/jkStuff/BlastZ_run2.sh cd run.2 para push # Completed: 41 of 41 jobs # CPU time in finished jobs: 266s 4.43m 0.07h 0.00d 0.000 y # IO & Wait Time: 822s 13.70m 0.23h 0.01d 0.000 y # Average job time: 27s 0.44m 0.01h 0.00d # Longest job: 54s 0.90m 0.01h 0.00d # Submission to last job: 58s 0.97m 0.02h 0.00d # END BLASTZ