# Chimp pre-assembly # Downloaded files from Whitehead: #ftp-genome.wi.mit.edu # Dir: Assembly.Nov1.2003 #unzip #The scaffold count is 37927. #% faSize contigs.bases #2733948177 bases (0 N's 2733948177 real) in 361864 sequences in 1 files #Total size: mean 7555.2 sd 9652.7 min 19 (contig_358805) max 160254 (contig_301167) median 3470 #N count: mean 0.0 sd 0.0 # Split into 500KB chunks for RepeatMasking ssh kksilo cd /cluster/data/pt0 mkdir -p split500K faSplit sequence contigs.bases 10 split500K/ cd split500K foreach f (*.fa) set d = $f:t:r mkdir -p $d faSplit about $f 500000 $d/ rm $f end # Create RM script and cluster job # NOTE for browser, remask with -s option, and chimp-specific repeats cd /cluster/data/pt0 mkdir -p jkStuff mkdir -p RMRun rm -f RMRun/RMJobs cat << '_EOF_' > jkStuff/RMChimp #!/bin/csh -fe cd $1 pushd . /bin/mkdir -p /tmp/pt0/$2 /bin/cp $2 /tmp/pt0/$2 cd /tmp/pt0/$2 /scratch/hg/RepeatMasker/RepeatMasker $2 popd /bin/cp /tmp/pt0/$2/$2.out ./ /bin/rm -fr /tmp/pt0/$2/* /bin/rmdir --ignore-fail-on-non-empty /tmp/pt0/$2 /bin/rmdir --ignore-fail-on-non-empty /tmp/pt0 '_EOF_' chmod +x jkStuff/RMChimp mkdir -p RMRun rm -f RMRun/RMJobs foreach d ( split500K/?? ) foreach f ( $d/*.fa ) set f = $f:t echo /cluster/data/pt0/jkStuff/RMChimp \ /cluster/data/pt0/$d $f \ '{'check out line+ /cluster/data/pt0/$d/$f.out'}' \ >> RMRun/RMJobs end end #5367 RMJobs # Run cluster job sh kk cd /cluster/data/pt0/RMRun para create RMJobs para try, para check, para check, para push, para check,... #Completed: 5367 of 5367 jobs #CPU time in finished jobs: 14066571s 234442.85m 3907.38h 162.81d 0.446 y #IO & Wait Time: 72512s 1208.53m 20.14h 0.84d 0.002 y #Average job time: 2634s 43.91m 0.73h 0.03d #Longest job: 5696s 94.93m 1.58h 0.07d #Submission to last job: 41294s 688.23m 11.47h 0.48d # TRF: Tandem Repeat Finder # create job list of 5MB chunks ssh kksilo cd /cluster/data/pt0 mkdir -p split5M faSplit about contigs.bases 5000000 split5M/ cd /cluster/data/pt0 mkdir -p bed/simpleRepeat cd bed/simpleRepeat mkdir trf rm -f jobs.csh echo '#\!/bin/csh -fe' > jobs.csh foreach f (/cluster/data/pt0/split5M/*.fa) set fout = $f:t:r.bed echo "/cluster/bin/i386/trfBig -trf=/cluster/bin/i386/trf $f /dev/null -bedAt=trf/$fout -tempDir=/tmp" \ >> jobs.csh end chmod +x jobs.csh wc -l jobs.csh # 546 jobs.csh ./jobs.csh >&! jobs.log & # in bash: ./jobs.csh > jobs.log 2>&1 & tail -f jobs.log # FILTER SIMPLE REPEATS INTO MASK # make a filtered version # of the trf output: # keep trf's with period <= 12: ssh kksilo cd /cluster/data/pt0/bed/simpleRepeat mkdir -p trfMask foreach f (trf/*.bed) echo "filtering $f" awk '{if ($5 <= 12) print;}' $f > trfMask/$f:t end # MASK CONTIGS WITH REPEATMASKER ssh kksilo cd /cluster/data/pt0 cd split500K foreach d (??) foreach f ($d/*.fa) echo "Masking $f" maskOutFa $f $f.out $f -soft end end #WARNING: negative rEnd: -92 contig_143568:1128-1159 MER46B #WARNING: negative rEnd: -155 contig_143869:5374-5508 AluJb # etc. # Comment in rn3 doc (Hiram) indicates these are OK... # Merge 500K masked chunks into single file, then split into 5Mb chunks # to prepare for TRF masking foreach d (??) echo "Contig dir $d" foreach f ($d/?.fa $d/??.fa $d/???.fa) set g = $f:h cat $f >> $g.fa end end # check the split500K/??.fa masked files, then cd /cluster/data/pt0 cat split500K/??.fa > contigs.bases.rmsk # check the rmsk file, then rm split500K/??.fa mkdir -p split5M.rmsk faSplit about contigs.bases.rmsk 5000000 split5M.rmsk/ foreach f (split5M.rmsk/*.fa) echo "TRF Masking $f" set b = $f:t:r maskOutFa $f bed/simpleRepeat/trfMask/$b.bed $f -softAdd end cat split5M.rmsk/*.fa > contigs.bases.msk # RUN BLASTZ VS. HUMAN # NOTE: This would normally be doc'ed in the hg16 make doc, # however this is just preliminary until the real assembly arrives # distribute contigs to bluearc and /iscratch/i for cluster run ssh kksilo mkdir -p /cluster/bluearc/pt0/split100 faSplit sequence /cluster/data/pt0/contigs.bases.msk 100 /cluster/bluearc/pt0/split100/ ssh kkr1u00 mkdir -p /iscratch/i/pt0/trfFa df /iscratch/i faSplit sequence /cluster/data/pt0/contigs.bases.msk 100 /iscratch/i/pt0/trfFa/ /cluster/bin/scripts/iSync # make DEF file for blastz ssh kksilo cd /cluster/data/pt0 mkdir -p bed/blastz.hg16 cd bed/blastz.hg16 # NOTE: need schwartzbin below for utils still not in penn bin cat << '_EOF_' > DEF # human vs. chimp export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/home/angie/schwartzbin:/cluster/home/kent/bin/i386 ALIGN=blastz-run BLASTZ=blastz BLASTZ_H=2000 BLASTZ_ABRIDGE_REPEATS=0 # Specific settings for chimp BLASTZ_Y=3400 BLASTZ_T=2 BLASTZ_K=4500 BLASTZ_Q=/cluster/data/penn/human_chimp.q # TARGET # Human SEQ1_DIR=/iscratch/i/gs.17/build34/bothMaskedNibs # not used SEQ1_RMSK= # not used SEQ1_FLAG= SEQ1_SMSK= SEQ1_IN_CONTIGS=0 SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY # Chimp SEQ2_DIR=/iscratch/i/pt0/trfFa # not currently used SEQ2_RMSK= # not currently used SEQ2_FLAG= SEQ2_SMSK= SEQ2_IN_CONTIGS=1 SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/cluster/store6/pt0/bed/blastz.hg16 DEF=$BASE/DEF RAW=$BASE/raw CDBDIR=$BASE SEQ1_LEN=$BASE/S1.len SEQ2_LEN=$BASE/S2.len '_EOF_' # << this line makes emacs coloring happy # Save the DEF file in the current standard place cp DEF ~angie/hummus/DEF.hg16-pt0.`date -I` ssh kk cd /cluster/data/pt0/bed/blastz.hg16 # source the DEF file to establish environment for following commands bash source ./DEF cp /cluster/data/mm4/jkStuff/BlastZ_run0.sh ../../jkStuff ../../jkStuff/BlastZ_run0.sh cd run.0 para try para check para push # Completed: 33561 of 33561 jobs # CPU time in finished jobs: 2748604s 45810.06m 763.50h 31.81d 0.087 y # IO & Wait Time: 468757s 7812.62m 130.21h 5.43d 0.015 y # Average job time: 96s 1.60m 0.03h 0.00d # Longest job: 3289s 54.82m 0.91h 0.04d # Submission to last job: 6757s 112.62m 1.88h 0.08d # Second cluster run to convert the .out's to .lav's cp /cluster/data/mm4/jkStuff/BlastZ_run1.sh /cluster/data/pt0/jkStuff ssh kk cd /cluster/data/pt0/bed/blastz.hg16 bash source DEF ../../jkStuff/BlastZ_run1.sh cd run.1 para try para check para push # Prepare third cluster run script to convert lav's to axt's cd /cluster/data/pt0/bed/blastz.hg16 cat << '_EOF_' > ../../jkStuff/BlastZ_run2.sh #!/bin/sh # prepare third cluster run for blastz processing # NOTE: should run this on iservers (4G), # with chr19 and chr1 on kolossus (8G) M=`uname -n` if [ "$M" != "kk" ]; then echo "ERROR: you are on machine: '$M'" echo -e "\tthis script expects machine kk" exit 255 fi source DEF mkdir axtChrom mkdir run.2 cd run.2 # usage: blastz-contiglav2axt lav-dir axt-file seq1-dir seq2-file echo '#LOOP' > gsub echo '/cluster/bin/scripts/blastz-contiglav2axt '${BASE}'/lav/$(root1) {check out line+ '${BASE}'/axtChrom/$(root1).axt} '${SEQ1_DIR} /cluster/bluearc/pt0/contigs.bases.msk >> gsub echo '#ENDLOOP' >> gsub ls -1S ${BASE}/lav > chrom.list gensub2 chrom.list single gsub jobList wc -l jobList echo "running 'para create'" para create jobList echo "Ready for cluster run. para try, check, push, etc ..." '_EOF_' chmod +x ../../jkStuff/BlastZ_run2.sh # Third cluster run to convert lav's to axt's source DEF ../../jkStuff/BlastZ_run2.sh cd run.2 para try, check, push, etc ... # NOTE: ran this on kolossus and mini-cluster # 30 min. to 2 hrs. per chrom # Lift Contig (query side) AXT files to Scaffold coordinates ssh kksilo cd /cluster/data/pt0 agpToLift < assembly.agp > jkStuff/assembly.lft cd bed/blastz.hg16 cp -r axtChrom axtChrom.contig cd axtChrom cat << 'EOF' > doLift.csh #!/bin/csh foreach f (*.axt) echo "Lifting $f" liftUp -axtQ $f.lft /cluster/data/pt0/jkStuff/assembly.lft warn $f mv $f.lft $f end 'EOF' # << this line makes emacs coloring happy chmod +x doLift.csh csh doLift.csh >&! doLift.log & tail -100f doLift.log # create scaffold fasta file from contig file and agp agpAllToFaFile assembly.agp contigs.bases.msk scaffolds.msk.fa -sizes=scaffold.sizes # takes 20 minutes # CHAIN HG16 BLASTZ # The axtChain is usually run on the small cluster, # as follows, however for this run, with large fasta file # instead of nibs, required kolossus. #ssh kkr1u00 #cd /cluster/data/pt0/bed/blastz.hg16 #mkdir -p axtChain/run1 #cd axtChain/run1 #mkdir out chain #ls -1S /cluster/data/pt0/bed/blastz.hg16/axtChrom/*.axt > input.lst #cat << '_EOF_' > gsub #LOOP #doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out} #ENDLOOP #'_EOF_' # << this line makes emacs coloring happy #cat << '_EOF_' > doChain #!/bin/csh #axtFilter -notQ_random $1 | axtChain stdin \ #/iscratch/i/gs.17/build34/bothMaskedNibs \ #-faQ /cluster/bluearc/pt0/scaffolds.msk.fa $2 > $3 #'_EOF_' # << this line makes emacs coloring happy #chmod a+x doChain # 42 jobs #gensub2 input.lst single gsub jobList #grep random jobList > jobList2 #mv jobList2 jobList #para create jobList #para try, para check, para push... ssh kolossus cd /cluster/data/pt0/bed/blastz.hg16/axtChain/run1 cat << '_EOF_' > doChain.kol #!/bin/csh ~/bin/x86_64/axtFilter -notQ_random $1 | ~/bin/x86_64/axtChain stdin \ /cluster/data/hg16/nib \ -faQ /cluster/bluearc/pt0/scaffolds.msk.fa $2 > $3 '_EOF_' # << this line makes emacs coloring happy chmod a+x doChain.kol cat << 'EOF' > runChain.csh set path = (~/bin/x86_64 $path); rehash foreach c (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 M X Y) echo "chaining chr$c" which axtChain doChain.kol /cluster/data/pt0/bed/blastz.hg16/axtChrom/chr${c}.axt chain/chr${c}.chain out/chr${c}.out end 'EOF' csh runChain.csh >&! runChain.log & tail -100f runChain.log cat << 'EOF' > runChainRandom.csh set path = (~/bin/x86_64 $path); rehash foreach c (1 2 3 4 5 6 7 8 9 Un X 10 13 15 17 18 19) echo "chaining chr${c}_random" doChain.kol /cluster/data/pt0/bed/blastz.hg16/axtChrom/chr${c}_random.axt chain/chr${c}_random.chain out/chr${c}_random.out end 'EOF' csh runChainRandom.csh >&! runChainRandom.log & tail -100f runChainRandom.log # 7 minutes per chrom -- ~4.5 hrs # now on the server, sort chains ssh kksilo cd /cluster/data/pt0/bed/blastz.hg16/axtChain time chainMergeSort run1/chain/*.chain > all.chain #94.050u 10.830s 1:54.09 91.9% time chainSplit chain all.chain #85.530u 10.690s 1:46.58 ssh hgwdev cd /cluster/data/pt0/bed/blastz.hg16/axtChain/chain foreach i (*.chain) set c = $i:r hgLoadChain hg16 ${c}_chainPt0 $i eco done $c end # optionally: rm run1/chain/*.chain # NET HUMAN BLASTZ ssh kksilo cd /cluster/data/pt0/bed/blastz.hg16/axtChain mkdir preNet cd chain foreach i (*.chain) echo preNetting $i /cluster/bin/i386/chainPreNet $i /cluster/data/hg16/chrom.sizes \ /cluster/data/pt0/scaffold.sizes ../preNet/$i end cd .. mkdir n1 cd preNet foreach i (*.chain) set n = $i:r.net echo primary netting $i /cluster/bin/i386/chainNet $i -minSpace=1 /cluster/data/hg16/chrom.sizes \ /cluster/data/pt0/scaffold.sizes ../n1/$n /dev/null end cd .. cat n1/*.net | /cluster/bin/i386/netSyntenic stdin hNoClass.net # memory usage 611950592, utime 1586 s/100, stime 298 # skipping netClass for this preliminary work -- needs database #ssh hgwdev #cd /cluster/data/pt0/bed/blastz.hg16/axtChain #time netClass hNoClass.net pt0 hg16 human.net \ #-tNewR=/cluster/bluearc/scratch/mus/pt0/linSpecRep.notInHuman \ #-qNewR=/cluster/bluearc/scratch/hg/gs.17/build34/linSpecRep.notInMouse mv hNoClass.net chimp.net # Make a 'syntenic' subset of these with cd /cluster/data/pt0/bed/blastz.hg16/axtChain netFilter -syn chimp.net > chimpSyn.net # takes a few minutes # make net ssh kksilo cd /cluster/data/pt0/bed/blastz.hg16/axtChain mkdir chimpNet time netSplit chimp.net chimpNet # Load the nets into database ssh hgwdev cd /cluster/data/pt0/bed/blastz.hg16/axtChain netFilter -minGap=10 chimp.net | hgLoadNet hg16 netPt0 stdin netFilter -minGap=10 chimpSyn.net | hgLoadNet hg16 syntenyNetPt0 stdin # package files for LaDeana cd chimpNet mkdir -p /usr/local/apache/htdocs/kate/chimp tar cvfz humanChimpNet.gz *.chain mv humanChimpNet.gz /usr/local/apache/htdocs/kate/chimp cd .. # make chain subset from net (for Tarjei Mikkelson at MIT) cd chimpNet mkdir ../chainSubset foreach f (*.net) set c = $f:r netChainSubset $f ../chain/$c.chain ../chainSubset/$c.chain end cd ../chainSubset tar cvfz humanChimpChain.gz *.chain mv humanChimpChain.gz /usr/local/apache/htdocs/kate/chimp cd .. # AXT BEST FOR WEBB cd /cluster/data/pt0/bed/blastz.hg16/axtChrom mkdir -p ../axtBest foreach f (*.axt) set c = $f:r echo axtBesting $c axtBest $c.axt $c ../axtBest/$c.axt -minScore=300 end cd ../axtBest tar cvfz humanChimpAxtBest.gz *.ax mv humanChimpAxtBest.gz /usr/local/apache/htdocs/kate/chimp # REVERSE NET -- additional file for LaDeana ssh kolossus cd /cluster/data/pt0/bed/blastz.hg16/axtChain /cluster/home/kate/bin/x86_64/chainNet all.chain -minSpace=10 /cluster/data/hg16/chrom.sizes /cluster/data/pt0/scaffold.sizes chimpGap10.net humanGap10.net # takes a few minutes grep -v gap humanGap10.net > humanFill10.net # takes a few minutes cp humanFill10.net /usr/local/apache/htdocs/kate # REVERSE CHAIN -- additional file for Tarjei ssh kksilo cd /cluster/data/pt0/bed/blastz.hg16/axtChain mkdir humanNet time netSplit humanGap10.net humanNet cd humanNet mkdir humanChainSubset /cluster/home/kate/bin/x86_64/netChainSubset humanGap10.net all.chain humanChainSubset/all.chain foreach f (*.net) set c = $f:r #/cluster/home/kate/bin/x86_64/netChainSubset $f ../chain/$c.chain ../humanChainSubset/$c.chain netChainSubset $f ../chain/$c.chain ../humanChainSubset/$c.chain end cd ../humanChainSubset tar cvfz chimpHumanChain.gz *.chain mv chimpHumanChain.gz /usr/local/apache/htdocs/kate/chimp # UPDATE WITH NOV13 ASSEMBLY cd /cluster/data/pt0 wget -r -nH ftp://user:passwd@ftp-genome.wi.mit.edu/Assembly.Nov13.2003 cd Assembly.Nov13.2003 gunzip contigs.bases.gz # create scaffold fasta file from contig file and agp agpAllToFaFile assembly.agp contigs.bases scaffolds.fa -sizes=scaffold.sizes # takes 20 minutes # extract unplaced scaffolds from latest assembly # list of unplaced scaffolds from prior assembly # add 1 replaced, 3 new scaffolds cd /cluster/data/pt0/bed # NOTE: blastz.hg16.nomask not used in final chain/net mkdir -p blastz.hg16.2003-11-20 ln -s blastz.hg16.2003-11-20 blastz.hg16.nomask cd blastz.hg16.nomask echo > README << 'EOF' This is an alignment of unmasked chimp supercontigs to unmasked human hg16 chromosomes, limited to the supercontigs that were not placed by the alignment using masked sequence. 'EOF' cp ../blastz.hg16/axtChain/notOnNet . cat >> notOnNet << 'EOF' scaffold_28690 scaffold_37928 scaffold_37929 scaffold_37930 'EOF' # extract unplaced scaffolds faSomeRecords /cluster/data/pt0/Assembly.Nov13.2003/scaffolds.fa notOnNet \ unplaced.fa cd /cluster/data/hg16 cat > jkStuff/makeUnmaskedNib.sh << 'EOF' #!/bin/csh -ef mkdir -p unmaskedNib foreach i (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y Un M) cd $i foreach j (*.fa) echo chr$i /cluster/bin/i386/faToNib $j ../unmaskedNib/$j:r.nib echo done $j end cd .. end 'EOF' jkStuff/makeUnmaskedNib.sh >&! unmaskedNib.log & tail -100f unmaskedNib.log # RUN BLASTZ OF UNPLACED SCAFFOLDS VS. UNMASKED HUMAN # NOTE: this run was aborted due to excessive running time # A blat was run instead (see below). # distribute scaffolds and unmaksed nibs to bluearc and /iscratch/i # for cluster run ssh kksilo mkdir -p /cluster/bluearc/pt0/unplacedScaffolds faSplit sequence /cluster/data/pt0/bed/blastz.hg16.nomask/unplaced.fa \ 30 /cluster/bluearc/pt0/unplacedScaffolds/ cat > /cluster/bluearc/pt0/unplacedScaffolds/README << 'EOF' This directory contains scaffold fasta files for the Nov. 13 2003 assembly, that were not placed in the initial repeatmasked alignment. Used for unmasked alignment with human for placement on human chroms. 'EOF' ssh kkr1u00 mkdir -p /iscratch/i/gs.17/build34/unmaskedNib cp /cluster/data/hg16/unmaskedNib/* /iscratch/i/gs.17/build34/unmaskedNib df /iscratch/i /cluster/bin/scripts/iSync cp -r /cluster/data/hg16/unmaskedNib /cluster/bluearc/hg16 # make DEF file for blastz ssh kksilo cd /cluster/data/pt0 cd bed/blastz.hg16.nomask # NOTE: need schwartzbin below for utils still not in penn bin cat << '_EOF_' > DEF # human vs. chimp export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/home/angie/schwartzbin:/cluster/home/kent/bin/i386 ALIGN=blastz-run BLASTZ=blastz BLASTZ_H=2000 BLASTZ_ABRIDGE_REPEATS=0 # Specific settings for chimp BLASTZ_Y=3400 BLASTZ_T=2 BLASTZ_K=4500 BLASTZ_Q=/cluster/data/penn/human_chimp.q # TARGET # Human SEQ1_DIR=/iscratch/i/gs.17/build34/unmaskedNib # not used SEQ1_RMSK= # not used SEQ1_FLAG= SEQ1_SMSK= SEQ1_IN_CONTIGS=0 SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY # Chimp SEQ2_DIR=/cluster/bluearc/pt0/unplacedScaffolds # not currently used SEQ2_RMSK= # not currently used SEQ2_FLAG= SEQ2_SMSK= SEQ2_IN_CONTIGS=1 SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/cluster/store6/pt0/bed/blastz.hg16.nomask DEF=$BASE/DEF RAW=$BASE/raw CDBDIR=$BASE SEQ1_LEN=$BASE/S1.len SEQ2_LEN=$BASE/S2.len '_EOF_' # << this line makes emacs coloring happy # Save the DEF file in the current standard place cp DEF ~angie/hummus/DEF.hg16-pt0.nomask.`date -I` # needs lots of memory ssh kk cd /cluster/data/pt0/bed/blastz.hg16.nomask # source the DEF file to establish environment for following commands bash source ./DEF ../../jkStuff/BlastZ_run0.sh # 10170 jobs written to batch cd run.0 para try para check para push # NOTE -- aborted this run, it would have taken too long # LOAD CHIMP SEQUENCE INTO HUMAN BROWSER mkdir -p /gbdb/hg16/pt0 ln -s /cluster/data/pt0/scaffolds.msk.fa /gbdb/hg16/pt0 cd /cluster/data/pt0 hgLoadSeq -prefix=pt0 hg16 /gbdb/hg16/pt0/scaffolds.msk.fa # BLAT UNPLACED UNMASKED CHIMP VS. UNMASKED HUMAN ssh kksilo mkdir -p /cluster/bluearc/pt0/unplacedScaffolds300 faSplit sequence /cluster/data/pt0/bed/blastz.hg16.nomask/unplaced.fa \ 300 /cluster/bluearc/pt0/unplacedScaffolds300/ mkdir /cluster/data/pt0/bed/blatHg16.scaffolds cd /cluster/data/pt0/bed/blatHg16.scaffolds mkdir psl foreach f (/cluster/data/hg16/?{,?}/N?_??????/N?_??????.fa) set c=$f:t:r echo $c mkdir -p psl/$c end # create cluster job mkdir run cd run ls -1S /cluster/bluearc/pt0/unplacedScaffolds300/*.fa > pt0.lst ls -1S /scratch/hg/gs.17/build34/trfFa/*.fa > human.lst cat << 'EOF' > gsub #LOOP /cluster/bin/i386/blat -ooc=/scratch/hg/h/11.ooc -q=dna -t=dna {check in line+ $(path1)} {check in line+ $(path2)} {check out line+ /cluster/data/pt0/bed/blatHg16.scaffolds/psl/$(root1)/$(root1)_$(root2).psl} #ENDLOOP 'EOF' # << this line makes emacs coloring happy gensub2 human.lst pt0.lst gsub spec ssh kk cd /cluster/data/pt0 cd bed/blatHg16.scaffolds cd run para create spec # 145336 jobs para try para check para push para check # NOTE: blatHg16 not used in final chain/net; # it consists of blastz.hg16 + blatHg16.scaffolds chains ssh kksilo mkdir /cluster/data/pt0/bed/blatHg16 cd /cluster/data/pt0/bed/blatHg16 mkdir psl foreach f (/cluster/data/hg16/?{,?}/N?_??????/N?_??????.fa) set c=$f:t:r echo $c mkdir -p psl/$c end # extract contigs for unplaced scaffolds cd /cluster/data/pt0 jkStuff/getContigList.pl Assembly.Nov13.2003/assembly.agp \ /cluster/data/pt0/bed/blastz.hg16.nomask/notOnNet > \ /cluster/data/pt0/bed/blatHg16/unplacedContigs.lst cd /cluster/data/pt0/bed/blatHg16 faSomeRecords /cluster/data/pt0/Assembly.Nov13.2003/contigs.bases \ unplacedContigs.lst unplacedContigs.fa mkdir -p /cluster/bluearc/pt0/unplacedContigs200 faSplit sequence unplacedContigs.fa 200 \ /cluster/bluearc/pt0/unplacedContigs200/ # create cluster job mkdir run cd run ls -1S /cluster/bluearc/pt0/unplacedContigs200/*.fa > pt0.lst ls -1S /scratch/hg/gs.17/build34/trfFa/*.fa > human.lst cat << 'EOF' > gsub #LOOP /cluster/bin/i386/blat -ooc=/scratch/hg/h/11.ooc -q=dna -t=dna {check in line+ $(path1)} {check in line+ $(path2)} {check out line+ /cluster/data/pt0/bed/blatHg16/psl/$(root1)/$(root1)_$(root2).psl} #ENDLOOP 'EOF' # << this line makes emacs coloring happy gensub2 human.lst pt0.lst gsub spec ssh kk cd /cluster/data/pt0 cd bed/blatHg16 cd run para create spec # 97218 jobs para try para check para push para check # lift psl files to chromosome coordinates ssh kksilo cd /cluster/data/pt0/bed/blatHg16.scaffolds pslCat -dir psl/N?_?????? | \ liftUp -type=.psl stdout \ /cluster/data/hg16/jkStuff/liftAll.lft warn stdin | \ pslSortAcc nohead chrom temp stdin # sort by target # CHAIN UNPLACED SCAFFOLD PSL'S ssh kkr1u00 mkdir -p /cluster/data/pt0/bed/blatHg16.scaffolds/pslChain/run1 cd /cluster/data/pt0/bed/blatHg16.scaffolds/pslChain/run1 mkdir out chain ls -1S /cluster/data/pt0/bed/blatHg16.scaffolds/chrom/*.psl > input.lst cat << '_EOF_' > gsub #LOOP doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out} #ENDLOOP '_EOF_' # << this line makes emacs coloring happy cat << '_EOF_' > doChain #!/bin/csh # NOTE: axtChain in /cluster/bin has a memory problem ~kate/bin/i386/axtChain -psl $1 \ /iscratch/i/gs.17/build34/unmaskedNib \ -faQ /cluster/data/pt0/bed/blastz.hg16.nomask/unplaced.fa $2 > $3 '_EOF_' # << this line makes emacs coloring happy chmod a+x doChain gensub2 input.lst single gsub jobList para create jobList # 40 jobs para try para push # merge with blastz-generated chains ssh kksilo cd /cluster/data/pt0/bed # filter out scaffold 28690, which is replaced in the newer assembly chainFilter -notQ=scaffold_28690 blastz.hg16/axtChain/all.chain \ > all.blastz.chain chainMergeSort all.blastz.chain \ blatHg16.scaffolds/pslChain/run1/chain/*.chain > all.chain chainSplit chain all.chain ssh hgwdev cd /cluster/data/pt0/bed/blastz.hg16/axtChain/chain foreach i (*.chain) set c = $i:r hgLoadChain hg16 ${c}_chainPt0 $i eco done $c end # optionally: rm run1/chain/*.chain # NOTE: didn't preNet # Net from merged chains ssh kolossus cd /cluster/data/pt0/bed /cluster/home/kate/bin/x86_64/chainNet all.chain -minSpace=10 /cluster/data/hg16/chrom.sizes /cluster/data/pt0/Assembly.Nov13.2003/scaffold.sizes chimpGap10.net humanGap10.net # takes a few minutes # writing chimpGap10.net #subchainInfo T 81232621,81233154 81218758,81244581 ali 19126, subSize 529, score 1473313.000000, subScore 40749.899456 #subchainInfo T 81238921,81239448 81218758,81244581 ali 19126, subSize 510, score 1473313.000000, subScore 39286.292481 #writing humanGap10.net #memory usage 4688400384, utime 13822 s/100, stime 945 grep -v gap humanGap10.net > humanFill10.net cp humanFill10.net /usr/local/apache/htdocs/kate/chimp # takes a few minutes # filter out tiny alignments and post to web site ssh kksilo cd /cluster/data/pt0/bed netFilter humanGap10.net -minAli=150 > humanGap10.filtered.net grep -v gap humanGap10.filtered.net > humanFill10.filtered.net cp humanFill10.filtered.net /usr/local/apache/htdocs/kate/chimp # CHAIN FILES FROM NET ssh kksilo cd /cluster/data/pt0/bed mkdir blastz-blatHg16.2003-11-24 ln -s blastz-blatHg16.2003-11-24 blastz-blatHg16 mv all*.chain chain *.net blastz-blatHg16.2003-11-24 cd blastz-blatHg16 netFilter chimpGap10.net -minAli=150 > chimpGap10.filtered.net netSplit chimpGap10.filtered.net chimpNet mkdir chimpChainSubset cd chimpNet foreach f (*.net) set c = $f:r echo "chain subset $c" netChainSubset $f ../chain/$c.chain ../chimpChainSubset/$c.chain end cd ../chimpChainSubset tar cvfz humanChimpChain.gz *.chain mv /usr/local/apache/htdocs/kate/chimp/humanChimpChain.gz \ /usr/local/apache/htdocs/kate/chimp/old mv humanChimpChain.gz /usr/local/apache/htdocs/kate/chimp cd ../ netChainSubset chimpGap10.filtered.net all.chain \ chimpGap10.filtered.chain # REVERSED CHAIN FILES FROM NET chainSwap all.chain all.swap.chain netChainSubset humanGap10.filtered.net all.swap.chain \ humanGap10.filtered.chain gzip -c humanGap10.filtered.chain > \ /usr/local/apache/htdocs/kate/chimp/chimpHumanChainAll.gz # statistics netStats chimp.net.stats chimpGap10.filtered.net netStats human.net.stats humanGap10.filtered.net netStats chimp.net.unfiltered.stats chimpGap10.net netStats human.net.unfiltered.stats humanGap10.net # REFINE BLAT ALIGNMENTS -- RUN BLASTZ ON UNPLACED SCAFFOLDS VS. # ONLY HUMAN CONTIGS IDENTIFIED BY BLAT ALIGNMENTS # This was proposed, but not completed ssh kksilo cd /cluster/data/pt0/bed/blatHg16.scaffolds pslCat -dir psl/N?_?????? | \ pslSortAcc nohead contig temp stdin # Processed 4644940 lines into 19 temp files # sorted by target contig # RECIPROCAL BEST -- FINAL ALIGNMENTS (2003-12-03 kate) # Use unfiltered chimp-reference chains, net them, and pull human side ssh kksilo cd /cluster/data/pt0 cd bed/blastz-blatHg16 netChainSubset humanGap10.net all.swap.chain humanGap10.jk2.chain chainSort humanGap10.jk2.chain humanGap10Sorted.jk2.chain # note: jim ran these, and switched conventions chainNet humanGap10Sorted.jk2.chain /cluster/data/pt0/scaffold.sizes \ /cluster/data/hg16/chrom.sizes chimp.jk2.net human.jk2.net \ -minScore=0 -minSpace=1 # detour to redo reciprocal best chimp chain netChainSubset chimpGap10.net all.chain chimpGap10.jk2.chain chainSort chimpGap10.jk2.chain chimpGap10Sorted.jk2.chain chainNet chimpGap10Sorted.jk2.chain /cluster/data/hg16/chrom.sizes \ /cluster/data/pt0/scaffold.sizes human.jk3.net chimp.jk3.net \ -minScore=0 -minSpace=1 /cluster/bin/i386/netSyntenic chimp.jk3.net chimp.best.net ~/bin/i386/netFilter -chimpSyn chimp.best.net > chimp.best.filtered.net netChainSubset chimp.best.filtered.net all.swap.chain \ chimp.best.filtered.chain ln -s chimp.best.filtered.chain chimp.chain cp chimp.chain /usr/local/apache/htdocs/kate/chimp cd /usr/local/apache/htdocs/kate/chimp gzip chimp.chain ##netFilter -minGap=10 chimp.net | hgLoadNet hg16 netPt0 stdin #/cluster/bin/i386/netSyntenic human.jk.net human.best.net #~/bin/i386/hgLoadNet -warn hg16 netRxBestPt0 human.best.net # back to best human chain mv human.best.net human.best.net.old /cluster/bin/i386/netSyntenic human.jk2.net human.best.net ~/bin/i386/netFilter -chimpSyn human.best.net > human.best.filtered.net #mv chimp.best.net chimp.best.net.old #/cluster/bin/i386/netSyntenic chimp.jk2.net chimp.best.net #~/bin/i386/netFilter -chimpSyn chimp.best.net > chimp.best.filtered.net #~/bin/i386/hgLoadNet -warn hg16 netRxBestPt0 human.best.net ln -s human.best.filtered.net human.net ssh hgwdev cd /cluster/data/pt0/bed/blastz-blatHg16 /bin/i386/hgLoadNet -qPrefix=pt0- -warn hg16 netRxBestPt0 human.net #netStats chimp.jk.stats chimp.jk.net #netStats human.jk.stats human.jk.net #netChainSubset chimp.jk.net all.swap.chain chimp.jk.chain #netChainSubset human.jk.net all.chain human.jk.chain # read 9254050 of 9254050 chains in all.chain # NOTE: post both chains for download by chimp-analysis group #netChainSubset chimp.jk2.net all.swap.chain chimp.jk2.chain #netChainSubset human.jk2.net all.chain human.jk2.chain netChainSubset human.best.filtered.net all.chain human.best.filtered.chain ln -s human.best.filtered.chain human.chain #netChainSubset chimp.best.filtered.net all.swap.chain \ #chimp.best.filtered.chain #ln -s chimp.best.filtered.chain chimp.chain cp human.chain /usr/local/apache/htdocs/kate/chimp cd /usr/local/apache/htdocs/kate/chimp gzip human.chain #hgLoadChain -qPrefix=pt0- hg16 chainAllPt0 all.chain ssh hgwdev cd /cluster/data/pt0 cd bed/blastz-blatHg16 chainSort -target all.chain | chainSplit chain stdin cd chain foreach i (*.chain) set c = $i:r hgLoadChain -qPrefix=pt0 hg16 ${c}_chainPt0 $i echo done $c end # reloaded these tables on genome-test on 4/29/04 # as they somehow were wrong (kate) # NOTE: need to remake scaffolds.msk.fa for Nov13 assembly # and reload seq table cd /cluster/data/pt0 hgLoadSeq -prefix=pt0 hg16 /gbdb/hg16/pt0/scaffolds.msk.fa #37928 sequences # DOWNLOADS # copy chain files to download area cd /usr/local/apache/htdocs/goldenPath/hg16 mkdir -p vsPt0 cp /cluster/data/pt0/bed/blastz-blatHg16/{human,chimp}.chain . gzip human.chain chimp.chain md5sum *.gz > md5sum.txt # add README.txt file to dir, if needed # CREATE RECIPROCAL BEST NET AXT'S # split scaffolds into files and create nibs for netToAxt ssh kksilo cd /cluster/data/pt0 mkdir scaffolds faSplit byname scaffolds.msk.fa scaffolds/ # add 3 scaffolds, change 1 from Nov13 assembly cd scaffolds mv scaffold_28690.fa scaffold_28690.fa.nov1 cd ../Assembly.Nov13.2003 echo "scaffold_28690\nscaffold_37928\nscaffold_37929\nscaffold_37930" \ > newscaffolds.lst faSomeRecords scaffolds.fa newscaffolds.lst newscaffolds.fa mkdir newscaffolds faSplit byname newscaffolds.fa newscaffolds/ cd newscaffolds foreach i (scaffold*.fa) echo $i cp $i ../../scaffolds/$i.nov13 ln -s /cluster/data/pt0/scaffolds/$i.nov13 ../../scaffolds/$i end cd /cluster/data/pt0 mkdir nib cd scaffolds foreach f (scaffold*.fa) set i = $f:r echo $i faToNib -softMask $i.fa ../nib/$i.nib end cd /cluster/data/pt0/bed/blastz-blatHg16 mkdir humanBestNet netSplit human.net humanBestNet mkdir humanBestAxt cd humanBestNet cat > makeAxt.csh << 'EOF' foreach f (*.net) set i = $f:r echo $i.net netToAxt $i.net ../all.chain \ /cluster/data/hg16/nib /cluster/data/pt0/nib \ ../humanBestAxt/$i.axt end 'EOF' csh -f makeAxt.csh >&! makeAxt.log & tail -100f makeAxt.log # CHIMP SEQUENCE FOR DOWNLOAD (2003-12-10 kate) # Re-extract scaffolds using Nov13 AGP ssh kksilo cd /cluster/data/pt0 mkdir Assembly.Nov1.2003 mv supercontigs* assembly.agp assembly.format scaffold.sizes.Nov1 scaffolds.msk.fa Assembly.Nov1.2003 ln -s Assembly.Nov13.2003/assembly.agp . agpAllToFaFile assembly.agp contigs.bases.msk scaffolds.msk.fa cp scaffolds.msk.fa /usr/local/apache/htdocs/goldenPath/hg16/vsPt0 cd /usr/local/apache/htdocs/goldenPath/hg16/vsPt0 gzip scaffolds.msk.fa md5sum scaffolds.msk.fa >> md5sum.txt # Update README # REDO NETTING WITH FIX (2003-12-11) ssh kolossus cd /cluster/data/pt0/bed/blastz-blatHg16 /cluster/home/kate/bin/x86_64/chainNet all.chain -minSpace=10 \ /cluster/data/hg16/chrom.sizes /cluster/data/pt0/scaffold.sizes \ chimpGap10.2.net humanGap10.2.net ssh kksilo cd /cluster/data/pt0/bed/blastz-blatHg16 ~/bin/i386/netChainSubset humanGap10.2.net all.swap.chain stdout | \ chainSort stdin humanGap10.2.chain # NOTE: this chain file has duplicated chain ID's -- chains # must have been split up from net # note: switched conventions here (like jk, above) ssh kolossus cd /cluster/data/pt0/bed/blastz-blatHg16 /cluster/home/kate/bin/x86_64/chainNet humanGap10.2.chain \ /cluster/data/pt0/scaffold.sizes /cluster/data/hg16/chrom.sizes \ chimp.2.net human.2.net -minScore=0 -minSpace=1 ssh kksilo cd /cluster/data/pt0/bed/blastz-blatHg16 netSyntenic human.2.net human.2.syn.net ~/bin/i386/netFilter -chimpSyn human.2.syn.net > human.best.2.net ssh hgwdev cd /cluster/data/pt0/bed/blastz-blatHg16 ~/bin/i386/hgLoadNet -qPrefix=pt0 -warn hg16 netRxBestPt0 human.best.2.net # get axt's from net ssh kksilo cd /cluster/data/pt0/bed/blastz-blatHg16 netSplit human.best.2.net humanBestNet.2 mkdir humanBestAxt.2 cd humanBestNet.2 cat > makeAxt.csh << 'EOF' foreach f (*.net) set i = $f:r echo $i.net netToAxt -nocache $i.net ../all.chain \ /cluster/data/hg16/nib /cluster/data/pt0/nib \ ../humanBestAxt.2/$i.axt end 'EOF' csh -f makeAxt.csh >&! makeAxt.log & tail -100f makeAxt.log cd .. ln -s humanBestAxt.2 axtBest # convert to PSL and load into browser ssh kksilo cd /cluster/data/pt0/bed/blastz-blatHg16 mkdir pslBest cat > makePsl.csh << 'EOF' foreach a (axtBest/chr*.axt) set c=$a:t:r echo "processing $c.axt" axtToPsl axtBest/${c}.axt \ /cluster/data/hg16/chrom.sizes \ /cluster/data/pt0/scaffold.sizes \ pslBest/${c}_blastzBestPt0.psl end 'EOF' csh makePsl.csh >&! makePsl.log & tail -100f makePsl.log # Load tables ssh hgwdev #cd /cluster/data/pt0/bed/blastz-blatHg16/pslBest #~/bin/i386/hgLoadPsl -qPrefix=p0- hg16 chr*_blastzBestPt0.psl cd /gbdb/hg16 set table = axtBestPt0 mkdir -p $table cd $table foreach p (/cluster/data/pt0/bed/blastz-blatHg16/axtBest/*.axt) echo $p set f = $p:t ln -s $p $f end cd /cluster/data/pt0/bed/blastz-blatHg16/axtBest hgLoadAxt hg16 axtBestPt0 # extract chains and copy to download area ssh kksilo cd /cluster/data/pt0/bed/blastz-blatHg16 ~/bin/i386/netChainSubset human.best.2.net all.chain human.best.2.chain #cp chimp.chain /usr/local/apache/htdocs/kate/chimp #cd /usr/local/apache/htdocs/kate/chimp #gzip chimp.chain # chimp reference chains /cluster/bin/i386/netSyntenic chimp.2.net chimp.2.syn.net ~/bin/i386/netFilter -chimpSyn chimp.2.syn.net > chimp.best.2.net ~/bin/i386/netChainSubset chimp.best.2.net all.swap.chain chimp.best.2.chain #cp chimp.chain /usr/local/apache/htdocs/kate/chimp #cd /usr/local/apache/htdocs/kate/chimp #gzip chimp.chain #gzip --fast *.chain # CREATING SCAFFOLD LEVEL COMPRESSED QUALITY SCORES AND HI QUALITY DIFFERENCE FILES # Create compressed scaffold quality file ssh kksilo cd /cluster/data/pt0 zcat contigs.quals.gz | qaToQac stdin stdout | \ chimpSuperQuals assembly.agp stdin scaffolds.qac # Create bed file of high quality differences cd bed/blastz-blatHg16.2003-11-24 chimpHiQualDiffs humanBestAxt.krr /cluster/data/pt0/scaffolds.qac chimpDiffs.bed # Load into database ssh hgwdev cd /cluster/data/pt0/bed/blastz-blatHg16.2003-11-24 sed '{s/simpleNucDiff/chimpSimpleDiff/}' $HOME/kent/src/hg/lib/simpleNucDiff.sql | hgsql hg16 hgLoadBed -oldTable hg16 chimpSimpleDiff chimpDiffs.bed