# for emacs: -*- mode: sh; -*- # Creating the assembly for Monodelphis domestica # South American, Short-tailed Opossum # http://www.genome.gov/11510687 # http://www.genome.gov/12512285 # NOTE: this doc may have genePred loads that fail to include # the bin column. Please correct that for the next build by adding # a bin column when you make any of these tables: # # mysql> SELECT tableName, type FROM trackDb WHERE type LIKE "%Pred%"; # +-------------+---------------------------------+ # | tableName | type | # +-------------+---------------------------------+ # | refGene | genePred refPep refMrna | # | xenoRefGene | genePred xenoRefPep xenoRefMrna | # | ensGene | genePred ensPep | # | genscan | genePred genscanPep | # +-------------+---------------------------------+ ######################################################################### # DOWNLOAD SEQUENCE (DONE - 2005-08-18 - Hiram) ssh kkstore01 mkdir -p /cluster/store9/monDom4/broad.mit.edu ln -s /cluster/store9/monDom4 /cluster/data/monDom4 cd /cluster/data/monDom4/broad.mit.edu time wget --timestamping --force-directories --directory-prefix=. \ --dont-remove-listing --recursive --level=2 --no-parent \ --no-host-directories --cut-dirs=5 \ ftp://ftp.broad.mit.edu/distribution/assemblies/mammals/monodelphis/monDom4/* # That takes a number of hours, ending up with: # -rw-rw-r-- 1 7492963 Jan 5 18:10 Monodelphis4.0.agp # -rw-rw-r-- 1 1160 Jan 5 18:10 MappedAssembly.Table # -rw-rw-r-- 1 716 Jan 5 18:10 ForDistribution.command # -rw-rw-r-- 1 1215 Jan 5 18:10 BasicStats.out # -rw-rw-r-- 1 1512 Jan 5 18:11 ReadUsage.Table # -rw-rw-r-- 1 474573344 Jan 5 18:11 Monodelphis4.0.agp.chromosome.qual.gz # -rw-rw-r-- 1 1201437533 Jan 5 18:11 Monodelphis4.0.agp.chromosome.fasta.gz # -rw-rw-r-- 1 7647098 Jan 5 18:12 assembly.agp # -rw-rw-r-- 1 14841256 Jan 5 18:13 assembly.markup # -rw-rw-r-- 1 2838523 Jan 5 18:13 assembly.links # -rw-rw-r-- 1 474281414 Jan 5 18:13 assembly.agp.qual.gz # -rw-rw-r-- 1 1201095284 Jan 5 18:13 assembly.agp.fasta.gz # -rw-rw-r-- 1 176159300 Jan 5 18:14 assembly.unplaced # -rw-rw-r-- 1 1151905015 Jan 5 18:14 assembly.reads.gz # -rw-rw-r-- 1 43 Jan 5 18:15 source # -rw-rw-r-- 1 471180166 Jan 5 18:15 contigs.quals.gz # -rw-rw-r-- 1 1198813602 Jan 5 18:15 contigs.bases.gz # -rw-rw-r-- 1 757582328 Jan 5 18:16 unplaced.fasta.gz # -rw-rw-r-- 1 2083924258 Jan 5 18:18 unplaced.qual.gz # -rw-rw-r-- 1 1159288222 Jan 5 19:17 softmasked_monDom3.tgz zcat broad.mit.edu/Monodelphis4.0.agp.chromosome.fasta.gz \ | faSplit -verbose=2 byname stdin broadSplit/ # combine the Broad split files into single chroms for C in 1 2 3 4 5 6 7 8 X Un do mkdir -p ${C} rm -f ${C}/chr${C}.fa echo ">chr${C}" > ${C}/chr${C}.fa echo -n "${C}/chr${C}.fa working ... " ls broadSplit/${C}.*-*.fa | sort -t"." -k2,2n | while read F do grep -v "^>" ${F} >> ${C}/chr${C}.fa done echo "done" done # create a 2bit file to get a blastz hg18 started # this will be replaced later after RepeatMasker with masked # sequence faToTwoBit ?/chr?.fa Un/chrUn.fa monDom4.2bit # generate chrom.sizes list in order by size, biggest first # This listing is going to be used in a variety of procedures # below twoBitInfo monDom4.2bit stdout | sort -rn +1 > chrom.sizes mkdir /san/sanvol1/scratch/monDom4 cp -p monDom4.2bit /san/sanvol1/scratch/monDom4 cp -p chrom.sizes /san/sanvol1/scratch/monDom4 # split into 500K chunks for RepeatMasker mkdir split500K for C in 1 2 3 4 5 6 7 8 X Un do mkdir split500K/${C} faSplit -verbose=2 size ${C}/chr${C}.fa 500000 split500K/${C}/chr${C}_ \ -lift=split500K/${C}.lft done mkdir lifts cat split500K/*.lft > lifts/lift500K.lft ########################################################################### # RepeatMasker run (DONE 2006-02-06 - Hiram) ssh kk mkdir /cluster/data/monDom4/RMRun mkdir /cluster/data/monDom4/RMOut mkdir /cluster/data/monDom4/scripts cd /cluster/data/monDom4/split500K for C in 1 2 3 4 5 6 7 8 X Un do ls -1S ${C}/*.fa done > ../RMRun/split.list cd /cluster/data/monDom4/RMRun cat << '_EOF_' > template #LOOP ../scripts/RMOpossum $(dir1) $(root1) {check out line ../RMOut/$(dir1)/$(root1).out} #ENDLOOP '_EOF_' # happy emacs cat << '_EOF_' > ../scripts/RMOpossum #!/bin/csh -fe /bin/mkdir -p /scratch/tmp/monDom4/$1/$2 /bin/mkdir -p ../RMOut/$1 /bin/cp ../split500K/$1/$2.fa /scratch/tmp/monDom4/$1/$2/$2.fa pushd /scratch/tmp/monDom4/$1/$2 /cluster/bluearc/RepeatMasker060120/RepeatMasker -s \ -species "Monodelphis domestica" $2.fa popd /bin/cp -p /scratch/tmp/monDom4/$1/$2/$2.fa.out $3 /bin/rm -fr /scratch/tmp/monDom4/$1/$2/* /bin/rmdir --ignore-fail-on-non-empty /scratch/tmp/monDom4/$1/$2 /bin/rmdir --ignore-fail-on-non-empty /scratch/tmp/monDom4/$1 /bin/rmdir --ignore-fail-on-non-empty /scratch/tmp/monDom4 '_EOF_' # happy emacs chmod +x ../scripts/RMOpossum gensub2 split.list single template jobList para create jobList para try ... check ... push ... etc ... # Completed: 7170 of 7170 jobs # CPU time in finished jobs: 26375191s 439586.52m 7326.44h 305.27d 0.836 y # IO & Wait Time: 1584329s 26405.48m 440.09h 18.34d 0.050 y # Average job time: 3900s 64.99m 1.08h 0.05d # Longest finished job: 12608s 210.13m 3.50h 0.15d # Submission to last job: 162734s 2712.23m 45.20h 1.88d # lift results up to chrom coordinates ssh kkstore01 cd /cluster/data/monDom4/RMOut for C in 1 2 3 4 5 6 7 8 X Un do echo "chr${C} working ... " (cat ${C}/chr${C}_000.out ${C}/chr${C}_0000.out 2> /dev/null; \ ls ${C}/chr${C}_*.out | egrep -v "_000.out|_0000.out" \ | xargs tail --quiet --lines=+4) | \ liftUp ../${C}/chr${C}.fa.out ../lifts/lift500K.lft error stdin echo "chr${C} done" done # the seemingly duplicate mention of _000.out and _0000.out takes # the seemingly duplicate mention of _000.out and _0000.out takes # care of the case where one or the other exists, but both never # exist. One will not exist and the cat will complain to stderr, hence # the redirection of cat complaints to dev null. ########################################################################### # BLASTZ HUMAN hg18 - to quickly investigate monDom4 to see if it has the same # pile up problem that was in monDom2. Got this started right after the # monDom4.2bit file was made from the original Broad chrom fasta ssh pk mkdir /cluster/store9/monDom4/bed/blastzHg18.2006-02-08 cd /cluster/store9/monDom4/bed/blastzHg18.2006-02-08 cat << '_EOF_' > DEF # human vs. opossum export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/parasol/bin BLASTZ=blastz.v7.x86_64 # Specific settings for chicken (per Webb email to Brian Raney) BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=10000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q BLASTZ_ABRIDGE_REPEATS=0 # TARGET: Human (hg18) SEQ1_DIR=/scratch/hg/hg18/nib SEQ1_LEN=/scratch/hg/hg18/chrom.sizes SEQ1_IN_CONTIGS=0 SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Opossum monDom4 SEQ2_DIR=/san/sanvol1/scratch/monDom4/monDom4.2bit SEQ2_LEN=/san/sanvol1/scratch/monDom4/chrom.sizes SEQ2_IN_CONTIGS=0 SEQ2_CHUNK=30000000 SEQ2_LAP=0 BASE=/cluster/data/monDom4/bed/blastzHg18.2006-02-08 TMPDIR=/scratch/tmp '_EOF_' # happy emacs time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ `pwd`/DEF > blastz.out 2>&1 & XXXX - running 2006-02-08 10:06 time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=kk -chainMinScore=5000 -chainLinearGap=loose \ -continue=chainMerge `pwd`/DEF > chainMerge.out 2>&1 & ########################################################################### # Initial database setup (DONE - 2006-02-06 - Hiram) ssh hgwdev cd /cluster/data/monDom4 mkdir /gbdb/monDom4 ln -s /cluster/data/monDom4/monDom4.2bit /gbdb/monDom4 twoBitInfo monDom4.2bit stdout \ | awk '{printf "%s\t%s\t/gbdb/monDom4/monDom4.2bit\n", $1, $2}' \ > chromInfo.tab hgsql -e "create database monDom4;" mysql hgsql -e "create table grp (PRIMARY KEY(NAME)) select * from hg17.grp;" \ monDom4 hgsql monDom4 < $HOME/kent/src/hg/lib/chromInfo.sql hgsql -e 'load data local infile "chromInfo.tab" into table chromInfo;' \ monDom4 hgsql monDom4 -N -e 'select chrom from chromInfo' > chrom.lst # Enter monDom4 into dbDb and defaultDb so test browser knows about # it: hgsql -e 'INSERT INTO dbDb (name, description, nibPath, organism, \ defaultPos, active, orderKey, genome, scientificName, \ htmlPath, hgNearOk, hgPbOk, sourceName) \ VALUES("monDom4", "Jan 2006", "/gbdb/monDom4", "Opossum", \ "chr4:344283621-363415496", 1, 34, "Opossum", \ "Monodelphis domestica", \ "/gbdb/monDom4/html/description.html", 0, 0, \ "Broad Inst. monDom4 Jan06");' \ -h localhost hgcentraltest # do this update later when it is time to make this be the default hgsql -e 'update defaultDb set name="monDom4" where genome="Opossum";' \ hgcentraltest # update default position to a HOX region: hgsql -e \ 'update dbDb set defaultPos="chr8:293325654-293362847" where name="monDom4";' \ hgcentraltest chr8:293,325,654-293,362,847 # Make trackDb table so browser knows what tracks to expect mkdir /cluster/data/monDom4/html ln -s /cluster/data/monDom4/html /gbdb/monDom4/html mkdir $HOME/kent/src/hg/makeDb/trackDb/opossum/monDom4 cd $HOME/kent/src/hg/makeDb/trackDb/opossum cvs add monDom4 cd monDom4 cp -p /cluster/data/monDom2/html/description.html ./description.html # edit that description to update cvs add description.html cvs commit cd $HOME/kent/src/hg/makeDb/trackDb # add monDom4 to the makefile, then make DBS=monDom4 # or make DBS=monDom4 alpha # to place it on the genome-test browser ####################################################################### # SIMPLE REPEAT [TRF] TRACK (DONE - 2006-02-08 - Hiram) # This can be done while the above repeat masker run is working ssh kkstore01 mkdir /cluster/data/monDom4/bed/simpleRepeat cd /cluster/data/monDom4 # split into 50M chunks for trfBig mkdir split50M for C in 1 2 3 4 5 6 7 8 X Un do mkdir split50M/${C} faSplit -verbose=2 size ${C}/chr${C}.fa 50000000 split50M/${C}/chr${C}_ \ -lift=split50M/${C}.lft done cat split50M/*.lft > lifts/lift50M.lft cd split50M ls -1S */*.fa > ../bed/simpleRepeat/split.list # small kluster for these larger jobs ssh kki cd /cluster/data/monDom4/bed/simpleRepeat mkdir trf cat << '_EOF_' > template #LOOP ./runTrf.csh $(dir1) $(root1) {check out line trf/$(root1).bed} #ENDLOOP '_EOF_' # happy emacs cat << '_EOF_' > runTrf.csh #!/bin/csh -fe set workDir = /scratch/tmp/monDom4TRF/$1/$2 set inputFN = ../../split50M/$1/$2.fa set outputFN = $3 mkdir -p $workDir cp -p $inputFN $workDir pushd $workDir /cluster/bin/i386/trfBig -trf=/cluster/bin/i386/trf $2.fa /dev/null -bedAt=$2.bed -tempDir=/scratch/tmp popd rm -f $outputFN cp -p $workDir/$2.bed $outputFN rm -fr $workDir/* rmdir --ignore-fail-on-non-empty $workDir rmdir --ignore-fail-on-non-empty /scratch/tmp/monDom4TRF/$1 rmdir --ignore-fail-on-non-empty /scratch/tmp/monDom4TRF '_EOF_' # happy emacs chmod +x runTrf.csh gensub2 split.list single template jobList para create jobList para try ... check ... push ... etc ... # Completed: 77 of 77 jobs # CPU time in finished jobs: 16109s 268.48m 4.47h 0.19d 0.001 y # IO & Wait Time: 1199s 19.99m 0.33h 0.01d 0.000 y # Average job time: 225s 3.75m 0.06h 0.00d # Longest finished job: 1322s 22.03m 0.37h 0.02d # Submission to last job: 1870s 31.17m 0.52h 0.02d # lift to chrom coordinates ssh kkstore01 cd /cluster/data/monDom4/bed/simpleRepeat find ./trf -type f | xargs cat | \ liftUp simpleRepeat.bed \ /cluster/data/monDom4/lifts/lift50M.lft \ error stdin # Load into the database: ssh hgwdev cd /cluster/data/monDom4/bed/simpleRepeat hgLoadBed -strict monDom4 simpleRepeat simpleRepeat.bed \ -sqlTable=$HOME/kent/src/hg/lib/simpleRepeat.sql # Loaded 710087 elements of size 16 # Compare with previous assembly time featureBits monDom4 simpleRepeat # 68337838 bases of 3501643220 (1.952%) in intersection # real 0m42.137s time featureBits monDom2 simpleRepeat # 69239555 bases of 3496445653 (1.980%) in intersection # real 1m5.716s time featureBits monDom1 simpleRepeat # 55000238 bases of 3492108230 (1.575%) in intersection # real 1m49.240s ####################################################################### # PROCESS SIMPLE REPEATS INTO MASK (DONE - 2006-02-10 - Hiram) # After the simpleRepeats track has been built, make a filtered # version # of the trf output: keep trf's with period <= 12: # need these mask files on a per-chrom basis ssh kkstore01 cd /cluster/data/monDom4/bed/simpleRepeat mkdir chromMask for C in 1 2 3 4 5 6 7 8 X Un do echo "chr${C} working ..." cat trf/chr${C}_*.bed | liftUp chromMask/chr${C}.bed \ /cluster/data/monDom4/lifts/lift50M.lft \ error stdin awk '{if ($5 <= 12) print;}' chromMask/chr${C}.bed \ > chromMask/chr${C}_Mask.bed echo "chr${C} done" done # should be same result as if we had done the whole thing as a # single piece: cat chromMask/*_Mask.bed | wc -l # 485364 awk '{if ($5 <= 12) print;}' simpleRepeat.bed | wc -l # 485364 ####################################################################### # MASK FA USING REPEATMASKER AND FILTERED TRF FILES (DONE - 2006-02-10 - Hiram) # Additional masking was added later to use Damian Keefe's repeat # libraries. # the new style binRange was in development at this time, so using # binaries built today with that new functionality. Next time # these will be the standard commands ssh kkstore01 cd /cluster/data/monDom4 # verify sequence before and after masking to ensure it doesn't # get broken time faCount ?/chr?.fa Un/chrUn.fa #seq len A C G T N cpg # chr1 748055161 227831941 138649610 138661137 228270299 14642174 3225777 # chr2 541556283 163736285 100133171 100127159 163985111 13574557 2505214 # chr3 526135210 161221343 95632761 95680478 160815407 12785221 2208210 # chr4 430141050 130190442 78748372 78684824 130210194 12307218 1877031 # chr5 308900514 94866760 56126597 56061671 94769331 7076155 1311015 # chr6 244784211 73686567 45271636 45267221 73515529 7043258 1216115 # chr7 262624689 81219469 47052213 47076531 81079019 6197457 1127925 # chr8 308469712 93087343 56578139 56466687 93245129 9092414 1427329 # chrX 60718501 16286269 11251258 11264518 16304629 5611827 371051 # chrUn 174229318 46556476 32785341 32891615 46354738 15641148 1620662 # total 3605614649 1088682895 662229098 662181841 1088549386 103971429 16890329 # real 1m51.555s cat << '_EOF_' > scripts/maskChroms.sh #!/bin/sh for C in 1 2 3 4 5 6 7 8 X Un do echo "chr${C} working ..." $HOME/bin/x86_64/maskOutFa -soft ${C}/chr${C}.fa \ bed/simpleRepeat/chromMask/chr${C}_Mask.bed ${C}/chr${C}_masked.fa $HOME/bin/x86_64/maskOutFa -softAdd ${C}/chr${C}_masked.fa \ ${C}/chr${C}.fa.out ${C}/chr${C}_masked.fa echo "done" done '_EOF_' # happy emacs chmod +x scripts/maskChroms.sh time ./scripts/maskChroms.sh # should have the same sequence still time faCount ?/chr?_masked.fa Un/chrUn_masked.fa #seq len A C G T N cpg chr1 748055161 227831941 138649610 138661137 228270299 14642174 3225777 chr2 541556283 163736285 100133171 100127159 163985111 13574557 2505214 chr3 526135210 161221343 95632761 95680478 160815407 12785221 2208210 chr4 430141050 130190442 78748372 78684824 130210194 12307218 1877031 chr5 308900514 94866760 56126597 56061671 94769331 7076155 1311015 chr6 244784211 73686567 45271636 45267221 73515529 7043258 1216115 chr7 262624689 81219469 47052213 47076531 81079019 6197457 1127925 chr8 308469712 93087343 56578139 56466687 93245129 9092414 1427329 chrX 60718501 16286269 11251258 11264518 16304629 5611827 371051 chrUn 174229318 46556476 32785341 32891615 46354738 15641148 1620662 total 3605614649 1088682895 662229098 662181841 1088549386 103971429 16890329 # after verifying they are the same, remove the old unmasked sequence for C in 1 2 3 4 5 6 7 8 X Un do rm ${C}/chr${C}.fa mv ${C}/chr${C}_masked.fa ${C}/chr${C}.fa done # Create new 2bit file with this masked sequence ls -og *.2bit # -rw-rw-r-- 1 901986358 Feb 8 09:29 monDom4.2bit # browser will not function while this 2bit file does not exist rm monDom4.2bit faToTwoBit ?/chr?.fa Un/chrUn.fa monDom4.2bit # the file becomes larger: ls -og *.2bit # -rw-rw-r-- 1 950060406 Feb 10 13:53 monDom4.2bit # and load the rmsk tables ssh hgwdev cd /cluster/data/monDom4 time $HOME/bin/i386/hgLoadOut -verbose=2 monDom4 \ ?/chr?.fa.out Un/chrUn.fa.out # hgLoadOut: connected to database: monDom4 # Processing 1/chr1.fa.out # Strange perc. field -460.2 line 561938 of 1/chr1.fa.out # Strange perc. field -395.6 line 561938 of 1/chr1.fa.out # Loading up table chr1_rmsk # Processing 2/chr2.fa.out # Strange perc. field -160.2 line 234960 of 2/chr2.fa.out # Strange perc. field -238.7 line 234960 of 2/chr2.fa.out # bad rep range [1723, 73] line 451681 of 2/chr2.fa.out # Strange perc. field -6.0 line 1104797 of 2/chr2.fa.out # Loading up table chr2_rmsk # Processing 3/chr3.fa.out # bad rep range [2344, 191] line 648571 of 3/chr3.fa.out # Loading up table chr3_rmsk # Processing 4/chr4.fa.out # Strange perc. field -7.0 line 7733 of 4/chr4.fa.out # Loading up table chr4_rmsk # Processing 5/chr5.fa.out # Strange perc. field -6.9 line 34415 of 5/chr5.fa.out # Strange perc. field -1.1 line 44716 of 5/chr5.fa.out # Strange perc. field -1.1 line 44720 of 5/chr5.fa.out # Strange perc. field -1.1 line 44722 of 5/chr5.fa.out # Strange perc. field -1.1 line 44724 of 5/chr5.fa.out # Strange perc. field -38.3 line 146729 of 5/chr5.fa.out # Strange perc. field -38.3 line 146732 of 5/chr5.fa.out # Strange perc. field -38.3 line 146734 of 5/chr5.fa.out # Strange perc. field -38.3 line 146737 of 5/chr5.fa.out # Strange perc. field -42.5 line 239189 of 5/chr5.fa.out # Loading up table chr5_rmsk # Processing 6/chr6.fa.out # Strange perc. field -0.2 line 87360 of 6/chr6.fa.out # Strange perc. field -0.1 line 87360 of 6/chr6.fa.out # Strange perc. field -3960.1 line 507698 of 6/chr6.fa.out # Strange perc. field -1856.3 line 507698 of 6/chr6.fa.out # Loading up table chr6_rmsk # Processing 7/chr7.fa.out # Loading up table chr7_rmsk # Processing 8/chr8.fa.out # Loading up table chr8_rmsk # Processing X/chrX.fa.out # Loading up table chrX_rmsk # Processing Un/chrUn.fa.out # Strange perc. field -27.6 line 129404 of Un/chrUn.fa.out # Strange perc. field -1.8 line 129404 of Un/chrUn.fa.out # Strange perc. field -27.6 line 129407 of Un/chrUn.fa.out # Strange perc. field -1.8 line 129407 of Un/chrUn.fa.out # Strange perc. field -27.6 line 129409 of Un/chrUn.fa.out # Strange perc. field -1.8 line 129409 of Un/chrUn.fa.out # Strange perc. field -27.6 line 129411 of Un/chrUn.fa.out # Strange perc. field -1.8 line 129411 of Un/chrUn.fa.out # Strange perc. field -27.6 line 129413 of Un/chrUn.fa.out # Strange perc. field -1.8 line 129413 of Un/chrUn.fa.out # Loading up table chrUn_rmsk # note: 2 records dropped due to repStart > repEnd # Note the interesting coincidence of the exact same % for monDom4 # and monDom2 time featureBits monDom4 rmsk # 1814099592 bases of 3501643220 (51.807%) in intersection # real 3m57.193s # time featureBits -countGaps monDom4 rmsk # 1814099592 bases of 3605614649 (50.313%) in intersection time featureBits monDom2 rmsk # 1811406819 bases of 3496445653 (51.807%) in intersection # real 5m31.388s time featureBits monDom1 rmsk # 1775646140 bases of 3492108230 (50.847%) in intersection ####################################################################### # GC5BASE (DONE - 2006-02-08 - Hiram) ssh kkstore01 mkdir /cluster/data/monDom4/bed/gc5Base cd /cluster/data/monDom4/bed/gc5Base time hgGcPercent -verbose=2 -wigOut -doGaps -file=stdout -win=5 monDom4 \ /cluster/data/monDom4 | wigEncode stdin gc5Base.wig gc5Base.wib ssh hgwdev cd /cluster/data/monDom4/bed/gc5Base mkdir /gbdb/monDom4/wib ln -s `pwd`/gc5Base.wib /gbdb/monDom4/wib hgLoadWiggle -verbose=2 monDom4 gc5Base gc5Base.wig ####################################################################### # LOAD GAP & GOLD TABLES FROM AGP (DONE - 2006-02-10 - Hiram) ssh hgwdev cd /cluster/data/monDom4 hgGoldGapGl -noGl monDom4 broad.mit.edu/Monodelphis4.0.agp # Verify sanity of indexes hgsql -e "show index from gold;" monDom4 hgsql -e "show index from gap;" monDom4 # Look at the Cardinality column, it should have some healthy # numbers for all indices # For some reason, the indices do not get built correctly -- # "show index from gap/gold" shows NULL cardinalities for chrom. # Rebuild indices with "analyze table". hgsql -e "analyze table gold; analyze table gap;" monDom4 ####################################################################### # LINEAGE SPECIFIC REPEATS (DONE - 2006-02-10 - Hiram) # Let's see if DateRepeats can do anything with this Opossum # business. This was tried, but it doesn't make anything special # for any of the organisms and a -comp of opossum isn't allowed at # this time so we couldn't do this on the other organisms either to # obtain any specific repeats there. ssh kki mkdir /cluster/data/monDom4/linSpecRep cd /cluster/data/monDom4/linSpecRep for C in 1 2 3 4 5 6 7 8 X Un do ln -s ../${C}/chr${C}.fa.out ./chr${C}.rmsk.out done # This takes a long time to run as a script, to speed it up it has # been converted to a mini cluster run cat << '_EOF_' > template #LOOP /cluster/bluearc/RepeatMasker060120/DateRepeats $(path1) -query opossum -comp human -comp mouse -comp rat -comp dog -comp dog #ENDLOOP '_EOF_' ls *.out > out.list gensub2 out.list single template jobList ssh kki cd /cluster/bluearc/scratch/hg/gs.18/build35/rmsk.spec ./runArian.sh > jobList para create jobList para try, ... check ... push ... etc ... # To check if any difference is noted in the various comparison # organisms: awk '{if(NF>10){a=NF-4;b=NF-3;c=NF-2;d=NF-1; print $a,$b,$c,$d,$NF}}' \ chr2.rmsk.out_homo-sapiens_mus-musculus_rattus_canis-familiaris_canis-familiaris \ | sort | uniq -c # 187651 - - - - - # 113100 0 0 0 0 0 # 210909 ? ? ? ? ? # 602832 X X X X X # always the same mark for each comp organism ####################################################################### # Distribute files to filesystems for kluster runs ssh kkr1u00 mkdir /iscratch/i/monDom4 cd /iscratch/i/monDom4 cp -p /cluster/data/monDom4/monDom4.2bit . cp -p /cluster/data/monDom4/chrom.sizes . for R in 2 3 4 5 6 7 8 do rsync -a --progress /iscratch/i/monDom4/ kkr${R}u00:/iscratch/i/monDom4/ done ssh pk mkdir /san/sanvol1/scratch/monDom4 cd /san/sanvol1/scratch/monDom4 cp -p /cluster/data/monDom4/monDom4.2bit . cp -p /cluster/data/monDom4/chrom.sizes . ####################################################################### # HUMAN BLASTZ - this time with masked sequence # (DONE - 2006-02-10 - 2006-02-15 - Hiram) ssh pk mkdir /cluster/data/monDom4/bed/blastzHg18.2006-02-10 cd /cluster/data/monDom4/bed/blastzHg18.2006-02-10 cat << '_EOF_' > DEF # opossum vs. human export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/parasol/bin BLASTZ=blastz.v7.x86_64 # settings for more distant organism alignments BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=10000 BLASTZ_K=2200 BLASTZ_M=50 BLASTZ_Q=/cluster/data/blastz/HoxD55.q BLASTZ_ABRIDGE_REPEATS=0 # TARGET: Opossum monDom4 SEQ1_DIR=/san/sanvol1/scratch/monDom4/monDom4.2bit SEQ1_LEN=/san/sanvol1/scratch/monDom4/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Human (hg18) SEQ2_DIR=/scratch/hg/hg18/nib SEQ2_LEN=/scratch/hg/hg18/chrom.sizes SEQ2_CHUNK=30000000 SEQ2_LAP=0 BASE=/cluster/data/monDom4/bed/blastzHg18.2006-02-10 TMPDIR=/scratch/tmp '_EOF_' # happy emacs time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ -stop=net `pwd`/DEF > blastz.out 2>&1 & # running 2006-02-10 ####################################################################### # Additional RepeatMasking - this sequence needs to be # additionally masked with Damian Keefe's libraries # (DONE - 2006-02-11 - Hiram) ssh kkr1u00 mkdir /cluster/data/monDom4/RMExtra cd /cluster/data/monDom4/RMExtra wget --timestamping --force-directories --directory-prefix=. \ --dont-remove-listing --recursive --level=2 --no-parent \ --no-host-directories --cut-dirs=6 \ ftp://ftp.ebi.ac.uk/pub/databases/ensembl/encode/repeat_libraries/monodelphis/* # picks up: # -rw-r--r-- 1 679203 May 9 2005 supplemental.lib # -rw-r--r-- 1 737274 May 9 2005 ab_initio.lib # -rw-r--r-- 1 452 May 9 2005 README cp -p supplemental.lib /iscratch/i/monDom4 cd /iscratch/i/monDom4 for R in 2 3 4 5 6 7 8 do rsync -a --progress /iscratch/i/monDom4/ kkr${R}u00:/iscratch/i/monDom4/ done ssh kk cd /cluster/data/monDom4/RMExtra cat << '_EOF_' > RMExtra.csh #!/bin/csh -fe /bin/mkdir -p /scratch/tmp/monDom4/$1/$2 /bin/mkdir -p ../RMOutExtra/$1 /bin/cp ../split500K/$1/$2.fa /scratch/tmp/monDom4/$1/$2/$2.fa pushd /scratch/tmp/monDom4/$1/$2 /cluster/bluearc/RepeatMasker060120/RepeatMasker -s \ -lib /iscratch/i/monDom4/supplemental.lib $2.fa popd /bin/cp -p /scratch/tmp/monDom4/$1/$2/$2.fa.out $3 /bin/rm -fr /scratch/tmp/monDom4/$1/$2/* /bin/rmdir --ignore-fail-on-non-empty /scratch/tmp/monDom4/$1/$2 /bin/rmdir --ignore-fail-on-non-empty /scratch/tmp/monDom4/$1 '_EOF_' # happy emacs chmod +x RMExtra.csh cat << '_EOF_' > template #LOOP ./RMExtra.csh $(dir1) $(root1) {check out line ../RMOutExtra/$(dir1)/$(root1).out} #ENDLOOP '_EOF_' # happy emacs mkdir ../RMOutExtra gensub2 ../RMRun/split.list single template jobList para create jobList para try ... check ... push ... etc... para time # Completed: 7170 of 7170 jobs # CPU time in finished jobs: 8540127s 142335.45m 2372.26h 98.84d 0.271 y # IO & Wait Time: 103372s 1722.87m 28.71h 1.20d 0.003 y # Average job time: 1206s 20.09m 0.33h 0.01d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 2326s 38.77m 0.65h 0.03d # Submission to last job: 22083s 368.05m 6.13h 0.26d cd /cluster/data/monDom4/RMOutExtra for C in 1 2 3 4 5 6 7 8 X Un do echo "chr${C} working ... " (cat ${C}/chr${C}_000.out ${C}/chr${C}_0000.out 2> /dev/null; \ ls ${C}/chr${C}_*.out | egrep -v "_000.out|_0000.out" \ | xargs tail --quiet --lines=+4) | \ liftUp ./chr${C}.fa.out ../lifts/lift500K.lft error stdin echo "chr${C} done" done # Add the two results together cd /cluster/data/monDom4/RMOut cp -p /cluster/data/monDom2/scripts/uniqueRMOut.csh ../scripts for C in 1 2 3 4 5 6 7 8 X Un do echo "${C} working ..." cp -p ../${C}/chr${C}.fa.out ./chr${C}.tmp.out tail +4 ../RMOutExtra/chr${C}.fa.out >> ./chr${C}.tmp.out ../scripts/uniqueRMOut.csh ./chr${C}.tmp.out > chr${C}.fa.out rm -f ./chr${C}.tmp.out & echo "done" done # and then add this mask to the existing masked sequence for C in 1 2 3 4 5 6 7 8 X Un do echo "chr${C} working ..." rm -f ./chr${C}.added.fa maskOutFa -softAdd ../${C}/chr${C}.fa ./chr${C}.fa.out ./chr${C}.added.fa echo "done" done # verify they are not broken compared to what was previous # note the additional masked sequence in the lower case counts ssh kkstore01 cd /cluster/data/monDom4/RMOut faSize chr?.added.fa chrUn.added.fa # 3605614649 bases (103971429 N's 3501643220 real 1551686450 upper # 1949956770 lower) in 10 sequences in 10 files faSize ../?/chr?.fa ../Un/chrUn.fa # 3605614649 bases (103971429 N's 3501643220 real 1687813668 upper # 1813829552 lower) in 10 sequences in 10 files # If that is OK, replace the current sequence for C in 1 2 3 4 5 6 7 8 X Un do rm ../${C}/chr${C}.fa mv chr${C}.added.fa ../${C}/chr${C}.fa done # And rebuild the 2bit file: faToTwoBit ?/chr?.fa Un/chrUn.fa monDom4RMExtra.2bit # and switch it out from under the browser rm monDom4.2bit; mv monDom4RMExtra.2bit monDom4.2bit # distribute to /iscratch/i/monDom4 ... ############################################################################ # BLATSERVERS ENTRY (DONE - 2006-02-16 - Hiram) # After getting a blat server assigned by the Blat Server Gods, ssh hgwdev hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ VALUES ("monDom4", "blat17", "17786", "1", "0"); \ INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ VALUES ("monDom4", "blat17", "17787", "0", "1");' \ hgcentraltest # test it with some sequence ######################################################################### # CPGISLANDS (DONE - 2006-02-16 - Hiram) ssh hgwdev mkdir /cluster/data/monDom4/bed/cpgIsland cd /cluster/data/monDom4/bed/cpgIsland # Build software from Asif Chinwalla (achinwal@watson.wustl.edu) cvs co hg3rdParty/cpgIslands cd hg3rdParty/cpgIslands make # gcc readseq.c cpg_lh.c -o cpglh.exe cd ../.. ln -s hg3rdParty/cpgIslands/cpglh.exe . ssh kkstore01 cd /cluster/data/monDom4 mkdir tmp # cpglh.exe requires hard-masked (N) .fa's. time for CHR in ?/chr?.fa ??/chr??.fa do echo "maskOutFa ${CHR} hard ${CHR}.masked" maskOutFa ${CHR} hard ${CHR}.masked done > tmp/hardMask.out 2>&1 # about 3 minutes 20 seconds # There may be warnings about "bad character" for IUPAC ambiguous # characters like R, S, etc. Ignore the warnings. cd /cluster/data/monDom4/bed/cpgIsland for F in ../../*/chr*.fa.masked do FA=${F/*\/} C=${FA/.fa.masked/} echo "./cpglh.exe ${FA} > ${C}.cpg" ./cpglh.exe ${F} > ${C}.cpg done > cpglh.out 2>&1 & # about 4 minutes, there were no warnings # Transform cpglh output to bed + cat << '_EOF_' > filter.awk { $2 = $2 - 1; width = $3 - $2; printf("%s\t%d\t%s\t%s %s\t%s\t%s\t%0.0f\t%0.1f\t%s\t%s\n", $1, $2, $3, $5,$6, width, $6, width*$7*0.01, 100.0*2*$6/width, $7, $9); } '_EOF_' # happy emacs awk -f filter.awk chr*.cpg | sort -k1,1 -k2,2n > cpgIsland.bed ssh hgwdev cd /cluster/data/monDom4/bed/cpgIsland hgLoadBed -strict monDom4 cpgIslandExt -tab -noBin \ -sqlTable=$HOME/kent/src/hg/lib/cpgIslandExt.sql cpgIsland.bed # Reading cpgIsland.bed # Loaded 22409 elements of size 10 featureBits monDom4 cpgIslandExt # 14526908 bases of 3501643220 (0.415%) in intersection ######################################################################### # ANDY LAW CPGISSLANDS (DONE - 2006-02-16 - Hiram) # See notes in makeGalGal2.doc and makeCanFam2.doc ssh kkstore01 mkdir /cluster/data/monDom4/bed/cpgIslandGgfAndy cd /cluster/data/monDom4/bed/cpgIslandGgfAndy # Build the preProcGgfAndy program in # kent/src/oneShot/preProcGgfAndy into your ~/bin/$MACHTYPE # Use masked sequence since this is a mammal... for F in ../../*/chr*.fa.masked do FA=${F/*\/} C=${FA/.fa.masked/} echo preproc and run on masked "${C} ${F}" 1>/dev/stderr ~/bin/$MACHTYPE/preProcGgfAndy ${F} \ | /cluster/home/angie/ggf-andy-cpg-island.pl \ | perl -wpe 'chomp; ($s,$e,$cpg,$n,$c,$g1,$oE) = split("\t"); $s--; $gc=$c+$g1; $pCpG=(100.0 * 2 * $cpg / $n); $pGc=(100.0 * $gc / $n); $_="'${C}'\t$s\t$e\tCpG: $cpg\t$n\t$cpg\t$gc\t" . "$pCpG\t$pGc\t$oE\n";' done | sort -k1,1 -k2,2n > cpgIslandGgfAndyMasked.bed # about 3 minutes # load into database: ssh hgwdev cd /cluster/data/monDom4/bed/cpgIslandGgfAndy sed -e 's/cpgIslandExt/cpgIslandGgfAndyMasked/g' \ $HOME/kent/src/hg/lib/cpgIslandExt.sql > cpgIslandGgfAndyMasked.sql hgLoadBed -strict monDom4 cpgIslandGgfAndyMasked -tab -noBin \ -sqlTable=cpgIslandGgfAndyMasked.sql cpgIslandGgfAndyMasked.bed # Loaded 67504 elements of size 10 featureBits monDom4 cpgIslandExt # 14526908 bases of 3501643220 (0.415%) in intersection featureBits monDom4 cpgIslandGgfAndyMasked # 41042229 bases of 3501643220 (1.172%) in intersection wc -l ../cpgIsland/cpgIsland.bed *bed # 22409 ../cpgIsland/cpgIsland.bed # 67504 cpgIslandGgfAndyMasked.bed ####################################################################### # MAKE 11.OOC FILE FOR BLAT (DONE - 2006-02-24 - Hiram) ssh kkstore01 cd /cluster/data/monDom4 # use repMatch of 1228 as this genome is ~ %20 larger than human # 1024 + (1024 * 0.2) = 1228 time blat monDom4.2bit \ /dev/null /dev/null -tileSize=11 -makeOoc=11.ooc -repMatch=1228 # Wrote 43992 overused 11-mers to 11.ooc # real 3m24.793s # copy to bluearc filesystem for kluster use cp -p 11.ooc /cluster/bluearc/scratch/hg/monDom4 ######################################################################### # GENBANK auto update (DONE - 2006-02-24 - Hiram) # align with revised genbank process. drop xeno ESTs. ssh hgwdev cd ~/kent/src/hg/makeDb/genbank cvs update -d -P etc #edit etc/genbank.conf to add monDom4, it is a copy of monDom2 with changes: # monDom4 (M. domestica) 8 chroms plus X and Un monDom4.serverGenome = /cluster/data/monDom4/monDom4.2bit monDom4.clusterGenome = /cluster/bluearc/scratch/hg/monDom4/monDom4.2bit monDom4.ooc = /cluster/data/monDom4/11.ooc monDom4.lift = no monDom4.refseq.mrna.native.pslCDnaFilter = ${ordered.refseq.mrna.native.pslCDnaFilter} monDom4.refseq.mrna.xeno.pslCDnaFilter = ${ordered.refseq.mrna.xeno.pslCDnaFilter} monDom4.genbank.mrna.native.pslCDnaFilter = ${ordered.genbank.mrna.native.pslCDnaFilter} monDom4.genbank.mrna.xeno.pslCDnaFilter = ${ordered.genbank.mrna.xeno.pslCDnaFilter} monDom4.genbank.est.native.pslCDnaFilter = ${ordered.genbank.est.native.pslCDnaFilter} monDom4.downloadDir = monDom4 monDom4.refseq.mrna.xeno.load = yes monDom4.refseq.mrna.xeno.loadDesc = yes monDom4.mgcTables.default = full monDom4.mgcTables.mgc = all # check that into CVS, then # update /cluster/data/genbank/ make etc-update ssh kkstore02 cd /cluster/data/genbank nice -n 19 bin/gbAlignStep -initial monDom4 & # var/build/logs/2006.02.24-22:38:50.monDom4.initalign.log # load database when finished ssh hgwdev cd /cluster/data/genbank nice -n 19 ./bin/gbDbLoadStep -drop -initialLoad monDom4 & # var/dbload/hgwdev/logs/2006.02.26-09:36:15.dbload.log XXXX - running 2006-02-26 09:36 ########################################################################## # BLASTZ SELF (DONE - 2006-03-02 - 2006-04-19 - Hiram) ssh kk mkdir /cluster/data/monDom4/bed/blastzSelf.2006-03-02 cd /cluster/data/monDom4/bed ln -s blastzSelf.2006-03-02 blastz.monDom4 cd blastzSelf.2006-03-02 cat << '_EOF_' > DEF # monDom4 vs monDom4 export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin BLASTZ=blastz.v7 BLASTZ_M=100 # TARGET: Opossum monDom4 SEQ1_DIR=/iscratch/i/monDom4/monDom4.2bit SEQ1_LEN=/iscratch/i/monDom4/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: MonDom2 SEQ2_DIR=/iscratch/i/monDom4/monDom4.2bit SEQ2_LEN=/iscratch/i/monDom4/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/cluster/data/monDom4/bed/blastzSelf.2006-03-02 TMPDIR=/scratch/tmp '_EOF_' # happy emacs time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=kk -chainMinScore=10000 -chainLinearGap=medium \ `pwd`/DEF > blastz.out 2>&1 & # This was a difficult run, needed to do the last job on Kolossus: # Completed: 133955 of 133956 jobs # CPU time in finished jobs: 78602533s 1310042.22m 21834.04h 909.75d 2.492 y # IO & Wait Time: 47419875s 790331.25m 13172.19h 548.84d 1.504 y # Average job time: 941s 15.68m 0.26h 0.01d # Longest finished job: 111202s 1853.37m 30.89h 1.29d # Submission to last job: 521430s 8690.50m 144.84h 6.04d time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=kk -chainMinScore=10000 -chainLinearGap=medium \ -continue=cat `pwd`/DEF > cat.out 2>&1 & # broke during chaining, the processes are too large. Running # them individually on hgwdev64 and kolossus, they take many days # each, perhaps a week, still trying to get them done after two # weeks of messing with them. # last one took: # real 7343m55.064s # user 2270m9.968s # sys 5072m16.807s # that's 5 days and 2 hours, about 5 weeks for all: # -rw-rw-r-- 1 138817724 Mar 10 17:48 monDom4.2bit:chrX:.chain # -rw-rw-r-- 1 500083934 Mar 11 15:06 monDom4.2bit:chr7:.chain # -rw-rw-r-- 1 681900033 Mar 11 20:27 monDom4.2bit:chr5:.chain # -rw-rw-r-- 1 495478058 Mar 11 20:53 monDom4.2bit:chr6:.chain # -rw-rw-r-- 1 948724278 Mar 12 12:55 monDom4.2bit:chrUn:.chain # -rw-rw-r-- 1 668565678 Mar 12 18:03 monDom4.2bit:chr8:.chain # -rw-rw-r-- 1 1214431534 Mar 23 21:39 monDom4.2bit:chr2:.chain # -rw-rw-r-- 1 1755465687 Apr 9 19:31 monDom4.2bit:chr4:.chain # -rw-rw-r-- 1 1739564535 Apr 10 15:55 monDom4.2bit:chr1:.chain # -rw-rw-r-- 1 2041409562 Apr 16 13:58 monDom4.2bit:chr3:.chain # loading the tables on kolossus ssh kolossus cd /cluster/data/monDom4/bed/blastzSelf.2006-03-02/axtChai cat << '_EOF_' > loadUp.csh #!/bin/csh -fe cd /cluster/data/monDom4/bed/blastzSelf.2006-03-02/axtChain/chain foreach c (`awk '{print $1;}' /cluster/bluearc/scratch/hg/monDom4/chrom.sizes`) set f = $c.chain if (! -e $f) then echo no chains for $c set f = /dev/null endif hgLoadChain monDom4 ${c}_chainSelf $f end '_EOF_' # << emacs chmod +x loadUp.csh time ./loadUp.csh # real 211m16.852s # request push of *chainSelf* tables from kolossus to hgwdev # almost 32 Gb of data. ssh kolossus cd /cluster/data/monDom4/bed/blastzSelf.2006-03-02 time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=kk -chainMinScore=10000 -chainLinearGap=medium \ -continue=net `pwd`/DEF > net.out 2>&1 & ############################################################################ ## BLASTZ swap from mm8 alignments (DONE - 2006-02-23 - Hiram) ssh pk cd /cluster/data/mm8/bed/blastzMonDom4.2006-02-23 time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -swap -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ `pwd`/DEF > swap.out 2>&1 & time nice -n +19 featureBits monDom4 chainMm8Link # 210933035 bases of 3501643220 (6.024%) in intersection ############################################################################ ## BLASTZ CHICKEN galGal2 (DONE - 2006-03-06 - 2006-03-08 - Hiram) ssh pk mkdir /cluster/data/monDom4/bed/blastzGalGal2.2006-03-06 cd /cluster/data/monDom4/bed ln -s blastzGalGal2.2006-03-06 blastz.galGal2 cd blastzGalGal2.2006-03-06 cat << '_EOF_' > DEF # Opossum vs Chicken export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/parasol/bin BLASTZ=blastz.v7.x86_64 BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=10000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET: Opossum monDom4 SEQ1_DIR=/san/sanvol1/scratch/monDom4/monDom4.2bit SEQ1_LEN=/san/sanvol1/scratch/monDom4/chrom.sizes SEQ1_CHUNK=50000000 SEQ1_LAP=10000 # QUERY: Chicken galGal2 - single chunk big enough for whole chroms at once SEQ2_DIR=/scratch/hg/galGal2/nib SEQ2_LEN=/scratch/hg/galGal2/chrom.sizes SEQ2_CHUNK=200000000 SEQ2_LAP=0 BASE=/cluster/data/monDom4/bed/blastzGalGal2.2006-03-06 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ `pwd`/DEF > blastz.out 2>&1 & # real 1399m47.238s mkdir /cluster/data/galGal2/bed/blastz.monDom4.swap cd /cluster/data/galGal2/bed ln -s blastz.monDom4.swap blastz.monDom4 cd blastz.monDom4.swap time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ -swap /cluster/data/monDom4/bed/blastzGalGal2.2006-03-06/DEF \ > swap.out 2>&1 & featureBits monDom4 chainGalGal2Link > fb.monDom4.chainGalGal2Link 2>&1 cat fb.monDom4.chainGalGal2Link # 106807090 bases of 3501643220 (3.050%) in intersection featureBits galGal2 chainMonDom4Link > fb.galGal2.chainMonDom4Link 2>&1 cat fb.galGal2.chainMonDom4Link # 95852676 bases of 1054197620 (9.092%) in intersection ############################################################################ ## BLASTZ ZEBRAFISH danRer3 (DONE - 2006-03-06 - 2003-03-08 - Hiram) ssh pk mkdir /cluster/data/monDom4/bed/blastzDanRer3.2006-03-06 cd /cluster/data/monDom4/bed ln -s blastzDanRer3.2006-03-06 blastz.danRer3 cd blastzDanRer3.2006-03-06 # doing only the chroms on zebrafish, leaving out the chrUn and chrNA cat << '_EOF_' > DEF # Opossum vs Zebrafish export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/parasol/bin BLASTZ=blastz.v7.x86_64 BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_M=50 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET: Opossum monDom4 SEQ1_DIR=/san/sanvol1/scratch/monDom4/monDom4.2bit SEQ1_LEN=/san/sanvol1/scratch/monDom4/chrom.sizes SEQ1_CHUNK=50000000 SEQ1_LAP=10000 # QUERY: Zebrafish (danRer3) # large enough chunk to do complete chroms at once SEQ2_DIR=/san/sanvol1/scratch/danRer3/chromNib SEQ2_LEN=/san/sanvol1/scratch/danRer3/chromNib.sizes SEQ2_CHUNK=100000000 SEQ2_LAP=0 BASE=/cluster/data/monDom4/bed/blastzDanRer3.2006-03-06 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ `pwd`/DEF > blastz.out 2>&1 & # common failure for unknown reason: # startStep: 0, at step 5 net to stopStep 9 # netChains: looks like previous stage was not successful # (can't find [monDom4.danRer3.]all.chain[.gz]). time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ -continue=net `pwd`/DEF > net.out 2>&1 & mkdir /cluster/data/danRer3/bed/blastz.monDom4.swap cd /cluster/data/danRer3/bed ln -s blastz.monDom4.swap blastz.monDom4 cd blastz.monDom4.swap time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ -swap /cluster/data/monDom4/bed/blastzDanRer3.2006-03-06/DEF \ > swap.out 2>&1 & featureBits monDom4 chainDanRer3Link > fb.monDom4.chainDanRer3Link 2>&1 cat fb.monDom4.chainDanRer3Link # 73187496 bases of 3501643220 (2.090%) in intersection featureBits danRer3 chainMonDom4Link > fb.danRer3.chainMonDom4Link 2>&1 cat fb.danRer3.chainMonDom4Link # 65225617 bases of 1630323462 (4.001%) in intersection ########################################################################## # MAKE Human Proteins track (WORKING - 2006-03-24 - Hiram) # we need the split500K sequences to do this work, set them up on # the Iservers ssh kkstore01 cd /cluster/data/monDom4/split500K rsync -a --progress `pwd`/ kkr1u00:/iscratch/i/monDom4/split500K/ ssh kkr1u00 cd /iscratch/i/monDom4/split500K # create individual blast db's for each sequence for C in 1 2 3 4 5 6 7 8 Un X do echo ${C} working ... cd ${C} for i in *.fa do /cluster/bluearc/blast229/formatdb -p F -i $i done cd .. echo ${C} done done find /iscratch/i/monDom4/split500K -name "*.nsq" \ | sed "s/\.nsq//" > target.lst # now sync everything to the other Iserver nodes for R in 2 3 4 5 6 7 8 do rsync -a --progress `pwd`/ kkr${R}u00:/iscratch/i/monDom4/split500K/ done ssh kkstore01 mkdir -p /cluster/data/monDom4/bed/tblastn.hg18KG cd /cluster/data/monDom4/bed/tblastn.hg18KG mkdir kgfa # calculate number of protein sequence to give a job count of about # 500,000 echo `cat /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl | wc -l` \ `cat /iscratch/i/monDom4/split500K/target.lst | wc -l` \ | awk '{printf "%d/(500000/%d) = %d\n", $1, $2, $1/(500000/$2)}' # 36727/(500000/7170) = 526 # split the proteins by that number split -l 526 /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl kgfa/kg cd kgfa for i in kg??; do pslxToFa $i $i.fa; rm $i; done cd .. ls -1S kgfa/*.fa > kg.lst # Want output to go to bluearc to preserve sanity of file server rm -rf /cluster/bluearc/monDom4/bed/tblastn.hg18KG/blastOut mkdir -p /cluster/bluearc/monDom4/bed/tblastn.hg18KG/blastOut ln -s /cluster/bluearc/monDom4/bed/tblastn.hg18KG/blastOut . for i in `cat kg.lst`; do mkdir blastOut/`basename $i .fa`; done ssh kk cd /cluster/data/monDom4/bed/tblastn.hg18KG cat /iscratch/i/monDom4/split500K/*.lft > subChr.lft cat << '_EOF_' > template #LOOP blastSome $(path1) $(path2) {check out exists blastOut/$(root2)/q.$(root1).psl} #ENDLOOP '_EOF_' # << happy emacs #### !NOTE: Next time run these jobs with a better separation # of I/O spaces, they seem to hangup bluearc while they work. # Plus, they are using /tmp instead of /scratch/tmp/ # And they are leaving lots of garbage, the $f.3 file. # Have added $f.3 to the listing below to remove it cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/cluster/bluearc/blast229/data export BLASTMAT g=`basename $2` f=/tmp/`basename $3`.$g for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11 do if /cluster/bluearc/blast229/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8 then mv $f.8 $f.1 break; fi done if test -f $f.1 then if /cluster/bin/i386/blastToPsl $f.1 $f.2 then liftUp -nosort -type=".psl" -pslQ -nohead $f.3 /cluster/data/hg18/bed/blat.hg18KG/protein.lft warn $f.2 liftUp -nosort -type=".psl" -nohead $3.tmp /cluster/data/monDom4/bed/tblastn.hg18KG/subChr.lft carry $f.3 mv $3.tmp $3 rm -f $f.1 $f.2 $f.3 exit 0 fi fi rm -f $f.1 $f.2 $f.3 $3.tmp $f.8 exit 1 '_EOF_' # << happy emacs chmod +x blastSome cp -p /iscratch/i/monDom4/split500K/target.lst . gensub2 target.lst kg.lst template jobList # You want to try 10 jobs and see if the following chain # pipeline will work in a reasonable time. If it doesn't the # jobs are too big. There were some hippos in the simpleChain # step. para create jobList para try para push # Completed: 504218 of 504218 jobs # CPU time in finished jobs: 28863259s 481054.31m 8018.57h 334.07d 0.915 y # IO & Wait Time: 4255045s 70918.42m 1181.96h 49.25d 0.135 y # Average job time: 66s 1.09m 0.02h 0.00d # Longest finished job: 443s 7.38m 0.12h 0.01d # Submission to last job: 166648s 2777.47m 46.29h 1.93d # This simpleChain step has jobs that are too large. They could # be broken up even more as was done for the hippos that remained, # see below. ssh kk cd /cluster/data/monDom4/bed/tblastn.hg18KG mkdir chainEm cd /cluster/data/monDom4/bed/tblastn.hg18KG/blastOut find . -name "*.psl" \ > /cluster/data/monDom4/bed/tblastn.hg18KG/chainEm/chain.lst cd chainEm mkdir psl cat << '_EOF_' > template #LOOP ./chainSome.csh $(root1) $(file2) {check out line psl/c.$(file2).$(root1).psl} #ENDLOOP '_EOF_' # << happy emacs # you will have to build this simpleChain binary in # hg/mouseStuff/simpleChain/ # it will not run compiled as an x86_64 binary, only i386 cat << '_EOF_' > chainSome.csh #!/bin/csh -fe cat ../blastOut/${1}/q.${2}_*.psl | \ $HOME/bin/i386/simpleChain \ -prot -outPsl -maxGap=250000 stdin psl/c.${2}.${1}.psl '_EOF_' # << happy emacs chmod +x chainSome.csh ls -1dS ../blastOut/kg?? > chain.dirs gensub2 chain.dirs ../../../chrom.lst template jobList para create jobList para push # Completed: 574 of 699 jobs # CPU time in finished jobs: 17708635s 295143.92m 4919.07h 204.96d 0.562 y # IO & Wait Time: 1589482s 26491.37m 441.52h 18.40d 0.050 y # Average job time: 33620s 560.34m 9.34h 0.39d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 222754s 3712.57m 61.88h 2.58d # Submission to last job: 673020s 11217.00m 186.95h 7.79d # Some of these ended up as hippos after a few days. # Rebuild a new batch for just the hippos ssh kk cd /cluster/store9/monDom4/bed/tblastn.hg18KG/chainEm mkdir runHippos mkdir runHippos/psl # from the running hippos, extract the commands: para running | grep command: > splitHippos.sh # edit splitHippos.sh to make those lines read: # ./mkHippoList.csh kgaa chr1 # where mkHippoList.csh is: # #!/bin/csh -fe # ls ../blastOut/${1}/q.${2}_*.psl # run that: ./splitHippos.sh > hippo.list # split by chrom grep chr1_ hippo.list > chr1.list grep chr2_ hippo.list > chr2.list grep chr3_ hippo.list > chr3.list grep chr4_ hippo.list > chr4.list # these were the only chroms in the list, then # The sequence below does not produce the correct result. # Instead, break up all protein results for each chrom to # individual files. ssh hgwdev cd /cluster/store9/monDom4/bed/tblastn.hg18KG/chainEm # Do these one at a time since they create huge amounts of data in # /scratch/tmp - after each one has been sent to kkr1u00, the # data here on hgwdev can be deleted cat chr1.list | xargs cat > /scratch/tmp/chr1.psl cd /scratch/tmp cat << '_EOF_' > splitProteins.pl #!/usr/bin/env perl use strict; use warnings; sub usage() { print "usage: sort -k10 chrN.psl | ./splitProteins.pl \n"; print " where chrN is : chr1, chr2, chr3, etc...\n"; print " one at a time please.\n"; exit 255; } my $argc = scalar(@ARGV); if ($argc != 1) { usage; } my $chr = shift; print "mkdir -p $chr", "\n"; print `mkdir -p $chr`, "\n"; my $curProtein=""; my $linesRead = 0; while (my $line=) { ++$linesRead; my ($col1, $col2, $col3, $col4, $col5, $col6, $col7, $col8, $col9, $col10, $rest) = split('\t',$line); if ($curProtein ne $col10) { if (length($curProtein) > 0) { close(FH); } open(FH,">$chr/$col10.psl") or die "can not open $chr/$col10.psl"; $curProtein = $col10; } print FH $line; if (0 == ($linesRead % 1000)) {print ".";} if (0 == ($linesRead % 70000)) {print " $linesRead \n";} } close(FH); '_EOF_' # << happy emacs chmod +x ./splitProteins.pl sort -k10 chr1.psl | ./splitProteins.pl chr1 rsync -a --progress ./chr1/ kkr1u00:/iscratch/i/monDom4/simpleChain/chr1/ # After the four chroms are there, sync to all I servers ssh kkr1u00 cd /iscratch/i/monDom4/simpleChain for R in 2 3 4 5 6 7 8 do rsync -a --progress --stats `pwd`/ \ kkr${R}u00:/iscratch/i/monDom4/simpleChain/ done # # while we are here, make up the jobList cat << '_EOF_' > mkJobList #!/bin/sh for C in 1 2 3 4 do M=0 N="00" find ./chr${C} -type f | while read F do BN=`basename ${F}` echo "./oneChained.csh chr${C} ${BN} ${N} (check line+ psl/chr${C}/${N}/${BN})" M=`echo $M | awk '{printf "%d", $1+1}'` if [ "${M}" -gt 999 ]; then N=`echo ${N} | awk '{printf "%02d", $1+1}'` M=0 fi done done '_EOF_' # << happy emacs chmod +x mkJobList ./mkJobList > jobList mkdir /cluster/data/monDom4/bed/tblastn.hg18KG/hippoRun cp jobList /cluster/data/monDom4/bed/tblastn.hg18KG/hippoRun ssh kk cd /cluster/data/monDom4/bed/tblastn.hg18KG/hippoRun mkdir psl cat << '_EOF_' > oneChained.csh #!/bin/csh -fe mkdir -p psl/${1}/${3} cat /iscratch/i/monDom4/simpleChain/${1}/${2} | \ $HOME/bin/i386/simpleChain \ -prot -outPsl -maxGap=250000 stdin psl/${1}/${3}/${2} '_EOF_' # << happy emacs # XXXX this sequence did not work right, it breaks up individual # XXXX protein results # these were the only chroms in the list, then mkdir chr1 chr2 chr3 chr4 cd chr1 split -a 3 -l 100 ../chr1.list chr1 cd ../chr2 split -a 3 -l 100 ../chr2.list chr2 cd ../chr3 split -a 3 -l 100 ../chr3.list chr3 cd ../chr4 split -a 3 -l 100 ../chr4.list chr4 cd ../runHippos cat << '_EOF_' > mkJobList.sh #!/bin/sh for C in 1 2 3 4 do for F in `(cd ../chr${C}; ls chr${C}*)` do echo \ "./chainHippos.csh ${F} chr${C} {check out line psl/c.chr${C}.${F}.psl}" done done '_EOF_' # << happy emacs chmod +x ./mkJobList.sh ./mkJobList.sh > jobList cat << '_EOF_' > chainHippos.csh #!/bin/csh -fe sed -e "s/^/..\//" ../${2}/${1} | xargs cat | \ $HOME/bin/i386/simpleChain \ -prot -outPsl -maxGap=250000 stdin psl/c.${2}.${1}.psl '_EOF_' # << happy emacs chmod +x chainHippos.sh para create jobList para try ... check ... etc .... ###### Continuing with the concatenation of all the results ssh kkstore04 # running out of disk space on store9, create a parallel hierarchy # on a different disk still on this server: mkdir -p /cluster/store8/monDom4/bed/tblastn.hg18KG/sortingResults cd /cluster/data/monDom4/bed/tblastn.hg18KG ln -s /cluster/store8/monDom4/bed/tblastn.hg18KG/sortingResults \ sortingResults # first set of results was in blastOut cd blastOut time for i in kg?? do find ./$i -name "q.*.psl" | xargs cat \ | awk '($13 - $12)/$11 > 0.6 {print}' \ > /cluster/data/monDom4/bed/tblastn.hg18KG/sortingResults/c60.$i.psl sort -rn \ /cluster/data/monDom4/bed/tblastn.hg18KG/sortingResults/c60.$i.psl \ | pslUniq stdin \ /cluster/data/monDom4/bed/tblastn.hg18KG/sortingResults/u.$i.psl awk '(($1 / $11) ) > 0.60 {print}' \ /cluster/data/monDom4/bed/tblastn.hg18KG/sortingResults/c60.$i.psl \ > /cluster/data/monDom4/bed/tblastn.hg18KG/sortingResults/m60.$i.psl echo $i done # real 199m44.441s # And then, adding the hippo results to those: cd ../hippoRun/psl time for C in 1 2 3 4 do cd chr${C} for D in * do find ./${D} -name "*.psl" | xargs cat \ | awk '($13 - $12)/$11 > 0.6 {print}' \ > /cluster/data/monDom4/bed/tblastn.hg18KG/sortingResults/c60.kg0${C}_${D}.psl sort -rn \ /cluster/data/monDom4/bed/tblastn.hg18KG/sortingResults/c60.kg0${C}_${D}.psl \ | pslUniq stdin \ /cluster/data/monDom4/bed/tblastn.hg18KG/sortingResults/u.kg0${C}_${D}.psl awk '(($1 / $11) ) > 0.60 {print}' \ /cluster/data/monDom4/bed/tblastn.hg18KG/sortingResults/c60.kg0${C}_${D}.psl \ > /cluster/data/monDom4/bed/tblastn.hg18KG/sortingResults/m60.kg0${C}_${D}.psl echo chr${C}/${D} done cd .. done # real 14m49.721s cd ../../sortingResults sort -u -k 14,14 -k 16,16n -k 18,18n u.*.psl m60* \ > /cluster/data/monDom4/bed/tblastn.hg18KG/blastHg18KG.psl cd .. wc -l blastHg18KG.psl # 63264 blastHg18KG.psl # what is this for ? cat kgfa/*fa | grep ">" | wc # 309494 309494 4810327 ssh hgwdev cd /cluster/data/monDom4/bed/tblastn.hg18KG hgLoadPsl monDom4 blastHg18KG.psl time nice -n +19 featureBits monDom4 blastHg18KG # 19424941 bases of 3501643220 (0.555%) in intersection # back to kksilo rm -rf blastOut # End tblastn ############################################################################ # SWAP CHAINS/NET RN4 (DONE 3/28/06 angie) ssh kkstore01 mkdir /cluster/data/monDom4/bed/blastz.rn4.swap cd /cluster/data/monDom4/bed/blastz.rn4.swap doBlastzChainNet.pl -swap /cluster/data/rn4/bed/blastz.monDom4/DEF \ -workhorse kkr8u00 >& do.log & tail -f do.log ln -s blastz.rn4.swap /cluster/data/monDom4/bed/blastz.rn4 nice featureBits monDom4 chainRn4Link # 198094619 bases of 3501643220 (5.657%) in intersection ####################################################################### # CHIMP BLASTZ - (DONE - 2006-03-29 - 2006-04-10 - Hiram) ssh pk mkdir /cluster/data/monDom4/bed/blastzPanTro2.2006-03-29 cd /cluster/data/monDom4/bed ln -s blastzPanTro2.2006-03-29 blastz.panTro2 cd blastzPanTro2.2006-03-29 cat << '_EOF_' > DEF # opossum vs. chimp export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/parasol/bin BLASTZ=blastz.v7.x86_64 # settings for more distant organism alignments BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=10000 BLASTZ_K=2200 BLASTZ_M=50 BLASTZ_Q=/cluster/data/blastz/HoxD55.q BLASTZ_ABRIDGE_REPEATS=0 # TARGET: Opossum monDom4 SEQ1_DIR=/san/sanvol1/scratch/monDom4/monDom4.2bit SEQ1_LEN=/san/sanvol1/scratch/monDom4/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Chimp PanTro2 SEQ2_DIR=/scratch/hg/panTro2/nib SEQ2_LEN=/scratch/hg/panTro2/chrom.sizes SEQ2_CHUNK=30000000 SEQ2_LAP=0 BASE=/cluster/data/monDom4/bed/blastzPanTro2.2006-03-29 TMPDIR=/scratch/tmp '_EOF_' # << emacs time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ `pwd`/DEF > blastz.out 2>&1 & # XXXX running 2006-03-29 # 2 chroms crashed out-of-mem, so run manually on kolossus ssh kolossus cd /cluster/data/monDom4/bed/blastz.panTro2/axtChain/run (chain.csh monDom4.2bit:chr4: chain/monDom4.2bit:chr4:.chain; \ chain.csh monDom4.2bit:chr3: chain/monDom4.2bit:chr3:.chain) > chain.log & time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ -continue=chainMerge \ `pwd`/DEF > blastz.chainMerge.out 2>&1 & # failed during load due to mysql restart time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -continue=load \ `pwd`/DEF > blastz.load.out 2>&1 & # SWAP ALIGNMENTS /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 -swap \ -workhorse=kolossus \ `pwd`/DEF > swap.log 2>&1 & # failed on file perm change on liftOver file owned by Hiram rm /cluster/data/monDom4/bed/liftOver/monDom4ToPanTro2.over.chain.gz rm /cluster/data/panTro2/bed/blastz.monDom4.swap/axtChain/noClass.net /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 -swap \ -workhorse=kolossus \ -continue=net \ `pwd`/DEF > swap.net.log 2>&1 & time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ -swap `pwd`/DEF > swap.out 2>&1 & # XXXX running 2006-04-04 time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ -continue=net -swap `pwd`/DEF > swap.net.out 2>&1 & time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ -continue=load -swap `pwd`/DEF > swap.load.out 2>&1 & time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ -continue=cleanup -swap `pwd`/DEF > swap.cleanup.out 2>&1 & time nice -n +19 featureBits panTro2 chainMonDom4Link \ > fb.panTro2.chainMonDom4Link 2>&1 & # BLASTZ/CHAIN/NET XENTRO2 (DONE 4/22/06 angie) ssh kkstore04 mkdir /cluster/data/monDom4/bed/blastz.xenTro2.2006-04-20 cd /cluster/data/monDom4/bed/blastz.xenTro2.2006-04-20 cat << '_EOF_' > DEF # opossum vs. frog BLASTZ=/cluster/bin/penn/x86_64/blastz.v7.x86_64 # Use same params as used for mammal-xenTro1 (see makeXenTro1.doc) BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=8000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET: Opossum monDom4 SEQ1_DIR=/san/sanvol1/scratch/monDom4/monDom4.2bit SEQ1_LEN=/san/sanvol1/scratch/monDom4/chrom.sizes SEQ1_CHUNK=50000000 SEQ1_LAP=10000 # QUERY: Frog xenTro2 - single chunk big enough to run two of the # largest scaffolds in one job SEQ2_DIR=/scratch/hg/xenTro2/xenTro2.2bit SEQ2_LEN=/san/sanvol1/scratch/xenTro2/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LAP=0 SEQ2_LIMIT=100 BASE=/cluster/data/monDom4/bed/blastz.xenTro2.2006-04-20 '_EOF_' # << emacs doBlastzChainNet.pl -blastzOutRoot=/san/sanvol1/monDom4XenTro2 \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose DEF \ >& do.log & tail -f do.log ln -s blastz.xenTro2.2006-04-20 /cluster/data/monDom4/bed/blastz.xenTro2 ssh kolossus time nice -n +19 featureBits monDom4 chainXenTro2Link \ > fb.monDom4.chainXenTro2Link 2>&1 & cat fb.monDom4.chainXenTro2Link # 8160051 bases of 3501643220 (2.232%) in intersection ########################################################################### # CREATE OPOSSUM AND OTHER SPECIES LINEAGE-SPECIFIC REPEATS DIRECTORY # AND NIB DIRECTORY ON THE SAN FOR BLASTZ CLUSTER RUNS # (DONE, 2006-04-26, hartera) # If there are no lineage-specific repeats for opossum and other species # then use all repeats. ssh pk mkdir -p /san/sanvol1/scratch/monDom4/linSpecRep.notInOthers foreach f (/cluster/data/monDom4/*/chr*.fa.out) cp -p $f \ /san/sanvol1/scratch/monDom4/linSpecRep.notInOthers/$f:t:r:r.out.spec end # need nib files when running Blastz with lineage-specific repeats # so create these and move to the san. ssh kkstore04 mkdir -p /cluster/data/monDom4/nib cd /cluster/data/monDom4 foreach f (*/chr*.fa) echo "Processing $f ..." nice faToNib -softMask $f nib/$f:t:r.nib end # move nibs to the san as taking up a lot of space on /cluster/data/ ssh pk mv /cluster/data/monDom4/nib /san/sanvol1/scratch/monDom4/ ########################################################################## # QUALITY (DONE - 2006-04-27 - 2006-04-28 - Hiram) ssh kkstore04 # this is actually a symlink to store8 to distribute the data on # the full filesystems on kkstore04 mkdir /cluster/data/monDom4/bed/quality cd /cluster/data/monDom4/bed/quality cp -p /cluster/data/monDom2/bed/quality/qaToWigEncode.pl . # using perl script from previous build # Update the file name in this file: my $AGP_FILE="/cluster/data/monDom4/broad.mit.edu/Monodelphis4.0.agp"; # convert the qual.fa to wiggle single column input and then wigEncode: time nice -n +19 \ zcat /cluster/data/monDom4/broad.mit.edu/Monodelphis4.0.agp.chromosome.qual.gz \ | ./qaToWigEncode.pl /dev/stdin \ | $HOME/bin/$MACHTYPE/wigEncode -noOverlap \ stdin quality.wig quality.wib # Converted stdin, upper limit 63.00, lower limit 0.00 # real 96m57.628s # Resulting files: ls -og quality.wi? # -rw-rw-r-- 1 3501643220 Apr 27 18:16 quality.wib # -rw-rw-r-- 1 322734978 Apr 27 18:16 quality.wig # Note how the byte size of the .wib file is exactly equal # to the number of bases in a featureBits run without -countGaps # from the rmsk featureBits above: # 1814099592 bases of 3501643220 (51.807%) in intersection # This means we have a valid quality score for every base that is # not in a gap, which is what we want. If this doesn't work out, # you can turn this wiggle track into a bed file for each # contiguous run of data, and then intersect those bed files with # the gap track to see where they overlap. This happened this # time when a new type of gap "clone" was found to be defined in # the agp file. Previously there was only "fragment". The perl # script had to be altered to recognize this new type of gap. # For example, this could be done after the table was loaded: ssh kolossus cd /tmp awk '{print $1}' /cluster/data/monDom4/chrom.sizes | while read C do hgWiggle -doBed -db=monDom4 -chr=${C} quality > ${C}.qual.bed done # then to intersect: for C in 1 2 3 4 5 6 7 8 X Un do featureBits -chrom=chr${C} monDom4 gap chr${C}.qual.bed done # should all be zero ssh hgwdev cd /cluster/data/monDom4/bed/quality ln -s `pwd`/quality.wib /gbdb/monDom4/wib time nice -n +19 hgLoadWiggle monDom4 quality quality.wig # real 3m29.459s # verify index: hgsql -e "show index from quality;" monDom4 # check that Cardinality is a number and is not NULL #################################################################### ### swap Hg18 blastz (DONE - 2006-04-28 - Hiram) ### ssh pk mkdir /cluster/data/monDom4/bed/blastz.hg18.swap cd /cluster/data/monDom4/bed ln -s blastz.hg18.swap blastz.hg18 cd blastz.hg18.swap # The original DEF file had iscratch directories for the monDom4 # sequence, for this one we need san directories. This is a copy # of the original one with the SEQ2_DIR and _LEN changed: cat << '_EOF_' > DEF # human vs. opossum export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/parasol/bin BLASTZ=blastz.v7 # settings for more distant organism alignments BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=10000 BLASTZ_K=2200 BLASTZ_M=50 BLASTZ_Q=/cluster/data/blastz/HoxD55.q BLASTZ_ABRIDGE_REPEATS=0 # TARGET: Human (hg18) SEQ1_DIR=/scratch/hg/hg18/nib SEQ1_LEN=/scratch/hg/hg18/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Opossum monDom4 SEQ2_DIR=/san/sanvol1/scratch/monDom4/monDom4.2bit SEQ2_LEN=/san/sanvol1/scratch/monDom4/chrom.sizes SEQ2_CHUNK=30000000 SEQ2_LAP=0 BASE=/cluster/data/hg18/bed/blastzMonDom4.2006-02-13 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs cd /cluster/data/monDom4/bed/blastz.hg18.swap time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ -swap DEF > swap.out 2>&1 & # real 118m44.093s ssh kolossus cd /cluster/data/monDom4/bed/blastz.hg18.swap time nice -n +19 featureBits monDom4 chainHg18Link \ > fb.monDom4.chainHg18Link 2>&1 # 351100848 bases of 3501643220 (10.027%) in intersection ####################################################################### ### 7-Way conservation (DONE - 2006-04-28 - 2006-05-02 - Hiram) ssh hgwdev mkdir /cluster/data/monDom4/bed/multiz7way cd /cluster/data/monDom4/bed/multiz7way cp /cluster/data/mm8/bed/multiz17way/17way.nh . /cluster/bin/phast/tree_doctor --prune \ chimp_panTro2,macaque_rheMac2,rabbit_oryCun1,cow_bosTau2,dog_canFam2,armadillo_dasNov1,elephant_loxAfr1,tenrec_echTel1,tetraodon_tetNig1,fugu_fr1 \ 17way.nh > 7way.nh # this leaves the tree we are working with here: (((((human_hg18:0.054922, (rat_rn4:0.081728,mouse_mm8:0.077017):0.335773):0.285925, monodelphis_monDom4:0.371073):0.189124, chicken_galGal2:0.454691):0.123297, xenopus_xenTro2:0.782453):0.156067, zebrafish_danRer3:0.938628); /cluster/bin/phast/draw_tree 7way.nh > 7way.ps /cluster/bin/phast/all_dists 7way.nh > 7way.distances.txt grep -y monDom4 7way.distances.txt | sort -k3,3n # Print out that file for reference, and use the calculated # distances in the table below to order the organisms and check # the button order on the browser. Zebrafish ends up before # tetraodon and fugu on the browser despite its distance. # And if you can fill in the table below entirely, you have # succeeded in finishing all the alignments required. # # featureBits chainLink measures # chainMonDom4Link chain linearGap # distance on MonDom4 on other minScore # 1 0.711920 - human hg18 (% 10.027) (% 12.385) 5000 loose # 2 1.014888 - chicken galGal2 (% 03.050) (% 09.092) 5000 loose # 3 1.069788 - mouse mm8 (% 06.024) (% 08.245) 5000 loose # 4 1.074499 - rat rn4 (% 05.657) (% 07.784) 5000 loose # 5 1.465947 - frog xenTro2 (% 02.232) (no swap ) 5000 loose # 6 1.778189 - zebrafish danRer3 (% 02.090) (% 04.001) 5000 loose cd /cluster/data/monDom4/bed/multiz7way # bash shell syntax here ... export H=/cluster/data/monDom4/bed mkdir mafLinks for G in hg18 galGal2 mm8 rn4 xenTro2 danRer3 do mkdir mafLinks/$G if [ ! -d ${H}/blastz.${G}/mafNet ]; then echo "missing directory blastz.${G}/mafNet" fi ln -s ${H}/blastz.$G/mafNet/*.maf.gz ./mafLinks/$G done # Usually these MAFs are copied to the san for a pk kluster run. # in this case they are small enough that there is no need to copy # them anywhere. The alignment can be done right here on the # kkstore04 filesystem. # Copy MAFs to some appropriate NFS server for kluster run # ssh kkstore04 # mkdir /san/sanvol1/scratch/monDom4/multiz7way # cd /san/sanvol1/scratch/monDom4/multiz7way # time rsync -a --copy-links --progress \ # /cluster/data/monDom4/bed/multiz7way/mafLinks/ . # We have about 734 Mb of data here, takes ~ 30 seconds to copy # the autoMultiz cluster run ssh kk mkdir /cluster/data/monDom4/bed/multiz7way/autoMultiz cd /cluster/data/monDom4/bed/multiz7way/autoMultiz mkdir penn # cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/multiz penn # cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/maf_project penn # cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/autoMZ penn cp -p /cluster/bin/penn/multiz.v11.i386/multiz-tba/multiz penn cp -p /cluster/bin/penn/multiz.v11.i386/multiz-tba/maf_project penn cp -p /cluster/bin/penn/multiz.v11.i386/multiz-tba/autoMZ penn # create species list and stripped down tree for autoMZ sed 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//; /^ *$/d' \ ../7way.nh > tmp.nh echo `cat tmp.nh` > tree-commas.nh echo `cat tree-commas.nh` | sed 's/ //g; s/,/ /g' > tree.nh sed 's/[()]//g; s/,/ /g' tree.nh > species.lst mkdir -p maf run cd run # NOTE: you need to set the db properly in this script # And the binDir and pairs dir cat > autoMultiz << '_EOF_' #!/bin/csh -ef set db = monDom4 set c = $1 set maf = $2 set binDir = /cluster/data/monDom4/bed/multiz7way/autoMultiz/penn set tmp = /scratch/tmp/$db/multiz.$c set pairs = /cluster/data/monDom4/bed/multiz7way/mafLinks rm -fr $tmp mkdir -p $tmp cp ../{tree.nh,species.lst} $tmp pushd $tmp foreach s (`cat species.lst`) set in = $pairs/$s/$c.maf set out = $db.$s.sing.maf if ($s == $db) then continue endif if (-e $in.gz) then zcat $in.gz > $out else if (-e $in) then cp $in $out else echo "##maf version=1 scoring=autoMZ" > $out endif end set path = ($binDir $path); rehash $binDir/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c.maf popd cp $tmp/$c.maf $maf rm -fr $tmp '_EOF_' # << happy emacs chmod +x autoMultiz cat << '_EOF_' > template #LOOP autoMultiz $(root1) {check out line+ ../maf/$(root1).maf} #ENDLOOP '_EOF_' # << happy emacs awk '{print $1}' /cluster/data/monDom4/chrom.sizes > chrom.lst gensub2 chrom.lst single template jobList para create jobList # 10 jobs para try ... check ... push ... etc ... # Completed: 10 of 10 jobs # CPU time in finished jobs: 38783s 646.38m 10.77h 0.45d 0.001 y # IO & Wait Time: 1184s 19.73m 0.33h 0.01d 0.000 y # Average job time: 3997s 66.61m 1.11h 0.05d # Longest finished job: 8706s 145.10m 2.42h 0.10d # Submission to last job: 8706s 145.10m 2.42h 0.10d # combine results into a single file for loading and gbdb reference ssh kkstore04 cd /cluster/data/monDom4/bed/multiz7way # There used to be a mafFilter here with a minScore of 500, but it # turns out that the scores in these maf files are pretty much # useless. They range from very large negatives to very large # positives. time catDir maf > multiz7way.maf # real 1m18.849s # makes a 2 Gb file: # -rw-rw-r-- 1 2119554905 May 1 14:23 multiz7way.maf # Create per-chrom individual maf files for downloads ssh kkstore04 cd /cluster/data/monDom4/bed/multiz7way mkdir mafDownloads time for M in maf/chr*.maf do B=`basename $M` cp -p ${M} mafDownloads/${B} gzip mafDownloads/${B} echo ${B} done done # real 7m21.713s # deliver to downloads ssh hgwdev ln -s /cluster/data/monDom4/bed/multiz7way/mafDownloads \ /usr/local/apache/htdocs/goldenPath/monDom4/multiz7way # Load into database ssh hgwdev cd /cluster/data/monDom4/bed/multiz7way mkdir /gbdb/monDom4/multiz7way ln -s /cluster/data/monDom4/bed/multiz7way/multiz7way.maf \ /gbdb/monDom4/multiz7way time nice -n +19 hgLoadMaf monDom4 multiz7way # Loaded 2180653 mafs in 1 files from /gbdb/monDom4/multiz7way # real 3m35.280s time nice -n +19 hgLoadMafSummary -minSize=10000 -mergeGap=500 \ -maxSize=50000 monDom4 multiz7waySummary multiz7way.maf # Created 1423996 summary blocks from 5463968 components and # 2180653 mafs from multiz7way.maf # real 5m40.531s # create tree image: cat << '_EOF_' > species.nh (((((human,(rat,mouse)),opossum),chicken),frog),zebrafish) '_EOF_' /cluster/bin/phast/draw_tree -b -s species.nh > species7.ps # photoshop to enhance, reduce the amount of whitespace to make it # smaller, then save as jpg ln -s /cluster/data/monDom4/bed/multiz7way/species7.jpg \ /usr/local/apache/htdocs/images/phylo/monDom4_7way.jpg # the chopping of whitespace can be done with Imagemagick # 'display' - use the "transform -> chop" menu, select horizontal # or vertical chop, highlight an area to cut ############################################################################ # CREATE CONSERVATION WIGGLE WITH PHASTCONS # (DONE - 2006-05-01 - 2006-05-02 - Hiram) # Estimate phastCons parameters ssh kkstore04 mkdir /cluster/data/monDom4/bed/multiz7way/cons cd /cluster/data/monDom4/bed/multiz7way/cons # Create a starting-tree.mod based on chr1 (the largest one) time /cluster/bin/phast/$MACHTYPE/msa_split ../maf/chr1.maf \ --refseq ../../../1/chr1.fa --in-format MAF \ --windows 100000000,1000 --out-format SS \ --between-blocks 5000 --out-root s1 # real 5m50.239s /cluster/bin/phast/$MACHTYPE/phyloFit -i SS s1.*.ss \ --tree "(((((hg18,(rn4,mm8)),monDom4),galGal2),xenTro2),danRer3)" \ --out-root starting-tree # real 1m11.568s rm s1.*.ss # add up the C and G: grep BACKGROUND starting-tree.mod | awk '{printf "%0.3f\n", $3 + $4;}' # 0.398 # This 0.398 is used in the --gc argument below # Create big bad bloated SS files on san filesystem (takes ~ 2h 20m) # Increasing their size this time from 1,000,000 to 10,000,000 to # slow down the phastCons pk jobs ssh kkstore04 mkdir -p /san/sanvol1/scratch/monDom4/cons/ss cd /san/sanvol1/scratch/monDom4/cons/ss time for C in `awk '{print $1}' /cluster/data/monDom4/chrom.sizes` do if [ -s /cluster/data/monDom4/bed/multiz7way/maf/${C}.maf ]; then mkdir ${C} echo msa_split $C chrN=${C/chr/} chrN=${chrN/_random/} /cluster/bin/phast/$MACHTYPE/msa_split \ /cluster/data/monDom4/bed/multiz7way/maf/${C}.maf \ --refseq /cluster/data/monDom4/${chrN}/${C}.fa \ --in-format MAF --windows 10000000,0 --between-blocks 5000 \ --out-format SS --out-root ${C}/${C} fi done & # real 28m44.827s # Create a random list of 50 1 mb regions (do not use the _randoms) cd /san/sanvol1/scratch/monDom4/cons/ss ls -1l chr*/chr*.ss | grep -v random | \ awk '$5 > 4000000 {print $9;}' | randomLines stdin 50 ../randomSs.list # Set up parasol directory to calculate trees on these 50 regions ssh kk mkdir /san/sanvol1/scratch/monDom4/cons/treeRun1 cd /san/sanvol1/scratch/monDom4/cons/treeRun1 mkdir tree log # Tuning this loop should come back to here to recalculate # Create little script that calls phastCons with right arguments # --target-coverage of 0.20 is about right for mouse, will be # tuned exactly below cat > makeTree.csh << '_EOF_' #!/bin/csh -fe set C=$1:h mkdir -p log/${C} tree/${C} /cluster/bin/phast/$MACHTYPE/phastCons ../ss/$1 \ /cluster/data/monDom4/bed/multiz7way/cons/starting-tree.mod \ --gc 0.398 --nrates 1,1 --no-post-probs --ignore-missing \ --expected-lengths 12 --target-coverage 0.17 \ --quiet --log log/$1 --estimate-trees tree/$1 '_EOF_' # << happy emacs chmod a+x makeTree.csh # Create gensub file cat > template << '_EOF_' #LOOP makeTree.csh $(path1) #ENDLOOP '_EOF_' # << happy emacs # Make cluster job and run it gensub2 ../randomSs.list single template jobList para create jobList para try/push/check/etc # Completed: 50 of 50 jobs # CPU time in finished jobs: 38161s 636.02m 10.60h 0.44d 0.001 y # IO & Wait Time: 385s 6.41m 0.11h 0.00d 0.000 y # Average job time: 771s 12.85m 0.21h 0.01d # Longest finished job: 1672s 27.87m 0.46h 0.02d # Submission to last job: 1693s 28.22m 0.47h 0.02d # Now combine parameter estimates. We can average the .mod files # using phyloBoot. This must be done separately for the conserved # and nonconserved models ssh kkstore04 cd /san/sanvol1/scratch/monDom4/cons/treeRun1 ls -1 tree/chr*/*.cons.mod > cons.list /cluster/bin/phast/$MACHTYPE/phyloBoot --read-mods '*cons.list' \ --output-average ../ave.cons.mod > cons_summary.txt ls -1 tree/chr*/*.noncons.mod > noncons.list /cluster/bin/phast/$MACHTYPE/phyloBoot --read-mods '*noncons.list' \ --output-average ../ave.noncons.mod > noncons_summary.txt cd .. cp -p ave.*.mod /cluster/data/monDom4/bed/multiz7way/cons # measuring entropy # consEntopy # ave.cons.mod ave.noncons.mod --NH 9.78 # never stops with the --NH argument /cluster/bin/phast/$MACHTYPE/consEntropy .17 12 --NH 9.78 \ ave.cons.mod ave.noncons.mod # ( Solving for new omega: 12.000000 9.952747 9.669713 9.663148 ) # Trans parameters: gamma=0.170000, omega=12.000000, mu=0.083333, nu=0.017068 # Relative entropy: H=1.090778 bits/site # Expected min. length: L_min=9.400348 sites # Expected max. length: L_max=6.597224 sites # Phylogenetic information threshold: PIT=L_min*H=10.253692 bits # Recommended expected length: omega=9.663148 sites (for L_min*H=9.780000) XXXX - working 2006-05-01 17:00 ### !!! *** This one with .17 and 12 is the one that was finally used # consEntropy .17 12 ave.cons.mod.4 ave.noncons.mod.4 # Transition params: gamma=0.170000, omega=12.000000, mu=0.083333, nu=0.017068 # Relative entropy: H=1.478838 bits/site # Required length: N=6.753382 sites # Total entropy: NH=9.987159 bits # SKIP to here passing by the tuning numbers ssh kk # Create cluster dir to do main phastCons run mkdir /san/sanvol1/scratch/monDom4/cons/consRun1 cd /san/sanvol1/scratch/monDom4/cons cp /san/sanvol1/scratch/mm8/cons/elliotsEncode.mod . # This file looks like: ALPHABET: A C G T ORDER: 0 SUBST_MOD: REV TRAINING_LNL: -988246.132962 BACKGROUND: 0.295 0.205 0.205 0.295 RATE_MAT: -1.165221 0.315494 0.589884 0.259843 0.189778 -0.878194 0.208718 0.479698 0.444622 0.261535 -0.885604 0.179447 0.234867 0.720815 0.215191 -1.170872 TREE: (((((((((((((hg18:0.006690,panTro2:0.007571):0.024272,(colobus_monkey:0.015404,(baboon:0.008258,rheMac2:0.028617):0.008519):0.022120):0.023960,(dusky_titi:0.025662,(owl_monkey:0.012151,marmoset:0.029549):0.008236):0.027158):0.066101,(mouse_lemur:0.059024,galago:0.121375):0.032386):0.017073,((rn4:0.081728,monDom4:0.077017):0.229273,oryCun1:0.206767):0.023340):0.023026,(((bosTau2:0.159182,canFam2:0.147731):0.004946,rfbat:0.138877):0.010150,(hedgehog:0.193396,shrew:0.261724):0.054246):0.024354):0.028505,dasNov1:0.149862):0.015994,(loxAfr1:0.104891,echTel1:0.259797):0.040371):0.218400,monDom4:0.371073):0.065268,platypus:0.468116):0.123856,galGal2:0.454691):0.123297,xenTro2:0.782453):0.156067,((tetNig1:0.199381,fr1:0.239894):0.492961,danRer3:0.782561):0.156067); cd /san/sanvol1/scratch/monDom4/cons/consRun1 mkdir ppRaw bed # Create script to run phastCons with right parameters # These parameters: # --rho 0.28 --expected-length 14 --target-coverage 0.008 --quiet \ # were taken from Kate's 17-way in Hg17, including the # -not-informative panTro2 # This job is I/O intensive in its output files, thus it is all # working over in /scratch/tmp/ cat > doPhast << '_EOF_' #!/bin/csh -fe mkdir /scratch/tmp/${2} cp -p ../ss/${1}/${2}.ss ../elliotsEncode.mod /scratch/tmp/${2} pushd /scratch/tmp/${2} > /dev/null /cluster/bin/phast/${MACHTYPE}/phastCons ${2}.ss elliotsEncode.mod \ --expected-lengths 12 --target-coverage 0.17 --quiet \ --seqname ${1} --idpref ${1} --viterbi ${2}.bed --score \ --require-informative 0 > ${2}.pp popd > /dev/null mkdir -p ppRaw/${1} mkdir -p bed/${1} mv /scratch/tmp/${2}/${2}.pp ppRaw/${1} mv /scratch/tmp/${2}/${2}.bed bed/${1} rm /scratch/tmp/${2}/elliotsEncode.mod rm /scratch/tmp/${2}/${2}.ss rmdir /scratch/tmp/${2} '_EOF_' # << happy emacs chmod a+x doPhast # root1 == chrom name, file1 == ss file name without .ss suffix # Create gsub file cat > template << '_EOF_' #LOOP doPhast $(root1) $(file1) #ENDLOOP '_EOF_' # << happy emacs # Create parasol batch and run it ls -1 ../ss/chr*/chr*.ss | sed 's/.ss$//' > in.list gensub2 in.list single template jobList para create jobList para try/check/push/etc. # These jobs are very fast and very I/O intensive, even on the san # they will hang it up as they work at full tilt. # Completed: 366 of 366 jobs # CPU time in finished jobs: 29300s 488.33m 8.14h 0.34d 0.001 y # IO & Wait Time: 19643s 327.39m 5.46h 0.23d 0.001 y # Average job time: 134s 2.23m 0.04h 0.00d # Longest finished job: 247s 4.12m 0.07h 0.00d # Submission to last job: 1880s 31.33m 0.52h 0.02d # combine predictions and transform scores to be in 0-1000 interval # it uses a lot of memory, so on kolossus: ssh kolossus cd /san/sanvol1/scratch/monDom4/cons/consRun1 # The sed's and the sort get the file names in chrom,start order find ./bed -type f | sed -e "s#/# x #g; s#\.# y #g; s#-# z #g" \ | sort -k7,7 -k9,9n \ | sed -e "s# z #-#g; s# y #\.#g; s# x #/#g" | xargs cat \ | awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' \ | /cluster/bin/scripts/lodToBedScore /dev/stdin > mostConserved.bed # ~ 1 minute cp -p mostConserved.bed /cluster/data/monDom4/bed/multiz7way # Figure out how much is actually covered by the bed files as so: # The 2583393846 comes from the non-n genome size, # from faSize on all chroms: ssh kkstore04 cd /cluster/data/monDom4 faSize ?{,?}/chr*.fa # 3605614649 bases (103971429 N's 3501643220 real 1551686450 upper # 1949956770 lower) in 10 sequences in 10 files cd /san/sanvol1/scratch/monDom4/cons/consRun1 awk ' {sum+=$3-$2} END{printf "%% %.2f = 100.0*%d/3501643220\n",100.0*sum/3501643220,sum}' \ mostConserved.bed # --expected-lengths 12 --target-coverage 0.17 --quiet \ # % 4.80 = 100.0*168149380/3501643220 ssh kolossus cd /san/sanvol1/scratch/monDom4/cons/consRun1 # Aiming for %70 coverage in # the following featureBits measurement on CDS: # Beware of negative scores when too high. The logToBedScore # will output an error on any negative scores. # I don't think this will work for opossum because we don't have # enough refGene:cds coverage: time nice -n +19 featureBits monDom4 refGene:cds # 34857 bases of 3501643220 (0.001%) in intersection HGDB_CONF=~/.hg.conf.read-only time nice -n +19 featureBits monDom4 \ -enrichment refGene:cds mostConserved.bed # --expected-lengths 12 --target-coverage 0.17 --quiet \ # refGene:cds 0.001%, mostConserved.bed 4.802%, both 0.001%, cover # 75.13%, enrich 15.65x # Load most conserved track into database ssh hgwdev cd /cluster/data/monDom4/bed/multiz7way time nice -n +19 hgLoadBed -strict monDom4 \ phastConsElements7way mostConserved.bed # real 2m31.673s # Loaded 1882640 elements of size 5 # should measure the same as above time nice -n +19 featureBits monDom4 -enrichment refGene:cds \ phastConsElements7way # refGene:cds 0.001%, phastConsElements7way 4.802%, both 0.001%, # cover 75.13%, enrich 15.65x # Create merged posterier probability file and wiggle track data files ssh kkstore04 cd /san/sanvol1/scratch/monDom4/cons/consRun1 # the sed business gets the names sorted by chromName, chromStart # so that everything goes in numerical order into wigEncode # You will want to test this to make sure it is sorting correctly, # run this command and see if everything is in order by chrom and # position: find ./ppRaw -type f \ | sed -e "s#/# x #g; s#\.# y #g; s#-# z #g" \ | sort -k7,7 -k9,9n \ | sed -e "s# z #-#g; s# y #\.#g; s# x #/#g" | less time nice -n +19 find ./ppRaw -type f \ | sed -e "s#/# x #g; s#\.# y #g; s#-# z #g" \ | sort -k7,7 -k9,9n \ | sed -e "s# z #-#g; s# y #\.#g; s# x #/#g" | xargs cat \ | $HOME/bin/$MACHTYPE/wigEncode -noOverlap stdin \ phastCons7.wig phastCons7.wib # Converted stdin, upper limit 1.00, lower limit 0.00 # real 4m4.694s # -rw-rw-r-- 1 465425363 May 2 12:42 phastCons7.wib # -rw-rw-r-- 1 79633105 May 2 12:42 phastCons7.wig cp -p phastCons7.wi? /cluster/data/monDom4/bed/multiz7way/ # prepare compressed copy of ascii data values for downloads ssh kkstore04 cd /san/sanvol1/scratch/monDom4/cons/consRun1 cat << '_EOF_' > gzipAscii.sh #!/bin/sh TOP=`pwd` export TOP mkdir -p phastCons7Scores for D in ppRaw/chr* do C=${D/ppRaw\/} out=phastCons7Scores/${C}.data.gz echo "========================== ${C} ${D}" find ./${D} -type f | sed -e "s#/# x #g; s#\.# y #g; s#-# z #g" \ | sort -k7,7 -k9,9n \ | sed -e "s# z #-#g; s# y #\.#g; s# x #/#g" | xargs cat | gzip > ${out} done '_EOF_' # << happy emacs chmod +x gzipAscii.sh time nice -n +19 ./gzipAscii.sh # real 8m10.959s # copy them for downloads ssh kkstore04 mkdir /cluster/data/monDom4/bed/multiz7way/phastCons7Scores cd /cluster/data/monDom4/bed/multiz7way/phastCons7Scores cp -p /san/sanvol1/scratch/monDom4/cons/consRun1/phastCons7Scores/* . ssh hgwdev cd /usr/local/apache/htdocs/goldenPath/monDom4 ln -s /cluster/data/monDom4/bed/multiz7way/phastCons7Scores . # Load gbdb and database with wiggle. ssh hgwdev cd /cluster/data/monDom4/bed/multiz7way ln -s `pwd`/phastCons7.wib /gbdb/monDom4/wib/phastCons7.wib time nice -n +19 hgLoadWiggle monDom4 phastCons7 phastCons7.wig # real 0m52.109s # Create histogram to get an overview of all the data ssh hgwdev cd /cluster/data/monDom4/bed/multiz7way time nice -n +19 hgWiggle -doHistogram \ -hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \ -db=monDom4 phastCons7 > histogram.data 2>&1 # real 9m8.075s # create plot of histogram: cat << '_EOF_' | gnuplot > histo.png set terminal png small color \ x000000 xffffff xc000ff x66ff66 xffff00 x00ffff xff0000 set size 1.4, 0.8 set key left box set grid noxtics set grid ytics set title " Opossum MonDom4 Histogram phastCons7 track" set xlabel " phastCons7 score" set ylabel " Relative Frequency" set y2label " Cumulative Relative Frequency (CRF)" set y2range [0:1] set y2tics set yrange [0:0.02] plot "histogram.data" using 2:5 title " RelFreq" with impulses, \ "histogram.data" using 2:7 axes x1y2 title " CRF" with lines '_EOF_' # << happy emacs display histo.png & ######################################################################### # Build maf annotation for multiz7way (DONE - 2006-07-31 - Hiram) ssh kolossus cd /cluster/data/monDom4/bed/multiz7way mkdir anno cd anno mkdir maf run cd run rm sizes nBeds for i in `cat /cluster/data/monDom4/bed/multiz7way/species.lst` do ln -s /cluster/data/$i/chrom.sizes $i.len ln -s /cluster/data/$i/$i.N.bed $i.bed echo $i.bed >> nBeds echo $i.len >> sizes done echo "#!/bin/csh -fe" > jobs.csh echo date >> jobs.csh # do smaller jobs first, the 'ls -rS' lists in reverse order by size for i in `ls -rS ../../maf/*.maf` do echo nice mafAddIRows -nBeds=nBeds -sizes=sizes $i /cluster/data/monDom4/monDom4.2bit ../maf/`basename $i` >> jobs.csh echo "echo $i" >> jobs.csh done echo date >> jobs.csh chmod +x jobs.csh # establish a screen for this long running job screen cd /cluster/data/monDom4/bed/multiz7way time ./jobs.csh > jobs.log 2>&1 # real 195m54.071s # user 57m51.373s # sys 128m25.386s # ctrl-a ctrl-d to release # to return to the screen: screen -r -d cd /cluster/data/monDom4/bed/multiz7way/anno/maf mkdir -p /gbdb/monDom4/multiz7way/anno/maf ln -s /cluster/data/monDom4/bed/multiz7way/anno/maf/*.maf \ /gbdb/monDom4/multiz7way/anno/maf time nice -n +19 hgLoadMaf -pathPrefix=/gbdb/monDom4/multiz7way/anno/maf \ monDom4 multiz7way # Loaded 2781002 mafs in 10 files from /gbdb/monDom4/multiz7way/anno/maf # real 1m49.236s time cat *.maf \ | nice -n +19 hgLoadMafSummary -minSize=10000 -mergeGap=500 \ -maxSize=50000 monDom4 multiz7waySummary stdin # Created 1423996 summary blocks from 5463968 components and 2781002 mafs # from stdin # real 2m30.875s ####################################################################### # MULTIZ7WAY MAF FRAMES (DONE - 2006-08-01 - Hiram) ssh hgwdev mkdir /cluster/data/monDom4/bed/multiz7way/frames cd /cluster/data/monDom4/bed/multiz7way/frames # The following is adapted from the sequence in mm8.txt #------------------------------------------------------------------------ # get the genes for all genomes # mRNAs with CDS. single select to get cds+psl, then split that up and # create genePred # using mrna table as genes: mkdir genes for qDB in danRer3 do tmpExt=`mktemp temp.XXXXXX` tmpMrnaCds=${qDB}.mrna-cds.${tmpExt} tmpMrna=${qDB}.mrna.${tmpExt} tmpCds=${qDB}.cds.${tmpExt} echo $qDB hgsql -N -e 'select all_mrna.qName,cds.name,all_mrna.* \ from all_mrna,gbCdnaInfo,cds \ where (all_mrna.qName = gbCdnaInfo.acc) and \ (gbCdnaInfo.cds != 0) and (gbCdnaInfo.cds = cds.id)' \ ${qDB} > ${tmpMrnaCds} cut -f 1-2 ${tmpMrnaCds} > ${tmpCds} cut -f 4-100 ${tmpMrnaCds} > ${tmpMrna} mrnaToGene -cdsFile=${tmpCds} -smallInsertSize=8 -quiet ${tmpMrna} \ stdout \ | genePredSingleCover stdin stdout | gzip -2c \ > /scratch/tmp/$qDB.tmp.gz rm ${tmpMrnaCds} ${tmpMrna} ${tmpCds} mv /scratch/tmp/$qDB.tmp.gz genes/$qDB.gp.gz rm -f $tmpExt done # using knownGene for rn4 mm8 hg18 # using refGene for galGal2 # using mgcGenes for xenTro2 # using ensGene for monDom4 # genePreds; (must keep only the first 10 columns for knownGene) for qDB in rn4 mm8 hg18 galGal2 xenTro2 monDom4 do if [ $qDB = "xenTro2" ]; then geneTbl=mgcGenes elif [ $qDB = "galGal2" ]; then geneTbl=refGene elif [ $qDB = "monDom4" ]; then geneTbl=ensGene else geneTbl=knownGene fi echo hgsql -N -e 'select * from '"$geneTbl ${qDB}" hgsql -N -e "select * from $geneTbl" ${qDB} | cut -f 1-10 \ | genePredSingleCover stdin stdout | gzip -2c \ > /scratch/tmp/$qDB.tmp.gz mv /scratch/tmp/$qDB.tmp.gz genes/$qDB.gp.gz rm -f $tmpExt done #------------------------------------------------------------------------ # create frames ssh kkstore04 cd /cluster/data/monDom4/bed/multiz7way/frames rm mkMafFrames.sh for C in `awk '{print $1;}' /cluster/data/monDom4/chrom.sizes` do echo "echo mafFrames/${C}.maf" echo genePredToMafFrames monDom4 ../mafDownloads/${C}.maf.gz \ mafFrames/${C}.maf \ monDom4 genes/monDom4.gp.gz \ hg18 genes/hg18.gp.gz \ mm8 genes/mm8.gp.gz \ rn4 genes/rn4.gp.gz \ galGal2 genes/galGal2.gp.gz \ danRer3 genes/danRer3.gp.gz \ xenTro2 genes/xenTro2.gp.gz done > mkMafFrames.sh chmod +x mkMafFrames.sh time nice -n +19 ./mkMafFrames.sh # real 1m10.711s gzip mafFrames/*.maf #------------------------------------------------------------------------ # load the database ssh hgwdev cd /cluster/data/monDom4/bed/multiz7way/frames time nice -n +19 hgLoadMafFrames monDom4 multiz7wayFrames \ mafFrames/*.maf.gz # real 0m17.864s # The use of this table requires an entry in the trackDb.ra for this: # frames multiz7wayFrames # irows on ########################################################################### # BLASTZ SWAP OF ZEBRAFISH (danRer4) CHAIN AND NET ALIGNMENTS OVER TO # monDom4 AND CREATE AXNET, MAFNET, LIFTOVER AND ALIGNMENT DOWNLOADS # (DONE, 2006-04-29, hartera) # see also makeDanRer4.doc # Remove test tables and directories and start again: ssh hgwdev cd /cluster/data/monDom4 foreach c (`cat chrom.lst`) hgsql -e "drop table chr${c}_chainDanRer4;" monDom4 hgsql -e "drop table chr${c}_chainDanRer4Link;" monDom4 hgsql -e "drop table chr${c}_chainDanRer4NoDyMsk;" monDom4 hgsql -e "drop table chr${c}_chainDanRer4NoDyMskLink;" monDom4 end hgsql -e "drop table netDanRer4;" monDom4 hgsql -e "drop table netDanRer4NoDyMsk;" monDom4 # remove downloads rm -r /usr/local/apache/htdocs/goldenPath/monDom4/vsDanRer4 rm \ /usr/local/apache/htdocs/goldenPath/monDom4/liftOver/monDom4ToDanRer4.over.chain.gz rm /cluster/data/monDom4/bed/liftOver/monDom4ToDanRer4.over.chain.gz # remove old Blastz swap rm -r /cluster/data/monDom4/bed/blastz.danRer4.swap # remove link to old blastz directory rm -r /cluster/data/monDom4/bed/blastz.danRer4 # Used the following BLASTZ parameters: # BLASTZ_H=2000 # BLASTZ_Y=3400 # BLASTZ_L=6000 # BLASTZ_K=2200 # BLASTZ_Q=/san/sanvol1/scratch/blastz/HoxD55.q # Used all repeats as lineage-specific repeats. # Swap directory is: /cluster/data/monDom4/bed/blastz.danRer4.swap ssh pk cd /cluster/data/danRer4/bed/blastz.monDom4.2006-04-28 nice /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ -swap `pwd`/DEF >& doSwap.log & featureBits monDom4 chainDanRer4Link # 62249730 bases of 3501643220 (1.778%) in intersection featureBits monDom4 chainDanRer3Link # 73187496 bases of 3501643220 (2.090%) in intersection featureBits monDom4 netDanRer4 # 745122163 bases of 3501643220 (21.279%) in intersection featureBits monDom4 netDanRer3 # 672446285 bases of 3501643220 (19.204%) in intersection featureBits -chrom=chr1 monDom4 refGene:cds chainDanRer4Link -enrichment # refGene:cds 0.001%, chainDanRer4Link 1.807%, both 0.000%, # cover 43.85%, enrich 24.27x featureBits -chrom=chr1 monDom4 refGene:cds chainDanRer3Link -enrichment # refGene:cds 0.001%, chainDanRer3Link 2.044%, both 0.000%, # cover 49.24%, enrich 24.09x # Only 36 RefSeqs in monDom4 refGene so try xenoRefGene (78707 sequences) # and mrna (174 sequences): featureBits -chrom=chr1 monDom4 xenoRefGene:cds chainDanRer4Link -enrichment # xenoRefGene:cds 0.820%, chainDanRer4Link 1.807%, both 0.661%, # cover 80.63%, enrich 44.63x featureBits -chrom=chr1 monDom4 xenoRefGene:cds chainDanRer3Link -enrichment # xenoRefGene:cds 0.820%, chainDanRer3Link 2.044%, both 0.577%, # cover 70.45%, enrich 34.47x featureBits -chrom=chr1 monDom4 mrna chainDanRer4Link -enrichment # mrna 0.004%, chainDanRer4Link 1.807%, both 0.002%, # cover 52.67%, enrich 29.15x featureBits -chrom=chr1 monDom4 mrna chainDanRer3Link -enrichment # mrna 0.004%, chainDanRer3Link 2.044%, both 0.002%, # cover 54.50%, enrich 26.66x featureBits -chrom=chr1 monDom4 xenoRefGene:cds netDanRer4 -enrichment # xenoRefGene:cds 0.820%, netDanRer4 24.539%, both 0.720%, # cover 87.79%, enrich 3.58x featureBits -chrom=chr1 monDom4 xenoRefGene:cds netDanRer3 -enrichment # xenoRefGene:cds 0.820%, netDanRer3 22.272%, both 0.640%, # cover 78.12%, enrich 3.51x featureBits -chrom=chr1 monDom4 mrna netDanRer4 -enrichment # mrna 0.004%, netDanRer4 24.539%, both 0.002%, cover 56.90%, enrich 2.32x featureBits -chrom=chr1 monDom4 mrna netDanRer3 -enrichment # mrna 0.004%, netDanRer3 22.272%, both 0.002%, cover 62.62%, enrich 2.81x ########################################################################## # PRODUCING GENSCAN PREDICTIONS (WORKING - 2006-05-02 - Hiram) # Want to make up a chrom and contig 2bit file to get proper # names on the genscan predictions. ssh kkstore04 cd /cluster/data/monDom4/Un grep chrUn ../broad.mit.edu/Monodelphis4.0.agp > chrUn.broad.agp cat chrUn.broad.agp | /cluster/bin/scripts/agpToLift > chrUn_random.lft $HOME/kent/src/hg/utils/lft2BitToFa.pl ../monDom4.2bit chrUn_random.lft \ > chrUn_random.ctg.fa # Create a 2bit file with the full chrom sequences and the # random contigs, all hard masked cat ?/chr?.fa Un/chrUn_random.ctg.fa \ | maskOutFa stdin hard stdout \ | faToTwoBit stdin monDom4Chroms_RandomContigs.hard.2bit # make sure it still has all the unmasked sequence in it: twoBitToFa monDom4Chroms_RandomContigs.hard.2bit stdout \ | faSize stdin # 3589973501 bases (2038287051 N's 1551686450 real 1551686450 # upper 0 lower) in 9949 sequences in 1 files twoBitToFa monDom4.2bit stdout | faSize stdin # 3605614649 bases (103971429 N's 3501643220 real 1551686450 upper # 1949956770 lower) in 10 sequences in 1 files # note the 'real' bases are the same, the lowers have become N's # 1089350685 + 97171400 = 1186522085 # 1949956770 + 103971429 = 2053928199 # 2053928199 - 2038287051 = 15641148 == N's in gaps between contigs # There are a lot of contigs that became completely N's with no # sequence left. genscan doesn't like that. We need to get them # out of the analysis. Pick out the ones with over 18 real bases # left: twoBitToFa monDom4Chroms_RandomContigs.hard.2bit stdout \ | faCount stdin > chroms_randoms.faCount egrep -v "^#|^total" chroms_randoms.faCount \ | awk '{if ($2-$7 > 17) {print $1}}' > over18.lst # creating 4,000,000 sized chunks mkdir hardChunks twoBitToFa monDom4Chroms_RandomContigs.hard.2bit stdout \ | faSomeRecords /dev/stdin over18.lst stdout \ | faSplit about stdin 4000000 hardChunks/c_ rsync -a --progress hardChunks/ /cluster/bluearc/monDom4/hardChunks/ ssh hgwdev mkdir /cluster/data/monDom4/bed/genscan cd /cluster/data/monDom4/bed/genscan # Check out hg3rdParty/genscanlinux to get latest genscan: cvs co hg3rdParty/genscanlinux # Run on pk cluster instead of kki since these jobs are going to # take quite a while and we don't want to tie up the kki kluster # entirely. ssh pk cd /cluster/data/monDom4/bed/genscan # Make 3 subdirectories for genscan to place output files mkdir gtf pep subopt # Generate a list file, genome.list, of all the hard-masked contigs that # *do not* consist of all-N's (which would cause genscan to blow up) # Since we split on gaps, we have no chunks like that. You can # verify with faCount on the chunks. ls -1S /cluster/bluearc/monDom4/hardChunks/c_*.fa > genome.list # Create template file, for gensub2. For example (3-line file): cat << '_EOF_' > template #LOOP /cluster/bin/x86_64/gsBig {check in line+ $(path1)} {check out line gtf/$(root1).gtf} -trans={check out line pep/$(root1).pep} -subopt={check out line subopt/$(root1).bed} -exe=hg3rdParty/genscanlinux/genscan - par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=2400000 #ENDLOOP '_EOF_' # << emacs gensub2 genome.list single template jobList para create jobList para try, check, push, check, ... XXXX - running 2006-05-04 14:13 # Completed: 906 of 906 jobs # CPU time in finished jobs: 215764s 3596.07m 59.93h 2.50d 0.007 y # IO & Wait Time: 2798s 46.63m 0.78h 0.03d 0.000 y # Average job time: 241s 4.02m 0.07h 0.00d # Longest finished job: 57335s 955.58m 15.93h 0.66d # Submission to last job: 63345s 1055.75m 17.60h 0.73d cat gtf/c_*.gtf | liftUp -type=.gtf t.gtf \ /cluster/data/monDom4/Un/chrUn_random.lft carry stdin # Concatenate and lift to chrom results: ssh kkstore04 cd /cluster/data/monDom4/bed/genscan for C in `awk '{print $1}' ../../chrom.sizes | sed -e "s/chr//"` do mkdir -p ${C} catDir gtf | liftUp ${C}/chr${C}.gtf \ /cluster/data/monDom4/hardChunks.lft error stdin catDir subopt | liftUp ${C}/chr${C}.bed \ /cluster/data/monDom4/hardChunks.lft error stdin catDir pep > ${C}/chr${C}.pep echo "done: chr${C}" done # Load into the database as so: ssh hgwdev cd /cluster/data/monDom4/bed/genscan ldHgGene -bin -gtf monDom4 genscan 1/chr1.gtf # 35103 gene predictions hgPepPred monDom4 generic genscanPep 1/chr1.pep hgLoadBed monDom4 genscanSubopt 1/chr1.bed # Loaded 446953 elements of size 6 # Compare with other assemblies: featureBits monDom4 genscan # 48628209 bases of 3496445653 (1.391%) in intersection featureBits monDom1 genscan # 47170795 bases of 3492108230 (1.351%) in intersection featureBits monDom4 genscanSubopt # 52714123 bases of 3496445653 (1.508%) in intersection featureBits monDom1 genscanSubopt # 51805835 bases of 3492108230 (1.484%) in intersection featureBits hg17 genscan # 55323340 bases of 2866216770 (1.930%) in intersection featureBits hg17 genscanSubopt # 55986178 bases of 2866216770 (1.953%) in intersection # Clean up cd /san/sanvol1/scratch/monDom4 rm -fr hardMasked200K # This kolossus clean up was actually left until much later. # I found these copies here to be useful later on. ssh kolossus cd /scratch/monDom4 rm -f * cd .. rmdir monDom4 ######################################################################### # creating bigZips downloads (DONE - 2006-05-15 - Hiram) ssh hgwdev mkdir /cluster/data/monDom4/downloads mkdir /cluster/data/monDom4/downloads/bigZips mkdir /cluster/data/monDom4/downloads/chromosomes cd /cluster/data/monDom4/downloads/chromosomes # There wasn't a README for a previous assembly, these things are # pretty generic, the mm8 one can be used with only slight edits. # Next time use the previous Opossum assembly README. cp /usr/local/apache/htdocs/goldenPath/mm8/chromosomes/README.txt . # edit that readme to provide correct references and details ln -s `pwd` /usr/local/apache/htdocs/goldenPath/monDom4/chromosomes ssh kkstore04 cd /cluster/data/monDom4/downloads/chromosomes for C in 1 2 3 4 5 6 7 8 X Un do echo "gzip -c ../../${C}/chr${C}.fa > chr${C}.fa.gz" gzip -c ../../${C}/chr${C}.fa > chr${C}.fa.gz done md5sum *.fa.gz R*.txt > md5sum.txt cd ../bigZips gzip -c ../../broad.mit.edu/Monodelphis4.0.agp > Monodelphis4.0.agp.gz cd ../.. tar cvzf downloads/bigZips/chromFa.tar.gz ?/chr*.fa Un/chrUn.fa # 16 minutes tar cvzf downloads/bigZips/chromFaMasked.tar.gz ?/chr*.fa.masked \ Un/chrUn.fa.masked # 8 minutes tar cvzf downloads/bigZips/chromOut.tar.gz ?/chr*.fa.out Un/chrUn.fa.out cd bed/simpleRepeat mkdir trfMask cd trfMask for C in 1 2 3 4 5 6 7 8 X Un do ln -s ../chromMask/chr${C}_Mask.bed chr${C}.bed done tar --dereference cvzf ../../downloads/bigZips/chromTrf.tar.gz ./trfMask ssh hgwdev cd /cluster/data/monDom4/downloads/bigZips cp /usr/local/apache/htdocs/goldenPath/mm8/bigZips/README.txt . ln -s /cluster/data/monDom4/downloads/bigZips \ /usr/local/apache/htdocs/goldenPath/monDom4/bigZips cd /usr/local/apache/htdocs/goldenPath/monDom4/bigZips ln -s /cluster/data/monDom4/monDom4.2bit . ssh kkstore04 cd /cluster/data/monDom4/downloads/bigZips md5sum *.gz *.2bit R*.txt > md5sum.txt ######################################################################### # BLASTZ CHICKEN galGal3 (DONE 5/26/06 angie) ssh pk mkdir /cluster/data/monDom4/bed/blastzGalGal3.2006-05-25 cd /cluster/data/monDom4/bed/blastzGalGal3.2006-05-25 cat << '_EOF_' > DEF # opossum vs chicken BLASTZ=blastz.v7.x86_64 # Specific settings for chicken (per Webb email to Brian Raney) BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=10000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q BLASTZ_ABRIDGE_REPEATS=1 # TARGET: Opossum monDom4 SEQ1_DIR=/san/sanvol1/scratch/monDom4/nib SEQ1_SMSK=/san/sanvol1/scratch/monDom4/linSpecRep.notInOthers SEQ1_LEN=/san/sanvol1/scratch/monDom4/chrom.sizes SEQ1_CHUNK=50000000 SEQ1_LAP=10000 # QUERY: Chicken galGal3 - single chunk big enough to run entire chrom SEQ2_DIR=/san/sanvol1/galGal3/nib SEQ2_LEN=/cluster/data/galGal3/chrom.sizes SEQ2_SMSK=/san/sanvol1/galGal3/linSpecRep SEQ2_CHUNK=200000000 SEQ2_LAP=0 BASE=/cluster/data/monDom4/bed/blastzGalGal3.2006-05-25 '_EOF_' # << emacs doBlastzChainNet.pl DEF -blastzOutRoot /san/sanvol1/scratch/gg3vsmonDom4 \ -bigClusterHub=pk -smallClusterHub=pk \ -chainMinScore=5000 -chainLinearGap=loose \ >& do.log & tail -f do.log ln -s blastzGalGal3.2006-05-25 /cluster/data/monDom4/bed/blastz.galGal3 ##################################################################### #### LOAD ENSEMBL GENES (DONE - 2006-06-21 - Hiram) mkdir /cluster/data/monDom4/bed/ensGene cd /cluster/data/monDom4/bed/ensGene Get the Ensembl BioMart at http://www.ensembl.org/Multi/martview Choose Ensembl 39 and Monodelphis domestica, click next It displays status in a window on the right, indicating how many entries are here, currently: 20,907 The next page is the "filter" step, we do not want any filters, nothing is changed on this page, click next Now we are on the "output" tab, the filter in the window on the right indicates that 20,907 passed the filter. (there is no filter) Now, on this output page, change the pull-down menu item from its default of "features" to read "structures" All the check-boxes now change. Mark the check box GTF under output format Under Gene Ensemble Attributes, Unselect Biotype Select Ensembl Gene ID Ensembl Transcript ID External Gene ID gzip compression and give it a filename: ensGeneMonDom4 it will add the .gff.gz suffix press "export" # They have a chrMT - we don't have the chrM sequence, # Add "chr" to front of each line # in the gene data gtf file to make # it compatible with ldHgGene and convert remove chrMT name zcat ensGeneMonDom4.gff.gz \ | sed -e "s/^\([0-9XYU][0-9n]*\)/chr\1/; /^MT/d" > ensGene.gtf # Verify names converted OK: awk '{print $1}' ensGene.gtf | sort | uniq -c # 153247 chr1 # 126767 chr2 # 82637 chr3 # 90199 chr4 # 47661 chr5 # 50469 chr6 # 34230 chr7 # 55841 chr8 # 41868 chrUn # 9965 chrX ldHgGene -gtf monDom4 ensGene ensGene.gtf # Read 33878 transcripts in 692884 lines in 1 files # 33878 groups 10 seqs 1 sources 4 feature types # 33878 gene predictions featureBits monDom4 ensGene # 32770559 bases of 3501643220 (0.936%) in intersection # Let's see if the peptides created from the genome sequence are # different than the peptides that Ensemble says they are making: # fetch chrX peptides from the ensGene table, sort them by their # name into single line instances: getRnaPred -peptides monDom4 ensGene chrX stdout \ | faToTab -type=protein stdin stdout | sort > ensGenePep.chrX.fa # And chrX peptides downloaded from ensmart: faToTab -type=protein ensPepMonDom4chrX.fasta.gz stdout \ | sort > ensPepchrX.fa # A diff of those two show they are different. But this is not # correct yet because they don't even have the same names in them # need to try some other options here to get the same sequence # It turns out it is necessary to have the ensGtp table to enable a link # to Ensemble from a gene's details page # ensGtp associates geneId/transcriptId/proteinId for hgPepPred and # hgKnownToSuper. Use ensMart to create it as above, except: # Page 3) Choose the "Features" box. In "Ensembl Attributes", check # Ensembl Gene ID, Ensembl Transcript ID, Ensembl Peptide ID. # Choose Text, tab-separated as the output format. Result name ensGtp. # Save file as monDom4.ensGtp.txt.gz gunzip ensGtp.txt.gz hgsql monDom4 < ~/kent/src/hg/lib/ensGtp.sql # remove header line from ensGtp.txt zcat monDom4.ensGtp.txt.gz \ | hgsql monDom4 \ -e "load data local infile '/dev/stdin' into table ensGtp ignore 1 lines" # Obtaining protein sequence from EnsMart # Select "sequences" from the pull-down on the output page # check Peptide in the "Sequences" selection area # and "Ensembl Transcript ID (versioned) in the Transcript # Attributes area # Uncheck all items in the Gene Attributes section. # Text,Fasta output, gzip, file name: monDom4.ensPep # becomes monDom4.ensPep.fasta.gz # The result should be just an ID and a peptide, e.g.: # >ENSMODT00000024417.2 # MSEFWLCFNCCIAEQPQPKRRRRIDRSMIGEPTNFVHTAHVGSGDLFSGMNSVSSIQSQMQSKGGYGGGMPSNGQMQLVETKAG # # monDom4.ensPep.fa.gz zcat monDom4.ensPep.fa.gz | \ $HOME/kent/src/utils/faToTab/faToTab.pl /dev/null /dev/stdin \ > ensGene.fa.tab # There appear to be more peptides than there were genes before. # I wonder if Ensembl has updated this list since the gene data was # fetched before: awk '{print $1}' genePred.tab | sort -u > geneNameList awk '{print $1}' ensGene.fa.tab | sort -u > pepNameList wc geneNameList pepNameList # 33878 33878 711438 geneNameList # 33915 33914 712193 pepNameList # 67793 67792 1423631 total comm -12 geneNameList pepNameList | wc # 33878 33878 711438 # So create an exclude list to get rid of the extras: comm -13 geneNameList pepNameList > excludeList # and leave them out zcat monDom4.ensPep.fa.gz | \ $HOME/kent/src/utils/faToTab/faToTab.pl excludeList /dev/stdin \ > ensGene.fa.tab hgPepPred monDom4 tab ensPep ensGene.fa.tab ########################################################################## # N-SCAN gene predictions (nscanGene, nscanPep) - (2006-09-13 markd) cd /cluster/data/monDom4/bed/nscan/ # obtained NSCAN predictions from michael brent's group # at WUSTL wget -nv -r -l 1 -np --accept=gtf http://ardor.wustl.edu/predictions/possum/monDom4/chr_gtf wget -nv -r -l 1 -np --accept=fa http://ardor.wustl.edu/predictions/possum/monDom4/chr_ptx # clean up downloaded directorys and compress files. mv ardor.wustl.edu/predictions/opossum/monDom4/* . rm -rf ardor.wustl.edu gzip chr_*/* chmod a-w chr_*/*.gz # load tracks. Note that these have *utr features, rather than # exon features. currently ldHgGene creates separate genePred exons # for these. ldHgGene -bin -gtf -genePredExt monDom4 nscanGene chr_gtf/chr*.gtf.gz # load protein, add .a suffix to match transcript id hgPepPred -suffix=.a monDom4 generic nscanPep chr_ptx/chr*.fa.gz rm *.tab # update trackDb; need a monDom4-specific page to describe informants opossum/monDom4/nscanGene.html (copy from rn4 and edit) opossum/monDom4/trackDb.ra # verify that the search spec matches the naming convention, sometimes # transcripts end with .a othertimes .0 OK! ######################################################################### # BLASTZ PLATYPUS ornAna1 (DONE 4/3/07 angie) ssh pk mkdir /cluster/data/monDom4/bed/blastz.ornAna1.2007-04-02 cd /cluster/data/monDom4/bed/blastz.ornAna1.2007-04-02 cat << '_EOF_' > DEF # opossum vs platypus BLASTZ=blastz.v7.x86_64 # Use "mammal-fish" params even though this is mammal-mammal... # pretty distant, hopefully not too many shared undermasked repeats, # and if we get overwhelmed we can back off: BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET: Opossum monDom4 SEQ1_DIR=/san/sanvol1/scratch/monDom4/nib SEQ1_LEN=/san/sanvol1/scratch/monDom4/chrom.sizes SEQ1_CHUNK=50000000 SEQ1_LAP=10000 # QUERY: Platypus ornAna1 SEQ2_DIR=/san/sanvol1/scratch/ornAna1/ornAna1.2bit SEQ2_LEN=/san/sanvol1/scratch/ornAna1/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LIMIT=300 SEQ2_LAP=0 BASE=/cluster/data/monDom4/bed/blastz.ornAna1.2007-04-02 '_EOF_' # << emacs doBlastzChainNet.pl DEF -blastzOutRoot /san/sanvol1/scratch/monDom4ornAna1 \ -bigClusterHub=pk -smallClusterHub=pk \ -chainMinScore=5000 -chainLinearGap=loose \ >& do.log & tail -f do.log ln -s blastz.ornAna1.2007-04-02 /cluster/data/monDom4/bed/blastz.ornAna1 ######################################################################### # BLASTZ Rhesus rheMac2 (DONE 2007-05-27 braney) ssh kolossus mkdir /cluster/data/monDom4/bed/blastz.rheMac2 cd /cluster/data/monDom4/bed/blastz.rheMac2 cat << '_EOF_' > DEF export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/parasol/bin BLASTZ=blastz.v7.x86_64 # settings for more distant organism alignments BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=10000 BLASTZ_K=2200 BLASTZ_M=50 BLASTZ_Q=/cluster/data/blastz/HoxD55.q BLASTZ_ABRIDGE_REPEATS=0 # TARGET: Opossum monDom4 SEQ1_DIR=/san/sanvol1/scratch/monDom4/monDom4.2bit SEQ1_LEN=/san/sanvol1/scratch/monDom4/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Rhesus (rheMac2) SEQ2_DIR=/san/sanvol1/scratch/rheMac2/nib SEQ2_LEN=/san/sanvol1/scratch/rheMac2/chrom.sizes SEQ2_CHUNK=30000000 SEQ2_LAP=0 BASE=/cluster/data/monDom4/bed/blastz.rheMac2 TMPDIR=/scratch/tmp # << emacs doBlastzChainNet.pl DEF \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ >& blastz.out 2>&1 & ############################################################################ ## BLASTZ swap from canFam2 alignments (2007-11-10 - markd) # /cluster/data/canFam2/bed/blastz.monDom4/DEF1 has incorrect BASE, so # create DEF.fixed ssh hgwdev mkdir /cluster/data/monDom4/bed/blastz.canFam2.swap cd /cluster/data/monDom4/bed/blastz.canFam2.swap ln -s blastz.canFam2.swap ../blastz.canFam2 (time /cluster/bin/scripts/doBlastzChainNet.pl \ -swap /cluster/data/canFam2/bed/blastz.monDom4/DEF.fixed) >& swap.out& nice -n +19 featureBits monDom4 chainCanFam2Link # 325604035 bases of 3501643220 (9.299%) in intersection ############################################################################## # Blastz Orangutan ponAbe2 (DONE - 2007-11-26 - Hiram) ssh kkstore04 # managing disk space on full filesystem mkdir /cluster/store8/monDom4/bed/blastzPonAbe2.2007-11-22 ln -s /cluster/store8/monDom4/bed/blastzPonAbe2.2007-11-22 \ /cluster/data/monDom4/bed cd /cluster/data/monDom4/bed/blastzPonAbe2.2007-11-22 screen # use screen to control this job mkdir /cluster/data/ponAbe2/bed/blastzOrnAna1.2007-11-13 cd /cluster/data/ponAbe2/bed/blastzOrnAna1.2007-11-13 cat << '_EOF_' > DEF # Opossum vs. Orangutan BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q BLASTZ_M=50 # TARGET: Opossum MonDom4 SEQ1_DIR=/san/sanvol1/scratch/monDom4/monDom4.2bit SEQ1_LEN=/cluster/data/monDom4/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 SEQ1_LIMIT=100 # QUERY: Orangutan PonAbe2 SEQ2_DIR=/cluster/bluearc/scratch/data/ponAbe2/ponAbe2.2bit SEQ2_LEN=/cluster/data/ponAbe2/chrom.sizes SEQ2_CHUNK=10000000 SEQ1_LIMIT=100 SEQ2_LAP=0 BASE=/cluster/data/monDom4/bed/blastzPonAbe2.2007-11-22 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # XXX - can run Opossum only on pk since the chroms are so large time nice -n +19 doBlastzChainNet.pl DEF -chainMinScore=5000 \ -stop=cat -chainLinearGap=loose -bigClusterHub=pk -verbose=2 \ > cat.log 2>&1 & # real 1600m18.602s # Completed: 134688 of 134688 jobs # CPU time in finished jobs: 12106044s 201767.41m 3362.79h 140.12d 0.384 y # IO & Wait Time: 851946s 14199.09m 236.65h 9.86d 0.027 y # Average job time: 96s 1.60m 0.03h 0.00d # Longest finished job: 11205s 186.75m 3.11h 0.13d # Submission to last job: 95155s 1585.92m 26.43h 1.10d time nice -n +19 doBlastzChainNet.pl DEF -chainMinScore=5000 \ -continue=chainRun -stop=download -chainLinearGap=loose \ -bigClusterHub=pk -verbose=2 \ > chainRun.log 2>&1 & # real 2724m19.530s cat fb.monDom4.chainPonAbe2Link.txt # 395887393 bases of 3501643220 (11.306%) in intersection # and for the swap mkdir /cluster/data/ponAbe2/bed/blastz.monDom4.swap cd /cluster/data/ponAbe2/bed/blastz.monDom4.swap time nice -n +19 doBlastzChainNet.pl -chainMinScore=5000 \ /cluster/data/monDom4/bed/blastzPonAbe2.2007-11-22/DEF \ -swap -chainLinearGap=loose -bigClusterHub=pk -verbose=2 \ > swap.log 2>&1 & # real 223m38.325s cat fb.ponAbe2.chainMonDom4Link.txt # 406409435 bases of 3093572278 (13.137%) in intersection ############################################################################## # Blastz Marmoset calJac1 (WORKING - 2007-11-24 - Hiram) ssh kkstore04 screen # use screen to control this job # managing disk space on full filesystem mkdir /cluster/store8/monDom4/bed/blastzCalJac1.2007-11-24 ln -s /cluster/store8/monDom4/bed/blastzCalJac1.2007-11-24 \ /cluster/data/monDom4/bed cd /cluster/data/monDom4/bed/blastzCalJac1.2007-11-24 # this chunking makes 651,480 jobs cat << '_EOF_' > DEF # Opossum vs. Marmoset BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q BLASTZ_M=50 # TARGET: Opossum MonDom4 SEQ1_DIR=/san/sanvol1/scratch/monDom4/monDom4.2bit SEQ1_LEN=/cluster/data/monDom4/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 SEQ1_LIMIT=100 # QUERY: Marmoset calJac1, largest contig 2,551,648, 49,724 contigs SEQ2_DIR=/cluster/bluearc/scratch/data/calJac1/calJac1.2bit SEQ2_LEN=/cluster/data/calJac1/chrom.sizes SEQ2_CHUNK=2000000 SEQ2_LIMIT=200 SEQ2_LAP=0 BASE=/cluster/data/monDom4/bed/blastzCalJac1.2007-11-24 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # XXX - can run Opossum only on pk since the chroms are so large time nice -n +19 doBlastzChainNet.pl `pwd`/DEF -chainMinScore=5000 \ -stop=cat -chainLinearGap=loose -bigClusterHub=pk -verbose=2 \ > cat.log 2>&1 & # real 2581m49.853s # Completed: 651480 of 651480 jobs # CPU time in finished jobs: 25871528s 431192.14m 7186.54h 299.44d 0.820 y # IO & Wait Time: 2695828s 44930.46m 748.84h 31.20d 0.085 y # Average job time: 44s 0.73m 0.01h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 1804s 30.07m 0.50h 0.02d # Submission to last job: 146641s 2444.02m 40.73h 1.70d time nice -n +19 doBlastzChainNet.pl `pwd`/DEF -chainMinScore=5000 \ -continue=chainRun -stop=download \ -chainLinearGap=loose -bigClusterHub=pk -verbose=2 \ > chainRun.log 2>&1 & # real 1810m38.601s # the 10 chaining jobs on kki take about 35 hours for the longest job # Completed: 10 of 10 jobs # CPU time in finished jobs: 369079s 6151.32m 102.52h 4.27d 0.012 y # IO & Wait Time: 385s 6.41m 0.11h 0.00d 0.000 y # Average job time: 36946s 615.77m 10.26h 0.43d # Longest finished job: 103069s 1717.82m 28.63h 1.19d # Submission to last job: 103069s 1717.82m 28.63h 1.19d cat fb.monDom4.chainCalJac1Link.txt # 386915334 bases of 3501643220 (11.050%) in intersection time nice -n +19 doBlastzChainNet.pl `pwd`/DEF -chainMinScore=5000 \ -continue=cleanup \ -chainLinearGap=loose -bigClusterHub=pk -verbose=2 \ > cleanup.log 2>&1 & mkdir /cluster/data/calJac1/bed/blastz.monDom4.swap cd /cluster/data/calJac1/bed/blastz.monDom4.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /cluster/data/monDom4/bed/blastzCalJac1.2007-11-24/DEF \ -chainMinScore=5000 -chainLinearGap=loose \ -swap -bigClusterHub=pk > swap.log 2>&1 & # real 498m24.344s # this failed during the load because the chainLink table became # too large. These were loaded manually with an sql statement to # start the table definition, then a load data local infile using # the .tab files left over from the failed load. Note the extra # definitions on the chainMonDom4Link table CREATE TABLE chainMonDom4 ( bin smallint(5) unsigned NOT NULL default '0', score double not null, # score of chain tName varchar(255) not null, # Target sequence name tSize int unsigned not null, # Target sequence size tStart int unsigned not null, # Alignment start position in target tEnd int unsigned not null, # Alignment end position in target qName varchar(255) not null, # Query sequence name qSize int unsigned not null, # Query sequence size qStrand char(1) not null, # Query strand qStart int unsigned not null, # Alignment start position in query qEnd int unsigned not null, # Alignment end position in query id int unsigned not null, # chain id #Indices KEY tName (tName(16),bin), KEY id (id) ) TYPE=MyISAM; time nice -n +19 hgsql -e \ "load data local infile \"chain.tab\" into table chainMonDom4;" calJac1 # real 8m10.273s CREATE TABLE chainMonDom4Link ( bin smallint(5) unsigned NOT NULL default 0, tName varchar(255) NOT NULL default '', tStart int(10) unsigned NOT NULL default 0, tEnd int(10) unsigned NOT NULL default 0, qStart int(10) unsigned NOT NULL default 0, chainId int(10) unsigned NOT NULL default 0, KEY tName (tName(16),bin), KEY chainId (chainId) ) ENGINE=MyISAM max_rows=160000000 avg_row_length=55 pack_keys=1 CHARSET=latin1; time nice -n +19 hgsql -e \ "load data local infile \"link.tab\" into table chainMonDom4Link;" calJac1 # this one took a number of hours # finish the nets and load time nice -n +19 netClass -verbose=0 -noAr noClass.net \ calJac1 monDom4 calJac1.monDom4.net # real 4m18.898s time nice -n +19 netFilter -minGap=10 calJac1.monDom4.net \ | hgLoadNet -verbose=0 calJac1 netMonDom4 stdin # real 0m54.559s cd /cluster/data/calJac1/bed/blastz.monDom4.swap time nice -n +19 featureBits calJac1 chainMonDom4Link \ > fb.calJac1.chainMonDom4Link.txt 2>&1 # real 25m7.373s cat fb.calJac1.chainMonDom4Link.txt # 391248654 bases of 2929139385 (13.357%) in intersection # continuing cd /cluster/data/calJac1/bed/blastz.monDom4.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /cluster/data/monDom4/bed/blastzCalJac1.2007-11-24/DEF \ -chainMinScore=5000 -chainLinearGap=loose \ -continue=download -swap -bigClusterHub=pk > download.log 2>&1 & ########################################################################### ############################################################################ # TRANSMAP vertebrate.2008-05-20 build (2008-05-24 markd) vertebrate-wide transMap alignments were built Tracks are created and loaded by a single Makefile. This is available from: svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-05-20 see doc/builds.txt for specific details. ############################################################################ ############################################################################ # TRANSMAP vertebrate.2008-06-07 build (2008-06-30 markd) vertebrate-wide transMap alignments were built Tracks are created and loaded by a single Makefile. This is available from: svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-06-30 see doc/builds.txt for specific details. ############################################################################ ############################################################################ # TRANSMAP vertebrate.2009-09-13 build (2009-09-20 markd) vertebrate-wide transMap alignments were built Tracks are created and loaded by a single Makefile. This is available from: svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-09-13 see doc/builds.txt for specific details. ############################################################################