# for emacs: -*- mode: sh; -*- # This file describes browser build for the mouse # genome, April 2007, ncbi mouse_37 - Mm9 # # "$Id: mm9.txt,v 1.138 2010/06/10 16:32:49 chinhli Exp $" # ####################################################################### # DOWNLOAD THE MOUSE SEQUENCE FROM NCBI (DONE - 2007-04-05 - Hiram) # # Examine disk space issues, find some goodly amount of space ssh kkstore02 mkdir /cluster/store5/mm9 ln -s /cluster/store5/mm9 /cluster/data/mm9 cd /cluster/data/mm9 ## After testing with the pre-release below, the real thing begins here mkdir mouse_37 cd mouse_37 ## Ouch, the files are no longer delivered conveniently in a single ## directory. They are in several locations now ... NCBI=ftp://ftp.ncbi.nih.gov/genomes MAPVIEW=MapView/Mus_musculus/sequence/BUILD.37.1/initial_release for F in README README_CURRENT_BUILD do wget --dont-remove-listing --timestamping \ "${NCBI}/M_musculus/${F}" -O ${F} done for F in allcontig.agp.gz seq_contig.md.gz ideogram.gz do wget --dont-remove-listing --timestamping \ "${NCBI}/${MAPVIEW}/${F}" -O ${F} done # survey the strains contained in seq_contig.md.gz zcat seq_contig.md.gz | awk '{print $9}' | sort | uniq -c | sort -rn 13075 Celera 360 C57BL/6J 101 129/SvJ 93 129/Sv 79 unknown 75 129/SvEvTac 40 NOD 26 129S7/SvEv 14 129/Ola 7 129 6 Cast/Ei 6 BALB/c 3 SJL/J 3 C3H 3 B6/CBAF1J 3 AKR/J 3 A/J 2 Spret/Ei 1 group_label 1 129/J # we will work on the C57BL/6J strain mkdir -p chrAgp cd chrAgp for C in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 X Y do wget --dont-remove-listing --timestamping \ "${NCBI}/M_musculus/Assembled_chromosomes/mm_ref_chr${C}.agp.gz" \ -O chr${C}.agp.gz done cd .. for C in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 X Y do zcat chrAgp/chr${C}.agp.gz | grep "^c" done > chrOnly.agp mkdir -p chrfasta cd chrfasta for C in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 X Y do wget --dont-remove-listing --timestamping \ "${NCBI}/M_musculus/Assembled_chromosomes/mm_ref_chr${C}.fa.gz" \ -O chr${C}.fa.gz done cd .. mkdir chrUn mkdir chrM wget --dont-remove-listing --timestamping \ "${NCBI}/M_musculus/CHR_Un/mm_ref_chrUn.fa.gz -O chrUn/chrUn.fa.gz wget --dont-remove-listing --timestamping \ "${NCBI}/M_musculus/CHR_MT/mm_ref_chrUn.fa.gz \ -O chrM/mm_ref_chrUn.fa.gz mkdir contigFasta for C in 1 2 3 4 5 6 7 8 9 do wget --dont-remove-listing --timestamping \ "${NCBI}/M_musculus/CHR_0${C}/mm_ref_chr${C}.fa.gz" \ -O contigFasta/chr${C}.fa.gz done for C in 0 1 2 3 4 5 6 7 8 9 do wget --dont-remove-listing --timestamping \ "${NCBI}/M_musculus/CHR_1${C}/mm_ref_chr1${C}.fa.gz" \ -O contigFasta/chr1${C}.fa.gz done for C in X Y Un MT do wget --dont-remove-listing --timestamping \ "${NCBI}/M_musculus/CHR_${C}/mm_ref_chr${C}.fa.gz" \ -O contigFasta/chr${C}.fa.gz done mv contigFasta/chrMT.fa.gz contigFasta/chrM.fa.gz ## split up the contigFasta files into their individual contigs ## the sed fixes the fasta header name to just be the contig name mkdir splitContigs for F in contigFasta/chr*.fa.gz do BN=`basename ${F}` C=${BN/.fa.gz/} echo $F $BN $C echo -n "${C} working ... " mkdir -p splitContigs/${C} zcat ${F} | sed -e "s/.*ref|/>/; s/|.*//" \ | faSplit byname stdin splitContigs/${C}/ echo "done" done ## create agp files for the randoms from seq_contig.md and allcontig.agp ## both fragment and contig agp files $HOME/kent/src/hg/mouseStuff/buildTools/seqContigToAgp.pl \ randomFragments.agp randomContigs.agp 2> randomContigs.err ## create contig agp file for non-randoms $HOME/kent/src/hg/mouseStuff/buildTools/mkContigAgp.pl allContigs.agp ## combine the two contig agp files cat allContigs.agp randomContigs.agp > mm9.contigs.agp ## separate the random contigs from the non-random contigs $HOME/kent/src/hg/mouseStuff/buildTools/sortRandoms.pl \ randomContigs.agp > mvRandoms.sh ## inspect mvRandoms.sh and then run it if it is OK chmod +x mvRandoms.sh ./mvRandoms.sh ## verify all contigs exist properly $HOME/kent/src/hg/mouseStuff/buildTools/checkContigs.pl mm9.contigs.agp ## create all contigs fasta file cd splitContigs find . -type f | xargs cat > ../mm9.contigs.fa ## create assembled sequence from these contigs and agp file cd .. agpToFa -simpleMulti mm9.contigs.agp all mm9.assembled.fa mm9.contigs.fa ## create fragments agp file cat chrOnly.agp randomFragments.agp > mm9.fragments.agp ## verify this agp too will work with the assembled fasta ## need 2bit file to avoid fasta file ordering difficulty faToTwoBit mm9.assembled.fa mm9.assembled.2bit checkAgpAndFa mm9.fragments.agp mm9.assembled.2bit ## it has a problem with chrY because it is supposed to end with: chrY 2902556 5902555 29 N 3000000 centromere no chrY 5902556 15902555 30 N 10000000 contig no ## edit mm9.contigs.agp to add these two lines, and repeat the agpToFa ## after that, this check fails on chrX_random ## this is supposed to be a gap, with N's ## chrX_random 300319 303472 46 N 3154 fragment yes # Loop: chrX_random, dnaOffset=300318, seqSize=1785075 # agpFrag->chromStart: 300318, agpFrag->chromEnd: 303472, dnaOffset: 300318 # FASTA gap entry # Bad char a found at index 300349 # Invalid Agp or Fasta file entry for sequence chrX_random # agpMatchesFaEntry failed; exiting ## this comes from the use of a single fragment in two parts, ## from allcontig.agp NT_165789.2 296206 300318 45 W CAAA01187194.1 1 4113 + NT_165789.2 300319 300349 46 N 31 fragment no NT_165789.2 300350 303372 47 W CAAA01187194.1 4145 7167 ## which I processed into: chrX_random 296206 300318 45 W CAAA01187194.1 1 4113 + chrX_random 300319 303472 46 N 3154 fragment yes ## should have been chrX_random 296206 300318 45 W CAAA01187194.1 1 4113 + chrX_random 300319 300349 46 N 31 fragment yes chrX_random 300350 303372 47 W CAAA01187194.1 4145 7167 + ### NCBI had this as a non-bridged fragment, a 'no' - I'm making it a yes ## so, edit the randomFragments.agp to fixup that line as indicated ## the chrOnly.agp file also needs an entry for chrM, add this ## line to chrOnly.agp: chrM 1 16299 1 F NC_005089.1 1 16299 + ## now have successful business: checkAgpAndFa mm9.fragments.agp mm9.assembled.2bit # All AGP and FASTA entries agree - both files are valid ## let's get the sequence in order in the fasta file faSplit byname mm9.assembled.fa splitChr/ cut -f1 mm9.fragments.agp | uniq -c ## using the order of this fragments.agp file for C in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 X Y M \ 13_random 16_random 17_random 1_random 3_random 4_random 5_random \ 7_random 8_random 9_random Un_random X_random Y_random do cat splitChr/chr${C}.fa done > mm9.fragorder.assembled.fa ## now that fasta file should also be OK checkAgpAndFa mm9.fragments.agp mm9.fragorder.assembled.fa # All AGP and FASTA entries agree - both files are valid ## now ready to give this agp and fasta file off to makeGenomeDb.pl ## pre-release testing download sequence ############################### mkdir ncbi cd ncbi cp -p /cluster/data/mm8/ncbi/.wgetrc . WGETRC=`pwd`/.wgetrc export WGETRC time nice -n +19 wget --timestamping --force-directories \ --directory-prefix=. --dont-remove-listing --recursive \ --level=4 --no-parent --no-host-directories --cut-dirs=1 \ ftp://ftp-private.ncbi.nih.gov/mouse_37 # Downloaded: 2,599,733,765 bytes in 196 files # The pre-release sequence, April 5th: mkdir /cluster/data/mm9/pre_release cd /cluster/data/mm9/pre_release # The .wgetrc is the anonymous user cat << '_EOF_' > .wgetrc login = anonymous passwd = '_EOF_' # << happy emacs chmod 600 .wgetrc WGETRC=`pwd`/.wgetrc export WGETRC wget --timestamping --force-directories --directory-prefix=. \ --dont-remove-listing --recursive --level=4 --no-parent \ --no-host-directories --cut-dirs=3 \ ftp://ftp.ncbi.nih.gov/genomes/M_musculus/pre_release ## Ran a quick test build with that to see if it would work ### this procedure run for the pre_release and the mouse_37 sequence ### for pre_release the sed was: # zcat chrfasta/chr${C}.fa.gz | sed -e "s/^>lcl|/>/; s/.fa.*//" mkdir chrNamesFixed for C in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 X Y do zcat chrfasta/chr${C}.fa.gz | sed -e "s/^>gi.*/>chr${C}/" \ | gzip -c > chrNamesFixed/chr${C}.fa.gz echo chr${C} done done zcat chrM/mm_ref_chrMT.fa.gz | sed -e "s/^>gi.*/>chrM/" \ | gzip -c > chrNamesFixed/chrM.fa.gz ## later on, an error was discovered in the processing of chrY_random # a lot of gaps of size zero were inserted. They didn't cause any # disruption to the assembly track, they only caused extra gap entries # that were useless. So, to fixup, remove anything in the chrY_gap # table that has a size of zero: hgsql -e 'delete from chrY_random_gap where size<"1";' mm9 ## And, fixing the one fragment on chrX_random hgsql -e 'INSERT chrX_random_gap VALUES("587", "chrX_random", "300318", "300349", "46", "N", "31", "fragment", "yes")' mm9 hgsql -e 'DELETE from chrX_random_gold where chromStart="296205";' mm9 hgsql -e 'INSERT chrX_random_gold VALUES("587", "chrX_random", "296205", "300318", "45", "W", "CAAA01187194.1", "0", "4113", "+")' mm9 hgsql -e 'INSERT chrX_random_gold VALUES("587", "chrX_random", "300349", "303372", "45", "W", "CAAA01187194.1", "4144", "7167", "+")' mm9 ########################################################################## ## final makeGenomeDb.pl (DONE - 2007-07-19 - Hiram) ## to make this go again, some things need to be removed or set-aside ssh hgwdev hgsql -e 'delete from dbDb where name="mm9";' hgcentraltest rm -fr /gbdb/mm9 ssh kkstore06 cd /cluster/data/mm9 mv mm9.config.ra mm9.config.pre_release.ra mv bed bed.pre_release mv mm9.unmasked.2bit mm9.unmasked.2bit.pre_release mv mm9.agp mm9.agp.pre_release mv mm9.randoms.2bit mm9.randoms.2bit.pre_release mv mm9.rmsk.2bit mm9.rmsk.2bit.pre_release mv mm9.rmskTrf.2bit mm9.rmskTrf.2bit.pre_release rm mm9.2bit rm -fr ? ?? mv dbDbInsert.sql dbDbInsert.sql.pre_release mv makeGenomeDb.out makeGenomeDb.out.pre_release mv chrom.lst chrom.lst.pre_release mv jkStuff jkStuff.pre_release ## ask cluster-admin to rename the existing mm9 db to be mm9prerelease cat << '_EOF_' > mm9.config.ra # Config parameters for makeGenomeDb.pl: db mm9 scientificName Mus musculus commonName Mouse assemblyDate Jul. 2007 assemblyLabel NCBI Build 37 orderKey 121 mitoAcc none fastaFiles /cluster/data/mm9/mouse_37/mm9.fragorder.assembled.fa agpFiles /cluster/data/mm9/mouse_37/mm9.fragments.agp # qualFiles /dev/null dbDbSpeciesDir mouse '_EOF_' # << happy emacs time nice -n +19 makeGenomeDb.pl mm9.config.ra > makeGenomeDb.out 2>&1 & # real 24m24.468s ssh hgwdev featureBits mm9 gold # 2620346158 bases of 2620346158 (100.000%) in intersection featureBits mm8 gold # 2567283971 bases of 2567283971 (100.000%) in intersection featureBits mm9 gap # 105419323 bases of 2620346158 (4.023%) in intersection featureBits mm8 gap # 97171117 bases of 2567283971 (3.785%) in intersection # verify index is correct: hgsql mm9 -e "show index from gc5Base;" # should see good numbers in Cardinality column # Reset default position to be like Mm8 hgsql -e \ 'update dbDb set defaultPos="chr12:50258170-50263946" where name="mm9";' \ hgcentraltest # create initial symlink for 2bit sequence mkdir /gbdb/mm9 mkdir /gbdb/mm9/html ln -s /cluster/data/mm9/mm9.unmasked.2bit /gbdb/mm9/mm9.2bit ## enter the trackDb business (was done in the pre-release test) ########################################################################## ## Initial pre-release makeGenomeDb.pl (DONE - 2007-04-05 - Hiram) ssh kkstore02 cd /cluster/data/mm9 cat << '_EOF_' > mm9.config.ra # Config parameters for makeGenomeDb.pl: db mm9 scientificName Mus musculus commonName Mouse assemblyDate Apr. 2007 assemblyLabel NCBI Build 37 orderKey 121 mitoAcc 33115104 fastaFiles /cluster/data/mm9/pre_release/chrNamesFixed/chr*.fa.gz agpFiles /cluster/data/mm9/pre_release/chrOnly.agp # qualFiles /dev/null dbDbSpeciesDir mouse '_EOF_' # << happy emacs time nice -n +19 makeGenomeDb.pl mm9.config.ra > makeGenomeDb.out 2>&1 & # real 24m24.468s ########################################################################## ## Repeat masker (DONE - 2007-04-05 - Hiram) ## RE-DONE with final sequence 2007-07-19 - Hiram ssh kkstore06 ## use screen for this mkdir /cluster/data/mm9/bed/RepeatMasker cd /cluster/data/mm9/bed/RepeatMasker time nice -n +19 doRepeatMasker.pl -bigClusterHub=kk \ -buildDir=/cluster/data/mm9/bed/RepeatMasker mm9 > do.out 2>&1 & # real 1726m32.849s # Completed: 5467 of 5467 jobs # CPU time in finished jobs: 54774630s 912910.50m 15215.17h 633.97d 1.737 y # IO & Wait Time: 432302s 7205.04m 120.08h 5.00d 0.014 y # Average job time: 10098s 168.30m 2.81h 0.12d # Longest finished job: 20982s 349.70m 5.83h 0.24d # Submission to last job: 100294s 1671.57m 27.86h 1.16d ssh kkstore06 cd /cluster/data/mm9 twoBitToFa mm9.rmsk.2bit stdout | faSize stdin # 2725765481 bases (105419509 N's 2620345972 real 1466644650 upper # 1153701322 lower) in 35 sequences in 1 files # %42.33 masked total, %44.03 masked real ############################################################################## ## simpleRepeat masking (DONE - 2007-04-07 - Hiram) ## RE-DONE with final sequence 2007-07-19 - Hiram ssh kolossus ## use screen for this mkdir /cluster/data/mm9/bed/simpleRepeat cd /cluster/data/mm9/bed/simpleRepeat time nice -n +19 twoBitToFa ../../mm9.unmasked.2bit stdout \ | trfBig -trf=/cluster/bin/i386/trf stdin /dev/null \ -bedAt=simpleRepeat.bed -tempDir=/scratch/tmp # real 253m44.602s # Appears to have an error on something: # sh: line 1: 18346 File size limit exceeded/cluster/bin/i386/trf /scratch/tmp/stdin_kolossus_3af1_fe9700.tf 2 7 7 80 10 50 2000 -m -d # Expecting 14 words line 4593 of /scratch/tmp/stdin_kolossus_3af1_fe9700.tf.2.7.7.80.10.50.2000.dat got 1 # Let's try running this on the kki kluster, by chrom ssh kkr1u00 mkdir /iscratch/i/mus/mm9 cd /iscratch/i/mus/mm9 cp -p /cluster/data/mm9/mm9.unmasked.2bit . cp -p /cluster/data/mm9/chrom.sizes . cut -f1 chrom.sizes | while read C do twoBitToFa -noMask -seq=${C} mm9.unmasked.2bit stdout | gzip -c > ${C}.fa.gz echo ${C} done for R in 2 3 4 5 6 7 8 do rsync -a --progress /iscratch/i/mus/mm9/ kkr${R}u00:/iscratch/i/mus/mm9/ done ssh kki mkdir /cluster/data/mm9/bed/simpleRepeat/trf cd /cluster/data/mm9/bed/simpleRepeat/trf cat << '_EOF_' > runTrf #!/bin/csh -fe # set C = $1 set GZ = /iscratch/i/mus/mm9/$C.fa.gz mkdir -p /scratch/tmp/$C zcat $GZ > /scratch/tmp/$C/$C.fa pushd /scratch/tmp/$C /cluster/bin/i386/trfBig -trf=/cluster/bin/i386/trf $C.fa \ /dev/null -bedAt=$C.bed -tempDir=/scratch/tmp/$C popd rm -f $C.bed cp -p /scratch/tmp/$C/$C.bed . rm -fr /scratch/tmp/$C '_EOF_' # << happy emacs chmod +x runTrf cat << '_EOF_' > template #LOOP ./runTrf $(path1) {check out line $(root1).bed} #ENDLOOP '_EOF_' # << happy emacs cut -f1 /iscratch/i/mus/mm9/chrom.sizes > chrom.lst gensub2 chrom.lst single template jobList para create jobList para try ... check ... push ... etc ... ## none of these jobs and any trouble, running line counts of these result ## bed files with the previous failed run indicates there are identical # Completed: 35 of 35 jobs # CPU time in finished jobs: 14620s 243.66m 4.06h 0.17d 0.000 y # IO & Wait Time: 272s 4.54m 0.08h 0.00d 0.000 y # Average job time: 425s 7.09m 0.12h 0.00d # Longest finished job: 1386s 23.10m 0.39h 0.02d # Submission to last job: 1790s 29.83m 0.50h 0.02d cat *.bed > ../simpleRepeat.bed cd .. awk '{if ($5 <= 12) print;}' simpleRepeat.bed > trfMask.bed ssh hgwdev cd /cluster/data/mm9/bed/simpleRepeat time nice -n +19 hgLoadBed mm9 simpleRepeat \ simpleRepeat.bed -sqlTable=$HOME/kent/src/hg/lib/simpleRepeat.sql # Loaded 1167619 elements of size 16 # real 0m33.312s nice -n +19 featureBits mm9 simpleRepeat # 80054947 bases of 2620346158 (3.055%) in intersection ## clean up the /iscratch/i/mus/mm9/ directory ## for downloads: mkdir trfMaskChrom cd trfMaskChrom ln -s ../trf/chr*.bed . ########################################################################### # CREATE MICROSAT TRACK (DONE - 2007-07-20 - Hiram) ssh hgwdev mkdir /cluster/data/mm9/bed/microsat cd /cluster/data/mm9/bed/microsat awk '($5==2 || $5==3) && $6 >= 15 && $8 == 100 && $9 == 0 {printf("%s\t%s\t%s\t%dx%s\n", $1, $2, $3, $6, $16);}' \ ../simpleRepeat/simpleRepeat.bed > microsat.bed hgLoadBed mm9 microsat microsat.bed # Loaded 195688 elements of size 4 featureBits mm9 microsat # 8713212 bases of 2620346158 (0.333%) in intersection featureBits mm8 microsat # 8570611 bases of 2567283971 (0.334%) in intersection ############################################################################# # PROCESS SIMPLE REPEATS INTO MASK (DONE - 2007-07-21 - Hiram) # After the simpleRepeats track has been built, make a filtered version # of the trf output: keep trf's with period <= 12: ssh kkstore06 cd /cluster/data/mm9/bed/simpleRepeat mkdir trfMask for F in trf/chr*.bed do echo "${F} -> ${F/trf\//}" awk '{if ($5 <= 12) print;}' ${F} > trfMask/${F/trf\//} done ## Add trfMask to repeat masked sequence ssh kkstore06 cd /cluster/data/mm9 cat << '_EOF_' > addTrf.csh #!/bin/csh -efx # This script will fail if any of its commands fail. set DB = mm9 set WORK_DIR = /cluster/data/${DB} cd ${WORK_DIR} set inputTwoBit = ${WORK_DIR}/${DB}.rmsk.2bit set outputTwoBit = ${WORK_DIR}/${DB}.rmskTrf.2bit cat /cluster/data/${DB}/bed/simpleRepeat/trfMask.bed \ | twoBitMask -add -type=.bed ${inputTwoBit} stdin ${outputTwoBit} twoBitToFa ${outputTwoBit} stdout | faSize stdin > faSize.${DB}.rmskTrf.txt '_EOF_' # << happy emacs chmod +x ./addTrf.csh time ./addTrf.csh cat faSize.mm9.rmskTrf.txt # 2725765481 bases (105419509 N's 2620345972 real 1465037892 upper # 1155308080 lower) in 35 sequences in 1 files # %42.38 masked total, %44.09 masked real ln -s mm9.rmskTrf.2bit mm9.2bit # fixup /gbdb/mm9/mm9.2bit symlink to this newly masked sequence ## copy to san for genbank kluster run cd /cluster/data/mm9 cp -p mm9.rmskTrf.2bit /san/sanvol1/scratch/mm9/mm9.2bit ############################################################################ # BLATSERVERS ENTRY (DONE - 2007-04-09 - Hiram) # After getting a blat server assigned by the Blat Server Gods, ssh hgwdev hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ VALUES ("mm9", "blat14", "17790", "1", "0"); \ INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ VALUES ("mm9", "blat14", "17791", "0", "1");' \ hgcentraltest # test it with some sequence ######################################################################## ## CYTOBAND - ideogram track (DONE - 2007-08-15 - Hiram) ssh hgwdev mkdir /cluster/data/mm9/bed/cytoBand cd /cluster/data/mm9/bed/cytoBand # Create bed file # (this script fixed up to eliminate one of the lines from ideogram file) $HOME/kent/src/utils/ncbi/createNcbiCytoBand.pl ../../mouse_37/ideogram ### doesn't work, the ideogram file is corrupted, use the one fetched below ## as so: $HOME/kent/src/utils/ncbi/createNcbiCytoBand.pl ideogram ## can now verify before load: $HOME/kent/src/utils/ncbi/cytoBandVerify.pl # everything checks out OK on 21 chroms # Load the bed file hgLoadBed -noBin -sqlTable=$HOME/kent/src/hg/lib/cytoBand.sql \ mm9 cytoBand cytoBand.bed # Make cytoBandIdeo track for ideogram gif on hgTracks page. # For mouse cytoBandIdeo is just a replicate of the cytoBand track. hgsql -e "drop table cytoBandIdeo;" mm9 hgsql mm9 -e "create table cytoBandIdeo (index(chrom(10),chromStart)) as select * from cytoBand;" ## fetch updated ideogram.gz file that has been fixed by NCBI NCBI=ftp://ftp.ncbi.nih.gov/genomes MAPVIEW=MapView/Mus_musculus/sequence/BUILD.37.1/updates wget --dont-remove-listing --timestamping \ "${NCBI}/${MAPVIEW}/ideogram.gz" -O ideogram.gz ## run through the createNcbiCytoBand.pl process above, and then load ## can now verify before load: $HOME/kent/src/utils/ncbi/cytoBandVerify.pl # everything checks out OK on 21 chroms ########################################################################## ## GENBANK alignments (DONE - 2007-08-03 - Hiram) ## next time: don't forget to make the 11.ooc file, see below ## generate a lift file that specifies segments separated by non-bridged ## gaps ## make the ooc file ssh kolossus cd /cluster/data/mm9 time blat mm9.2bit \ /dev/null /dev/null -tileSize=11 -makeOoc=11.ooc -repMatch=912 # real 2m29.455s cp -p 11.ooc /san/sanvol1/scratch/mm9 cp -p 11.ooc jkStuff ## also setup /iscratch/i/mus/mm9/ with these files for ## other kluster runs: # -rw-rw-r-- 1 712923274 Jul 21 13:31 mm9.2bit # -rw-rw-r-- 1 17179 Jul 23 16:18 nonBridgedGap.lft # -rw-rw-r-- 1 122352 Jul 24 11:32 11.ooc ssh hgwdev cd /cluster/data/mm9/jkStuff gapToLift mm9 nonBridgedGap.lft # WARNING: gap at end of chromosome at chrY:5902555-15902555 # WARNING: overlapping gap at chrY:2902555-5902555 and chrY:5902555-15902555 ## These warnings are true, chrY has two gaps next to each other, and ## the second one is actually the end of the chrom. This is the way the ## NCBI supplied AGP file is. (this seems to be normal in hg18 too ...) cp -p nonBridgedGap.lft /san/sanvol1/scratch/mm9 cd .. cp -p mm9.rmskTrf.2bit /san/sanvol1/scratch/mm9/mm9.2bit ## The genbank.conf entry looks like: # mm9 mm9.serverGenome = /cluster/data/mm9/mm9.2bit mm9.clusterGenome = /san/sanvol1/scratch/mm9/mm9.2bit mm9.ooc = /cluster/data/mm9/11.ooc mm9.align.unplacedChroms = * mm9.lift = /cluster/data/mm9/jkStuff/nonBridgedGap.lft mm9.refseq.mrna.native.pslCDnaFilter = ${ordered.refseq.mrna.native.pslCDnaFilter} mm9.refseq.mrna.xeno.pslCDnaFilter = ${ordered.refseq.mrna.xeno.pslCDnaFilter} mm9.genbank.mrna.native.pslCDnaFilter = ${ordered.genbank.mrna.native.pslCDnaFilter} mm9.genbank.mrna.xeno.pslCDnaFilter = ${ordered.genbank.mrna.xeno.pslCDnaFilter} mm9.genbank.est.native.pslCDnaFilter = ${ordered.genbank.est.native.pslCDnaFilter} mm9.downloadDir = mm9 mm9.refseq.mrna.xeno.load = yes mm9.refseq.mrna.xeno.loadDesc = yes mm9.mgcTables.default = full mm9.mgcTables.mgc = all ssh kkstore02 cd /cluster/data/genbank time nice -n +19 bin/gbAlignStep -initial mm9 & ## var/build/logs/2007.07.26-21:57:22.mm9.initalign.log ## logFile: var/build/logs/2007.07.23-16:44:31.mm9.initalign.log # real 771m12.978s # a couple of failed jobs, finish off the align step manually ssh kk cd /cluster/bluearc/genbank/work/initial.mm9/align para time # Completed: 50580 of 50580 jobs # CPU time in finished jobs: 14556484s 242608.06m 4043.47h 168.48d 0.462 y # IO & Wait Time: 988518s 16475.30m 274.59h 11.44d 0.031 y # Average job time: 307s 5.12m 0.09h 0.00d # Longest finished job: 1815s 30.25m 0.50h 0.02d # Submission to last job: 40513s 675.22m 11.25h 0.47d ## after recovery of the alignments jobs ssh kkstore02 cd /cluster/data/genbank time nice -n +19 bin/gbAlignStep -continue=finish -initial mm9 & # var/build/logs/2007.07.27-11:02:00.mm9.initalign.log # real 169m53.124s ssh hgwdev cd /cluster/data/genbank time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad mm9 # var/dbload/hgwdev/logs/2007.07.27-14:10:22.dbload.log # real 54m55.707s ## the two measurements are for two different runs of genbank, ## once configured as "ordered" secondly configured as "finished" featureBits mm9 refGene:cds # 30105171 bases of 2620346127 (1.149%) in intersection # 30113840 bases of 2620346127 (1.149%) in intersection featureBits mm9 refGene # 51164928 bases of 2620346127 (1.953%) in intersection # 51175624 bases of 2620346127 (1.953%) in intersection featureBits mm9 mrna # 135379415 bases of 2620346127 (5.166%) in intersection # 137195240 bases of 2620346127 (5.236%) in intersection featureBits mm9 mgcGenes # 33676155 bases of 2620346127 (1.285%) in intersection # 34012201 bases of 2620346127 (1.298%) in intersection featureBits mm9 est # 184121510 bases of 2620346127 (7.027%) in intersection # 188799620 bases of 2620346127 (7.205%) in intersection featureBits mm9 intronEst # 52305179 bases of 2620346127 (1.996%) in intersection # 52812173 bases of 2620346127 (2.015%) in intersection featureBits mm9 xenoMrna # 46119254 bases of 2620346127 (1.760%) in intersection # 51438566 bases of 2620346127 (1.963%) in intersection featureBits mm9 xenoRefGene # 40378885 bases of 2620346127 (1.541%) in intersection # 44298281 bases of 2620346127 (1.691%) in intersection # enable daily alignment and update of hgwdev (DONE - 2007-08-03 - Hiram) cd ~/kent/src/hg/makeDb/genbank cvsup # add mm9 to: etc/align.dbs etc/hgwdev.dbs cvs ci -m "Added mm9 - Mus musculus" etc/align.dbs etc/hgwdev.dbs make etc-update ######################################################################### # MAP CONTIGS TRACK (DONE - 2007-07-23 - Hiram) ## can take contig information directly from previously created ## mm9.contigs.agp ssh hgwdev mkdir /cluster/data/mm9/bed/ctgPos cd /cluster/data/mm9/bed/ctgPos grep CONTIG ../../mouse_37/mm9.contigs.agp \ | awk '{printf "%s\t%d\t%s\t%d\t%d\n", $6, $8, $1, $2-1, $3}' \ > mm9.ctgPos.tab hgsql mm9 < ~/kent/src/hg/lib/ctgPos.sql hgsql mm9 -e 'load data local infile "mm9.ctgPos.tab" into table ctgPos;' featureBits -countGaps mm9 ctgPos # 2623952781 bases of 2725765481 (96.265%) in intersection featureBits -countGaps mm8 ctgPos # 2573322222 bases of 2664455088 (96.580%) in intersection ######################################################################### ## Create downloads directory (DONE - 2007-07-25 - Hiram) ssh hgwdev mkdir /cluster/data/mm9/bed/simpleRepeat/trfMaskChrom cd /cluster/data/mm9/bed/simpleRepeat/trfMaskChrom ln -s ../trf/chr*.bed . cd /cluster/data/mm9 time nice -n +19 /cluster/bin/scripts/makeDownloads.pl mm9 \ > do.downloads.out 2>&1 # real 41m18.282s ## failed during jkStuff/doInstall.csh: # foreach size ( 1000 2000 5000 ) # echo 1000 # featureBits mm9 refGene:upstream:1000 -fa=stdout # setpriority: Permission denied. # Error writing 50 bytes: Operation not permitted ## remove the "nice" statements from the csh, and finish it off ## edit the README files to indicate correct information ########################################################################## # MGI LIFTOVER FROM MM8 (DONE 2007-07-26 angie) ssh kolossus mkdir /cluster/data/mm9/bed/jaxLiftOver cd /cluster/data/mm9/bed/jaxLiftOver ldHgGene -out=stdout -nobin placeholder placeholder \ /cluster/data/mm8/bed/jax/2007_07/jaxRepTranscript.gff \ | liftOver stdin -minBlocks=0.5 \ /cluster/data/mm8/bed/liftOver/mm8ToMm9.over.chain.gz \ -genePred jaxRepTranscriptLift.{gp,unmapped} #Read 31587 transcripts in 232925 lines in 1 files wc -l jaxRepTranscriptLift.{gp,unmapped} # 31470 jaxRepTranscriptLift.gp # 234 jaxRepTranscriptLift.unmapped liftOver -minBlocks=0.5 /cluster/data/mm8/bed/jax/2007_07/jaxAllele.bed \ /cluster/data/mm8/bed/liftOver/mm8ToMm9.over.chain.gz \ -bedPlus=12 jaxAlleleLift.{bed,unmapped} wc -l jaxAlleleLift.{bed,unmapped} # 12372 jaxAlleleLift.bed # 2 jaxAlleleLift.unmapped liftOver -minBlocks=0.5 /cluster/data/mm8/bed/jax/2007_07/jaxPhenotype.bed \ /cluster/data/mm8/bed/liftOver/mm8ToMm9.over.chain.gz \ -bedPlus=12 -tab jaxPhenotypeLift.{bed,unmapped} wc -l jaxPhenotypeLift.{bed,unmapped} # 23806 jaxPhenotypeLift.bed # 0 jaxPhenotypeLift.unmapped liftOver -minBlocks=0.5 /cluster/data/mm8/bed/jax/2007_07/jaxQtl.bed \ /cluster/data/mm8/bed/liftOver/mm8ToMm9.over.chain.gz \ -bedPlus=6 -tab jaxQtlLift.{bed,unmapped} wc -l jaxQtlLift.{bed,unmapped} # 1539 jaxQtlLift.bed # 12 jaxQtlLift.unmapped # Load lifted track tables and original auxiliary tables: ssh hgwdev cd /cluster/data/mm8/bed/jaxLiftOver # jaxRepTranscriptLift ldHgGene -predTab mm9 jaxRepTranscriptLift jaxRepTranscriptLift.gp #31470 gene predictions sed -e 's/jaxRepTranscript/jaxRepTranscriptLift/g' \ /cluster/data/mm8/bed/jax/2007_07/fixJaxRepTranscript.sql \ > fixJaxRepTranscriptLift.sql hgsql mm9 < fixJaxRepTranscriptLift.sql hgLoadSqlTab mm9 jaxRepTranscriptAlias \ /cluster/data/mm8/bed/jax/2007_07/jaxRepTranscriptAlias.sql \ /cluster/data/mm8/bed/jax/2007_07/jaxRepTranscriptAlias.tab hgsql mm9 -e 'rename table jaxRepTranscriptAlias to jaxRepTranscriptLiftAlias;' # jaxAlleleLift sed -e 's/bed12Source/jaxAlleleLift/g' \ $HOME/kent/src/hg/lib/bed12Source.sql > jaxAlleleLift.sql hgLoadBed -sqlTable=jaxAlleleLift.sql mm9 jaxAlleleLift jaxAlleleLift.bed #Loaded 12372 elements of size 13 sed -e 's/jaxAllele/jaxAlleleLift/g' \ /cluster/data/mm8/bed/jax/2007_07/fixJaxAllele.sql > fixJaxAlleleLift.sql # empty file, but just in case it has something in the future... hgsql mm9 < fixJaxAlleleLift.sql hgLoadSqlTab mm9 jaxAlleleInfo \ ~/kent/src/hg/lib/jaxAlleleInfo.sql \ /cluster/data/mm8/bed/jax/2007_07/jaxAlleleInfo.tab # jaxPhenotypeLift sed -e 's/bed12Source/jaxPhenotypeLift/g' \ ~/kent/src/hg/lib/bed12Source.sql > jaxPhenotypeLift.sql hgLoadBed -tab -sqlTable=jaxPhenotypeLift.sql mm9 jaxPhenotypeLift \ jaxPhenotypeLift.bed #Loaded 23806 elements of size 13 sed -e 's/jaxPhenotype/jaxPhenotypeLift/g' \ /cluster/data/mm8/bed/jax/2007_07/fixJaxPhenotype.sql \ > fixJaxPhenotypeLift.sql # empty file, but just in case it has something in the future... hgsql mm9 < fixJaxPhenotypeLift.sql hgLoadSqlTab mm9 jaxPhenotypeAlias \ /cluster/data/mm8/bed/jax/2007_07/jaxPhenotypeAlias.sql \ /cluster/data/mm8/bed/jax/2007_07/jaxPhenotypeAlias.tab hgsql mm9 -e 'rename table jaxPhenotypeAlias to jaxPhenotypeLiftAlias;' # phenotype-allele relationships hgLoadSqlTab mm9 jaxAllelePheno \ ~/kent/src/hg/lib/jaxAllelePheno.sql \ /cluster/data/mm8/bed/jax/2007_07/jaxAllelePheno.tab # jaxQTLLift sed -e 's/jaxQTL/jaxQTLLift/g'\ ~/kent/src/hg/lib/jaxQTL.sql > jaxQTLLift.sql hgLoadBed -tab -notItemRgb -noBin \ -sqlTable=jaxQTLLift.sql \ mm9 jaxQTLLift jaxQtlLift.bed #Loaded 1539 elements of size 10 # Add row to mm9.grp for Phenotype and Allele track group: hgsql mm9 -e 'insert into grp values("phenoAllele", "Phenotype and Allele", 4.5);' ########################################################################## ## Creating pushQ (DONE - 2007-07-26 - Hiram) ssh hgwdev mkdir /cluster/data/mm9/pushQ cd /cluster/data/mm9/pushQ /cluster/bin/scripts/makePushQSql.pl mm9 > mm9.sql 2> stderr.out ## check the stderr.out for anything that needs to be fixed ## copy mm9.sql to hgwbeta:/tmp scp mm9.sql hgwbeta:/tmp ## then on hgwbeta ssh hgwbeta cd /tmp hgsql qapushq < mm9.sql ############################################################################# # STS MARKERS DATA DOWNLOAD (DONE - 2007-07-26 - Hiram) ssh kkstore06 mkdir -p /cluster/data/mm9/bed/STSmarkers/downloads cd /cluster/data/mm9/bed/STSmarkers/downloads # these files appear to be new almost every day time nice -n +19 wget --timestamping \ ftp://ftp.ncbi.nih.gov/repository/UniSTS/UniSTS_mouse.sts time nice -n +19 wget --timestamping \ ftp://ftp.ncbi.nih.gov/repository/UniSTS/UniSTS.aliases # The new feature in the .aliases file this time are names with # spaces in them ! This changes our parsing business below, # hopefully the spaces in the names won't cause trouble elsewhere. time nice -n +19 wget --timestamping \ ftp://ftp.ncbi.nih.gov/repository/UniSTS/UniSTS_MapReports/Mus_musculus/* # these reports from jax.org appear to be changing daily time nice -n +19 wget --timestamping \ ftp://ftp.informatics.jax.org/pub/reports/MRK_Dump2.rpt time nice -n +19 wget --timestamping \ ftp://ftp.informatics.jax.org/pub/reports/MRK_Sequence.rpt time nice -n +19 wget --timestamping \ ftp://ftp.informatics.jax.org/pub/reports/PRB_PrimerSeq.rpt ls -ogrt # -rw-rw-r-- 1 676 Mar 11 2004 README # -rw-rw-r-- 1 396858 Jan 28 2005 10090.MGI.txt # -rw-rw-r-- 1 390139 Mar 16 2005 10090.WI_MRC_RH.txt # -rw-rw-r-- 1 240688 Mar 16 2005 10090.WI-YAC.txt # -rw-rw-r-- 1 173344 Mar 16 2005 10090.WI-Genetic.txt # -rw-rw-r-- 1 25691253 Jan 13 2006 UniSTS.aliases # -rw-rw-r-- 1 4582158 Jul 5 11:40 UniSTS_mouse.sts # -rw-rw-r-- 1 2841773 Jul 26 03:13 PRB_PrimerSeq.rpt # -rw-rw-r-- 1 5149790 Jul 26 03:13 MRK_Sequence.rpt # -rw-rw-r-- 1 5697140 Jul 26 03:13 MRK_Dump2.rpt # I note the UniSTS.aliases file is over twice as big as was in # Mm7 build. I wonder what got into it ... # What got into it was that it was completely broken. It appeared # to have a vast section of itself duplicated again in the file. # It was cleaned up via: echo -e "#Unique ID\tAliases" > uniqueSTS.aliases grep -v "^#" UniSTS.aliases | sort -n | uniq >> uniqueSTS.aliases mv UniSTS.aliases UniSTS.aliases.broken mv uniqueSTS.aliases UniSTS.aliases # back to our work area, update the bed file # to do this we need a new UniSTS_mouse.alias file # it is created by a combination of information from several # of the above files ! AND ! the previous stsInfoMouse.bed file # the db reference here is to the previous build time nice -n +19 ~/kent/src/hg/stsMarkers/fetchAllAliases.csh mm8 # Here is a normal set of errors: # processing UniSTS_mouse.sts to find aliases # # ERROR: KNOWN(==OK) duplicate ID: '108991' encountered at line # # 2384 # processing MGI.aliases # fetching existing aliases from previous stsInfoMouse.bed file # found 27648 potential errors in # /cluster/data/mm8/bed/STSmarkers/stsInfoMouse.bed # to see the errors: grep ERROR stsInfoAliases.txt # verify those stsInfoMouse.bed aliases with UniSTS.aliases # those errors in the previous stsInfoMouse.bed file are an # accumulation of errors from a long long time ago in this chain # of processing. Some day it might be nice to fix them, but they # don't seem to bother anything, so they continue to be carried # forward, and a couple of new ones are added with each assembly. #################################################################### ## STS markers data processing track (DONE - 2007-07-26 - Hiram) ssh hgwdev cd /cluster/data/mm9/bed/STSmarkers # create a new stsInfoMouse.bed file: # Update the m m 8 directory name here to m m 9 # for the next build of m m 10, ...etc... and so forth time ~/kent/src/hg/stsMarkers/updateBed.pl \ /cluster/data/mm8/bed/STSmarkers/stsInfoMouse.bed \ downloads/MRK_Dump2.rpt \ downloads/PRB_PrimerSeq.rpt \ downloads/MRK_Sequence.rpt \ downloads/UniSTS_mouse.alias \ downloads/UniSTS_mouse.sts \ -g downloads/10090.WI-Genetic.txt \ -r downloads/10090.WI_MRC_RH.txt \ -verbose 2> dbg.updateBed | sed -e "s/\t*$//" > newbedfile ~/kent/src/hg/stsMarkers/cleanInfo.pl -mouse newbedfile \ | sed -e "s/\t*$//" > mm9.stsInfoMouse.bed # copy the stsInfoMouse.bed file from working dir to the marker # info storage fold. added 2 new steps by Yontao # be wary of the archive name here, check the directory and get # the name right here. mv /cluster/store5/mouseMarker/stsInfoMouse.bed \ /cluster/store5/mouseMarker/stsInfoMouse.bed_mm8.secondTime cp -p mm9.stsInfoMouse.bed /cluster/store5/mouseMarker/stsInfoMouse.bed # comparing to previous, numbers increase slightly each time wc -l /cluster/store5/mouseMarker/stsInfoMouse.bed \ /cluster/store5/mouseMarker/stsInfoMouse.bed_mm8.secondTime \ /cluster/store5/mouseMarker/stsInfoMouse.bed_mm7 \ /cluster/store5/mouseMarker/stsInfoMouse.bed_mm6 \ /cluster/store5/mouseMarker/stsInfoMouse.bed_mm5 # 66782 /cluster/store5/mouseMarker/stsInfoMouse.bed # 60631 /cluster/store5/mouseMarker/stsInfoMouse.bed_mm8.secondTime # 59843 /cluster/store5/mouseMarker/stsInfoMouse.bed_mm7 # 58980 /cluster/store5/mouseMarker/stsInfoMouse.bed_mm6 # 58493 /cluster/store5/mouseMarker/stsInfoMouse.bed_mm5 # and from that, create new primer fa, epcr, etc: time ~/kent/src/hg/stsMarkers/luConvertPrimerToFa \ mm9.stsInfoMouse.bed mouseP.fa mouseC.fa mouseP.info # the mouseC.fa file will be empty, should be more than last time wc -l mouse?.* # 0 mouseC.fa # 359647 mouseP.fa # 41247 mouseP.info # the equivalent Mm8 files: # 0 mouseC.fa # 308384 mouseP.fa # 34666 mouseP.info # copy the primers over to some filesystem close to the klusters # and split them up to have a small number of sequences in one file mkdir /cluster/bluearc/mm9/stsMarkers cp -p mouseP.fa /cluster/bluearc/mm9/stsMarkers cd /cluster/bluearc/mm9/stsMarkers cp -p /cluster/data/mm9/11.ooc . mkdir split # 356 files for 41,247 sequences, == about 116 sequences per file faSplit sequence mouseP.fa 400 split/mm_ # PLEASE NOTE /cluster/bin/i386/blat.2 SPECIFICALLY IS USED HERE. # This process could convert to a modern version of blat with the # filters as described, for example, in the STS markers build in Hg18 # CLUSTER RUN FOR THE STS PRIMERS ssh kk cd /cluster/data/mm9/bed/STSmarkers mkdir primer mkdir ePCR cd primer mkdir out # interestingly, this blat2.2 binary did not function correctly # when given nib files. It has only about 1/4th of the number of # alignments as it gets when it used fa files for the target # sequence. ls -1S /cluster/bluearc/mm9/stsMarkers/split > primers.list # will fetch chrom sequences from the 2bit file cut -f1 /cluster/data/mm9/chrom.sizes > chr.list ## next time, make this script produce its results in /scratch/tmp ## then move result file to output instead of writing result ## to output cat << '_EOF_' > runBlat2 #!/bin/csh -fe set primer = /cluster/bluearc/mm9/stsMarkers/split/$1 set root1 = $1:r set fa = $root1.$2.fa set ooc = /cluster/bluearc/mm9/stsMarkers/11.ooc set root2 = $2:r set tmpDir = /scratch/tmp/$root1.$root2 mkdir $tmpDir mkdir -p out/${root2} set out = $3 pushd $tmpDir twoBitToFa -seq=$2 /iscratch/i/mus/mm9/mm9.2bit ${fa} cp -p ${primer} primer.fa cp -p ${ooc} 11.ooc /cluster/bin/i386/blat.2 ${fa} primer.fa -ooc=11.ooc \ -minMatch=1 -minScore=0 -minIdentity=80 -oneOff result.psl popd cp -p ${tmpDir}/result.psl ${out} rm -fr ${tmpDir} '_EOF_' # << happy emacs chmod +x runBlat2 cat << '_EOF_' > template #LOOP ./runBlat2 $(path1) $(path2) {check out line+ out/$(root2)/$(root1).psl} #ENDLOOP '_EOF_' # << happy emacs gensub2 primers.list chr.list template jobList para create jobList para try ... check ... push ... etc ... # Completed: 12425 of 12425 jobs # CPU time in finished jobs: 1438098s 23968.31m 399.47h 16.64d 0.046 y # IO & Wait Time: 237582s 3959.69m 65.99h 2.75d 0.008 y # Average job time: 135s 2.25m 0.04h 0.00d # Longest finished job: 2150s 35.83m 0.60h 0.02d # Submission to last job: 4736s 78.93m 1.32h 0.05d # on the file server ssh kkstore06 cd /cluster/data/mm9/bed/STSmarkers/primer time nice -n +19 pslSort dirs primers.raw.psl temp out/chr* # real 1m34.193s # -rw-rw-r-- 1 700293557 Aug 6 10:22 primers.raw.psl # filter alignments for (qEnd-qStart) vs. (tEnd-tStart) # should not be more than 100 bases different. # This filters out about 948,260 alignments, or # %17.4 = 100.0 * 948260 / 5462936 time nice -n +19 pslSort dirs stdout temp out/chr* | awk -F"\t" ' { if (((($13 - $12) - ($17 - $16)) > -100) && ((($13 - $12) - ($17 - $16)) < 100)) {print} } ' > primers.100.psl rmdir temp wc -l *.psl # 5340677 primers.100.psl # 6498150 primers.raw.psl echo "6498150-5340677" | bc -q # 1157473 difference # a rough comparison with previous results: wc -l primers.100.psl \ /cluster/data/mm8/bed/STSmarkers.2006-08-29/primer/primers.psl.100 # 5340677 primers.100.psl # 4514676 /cluster/data/mm8/bed/STSmarkers.2006-08-29/primer/primers.psl.100 # another kluster run for the ePCR ssh pk cd /cluster/data/mm9/bed/STSmarkers/ePCR cut -f1 /cluster/data/mm9/chrom.sizes > chr.list # Using previously fetched e-PCR source from # ftp://ftp.ncbi.nlm.nih.gov/pub/schuler/e-PCR/ # version 2.3.1 11 Feb 2005 # Had to add the following to both re-PCR_main.cpp and # e-PCR_main.cpp to get them to compile on kolossus: // max and min Copied from /usr/include/mysql/my_global.h #define max(a, b) ((a) >? (b)) #define min(a, b) ((a) runPCR #!/bin/csh -fe set chr = $1 set out = $2 set wrkdir = /scratch/tmp/epcr.mm9.$chr set fa = $chr.fa set tmpResult = $chr.result.epcr mkdir $wrkdir twoBitToFa -seq=$chr /san/sanvol1/scratch/mm9/mm9.2bit $wrkdir/$fa pushd $wrkdir /cluster/bin/x86_64/e-PCR \ /cluster/data/mm9/bed/STSmarkers/mouseP.info $fa N=1 M=50 W=5 > $tmpResult popd cp -p $wrkdir/$tmpResult $out rm $wrkdir/$tmpResult rm $wrkdir/$fa rmdir $wrkdir '_EOF_' # << happy emacs chmod +x runPCR cat << '_EOF_' > template #LOOP ./runPCR $(path1) {check out line+ out/$(root1).epcr} #ENDLOOP '_EOF_' # << the mouseP.info was created above gensub2 chr.list single template jobList para create jobList para try para check para push ... etc ... ## two of those produce zero results: # -rw-rw-r-- 1 0 Aug 6 12:53 chr3_random.epcr # -rw-rw-r-- 1 0 Aug 6 12:53 chr16_random.epcr ## hence, the two crashed jobs in the check display: # Completed: 33 of 35 jobs # Crashed: 2 jobs # CPU time in finished jobs: 80940s 1349.01m 22.48h 0.94d 0.003 y # IO & Wait Time: 0s 0.00m 0.00h 0.00d 0.000 y # Average job time: 2327s 38.78m 0.65h 0.03d # Longest finished job: 6980s 116.33m 1.94h 0.08d # Submission to last job: 15589s 259.82m 4.33h 0.18d ssh kkstore06 cd /cluster/data/mm9/bed/STSmarkers/ePCR # all those results become all.epcr cat out/*.epcr > all.epcr # comparing to previous results, should have more with new results: wc -l all.epcr /cluster/data/mm8/bed/STSmarkers.2006-08-29/ePCR/all.epcr # 87623 all.epcr # 58162 /cluster/data/mm8/bed/STSmarkers.2006-08-29/ePCR/all.epcr cd /cluster/data/mm9/bed/STSmarkers/primer ~/kent/src/hg/stsMarkers/filterSTSPrimers \ -mouse ../mm9.stsInfoMouse.bed primers.100.psl \ ../mouseP.info ../ePCR/all.epcr > primers.psl.filter.blat # The output should show an increasing count: # Reading name info from: ../mm9.stsInfoMouse.bed # Reading primer info from: ../mouseP.info # Reading ePCR info from: ../ePCR/all.epcr # Reading alignment results from: primers.100.psl # 100000 # 200000 # ... # 5200000 # 5300000 # Determining ePCR not found from ePCR results # Out of 26332 ePCR alignments examined, not found: 527 ## compare with previous build results wc -l primers.psl.filter.blat \ /cluster/data/mm8/bed/STSmarkers.2006-08-29/primer/primers.psl.filter.blat # 35537 primers.psl.filter.blat # 34043 /cluster/data/mm8/bed/STSmarkers.2006-08-29/primer/primers.psl.filter.blat ## ouch, mm9 does not have lift files for contigs to chroms ## let's make a contig lift file cd /cluster/data/mm9/jkStuff cp -p /cluster/data/cb3/jkStuff/agpToLift.pl . grep CONTIG ../mouse_37/mm9.contigs.agp \ | ./agpToLift.pl /dev/stdin > mm9.contigs.lift awk '{if (! match($5,"N")) print}' ../mouse_37/mm9.fragments.agp \ | /cluster/data/rn3/jkStuff/agpToLift.pl ../chrom.sizes /dev/stdin \ > mm9.fragments.lift cd .. mkdir ctgLifts splitFileByColumn -col=4 jkStuff/mm9.contigs.lift ctgLifts mkdir fragmentLifts splitFileByColumn -col=4 jkStuff/mm9.fragments.lift fragmentLifts ## distribute those in the old-style lift directory hierarchy for C in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 Un X Y M do rm -fr ${C}/lift done for C in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 Un X Y M do mkdir -p ${C}/lift if [ -f ctgLifts/chr${C}.contigs.lift ]; then cp -p ctgLifts/chr${C}.contigs.lift ${C}/lift/ordered.lft fi if [ -f ctgLifts/chr${C}_random.contigs.lift ]; then cp -p ctgLifts/chr${C}_random.contigs.lift ${C}/lift/random.lft fi done ## not the fragments # for C in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 Un X Y M #do # mkdir -p ${C}/lift # if [ -f fragmentLifts/chr${C}.fragments.lift ]; then # cp -p fragmentLifts/chr${C}.fragments.lift ${C}/lift/ordered.lft # fi # if [ -f fragmentLifts/chr${C}_random.fragments.lift ]; then # cp -p fragmentLifts/chr${C}_random.fragments.lift ${C}/lift/random.lft # fi #done ## now, after that side trip, back to the primer business # create file accession_info.rdb touch empty_sequence.inf ~/kent/src/hg/stsMarkers/compileAccInfo -mouse \ /cluster/data/mm9 empty_sequence.inf # 20363 processed mv accession_info.rdb accession_info.rdb.tmp ~/kent/src/hg/stsMarkers/sorttbl -x Chr Ord Start \ < accession_info.rdb.tmp > accession_info.rdb # The -x prints the debug statement: # sort arg: -t" " +0 -1 +1 -2g +2 -3g rm accession_info.rdb.tmp # comparing results to previous # Continuing the trend that began with Mm7, the numbers in # accession_info.rdb continue to decrease. Even Mm8 has much less # fragments than did mm7: # e.g.: [hiram@kkstore06 /cluster/data] wc -l mm9/?/chr*.agp mm9/??/chr*.agp | tail -1 # 21699 total [hiram@kkstore06 /cluster/data] wc -l mm8/*/chr*.agp | tail -1 # 21910 total [hiram@kkstore06 /cluster/data] wc -l mm7/*/chr*.agp | tail -1 # 70125 total [hiram@kkstore06 /cluster/data] wc -l mm6/*/chr*.agp | tail -1 # 170812 total wc -l accession_info.rdb \ /cluster/data/mm8/bed/STSmarkers.2006-08-29/primer/accession_info.rdb /cluster/data/mm8/bed/STSmarkers.2006-08-29/primer/primers.psl.filter.blat # 20333 accession_info.rdb # 20385 /cluster/data/mm8/bed/STSmarkers.2006-08-29/primer/accession_info.rdb # creates epcr.not.found.nomatch and epcr.not.found.psl ~/kent/src/hg/stsMarkers/epcrToPsl -mouse \ epcr.not.found ../mouseP.info \ accession_info.rdb /cluster/data/mm9/mm9.2bit 2> dbg.epcrToPsl # the dbg.epcrToPsl has a number of lines complaining about bad # primers in ../mouseP.info - and indeed they are bad primers, # they do not have a second primer. # Comparing results to previous: wc -l epcr* \ /cluster/data/mm8/bed/STSmarkers.2006-08-29/primer/epcr* # 527 epcr.not.found # 0 epcr.not.found.nomatch # 527 epcr.not.found.psl # 520 /cluster/data/mm8/bed/STSmarkers.2006-08-29/primer/epcr.not.found # 0 /cluster/data/mm8/bed/STSmarkers.2006-08-29/primer/epcr.not.found.nomatch # 520 /cluster/data/mm8/bed/STSmarkers.2006-08-29/primer/epcr.not.found.psl # Mm7 wc epcr* wc -l /cluster/data/mm7/bed/STSmarkers/primer/epcr* # 474 /cluster/data/mm7/bed/STSmarkers/primer/epcr.not.found # 0 /cluster/data/mm7/bed/STSmarkers/primer/epcr.not.found.nomatch # 474 /cluster/data/mm7/bed/STSmarkers/primer/epcr.not.found.psl # 158 /cluster/data/mm7/bed/STSmarkers/primer/epcrToPsl # 1106 total cat primers.psl.filter.blat epcr.not.found.psl > primers.psl.filter wc -l primers.psl.filter \ /cluster/data/mm8/bed/STSmarkers.2006-08-29/primer/primers.psl.filter # 36064 primers.psl.filter # 34563 /cluster/data/mm8/bed/STSmarkers.2006-08-29/primer/primers.psl.filter # create primers.psl.filter.lifted.initial # The PATH setting allows extractPslInfo to find other programs that it # is going to use. PATH=~/kent/src/hg/stsMarkers:$PATH \ ~/kent/src/hg/stsMarkers/extractPslInfo primers.psl.filter wc -l *.initial \ /cluster/data/mm8/bed/STSmarkers.2006-08-29/primer/primers.psl.filter.initial # 36040 primers.psl.filter.initial # 34545 /cluster/data/mm8/bed/STSmarkers.2006-08-29/primer/primers.psl.filter.initial # create primers.psl.filter.lifted.initial.acc PATH=~/kent/src/hg/stsMarkers:$PATH \ ~/kent/src/hg/stsMarkers/findAccession -agp \ -mouse primers.psl.filter.initial /cluster/data/mm9 wc -l *.initial.acc /cluster/data/mm8/bed\ /STSmarkers.2006-08-29/primer/primers.psl.filter.initial.acc # 36040 primers.psl.filter.initial.acc # 34545 /cluster/data/mm8/bed/STSmarkers.2006-08-29/primer/primers.psl.filter.initial.acc # this needs to be -rat as that specifies how to scan the # stsInfoMouse.bed file and it does not work if you use -mouse # it is not clear what -mouse would mean to this script, some other file # format perhaps from the stsInfoMouse.bed format. ~/kent/src/hg/stsMarkers/getStsId -rat \ ../mm9.stsInfoMouse.bed primers.psl.filter.initial.acc \ | sort -k4,4n > primers.final wc -l primers.final \ /cluster/data/mm8/bed/STSmarkers.2006-08-29/primer/primers.final # 36040 primers.final # 34545 /cluster/data/mm8/bed/STSmarkers.2006-08-29/primer/primers.final cd /cluster/data/mm9/bed/STSmarkers # stsMarkers.final is empty for mouse touch stsMarkers.final dummy PATH=~/kent/src/hg/stsMarkers:$PATH \ ~/kent/src/hg/stsMarkers/combineSeqPrimerPos \ stsMarkers.final primer/primers.final > stsMarkers_pos.rdb wc -l stsMarkers_pos.rdb \ /cluster/data/mm8/bed/STSmarkers.2006-08-29/stsMarkers_pos.rdb # 34232 stsMarkers_pos.rdb # 33048 /cluster/data/mm8/bed/STSmarkers.2006-08-29/stsMarkers_pos.rdb PATH=~/kent/src/hg/stsMarkers:$PATH \ ~/kent/src/hg/stsMarkers/createStsBed \ mm9.stsInfoMouse.bed stsMarkers_pos.rdb 500 \ | sort -k1,1 -k2,2n | sed -e "s/ //g" > stsMapMouse.bed # The sed removes unneeded blanks # verify score profile remains similar awk -F'\t' '{print $5}' stsMapMouse.bed | sort -n | uniq -c # 591 500 # 1774 750 # 28529 1000 awk -F'\t' '{print $5}' \ /cluster/data/mm8/bed/STSmarkers.2006-08-29/stsMapMouse.bed \ | sort -n | uniq -c # 546 500 # 1650 750 # 27705 1000 wc -l stsMapMouse.bed \ /cluster/data/mm8/bed/STSmarkers.2006-08-29/stsMapMouse.bed # 30894 stsMapMouse.bed # 29901 /cluster/data/mm8/bed/STSmarkers.2006-08-29/stsMapMouse.bed ## check the names, look for odd ones ## the bogus names "-" were fixed for mm9 awk -F'\t' '{print $4}' stsMapMouse.bed | sort | head awk -F'\t' '{print $4}' stsMapMouse.bed | sort | tail # loading STS markers tables ssh hgwdev cd /cluster/data/mm9/bed/STSmarkers ~/kent/src/hg/stsMarkers/ucscAlias.pl \ mm9.stsInfoMouse.bed > ucscStsAlias.tab 2> ucscStsAlias.warnings # this does leave messages in ucscStsAlias.warnings but they seem # to be very similar to Mm6 with just a few new ones wc -l ucscStsAlias.tab \ /cluster/data/mm8/bed/STSmarkers.2006-08-29/ucscStsAlias.tab # 146359 ucscStsAlias.tab # 146767 /cluster/data/mm8/bed/STSmarkers.2006-08-29/ucscStsAlias.tab ssh hgwdev cd /cluster/data/mm9/bed/STSmarkers ## when reloading: hgsql -e "drop table stsAlias;" mm9 hgsql -e "drop table stsMapMouseNew;" mm9 hgsql -e "drop table stsInfoMouseNew;" mm9 hgsql mm9 < ~/kent/src/hg/lib/stsAlias.sql hgsql -e \ 'load data local infile "ucscStsAlias.tab" into table stsAlias;' mm9 hgsql mm9 < ~/kent/src/hg/lib/stsMapMouseNew.sql hgsql -e \ 'load data local infile "stsMapMouse.bed" into table stsMapMouseNew;' mm9 hgsql mm9 < ~/kent/src/hg/lib/stsInfoMouseNew.sql hgsql -e \ 'load data local infile "mm9.stsInfoMouse.bed" into table stsInfoMouseNew;' mm9 hgsql -e "drop table all_sts_primer;" mm9 hgLoadPsl -nobin -table=all_sts_primer mm9 primer/primers.psl.filter # load of all_sts_primer did not go as planned: 36064 record(s), # 0 row(s) skipped, 1 warning(s) loading primer/primers.psl.filter # After warnings, checkTableCoords to find problems: checkTableCoords -verboseBlocks mm9 all_sts_primer mm9.all_sts_primer item 61999 chr10:62485403-62485439: blocks 0 and 1 overlap. mm9.all_sts_primer has 1 records with overlapping blocks. # Strip the offending item from the load hgsql -e 'delete from all_sts_primer where tName="chr10" AND tStart=62485403 AND tEnd=62485439;' mm9 # load primer sequences mkdir /gbdb/mm9/stsMarker ln -s /cluster/data/mm9/bed/STSmarkers/mouseP.fa \ /gbdb/mm9/stsMarker/mouseP.fa # PLEASE NOTE THAT THE If you are going to reload this business, use the # -replace option on this hgLoadSeq # hgLoadSeq -replace mm9 /gbdb/mm9/stsMarker/mouseP.fa # otherwise there will be a problem that the seq and extFile tables # will be out of sync. hgLoadSeq -replace mm9 /gbdb/mm9/stsMarker/mouseP.fa # Adding /gbdb/mm9/stsMarker/mouseP.fa # 41247 sequences # Warning: load of seq did not go as planned: 41330 record(s), 0 row(s) # skipped, 1 warning(s) loading ./seq.tab ## joinerCheck should be clean: joinerCheck -keys -identifier=mouseStsTrueName -database=mm9 all.joiner # Checking keys on database mm9 # mm9.stsAlias.trueName - hits 146350 of 146359 ok # mm9.all_sts_primer.qName - hits 35537 of 36063 ok # mm9.stsMapMouseNew.name - hits 30894 of 30894 ok featureBits mm9 all_sts_primer # 3795229 bases of 2620346127 (0.145%) in intersection featureBits mm8 all_sts_primer # 3700897 bases of 2567283971 (0.144%) in intersection featureBits mm9 stsMapMouseNew # 4884563 bases of 2620346127 (0.186%) in intersection featureBits mm8 stsMapMouseNew # 4812616 bases of 2567283971 (0.187%) in intersection hgsql -N mm9 -e "select count(*) from stsAlias;" # 146359 hgsql -N mm8 -e "select count(*) from stsAlias;" # 146767 hgsql -N mm9 -e "select count(*) from stsInfoMouseNew;" # 66782 hgsql -N mm8 -e "select count(*) from stsInfoMouseNew;" # 60631 # compare old and new name lists, not much difference: awk '{print $4}' stsMapMouse.bed | sort -u > mm9.nameList # in common with previous version: comm -12 \ /cluster/data/mm8/bed/STSmarkers.2006-08-29/mm8.nameList \ mm9.nameList | wc -l # 28596 # unique to previous version: comm -23 \ /cluster/data/mm8/bed/STSmarkers.2006-08-29/mm8.nameList \ mm9.nameList | wc -l # 111 # unique to this new set: comm -13 \ /cluster/data/mm8/bed/STSmarkers.2006-08-29/mm8.nameList \ mm9.nameList | wc -l # 1017 ########################################################################### # Reset default position to be same area as Mm8, 2007-08-02 - Hiram hgsql -e \ 'update dbDb set defaultPos="chr12:57795963-57815592" where name="mm9";' \ hgcentraltest ############################################################################## # CLONE ENDS - BACEND TRACK (DONE - 2007-08-02 - 2007-08-03 - Hiram) ssh kkstore06 cd /cluster/data/mm9 # check disk space: 1.2T free df -h . # Filesystem Size Used Avail Use% Mounted on # /export/cluster/store4 # 2.3T 997G 1.2T 46% /cluster/store4 mkdir -p bed/cloneend/ncbi cd bed/cloneend/ncbi wget --timestamping \ ftp://ftp.ncbi.nih.gov/genomes/CLONEEND/mus_musculus/* cd /cluster/data/mm9/bed/cloneend # seems like the *.mfa files were split just for convenience # concatenate, and convert the title line of the fasta sequences cat << '_EOF_' > convert.pl #!/usr/bin/env perl use strict; use warnings; while (my $line = <>) { if ($line !~ m/^>/) { print $line } else { my @fields = split('\|', $line); my $fieldCount = scalar(@fields); my $printed = 0; for (my $i = 0; $i < $fieldCount; $i++) { if ($fields[$i] eq "gb" || $fields[$i] eq "dbj") { (my $name, my $vers) = split(/\./,$fields[$i+1]); print ">$name\n"; $i= $fieldCount; $printed = 1; } } die("Failed for $line\n") if (!$printed); } } '_EOF_' # << happy emacs chmod +x convert.pl for F in ncbi/*.mfa.gz do zcat ${F} done | ./convert.pl | gzip > cloneEnds.fa.gz # make sure nothing got broken: faSize ncbi/*.mfa.gz # 498162791 bases (16779168 N's 481383623 real 304962409 upper # 176421214 lower) in 789466 sequences in 44 files faSize cloneEnds.fa.gz # 498162791 bases (16779168 N's 481383623 real 304962409 upper # 176421214 lower) in 789466 sequences in 1 files # identical numbers, curiously, these are exactly the same numbers # as were seen during the build of Mm7. Do these things not # change with time ? # concatenate the text files, too for F in ncbi/*.txt.gz do zcat ${F} done | gzip > all.txt.gz # generate cloneEndPairs.txt and cloneEndSingles.txt zcat all.txt.gz | ~/kent/src/hg/utils/cloneEndParse.pl /dev/stdin # Reading in end info # Writing out pair info # Writing out singleton info # 354485 pairs and 78423 singles # faSplit does not function correctly if given a .gz source file # AND, we need the unzipped file for sequence loading below gunzip cloneEnds.fa.gz # split mkdir split cd split ## adjust split size based on previous kluster performance, see below faSplit sequence ../cloneEnds.fa 500 cloneEnds # Check to ensure no breakage: faSize c*.fa # 498162791 bases (16779168 N's 481383623 real 304962409 upper 176421214 # lower) in 789466 sequences in 98 files # %35.41 masked total, %36.65 masked real # same numbers as before # Copy to san for cluster runs mkdir /san/sanvol1/scratch/mm9/cloneEnds rsync -a --progress --stats ./ /san/sanvol1/scratch/mm9/cloneEnds/ rm * cd .. rmdir split # may as well remove the previous assembly copy: rm -fr /san/sanvol1/scratch/mm8/cloneEnds # load sequences ssh hgwdev mkdir /gbdb/mm9/cloneend cd /gbdb/mm9/cloneend ln -s /cluster/data/mm9/bed/cloneend/cloneEnds.fa . cd /tmp hgLoadSeq mm9 /gbdb/mm9/cloneend/cloneEnds.fa # Advisory lock created # Creating .tab file # Adding /gbdb/mm9/cloneend/cloneEnds.fa # 789466 sequences # Updating seq table # Advisory lock has been released # All done ## clean up garbage rm seq.tab ############################################################################ # BACEND SEQUENCE ALIGNMENTS (DONE - 2007-08-06 - Hiram) ssh kkstore06 mkdir /cluster/data/mm9/noMask cd /cluster/data/mm9/ # Need an unmasked sequence for this work for C in `cut -f1 chrom.sizes` do echo twoBitToFa -noMask -seq=${C} mm9.2bit noMask/${C}.fa twoBitToFa -noMask -seq=${C} mm9.2bit noMask/${C}.fa done # verify nothing broken faSize noMask/c*.fa # 2725765481 bases (105419509 N's 2620345972 real 2620345972 upper 0 lower) in # 35 sequences in 35 files # note, this was the same as long ago when the mm9.2bit was measured: # 2725765481 bases (105419509 N's 2620345972 real 1466644650 upper # 1153701322 lower) in 35 sequences in 1 files # copy to san for kluster run mkdir /san/sanvol1/scratch/mm9/noMask rsync -a --progress --stats noMask/ /san/sanvol1/scratch/mm9/noMask/ # 11.ooc file is already there from the genbank build # and now for the kluster run ssh pk mkdir /cluster/data/mm9/bed/bacends cd /cluster/data/mm9/bed/bacends mkdir out # allow blat to run politely in /tmp while it writes output, then # copy results to results file: cat << '_EOF_' > runBlat #!/bin/csh -fe set root1 = $1 set root2 = $2 set result = $3 rm -fr /scratch/tmp/${root1}_${root2} mkdir /scratch/tmp/${root1}_${root2} cp -p /san/sanvol1/scratch/mm9/11.ooc /scratch/tmp/${root1}_${root2} cp -p /san/sanvol1/scratch/mm9/noMask/${root1}.fa \ /scratch/tmp/${root1}_${root2} cp -p /san/sanvol1/scratch/mm9/cloneEnds/${root2}.fa \ /scratch/tmp/${root1}_${root2} pushd /scratch/tmp/${root1}_${root2} /cluster/bin/x86_64/blat ${root1}.fa ${root2}.fa \ -ooc=11.ooc ${root1}.${root2}.psl popd mkdir -p out/${root2} rm -f ${result} cp -p /scratch/tmp/${root1}_${root2}/${root1}.${root2}.psl ${result} rm -fr /scratch/tmp/${root1}_${root2} '_EOF_' # << happy emacs chmod +x runBlat cat << '_EOF_' > template #LOOP ./runBlat $(root1) $(root2) {check out line+ out/$(root2)/$(root1).$(root2).psl} #ENDLOOP '_EOF_' # << emacs happy ls -1S /san/sanvol1/scratch/mm9/cloneEnds/cloneEnds*.fa > bacEnds.lst ls -1S /san/sanvol1/scratch/mm9/noMask/chr*.fa > chrom.lst gensub2 chrom.lst bacEnds.lst template jobList para create jobList # 17150 jobs written to batch para try, check, push, etc ... # Completed: 17150 of 17150 jobs # CPU time in finished jobs: 698826s 11647.09m 194.12h 8.09d 0.022 y # IO & Wait Time: 262556s 4375.94m 72.93h 3.04d 0.008 y # Average job time: 56s 0.93m 0.02h 0.00d # Longest finished job: 332s 5.53m 0.09h 0.00d # Submission to last job: 250536s 4175.60m 69.59h 2.90d ssh kkstore06 cd /cluster/data/mm9/bed/bacends screen mkdir temp time nice -n +19 pslSort dirs raw.psl temp out/* > pslSort.out 2>&1 & # real 22m4.019s # -rw-rw-r-- 1 8423154460 Aug 6 13:40 raw.psl time nice -n +19 pslReps -nearTop=0.01 -minCover=0.7 -minAli=0.8 \ -noIntrons raw.psl bacEnds.psl /dev/null > pslReps.out 2>&1 & # real 6m1.174s # -rw-rw-r-- 1 1236810588 Aug 6 13:51 bacEnds.psl # split this large psl file into pieces with 100,000 lines each # to prepare for a sort time nice -n +19 ~/kent/src/hg/pslSplitOnTarget/pslSplitLineCount.pl \ 100000 bacEnds.psl split/bacends # real 0m15.389s # save original file, then sort mv bacEnds.psl bacEnds.psl.save time pslSort dirs bacEnds.psl temp split # real 2m19.131s # -rw-rw-r-- 1 1236810588 Aug 6 14:38 bacEnds.psl ## compare to previous results wc -l bacEnds.psl /cluster/data/mm8/bed/bacends/bacEnds.psl # 10294737 bacEnds.psl # 10229750 /cluster/data/mm8/bed/bacends/bacEnds.psl ## work at top-level directory after this mkdir /cluster/data/mm9/bacends cp -p bacEnds.psl /cluster/data/mm9/bacends ############################################################################ # BACEND PAIRS TRACK (DONE - 2007-08-06 - Hiram) ssh kolossus cd /cluster/data/mm9/bacends time nice -n +19 pslPairs -tInsert=10000 -minId=0.91 -noBin -min=25000 \ -max=350000 -slopval=10000 -hardMax=500000 -slop -short -long -orphan \ -mismatch -verbose bacEnds.psl \ ../bed/cloneend/cloneEndPairs.txt all_bacends bacEnds # real 0m49.120s ## produces files: # -rw-rw-r-- 1 199185 Aug 6 14:46 bacEnds.slop # -rw-rw-r-- 1 144486 Aug 6 14:46 bacEnds.short # -rw-rw-r-- 1 24399410 Aug 6 14:46 bacEnds.pairs # -rw-rw-r-- 1 25421100 Aug 6 14:46 bacEnds.orphan # -rw-rw-r-- 1 201794 Aug 6 14:46 bacEnds.mismatch # -rw-rw-r-- 1 15928 Aug 6 14:46 bacEnds.long # create header required by "rdb" tools echo -e \ "chr\tstart\tend\tclone\tscore\tstrand\tall\tfeatures\tstarts\tsizes" > header echo -e "10\t10N\t10N\t10\t10N\t10\t10\t10N\t10\t10" >> header cat header bacEnds.pairs | \ /cluster/bin/scripts/row score ge 300 | \ /cluster/bin/scripts/sorttbl chr start | \ /cluster/bin/scripts/headchg -del > bacEndPairs.bed # -rw-rw-r-- 1 24201067 Aug 6 14:49 bacEndPairs.bed cat header bacEnds.slop bacEnds.short bacEnds.long bacEnds.mismatch \ bacEnds.orphan | /cluster/bin/scripts/row score ge 300 | \ /cluster/bin/scripts/sorttbl chr start | \ /cluster/bin/scripts/headchg -del > bacEndPairsBad.bed # -rw-rw-r-- 1 6888559 Aug 6 14:49 bacEndPairsBad.bed /cluster/bin/scripts/extractPslLoad -noBin bacEnds.psl bacEndPairs.bed \ bacEndPairsBad.bed >j1.out # -rw-rw-r-- 1 989173324 Aug 6 14:52 j1.out cat j1.out | /cluster/bin/scripts/sorttbl tname tstart >j2.out # -rw-rw-r-- 1 989173324 Aug 6 15:07 j2.out cat j2.out | /cluster/bin/scripts/headchg -del > bacEnds.load.psl # -rw-rw-r-- 1 989173165 Aug 6 15:08 bacEnds.load.psl rm j1.out j2.out # CHECK bacEndPairs.bed ID's to make sure they have no blanks in them awk '{print $5}' bacEndPairs.bed | sort -u # result should be the scores, no extraneous strings: # 1000 # 300 # 375 # 500 # 750 # edit the file and fix it if it has a bad name. wc -l bacEnds.load.psl /cluster/data/mm8/bacends/bacEnds.load.psl # 8167555 bacEnds.load.psl # 8132116 /cluster/data/mm8/bacends/bacEnds.load.psl # load into database ssh hgwdev cd /cluster/data/mm9/bacends hgLoadBed -notItemRgb mm9 bacEndPairs bacEndPairs.bed \ -sqlTable=$HOME/kent/src/hg/lib/bacEndPairs.sql # Loaded 239101 elements of size 11 # note - this track isn't pushed to RR, just used for assembly QA hgLoadBed -notItemRgb mm9 bacEndPairsBad bacEndPairsBad.bed \ -sqlTable=$HOME/kent/src/hg/lib/bacEndPairsBad.sql # Loaded 84679 elements of size 11 # NOTE: truncates file to 0 if -nobin is used time hgLoadPsl mm9 -table=all_bacends bacEnds.load.psl # load of all_bacends did not go as planned: 8167555 record(s), 0 row(s) # skipped, 2 warning(s) loading psl.tab # real 4m1.142s ## to find out what the warnings are about: ## first, on hgwdev, dump the loaded table hgsql -N -e "select qName from all_bacends;" mm9 \ | sort -u > all_bacends.qName.txt ## then on kkstore06 compare the resulting load with the requested load file diff psl.tab mm9.all_bacends.txt ## this diff shows two markers had their qBaseInsert count changed from ## a negative number to a zero since that field is an unsigned ## AG326808 and AG609381 ## joinerCheck should be clean: joinerCheck -keys -identifier=bacEndNames -database=mm9 all.joiner # Checking keys on database mm9 # mm9.bacEndPairs.lfNames - hits 478202 of 478202 ok featureBits mm9 all_bacends # 349085662 bases of 2620346127 (13.322%) in intersection featureBits mm8 all_bacends # 327086559 bases of 2567283971 (12.741%) in intersection featureBits mm7 all_bacends # 334161740 bases of 2583394090 (12.935%) in intersection featureBits mm6 all_bacends # 336981828 bases of 2597150411 (12.975%) in intersection featureBits mm5 all_bacends # 268502414 bases of 2615483787 (10.266%) in intersection featureBits mm4 all_bacends # 243096171 bases of 2627444668 (9.252%) in intersection featureBits mm9 bacEndPairs # 209909804 bases of 2620346127 (8.011%) in intersection featureBits mm8 bacEndPairs # 2572527283 bases of 2567283971 (100.204%) in intersection featureBits mm7 bacEndPairs # 2578837424 bases of 2583394090 (99.824%) in intersection featureBits mm6 bacEndPairs # 2570768812 bases of 2597150411 (98.984%) in intersection featureBits mm5 bacEndPairs # 2567958504 bases of 2615483787 (98.183%) in intersection featureBits mm4 bacEndPairs # 2549945356 bases of 2627444668 (97.050%) in intersection featureBits mm9 bacEndPairsBad # 48850302 bases of 2620346127 (1.864%) in intersection ####################################################################### # Special one-off bacEnds added (DONE - 2008-01-09 - Hiram) ssh hgwdev # BAC RP23-473N24 was reported missing # its two ends are AZ095043 and AZ095046 # end AZ095046 maps just fine to the correct location on chr7 # the end AZ095043 does not map correctly when using the -ooc # option to blat. Run the blat without ooc and it does the # correct thing. From the genbank record: cd /cluster/data/mm9/bed/bacends cat << '_EOF_' > AZ095043.fa >AZ095043 TTTATCATGAATGGGTGTTGTATCTTGTCGAAGCTTTTTCCGCATCTAACGAGATGATCATGTGGTTTTT GTCTTTGAGTTTGTTTATATAATGGATTACATTGATGGATTTTCATATATTAAACCATCCCTGCATCCCT GGAATAAAACCTACTTGGTCAGGATGGATGACTGCCAAGGCGGACCGGG '_EOF_' blat /san/sanvol1/scratch/mm9/noMask/chr7.fa AZ095043.fa AZ095043.raw.psl pslReps -nearTop=0.01 -minCover=0.7 -minAli=0.8 \ -noIntrons AZ095043.raw.psl AZ095043.psl /dev/null # before adding this one item: hgsql -e "select count(*) from all_bacends;" mm9 # 8167555 hgLoadPsl -table=all_bacends -append mm9 AZ095043.psl # verify one row added hgsql -e "select count(*) from all_bacends;" mm9 # 8167556 # Using the Mm6 records from all_bacends and bacEndPairs as a guide # The bed record for this BAC is therefore: cat << '_EOF_' > RP23-473N24.bed chr7 150015932 150193247 RP23-473N24 1000 - all_bacends 2 150015932,150192880 172,367 AZ095043,AZ095046 '_EOF_' # verify rows before adding this one new row hgsql -e "select count(*) from bacEndPairs;" mm9 # 239101 # YOW ! The -oldTable option didn't work ! I'm guessing that with # the -sqlTable argument it became confused hgLoadBed -oldTable -notItemRgb mm9 bacEndPairs RP23-473N24.bed \ -sqlTable=$HOME/kent/src/hg/lib/bacEndPairs.sql # reload everything: cat ../../bacends/bacEndPairs.bed RP23-473N24.bed \ | hgLoadBed -notItemRgb mm9 bacEndPairs stdin \ -sqlTable=$HOME/kent/src/hg/lib/bacEndPairs.sql ####################################################################### ## create random contigs for genscan and other alignment tasks ## DONE - 2007-08-07 - Hiram ssh kkstore06 mkdir randomContigs for L in ?/lift/random.lft ??/lift/random.lft do D=${L/\/lift*} echo $L $D ~/kent/src/hg/utils/lft2BitToFa.pl mm9.2bit ${L} \ > randomContigs/chr${D}_random.ctg.fa done # # Verify these *.ctg.fa files have the same bases as the ordinary # chr*_random.fa files: ## don't have these fasta files yet, extract them from the 2bit grep random chrom.sizes | cut -f1 | sed -e "s/^chr//; s/_random//" \ | while read C do echo "twoBitToFa -seq=chr${C}_random mm9.2bit ${C}/chr${C}_random.fa" twoBitToFa -seq=chr${C}_random mm9.2bit ${C}/chr${C}_random.fa done ## now we can measure them faSize ?/chr?_random.fa ??/chr??_random.fa # 70853964 bases (9033771 N's 61820193 real 26427973 upper # 35392220 lower) in 13 sequences in 13 files ## and our contig versions faSize randomContigs/*.ctg.fa # 62053964 bases (233771 N's 61820193 real 26427973 upper # 35392220 lower) in 189 sequences in 13 files ## note, same number of real, upper and lower, only different N's ## it would be nice to have the actual chroms too grep -v random chrom.sizes | cut -f1 | sed -e "s/^chr//" \ | while read C do echo "twoBitToFa -seq=chr${C} mm9.2bit ${C}/chr${C}.fa" twoBitToFa -seq=chr${C} mm9.2bit ${C}/chr${C}.fa done # measure that result faSize ?/chr?.fa ??/chr??.fa # 2654911517 bases (96385738 N's 2558525779 real 1438609919 # upper 1119915860 lower) in 22 sequences in 22 files ## is this the amount of sequence specified in chrom.sizes ? grep -v random chrom.sizes | ave -col=2 stdin | grep total # total 2654911517.000000 ## same number, nothing lost ######################################################################### # GENSCAN PREDICTIONS (DONE - 2007-08-07 - 2007-08-10 - Hiram) ssh kkstore06 # Create a 2bit file with the full chrom sequences and the # random contigs, all hard masked ## later it was found that chr16_random.ctg.fa should not be in ## this genscan run. So, it was temporarily taken out of this directory ## and this sequence was rerun to avoid it. cat ?/chr?.fa ??/chr??.fa randomContigs/chr*.ctg.fa \ | maskOutFa stdin hard stdout \ | faToTwoBit stdin mm9Chroms_RandomContigs.hard.2bit # with chr16_random removed: # 2716961487 bases (1251923595 N's 1465037892 real 1465037892 upper 0 # lower) in 210 sequences in 1 files # make sure it still has all the unmasked sequence in it: (incl 16) twoBitToFa mm9Chroms_RandomContigs.hard.2bit stdout \ | faSize stdin # 2716965481 bases (1251927589 N's 1465037892 real 1465037892 upper # 0 lower) in 211 sequences in 1 files twoBitToFa mm9.2bit stdout | faSize stdin # 2725765481 bases (105419509 N's 2620345972 real 1465037892 upper # 1155308080 lower) in 35 sequences in 1 files # note the upper bases are the same, the lowers have become N's # lower 1155308080 + upper 1465037892 = 2620345972 real # N's 1251927589 - N's 105419509 = 1146508080 == # N's in gaps between contigs # And, make sure there aren't any sequences in this lot that have # become all N's with no sequence left in them. This drives genscan nuts twoBitToFa mm9Chroms_RandomContigs.hard.2bit stdout \ | faCount stdin > chroms_randoms.faCount # the lowest three are: egrep -v "^#|^total" chroms_randoms.faCount \ | awk '{print $1,$2-$7}' | sort -k2,2nr | tail -3 # NT_166474.1 75 # NT_166461.1 66 # NT_166481.1 39 # NT_166325.1 0 ## This last one is the entire chr16_random and it is only 3,994 bases ## long and is marked entirely by RepeatMasker as a line. It needs ## to be skipped during the run of genscan. Go back to the 2bit creation ## and do not include chr16_random # creating 4,000,000 sized chunks, the chroms stay together as # single pieces. The contigs get grouped together into 4,000,000 # sized fasta files. You don't want to break these things up # because genscan will be doing its own internal 2.4 million # window on these pieces, and the gene names are going to be # constructed from the sequence name in these fasta files. The # gene names are much better when they are this simple chrN.M # numbering scheme, or in the case of a contig: contig_name.M # where the M is a sequence number that genscan will assign to # each gene it discovers. mkdir hardChunks twoBitToFa mm9Chroms_RandomContigs.hard.2bit stdout \ | faSplit about stdin 4000000 hardChunks/c_ ssh kkr1u00 mkdir /iscratch/i/mus/mm9/hardChunks cd /iscratch/i/mus/mm9/hardChunks rsync -a --progress /cluster/data/mm9/hardChunks/ . for R in 2 3 4 5 6 7 8 do rsync -a --progress ./ kkr${R}u00:/iscratch/i/mus/mm9/hardChunks/ done ssh hgwdev mkdir /cluster/data/mm9/bed/genscan cd /cluster/data/mm9/bed/genscan # Check out hg3rdParty/genscanlinux to get latest genscan: cvs co hg3rdParty/genscanlinux # Run on small cluster (more mem than big cluster). ssh kki cd /cluster/data/mm9/bed/genscan # Make 3 subdirectories for genscan to put their output files in mkdir gtf pep subopt # Generate a list file, genome.list, of all the hard-masked contigs that # *do not* consist of all-N's (which would cause genscan to blow up) # Since we split on gaps, we have no chunks like that. You can # verify with faCount on the chunks. ls -1Sr /iscratch/i/mus/mm9/hardChunks/c_*.fa > genome.list ## for next time, this isn't a parasol safe method of operation. ## if genscan is writing answers to gtf/ pep/ and subopt/ during ## its operation and it fails. parsol wouldn't be able to verify that ## it was complete merely by file existence check. This should work ## in scratch/tmp entirely, then copy results back after it is done. # Create template file, for gensub2. For example (3-line file): cat << '_EOF_' > template #LOOP /cluster/bin/x86_64/gsBig {check in line+ $(path1)} {check out line gtf/$(root1).gtf} -trans={check out line pep/$(root1).pep} -subopt={check out line subopt/$(root1).bed} -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=2400000 #ENDLOOP '_EOF_' # << happy emacs gensub2 genome.list single template jobList para create jobList para try, check, push, check, ... # Completed: 35 of 36 jobs # CPU time in finished jobs: 279581s 4659.68m 77.66h 3.24d 0.009 y # IO & Wait Time: 3390s 56.50m 0.94h 0.04d 0.000 y # Average job time: 8085s 134.75m 2.25h 0.09d # Longest finished job: 32422s 540.37m 9.01h 0.38d # Submission to last job: 122301s 2038.35m 33.97h 1.42d # There was a failed job, going to kolossus and running it again, # it takes a very long time, and fails with this cryptic error: # No overlap between a and b in mergeTwo ssh kolossus cd /cluster/data/mm9/bed/genscan time /cluster/bin/x86_64/gsBig /iscratch/i/mus/mm9/hardChunks/c_06.fa \ gtf/c_06.gtf -trans=pep/c_06.pep -subopt=subopt/c_06.bed \ -exe=hg3rdParty/genscanlinux/genscan \ -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/scratch/tmp \ -window=2400000 # real 922m2.382s # run it with a reduced window size to see if it will complete time nice -n +19 /cluster/bin/x86_64/gsBig \ /iscratch/i/mus/mm9/hardChunks/c_06.fa \ gtf/c_06.gtf -trans=pep/c_06.pep -subopt=subopt/c_06.bed \ -exe=hg3rdParty/genscanlinux/genscan \ -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/scratch/tmp \ -window=2000000 # real 648m24.682s ## that one failed too, with an error: # /scratch/tmp/temp_gsBig_10943_chr7_38.genscan is not a GENSCAN output file ## and the contents of that file said: # Insufficient memory error: results may be unreliable. # Try running program an a portion of sequence. # Let's try splitting up this chr7 on the gaps, which there are plenty # of in this hard masked sequence. Ended up breaking the chr7 sequence # with the non bridged lift file. See the lft2BitToFa.pl file in # the chr7_split directory. # on kkstore06 ssh kkstore06 mkdir /cluster/data/mm9/bed/genscan/chr7_split cd /cluster/data/mm9/bed/genscan/chr7_split ./lft2BitToFa.pl ../../../mm9.2bit *.lft > chr7.contigs.hard.fa mkdir /cluster/data/mm9/bed/genscan/chr7_run cd /cluster/data/mm9/bed/genscan/chr7_run mkdir split faSplit sequence ../chr7_split/chr7.contigs.hard.fa 100 split/chr7_ ## Now, on the small kluster ssh kki cd /cluster/data/mm9/bed/genscan/chr7_run mkdir gtf pep subopt # Create template file, for gensub2. For example (3-line file): cat << '_EOF_' > template #LOOP /cluster/bin/x86_64/gsBig {check in line+ $(path1)} {check out line gtf/$(root1).gtf} -trans={check out line pep/$(root1).pep} -subopt={check out line subopt/$(root1).bed} -exe=../hg3rdParty/genscanlinux/genscan -par=../hg3rdParty/genscanlinux/HumanIso.smat -tmp=/scratch/tmp -window=2400000 #ENDLOOP '_EOF_' # << happy emacs ls -1S split/chr7_*.fa > chr7.list gensub2 chr7.list single template jobList para create jobList para try ... check ... push ... etc... # Completed: 15 of 15 jobs # CPU time in finished jobs: 4226s 70.43m 1.17h 0.05d 0.000 y # IO & Wait Time: 215s 3.59m 0.06h 0.00d 0.000 y # Average job time: 296s 4.93m 0.08h 0.00d # Longest finished job: 861s 14.35m 0.24h 0.01d # Submission to last job: 861s 14.35m 0.24h 0.01d # lift these chr7 results into a single file, # fixup the gene names with the sed to remove the lift name effect ssh kkstore06 cd /cluster/data/mm9/bed/genscan/chr7_run cat gtf/chr7_*.gtf | liftUp -type=.gtf stdout \ ../chr7_split/nonBridgedChr7.lft error stdin \ | sed -e "s/chr7\.\([0-9][0-9]*\)\./chr7.\1/g" > chr7.gtf cat subopt/chr7_*.bed | liftUp -type=.bed stdout \ ../chr7_split/nonBridgedChr7.lft error stdin \ | sed -e "s/chr7\.\([0-9][0-9]*\)\./chr7.\1/g" > chr7.subopt.bed cat pep/chr7_*.pep | sed -e "s/chr7\.\([0-9][0-9]*\)\./chr7.\1/g" > chr7.pep ## these results become the c_06 results in the main run cp -p chr7.pep ../pep/c_06.pep cp -p chr7.subopt.bed ../subopt/c_06.bed cp -p chr7.gtf ../gtf/c_06.gtf ## after the chr7 business above, back to the mainline processing # cat and lift the results into single files ssh kkstore06 cd /cluster/data/mm9/bed/genscan cat gtf/c_*.gtf | liftUp -type=.gtf genscan.gtf \ ../../jkStuff/mm9.contigs.lift carry stdin cat subopt/c_*.bed | liftUp -type=.bed genscanSubopt.bed \ ../../jkStuff/mm9.contigs.lift carry stdin cat pep/c_*.pep > genscan.pep # Load into the database as so: ssh hgwdev cd /cluster/data/mm9/bed/genscan ldHgGene mm9 -gtf genscan genscan.gtf # Read 45189 transcripts in 324075 lines in 1 files # 45189 groups 34 seqs 1 sources 1 feature types # 45189 gene predictions hgPepPred mm9 generic genscanPep genscan.pep hgLoadBed mm9 genscanSubopt genscanSubopt.bed # Loaded 525904 elements of size 6 # check the numbers time nice -n +19 featureBits mm9 genscan # 55293837 bases of 2620346127 (2.110%) in intersection time nice -n +19 featureBits mm8 genscan # 54455852 bases of 2567283971 (2.121%) in intersection time nice -n +19 featureBits mm8 knownGene:cds # 28459053 bases of 2567283971 (1.109%) in intersection featureBits mm7 genscan # 54864694 bases of 2583394090 (2.124%) in intersection time nice -n +19 featureBits mm7 knownGene:cds # 27531524 bases of 2583394090 (1.066%) in intersection featureBits mm9 genscanSubopt # 57044145 bases of 2620346127 (2.177%) in intersection featureBits mm8 genscanSubopt # 57048581 bases of 2567283971 (2.222%) in intersection featureBits mm7 genscanSubopt # 57512333 bases of 2583394090 (2.226%) in intersection featureBits mm6 genscanSubopt # 57856316 bases of 2597150411 (2.228%) in intersection featureBits mm5 genscanSubopt # 58474899 bases of 2615483787 (2.236%) in intersection featureBits mm4 genscanSubopt # 59601009 bases of 2627444668 (2.268%) in intersection featureBits mm3 genscanSubopt # 56085184 bases of 2505900260 (2.238%) in intersection ############################################################################# # BLASTZ SELF (DONE - 2007-08-07 - 2007-08-31 - Hiram) # using chain min score of 10,000 to cut down on volumn of data # trying a two pass sequence, chroms with chroms, then randoms to chroms # swap the randoms, then combine the three results into a final set ssh kkstore06 cd /cluster/data/mm9 time nice -n +19 faToTwoBit ?/chr?.fa ??/chr??.fa mm9.chroms.2bit time nice -n +19 faToTwoBit randomContigs/chr*.ctg.fa mm9.randomContigs.2bit ssh kkr1u00 cd /iscratch/i/mus/mm9 cp -p /cluster/data/mm9/mm9.chroms.2bit . cp -p /cluster/data/mm9/mm9.randomContigs.2bit . twoBitInfo mm9.chroms.2bit mm9.chroms.sizes twoBitInfo mm9.randomContigs.2bit mm9.randomContgs.sizes for R in 2 3 4 5 6 7 8 do rsync -a --progress ./ kkr${R}u00:/iscratch/i/mus/mm9/ done ssh kkstore06 mkdir /cluster/data/mm9/bed/blastzSelf.2007-08-07 cd /cluster/data/mm9/bed/blastzSelf.2007-08-07 cat << '_EOF_' > DEF # mouse vs mouse BLASTZ_H=2000 BLASTZ_M=200 # TARGET: Mouse Mm9 SEQ1_DIR=/iscratch/i/mus/mm9/mm9.chroms.2bit SEQ1_LEN=/cluster/data/mm9/mm9.chroms.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Mouse Mm9 SEQ2_DIR=/iscratch/i/mus/mm9/mm9.chroms.2bit SEQ1_LEN=/cluster/data/mm9/mm9.chroms.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/cluster/data/mm9/bed/blastzSelf.2007-08-07 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs ## run this in a screen on kkstore06 cd /cluster/data/mm9/bed/blastzSelf.2007-08-07 time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -chainMinScore=10000 -chainLinearGap=medium -bigClusterHub=kk \ -stop=load `pwd`/DEF > blastz.out 2>&1 & # This was a tricky one to complete. A situation was fixed in the # blastz-run-ucsc script which may have helped, but then there were # 32 jobs that would only complete on the kki kluster. The kk nodes # complained about running out of memory. After a completed run was # finished, and verified: ssh kkstore06 cd /cluster/data/mm9/bed/blastzSelf.2007-08-07/psl find . -type f | wc -l # 77284 wc -l ../run.blastz/jobList # wc -l ../run.blastz.jobList # finished the rest by continuing at the 'cat' step: time doBlastzChainNet.pl -verbose=2 \ -chainMinScore=10000 -chainLinearGap=medium -bigClusterHub=kk \ -continue=cat -stop=load `pwd`/DEF > cat.out 2>&1 & # real 285m33.094s # failed during the load because of the SEQ?_LEN specification pointing # to /iscratch/i which is not available on hgwdev. So, only use # the primary /cluster/data/mm9/chrom.sizes for the DEF file in the future # ran the load step manually to complete with the loadUp.csh fixed. ssh kolossus cd /cluster/data/mm9/bed/blastzSelf.2007-08-07 time nice -n +19 featureBits mm9 chainSelfLink \ > fb.mm9.chainSelfLink.noRandoms.txt 2>&1 # real 24m54.883s cat fb.mm9.chainSelfLink.noRandoms.txt # 323062218 bases of 2620346127 (12.329%) in intersection cd /cluster/data/mm9/bed ln -s blastzSelf.2007-08-07 blastz.mm9 ## prepare 2bit file of only the randoms ssh kkstore06 cd /cluster/data/mm9 faToTwoBit ?/chr?_random.fa ??/chr??_random.fa mm9.randoms.2bit # and the sizes files twoBitInfo mm9.randomContigs.2bit mm9.randomContigs.sizes twoBitInfo mm9.randoms.2bit mm9.randoms.sizes # a cluster run for just these bits of sequence mkdir /cluster/data/mm9/bed/blastzSelf.2007-08-07/randomsOnly cd /cluster/data/mm9/bed/blastzSelf.2007-08-07/randomsOnly cat << '_EOF_' > DEF # mouse vs mouse randoms PATH=/cluster/bin/penn/x86_64:/cluster/bin/penn:/cluster/bin/scripts:/cluster/bin/x86_64:/bin:/usr/bin BLASTZ_H=2000 BLASTZ_M=200 # TARGET: Mouse Mm9 SEQ1_DIR=/iscratch/i/mus/mm9/mm9.chroms.2bit SEQ1_LEN=/cluster/data/mm9/mm9.chroms.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Mouse Mm9 randoms only SEQ2_DIR=/cluster/data/mm9/mm9.randoms.2bit SEQ2_LEN=/cluster/data/mm9/mm9.randoms.sizes SEQ2_CTGDIR=/cluster/data/mm9/mm9.randomContigs.2bit SEQ2_CTGLEN=/cluster/data/mm9/mm9.randomContigs.sizes SEQ2_LIFT=/cluster/data/mm9/jkStuff/mm9.contigs.lift SEQ2_CHUNK=10000000 SEQ2_LIMIT=20 SEQ2_LAP=0 BASE=/cluster/data/mm9/bed/blastzSelf.2007-08-07/randomsOnly TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 ~/kent/src/hg/utils/automation/doBlastzChainNet.pl \ -verbose=2 -ignoreSelf \ -chainMinScore=10000 -chainLinearGap=medium -bigClusterHub=kk \ -stop=net `pwd`/DEF > blastz.out 2>&1 & # now swap the primary chroms back to the randoms mkdir /cluster/data/mm9/bed/blastzSelf.2007-08-07/randomsSwap cd /cluster/data/mm9/bed/blastzSelf.2007-08-07/randomsSwap chainSwap ../randomsOnly/axtChain/mm9.mm9.all.chain.gz stdout \ | nice chainSort stdin stdout | nice gzip -c \ > mm9.mm9.all.chain.gz # And then combine all three sets together mkdir /cluster/data/mm9/bed/blastzSelf.2007-08-07/allTogetherNow cd /cluster/data/mm9/bed/blastzSelf.2007-08-07/allTogetherNow chainSplit chain ../axtChain/mm9.mm9.all.chain.gz \ ../randomsOnly/axtChain/mm9.mm9.all.chain.gz \ ../randomsSwap/mm9.mm9.all.chain.gz # get them sorted by score correctly mkdir chainSort for F in `(cd chain; ls *.chain)` do echo $F chainSort chain/$F chainSort/$F done # re-number the chains consistently chainMergeSort chainSort/*.chain | nice gzip -c > mm9.mm9.all.chain.gz rm -fr chain mv chainSort chain # and for loading, split this consistently numbered set rm -fr chain time nice -n +19 chainSplit chain mm9.mm9.all.chain.gz # real 5m0.666s ## using a manually fixed up netChains.csh script: time nice -n +19 ./netChains.csh > netChains.out 2>&1 # real 147m53.147s ssh hgwdev ## using a manually fixed up loadUp.csh script: # (from ../axtChain/loadUp.csh) time nice -n +19 ./loadUp.csh > loadUp.out 2>&1 & # real 99m17.895s time nice -n +19 featureBits mm9 chainSelfLink > fb.mm9.chainSelfLink 2>&1 # real 30m3.402s # 378849408 bases of 2620346127 (14.458%) in intersection cat /cluster/data/mm8/bed/blastzSelf.2006-03-20/fb.mm8.chainSelfLink # 362483673 bases of 2567283971 (14.119%) in intersection # finish off the nets time nice -n +19 netClass -verbose=0 -noAr noClass.net mm9 mm9 mm9.mm9.net # real 1m9.538s # load nets (not needed for the RR, but useful on genome-test) time nice -n +19 netFilter -minGap=10 mm9.mm9.net \ | hgLoadNet -verbose=0 mm9 netSelf stdin # real 0m40.709s ## We don't deliver this track to the RR, so downloads are not necessary ############################################################################# # PREPARE LINEAGE SPECIFIC REPEAT FILES FOR BLASTZ (DONE - 2007-08-07 - Hiram) ssh kkr1u00 mkdir /iscratch/i/mus/mm9/rmsk cd /cluster/data/mm9 cp -p */chr*.fa.out /iscratch/i/mus/mm9/rmsk cd /iscratch/i/mus/mm9 for R in 2 3 4 5 6 7 8 do rsync -a --progress /iscratch/i/mm9/ kkr${R}u00:/iscratch/i/mm9/ done cd rmsk ssh kki mkdir /cluster/data/mm9/linSpecRep cd /cluster/data/mm9/linSpecRep ls -1S /iscratch/i/mus/mm9/rmsk > fa.list cat << '_EOF_' > mkLSR #!/bin/csh -fe pushd /iscratch/i/mus/mm9/rmsk rm -f $1_homo-sapiens_rattus_canis-familiaris_bos-taurus /cluster/bluearc/RepeatMasker070517/DateRepeats \ $1 -query mouse -comp human -comp rat -comp dog -comp cow popd /bin/cp -p \ /iscratch/i/mus/mm9/rmsk/$1_homo-sapiens_rattus_canis-familiaris_bos-taurus . rm -f /iscratch/i/mus/mm9/rmsk/$1_homo-sapiens_rattus_canis-familiaris_bos-taurus '_EOF_' # << happy emacs chmod +x mkLSR cat << '_EOF_' > template #LOOP ./mkLSR $(path1) {check out line+ $(path1)_homo-sapiens_rattus_canis-familiaris_bos-taurus} #ENDLOOP '_EOF_' # << happy emacs gensub2 fa.list single template jobList para try ... check ... push ... etc... para time # Completed: 35 of 35 jobs # CPU time in finished jobs: 1498s 24.96m 0.42h 0.02d 0.000 y # IO & Wait Time: 193s 3.22m 0.05h 0.00d 0.000 y # Average job time: 48s 0.81m 0.01h 0.00d # Longest finished job: 102s 1.70m 0.03h 0.00d # Submission to last job: 3399s 56.65m 0.94h 0.04d ssh kkstore06 cd /cluster/data/mm9/linSpecRep mkdir notInHuman notInRat notInDog notInCow notInRabbit for F in chr*.out_homo-sapiens* do B=${F/.fa.out*/} echo $B /cluster/bin/scripts/extractRepeats 1 ${F} > \ notInHuman/${B}.out.spec /cluster/bin/scripts/extractRepeats 2 ${F} > \ notInRat/${B}.out.spec /cluster/bin/scripts/extractRepeats 3 ${F} > \ notInDog/${B}.out.spec /cluster/bin/scripts/extractRepeats 4 ${F} > \ notInCow/${B}.out.spec done # the notInHuman, notInDog, and notInCow ended up being # identical. Only the notInRat was different than them # To check identical find . -name "*.out.spec" | \ while read FN; do echo `cat ${FN} | sum -r` ${FN}; done \ | sort -k1,1n | sort -t"/" -k3,3 # Copy to iscratch for use in kluster runs ssh kkr1u00 mkdir -p /iscratch/i/mus/mm9/linSpecRep/notInRat mkdir -p /iscratch/i/mus/mm9/linSpecRep/notInOthers cd /iscratch/i/mus/mm9/linSpecRep/notInRat cp -p /cluster/data/mm9/linSpecRep/notInRat/* . cd /iscratch/i/mus/mm9/linSpecRep/notInOthers cp -p /cluster/data/mm9/linSpecRep/notInHuman/* . # copy this directory to the other Iservers cd /iscratch/i/mus/mm9 for R in 2 3 4 5 6 7 8 do rsync -a --progress ./ kkr${R}u00:/iscratch/i/mus/mm9/ done # and we can do the Iservers simply: ssh kkr1u00 cd /iscratch/i/mm9 # no longer need these two directories rm -fr fa rmsk rsync -a --progress /cluster/bluearc/scratch/hg/mm9/ . for R in 2 3 4 5 6 7 8 do rsync -a --progress /iscratch/i/mm9/ kkr${R}u00:/iscratch/i/mm9/ done # We also need the nibs for blastz runs with lineage specific repeats ssh kkstore06 mkdir /cluster/data/mm9/nib cd /cluster/data/mm9 for FA in ?/chr*.fa ??/chr*.fa do F=${FA/*\//} F=${F/.fa/} echo faToNib -softMask ${FA} nib/${F}.nib faToNib -softMask ${FA} nib/${F}.nib done # copied to /cluster/bluearc/scratch/data/mm9/nib/ # and everything else we will need for kluster runs into # /cluster/bluearc/scratch/data/mm9/ # Ask cluster-admin to sync /scratch/ filesystem to kluster nodes ######################################################################### # BLASTZ RAT Rn4 (DONE - 2007-08-09 - 2007-08-15 - Hiram) # re-run a second time with tighter parameters, see below for second run ssh kkstore06 mkdir /cluster/data/mm9/bed/blastzRn4.2007-08-09 cd /cluster/data/mm9/bed/blastzRn4.2007-08-09 # Started this before the rsync to /scratch/data/mm9/ had completed, # hence the /cluster/bluearc/scratch/data/mm9/ location is used # here. cat << '_EOF_' > DEF # mouse vs rat BLASTZ_ABRIDGE_REPEATS=1 # TARGET: Mouse Mm9 SEQ1_DIR=/cluster/bluearc/scratch/data/mm9/nib SEQ1_SMSK=/cluster/bluearc/scratch/data/mm9/notInRat SEQ1_LEN=/cluster/data/mm9/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Rat Rn4 - single chunk big enough to run each chrom by itself SEQ2_DIR=/iscratch/i/rn4/nib SEQ2_SMSK=/iscratch/i/rn4/linSpecRep.notInMouse SEQ2_LEN=/cluster/data/rn4/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/cluster/data/mm9/bed/blastzRn4.2007-08-09 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # establish a screen to control this job screen time nice -n +19 doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=kk -chainMinScore=3000 -chainLinearGap=medium \ `pwd`/DEF > blastz.out 2>&1 & ## had to fix the blast-run-ucsc script to get these to complete. # the chr16_random sequence was causing problems because it has no usable # sequence in it for blastz to work with. And finally, two jobs needed to # be run manually on kolossus, don't know what happened with them, # although their output was immense: # -rw-rw-r-- 1 15054644 Aug 14 10:22 chr2.nib:chr2:80000000-90010000_chr7.nib:chr7:0-10000000.psl # -rw-rw-r-- 1 18992595 Aug 14 11:02 chr2.nib:chr2:80000000-90010000_chr3.nib:chr3:70000000-80000000.psl # I suspect there is something going on with large results and running on # the kk nodes. I'm getting the same trouble with the self blastz. # then, continuing with the cat time nice -n +19 doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=kk -chainMinScore=3000 -chainLinearGap=medium \ -continue=cat `pwd`/DEF > cat.out 2>&1 & # real 239m51.356s cat fb.mm9.chainRn4Link.txt # 1791195056 bases of 2620346127 (68.357%) in intersection cat /cluster/data/mm8/bed/blastz.rn4/fb.mm8.chainRn4Link # 1770319811 bases of 2567283971 (68.957%) in intersection cd /cluster/data/mm9/bed ln -s blastzRn4.2007-08-09 blastz.rn4 mkdir /cluster/data/rn4/bed/blastz.mm9.swap cd /cluster/data/rn4/bed/blastz.mm9.swap time ~/kent/src/hg/utils/automation/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=kk -chainMinScore=3000 -chainLinearGap=medium \ -swap /cluster/data/mm9/bed/blastzRn4.2007-08-09/DEF > swap.out 2>&1 & # real 209m11.032s cd /cluster/data/rn4/bed ln -s blastz.mm9.swap blastz.mm9 cat /cluster/data/rn4/bed/blastz.mm9/fb.rn4.chainMm9Link.txt # 1788261968 bases of 2571531505 (69.541%) in intersection cat /cluster/data/rn4/bed/blastz.mm8/fb.rn4.chainMm8Link.txt # 1791093685 bases of 2571531505 (69.651%) in intersection ######################################################################### ## multiple alignment preparation stats # The following table will keep track of the pairwise alignments # completed. (The % NN.Nxx mean not done yet) # featureBits chainLink measures # chainMm9Link chain linearGap # distance on Mm9 on other minScore # 1 0.1587 - rat rn4 (% 68.357) (% 69.541) 3000 medium # 2 0.4677 - human hg18 (% 38.499) (% 35.201) 3000 medium # 3 0.4686 - chimp panTro2 (% 37.5xx) (% 33.6xx) 3000 medium # 4 0.4960 - macaque rheMac2 (% 34.7xx) (% 33.1xx) 3000 medium # 5 0.5131 - rabbit oryCun1 (% 19.3xx) (no swap ) 3000 medium # 6 0.6142 - armadillo dasNov1 (% 16.8xx) (no swap ) 3000 medium # 7 0.6230 - dog canFam2 (% 32.2xx) (% 34.2xx) 3000 medium # 8 0.6256 - elephant loxAfr1 (% 18.3xx) (no swap ) 3000 medium # 9 0.6344 - cow bosTau2 (% 26.8xx) (% 24.2xx) 3000 medium # 10 0.7805 - tenrec echTel1 (% 11.4xx) (no swap ) 5000 loose # 11 1.0698 - opossum monDom4 (% 8.2xx) (% 6.0xx) 5000 loose # 12 1.3425 - chicken galGal2 (% 2.5xx) (% 5.4xx) 5000 loose # 13 1.7936 - frog xenTro2 (% 2.6xx) (% 5.3xx) 5000 loose # 14 2.0157 - tetraodon tetNig1 (% 1.9xx) (% 13.7xx) 5000 loose # 15 2.0562 - fugu fr1 (% 1.9xx) (% 13.5xx) 5000 loose # 16 2.1059 - zebrafish danRer5 (% 2.1xx) (% 3.5xx) 5000 loose ########################################################################## ## BLASTZ SWAP from Hg18 to Mm9 (DONE - 2007-08-15 - Hiram) # also in hg18.txt cat /cluster/data/hg18/bed/blastzMm9.2007-08-09/fb.hg18.chainMm9Link.txt # 1014323175 bases of 2881515245 (35.201%) in intersection # Then to swap over to Mm9 ssh kkstore06 mkdir /cluster/data/mm9/bed/blastz.hg18.swap cd /cluster/data/mm9/bed/blastz.hg18.swap time nice -n +19 ~/kent/src/hg/utils/automation/doBlastzChainNet.pl \ -verbose=2 -swap -bigClusterHub=pk -chainMinScore=3000 \ -chainLinearGap=medium \ /cluster/data/hg18/bed/blastz.mm9/DEF > swap.out 2>&1 & # real 67m21.146s cat /cluster/data/mm9/bed/blastz.hg18.swap/fb.mm9.chainHg18Link.txt # 1008812599 bases of 2620346127 (38.499%) in intersection cat /cluster/data/mm8/bed/blastz.hg18/fb.mm8.chainHg18Link # 984380268 bases of 2567283971 (38.343%) in intersection cd /cluster/data/mm9/bed ln -s blastz.hg18.swap blastz.hg18 ## make swapped syntenic net cd /cluster/data/mm9/bed/blastz.hg18.swap time nice -n +19 ~/kent/src/hg/utils/automation/doBlastzChainNet.pl \ -verbose=2 -bigClusterHub=pk -chainMinScore=3000 \ -swap -syntenicNet -chainLinearGap=medium -continue=syntenicNet \ /cluster/data/hg18/bed/blastz.mm9/DEF > syntenic.out 2>&1 & ## real 20m49.712s ######################################################################### # BLASTZ RAT Rn4 (DONE - 2007-08-30 - Hiram) # re-run this second time with tighter parameters ssh kkstore06 mkdir /cluster/data/mm9/bed/blastzRn4.2007-08-30 cd /cluster/data/mm9/bed/blastzRn4.2007-08-30 # Started this before the rsync to /scratch/data/mm9/ had completed, # hence the /cluster/bluearc/scratch/data/mm9/ location is used # here. cat << '_EOF_' > DEF # mouse vs rat # Specially tuned blastz parameters from Webb Miller BLASTZ_ABRIDGE_REPEATS=0 BLASTZ_O=600 BLASTZ_E=150 BLASTZ_Y=15000 BLASTZ_T=2 BLASTZ_K=4500 BLASTZ_Q=/cluster/data/blastz/human_chimp.v2.q # TARGET: Mouse Mm9 SEQ1_DIR=/scratch/data/mm9/mm9.2bit SEQ1_LEN=/cluster/data/mm9/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Rat Rn4 - single chunk big enough to run each chrom by itself SEQ2_DIR=/scratch/hg/rn4/rn4.2bit SEQ2_LEN=/cluster/data/rn4/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/cluster/data/mm9/bed/blastzRn4.2007-08-30 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # establish a screen to control this job screen time nice -n +19 doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=medium \ -stop=net \ `pwd`/DEF > blastz.out 2>&1 & # this runs much faster than the usual blastz run # failed when it got to the kki run since /scratch/hg/rn4/ was not # complete on the Iservers. Fixup that, then, continue: time nice -n +19 doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=medium \ -continue=chainMerge -stop=net \ `pwd`/DEF > chainMerge.out 2>&1 & # And then, kolossus had no /scratch/data/ directory, go there and # make this a symlink to /iscratch/data/ # and run the axtChain/netChains.csh script manually on kolossus ######################################################################### # BLASTZ/CHAIN/NET oryLat1 (DONE - 2007-08-31 - Hiram) ssh kkstore04 mkdir /cluster/data/mm9/bed/blastzOryLat1.2007-08-30 cd /cluster/data/mm9/bed/blastzOryLat1.2007-08-30 cat << '_EOF_' > DEF # mouse vs medaka BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_M=50 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET: Mouse Mm9 SEQ1_DIR=/scratch/data/mm9/mm9.2bit SEQ1_LEN=/cluster/data/mm9/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Medaka oryLat1 (40M chunks covers the largest chroms in one gulp) # chrUn in Scaffolds for this alignment run SEQ2_DIR=/san/sanvol1/scratch/oryLat1/oryLat1.sdTrf.2bit SEQ2_LEN=/san/sanvol1/scratch/oryLat1/chrom.sizes SEQ2_CTGDIR=/san/sanvol1/scratch/oryLat1/oryLat1UnScaffolds.2bit SEQ2_CTGLEN=/san/sanvol1/scratch/oryLat1/oryLat1UnScaffolds.sizes SEQ2_LIFT=/san/sanvol1/scratch/oryLat1/chrUn.lift SEQ2_CHUNK=40000000 SEQ2_LIMIT=50 SEQ2_LAP=0 BASE=/cluster/data/mm9/bed/blastzOryLat1.2007-08-30 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl DEF -chainMinScore=5000 \ -qRepeats=windowmaskerSdust -chainLinearGap=loose \ -bigClusterHub=kk -verbose=2 > do.log 2>&1 & # real 512m56.909s # had a single failed kk job, finished manually, then: time nice -n +19 doBlastzChainNet.pl DEF -chainMinScore=5000 \ -qRepeats=windowmaskerSdust -chainLinearGap=loose \ -continue=cat -bigClusterHub=kk -verbose=2 > cat.log 2>&1 & # real 11m5.508s ## typical failure: # HgStepManager: executing step 'net' Fri Aug 31 10:02:51 2007. # netChains: looks like previous stage was not successful (can't find [mm9.oryLat1.]all.chain[.gz]). # continuing time nice -n +19 doBlastzChainNet.pl DEF -chainMinScore=5000 \ -qRepeats=windowmaskerSdust -chainLinearGap=loose \ -continue=net -bigClusterHub=kk -verbose=2 > net.log 2>&1 & # real 21m33.501s cat fb.mm9.chainOryLat1Link.txt # 50650171 bases of 2620346127 (1.933%) in intersection # and the swap mkdir /cluster/data/oryLat1/bed/blastz.mm9.swap cd /cluster/data/oryLat1/bed/blastz.mm9.swap time nice -n +19 doBlastzChainNet.pl \ /cluster/data/mm9/bed/blastzOryLat1.2007-08-30/DEF \ -chainMinScore=5000 -qRepeats=windowmaskerSdust -chainLinearGap=loose \ -swap -bigClusterHub=kk -verbose=2 > swap.log 2>&1 & cat fb.oryLat1.chainMm9Link.txt # 45488232 bases of 700386597 (6.495%) in intersection ######################################################################### # LOAD ACEMBLY (DONE 9/17/07 angie) ssh kkstore06 mkdir /cluster/data/mm9/bed/acembly cd /cluster/data/mm9/bed/acembly wget --timestamping ftp://ftp.ncbi.nih.gov/repository/acedb/ncbi_37_Sep07.mouse.genes/AceView.mm_37.genes_gff.tar.gz wget --timestamping ftp://ftp.ncbi.nih.gov/repository/acedb/ncbi_37_Sep07.mouse.genes/AceView.mm_37.good_proteins_fasta.tar.gz tar xvzf AceView.mm_37.genes_gff.tar.gz tar xvzf AceView.mm_37.good_proteins_fasta.tar.gz cd AceView.mm_37.genes_gff # If the result of this command is > 0, then some lines have end < start # and need to be fixed: awk '$5 < $4 {print;}' *.gff | wc -l #0 # Add "chr" prefix: sed -e 's/^/chr/;' x1*.gff > acembly.gff # Extract annotation types from original gff: perl -wpe 's/^.*Gene_type (\w+); transcript_id (\S+);.*/$2\t$1/; \ s/Main$/main/ || s/Putative$/putative/ || \ die "Unrecognized class:\n$_\n";' *.gff \ | sort -u \ > acemblyClass.tab # Keep tabs on the transcript names that end in -unspliced -- # the first time around, had to add that suffix to some protein names # in order to get all of them to match. runJoiner is the real test. grep unspliced acemblyClass.tab | wc -l #54774 # Pare down proteins to just the ones that we have transcripts for: cd /cluster/data/mm9/bed/acembly/AceView.mm_37.good_proteins_fasta awk '{print $1;}' ../AceView.mm_37.genes_gff/acemblyClass.tab \ > transcriptNames.txt cat *.fasta \ | faSomeRecords stdin transcriptNames.txt acemblyPep.fa grep unspliced acemblyPep.fa | wc -l #45033 # Danielle Thierry-Mieg explained that noncoding genes are included so # the number of proteins can be smaller than the number of transcripts. # Load tables ssh hgwdev cd /cluster/data/mm9/bed/acembly/AceView.mm_37.genes_gff ldHgGene -gtf mm9 acembly acembly.gff #Read 173008 transcripts in 2366104 lines in 1 files # 173008 groups 21 seqs 1 sources 5 feature types hgLoadSqlTab mm9 acemblyClass ~/kent/src/hg/lib/acemblyClass.sql \ acemblyClass.tab cd /cluster/data/mm9/bed/acembly/AceView.mm_37.good_proteins_fasta hgPepPred mm9 generic acemblyPep acemblyPep.fa rm acemblyPep.tab runJoiner.csh mm9 acembly # mm9.acemblyPep.name - hits 149560 of 149560 ok # mm9.acemblyClass.name - hits 173008 of 173008 ok ######################################################################### # BLASTZ RAT Rn4 (DONE - 2007-08-30 - 2007-09-11 - Hiram) # re-run this third time with a special matrix from Bob Harris/Webb Miller cat /cluster/data/blastz/mouse_rat.q A C G T 56 -109 -45 -137 -109 100 -103 -45 -45 -103 100 -109 -137 -45 -109 56 O=600 E=55 ssh kkstore06 mkdir /cluster/data/mm9/bed/blastzRn4.2007-08-31 cd /cluster/data/mm9/bed/blastzRn4.2007-08-31 # Started this before the rsync to /scratch/data/mm9/ had completed, # hence the /cluster/bluearc/scratch/data/mm9/ location is used # here. cat << '_EOF_' > DEF # mouse vs rat # Specially tuned blastz parameters from Webb Miller BLASTZ_ABRIDGE_REPEATS=0 BLASTZ_O=600 BLASTZ_E=55 BLASTZ_Y=15000 BLASTZ_T=2 BLASTZ_K=4500 BLASTZ_Q=/cluster/data/blastz/mouse_rat.q # TARGET: Mouse Mm9 SEQ1_DIR=/scratch/data/mm9/mm9.2bit SEQ1_LEN=/cluster/data/mm9/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Rat Rn4 - single chunk big enough to run each chrom by itself SEQ2_DIR=/scratch/hg/rn4/rn4.2bit SEQ2_LEN=/cluster/data/rn4/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/cluster/data/mm9/bed/blastzRn4.2007-08-31 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # establish a screen to control this job screen cd /cluster/data/mm9/bed/blastzRn4.2007-08-31 time nice -n +19 doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=medium \ -stop=net `pwd`/DEF > blastz.out 2>&1 & # real 243m51.078s time nice -n +19 doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=medium \ -continue=download -stop=download `pwd`/DEF > download.out 2>&1 & time nice -n +19 doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=medium \ -continue=cleanup -syntenicNet `pwd`/DEF > syntenicNet.out 2>&1 & cat fb.mm9.chainRn4Link.txt # 1713186474 bases of 2620346127 (65.380%) in intersection # and the swap mkdir /cluster/data/rn4/bed/blastz.mm9.swap cd /cluster/data/rn4/bed/blastz.mm9.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /cluster/data/mm9/bed/blastzRn4.2007-08-31/DEF \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=medium \ -swap -syntenicNet > swap.out 2>&1 & # real 314m59.840s cat fb.rn4.chainMm9Link.txt # 1711034941 bases of 2571531505 (66.538%) in intersection ######################################################################### # EXONIPHY MM9, lifted from hg18 (DONE - 2007-09-05 - Hiram) # needed for uscsGenes10 building # create a syntenic liftOver chain file ssh kolossus cd /cluster/data/hg18/bed/blastz.mm9/axtChain time nice -n +19 netFilter -syn hg18.mm9.net.gz \ | netChainSubset -verbose=0 stdin hg18.mm9.all.chain.gz stdout \ | chainStitchId stdin stdout | gzip -c > hg18.mm9.syn.chain.gz # real 5m55.575s # slightly smaller than the ordinary liftOver chain file: # -rw-rw-r-- 1 77849682 Aug 14 16:49 hg18.mm9.over.chain.gz # -rw-rw-r-- 1 73972671 Sep 5 15:27 hg18.mm9.syn.chain.gz # exoniphyMm9.gp is prepared as follows ssh hgwdev mkdir /cluster/data/mm9/bed/exoniphy cd /cluster/data/mm9/bed/exoniphy hgsql hg18 -e "select * from exoniphy" -N > exoniphyHg18.gp time nice -n +19 liftOver -genePred exoniphyHg18.gp \ /cluster/data/hg18/bed/blastz.mm9/axtChain/hg18.mm9.syn.chain.gz \ exoniphyMm9.gp unmapped # real 52m0.335s wc -l * # 178162 exoniphyHg18.gp # 172859 exoniphyMm9.gp # 10606 unmapped ssh hgwdev cd /cluster/data/mm9/bed/exoniphy nice -n +19 hgLoadGenePred -genePredExt mm9 exoniphy exoniphyMm9.gp nice -n +19 featureBits mm9 exoniphy # 25931742 bases of 2620346127 (0.990%) in intersection nice -n +19 featureBits mm8 exoniphy # 25952211 bases of 2567283971 (1.011%) in intersection ######################################################################### # BLASTZ canFam2 (DONE - 2006-02-18 - Hiram) ssh kkstore06 # establish a screen to control this job screen mkdir /cluster/data/mm9/bed/blastzCanFam2.2007-09-04 cd /cluster/data/mm9/bed/blastzCanFam2.2007-09-04 cat << '_EOF_' > DEF # mouse vs dog BLASTZ_ABRIDGE_REPEATS=1 # TARGET: Mouse Mm9 SEQ1_DIR=/scratch/data/mm9/nib SEQ1_SMSK=/scratch/data/mm9/notInOthers SEQ1_LEN=/cluster/data/mm9/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Dog CanFam2 SEQ2_DIR=/scratch/hg/canFam2/nib SEQ2_SMSK=/san/sanvol1/scratch/canFam2/linSpecRep.notInMouse SEQ2_LEN=/san/sanvol1/scratch/canFam2/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/cluster/data/mm9/bed/blastzCanFam2.2007-09-04 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ `pwd`/DEF > blastz.out 2>&1 & # real 871m24.249s cat fb.mm9.chainCanFam2Link.txt # 848004408 bases of 2620346127 (32.362%) in intersection mkdir /cluster/data/canFam2/bed/blastz.mm9.swap cd /cluster/data/canFam2/bed/blastz.mm9.swap time /cluster/bin/scripts/doBlastzChainNet.pl \ /cluster/data/mm9/bed/blastzCanFam2.2007-09-04/DEF \ -verbose=2 -bigClusterHub=pk -chainMinScore=3000 \ -chainLinearGap=medium -swap > swap.out 2>&1 & # real 57m59.126s cat fb.canFam2.chainMm9Link.txt # 832145360 bases of 2384996543 (34.891%) in intersection # need syntenic net for the multiz cd /cluster/data/mm9/bed/blastzCanFam2.2007-09-04 time nice -n +19 doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -syntenicNet -continue=syntenicNet `pwd`/DEF > synNet.out 2>&1 & # real 19m1.302s ######################################################################### # BLASTZ/CHAIN/NET RHEMAC2 (DONE - 2007-09-05 - Hiram) # Won't put this in Conservation -- special request for ancestor recon. ssh kkstore06 # use a screen to control this job # XXX note for next time, naming convention is different here than all the # others, and there is a missing TMPDIR in the DEF file screen mkdir /cluster/data/mm9/bed/blastz.rheMac2.2007-09-05 cd /cluster/data/mm9/bed/blastz.rheMac2.2007-09-05 cat << '_EOF_' > DEF # Mouse vs. macacque BLASTZ_ABRIDGE_REPEATS=1 # TARGET: Mouse Mm9 SEQ1_DIR=/scratch/data/mm9/nib SEQ1_SMSK=/scratch/data/mm9/notInOthers SEQ1_LEN=/cluster/data/mm9/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Macacque (rheMac2) SEQ2_DIR=/san/sanvol1/scratch/rheMac2/nib SEQ2_SMSK=/cluster/bluearc/rheMac2/linSpecRep/notInRodent SEQ2_LEN=/cluster/data/rheMac2/chrom.sizes SEQ1_CHUNK=10000000 SEQ2_LAP=0 BASE=/cluster/data/mm9/bed/blastz.rheMac2.2007-09-05 '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=kk -chainMinScore=3000 -chainLinearGap=medium \ -syntenicNet `pwd`/DEF > do.log 2>&1 & # real 1017m13.247s # some kk kluster difficulties, fixup and complete manually # Completed: 87616 of 87616 jobs # CPU time in finished jobs: 26547195s 442453.25m 7374.22h 307.26d 0.842 y # IO & Wait Time: 3384143s 56402.38m 940.04h 39.17d 0.107 y # Average job time: 342s 5.69m 0.09h 0.00d # Longest finished job: 3159s 52.65m 0.88h 0.04d # Submission to last job: 65814s 1096.90m 18.28h 0.76d # then, continuing time nice -n +19 doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=kk -chainMinScore=3000 -chainLinearGap=medium \ -continue=cat -syntenicNet `pwd`/DEF > cat.log 2>&1 & # real 255m52.382s cat fb.mm9.chainRheMac2Link.txt # 998017006 bases of 2620346127 (38.087%) in intersection mkdir /cluster/data/rheMac2/bed/blastz.mm9.swap cd /cluster/data/rheMac2/bed/blastz.mm9.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /cluster/data/mm9/bed/blastz.rheMac2.2007-09-05/DEF \ -bigClusterHub=kk -chainMinScore=3000 -chainLinearGap=medium \ -swap -syntenicNet > swap.log 2>&1 & # real 178m31.911s cat fb.rheMac2.chainMm9Link.txt # 1094006509 bases of 2646704109 (41.335%) in intersection ######################################################################### # BLASTZ/CHAIN/NET Orangutan ponAbe1 (DONE - 2007-09-05 - Hiram) ssh kkstore01 # use a screen to control this job screen mkdir /cluster/data/mm9/bed/blastzPonAbe1.2007-09-05 cd /cluster/data/mm9/bed/blastzPonAbe1.2007-09-05 # next time, have SEQ2_CHUNK at 30000000 and SEQ2_LIMIT at 100 # this caused over 500,000 pk jobs, that is too many cat << '_EOF_' > DEF # mouse vs orangutan BLASTZ_M=50 # TARGET: Mouse Mm9 SEQ1_DIR=/scratch/data/mm9/mm9.2bit SEQ1_LEN=/cluster/data/mm9/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Orangutan ponAbe1 SEQ2_DIR=/scratch/data/ponAbe1/ponAbe1.2bit SEQ2_LEN=/cluster/data/ponAbe1/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LIMIT=50 SEQ2_LAP=0 BASE=/cluster/data/mm9/bed/blastzPonAbe1.2007-09-05 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \ -stop=load -chainMinScore=3000 \ -chainLinearGap=medium -bigClusterHub=pk > do.log 2>&1 & # real 897m58.156s # some pk kluster difficulties, fixup and complete manually Completed: 511290 of 511290 jobs CPU time in finished jobs: 11448015s 190800.24m 3180.00h 132.50d 0.363 y IO & Wait Time: 1852197s 30869.96m 514.50h 21.44d 0.059 y Average job time: 26s 0.43m 0.01h 0.00d Longest running job: 0s 0.00m 0.00h 0.00d Longest finished job: 349s 5.82m 0.10h 0.00d Submission to last job: 54771s 912.85m 15.21h 0.63d # then, continuing time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \ -continue=cat -stop=load -chainMinScore=3000 \ -chainLinearGap=medium -bigClusterHub=pk > cat.log 2>&1 & # ran into trouble on the kki chain run with stuff missing # from the Iservers /scratch/data/ - rsync them up and get # the run done manually # Completed: 24 of 24 jobs # CPU time in finished jobs: 17718s 295.30m 4.92h 0.21d 0.001 y # IO & Wait Time: 203s 3.38m 0.06h 0.00d 0.000 y # Average job time: 747s 12.45m 0.21h 0.01d # Longest finished job: 3673s 61.22m 1.02h 0.04d # Submission to last job: 3886s 64.77m 1.08h 0.04d time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \ -continue=chainMerge -stop=load -chainMinScore=3000 \ -chainLinearGap=medium -bigClusterHub=pk > chainMerge.log 2>&1 & # real 55m27.522s cat fb.mm9.chainPonAbe1Link.txt # 913843325 bases of 2620346127 (34.875%) in intersection mkdir /cluster/data/ponAbe1/bed/blastz.mm9.swap cd /cluster/data/ponAbe1/bed/blastz.mm9.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /cluster/data/mm9/bed/blastzPonAbe1.2007-09-05/DEF \ -stop=load -chainMinScore=3000 \ -swap -chainLinearGap=medium -bigClusterHub=pk > swap.log 2>&1 & # create the syntenic maf nets: time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \ -continue=download -syntenicNet -chainMinScore=3000 \ -chainLinearGap=medium -bigClusterHub=pk > syntenicNet.log 2>&1 & # real 20m55.024s # create reciprocal best chains/nets ssh hgwdev cd /cluster/data/mm9/bed/blastzPonAbe1.2007-09-05 time nice -n +19 /cluster/bin/scripts/doRecipBest.pl mm9 ponAbe1 \ > rbest.log 2>&1 & # real 53m43.377s ######################################################################### # BLASTZ/CHAIN/NET Marmoset calJac1 (DONE - 2007-09-06 - 2007-09-07 - Hiram) ssh kkstore06 # use a screen to control this job screen mkdir /cluster/data/mm9/bed/blastzCalJac1.2007-09-06 cd /cluster/data/mm9/bed/blastzCalJac1.2007-09-06 # next time, try SEQ2_CHUNK at 40000000, SEQ2_LIMIT at 75 # this created 285,570 kluster jobs, that is too many cat << '_EOF_' > DEF # mouse vs marmoset BLASTZ_M=50 # TARGET: Mouse Mm9 SEQ1_DIR=/scratch/data/mm9/mm9.2bit SEQ1_LEN=/cluster/data/mm9/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Marmoset calJac1 SEQ2_DIR=/scratch/data/calJac1/calJac1.2bit SEQ2_LEN=/cluster/data/calJac1/chrom.sizes SEQ2_CHUNK=30000000 SEQ2_LIMIT=50 SEQ2_LAP=0 BASE=/cluster/data/mm9/bed/blastzCalJac1.2007-09-06 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \ -stop=load -chainMinScore=3000 \ -chainLinearGap=medium -bigClusterHub=pk > do.log 2>&1 & # real 897m58.156s # some pk kluster difficulties, fixup and complete manually # Completed: 511290 of 511290 jobs # CPU time in finished jobs: 11448015s 190800.24m 3180.00h 132.50d 0.363 y # IO & Wait Time: 1852197s 30869.96m 514.50h 21.44d 0.059 y # Average job time: 26s 0.43m 0.01h 0.00d # Longest finished job: 349s 5.82m 0.10h 0.00d # Submission to last job: 54771s 912.85m 15.21h 0.63d # then, continuing time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \ -continue=cat -stop=load -chainMinScore=3000 \ -chainLinearGap=medium -bigClusterHub=pk > cat.log 2>&1 & # real 669m34.473s cat fb.mm9.chainCalJac1Link.txt # 863961573 bases of 2620346127 (32.971%) in intersection mkdir /cluster/data/calJac1/bed/blastz.mm9.swap cd /cluster/data/calJac1/bed/blastz.mm9.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /cluster/data/mm9/bed/blastzCalJac1.2007-09-06/DEF \ -stop=load -chainMinScore=3000 \ -swap -chainLinearGap=medium -bigClusterHub=pk > swap.log 2>&1 & # real 217m10.835s cat fb.calJac1.chainMm9Link.txt # 887586922 bases of 2929139385 (30.302%) in intersection time nice -n +19 $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \ -verbose=2 /cluster/data/mm9/bed/blastzCalJac1.2007-09-06/DEF \ -continue=download -chainMinScore=3000 \ -swap -chainLinearGap=medium -bigClusterHub=pk > download.log 2>&1 & # real 1m9.876s # run the syntenic nets time nice -n +19 doBlastzChainNet.pl -verbose=2 DEF \ -continue=download -chainMinScore=3000 \ -syntenicNet -chainLinearGap=medium -bigClusterHub=pk \ > syntenicNet.log 2>&1 & # real 22m51.080s # create reciprocal best chains/nets ssh hgwdev cd /cluster/data/mm9/bed/blastzCalJac1.2007-09-06 time nice -n +19 /cluster/bin/scripts/doRecipBest.pl mm9 calJac1 \ > rbest.log 2>&1 & # real 47m18.467s ######################################################################### # BLASTZ/CHAIN/NET Fugu fr2 (DONE - 2007-09-06 - 2007-09-07 - Hiram) ssh kkstore02 # use a screen to control this job screen mkdir /cluster/data/mm9/bed/blastzFr2.2007-09-06 cd /cluster/data/mm9/bed/blastzFr2.2007-09-06 cat << '_EOF_' > DEF # mouse vs medaka BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_M=50 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET: Mouse Mm9 SEQ1_DIR=/scratch/data/mm9/mm9.2bit SEQ1_LEN=/cluster/data/mm9/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Fugu fr2 # Align to the scaffolds, results lifed up to chrUn.sdTrf coordinates SEQ2_DIR=/san/sanvol1/scratch/fr2/fr2.2bit SEQ2_LEN=/san/sanvol1/scratch/fr2/chrom.sizes SEQ2_CTGDIR=/san/sanvol1/scratch/fr2/fr2.scaffolds.2bit SEQ2_CTGLEN=/san/sanvol1/scratch/fr2/fr2.scaffolds.sizes SEQ2_LIFT=/san/sanvol1/scratch/fr2/liftAll.lft SEQ2_CHUNK=20000000 SEQ2_LIMIT=30 SEQ2_LAP=0 BASE=/cluster/data/mm9/bed/blastzFr2.2007-09-06 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl DEF -chainMinScore=5000 \ -qRepeats=windowmaskerSdust -chainLinearGap=loose \ -bigClusterHub=kk -verbose=2 > do.log 2>&1 & # real 156m55.151s # fixup broken kluster jobs, complete manually # Completed: 70395 of 70395 jobs # CPU time in finished jobs: 4339015s 72316.91m 1205.28h 50.22d 0.138 y # IO & Wait Time: 486414s 8106.90m 135.12h 5.63d 0.015 y # Average job time: 69s 1.14m 0.02h 0.00d # Longest finished job: 1098s 18.30m 0.30h 0.01d # Submission to last job: 18352s 305.87m 5.10h 0.21d # and then continuing time nice -n +19 doBlastzChainNet.pl DEF -chainMinScore=5000 \ -qRepeats=windowmaskerSdust -chainLinearGap=loose \ -continue=cat -bigClusterHub=kk -verbose=2 > cat.log 2>&1 & # real 5m43.977s # Still, the typical failure # HgStepManager: executing step 'net' Thu Sep 6 16:04:56 2007. # netChains: looks like previous stage was not successful (can't find [mm9.fr2.]all.chain[.gz]). time nice -n +19 doBlastzChainNet.pl DEF -chainMinScore=5000 \ -qRepeats=windowmaskerSdust -chainLinearGap=loose \ -continue=net -bigClusterHub=kk -verbose=2 > net.log 2>&1 & # real 178m15.798s cat fb.mm9.chainFr2Link.txt # 47018710 bases of 2620346127 (1.794%) in intersection mkdir /cluster/data/fr2/bed/blastz.mm9.swap cd /cluster/data/fr2/bed/blastz.mm9.swap time nice -n +19 doBlastzChainNet.pl -chainMinScore=5000 \ /cluster/data/mm9/bed/blastzFr2.2007-09-06/DEF \ -qRepeats=windowmaskerSdust -chainLinearGap=loose \ -swap -bigClusterHub=kk -verbose=2 > swap.log 2>&1 & # real 15m32.368s cat fb.fr2.chainMm9Link.txt # 42413565 bases of 393312790 (10.784%) in intersection ######################################################################### # BLASTZ/CHAIN/NET Tetraodon tetNig1 (DONE - 2007-09-06 - 2007-09-07 - Hiram) ssh kkstore01 # use a screen to control this job screen mkdir /cluster/data/mm9/bed/blastzTetNig1.2007-09-06 cd /cluster/data/mm9/bed/blastzTetNig1.2007-09-06 cat << '_EOF_' > DEF # mouse vs tetraodon BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_M=50 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET: Mouse Mm9 SEQ1_DIR=/scratch/data/mm9/mm9.2bit SEQ1_LEN=/cluster/data/mm9/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Tetraodon tetNig1 # Align to the scaffolds, results lifed up to chrUn.sdTrf coordinates SEQ2_DIR=/san/sanvol1/scratch/tetNig1/tetNig1.sdTrf.2bit SEQ2_LEN=/san/sanvol1/scratch/tetNig1/chrom.sizes SEQ2_CTGDIR=/san/sanvol1/scratch/tetNig1/tetNig1.randomContigs.sdTrf.2bit SEQ2_CTGLEN=/san/sanvol1/scratch/tetNig1/tetNig1.randomContigs.sdTrf.sizes SEQ2_LIFT=/san/sanvol1/scratch/tetNig1/tetNig1.randomContigs.lift SEQ2_CHUNK=20000000 SEQ2_LIMIT=30 SEQ2_LAP=0 BASE=/cluster/data/mm9/bed/blastzTetNig1.2007-09-06 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl DEF -chainMinScore=5000 \ -qRepeats=windowmaskerSdust -chainLinearGap=loose \ -bigClusterHub=kk -verbose=2 > do.log 2>&1 & # real 535m2.474s # Typical failure # HgStepManager: executing step 'net' Fri Sep 7 01:13:06 2007. # netChains: looks like previous stage was not successful (can't find [mm9.tetNig1.]all.chain[.gz]). # continuing time nice -n +19 doBlastzChainNet.pl DEF -chainMinScore=5000 \ -qRepeats=windowmaskerSdust -chainLinearGap=loose \ -continue=net -bigClusterHub=kk -verbose=2 > net.log 2>&1 & cat fb.mm9.chainTetNig1Link.txt # 46206292 bases of 2620346127 (1.763%) in intersection mkdir /cluster/data/tetNig1/bed/blastz.mm9.swap cd /cluster/data/tetNig1/bed/blastz.mm9.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /cluster/data/mm9/bed/blastzTetNig1.2007-09-06/DEF \ -chainMinScore=5000 \ -qRepeats=windowmaskerSdust -chainLinearGap=loose \ -swap -bigClusterHub=kk > swap.log 2>&1 & # real 19m58.885s cat fb.tetNig1.chainMm9Link.txt # 42256263 bases of 342403326 (12.341%) in intersection ######################################################################### # BLASTZ/CHAIN/NET Stickleback gasAcu1 (DONE - 2007-09-06 - 2007-09-07 - Hiram) ssh kkstore01 # use a screen to control this job screen mkdir /cluster/data/mm9/bed/blastzGasAcu1.2007-09-06 cd /cluster/data/mm9/bed/blastzGasAcu1.2007-09-06 cat << '_EOF_' > DEF # mouse vs stickleback BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_M=50 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET: Mouse Mm9 SEQ1_DIR=/scratch/data/mm9/mm9.2bit SEQ1_LEN=/cluster/data/mm9/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: stickleback gasAcu1 SEQ2_DIR=/san/sanvol1/scratch/gasAcu1/gasAcu1.sdTrf.2bit SEQ2_LEN=/san/sanvol1/scratch/gasAcu1/gasAcu1.sdTrf.sizes SEQ2_CTGDIR=/san/sanvol1/scratch/gasAcu1/gasAcu1.randomContigs.sdTrf.2bit SEQ2_CTGLEN=/san/sanvol1/scratch/gasAcu1/gasAcu1.randomContigs.sdTrf.sizes SEQ2_LIFT=/san/sanvol1/scratch/gasAcu1/chrUn.extraCloneGap.lift SEQ2_CHUNK=35000000 SEQ2_LIMIT=30 SEQ2_LAP=0 BASE=/cluster/data/mm9/bed/blastzGasAcu1.2007-09-06 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl DEF -chainMinScore=5000 \ -qRepeats=windowmaskerSdust -chainLinearGap=loose \ -bigClusterHub=kk -verbose=2 > do.log 2>&1 & # Completed: 52725 of 52725 jobs # CPU time in finished jobs: 4110432s 68507.19m 1141.79h 47.57d 0.130 y # IO & Wait Time: 413069s 6884.49m 114.74h 4.78d 0.013 y # Average job time: 86s 1.43m 0.02h 0.00d # Longest finished job: 1140s 19.00m 0.32h 0.01d # Submission to last job: 71194s 1186.57m 19.78h 0.82d # had some jobs fail on the kk run, finish manually, then continuing: time nice -n +19 doBlastzChainNet.pl DEF -chainMinScore=5000 \ -qRepeats=windowmaskerSdust -chainLinearGap=loose \ -continue=cat -bigClusterHub=kk -verbose=2 > cat.log 2>&1 & # real 120m36.209s # failed kki chain job due to san outage on kkr7u00, finished manually: # Completed: 24 of 24 jobs # CPU time in finished jobs: 1807s 30.12m 0.50h 0.02d 0.000 y # IO & Wait Time: 258s 4.29m 0.07h 0.00d 0.000 y # Average job time: 86s 1.43m 0.02h 0.00d # Longest finished job: 257s 4.28m 0.07h 0.00d # Submission to last job: 9851s 164.18m 2.74h 0.11d # continuing time nice -n +19 doBlastzChainNet.pl DEF -chainMinScore=5000 -verbose=2 \ -qRepeats=windowmaskerSdust -chainLinearGap=loose \ -continue=chainMerge -bigClusterHub=kk > chainMerge.log 2>&1 & # real 21m7.089s cat fb.mm9.chainGasAcu1Link.txt # 48448585 bases of 2620346127 (1.849%) in intersection mkdir /cluster/data/gasAcu1/bed/blastz.mm9.swap cd /cluster/data/gasAcu1/bed/blastz.mm9.swap time nice -n +19 doBlastzChainNet.pl -chainMinScore=5000 \ /cluster/data/mm9/bed/blastzGasAcu1.2007-09-06/DEF \ -qRepeats=windowmaskerSdust -chainLinearGap=loose \ -swap -bigClusterHub=kk -verbose=2 > swap.log 2>&1 & cat fb.gasAcu1.chainMm9Link.txt # 43730193 bases of 446627861 (9.791%) in intersection ######################################################################### # BLASTZ Zebrafish danRer5 (DONE - 2007-09-11 - 2007-09-12 - Hiram) # re-run a second time with BLASTZ_Q, see below ssh kkstore06 screen # use screen to manage this job mkdir /cluster/data/mm9/bed/blastzDanRer5.2007-09-11 cd /cluster/data/mm9/bed/blastzDanRer5.2007-09-11 cat << '_EOF_' > DEF # Mouse (mm9) vs zebrafish (danRer5) BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_M=50 # TARGET: Mouse Mm9 SEQ1_DIR=/scratch/data/mm9/mm9.2bit SEQ1_LEN=/cluster/data/mm9/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY - zebrafish (danRer5) SEQ2_DIR=/scratch/data/danRer5/danRer5.2bit SEQ2_LEN=/cluster/data/danRer5/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LIMIT=50 SEQ2_LAP=0 BASE=/cluster/data/mm9/bed/blastzDanRer5.2007-09-11 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl DEF -chainMinScore=5000 \ -chainLinearGap=loose -bigClusterHub=pk -verbose=2 > do.log 2>&1 & # real 222m47.787s cat fb.mm9.chainDanRer5Link.txt # 48497464 bases of 2620346127 (1.851%) in intersection mkdir /cluster/data/danRer5/bed/blastz.mm9.swap cd /cluster/data/danRer5/bed/blastz.mm9.swap time nice -n +19 $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \ -chainMinScore=5000 \ /cluster/data/mm9/bed/blastzDanRer5.2007-09-11/DEF \ -swap -chainLinearGap=loose -bigClusterHub=pk -verbose=2 \ > swap.log 2>&1 & # real 9m47.163s cat fb.danRer5.chainMm9Link.txt # 34017483 bases of 1435609608 (2.370%) in intersection ######################################################################### # BLASTZ Zebrafish danRer5 (DONE - 2007-09-13 - Hiram) # second time, forgot to include BLASTZ_Q the first time ssh kkstore06 screen # use screen to manage this job mkdir /cluster/data/mm9/bed/blastzDanRer5.2007-09-13 cd /cluster/data/mm9/bed/blastzDanRer5.2007-09-13 # This is the wrong way overlap, but it seems to work cat << '_EOF_' > DEF # Mouse (mm9) vs zebrafish (danRer5) BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_M=50 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET: Mouse Mm9 SEQ1_DIR=/scratch/data/mm9/mm9.2bit SEQ1_LEN=/cluster/data/mm9/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY - zebrafish (danRer5) SEQ2_DIR=/scratch/data/danRer5/danRer5.2bit SEQ2_LEN=/cluster/data/danRer5/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LIMIT=50 SEQ2_LAP=0 BASE=/cluster/data/mm9/bed/blastzDanRer5.2007-09-13 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl DEF -chainMinScore=5000 \ -chainLinearGap=loose -bigClusterHub=pk -verbose=2 > do.log 2>&1 & # real 369m16.947s cat fb.mm9.chainDanRer5Link.txt # 84513268 bases of 2620346127 (3.225%) in intersection mkdir /cluster/data/danRer5/bed/blastz.mm9.swap cd /cluster/data/danRer5/bed/blastz.mm9.swap time nice -n +19 $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \ -chainMinScore=5000 \ /cluster/data/mm9/bed/blastzDanRer5.2007-09-13/DEF \ -swap -chainLinearGap=loose -bigClusterHub=pk -verbose=2 \ > swap.log 2>&1 & # real 21m44.784s cat fb.danRer5.chainMm9Link.txt # 66400782 bases of 1435609608 (4.625%) in intersection ######################################################################### # BLASTZ/CHAIN/NET Guinea Pig cavPor2 (DONE - 2007-09-19 - kate) ssh kkstore06 mkdir /cluster/data/mm9/bed/blastzCavPor2.2007-09-19 cd /cluster/data/mm9/bed/blastzCavPor2.2007-09-19 cat << '_EOF_' > DEF # mouse vs guinea pig BLASTZ_M=50 # TARGET: Mouse Mm9 SEQ1_DIR=/scratch/data/mm9/mm9.2bit SEQ1_LEN=/cluster/data/mm9/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Guinea pig cavPor2 SEQ2_DIR=/san/sanvol1/scratch/cavPor2/cavPor2.2bit SEQ2_LEN=/san/sanvol1/scratch/cavPor2/chrom.sizes # chunking similar to cat (similar number of scaffolds) SEQ2_CHUNK=30000000 SEQ2_LIMIT=500 SEQ2_LAP=0 BASE=/cluster/data/mm9/bed/blastzCavPor2.2007-09-19 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs doBlastzChainNet.pl `pwd`/DEF \ -chainMinScore=3000 -chainLinearGap=medium \ -bigClusterHub=pk >& do.log & # load nets manually -- automated loading fails as classification info # not available (no database) ssh hgwdev cd /cluster/data/mm9/bed/blastz.cavPor2/axtChain netFilter -minGap=10 noClass.net | hgLoadNet -warn mm9 netCavPor2 stdin netFilter -minGap=10 mm9.cavPor2.rbest.net.gz | \ hgLoadNet -warn mm9 netRBestCavPor2 stdin doBlastzChainNet.pl `pwd`/DEF \ -chainMinScore=3000 -chainLinearGap=medium \ -continue=download >& do2.log & # reciprocal best net mafs for multiz ~/kent/src/hg/utils/automation/doRecipBest.pl mm9 cavPor2 >&! rbest.log & time nice -n +19 featureBits mm9 chainCavPor2Link \ > fb.mm9.chainCavPor2Link.txt 2>&1 cat fb.mm9.chainCavPor2Link.txt # 480194223 bases of 2620346127 (18.326%) in intersection # create the syntenic maf nets (these are unneeded): time nice -n +19 doBlastzChainNet.pl `pwd`/DEF -chainMinScore=3000 \ -chainLinearGap=medium -continue=syntenicNet -syntenicNet \ -bigClusterHub=pk > syntenicNet.log 2>&1 ######################################################################### ## 4-Way Multiz (DONE - 2007-09-07 - Hiram) ssh hgwdev mkdir /cluster/data/mm9/bed/multiz4way cd /cluster/data/mm9/bed/multiz4way ln -s ../multiz30way/mm9.guess.30way.nh ./30way.nh leave mm9 rn4, canFam2 and hg18 /cluster/bin/phast/tree_doctor \ --prune panTro2,ponAbe1,rheMac2,calJac1,otoGar1,tupBel1,cavPor2,oryCun1,sorAra1,eriEur1,felCat3,equCab1,bosTau3,dasNov1,loxAfr1,echTel1,monDom4,ornAna1,galGal3,anoCar1,xenTro2,tetNig1,fr2,gasAcu1,oryLat1,danRer4 30way.nh # this leaves us with: cat << '_EOF_' > 4way.nh ((hg18:0.126901, (rn4:0.084383,mm9:0.076274):0.249544):0.019763,canFam2:0.187963); '_EOF_' # << happy emacs # Use this specification in the phyloGif tool: # http://genome.ucsc.edu/cgi-bin/phyloGif # to obtain a gif image for htdocs/images/phylo/mm9_4way.gif /cluster/bin/phast/all_dists 4way.nh > 4way.distances.txt # Use this output to create the table below grep -y mm9 4way.distances.txt | sort -k3,3n # # If you can fill in all the numbers in this table, you are ready for # the multiple alignment procedure # # featureBits chainLink measures # chainOryLat1Link chain linearGap # distance on mm9 on other minScore # 1 0.160657 - rat rn4 (% 65.380) (% xx.xxx) 5000 medium # 2 0.452719 - human hg18 (% 38.499) (% 35.201) 3000 medium # 3 0.533544 - dog canFam2 (% 32.362) (% 34.891) 3000 medium # using the syntenic nets cd /cluster/data/mm9/bed/multiz4way mkdir mafLinks mkdir mafLinks/rn4 cd mafLinks/rn4 ln -s ../../../blastzRn4.2007-08-31/mafSynNet/*.maf.gz . mkdir ../hg18 cd ../hg18 ln -s ../../../blastz.hg18/mafSynNet/*.maf.gz . mkdir ../canFam2 cd ../canFam2 ln -s ../../../blastz.canFam2/mafSynNet/*.maf.gz . # Copy MAFs to some appropriate NFS server for kluster run mkdir /san/sanvol1/scratch/mm9/multiz4way cd /san/sanvol1/scratch/mm9/multiz4way time nice -n +19 rsync -a --copy-links --progress \ /cluster/data/mm9/bed/multiz4way/mafLinks/ . # 1 minute to copy 2.4 Gb # determine what is the newest version of multiz and use that mkdir penn cp -p /cluster/bin/penn/multiz.v11.2007-03-19/multiz-tba/multiz penn cp -p /cluster/bin/penn/multiz.v11.2007-03-19/multiz-tba/maf_project penn cp -p /cluster/bin/penn/multiz.v11.2007-03-19/multiz-tba/autoMZ penn # the autoMultiz cluster run ssh pk cd /cluster/data/mm9/bed/multiz4way # create species list and stripped down tree for autoMZ sed 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//; /^ *$/d' \ 4way.nh > tmp.nh echo `cat tmp.nh` > tree-commas.nh echo `cat tree-commas.nh` | sed 's/ //g; s/,/ /g' > tree.nh sed 's/[()]//g; s/,/ /g' tree.nh > species.lst mkdir run maf cd run # NOTE: you need to set the db and multiz dirname properly in this script cat > autoMultiz << '_EOF_' #!/bin/csh -ef set db = mm9 set c = $1 set maf = $2 set binDir = /san/sanvol1/scratch/$db/multiz4way/penn set tmp = /scratch/tmp/$db/multiz.$c set pairs = /san/sanvol1/scratch/$db/multiz4way rm -fr $tmp mkdir -p $tmp cp ../{tree.nh,species.lst} $tmp pushd $tmp foreach s (`cat species.lst`) set in = $pairs/$s/$c.maf set out = $db.$s.sing.maf if ($s == $db) then continue endif if (-e $in.gz) then zcat $in.gz > $out else if (-e $in) then cp $in $out else echo "##maf version=1 scoring=autoMZ" > $out endif end set path = ($binDir $path); rehash $binDir/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c.maf popd cp $tmp/$c.maf $maf rm -fr $tmp '_EOF_' # << happy emacs chmod +x autoMultiz cat << '_EOF_' > template #LOOP ./autoMultiz $(root1) {check out line+ /cluster/data/mm9/bed/multiz4way/maf/$(root1).maf} #ENDLOOP '_EOF_' # << happy emacs awk '{print $1}' /cluster/data/mm9/chrom.sizes > chrom.lst gensub2 chrom.lst single template jobList para create jobList # 35 jobs para try ... check ... push ... etc ... # Completed: 35 of 35 jobs # CPU time in finished jobs: 27901s 465.02m 7.75h 0.32d 0.001 y # IO & Wait Time: 562s 9.37m 0.16h 0.01d 0.000 y # Average job time: 813s 13.55m 0.23h 0.01d # Longest finished job: 2222s 37.03m 0.62h 0.03d # Submission to last job: 2222s 37.03m 0.62h 0.03d # combine results into a single file for loading and gbdb reference ssh kkstore06 cd /cluster/data/mm9/bed/multiz4way time nice -n +19 catDir maf > multiz4way.maf # real 2m43.409s # makes a 6.5 Gb file: # -rw-rw-r-- 1 6883356263 Sep 7 11:00 multiz4way.maf # Create per-chrom individual maf files for downloads # NOT NECESSARY HERE - DONE LATER WITH THE ANNOTATED MAFS ssh kkstore04 cd /cluster/data/mm9/bed/multiz4way mkdir mafDownloads time for M in maf/chr*.maf do B=`basename $M` cp -p ${M} mafDownloads/${B} gzip mafDownloads/${B} echo ${B} done done # real 5m9.273 # deliver to downloads *!* NOT NECESSARY HERE - DONE LATER WITH # THE ANNOTATED MAFS ssh hgwdev ln -s /cluster/data/mm9/bed/multiz4way/mafDownloads \ /usr/local/apache/htdocs/goldenPath/mm9/multiz4way # Load into database ssh hgwdev cd /cluster/data/mm9/bed/multiz4way mkdir /gbdb/mm9/multiz4way ln -s /cluster/data/mm9/bed/multiz4way/multiz4way.maf \ /gbdb/mm9/multiz4way time nice -n +19 hgLoadMaf mm9 multiz4way # Loaded 5072051 mafs in 1 files from /gbdb/mm9/multiz4way # real 2m33.680s time nice -n +19 hgLoadMafSummary -minSize=10000 -mergeGap=500 \ -maxSize=50000 mm9 multiz4waySummary multiz4way.maf # Created 1330454 summary blocks from 9893113 components # and 5068764 mafs from multiz4way.maf # real 3m27.620s # Create tree image for details page # You can get a better image from the phyloGif tool: # http://genome.ucsc.edu/cgi-bin/phyloGif # with mm9 on top: (((mouse_mm9:0.076274,rat_rn4:0.084383):0.249544,human_hg18:0.126901):0.019763, dog_canFam2:0.187963); ######################################################################### ### GNF ATLAS 2 - required for UCSC Gene/Gene Sorter build # (DONE - 2007-09-10 - Hiram) # Align probes from GNF1M chip. ssh pk mkdir -p /cluster/data/mm9/bed/geneAtlas2/run/psl cd /cluster/data/mm9/bed/geneAtlas2/run cut -f1 /cluster/data/mm9/chrom.sizes > genome.list ls -1 /cluster/bluearc/geneAtlas2/gnf1m.fa > probe.list cat << '_EOF_' > template #LOOP blat -fine -ooc=/scratch/data/mm9/11.ooc /scratch/data/mm9/nib/$(path1).nib $(path2) {check out line+ psl/$(root1)_$(root2).psl} #ENDLOOP '_EOF_' # << happy emacs gensub2 genome.list probe.list template jobList para create jobList para try ... check ... push ... etc. para time # Completed: 35 of 35 jobs # CPU time in finished jobs: 14865s 247.75m 4.13h 0.17d 0.000 y # IO & Wait Time: 160s 2.66m 0.04h 0.00d 0.000 y # Average job time: 429s 7.15m 0.12h 0.00d # Longest finished job: 1151s 19.18m 0.32h 0.01d # Submission to last job: 1166s 19.43m 0.32h 0.01d # Do sort, best in genome filter, and convert to chromosome coordinates # to create gnf1h.psl. pslSort dirs raw.psl tmp psl pslReps -minCover=0.3 -minAli=0.95 -nearTop=0.005 raw.psl \ ../affyGnf1m.psl /dev/null # Load probes and alignments from GNF1H into database. ssh hgwdev cd /cluster/data/mm9/bed/geneAtlas2 # ln -s /projects/compbio/data/microarray/geneAtlas2/mouse/gnf1m.fa /gbdb/hgFixed/affyProbes hgLoadPsl mm9 affyGnf1m.psl hgLoadSeq mm9 /gbdb/hgFixed/affyProbes/gnf1m.fa # 31309 sequences # Load up track hgMapMicroarray gnfAtlas2.bed hgFixed.gnfMouseAtlas2MedianRatio \ affyGnf1m.psl #Loaded 34863 rows of expression data from hgFixed.gnfMouseAtlas2MedianRatio # Mapped 30117, multiply-mapped 1723, missed 882, unmapped 4746 # Note that the unmapped 5000 records are from all-N sequences. hgLoadBed mm9 gnfAtlas2 gnfAtlas2.bed # Loaded 31840 elements of size 15 featureBits mm9 gnfAtlas2 # 12921627 bases of 2620346127 (0.493%) in intersection featureBits mm8 gnfAtlas2 # 12858280 bases of 2567283971 (0.501%) in intersection # during the build of UCSC genes, this sequence takes place: hgMapToGene mm9 affyGnf1m knownGene knownToGnf1m hgExpDistance mm9 hgFixed.gnfMouseAtlas2MedianRatio \ hgFixed.gnfMouseAtlas2MedianExps gnfAtlas2Distance -lookup=knownToGnf1m # this hgExpDistance command takes some time, maybe an hour or so ? # Have 34863 elements in hgFixed.gnfMouseAtlas2MedianRatio # Got 31145 unique elements in hgFixed.gnfMouseAtlas2MedianRatio hgMapToGene mm9 gnfAtlas2 knownGene knownToGnfAtlas2 '-type=bed 12' ############################################################################ ### affyU74 TRACK - needed for the Gene Sorter (DONE - 2007-09-10 - Hiram) # # MAKE THE affyU74 TRACK using Affy consensus sequences instead of # target sequences. Recalculate alignments and load data # # The affy data has previously been loaded to iscratch in: # /iscratch/i/affy # It originates from: # /projects/compbio/data/microarray/affyGnfMouse/sequences/ # Run cluster job to do alignments ssh kk mkdir -p /cluster/data/mm9/bed/affyU74/run/psl cd /cluster/data/mm9/bed/affyU74/run cut -f1 /cluster/data/mm9/chrom.sizes > genome.list ls -1 /iscratch/i/affy/U74*consensus.fa > affy.list cat << '_EOF_' > template #LOOP blat -fine -mask=lower -minIdentity=95 -ooc=/scratch/data/mm9/11.ooc /scratch/data/mm9/nib/$(path1).nib {check in line+ $(path2)} {check out line+ psl/$(root1)_$(root2).psl} #ENDLOOP '_EOF_' # << happy emacs gensub2 genome.list affy.list template jobList para create jobList para try ... check ... push ... etc. para time # Completed: 105 of 105 jobs # CPU time in finished jobs: 5891s 98.18m 1.64h 0.07d 0.000 y # IO & Wait Time: 738s 12.31m 0.21h 0.01d 0.000 y # Average job time: 63s 1.05m 0.02h 0.00d # Longest finished job: 199s 3.32m 0.06h 0.00d # Submission to last job: 215s 3.58m 0.06h 0.00d # Do sort, best in genome filter, and convert to chromosome coordinates # to create affyU74.psl. pslSort dirs raw.psl tmp psl # change filter parameters for these sequences. only use alignments that # cover 30% of sequence and have at least minAli = 0.95. # minAli = 0.97 too high. low minCover as a lot of n's in these sequences #pslReps -minCover=0.3 -sizeMatters -minAli=0.95 -nearTop=0.005 raw.psl contig.psl /dev/null pslReps -minCover=0.3 -sizeMatters -minAli=0.95 -nearTop=0.005 raw.psl \ ../all_affyU74.psl /dev/null # Processed 40512 alignments # Sort by chromosome and load into database. ssh hgwdev cd /cluster/data/mm9/bed/affyU74 pslSortAcc nohead chrom temp all_affyU74.psl # Processed 30609 lines into 1 temp files cat chrom/*.psl > affyU74.psl # shorten qName to "xxxx_at" instead of "U74Xv2:xxxx_at;" # and reload data into table mv affyU74.psl affyU74.psl.orig cut -f 1-9 affyU74.psl.orig >j1.tmp cut -f 10 affyU74.psl.orig | sed -e 's/:/\t/' | cut -f 2 > j2.tmp cut -f 11-21 affyU74.psl.orig >j3.tmp paste j1.tmp j2.tmp j3.tmp >affyU74.psl hgLoadPsl mm9 affyU74.psl rm -rf chrom temp run j?.tmp # creating the gene sorter tables runs the following: hgMapToGene mm9 affyU74 knownGene knownToU74 ############################################################################ ## MAKE THE affyGnfU74 TRACKs (DONE 3/8/06, Fan) # Make bed files and load consensus sequences for Affy U74 chip set. #This needs to be done after affyU74 is already made. ssh hgwdev mkdir -p /cluster/data/mm9/bed/affyGnf cd /cluster/data/mm9/bed/affyGnf # may need to build this command in src/hg/affyGnf ~/src/hg/affyGnf/affyPslAndAtlasToBed ../affyU74/affyU74.psl \ /projects/compbio/data/microarray/affyGnfMouse/data/data_public_U74 \ affyGnfU74A.bed affyGnfU74A.exp -newType -chip=U74Av2 # 89 experiments # 10043 rows of expression data # 30609 records in ../affyU74/affyU74.psl # 10309 records written to affyGnfU74A.bed ~/src/hg/affyGnf/affyPslAndAtlasToBed ../affyU74/affyU74.psl \ /projects/compbio/data/microarray/affyGnfMouse/data/U74B_b.txt \ affyGnfU74B.bed affyGnfU74B.exp -newType -chip=U74Bv2 # 20 experiments # 12477 rows of expression data # 30609 records in ../affyU74/affyU74.psl # 11324 records written to affyGnfU74B.bed ~/src/hg/affyGnf/affyPslAndAtlasToBed ../affyU74/affyU74.psl \ /projects/compbio/data/microarray/affyGnfMouse/data/U74C_b.txt \ affyGnfU74C.bed affyGnfU74C.exp -newType -chip=U74Cv2 # 20 experiments # 11934 rows of expression data # 30609 records in ../affyU74/affyU74.psl # 7773 records written to affyGnfU74C.bed # edit 3 .bed files to shorten qName to "xxxx_at" instead of "U74Xv2:xxxx_at;" # (these files do not appear to have these long names in them to begin with) mkdir sav mv *.bed sav sed -e "s/U74Av2://" sav/affyGnfU74A.bed > affyGnfU74A.bed sed -e "s/U74Bv2://" sav/affyGnfU74B.bed > affyGnfU74B.bed sed -e "s/U74Cv2://" sav/affyGnfU74C.bed > affyGnfU74C.bed # and reload data into table hgLoadBed mm9 affyGnfU74A affyGnfU74A.bed # Loaded 10309 elements of size 15 hgLoadBed mm9 affyGnfU74B affyGnfU74B.bed # Loaded 11324 elements of size 15 hgLoadBed mm9 affyGnfU74C affyGnfU74C.bed # Loaded 7773 elements of size 15 # Add in sequence data for U74 tracks. # This business is already in gbdb - 2007-00-10 - Hiram # You do not need to repeat this symlink sequence # Copy consensus sequence to /gbdb if it isn't already # mkdir -p /gbdb/hgFixed/affyProbes cd /gbdb/hgFixed/affyProbes # fix broken symlinks after directory structure changed # /projects/compbiodata ----> /projects/compbio/data rm U74* # make correct symlinks (hartera, 2005-05-03) ln -s /projects/compbio/data/microarray/affyGnfMouse/sequences/U74Av2_consensus.fa . ln -s /projects/compbio/data/microarray/affyGnfMouse/sequences/U74Bv2_consensus.fa . ln -s /projects/compbio/data/microarray/affyGnfMouse/sequences/U74Cv2_consensus.fa . # used perl -pi.bak -e 's/;/ /' to remove ";" after probe name # ASSUMED THIS IS ALREADY DONE LAST TIME FOR MM4. # reload sequences with prefix removed so acc matches name used in # other dependent tables hgLoadSeq -abbr=U74Av2: mm9 /gbdb/hgFixed/affyProbes/U74Av2_consensus.fa # 12422 sequences hgLoadSeq -abbr=U74Bv2: mm9 /gbdb/hgFixed/affyProbes/U74Bv2_consensus.fa # 12411 sequences hgLoadSeq -abbr=U74Cv2: mm9 /gbdb/hgFixed/affyProbes/U74Cv2_consensus.fa # 11868 sequences # building the gene sorter runs the following commands hgExpDistance mm9 affyGnfU74A affyGnfU74AExps affyGnfU74ADistance \ -lookup=knownToU74 # real 7m6.223s # Have 9636 elements in affyGnfU74A # Got 15902 unique elements in affyGnfU74A hgExpDistance mm9 affyGnfU74B affyGnfU74BExps affyGnfU74BDistance \ -lookup=knownToU74 # real 2m12.727s # Have 11025 elements in affyGnfU74B # Got 10442 unique elements in affyGnfU74B hgExpDistance mm9 affyGnfU74C affyGnfU74CExps affyGnfU74CDistance \ -lookup=knownToU74 # real 0m29.270s # Have 7487 elements in affyGnfU74C # Got 3259 unique elements in affyGnfU74C ########################################################################## # BUILD NIBB IMAGE PROGES (DONE - 2007-09-10 - Hiram) ssh pk mkdir -p /cluster/data/mm9/bed/nibbPics/run cd /cluster/data/mm9/bed/nibbPics cp -p /cluster/data/xenTro1/bed/nibbPics/nibbImageProbes.fa . cd run mkdir psl ls -1 /scratch/data/mm9/nib/*.nib > genome.list echo ../nibbImageProbes.fa > probe.list # Create parasol gensub file file cat << '_EOF_' > template #LOOP blatz -rna -minScore=6000 -out=psl $(path1) $(path2) psl/$(root1)_$(root2).psl #ENDLOOP '_EOF_' # << happy emacs # Create parasol batch gensub2 genome.list probe.list template jobList para create jobList para try ... check ... push ... etc... time # Completed: 35 of 35 jobs # CPU time in finished jobs: 9983s 166.39m 2.77h 0.12d 0.000 y # IO & Wait Time: 146s 2.43m 0.04h 0.00d 0.000 y # Average job time: 289s 4.82m 0.08h 0.00d # Longest finished job: 729s 12.15m 0.20h 0.01d # Submission to last job: 729s 12.15m 0.20h 0.01d # Make sort and filter catDir psl | sort -k 10 \ | pslReps stdin stdout /dev/null -nohead -minAli=0.60 \ -nearTop=0.001 -minCover=0.10 -minNearTopSize=80 \ | sort -k 14,14 -k 16,16n \ | sed 's#/scratch/data/mm9/nib/chr#chr#' \ | sed 's/.nib//' > ../nibbImageProbes.psl # Make bed file and copy in stuff ssh hgwdev cd /cluster/data/mm9/bed/nibbPics # Load into database ln -s /cluster/data/xenTro1/bed/nibbPics/nibbImageProbes.fa \ /gbdb/mm9/nibbImageProbes.fa hgLoadSeq mm9 /gbdb/mm9/nibbImageProbes.fa hgLoadPsl mm9 nibbImageProbes.psl ######################################################################### # Creating visiGene tables for gene sorter business # (DONE - 2007-09-10 - Hiram) # This businesss has cumulative effects on the visiGene database # for safety purposes, backup the visiGene database ssh hgwdev mkdir -p /cluster/data/mm9/bed/vgProbes/visiGene.bak cd /cluster/data/mm9/bed/vgProbes/visiGene.bak hgsqldump --all -c --tab=. visiGene cd /cluster/data/mm9/bed/vgProbes mkdir working cd /cluster/data/mm9/bed/vgProbes cp -p ~/kent/src/hg/visiGene/vgProbeTrack/*.sql . # this SEQ appears to find nothing new ? vgProbeTrack SEQ working mm9 rc = 0 = count of primers for mrna search for taxon 10090 rc = 0 = count of primers for genome search for taxon 10090 bac list read done. found seq for 0 bacEndPairs rc = 0 = count of refSeq mrna for mm9 rc = 0 = count of genRef mrna for mm9 rc = 0 = count of genbank mrna for mm9 rc = 0 = count of flatRef mrna for mm9 rc = 0 = count of flatAll mrna for mm9 rc = 0 = count of linkRef mrna for mm9 rc = 0 = count of linkAll mrna for mm9 rc = 0 = count of kgAlRef mrna for mm9 rc = 0 = count of kgAlAll mrna for mm9 # and then, this creates the vgProbes table in mm9 vgProbeTrack ALI working mm9 -sqlPath=.. hgsql -e "select count(*) from vgProbes;" mm9 # 24924 hgsql -e "select count(*) from vgProbes;" mm8 # 24615 # this appears to build working/vgPrbExt.fa and it loaded some sequences vgProbeTrack EXT working mm9 # this copies over all the items from vgProbes to start vgAllProbes vgProbeTrack SELFMAP working mm9 -sqlPath=.. # this adds frog alignments to vgAllProbes vgProbeTrack -sqlPath=.. REMAP working mm9 nibb nibbImageProbes \ /gbdb/mm9/nibbImageProbes.fa hgsql -e "select count(*) from vgAllProbes;" mm9 # 26289 hgsql -e "select count(*) from vgAllProbes;" mm8 # 25994 # finally, gathering together all alignments used and updates seq table vgProbeTrack EXTALL working mm9 # Then, during the gene sorter build, it does: knownToVisiGene mm9 vgGetText visiGene.text mm7 mm8 mm9 hg17 hg18 # probe has 26611 rows # gene has 20413 rows # imageProbe has 125765 rows wc -l visiGene.text # 124186 visiGene.text # compare to existing: wc -l /usr/local/apache/cgi-bin/visiGeneData/visiGene.text # 124186 /usr/local/apache/cgi-bin/visiGeneData/visiGene.text ######################################################################### # Create Allen Brain Atlas mapping. (DONE - 2007-09-24 - Hiram) # Set up directory ssh kkstore06 mkdir /cluster/data/mm9/bed/allenBrain cd /cluster/data/mm9/bed/allenBrain # find most recent update of allProbes.fa to use for these alignments cp -p /cluster/data/mm6/bed/allenBrain/allProbes.fa ./allenBrainProbes.fa cp -p /cluster/data/mm6/bed/allenBrain/allenBrainUrl.tab . # Set up a blat run to align the probes. mkdir split faSplit sequence allenBrainProbes.fa 200 split/rp mkdir run ssh pk cd /cluster/data/mm9/bed/allenBrain/run ls -1 ../split/*.fa > probe.list ls -1 /scratch/data/mm9/nib/*.nib > genome.list mkdir psl cat << '_EOF_' > template #LOOP runBlat $(path1) $(path2) $(root1) $(root2) {check out line+ psl/$(root1)_$(root2).psl} #ENDLOOP '_EOF_' # << happy emacs cat << '_EOF_' > runBlat #!/bin/csh -ef set ooc = /scratch/data/mm9/11.ooc set tmpDir = /scratch/tmp/mm9 set workDir = $tmpDir/$3_$4 set pslOut = $3_$4.psl mkdir -p $tmpDir mkdir $workDir blat -ooc=$ooc $1 $2 $workDir/$pslOut mv $workDir/$pslOut psl/$pslOut rmdir $workDir rmdir --ignore-fail-on-non-empty $tmpDir '_EOF_' # << happy emacs chmod +x runBlat gensub2 genome.list probe.list template jobList para create jobList para try ... check ... push ... etc. # Completed: 6790 of 6790 jobs # CPU time in finished jobs: 28129s 468.81m 7.81h 0.33d 0.001 y # IO & Wait Time: 23014s 383.57m 6.39h 0.27d 0.001 y # Average job time: 8s 0.13m 0.00h 0.00d # Longest finished job: 29s 0.48m 0.01h 0.00d # Submission to last job: 363s 6.05m 0.10h 0.00d # Then do sorting and near-best-in-genome step on file server ssh kkstore06 cd /cluster/data/mm9/bed/allenBrain/run pslSort dirs raw.psl tmp psl pslReps raw.psl ../best.psl -nohead -minCover=0.20 -minAli=0.96 \ -nearTop=0.001 /dev/null # Processed 63183 alignments sort -k14,14 -k16,16n ../best.psl > ../allenBrainAli.psl # Clean up big files no longer needed rm raw.psl batch.bak rm -r psl rm -r ../split # Load up database ssh hgwdev cd /cluster/data/mm9/bed/allenBrain # Make a new table that contains the URLs for the allen brain genes # Make this one first since all.joiner considers it the master table. hgsql mm9 < ~/kent/src/hg/lib/allenBrainUrl.sql hgsql mm9 -e \ 'load data local infile "allenBrainUrl.tab" into table allenBrainUrl;' # Make probe alignment table, and load sequence. hgLoadPsl mm9 allenBrainAli.psl mkdir /gbdb/mm9/allenBrain ln -s /cluster/data/mm9/bed/allenBrain/allenBrainProbes.fa \ /gbdb/mm9/allenBrain/allenBrainProbes.fa hgLoadSeq -replace mm9 /gbdb/mm9/allenBrain/allenBrainProbes.fa # Make mapping between known genes and allenBrain hgMapToGene mm9 allenBrainAli -type=psl knownGene knownToAllenBrain ######################################################################### # MOUSE AFFYMETRIX MOE430 TRACK (DONE - 2007-09-10 - Hiram) # mkdir -p /projects/compbio/data/microarray/affyMouse # Download MOE430A and MOE430B consensus sequences from Affymetrix web site # http://www.affymetrix.com/support/technical/byproduct.affx?product=moe430 # unzip MOE430*_consensus.zip # check for duplicate probes: there are none, all have unique names # check for duplicate probes: 100 from 136745_at to 1367551_a_at # remove "consensus:" and ";" from FASTA headers to shorten probeset # names for database # sed -e 's/consensus://' MOE430A_consensus | sed -e 's/;/ /' > MOE430_all.fa # sed -e 's/consensus://' MOE430B_consensus | sed -e 's/;/ /' >> MOE430_all.fa # cp /projects/compbio/data/microarray/affyMouse/MOE430_all.fa \ # /cluster/bluearc/affy/ # THE ABOVE WAS ALREADY TBD) # Set up cluster job to align MOE430 consensus sequences to mm9 ssh kk mkdir /cluster/data/mm9/bed/affyMOE430 cd /cluster/data/mm9/bed/affyMOE430 ls -1 /iscratch/i/affy/MOE430_all.fa > probe.list cut -f1 /cluster/data/mm9/chrom.sizes > genome.list cat << '_EOF_' > template #LOOP blat -fine -mask=lower -minIdentity=95 -ooc=/scratch/data/mm9/11.ooc /scratch/data/mm9/nib/$(path1).nib {check in line+ $(path2)} {check out line+ psl/$(root1)_$(root2).psl} #ENDLOOP '_EOF_' # << happy emacs gensub2 genome.list probe.list template jobList mkdir psl para create jobList # Do the job with usual para try/check/push/time etc. # Completed: 35 of 35 jobs # CPU time in finished jobs: 9093s 151.55m 2.53h 0.11d 0.000 y # IO & Wait Time: 217s 3.62m 0.06h 0.00d 0.000 y # Average job time: 266s 4.43m 0.07h 0.00d # Longest finished job: 602s 10.03m 0.17h 0.01d # Submission to last job: 602s 10.03m 0.17h 0.01d # Do sort, best in genome filter, and convert to chromosome coordinates # to create affyRAE230.psl pslSort dirs raw.psl tmp psl # only use alignments that cover 30% of sequence and have at least # 95% identity in aligned region. # low minCover as a lot of n's in these sequences pslReps -minCover=0.3 -sizeMatters -minAli=0.95 -nearTop=0.005 \ raw.psl affyMOE430.psl /dev/null # Load alignments and sequences into database ssh hgwdev cd /cluster/data/mm9/bed/affyMOE430 # shorten names in psl file sed -e 's/MOE430//' affyMOE430.psl > affyMOE430.psl.bak mv affyMOE430.psl.bak affyMOE430.psl # load track into database hgLoadPsl mm9 affyMOE430.psl # Add consensus sequences for MOE430 # Copy sequences to gbdb is they are not there already # mkdir -p /gbdb/hgFixed/affyProbes # ln -s /projects/compbio/data/microarray/affyMouse/MOE430_all.fa \ # /gbdb/hgFixed/affyProbes hgLoadSeq -abbr=MOE430 mm9 /gbdb/hgFixed/affyProbes/MOE430_all.fa # Clean up rm batch.bak raw.psl # and then, during the gene sorter build, it does: hgMapToGene mm9 affyMOE430 knownGene knownToMOE430 hgMapToGene mm9 affyMOE430 -prefix=A: knownGene knownToMOE430A ######################################################################### # creating UCSC genes track (DONE - 2007-08-31 - 2007-09-25 - Hiram) # working on the script mm9.ucscGenes10.csh in src/hg/makeDb/doc # The tracks created above were done as they were encountered # in working through that script. Worked through that script # approximately one kluster run at a time, using a large if (1 == 0) # statement to skip over business that had been successfully completed. # After it reached the point where it had begun to load the tables # into the tempDb and started to fail at the missing tables affyGnf1m # the successfully loaded tables in tempDb were moved to mm9 and # the track began to function. Then, working through the affy # alignments above, and completing the loading of the knownTo tables # for the gene sorter as they were completed. Now continuing below # with the rest of the steps manually since it is not necessary to # use the tempDb and its /gbdb/ directory. Everything is not taking # place in the mm9 database. # example script to transfer appropriate tables from one DB to another # while saving the first set too hgsql -N -e "show tables;" mm9UCGenes | \ egrep -v "allenBrainAli|allenBrainUrl|extFile|knownToEnsembl|vgProbes|vgAllProbe s|^seq$|trackDb|history|chromInfo" | while read T do echo -n "=== table ${T}: " C1=`hgsql -N -e "select count(*) from ${T}" mm9` C2=`hgsql -N -e "select count(*) from ${T}" mm9UCGenes 2> /dev/null` D=`echo "${C1}" "${C2}" | awk '{printf "%d", $2-$1}'` echo "${C1} - ${C2} - ${D}" echo "rename table mm9.${T} to mm9UCGenes.${T}_try0" echo "rename table mm9UCGenes.${T} to mm9.${T}" hgsql -e "rename table mm9.${T} to mm9UCGenes.${T}_try0" mysql hgsql -e "rename table mm9UCGenes.${T} to mm9.${T}" mysql done # The egrep -v knocks out tables that are redundant, should be the same # in both DBs ######################################################################### # running the blastP operation to the other genomes for the gene sorter # (DONE - 2007-09-10 - Hiram) mkdir /cluster/data/mm9/bed/ucsc.10/hgNearBlastp cd /cluster/data/mm9/bed/ucsc.10/hgNearBlastp cat << '_EOF_' > config.ra # Latest human vs. other Gene Sorter orgs: # mouse, rat, zebrafish, worm, yeast, fly targetGenesetPrefix known targetDb mm9 queryDbs hg18 rn4 danRer4 dm2 ce4 sacCer1 mm9Fa /cluster/data/mm9/bed/ucsc.10/ucscGenes.faa hg18Fa /cluster/data/hg18/bed/blastp/known.faa rn4Fa /cluster/data/rn4/bed/blastp/known.faa danRer4Fa /cluster/data/danRer4/bed/blastp/ensembl.faa dm2Fa /cluster/data/dm2/bed/flybase4.2/flybasePep.fa ce4Fa /cluster/data/ce4/bed/hgNearBlastp/070731/ce4.sangerPep.faa sacCer1Fa /cluster/data/sacCer1/bed/blastp/sgdPep.faa buildDir /cluster/data/mm9/bed/ucsc.10/hgNearBlastp scratchDir /san/sanvol1/scratch/mm9/jkgHgNearBlastp '_EOF_' # << happy emacs # takes about an hour time nice -n +19 doHgNearBlastp.pl config.ra > do.log 2>&1 & ######################################################################### # fixup the blastP tables to remove non-syntenic hits # (DONE - 2007-09-11 - Hiram) # This was all re-done 2007-09-25, see below: ###### Update blast tabs after UCSC genes rebuild (DONE - 2007-09-25 - Hiram) # Remove non-syntenic hits for human and rat # Takes a few minutes cd /cluster/data/mm9/bed/ucsc.10 synBlastp.csh mm9 rn4 # old number of unique query values: 31610 # old number of unique target values 7072 # new number of unique query values: 13973 # new number of unique target values 6888 synBlastp.csh mm9 hg18 # old number of unique query values: 38136 # old number of unique target values 17214 # new number of unique query values: 0 # new number of unique target values 0 # Make reciprocal best subset for the blastp pairs that are too # Far for synteny to help cd /cluster/data/mm9/bed/ucsc.10/hgNearBlastp # Us vs. others foreach otherDb (danRer4 dm2 ce4 sacCer1) set aToB = run.mm9.$otherDb set bToA = run.$otherDb.mm9 cat $aToB/out/*.tab > $aToB/all.tab cat $bToA/out/*.tab > $bToA/all.tab blastRecipBest $aToB/all.tab $bToA/all.tab \ $aToB/recipBest.tab $bToA/recipBest.tab hgLoadBlastTab mm9 drBlastTab $aToB/recipBest.tab hgLoadBlastTab $otherDb tfBlastTab $bToA/recipBest.tab end # Clean up cat run.mm9.mm9/out/*.tab | gzip -c > run.mm9.mm9/all.tab.gz cat run.mm9.hg18/out/*.tab | gzip -c > run.mm9.hg18/all.tab.gz cat run.hg18.mm9/out/*.tab | gzip -c > run.hg18.mm9/all.tab.gz cat run.mm9.rn4/out/*.tab | gzip -c > run.mm9.rn4/all.tab.gz cat run.rn4.mm9/out/*.tab | gzip -c > run.rn4.mm9/all.tab.gz gzip run.*/all.tab rm -r run.*/out ######################################################################### # Update BLASTTAB blast tabs after UCSC genes rebuild ## (DONE - 2007-09-25 - Hiram) sh hgwdev mkdir -p /cluster/data/mm9/bed/hgNearBlastp/070924 cd /cluster/data/mm9/bed/hgNearBlastp/070924 # Get the proteins used by all hgNear organisms: pepPredToFa hg18 knownGenePep hg18.known.faa pepPredToFa mm9 knownGenePep mm9.known.faa pepPredToFa rn4 knownGenePep rn4.known.faa pepPredToFa danRer4 ensPep danRer4.ensPep.faa pepPredToFa dm3 flyBasePep dm3.flyBasePep.faa pepPredToFa ce4 sangerPep ce4.sangerPep.faa pepPredToFa sacCer1 sgdPep sacCer1.sgdPep.faa cat << '_EOF_' > config.ra # Latest human vs. other Gene Sorter orgs: # mouse, rat, zebrafish, worm, yeast, fly targetGenesetPrefix known targetDb mm9 queryDbs hg18 rn4 danRer4 dm3 ce4 sacCer1 recipBest danRer4 dm3 ce4 sacCer1 mm9Fa /cluster/data/mm9/bed/hgNearBlastp/070924/mm9.known.faa hg18Fa /cluster/data/mm9/bed/hgNearBlastp/070924/hg18.known.faa rn4Fa /cluster/data/mm9/bed/hgNearBlastp/070924/rn4.known.faa danRer4Fa /cluster/data/mm9/bed/hgNearBlastp/070924/danRer4.ensPep.faa dm3Fa /cluster/data/mm9/bed/hgNearBlastp/070924/dm3.flyBasePep.faa ce4Fa /cluster/data/mm9/bed/hgNearBlastp/070924/ce4.sangerPep.faa sacCer1Fa /cluster/data/mm9/bed/hgNearBlastp/070924/sacCer1.sgdPep.faa buildDir /cluster/data/mm9/bed/hgNearBlastp/070924 scratchDir /san/sanvol1/scratch/mm9HgNearBlastp '_EOF_' # << happy emacs # Run with -noLoad so we can eyeball files, manually load mm9 tables now, # and after release of mm9 Gene Sorter on the RR, overload other # databases' mmBlastTab tables. time nice -n +19 doHgNearBlastp.pl -noLoad config.ra > do.log 2>&1 & tail -f do.log Follow instructions at end of do.log, piecewise: - first execute all of the run.mm9.* load scripts - then execute the run.hg18.mm9 and run.rn4.mm9 scripts - then run Galt's script (this is why we load hg18 and rn4 early): synBlastp.csh mm9 hg18 synBlastp.csh mm9 rn4 -- The following was performed 2007-10-11 - After mm9 hgNear/Gene Sorter is enabled on the RR: - run the remaining run.*.mm9 load scripts - then modify each $queryDb's hgGeneData/$org/$queryDb/otherOrg.ra to specify mm9 for mouse - then do a push request for $queryDbs.mmBlastTab and hgGeneData ######################################################################### # MAKE FOLDUTR TABLES (DONE - 2007-09-11 - Hiram) # First set up directory structure and extract UTR sequence on hgwdev # Beware running this on pk since the program RNAfold which is used # during this process is only found on /cluster/bin/i386/ # And there is no way for this cluster setup to verify success # of that program since it is hidden away in rnaFoldBig # Need to fix rnaFoldBig to recognize RNAfold missing ... ssh hgwdev mkdir /cluster/data/mm9/bed/ucsc.10/rnaStruct cd /cluster/data/mm9/bed/ucsc.10/rnaStruct mkdir -p utr3/split utr5/split utr3/fold utr5/fold utrFa mm9 knownGene utr3 utr3/utr.fa utrFa mm9 knownGene utr5 utr5/utr.fa # Split up files and make files that define job. faSplit sequence utr3/utr.fa 10000 utr3/split/s faSplit sequence utr5/utr.fa 10000 utr5/split/s ls -1 utr3/split > utr3/in.lst ls -1 utr5/split > utr5/in.lst cd utr3 cat > template << '_EOF_' #LOOP rnaFoldBig split/$(path1) fold #ENDLOOP '_EOF_' # << happy emacs gensub2 in.lst single template jobList cp -p template ../utr5 cd ../utr5 gensub2 in.lst single template jobList ssh kk cd /cluster/data/mm9/bed/ucsc.10/rnaStruct/utr3 para make jobList # Completed: 9750 of 9750 jobs # CPU time in finished jobs: 377924s 6298.73m 104.98h 4.37d 0.012 y # IO & Wait Time: 38985s 649.75m 10.83h 0.45d 0.001 y # Average job time: 43s 0.71m 0.01h 0.00d # Longest finished job: 3432s 57.20m 0.95h 0.04d # Submission to last job: 11280s 188.00m 3.13h 0.13d cd ../utr5 para make jobList # Completed: 9253 of 9253 jobs # CPU time in finished jobs: 44949s 749.16m 12.49h 0.52d 0.001 y # IO & Wait Time: 51547s 859.11m 14.32h 0.60d 0.002 y # Average job time: 10s 0.17m 0.00h 0.00d # Longest finished job: 1100s 18.33m 0.31h 0.01d # Submission to last job: 1398s 23.30m 0.39h 0.02d # Load database ssh hgwdev cd /cluster/data/mm9/bed/ucsc.10/rnaStruct/utr5 hgLoadRnaFold mm9 foldUtr5 fold # Parsed 35796 files cd ../utr3 hgLoadRnaFold -warnEmpty mm9 foldUtr3 fold # only one is empty: uc009gyo.1 # Seems to be a problem in # RNAfold, so not easy for us to fix. Consequence is not too bad, just a # few 3' UTRs will be missing annotation. (in this case, only one) # Clean up tar cvzf ./fold.tgz ./fold rm -r split fold err batch.bak cd ../utr5 tar cvzf ./fold.tgz ./fold rm -r split fold err batch.bak ######################################################################### # Make pfam run. Actual cluster run is about 6 hours. # (DONE - 2007-09-12 - Hiram) # First get pfam global HMMs into /san/sanvol1/pfam somehow. ssh pk mkdir /san/sanvol1/scratch/mm9/ucscGenes cd /san/sanvol1/scratch/mm9/ucscGenes mkdir splitProt faSplit sequence /cluster/data/mm9/bed/ucsc.10/ucscGenes.faa \ 10000 splitProt/ mkdir pfam cd pfam mkdir out ls -1 ../splitProt > gene.list cat << '_EOF_' > doPfam #!/bin/csh -ef /san/sanvol1/pfam/hmmpfam -E 0.1 /san/sanvol1/pfam/Pfam_fs $1 \ > /scratch/tmp/mm9.$2 mv /scratch/tmp/mm9.$2 $3 '_EOF_' # << happy emacs chmod a+x doPfam cat << '_EOF_' > template #LOOP doPfam ../splitProt/$(path1) $(root1).pf {check out line out/$(root1).pf} #ENDLOOP '_EOF_' # << happy emacs gensub2 gene.list single template jobList para create jobList para try ... check ... push ... etc... time # after some kluster difficulties Completed: 9666 of 9666 jobs CPU time in finished jobs: 3535078s 58917.96m 981.97h 40.92d 0.112 y IO & Wait Time: 0s 0.00m 0.00h 0.00d 0.000 y Average job time: 287s 4.78m 0.08h 0.00d Longest running job: 0s 0.00m 0.00h 0.00d Longest finished job: 3430s 57.17m 0.95h 0.04d Submission to last job: 79051s 1317.52m 21.96h 0.91d # Make up pfamDesc.tab by converting pfam to a ra file first cat << '_EOF_' > makePfamRa.awk /^NAME/ {print} /^ACC/ {print} /^DESC/ {print; printf("\n");} '_EOF_' # << happy emacs awk -f makePfamRa.awk /cluster/store12/pfam/Pfam_fs > pfamDesc.ra raToTab -cols=ACC,NAME,DESC pfamDesc.ra stdout | \ awk -F '\t' '{ printf("%s\t%s\t%s\n", gensub(/\.[0-9]+/, "", "g", $1), $2, $3); }' > pfamDesc.tab # Convert output to tab-separated file. cd /cluster/data/mm9/bed/ucsc.10 catDir /san/sanvol1/scratch/mm9/ucscGenes/pfam/out \ | hmmPfamToTab -eValCol stdin ucscPfam.tab # Convert output to knownToPfam table awk '{printf("%s\t%s\n", $2, gensub(/\.[0-9]+/, "", "g", $1));}' \ /san/sanvol1/scratch/mm9/ucscGenes/pfam/pfamDesc.tab > sub.foo cut -f 1,4 ucscPfam.tab | subColumn 2 stdin sub.foo knownToPfam.tab hgLoadSqlTab mm9 knownToPfam ~/kent/src/hg/lib/knownTo.sql \ knownToPfam.tab cut -f 1-4 ucscPfam.tab > load.ucscPfam.tab hgLoadSqlTab mm9 ucscPfam ~/kent/src/hg/lib/ucscPfam.sql load.ucscPfam.tab cp -p /san/sanvol1/scratch/mm9/ucscGenes/pfam/pfamDesc.tab . hgLoadSqlTab mm9 pfamDesc ~/kent/src/hg/lib/pfamDesc.sql pfamDesc.tab ######################################################################### # Do scop run. Takes about 3.5 hours (DONE - 2007-09-12 - Hiram) # First get pfam global HMMs into /san/sanvol1/scop somehow. ssh pk mkdir /san/sanvol1/scratch/mm9/ucscGenes/scop cd /san/sanvol1/scratch/mm9/ucscGenes/scop mkdir out ls -1 ../splitProt > gene.list cat << '_EOF_' > doScop #!/bin/tcsh -ef /san/sanvol1/pfam/hmmpfam -E 0.1 /san/sanvol1/scop/scop.hmm $1 \ > /scratch/tmp/mm9.$2 mv /scratch/tmp/mm9.$2 $3 '_EOF_' chmod a+x doScop cat << '_EOF_' > template #LOOP doScop ../splitProt/$(path1) $(root1).pf {check out line out/$(root1).pf} #ENDLOOP '_EOF_' gensub2 gene.list single template jobList para create jobList para try ... check ... push ... etc... time # Completed: 9666 of 9666 jobs # CPU time in finished jobs: 3532425s 58873.76m 981.23h 40.88d 0.112 y # IO & Wait Time: 0s 0.00m 0.00h 0.00d 0.000 y # Average job time: 347s 5.78m 0.10h 0.00d # Longest finished job: 6512s 108.53m 1.81h 0.08d # Submission to last job: 12348s 205.80m 3.43h 0.14d # Convert scop output to tab-separated files ssh hgwdev cd /cluster/data/mm9/bed/ucsc.10 catDir /san/sanvol1/scratch/mm9/ucscGenes/scop/out | \ hmmPfamToTab -eValCol -scoreCol stdin scopPlusScore.tab scopCollapse scopPlusScore.tab /cluster/store12/scop/model.tab \ ucscScop.tab scopDesc.tab knownToSuper.tab hgLoadSqlTab mm9 knownToSuper ~/kent/src/hg/lib/knownToSuper.sql \ knownToSuper.tab hgLoadSqlTab mm9 ucscScop ~/kent/src/hg/lib/ucscScop.sql ucscScop.tab hgLoadSqlTab mm9 scopDesc ~/kent/src/hg/lib/scopDesc.sql scopDesc.tab # XXX - ccds is not yet available for Mm9 according to Mark # Regenerate ccdsKgMap table # /cluster/data/genbank/bin/x86_64/mkCcdsGeneMap -db=mm9 -loadDb \ # mm9.ccdsGene knownGene ccdsKgMap # Map old to new mapping - maybe next time, this is first genes on mm9 # hgsql mm9 -N -e 'select * from knownGene' > knownGene_1.gp # genePredToBed knownGene_1.gp >knownGene_1.bed # cat refSeq/*.bed mrna/*.bed | txGeneExplainUpdate1 knownGene_1.bed \ # ucscGenes.bed stdin abWalk.bed kg2ToKg3.bed # hgLoadSqlTab $tempDb kg1ToKg2 ~/kent/src/hg/lib/kg2ToKg3.sql kg2ToKg3.bed # Build kgSpAlias table, which combines content of both kgAlias and kgProtAlias tables. hgsql mm9 -N -e \ 'select kgXref.kgID, spID, alias from kgXref, kgAlias where kgXref.kgID=kgAlias.kgID' >j.tmp hgsql mm9 -N -e \ 'select kgXref.kgID, spID, alias from kgXref, kgProtAlias where kgXref.kgID=kgProtAlias.kgID'\ >>j.tmp sort -u j.tmp > kgSpAlias.tab rm j.tmp hgLoadSqlTab mm9 kgSpAlias ~/kent/src/hg/lib/kgSpAlias.sql ./kgSpAlias.tab ######################################################################### # Building PROTEOME BROWSER TABLES (DONE - 2007-09-12 - Hiram) # These are instructions for building tables # needed for the Proteome Browser. # DON'T START THESE UNTIL TABLES FOR KNOWN GENES AND kgProtMap2 table # ARE REBUILT. # This build is based on proteins DBs dated 070202. # Create the working directory ssh hgwdev mkdir /cluster/data/mm9/bed/ucsc.10/pb cd /cluster/data/mm9/bed/ucsc.10/pb # Build the pepMwAa table hgsql proteins070202 -N -e \ "select info.acc, molWeight, aaSize from sp070202.info, sp070202.accToTaxon where accToTaxon.taxon=10090 and accToTaxon.acc = info.acc" > pepMwAa.tab hgLoadSqlTab mm9 pepMwAa ~/kent/src/hg/lib/pepMwAa.sql ./pepMwAa.tab # Build the pepPi table hgsql proteins070202 -e \ "select info.acc from sp070202.info, sp070202.accToTaxon where accToTaxon.taxon=10090 and accToTaxon.acc = info.acc" > protAcc.list hgsql mm9 -N \ -e 'select proteinID from knownGene where proteinID like "%-%"' \ | sort -u >> protAcc.list pbCalPi protAcc.list sp070202 pepPi.tab hgLoadSqlTab mm9 pepPi ~/kent/src/hg/lib/pepPi.sql ./pepPi.tab # Calculate and load pep distributions pbCalDist sp070202 proteins070202 10090 mm9 hgLoadSqlTab mm9 pepExonCntDist ~/kent/src/hg/lib/pepExonCntDist.sql \ ./pepExonCntDist.tab hgLoadSqlTab mm9 pepCCntDist ~/kent/src/hg/lib/pepCCntDist.sql \ ./pepCCntDist.tab hgLoadSqlTab mm9 pepHydroDist ~/kent/src/hg/lib/pepHydroDist.sql \ ./pepHydroDist.tab hgLoadSqlTab mm9 pepMolWtDist ~/kent/src/hg/lib/pepMolWtDist.sql \ ./pepMolWtDist.tab hgLoadSqlTab mm9 pepResDist ~/kent/src/hg/lib/pepResDist.sql \ ./pepResDist.tab hgLoadSqlTab mm9 pepIPCntDist ~/kent/src/hg/lib/pepIPCntDist.sql \ ./pepIPCntDist.tab hgLoadSqlTab mm9 pepPiDist ~/kent/src/hg/lib/pepPiDist.sql ./pepPiDist.tab # Calculate frequency distributions pbCalResStd sp070202 10090 mm9 # Create pbAnomLimit and pbResAvgStd tables hgLoadSqlTab mm9 pbAnomLimit ~/kent/src/hg/lib/pbAnomLimit.sql \ ./pbAnomLimit.tab hgLoadSqlTab mm9 pbResAvgStd ~/kent/src/hg/lib/pbResAvgStd.sql \ ./pbResAvgStd.tab hgsql -N -e "select * from pbStamp;" mm8 > pbStamp.tab hgLoadSqlTab mm9 pbStamp ~/kent/src/hg/lib/pbStamp.sql \ ./pbStamp.tab # Turn on protein and gene sorter hgsql -e 'update dbDb set hgNearOk=1,hgPbOk=1 where name="mm9";' \ hgcentraltest # Add mm9 to gdbPdb, pointing to proteins070202 mysql> insert into gdbPdb values('mm9','proteins070202'); ############################################################################ # BUILD KNOWN GENE LIST FOR GOOGLE. (DONE - 2007-10-03 - Hiram) cd /cluster/data/mm9/bed rm -rf knownGeneList/mm9 # Run hgKnownGeneList to generate the tree of HTML pages # under ./knownGeneList/mm9 hgKnownGeneList mm9 # copy over to /usr/local/apache/htdocs rm -rf /usr/local/apache/htdocs/knownGeneList/mm9 rsync -a --progress ./knownGeneList/mm9/ \ /usr/local/apache/htdocs/knownGeneList/mm9/ # if this is a new listing, add it to the top level # knownGeneLists.html file ############################################################################ # SGP GENES (DONE - 2007-10-01 - Hiram) ssh kkstore06 mkdir /cluster/data/mm9/bed/sgp cd /cluster/data/mm9/bed/sgp # They don't do chrM (we could just let that on fail ...) for C in `awk '{print $1}' /cluster/data/mm9/chrom.sizes | grep -v chrM` do wget --timestamping \ "http://genome.imim.es/genepredictions/M.musculus/mmJul2007/SGP/humangp200603/${C}.gtf" \ -O "${C}.gtf" done ssh hgwdev cd /cluster/data/mm9/bed/sgp ldHgGene -gtf -genePredExt mm9 sgpGene chr*.gtf # Read 35983 transcripts in 290486 lines in 34 files # 35983 groups 32 seqs 1 sources 3 feature types # 35983 gene predictions featureBits mm9 -enrichment refGene:CDS sgpGene # refGene:CDS 1.165%, sgpGene 1.439%, both 1.005%, cover 86.28%, # enrich 59.96x featureBits mm8 -enrichment refGene:CDS sgpGene # refGene:CDS 1.186%, sgpGene 1.455%, both 1.025%, cover 86.47%, # enrich 59.42x featureBits mm9 -enrichment knownGene:CDS sgpGene # knownGene:CDS 1.278%, sgpGene 1.439%, both 1.080%, cover 84.53%, # enrich 58.74x featureBits mm8 -enrichment knownGene:CDS sgpGene # knownGene:CDS 1.109%, sgpGene 1.455%, both 0.931%, cover 83.98%, # enrich 57.71x ##################################################################### # LOAD GENEID GENES (DONE - 2007-10-01 - Hiram) ssh kkstore06 mkdir -p /cluster/data/mm9/bed/geneid/download cd /cluster/data/mm9/bed/geneid/download bash awk '{print $1}' ../../../chrom.sizes | while read C do echo $C wget --timestamping \ "http://genome.imim.es/genepredictions/M.musculus/mmJul2007/geneid_v1.2/${C}.gtf" \ -O ${C}.gtf wget --timestamping \ "http://genome.imim.es/genepredictions/M.musculus/mmJul2007/geneid_v1.2/${C}.prot" \ -O ${C}.prot done exit # Add missing .1 to protein id's foreach f (*.prot) perl -wpe 's/^(>chr\w+)$/$1.1/' $f > $f:r-fixed.prot end ssh hgwdev cd /cluster/data/mm9/bed/geneid ldHgGene -genePredExt -gtf mm9 geneid download/*.gtf # Read 36708 transcripts in 287399 lines in 35 files # 36708 groups 34 seqs 1 sources 3 feature types # 36708 gene predictions # the chr16_random file is empty, do not attempt to use it hgPepPred mm9 generic geneidPep \ `ls download/*-fixed.prot | grep -v chr16_random` featureBits mm9 -enrichment refGene geneid # refGene 1.975%, geneid 1.590%, both 0.956%, cover 48.39%, enrich 30.44x featureBits mm8 -enrichment refGene geneid # refGene 2.010%, geneid 1.592%, both 0.974%, cover 48.44%, enrich 30.43x featureBits mm7 -enrichment refGene geneid # refGene 2.002%, geneid 1.579%, both 0.952%, cover 47.57%, enrich 30.12x featureBits mm9 -enrichment knownGene geneid # knownGene 2.686%, geneid 1.590%, both 1.047%, cover 38.97%, enrich 24.52x featureBits mm8 -enrichment knownGene geneid # knownGene 2.130%, geneid 1.592%, both 0.900%, cover 42.23%, enrich 26.53x featureBits mm7 -enrichment knownGene geneid # knownGene 2.058%, geneid 1.579%, both 0.859%, cover 41.72%, enrich 26.42x ######################################################################### # BLASTZ/CHAIN/NET Orangutan ponAbe2 (DONE - 2007-09-21 - Hiram) ssh kkstore02 # use a screen to control this job screen mkdir /cluster/data/mm9/bed/blastzPonAbe2.2007-09-19 cd /cluster/data/mm9/bed/blastzPonAbe2.2007-09-19 cat << '_EOF_' > DEF # mouse vs orangutan BLASTZ_M=50 # TARGET: Mouse Mm9 SEQ1_DIR=/scratch/data/mm9/mm9.2bit SEQ1_LEN=/cluster/data/mm9/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=0 # QUERY: Orangutan ponAbe2 SEQ2_DIR=/cluster/bluearc/scratch/data/ponAbe2/ponAbe2.2bit SEQ2_LEN=/cluster/data/ponAbe2/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=10000 BASE=/cluster/data/mm9/bed/blastzPonAbe2.2007-09-19 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \ -stop=load -chainMinScore=3000 \ -chainLinearGap=medium -bigClusterHub=pk > do.log 2>&1 & # real 62m34.156s # some pk kluster difficulties, fixup and complete manually # Completed: 104880 of 104880 jobs # CPU time in finished jobs: 7142978s 119049.64m 1984.16h 82.67d 0.227 y # IO & Wait Time: 556393s 9273.21m 154.55h 6.44d 0.018 y # Average job time: 73s 1.22m 0.02h 0.00d # Longest finished job: 507s 8.45m 0.14h 0.01d # Submission to last job: 65973s 1099.55m 18.33h 0.76d time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \ -continue=cat -chainMinScore=3000 \ -chainLinearGap=medium -bigClusterHub=pk > cat.log 2>&1 & # real 166m20.442s cat fb.mm9.chainPonAbe2Link.txt # 914561309 bases of 2620346127 (34.902%) in intersection # And, for the swap mkdir /cluster/data/ponAbe2/bed/blastz.mm9.swap cd /cluster/data/ponAbe2/bed/blastz.mm9.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /cluster/data/mm9/bed/blastzPonAbe2.2007-09-19/DEF \ -chainMinScore=3000 -swap -chainLinearGap=medium \ -bigClusterHub=pk > swap.log 2>&1 & # real 102m23.209s cat fb.ponAbe2.chainMm9Link.txt # 948458190 bases of 3093572278 (30.659%) in intersection # create the syntenic maf nets: ssh hgwdev cd /cluster/data/mm9/bed/blastzPonAbe2.2007-09-19 time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \ -continue=syntenicNet -syntenicNet -chainMinScore=3000 \ -chainLinearGap=medium -bigClusterHub=pk > syntenicNet.log 2>&1 & # real 22m16.544s ######################################################################## # BLASTZ/CHAIN/NET Frog X. tropicalis xenTro2 (DONE - 2007-09-23 - Hiram) ssh kkstore04 screen # use screen to manage this job # XXX note for next time, missing the TMPDIR in the DEF file mkdir /cluster/data/mm9/bed/blastzXenTro2.2007-09-19 cd /cluster/data/mm9/bed/blastzXenTro2.2007-09-19 cat << '_EOF_' > DEF # Mouse (mm9) vs frog (xenTro2) BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q BLASTZ_M=50 # TARGET: Mouse Mm9 SEQ1_DIR=/scratch/data/mm9/mm9.2bit SEQ1_LEN=/cluster/data/mm9/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=0 # QUERY: Frog xenTro2 - single chunk big enough to run two of the # largest scaffolds in one job SEQ2_DIR=/scratch/hg/xenTro2/xenTro2.2bit SEQ2_LEN=/cluster/data/xenTro2/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LIMIT=50 SEQ2_LAP=10000 BASE=/cluster/data/mm9/bed/blastzXenTro2.2007-09-19 '_EOF_' # << emacs time nice -n +19 doBlastzChainNet.pl DEF -chainMinScore=5000 \ -chainLinearGap=loose -bigClusterHub=kk -verbose=2 > do.log 2>&1 & # real 1050m55.259s # after kk difficulties, finishing the first kluster run manually time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=kk -chainMinScore=5000 -chainLinearGap=loose \ `pwd`/DEF > blastz.out 2>&1 & # Completed: 126539 of 126540 jobs # Crashed: 1 jobs # CPU time in finished jobs: 15750656s 262510.93m 4375.18h 182.30d 0.499 y # IO & Wait Time: 843281s 14054.69m 234.24h 9.76d 0.027 y # Average job time: 131s 2.19m 0.04h 0.00d # Longest finished job: 2039s 33.98m 0.57h 0.02d # Submission to last job: 79275s 1321.25m 22.02h 0.92d # A single job kept having trouble, finished it on kolossus: ssh kolossus cd /cluster/data/mm9/bed/blastzXenTro2.2007-09-19/run.blastz time nice -n +19 /cluster/bin/scripts/blastz-run-ucsc -outFormat psl \ /scratch/data/mm9/mm9.2bit:chr2:80000000-90000000 qParts/part008.lst ../DEF \ ../psl/mm9.2bit:chr2:80000000-90000000/mm9.2bit:chr2:80000000-90000000_part008.lst.psl # continuing after that time nice -n +19 doBlastzChainNet.pl -verbose=2 \ -continue=cat -bigClusterHub=kk -chainMinScore=5000 \ -chainLinearGap=loose `pwd`/DEF > cat.out 2>&1 & # real 62m17.627s cat fb.mm9.chainXenTro2Link.txt # 82054987 bases of 2620346127 (3.131%) in intersection # Then to swap over to xenTro2 mkdir /cluster/data/xenTro2/bed/blastz.mm9.swap cd /cluster/data/xenTro2/bed/blastz.mm9.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ -swap -bigClusterHub=kk -chainMinScore=5000 \ /cluster/data/mm9/bed/blastzXenTro2.2007-09-19/DEF \ -chainLinearGap=loose > swap.out 2>&1 & # real 47m53.428s ssh hgwdev cd /cluster/data/mm9/bed/blastz.xenTro2.2007-09-19 time nice -n +19 featureBits mm9 chainXenTro2Link \ > fb.mm9.chainXenTro2Link 2>&1 & # 68050843 bases of 2567283971 (2.651%) in intersection cd /cluster/data/xenTro2/bed/blastz.mm9.swap time nice -n +19 featureBits xenTro2 chainMm8Link \ > fb.xenTro2.chainMm8Link 2>&1 # 72840135 bases of 1359412157 (5.358%) in intersection ######################################################################### ## BLASTZ Lizard anoCar1 - (DONE - 2007-09-21 - Hiram) ssh kkstore04 mkdir /cluster/data/mm9/bed/blastzAnoCar1.2007-09-19 cd /cluster/data/mm9/bed/blastzAnoCar1.2007-09-19 cat << '_EOF_' > DEF # Mouse (mm9) vs lizard (anoCar1) BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q BLASTZ_M=50 # TARGET: Mouse Mm9 SEQ1_DIR=/scratch/data/mm9/mm9.2bit SEQ1_LEN=/cluster/data/mm9/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=0 # QUERY: Lizard AnoCar1 - largest chunk big enough for largest scaffold SEQ2_DIR=/san/sanvol1/scratch/anoCar1/anoCar1.2bit SEQ2_LEN=/cluster/data/anoCar1/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LIMIT=30 SEQ2_LAP=10000 BASE=/cluster/data/mm9/bed/blastzAnoCar1.2007-09-19 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl DEF -chainMinScore=5000 \ -qRepeats=windowmaskerSdust \ -chainLinearGap=loose -bigClusterHub=kk -verbose=2 > do.log 2>&1 & # real 911m49.918s # after kk difficulties, finishing the first kluster run manually # Completed: 86355 of 86355 jobs # CPU time in finished jobs: 11171051s 186184.18m 3103.07h 129.29d 0.354 y # IO & Wait Time: 662082s 11034.70m 183.91h 7.66d 0.021 y # Average job time: 137s 2.28m 0.04h 0.00d # Longest finished job: 1467s 24.45m 0.41h 0.02d # Submission to last job: 62938s 1048.97m 17.48h 0.73d # continuing time nice -n +19 ~/kent/src/hg/utils/automation/doBlastzChainNet.pl \ DEF -chainMinScore=5000 \ -continue=cat -qRepeats=windowmaskerSdust \ -chainLinearGap=loose -bigClusterHub=kk -verbose=2 > cat.log 2>&1 & # real 31m44.652s cat fb.mm9.chainAnoCar1Link.txt # 89239796 bases of 2620346127 (3.406%) in intersection # and for the swap mkdir /cluster/data/anoCar1/bed/blastz.mm9.swap cd /cluster/data/anoCar1/bed/blastz.mm9.swap time nice -n +19 ~/kent/src/hg/utils/automation/doBlastzChainNet.pl \ /cluster/data/mm9/bed/blastzAnoCar1.2007-09-19/DEF -chainMinScore=5000 \ -swap -qRepeats=windowmaskerSdust \ -chainLinearGap=loose -bigClusterHub=kk -verbose=2 > swap.log 2>&1 & # real 29m12.291s cat fb.anoCar1.chainMm9Link.txt # 85923556 bases of 1741478929 (4.934%) in intersection ######################################################################### # BLASTZ Chicken galGal3 (DONE - 2007-09-25 - Hiram) ssh kkstore03 screen # use screen to control this job mkdir /cluster/data/mm9/bed/blastzGalGal3.2007-09-21 cd /cluster/data/mm9/bed/blastzGalGal3.2007-09-21 # This partitioning is too large to run on kk, must run this on pk # or change the partitioning cat << '_EOF_' > DEF # mouse vs chicken BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q BLASTZ_ABRIDGE_REPEATS=1 # TARGET: Mouse Mm9 SEQ1_DIR=/scratch/data/mm9/nib SEQ1_SMSK=/scratch/data/mm9/notInOthers SEQ1_LEN=/cluster/data/mm9/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Chicken galGal3 - single chunk big enough to run entire chrom SEQ2_DIR=/scratch/hg/galGal3/nib SEQ2_LEN=/cluster/data/galGal3/chrom.sizes SEQ2_SMSK=/san/sanvol1/galGal3/linSpecRep SEQ2_CHUNK=200000000 SEQ2_LAP=0 BASE=/cluster/data/mm9/bed/blastzGalGal3.2007-09-21 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl DEF -chainMinScore=5000 \ -chainLinearGap=loose -bigClusterHub=kk -verbose=2 > do.log 2>&1 & # real 587m53.468s # Completed: 16680 of 17168 jobs # Crashed: 488 jobs # CPU time in finished jobs: 7758569s 129309.48m 2155.16h 89.80d 0.246 y # IO & Wait Time: 190128s 3168.80m 52.81h 2.20d 0.006 y # Average job time: 477s 7.94m 0.13h 0.01d # Longest finished job: 6501s 108.35m 1.81h 0.08d # Submission to last job: 271554s 4525.90m 75.43h 3.14d # the kk cluster could not complete some of these jobs. A recovery job # list was created from the remaining jobs and completed on pk # Completed: 488 of 488 jobs # CPU time in finished jobs: 1226144s 20435.73m 340.60h 14.19d 0.039 y # IO & Wait Time: 6875s 114.58m 1.91h 0.08d 0.000 y # Average job time: 2527s 42.11m 0.70h 0.03d # Longest finished job: 3872s 64.53m 1.08h 0.04d # Submission to last job: 11739s 195.65m 3.26h 0.14d # continuing time nice -n +19 doBlastzChainNet.pl DEF -chainMinScore=5000 \ -continue=cat -chainLinearGap=loose -bigClusterHub=pk -verbose=2 \ > cat.log 2>&1 & # real 18m35.814s cat fb.mm9.chainGalGal3Link.txt # 97711788 bases of 2620346127 (3.729%) in intersection # and the swap mkdir /cluster/data/galGal3/bed/blastz.mm9.swap cd /cluster/data/galGal3/bed/blastz.mm9.swap time nice -n +19 doBlastzChainNet.pl -chainMinScore=5000 -verbose=2 \ /cluster/data/mm9/bed/blastzGalGal3.2007-09-21/DEF \ -swap -chainLinearGap=loose -bigClusterHub=pk > swap.log 2>&1 & # real 12m54.737s cat fb.galGal3.chainMm9Link.txt # 84990797 bases of 1042591351 (8.152%) in intersection ######################################################################### # BLASTZ Platypus ornAna1 - (DONE - 2007-09-21 - 2007-09-25 - Hiram) ssh kkstore05 mkdir /cluster/data/mm9/bed/blastzOrnAna1.2007-09-21 cd /cluster/data/mm9/bed/blastzOrnAna1.2007-09-21 cat << '_EOF_' > DEF # mouse vs. platypus BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q BLASTZ_M=50 # TARGET: Mouse Mm9 SEQ1_DIR=/scratch/data/mm9/mm9.2bit SEQ1_LEN=/cluster/data/mm9/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=0 # QUERY: ornAna1 SEQ2_DIR=/iscratch/i/ornAna1/ornAna1.2bit SEQ2_LEN=/cluster/data/ornAna1/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LIMIT=300 SEQ2_LAP=0 BASE=/cluster/data/mm9/bed/blastzOrnAna1.2007-09-21 TMPDIR=/scratch/tmp '_EOF_' # << emacs time nice -n +19 doBlastzChainNet.pl DEF -chainMinScore=5000 \ -chainLinearGap=loose -bigClusterHub=kk -verbose=2 > do.log 2>&1 & # real 912m18.732s cat fb.mm9.chainOrnAna1Link.txt # 141953739 bases of 2620346127 (5.417%) in intersection # and the swap mkdir /cluster/data/ornAna1/bed/blastz.mm9.swap cd /cluster/data/ornAna1/bed/blastz.mm9.swap time nice -n +19 doBlastzChainNet.pl -chainMinScore=5000 -verbose=2 \ /cluster/data/mm9/bed/blastzOrnAna1.2007-09-21/DEF \ -swap -chainLinearGap=loose -bigClusterHub=kk > swap.log 2>&1 & # real 123m16.632s cat fb.ornAna1.chainMm9Link.txt # 135570580 bases of 1842236818 (7.359%) in intersection ######################################################################### # Blastz Chimp panTro2 - (DONE - 2007-09-24 - 2007-09-25 - Hiram) ssh kkstore04 mkdir /cluster/data/mm9/bed/blastzPanTro2.2007-09-24 cd /cluster/data/mm9/bed/blastzPanTro2.2007-09-24 cat << '_EOF_' > DEF # Mouse vs Chimp BLASTZ_ABRIDGE_REPEATS=1 # TARGET: Mouse Mm9 SEQ1_DIR=/scratch/data/mm9/nib SEQ1_SMSK=/scratch/data/mm9/notInOthers SEQ1_LEN=/cluster/data/mm9/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Chimp PanTro2 SEQ2_DIR=/scratch/hg/panTro2/nib SEQ2_LEN=/cluster/data/panTro2/chrom.sizes SEQ2_SMSK=/cluster/bluearc/panTro2/linSpecRep/notInRodent SEQ2_CHUNK=50000000 SEQ2_LAP=0 BASE=/cluster/data/mm9/bed/blastzPanTro2.2007-09-24 TMPDIR=/scratch/tmp '_EOF_' # << emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ DEF > blastz.out 2>&1 & # real 701m23.446s cat fb.mm9.chainPanTro2Link.txt # 987180081 bases of 2620346127 (37.674%) in intersection # and the swap mkdir /cluster/data/panTro2/bed/blastz.mm9.swap cd /cluster/data/panTro2/bed/blastz.mm9.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /cluster/data/mm9/bed/blastzPanTro2.2007-09-24/DEF \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -swap > swap.log 2>&1 & # real 87m25.448s cat fb.panTro2.chainMm9Link.txt # 997050630 bases of 2909485072 (34.269%) in intersection # create syntenic maf nets: time nice -n +19 doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -syntenicNet -continue=syntenicNet DEF > syntenicNet.out 2>&1 & # real 25m13.118s ######################################################################### # Blastz Horse equCab1 - (DONE - 2007-09-24 - 2007-09-25 - Hiram) ssh kkstore05 mkdir /cluster/data/mm9/bed/blastzEquCab1.2007-09-24 cd /cluster/data/mm9/bed/blastzEquCab1.2007-09-24 cat << '_EOF_' > DEF # Mouse vs Horse # TARGET: Mouse Mm9 SEQ1_DIR=/scratch/data/mm9/mm9.2bit SEQ1_LEN=/cluster/data/mm9/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Horse EquCab1 SEQ2_DIR=/san/sanvol1/scratch/equCab1/equCab1.2bit SEQ2_LEN=/cluster/data/equCab1/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LAP=0 BASE=/cluster/data/mm9/bed/blastzEquCab1.2007-09-24 TMPDIR=/scratch/tmp '_EOF_' # << emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ DEF > blastz.out 2>&1 & # real 1582m34.597s cat fb.mm9.chainEquCab1Link.txt # 911418189 bases of 2620346127 (34.782%) in intersection # and the swap mkdir /cluster/data/equCab1/bed/blastz.mm9.swap cd /cluster/data/equCab1/bed/blastz.mm9.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 -chainMinScore=3000 \ /cluster/data/mm9/bed/blastzEquCab1.2007-09-24/DEF \ -swap -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ > swap.out 2>&1 & # real ~110m cat fb.equCab1.chainMm9Link.txt # 901367656 bases of 2421923695 (37.217%) in intersection # create the syntenic maf nets time nice -n +19 doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -continue=syntenicNet -syntenicNet DEF > syntenicNet.out 2>&1 & # real 29m40.546s ######################################################################### # Blastz Cow bosTau3 (DONE - 2007-09-25 - Hiram) ssh kkstore05 screen # use a screen to control this job mkdir /cluster/data/mm9/bed/blastzBosTau3.2007-09-25 cd /cluster/data/mm9/bed/blastzBosTau3.2007-09-25 cat << '_EOF_' > DEF # Mouse vs Cow # TARGET: Mouse Mm9 SEQ1_DIR=/scratch/data/mm9/mm9.2bit SEQ1_LEN=/cluster/data/mm9/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Cow bosTau3 SEQ2_DIR=/san/sanvol1/scratch/bosTau3/bosTau3.2bit SEQ2_LEN=/cluster/data/bosTau3/chrom.sizes SEQ2_LIMIT=100 SEQ2_CHUNK=50000000 SEQ2_LAP=0 BASE=/cluster/data/mm9/bed/blastzBosTau3.2007-09-25 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -chainMinScore=3000 -verbose=2 \ DEF -bigClusterHub=pk -chainLinearGap=medium > do.log 2>&1 & # real 733m40.065s cat fb.mm9.chainBosTau3Link.txt # 690515959 bases of 2620346127 (26.352%) in intersection # and for the swap mkdir /cluster/data/bosTau3/bed/blastz.mm9.swap cd /cluster/data/bosTau3/bed/blastz.mm9.swap time nice -n +19 doBlastzChainNet.pl -chainMinScore=3000 -verbose=2 \ /cluster/data/mm9/bed/blastzBosTau3.2007-09-25/DEF \ -swap -bigClusterHub=pk -chainLinearGap=medium > swap.log 2>&1 & # real 100m20.707s cat fb.bosTau3.chainMm9Link.txt # 707779988 bases of 2731807384 (25.909%) in intersection # create the syntenic maf nets time nice -n +19 doBlastzChainNet.pl -chainMinScore=3000 -verbose=2 \ -syntenicNet -continue=syntenicNet \ DEF -bigClusterHub=pk -chainLinearGap=medium > syntenicNet.log 2>&1 & # real 16m28.741s ######################################################################### # Blastz Opossum monDom4 (DONE - 2007-09-25 - 2007-09-27 - Hiram) ssh kkstore04 screen # use screen to manage this job mkdir /cluster/data/mm9/bed/blastzMonDom4.2007-09-25 cd /cluster/data/mm9/bed/blastzMonDom4.2007-09-25 # the opossum chroms are too large to work with on the kk, must run this # on the pk kluster cat << '_EOF_' > DEF # Mouse vs. opossum BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET: Mouse Mm9 SEQ1_DIR=/scratch/data/mm9/nib SEQ1_LEN=/cluster/data/mm9/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Opossum monDom4 SEQ2_DIR=/scratch/hg/monDom4/monDom4.2bit SEQ2_LEN=/cluster/data/monDom4/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/cluster/data/mm9/bed/blastzMonDom4.2007-09-25 TMPDIR=/scratch/tmp '_EOF' # << happy emacs time nice -n +19 doBlastzChainNet.pl DEF -chainMinScore=5000 \ -chainLinearGap=loose -bigClusterHub=pk -verbose=2 > do.log 2>&1 & # real 811m19.320s # problem on kki run, monDom4 wasn't distributed on the Iservers to # /scratch/hg/monDom4/ - straighten that up, and finish that run, then # continuing time nice -n +19 doBlastzChainNet.pl DEF -chainMinScore=5000 \ -continue=chainMerge -chainLinearGap=loose \ -bigClusterHub=pk -verbose=2 > chainMerge.log 2>&1 & # real 158m9.287s cat fb.mm9.chainMonDom4Link.txt # 255535025 bases of 2620346127 (9.752%) in intersection # and for the swap mkdir /cluster/data/monDom4/bed/blastz.mm9.swap cd /cluster/data/monDom4/bed/blastz.mm9.swap time nice -n +19 doBlastzChainNet.pl -chainMinScore=5000 -verbose=2 \ /cluster/data/mm9/bed/blastzMonDom4.2007-09-25/DEF \ -swap -chainLinearGap=loose \ -bigClusterHub=pk > swap.log 2>&1 & # real 59m19.005s cat fb.monDom4.chainMm9Link.txt # 254018516 bases of 3501643220 (7.254%) in intersection ######################################################################### # Blastz Tenrec echTel1 (DONE - 2007-09-25 - 2007-09-27 - Hiram) ssh kkstore02 screen # use a screen to control this job mkdir /cluster/data/mm9/bed/blastzEchTel1.2007-09-25 cd /cluster/data/mm9/bed/blastzEchTel1.2007-09-25 cat << '_EOF_' > DEF BLASTZ_M=50 # TARGET: Mouse Mm9 SEQ1_DIR=/scratch/data/mm9/mm9.2bit SEQ1_LEN=/cluster/data/mm9/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY - Tenrec echTel1 SEQ2_DIR=/scratch/hg/echTel1/echTel1.2bit SEQ2_LEN=/scratch/hg/echTel1/chrom.sizes SEQ2_CHUNK=30000000 SEQ2_LIMIT=800 SEQ2_LAP=0 BASE=/cluster/data/mm9/bed/blastzEchTel1.2007-09-25 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 -chainMinScore=3000 \ -bigClusterHub=kk -chainLinearGap=medium DEF > do.log 2>&1 & # real 2721m33.204s cat fb.mm9.chainEchTel1Link.txt # 291920039 bases of 2620346127 (11.141%) in intersection # and for the swap mkdir /cluster/data/echTel1/bed/blastz.mm9.swap cd /cluster/data/echTel1/bed/blastz.mm9.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 -chainMinScore=3000 \ /cluster/data/mm9/bed/blastzEchTel1.2007-09-25/DEF \ -swap -bigClusterHub=kk -chainLinearGap=medium > swap.log 2>&1 & # real 520m9.198s cat fb.echTel1.chainMm9Link.txt # 298656963 bases of 2111581369 (14.144%) in intersection # create syntenic maf nets time nice -n +19 doBlastzChainNet.pl -verbose=2 -chainMinScore=3000 \ DEF -continue=syntenicNet -bigClusterHub=kk \ -syntenicNet -chainLinearGap=medium > syntenicNet.log 2>&1 & # real 3m4.285s # create reciprocal best chains/nets ssh hgwdev cd /cluster/data/mm9/bed/blastzEchTel1.2007-09-25 time nice -n +19 /cluster/bin/scripts/doRecipBest.pl mm9 echTel1 \ > rbest.log 2>&1 & # real 34m12.936s ######################################################################### # Blastz Tree Shrew tupBel1 (DONE - 2007-09-27 - 2007-10-01 - Hiram) ssh kkstore05 screen # use screen to control this job mkdir /cluster/data/mm9/bed/blastzTupBel1.2007-09-27 cd /cluster/data/mm9/bed/blastzTupBel1.2007-09-27 cat << '_EOF_' > DEF # Mouse vs. Tree Shrew BLASTZ_M=50 # TARGET: Mouse Mm9 SEQ1_DIR=/scratch/data/mm9/mm9.2bit SEQ1_LEN=/cluster/data/mm9/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Tree shrew tupBel1 SEQ2_DIR=/san/sanvol1/scratch/tupBel1/tupBel1.2bit SEQ2_LEN=/cluster/data/tupBel1/chrom.sizes SEQ2_CHUNK=30000000 SEQ2_LIMIT=400 SEQ2_LAP=0 BASE=/cluster/data/mm9/bed/blastzTupBel1.2007-09-27 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 -chainMinScore=3000 \ -stop=chainMerge -bigClusterHub=pk -chainLinearGap=medium DEF \ > chainMerge.log 2>&1 & # real 1262m32.699s # the load should fail due to missing repeat masker tables in tupBel1 time nice -n +19 doBlastzChainNet.pl -verbose=2 -chainMinScore=3000 \ -continue=net -stop=load -bigClusterHub=pk -chainLinearGap=medium DEF \ > net.log 2>&1 & # real 69m41.901s # and indeed it did, Loading the net track ssh hgwdev cd /cluster/data/mm9/bed/blastzTupBel1.2007-09-27/axtChain cp -p noClass.net mm9.tupBel1.net time nice -n +19 netFilter -minGap=10 mm9.tupBel1.net \ | hgLoadNet -warn mm9 netTupBel1 stdin cd /cluster/data/mm9/bed/blastzTupBel1.2007-09-27 time nice -n +19 featureBits mm9 chainTupBel1Link \ > fb.mm9.chainTupBel1Link.txt 2>&1 & cat fb.mm9.chainTupBel1Link.txt # 552865662 bases of 2620346127 (21.099%) in intersection # and, to finish it all off, with syntenic net time nice -n +19 doBlastzChainNet.pl -verbose=2 -chainMinScore=3000 \ -continue=download -bigClusterHub=pk \ -syntenicNet -chainLinearGap=medium DEF > syntenicNet.log 2>&1 & # real 14m42.816s # create reciprocal best chains/nets ssh hgwdev cd /cluster/data/mm9/bed/blastzTupBel1.2007-09-27 time nice -n +19 /cluster/bin/scripts/doRecipBest.pl mm9 tupBel1 \ > rbest.log 2>&1 & # real 41m12.278s ######################################################################### # Blastz Bush Baby otoGar1 (DONE - 2007-09-27 - 2007-09-28 - Hiram) ssh kkstore05 screen # use screen to control this job mkdir /cluster/data/mm9/bed/blastzOtoGar1.2007-09-27 cd /cluster/data/mm9/bed/blastzOtoGar1.2007-09-27 cat << '_EOF_' > DEF # Mouse vs. Tree Shrew BLASTZ_M=50 # TARGET: Mouse Mm9 SEQ1_DIR=/scratch/data/mm9/mm9.2bit SEQ1_LEN=/cluster/data/mm9/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Bush baby otoGar1 SEQ2_DIR=/san/sanvol1/scratch/otoGar1/otoGar1.2bit SEQ2_LEN=/cluster/data/otoGar1/chrom.sizes SEQ2_CHUNK=30000000 SEQ2_LIMIT=400 SEQ2_LAP=0 BASE=/cluster/data/mm9/bed/blastzOtoGar1.2007-09-27 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 -chainMinScore=3000 \ -stop=chainMerge -bigClusterHub=pk -chainLinearGap=medium DEF \ > chainMerge.log 2>&1 & # real 873m23.531s time nice -n +19 doBlastzChainNet.pl -verbose=2 -chainMinScore=3000 \ -continue=net -stop=load -bigClusterHub=pk -chainLinearGap=medium DEF \ > net.log 2>&1 & # real 67m7.172s cat fb.mm9.chainOtoGar1Link.txt # 601932945 bases of 2620346127 (22.972%) in intersection # and run the syntenicNet and cleanup time nice -n +19 doBlastzChainNet.pl -verbose=2 -chainMinScore=3000 \ -continue=download -bigClusterHub=pk -chainLinearGap=medium DEF \ -syntenicNet > syntenicNet.log 2>&1 & # real 13m57.573s # create reciprocal best chains/nets ssh hgwdev cd /cluster/data/mm9/bed/blastzOtoGar1.2007-09-27 time nice -n +19 /cluster/bin/scripts/doRecipBest.pl mm9 otoGar1 \ > rbest.log 2>&1 & # real 40m1.428s ######################################################################### # Blastz Armadillo dasNov1 (DONE - 2007-09-27 - 2007-10-02 - Hiram) ssh kkstore04 screen # use screen to control this job mkdir /cluster/data/mm9/bed/blastzDasNov1.2007-09-27 cd /cluster/data/mm9/bed/blastzDasNov1.2007-09-27 cat << '_EOF_' > DEF # Mouse vs. Armadillo BLASTZ_M=50 # TARGET: Mouse Mm9 SEQ1_DIR=/scratch/data/mm9/mm9.2bit SEQ1_LEN=/cluster/data/mm9/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Armadillo dasNov1 SEQ2_DIR=/scratch/hg/dasNov1/dasNov1.2bit SEQ2_LEN=/cluster/data/dasNov1/chrom.sizes SEQ2_CHUNK=30000000 SEQ2_LIMIT=400 SEQ2_LAP=0 BASE=/cluster/data/mm9/bed/blastzDasNov1.2007-09-27 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 -chainMinScore=3000 \ -stop=load -bigClusterHub=pk -chainLinearGap=medium DEF \ > load.log 2>&1 & # real 3607m35.169s cat fb.mm9.chainDasNov1Link.txt # 433593082 bases of 2620346127 (16.547%) in intersection time nice -n +19 doBlastzChainNet.pl -verbose=2 -chainMinScore=3000 \ -continue=download -bigClusterHub=pk -chainLinearGap=medium DEF \ -syntenicNet > syntenicNet.log 2>&1 & # real 15m7.642s # create reciprocal best chains/nets ssh hgwdev cd /cluster/data/mm9/bed/blastzDasNov1.2007-09-27 time nice -n +19 /cluster/bin/scripts/doRecipBest.pl mm9 dasNov1 \ > rbest.log 2>&1 & # real 39m18.156s ######################################################################### # Blastz Rabbit oryCun1 (DONE - 2007-09-28 - 2007-09-29 - Hiram) ssh kkstore04 screen # use screen to control this job mkdir /cluster/data/mm9/bed/blastzOryCun1.2007-09-28 cd /cluster/data/mm9/bed/blastzOryCun1.2007-09-28 cat << '_EOF_' > DEF # Mouse vs. Rabbit BLASTZ_M=50 # TARGET: Mouse Mm9 SEQ1_DIR=/scratch/data/mm9/mm9.2bit SEQ1_LEN=/cluster/data/mm9/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Rabbit oryCun1 SEQ2_DIR=/scratch/hg/oryCun1/oryCun1.2bit SEQ2_LEN=/cluster/data/oryCun1/chrom.sizes SEQ2_CHUNK=30000000 SEQ2_LIMIT=400 SEQ2_LAP=0 BASE=/cluster/data/mm9/bed/blastzOryCun1.2007-09-28 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 -chainMinScore=3000 \ -stop=chainMerge -bigClusterHub=pk -chainLinearGap=medium DEF \ > chainMerge.log 2>&1 & # real 2126m59.162s time nice -n +19 doBlastzChainNet.pl -verbose=2 -chainMinScore=3000 \ -continue=net -stop=load -bigClusterHub=pk -chainLinearGap=medium DEF \ > load.log 2>&1 & # real 53m28.279s cat fb.mm9.chainOryCun1Link.txt # 496428446 bases of 2620346127 (18.945%) in intersection time nice -n +19 doBlastzChainNet.pl -verbose=2 -chainMinScore=3000 \ -continue=download -bigClusterHub=pk -chainLinearGap=medium DEF \ -syntenicNet > syntenicNet.log 2>&1 & # real 9m27.321s # create reciprocal best chains/nets ssh hgwdev cd /cluster/data/mm9/bed/blastzOryCun1.2007-09-28 time nice -n +19 /cluster/bin/scripts/doRecipBest.pl mm9 oryCun1 \ > rbest.log 2>&1 & # real 37m32.151s ######################################################################### # Blastz Cat felCat3 (DONE - 2007-09-28 - 2007-09-29 - Hiram) ssh kkstore05 screen # use screen to control this job mkdir /cluster/data/mm9/bed/blastzFelCat3.2007-09-28 cd /cluster/data/mm9/bed/blastzFelCat3.2007-09-28 cat << '_EOF_' > DEF # Mouse vs. Cat BLASTZ_M=50 # TARGET: Mouse Mm9 SEQ1_DIR=/scratch/data/mm9/mm9.2bit SEQ1_LEN=/cluster/data/mm9/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Cat felCat3 SEQ2_DIR=/san/sanvol1/scratch/felCat3/felCat3.2bit SEQ2_LEN=/cluster/data/felCat3/chrom.sizes SEQ2_CHUNK=30000000 SEQ2_LIMIT=400 SEQ2_LAP=0 BASE=/cluster/data/mm9/bed/blastzFelCat3.2007-09-28 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 -chainMinScore=3000 \ -stop=chainMerge -bigClusterHub=pk -chainLinearGap=medium DEF \ > chainMerge.log 2>&1 & # real 1597m21.032s time nice -n +19 doBlastzChainNet.pl -verbose=2 -chainMinScore=3000 \ -continue=net -stop=load -bigClusterHub=pk -chainLinearGap=medium DEF \ > load.log 2>&1 & # real 39m30.078s cat fb.mm9.chainFelCat3Link.txt # 499894253 bases of 2620346127 (19.077%) in intersection time nice -n +19 doBlastzChainNet.pl -verbose=2 -chainMinScore=3000 \ -continue=download -bigClusterHub=pk -chainLinearGap=medium DEF \ -syntenicNet > syntenicNet.log 2>&1 & # real 9m42.624s # create reciprocal best chains/nets ssh hgwdev cd /cluster/data/mm9/bed/blastzFelCat3.2007-09-28 time nice -n +19 /cluster/bin/scripts/doRecipBest.pl mm9 felCat3 \ > rbest.log 2>&1 & # real 36m40.000s ######################################################################### # Blastz Elephant loxAfr1 (DONE - 2007-09-28 - 2007-10-02 - Hiram) ssh kkstore04 screen # use screen to control this job mkdir /cluster/data/mm9/bed/blastzLoxAfr1.2007-09-28 cd /cluster/data/mm9/bed/blastzLoxAfr1.2007-09-28 cat << '_EOF_' > DEF # Mouse vs. Elephant BLASTZ_M=50 # TARGET: Mouse Mm9 SEQ1_DIR=/scratch/data/mm9/mm9.2bit SEQ1_LEN=/cluster/data/mm9/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Elephant loxAfr1 SEQ2_DIR=/scratch/hg/loxAfr1/loxAfr1.2bit SEQ2_LEN=/cluster/data/loxAfr1/chrom.sizes SEQ2_CHUNK=30000000 SEQ2_LIMIT=400 SEQ2_LAP=0 BASE=/cluster/data/mm9/bed/blastzLoxAfr1.2007-09-28 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 -chainMinScore=3000 \ -stop=load -bigClusterHub=pk -chainLinearGap=medium DEF \ > load.log 2>&1 & # real 2981m3.302s # had two failed jobs in that state where their results existed, # but parasol thought they were not done. Continuing, and now # all the way to syntenicNet. Will probably fail during the load # since not everything is there for db loxAfr1 time nice -n +19 doBlastzChainNet.pl -verbose=2 -chainMinScore=3000 \ -continue=cat -bigClusterHub=pk -chainLinearGap=medium DEF \ -syntenicNet > syntenicNet.log 2>&1 & # real 166m4.710s # it did get through everything to a successful completion cat fb.mm9.chainLoxAfr1Link.txt # 473014688 bases of 2620346127 (18.052%) in intersection # create reciprocal best chains/nets ssh hgwdev cd /cluster/data/mm9/bed/blastzLoxAfr1.2007-09-28 time nice -n +19 /cluster/bin/scripts/doRecipBest.pl mm9 loxAfr1 \ > rbest.log 2>&1 & # real 41m56.201s ######################################################################### # Blastz Hedgehog eriEur1 (DONE - 2007-09-28 - 2007-10-02 - Hiram) ssh kkstore05 screen # use screen to control this job mkdir /cluster/data/mm9/bed/blastzEriEur1.2007-09-28 cd /cluster/data/mm9/bed/blastzEriEur1.2007-09-28 cat << '_EOF_' > DEF # Mouse vs. Hedgehog BLASTZ_M=50 # TARGET: Mouse Mm9 SEQ1_DIR=/scratch/data/mm9/mm9.2bit SEQ1_LEN=/cluster/data/mm9/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Hedgehog eriEur1 SEQ2_DIR=/san/sanvol1/scratch/eriEur1/eriEur1.2bit SEQ2_LEN=/cluster/data/eriEur1/chrom.sizes SEQ2_CHUNK=30000000 SEQ2_LIMIT=400 SEQ2_LAP=0 BASE=/cluster/data/mm9/bed/blastzEriEur1.2007-09-28 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 -chainMinScore=3000 \ -stop=load -bigClusterHub=pk -chainLinearGap=medium DEF \ > load.log 2>&1 & # failed during the load since the db eriEur1 does not exist ssh hgwdev cd /cluster/data/mm9/bed/blastzEriEur1.2007-09-28/axtChain cp -p noClass.net mm9.eriEur1.net time nice -n +19 netFilter -minGap=10 mm9.eriEur1.net \ | hgLoadNet -warn mm9 netEriEur1 stdin cd /cluster/data/mm9/bed/blastzEriEur1.2007-09-28 time nice -n +19 featureBits mm9 chainEriEur1Link \ > fb.mm9.chainEriEur1Link.txt 2>&1 & cat fb.mm9.chainEriEur1Link.txt # 262604655 bases of 2620346127 (10.022%) in intersection # continuing through syntenic nets (actually unneeded) time nice -n +19 doBlastzChainNet.pl -verbose=2 -chainMinScore=3000 \ -continue=download -bigClusterHub=pk -chainLinearGap=medium DEF \ -syntenicNet > syntenicNet.log 2>&1 & # create reciprocal best chains/nets ssh hgwdev cd /cluster/data/mm9/bed/blastzEriEur1.2007-09-28 time nice -n +19 /cluster/bin/scripts/doRecipBest.pl mm9 eriEur1 \ > rbest.log 2>&1 & # real 33m27.296s ######################################################################### # Blastz Shrew sorAra1 (DONE - 2007-09-28 - 2007-10-01 - Hiram) ssh kkstore05 screen # use screen to control this job mkdir /cluster/data/mm9/bed/blastzSorAra1.2007-09-28 cd /cluster/data/mm9/bed/blastzSorAra1.2007-09-28 cat << '_EOF_' > DEF # Mouse vs. Shrew BLASTZ_M=50 # TARGET: Mouse Mm9 SEQ1_DIR=/scratch/data/mm9/mm9.2bit SEQ1_LEN=/cluster/data/mm9/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Shrew sorAra1 SEQ2_DIR=/san/sanvol1/scratch/sorAra1/sorAra1.2bit SEQ2_LEN=/cluster/data/sorAra1/chrom.sizes SEQ2_CHUNK=30000000 SEQ2_LIMIT=400 SEQ2_LAP=0 BASE=/cluster/data/mm9/bed/blastzSorAra1.2007-09-28 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 -chainMinScore=3000 \ -stop=chainMerge -bigClusterHub=pk -chainLinearGap=medium DEF \ >chainMerge chainMerge.log 2>&1 & # real 2478m57.242s time nice -n +19 doBlastzChainNet.pl -verbose=2 -chainMinScore=3000 \ -continue=net -stop=load -bigClusterHub=pk -chainLinearGap=medium DEF \ > load.log 2>&1 & # real 15m55.272s # as expected, fails during load since there is no sorAra1 database # load nets without class ssh hgwdev cd /cluster/data/mm9/bed/blastzSorAra1.2007-09-28/axtChain cp -p noClass.net mm9.sorAra1.net time nice -n +19 netFilter -minGap=10 mm9.sorAra1.net \ | hgLoadNet -warn mm9 netSorAra1 stdin cd /cluster/data/mm9/bed/blastzSorAra1.2007-09-28 time nice -n +19 featureBits mm9 chainSorAra1Link \ > fb.mm9.chainSorAra1Link.txt 2>&1 cat fb.mm9.chainSorAra1Link.txt # 250412778 bases of 2620346127 (9.556%) in intersection # and, to finish it all off, with syntenic net time nice -n +19 doBlastzChainNet.pl -verbose=2 -chainMinScore=3000 \ -continue=download -bigClusterHub=pk \ -syntenicNet -chainLinearGap=medium DEF > syntenicNet.log 2>&1 & # real 3m49.961s # create reciprocal best chains/nets ssh hgwdev cd /cluster/data/mm9/bed/blastzSorAra1.2007-09-28 time nice -n +19 /cluster/bin/scripts/doRecipBest.pl mm9 sorAra1 \ > rbest.log 2>&1 & # real 27m3.076s ######################################################################### ## 30-Way Multiz (DONE - 2007-10-01 - Hiram) ## The blastz alignments for this 30-way are documented at: ## http://genomewiki.ucsc.edu/index.php/Mm9_multiple_alignment ## ssh hgwdev mkdir /cluster/data/mm9/bed/multiz30way cd /cluster/data/mm9/bed/multiz30way # take the 28-way tree from hg18 and insert the two new genomes. # rearrange to get mm9 on the top of the graph # paste this tree into the on-line phyloGif tool: # http://genome.ucsc.edu/cgi-bin/phyloGif # to create the image for the tree diagram cat << '_EOF_' > mm9OnTop.fullNames.nh (((((((( (((Mouse_mm9:0.076274,Rat_rn4:0.084383):0.200607, GuineaPig_cavPor2:0.202990):0.034350, Rabbit_oryCun1:0.208548):0.014587, ((((((Human_hg18:0.005873,Chimp_panTro2:0.007668):0.013037, Orangutan_ponAbe2:0.02):0.013037,Rhesus_rheMac2:0.031973):0.0365, Marmoset_calJac1:0.07):0.0365,Bushbaby_otoGar1:0.151185):0.015682, TreeShrew_tupBel1:0.162844):0.006272):0.019763, ((Shrew_sorAra1:0.248532,Hedgehog_eriEur1:0.222255):0.045693, (((Dog_canFam2:0.101137,Cat_felCat3:0.098203):0.048213, Horse_equCab1:0.099323):0.007287, Cow_bosTau3:0.163945):0.012398):0.018928):0.030081, (Armadillo_dasNov1:0.133274,(Elephant_loxAfr1:0.103030, Tenrec_echTel1:0.232706):0.049511):0.008424):0.213469, Opossum_monDom4:0.320721):0.088647, Platypus_ornAna1:0.488110):0.118797, (Chicken_galGal3:0.395136,Lizard_anoCar1:0.513962):0.093688):0.151358, Frog_xenTro2:0.778272):0.174596, (((Tetraodon_tetNig1:0.203933,Fugu_fr2:0.239587):0.203949, (Stickleback_gasAcu1:0.314162,Medaka_oryLat1:0.501915):0.055354):0.346008, Zebrafish_danRer5:0.730028):0.174596); '_EOF_' # << happy emacs # create a species list from that file: sed -e 's/[()]//g; s/ /\n/g; s/,/\n/g' mm9OnTop.fullNames.nh \ | sed -e "s/[ \t]*//g; /^[ \t]$/d; /^$/d" | sort -u \ | sed -e "s/.*_//; s/:.*//" | sort > species.list # verify that has 30 db names in it # create a stripped down nh file for use in autoMZ run echo \ `sed 's/[a-zA-Z0-9]*_//g; s/:0.[0-9]*//g; s/[,;]/ /g' mm9OnTop.fullNames.nh \ | sed -e "s/ / /g"` > tree.30.nh # that looks like, as a single line: (((((((( (((mm9 rn4) cavPor2) oryCun1) ((((((hg18 panTro2) ponAbe2) rheMac2) calJac1) otoGar1) tupBel1)) ((sorAra1 eriEur1) (((canFam2 felCat3) equCab1) bosTau3))) (dasNov1 (loxAfr1 echTel1))) monDom4) ornAna1) (galGal3 anoCar1)) xenTro2) (((tetNig1 fr2) (gasAcu1 oryLat1)) danRer5)) # verify all blastz's exists cat << '_EOF_' > listMafs.csh #!/bin/csh -fe cd /cluster/data/mm9/bed/multiz30way foreach db (`cat species.list`) set bdir = /cluster/data/mm9/bed/blastz.$db if (-e $bdir/mafRBestNet/chr1.maf.gz) then echo "$db mafRBestNet" else if (-e $bdir/mafSynNet/chr1.maf.gz) then echo "$db mafSynNet" else if (-e $bdir/mafNet/chr1.maf.gz) then echo "$db mafNet" else echo "$db mafs not found" endif end '_EOF_' # << happy emacs chmod +x ./listMafs.csh # see what it says, shouldn't be anything with "mafs not found" ./listMafs.csh # copy net mafs to cluster-friendly storage, splitting chroms # into 50MB chunks to improve run-time # NOTE: splitting will be different for scaffold-based reference asemblies ssh hgwdev mkdir /cluster/data/mm9/bed/multiz30way/run.split cd /cluster/data/mm9/bed/multiz30way/run.split # this works by examining the rmsk table for likely repeat areas # that won't be used in blastz mafSplitPos mm9 50 mafSplit.bed ssh kki cd /cluster/data/mm9/bed/multiz30way/run.split cat << '_EOF_' > doSplit.csh #!/bin/csh -ef set db = $1 set sdir = /san/sanvol1/scratch/mm9/splitStrictMafNet mkdir -p $sdir if (-e $sdir/$db) then echo "directory $sdir/$db already exists -- remove and retry" exit 1 endif set bdir = /cluster/data/mm9/bed/blastz.$db if (! -e $bdir) then echo "directory $bdir not found" exit 1 endif mkdir -p $sdir/$db if (-e $bdir/mafRBestNet) then set mdir = $bdir/mafRBestNet else if (-e $bdir/mafSynNet) then set mdir = $bdir/mafSynNet else if (-e $bdir/mafNet) then set mdir = $bdir/mafNet else echo "$bdir maf dir not found" exit 1 endif echo $mdir foreach f ($mdir/*) set c = $f:t:r:r echo " $c" nice mafSplit mafSplit.bed $sdir/$db/ $f end echo "gzipping $sdir/$db mafs" nice gzip $sdir/$db/* endif echo $mdir > $db.done '_EOF_' # << happy emacs chmod +x doSplit.csh grep -v mm9 ../species.list > split.list cat << '_EOF_' > template #LOOP doSplit.csh $(path1) {check out line+ $(path1).done} #ENDLOOP '_EOF_' gensub2 split.list single template jobList para create jobList # 29 jobs # start these gently, this is a good load on the san filesystem para try # let that run to a couple completions, a few minutes, then again: para try # etc ... # Completed: 29 of 29 jobs # CPU time in finished jobs: 9476s 157.94m 2.63h 0.11d 0.000 y # IO & Wait Time: 1531s 25.51m 0.43h 0.02d 0.000 y # Average job time: 380s 6.33m 0.11h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 1081s 18.02m 0.30h 0.01d # Submission to last job: 1391s 23.18m 0.39h 0.02d # ready for the multiz run ssh pk cd /cluster/data/mm9/bed/multiz30way # actually, the result directory here should be maf.split instead of maf mkdir -p maf run cd run mkdir penn # use latest penn utilities P=/cluster/bin/penn/multiz.v11.2007-03-19/multiz-tba cp -p $P/{autoMZ,multiz,maf_project} penn # list chrom chunks, any db dir will do; better would be for the # splitter to generate this file # We temporarily use __ instead of . to delimit chunk in filename # so we can use $(root) to get basename find /san/sanvol1/scratch/mm9/splitStrictMafNet -type f \ | while read F; do basename $F; done \ | sed -e 's/.maf.gz//' -e 's/\./__/' | sort -u > chromChunks.lst sort -u > chromChunks.list wc -l chromChunks.list # 75 cat > autoMultiz.csh << '_EOF_' #!/bin/csh -ef set db = mm9 set c = $1 set maf = $2 set run = `pwd` set tmp = /scratch/tmp/$db/multiz.$c set pairs = /san/sanvol1/scratch/$db/splitStrictMafNet rm -fr $tmp mkdir -p $tmp cp ../tree.30.nh ../species.list $tmp pushd $tmp foreach s (`cat species.list`) set c2 = `echo $c | sed 's/__/./'` set in = $pairs/$s/$c2.maf set out = $db.$s.sing.maf if ($s == mm9) then continue endif if (-e $in.gz) then zcat $in.gz > $out else if (-e $in) then cp $in $out else echo "##maf version=1 scoring=autoMZ" > $out endif end set path = ($run/penn $path); rehash $run/penn/autoMZ + T=$tmp E=$db "`cat tree.30.nh`" $db.*.sing.maf $c.maf popd cp $tmp/$c.maf $maf rm -fr $tmp '_EOF_' # << happy emacs chmod +x autoMultiz.csh cat << '_EOF_' > template #LOOP ./autoMultiz.csh $(root1) {check out line+ /cluster/data/mm9/bed/multiz30way/maf/$(root1).maf} #ENDLOOP '_EOF_' # << emacs gensub2 chromChunks.list single template jobList para create jobList # 75 jobs # three of these jobs failed with memory allocation error: # maf_project.v12: Ran out of memory trying to allocate 64. # autoMZ.v1: command 'maf_project /scratch/tmp/mm9/multiz.chr10__1/_MZ_16482_lef # t.maf19 mm9 > /scratch/tmp/mm9/multiz.chr10__1/_MZ_16482_U1' failed # the 73 jobs run time: # Completed: 72 of 75 jobs # CPU time in finished jobs: 501143s 8352.38m 139.21h 5.80d 0.016 y # IO & Wait Time: 22628s 377.14m 6.29h 0.26d 0.001 y # Average job time: 7275s 121.24m 2.02h 0.08d # Longest finished job: 15957s 265.95m 4.43h 0.18d # Submission to last job: 16473s 274.55m 4.58h 0.19d # performed a para recover on the jobList and used the kki kluster # to run the last three jobs: # Completed: 3 of 3 jobs # CPU time in finished jobs: 50762s 846.03m 14.10h 0.59d 0.002 y # IO & Wait Time: 1795s 29.92m 0.50h 0.02d 0.000 y # Average job time: 17519s 291.98m 4.87h 0.20d # Longest finished job: 17887s 298.12m 4.97h 0.21d # Submission to last job: 17887s 298.12m 4.97h 0.21d # put the split maf results back together into single chroms ssh kkstore06 cd /cluster/data/mm9/bed/multiz30way # here is where the result directory maf should have already been maf.split mv maf maf.split mkdir maf # going to sort out the redundant header garbage to leave a cleaner maf for C in `ls maf.split | sed -e "s#__.*##" | sort -u` do echo ${C} head -q -n 1 maf.split/${C}__*.maf | sort -u > maf/${C}.maf grep -h "^#" maf.split/${C}__*.maf | egrep -v "maf version=1|eof maf" | \ sed -e "s#_MZ_[^ ]* # #g; s#__[0-9]##g" | sort -u >> maf/${C}.maf grep -h -v "^#" maf.split/${C}__*.maf >> maf/${C}.maf tail -q -n 1 maf.split/${C}__*.maf | sort -u >> maf/${C}.maf done # load tables for a look ssh hgwdev mkdir -p /gbdb/mm9/multiz30way/maf ln -s /cluster/data/mm9/bed/multiz30way/maf/*.maf \ /gbdb/mm9/multiz30way/maf cd /cluster/data/mm9/bed/multiz30way # this generates a large 1 Gb multiz30way.tab file in the directory # where it is running. Best to run this over in scratch. cd /scratch/tmp time nice -n +19 hgLoadMaf \ -pathPrefix=/gbdb/mm9/multiz30way/maf mm9 multiz30way # real 11m38.695s # Loaded 15881850 mafs in 34 files from /gbdb/mm9/multiz30way/maf # load summary table time nice -n +19 cat /gbdb/mm9/multiz30way/maf/*.maf \ | hgLoadMafSummary mm9 -minSize=30000 -mergeGap=1500 \ -maxSize=200000 multiz30waySummary stdin # Created 5648546 summary blocks from 154642836 components and 15872991 # mafs from stdin # real 19m44.355s # Gap Annotation # prepare bed files with gap info ssh kkstore06 mkdir /cluster/data/mm9/bed/multiz30way/anno cd /cluster/data/mm9/bed/multiz30way/anno mkdir maf run for DB in `cat ../species.list` do CDIR="/cluster/data/${DB}" if [ ! -f ${CDIR}/${DB}.N.bed ]; then echo "creating ${DB}.N.bed" echo twoBitInfo -nBed ${CDIR}/${DB}.2bit ${CDIR}/${DB}.N.bed else ls -og ${CDIR}/${DB}.N.bed fi done cd run rm -f nBeds sizes for DB in `grep -v mm9 ../../species.list` do echo "${DB} " ln -s /cluster/data/${DB}/${DB}.N.bed ${DB}.bed echo ${DB}.bed >> nBeds ln -s /cluster/data/${DB}/chrom.sizes ${DB}.len echo ${DB}.len >> sizes done ssh kki cd /cluster/data/mm9/bed/multiz30way/anno/run cat << '_EOF_' > doAnno.csh #!/bin/csh -ef set dir = /cluster/data/mm9/bed/multiz30way set c = $1 cat $dir/maf/${c}.maf | \ nice mafAddIRows -nBeds=nBeds stdin /cluster/data/mm9/mm9.2bit $2 '_EOF_' # << happy emacs chmod +x doAnno.csh cat << '_EOF_' > template #LOOP ./doAnno.csh $(root1) {check out line+ /cluster/data/mm9/bed/multiz30way/anno/maf/$(root1).maf} #ENDLOOP '_EOF_' # << happy emacs # there is no 16_random maf file cut -f1 /cluster/data/mm9/chrom.sizes | grep -v 16_random > chrom.list gensub2 chrom.list single template jobList para create jobList para try # Crashed: 1 jobs # CPU time in finished jobs: 18129s 302.15m 5.04h 0.21d 0.001 y # IO & Wait Time: 10273s 171.22m 2.85h 0.12d 0.000 y # Average job time: 861s 14.34m 0.24h 0.01d # Longest finished job: 4376s 72.93m 1.22h 0.05d # one job was too large for this memory: # job: ./doAnno.csh chr1 /cluster/data/mm9/bed/multiz30way/anno/maf/chr1.maf # needLargeMem: Out of memory - request size 1129396 bytes, errno: 12 # going to hgwdev for this one: ssh hgwdev cd /cluster/data/mm9/bed/multiz30way/anno/run time ./doAnno.csh chr1 ../maf/chr1.maf # real 17m34.550s ssh hgwdev cd /cluster/data/mm9/bed/multiz30way/anno mkdir -p /gbdb/mm9/multiz30way/anno/maf ln -s /cluster/data/mm9/bed/multiz30way/anno/maf/*.maf \ /gbdb/mm9/multiz30way/anno/maf # by loading this into the table multiz30way, it will replace the # previously loaded table with the unannotated mafs # huge temp files are made, do them on local disk cd /scratch/tmp time nice -n +19 hgLoadMaf -pathPrefix=/gbdb/mm9/multiz30way/anno/maf \ mm9 multiz30way # Loaded 16799995 mafs in 34 files from /gbdb/mm9/multiz30way/anno/maf # real 18m12.171s # This step may be useless. The original mafs should have the same # summary. cat /cluster/data/mm9/chrom.sizes | \ awk '{if ($2 > 1000000) { print $1 }}' | while read C do echo /gbdb/mm9/multiz30way/anno/maf/$C.maf done | xargs cat | \ hgLoadMafSummary mm9 -minSize=30000 -mergeGap=1500 \ -maxSize=200000 multiz30waySummary stdin # Created 5648546 summary blocks from 154642836 components and 16790208 # mafs from stdin # by loading this into the table multiz30waySummary, it will replace # the previously loaded table with the unannotated mafs # real 30m26.542s ############################################################################# ## Annotate 30-way multiple alignment with gene annotations ## (DONE - 2007-10-18 - Hiram) # Gene frames ## survey all genomes to see what type of gene track to use ssh hgwdev mkdir /cluster/data/mm9/bed/multiz30way/frames cd /cluster/data/mm9/bed/multiz30way/frames # dbs: eriEur1, cavPor2, sorAra1 do not exist, can not look at them cat << '_EOF_' > showGenes.csh #!/bin/csh -fe foreach db (`egrep -v "sorAra1|eriEur1|cavPor2" ../species.list`) echo -n "${db}: " echo -n "Tables: " set tables = `hgsql $db -N -e "show tables like '%Gene%'"` foreach table ($tables) if ($table == "ensGene" || $table == "refGene" || $table == "mgcGenes" || \ $table == "knownGene") then set count = `hgsql $db -N -e "select count(*) from $table"` echo -n "${table}: ${count}, " endif end set orgName = `hgsql hgcentraltest -N -e \ "select scientificName from dbDb where name='$db'"` set orgId = `hgsql mm9 -N -e \ "select id from organism where name='$orgName'"` if ($orgId == "") then echo "Mrnas: 0" else set count = `hgsql mm9 -N -e "select count(*) from gbCdnaInfo where organism=$orgId"` echo "Mrnas: ${count}" endif end '_EOF_' # << happy emacs chmod +x ./showGenes.csh # given this output, manually sorted for this display: # hg18: Tables: ensGene: 43569, knownGene: 56722, mgcGenes: 29028, refGene: 25902, Mrnas: 208990 # mm9: Tables: knownGene: 49409, mgcGenes: 22947, refGene: 21004, Mrnas: 5092390 # rn4: Tables: ensGene: 33745, knownGene: 8202, mgcGenes: 5400, refGene: 14333, Mrnas: 34471 # canFam2: Tables: ensGene: 25568, refGene: 833, Mrnas: 1708 # danRer5: Tables: ensGene: 31740, mgcGenes: 13037, refGene: 12879, Mrnas: 33184 # fr2: Tables: ensGene: 22102, Mrnas: 1098 # gasAcu1: Tables: ensGene: 28840, Mrnas: 2326 # monDom4: Tables: ensGene: 33878, refGene: 163, Mrnas: 398 # ornAna1: Tables: ensGene: 25981, refGene: 3, Mrnas: 141 # oryLat1: Tables: ensGene: 23087, Mrnas: 980 # panTro2: Tables: ensGene: 32852, refGene: 26160, Mrnas: 1277 # rheMac2: Tables: ensGene: 38561, refGene: 412, Mrnas: 3169 # bosTau3: Tables: mgcGenes: 9617, refGene: 10287, Mrnas: 26808 # equCab1: Tables: refGene: 304, Mrnas: 1396 # felCat3: Tables: refGene: 401, Mrnas: 882 # galGal3: Tables: refGene: 4210, Mrnas: 31217 # xenTro2: Tables: mgcGenes: 6255, refGene: 7086, Mrnas: 19155 # anoCar1: Tables: Mrnas: 12 # calJac1: Tables: Mrnas: 949 # dasNov1: Tables: Mrnas: 18 # echTel1: Tables: Mrnas: 0 # loxAfr1: Tables: Mrnas: 12 # oryCun1: Tables: Mrnas: 3786 # otoGar1: Tables: Mrnas: 0 # ponAbe2: Tables: Mrnas: 2 # tetNig1: Tables: Mrnas: 99495 # tupBel1: Tables: Mrnas: 47 # use knownGene for hg18, mm9 # use ensGene for rn4, canFam2, danRer5, fr2, gasAcu1, monDom4, ornAna1, # oryLat1, panTro2, rheMac2 # use refGene for bosTau3, xenTro2 # use Mrnas for galGal3, tetNig1 # barely can use Mrnas for equCab1, felCat3, anoCar1, dasNov1, # loxAfr1, oryCun1, ponAbe2 # no annotations for calJac1, echTel1, otoGar1, sorAra1, eriEur1, cavPor2 # tupBel1 mkdir genes # knownGene for DB in hg18 mm9 do hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from knownGene" ${DB} \ | genePredSingleCover stdin stdout | gzip -2c \ > /scratch/tmp/${DB}.tmp.gz mv /scratch/tmp/${DB}.tmp.gz genes/$DB.gp.gz echo "${DB} done" done # ensGene for DB in rn4 canFam2 danRer5 fr2 gasAcu1 monDom4 \ ornAna1 oryLat1 panTro2 rheMac2 do hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from ensGene" ${DB} \ | genePredSingleCover stdin stdout | gzip -2c \ > /scratch/tmp/${DB}.tmp.gz mv /scratch/tmp/${DB}.tmp.gz genes/$DB.gp.gz echo "${DB} done" done # refGene for DB in bosTau3 xenTro2 do hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from refGene" ${DB} \ | genePredSingleCover stdin stdout | gzip -2c \ > /scratch/tmp/${DB}.tmp.gz mv /scratch/tmp/${DB}.tmp.gz genes/$DB.gp.gz echo "${DB} done" done # and finally, using the mrna tables # use Mrnas for galGal3 tetNig1 equCab1 felCat3 anoCar1 dasNov1 # loxAfr1 oryCun1 ponAbe2 for DB in galGal3 tetNig1 equCab1 felCat3 anoCar1 dasNov1 \ loxAfr1 oryCun1 ponAbe2 do tmpExt=`mktemp temp.XXXXXX` tmpMrnaCds=${DB}.mrna-cds.${tmpExt} tmpMrna=${DB}.mrna.${tmpExt} tmpCds=${DB}.cds.${tmpExt} hgsql -N -e 'select all_mrna.qName,cds.name,all_mrna.* \ from all_mrna,gbCdnaInfo,cds \ where (all_mrna.qName = gbCdnaInfo.acc) and \ (gbCdnaInfo.cds != 0) and (gbCdnaInfo.cds = cds.id)' \ $DB > ${tmpMrnaCds} cut -f 1-2 ${tmpMrnaCds} > ${tmpCds} cut -f 4-100 ${tmpMrnaCds} > ${tmpMrna} mrnaToGene -cdsFile=${tmpCds} -smallInsertSize=8 -quiet ${tmpMrna} stdout | \ genePredSingleCover stdin stdout | gzip -2c > /scratch/tmp/$DB.tmp.gz rm ${tmpMrnaCds} ${tmpMrna} ${tmpCds} mv /scratch/tmp/$DB.tmp.gz genes/$DB.gp.gz rm -f $tmpExt echo "${DB} done" done ################################################## # redmine GB - Feature #480 - missing self frames on multiz # NwayFrames tables (DONE 2010-07-29) # re-run the genePredToMafFrames with mm9 genes/mm9 ssh kkstore06 cd /cluster/data/mm9/bed/multiz30way/frames # leaving out calJac1, echTel1, otoGar1, sorAra1, eriEur1, cavPor2 # tupBel1 since no gene preds there time (cat ../maf/*.maf | nice -n +19 genePredToMafFrames mm9 stdin stdout mm9 genes/mm9.gp.gz rn4 genes/rn4.gp.gz hg18 genes/hg18.gp.gz rheMac2 genes/rheMac2.gp.gz ponAbe2 genes/ponAbe2.gp.gz panTro2 genes/panTro2.gp.gz equCab1 genes/equCab1.gp.gz dasNov1 genes/dasNov1.gp.gz oryCun1 genes/oryCun1.gp.gz felCat3 genes/felCat3.gp.gz canFam2 genes/canFam2.gp.gz loxAfr1 genes/loxAfr1.gp.gz bosTau3 genes/bosTau3.gp.gz monDom4 genes/monDom4.gp.gz ornAna1 genes/ornAna1.gp.gz galGal3 genes/galGal3.gp.gz anoCar1 genes/anoCar1.gp.gz xenTro2 genes/xenTro2.gp.gz gasAcu1 genes/gasAcu1.gp.gz danRer5 genes/danRer5.gp.gz tetNig1 genes/tetNig1.gp.gz fr2 genes/fr2.gp.gz oryLat1 genes/oryLat1.gp.gz | gzip > multiz30way.mafFrames.gz) > frames.log 2>&1 # see what it looks like in terms of number of annotations per DB: zcat multiz30way.mafFrames.gz | cut -f4 | sort | uniq -c | sort -n 67 loxAfr1 79 dasNov1 116 ponAbe2 491 anoCar1 1807 tetNig1 2429 felCat3 4892 equCab1 9156 oryCun1 85568 bosTau3 118192 galGal3 129442 xenTro2 185607 mm9 208239 rn4 224420 rheMac2 226866 panTro2 228563 hg18 243074 canFam2 329523 danRer5 334418 ornAna1 347708 oryLat1 369267 monDom4 374016 gasAcu1 380839 fr2 # load the resulting file ssh hgwdev cd /cluster/data/mm9/bed/multiz30way/frames time nice -n +19 hgLoadMafFrames mm9 multiz30wayFrames \ multiz30way.mafFrames.gz # real 0m43.134s # enable the trackDb entries: # frames multiz30wayFrames # irows on ############################################################################# # phastCons 30-way (DONE - 2007-10-16 - Hiram) # split 30way mafs into 10M chunks and generate sufficient statistics # files for # phastCons ssh kki mkdir /cluster/data/mm9/bed/multiz30way/msa.split cd /cluster/data/mm9/bed/multiz30way/msa.split mkdir -p /san/sanvol1/scratch/mm9/multiz30way/cons/ss cat << '_EOF_' > doSplit.csh #!/bin/csh -ef set MAFS = /cluster/data/mm9/bed/multiz30way/maf set WINDOWS = /san/sanvol1/scratch/mm9/multiz30way/cons/ss pushd $WINDOWS set c = $1 rm -fr $c mkdir $c twoBitToFa -seq=$c /scratch/data/mm9/mm9.2bit /scratch/tmp/mm9.$c.fa # need to truncate odd-ball scaffold/chrom names that include dots # as phastCons utils can't handle them set CLEAN_MAF = /scratch/tmp/$c.clean.maf.$$ perl -wpe 's/^s ([^.]+\.[^. ]+)\.\S+/s $1/' $MAFS/$c.maf > $CLEAN_MAF /cluster/bin/phast/$MACHTYPE/msa_split $CLEAN_MAF -i MAF \ -M /scratch/tmp/mm9.$c.fa \ -o SS -r $c/$c -w 10000000,0 -I 1000 -B 5000 rm -f $CLEAN_MAF /scratch/tmp/mm9.$c.fa popd date >> $c.done '_EOF_' # << happy emacs chmod +x doSplit.csh cat << '_EOF_' > template #LOOP doSplit.csh $(root1) {check out line+ $(root1).done} #ENDLOOP '_EOF_' # << happy emacs # do the easy ones first to see some immediate results ls -1S -r ../maf | sed -e "s/.maf//" > maf.list gensub2 maf.list single template jobList para create jobList para try ... check ... etc - # completed shorter jobs in a few hours, there is a problem of swapping # going on here, two of these jobs on a single node can consume all of its # memory and then some. Three jobs failed to complete, finish them up # manually on hgwdev, the processes grow to over 8 Gb in memory for chr1, # chr11 and chr2 # Estimate phastCons parameters time nice -n +19 /cluster/bin/phast.2007-05-04/phyloFit -i SS \ /san/sanvol1/scratch/mm9/multiz30way/cons/ss/chrY/chrY.1-10000000.ss \ --tree "(((((((((((mm9,rn4),cavPor2),oryCun1),((((((hg18,panTro2),ponAbe2),rheMac2),calJac1),otoGar1),tupBel1)),((sorAra1,eriEur1),(((canFam2,felCat3),equCab1),bosTau3))),(dasNov1,(loxAfr1,echTel1))),monDom4),ornAna1),(galGal3,anoCar1)),xenTro2),(((tetNig1,fr2),(gasAcu1,oryLat1)),danRer5))" \ --out-root starting-tree # real 107m46.703s # Tried this on chr13 too: # real 4619m42.984s # that is almost 77 hours on hgwdev == 3.2 days # add up the C and G: grep BACKGROUND starting-tree.mod | awk '{printf "%0.3f\n", $3 + $4;}' # 0.400 # This 0.400 is used in the --gc argument below # got 0.404 with chrM.starting-tree.mod # Run phastCons # This job is I/O intensive in its output files, thus it is all # working over in /scratch/tmp/ ssh pk mkdir -p /cluster/data/mm9/bed/multiz30way/cons/run.cons cd /cluster/data/mm9/bed/multiz30way/cons/run.cons # there are going to be several different phastCons runs using # this same script. They trigger off of the current working directory # $cwd:t which is the "grp" in this script. It is one of: # all glires placentals cat << '_EOF_' > doPhast.csh #!/bin/csh -fe set PHASTBIN = /cluster/bin/phast.2007-05-04 set c = $1 set f = $2 set len = $3 set cov = $4 set rho = $5 set grp = $cwd:t set tmp = /scratch/tmp/$f set cons = /cluster/data/mm9/bed/multiz30way/cons mkdir -p $tmp set san = /san/sanvol1/scratch/mm9/multiz30way/cons if (-s $cons/$grp/$grp.non-inf) then cp -p $cons/$grp/$grp.mod $cons/$grp/$grp.non-inf . cp -p $san/ss/$c/$f.ss $cons/$grp/$grp.mod $cons/$grp/$grp.non-inf $tmp else cp -p $cons/$grp/$grp.mod . cp -p $san/ss/$c/$f.ss $cons/$grp/$grp.mod $tmp endif pushd $tmp > /dev/null if (-s $grp.non-inf) then $PHASTBIN/phastCons $f.ss $grp.mod \ --rho $rho --expected-length $len --target-coverage $cov --quiet \ --not-informative `cat $grp.non-inf` \ --seqname $c --idpref $c --most-conserved $f.bed --score > $f.pp else $PHASTBIN/phastCons $f.ss $grp.mod \ --rho $rho --expected-length $len --target-coverage $cov --quiet \ --seqname $c --idpref $c --most-conserved $f.bed --score > $f.pp endif popd > /dev/null mkdir -p $san/$grp/pp/$c $san/$grp/bed/$c sleep 4 touch $san/$grp/pp/$c $san/$grp/bed/$c rm -f $san/$grp/pp/$c/$f.pp rm -f $san/$grp/bed/$c/$f.bed mv $tmp/$f.pp $san/$grp/pp/$c mv $tmp/$f.bed $san/$grp/bed/$c rm -fr $tmp '_EOF_' # << happy emacs chmod a+x doPhast.csh cat << '_EOF_' > template #LOOP ../doPhast.csh $(root1) $(file1) 45 .3 .31 {check out line+ /san/sanvol1/scratch/mm9/multiz30way/cons/all/bed/$(root1)/$(file1).bed} #ENDLOOP '_EOF_' # << happy emacs # Create parasol batch and run it pushd /san/sanvol1/scratch/mm9/multiz30way/cons ls -1 ss/chr*/chr*.ss | sed 's/.ss$//' > \ /cluster/data/mm9/bed/multiz30way/cons/run.cons/ss.list popd # run for all species cd .. mkdir -p all run.cons/all cd all cp ../../chrY.starting-tree.mod all.mod # root1 == chrom name, file1 == ss file name without .ss suffix # Create template file for "all" run cat << '_EOF_' > template #LOOP ../doPhast.csh $(root1) $(file1) 45 .3 .31 {check out line+ /san/sanvol1/scratch/mm9/multiz30way/cons/all/bed/$(root1)/$(file1).bed} #ENDLOOP '_EOF_' # << happy emacs gensub2 ../ss.list single template jobList para create jobList para try ... check ... push ... etc. # Completed: 294 of 294 jobs # CPU time in finished jobs: 25724s 428.73m 7.15h 0.30d 0.001 y # IO & Wait Time: 8951s 149.19m 2.49h 0.10d 0.000 y # Average job time: 118s 1.97m 0.03h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 226s 3.77m 0.06h 0.00d # Submission to last job: 582s 9.70m 0.16h 0.01d # create Most Conserved track ssh kolossus cd /san/sanvol1/scratch/mm9/multiz30way/cons/all cat bed/*/chr*.bed | sort -k1,1 -k2,2n | \ awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' | \ /cluster/bin/scripts/lodToBedScore /dev/stdin > mostConserved.bed # ~ 1 minute cp -p mostConserved.bed /cluster/data/mm9/bed/multiz30way/cons/all # load into database ssh hgwdev cd /cluster/data/mm9/bed/multiz30way/cons/all time nice -n +19 hgLoadBed mm9 phastConsElements30way mostConserved.bed # Loaded 2782368 elements of size 5 # real 1m15.673s # compare with previous tracks hgsql mm9 -s -N -e "select count(*) from phastConsElements30way" # 2782368 hgsql mm8 -s -N -e "select count(*) from phastConsElements17way" # 1883370 # Try for 5% overall cov, and 70% CDS cov # --rho .31 --expected-length 45 --target-coverage .3 # chrY mod tree featureBits mm9 -enrichment refGene:cds phastConsElements30way # refGene:cds 1.167%, phastConsElements30way 4.789%, # both 0.582%, cover 49.90%, enrich 10.42x featureBits mm9 -enrichment knownGene:cds phastConsElements30way # knownGene:cds 1.278%, phastConsElements30way 4.789%, # both 0.627%, cover 49.03%, enrich 10.24x # --rho .31 --expected-length 45 --target-coverage .3 elim non-autho # chr13 mod tree featureBits mm9 -enrichment refGene:cds mostConserved.bed # refGene:cds 1.167%, mostConserved.bed 4.128%, # both 0.614%, cover 52.59%, enrich 12.74x # --rho .31 --expected-length 45 --target-coverage .3 elim non-autho # 28-way mod tree adjusted to 30-way featureBits mm9 -enrichment refGene:cds mostConserved.bed # refGene:cds 1.167%, mostConserved.bed 5.841%, both 0.862%, cover # 73.90%, enrich 12.65x featureBits mm8 -enrichment refGene:cds phastConsElements17way # refGene:cds 1.188%, phastConsElements17way 5.398%, # both 0.832%, cover 70.05%, enrich 12.98x featureBits mm8 -enrichment knownGene:cds phastConsElements17way # knownGene:cds 1.109%, phastConsElements17way 5.398%, # both 0.776%, cover 69.99%, enrich 12.97x # Create merged posterier probability file and wiggle track data files # currently doesn't matter where this is performed, the san is the same # network distance from all machines. cd /san/sanvol1/scratch/mm9/multiz30way/cons/all cat << '_EOF_' > gzipAscii.sh #!/bin/sh TOP=`pwd` export TOP mkdir -p phastCons30wayScores for D in pp/chr* do C=${D/pp\/} out=phastCons30wayScores/${C}.data.gz echo "${D} > ${C}.data.gz" ls $D/*.pp | sort -n -t\. -k2 | xargs cat | \ gzip > ${out} done '_EOF_' # << happy emacs chmod +x gzipAscii.sh time nice -n +19 ./gzipAscii.sh # Create merged posterier probability file and wiggle track data files # currently doesn't matter where this is performed, the san is the same # network distance from all machines. cd /san/sanvol1/scratch/mm9/multiz30way/cons/all # sort by chromName, chromStart so that items are in numerical order # for wigEncode for D in pp/chr* do ls $D/*.pp | sort -n -t\. -k2 done | xargs cat \ | wigEncode -noOverlap stdin phastCons30way.wig phastCons30way.wib # Converted stdin, upper limit 1.00, lower limit 0.00 # Load gbdb and database with wiggle. ssh hgwdev cd /cluster/data/mm9/bed/multiz30way/cons/all ln -s `pwd`/phastCons30way.wib /gbdb/mm9/multiz30way/phastCons30way.wib time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/mm9/multiz30way mm9 \ phastCons30way phastCons30way.wig # real 0m42.728s # Create histogram to get an overview of all the data ssh hgwdev cd /cluster/data/mm9/bed/multiz30way time nice -n +19 hgWiggle -doHistogram \ -hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \ -db=mm9 phastCons30way > histogram.data 2>&1 # real 28m24.388s # create plot of histogram: cat << '_EOF_' | gnuplot > histo.png set terminal png small color x000000 xffffff xc000ff x66ff66 xffff00 x00ffff set size 1.4, 0.8 set key left box set grid noxtics set grid ytics set title " Mouse Mm9 Histogram phastCons30way track" set xlabel " phastCons30way score" set ylabel " Relative Frequency" set y2label " Cumulative Relative Frequency (CRF)" set y2range [0:1] set y2tics set yrange [0:0.02] plot "histogram.data" using 2:5 title " RelFreq" with impulses, \ "histogram.data" using 2:7 axes x1y2 title " CRF" with lines '_EOF_' # << happy emacs display histo.png & ### Create a phastCons data set for Euarchontoglires # setup euarchontoglires-only run ssh pk cd /cluster/data/mm9/bed/multiz30way/cons mkdir euarchontoglires run.cons/euarchontoglires cd euarchontoglires # euarchontoglires-only: exclude all but these for phastCons tree: /cluster/bin/phast.new/tree_doctor ../../mm9.30way.mod \ --prune-all-but=mm9,rn4,cavPor2,oryCun1,hg18,panTro2,ponAbe2,rheMac2,calJac1,otoGar1,tupBel1 \ > euarchontoglires.mod # and place the removed ones in the non-inf file so phastCons will # truly ignore them: echo "sorAra1,eriEur1,canFam2,felCat3,equCab1,bosTau3,dasNov1,loxAfr1,echTel1,monDom4,ornAna1,galGal3,anoCar1,xenTro2,tetNig1,fr2,gasAcu1,oryLat1,danRer5" \ > euarchontoglires.non-inf cd ../run.cons/euarchontoglires # root1 == chrom name, file1 == ss file name without .ss suffix # Create template file for "all" run cat << '_EOF_' > template #LOOP ../doPhast.csh $(root1) $(file1) 45 .3 .31 {check out line+ /san/sanvol1/scratch/mm9/multiz30way/cons/euarchontoglires/bed/$(root1)/$(file1).bed} #ENDLOOP '_EOF_' # << happy emacs gensub2 ../ss.list single template jobList para create jobList para try ... check ... push ... etc. # Three of these jobs fail to produce any output: # chr5_random/chr5_random.1-357350.bed # chr7_random/chr7_random.1-362490.bed # chrY_random/chrY_random.50000001-58682461.bed # Completed: 291 of 294 jobs # Crashed: 3 jobs # CPU time in finished jobs: 17184s 286.40m 4.77h 0.20d 0.001 y # IO & Wait Time: 30139s 502.31m 8.37h 0.35d 0.001 y # Average job time: 163s 2.71m 0.05h 0.00d # Longest finished job: 296s 4.93m 0.08h 0.00d # Submission to last job: 2775s 46.25m 0.77h 0.03d # create Most Conserved track ssh kolossus cd /san/sanvol1/scratch/mm9/multiz30way/cons/euarchontoglires cat bed/*/chr*.bed | sort -k1,1 -k2,2n | \ awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' | \ /cluster/bin/scripts/lodToBedScore /dev/stdin > mostConserved.bed # ~ 1 minute cp -p mostConserved.bed /cluster/data/mm9/bed/multiz30way/cons/euarchontoglires # load into database ssh hgwdev cd /cluster/data/mm9/bed/multiz30way/cons/euarchontoglires time nice -n +19 hgLoadBed mm9 phastConsElements30wayEuarch \ mostConserved.bed # Loaded 1021674 elements of size 5 # real 0m23.402s # verify coverage featureBits mm9 phastConsElements30wayEuarch # 103492546 bases of 2620346127 (3.950%) in intersection # Create the downloads .pp files, from which the phastCons wiggle data # is calculated # currently doesn't matter where this is performed, the san is the same # network distance from all machines. # sort by chromName, chromStart so that items are in numerical order # for wigEncode cd /san/sanvol1/scratch/mm9/multiz30way/cons/euarchontoglires mkdir downloads for D in pp/chr* do C=${D/pp\//} ls $D/*.pp | sort -n -t\. -k2 | xargs cat | gzip -c \ > downloads/${C}.euarchontoglires.pp.data.gz echo $D $C done done # Create merged posterier probability file and wiggle track data files cd /san/sanvol1/scratch/mm9/multiz30way/cons/euarchontoglires ls downloads/chr*.data.gz | xargs zcat \ | wigEncode -noOverlap stdin phastCons30wayEuarch.wig phastCons30wayEuarch.wib # Converted stdin, upper limit 1.00, lower limit 0.00 ## load table with wiggle data ssh hgwdev cd /cluster/data/mm9/bed/multiz30way/cons/euarchontoglires cp -p /san/sanvol1/scratch/mm9/multiz30way/cons/euarchontoglires/*.wi? . ln -s `pwd`/phastCons30wayEuarch.wib \ /gbdb/mm9/multiz30way/phastCons30wayEuarch.wib time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/mm9/multiz30way mm9 \ phastCons30wayEuarch phastCons30wayEuarch.wig # real 0m44.161s # Create histogram to get an overview of all the data time nice -n +19 hgWiggle -doHistogram \ -hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \ -db=mm9 phastCons30wayEuarch > histogram.data 2>&1 # real 3m22.364s # create plot of histogram: cat << '_EOF_' | gnuplot > histo.png set terminal png small color \ x000000 xffffff xc000ff x66ff66 xffff00 x00ffff xff0000 set size 1.4, 0.8 set key left box set grid noxtics set grid ytics set title " Mouse Mm9 Histogram phastCons30wayEuarch track" set xlabel " phastCons30wayEuarch score" set ylabel " Relative Frequency" set y2label " Cumulative Relative Frequency (CRF)" set y2range [0:1] set y2tics set yrange [0:0.02] plot "histogram.data" using 2:5 title " RelFreq" with impulses, \ "histogram.data" using 2:7 axes x1y2 title " CRF" with lines '_EOF_' # << happy emacs display histo.png & ### Create a phastCons data set for Placentals # setup placental-only run ssh pk cd /cluster/data/mm9/bed/multiz30way/cons mkdir placental run.cons/placental cd placental # placental-only: exclude all but these for phastCons tree: /cluster/bin/phast.new/tree_doctor ../../mm9.30way.mod \ --prune-all-but=mm9,rn4,cavPor2,oryCun1,hg18,panTro2,ponAbe2,rheMac2,calJac1,otoGar1,tupBel1,sorAra1,eriEur1,canFam2,felCat3,equCab1,bosTau3,dasNov1,loxAfr1,echTel1 \ > placental.mod # and place the removed ones in the non-inf file so phastCons will # truly ignore them: echo "monDom4,ornAna1,galGal3,anoCar1,xenTro2,tetNig1,fr2,gasAcu1,oryLat1,danRer5" \ > placental.non-inf cd ../run.cons/placental # root1 == chrom name, file1 == ss file name without .ss suffix # Create template file for "all" run cat << '_EOF_' > template #LOOP ../doPhast.csh $(root1) $(file1) 45 .3 .31 {check out line+ /san/sanvol1/scratch/mm9/multiz30way/cons/placental/bed/$(root1)/$(file1).bed} #ENDLOOP '_EOF_' # << happy emacs gensub2 ../ss.list single template jobList para create jobList para try ... check ... push ... etc. # One of these jobs fails to produce any output: # chr5_random/chr5_random.1-357350.bed # Completed: 293 of 294 jobs # Crashed: 1 jobs # CPU time in finished jobs: 21121s 352.01m 5.87h 0.24d 0.001 y # IO & Wait Time: 33985s 566.42m 9.44h 0.39d 0.001 y # Average job time: 188s 3.13m 0.05h 0.00d # Longest finished job: 324s 5.40m 0.09h 0.00d # Submission to last job: 3511s 58.52m 0.98h 0.04d # create Most Conserved track ssh kolossus cd /san/sanvol1/scratch/mm9/multiz30way/cons/placental cat bed/*/chr*.bed | sort -k1,1 -k2,2n | \ awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' | \ /cluster/bin/scripts/lodToBedScore /dev/stdin > mostConserved.bed # ~ 1 minute cp -p mostConserved.bed /cluster/data/mm9/bed/multiz30way/cons/placental # load into database ssh hgwdev cd /cluster/data/mm9/bed/multiz30way/cons/placental time nice -n +19 hgLoadBed mm9 phastConsElements30wayPlacental \ mostConserved.bed # Loaded 1990870 elements of size 5 # real 0m48.084s # verify coverage featureBits mm9 phastConsElements30wayPlacental # 111626429 bases of 2620346127 (4.260%) in intersection # Create the downloads .pp files, from which the phastCons wiggle data # is calculated # currently doesn't matter where this is performed, the san is the same # network distance from all machines. # sort by chromName, chromStart so that items are in numerical order # for wigEncode cd /san/sanvol1/scratch/mm9/multiz30way/cons/placental mkdir downloads for D in pp/chr* do C=${D/pp\//} ls $D/*.pp | sort -n -t\. -k2 | xargs cat | gzip -c \ > downloads/${C}.placental.pp.data.gz echo $D $C done done # Create merged posterier probability file and wiggle track data files cd /san/sanvol1/scratch/mm9/multiz30way/cons/placental ls downloads/chr*.data.gz | xargs zcat \ | wigEncode -noOverlap stdin phastCons30wayPlacental.wig \ phastCons30wayPlacental.wib # Converted stdin, upper limit 1.00, lower limit 0.00 ## load table with wiggle data ssh hgwdev cd /cluster/data/mm9/bed/multiz30way/cons/placental cp -p /san/sanvol1/scratch/mm9/multiz30way/cons/placental/*.wi? . ln -s `pwd`/phastCons30wayPlacental.wib \ /gbdb/mm9/multiz30way/phastCons30wayPlacental.wib time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/mm9/multiz30way mm9 \ phastCons30wayPlacental phastCons30wayPlacental.wig # real 0m44.585s # Create histogram to get an overview of all the data time nice -n +19 hgWiggle -doHistogram \ -hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \ -db=mm9 phastCons30wayPlacental > histogram.data 2>&1 # real 28m24.388s # create plot of histogram: cat << '_EOF_' | gnuplot > histo.png set terminal png small color x000000 xffffff xc000ff x66ff66 xffff00 x00ffff set size 1.4, 0.8 set key left box set grid noxtics set grid ytics set title " Mouse Mm9 Histogram phastCons30wayPlacental track" set xlabel " phastCons30wayPlacental score" set ylabel " Relative Frequency" set y2label " Cumulative Relative Frequency (CRF)" set y2range [0:1] set y2tics set yrange [0:0.02] plot "histogram.data" using 2:5 title " RelFreq" with impulses, \ "histogram.data" using 2:7 axes x1y2 title " CRF" with lines '_EOF_' # << happy emacs display histo.png & ############################################################################# ## Downloads for 30way Conservation (DONE - 2007-11-01 - Hiram) ssh kkstore06 mkdir /cluster/data/mm9/bed/multiz30way/downloads/phastCons30wayScores cd /cluster/data/mm9/bed/multiz30way/downloads/phastCons30wayScores mkdir placental euarchontoglires all cd all cp -p \ /san/sanvol1/scratch/mm9/multiz30way/cons/all/phastCons30wayScores/*.data.gz . cd ../placental cp -p \ /san/sanvol1/scratch/mm9/multiz30way/cons/placental/downloads/*.data.gz . cd ../euarchontoglires cp -p \ /san/sanvol1/scratch/mm9/multiz30way/cons/euarchontoglires/downloads/*.data.gz . # rebuilt 2007-12-27 to fix difficulty in mafFrags when species.lst # did not have mm9 as the first one # upstream mafs (mafFrags takes a while) ssh kkstore06 cd /cluster/data/mm9/bed/multiz30way/downloads/multiz30way/maf # bash script #!/bin/sh for S in 1000 2000 5000 do echo "making upstream${S}.maf" featureBits mm9 refGene:upstream:${S} -fa=/dev/null -bed=stdout \ | perl -wpe 's/_up[^\t]+/\t0/' | sort -k1,1 -k2,2n \ | $HOME/kent/src/hg/ratStuff/mafFrags/mafFrags mm9 multiz30way \ stdin stdout \ -orgs=/cluster/data/mm9/bed/multiz30way/species.list \ | gzip -c > upstream${S}.maf.gz echo "done upstream${S}.maf.gz" done md5sum up*.gz >> md5sum.txt ssh kkstore06 mkdir /cluster/data/mm9/bed/multiz30way/downloads/multiz30way/maf_qual cp -p ../../../qual/maf/*.maf . time nice -n +19 gzip *.maf # real 77m3.592s time nice -n +19 md5sum *.gz > md5sum.txt # real 4m52.044s mkdir /cluster/data/mm9/bed/multiz30way/downloads/multiz30way/maf cp -p ../../../anno/maf/*.maf . time nice -n +19 gzip *.maf # real 86m2.341s time nice -n +19 md5sum *.gz > md5sum.txt # real 4m30.087s # create syn.net files for downloads for those organisms which # used the mafSynNet in the multiz30way ssh kkstore06 cd /cluster/data/mm9/bed for DB in rn4 hg18 rheMac2 ponAbe2 panTro2 equCab1 canFam2 bosTau3 do cd /cluster/data/mm9/bed/blastz.${DB}/axtChain time nice -n +19 netFilter -syn mm9.${DB}.net.gz \ | gzip -c > mm9.${DB}.syn.net.gz ls -og mm9.${DB}.syn.net.gz md5sum mm9.${DB}.syn.net.gz >> md5sum.txt done for DB in calJac1 cavPor2 tupBel1 otoGar1 dasNov1 oryCun1 felCat3 \ loxAfr1 eriEur1 sorAra1 echTel1 do cd /cluster/data/mm9/bed/blastz.${DB}/axtChain ls -l mm9.${DB}.rbest.net.gz md5sum mm9.${DB}.rbest.net.gz >> md5sum.txt md5sum mm9.${DB}.rbest.chain.gz >> md5sum.txt grep rbest md5sum.txt done # create symlinks to make everything show up ssh hgwdev cd /usr/local/apache/htdocs/goldenPath/mm9 for DB in ?n4 ?g18 ?heMac2 ?onAbe2 ?anTro2 ?quCab1 ?anFam2 ?osTau3 do ls -Lld /cluster/data/mm9/bed/blastz.${DB}/axtChain/mm9.${DB}.syn.net.gz ln -s /cluster/data/mm9/bed/blastz.${DB}/axtChain/mm9.${DB}.syn.net.gz \ vs${DB}/ ls -Lld vs${DB}/mm9.*.syn.net.gz done for DB in ?alJac1 ?avPor2 ?upBel1 ?toGar1 ?asNov1 ?ryCun1 ?elCat3 \ ?oxAfr1 ?riEur1 ?orAra1 ?chTel1 do ls -Lld /cluster/data/mm9/bed/blastz.${DB}/axtChain/mm9.${DB}.rbest.net.gz ln -s /cluster/data/mm9/bed/blastz.${DB}/axtChain/mm9.${DB}.rbest.net.gz \ vs${DB}/ ls -Lld vs${DB}/mm9.${DB}.rbest.net.gz grep rbest vs${DB}/md5sum.txt done for DB in ?alJac1 ?avPor2 ?upBel1 ?toGar1 ?asNov1 ?ryCun1 ?elCat3 \ ?oxAfr1 ?riEur1 ?orAra1 ?chTel1 do ls -Lld /cluster/data/mm9/bed/blastz.${DB}/axtChain/mm9.${DB}.rbest.net.gz ln -s /cluster/data/mm9/bed/blastz.${DB}/axtChain/mm9.${DB}.rbest.net.gz \ vs${DB}/ ls -Lld vs${DB}/mm9.${DB}.rbest.net.gz ls -Lld /cluster/data/mm9/bed/blastz.${DB}/axtChain/mm9.${DB}.rbest.chain.gz ln -s /cluster/data/mm9/bed/blastz.${DB}/axtChain/mm9.${DB}.rbest.chain.gz \ vs${DB}/ ls -Lld vs${DB}/mm9.${DB}.rbest.chain.gz grep rbest vs${DB}/md5sum.txt done ###########################################################################t # # BUILD miRNA TRACK (DONE - 2007-10-05 - Fan) # updated data from: Michel.Weber@ibcg.biotoul.fr # notify them when done. ssh hgwdev cd /cluster/data/mm9/bed mkdir miRNA-2007-10-05 cd miRNA-2007-10-05 # save the miRNAtrack-mm9.txt file from email cat miRNAtrack-mm9.txt|sed -e 's/ /\t/g' > miRNA.tab hgLoadBed mm9 miRNA miRNA.tab # Add the miRNA section to makeDb/trackDb/mouse/mm9/trackDb.ra vi ~/src/hg/makeDb/trackDb/mouse/mm9/trackDb.ra # check previous release track before update featureBits mm8 miRNA #33398 bases of 2567283971 (0.001%) in intersection featureBits mm9 miRNA #39718 bases of 2620346127 (0.002%) in intersection ###########################################################################t # RE-BUILD miRNA TRACK (DONE – 2008-05-29 - Fan) # updated data from: Michel.Weber@ibcg.biotoul.fr # notify them when done. ssh hgwdev cd /cluster/data/mm9/bed mkdir miRNA-2008-05-28 cd miRNA-2008-05-28 # save the mouse_miRNA_may2008.doc as mouse_miRNA_may2008.txt # and replace all blanks with tabs. cp mouse_miRNA_may2008.txt miRNA.tab hgLoadBed mm9 miRNA miRNA.tab # check previous release track before update featureBits mm8 miRNA #33398 bases of 2567283971 (0.001%) in intersection featureBits mm9 miRNA #43236 bases of 2620346127 (0.002%) in intersection ############################################################################# # N-SCAN gene predictions (nscanGene) - (2006-08-30 markd) # obtained NSCAN predictions from michael brent's group # at WUSTL cd /cluster/data/mm9/bed/nscan/ wget http://mblab.wustl.edu/predictions/mouse/mm9/mm9.gtf wget http://mblab.wustl.edu/predictions/mouse/mm9/mm9.prot.fa wget http://mblab.wustl.edu/predictions/mouse/mm9/readme.txt bzip2 mm9.* chmod a-w * mv ardor.wustl.edu/jeltje/mm9/chr_ptx . rm -rf ardor.wustl.edu rm chr_*/index.html* gzip chr_*/* chmod a-w chr_*/*.gz # load track ldHgGene -bin -gtf -genePredExt mm9 nscanGene mm9.gtf.bz2 hgPepPred mm9 generic nscanPep mm9.prot.fa.bz2 rm *.tab # update trackDb; need a mm9-specific page to describe informants mouse/mm9/nscanGene.html (copy from hg18 and edit) mouse/mm9/trackDb.ra # changed search regex to termRegex chr[0-9a-zA-Z_].*\.[0-9]+\.[0-9] ######################################################################### # CPGISLANDS (DONE - 2007-10-25 - Hiram) ssh hgwdev mkdir /cluster/data/mm9/bed/cpgIsland cd /cluster/data/mm9/bed/cpgIsland # Build software from Asif Chinwalla (achinwal@watson.wustl.edu) cvs co hg3rdParty/cpgIslands cd hg3rdParty/cpgIslands make # gcc readseq.c cpg_lh.c -o cpglh.exe cd ../.. ln -s hg3rdParty/cpgIslands/cpglh.exe . # cpglh.exe requires hard-masked (N) .fa's. # make the hard masked sequences from these soft masked sequences ssh kkstore06 time for CHR in ?/chr?.fa ??/chr??.fa ?/chr?_random.fa ??/chr??_random.fa do echo "maskOutFa ${CHR} hard ${CHR}.masked" nice -n +19 maskOutFa ${CHR} hard ${CHR}.masked done # about 2 minutes # There may be warnings about "bad character" for IUPAC ambiguous # characters like R, S, etc. Ignore the warnings. cd /cluster/data/mm9/bed/cpgIsland time for F in ../../*/chr*.fa.masked do FA=${F/*\/} C=${FA/.fa.masked/} echo "./cpglh.exe ${FA} > ${C}.cpg" nice -n +19 ./cpglh.exe ${F} > ${C}.cpg done > cpglh.out 2>&1 & # about 3 minutes # Several chroms have 0 results: # -rw-rw-r-- 1 0 Oct 25 11:11 chr16_random.cpg # -rw-rw-r-- 1 0 Oct 25 11:12 chr3_random.cpg # -rw-rw-r-- 1 0 Oct 25 11:12 chr5_random.cpg # -rw-rw-r-- 1 0 Oct 25 11:13 chr7_random.cpg # -rw-rw-r-- 1 0 Oct 25 11:13 chrM.cpg # -rw-rw-r-- 1 0 Oct 25 11:13 chrX_random.cpg # -rw-rw-r-- 1 0 Oct 25 11:13 chrY.cpg # Transform cpglh output to bed + cat << '_EOF_' > filter.awk { $2 = $2 - 1; width = $3 - $2; printf("%s\t%d\t%s\t%s %s\t%s\t%s\t%0.0f\t%0.1f\t%s\t%s\n", $1, $2, $3, $5,$6, width, $6, width*$7*0.01, 100.0*2*$6/width, $7, $9); } '_EOF_' # << happy emacs awk -f filter.awk chr*.cpg | sort -k1,1 -k2,2n > cpgIsland.bed ssh hgwdev cd /cluster/data/mm9/bed/cpgIsland hgLoadBed mm9 cpgIslandExt -tab \ -sqlTable=$HOME/kent/src/hg/lib/cpgIslandExt.sql cpgIsland.bed # Reading cpgIsland.bed # Loaded 15963 elements of size 10 featureBits mm9 cpgIslandExt # 10496250 bases of 2620346127 (0.401%) in intersection featureBits mm8 cpgIslandExt # 10456823 bases of 2567283971 (0.407%) in intersection featureBits mm7 cpgIslandExt # 10439328 bases of 2583394090 (0.404%) in intersection featureBits mm6 cpgIslandExt # 10432360 bases of 2597150411 (0.402%) in intersection featureBits mm5 cpgIslandExt # 10422989 bases of 2615483787 (0.399%) in intersection featureBits mm4 cpgIsland # 11109692 bases of 2627444668 (0.423%) in intersection featureBits mm3 cpgIsland # 10102968 bases of 2505900260 (0.403%) in intersection ############################################################################# # LIFTOVER (DROPUNDER) TO MM8 (DONE - 2007-11-05 - Hiram) ssh kkstore06 screen # use a screen to control this job # -debug run to create run dir, preview scripts... doSameSpeciesLiftOver.pl -debug mm9 mm8 \ -ooc /san/sanvol1/scratch/mm9/11.ooc # Real run: cd /cluster/data/mm9/bed/blat.mm8.2007-11-05 time nice -n +19 doSameSpeciesLiftOver.pl mm9 mm8 \ -ooc /san/sanvol1/scratch/mm9/11.ooc > do.log 2>&1 & ######################################################################## # ANNOTATE 30-WAY ALIGNMENT WITH QUALITY DATA (2007-11-07 rico at bx.psu.edu) # # The basic idea here is to create a qac file which has quality data for each # (chromosome/scaffold/etc) and then index the qac file. Once this is done, # mafAddQRows can be used to add the quality data to a given maf. The agp # files are used so that gaps can be represented in the qac files as a special # value. ## create .qac and .qdx files for each species in the 30-way alignment ## results are stored in /cluster/store12/rico/quality o human (hg18) Unable to find quality data. o chimp (panTro2) in.agp = cat /cluster/data/panTro2/wustl/*.agp > all.agp in.qac = /cluster/data/panTro2/bed/quality/qac/panTro2.qac qacAddGapIdx in.agp in.qac panTro2.qac panTro2.qdx o rhesus (rheMac2) in.agp: /cluster/data/rheMac2/downloads (cat v1.edit4.chrome.ctgs.final.fix.agp; sed -e 's/^ChrUr/chrUr/' v1.edit4.ChrUr.ctgs.agp ) > all.agp in.qa = /cluster/data/rheMac2/qual/rheMac2.qual.qv.gz qaAgpToQacIdx in.agp in.qa rheMac2.qac rheMac2.qdx o bushbaby (otoGar1) http://www.broad.mit.edu/ftp/pub/assemblies/mammals/bushbaby/otoGar1 in.agp = assembly.agp in.qa = Draft_v1.agp.chromosome.qual.gz qaAgpToQacIdx in.agp in.qa otoGar1.qac otoGar1.qdx o treeshrew (tupBel1) http://www.broad.mit.edu/ftp/pub/assemblies/mammals/treeShrew/tupBel1 in.agp = assembly.agp in.qa = Draft_v1.agp.chromosome.qual.gz qaAgpToQacIdx in.agp in.qa tupBel1.qac tupBel1.qdx o rat (rn4) in.agp: /cluster/data/rn4 #!/bin/sh rm -f rn4.agp for chrom in `awk '{print $1}' chrom.sizes` do num=`echo $chrom | cut -dr -f2- | cut -d_ -f1` if [ -f "$num/${chrom}.agp" ]; then cat $num/${chrom}.agp >> rn4.agp else echo "Missing agp file for $chrom" exit 1 fi done in.qa: /cluster/data/rn4/downloads #!/bin/sh rm -f rn4.qa for file in *.qual.gz do echo -n "Processing $file ... " chrom=`echo $file | sed -e 's/^Rnor3.4//;s/\.fa\.qual\.gz$//' | tr '-' '_'` (echo ">$chrom" ; gzip -dc $file | tail +2) >> rn4.qa echo "done." done qaAgpToQacIdx in.agp in.qa rn4.qac rn4.qdx o mouse (mm9) Unable to find quality data. o guinea pig (cavPor2) in.agp = /cluster/data/cavPor2/downloads/assembly.agp in.qa = /cluster/data/cavPor2/downloads/Draft_v2.agp.chromosome.qual.gz qaAgpToQacIdx in.agp in.qa cavPor2.qac cavPor2.qdx o rabbit (oryCun1) http://www.broad.mit.edu/ftp/pub/assemblies/mammals/rabbit/oryCun1 in.agp = assembly.agp in.qa = Draft_v1.agp.chromosome.qual.gz qaAgpToQacIdx in.agp in.qa oryCun1.qac oryCun1.qdx o shrew (sorAra1) in.agp = /cluster/data/sorAra1/downloads/assembly.agp in.qa = /cluster/data/sorAra1/downloads/Draft_v1.agp.chromosome.qual.gz qaAgpToQacIdx in.agp in.qa sorAra1.qac sorAra1.qdx o hedgehog (eriEur1) in.agp = /cluster/data/eriEur1/downloads/assembly.agp in.qa = /cluster/data/eriEur1/downloads/Draft_v1.agp.chromosome.qual.gz qaAgpToQacIdx in.agp in.qa eriEur1.qac eriEur1.qdx o dog (canFam2) in.agp = /cluster/store9/canFam2/broad/UCSC_Dog2.0.agp in.qac = /cluster/store9/canFam2/bed/quality/chrom.qac qacAddGapIdx in.agp in.qac canFam2.qac canFam2.qdx o cat (felCat3) in.agp = /cluster/data/felCat3/downloads/assembly.agp in.qa = /cluster/data/felCat3/downloads/Draft_v3.agp.chromosome.qual.gz qaAgpToQacIdx in.agp in.qa felCat3.qac felCat3.qdx o horse (equCab1) in.agp = /cluster/data/equCab1/downloads/assembly.agp in.qa = /cluster/data/equCab1/downloads/Draft_v1.agp.chromosome.qual.gz qaAgpToQacIdx in.agp in.qa equCab1.qac equCab1.qdx o cow (bosTau3) in.agp = /cluster/data/bosTau3/fixup/UCSC.agp in.qac = /cluster/data/bosTau3/fixup/chrom.qac qacAddGapIdx in.agp in.qac bosTau3.qac bosTau3.qdx o armadillo (dasNov1) /cluster/data/dasNov1/broad combineQuals assembly.agp.gz assembly.quals.gz combined.quals qaAgpToQacIdx assembly.agp.gz combined.quals dasNov1.qac dasNov1.qdx o elephant (loxAfr1) /cluster/data/loxAfr1/broad combineQuals assembly.agp assembly.quals.gz combined.quals qaAgpToQacIdx assembly.agp combined.quals loxAfr1.qac loxAfr1.qdx o tenrec (echTel1) /cluster/data/echTel1/broad combineQuals assembly.agp assembly.quals.gz combined.quals qaAgpToQacIdx assembly.agp combined.quals echTel1.qac echTel1.qdx o opossum (monDom4) /cluster/data/monDom4/broad.mit.edu in.qa = gzip -dc Monodelphis4.0.agp.chromosome.qual.gz \ | sed -e 's/^>\([^.]*\)\.1-.*/>chr\1/;/^>.*Monodelphis4.0)/d' > monDom4.qa in.agp = Monodelphis4.0.agp qaAgpToQacIdx in.agp in.qa monDom4.qac monDom4.qdx o platypus (ornAna1) Unable to find quality data. o chicken (galGal3) Unable to find quality data. o lizard (anoCar1) in.agp = /cluster/data/anoCar1/downloads/assembly.agp in.qac = /cluster/data/anoCar1/downloads/scaffold.lifted.qac qacAddGapIdx in.agp in.qac anoCar1.qac anoCar1.qdx o frog (xenTro2) Unable to find quality data. o tetraodon (tetNig1) Unable to find quality data. o fugu (fr2) Unable to find quality data. o stickleback (gasAcu1) in.agp = /cluster/data/gasAcu1/downloads/UCSC.gasAcu1.agp in.qa = /cluster/data/gasAcu1/downloads/UCSC.gasAcu1.qual qaAgpToQacIdx in.agp in.qac gasAcu1.qac gasAcu1.qdx o medaka (oryLat1) in.agp = /cluster/data/oryLat1/downloads/chr.agp.txt-fixed in.qac = /cluster/data/oryLat1/bed/qual/fixed.chroms.qac qacAddGapIdx in.agp in.qac oryLat1.qac oryLat1.qdx o zebrafish (danRer5) Unable to find quality data. o orangutan (ponAbe2) Unable to find quality data. o marmoset (calJac1) Unable to find quality data. ## copy all .qac and .qdx files to the san cp *.{qac,qdx} /san/sanvol1/rico/quality ## create species list (species.lst) containing the following anoCar1 /san/sanvol1/rico/quality bosTau3 /san/sanvol1/rico/quality canFam2 /san/sanvol1/rico/quality cavPor2 /san/sanvol1/rico/quality dasNov1 /san/sanvol1/rico/quality echTel1 /san/sanvol1/rico/quality equCab1 /san/sanvol1/rico/quality eriEur1 /san/sanvol1/rico/quality felCat3 /san/sanvol1/rico/quality gasAcu1 /san/sanvol1/rico/quality loxAfr1 /san/sanvol1/rico/quality monDom4 /san/sanvol1/rico/quality oryCun1 /san/sanvol1/rico/quality oryLat1 /san/sanvol1/rico/quality otoGar1 /san/sanvol1/rico/quality panTro2 /san/sanvol1/rico/quality rheMac2 /san/sanvol1/rico/quality rn4 /san/sanvol1/rico/quality sorAra1 /san/sanvol1/rico/quality tupBel1 /san/sanvol1/rico/quality ## the following script will add quality data to each of the mafs cat > addQData << 'EOF' #!/bin/sh INPUT_DIR=/cluster/data/mm9/bed/multiz30way/anno/maf OUTPUT_DIR=/cluster/data/mm9/bed/multiz30way/qual/maf for maf in `ls -1Sr ${INPUT_DIR}/*.maf` do file=`basename $maf` mafAddQRows species.lst $maf ${OUTPUT_DIR}/$file done 'EOF' # << emacs ######################################################################### ### IGTC (Int'l GeneTrap Consortium) (DONE - 2007-10-01 - angie) ### Doug Stryke in Tom Ferrin's lab ### NOTE -- the igtc track is automatically updated on hgwdev by the ### scripts monthlyUpdateIgtc.csh and updateIgtc.pl in ### kent/src/hg/utils/automation/ . ######################################################################### # Load CCDS (2007-12-12 markd) # import ccds database as described in ccds.txt set db=mm9 # create and load ccdsGene and ccdsInfo tables from imported database /cluster/data/genbank/bin/x86_64/ccdsMkTables -loadDb ccds ${db} ccdsInfo ccdsGene # ccdsKgMap /cluster/data/genbank/bin/x86_64/mkCcdsGeneMap -db=${db} -loadDb ccdsGene knownGene ccdsKgMap # build initial version of ccdsMgcMap table, updated by nightly genbank update /cluster/data/genbank/bin/x86_64/mkCcdsGeneMap -db=${db} -loadDb ccdsGene mgcGenes ccdsMgcMap checkTableCoords ${db} -verbose=2 ccdsGene joinerCheck -database=${db} -identifier=ccdsGeneId ~/compbio/genbank/kent/src/hg/makeDb/schema/all.joiner ############################################################################ # Reload CCDS (2008-02-01 markd) # import ccds database as described in ccds.txt set db=mm9 # create and load ccdsGene and ccdsInfo tables from imported database /cluster/data/genbank/bin/x86_64/ccdsMkTables -loadDb ccds ${db} ccdsInfo ccdsGene # ccdsKgMap /cluster/data/genbank/bin/x86_64/mkCcdsGeneMap -db=${db} -loadDb ccdsGene knownGene ccdsKgMap checkTableCoords ${db} -verbose=2 ccdsGene # update all.jointer to include ${db} in ccdsDb joinerCheck -database=${db} -identifier=ccdsGeneId ~/compbio/genbank/kent/src/hg/makeDb/schema/all.joiner # request push of ccdsGene ccdsInfo ccdsKgMap # << emacs ############################################################################ # dbSNP BUILD 128 (DONE 2/8/08 angie) # updated snp128ExceptionDesc (tweaked wording) 3/11/08 # Set up build directory ssh kkstore06 mkdir -p /cluster/store3/dbSNP128/{mouse,shared} # dbSNP 128 field encodings (*.bcp.gz) were already downloaded -- # see hg18.txt. ########################## DOWNLOAD ############################# cd /cluster/data/dbSNP/128/mouse mkdir data schema rs_fasta # Get data from NCBI (anonymous FTP) wget ftp://ftp.ncbi.nih.gov/snp/00readme.txt cd /cluster/data/dbSNP/128/mouse/data alias wg wget --timestamping set ftpSnpDb = ftp://ftp.ncbi.nih.gov/snp/organisms/mouse_10090/database # ContigLoc table has coords, orientation, loc_type, and refNCBI allele wg $ftpSnpDb/organism_data/b128_SNPContigLoc_37_1.bcp.gz wg $ftpSnpDb/organism_data/b128_SNPContigLocusId_37_1.bcp.gz wg $ftpSnpDb/organism_data/b128_ContigInfo_37_1.bcp.gz # MapInfo has alignment weights wg $ftpSnpDb/organism_data/b128_SNPMapInfo_37_1.bcp.gz # SNP has univar_id, validation status and heterozygosity wg $ftpSnpDb/organism_data/SNP.bcp.gz # Get schema cd /cluster/data/dbSNP/128/mouse/schema wg $ftpSnpDb/organism_schema/mouse_10090_table.sql.gz # Get fasta files # using headers of fasta files for molType, class, observed cd /cluster/data/dbSNP/128/mouse/rs_fasta wg ftp://ftp.ncbi.nih.gov/snp/organisms/mouse_10090/rs_fasta/\*.gz ########################## LOAD NCBI TABLES ############################# # Simplify names of data files -- strip version & extras to get # local canonical table names. cd /cluster/data/dbSNP/128/mouse/data foreach f (*.bcp.gz) set new = `echo $f \ | sed -e 's/^b128_SNP//; s/^b128_//; s/_37_1//; s/.bcp//;'` mv $f $new echo $new end # Extract just the tables that we need from the NCBI msSQL table # creation file, and get CREATE statements from # mouse_10090_table.sql for our 5 tables cd /cluster/data/dbSNP/128/mouse/schema zcat mouse_10090_table.sql.gz \ | perl -we '$/ = "\nGO\n\n\n"; \ while (<>) { \ next unless /^CREATE TABLE \[(b128_(SNP)?)?(ContigInfo|ContigLoc|ContigLocusId|MapInfo|SNP)(_37_1)?\]/; \ s/b128_(SNP)?//; s/_37_1//; \ s/[\[\]]//g; s/GO\n\n\n/;/; s/smalldatetime/datetime/g; \ s/ON PRIMARY//g; s/COLLATE//g; s/Latin1_General_BIN//g; \ s/IDENTITY (1, 1) NOT NULL /NOT NULL AUTO_INCREMENT, PRIMARY KEY (id)/g; \ s/nvarchar/varchar/g; s/set quoted/--set quoted/g; \ s/(image|varchar\s+\(\d+\))/BLOB/g; \ print; \ }' \ > table.sql # load on kolossus or a small cluster machine (mysql5 is OK for this). ssh kkr3u00 hgsql '' -e 'create database mm9snp128' cd /cluster/data/dbSNP/128/mouse/schema hgsql mm9snp128 < table.sql cd ../data foreach t (ContigInfo ContigLoc ContigLocusId MapInfo SNP) zcat $t.gz \ | perl -wpe 's/(\d\d:\d\d:\d\d)\.0/$1/g;' \ | hgLoadSqlTab -oldTable mm9snp128 $t placeholder stdin end # There were some warnings (many cleared up by the perl substitution) # but no rows were dropped. I eyeballed a few examples, seemed OK, # e.g. no value given for a field where NULL is OK. foreach t (ContigInfo ContigLoc ContigLocusId MapInfo SNP) echo -n "${t}:\t" hgsql -N -B mm9snp128 -e 'select count(*) from '$t end #ContigInfo: 13636 #ContigLoc: 31733892 #ContigLocusId: 12883378 #MapInfo: 28464204 #SNP: 14380527 # compare contig list in mm9.ctgPos vs ContigInfo (for the reference # strain, not the alts included in ContigInfo) ssh hgwdev hgsql mm9 -NBe 'select * from ctgPos;' \ | sed -re 's/^(N[A-Z]_[0-9]+)\.[0-9]+/\1/;' \ > ctgPos.tab awk '{print $1;}' ctgPos.tab | sort > /tmp/1 # Take a look at the group_label values and choose a set that matches # the reference assembly: hgsql mm9snp128 -NBe 'select distinct(group_label) from ContigInfo' # Looks like just ref_strain will do. hgsql mm9snp128 -NBe 'select contig_acc from ContigInfo \ where group_label = "C57BL/6J"' \ | sort > /tmp/2 diff /tmp/1 /tmp/2 # No diff, good. # Make sure there are no orient != 0 contigs among those selected. hgsql mm9snp128 -NBe \ 'select count(*) from ContigInfo where orient != 0 and \ group_label = "C57BL/6J";' #0 #################### EXTRACT INFO FROM NCBI TABLES #################### mkdir -p /scratch/snp/128/mouse cd /scratch/snp/128/mouse time hgsql mm9snp128 -e \ 'alter table ContigLoc add index (ctg_id); \ alter table ContigInfo add index (ctg_id);' #0.002u 0.001s 6:18.71 0.0% 0+0k 0+0io 1pf+0w time hgsql mm9snp128 -e \ 'alter table ContigInfo add index (group_label(9));' #0.002u 0.002s 0:00.35 0.0% 0+0k 0+0io 1pf+0w # Since there is only one group_label for mouse, just use snp_id # as key. If there is more than one group_label to pick up, then # don't use this as a template -- use hg18.txt. hgsql mm9snp128 -NBe \ 'select snp_id, ContigInfo.contig_acc, asn_from, asn_to, \ loc_type, orientation, allele, phys_pos_from \ from ContigLoc, ContigInfo \ where ContigLoc.ctg_id = ContigInfo.ctg_id and \ ContigInfo.group_label = "C57BL/6J";' \ | sort \ > ucscContigLoc.txt # took ~7 minutes # The IDs are non-unique (can be multiply mapped). This is OK if # everything else that we relate to these uniquely maps to snp_id. wc -l ucscContigLoc.txt #16232825 ucscContigLoc.txt awk '{print $1;}' ucscContigLoc.txt | uniq | wc -l #14304640 # SNP -> valid, avHet, avHetSE # SNP has only snp_id as identifier, nothing relating to assembly. hgsql mm9snp128 -NBe \ 'select snp_id, validation_status, avg_heterozygosity, het_se \ from SNP;' \ | sort \ > ucscSNP.txt # Check ID uniqueness: wc -l ucscSNP.txt #14380527 ucscSNP.txt awk '{print $1;}' ucscSNP.txt | uniq | wc -l #14380527 # ContigLocusId -> func # ContigLocusId has only snp_id as an identifier (it gives one # example contig if the SNP is on multiple contigs). # The sort options and awk are to convert multiple entries with different # function classes for the same SNP into one entry per SNP with a list # of function classes. hgsql mm9snp128 -NBe \ 'select snp_id, fxn_class from ContigLocusId;' \ | sort -u -k1,1 -k2,2n \ | awk '{if (prevId == $1) { prevFunc = prevFunc $2 ","; } \ else { if (prevId) {print prevId "\t" prevFunc;} \ prevFunc = $2 ","; }} \ {prevId = $1;} \ END {print prevId "\t" prevFunc;}' \ > ucscFunc.txt # Check ID uniqueness: wc -l ucscFunc.txt #5878591 ucscFunc.txt awk '{print $1;}' ucscFunc.txt | sort -u | wc -l #5878591 # MapInfo -> weight # MapInfo needs assembly+snp_ids in order to have unique IDs. time hgsql mm9snp128 -e \ 'alter table MapInfo add index (assembly(9));' #0.000u 0.004s 2:22.64 0.0% 0+0k 0+0io 0pf+0w hgsql mm9snp128 -NBe \ 'select snp_id, weight from MapInfo where assembly = "C57BL/6J";' \ | sort \ > weight.txt # ~1 minute # Check ID uniqueness: wc -l weight.txt #14304640 weight.txt awk '{print $1;}' weight.txt | uniq | wc -l #14304640 awk '{print $2;}' weight.txt | sort -n | uniq -c #13954580 1 # 113119 2 # 169755 3 # 67186 10 # SNPs w/weight 0 and 10 will be discarded later. # fasta headers -> observed, molType, class zcat /cluster/data/dbSNP/128/mouse/rs_fasta/rs_ch*.fas.gz \ | grep '^>gnl' \ | perl -wpe 's/^\S+rs(\d+) .*mol="(\w+)"\|class=(\d+)\|alleles="([^"]+)"\|build.*/$1\t$4\t$2\t$3/ || die "Parse error line $.:\n$_\n\t";' \ | sort \ > ucscGnl.txt # ~4 minutes wc -l ucscGnl.txt #14380527 ucscGnl.txt awk '{print $1;}' ucscGnl.txt | uniq | wc -l #14380527 ############### JOIN NCBI COLUMNS TO GET UCSC SNP COLUMNS ################ # Join files by ID. time join -a 1 -e MISSING -t ' ' ucscContigLoc.txt weight.txt \ > ucscCL+w.txt #26.811u 4.091s 1:02.59 49.3% 0+0k 0+0io 0pf+0w wc -l ucscCL+w.txt #16232825 ucscCL+w.txt # Same as ucscContigLoc.txt above, good. # Any missing weights? grep MISSING ucscCL+w.txt | head # No output, good. # Join the files with SNP-only IDs. time join -e MISSING -t ' ' ucscGnl.txt ucscSNP.txt \ > ucscG+S.txt #16.591u 1.935s 0:28.44 65.1% 0+0k 0+0io 0pf+0w wc -l ucscG+S.txt #14380527 ucscG+S.txt # Same as ucscSNP.txt and ucscGnl.txt above. grep MISSING ucscG+S.txt | wc -l #0 time join -a 1 -e MISSING -o 1.1,1.2,1.3,1.4,1.5,1.6,1.7,2.2 \ -t ' ' ucscG+S.txt ucscFunc.txt \ > ucscG+S+F.txt #17.438u 2.115s 0:24.83 78.6% 0+0k 0+0io 0pf+0w wc -l ucscG+S+F.txt #14380527 ucscG+S+F.txt grep MISSING ucscG+S+F.txt | wc -l #8501936 # Not surprising -- ucscFunc.txt has only 5878591 lines. expr 14380527 - 5878591 #8501936 # Final join -- treat ContigLoc as authoritative (since it has coords). # Arrange columns in same order as in the SNP table, with extras for # checking at the end (phys_pos_from). # chr chrS chrE name strand refN obs molT cls val aH aHSE fxn locT wt ... time join -a 1 -e MISSING -t ' ' \ -o '1.2 1.3 1.4 1.1 1.6 1.7 2.2 2.3 2.4 2.5 2.6 2.7 2.8 1.5 1.9 1.8' \ ucscCL+w.txt ucscG+S+F.txt \ > ucscNcbiSnp.ctg.txt #41.401u 6.045s 1:02.04 76.4% 0+0k 0+0io 0pf+0w wc -l ucscNcbiSnp.ctg.txt #16232825 ucscNcbiSnp.ctg.txt grep MISSING ucscNcbiSnp.ctg.txt | awk '{print $4;}' | uniq | wc -l #8432812 # a bit less than the 8501936 missing FUNC's above... perhaps some # of those did not have any mappings in ucscContigLoc.txt. # Lift the map contig coordinates to chrom coordinates (~2m); sed -re 's/\t(N[A-Z]_[0-9]+)\.[0-9]+\t/\t\1\t/;' \ /cluster/data/mm9/jkStuff/mm9.contigs.lift > liftContigs.lft time liftUp ucscNcbiSnp.bed liftContigs.lft warn ucscNcbiSnp.ctg.txt #131.007u 7.438s 2:26.48 94.5% 0+0k 0+0io 0pf+0w wc -l ucscNcbiSnp.bed #16232825 ucscNcbiSnp.bed # At this point, move back from /scratch to /cluster/data. nice gzip ucscNcbiSnp.bed cp -p ucscNcbiSnp.bed.gz /cluster/data/dbSNP/128/mouse/ # Translate NCBI's encoding into UCSC's, and perform a bunch of # checks. This is where developer involvement is most likely as # NCBI extends the encodings used in dbSNP. cd /cluster/data/dbSNP/128/mouse/ gunzip ucscNcbiSnp.bed.gz time snpNcbiToUcsc ucscNcbiSnp.bed /cluster/data/mm9/mm9.2bit \ snp128 #count of snps with weight 0 = 0 #count of snps with weight 1 = 13954580 #count of snps with weight 2 = 226238 #count of snps with weight 3 = 712684 #count of snps with weight 10 = 1339323 #Found no errors. #162.963u 9.783s 3:02.77 94.5% 0+0k 0+0io 1pf+0w wc -l snp* # 14893502 snp128.bed # 22 snp128.sql # 0 snp128Errors.bed # 18 snp128ExceptionDesc.tab # 1898314 snp128Exceptions.bed # Make one big fasta file. (note: snp126 skipped chrUn... but it's small # compared to chr1, chr2 etc.) # Some of the fasta files have SNPs that were not mapped to the reference # assembly. Make sure there is no overlap with snp128.bed, and then # move then out of the way. zcat rs_fasta/rs_chNotOn.fas.gz \ | perl -we 'while (<>) { \ next unless /^>gnl/; s/^>gnl.dbSNP.(rs\d+).*/$1/; print; }' \ | sort | grep -Fwf - snp128.bed | head ^chNotOn^chAltOnly # No output from either command -- good. mkdir rs_fasta/omitted mv rs_fasta/rs_ch{AltOnly,NotOn}.fas.gz rs_fasta/omitted/ zcat rs_fasta/rs_ch*.fas.gz \ | perl -wpe 's/^>gnl\|dbSNP\|(rs\d+) .*/>$1/ || ! /^>/ || die;' \ > snp128.fa # Check for duplicates. grep ^\>rs snp128.fa | sort > /scratch/tmp/seqHeaders wc -l /scratch/tmp/seqHeaders #14304640 /scratch/tmp/seqHeaders uniq /scratch/tmp/seqHeaders | wc -l #14304640 # Use hgLoadSeq to generate .tab output for sequence file offsets, # and keep only the columns that we need: acc and file_offset. # Index it and translate to snpSeq table format. time hgLoadSeq -test placeholder snp128.fa #42.866u 4.977s 0:48.09 99.4% 0+0k 0+0io 4pf+0w cut -f 2,6 seq.tab > snp128Seq.tab rm seq.tab ssh hgwdev # Load up main track tables. cd /cluster/data/dbSNP/128/mouse time nice hgLoadBed -tab -noSort -onServer -tmpDir=/scratch/tmp \ mm9 snp128 -sqlTable=snp128.sql snp128.bed #Loaded 14893502 elements of size 17 #67.395u 12.818s 8:43.01 15.3% 0+0k 0+0io 0pf+0w sed -e 's/snp125/snp128/' ~/kent/src/hg/lib/snp125Exceptions.sql \ > snp128Exceptions.sql time nice hgLoadBed -tab -onServer -tmpDir=/scratch/tmp \ mm9 snp128Exceptions -sqlTable=snp128Exceptions.sql \ snp128Exceptions.bed #Loaded 1898314 elements of size 5 #8.925u 1.354s 0:52.66 19.5% 0+0k 0+0io 0pf+0w sed -e 's/snp125/snp128/' ~/kent/src/hg/lib/snp125ExceptionDesc.sql \ > snp128ExceptionDesc.sql # 3/11/08: reloaded snp128ExceptionDesc (tweaked wording) hgLoadSqlTab mm9 snp128ExceptionDesc snp128ExceptionDesc.sql \ snp128ExceptionDesc.tab # Load up sequences. sed -e 's/snpSeq/snp128Seq/' ~/kent/src/hg/lib/snpSeq.sql \ > snp128Seq.sql mkdir -p /gbdb/mm9/snp ln -s /cluster/data/dbSNP/128/mouse/snp128.fa /gbdb/mm9/snp/snp128.fa time nice hgLoadSqlTab mm9 snp128Seq snp128Seq.sql snp128Seq.tab #0.000u 0.003s 3:02.66 0.0% 0+0k 0+0io 0pf+0w # Put in a link where one would expect to find the track build dir... ln -s /cluster/data/dbSNP/128/mouse /cluster/data/mm9/bed/snp128 ######################################################################### # BLASTZ/CHAIN/NET BOSTAU4 (DONE - 2008-03-11,12 - Hiram) ssh kkstore06 screen # use a screen to manage this multi-day job mkdir /cluster/data/mm9/bed/blastzBosTau4.2008-03-11 cd /cluster/data/mm9/bed/blastzBosTau4.2008-03-11 cat << '_EOF_' > DEF BLASTZ_M=50 # TARGET: Human Hg18 SEQ1_DIR=/scratch/data/mm9/nib SEQ1_LEN=/cluster/data/mm9/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Cow bosTau4 SEQ2_DIR=/san/sanvol1/scratch/bosTau4/bosTau4.2bit SEQ2_LEN=/cluster/data/bosTau4/chrom.sizes # Maximum number of scaffolds that can be lumped together SEQ2_LIMIT=200 SEQ2_CHUNK=20000000 SEQ2_LAP=0 BASE=/cluster/data/mm9/bed/blastzBosTau4.2008-03-11 TMPDIR=/scratch/tmp '_EOF_' # << this line keeps emacs coloring happy time nice -n +19 doBlastzChainNet.pl `pwd`/DEF -verbose=2 \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -syntenicNet > do.log 2>&1 & # real 460m51.297s cat fb.mm9.chainBosTau4Link.txt # 690095394 bases of 2620346127 (26.336%) in intersection mkdir /cluster/data/bosTau4/bed/blastz.mm9.swap cd /cluster/data/bosTau4/bed/blastz.mm9.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /cluster/data/mm9/bed/blastzBosTau4.2008-03-11/DEF \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -swap -syntenicNet > swap.log 2>&1 & # real 117m39.571s cat fb.bosTau4.chainMm9Link.txt # 707444627 bases of 2731830700 (25.896%) in intersection ####################################################################### # BLASTZ/CHAIN/NET Lamprey petMar1 (DONE - 2008-04-14 - Hiram) ssh kkstore06 screen # use screen to control this job mkdir /cluster/data/mm9/bed/blastzPetMar1.2008-04-14 cd /cluster/data/mm9/bed/blastzPetMar1.2008-04-14 cat << '_EOF_' > DEF # Mouse vs. Lamprey # using the "distant" genome alignment parameters # see also: http://genomewiki.ucsc.edu/index.php/Mm9_multiple_alignment BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_M=50 BLASTZ_Q=/scratch/data/blastz/HoxD55.q # TARGET: Mouse SEQ1_DIR=/scratch/data/mm9/mm9.2bit SEQ1_LEN=/scratch/data/mm9/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 SEQ1_LIMIT=1 # QUERY: Lamprey petMar1 SEQ2_DIR=/scratch/data/petMar1/petMar1.2bit SEQ2_LEN=/scratch/data/petMar1/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LIMIT=300 SEQ2_LAP=0 BASE=/cluster/data/mm9/bed/blastzPetMar1.2008-04-14 TMPDIR=/scratch/tmp '_EOF_' # << this line keeps emacs coloring happy time doBlastzChainNet.pl -verbose=2 \ /cluster/data/mm9/bed/blastzPetMar1.2008-04-14/DEF \ -chainMinScore=5000 -chainLinearGap=loose \ -qRepeats=windowmaskerSdust -bigClusterHub=pk > do.log 2>&1 & cat fb.mm9.chainPetMar1Link.txt # 29113438 bases of 2620346127 (1.111%) in intersection # That is OK, now for the swap: mkdir /cluster/data/petMar1/bed/blastz.mm9.swap cd /cluster/data/petMar1/bed/blastz.mm9.swap time doBlastzChainNet.pl -verbose=2 -swap \ /cluster/data/mm9/bed/blastzPetMar1.2008-04-14/DEF \ -chainMinScore=5000 -chainLinearGap=loose \ -qRepeats=windowmaskerSdust -bigClusterHub=pk > swap.log 2>&1 & # real 33m29.076s cat fb.petMar1.chainMm9Link.txt # 26052507 bases of 831696438 (3.132%) in intersection ####################################################################### # BLASTZ/CHAIN/NET Lanclet broFla1 (DONE - 2008-04-14 - Hiram) ssh kkstore06 screen # use screen to control this job mkdir /cluster/data/mm9/bed/blastzBraFlo1.2008-04-14 cd /cluster/data/mm9/bed/blastzBraFlo1.2008-04-14 cat << '_EOF_' > DEF # Mouse vs. Lanclet # using the "distant" genome alignment parameters # see also: http://genomewiki.ucsc.edu/index.php/Mm9_multiple_alignment BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_M=50 BLASTZ_Q=/scratch/data/blastz/HoxD55.q # TARGET: Mouse SEQ1_DIR=/scratch/data/mm9/mm9.2bit SEQ1_LEN=/scratch/data/mm9/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 SEQ1_LIMIT=1 # QUERY: Lancelet braFlo1 - largest chunk big enough for largest scaffold # Largest scaffold 7,200,735 - 3032 scaffolds + chrM SEQ2_DIR=/scratch/data/braFlo1/braFlo1.2bit SEQ2_LEN=/scratch/data/braFlo1/chrom.sizes SEQ2_CTGDIR=/scratch/data/braFlo1/braFlo1UnScaffolds.2bit SEQ2_CTGLEN=/scratch/data/braFlo1/braFlo1UnScaffolds.sizes SEQ2_LIFT=/scratch/data/braFlo1/braFlo1.lift SEQ2_CHUNK=10000000 SEQ2_LIMIT=30 SEQ2_LAP=0 BASE=/cluster/data/mm9/bed/blastzBraFlo1.2008-04-14 TMPDIR=/scratch/tmp '_EOF_' # << this line keeps emacs coloring happy time doBlastzChainNet.pl -verbose=2 \ /cluster/data/mm9/bed/blastzBraFlo1.2008-04-14/DEF \ -chainMinScore=5000 -chainLinearGap=loose \ -qRepeats=windowmaskerSdust -bigClusterHub=kk > do.log 2>&1 & # real 408m36.691s cat fb.mm9.chainBraFlo1Link.txt # 26725980 bases of 2620346127 (1.020%) in intersection # That is OK, now for the swap: mkdir /cluster/data/braFlo1/bed/blastz.mm9.swap cd /cluster/data/braFlo1/bed/blastz.mm9.swap time doBlastzChainNet.pl -verbose=2 -swap \ /cluster/data/mm9/bed/blastzBraFlo1.2008-04-14/DEF \ -chainMinScore=5000 -chainLinearGap=loose \ -qRepeats=windowmaskerSdust -bigClusterHub=kk > swap.log 2>&1 & # real 12m23.402s cat fb.braFlo1.chainMm9Link.txt # 31517169 bases of 923355587 (3.413%) in intersection ########################################################################### # LOAD Transcriptome data (DONE - 2008-05-06 - Hiram) # data from Christian Iseli 'Christian.Iseli at licr.org' ssh hgwdev mkdir /cluster/data/mm9/bed/transcriptome cd /cluster/data/mm9/bed/transcriptome wget --timestamping ftp://ftp.licr.org/pub/MTr.gtf.gz wget --timestamping ftp://ftp.licr.org/pub/txg.tar.gz gtfToGenePred -genePredExt MTR.gtf.gz MTr.gp hgLoadGenePred mm9 transcriptome -genePredExt MTr.gp tar xvzf txg.tar.gz # Do a little data cleanup and transformation and # load splice graphs into database. sed 's/altGraphX/sibTxGraph/' ~/kent/src/hg/lib/altGraphX.sql \ > sibTxGraph.sql cat txg/*.txg | txgToAgx stdin stdout \ | hgLoadBed -notItemRgb -sqlTable=sibTxGraph.sql mm9 sibTxGraph stdin # Loaded 52065 elements of size 18 # Create sibAltEvents track for analysed alt-splices. cat txg/*.txg \ | txgAnalyze stdin /cluster/data/mm9/mm9.2bit stdout \ | awk '$2 >= 0' | sort | uniq > sibAltEvents.bed hgLoadBed mm9 sibAltEvents sibAltEvents.bed ############################################################################# # BLASTZ/CHAIN/NET equCab2 (DONE - 2008-04-17 - larrym) ssh kkstore04 screen # use screen to control this multi-day job mkdir /cluster/data/mm9/bed/blastz.equCab2.2008-04-15 cd /cluster/data/mm9/bed/blastz.equCab2.2008-04-15 cat << '_EOF_' > DEF # Mouse vs. Horse BLASTZ_M=50 # TARGET: Mouse MM9 SEQ1_DIR=/scratch/data/mm9/nib SEQ1_LEN=/cluster/data/mm9/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Horse SEQ2_DIR=/scratch/data/equCab2/equCab2.2bit SEQ2_LEN=/cluster/data/equCab2/chrom.sizes SEQ2_CTGDIR=/san/sanvol1/scratch/equCab2/equCab2.UnScaffolds.2bit SEQ2_CTGLEN=/san/sanvol1/scratch/equCab2/equCab2.UnScaffolds.sizes SEQ2_LIFT=/cluster/data/equCab2/jkStuff/equCab2.chrUn.lift SEQ2_CHUNK=20000000 SEQ2_LIMIT=200 SEQ2_LAP=0 BASE=/cluster/data/mm9/bed/blastz.equCab2.2008-04-15 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time doBlastzChainNet.pl `pwd`/DEF \ -verbose=2 -bigClusterHub=pk \ -chainMinScore=3000 -chainLinearGap=medium \ -blastzOutRoot /cluster/bluearc/equCab2/blastz.hg18 >>& do.log & ln -s blastz.equCab2.2008-04-15 /cluster/data/mm9/bed/blastz.equCab2 ############################################################################ # Reload CCDS from CCDS.20080502 dump (2008-05-03 markd) # import ccds database as described in ccds.txt set db=mm9 set ncbiBld=37.1 # create and load ccdsGene and ccdsInfo tables from imported database /cluster/data/genbank/bin/x86_64/ccdsMkTables -loadDb ccds ${db} ${ncbiBld} ccdsInfo ccdsGene # ccdsKgMap /cluster/data/genbank/bin/x86_64/mkCcdsGeneMap -db=${db} -loadDb ccdsGene knownGene ccdsKgMap checkTableCoords ${db} -verbose=2 ccdsGene # update all.jointer to include ${db} in ccdsDb joinerCheck -database=${db} -identifier=ccdsGeneId ~/compbio/genbank/kent/src/hg/makeDb/schema/all.joiner # request push of ccdsGene ccdsInfo ccdsKgMap # << emacs ############################################################################ # update vega genes to version 31 (v49 of Ensembl genes) # (DONE - 2008-05-15 - Hiram) mkdir /cluster/data/mm9/bed/vega31_49 cd /cluster/data/mm9/bed/vega31_49 wget --timestamping \ "ftp://ftp.sanger.ac.uk/pub/vega/mouse/gtf_file.gz" wget --timestamping \ "ftp://ftp.sanger.ac.uk/pub/vega/mouse/CHANGELOG.gz" wget --timestamping \ "ftp://ftp.sanger.ac.uk/pub/vega/mouse/catalog.txt" wget --timestamping \ "ftp://ftp.sanger.ac.uk/pub/vega/mouse/pep/Mus_musculus.VEGA.apr.pep.tot.fa.gz" # processing similar to the same processing for Ensembl genes, # from /cluster/data/mm9/bed/ensGene.49/process/doProcess.csh cp -p /cluster/data/mm9/bed/ensGene.49/process/randoms.mm9.lift . zcat gtf_file.gz \ | sed -e "s/^\([0-9XY][0-9]*\)/chr\1/; s/^MT/chrM/" \ | liftUp -type=.gtf stdout randoms.mm9.lift carry stdin \ | gzip > allGenes.gtf.gz gtfToGenePred -infoOut=infoOut.txt -genePredExt allGenes.gtf.gz stdout \ | gzip > mm9.allGenes.gp.gz /cluster/home/hiram/kent/src/hg/utils/automation/extractGtf.pl \ infoOut.txt > ensGtp.tab genePredCheck -db=mm9 mm9.allGenes.gp.gz # checked: 54208 failed: 0 zcat allGenes.gtf.gz | grep -i pseudo > pseudo.gtf zcat allGenes.gtf.gz | grep -v -i pseudo > not.pseudo.gtf gtfToGenePred -genePredExt pseudo.gtf pseudo.gp gtfToGenePred -genePredExt not.pseudo.gtf not.pseudo.gp genePredCheck -db=mm9 pseudo.gp # checked: 3989 failed: 0 genePredCheck -db=mm9 not.pseudo.gp # checked: 50219 failed: 0 hgLoadGenePred -genePredExt mm9 vegaGene not.pseudo.gp hgLoadGenePred -genePredExt mm9 vegaPseudoGene pseudo.gp ############################################################################ # BLASTZ/CHAIN/NET 2X Ground squirrel: speTri0 (In progress 2008-05-16 kate) ssh kkstore06 cd /cluster/data/mm9/bed mkdir blastzSpeTri0.2008-05-16 cd blastzSpeTri0.2008-05-16 cat << '_EOF_' > DEF # Mouse vs. Ground squirrel BLASTZ_M=50 # TARGET: Mouse MM9 SEQ1_DIR=/scratch/data/mm9/mm9.2bit SEQ1_LEN=/cluster/data/mm9/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Ground squirrel speTri0 SEQ2_DIR=/scratch/data/speTri0/speTri0.2bit SEQ2_LEN=/cluster/data/speTri0/chrom.sizes SEQ2_CHUNK=30000000 SEQ2_LIMIT=500 SEQ2_LAP=0 BASE=/cluster/data/mm9/bed/blastzSpeTri0.2008-05-16 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs doBlastzChainNet.pl `pwd`/DEF -bigClusterHub=pk \ -chainMinScore=3000 -chainLinearGap=medium >& do.log & ln -s blastzSpeTri0.2008-05-16 /cluster/data/mm9/bed/blastz.speTri0 # create reciprocal best chains/nets ssh hgwdev cd /cluster/data/mm9/bed/blastz.speTri0 /cluster/bin/scripts/doRecipBest.pl mm9 speTri0 >&! rbest.log & # failed coverage check, shouldn't be fatal ? # resume creating axt's and maf's # use axtChain/doRecipBest.csh to create resume.csh ssh kkstore06 cd /cluster/data/mm9/bed/blastz.speTri0/axtChain csh resume.csh >&! resume.log & ssh hgwdev cd /cluster/data/mm9/bed/blastz.speTri0 featureBits mm9 chainSpeTri0Link > fb.mm9.chainSpeTri0Link.txt cat fb.mm9.chainSpeTri0Link.txt # 673393210 bases of 2620346127 (25.699%) in intersection ################# # Rodent multiz (mouse, guinea pig, ground squirrel) # for Jurgen Schmitz (2008-06-07 kate) # Redo with unfiltered net mafs, to maximize squirrel sequence ssh kkstore06 mkdir /cluster/data/mm9/bed/multiz3way cd /cluster/data/mm9/bed/multiz3way mkdir mafLinks mkdir mafLinks/cavPor3 cd mafLinks/cavPor3 # high quality mammalian genome, so use syntenic net ln -s ../../../blastz.cavPor3/mafSynNet/*.maf.gz . mkdir ../speTri0 cd ../speTri0 # low coverage genome, so use reciprocal best #ln -s ../../../blastz.speTri0/mafRBestNet/*.maf.gz . # redo with unfiltered, to get more squirrel sequence ln -s ../../../blastz.speTri0/maftNet/*.maf.gz . # Copy MAFs to kluster-friendly disk mkdir -p /san/sanvol1/scratch/mm9/multiz3way cd /san/sanvol1/scratch/mm9/multiz3way rsync -a --copy-links --progress \ /cluster/data/mm9/bed/multiz3way/mafLinks/ . # get latest PSU utilities mkdir penn set p=/cluster/bin/penn/multiz.v11.2007-03-19/multiz-tba cp -p $p/{autoMZ,multiz,maf_project} penn # the autoMultiz cluster run ssh pk cd /cluster/data/mm9/bed/multiz3way # create species list and stripped down tree for autoMZ cat > tree.nh << 'EOF' ((mm9 cavPor3) speTri0) 'EOF' cat > species.lst << 'EOF' mm9 cavPor3 speTri0 'EOF' mkdir run maf cd run cat > autoMultiz << '_EOF_' #!/bin/csh -ef set db = mm9 set c = $1 set maf = $2 set binDir = /san/sanvol1/scratch/$db/multiz3way/penn set tmp = /scratch/tmp/$db/multiz.$c set pairs = /san/sanvol1/scratch/$db/multiz3way rm -fr $tmp mkdir -p $tmp cp ../{tree.nh,species.lst} $tmp pushd $tmp foreach s (`cat species.lst`) set in = $pairs/$s/$c.maf set out = $db.$s.sing.maf if ($s == $db) then continue endif if (-e $in.gz) then zcat $in.gz > $out else if (-e $in) then cp $in $out else echo "##maf version=1 scoring=autoMZ" > $out endif end set path = ($binDir $path); rehash $binDir/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c.maf popd cp $tmp/$c.maf $maf rm -fr $tmp '_EOF_' # << happy emacs chmod +x autoMultiz cat << '_EOF_' > template #LOOP ./autoMultiz $(root1) {check out line+ /cluster/data/mm9/bed/multiz3way/maf/$(root1).maf} #ENDLOOP '_EOF_' # << happy emacs awk '{print $1}' /cluster/data/mm9/chrom.sizes > chrom.lst gensub2 chrom.lst single template jobList para create jobList # 35 jobs para try para check #Completed: 35 of 35 jobs #CPU time in finished jobs: 6086s 101.43m 1.69h 0.07d 0.000 y #IO & Wait Time: 240s 4.00m 0.07h 0.00d 0.000 y #Average job time: 181s 3.01m 0.05h 0.00d #Longest running job: 0s 0.00m 0.00h 0.00d #Longest finished job: 502s 8.37m 0.14h 0.01d #Submission to last job: 506s 8.43m 0.14h 0.01d ssh hgwdev cd /usr/local/apache/htdocs/goldenPath/mm9 mkdir multizRodent3way cd multizRodent3way ln -s /cluster/data/mm9/bed/multiz3way/maf . cat > README.txt << 'EOF' This directory contains multiple alignments of 2 rodent genome assemblies to the mouse genome (mm9, Mar. 2006): _ guinea pig Cavia porcellus Feb. 2008, cavPor3 _ ground squirrel Spermophilus tridecemlineatus Jun. 2006, speTri0 'EOF' # << emacs ############################################################################ # TRANSMAP vertebrate.2008-05-20 build (2008-05-24 markd) vertebrate-wide transMap alignments were built Tracks are created and loaded by a single Makefile. This is available from: svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-05-20 see doc/builds.txt for specific details. ############################################################################ ############################################################################ # TRANSMAP vertebrate.2008-06-07 build (2008-06-30 markd) vertebrate-wide transMap alignments were built Tracks are created and loaded by a single Makefile. This is available from: svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-06-30 see doc/builds.txt for specific details. ############################################################################ ######################################################################### # ORegAnno - Open Regulatory Annotations # loaded July 7, 2008 # updated Sept 29, 2008 # loaded by Belinda Giardine, in same manner as hg18 ORegAnno track ############################################################################ # JAX/MGI TRACKS (DONE 6/22/11 angie) # last mm9 MGI tracks update (working 2012-10-23 - Chin) # in /hive/data/genomes/mm9/bed/jax/2012_10 # # Previously done 9/20/10 in /hive/data/genomes/mm9/bed/jax/2010_09 (pushed) # Previously done 8/20/09 in /hive/data/genomes/mm9/bed/jax/2009_08 (pushed) # Previously done 6/11/09 in /hive/data/genomes/mm9/bed/jax/2009_06 (pushed) # Previously done 4/24/09 in /hive/data/genomes/mm9/bed/jax/2009_04 (not pushed) # Previously done 9/24/08 in /cluster/data/mm9/bed/jax/2008_09 mkdir -p /hive/data/genomes/mm9/bed/jax/2012_10 cd /hive/data/genomes/mm9/bed/jax/2012_10 wget --timestamping ftp://ftp.informatics.jax.org/pub/gbrowse/\* wget --timestamping ftp://ftp.informatics.jax.org/pub/reports/MGI_PhenotypicAllele.rpt # Oops, June 2011 has a file that ends in "gff" not ".gff": mv MP_0010768_mortalitygff MP_0010768_mortality.gff # And June 2011 got some stale files from the ftp site: rm -f MP_0005393_skin_coat_nails.gff \ MP_0005392_touch_vibrissae.gff \ MP_0005374_lethality-embryonic_perinatal.gff \ MP_0005373_lethality-postnatal.gff \ MP_0005372_life_span-post-weaning_aging.gff # Jax Rep Transcript track # SEQ_RepTransGenomic_rpt.gff --> jaxRepTranscript{,Alias} # -- names like AK016604_4933401J01Rik, NM_001011874_AY534250 # -- aliases ~ MGI:\d+ ~/kent/src/hg/jaxMgi/parseRepTranscript.pl SEQ_RepTransGenomic_from_models.gff \ > jaxRepTranscript.gff # Jax Allele track # AL_*.gff --> jaxAllele{,Info} # -- bed12Source -- add type from filename # -- names like NM_011283_Rp1h, XM_129721_Slc9a2 # -- Info: name, mgiID, source {"Gene trapped", ...} cd /hive/data/genomes/mm9/bed/jax/2012_10 # 2012_10 Note: Need to modify source and feature fields -- # MGI_GTDNA: "Gene Trap" to "MGI_DNA_GTRAP", "Gene" to "genetrap_DNA". # MGI_GTRNA: "Gene Trap" to "MGI_RNA_GTRAP" cat MGI_GTDNA.gff | awk 'BEGIN {FS = "\t"; RS = "\n"} {print $2}' | sort -u # Gene Trap cat MGI_GTDNA.gff | awk 'BEGIN {FS = "\t"; RS = "\n"} {print $3}' | sort -u # Gene cat MGI_GTRNA.gff | awk 'BEGIN {FS = "\t"; RS = "\n"} {print $2}' | sort -u # Gene Trap cat MGI_GTRNA.gff | awk 'BEGIN {FS = "\t"; RS = "\n" } {print $3}' | sort -u # exon cat MGI_GTDNA.gff | sed -e 's/Gene Trap/MGI_DNA_GTRAP/' \ -e 's/Gene/genetrap_DNA/' > MGI_GTDNA_GBrowse.gff cat MGI_GTRNA.gff | sed -e 's/Gene Trap/MGI_RNA_GTRAP/' \ > MGI_GTRNA_GBrowse.gff XXXX 2012-11-13 stop here due to error from following Can't modify single ref constructor in scalar assignment at -e line 3, near ""25,25,112";" Execution of -e aborted due to compilation errors. Reading stdin rm -f jaxGeneTrap.bed jaxAlleleInfo.tab fixJaxAllele.sql foreach f (MGI_GT[DR]NA_GBrowse.gff) echo $f:t:r | sed -e 's/MGI_//; s/_GBrowse//' ~/kent/src/hg/jaxMgi/parseAllele.pl $f \ | ldHgGene mm9 placeholder stdin -nobin -out=stdout \ | /cluster/bin/scripts/genePredToBed \ | perl -wpe 'chomp; @w = split; $w[3] =~ s/\|\|(\w+)$// || die; $source = $1; \ $w[8] = ($source eq "GeneTrappedDna") ? "218,112,214" : \ ($source eq "GeneTrappedRna3") ? "50,205,50" : "25,25,112"; \ $_ = join("\t", @w, $source) . "\n";' \ >> jaxGeneTrap.bed if ($status) then echo "\nERRORS - FixMe\n" break endif end mv jaxAlleleInfo.tab jaxGeneTrapInfo.tab mv fixJaxAllele.sql fixJaxGeneTrap.sql cut -f 13 jaxGeneTrap.bed | sort | uniq -c # 298285 GeneTrappedDna # 186042 GeneTrappedRna3 # 35389 GeneTrappedRna5 rm -f jaxAllele.bed jaxAlleleInfo.tab fixJaxAllele.sql foreach f (AL_{IND,OTHER,SPON,TARG,TRANS}.gff) echo $f:t:r | sed -e 's/AL_//;' ./parseAllele.pl $f \ | ldHgGene mm9 placeholder stdin -nobin -out=stdout \ | /cluster/bin/scripts/genePredToBed \ | perl -wpe 'chomp; @w = split; $w[3] =~ s/\|\|(\w+)$// || die; \ $_ = join("\t", @w, $1) . "\n";' \ >> jaxAllele.bed if ($status) then echo "\nERRORS - FixMe\n" break endif end cut -f 13 jaxAllele.bed | sort | uniq -c #TARG #Missing > for mRNA name NM_016893_Fut8 jaxPhenotype{,Alias} # -- bed12Source -- add type from filename # -- names like NM_001001488_Atp8b1 rm -f jaxPhenotype.bed jaxPhenotypeAlias.tab fixJaxPhenotype.sql foreach f (MP_*.gff) set type = `echo $f:t:r \ | perl -wpe 's/MP_[0-9]*_//; s/[_-](\w)/\u$1/g; s/^(\w)/\u$1/; \ s@AdiposeTissue@Adipose@ || \ s@BehaviorNeurological@Behavior@ || \ s@CardiovascularSystem@Cardiovascular@ || \ s@DigestiveAlimentary@Digestive@ || \ s@EndocrineExocrineGland@Gland@ || \ s@GrowthSize@Growth Size@ || \ s@HearingEar@Hearing/Ear@ || \ s@HematopoieticSystem@Hematopoietic@ || \ s@HomeostasisMetabolism@Homeostasis@ || \ s@ImmuneSystem@Immune@ || \ s@LimbsDigitsTail@Limbs and Tail@ || \ s@LiverBiliarySystem@Liver and Bile@ || \ s@NervousSystem@Nervous System@ || \ s@RenalUrinarySystem@Renal/Urinary@ || \ s@ReproductiveSystem@Reproductive@ || \ s@RespiratorySystem@Respiratory@ || \ s@TasteOlfaction@Taste/Smell@ || \ s@Tumorigenesis@Tumorigenesis@ || \ s@VisionEye@Vision/Eye@ || \ m/^Craniofacial|Cellular|Embryogenesis|Integument|Mortality|Muscle|Normal|Other|Pigmentation|Skeleton$/ || \ die "\n\nUnrecognized phenotype $_\n\n\t";'` echo $type if ("$type" == "") break ~/kent/src/hg/jaxMgi/parsePhenotype.pl $f \ | ldHgGene mm9 placeholder stdin -nobin -out=stdout \ | /cluster/bin/scripts/genePredToBed \ | sed -e 's/^/chr/; s@$@'"\t$type"'@;' \ >> jaxPhenotype.bed end sort -u jaxPhenotypeAlias.tab > tmp mv tmp jaxPhenotypeAlias.tab # Jax QTL track # QTL*.gff --> jaxQtl2 (or 3?)... but we're missing MIT SSLP marker # and CM distance for 2, or those plus flanking markers for 3... dos2unix MGI_QTL.gff # Compare against the previous update to see if we need to reload: if (`cmp MGI_QTL.gff ../2010_09/MGI_QTL.gff` != 0) then echo MGI_QTL.gff changed, updating... perl -wpe 'chomp; s/\s*$//; \ ($c, undef, undef, $start, $end, undef, $strand, undef, $info) = \ split("\t"); \ if ($info =~ /QTL (\S+); Dbxref "(MGI:\d+)"; Alias .*; Note "([^"]+)"/) { \ ($name, $mgiID, $desc) = ($1, $2, $3); \ } else { die "parse\n$info"; } \ if ($start > $end) { $tmp = $end; $end = $start; $start = $tmp; } \ $start-- unless $start == 0; \ s/^.*$/chr$c\t$start\t$end\t$name\t1000\t$strand\t\t$mgiID\t$desc\t0.0\n/;' \ MGI_QTL.gff > jaxQtl.bed else echo No change to MGI_QTL.gff endif #MGI_QTL.gff ../2010_09/MGI_QTL.gff differ: char 99310, line 780 # Extract phenotype-allele relationships: # Make a file for the one code not already in a filename: cp /dev/null MP_0003012_no_phenotypic_analysis # Wrote a script to extract the phenotype-allele relationships -- # it uses the filenames to map MP:* codes to our phenotype names. ~/kent/src/hg/jaxMgi/parsePhenotypicAllele.pl MGI_PhenotypicAllele.rpt > jaxAllelePheno.tab # The file "err" has messages about missing data (no gene name in # PhenotypicAllele.rpt, or gene/mgiId not found in jaxAlleleInfo). wc -l jaxAllelePheno.tab err # 15147 jaxAllelePheno.tab # 11778 err # Load tables # jaxRepTranscript ldHgGene mm9 jaxRepTranscript jaxRepTranscript.gff # 38436 groups 22 seqs 1 sources 1 feature types hgsql mm9 < fixJaxRepTranscript.sql hgLoadSqlTab mm9 jaxRepTranscriptAlias \ ~/kent/src/hg/lib/genericAlias.sql jaxRepTranscriptAlias.tab checkTableCoords mm9 jaxRepTranscript # jaxAllele hgLoadBed -renameSqlTable -sqlTable=$HOME/kent/src/hg/lib/bed12Source.sql \ mm9 jaxAllele jaxAllele.bed #Loaded 16901 elements of size 13 checkTableCoords mm9 jaxAllele # fixJaxAllele.sql is empty so don't need to do this: # hgsql mm9 < fixJaxAllele.sql hgLoadSqlTab mm9 jaxAlleleInfo \ ~/kent/src/hg/lib/jaxAlleleInfo.sql jaxAlleleInfo.tab # jaxGeneTrap hgLoadBed -renameSqlTable -sqlTable=$HOME/kent/src/hg/lib/bed12Source.sql \ mm9 jaxGeneTrap jaxGeneTrap.bed #Loaded 519716 elements of size 13 checkTableCoords mm9 jaxGeneTrap hgsql mm9 < fixJaxGeneTrap.sql hgLoadSqlTab mm9 jaxGeneTrapInfo \ ~/kent/src/hg/lib/jaxAlleleInfo.sql jaxGeneTrapInfo.tab # jaxPhenotype hgLoadBed -renameSqlTable -sqlTable=$HOME/kent/src/hg/lib/bed12Source.sql \ -tab mm9 jaxPhenotype jaxPhenotype.bed #Loaded 37122 elements of size 13 checkTableCoords mm9 jaxPhenotype # fixJaxPhenotype.sql is empty so don't need to execute it. # hgsql mm9 < fixJaxPhenotype.sql hgLoadSqlTab mm9 jaxPhenotypeAlias \ ~/kent/src/hg/lib/genericAlias.sql jaxPhenotypeAlias.tab # jaxQtl if (`cmp MGI_QTL.gff ../2010_09/MGI_QTL.gff` != 0) then echo MGI_QTL.gff changed, updating... hgLoadBed -tab -notItemRgb -noBin \ -sqlTable=$HOME/kent/src/hg/lib/jaxQtl.sql \ mm9 jaxQtl jaxQtl.bed #Loaded 1890 elements of size 10 endif checkTableCoords -verbose=2 mm9 jaxQtl #mm9.jaxQtl item Ath13 chr14:51915898-165887941: chromEnd > chromSize 125194864 #mm9.jaxQtl item Ity2 chr11:145756703-145756947: chromEnd > chromSize 121843856 hgsql mm9 -e 'update jaxQtl set chromEnd = 125194864 where chrom = "chr14" and chromEnd = 165887941' hgsql mm9 -e 'delete from jaxQtl where chrom = "chr11" and chromStart > 121843856' checkTableCoords -verbose=2 mm9 jaxQtl # No output, good. # phenotype-allele relationships hgLoadSqlTab mm9 jaxAllelePheno \ ~/kent/src/hg/lib/jaxAllelePheno.sql jaxAllelePheno.tab # Check joiner (noTimes to avoid flood of refGene/knownGene timestamp warnings): foreach t (jaxRepTranscript jaxPhenotype jaxGeneTrap) echo $t runJoiner.csh mm9 $t noTimes end ############################################################################ # WOLD RNA-seq # # wig files: bed format, 25mers ave mm9Brain.wig #min 1, max=12989, median, 6 #7.4M reads woldRnaSeqBrain ########################################################################## # Fix equCab2 nets and chains to remove duplicate scaffold_34 (DONE - 2008-08-19 - larrym) fixChainNetEquCab2 hg18 deleted: 3100 from chr1_chainEquCab2 deleted: 7362 from chr10_chainEquCab2 deleted: 8472 from chr11_chainEquCab2 deleted: 1078 from chr12_chainEquCab2 deleted: 2227 from chr13_chainEquCab2 deleted: 2 from chr13_random_chainEquCab2 deleted: 3605 from chr14_chainEquCab2 deleted: 6773 from chr15_chainEquCab2 deleted: 3400 from chr16_chainEquCab2 deleted: 0 from chr16_random_chainEquCab2 deleted: 3741 from chr17_chainEquCab2 deleted: 3 from chr17_random_chainEquCab2 deleted: 334 from chr18_chainEquCab2 deleted: 5620 from chr19_chainEquCab2 deleted: 5 from chr1_random_chainEquCab2 deleted: 23003 from chr2_chainEquCab2 deleted: 1265 from chr3_chainEquCab2 deleted: 0 from chr3_random_chainEquCab2 deleted: 2567 from chr4_chainEquCab2 deleted: 0 from chr4_random_chainEquCab2 deleted: 967 from chr5_chainEquCab2 deleted: 0 from chr5_random_chainEquCab2 deleted: 3419 from chr6_chainEquCab2 deleted: 10493 from chr7_chainEquCab2 deleted: 0 from chr7_random_chainEquCab2 deleted: 1284 from chr8_chainEquCab2 deleted: 1 from chr8_random_chainEquCab2 deleted: 10185 from chr9_chainEquCab2 deleted: 1 from chr9_random_chainEquCab2 deleted: 4 from chrM_chainEquCab2 deleted: 8 from chrUn_random_chainEquCab2 deleted: 1585 from chrX_chainEquCab2 deleted: 3 from chrX_random_chainEquCab2 deleted: 19 from chrY_chainEquCab2 deleted: 70 from chrY_random_chainEquCab2 deleted: 18173 from netEquCab2 ######################################################################### # BLASTZ/CHAIN/NET oryLat2 (DONE - 2008-08-25,27 - Hiram) ssh kkstore06 screen # use a screen to manage this longish running job mkdir /cluster/data/mm9/bed/blastzOryLat2.2008-08-25 cd /cluster/data/mm9/bed/blastzOryLat2.2008-08-25 cat << '_EOF_' > DEF # Mouse vs. Medaka BLASTZ=/cluster/bin/penn/x86_64/lastz # typical parameters for a genome that is distant from human BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_M=50 BLASTZ_Q=/scratch/data/blastz/HoxD55.q # TARGET: Mouse mm9 SEQ1_DIR=/scratch/data/mm9/mm9.2bit SEQ1_LEN=/scratch/data/mm9/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 SEQ1_LIMIT=1 # QUERY: Medaka oryLat2 (40M chunks covers the largest chroms in one gulp) SEQ2_DIR=/scratch/data/oryLat2/oryLat2.2bit SEQ2_LEN=/scratch/data/oryLat2/chrom.sizes SEQ2_CHUNK=40000000 SEQ2_LIMIT=200 SEQ2_LAP=0 BASE=/cluster/data/mm9/bed/blastzOryLat2.2008-08-25 TMPDIR=/scratch/tmp '_EOF_' # << this line keeps emacs coloring happy time doBlastzChainNet.pl `pwd`/DEF \ -chainMinScore=5000 -chainLinearGap=loose \ -qRepeats=windowmaskerSdust \ -bigClusterHub=pk -verbose=2 > do.log 2>&1 & # real 124m28.816s # problems with memk today, continuing: time doBlastzChainNet.pl `pwd`/DEF \ -chainMinScore=5000 -chainLinearGap=loose \ -continue=cat -qRepeats=windowmaskerSdust \ -smallClusterHub=pk -bigClusterHub=pk -verbose=2 > cat.log 2>&1 & # the kluster is acting up, took several attempts to get one of the # simple cat jobs done, not sure why it was having trouble, continuing: time doBlastzChainNet.pl `pwd`/DEF \ -chainMinScore=5000 -chainLinearGap=loose \ -continue=chainRun -qRepeats=windowmaskerSdust \ -smallClusterHub=pk -bigClusterHub=pk -verbose=2 > chainRun.log 2>&1 & time doBlastzChainNet.pl `pwd`/DEF \ -chainMinScore=5000 -chainLinearGap=loose \ -workhorse=hgwdev -continue=chainMerge -qRepeats=windowmaskerSdust \ -smallClusterHub=pk -bigClusterHub=pk -verbose=2 > chainMerge.log 2>&1 & # real 14m58.355s cat fb.mm9.chainOryLat2Link.txt # 50975949 bases of 2620346127 (1.945%) in intersection cd /cluster/data/mm9/bed ln -s blastzOryLat2.2008-08-25 blastz.oryLat2 # That is OK, now for the swap: mkdir /cluster/data/oryLat2/bed/blastz.mm9.swap cd /cluster/data/oryLat2/bed/blastz.mm9.swap time doBlastzChainNet.pl -verbose=2 -swap \ /cluster/data/mm9/bed/blastzOryLat2.2008-08-25/DEF \ -chainMinScore=5000 -chainLinearGap=loose \ -qRepeats=windowmaskerSdust \ -smallClusterHub=pk -bigClusterHub=pk > swap.log 2>&1 & # real 15m26.642s cat fb.oryLat2.chainMm9Link.txt # 45837267 bases of 700386597 (6.545%) in intersection ####################################### # Wold RNA-seq data (Done Jul 30 mikep) # df . #Filesystem 1K-blocks Used Available Use% Mounted on #kkstore06-10:/export/cluster/store4 # 2402304448 2183573728 96700640 96% /cluster/store4 ssh kkstore06 cd /cluster/store4/mm9/bed/woldRnaSeq/ # naming convention: woldRnaSeq (Signal) Tissue Replicate # rename input wigs to convention mv mm9Brain.wig woldRnaSeqSignalBrain1.wigbed mv mm9Brain2.wig woldRnaSeqSignalBrain2.wigbed mv mm9Liver.wig woldRnaSeqSignalLiver1.wigbed mv mm9Liver2.wig woldRnaSeqSignalLiver2.wigbed mv mm9Muscle.wig woldRnaSeqSignalMuscle1.wigbed mv mm9Muscle2.wig woldRnaSeqSignalMuscle2.wigbed # wigEncode it all for T in Brain Liver Muscle do for R in 1 2 do wigEncode woldRnaSeqSignal${T}${R}.wigbed woldRnaSeqSignal${T}${R}.wig woldRnaSeqSignal${T}${R}.wib done done #Converted woldRnaSeqSignalBrain1.wigbed, upper limit 12989.00, lower limit 1.00 #Converted woldRnaSeqSignalBrain2.wigbed, upper limit 1482.24, lower limit 0.04 #Converted woldRnaSeqSignalLiver1.wigbed, upper limit 44652.00, lower limit 1.00 #Converted woldRnaSeqSignalLiver2.wigbed, upper limit 2567.53, lower limit 0.06 #Converted woldRnaSeqSignalMuscle1.wigbed, upper limit 60949.00, lower limit 1.00 #Converted woldRnaSeqSignalMuscle2.wigbed, upper limit 2726.96, lower limit 0.06 # Load on hgwdev ssh hgwdev for T in Brain Liver Muscle do for R in 1 2 do ln -s /cluster/data/mm9/bed/woldRnaSeq/woldRnaSeqSignal${T}${R}.wib /gbdb/mm9/wib/ hgLoadWiggle mm9 woldRnaSeqSignal${T}${R} woldRnaSeqSignal${T}${R}.wig done done rm wiggle.tab # do the beds for F in data/*beds*tgz do echo "untaring $F" tar zxvf $F done # How many records in the beds? wc -l *bed # 8868804 mm9Brain1.multi.bed # 856281 mm9Brain1.splices.bed # 14488584 mm9Brain1.uniqs.bed # 16180919 mm9Brain2.multi.bed # 54100 mm9Brain2.spike.bed # 1570776 mm9Brain2.splices.bed # 26519333 mm9Brain2.uniqs.bed # 12794917 mm9Liver1.multi.bed # 1030969 mm9Liver1.splices.bed # 13133048 mm9Liver1.uniqs.bed # 17783124 mm9Liver2.multi.bed # 414618 mm9Liver2.spike.bed # 1372984 mm9Liver2.splices.bed # 17673014 mm9Liver2.uniqs.bed # 12048985 mm9Muscle1.multi.bed # 1150895 mm9Muscle1.splices.bed # 13936012 mm9Muscle1.uniqs.bed # 16033642 mm9Muscle2.multi.bed # 589787 mm9Muscle2.spike.bed # 1347749 mm9Muscle2.splices.bed # 16632816 mm9Muscle2.uniqs.bed # 194481357 total # Just do the splices ones for T in Brain Liver Muscle do for R in 1 2 do egrep -v "^track" mm9${T}${R}.splices.bed | gawk -v OFS="\t" '{print $1,$2,$3,$4,$5,$6,$2,$3,0,$10,$11,$12}' > woldRnaSeqSplices${T}${R}.bed hgLoadBed mm9 woldRnaSeqSplices${T}${R} woldRnaSeqSplices${T}${R}.bed done done rm bed.tab ######################################################################### ### Affy MOE430 version 2 (DONE - 2008-09-25,10-02 - Hiram) # Align probes from MOE430v2 chip. # Data was picked up manually from the Affymetrix WEB site # while logged in to the Affymetrix system, from the page: # http://www.affymetrix.com/support/technical/byproduct.affx?product=moe430-20 # found links to the following files: -rw-r--r-- 1 51429336 Dec 1 2003 Mouse430_2.probe_fasta -rw-r--r-- 1 163849 Dec 2 2003 Mouse430_2_control -rw-r--r-- 1 89662619 Dec 2 2003 Mouse430_2.consensus -rw-r--r-- 1 30999528 Dec 2 2003 Mouse430_2.target -rw-r--r-- 1 24828845 Jun 12 2006 Mouse430_2.link.psl -rw-r--r-- 1 119301329 Aug 18 2006 Mouse430_2_ortholog.csv -rw-rw-rw- 1 95467111 Jul 7 22:05 Mouse430_2.na26.annot.csv -rw-r--r-- 1 3188 Jul 8 13:23 3prime-IVT.AFFX_README.NetAffx-CSV-Files.txt # placed into: /hive/data/genomes/mm9/bed/affyMOE430v2/affyData # The GNF folks pointed to data available at: # http://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE10246 ssh memk # cat ../affyData/Mouse430_2.probe_fasta \ # | sed -e "s/probe:Mouse430_2:/MOE320v2_/; s/:.*//" > MOE430v2_probes.fa # cat ../affyData/Mouse430_2.target \ # | sed -e "s/target:Mouse430_2:/MOE320v2_/; s/;.*//" > MOE430v2_target.fa mkdir /hive/data/genomes/mm9/bed/affyMOE430v2/run cd /hive/data/genomes/mm9/bed/affyMOE430v2/run mkdir psl cut -f1 ../../../chrom.sizes > genome.list cat ../affyData/Mouse430_2.consensus \ | sed -e "s/consensus:Mouse430_2://; s/;.*//" > affyMOE430v2.fa ls -1 /hive/data/genomes/mm9/bed/affyMOE430v2/run/affyMOE430v2.fa \ > probe.list cat << '_EOF_' > template #LOOP blat -fine -ooc=/scratch/data/mm9/11.ooc /scratch/data/mm9/nib/$(path1).nib $(path2) {check out line+ psl/$(root1).psl} #ENDLOOP '_EOF_' # << happy emacs gensub2 genome.list probe.list template jobList para create jobList para try ... check ... push ... etc. para time # Completed: 35 of 35 jobs # CPU time in finished jobs: 22222s 370.36m 6.17h 0.26d 0.001 y # IO & Wait Time: 104s 1.74m 0.03h 0.00d 0.000 y # Average job time: 638s 10.63m 0.18h 0.01d # Longest finished job: 1580s 26.33m 0.44h 0.02d # Submission to last job: 1589s 26.48m 0.44h 0.02d # Do sort, best in genome filter, and convert to chromosome coordinates # to create gnf1h.psl. pslSort dirs raw.psl tmp psl pslReps -minCover=0.3 -minAli=0.95 -nearTop=0.005 raw.psl \ ../affyMOE430v2.psl /dev/null # Load probes and alignments from MOE430v2 into database. ssh hgwdev cd /hive/data/genomes/mm9/bed/affyMOE430v2 mkdir /projects/compbio/data/microarray/affyMOE430v2 cp -p run/affyMOE430v2.fa /projects/compbio/data/microarray/affyMOE430v2 ln -s /projects/compbio/data/microarray/affyMOE430v2/affyMOE430v2.fa \ /gbdb/hgFixed/affyProbes hgLoadPsl mm9 affyMOE430v2.psl hgLoadSeq mm9 /gbdb/hgFixed/affyProbes/affyMOE430v2.fa # 45037 sequences pslToBed affyMOE430v2.psl affyMOE430v2Probes.bed hgLoadBed -tmpDir=/scratch/tmp mm9 affyMOE430v2Probes affyMOE430v2Probes.bed Loaded 46193 elements of size 12 # this is temporary, for use with bedMergeExpData below # Create a similar formatted file to the one used in MOE430 zcat geoData/GSE10246_series_matrix.txt.gz \ | egrep "^\"1|source_name|Sample_title" \ | sed -e "s/\!Sample_title/#Probe Set/; s#\!Sample_source_name_ch1##;" \ | sed -e "s/\"//g" > gnfMOE430v2.AD.txt # create gnfMouseAtlas3AllExps and gnfMouseAtlas3All tables in hgFixed hgGnfMicroarray gnfMouseAtlas3AllExps gnfMouseAtlas3All \ gnfMOE430v2.AD.txt -chip=affyMOE430v2 # 182 experiments # from that table, create median ratio table # create table gnfMOE430v2AllRatio in hgFixed from hgFixed.gnfMOE430v2All # and classification file ../hgMedianMicroarray/gnfMOE430v2.ra hgRatioMicroarray gnfMouseAtlas3All gnfMouseAtlas3AllRatio \ -clump=$HOME/kent/src/hg/makeDb/hgMedianMicroarray/gnfMOE430v2.ra # add those ratio's to the probe locations to make a bed 15 microarray type bedMergeExpData hgFixed.gnfMouseAtlas3AllRatio mm9.affyMOE430v2Probes \ gnfMouseAtlas3AllRatio.bed # no longer need this table # do not need this table for the genome browser display hgsql -e "drop table affyMOE430v2Probes;" mm9 hgLoadBed mm9 gnfMouseAtlas3 gnfMouseAtlas3AllRatio.bed hgMapToGene mm9 gnfMouseAtlas3 knownGene \ knownToGnfMouseAtlas3 '-type=bed 12' time hgExpDistance mm9 hgFixed.gnfMouseAtlas3AllRatio \ hgFixed.gnfMouseAtlas3AllExps gnfMouseAtlas3Distance \ -lookup=knownToGnfMouseAtlas3 # Have 45036 elements in hgFixed.gnfMouseAtlas3AllRatio # Got 39872 unique elements in hgFixed.gnfMouseAtlas3AllRatio # Loaded gnfMouseAtlas3Distance # real 34m56.844s # user 58m1.892s # sys 1m44.821s # Take the median value over multiple replicants creating # hgFixed.gnfMouseAtlas3MedianRatio and gnfMouseAtlas3MedianExps cd ../hgMedianMicroarray hgMedianMicroarray hgFixed gnfMouseAtlas3AllRatio gnfMouseAtlas3AllExps \ $HOME/kent/src/hg/makeDb/hgMedianMicroarray/gnfMOE430v2.ra \ gnfMouseAtlas3MedianRatio gnfMouseAtlas3MedianExps -minExps=1 # Also make a median version of the absolute measurements hgMedianMicroarray hgFixed gnfMouseAtlas3All gnfMouseAtlas3AllExps \ $HOME/kent/src/hg/makeDb/hgMedianMicroarray/gnfMOE430v2.ra \ gnfMouseAtlas3AllMedian gnfMouseAtlas3AllMedianExps -minExps=1 time hgExpDistance mm9 hgFixed.gnfMouseAtlas3MedianRatio \ hgFixed.gnfMouseAtlas3MedianExps gnfMouseAtlas3MedianDistance \ -lookup=knownToGnfMouseAtlas3 # Have 45037 elements in hgFixed.gnfMouseAtlas3MedianRatio # Got 39872 unique elements in hgFixed.gnfMouseAtlas3MedianRatio XXX - working Mon Nov 24 10:01:43 PST 2008 # real 16m5.102s # user 41m54.581s # sys 1m28.595s # 182 experiments # Convert these to ratios using the median of medians of non-cancerous # cell types as the denominator as so: cd ~/src/hg/makeDb/hgRatioMicroarray cd ../hgMedianMicroarray # create tables gnfMOE430v2MedianRatio gnfMOE430v2MedianExps in hgFixed hgMedianMicroarray hgFixed gnfMOE430v2AllRatio gnfMOE430v2AllExps \ gnfMOE430v2.ra gnfMOE430v2MedianRatio gnfMOE430v2MedianExps -minExps=1 # Also make a median version of the absolute measurements # create gnfMOE430v2Median hgMedianMicroarray hgFixed gnfMOE430v2All gnfMOE430v2AllExps \ gnfMOE430v2.ra gnfMOE430v2Median gnfMOE430v2MedianExps -minExps=1 cd /hive/data/genomes/mm9/bed/affyMOE430v2 # Load up microarray track hgMapMicroarray gnfMOE430v2.bed hgFixed.gnfMOE430v2MedianRatio \ affyMOE430v2.psl # Loaded 45037 rows of expression data from hgFixed.gnfMOE430v2MedianRatio # Mapped 44106, multiply-mapped 2087, missed 0, unmapped 931 hgLoadBed mm9 gnfMOE430v2 gnfMOE430v2.bed # Loaded 46193 elements of size 15 ####################################### hgExpDistance mm9 hgFixed.gnfMouseAtlas2MedianRatio \ hgFixed.gnfMouseAtlas2MedianExps gnfAtlas2Distance -lookup=knownToGnf1m # Convert these to ratios using the median of medians of non-cancerous # cell types as the denominator as so: cd ~/src/hg/makeDb/hgRatioMicroarray hgRatioMicroarray gnfMouseAtlas2All gnfMouseAtlas2AllRatio -clump=../hgMedianMicroarray/gnfMouseAtlas2.ra # Take the median value over multiple replicants and put in this table: cd ../hgMedianMicroarray hgMedianMicroarray hgFixed gnfMouseAtlas2AllRatio gnfMouseAtlas2AllExps gnfMouseAtlas2.ra gnfMouseAtlas2MedianRatio gnfMouseAtlas2MedianExps -minExps=1 # Also make a median version of the absolute measurements hgMedianMicroarray hgFixed gnfMouseAtlas2All gnfMouseAtlas2AllExps gnfMouseAtlas2.ra gnfMouseAtlas2Median gnfMouseAtlas2MedianExps -minExps=1 ############################################################################ # hgPal downloads ssh hgwdev screen bash rm -rf /cluster/data/mm9/bed/multiz30way/pal mkdir /cluster/data/mm9/bed/multiz30way/pal cd /cluster/data/mm9/bed/multiz30way/pal cat > order.lst < ppredAA/$j.ppredAA.fa.gz" echo "mafGene -chrom=$j -noTrans $db $mz $gp order.lst stdout | \ gzip -c > ppredNuc/$j.ppredNuc.fa.gz" echo "mafGene -chrom=$j -exons -noTrans $db $mz $gp order.lst stdout | \ gzip -c > exonNuc/$j.exonNuc.fa.gz" echo "mafGene -chrom=$j -exons $db $mz $gp order.lst stdout | \ gzip -c > exonAA/$j.exonAA.fa.gz" done > $gp.jobs time sh -x $gp.jobs > $gp.jobs.log 2>&1 & sleep 1 tail -f $gp.jobs.log # real 196m7.752s # user 11m26.917s # sys 3m41.587s zcat exonAA/*.gz | gzip -c > $gp.$mz.exonAA.fa.gz zcat exonNuc/*.gz | gzip -c > $gp.$mz.exonNuc.fa.gz zcat ppredAA/*.gz | gzip -c > $gp.$mz.ppredAA.fa.gz zcat ppredNuc/*.gz | gzip -c > $gp.$mz.ppredNuc.fa.gz rm -rf exonAA exonNuc ppredAA ppredNuc # we're only distributing exons at the moment pd=/usr/local/apache/htdocs/goldenPath/$db/$mz ln -s `pwd`/$gp.$mz.exonAA.fa.gz $pd/$gp.exonAA.fa.gz ln -s `pwd`/$gp.$mz.exonNuc.fa.gz $pd/$gp.exonNuc.fa.gz mz=multiz30way gp=knownGene db=mm9 mkdir exonAA exonNuc ppredAA ppredNuc for j in `sort -nk 2 /cluster/data/$db/chrom.sizes | awk '{print $1}'` do echo "date" echo "mafGene -chrom=$j $db $mz $gp order.lst stdout | \ gzip -c > ppredAA/$j.ppredAA.fa.gz" echo "mafGene -chrom=$j -noTrans $db $mz $gp order.lst stdout | \ gzip -c > ppredNuc/$j.ppredNuc.fa.gz" echo "mafGene -chrom=$j -exons -noTrans $db $mz $gp order.lst stdout | \ gzip -c > exonNuc/$j.exonNuc.fa.gz" echo "mafGene -chrom=$j -exons $db $mz $gp order.lst stdout | \ gzip -c > exonAA/$j.exonAA.fa.gz" done > $gp.$mz.jobs time sh -x $gp.$mz.jobs > $gp.$mz.job.log 2>&1 & sleep 1 tail -f $gp.$mz.job.log # real 216m43.721s # user 18m33.552s # sys 5m42.639s zcat exonAA/c*.gz | gzip -c > $gp.$mz.exonAA.fa.gz zcat exonNuc/c*.gz | gzip -c > $gp.$mz.exonNuc.fa.gz zcat ppredAA/c*.gz | gzip -c > $gp.$mz.ppredAA.fa.gz zcat ppredNuc/c*.gz | gzip -c > $gp.$mz.ppredNuc.fa.gz rm -rf exonAA exonNuc ppredAA ppredNuc pd=/usr/local/apache/htdocs/goldenPath/$db/$mz ln -s `pwd`/$gp.$mz.exonAA.fa.gz $pd/$gp.exonAA.fa.gz ln -s `pwd`/$gp.$mz.exonNuc.fa.gz $pd/$gp.exonNuc.fa.gz # now do the canonical set cd /cluster/data/mm9/bed/multiz30way/pal mz=multiz30way gp=knownCanonical db=mm9 for j in `awk '{print $1}' /cluster/data/mm9/chrom.sizes` do echo "select chrom, chromStart, chromEnd, transcript from knownCanonical where chrom='$j'" | hgsql $db | tail -n +2 > $j.known.bed done mkdir exonAA exonNuc ppredAA ppredNuc for j in `sort -nk 2 /cluster/data/$db/chrom.sizes | awk '{print $1}'` do echo "date" echo "mafGene -geneBeds=$j.known.bed $db $mz knownGene order.lst stdout | gzip -c > ppredAA/$j.ppredAA.fa.gz" echo "mafGene -geneBeds=$j.known.bed -noTrans $db $mz knownGene order.lst stdout | gzip -c > ppredNuc/$j.ppredNuc.fa.gz" echo "mafGene -geneBeds=$j.known.bed -exons -noTrans $db $mz knownGene order.lst stdout | gzip -c > exonNuc/$j.exonNuc.fa.gz" echo "mafGene -geneBeds=$j.known.bed -exons $db $mz knownGene order.lst stdout | gzip -c > exonAA/$j.exonAA.fa.gz" done > $gp.$mz.jobs time sh -x $gp.$mz.jobs > $gp.$mz.job.log 2>&1 & sleep 1 tail -f $gp.$mz.job.log # real 192m17.168s # user 10m28.659s # sys 3m53.467s rm *.known.bed zcat exonAA/c*.gz | gzip -c > $gp.$mz.exonAA.fa.gz zcat exonNuc/c*.gz | gzip -c > $gp.$mz.exonNuc.fa.gz zcat ppredAA/c*.gz | gzip -c > $gp.$mz.ppredAA.fa.gz zcat ppredNuc/c*.gz | gzip -c > $gp.$mz.ppredNuc.fa.gz rm -rf exonAA exonNuc ppredAA ppredNuc db=mm9 mz=multiz30way gp=knownCanonical pd=/usr/local/apache/htdocs/goldenPath/$db/$mz ln -s `pwd`/$gp.$mz.exonAA.fa.gz $pd/$gp.exonAA.fa.gz ln -s `pwd`/$gp.$mz.exonNuc.fa.gz $pd/$gp.exonNuc.fa.gz ############################################################################# # MOUSE TISSUE EXON ARRAYS (Melissa Cline, cline@biology.ucsc.edu, 10/14/08) # (to build the affyExonTissues track, see the steps outlined in hg18.txt) ############################################################################# ######################################################################## ## AFFY ALL EXON PROBESETS (HG18/MM9/RN4) (DONE 2009-01-29, Andy) ## (instructions are in hg18.txt) ######################################################################## ################################################ # AUTOMATE UPSTREAM FILE CREATION (2008-10-15 markd) update genbank.conf: mm9.upstreamGeneTbl = refGene mm9.upstreamMaf = multiz30way /hive/data/genomes/mm9/bed/multiz30way/species.list ############################################################################# # MAKE PCR TARGET FOR UCSC GENES (DONE 11/4/08) (REDONE 2/24/11) ssh hgwdev mkdir /cluster/data/mm9/bed/mrnaPcr cd /cluster/data/mm9/bed/mrnaPcr genePredToBed /cluster/data/mm9/bed/ucsc.12/ucscGenes.gp > ucscGenes.bed hgsql mm9 -NBe 'select kgId,geneSymbol from kgXref' \ | perl -wpe 's/^(\S+)\t(\S+)/$1\t${1}__$2/ || die;' \ > idSub.txt subColumn 4 ucscGenes.bed idSub.txt ucscGenesIdSubbed.bed sequenceForBed -keepName -db=mm9 -bedIn=ucscGenesIdSubbed.bed \ -fastaOut=stdout \ | faToTwoBit stdin kgTargetSeq.2bit cut -f 1-10 /cluster/data/mm9/bed/ucsc.12/ucscGenes.gp \ | genePredToFakePsl mm9 stdin kgTargetAli.psl /dev/null # Load up the UCSC Genes target PSL table and put 2bit in /gbdb:: cd /cluster/data/mm9/bed/mrnaPcr hgLoadPsl mm9 kgTargetAli.psl mkdir /gbdb/mm9/targetDb ln -s /cluster/data/mm9/bed/mrnaPcr/kgTargetSeq.2bit /gbdb/mm9/targetDb/kgTargetSeq12.2bit # Ask cluster-admin to start an untranslated, -stepSize=5 gfServer on # /gbdb/mm9/targetDb/kgTargetSeq12.2bit . ssh hgwdev # Add records to hgcentraltest blatServers and targetDb: hgsql hgcentraltest -e \ 'INSERT into blatServers values ("mm9Kg", "blat13", 17805, 0, 1);' hgsql hgcentraltest -e \ 'INSERT into targetDb values("mm9Kg", "UCSC Genes", \ "mm9", "kgTargetAli", "", "", \ "/gbdb/mm9/targetDb/kgTargetSeq.2bit", 1, now(), "");' ############################################################################# # TEST BLASTZ with Rn5 (DONE - 2008-11-26,30 - Hiram) mkdir /hive/data/genomes/mm9/bed/blastzRn5.2008-11-26 cd /hive/data/genomes/mm9/bed/blastzRn5.2008-11-26 cat << '_EOF_' > DEF # mouse vs rat # Specially tuned blastz parameters from Webb Miller BLASTZ=blastz BLASTZ_ABRIDGE_REPEATS=0 BLASTZ_O=600 BLASTZ_E=55 BLASTZ_Y=15000 BLASTZ_T=2 BLASTZ_K=4500 BLASTZ_Q=/scratch/data/blastz/mouse_rat.q # TARGET: Mouse Mm9 SEQ1_DIR=/scratch/data/mm9/mm9.2bit SEQ1_LEN=/scratch/data/mm9/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Rat Rn5 SEQ2_DIR=/scratch/data/rn5/rn5.2bit SEQ2_LEN=/scratch/data/rn5/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/hive/data/genomes/mm9/bed/blastzRn5.2008-11-26 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # establish a screen to control this job screen cd /hive/data/genomes/mm9/bed/blastzRn5.2008-11-26 time nice -n +19 doBlastzChainNet.pl -verbose=2 \ -workhorse=hgwdev -smallClusterHub=pk -bigClusterHub=pk \ -chainMinScore=5000 -chainLinearGap=medium \ -stop=net `pwd`/DEF > do.log 2>&1 & # real 403m22.371s time nice -n +19 doBlastzChainNet.pl -verbose=2 \ -workhorse=hgwdev -smallClusterHub=pk -bigClusterHub=pk \ -debug -chainMinScore=5000 -chainLinearGap=medium \ -continue=load -stop=load `pwd`/DEF > load.log 2>&1 & # real 44m59.528s cat fb.mm9.chainRn5BlastzLink.txt # 1751593467 bases of 2620346127 (66.846%) in intersection cat /cluster/data/mm9/bed/blastzRn4.2007-08-31/fb.mm9.chainRn4Link.txt # 1713186474 bases of 2620346127 (65.380%) in intersection mkdir /hive/data/genomes/rn5/bed/blastz.mm9.swap cd /hive/data/genomes/rn5/bed/blastz.mm9.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm9/bed/blastzRn5.2008-11-26/DEF \ -workhorse=hgwdev -smallClusterHub=pk -bigClusterHub=pk \ -chainMinScore=5000 -chainLinearGap=medium \ -swap -stop=net > swap.log 2>&1 & # real 63m51.690s time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm9/bed/blastzRn5.2008-11-26/DEF \ -workhorse=hgwdev -smallClusterHub=pk -bigClusterHub=pk \ -chainMinScore=5000 -chainLinearGap=medium \ -debug -swap -continue=load -stop=load > load.log 2>&1 & cat fb.rn5.chainMm9BlastzLink.txt # 1901280009 bases of 3372561689 (56.375%) in intersection ############################################################################# # AFFY EXON PROBE LIFT MM8->MM9 (DONE, 2008-12-17 Andy) ssh hgwdev cd /hive/data/genomes/mm9/bed mkdir affyMoEx1 cd affyMoEx1/ echo "select * from affyMoEx1Probe" | \ hgsql mm8 | tail +2 | cut -f2- > mm8.affyMoEx1Probe.bed liftOver mm8.affyMoEx1Probe.bed /gbdb/mm8/liftOver/mm8ToMm9.over.chain.gz \ affyMoEx1Probe.bed unmapped.txt grep Partially unmapped.txt | wc -l #199 grep Split unmapped.txt | wc -l #190 grep Deleted unmapped.txt | wc -l #354 wc -l mm8.affyMoEx1Probe.bed #4549897 ## Out of 4.5 million probes in mm8, we've lost 743 in different ways ## attempting to lift. That's an acceptable number. hgLoadBed mm9 affyMoEx1Probe{,.bed} echo "select * from affyMoEx1Transcript" | \ hgsql mm8 | tail +2 | cut -f2- > mm8.affyMoEx1Transcript.bed liftOver mm8.affyMoEx1Transcript.bed /gbdb/mm8/liftOver/mm8ToMm9.over.chain.gz \ affyMoEx1Transcript.bed unmapped.txt hgLoadBed mm9 affyMoEx1Transcript{,.bed} ## Put unlifted IDs into a downloadable file. mkdir /usr/local/apache/htdocs/goldenPath/mm9/unlifted grep -A1 Deleted unmapped.txt | grep "^chr" > affyMoEx1Probe.mm8Deleted.bed grep -A1 Partially unmapped.txt | grep "^chr" > affyMoEx1Probe.mm8PartiallyDeleted.bed grep -A1 Split unmapped.txt | grep "^chr" > affyMoEx1Probe.mm8Split.bed grep -A1 Deleted unmappedTranscripts.txt | grep "^chr" > affyMoEx1Transcript.mm8Deleted.bed grep -A1 Partially unmappedTranscripts.txt | grep "^chr" > affyMoEx1Transcript.mm8PartiallyDeleted.bed cp affyMoEx1*.mm8*.bed /usr/local/apache/htdocs/goldenPath/mm9/unlifted ## mm8 and mm9 track descriptions differ: ## 1. Copy mouse/trackDb.ra setting to mouse/mm9/trackDb.ra and add ## origAssembly mm8 line. ## 2. Make a new paragraph in a new affyMouseExon.html in mm9 to include ## details about the lift and how many didn't lift. ############################################################################# # HUMAN (hg18) PROTEINS TRACK (DONE braney 2009-04-07) # bash if not using bash shell already ssh kolossus mkdir /cluster/data/mm9/blastDb cd /cluster/data/mm9 awk '{if ($2 > 1000000) print $1}' mm9Chroms_RandomContigs.hard.sizes > 1meg.lst twoBitToFa -seqList=1meg.lst mm9Chroms_RandomContigs.hard.2bit temp.fa faSplit gap temp.fa 1000000 blastDb/x -lift=blastDb.lft rm temp.fa 1meg.lst awk '{if ($2 <= 1000000) print $1}' mm9Chroms_RandomContigs.hard.sizes > less1meg.lst twoBitToFa -seqList=less1meg.lst mm9Chroms_RandomContigs.hard.2bit temp.fa faSplit about temp.fa 1000000 blastDb/y cd blastDb for i in *.fa do /hive/data/outside/blast229/formatdb -i $i -p F done rm *.fa ls *.nsq | wc -l # 2712 mkdir -p /cluster/data/mm9/bed/tblastn.hg18KG cd /cluster/data/mm9/bed/tblastn.hg18KG echo ../../blastDb/*.nsq | xargs ls -S | sed "s/\.nsq//" > query.lst wc -l query.lst # 2712 query.lst # we want around 250000 jobs calc `wc /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl | awk '{print $1}'`/\(250000/`wc query.lst | awk '{print $1}'`\) # 36727/(250000/2712) = 398.414496 mkdir -p kgfa split -l 398 /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl kgfa/kg cd kgfa for i in *; do nice pslxToFa $i $i.fa; rm $i; done cd .. ls -1S kgfa/*.fa > kg.lst mkdir -p blastOut for i in `cat kg.lst`; do mkdir blastOut/`basename $i .fa`; done tcsh cd /cluster/data/mm9/bed/tblastn.hg18KG cat << '_EOF_' > blastGsub #LOOP blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl } #ENDLOOP '_EOF_' cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/hive/data/outside/blast229/data export BLASTMAT g=`basename $2` f=/tmp/`basename $3`.$g for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11 do if /hive/data/outside/blast229/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8 then mv $f.8 $f.1 break; fi done if test -f $f.1 then if /cluster/bin/i386/blastToPsl $f.1 $f.2 then liftUp -nosort -type=".psl" -nohead $f.3 /cluster/data/mm9/blastDb.lft carry $f.2 liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg18/bed/blat.hg18KG/protein.lft warn $f.3 if pslCheck -prot $3.tmp then mv $3.tmp $3 rm -f $f.1 $f.2 $f.3 $f.4 fi exit 0 fi fi rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 $f.4 exit 1 '_EOF_' # << happy emacs chmod +x blastSome gensub2 query.lst kg.lst blastGsub blastSpec exit ssh swarm cd /cluster/data/mm9/bed/tblastn.hg18KG para create blastSpec # para try, check, push, check etc. para time # Completed: 252216 of 252216 jobs # CPU time in finished jobs: 14882096s 248034.93m 4133.92h 172.25d 0.472 y # IO & Wait Time: 1019014s 16983.57m 283.06h 11.79d 0.032 y # Average job time: 63s 1.05m 0.02h 0.00d # Longest finished job: 184s 3.07m 0.05h 0.00d # Submission to last job: 15667s 261.12m 4.35h 0.18d ssh swarm cd /cluster/data/mm9/bed/tblastn.hg18KG mkdir chainRun cd chainRun tcsh cat << '_EOF_' > chainGsub #LOOP chainOne $(path1) #ENDLOOP '_EOF_' cat << '_EOF_' > chainOne (cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=150000 stdin ../c.`basename $1`.psl) '_EOF_' chmod +x chainOne ls -1dS ../blastOut/kg?? > chain.lst gensub2 chain.lst single chainGsub chainSpec # do the cluster run for chaining para create chainSpec para try, check, push, check etc. # Completed: 93 of 93 jobs # CPU time in finished jobs: 5736s 95.59m 1.59h 0.07d 0.000 y # IO & Wait Time: 21289s 354.82m 5.91h 0.25d 0.001 y # Average job time: 291s 4.84m 0.08h 0.00d # Longest finished job: 472s 7.87m 0.13h 0.01d # Submission to last job: 496s 8.27m 0.14h 0.01d cd /cluster/data/mm9/bed/tblastn.hg18KG/blastOut for i in kg?? do cat c.$i.psl | awk "(\$13 - \$12)/\$11 > 0.6 {print}" > c60.$i.psl sort -rn c60.$i.psl | pslUniq stdin u.$i.psl awk "((\$1 / \$11) ) > 0.60 { print }" c60.$i.psl > m60.$i.psl echo $i done sort u.*.psl m60* | uniq > ../unliftBlastHg18KG.psl cd .. pslCheck unliftBlastHg18KG.psl liftUp -nohead temp.psl ../../jkStuff/mm9.contigs.lift carry unliftBlastHg18KG.psl sort -T /tmp -k 14,14 -k 16,16n -k 17,17n temp.psl > blastHg18KG.psl rm temp.psl pslCheck blastHg18KG.psl # load table ssh hgwdev cd /cluster/data/mm9/bed/tblastn.hg18KG hgLoadPsl mm9 blastHg18KG.psl # check coverage featureBits mm9 blastHg18KG # 30285278 bases of 2620346127 (1.156%) in intersection featureBits mm9 knownGene:cds blastHg18KG -enrichment # knownGene:cds 1.278%, blastHg18KG 1.156%, both 0.969%, cover 75.86%, enrich 65.64x featureBits mm9 refGene:cds blastHg18KG -enrichment # refGene:cds 1.205%, blastHg18KG 1.156%, both 0.940%, cover 78.04%, enrich 67.52x rm -rf blastOut #end tblastn ############################################################################# # LASTZ Swap Human Hg19 (DONE - 2009-05-14 - Hiram) # the original cd /hive/data/genomes/hg19/bed/lastzMm9.2009-05-13 cat fb.hg19.chainMm9Link.txt # 1022734273 bases of 2897316137 (35.299%) in intersection # and the swap mkdir /hive/data/genomes/mm9/bed/blastz.hg19.swap cd /hive/data/genomes/mm9/bed/blastz.hg19.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/hg19/bed/lastzMm9.2009-05-13/DEF \ -swap -noLoadChainSplit -syntenicNet \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 131m58.763s cat fb.mm9.chainHg19Link.txt # 1013880568 bases of 2620346127 (38.693%) in intersection ############################################################################# # RE-BUILD miRNA TRACK (DONE, 2009-06-09-2009-06-11, hartera) # The miRNA track from miRBase is out of date so update the track. mkdir -p /hive/data/genomes/mm9/bed/miRNA-2009-06-09 cd /hive/data/genomes/mm9/bed/miRNA-2009-06-09 # Download GFF file of latest miRNA annotations from miRBase at the # Wellcome Trust Sanger Institute (WTSI). This is Release 13.0. # (March 2009) wget --timestamping \ ftp://ftp.sanger.ac.uk/pub/mirbase/sequences/CURRENT/genomes/mmu.gff # Re-format, need to add "chr" to the beginning of each line. sed -e 's/^/chr/' mmu.gff > mmMirBaseFormat.gff # Remove extra "chr" in comment lines perl -pi.bak -e 's/chr#/#/' mmMirBaseFormat.gff # Change chrMT to chrM perl -pi.bak -e 's/chrMT/chrM/' mmMirBaseFormat.gff # Remove all but ID name in last field sed -e 's/\";//g' mmMirBaseFormat.gff | sed -e 's/ID=\"/transcript_id=/g' \ | sed -e 's/ACC=\"MI[0-9]*\s//' > mmMirBaseFormatIdOnly.gff # Load into database. ldHgGene -exon=miRNA mm9 miRNARel13 mmMirBaseFormatIdOnly.gff # Does not load as mmu-mir-692-2 is on two chroms, chr4 and chr13. # These are alignments not genePreds so convert to BED for loading into # the database. sed -e 's/\";//g' mmMirBaseFormat.gff | sed -e 's/ID=\"//g' \ | sed -e 's/ACC=\"MI[0-9]*\s//' > mmMirBaseFormatIdOnly.gff # chr1 . miRNA 20669091 20669163 . + # . mmu-mir-206 # use score 906 for + strand and 480 for - strand. This will show # up black on the track for + strand and grey for - strand. # Re-do below and re-load track as appears off by 1 compared to # Ensembl track and other miRNA resources (2009-06-11) # Confirmed with Sam Griffith-Jones that the coordinates in the # GFF file are 1-based. (2009-06-12). awk 'BEGIN {FS="\t"} {OFS="\t"} \ {if ($0 !~ /#/ && $7 == "+") print $1, $4-1, $5, $9, 960, $7; \ else if ($0 !~ /#/ && $7 == "-") print $1, $4-1, $5, $9, 480, $7;}' \ mmMirBaseFormatIdOnly.gff > mmMirBaseFormatIdOnly.bed # Remove previous table hgsql -e 'drop table miRNA' mm9 hgLoadBed mm9 miRNA mmMirBaseFormatIdOnly.bed # Reading mmMirBaseFormatIdOnly.bed # Loaded 568 elements of size 6 # Sorted # Creating table definition for miRNARel13 # Saving bed.tab # Loading mm9 hgsql -e 'select count(*) from miRNA;' mm9 # 568 # The previous version had 493 miRNAs. hgsql -e 'select count(distinct name) from miRNA;' mm9 # 541 # The previous version had 466 unique miRNAs. ############################################################################ # Re-Run equCab2 alignment (DONE - 2009-06-29,07-02 - Hiram mkdir /hive/data/genomes/mm9/bed/lastzEquCab2.2009-06-29 cd /hive/data/genomes/mm9/bed/lastzEquCab2.2009-06-29 cat << '_EOF_' > DEF # Mouse vs. Horse BLASTZ_M=50 # TARGET: Mouse MM9 SEQ1_DIR=/scratch/data/mm9/nib SEQ1_LEN=/scratch/data/mm9/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Horse SEQ2_DIR=/scratch/data/equCab2/equCab2.2bit SEQ2_LEN=/scratch/data/equCab2/chrom.sizes SEQ2_CTGDIR=/hive/data/genomes/equCab2/equCab2.UnScaffolds.2bit SEQ2_CTGLEN=/hive/data/genomes/equCab2/equCab2.UnScaffolds.sizes SEQ2_LIFT=/hive/data/genomes/equCab2/jkStuff/equCab2.chrUn.lift SEQ2_CHUNK=20000000 SEQ2_LIMIT=100 SEQ2_LAP=0 BASE=/hive/data/genomes/mm9/bed/lastzEquCab2.2009-06-29 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time doBlastzChainNet.pl `pwd`/DEF \ -noLoadChainSplit -verbose=2 -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 360m10.094s time doBlastzChainNet.pl `pwd`/DEF \ -continue=chainMerge -noLoadChainSplit -verbose=2 -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > chainMerge.log 2>&1 & # real 225m4.178s cat fb.mm9.chainEquCab2Link.txt # 912421053 bases of 2620346127 (34.821%) in intersection mkdir /hive/data/genomes/equCab2/bed/blastz.mm9.swap cd /hive/data/genomes/equCab2/bed/blastz.mm9.swap time doBlastzChainNet.pl \ /hive/data/genomes/mm9/bed/lastzEquCab2.2009-06-29/DEF \ -swap -noLoadChainSplit -verbose=2 -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 122m25.314s cat fb.equCab2.chainMm9Link.txt # 902295813 bases of 2428790173 (37.150%) in intersection ############################################################################ ############################################################################ # TRANSMAP vertebrate.2009-07-01 build (2009-07-21 markd) vertebrate-wide transMap alignments were built Tracks are created and loaded by a single Makefile. This is available from: svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-07-01 see doc/builds.txt for specific details. ############################################################################ # VEGA GENES UPDATE TO BUILD 35 (DONE, 2009-07-30 - 2009-09-09, hartera) # Needs updating as the current version is build 31 from May 2008. # 2009-08-03 (hartera) - Added code to register track handler for # vegaGeneComposite. # 2009-08-15 - 2009-08-16 (hartera) - Added code to allow use of radio buttons # on the configuratio page for the track item labels. Modified code so it # can be shared with Ensembl to create the links to Vega transcript, gene # and protein reports on the details pages. # 2009-08-22 - Finished code for adding Vega report URLs to the details pages. # Loaded the vegaGtp table. # 2009-09-01 and 2009-09-03 (hartera). Loaded a vegaPep table for the protein # sequence link on the details pages. # 2009-09-04 Re-load all tables as some reverted to the older version during # mySQL 5 upgrade. # 2009-09-08 - 2009-09-09 Code change to change message on details page when # no protein is available and change to trackDb to make vegaGene items a # darker blue colour. Reloaded vegaPep after removing proteins whose # transcripts are not in vegaGtp to make all.joiner happy. mkdir -p /hive/data/genomes/mm9/bed/vega35 cd /hive/data/genomes/mm9/bed/vega35 # Download the VEGA genes for mouse from the ftp site # This file is from 03/17/09. wget --timestamping \ "ftp://ftp.sanger.ac.uk/pub/vega/mouse/gtf_file.gz" # add chr in front of chromosome names and lift up the randoms # processing similar to the same processing for Ensembl genes, # from /cluster/data/mm9/bed/ensGene.49/process/doProcess.csh cp -p /cluster/data/mm9/bed/ensGene.49/process/randoms.mm9.lift . zcat gtf_file.gz \ | sed -e "s/^\([0-9XY][0-9]*\)/chr\1/; s/^MT/chrM/" \ | liftUp -type=.gtf stdout randoms.mm9.lift carry stdin \ | gzip > allGenes.gtf.gz # Got 189 lifts in randoms.mm9.lift gtfToGenePred -infoOut=infoOut.txt -genePredExt allGenes.gtf.gz stdout \ | gzip > mm9.allGenes.gp.gz /cluster/home/hartera/kent/src/hg/utils/automation/extractGtf.pl \ infoOut.txt > ensGtp.tab genePredCheck -db=mm9 mm9.allGenes.gp.gz # checked: 59381 failed: 0 zcat allGenes.gtf.gz | grep -i pseudo > pseudo.gtf zcat allGenes.gtf.gz | grep -v -i pseudo > not.pseudo.gtf # Modify the GTF files so that the gene name goes into the # name2 field of the genePred. perl -pi.bak -e 's/gene_id/other_gene_id/' *pseudo.gtf perl -pi.bak -e 's/gene_name/gene_id/' *pseudo.gtf gtfToGenePred -genePredExt pseudo.gtf pseudo.gp gtfToGenePred -genePredExt not.pseudo.gtf not.pseudo.gp genePredCheck -db=mm9 pseudo.gp # checked: 4305 failed: 0§ genePredCheck -db=mm9 not.pseudo.gp # checked: 55076 failed: 0 hgLoadGenePred -genePredExt mm9 vegaGene not.pseudo.gp hgLoadGenePred -genePredExt mm9 vegaPseudoGene pseudo.gp # clean up rm *.bak # 2009-08-03 (hartera) # Added code to src/hg/hgTracks/simpleTracks.c to register a track # handler for vegaGeneComposite that is now used for this data. This used # vegaGeneMethods to display the name2 field (gene) as the item label in # the track. # 2009-08-15 - 2009-08-16 (hartera) # Information extracted the attributes in the GTF file as ensGtp so # change name to vegaGtp. mv ensGtp.tab vegaGtp.tab # ensGtp table definition is in ~/kent/src/hg/lib/ensGtp.sql # There is an index on the protein field so it can not be NULL. # If there is no protein, the gene name is given. # Added code to hgTracks.c and hgTrackUi.c to allow the use of # radio buttons on the track configuratioin page to select the # gene name, accession or both to be displayed in the track. # The gene name is displayed by default. # Added code to hgc.c so that Ensembl and Vega can share code to # create links on the details pages to the Vega reports for transcript, # gene and protein through these IDs. Created new function # printEnsemblOrVegaCustomUrl(). # 2009-08-22 (hartera) # Loaded the vegaGtp table. Use ensGtp.sql to create the table. # vegaGtp associates geneId/transcriptId/proteinId # for the links to Vega reports from the details page. cd /hive/data/genomes/mm9/bed/vega35 cp ~/kent/src/hg/lib/ensGtp.sql . # 11 of the gene names for noncoding transcripts are too long for the # protein ID field so change this field in ensGtp.sql to allow 40 chars # instead of 20 and re-load the table. hgsql -e 'drop table vegaGtp;' mm9 hgLoadSqlTab mm9 vegaGtp ensGtp.sql vegaGtp.tab # Loaded succesfully # Added code to hgc.c to use printEnsemblOrVegaCustomUrl() in # doVegaGene() to add the links to Vega reports on the details pages. # Code was added so that there is no protein sequence link on the details # page if it there is none available e.g. noncoding. # 2009-09-01 (hartera) # Coding genes are displaying the message that there is no protein # prediction available. Need to add a vegaPep table. cd /hive/data/genomes/mm9/bed/vega35 # Download the protein FASTA file for Vega35 wget --timestamping "ftp://ftp.sanger.ac.uk/pub/vega/mouse/pep/*.tot.fa.gz" # from the Ensembl process: zcat Mus_musculus.VEGA.mar.pep.tot.fa.gz \ | sed -e 's/^>.* Transcript:/>/;' | gzip > vegaPep.txt.gz zcat vegaPep.txt.gz \ | ~/kent/src/utils/faToTab/faToTab.pl /dev/null /dev/stdin \ | sed -e '/^$/d; s/*$//' | sort > vegaPep.mm9.fa.tab # Load table (2009-09-03, hartera) hgPepPred mm9 tab vegaPep vegaPep.mm9.fa.tab # Add vegaPep to the trackDb.ra entry for the vegaGeneComposite track # in the type line for src/hg/makeDb/trackDb/mouse/mm9/trackDb.ra. # Check that the vegaPep table looks ok and then check protein-coding and # noncoding transcript details pages for protein links. # 2009-09-04, hartera # Re-load tables after upgrade to mySQL 5 as they had reverted back to # tables with the previous Vega dataset. cd /hive/data/genomes/mm9/bed/vega35 hgsql -e 'drop table vegaGene;' mm9 hgsql -e 'drop table vegaPseudoGene;' mm9 hgLoadGenePred -genePredExt mm9 vegaGene not.pseudo.gp hgLoadGenePred -genePredExt mm9 vegaPseudoGene pseudo.gp hgsql -e 'drop table vegaGtp;' mm9 hgLoadSqlTab mm9 vegaGtp ensGtp.sql vegaGtp.tab hgsql -e 'drop table vegaPep;' mm9 hgPepPred mm9 tab vegaPep vegaPep.mm9.fa.tab # 2009-09-08 (hartera). Changed message in code for details page when no # protein sequence is available to be more explanatory. "Non-protein # coding gene or gene fragment, no protein prediction available." Changed # the colouring for the vegaGene subtrack to be darker blue so there is # more of a contrast between vegaGene and vegaPseudoGene subtracks. # 2009-09-09 (hartera) - re-loaded vegaPep table with only those proteins # that have a transcript ID in vegaGtp. # all.joiner is complaining as there are about 1,000 extra proteins in # vegaPep that do not have transcripts in vegaGtp. Decided to remove these # and e-mailed the HAVANA group to ask about the discrepancy. cd /hive/data/genomes/mm9/bed/vega35 awk '{print $2}' vegaGtp.tab | sort | uniq > vegaGtp.tx.ids awk '{print $1}' vegaPep.mm9.fa.tab | sort | uniq > vegaPep.tx.ids wc -l *.tx.ids # 59381 vegaGtp.tx.ids # 30956 vegaPep.tx.ids # Number of transcripts that have a protein ID: hgsql -Ne 'select transcript from vegaGtp where protein like "OTTMUSP%";' \ mm9 | sort | uniq > vegaGtpWithProt.tx.ids wc -l vegaGtpWithProt.tx.ids # 29902 vegaGtpWithProt.tx.ids # find those that are common to both. comm -12 vegaGtp.tx.ids vegaPep.tx.ids > pepandGtp.tx.ids wc -l pepandGtp.tx.ids # 29902 pepandGtp.tx.ids comm -12 pepandGtp.tx.ids vegaGtpWithProt.tx.ids | wc -l # 29902 # Therefore all the vegaGtp transcripts with a protein ID are in the # protein FASTA file. hgsql -Ne 'select * from vegaPep as p, vegaGtp as g where g.protein \ like "OTTMUSP%" and p.name = g.transcript;' mm9 \ > vegaPepOnlyInGtp.mm9.fa.tab wc -l vegaPepOnlyInGtp.mm9.fa.tab # 29902 vegaPepOnlyInGtp.mm9.fa.tab hgsql -e 'drop table vegaPep;' mm9 hgPepPred mm9 tab vegaPep vegaPepOnlyInGtp.mm9.fa.tab ############################################################################ # Blastz Elephant loxAfr3 (DONE - 2009-08-12 - Hiram) mkdir /hive/data/genomes/mm9/bed/lastzLoxAfr3.2009-08-12 cd /hive/data/genomes/mm9/bed/lastzLoxAfr3.2009-08-12 cat << '_EOF_' > DEF # Mouse vs. Elephant BLASTZ_M=50 # TARGET: Mouse Mm9 SEQ1_DIR=/scratch/data/mm9/nib SEQ1_LEN=/scratch/data/mm9/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Elephant loxAfr3 SEQ2_DIR=/scratch/data/loxAfr3/loxAfr3.2bit SEQ2_LEN=/scratch/data/loxAfr3/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LIMIT=50 SEQ2_LAP=0 BASE=/cluster/data/mm9/bed/lastzLoxAfr3.2009-08-12 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ > do.log 2>&1 & # real 498m44.261s cat fb.mm9.chainLoxAfr3Link.txt # 684326090 bases of 2620346127 (26.116%) in intersection # trying syntenic nets time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -continue=syntenicNet -syntenicNet > syntenicNet.log 2>&1 & # about 20 minutes mkdir /hive/data/genomes/loxAfr3/bed/blastz.mm9.swap cd /hive/data/genomes/loxAfr3/bed/blastz.mm9.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm9/bed/lastzLoxAfr3.2009-08-12/DEF \ -swap -noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -syntenicNet > swap.log 2>&1 & # real 123m9.342s cat fb.loxAfr3.chainMm9Link.txt # 673856452 bases of 3118565340 (21.608%) in intersection ######################################################################### ## NIA Mouse Gene Index - (DONE, Fan, 9/9/09) # NOTE FOR NEXT TIME: this track fails pslCheck because every row in the # NIAGene table has a tSize of 198000000. Future tables should contain the # proper chromosome lengths in the tSize field. (Brooke, 2/22/10) ssh hgwdev mkdir -p /cluster/data/mm9/bed/NIAGene090903 cd /cluster/data/mm9/bed ln -s NIAGene090903 NIAGene cd NIAGene wget --timestamp http://lgsun.grc.nia.nih.gov/geneindex/mm9/download/T-fasta.ff.gz wget --timestamp http://lgsun.grc.nia.nih.gov/geneindex/mm9/download/T-psl.txt.gz gzip -d *.gz cut -f 1-21 T-psl.txt >NIAGene.tab hgLoadPsl mm9 NIAGene.tab mkdir /gbdb/mm9/NIAGene ln -s /cluster/data/mm9/bed/NIAGene/T-fasta.fa /gbdb/mm9/NIAGene/T-fasta.fa hgLoadSeq mm9 /gbdb/mm9/NIAGene/T-fasta.fa #Creating seq.tab file #Adding /gbdb/mm9/NIAGene/T-fasta.fa #257758 sequences #Updating seq table #Warning: load of seq did not go as planned: 257758 record(s), 0 row(s) skipped, 257758 warning(s) loading ./seq.tab #Advisory lock has been released #All done # not sure what the warnings are about, but the track seems working. # Create/edit/check in NIAGene.html and trackDb.ra under kent/src/hg/makeDb/trackDb/mouse/mm9 ##################################################################### # LASTZ Tetraodon TetNig2 (DONE - 2009-09-15 - Hiram) mkdir /hive/data/genomes/mm9/bed/lastzTetNig2.2009-09-15 cd /hive/data/genomes/mm9/bed/lastzTetNig2.2009-09-15 cat << '_EOF_' > DEF # mouse vs tetraodon BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_M=50 BLASTZ_Q=/scratch/data/blastz/HoxD55.q # TARGET: Mouse Mm9 SEQ1_DIR=/scratch/data/mm9/mm9.2bit SEQ1_LEN=/scratch/data/mm9/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 SEQ1_LIMIT=5 # QUERY: Tetraodon TetNig2 - single chunk big enough to single largest item SEQ2_DIR=/scratch/data/tetNig2/tetNig2.2bit SEQ2_LEN=/scratch/data/tetNig2/chrom.sizes SEQ2_CTGDIR=/scratch/data/tetNig2/tetNig2.contigs.2bit SEQ2_CTGLEN=/scratch/data/tetNig2/tetNig2.contigs.sizes SEQ2_LIFT=/scratch/data/tetNig2/tetNig2.contigs.lift SEQ2_CHUNK=20000000 SEQ2_LAP=0 SEQ2_LIMIT=50 BASE=/hive/data/genomes/mm9/bed/lastzTetNig2.2009-09-15 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # establish a screen to control this job screen time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -qRepeats=windowmaskerSdust \ -noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \ > do.log 2>&1 & # about 124 minutes cat fb.mm9.chainTetNig2Link.txt # 45642112 bases of 2620346127 (1.742%) in intersection # running the swap mkdir /hive/data/genomes/tetNig2/bed/blastz.mm9.swap cd /hive/data/genomes/tetNig2/bed/blastz.mm9.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm9/bed/lastzTetNig2.2009-09-15/DEF \ -qRepeats=windowmaskerSdust \ -noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \ -swap > swap.log 2>&1 & # real 10m34.797s cat fb.tetNig2.chainMm9Link.txt # 41176381 bases of 302314788 (13.620%) in intersection ############################################################################## # BUILD REST TRACK (DONE 9/16/09, Fan) mkdir /hive/data/genomes/mm9/bed/REST cd /hive/data/genomes/mm9/bed/REST # Receive bed data file, REST_ChIP_PET_mm9.bed, # from Rory JOHNSON [johnsonrb@gis.a-star.edu.sg]. hgLoadBed mm9 REST REST_ChIP_PET_mm9.bed # Discovered mm9's extFile and history tables were out of sync. # Bob and Hirm fixed the problem. Reload and it was successful. # Created REST.html based on Rory's original doc and later updates. # Added track definition and search term into trackDb/mouse/mm9/trackDb.ra # Fix the 0 base problem. (Fan 9/20/09, per Rory's email) hgsql mm9 -e 'update rest set chromStart = chromStart -1' ############################################################################ # TRANSMAP vertebrate.2009-09-13 build (2009-09-20 markd) vertebrate-wide transMap alignments were built Tracks are created and loaded by a single Makefile. This is available from: svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-09-13 see doc/builds.txt for specific details. ############################################################################ # ADD LINK TO GENENETWORK (DONE. 11/06/09 Fan). # Received geneNetwork ID list file, GN_mouse_RefSeq.txt, for mm9 from # GeneNetwork, Zhou Xiaodong [xiaodong.zhou@gmail.com]. ssh hgwdev mkdir -p /cluster/data/mm9/bed/geneNetwork cd /cluster/data/mm9/bed/geneNetwork hgsql mm9 < ~/src/hg/lib/geneNetworkId.sql hgsql mm9 -e \ 'load data local infile "GN_mouse_RefSeq.txt" into table geneNetworkId' ######################################################################### # LASTZ/CHAIN/NET swap danRer6 (DONE - 2009-12-18 - Galt) # original alignment to danRer6 cd /hive/data/genomes/danRer6/bed/lastzMm9.2009-12-17 cat fb.danRer6.chainMm9Link.txt # 77099032 bases of 1506896106 (5.116%) in intersection # running the swap - DONE - 2009-12-18 mkdir /hive/data/genomes/mm9/bed/blastz.danRer6.swap cd /hive/data/genomes/mm9/bed/blastz.danRer6.swap time nice +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/danRer6/bed/lastzMm9.2009-12-17/DEF \ -noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \ -swap >& swap.log & # real 183m21.102s cat fb.mm9.chainDanRer6Link.txt # 73444297 bases of 2620346127 (2.803%) in intersection ####################################################################### # Vega gene update (DONE - 2010-01-15 - Hiram) # lookup version number at the Vega WEB site: # http://vega.sanger.ac.uk/index.html # and FTP site: # ftp://ftp.sanger.ac.uk/pub/vega/ cd /hive/data/genomes/mm9 # step wise to verify operation doEnsGeneUpdate.pl -vegaGene -ensVersion=37 -stop=download mm9.ensGene.ra doEnsGeneUpdate.pl -vegaGene -ensVersion=37 \ -continue=process -stop=process mm9.ensGene.ra doEnsGeneUpdate.pl -vegaGene -ensVersion=37 \ -continue=load -stop=load mm9.ensGene.ra doEnsGeneUpdate.pl -vegaGene -ensVersion=37 \ -continue=cleanup mm9.ensGene.ra featureBits mm9 vegaGene # 53838752 bases of 2620346127 (2.055%) in intersection featureBits mm9 vegaPseudoGene # 3060300 bases of 2620346127 (0.117%) in intersection ######################################################################## # Blastz Rabbit oryCun2 (DONE - 2010-01-15 - Hiram) ssh hgwdev screen # use screen to control this job mkdir /hive/data/genomes/mm9/bed/lastzOryCun2.2010-01-15 cd /hive/data/genomes/mm9/bed/lastzOryCun2.2010-01-15 cat << '_EOF_' > DEF # Mouse vs. Rabbit BLASTZ_M=50 # TARGET: Mouse Mm9 SEQ1_DIR=/scratch/data/mm9/mm9.2bit SEQ1_LEN=/cluster/data/mm9/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Rabbit at chunk 20,000,000 all but 36 contigs can fit in a single job SEQ2_DIR=/scratch/data/oryCun2/oryCun2.2bit SEQ2_LEN=/scratch/data/oryCun2/chrom.sizes SEQ2_CTGDIR=/scratch/data/oryCun2/oryCun2.contigs.2bit SEQ2_CTGLEN=/scratch/data/oryCun2/oryCun2.contigs.sizes SEQ2_LIFT=/hive/data/genomes/oryCun2/contigs/oryCun2.contigs.lift SEQ2_CHUNK=20000000 SEQ2_LIMIT=400 SEQ2_LAP=0 BASE=/hive/data/genomes/mm9/bed/lastzOryCun2.2010-01-15 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ > do.log 2>&1 & cat fb.mm9.chainOryCun2Link.txt # 670229789 bases of 2620346127 (25.578%) in intersection # 496428446 bases of 2620346127 (18.945%) in intersection time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \ -continue=syntenicNet -bigClusterHub=swarm \ -syntenicNet > syntenicNet.log 2>&1 & # about 20 minutes # create reciprocal best chains/nets ssh hgwdev cd /hive/data/genomes/mm9/bed/lastzOryCun2.2010-01-15 # this needs blastz.oryCun2 symlink to function time nice -n +19 doRecipBest.pl mm9 oryCun2 > rbest.log 2>&1 & # real 37m32.151s mkdir /hive/data/genomes/oryCun2/bed/blastz.mm9.swap cd /hive/data/genomes/oryCun2/bed/blastz.mm9.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm9/bed/lastzOryCun2.2010-01-15/DEF \ -noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \ -swap -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ > swap.log 2>&1 & # real 84m6.571s cat fb.oryCun2.chainMm9Link.txt # 669602734 bases of 2604023284 (25.714%) in intersection ######################################################################### # ailMel1 Panda alignment (DONE - 2010-02-04 - Hiram) mkdir /hive/data/genomes/mm9/bed/lastzAilMel1.2010-02-04 cd /hive/data/genomes/mm9/bed/lastzAilMel1.2010-02-04 cat << '_EOF_' > DEF # Mouse vs. Panda # parameters from the Panda paper supplemental where they describe # their lastz parameters BLASTZ_K=2200 BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_H=2000 BLASTZ_C=2 BLASTZ_T=2 # our usual M BLASTZ_M=50 # TARGET: Mouse Mm9 SEQ1_DIR=/scratch/data/mm9/nib SEQ1_LEN=/scratch/data/mm9/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Panda SEQ2_DIR=/scratch/data/ailMel1/ailMel1.2bit SEQ2_LEN=/scratch/data/ailMel1/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LIMIT=50 SEQ2_LAP=0 BASE=/hive/data/genomes/mm9/bed/lastzAilMel1.2010-02-04 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -noLoadChainSplit -syntenicNet \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 501m27.760s cat fb.mm9.chainAilMel1Link.txt # 749595031 bases of 2620346127 (28.607%) in intersection mkdir /hive/data/genomes/ailMel1/bed/blastz.mm9.swap cd /hive/data/genomes/ailMel1/bed/blastz.mm9.swap time doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm9/bed/lastzAilMel1.2010-02-04/DEF \ -swap -noLoadChainSplit -bigClusterHub=swarm -smallClusterHub=memk \ -workhorse=hgwdev \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 54m57.140s cat fb.ailMel1.chainMm9Link.txt # 739076250 bases of 2245312831 (32.916%) in intersection ############################################################################ # susScr1 Pig BLASTZ/CHAIN/NET (DONE - 2010-01-21,22 - Hiram) screen # use a screen to manage this multi-day job mkdir /hive/data/genomes/mm9/bed/lastzSusScr1.2010-01-21 cd /hive/data/genomes/mm9/bed/lastzSusScr1.2010-01-21 cat << '_EOF_' > DEF # Pig vs. Mouse BLASTZ_M=50 # TARGET: Mouse Mm9 SEQ1_DIR=/scratch/data/mm9/nib SEQ1_LEN=/scratch/data/mm9/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Pig SusScr1 SEQ2_DIR=/scratch/data/susScr1/susScr1.2bit SEQ2_LEN=/scratch/data/susScr1/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/hive/data/genomes/mm9/bed/lastzSusScr1.2010-01-21 TMPDIR=/scratch/tmp '_EOF_' # << this line keeps emacs coloring happy time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -noLoadChainSplit -syntenicNet \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 875m26.114s cat fb.mm9.chainSusScr1Link.txt # 616833828 bases of 2620346127 (23.540%) in intersection mkdir /hive/data/genomes/susScr1/bed/blastz.mm9.swap cd /hive/data/genomes/susScr1/bed/blastz.mm9.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm9/bed/lastzSusScr1.2010-01-21/DEF \ -swap -noLoadChainSplit -syntenicNet \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 69m27.221s cat fb.susScr1.chainMm9Link.txt # 656445475 bases of 2231332019 (29.419%) in intersection ######################################################################### # CRG MAPABILITY (2010-02-05 - 2010-02-09, hartera, DONE) # Data was provided by Thomas Derrien (thomas.derrien.crg.es) and Paolo Ribeca # from the Guigo lab at the Center for Genomic Regulation (CRG) in Barcelona # on 2010-02-04. # Data was produced using their GEM mapper aligner taking sliding k-mers # window of the human genome that were mapped back onto the genome with up # to 2mismatches. For each window, a mappability score is computed # S = 1/(nb of match_found) and the BigWig index was created according to # this score. # 2010-02-09. Loaded database and added data to /gbdb/ # Added trackDb entry for the Mapability track. # 2010-04-02. Replaced the Mapability 40mer subtrack bigWig file with a new one # provided by CRG as the old file had regions with missing data. # 2010-04-28. Received new data from Thomas Derrien. Downloaded data and # added it to /gbdb/. A bug was found in a library used by bedGraphToBigWig so # sent a new binary to data providers and they re-created the bigWig files. mkdir -p /hive/data/genomes/mm9/bed/crgMapability cd /hive/data/genomes/mm9/bed/crgMapability cat << 'EOF' > temp #!/bin/tcsh -ef http://genome.crg.es/~tderrien/UCSC_Tracks/M.musculus.genome.mm9.mappability-36_mm9.bw.bz2 http://genome.crg.es/~tderrien/UCSC_Tracks/M.musculus.genome.mm9.mappability-50_mm9.bw.bz2 http://genome.crg.es/~tderrien/UCSC_Tracks/M.musculus.genome.mm9.mappability-75_mm9.bw.bz2 http://genome.crg.es/~tderrien/UCSC_Tracks/M.musculus.genome.mm9.mappability-100_mm9.bw.bz2 http://genome.crg.es/~tderrien/UCSC_Tracks/M.musculus.genome.mm9.mappability-40_mm9.bw.bz2 'EOF' awk '{if ($0 ~ /#/) print; else print "wget --timestamping \"" $0 "\"";}' \ temp > download.csh rm temp chmod +x download.csh ./download.csh >& download.log & # Add the data to /gbdb/ and load the file names into tables (2010-01-26) cd /hive/data/genomes/mm9/bed/crgMapability bunzip2 *.bz2 # Add data to gbdb mkdir -p /gbdb/mm9/bbi/ # Symlink files with names as crgMapabilityAlignXmer.bw to /gbdb/mm9/bbi # and load file name into a table - one per dataset. Each table # represents a subtrack. foreach f (`ls *.bw`) echo $f set g=`echo $f | cut -d "-" -f2` set num=`echo $g | cut -d "_" -f1` set mer=`echo "${num}mer"` set nf=`echo "crgMapabilityAlign${mer}.bw"` echo $nf ln -s `pwd`/${f} /gbdb/mm9/bbi/${nf} hgsql mm9 -e "drop table if exists crgMapabilityAlign${mer}; \ create table crgMapabilityAlign${mer} (fileName varchar(255) not null); \ insert into crgMapabilityAlign${mer} values ('/gbdb/mm9/bbi/${nf}');" end # Added a trackDb entry for this mapability track in # kent/src/hg/makeDb/trackDb/mouse/mm9/trackDb.ra # use bigWigInfo to check min and max values. Created a mapability.html # description page. # 2010-04-02, hartera # QA found regions of missing data for the 40mer subtrack. Wrote to the # data providers and they said that the original output has no missing # data so they recreated the bigWig file for the 40mer subtrack and a # link to the new file was sent on 2010-04-02. cd /hive/data/genomes/mm9/bed/crgMapability wget --timestamping \ "http://genome.crg.es/~tderrien/UCSC_Tracks/M.musculus.genome.mm9.mappability-40_mm9.bw.gz" gunzip M.musculus.genome.mm9.mappability-40_mm9.bw.gz # Remove old file from /gbdb/mm9/bbi and add new file. rm /gbdb/mm9/bbi/crgMapabilityAlign40mer.bw ln -s `pwd`/M.musculus.genome.mm9.mappability-40_mm9.bw \ /gbdb/mm9/bbi/crgMapabilityAlign40mer.bw # Downloaded and added new bigWig files to /gbdb/hg19/bbi # (2010-04-30 and 2010-05-01, hartera). New files were created as # there was a bug in the older version of bedGraphToBigWig. cd /hive/data/genomes/mm9/bed/crgMapability rm temp download.csh download.log cat << 'EOF' > temp #!/bin/tcsh -ef http://genome.crg.es/~tderrien/UCSC_Tracks/M.musculus.genome.mm9.mappability-100_mm9.bz2 http://genome.crg.es/~tderrien/UCSC_Tracks/M.musculus.genome.mm9.mappability-36_mm9.bz2 http://genome.crg.es/~tderrien/UCSC_Tracks/M.musculus.genome.mm9.mappability-40_mm9.bz2 http://genome.crg.es/~tderrien/UCSC_Tracks/M.musculus.genome.mm9.mappability-50_mm9.bz2 http://genome.crg.es/~tderrien/UCSC_Tracks/M.musculus.genome.mm9.mappability-75_mm9.bz2 'EOF' awk '{if ($0 ~ /#/) print; else print "wget --timestamping \"" $0 "\"";}' \ temp > download.csh rm temp chmod +x download.csh ./download.csh >& download.log & # Add data to /gbdb/. The file names in /gbdb/ are the same as before # so the tables do not need to be reloaded. cd /hive/data/genomes/mm9/bed/crgMapability bunzip2 *.bz2 # File names do not have a *.bw extension so re-name foreach f (`ls *mm9`) echo $f set g=${f}.bw echo $g mv $f $g end # Then symlink to /gbdb/ foreach f (`ls *.bw`) echo $f set g=`echo $f | cut -d "-" -f2` set num=`echo $g | cut -d "_" -f1` set mer=`echo "${num}mer"` set nf=`echo "crgMapabilityAlign${mer}.bw"` echo $nf rm /gbdb/mm9/bbi/${nf} ln -s `pwd`/${f} /gbdb/mm9/bbi/${nf} end ##################################################################### # tRNAs track (2010-03-12, Fan RE-BUILT) # ssh hgwdev cd /hive/data/genomes/mm9/bed mkdir tRNAs cd tRNAs # Get data files from /projects/lowelab/users/lowe/Browser/vertebrates/ cp -p /projects/lowelab/users/lowe/Browser/vertebrates/mm9-tRNAs.bed . cp -p \ /projects/lowelab/users/lowe/Browser/vertebrates/mm9_tRNAs_images.tar . hgsql mm9 -e 'drop table if exists tRNAs' hgLoadBed -tab mm9 tRNAs mm9-tRNAs.bed -sqlTable=$HOME/kent/src/hg/lib/tRNAs.sql mkdir gif cd gif tar -xvf ../mm9_tRNAs_images.tar mv images/*.gif . rm -rf images mkdir /hive/data/gbdb/mm9/RNA-img rm /hive/data/gbdb/mm9/RNA-img/* cp -p * /hive/data/gbdb/mm9/RNA-img ##################################################################### # LASTZ/CHAIN/NET Marmoset calJac3 (DONE - 2010-02-12 - Hiram) # use a screen to control this job screen mkdir /hive/data/genomes/mm9/bed/lastzCalJac3.2010-02-12 cd /hive/data/genomes/mm9/bed/lastzCalJac3.2010-02-12 cat << '_EOF_' > DEF # mouse vs marmoset BLASTZ_M=50 # TARGET: Mouse Mm9 SEQ1_DIR=/scratch/data/mm9/nib SEQ1_LEN=/scratch/data/mm9/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Marmoset (calJac3) SEQ2_DIR=/scratch/data/calJac3/calJac3.2bit SEQ2_LEN=/scratch/data/calJac3/chrom.sizes SEQ2_LIMIT=75 SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/hive/data/genomes/mm9/bed/lastzCalJac3.2010-02-12 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \ -verbose=2 `pwd`/DEF \ -syntenicNet -chainMinScore=3000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \ > do.log 2>&1 & # real 445m42.381s cat fb.mm9.chainCalJac3Link.txt # 859869647 bases of 2620346127 (32.815%) in intersection mkdir /hive/data/genomes/calJac3/bed/blastz.mm9.swap cd /hive/data/genomes/calJac3/bed/blastz.mm9.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm9/bed/lastzCalJac3.2010-02-12/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 90m38.739s cat fb.calJac3.chainHg19Link.txt # 861811978 bases of 2752505800 (31.310%) in intersection ####################################################################### # felCat4 Cat BLASTZ/CHAIN/NET (DONE - 2010-06-07 - Chin) screen # use a screen to manage this multi-day job mkdir /hive/data/genomes/mm9/bed/lastzFelCat4.2010-06-07 cd /hive/data/genomes/mm9/bed/lastzFelCat4.2010-06-07 cat << '_EOF_' > DEF # dog vs. cat # maximum M allowed with lastz is only 254 BLASTZ_M=254 # TARGET: Dog canFan3 SEQ1_DIR=/scratch/data/mm9/mm9.2bit SEQ1_LEN=/scratch/data/mm9/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 SEQ1_LIMIT=5 # QUERY: Cat (felCat4) SEQ2_DIR=/scratch/data/felCat4/felCat4.2bit SEQ2_LEN=/scratch/data/felCat4/chrom.sizes SEQ2_LIMIT=50 SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/hive/data/genomes/mm9/bed/lastzFelCat4.2010-06-07 TMPDIR=/scratch/tmp '_EOF_' # << this line keeps emacs coloring happy time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet -noDbNameCheck \ -chainMinScore=3000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=pk \ > do.log 2>&1 & # real 1272m46.726s # doBlastzChainNet from step chainRun after para stop, para freeBatch # After para stop para freeBatch in # /hive/data/genomes/mm9/bed/lastzFelCat4.2010-06-07/axtChain/run] # rm the run directory, and use memk/swarm this time time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -continue chainRun \ -syntenicNet -noDbNameCheck \ -chainMinScore=3000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \ > do_chainRun.log 2>&1 & # real 337m31.606s # *** All done ! Elapsed time: 337m32s # *** Make sure that goldenPath/mm9/vsFelCat4/README.txt is accurate. # *** Add {chain,net}FelCat4 tracks to trackDb.ra if necessary. cat fb.mm9.chainFelCat4Link.txt # 637007193 bases of 2620346127 (24.310%) in intersection # swap mkdir /hive/data/genomes/felCat4/bed/blastz.mm9.swap cd /hive/data/genomes/felCat4/bed/blastz.mm9.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm9/bed/lastzFelCat4.2010-06-07/DEF \ -swap -syntenicNet -noDbNameCheck \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 176m42.490s # *** All done ! Elapsed time: 176m42s # *** Make sure that goldenPath/felCat4/vsMm9/README.txt is accurate. # *** Add {chain,net}Mm9 tracks to trackDb.ra if necessary. # real ???? 125m37.926s cat fb.felCat4.chainMm9Link.txt # 616529959 bases of 1990635005 (30.972%) in intersection ##################################################################### # susScr2 Pig BLASTZ/CHAIN/NET (DONE - 2010-03-26,27 - Hiram) screen # use a screen to manage this multi-day job mkdir /hive/data/genomes/mm9/bed/lastzSusScr2.2010-03-26 cd /hive/data/genomes/mm9/bed/lastzSusScr2.2010-03-26 cat << '_EOF_' > DEF # Pig vs. Mouse BLASTZ_M=50 # TARGET: Mouse Mm9 SEQ1_DIR=/scratch/data/mm9/nib SEQ1_LEN=/scratch/data/mm9/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Pig SusScr2 SEQ2_DIR=/scratch/data/susScr2/susScr2.2bit SEQ2_LEN=/scratch/data/susScr2/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/hive/data/genomes/mm9/bed/lastzSusScr2.2010-03-26 TMPDIR=/scratch/tmp '_EOF_' # << this line keeps emacs coloring happy time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -noLoadChainSplit -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # Elapsed time: 717m25s cat fb.mm9.chainSusScr2Link.txt # 616615408 bases of 2620346127 (23.532%) in intersection mkdir /hive/data/genomes/susScr2/bed/blastz.mm9.swap cd /hive/data/genomes/susScr2/bed/blastz.mm9.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm9/bed/lastzSusScr2.2010-03-26/DEF \ -swap -noLoadChainSplit -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # Elapsed time: 63m4s cat fb.susScr2.chainMm9Link.txt # 656444411 bases of 2231298548 (29.420%) in intersection ############################################################################ # Vega aka Havana gene update (DONE - 2010-04-07 - Hiram) # Stephen Trevanion st3 sanger ac uk # lookup version number at the Vega WEB site: # http://vega.sanger.ac.uk/index.html # and FTP site: # ftp://ftp.sanger.ac.uk/pub/vega/ cd /hive/data/genomes/mm9 # step wise to verify operation doEnsGeneUpdate.pl -vegaGene -ensVersion=38 -stop=download mm9.ensGene.ra doEnsGeneUpdate.pl -vegaGene -ensVersion=38 \ -continue=process -stop=process mm9.ensGene.ra # genePredCheck -db=mm9 vegaPseudo.gp.gz # checked: 4377 failed: 0 # genePredCheck -db=mm9 not.vegaPseudo.gp.gz # checked: 57096 failed: 0 # genePredCheck -db=mm9 mm9.allGenes.gp.gz # checked: 61473 failed: 0 doEnsGeneUpdate.pl -vegaGene -ensVersion=38 \ -continue=load -stop=load mm9.ensGene.ra # "identical to previous version 37" doEnsGeneUpdate.pl -vegaGene -ensVersion=38 \ -continue=cleanup mm9.ensGene.ra featureBits mm9 vegaGene # 53838752 bases of 2620346127 (2.055%) in intersection featureBits mm9 vegaPseudoGene # 3060300 bases of 2620346127 (0.117%) in intersection ##################################################################### # oviAri1 Sheep BLASTZ/CHAIN/NET (DONE - 2010-04-16 - Chin) screen # use a screen to manage this multi-day job mkdir /hive/data/genomes/mm9/bed/lastzOviAri1.2010-04-16 cd /hive/data/genomes/mm9/bed/lastzOviAri1.2010-04-16 cat << '_EOF_' > DEF # Sheep vs. Mouse BLASTZ_M=50 # TARGET: Mouse Mm9 SEQ1_DIR=/scratch/data/mm9/nib SEQ1_LEN=/scratch/data/mm9/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Sheep OviAri1 SEQ2_DIR=/scratch/data/oviAri1/oviAri1.2bit SEQ2_LEN=/scratch/data/oviAri1/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/hive/data/genomes/mm9/bed/lastzOviAri1.2010-04-16 TMPDIR=/scratch/tmp '_EOF_' # << this line keeps emacs coloring happy time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -noLoadChainSplit -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 269m58.488s cat fb.mm9.chainOviAri1Link.txt # 406407377 bases of 2620346127 (15.510%) in intersection # and the swap mkdir /hive/data/genomes/oviAri1/bed/blastz.mm9.swap cd /hive/data/genomes/oviAri1/bed/blastz.mm9.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm9/bed/lastzOviAri1.2010-04-16/DEF \ -swap -noLoadChainSplit -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 35m25.217s cat fb.oviAri1.chainMm9Link.txt # 383753361 bases of 1201271277 (31.946%) in intersection ####################################################################### ############################################################################# # ucscRetro track (2010-04-12, baertsch DONE) mkdir -p /hive/users/baertsch/retro/mm9 cd /hive/users/baertsch/retro/mm9 wget http://compbio.soe.ucsc.edu/retrogene/retroFinder-1.16.tar.gz tar xvfz retroFinder-1.16.tar.gz cd retroFinder-1.16/src/pslPseudo make cd ../../.. cat << '_EOF_' > DEF RETRO_OPTIONS="-verbose=4 -minAli=0.98 -nearTop=0.005 " DB=mm9 SCORETHRESH=550 GENOMENAME='Mus musculus' GBDB=mm MRNABASE=/hive/data/genomes/$DB/bed/mrnaBlastz/ TMPMRNA=/hive/users/baertsch/mrnaBlastz/$DB TMPEST=/hive/users/baertsch/est/$DB EST=all_est SPLICED_EST=intronEst SPLIT_EST=0 SPLIT_SPLICED_EST=1 SCRIPT=/hive/users/baertsch/retro/$DB/retroFinder-1.16/scripts GENOME=/hive/data/genomes/ RETRODIR=$GENOME/$DB/bed/retro BASE=/hive/users/baertsch/retro OUTDIR=/hive/users/baertsch/retro/$DB/ RESULT=$OUTDIR/result LOG=$OUTDIR/log OUT=$OUTDIR/out OVERLAPDIR=$OUTDIR/run.o VERSION=2 TABLE=ucscRetroInfo$VERSION ALIGN=ucscRetroAli$VERSION LOCAL=/scratch/data/$DB NIB=$LOCAL/nib RMSK=x NET1=netHg19 NET2=netCanFam2 NET3=netRn4 GENE1=knownGene GENE2=refGene GENE3=ensGene CLUSTER=swarm SPECIES="hg18 mm9" ROOTDIR="~/public_html/retro/mm9Nov09" EXPDIR=exp GENEPFAM=knownGene PFAM=knownToPfam PFAMIDFIELD=name PFAMDOMAIN=value ARRAY=gnfAtlas2 AFFYPROBE=affyGnf1m ARRAYMEDIAN=hgFixed.gnfMouseAtlas2Median ARRAYRATIO=hgFixed.gnfMouseAtlas2AllRatio ARRAYABS=hgFixed.gnfMouseAtlas2All ARRAYEXP=hgFixed.gnfMouseAtlas2MedianExps ARRAYEXPALL=hgFixed.gnfMouseAtlas2AllExps ARRAYLOOKUP=knownToGnfAtlas2 ARRAYPSLS="/hive/data/genomes/mm9/bed/geneAtlas2/affyGnf1m.psl" ALTSPLICE=sibTxGraph SPLITBYAGE=splitRetrosByAgeMouse PDB=proteins090821 '_EOF_' # << happy emacs #add ./retroFinder-1.16/scripts to PATH retroFinder-1.16/scripts/filterMrna.sh DEF retroFinder-1.16/scripts/filterEst.sh DEF nohup retroFinder-1.16/scripts/ucscRetroStep1.sh DEF #check cluster job nohup retroFinder-1.16/scripts/ucscRetroStep2.sh DEF nohup retroFinder-1.16/scripts/ucscRetroStep3.sh DEF #check cluster job nohup retroFinder-1.16/scripts/ucscRetroStep4.sh DEF nohup retroFinder-1.16/scripts/ucscRetroStep5.sh DEF # Load the track nohup retroFinder-1.16/scripts/ucscRetroStep6.sh DEF #add ucscRetroAli to trackDb.ra ################################################################ # ADD KEGG TABLES (DONE, Fan, 6/18/10) mkdir -p /hive/data/genomes/mm9/bed/pathways/kegg cd /hive/data/genomes/mm9/bed/pathways/kegg wget --timestamping ftp://ftp.genome.jp/pub/kegg/pathway/map_title.tab cat map_title.tab | sed -e 's/\t/\tmmu\t/' > j.tmp cut -f 2 j.tmp >j.mmu cut -f 1,3 j.tmp >j.1 paste j.mmu j.1 |sed -e 's/\t//' > keggMapDesc.tab rm j.mmu j.1 rm j.tmp hgsql mm9 -e 'drop table keggMapDesc' hgsql mm9 < ~/kent/src/hg/lib/keggMapDesc.sql hgsql mm9 -e 'load data local infile "keggMapDesc.tab" into table keggMapDesc' wget --timestamping ftp://ftp.genome.jp/pub/kegg/genes/organisms/mmu/mmu_pathway.list cat mmu_pathway.list| sed -e 's/path://'|sed -e 's/:/\t/' > j.tmp hgsql mm9 -e 'drop table keggPathway' hgsql mm9 < ~/kent/src/hg/lib/keggPathway.sql hgsql mm9 -e 'load data local infile "j.tmp" into table keggPathway' hgsql mm9 -N -e \ 'select name, locusID, mapID from keggPathway p, knownToLocusLink l where p.locusID=l.value' \ >keggPathway.tab hgsql mm9 -e 'delete from keggPathway' hgsql mm9 -e 'load data local infile "keggPathway.tab" into table keggPathway' rm j.tmp ################################################################ # Add KEGG column to mm9 Gene Sorter (Done, Fan, 6/18/2010) mkdir -p /hive/data/genomes/mm9/bed/geneSorter cd /hive/data/genomes/mm9/bed/geneSorter hgsql mm9 -N -e 'select kgId, mapID, mapID, "+", locusID from keggPathway' |sort -u|sed -e 's/\t+\t/+/' > knownToKeggEntrez.tab hgsql mm9 -e 'drop table knownToKeggEntrez' hgsql mm9 < ~/kent/src/hg/lib/knownToKeggEntrez.sql hgsql mm9 -e 'load data local infile "knownToKeggEntrez.tab" into table knownToKeggEntrez' ############################################################################# # Update BLASTTAB blast tables (DONE - 2010-08-06 - Fan) ssh hgwdev mkdir -p /hive/data/genomes/mm9/bed/hgNearBlastp/100806 cd /hive/data/genomes/mm9/bed/hgNearBlastp/100806 # Get the proteins used by all hgNear organisms: pepPredToFa hg19 knownGenePep hg19.known.faa pepPredToFa mm9 knownGenePep mm9.known.faa pepPredToFa rn4 knownGenePep rn4.known.faa pepPredToFa danRer6 ensPep danRer6.ensPep.faa pepPredToFa dm3 flyBasePep dm3.flyBasePep.faa pepPredToFa ce6 sangerPep ce6.sangerPep.faa pepPredToFa sacCer2 sgdPep sacCer2.sgdPep.faa cat << '_EOF_' > config.ra # Latest mouse vs. other Gene Sorter orgs: # human, rat, zebrafish, worm, yeast, fly targetGenesetPrefix known targetDb mm9 queryDbs hg19 rn4 danRer6 dm3 ce6 sacCer2 recipBest danRer6 dm3 ce6 sacCer2 mm9Fa /hive/data/genomes/mm9/bed/hgNearBlastp/100806/mm9.known.faa hg19Fa /hive/data/genomes/mm9/bed/hgNearBlastp/100806/hg19.known.faa rn4Fa /hive/data/genomes/mm9/bed/hgNearBlastp/100806/rn4.known.faa danRer6Fa /hive/data/genomes/mm9/bed/hgNearBlastp/100806/danRer6.ensPep.faa dm3Fa /hive/data/genomes/mm9/bed/hgNearBlastp/100806/dm3.flyBasePep.faa ce6Fa /hive/data/genomes/mm9/bed/hgNearBlastp/100806/ce6.sangerPep.faa sacCer2Fa /hive/data/genomes/mm9/bed/hgNearBlastp/100806/sacCer2.sgdPep.faa buildDir /hive/data/genomes/mm9/bed/hgNearBlastp/100806 scratchDir /hive/data/genomes/mm9/bed/hgNearBlastp/100806/tmp '_EOF_' doHgNearBlastp.pl -targetOnly config.ra >& do.log & tail -f do.log # *** All done! # *** Check these tables in mm9: # *** knownBlastTab hgBlastTab rnBlastTab drBlastTab dmBlastTab ceBlastTab scBlastTab ######################################################################### # BUILD CGAP PATHWAY TABLES, DONE, Fan 7/6/2010 ssh hgwdev mkdir -p /hive/data/genomes/mm9/bed/cgap/100706 cd /hive/data/genomes/mm9/bed/cgap/100706 # get data file from data source wget --timestamping -O Mm_GeneData.dat "ftp://ftp1.nci.nih.gov/pub/CGAP/Mm_GeneData.dat" # parse the data file hgCGAP Mm_GeneData.dat hgsql mm9 -e "drop table cgapBiocPathway" hgsql mm9 -e "drop table cgapBiocDesc" hgsql mm9 -e "drop table cgapAlias" hgsql mm9 < ~/kent/src/hg/hgCGAP/cgapBiocPathway.sql hgsql mm9 -e 'LOAD DATA local INFILE "cgapBIOCARTA.tab" into table cgapBiocPathway;' hgsql mm9 < ~/kent/src/hg/hgCGAP/cgapBiocDesc.sql cat cgapBIOCARTAdesc.tab|sort -u > cgapBIOCARTAdescSorted.tab hgsql mm9 -e 'LOAD DATA local INFILE "cgapBIOCARTAdescSorted.tab" into table cgapBiocDesc;' hgsql mm9 < ~/kent/src/hg/hgCGAP/cgapAlias.sql cat cgapSEQUENCE.tab cgapSYMBOL.tab cgapALIAS.tab|sort -u > cgapAlias.tab hgsql mm9 -e 'LOAD DATA local INFILE "cgapAlias.tab" into table cgapAlias' ######################################################################### # phyloP conservation for 30-way (DONE - 2010-07-15 - Hiram) # # Vertebrate, Placental, Euarchontoglires # # split SS files into 1M chunks, this business needs smaller files # to complete ssh swarm mkdir /hive/data/genomes/mm9/bed/multiz30way/consPhyloP cd /hive/data/genomes/mm9/bed/multiz30way/consPhyloP mkdir ss run.split cd run.split cat << '_EOF_' > doSplit.csh #!/bin/csh -ef set c = $1 set MAF = /hive/data/genomes/mm9/bed/multiz30way/maf.split/$c.maf set WINDOWS = /hive/data/genomes/mm9/bed/multiz30way/consPhyloP/run.split/ss/$c set WC = `cat $MAF | wc -l` set NL = `grep "^#" $MAF | wc -l` if ( -s $2 ) then exit 0 endif if ( -s $2.running ) then exit 0 endif date >> $2.running rm -fr $WINDOWS mkdir $WINDOWS pushd $WINDOWS > /dev/null if ( $WC != $NL ) then /cluster/bin/phast.build/cornellCVS/phast.2009-10-19/bin/msa_split \ $MAF -i MAF -o SS -r $WINDOWS/$c -w 1000000,0 -I 1000 -B 5000 endif popd > /dev/null date >> $2 rm -f $2.running '_EOF_' # << happy emacs chmod +x doSplit.csh ls -1S -r ../../maf.split | sed -e "s/.maf//" > maf.list cat << '_EOF_' > template #LOOP doSplit.csh $(path1) {check out exists+ done/$(path1).done} #ENDLOOP '_EOF_' # << happy emacs mkdir ss done ssh memk cd /hive/data/genomes/mm9/bed/multiz30way/consPhyloP/run.split gensub2 maf.list single template jobList para -ram=8g create jobList # Completed: 75 of 75 jobs # CPU time in finished jobs: 9843s 164.05m 2.73h 0.11d 0.000 y # IO & Wait Time: 2938s 48.97m 0.82h 0.03d 0.000 y # Average job time: 170s 2.84m 0.05h 0.00d # Longest finished job: 393s 6.55m 0.11h 0.00d # Submission to last job: 678s 11.30m 0.19h 0.01d # run phyloP with --method LRT ssh swarm mkdir /hive/data/genomes/mm9/bed/multiz30way/consPhyloP/run.phyloP cd /hive/data/genomes/mm9/bed/multiz30way/consPhyloP/run.phyloP # Adjust model file base composition background and rate matrix to be # representative of the chromosomes in play grep BACKGROUND ../../cons/all/all.mod | awk '{printf "%0.3f\n", $3 + $4}' # 0.410 /cluster/bin/phast.build/cornellCVS/phast.2009-10-21/bin/modFreqs \ ../../cons/all/all.mod 0.410 > all.mod grep BACKGROUND ../../cons/euarchontoglires/euarchontoglires.mod \ | awk '{printf "%0.3f\n", $3 + $4}' # 0.410 /cluster/bin/phast.build/cornellCVS/phast.2009-10-21/bin/modFreqs \ ../../cons/euarchontoglires/euarchontoglires.mod 0.410 \ > euarchontoglires.mod grep BACKGROUND ../../cons/placental/placental.mod \ | awk '{printf "%0.3f\n", $3 + $4}' # 0.410 /cluster/bin/phast.build/cornellCVS/phast.2009-10-21/bin/modFreqs \ ../../cons/placental/placental.mod 0.410 > placental.mod cat << '_EOF_' > doPhyloP.csh #!/bin/csh -fe set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast.2009-10-21/bin set f = $1 set out = $2 set cName = $f:r:r set chrDir = $f:r set n = $f:r:e set grp = $cwd:t set cons = /hive/data/genomes/mm9/bed/multiz30way/consPhyloP set tmp = $cons/tmp/$grp/$f rm -fr $tmp mkdir -p $tmp set ssSrc = "$cons/run.split/ss/$chrDir/$f" set useGrp = "$grp.mod" ln -s $cons/run.phyloP/$grp.mod $tmp pushd $tmp > /dev/null $PHASTBIN/phyloP --method LRT --mode CONACC --wig-scores --chrom $cName \ -i SS $useGrp $ssSrc.ss > $f.wigFix popd > /dev/null mkdir -p $out:h sleep 4 mv $tmp/$f.wigFix $out rm -fr $tmp '_EOF_' # << happy emacs chmod +x doPhyloP.csh # Create list of chunks find ../run.split/ss -type f | sed -e "s/.ss$//; s#^../run.split/ss/##" \ > ss.list # Create template file # file1 == $chr/$chunk/file name without .ss suffix cat << '_EOF_' > template #LOOP ../run.phyloP/doPhyloP.csh $(file1) {check out line+ wigFix/$(dir1)/$(file1).wigFix} #ENDLOOP '_EOF_' # << happy emacs ###################### Running all species ####################### # setup run for all species mkdir /hive/data/genomes/mm9/bed/multiz30way/consPhyloP/all cd /hive/data/genomes/mm9/bed/multiz30way/consPhyloP/all rm -fr wigFix mkdir wigFix gensub2 ../run.phyloP/ss.list single ../run.phyloP/template jobList para create jobList para try ... check ... push ... etc ... para time # Completed: 2685 of 2685 jobs # CPU time in finished jobs: 641900s 10698.34m 178.31h 7.43d 0.020 y # IO & Wait Time: 19012s 316.86m 5.28h 0.22d 0.001 y # Average job time: 246s 4.10m 0.07h 0.00d # Longest finished job: 484s 8.07m 0.13h 0.01d # Submission to last job: 31192s 519.87m 8.66h 0.36d ssh hgwdev cd /hive/data/genomes/hg18/bed/multiz30way/consPhyloP/run.phyloP/all find ./wigFix -type f \ | sed -e "s#^./##; s/\./ /g; s/-/ - /g" \ | sort -k1,1 -k2,2n | sed -e "s/ - /-/g; s/ /./g" > wigFile.list cat wigFile.list | xargs cat | sed -e "s/__[0-9]//" \ | wigEncode stdin phyloP30way.wig \ phyloP30way.wib > wigEncode.log 2>&1 & # Converted stdin, upper limit 5.04, lower limit -10.12 # good test to make sure no overlapping coordinates, bigWig: # consumes massive amount of memory, in bash raise your memory limits: ulimit -d 188743680 ulimit -v 188743680 time cat wigFile.list | xargs cat | sed -e "s/__[0-9]//" \ | wigToBigWig stdin ../../../../chrom.sizes phyloP30way.bw & # if you wanted to use the bigWig file, loading bigWig table: ln -s `pwd`/phyloP30way.bw /gbdb/mm9/bbi hgsql mm9 -e 'drop table if exists phyloP30wayAll; \ create table phyloP30wayAll \ (fileName varchar(255) not null); \ insert into phyloP30wayAll values ("/gbdb/mm9/bbi/phyloP30way.bw");' # loading the wiggle table: ln -s `pwd`/phyloP30way.wib /gbdb/mm9/multiz30way time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/mm9/multiz30way mm9 \ phyloP30wayAll phyloP30way.wig # real 0m32.778s # create download files: cat << '_EOF_' > mkDown.csh #!/bin/csh -fe foreach F (`cat wigFile.list`) set C = $F:h:t:r cat $F | sed -e "s/__[0-9]//" >> downloads/${C}.wigFix end '_EOF_' # << happy emacs chmod +x ./mkDown.csh mkdir downloads time ./mkDown.csh # real 16m19.683s time gzip downloads/chr*.wigFix # real 47m11.017s wigTableStats.sh mm9 phyloP30wayAll # db.table min max mean count sumData # mm9.phyloP30wayAll -10.116 5.038 0.119587 1914580285 2.28959e+08 # stdDev viewLimits # 0.760605 viewLimits=-3.68344:3.92261 # that range is: 10.116+5.039 = 15.154 # Create histogram to get an overview of all the data time nice -n +19 hgWiggle -doHistogram \ -hBinSize=0.015154 -hBinCount=1000 -hMinVal=-10.116 -verbose=2 \ -db=mm9 phyloP30wayAll > histogram.data 2>&1 # real 8m15.623s # create plot of histogram: cat << '_EOF_' | gnuplot > histo.png set terminal png small color x000000 xffffff xc000ff x66ff66 xffff00 x00ffff set size 1.4, 0.8 set key left box set grid noxtics set grid ytics set title " Human Hg19 Histogram phyloP30way track, all 30 vertebrates" set xlabel " phyloP30way score, all 30 vertebrates" set ylabel " Relative Frequency" set y2label " Cumulative Relative Frequency (CRF)" set y2range [0:1] set y2tics set yrange [0:0.04] set xrange [-2:2] plot "histogram.data" using 2:5 title " RelFreq" with impulses, \ "histogram.data" using 2:7 axes x1y2 title " CRF" with lines '_EOF_' # << happy emacs display histo.png & ###################### Running the euarchontoglires ####################### mkdir /hive/data/genomes/mm9/bed/multiz30way/consPhyloP/euarchontoglires cd /hive/data/genomes/mm9/bed/multiz30way/consPhyloP/euarchontoglires rm -fr wigFix mkdir wigFix gensub2 ../run.phyloP/ss.list single ../run.phyloP/template jobList para create jobList para try ... check ... push ... etc ... para time # Completed: 2685 of 2685 jobs # CPU time in finished jobs: 127142s 2119.04m 35.32h 1.47d 0.004 y # IO & Wait Time: 53995s 899.91m 15.00h 0.62d 0.002 y # Average job time: 67s 1.12m 0.02h 0.00d # Longest finished job: 125s 2.08m 0.03h 0.00d # Submission to last job: 277s 4.62m 0.08h 0.00d cd /hive/data/genomes/mm9/bed/multiz30way/consPhyloP/euarchontoglires # the sed | sort | sed # trick gets the files sorted so that coordinates and chromosomes # are in chrom and chromStart order and thus wigEncode sees a proper # incoming data stream sorted by coordinates. find ./wigFix -type f \ | sed -e "s#^./##; s/\./ /g; s/-/ - /g" \ | sort -k1,1 -k2,2n | sed -e "s/ - /-/g; s/ /./g" > wigFile.list cat wigFile.list | xargs cat | sed -e "s/__[0-9]//" \ | wigEncode stdin phyloP30wayEuarchontoglires.wig \ phyloP30wayEuarchontoglires.wib > wigEncode.log 2>&1 & cat wigEncode.log # Converted stdin, upper limit 1.13, lower limit -7.49 # good test to make sure no overlapping coordinates, bigWig: # consumes massive amount of memory, in bash raise your memory limits: ulimit -d 188743680 ulimit -v 188743680 cat wigFile.list | xargs cat | sed -e "s/__[0-9]//" \ | wigToBigWig stdin ../../../../chrom.sizes \ phyloP30wayEuarchontoglires.bw # XXX running Fri Jul 9 14:33:29 PDT 2010 # if you wanted to use the bigWig file, loading bigWig table: ln -s `pwd`/phyloP30wayEuarchontoglires.bw /gbdb/mm9/bbi hgsql mm9 -e 'drop table if exists phyloP30wayEuarchontoglires; \ create table phyloP30wayEuarchontoglires \ (fileName varchar(255) not null); \ insert into phyloP30wayEuarchontoglires values ("/gbdb/mm9/bbi/phyloP30wayEuarchontoglires.bw");' # loading the wiggle table: ln -s `pwd`/phyloP30wayEuarchontoglires.wib /gbdb/mm9/multiz30way time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/mm9/multiz30way mm9 \ phyloP30wayEuarch phyloP30wayEuarchontoglires.wig # real 0m39.041s # clean garbage: rm -f wiggle.tab # create download files: mkdir downloads time ../all/mkDown.csh # real 18m44.186s time gzip downloads/chr*.wigFix # real 32m11.301s wigTableStats.sh mm9 phyloP30wayEuarchontoglires # db.table min max mean count # mm9.phyloP30wayEuarchontoglires -7.486 1.126 0.0662017 1914580285 # 1.26749e+08 0.594433 viewLimits=-2.90596:1.126 # that range is: 7.486+1.126 = 8.612 # Create histogram to get an overview of all the data, using the # numbers from wigTableStats above: time nice -n +19 hgWiggle -doHistogram \ -hBinSize=0.008612 -hBinCount=1000 -hMinVal=-7.486 -verbose=2 \ -db=mm9 phyloP30wayEuarchontoglires > histogram.data 2>&1 # real 8m15.623s # create plot of histogram: cat << '_EOF_' | gnuplot > histo.png set terminal png small size 1000,600 x000000 xffffff xc000ff x66ff66 xffff00 x00ffff set size 1.4, 0.8 set key left box set grid noxtics set grid ytics set title " Human Hg19 Histogram phyloP30wayEuarchontoglires track" set xlabel " phyloP30wayEuarchontoglires score" set ylabel " Relative Frequency" set y2label " Cumulative Relative Frequency (CRF)" set y2range [0:1] set y2tics set yrange [0.0:0.07] set xrange [-2.0:1.13] plot "histogram.data" using 2:5 title " RelFreq" with impulses, \ "histogram.data" using 2:7 axes x1y2 title " CRF" with lines '_EOF_' # << happy emacs display histo.png & ###################### Running the placental ####################### mkdir /hive/data/genomes/mm9/bed/multiz30way/consPhyloP/placental cd /hive/data/genomes/mm9/bed/multiz30way/consPhyloP/placental rm -fr wigFix mkdir wigFix gensub2 ../run.phyloP/ss.list single ../run.phyloP/template jobList para create jobList para try ... check ... push ... etc ... para time # Completed: 2685 of 2685 jobs # CPU time in finished jobs: 237516s 3958.60m 65.98h 2.75d 0.008 y # IO & Wait Time: 45828s 763.80m 12.73h 0.53d 0.001 y # Average job time: 106s 1.76m 0.03h 0.00d # Longest finished job: 196s 3.27m 0.05h 0.00d # Submission to last job: 426s 7.10m 0.12h 0.00d cd /hive/data/genomes/mm9/bed/multiz30way/consPhyloP/placental find ./wigFix -type f \ | sed -e "s#^./##; s/\./ /g; s/-/ - /g" \ | sort -k1,1 -k2,2n | sed -e "s/ - /-/g; s/ /./g" > wigFile.list cat wigFile.list | xargs cat | sed -e "s/__[0-9]//" \ | wigEncode stdin phyloP30wayPlacental.wig phyloP30wayPlacental.wib \ > wigEncode.log 2>&1 & # Converted stdin, upper limit 2.06, lower limit -9.46 # good test to make sure no overlapping coordinates, bigWig: # consumes massive amount of memory, in bash raise your memory limits: ulimit -d 188743680 ulimit -v 188743680 cat wigFile.list | xargs cat | sed -e "s/__[0-9]//" \ | wigToBigWig stdin ../../../../chrom.sizes phyloP30wayPlacental.bw \ > bigEncode.log 2>&1 & # loading bigWig table: ln -s `pwd`/phyloP30wayPlacental.bw /gbdb/mm9/bbi hgsql mm9 -e 'drop table if exists phyloP30wayPlacental; \ create table phyloP30wayPlacental \ (fileName varchar(255) not null); \ insert into phyloP30wayPlacental values ("/gbdb/mm9/bbi/phyloP30wayPlacental.bw");' # loading the wiggle table: ln -s `pwd`/phyloP30wayPlacental.wib /gbdb/mm9/multiz30way time hgLoadWiggle time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/mm9/multiz30way mm9 \ phyloP30wayPlacental phyloP30wayPlacental.wig # real 0m35.726s # create download files: mkdir downloads time ../all/mkDown.csh # real 18m52.778s time gzip downloads/chr*.wigFix # real 30m55.550s wigTableStats.sh mm9 phyloP30wayPlacental # db.table min max mean count sumData stdDev viewLimits # mm9.phyloP30wayPlacental -9.46 2.058 0.07797 1914580285 1.4928e+08 # stdDev viewLimits # 0.668819 viewLimits=-3.26613:2.058 # that range is: 9.46+2.058 = 11.518 # Create histogram to get an overview of all the data time nice -n +19 hgWiggle -doHistogram \ -hBinSize=0.011518 -hBinCount=1000 -hMinVal=-9.46 -verbose=2 \ -db=mm9 phyloP30wayPlacental > histogram.data 2>&1 # real 8m15.623s # create plot of histogram: cat << '_EOF_' | gnuplot > histo.png set terminal png small color x000000 xffffff xc000ff x66ff66 xffff00 x00ffff set size 1.4, 0.8 set key left box set grid noxtics set grid ytics set title " Human Hg19 Histogram phyloP30wayPlacental track" set xlabel " phyloP30wayPlacental score" set ylabel " Relative Frequency" set y2label " Cumulative Relative Frequency (CRF)" set y2range [0:1] set y2tics set yrange [0:0.03] set xrange [-2.5:2.5] plot "histogram.data" using 2:5 title " RelFreq" with impulses, \ "histogram.data" using 2:7 axes x1y2 title " CRF" with lines '_EOF_' # << happy emacs display histo.png & ############################################################################# # Agilent arrays (2010-12-01 Andy) cd /hive/data/genomes/mm9/bed/agilentProbes/ # FTP download from ftp.agilent.com using given user/pass from Anniek De-witte # (anniek_de-witte@agilent.com) # downloaded files are gzipped beds. The files are typically located in a # directory called "FOR_UCSC" or something like that. The user/pass and the # directory are deleted after it's confirmed they're received, so it's not # too helpful to mention specifics here. ftp -u user -p password ftp.agilent.com > cd directory > get 027411_D_BED_20100308.bed.gz > get 027414_D_BED_20100318.bed # unzip everything gunzip 027*.bed.gz ln -s 027414_D_BED_20100318.bed agilentCgh1x1m.ct.bed ln -s 027411_D_BED_20100308.bed agilentCgh4x180k.ct.bed for bed in agilent*.bed; do tail -n +2 $bed | hgLoadBed mm9 ${bed%.ct.bed} stdin done rm bed.tab ########################################################################## # Build targetScanS track - (DONE - 2010-12-14 galt) # requested by: George Bell gbell at wi.mit.edu ssh hgwdev mkdir -p /cluster/data/mm9/bed/targetScanS cd /cluster/data/mm9/bed/targetScanS wget --timestamping http://www.targetscan.org/mmu_50/ucsc/mm9/mm9Cons_ALL_CHRS.BED hgLoadBed mm9 targetScanS mm9Cons_ALL_CHRS.BED # Loaded 38961 elements of size 6 featureBits mm9 targetScanS # 253088 bases of 2620346127 (0.010%) in intersection # Create/edit/check in targetScans.html and trackDb.ra under # kent/src/hg/makeDb/trackDb/mouse/mm9 ######################################################################### # LASTZ Mouse mm9 (DONE - 2010-12-17 - hiram) mkdir /hive/data/genomes/mm9/bed/lastzDanRer7.2010-12-17 cd /hive/data/genomes/mm9/bed/lastzDanRer7.2010-12-17 cat << '_EOF_' > DEF # mouse vs zebrafish BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/scratch/data/blastz/HoxD55.q # TARGET: Mouse mm9 SEQ1_DIR=/scratch/data/mm9/mm9.2bit SEQ1_LEN=/scratch/data/mm9/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 SEQ1_LIMIT=5 # QUERY: Zebrafish danRer7 SEQ2_DIR=/scratch/data/danRer7/danRer7.2bit SEQ2_LEN=/scratch/data/danRer7/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LAP=0 SEQ2_LIMIT=40 BASE=/hive/data/genomes/mm9/bed/lastzDanRer7.2010-12-17 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # establish a screen to control this job screen time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ > do.log 2>&1 & # Elapsed time: 254m42s cat fb.danRer7.chainMm9Link.txt # 68190354 bases of 2620346127 (2.602%) in intersection # and the swap to danRer7 mkdir /hive/data/genomes/danRer7/bed/blastz.mm9.swap cd /hive/data/genomes/danRer7/bed/blastz.mm9.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm9/bed/lastzDanRer7.2010-12-17/DEF \ -noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -swap > swap.log 2>&1 & # real 16m8.672s cat fb.danRer7.chainMm9Link.txt # 71960602 bases of 1409770109 (5.104%) in intersection ######################################################################### # YALE PSEUDOPIPE PSEUDOGENE PREDICTIONS BASED ON ENSEMBL 60 # (hartera, 2010-12-23 - 2010-12-24, DONE) # FTP site e-mailed on 2010-12-22 by Suganthi Balasubramanian # (suganthi.bala@yale.edu) from the Gerstein lab. Data is from their # PseudoPipe pipeline and it is based on proteins from Ensembl Build 60 # (pseudogene data from December 2010?). # NOTE: this data will be replaced before being released to the RR as there # is a problem with the IDs. mkdir -p /hive/data/genomes/mm9/bed/pseudoYale60 cd /hive/data/genomes/mm9/bed/pseudoYale60 # Go to http://tables.pseudogene.org/set.py?id=Mouse60 and click on the # download link to download Mouse60.tx and copy the file to this directory. # Header from data file. ID Chromosome Start Coordinate Stop Coordinate Strand Parent Protein Protein Start Protein Stop Parent Gene Fraction Num Insertions Num Deletions Num Shifts Num Stops E Value Identity PolyA Disablements Exons Introns Class Sequence Link # urls are of type: # http://tables.pseudogene.org/mouse60/ so this can be added to the # trackDb as for the previous track. Just update the Ensembl 59 trackDb entry. # Get list of chroms: awk '{print $2}' Mouse60.txt | sort | uniq # chromosomes are 1-19, X, Y # Convert data to genePred: cat << '_EOF_' > formatPseudogenesToGenePred #!/usr/bin/awk -f # Parse Yale pseudogene data file. # Exon coordinates are in this format: [[28688544, 28688864], [28689678, 2869117# 4], [28694308, 28694460], [28701327, 28701749]] # Ignore header line /^ID/ { next; } # Parse the data lines BEGIN {FS="\t"} {OFS="\t"} { gsub(/\[/, "", $19); gsub(/\]/, "", $19); split($19, exons, ","); # Count the number of start and end coordinates for exons and # calculate the number of exons. count=(length(exons))/2; # Write out genePred. Add chr in front of chrom only if not haplotype. if ($2 !~ /HSCHR/) { printf "%s\tchr%s\t%c\t%d\t%d\t0\t0\t%d\t", $1, $2, $5, $3-1, $4, count; } else { printf "%s\t%s\t%c\t%d\t%d\t0\t0\t%d\t", $1, $2, $5, $3-1, $4, count; } # get list of exon starts, convert from 1-based to 0-based for (i=1; i <= length(exons); i+=2) { printf "%d,", exons[i]-1","; } printf "\t"; # get list of exon ends for (i=2; i <= length(exons); i+=2) { printf "%d,", exons[i]","; } printf "\n"; } '_EOF_' # << emacs chmod +x formatPseudogenesToGenePred # format the Yale pseudogenes data to genePred. ./formatPseudogenesToGenePred Mouse60.txt > pseudoYaleMouse60.gp # Load the genePred file into mm9 hgLoadGenePred mm9 pseudoYale60 pseudoYaleMouse60.gp # Didn't load. There are 4 invalid genePreds: Error: invalid genePred: PGOMOU00000130313 exon 1 overlaps previous exon Error: invalid genePred: PGOMOU00000139101 exon 1 overlaps previous exon Error: invalid genePred: PGOMOU00000136201 exon 1 overlaps previous exon Error: invalid genePred: PGOMOU00000128816 exon 1 overlaps previous exon Error: 4 invalid genePreds, database unchanged # File didn't load into database. # Make a file of these ids - invalidIds grep -f invalidIds -vw pseudoYaleMouse60.gp > pseudoYaleMouse60NoInvalidGps.gp wc -l pseudoYale*gp # 19086 pseudoYaleMouse60.gp # 19082 pseudoYaleMouse60NoInvalidGps.gp # Then re-load database hgLoadGenePred mm9 pseudoYale60 pseudoYaleMouse60NoInvalidGps.gp # 2010-12-24 # Add trackDb.ra entry for track, add a search and make sure # there is a description page, copy over from the gencodeYalePseudoBuild59 # html. cp /hive/users/hartera/GencodeWG/ccds/trunk/gencode/browser/trackDb/human/hg19/gencodeYalePseudoBuild59.html \ ~/kent/src/hg/makeDbb/trackDb/mouse/mm9/pseudoYale60.html # Edit this for mouse and add the list of 4 IDs of genes that were removed due # to overlapping exon coordinates. Commit to git. # Check pseudogene types in data: tail -n +2 Mouse60.txt | tawk '{print $21}' | sort | uniq #Ambiguous #Duplicated #Processed # Build class table for colouring pseudogenes by type. # copy over class table definition from a previous set of Yale pseudogenes. cp -p /hive/groups/gencode/browser/hg19/gencodeYalePseudoBuild59/gencodeYalePseudoBuild59Class.sql \ pseudoYale60Class.sql # Make the class table file: tail -n +2 Mouse60.txt \ | tawk '{print $1, $21, "Yale"}' | sort > pseudoYale60Class.txt # load table hgLoadSqlTab mm9 pseudoYale60Class \ pseudoYale60Class.sql pseudoYale60Class.txt hgsql -e 'select distinct(class) from pseudoYale60Class;' mm9 +------------+ | class | +------------+ | Ambiguous | | Processed | | Duplicated | +------------+ # Add these classes to the trackDb.ra entry for the geneClasses field and # to the list of classes with colours. # Added the classes to: # ~/kentJuly2010/kent/src/hg/makeDb/trackDb/tagTypes.tab # e.g. gClass_Processed genePred ######################################################################### # Refresh mm9.knownToVisiGene # # After we fixed another issue on hgwdev and rebuilt knownToVisiGene # it picked up the new mm9 ucsc genes. Because that has not # yet been released to RR, I remade it on hgwbeta. ssh hgwbeta knownToVisiGene mm9 ######################################################################### # SEGMENTAL DUPLICATIONS (REBUILT 9/26/20111 Fan) # corrected data file sent by email from John Huddleston [jlhudd@uw.edu]. mkdir /cluster/data/mm9/bed/genomicSuperDups/09262011 cd /cluster/data/mm9/bed/genomicSuperDups/09262011 wget --timestamping ftp://mesh.gs.washington.edu/pub/UCSC/mm9genomicSuperDups.fixed.tab.gz gzip -d mm9genomicSuperDups.fixed.tab.gz awk '($3 - $2) >= 1000 && ($9 - $8) >= 1000 {print;}' mm9genomicSuperDups.fixed.tab \ | hgLoadBed mm9 genomicSuperDups stdin \ -tab -sqlTable=$HOME/kent/src/hg/lib/genomicSuperDups.sql ##################################################################### # GRC Incident database (DONE - 2011-02-10 - Hiram) # used to be NCBI Incident - changed to GRC Incident 2012-04-12 # this procedure is run as a cron job in Hiram's account: # 43 09 * * * /hive/data/outside/grc/incidentDb/runUpdate.sh makeItSo # using the two scrips there: runUpdate.sh and update.sh # which are checked into the source tree as files: # src/hg/utils/automation/grcIncidentUpdate.sh # src/hg/utils/automation/grcRunIncidentUpdate.sh # they fetch the XML files from NCBI, convert them to SQL text # files, construct a bigBed file, and pushes it to genomewiki if # it is an update from previous # the table in the dataBase is: grcIncidentDb # which is the URL to the bb file, a single row: # http://genomewiki.ucsc.edu/images/e/e1/Mm9.grciIncidentDb.bb ######################################################################### # KOMP/IKMC (KNOCKOUT MOUSE PROJECT became Int'l Knockout Mouse Cons) (DONE 5/23/12 angie) # done 5/23/12 w/files emailed from Carol # done 8/2/11 w/files emailed from Carol # done 3/22/11 w/files emailed from Carol # renamed to ikmc 3/25/10 at Carol's request # done 3/12/10 w/files emailed from Carol 3/12 # done 12/8/09 w/files emailed from Carol 12/7 # done 7/24/09 w/files emailed from Carol 7/24 # done 5/7/09 w/files emailed from Carol Bult 5/7 # done 2/12/09 w/files emailed from Carol Bult 2/12 # done 10/21/08 w/files emailed from Carol Bult 10/18 ssh hgwdev mkdir -p /hive/data/genomes/mm9/bed/ikmc/2012_05 cd /hive/data/genomes/mm9/bed/ikmc/2012_05 # Save files emailed from Carol Bult as # 20120518_ikmc.gff.gz # Make bed12 with itemRgb: zcat 20120518_ikmc.gff.gz \ | perl -we \ 'while (<>) { \ s/\r?\n$//; \ ($chr, undef, $ctr, $s, $e, undef, undef, undef, $id, $col, $n) = split("\t"); \ if ($s eq "") { warn "$_\n"; s/^.*//; next; } # Some lines have no coords. \ $col = ($col eq "Yellow") ? "255,215,0" : \ ($col eq "Green") ? "0,240,0" : \ ($col eq "Blue") ? "0,0,200" : "0,0,0"; \ $s--; \ $id =~ s/^MGI:\d+; ([\w ]+); .*/$1/ || die "Cant parse id \"$id\""; \ $id =~ s/ //g; \ my $geneId = join("|", $chr, $ctr, "${n}_$id"); \ push @{$geneBlks{$geneId}}, [$s, $e, $col] unless $e <= 0; \ } \ warn "Got " . scalar(keys %geneBlks) . " genes.\n"; \ foreach my $geneId (keys %geneBlks) { \ my @blks = @{$geneBlks{$geneId}}; \ my ($chrom, $center, $name) = split(/\|/, $geneId); \ my $blkCount = @blks; \ @blks = sort {$a->[0] <=> $b->[0]} @blks; \ my $chromStart = $blks[0]->[0]; \ my $chromEnd = $blks[$blkCount-1]->[1]; \ my $color = $blks[0]->[2]; \ my $blkStarts = ""; \ my $blkSizes = ""; \ foreach my $blk (@blks) { \ my ($start, $end, $col) = @{$blk}; \ $blkStarts .= ($start - $chromStart) . ","; \ $blkSizes .= ($end - $start) . ","; \ if ($col ne $color) { die "Blocks of $geneId of colors $color and $col"; } \ } \ print join("\t", $chrom, $chromStart, $chromEnd, $name, 0, ".", $chromStart, \ $chromStart, $color, $blkCount, $blkSizes, $blkStarts) . "\n"; \ }' \ | sort -k 1,1 -k 2n,2n > ikmc.bed #Got 51058 genes. # No stderr empty-coord warnings this time (no unmapped items). # Make an alias-style table with associated info (MGI ID and status): zcat 20120518_ikmc.gff.gz \ | perl -wpe 's/\r?\n$//; @w = split("\t"); \ if ($w[3] eq "") { s/^.*//; next; } # Some lines have no coords. \ if ($w[4] <= 0) { s/^.*//; next; } # A few lines have end=0. \ $w[8] =~ m/^(MGI:\d+); ([\w ]+); (\w.*)/ || die; \ ($mgi, $designId, $status) = ($1, $2, $3); \ $designId =~ s/ //g; \ $_ = "$w[10]_$designId\t$mgi,$w[2],$status\n";' \ | sort -u > ikmcExtra.tab wc -l ikmcExtra.tab #51058 ikmcExtra.tab # Load 'em up: hgLoadBed mm9 ikmc ikmc.bed #Read 51058 elements of size 12 from ikmc.bed hgLoadSqlTab mm9 ikmcExtra $HOME/kent/src/hg/lib/genericAlias.sql ikmcExtra.tab checkTableCoords -verbose=2 mm9 ikmc #mm9.ikmc item Tekt3_41479 chr11:62887195-62896116: blocks 3 and 4 overlap. #mm9.ikmc item Tekt3_41478 chr11:62887195-62896116: blocks 3 and 4 overlap. #mm9.ikmc item Tekt3_41477 chr11:62887195-62896116: blocks 3 and 4 overlap. #mm9.ikmc item Tekt3_41476 chr11:62887195-62896116: blocks 3 and 4 overlap. #mm9.ikmc item Cbx1_93671 chr11:96659010-96669485: blocks 1 and 2 overlap. #mm9.ikmc item Cbx3_93731 chr6:51423841-51433715: blocks 1 and 2 overlap. # Carol talked to the Sanger folks about those... pls waive. hgsql mm9 -e 'delete from ikmc where name in \ ("Tekt3_41476", "Tekt3_41477", "Tekt3_41478", "Tekt3_41479", \ "Cbx1_93671", "Cbx3_93731")' hgsql mm9 -e 'delete from ikmcExtra where name in \ ("Tekt3_41476", "Tekt3_41477", "Tekt3_41478", "Tekt3_41479", \ "Cbx1_93671", "Cbx3_93731")' # Note from July '09: Carol noticed some very long items and is asking # Sanger about them. Here's how to check it ourselves next time: hgsql mm9 -e 'select name, (chromEnd-chromStart) as length from ikmc \ where chromEnd - chromStart > 1000000 order by length desc;' #+------------------+---------+ #| name | length | #+------------------+---------+ #| Ptprd_VG12763 | 2270724 | #| Cntnap2_VG19736 | 2241309 | #| Pcdh15_VG15967 | 1546970 | #| Magi2_VG18895 | 1477753 | #| Gpc5_VG15750 | 1432965 | #| Naaladl2_VG19786 | 1339345 | #| Agbl4_VG16439 | 1266664 | #| Prkg1_VG15918 | 1195585 | #| Ptprt_VG10147 | 1139158 | #| Nrg3_VG19738 | 1104239 | #| Anks1b_VG16505 | 1099314 | #| Erbb4_VG18672 | 1075874 | #| Nrxn1_VG16178 | 1059162 | #+------------------+---------+ runJoiner.csh mm9 ikmc # mm9.ikmcExtra.name - hits 51052 of 51052 ok ######################################################################### # LASTZ Turkey MelGal1 ( DONE - 2011-03-30 - Chin) mkdir /hive/data/genomes/mm9/bed/lastzMelGal1.2011-03-30 cd /hive/data/genomes/mm9/bed/lastzMelGal1.2011-03-30 cat << '_EOF_' > DEF # Turkey vs Mouse # TARGET: Mouse mm9 SEQ1_DIR=/scratch/data/mm9/nib SEQ1_LEN=/scratch/data/mm9/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Turkey melGal1 - single chunk big enough to run entire chrom SEQ2_DIR=/scratch/data/melGal1/melGal1.2bit SEQ2_LEN=/scratch/data/melGal1/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/hive/data/genomes/mm9/bed/lastzMelGal1.2011-03-30 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -noLoadChainSplit \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=loose > do.log 2>&1 # real 71m8.450s cat fb.mm9.chainMelGal1Link.txt # 62597891 bases of 2620346127 (2.389%) in intersection cd /hive/data/genomes/mm9/bed ln -s lastzMelGal1.2011-03-30 lastz.melGal1 # running the swap mkdir /hive/data/genomes/melGal1/bed/blastz.mm9.swap cd /hive/data/genomes/melGal1/bed/blastz.mm9.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm9/bed/lastzMelGal1.2011-03-30/DEF \ -swap \ -noLoadChainSplit \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=loose > swap.log 2>&1 # real 6m49.871s cat fb.melGal1.chainMm9Link.txt # 50649368 bases of 935922386 (5.412%) in intersection cd /hive/data/genomes/melGal1/bed ln -s blastz.mm9.swap lastz.mm9 ############################################################################ # Nuclear Lamina (woring 2011-04-04 - Chin) # The track is based on: # "Molecular maps of the reorganization of genome-nuclear lamina # interactions during differentiation" # Peric-Hupkes D, Meuleman W, Pagie L, Bruggeman SW et al. # Mol Cell 2010 May 28;38(4):603-13. PMID: 20513434 # GEO Accession Series GSE17051 # Main Contact: # Bas van Steensel # Division of Gene Regulation, room B4.042 # Netherlands Cancer Institute # Plesmanlaan 121 # 1066 CX Amsterdam # The Netherlands # Phone +31.20.5122040 # http://research.nki.nl/vansteensellab # Download data mkdir /hive/data/outside/vansteensel cd /hive/data/outside/vansteensel # Raw GEO data wget --timestamping \ ftp://ftp.ncbi.nih.gov/pub/geo/DATA/supplementary/series/GSE17051/GSE17051_RAW.tar # SOFT formatted family file(s) wget --timestamping \ ftp://ftp.ncbi.nih.gov/pub/geo/DATA/SOFT/by_series/GSE17051/GSE17051_family.soft.gz # MINiML formatted family file(s) wget --timestamping \ ftp://ftp.ncbi.nih.gov/pub/geo/DATA/MINiML/by_series/GSE17051/GSE17051_family.xml.tgz # Series Matrix File(s) wget --timestamping \ ftp://ftp.ncbi.nih.gov/pub/geo/DATA/SeriesMatrix/GSE17051/GSE17051_series_matrix.txt.gz # wiggle data from vansteensel lab: # Embryonic Stem Cells wget --timestamping \ http://bioinformatics.nki.nl/~meuleman/ES.gff.gz wget --timestamping \ http://bioinformatics.nki.nl/~meuleman/ES.wig.gz # Neuronal Precursor Cells wget --timestamping \ http://bioinformatics.nki.nl/~meuleman/NP.gff.gz wget --timestamping \ http://bioinformatics.nki.nl/~meuleman/NP.wig.gz # Astrocytes wget --timestamping \ http://bioinformatics.nki.nl/~meuleman/AC.gff.gz wget --timestamping \ http://bioinformatics.nki.nl/~meuleman/AC.wig.gz # NIH3T3 Cells wget --timestamping \ http://bioinformatics.nki.nl/~meuleman/EF.gff.gz wget --timestamping \ http://bioinformatics.nki.nl/~meuleman/EF.wig.gz mkdir /hive/data/genomes/mm9/bed/nuclearLamina cd /hive/data/genomes/mm9/bed/nuclearLamina cp /hive/data/outside/vansteensel/*.gff.gz . cp /hive/data/outside/vansteensel/*.wig.gz . gunzip *.gz # The wiggle files contain 51 set of duplicates (all on chr18), # Per data provider's instruction, fixed them by taking # the average. cat << '_EOF_' > quickCheckWig.pl #!/usr/bin/env perl use strict; use warnings; my $chrom; my $curPos; my $curValue; my $prevPos; my $prevValue; my $lineNum = 0; my $tf1; my $tf2; open(FH, $ARGV[0]) or die("Error: cannot open file '$ARGV[0]'\n"); while (my $line = ) { $lineNum += 1; if ($line =~ m/^browser/ ) { next; } elsif ($line =~ m/track/ ) { next; } elsif ($line =~ m/^variableStep/ ) { ($tf1, $chrom, $tf2) = split('\s+', $line, 3); $chrom =~ s/^chrom=//; $curPos=0; $curValue=0; $prevPos=0; $prevValue=0; next; } elsif ($line =~ m/^[0-9]/ ) { $prevPos = $curPos; $prevValue = $curValue; ($curPos, $curValue)=split('\s+', $line, 2); $curValue =~ s/\n//; $prevValue = $curValue; if ($curPos == $prevPos) { printf("Duplicate %s %s on %s at line # %s \n", $curPos, $curValue, $chrom, $lineNum); next; } } } close (FH); '_EOF_' #<< happy emacs chmod +x quickCheckWig.pl # check for duplicates in wiggles for WIG in AC EF ES NP do ./quickCheckWig.pl ${WIG}.wig > ${WIG}.dup.list 2>&1 done cat << '_EOF_' > fixDupWig.pl #!/usr/bin/env perl use strict; use warnings; my $chrom; my $curPos; my $curValue; my $prevPos; my $prevValue; my $tf1; my $tf2; sub resetAll { $curPos=0; $curValue=0; $prevPos=0; $prevValue=0; } my $lineNum = 0; my $prtPos=0; my $prtValue=0; resetAll(); open(FH, $ARGV[0]) or die("Error: cannot open file '$ARGV[0]'\n"); while (my $line = ) { $lineNum += 1; if ($line =~ m/^browser/ ) { printf("%s", $line); next; } elsif ($line =~ m/track/ ) { printf("%s", $line); next; } elsif ($line =~ m/^variableStep/ ) { # get chrom number ($tf1, $chrom, $tf2) = split('\s+', $line, 3); $chrom =~ s/^chrom=//; &resetAll(); printf ("%s",$line); next; } elsif ($line =~ m/^[0-9]/ ) { chomp($line); ($curPos, $curValue)=split('\s+', $line, 2); if ($prevPos == 0) { $prevPos = $curPos; $prevValue = $curValue; next; } elsif ($prevPos == $curPos) { $prevValue=($prevValue+$curValue)/2; next; } else { printf("%s\t%s\n", $prevPos, $prevValue); $prevPos=$curPos; $prevValue=$curValue; } } } close (FH); '_EOF_' #<< happy emacs chmod +x fixDupWig.pl # fix duplicates in wiggles by takig average of duplicates for WIG in AC EF ES NP do ./fixDupWig.pl ${WIG}.wig > ${WIG}_Fixed.wig done # tested by loading **_Fixed.wig as customer tracks ######################################################################### # LASTZ Lizard AnoCar2 (DONE - 2011-04-25 - Hiram) # XXX FYI: the date on this directory is incorrect, it was done 04-25 mkdir /hive/data/genomes/mm9/bed/lastzAnoCar2.2011-04-19 cd /hive/data/genomes/mm9/bed/lastzAnoCar2.2011-04-19 cat << '_EOF_' > DEF # mouse vs lizard BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/scratch/data/blastz/HoxD55.q # TARGET: Mouse mm9 SEQ1_DIR=/scratch/data/mm9/nib SEQ1_LEN=/scratch/data/mm9/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Lizard anoCar2 SEQ2_DIR=/scratch/data/anoCar2/anoCar2.2bit SEQ2_LEN=/scratch/data/anoCar2/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=40 BASE=/hive/data/genomes/mm9/bed/lastzAnoCar2.2011-04-19 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # establish a screen to control this job screen time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \ -syntenicNet -workhorse=hgwdev -smallClusterHub=encodek \ -bigClusterHub=swarm -qRepeats=windowmaskerSdust > do.log 2>&1 & # real 289m10.549s cat fb.mm9.chainAnoCar2Link.txt # 88067954 bases of 2620346127 (3.361%) in intersection # running the swap - DONE - 2011-04-19 mkdir /hive/data/genomes/anoCar2/bed/blastz.mm9.swap cd /hive/data/genomes/anoCar2/bed/blastz.mm9.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm9/bed/lastzAnoCar2.2011-04-19/DEF \ -noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -syntenicNet -swap -qRepeats=windowmaskerSdust > swap.log 2>&1 & # real 14m28.747s cat fb.anoCar2.chainMm9Link.txt # 84738440 bases of 1701353770 (4.981%) in intersection ############################################################################## # BUILD mm9 GERP TRACK (DONE 4/25/11, Fan) ssh hgwdev mkdir /hive/data/genomes/mm9/bed/gerp cd /hive/data/genomes/mm9/bed/gerp # place the wig data file, All_mm9_RS.wig, here. ulimit -d 180000000 ulimit -v 180000000 wigToBigWig All_mm9_RS.wig /hive/data/genomes/mm9/chrom.sizes All_mm9_RS.bw ln -s `pwd`/All_mm9_RS.bw /gbdb/mm9/bbi/All_mm9_RS.bw hgsql mm9 -e 'drop table if exists allMm9RS_BW; \ create table allMm9RS_BW (fileName varchar(255) not null); \ insert into allMm9RS_BW values ("/gbdb/mm9/bbi/All_mm9_RS.bw");' # create corresponding trackDb.ra section and html description page. ############################################################################ # Nuclear Lamina (DONE 2011-04-04 - Chin) # # "Molecular maps of the reorganization of genome-nuclear lamina # interactions during differentiation" # Peric-Hupkes D, Meuleman W, Pagie L, Bruggeman SW et al. # Mol Cell 2010 May 28;38(4):603-13. PMID: 20513434 # GEO Accession Series GSE17051 # Main Contact: # Bas van Steensel # Division of Gene Regulation, room B4.042 # Netherlands Cancer Institute # Plesmanlaan 121 # 1066 CX Amsterdam # The Netherlands # Phone +31.20.5122040 # http://research.nki.nl/vansteensellab # Download data mkdir /hive/data/outside/vansteensel cd /hive/data/outside/vansteensel # Raw GEO data which we did not use at this time, # get them any way to keep data in sync for the future wget --timestamping \ ftp://ftp.ncbi.nih.gov/pub/geo/DATA/supplementary/series/GSE17051/GSE17051_RAW.tar # SOFT formatted family file(s) wget --timestamping \ ftp://ftp.ncbi.nih.gov/pub/geo/DATA/SOFT/by_series/GSE17051/GSE17051_family.soft.gz # MINiML formatted family file(s) wget --timestamping \ ftp://ftp.ncbi.nih.gov/pub/geo/DATA/MINiML/by_series/GSE17051/GSE17051_family.xml.tgz # Series Matrix File(s) wget --timestamping \ ftp://ftp.ncbi.nih.gov/pub/geo/DATA/SeriesMatrix/GSE17051/GSE17051_series_matrix.txt.gz # wiggle data from vansteensel lab (2011-04-26): # Embryonic Stem Cells wget --timestamping \ http://bioinformatics.nki.nl/~meuleman/ES.gff.gz wget --timestamping \ http://bioinformatics.nki.nl/~meuleman/ES.wig.gz # Neuronal Precursor Cells wget --timestamping \ http://bioinformatics.nki.nl/~meuleman/NP.gff.gz wget --timestamping \ http://bioinformatics.nki.nl/~meuleman/NP.wig.gz # Astrocytes wget --timestamping \ http://bioinformatics.nki.nl/~meuleman/AC.gff.gz wget --timestamping \ http://bioinformatics.nki.nl/~meuleman/AC.wig.gz # NIH3T3 Cells wget --timestamping \ http://bioinformatics.nki.nl/~meuleman/EF.gff.gz wget --timestamping \ http://bioinformatics.nki.nl/~meuleman/EF.wig.gz ### new wiggles without duplicates from Wouter mkdir /hive/data/outside/vansteensel/2011-04-26 cd /hive/data/outside/vansteensel/2011-04-26 for WIG in AC EF ES NP do wget --timestamping \ http://bioinformatics.nki.nl/~meuleman/Chin/${WIG}.wig.gz done mkdir /hive/data/genomes/mm9/bed/nuclearLamina/rawdata cd /hive/data/genomes/mm9/bed/nuclearLamina/rawdata cp /hive/data/outside/vansteensel/2011-04-26/*.wig.gz . gunzip *.gz # The wiggle files contain 51 set of duplicates (all on chr18), # Per data provider's instruction, fixed them by taking # the average. cat << '_EOF_' > quickCheckWig.pl #!/usr/bin/env perl use strict; use warnings; my $chrom; my $curPos; my $curValue; my $prevPos; my $prevValue; my $lineNum = 0; my $tf1; my $tf2; open(FH, $ARGV[0]) or die("Error: cannot open file '$ARGV[0]'\n"); while (my $line = ) { $lineNum += 1; if ($line =~ m/^browser/ ) { next; } elsif ($line =~ m/track/ ) { next; } elsif ($line =~ m/^variableStep/ ) { ($tf1, $chrom, $tf2) = split('\s+', $line, 3); $chrom =~ s/^chrom=//; $curPos=0; $curValue=0; $prevPos=0; $prevValue=0; next; } elsif ($line =~ m/^[0-9]/ ) { $prevPos = $curPos; $prevValue = $curValue; ($curPos, $curValue)=split('\s+', $line, 2); $curValue =~ s/\n//; $prevValue = $curValue; if ($curPos == $prevPos) { printf("Duplicate %s %s on %s at line # %s \n", $curPos, $curValue, $chrom, $lineNum); next; } } } close (FH); '_EOF_' #<< happy emacs chmod +x quickCheckWig.pl # check for duplicates in wiggles for WIG in AC EF ES NP do ./quickCheckWig.pl ${WIG}.wig > ${WIG}.dup.list 2>&1 done wc -l *.dup.list # all dup.list are empty, so there is no duplicate in the wiggles. rm *.dup.list # load the tracks on hgwdev ssh hgwdev cd /cluster/data/mm9/bed/nuclearLamina/ for WIG in AC EF ES NP do wigEncode ./rawdata/${WIG}.wig laminB1_${WIG}.wig \ laminB1_${WIG}.wib done # Converted ./rawdata/AC.wig, upper limit 4.29, lower limit -5.74 # Converted ./rawdata/EF.wig, upper limit 4.92, lower limit -5.44 # Converted ./rawdata/ES.wig, upper limit 4.62, lower limit -5.57 # Converted ./rawdata/NP.wig, upper limit 4.29, lower limit -6.00 for WIG in AC EF ES NP do hgLoadWiggle mm9 laminB1_${WIG} laminB1_${WIG}.wig done # Connected to database mm9 for track laminB1_AC # Creating wiggle table definition in mm9.laminB1_AC # Saving wiggle.tab # Loading mm9 # ... ... rm wiggle.tab for WIG in AC EF ES NP do ln -s /cluster/data/mm9/bed/nuclearLamina/laminB1_${WIG}.wib \ /gbdb/mm9/wib/ done # Create the laminB1 supertrack definitions in mm9/trackDb.ra # copy the descrition html to right place cp /hive/data/outside/vansteensel/2011-05-11/mouse_laminB1_DamID.html \ /cluster/home/chinhli/kent/src/hg/makeDb/trackDb/mouse/mm9/laminB1Mm9.html cp /hive/data/outside/vansteensel/2011-05-11/MolCell2010_cartoon.png \ /cluster/home/chinhli/kent/src/hg/htdocs/images/laminB1Mm9.png # edit/rescale the html and png files # add the new html and image files to git # collect wiggle stats for track definition: for WIG in AC EF ES NP do wigTableStats.sh mm9 laminB1_${WIG} done # db.table min max mean count sumData stdDev viewLimits mm9.laminB1_AC -5.742 4.293 -0.00201324 2102030 -4231.89 1.14068 viewLimits=-5.70541:4.293 # db.table min max mean count sumData stdDev viewLimits mm9.laminB1_EF -5.444 4.922 -0.00154509 2102030 -3247.82 1.04232 viewLimits=-5.21314:4.922 # db.table min max mean count sumData stdDev viewLimits mm9.laminB1_ES -5.572 4.617 -0.00133816 2102030 -2812.85 0.929746 viewLimits=-4.65007:4.617 # db.table min max mean count sumData stdDev viewLimits mm9.laminB1_NP -5.998 4.292 -0.00148136 2102030 -3113.87 1.059 viewLimits=-5.29649:4.292 # Create the laminB1 supertrack definitions in mm9/trackDb.ra # copy the descrition html to right place cp /hive/data/outside/vansteensel/2011-05-11/mouse_laminB1_DamID.html \ /cluster/home/chinhli/kent/src/hg/makeDb/trackDb/mouse/mm9/laminB1Mm9.html # scale down to 32% convert -size 32% \ /hive/data/outside/vansteensel/2011-05-11/MolCell2010_cartoon.png \ -resize 32% \ /cluster/home/chinhli/kent/src/hg/htdocs/images/laminB1Mm9.png # edit/rescale the html and png files # add the new html and image files to git # cd ~/kent/src/hg/htdocs and make # cd ~/kent/src/hg/makeDb/trackDb and make DBS=mm9 ######################################################################### # LASTZ Cow BosTau6 (DONE - 2011-05-17 - Chin) mkdir /hive/data/genomes/mm9/bed/lastzBosTau6.2011-05-17 cd /hive/data/genomes/mm9/bed/lastzBosTau6.2011-05-17 cat << '_EOF_' > DEF # mouse vs cow # maximum M allowed with lastz is only 254 BLASTZ_M=254 # TARGET: Mouse mm9 SEQ1_DIR=/scratch/data/mm9/nib SEQ1_LEN=/scratch/data/mm9/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Cow bosTau6 SEQ2_DIR=/scratch/data/bosTau6/bosTau6.2bit SEQ2_LEN=/scratch/data/bosTau6/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/hive/data/genomes/mm9/bed/lastzBosTau6.2011-05-17 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -noLoadChainSplit \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 # real 211m26.412s cat fb.mm9.chainBosTau6Link.txt # 699351036 bases of 2620346127 (26.689%) in intersection # Create link cd /hive/data/genomes/mm9/bed ln -s lastzBosTau6.2011-05-17 lastz.bosTau6 # and the swap mkdir /hive/data/genomes/bosTau6/bed/blastz.mm9.swap cd /hive/data/genomes/bosTau6/bed/blastz.mm9.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm9/bed/lastzBosTau6.2011-05-17/DEF \ -swap -syntenicNet \ -noLoadChainSplit \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 # real 53m5.237s cat fb.bosTau6.chainMm9Link.txt # 688894115 bases of 2649682029 (25.999%) in intersection cd /hive/data/genomes/bosTau6/bed ln -s blastz.mm9.swap lastz.mm9 ############################################################################ # NUMTS TRACK (DONE 2011-06-03 - Chin) mkdir -p /hive/data/outside/Numts/mm9 cd /hive/data/outside/Numts/mm9 wget http://193.204.182.50/files/mm9/all_mouse_tracks.txt wget http://193.204.182.50/files/mm9/MMS_NumtS.html wget http://193.204.182.50/files/bam/MMS_NumtS.fasta.sorted.bam wget http://193.204.182.50/files/bam/MMS_NumtS.fasta.sorted.bam.bai mkdir /cluster/data/mm9/bed/NumtS cd /cluster/data/mm9/bed/NumtS cp /hive/data/outside/Numts/mm9/*.* . # split the all_mouse_tracks.txt into 3 bed files # mmsNumtSAssembled.bed, mmsNumtS.bed, an mmsNumtSMitochondrion.bed cat all_mouse_tracks.txt | awk ' /^track name/ {print $_}' > tracks.list cat all_mouse_tracks.txt | awk ' /^track type/ {print $_}' >> tracks.list # load the 3 bed files to mm9 hgLoadBed mm9 numtSAssembled mmsNumtSAssembled.bed hgLoadBed mm9 numtS mmsNumtS.bed hgLoadBed mm9 numtSMitochondrion mmsNumtSMitochondrion.bed # Make /gbdb/ links and load bam mkdir /gbdb/mm9/NumtS ln -s `pwd`/MMS_NumtS.fasta.sorted.bam{,.bai} /gbdb/mm9/NumtS/ hgBbiDbLink mm9 bamMmsNumtSSorted /gbdb/mm9/NumtS/MMS_NumtS.fasta.sorted.bam # setup trackDb for mm9 ############################################################################## # LASTZ X. tropicalis XenTro3 (DONE - 2011-09-20 - Hiram) mkdir /hive/data/genomes/mm9/bed/lastzXenTro3.2011-09-20 cd /hive/data/genomes/mm9/bed/lastzXenTro3.2011-09-20 cat << '_EOF_' > DEF # Mouse (mm9) vs frog (xenTro2) BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET: Mouse mm9 SEQ1_DIR=/scratch/data/mm9/mm9.2bit SEQ1_LEN=/scratch/data/mm9/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Frog xenTro3 SEQ2_DIR=/scratch/data/xenTro3/xenTro3.2bit SEQ2_LEN=/scratch/data/xenTro3/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LAP=0 SEQ2_LIMIT=100 BASE=/hive/data/genomes/mm9/bed/lastzXenTro3.2011-09-20 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # establish a screen to control this job screen time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -chainMinScore=5000 -chainLinearGap=loose \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \ > do.log 2>&1 & # real 382m43.129s cat fb.mm9.chainXenTro3Link.txt # 81920795 bases of 2620346127 (3.126%) in intersection # running the swap - DONE - 2011-09-21 mkdir /hive/data/genomes/xenTro3/bed/blastz.mm9.swap cd /hive/data/genomes/xenTro3/bed/blastz.mm9.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm9/bed/lastzXenTro3.2011-09-20/DEF \ -chainMinScore=5000 -chainLinearGap=loose \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \ -swap > swap.log 2>&1 & # real 57m36.529s cat fb.xenTro3.chainMm9Link.txt # 89770014 bases of 1358334882 (6.609%) in intersection ######################################################################### # LASTZ Cow BosTau7 (working - 2012-01-22 - Chin) mkdir /hive/data/genomes/mm9/bed/lastzBosTau7.2012-01-22 cd /hive/data/genomes/mm9/bed/lastzBosTau7.2012-01-22 cat << '_EOF_' > DEF # mouse vs cow # maximum M allowed with lastz is only 254 BLASTZ_M=254 # TARGET: Mouse mm9 SEQ1_DIR=/scratch/data/mm9/nib SEQ1_LEN=/scratch/data/mm9/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Cow bosTau7 SEQ2_DIR=/scratch/data/bosTau7/bosTau7.2bit SEQ2_LEN=/scratch/data/bosTau7/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/hive/data/genomes/mm9/bed/lastzBosTau7.2012-01-22 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -noLoadChainSplit \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 # real 190m44.307s cat fb.mm9.chainBosTau7Link.txt # 695010371 bases of 2620346127 (26.524%) in intersection # Create link cd /hive/data/genomes/mm9/bed ln -s lastzBosTau7.2012-01-22 lastz.bosTau7 # and the swap mkdir /hive/data/genomes/bosTau7/bed/blastz.mm9.swap cd /hive/data/genomes/bosTau7/bed/blastz.mm9.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm9/bed/lastzBosTau7.2012-01-22/DEF \ -swap -syntenicNet \ -noLoadChainSplit \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 # real 51m44.505s cat fb.bosTau7.chainMm9Link.txt # 711305079 bases of 2804673174 (25.361%) in intersection cd /hive/data/genomes/bosTau7/bed ln -s blastz.mm9.swap lastz.mm9 ############################################################################ # POLYA-SEQ TRACK (from Adnan Derti, Merck) (DONE, Andy 2012-02-06) # (see hg19.txt for multi-species build notes) ############################################################################## # construct liftOver to mm10 (DONE - 2012-02-09 - Hiram) screen # manage this longish running job in a screen mkdir /hive/data/genomes/mm9/bed/blat.mm10.2012-02-09 cd /hive/data/genomes/mm9/bed/blat.mm10.2012-02-09 # check it with -debug first to see if it is going to work: time doSameSpeciesLiftOver.pl -buildDir=`pwd` -bigClusterHub=swarm \ -ooc=/scratch/data/mm9/11.ooc \ -debug -dbHost=hgwdev -workhorse=hgwdev mm9 mm10 > do.log 2>&1 # if that is OK, then run it: time doSameSpeciesLiftOver.pl -buildDir=`pwd` -bigClusterHub=swarm \ -ooc=/scratch/data/mm9/11.ooc \ -dbHost=hgwdev -workhorse=hgwdev mm9 mm10 > do.log 2>&1 # real 83m43.240s # verify this file exists: og -L /gbdb/mm9/liftOver/mm9ToMm10.over.chain.gz # -rw-rw-r-- 1 535855 Feb 9 12:07 /gbdb/mm9/liftOver/mm9ToMm10.over.chain.gz # and try out the conversion on genome-test from mm9 to mm10 ############################################################################ ##########################################################################pubStart # Publications track (DONE - 04-27-12 - Max) # article download and conversion is run every night on hgwdev: # 22 22 * * * /hive/data/inside/literature/pubtools/pubCronDailyUpdate.sh # the script downloads files into /hive/data/outside/literature/{PubMedCentral,ElsevierConsyn}/ # then converts them to text into /hive/data/outside/literature/{pmc,elsevier} # all configuration of the pipeline is in /hive/data/inside/literature/pubtools/lib/pubConf.py # data processing was run manually like this export PATH=/cluster/home/max/bin/x86_64:/cluster/bin/x86_64:/cluster/home/max/software/bin/:/cluster/software/bin:/cluster/home/max/projects/pubtools:/cluster/home/max/bin/x86_64:/hive/groups/recon/local/bin:/usr/local/bin:/usr/bin:/bin:/usr/bin/X11:/cluster/home/max/usr/src/scripts:/cluster/home/max/usr/src/oneshot:/cluster/home/max/bin:/cluster/bin/scripts:.:/cluster/home/max/usr/bin:/usr/lib64/qt-3.3/bin:/usr/kerberos/bin:/usr/local/bin:/bin:/usr/bin:/usr/lpp/mmfs/bin/:/opt/dell/srvadmin/bin:/cluster/bin/scripts:/hive/users/hiram/cloud/ec2-api-tools-1.3-51254/bin:/cluster/home/max/bin:/usr/bin/X11:/usr/java/jdk1.6.0_20/bin:/cluster/home/max/bin:/hive/data/inside/literature/pubtools/ # pmc cd /hive/data/inside/literature/pubtools/runs/pmcBlat/ pubBlat init /hive/data/inside/literature/blat/pmc/ /hive/data/inside/literature/text/pmc ssh swarm cd /hive/data/inside/literature/pubtools/runs/pmcBlat/ pubBlat steps:annot-tables exit pubBlat load # elsevier cd /hive/data/inside/literature/pubtools/runs/elsBlat/ pubBlat init /hive/data/inside/literature/blat/elsevier/ /hive/data/inside/literature/text/elsevier ssh swarm cd /hive/data/inside/literature/pubtools/runs/elsBlat/ pubBlat steps:annot-tables exit pubBlat load #--pubEnd ############################################################################ # orfeome 2012-03-16 (markd) enabled ORFeome tracks in etc/genbank.conf and reload genbank ############################################################################