# This file describes how we made the browser database on the mouse # genome, June 2004 build. - Mm5 # # # NOTE: There is a new chrMT sequence in the build 32 # >gi|34538597|ref|NC_005089.1| Mus musculus mitochondrion # # Will have to beware of this NC_ contig in the processing since # all previous builds had only NT_ contigs # # NOTE: The README_PREBUILD file for this assembly mentions several # differences from the previous release (build 30): # 1. seq_contig.md - new first line is a comment containing column name # Also, last two columns (group label and weight, have been swapped) # Also, some lines have id with CONTIG: prepended, and upper-case # feature type (CONTIG) # 2. contig.idmap - has an additional column "contig label" # This required changing the jkStuff ncbi* utilities (7/1/03 KRR) # # DOWNLOAD THE MOUSE SEQUENCE FROM NCBI (DONE - 2004-06-27 - Fan) ssh kksilo mkdir -p /cluster/store6/mm5/ncbi ln -s /cluster/store6/mm5 /cluster/data cd /cluster/data/mm5/ncbi mkdir chrfasta contigfasta ftp ftp.ncbi.nih.gov # user hgpguest, password from /cse/faculty/kent/buildHg6.doc cd mouse_33 prompt bin mget * quit gunzip *.agp.gz # compress chrY.fa (at NCBI site, this one file some how was not compressed) cd chrfasta gzip chrY.fa cd .. #use chrMT.fa.gz from mm4 instead because its first line format is correct cp -p /cluster/store6/mm4/ncbi/chrfasta/chrMT.fa.gz chrfasta cp -p /cluster/store6/mm4/ncbi/contigfasta/chrMT.fa.gz contigfasta # Fix the troubles caused by chrMT released later separately # Fixed allcontig.agp # add the last line of .../mm4/ncbi/allcontig.agp to allcontig.agp # Fixed allrefcontig.chr.agp # add the last line of .../mm4/ncbi/allrefcontig.chr.agp to allrefcontig.chr.agp # Fix contig.idmap cat contig.idmap chrMT/contig.idmap >new.idmap mv new.idmap contig.idmap # Fix seq_contig.md # Edit seq_contig.md to add 3 lines (from mm4) in its middle before Un|... 10090 MT 0 0 + start -1 CONTIG C57BL/6J 1010090 MT 1 16299 + NC_005089 GI:34538597 CONTIG C57BL/6J na10090 MT 16299 16299 + end -2 CONTIG C57BL/6J 10 # ctg_coords, contig_overlaps.agp and sequence.inf not fixed. # Check chromosome files (DONE - 2004-06-27 - Fan) cd chrfasta foreach f (*.fa.gz) echo $f:r >> faSize.out gunzip $f /cluster/bin/i386/faSize $f:r >> faSize.out echo $f:r done end /cluster/bin/i386/faSize *.fa >> faSize.out grep "^>" *.fa > ../chrfasta.all.fa.headers gzip *.fa cd ../contigfasta gunzip *.fa.gz grep "^>" *.fa > ../contigfasta.all.fa.headers gzip *.fa # BREAK UP SEQUENCE INTO 5 MB CHUNKS AT NON-BRIDGED CONTIGS # (DONE - 2004-06-27 - Fan) ssh kksilo cd /cluster/data/mm5 gunzip ncbi/allrefcontig.chr.agp.gz # splitFaIntoContigs doesn't do right with agp lines arriving in a # different order than fasta chrom sequences. so split up the agp # into one per chrom. foreach c ( 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 X Y MT Un) mkdir $c perl -we "while(<>){if (/^chr$c\t/) {print;}}" \ ./ncbi/allrefcontig.chr.agp \ > $c/chr$c.agp gunzip -c ./ncbi/chrfasta/chr$c.fa.gz \ | perl -wpe 's/^>lcl\|(chr\w+)\.fa.*/>$1/' \ | splitFaIntoContigs $c/chr$c.agp \ stdin /cluster/data/mm5 -nSize=5000000 end # gzip ncbi/chrfasta/chr*.fa # CREATE CHROM-LEVEL AGP AND FASTA FOR _RANDOMS (DONE 2004-06-27 - Fan) ssh kksilo cd /cluster/data/mm5/ncbi gunzip seq_contig.md.gz # reorder random contigs in allrefcontig agp file to match seq_contig.md # this is required by the ncbiToRandomAgps scripts # had to fixup ncbiToRandomAgps from previous use to match the # lines better, and to do the MT/NC_ mitochondrion thing mkdir /cluster/store6/mm5/jkStuff # copy scripts used from previous trial mm5 build cd /cluster/data/mm5 cp -p ~/mm50/jkStuff/* jkStuff cd /cluster/data/mm5/ncbi ../jkStuff/ncbiFixAgp allrefcontig.chr.agp > \ allrefcontig.chr.ordered.agp #Edit MANUALLY ../jkStuff/ncbiToRandomAgps, to change build 32 to build 33. ../jkStuff/ncbiToRandomAgps seq_contig.md allrefcontig.chr.ordered.agp \ contig.idmap .. # creating ../mm5/1/chr1_random.agp... # ... creating ../mm5/Un/chrUn_random.agp... # The chrUn_random.agp created by this is too large with the 5000 # gaps. it will work with 1000 gaps, so fixup the chrUn_random agp: ../jkStuff/ncbiToRandomAgps -gapLen 1000 -chrom Un \ seq_contig.md allrefcontig.chr.ordered.agp contig.idmap .. ssh kksilo cd /cluster/data/mm5 foreach c (?{,?}) if (-e $c/chr${c}_random.ctg.agp) then echo building $c/chr${c}_random.fa gunzip -c ./ncbi/contigfasta/chr$c.fa.gz \ | perl -wpe 's/^>lcl\|(Mm\w+)\s+.*$/>$1/' \ > ./tmp.fa agpToFa -simpleMulti $c/chr${c}_random.ctg.agp chr${c}_random \ $c/chr${c}_random.fa ./tmp.fa rm tmp.fa endif end # building 1/chr1_random.fa # ... etc ... # building Un/chrUn_random.fa # Writing 102265694 bases to Un/chrUn_random.fa # Clean these up to avoid confusion later... they're easily rebuilt # with the ncbiToRandomAgps script above rm ?/*.ctg.agp ??/*.ctg.agp # BREAK UP _RANDOMS INTO 5 MB CHUNKS AT NON-BRIDGED CONTIGS (DONE 2004-06-27 - Fan) ssh kksilo cd /cluster/data/mm5 foreach c (?{,?}) if (-e $c/chr${c}_random.agp) then splitFaIntoContigs $c/chr${c}_random.agp $c/chr${c}_random.fa . \ -nSize=5000000 mkdir -p $c/lift mv ${c}_random/lift/oOut.lst $c/lift/rOut.lst mv ${c}_random/lift/ordered.lft $c/lift/random.lft mv ${c}_random/lift/ordered.lst $c/lift/random.lst rmdir ${c}_random/lift rm ${c}_random/chr${c}_random.{agp,fa} mv ${c}_random/* $c rmdir ${c}_random endif end # This has a lot of output. It is difficult to see if anything # goes wrong. # Fixup chrMT name to be chrM (DONE - 2004-06-27 - Fan) ssh kksilo cd /cluster/data/mm5 mv MT MT.ncbi mkdir M mkdir M/chrM_1 mkdir M/lift cd MT.ncbi bash find . -type f | while read FN do NF=`echo $FN | sed -e "s/MT/M/g"` sed -e "s/chrMT/chrM/g" $FN > ../M/$NF done # MAKE LIFTALL.LFT (DONE - 2003-06-27 - Fan) cd /cluster/data/mm5 cat ?{,?}/lift/{ordered,random}.lft > jkStuff/liftAll.lft # 7:40 PM 6/27/04, used dark blue color above. # Now changed to use dark pink color for things done. # CREATING DATABASE (DONE 2004-06-27 - Fan) # First, clean out mm5 tables built by previous trail build. # Rename all mm5.* tables to mm5_old4.*, # then drop database mm5 o - Create the database. ssh hgwdev hgsql -e 'create database mm5;' '' # if you need to delete this database: !!! WILL DELETE EVERYTHING !!! # hgsql -e "drop database mm5;" mm5 o - Use df to make sure there is at least 5 gig free on hgwdev:/var/lib/mysql df -h /var/lib/mysql Filesystem Size Used Avail Use% Mounted on /dev/sdc1 1.8T 383G 1.3T 24% /var/lib/mysql # CREATING GRP TABLE FOR TRACK GROUPING (DONE - 2004-06-27 - Fan) # Use any of the newest databases to ensure that the organization # of the grp table is up to date ssh hgwdev hgsql -e "create table grp (PRIMARY KEY(NAME)) select * from hg16.grp" mm5 # STORING O+O SEQUENCE AND ASSEMBLY INFORMATION (DONE - 2004-06-27 - Fan) # Create (unmasked) nib files ssh kksilo cd /cluster/data/mm5 mkdir -p unmaskedNib foreach f (?{,?}/chr?{,?}{,_random}.fa) echo $f:t:r faToNib $f unmaskedNib/$f:t:r.nib end # Create symbolic links from /gbdb/mm5/nib to real nib files # These unmasked Nib files are temporary just to get the browser # up an running immediately. After the masking is done and masked # sequence is created, these nibs will be replaced with the masked # nibs ssh hgwdev mkdir -p /gbdb/mm5/nib cd /gbdb/mm5/nib ln -s /cluster/data/mm5/unmaskedNib/chr*.nib . # Load /gbdb nib paths into database and save size info. ssh hgwdev cd /cluster/data/mm5 hgsql mm5 < ~/kent/src/hg/lib/chromInfo.sql hgNibSeq -preMadeNib mm5 /gbdb/mm5/nib ?{,?}/chr?{,?}{,_random}.fa # 3164952073 total bases # NOTE: mm4 was 2952612207, an increase of 212 Mb (~7.2%) hgsql -N -e "select chrom,size from chromInfo;" mm5 > chrom.sizes # check the resulting file chrom.sizes # Store o+o info in database. cd /cluster/data/mm5/ncbi gunzip sequence.inf cd /cluster/data/mm5 ln -s ncbi ffa # remove so as not to confuse hgGoldGap -- they are easily regenerated rm */chr*.ctg.agp # to undo/redo: # jkStuff/dropSplitTable.csh gap # jkStuff/dropSplitTable.csh gold /cluster/bin/i386/hgGoldGapGl mm5 /cluster/data/mm5 . featureBits mm5 gold # 2615483787 bases of 2615483787 (100.000%) in intersection featureBits mm4 gold # 2627444668 bases of 2627444668 (100.000%) in intersection featureBits mm5 gap # 549468286 bases of 2615483787 (21.008%) in intersection featureBits mm4 gap # 325167539 bases of 2627444668 (12.376%) in intersection featureBits mm3 gap # 202319873 bases of 2505900260 (8.074%) in intersection # Make and load GC percent table (DONE - 2004-06-27 - Fan) # NOT REQUIRED, been replaced by gc5Base procedure below ssh hgwdev mkdir -p /cluster/data/mm5/bed/gcPercent cd /cluster/data/mm5/bed/gcPercent hgsql mm5 < ~/kent/src/hg/lib/gcPercent.sql hgGcPercent mm5 ../../unmaskedNib # MAKE HGCENTRALTEST ENTRY AND TRACKDB TABLE FOR MM5 (DONE - 2004-06-27 - Fan) # using the Mm3 position blatted onto Mm5: # Enter mm5 into hgcentraltest.dbDb so test browser knows about it: hgsql -e 'INSERT INTO dbDb \ (name, description, nibPath, organism, defaultPos, \ active, orderKey, genome, scientificName, htmlPath, \ hgNearOk, hgPbOk, sourceName) \ VALUES("mm5", "May 2004", "/gbdb/mm5/nib", "Mouse", \ "chr6:121658238-121674165", \ 1, 20, "Mouse", "Mus musculus", "/gbdb/mm5/html/description.html",\ 0, 0, "NCBI Build 33");' \ -h genome-testdb hgcentraltest # If you need to delete that entry: hgsql -e 'delete from dbDb where name="mm5";' -h genome-testdb hgcentraltest # Make trackDb table so browser knows what tracks to expect: ssh hgwdev cd ~kent/src/hg/makeDb/trackDb cvs up -d -P # Edit that makefile to add mm5 in all the right places and do make update make alpha cvs commit makefile # MAKE HGCENTRALTEST BLATSERVERS ENTRY FOR MM5 (DONE - 2004-07-14 Fan) ssh hgwdev # Make one big 2bit file as well, and make a link to it in # /gbdb/mm5/nib because hgBlat looks there: cd /cluster/data/mm5 faToTwoBit */chr*.fa mm5.2bit ln -s /cluster/data/mm5/mm5.2bit /gbdb/mm5/nib/ hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ VALUES ("mm5", "snort", "17778", "1", "0"); \ INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ VALUES ("mm5", "snort", "17779", "0", "1");' \ -h genome-testdb hgcentraltest # REPEAT MASKING (Working on 2004-06-27 Fan) # TRF simpleRepeat below can be run at the same time # Split contigs, run RepeatMasker, lift results # * Contigs (*/chr*_*/chr*_*.fa) are split into 500kb chunks to make # RepeatMasker runs manageable on the cluster ==> results need lifting. # * For the NCBI assembly we repeat mask on the sensitive mode setting # (RepeatMasker -m -s -ali) #- Split contigs into 500kb chunks: ssh kksilo cd /cluster/data/mm5 foreach d ( */chr?{,?}{,_random}_?{,?} ) cd $d set contig = $d:t faSplit size $contig.fa 500000 ${contig}_ -lift=$contig.lft \ -maxN=500000 cd ../.. end # ... # 11 pieces of 11 written # 1 pieces of 1 written # ... #- Make the run directory and job list: cd /cluster/data/mm5 cat << '_EOF_' > jkStuff/RMMouse #!/bin/csh -fe cd $1 pushd . /bin/mkdir -p /tmp/mm5/$2 /bin/cp $2 /tmp/mm5/$2 cd /tmp/mm5/$2 /cluster/bluearc/RepeatMasker/RepeatMasker -ali -s -species mus $2 popd /bin/cp /tmp/mm5/$2/$2.out ./ if (-e /tmp/mm5/$2/$2.align) /bin/cp /tmp/mm5/$2/$2.align ./ if (-e /tmp/mm5/$2/$2.tbl) /bin/cp /tmp/mm5/$2/$2.tbl ./ if (-e /tmp/mm5/$2/$2.cat) /bin/cp /tmp/mm5/$2/$2.cat ./ /bin/rm -fr /tmp/mm5/$2/* /bin/rmdir --ignore-fail-on-non-empty /tmp/mm5/$2 /bin/rmdir --ignore-fail-on-non-empty /tmp/mm5 '_EOF_' chmod +x jkStuff/RMMouse mkdir -p RMRun rm -f RMRun/RMJobs foreach d ( ?{,?}/chr*_?{,?} ) foreach f ( $d/chr*_?{,?}_?{,?}.fa ) set f = $f:t echo /cluster/data/mm5/jkStuff/RMMouse \ /cluster/data/mm5/$d $f \ '{'check out line+ /cluster/data/mm5/$d/$f.out'}' \ >> RMRun/RMJobs end end #- Do the run ssh kk cd /cluster/data/mm5/RMRun para create RMJobs para try, para check, para check, para push, para check,... [kk:RMRun> para check 6885 jobs in batch 8 jobs (including everybody's) in Parasol queue. Checking finished jobs. ranOk: 6885 total jobs in batch: 6885 [kk:RMRun> para time 6885 jobs in batch 8 jobs (including everybody's) in Parasol queue. Checking finished jobs Completed: 6885 of 6885 jobs CPU time in finished jobs: 40084305s 668071.74m 11134.53h 463.94d 1.271 y IO & Wait Time: 122589s 2043.16m 34.05h 1.42d 0.004 y Average job time: 5840s 97.33m 1.62h 0.07d Longest job: 9804s 163.40m 2.72h 0.11d Submission to last job: 46771s 779.52m 12.99h 0.54d # Done 11:57 AM 6/28/04 #- Lift up the split-contig .out's to contig-level .out's ssh kksilo cd /cluster/data/mm5 foreach d ( ?{,?}/chr*_?{,?} ) cd $d set contig = $d:t liftUp $contig.fa.out $contig.lft warn ${contig}_*.fa.out > /dev/null cd ../.. end #- Lift up the contig-level .out's to chr-level ssh kksilo cd /cluster/data/mm5 ./jkStuff/liftOut5.csh # This one error is OK # Can not find Un/lift/ordered.lft . #- Load the .out files into the database with: ssh hgwdev cd /cluster/data/mm5 # to redo: # ./jkStuff/dropSplitTable.csh rmsk # make sure there's no chrUn -- rm Un/chrUn.fa.out hgLoadOut mm5 ?/*.fa.out ??/*.fa.out # VERIFY REPEATMASKER RESULTS (DONE - 2004-06-28 Fan) # Run featureBits on mm5 and on a comparable genome build, and compare: ssh hgwdev featureBits mm5 rmsk #1137310280 bases of 2615483787 (43.484%) in intersection #featureBits mm4 rmsk 1130883581 bases of 2627444668 (43.041%) in intersection #featureBits mm3 rmsk 1080265553 bases of 2505900260 (43.109%) in intersection #cd /cluster/data/mm5 #awk '{print $1}' chrom.sizes | sed -e "s/chr//" | grep -v random > chrom.lst # SIMPLE REPEAT TRACK (DONE - 2004-06-29 Fan) # TRF can be run in parallel with RepeatMasker on the file server # since it doesn't require masked input sequence. ssh kksilo mkdir /cluster/data/mm5/bed/simpleRepeat cd /cluster/data/mm5/bed/simpleRepeat mkdir trf rm -f jobs.csh echo '#\!/bin/csh -fe' > jobs.csh # create job list of 5MB chunks foreach f \ (/cluster/data/mm5/?{,?}/chr?{,?}_[0-9]*/chr?{,?}_?{,?}.fa \ /cluster/data/mm5/?{,?}/chr*_random_?{,?}/chr*_random_?{,?}.fa) set fout = $f:t:r.bed echo "/cluster/bin/i386/trfBig -trf=/cluster/bin/i386/trf $f /dev/null -bedAt=trf/$fout -tempDir=/tmp" \ >> jobs.csh end chmod +x jobs.csh wc jobs.csh # 640 3836 90839 jobs.csh ./jobs.csh >&! jobs.log & # in bash: ./jobs.csh > jobs.log 2>&1 & tail -f jobs.log # Done 3:07 PM 6/29/04, took about 6 hours. # When job is done lift output files liftUp simpleRepeat.bed /cluster/data/mm5/jkStuff/liftAll.lft warn trf/*.bed # Load into the database ssh hgwdev cd /cluster/data/mm5/bed/simpleRepeat hgLoadBed mm5 simpleRepeat simpleRepeat.bed \ -sqlTable=$HOME/src/hg/lib/simpleRepeat.sql # Loaded 1150615 elements of size 16 featureBits mm5 simpleRepeat # 81414259 bases of 2615483787 (3.113%) in intersection featureBits mm4 simpleRepeat # 82600648 bases of 2627444668 (3.144%) in intersection featureBits mm3 simpleRepeat # 75457193 bases of 2505900260 (3.011%) in intersection # PROCESS SIMPLE REPEATS INTO MASK (DONE - 2004-06-29 - Fan) # After the simpleRepeats track has been built, make a filtered version # of the trf output: keep trf's with period <= 12: ssh kksilo cd /cluster/data/mm5/bed/simpleRepeat mkdir -p trfMask foreach f (trf/chr*.bed) awk '{if ($5 <= 12) print;}' $f > trfMask/$f:t end # Lift up filtered trf output to chrom coords cd /cluster/data/mm5 mkdir -p bed/simpleRepeat/trfMaskChrom foreach c (?{,?}) if (-e $c/lift/ordered.lst) then perl -wpe 's@(\S+)@bed/simpleRepeat/trfMask/$1.bed@' \ $c/lift/ordered.lst > $c/lift/oTrf.lst liftUp bed/simpleRepeat/trfMaskChrom/chr$c.bed \ jkStuff/liftAll.lft warn `cat $c/lift/oTrf.lst` else echo "WARNING NO FILE: $c/lift/ordered.lst" endif if (-e $c/lift/random.lst) then perl -wpe 's@(\S+)@bed/simpleRepeat/trfMask/$1.bed@' \ $c/lift/random.lst > $c/lift/rTrf.lst liftUp bed/simpleRepeat/trfMaskChrom/chr${c}_random.bed \ jkStuff/liftAll.lft warn `cat $c/lift/rTrf.lst` endif end # NOTE: ignore warning about non-existent Un/Lift/ordered.lift # since there is no chrUn # MASK SEQUENCE WITH BOTH REPEATMASKER AND SIMPLE REPEAT/TRF # (Working on - 2004-06-29 Fan) ssh kksilo cd /cluster/data/mm5 #- Soft-mask (lower-case) the contig and chr .fa's ./jkStuff/makeFaMasked.csh >&! maskFa.out & # bash: ./jkStuff/makeFaMasked.csh > maskFa.out 2>&1 & tail -100f maskFa.out #- Make hard-masked .fa.masked files as well: ./jkStuff/makeHardMasked.csh Edited ./jkStuff/makeNib.csh to comment out "if ..." and "endif" as below: #!/bin/csh -fe mkdir -p nib mixedNib maskedNib foreach i (?{,?}) cd $i # foreach j (chr$i{,_random}.fa) foreach j (*.fa) # if (-e "${j}") set r = $j:r /cluster/bin/i386/faToNib $j ../nib/$r.nib /cluster/bin/i386/faToNib -softMask $j ../mixedNib/$r.nib /cluster/bin/i386/faToNib -hardMask $j ../maskedNib/$r.nib # endif echo done $j end cd .. end #- Rebuild the nib, mixedNib, maskedNib files: ./jkStuff/makeNib.csh # ignore complaints about missing chrUn # Redo symbolic links from /gbdb/mm5/nib to # mixed (RM and TRF) soft-masked nib files ssh hgwdev rm -fr /gbdb/mm5/nib/* ln -s /cluster/data/mm5/mixedNib/chr*.nib /gbdb/mm5/nib # Copy data to /cluster/bluearc for cluster runs ssh kksilo # masked contigs rm -fr /cluster/bluearc/scratch/mus/mm5/trfFa mkdir -p /cluster/bluearc/scratch/mus/mm5/trfFa cp -p /cluster/data/mm5/?{,?}/chr*_*/chr?{,?}{,_random}_?{,?}.fa \ /cluster/bluearc/scratch/mus/mm5/trfFa # masked chrom nibs cd /cluster/data/mm5 rm -fr /cluster/bluearc/scratch/mus/mm5/softNib mkdir -p /cluster/bluearc/scratch/mus/mm5/softNib cp -p mixedNib/chr*.nib /cluster/bluearc/scratch/mus/mm5/softNib rm -fr /cluster/bluearc/scratch/mus/mm5/hardNib mkdir -p /cluster/bluearc/scratch/mus/mm5/hardNib cp -p maskedNib/chr*.nib /cluster/bluearc/scratch/mus/mm5/hardNib # fasta files rm -fr /cluster/bluearc/scratch/mus/mm5/fasta mkdir -p /cluster/bluearc/scratch/mus/mm5/fasta cp -p ?/*.fa ??/*.fa /cluster/bluearc/scratch/mus/mm5/fasta # RepeatMasker *.out files rm -rf /cluster/bluearc/scratch/mus/mm5/rmsk mkdir -p /cluster/bluearc/scratch/mus/mm5/rmsk cp -p ?{,?}/chr?{,?}{,_random}.fa.out /cluster/bluearc/scratch/mus/mm5/rmsk # lift file, for mrna processing cp -p jkStuff/liftAll.lft /cluster/bluearc/scratch/mus/mm5 #above was done 6/29/04 4:50PM # also copy to iservers ssh kkr1u00 #cd ~/mm5 cd /cluster/bluearc/scratch/mus/mm5 mkdir /iscratch/i/mus/mm5 cp -p liftAll.lft /iscratch/i/mus/mm5 mkdir -p /iscratch/i/mus/mm5/softNib cp -p /cluster/bluearc/scratch/mus/mm5/softNib/chr*.nib /iscratch/i/mus/mm5/softNib mkdir -p /iscratch/i/mus/mm5/trfFa cd /cluster/store6/mm5 cp ?{,?}/chr*_*/chr?{,?}{,_random}_?{,?}.fa /cluster/bluearc/scratch/mus/mm5/trfFa /cluster/bin/scripts/iSync ssh kkr1u00 mkdir /iscratch/i/mus/mm5 cd /iscratch/i/mus rsync -arlv /cluster/bluearc/scratch/mus/mm5 . #wrote 8660800915 bytes read 15380 bytes 17729409.00 bytes/sec #total size is 10242205742 speedup is 1.18 cd /iserver/kkr1u00/i/mus/mm5 mv trfFa maskedContigs cd /cluster/bluearc/scratch/mus/mm5 mv trfFa maskedContigs # PREPARE CLUSTER FOR BLASTZ RUN (DONE - 2004-06-29 - Fan) ssh kksilo mkdir -p /cluster/bluearc/scratch/mus/mm5/rmsk.spec cd /cluster/bluearc/scratch/mus/mm5/rmsk.spec ln -s ../rmsk/*.out . # NOTE: DON't leave indentations in the script below. cat << '_EOF_' > runArian.sh #!/bin/sh for FN in *.out do echo ${FN} /cluster/bluearc/RepeatMasker/DateRepsinRMoutput.pl \ ${FN} -query mouse -comp human -comp rat done '_EOF_' chmod +x runArian.sh ./runArian.sh cd /cluster/bluearc/scratch/mus/mm5 mkdir linSpecRep.notInHuman mkdir linSpecRep.notInRat foreach f (rmsk.spec/*.out_hum_rat) set base = $f:t:r:r echo $base.out.spec /cluster/bin/scripts/extractLinSpecReps 1 $f > \ linSpecRep.notInHuman/$base.out.spec end foreach f (rmsk.spec/*.out_hum_rat) set base = $f:t:r:r echo $base.out.spec /cluster/bin/scripts/extractLinSpecReps 2 $f > \ linSpecRep.notInRat/$base.out.spec end cp rmsk.spec /iscratch/i/mus/mm5 -Rp cp linSpecRep.notInRat /iscratch/i/mus/mm5 -Rp cp linSpecRep.notInHuman /iscratch/i/mus/mm5 -Rp /cluster/bin/scripts/iSync # Request rsync /cluster/bluearc/scratch/mus/mm5 to the KiloKluster # GC5BASE WIGGLE TRACK (DONE - 2004-06-24 - Hiram) # This previously was a script that ran through each nib. # Recently transformed into a mini cluster run. ssh kki mkdir /cluster/data/mm5/bed/gc5Base cd /cluster/data/mm5/bed/gc5Base mkdir wigData5 dataLimits5 wigData5_1K dataLimits5_1K cat << '_EOF_' > kkRun.sh #!/bin/sh NIB=$1 chr=${NIB/.nib/} chrom=${chr#chr} hgGcPercent -chr=${chr} -doGaps -file=stdout -win=5 mm5 \ /cluster/data/mm5/mixedNib | \ grep -w GC | \ awk '{if (($3-$2) >= 5) {printf "%d\t%.1f\n", $2+1, $5/10.0} }' | \ wigAsciiToBinary -dataSpan=5 -chrom=${chr} \ -wibFile=wigData5/gc5Base_${chrom} \ -name=${chrom} stdin 2> dataLimits5/${chr} '_EOF_' # << this line makes emacs coloring happy chmod +x kkRun.sh ls /cluster/data/mm5/mixedNib > nibList cat << '_EOF_' > gsub #LOOP ./kkRun.sh $(path1) #ENDLOOP '_EOF_' # << this line makes emacs coloring happy gensub2 nibList single gsub jobList para create jobList para try, check, ... etc # Completed: 43 of 43 jobs # CPU time in finished jobs: 4969s 82.81m 1.38h 0.06d 0.000 y # IO & Wait Time: 611s 10.19m 0.17h 0.01d 0.000 y # Average job time: 130s 2.16m 0.04h 0.00d # Longest job: 370s 6.17m 0.10h 0.00d # Submission to last job: 598s 9.97m 0.17h 0.01d # load the .wig files back on hgwdev: ssh hgwdev cd /cluster/data/mm5/bed/gc5Base hgLoadWiggle -pathPrefix=/gbdb/mm5/wib/gc5Base mm5 gc5Base wigData5/*.wig # and symlink the .wib files into /gbdb mkdir /gbdb/mm5/wib/gc5Base ln -s `pwd`/wigData5/*.wib /gbdb/mm5/wib/gc5Base # And then the zoomed data view ssh kki cd /cluster/data/mm5/bed/gc5Base mkdir wigData5_1K dataLimits5_1K cat << '_EOF_' > kkRunZoom.sh #!/bin/sh NIB=$1 chr=${NIB/.nib/} chrom=${chr#chr} hgGcPercent -chr=${chr} -doGaps -file=stdout -win=5 mm5 \ /cluster/data/mm5/mixedNib | \ grep -w GC | \ awk '{if (($3-$2) >= 5) {printf "%d\t%.1f\n", $2+1, $5/10.0} }' | \ wigZoom -dataSpan=1000 stdin | wigAsciiToBinary -dataSpan=1000 \ -chrom=${chr} -wibFile=wigData5_1K/gc5Base_${chrom}_1K \ -name=${chrom} stdin 2> dataLimits5_1K/${chr} '_EOF_' # << this line makes emacs coloring happy chmod +x kkRunZoom.sh cat << '_EOF_' > gsubZoom #LOOP ./kkRunZoom.sh $(path1) #ENDLOOP '_EOF_' # << this line makes emacs coloring happy gensub2 nibList single gsubZoom jobListZoom para create jobListZoom para try ... check ... etc ... # Completed: 43 of 43 jobs # CPU time in finished jobs: 4878s 81.29m 1.35h 0.06d 0.000 y # IO & Wait Time: 488s 8.14m 0.14h 0.01d 0.000 y # Average job time: 125s 2.08m 0.03h 0.00d # Longest job: 378s 6.30m 0.10h 0.00d # Submission to last job: 665s 11.08m 0.18h 0.01d # Then load these .wig files into the same database as above ssh hgwdev cd /cluster/data/mm5/bed/gc5Base hgLoadWiggle -pathPrefix=/gbdb/mm5/wib/gc5Base \ -oldTable mm5 gc5Base wigData5_1K/*.wig # and symlink these .wib files into /gbdb ln -s `pwd`/wigData5_1K/*.wib /gbdb/mm5/wib/gc5Base # GC5BASE WIGGLE TRACK (DONE - 2004-07-01 - Hiram) # This previously was a script that ran through each nib. # Recently transformed into a mini cluster run. ssh kki mkdir /cluster/data/mm5/bed/gc5Base cd /cluster/data/mm5/bed/gc5Base mkdir wigData5 dataLimits5 wigData5_1K dataLimits5_1K cat << '_EOF_' > kkRun.sh #!/bin/sh NIB=$1 chr=${NIB/.nib/} chrom=${chr#chr} hgGcPercent -chr=${chr} -doGaps -file=stdout -win=5 mm5 \ /cluster/data/mm5/mixedNib | \ grep -w GC | \ awk '{if (($3-$2) >= 5) {printf "%d\t%.1f\n", $2+1, $5/10.0} }' | \ wigAsciiToBinary -dataSpan=5 -chrom=${chr} \ -wibFile=wigData5/gc5Base_${chrom} \ -name=${chrom} stdin 2> dataLimits5/${chr} '_EOF_' # << this line makes emacs coloring happy chmod +x kkRun.sh ls /cluster/data/mm5/mixedNib > nibList cat << '_EOF_' > gsub #LOOP ./kkRun.sh $(path1) #ENDLOOP '_EOF_' # << this line makes emacs coloring happy gensub2 nibList single gsub jobList para create jobList para try, check, ... etc # Completed: 43 of 43 jobs # CPU time in finished jobs: 4857s 80.94m 1.35h 0.06d 0.000 y # IO & Wait Time: 121s 2.02m 0.03h 0.00d 0.000 y # Average job time: 116s 1.93m 0.03h 0.00d # Longest job: 335s 5.58m 0.09h 0.00d # Submission to last job: 516s 8.60m 0.14h 0.01d # load the .wig files back on hgwdev: ssh hgwdev cd /cluster/data/mm5/bed/gc5Base hgLoadWiggle -pathPrefix=/gbdb/mm5/wib/gc5Base mm5 gc5Base wigData5/*.wig # and symlink the .wib files into /gbdb mkdir /gbdb/mm5/wib mkdir /gbdb/mm5/wib/gc5Base ln -s `pwd`/wigData5/*.wib /gbdb/mm5/wib/gc5Base # And then the zoomed data view ssh kki cd /cluster/data/mm5/bed/gc5Base mkdir wigData5_1K dataLimits5_1K cat << '_EOF_' > kkRunZoom.sh #!/bin/sh NIB=$1 chr=${NIB/.nib/} chrom=${chr#chr} hgGcPercent -chr=${chr} -doGaps -file=stdout -win=5 mm5 \ /cluster/data/mm5/mixedNib | \ grep -w GC | \ awk '{if (($3-$2) >= 5) {printf "%d\t%.1f\n", $2+1, $5/10.0} }' | \ wigZoom -dataSpan=1000 stdin | wigAsciiToBinary -dataSpan=1000 \ -chrom=${chr} -wibFile=wigData5_1K/gc5Base_${chrom}_1K \ -name=${chrom} stdin 2> dataLimits5_1K/${chr} '_EOF_' # << this line makes emacs coloring happy chmod +x kkRunZoom.sh cat << '_EOF_' > gsubZoom #LOOP ./kkRunZoom.sh $(path1) #ENDLOOP '_EOF_' # << this line makes emacs coloring happy gensub2 nibList single gsubZoom jobListZoom para create jobListZoom para try ... check ... etc ... # Completed: 43 of 43 jobs # CPU time in finished jobs: 4819s 80.31m 1.34h 0.06d 0.000 y # IO & Wait Time: 82s 1.37m 0.02h 0.00d 0.000 y # Average job time: 114s 1.90m 0.03h 0.00d # Longest job: 336s 5.60m 0.09h 0.00d # Submission to last job: 500s 8.33m 0.14h 0.01d # Then load these .wig files into the same database as above ssh hgwdev cd /cluster/data/mm5/bed/gc5Base hgLoadWiggle -pathPrefix=/gbdb/mm5/wib/gc5Base \ -oldTable mm5 gc5Base wigData5_1K/*.wig # and symlink these .wib files into /gbdb ln -s `pwd`/wigData5_1K/*.wib /gbdb/mm5/wib/gc5Base # BLASTZ HG17 (WORKING - 2004-07-06 - Hiram) ssh kk mkdir -p /cluster/data/mm5/bed/blastz.hg17.2004-07-06 cd /cluster/data/mm5/bed ln -s blastz.hg17.2004-07-06 blastz.hg17 cd blastz.hg17 cat << '_EOF_' > DEF # mouse vs. human export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/home/angie/schwartzbin:/cluster/home/kent/bin/i386 ALIGN=blastz-run BLASTZ=blastz BLASTZ_H=2000 BLASTZ_ABRIDGE_REPEATS=1 # TARGET # Mouse SEQ1_DIR=/scratch/mus/mm5/softNib # not used SEQ1_RMSK=/scratch/mus/mm5/rmsk # not used SEQ1_FLAG=-rodent SEQ1_SMSK=/scratch/mus/mm5/linSpecRep.notInHuman SEQ1_IN_CONTIGS=0 SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY # Human SEQ2_DIR=/iscratch/i/gs.18/build35/bothMaskedNibs # RMSK not currently used SEQ2_RMSK= # FLAG not currently used SEQ2_FLAG= SEQ2_SMSK=/iscratch/i/gs.18/build35/linSpecRep.notInMouse SEQ2_IN_CONTIGS=0 SEQ2_CHUNK=30000000 SEQ2_LAP=0 BASE=/cluster/data/mm5/bed/blastz.hg17 DEF=$BASE/DEF RAW=$BASE/raw CDBDIR=$BASE SEQ1_LEN=$BASE/S1.len SEQ2_LEN=$BASE/S2.len '_EOF_' # << this line keeps emacs coloring happy # prepare first cluster run ssh kk cd /cluster/data/mm5/bed/blastz.hg17 # OK to use this script here, it is generic, works anywhere /cluster/data/hg17/jkStuff/BlastZ_run0.sh cd run.0 para try, check, push, check, .... # Completed: 46717 of 46717 jobs # CPU time in finished jobs: 16171136s 269518.93m 4491.98h 187.17d 0.513 y # IO & Wait Time: 534501s 8908.35m 148.47h 6.19d 0.017 y # Average job time: 358s 5.96m 0.10h 0.00d # Longest job: 5263s 87.72m 1.46h 0.06d # Submission to last job: 30066s 501.10m 8.35h 0.35d # the file server to its knees. Run this on the small cluster. ssh kki cd /cluster/data/mm5/bed/blastz.hg17 /cluster/data/hg17/jkStuff/BlastZ_run1.sh cd run.1 para try, check, push, etc ... # Completed: 341 of 341 jobs # CPU time in finished jobs: 2186s 36.43m 0.61h 0.03d 0.000 y # IO & Wait Time: 1804s 30.07m 0.50h 0.02d 0.000 y # Average job time: 12s 0.20m 0.00h 0.00d # Longest job: 82s 1.37m 0.02h 0.00d # Submission to last job: 3895s 64.92m 1.08h 0.05d # Third cluster run to convert lav's to axt's # Does not work on kki since /scratch on the iservers is not the # same as /scratch on the other clusters. ssh kk cd /cluster/data/mm5/bed/blastz.hg17 /cluster/data/hg17/jkStuff/BlastZ_run2.sh cd run.2 para try, check, push, etc ... # Completed: 43 of 43 jobs # CPU time in finished jobs: 2099s 34.98m 0.58h 0.02d 0.000 y # IO & Wait Time: 6862s 114.37m 1.91h 0.08d 0.000 y # Average job time: 208s 3.47m 0.06h 0.00d # Longest job: 1276s 21.27m 0.35h 0.01d # Submission to last job: 1291s 21.52m 0.36h 0.01d # translate sorted axt files into psl ssh kksilo cd /cluster/data/mm5/bed/blastz.hg17 mkdir p pslChrom set tbl = "blastzHg17" foreach f (axtChrom/chr*.axt) set c=$f:t:r echo "Processing chr $c" /cluster/bin/i386/axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl end # This takes more than an hour. You can shorten this by changing # that command to a simple echo, put the results into a file, # split the file into four parts and run the four files as shell # scripts on kksilo to have four processes running at the same # time. Load on kksilo gets up to about 20 which is reasonable. # Load database tables ssh hgwdev cd /cluster/data/mm5/bed/blastz.hg17/pslChrom bash # for tcsh users for F in chr*_blastzHg17.psl do /cluster/bin/i386/hgLoadPsl -noTNameIx mm5 ${F} echo "${F} done" done # this is a 40 minute job # exit bash if you are tcsh # featureBits on blastzMm3 or 4 will not work on hgwdev, runs out of # memory. But if you reset your ~/.hg.conf to use the read-only # user and contact the hgwdev host, then use the x86_64 featureBits # featureBits mm5 blastzHg17 # 1057836001 bases of 2615483787 (40.445%) in intersection # featureBits mm4 blastzHg16 # 1068995521 bases of 2627444668 (40.686%) in intersection # CHAIN MM5 BLASTZ (DONE - 2004-07-02 - Hiram) # The axtChain is best run on the small kluster, or the kk9 kluster ssh kki mkdir -p /cluster/data/mm5/bed/blastz.hg17/axtChain/run1 cd /cluster/data/mm5/bed/blastz.hg17/axtChain/run1 mkdir out chain ls -1S /cluster/data/mm5/bed/blastz.hg17/axtChrom/*.axt > input.lst cat << '_EOF_' > gsub #LOOP doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} out/$(root1).out #ENDLOOP '_EOF_' # << this line makes emacs coloring happy # May need -minScore=5000 for all chroms if chr19 won't finish on kolossus cat << '_EOF_' > doChain #!/bin/csh axtChain $1 /iscratch/i/mus/mm5/softNib \ /iscratch/i/gs.18/build35/bothMaskedNibs $2 > $3 '_EOF_' # << this line makes emacs coloring happy chmod a+x doChain # 46 jobs gensub2 input.lst single gsub jobList para create jobList para try para push # ... etc ... # Completed: 43 of 43 jobs # CPU time in finished jobs: 5354s 89.23m 1.49h 0.06d 0.000 y # IO & Wait Time: 10543s 175.72m 2.93h 0.12d 0.000 y # Average job time: 370s 6.16m 0.10h 0.00d # Longest job: 1694s 28.23m 0.47h 0.02d # Submission to last job: 1694s 28.23m 0.47h 0.02d # now on the file server, sort chains ssh kksilo cd /cluster/data/mm5/bed/blastz.hg17/axtChain time chainMergeSort run1/chain/*.chain > all.chain # real 4m53.428s # user 4m3.040s # sys 0m29.440s time chainSplit chain all.chain # real 4m34.674s # user 3m38.370s # sys 0m29.990s # optionally: rm run1/chain/*.chain # Load chains into database # next machine ssh hgwdev cd /cluster/data/mm5/bed/blastz.hg17/axtChain/chain bash # for tcsh users for I in *.chain do c=${I/.chain/} hgLoadChain mm5 ${c}_chainHg17 $I echo done $c done # exit bash if you are tcsh # This is a 50 minute job # featureBits mm5 chainHg17 # 2507720521 bases of 2615483787 (95.880%) in intersection # featureBits mm4 chainHg16 # 2558968088 bases of 2627444668 (97.394%) in intersection # NET MM5 (WORKING - 2004-07-02 - Hiram) ssh kksilo cd /cluster/data/mm5/bed/blastz.hg17/axtChain mkdir preNet cd chain bash # for tcsh users for I in *.chain do echo preNetting $I /cluster/bin/i386/chainPreNet $I /cluster/data/mm5/chrom.sizes \ /cluster/data/hg17/chrom.sizes ../preNet/$I done # exit bash if you are tcsh # 7 minute job cd .. mkdir n1 cd preNet bash # for tcsh users for I in *.chain do n=${I/.chain/}.net echo primary netting $I $n /cluster/bin/i386/chainNet $I -minSpace=1 /cluster/data/mm5/chrom.sizes \ /cluster/data/hg17/chrom.sizes ../n1/$n /dev/null done # exit bash if you are tcsh # 5 minute job cd .. cat n1/*.net | /cluster/bin/i386/netSyntenic stdin hNoClass.net # memory usage 2546110464, utime 16327 s/100, stime 3546 ssh hgwdev cd /cluster/data/mm5/bed/blastz.hg17/axtChain time netClass hNoClass.net mm5 hg17 human.net \ -tNewR=/cluster/bluearc/scratch/mus/mm5/linSpecRep.notInHuman \ -qNewR=/cluster/bluearc/scratch/hg/gs.18/build35/linSpecRep.notInMouse # real 9m45.271s # user 6m47.170s # sys 1m20.440s # If things look good do ssh kksilo cd /cluster/data/mm5/bed/blastz.hg17/axtChain rm -r n1 hNoClass.net # Make a 'syntenic' subset of these with time netFilter -syn human.net > humanSyn.net # real 12m3.701s # user 8m44.180s # sys 1m1.610s # Load the nets into database ssh hgwdev cd /cluster/data/mm5/bed/blastz.hg17/axtChain netFilter -minGap=10 human.net | hgLoadNet mm5 netHg17 stdin netFilter -minGap=10 humanSyn.net | hgLoadNet mm5 syntenyNetHg17 stdin # check results # featureBits mm5 netHg17 # 2504056038 bases of 2615483787 (95.740%) in intersection # featureBits mm4 netHg16 # 2553137690 bases of 2627444668 (97.172%) in intersection # featureBits mm5 syntenyNetHg17 # 2460442823 bases of 2615483787 (94.072%) in intersection # featureBits mm4 syntenyNetHg16 # 2495783103 bases of 2627444668 (94.989%) in intersection # Add entries for net and chain to mouse/hg17 trackDb # make net ssh kksilo cd /cluster/data/mm5/bed/blastz.hg17/axtChain mkdir humanNet time netSplit human.net humanNet # real 4m46.190s # user 3m27.740s # sys 0m38.900s # extract axt's from net, and convert to maf's ssh kksilo cd /cluster/data/mm5/bed/blastz.hg17/axtChain mkdir ../axtNet ../mafNet cat > makeMaf.csh << '_EOF_' #!/bin/csh -ef foreach f (humanNet/chr*.net) set c = $f:t:r echo "netToAxt: $c.net -> $c.axt" rm -f ../axtNet/$c.axt netToAxt humanNet/$c.net chain/$c.chain \ /cluster/data/mm5/nib /cluster/data/hg17/nib stdout | \ axtSort stdin ../axtNet/$c.axt axtToMaf ../axtNet/$c.axt \ /cluster/data/mm5/chrom.sizes /cluster/data/hg17/chrom.sizes \ ../mafNet/$c.maf -tPrefix=mm5. -qPrefix=hg17. echo "Complete: $c.net -> axtNet/$c.axt -> mafNet/$c.maf" end '_EOF_' # << for emacs csh makeMaf.csh >&! makeMaf.log & tail -100f makeMaf.log # real 39m53.316s # user 20m2.530s # sys 4m40.120s ssh hgwdev mkdir /cluster/data/mm5/bed/blastz.hg17/axtBest cd /cluster/data/mm5/bed/blastz.hg17/axtBest ln -s ../axtNet/chr*.axt . # copy net axt's to download area ssh hgwdev cd /cluster/data/mm5/bed/blastz.hg17/axtNet mkdir -p /usr/local/apache/htdocs/goldenPath/mm5/vsHg17/axtNet cp -p *.axt /usr/local/apache/htdocs/goldenPath/mm5/vsHg17/axtNet cd /usr/local/apache/htdocs/goldenPath/mm5/vsHg17/axtNet gzip *.axt XXX - running 2004-07-13 14;18 # add README.txt file to dir (use previous assembly's copy as template) # 32 minute gzip # Convert those axt files to psl ssh kksilo cd /cluster/data/mm5/bed/blastz.hg17 mkdir pslBest foreach a (axtBest/chr*.axt) set c=$a:t:r echo -n "processing $c.axt -> ${c}_blastzBesthg17.psl ..." /cluster/bin/i386/axtToPsl axtBest/${c}.axt \ S1.len S2.len pslBest/${c}_blastzBestHg17.psl echo "Done" end # Load tables ssh hgwdev cd /cluster/data/mm5/bed/blastz.hg17/pslBest for I in chr*BestHg17.psl do /cluster/bin/i386/hgLoadPsl -noTNameIx mm5 ${I} echo "done ${I}" done # check results # featureBits mm5 blastzBestHg17 # 1020692679 bases of 2615483787 (39.025%) in intersection # featureBits mm4 blastzBestHg16 # 1030510540 bases of 2627444668 (39.221%) in intersection # Make /gbdb links and add them to the axtInfo table: mkdir -p /gbdb/mm5/axtBest/Hg17 cd /gbdb/mm5/axtBest/Hg17 ln -s /cluster/data/mm5/bed/blastz.hg17/axtNet/chr*.axt . cd /cluster/data/mm5/bed/blastz.hg17/axtNet rm -f axtInfoInserts.sql foreach f (/gbdb/mm5/axtBest/Hg17/chr*.axt) set chr=$f:t:r echo "INSERT INTO axtInfo (species, alignment, chrom, fileName) \ VALUES ('hg17','Blastz Best in Genome','$chr','$f');" \ >>! axtInfoInserts.sql end hgsql mm5 < ~/kent/src/hg/lib/axtInfo.sql # table axtInfo may already exist, ignore create error. hgsql mm5 < axtInfoInserts.sql # MM5 TO CANFAM1 LIFTOVER CHAIN (DONE 1/7/05 Andy) ssh kolossus cd /cluster/data/mm5/bed/blastz.canFam1/axtChain mkdir net netSplit dog.net net mkdir over for file in chain/*.chain; do chrom=`basename $file .chain` netChainSubset net/$chrom.net chain/$chrom.chain over/$chrom.over cat over/$chrom.over >> /cluster/data/mm5/bed/liftOver/mm5ToCanFam1.chain done rm -rf over/ ssh hgwdev cd /usr/local/apache/htdocs/goldenPath/mm5/liftOver cp /cluster/data/mm5/bed/liftOver/mm5ToCanFam1.chain . gzip mm5ToCanFam1.chain mkdir -p /gbdb/mm5/liftOver ln -s /cluster/data/mm5/bed/liftOver/mm5ToCanFam1.chain /gbdb/mm5/liftOver/mm5ToCanFam1.over.chain hgAddLiftOverChain -multiple mm5 canFam1 # ADD CHAIN AND NET TO VSHG17 DOWNLOAD AREAS (DONE Sept. 8th, 2004, Heather) ssh hgwdev cp -p /cluster/data/mm5/bed/blastz.hg17/axtChain/all.chain.gz \ /usr/local/apache/htdocs/goldenPath/mm5/vsHg17/human.chain.gz cp -p /cluster/data/mm5/bed/blastz.hg17/axtChain/human.net.gz \ /usr/local/apache/htdocs/goldenPath/mm5/vsHg17/human.net.gz cd /usr/local/apache/htdocs/goldenPath/mm5/vsHg17 md5sum *.gz */*.gz > md5sum.txt # Update the README.txt # LIFTOVER CHAIN TO MM6 (DONE 4/20/2005 Andy) ssh kkstore cd /cluster/data/mm6 mkdir liftSplits/ cat << _EOF_ > split.csh #!/bin/tcsh set liftDir = /cluster/data/mm6/liftSplits cd /cluster/data/mm6 foreach n (\`ls ?{,?}/*.fa\`) set d = \$n:h set c = \$n:t:r echo \$c faSplit -lift=\$liftDir/lift/\$c.lft size /cluster/data/mm6/\$d/\$c.fa -oneFile 3000 \$liftDir/split/\$c end _EOF_ chmod +x split.csh ./split.csh # kkstore not mounting /panasas ... weird. ssh hgwdev cd /cluster/data/mm6 cp -r liftSplits/ /panasas/store/mm6 ssh kk cd /cluster/data/mm5 makeLoChain-align mm5 /scratch/mus/mm5/softNib \ mm6 /panasas/store/mm6/liftSplits/split # Created parasol job in bed/blat.mm6.2005-04-20/run cd bed/blat.mm6.2005-04-20/run/ para create spec para push # para time was complicated by the fact I redid some hippos (mostly chrUn_random # alignments) on kk9. Basically, it took about a day. # In the end, the chrUn_random vs. chrUn_random just took wayyyyyy too long. # Later, if a more rigorous chain file is desired, it can be made after rerunning # that blat. # Lifting ssh kksilo cd /cluster/data/mm5/bed/blat.mm6 makeLoChain-lift mm5 mm6 /panasas/store/mm6/liftSplits/lift \ > lift.log & tail -f lift.log # OK so I remember this problem with makeLoChain-lift: it always stops with chr1. # I'll just do it manually. cd raw/ for nib in `ls /cluster/data/mm6/nib`; do chrom=${nib%.nib} echo $chrom liftUp -pslQ ../psl/${chrom}.psl /panasas/store/mm6/liftSplits/lift/${chrom}.lft warn chr*_${chrom}.psl echo done $chrom done ssh kk9 cd /cluster/data/mm5/bed ln -s blat.mm6.2005-04-20 blat.mm6.2005-04-22 makeLoChain-chain mm5 /cluster/data/mm5/nib mm6 /cluster/data/mm6/nib cd /cluster/data/mm5/bed/blat.mm5.2005-02-08/chainRun para try para check para push para time #Completed: 40 of 40 jobs #CPU time in finished jobs: 27315s 455.25m 7.59h 0.32d 0.001 y #IO & Wait Time: 67093s 1118.22m 18.64h 0.78d 0.002 y #Average job time: 2360s 39.34m 0.66h 0.03d #Longest running job: 0s 0.00m 0.00h 0.00d #Longest finished job: 11656s 194.27m 3.24h 0.13d #Submission to last job: 31329s 522.15m 8.70h 0.36d # That looks weird but I think it was because 8 jobs crashed because there was no disk space. # I freed up some space but then there wasn't much room for the netting stage. # It crashed twice when I tried it using the script makeLoChain-net after the # chainMergeSort/split. I figured out that it needed more memory. So I ran it manually on # kolossus ssh kolossus mkdir -p /tmp/andy cd /tmp/andy cp -r /cluster/data/mm5/bed/blat.mm6/chainRaw . rm -rf /cluster/data/mm5/bed/blat.mm6/chainRaw mkdir chain chainMergeSort chainRaw/*.chain | chainSplit chain stdin mkdir net over cd chain for c in *.chain; do echo ${c%.chain}; chainNet $c /cluster/data/mm5/chrom.sizes \ /cluster/data/mm6/chrom.sizes ../net/${c%.chain}.net /dev/null echo done $c done for chain in *; do c=${chain%.chain} netChainSubset ../net/$c.net $chain ../over/$c.over done cd ../over/ cat * >> ../mm5ToMm6.chain cd ../ cp mm5ToMm6.chain /cluster/data/mm5/bed/liftOver/ cd /cluster/data/mm5/bed/liftOver mv mm5ToMm6.chain mm5ToMm6.over.chain ssh hgwdev ln -s /cluster/data/mm5/bed/liftOver/mm5ToMm6.over.chain /gbdb/mm5/liftOver/mm5ToMm6.over.chain hgAddLiftOverChain mm5 mm6 /gbdb/mm5/liftOver/mm5ToMm6.over.chain cd /usr/local/apache/htdocs/goldenPath/mm5/liftOver cp /gbdb/mm5/liftOver/mm5ToMm6.over.chain . gzip mm5ToMm6.over.chain # MAKING HUMAN SYNTENY (DONE - 2004-07-13 - Hiram) ssh hgwdev mkdir /cluster/data/mm5/bed/syntenyHg17 cd /cluster/data/mm5/bed/syntenyHg17 # Copy all the needed scripts from /cluster/data/hg16/bed/syntenyRn3 cp -p /cluster/data/hg17/bed/syntenyRn3/*.pl . ./syntenicBest.pl -db=mm5 -table=blastzBestHg17 > synBest.out 2>&1 ./smooth.pl > smooth.out 2>&1 ./joinsmallgaps.pl > joingaps.out 2>&1 ./fillgap.pl -db=mm5 -table=blastzBestHg17 > fillgap.out 2>&1 ./synteny2bed.pl > syn2bed.out 2>&1 # The five commands above # real 168m43.627s # user 0m18.680s # sys 0m4.990s # Used to load this in syntenyHg17, but that type is misleading to # the table browser and fails the checkTableCoords check. # Better to use this ensRatMusHom type: # Need a new name here for the Hg17 to not conflict with the # others sed -e 's/ensPhusionBlast/ensRatMusHg17/g' \ $HOME/kent/src/hg/lib/ensPhusionBlast.sql \ > ensRatMusHg17.sql hgLoadBed mm5 ensRatMusHg17 ucsc100k.bed -sqlTable=ensRatMusHg17.sql # featureBits mm5 ensRatMusHg17 # 2366463967 bases of 2615483787 (90.479%) in intersection # featureBits mm4 syntenyHg16 # 2299774191 bases of 2627444668 (87.529%) in intersection # MAKING MOUSE AXTTIGHT FROM AXTBEST (DONE - 2004-07-13 - Hiram) # After creating axtBest alignments above, use subsetAxt to get axtTight: ssh kksilo cd /cluster/data/mm5/bed/blastz.hg17/axtNet mkdir -p ../axtTight bash # for tcsh users for I in *.axt do echo "axtNet/$I -> ../axtTight/$I" subsetAxt $I ../axtTight/$I \ ~kent/src/hg/mouseStuff/subsetAxt/coding.mat 3400 done # exit bash if you are tcsh # An 8 minute job # translate to psl cd ../axtTight mkdir ../pslTight bash # for tcsh users for I in *.axt do C=${I/.axt/} axtToPsl $I ../S1.len ../S2.len ../pslTight/${C}_blastzTightHg17.psl echo "Done: $I -> ${C}_blastzTightHg17.psl" done # exit bash if you are tcsh # Load tables into database ssh hgwdev cd /cluster/data/mm5/bed/blastz.hg17/pslTight for I in chr*TightHg17.psl do /cluster/bin/i386/hgLoadPsl -noTNameIx mm5 ${I} echo "done ${I}" done # Compare results with previous assembly: # featureBits mm5 blastzTightHg17 # 168148800 bases of 2615483787 (6.429%) in intersection # featureBits mm4 blastzTightHg16 # 170163839 bases of 2627444668 (6.476%) in intersection # copy axt's to download area ssh hgwdev cd /cluster/data/mm5/bed/blastz.hg17/axtTight mkdir -p /usr/local/apache/htdocs/goldenPath/mm5/vsHg17/axtTight cp -p *.axt /usr/local/apache/htdocs/goldenPath/mm5/vsHg17/axtTight cd /usr/local/apache/htdocs/goldenPath/mm5/vsHg17/axtTight gzip *.axt # add README.txt file to dir (use previous assembly's copy as template) # 4 minute gzip #### BUILD Ensembl cross-reference table, ensemblXref3 (DONE - 2004-07-13 - Fan) # PLEASE NOTE THAT THE ENSEMBLXREF3 TABLE IS BUILT USING ENSMART DATA OF MOUSE BUILD 32. # THIS TABLE IS NEEDED TO SUPPORT SUPERFAMILY TRACK OF THE PROTEOME BROWSER. # WHEN ENSEMBL FINISHES THEIR MOUSE BUILD 33 RELEASE, WE NEED TO REBUILD THIS # TABLE. # Get the ensembl gene/protein cross-reference data from # http://www.ensembl.org/Multi/martview?species=Mus_musculus # Follow this sequence through the pages: # Page 1) Make sure that the Mus musculus choice is selected. Hit next. # Page 2) Uncheck the "Limit to" box in the region choice. Then hit next. # Page 3) Choose the "Feature" box, select Ensembl gene, transcript, and peptid IDs, SPTrEMBL ID, SWISSPROT ID, and SWISSPROT AC # Page 4) Choose "Text, tab separated". choose gzip compression. hit export. # Save as ensXref sed ensXref.tsv -e 's/\./\t/g' > ensemblXref3.tab hgsql mm5 -e "drop table ensemblXref3" hgsql mm5 < ~/src/hg/lib/ensemblXref3.sql hgsql mm5 -e 'load data local infile "ensemblXref3.tab" into table ensemblXref3 ignore 1 lines' # CPGISLANDS (DONE - 2004-07-13 - Fan) ssh hgwdev mkdir -p /cluster/data/mm5/bed/cpgIsland cd /cluster/data/mm5/bed/cpgIsland # Build software from Asif Chinwalla (achinwal@watson.wustl.edu) cvs co hg3rdParty/cpgIslands cd hg3rdParty/cpgIslands make # gcc readseq.c cpg_lh.c -o cpglh.exe mv cpglh.exe /cluster/data/mm5/bed/cpgIsland/ # cpglh.exe requires hard-masked (N) .fa's. # There may be warnings about "bad character" for IUPAC ambiguous # characters like R, S, etc. Ignore the warnings. ssh kksilo cd /cluster/data/mm5/bed/cpgIsland foreach f (../../*/chr*.fa.masked) set fout=$f:t:r:r.cpg echo running cpglh on $f to $fout ./cpglh.exe $f > $fout end # the warnings: # Bad char 0x52 = 'R' at line 117472, base 5873535, sequence chr14 # Bad char 0x53 = 'S' at line 120651, base 6032462, sequence chr14 # Bad char 0x53 = 'S' at line 120652, base 6032546, sequence chr14 # real 21m47.823s # user 18m30.810s # sys 1m13.420s # Transform cpglh output to bed + cat << '_EOF_' > filter.awk { $2 = $2 - 1; width = $3 - $2; printf("%s\t%d\t%s\t%s %s\t%s\t%s\t%0.0f\t%0.1f\t%s\t%s\n", $1, $2, $3, $5,$6, width, $6, width*$7*0.01, 100.0*2*$6/width, $7, $9); } '_EOF_' # << this line makes emacs coloring happy awk -f filter.awk chr*.cpg > cpgIsland.bed ssh hgwdev cd /cluster/data/mm5/bed/cpgIsland hgLoadBed mm5 cpgIslandExt -tab -noBin \ -sqlTable=$HOME/kent/src/hg/lib/cpgIslandExt.sql cpgIsland.bed # Reading cpgIsland.bed # Loaded 16238 elements of size 10 # Sorted # Saving bed.tab # Loading mm5 # MAKE DOWNLOADABLE SEQUENCE FILES (DONE 2004-07-14 Fan) ssh kksilo cd /cluster/data/mm5 # Build the .zip files cp /cluster/data/rn3/jkStuff/zipAll.sh jkStuff # edit this zipAll.sh to produce output to /cluster/data/mm5/bigZips jkStuff/zipAll.sh > zipAll.log # bash: ./jkStuff/zipAll.sh > zipAll.log 2>&1 & tail -f zipAll.log mkdir zip mv *.zip zip cd zip # Look at zipAll.log to make sure all file lists look reasonable. # Check zip file integrity: foreach f (*.zip) unzip -t $f > $f.test tail -1 $f.test end wc -l *.zip.test # 46 chromAgp.zip.test # 45 chromFa.zip.test # 45 chromFaMasked.zip.test # 45 chromOut.zip.test # 45 chromTrf.zip.test # 641 contigAgp.zip.test # 641 contigFa.zip.test # 641 contigFaMasked.zip.test # 641 contigOut.zip.test # 641 contigTrf.zip.test #3431 total ssh hgwdev cd /cluster/data/mm5/jkStuff # create generic copy program cat << '_EOF_' > cpToWeb.sh #!/bin/sh if [ $# -ne 1 ]; then echo "usage: cpToWeb.sh " echo -e "\texample: cpToWeb.sh mm5" exit 255 fi GP=/usr/local/apache/htdocs/goldenPath/$1 mkdir -p ${GP} mkdir -p ${GP}/chromosomes for f in ../?/*.fa ../??/*.fa do BN=`basename ${f}` zip -j ${GP}/chromosomes/${BN}.zip ${f} echo "zipped: ${BN}" done mkdir -p ${GP}/bigZips for Z in *.zip do cp -p ${Z} ${GP}/bigZips echo "copied: ${Z}" done '_EOF_' # << this line keeps emacs coloring happy chmod +x cpToWeb.sh cd /cluster/data/mm5/zip ../jkStuff/cpToWeb.sh mm5 cd /usr/local/apache/htdocs/goldenPath/mm5 # Take a look at bigZips/* and chromosomes/*, update their README.txt's # Make the upstream sequence files. # NOTE: must be redone due to bad gap track cd bigZips featureBits mm5 refGene:upstream:1000 -fa=upstream1000.fa zip upstream1000.zip upstream1000.fa rm upstream1000.fa featureBits mm5 refGene:upstream:2000 -fa=upstream2000.fa zip upstream2000.zip upstream2000.fa rm upstream2000.fa featureBits mm5 refGene:upstream:5000 -fa=upstream5000.fa zip upstream5000.zip upstream5000.fa rm upstream5000.fa # mrna zips -- auto dump process takes care of this # MAKE LINEAGE-SPECIFIC REPEATS FOR CHICKEN (DONE 7/15/04 angie) # In an email 2/13/04, Arian said we could treat all human repeats as # lineage-specific for human-chicken blastz. Do the same for mouse. # Scripts expect *.out.spec filenames, so set that up: ssh kkr1u00 cd /cluster/data/mm5 mkdir /iscratch/i/mus/mm5/linSpecRep.notInChicken foreach f (/iscratch/i/mus/mm5/rmsk/chr*.fa.out) cp -p $f /iscratch/i/mus/mm5/linSpecRep.notInChicken/$f:t:r:r.out.spec end iSync # Use these the next time we run human-chicken blastz. # BLASTZ CHICKEN (GALGAL2) (DONE 7/19/04 angie) ssh kk mkdir /cluster/data/mm5/bed/blastz.galGal2.2004-07-15 ln -s blastz.galGal2.2004-07-15 /cluster/data/mm5/bed/blastz.galGal2 cd /cluster/data/mm5/bed/blastz.galGal2.2004-07-15 # Use human-chicken params: set L=10000 (higher threshold on blastz's # outer loop) and abridge repeats. cat << '_EOF_' > DEF # mouse vs. chicken export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin ALIGN=blastz-run BLASTZ=blastz # Specific settings for chicken (per Webb email to Brian Raney) BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=10000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q BLASTZ_ABRIDGE_REPEATS=1 # TARGET: Mouse SEQ1_DIR=/scratch/mus/mm5/softNib SEQ1_RMSK= SEQ1_FLAG= SEQ1_SMSK=/iscratch/i/mus/mm5/linSpecRep.notInChicken SEQ1_IN_CONTIGS=0 SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Chicken SEQ2_DIR=/iscratch/i/galGal2/nib SEQ2_RMSK= SEQ2_FLAG= SEQ2_SMSK=/iscratch/i/galGal2/linSpecRep SEQ2_IN_CONTIGS=0 SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/cluster/data/mm5/bed/blastz.galGal2.2004-07-15 DEF=$BASE/DEF RAW=$BASE/raw CDBDIR=$BASE SEQ1_LEN=$BASE/S1.len SEQ2_LEN=$BASE/S2.len '_EOF_' # << this line keeps emacs coloring happy # first cluster run: raw blastz alignments ssh kk bash # if a csh/tcsh user cd /cluster/data/mm5/bed/blastz.galGal2.2004-07-15 source DEF mkdir $RAW run.0 /cluster/home/angie/hummus/make-joblist $DEF > $BASE/run.0/j sh ./xdir.sh cd run.0 sed -e 's@^blastz-run@/cluster/bin/penn/blastz-run@' j > jobList para create jobList para try, check, push, check, .... #Completed: 51491 of 51491 jobs #Average job time: 357s 5.95m 0.10h 0.00d #Longest job: 1015s 16.92m 0.28h 0.01d #Submission to last job: 89841s 1497.35m 24.96h 1.04d # second cluster run: lift raw alignments -> lav dir ssh kki bash # if a csh/tcsh user cd /cluster/data/mm5/bed/blastz.galGal2.2004-07-15 source DEF mkdir run.1 lav /cluster/bin/scripts/blastz-make-out2lav $DEF $BASE > $BASE/run.1/jobList cd run.1 wc -l jobList para create jobList para try, check, push, etc ... #Completed: 341 of 341 jobs #Average job time: 11s 0.18m 0.00h 0.00d #Longest job: 55s 0.92m 0.02h 0.00d #Submission to last job: 245s 4.08m 0.07h 0.00d # third run: lav -> axt # NOTE: use axtRescore here because we used a non-default BLASTZ_Q matrix # and abridged repeats (Penn State's restore_rpts program rescores with # default matrix, oops). ssh kki cd /cluster/data/mm5/bed/blastz.galGal2.2004-07-15 mkdir axtChrom pslChrom run.2 cd run.2 cat << '_EOF_' > do.csh #!/bin/csh -ef cd $1 set chr = $1:t set path = (/cluster/bin/x86_64 $path) cat `ls -1 *.lav | sort -g` \ | lavToAxt stdin \ /iscratch/i/mus/mm5/softNib /iscratch/i/galGal2/nib stdout \ | axtRescore -scoreScheme=/cluster/data/blastz/HoxD55.q stdin stdout \ | axtSort stdin ../../axtChrom/$chr.axt axtToPsl ../../axtChrom/$chr.axt ../../S1.len ../../S2.len \ ../../pslChrom/$chr.psl '_EOF_' # << this line keeps emacs coloring happy chmod a+x do.csh cp /dev/null jobList foreach d (../lav/chr*) echo "do.csh $d" >> jobList end para create jobList para try, check, push, check #Completed: 43 of 43 jobs #Average job time: 38s 0.63m 0.01h 0.00d #Longest job: 160s 2.67m 0.04h 0.00d #Submission to last job: 233s 3.88m 0.06h 0.00d # CHAIN CHICKEN BLASTZ (DONE 7/19/04 angie) # Run axtChain on little cluster ssh kki cd /cluster/data/mm5/bed/blastz.galGal2.2004-07-15 mkdir -p axtChain/run1 cd axtChain/run1 mkdir out chain ls -1S /cluster/data/mm5/bed/blastz.galGal2.2004-07-15/axtChrom/*.axt \ > input.lst cat << '_EOF_' > gsub #LOOP doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out} #ENDLOOP '_EOF_' # << this line makes emacs coloring happy cat << '_EOF_' > doChain #!/bin/csh axtChain -scoreScheme=/cluster/data/blastz/HoxD55.q \ -linearGap=/cluster/data/blastz/chickenHumanTuned.gap \ -minScore=5000 $1 \ /iscratch/i/mus/mm5/softNib \ /iscratch/i/galGal2/nib $2 > $3 '_EOF_' # << this line makes emacs coloring happy chmod a+x doChain gensub2 input.lst single gsub jobList para create jobList para try, check, push, check... #Completed: 43 of 43 jobs #Average job time: 60s 1.00m 0.02h 0.00d #Longest job: 355s 5.92m 0.10h 0.00d #Submission to last job: 355s 5.92m 0.10h 0.00d # now on the cluster server, sort chains ssh kksilo cd /cluster/data/mm5/bed/blastz.galGal2.2004-07-15/axtChain chainMergeSort run1/chain/*.chain > all.chain chainSplit chain all.chain rm run1/chain/*.chain # take a look at score distr's foreach f (chain/*.chain) grep chain $f | awk '{print $2;}' | sort -nr > /tmp/score.$f:t:r echo $f:t:r textHistogram -binSize=5000 /tmp/score.$f:t:r echo "" end # Load chains into database ssh hgwdev cd /cluster/data/mm5/bed/blastz.galGal2.2004-07-15/axtChain/chain foreach i (*.chain) set c = $i:r echo loading $c hgLoadChain mm5 ${c}_chainGalGal2 $i end featureBits mm5 chainGalGal2Link #78951466 bases of 2615483787 (3.019%) in intersection featureBits hg17 chainGalGal2Link #103882699 bases of 2866216770 (3.624%) in intersection # NET CHICKEN BLASTZ (DONE 7/19/04 angie) ssh kksilo cd /cluster/data/mm5/bed/blastz.galGal2.2004-07-15/axtChain chainPreNet all.chain ../S1.len ../S2.len stdout \ | chainNet stdin -minSpace=1 ../S1.len ../S2.len stdout /dev/null \ | netSyntenic stdin noClass.net # Add classification info using db tables: ssh hgwdev cd /cluster/data/mm5/bed/blastz.galGal2.2004-07-15/axtChain netClass -noAr noClass.net mm5 galGal2 chicken.net # Make a 'syntenic' subset: ssh kksilo cd /cluster/data/mm5/bed/blastz.galGal2.2004-07-15/axtChain rm noClass.net # Make a 'syntenic' subset of these with netFilter -syn chicken.net > chickenSyn.net # Load the nets into database ssh hgwdev cd /cluster/data/mm5/bed/blastz.galGal2.2004-07-15/axtChain netFilter -minGap=10 chicken.net | hgLoadNet mm5 netGalGal2 stdin netFilter -minGap=10 chickenSyn.net | hgLoadNet mm5 syntenyNetGalGal2 stdin # Add entries for chainGalGal2, netGalGal2, syntenyNetGalGal2 to # mouse/mm5 trackDb # GENERATE GALGAL2 MAF FOR MULTIZ FROM NET (DONE 7/19/04 angie) ssh kksilo cd /cluster/data/mm5/bed/blastz.galGal2.2004-07-15/axtChain netSplit chicken.net net cd /cluster/data/mm5/bed/blastz.galGal2.2004-07-15 mkdir axtNet foreach f (axtChain/net/*) set chr = $f:t:r netToAxt $f axtChain/chain/$chr.chain /cluster/data/mm5/mixedNib \ /cluster/data/galGal2/nib stdout \ | axtSort stdin axtNet/$chr.axt end mkdir mafNet foreach f (axtNet/chr*.axt) set maf = mafNet/$f:t:r.maf axtToMaf $f \ /cluster/data/mm5/chrom.sizes /cluster/data/galGal2/chrom.sizes \ $maf -tPrefix=mm5. -qPrefix=galGal2. end # XENOPUS BLASTZ/CHAIN/NET (DONE 9/24/04 jk) # see makeXenTro1.doc and search for zb.mm5 # The results of this are also symlinked under mm5/bed # MAKE VSGALGAL2 DOWNLOADABLES (DONE 7/19/04 angie) ssh kksilo cd /cluster/data/mm5/bed/blastz.galGal2.2004-07-15 gzip axtNet/*.axt cd /cluster/data/mm5/bed/blastz.galGal2.2004-07-15/axtChain ln all.chain chicken.chain zip /cluster/data/mm5/zip/chicken.chain.zip chicken.chain rm chicken.chain zip /cluster/data/mm5/zip/chicken.net.zip chicken.net zip /cluster/data/mm5/zip/chickenSyn.net.zip chickenSyn.net ssh hgwdev mkdir /usr/local/apache/htdocs/goldenPath/mm5/vsGalGal2 cd /usr/local/apache/htdocs/goldenPath/mm5/vsGalGal2 mv /cluster/data/mm5/zip/chicken*.zip . cp -pR /cluster/data/mm5/bed/blastz.galGal2.2004-07-15/axtNet . md5sum *.zip axtNet/* > md5sum.txt # Copy over & edit README.txt w/pointers to chain, net formats. # EXTRACT LINEAGE-SPECIFIC REPEATS FOR DOG (DONE 7/15/04 angie) ssh kkr1u00 cd /cluster/bluearc/scratch/mus/mm5/rmsk # Run Arian's DateRepsinRMoutput.pl to add extra columns telling # whether repeats in -query are also expected in -comp species. # Even though we already have the mouse-human linSpecReps, # extractLinSpecReps requires two columns of DateRepsinRMoutput.pl # additions. So add human, then ignore it. # Dog in extra column 1, Human in extra column 2 foreach outfl ( *.out ) echo "$outfl" /cluster/bluearc/RepeatMasker/DateRepsinRMoutput.pl \ ${outfl} -query mouse -comp dog -comp human end # Now extract dog (extra column 1), ignore human. cd /iscratch/i/mus/mm5 mkdir linSpecRep.notInDog foreach f (/cluster/bluearc/scratch/mus/mm5/rmsk/*.out_dog_hum) set base = $f:t:r:r echo $base.out.spec /cluster/bin/scripts/extractLinSpecReps 1 $f > \ linSpecRep.notInDog/$base.out.spec end # Clean up. rm /cluster/bluearc/scratch/mus/mm5/rmsk/*.out_dog_hum iSync # BLASTZ DOG (CANFAM1) (DONE 7/16/04 angie) ssh kk mkdir /cluster/data/mm5/bed/blastz.canFam1.2004-07-15 ln -s blastz.canFam1.2004-07-15 /cluster/data/mm5/bed/blastz.canFam1 cd /cluster/data/mm5/bed/blastz.canFam1.2004-07-15 # Use default (Human-Mouse) settings for starters. cat << '_EOF_' > DEF # mouse vs. dog export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin ALIGN=blastz-run BLASTZ=blastz # Default BLASTZ_H=2000 BLASTZ_ABRIDGE_REPEATS=1 # TARGET: Mouse SEQ1_DIR=/scratch/mus/mm5/softNib SEQ1_RMSK= SEQ1_FLAG= SEQ1_SMSK=/iscratch/i/mus/mm5/linSpecRep.notInDog SEQ1_IN_CONTIGS=0 SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Dog SEQ2_DIR=/scratch/hg/canFam1/nib SEQ2_RMSK= SEQ2_FLAG= SEQ2_SMSK=/scratch/hg/canFam1/linSpecRep.notInMouse SEQ2_IN_CONTIGS=0 SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/cluster/data/mm5/bed/blastz.canFam1.2004-07-15 DEF=$BASE/DEF RAW=$BASE/raw CDBDIR=$BASE SEQ1_LEN=$BASE/S1.len SEQ2_LEN=$BASE/S2.len '_EOF_' # << this line keeps emacs coloring happy # first cluster run: raw blastz alignments ssh kk bash # if a csh/tcsh user cd /cluster/data/mm5/bed/blastz.canFam1.2004-07-15 source DEF mkdir $RAW run.0 /cluster/home/angie/hummus/make-joblist $DEF > $BASE/run.0/j sh ./xdir.sh cd run.0 sed -e 's@^blastz-run@/cluster/bin/penn/blastz-run@' j > jobList para create jobList para try, check, push, check, .... # cluster was mobbed... #Completed: 93775 of 93775 jobs #Average job time: 187s 3.11m 0.05h 0.00d #Longest job: 3907s 65.12m 1.09h 0.05d #Submission to last job: 76763s 1279.38m 21.32h 0.89d # second cluster run: lift raw alignments -> lav dir ssh kki bash # if a csh/tcsh user cd /cluster/data/mm5/bed/blastz.canFam1.2004-07-15 source DEF mkdir run.1 lav /cluster/bin/scripts/blastz-make-out2lav $DEF $BASE > $BASE/run.1/jobList cd run.1 wc -l jobList para create jobList para try, check, push, etc ... #Completed: 341 of 341 jobs #Average job time: 98s 1.63m 0.03h 0.00d #Longest job: 281s 4.68m 0.08h 0.00d #Submission to last job: 2102s 35.03m 0.58h 0.02d # third run: lav -> axt # (if non-default BLASTZ_Q is used in the future, put axtRescore in # the pipe after lavToAxt) ssh kki cd /cluster/data/mm5/bed/blastz.canFam1.2004-07-15 mkdir axtChrom pslChrom run.2 cd run.2 cat << '_EOF_' > do.csh #!/bin/csh -ef cd $1 set chr = $1:t cat `ls -1 *.lav | sort -g` \ | $HOME/bin/x86_64/lavToAxt stdin \ /iscratch/i/mus/mm5/softNib /iscratch/i/canFam1/nib stdout \ | $HOME/bin/x86_64/axtSort stdin ../../axtChrom/$chr.axt $HOME/bin/x86_64/axtToPsl ../../axtChrom/$chr.axt ../../S1.len ../../S2.len \ ../../pslChrom/$chr.psl '_EOF_' # << this line keeps emacs coloring happy chmod a+x do.csh cp /dev/null jobList foreach d (../lav/chr*) echo "do.csh $d" >> jobList end para create jobList para try, check, push, check #Completed: 43 of 43 jobs #Average job time: 671s 11.18m 0.19h 0.01d #Longest job: 2398s 39.97m 0.67h 0.03d #Submission to last job: 2417s 40.28m 0.67h 0.03d # CHAIN DOG BLASTZ (DONE 7/16/04 angie) # Run axtChain on little cluster ssh kki cd /cluster/data/mm5/bed/blastz.canFam1.2004-07-15 mkdir -p axtChain/run1 cd axtChain/run1 mkdir out chainchimpSuperQuals ls -1S /cluster/data/mm5/bed/blastz.canFam1.2004-07-15/axtChrom/*.axt \ > input.lst cat << '_EOF_' > gsub #LOOP doChain {check in line+ $(path1)} {check out line+ chain/$(root1).chain} {check out exists out/$(root1).out} #ENDLOOP '_EOF_' # << this line makes emacs coloring happy cat << '_EOF_' > doChain #!/bin/csh axtChain $1 \ /iscratch/i/mus/mm5/softNib \ /iscratch/i/canFam1/nib $2 > $3 '_EOF_' # << this line makes emacs coloring happy chmod a+x doChain gensub2 input.lst single gsub jobList para create jobList para try, check, push, check... #Completed: 43 of 43 jobs #Average job time: 537s 8.96m 0.15h 0.01d #Longest job: 2071s 34.52m 0.58h 0.02d #Submission to last job: 2071s 34.52m 0.58h 0.02d # now on the cluster server, sort chains ssh kksilo cd /cluster/data/mm5/bed/blastz.canFam1.2004-07-15/axtChain chainMergeSort run1/chain/*.chain > all.chain chainSplit chain all.chain rm run1/chain/*.chain # take a look at score distr's foreach f (chain/*.chain) grep chain $f | awk '{print $2;}' | sort -nr > /tmp/score.$f:t:r echo $f:t:r textHistogram -binSize=5000 /tmp/score.$f:t:r echo "" end # Lots of chaff with scores in the 3000's. Many very-high-scoring # chains. So filter the chain down somewhat... mv all.chain all.chain.unfiltered chainFilter -minScore=5000 all.chain.unfiltered > all.chain rm chain/* chainSplit chain all.chain gzip all.chain.unfiltered # Load chains into database ssh hgwdev cd /cluster/data/mm5/bed/blastz.canFam1.2004-07-15/axtChain/chain foreach i (*.chain) set c = $i:r hgLoadChain mm5 ${c}_chainCanFam1 $i end # mouse-dog gets significantly less coverage than human-dog: featureBits mm5 -chrom=chr1 chainCanFam1Link #63386139 bases of 185739816 (34.126%) in intersection featureBits hg17 -chrom=chr1 chainCanFam1Link #123999291 bases of 222827847 (55.648%) in intersection # mouse-dog isn't a whole lot less than mouse-human though: featureBits mm5 -chrom=chr1 chainHg17Link #75492250 bases of 185739816 (40.644%) in intersection # NET DOG BLASTZ (DONE 7/16/04 angie) ssh kolossus cd /cluster/data/mm5/bed/blastz.canFam1.2004-07-15/axtChain chainPreNet all.chain ../S1.len ../S2.len stdout \ | chainNet stdin -minSpace=1 ../S1.len ../S2.len stdout /dev/null \ | netSyntenic stdin noClass.net # Add classification info using db tables: ssh hgwdev cd /cluster/data/mm5/bed/blastz.canFam1.2004-07-15/axtChain netClass -noAr noClass.net mm5 canFam1 dog.net # Make a 'syntenic' subset: ssh kksilo cd /cluster/data/mm5/bed/blastz.canFam1.2004-07-15/axtChain rm noClass.net # Make a 'syntenic' subset of these with netFilter -syn dog.net > dogSyn.net # Load the nets into database ssh hgwdev cd /cluster/data/mm5/bed/blastz.canFam1.2004-07-15/axtChain netFilter -minGap=10 dog.net | hgLoadNet mm5 netCanFam1 stdin netFilter -minGap=10 dogSyn.net | hgLoadNet mm5 syntenyNetCanFam1 stdin # Add entries for chainCanFam1, netCanFam1 to mouse/mm5 trackDb # MAKE VSCANFAM1 DOWNLOADABLES (DONE 7/19/04 angie) ssh kksilo cd /cluster/data/mm5/bed/blastz.canFam1.2004-07-15 gzip axtNet/chr*.axt cd /cluster/data/mm5/bed/blastz.canFam1.2004-07-15/axtChain ln all.chain dog.chain zip /cluster/data/mm5/zip/dog.chain.zip dog.chain rm dog.chain zip /cluster/data/mm5/zip/dog.net.zip dog.net zip /cluster/data/mm5/zip/dogSyn.net.zip dogSyn.net ssh hgwdev mkdir /usr/local/apache/htdocs/goldenPath/mm5/vsCanFam1 cd /usr/local/apache/htdocs/goldenPath/mm5/vsCanFam1 mv /cluster/data/mm5/zip/dog*.zip . cp -pR /cluster/data/mm5/bed/blastz.canFam1.2004-07-15/axtNet . md5sum *.zip axtNet/* > md5sum.txt # Copy over & edit README.txt w/pointers to chain, net formats. # GENERATE CANFAM1 MAF FOR MULTIZ FROM NET (DONE 7/19/04 angie) ssh kksilo cd /cluster/data/mm5/bed/blastz.canFam1.2004-07-15/axtChain netSplit dog.net net cd /cluster/data/mm5/bed/blastz.canFam1.2004-07-15 mkdir axtNet foreach f (axtChain/net/*) set chr = $f:t:r netToAxt $f axtChain/chain/$chr.chain /cluster/data/mm5/nib \ /cluster/data/canFam1/nib stdout \ | axtSort stdin axtNet/$chr.axt end mkdir mafNet foreach f (axtNet/chr*.axt) set maf = mafNet/$f:t:r.maf axtToMaf $f \ /cluster/data/mm5/chrom.sizes /cluster/data/canFam1/chrom.sizes \ $maf -tPrefix=mm5. -qPrefix=canFam1. end ### MAKE THE affyU74 TRACK - needed for the Gene Sorter # (DONE - 2004-07-16 - Fan) # MAKE THE affyU74 TRACK using Affy consensus sequences instead of # target sequences. Recalculate alignments and load data ---------------------------------- # Load up semi-local disk with target sequences for Affy mouse U74 chips. ssh kkr1u00 mkdir -p /iscratch/i/affy # This /projects filesystem is not available on kkr1u00 # but it is on kk ssh kk cp /projects/compbio/data/microarray/affyGnfMouse/sequences/U74*consensus.fa /iscratch/i/affy ssh kkr1u00 iSync # Run cluster job to do alignments ssh kk mkdir /cluster/data/mm5/bed/affyU74.2004-07-16 cd /cluster/data/mm5/bed/affyU74.2004-07-16 mkdir run cd run mkdir psl echo /scratch/mus/mm5/maskedContigs/*.fa | wordLine stdin > genome.lst ls -1 /iscratch/i/affy/U74*consensus.fa > affy.lst cat << '_EOF_' > gsub #LOOP /cluster/bin/i386/blat -fine -mask=lower -minIdentity=95 -ooc=/scratch/hg/h/11.ooc {check in line+ $(path1)} {check in line+ $(path2)} {check out line+ psl/$(root1)_$(root2).psl} #ENDLOOP '_EOF_' # << this line makes emacs coloring happy gensub2 genome.lst affy.lst gsub jobList para create jobList para try # do usual para check/para push etc. until the job is done. # Completed: 1917 of 1917 jobs # CPU time in finished jobs: 14240s 237.34m 3.96h 0.16d 0.000 y # IO & Wait Time: 7946s 132.43m 2.21h 0.09d 0.000 y # Average job time: 12s 0.19m 0.00h 0.00d # Longest job: 40s 0.67m 0.01h 0.00d # Submission to last job: 307s 5.12m 0.09h 0.00d # Do sort, best in genome filter, and convert to chromosome coordinates # to create affyU74.psl. ssh kksilo cd /cluster/data/mm5/bed/affyU74.2004-07-16/run pslSort dirs raw.psl tmp psl # change filter parameters for these sequences. only use alignments that # cover 30% of sequence and have at least minAli = 0.95. # minAli = 0.97 too high. low minCover as a lot of n's in these sequences pslReps -minCover=0.3 -sizeMatters -minAli=0.95 -nearTop=0.005 raw.psl contig.psl /dev/null # Processed 44630 alignments liftUp ../all_affyU74.psl ../../../jkStuff/liftAll.lft warn contig.psl # Sort by chromosome and load into database. ssh hgwdev cd /cluster/data/mm5/bed/affyU74.2004-07-16 pslSortAcc nohead chrom temp all_affyU74.psl cat chrom/*.psl > affyU74.psl # shorten qName to "xxxx_at" instead of "U74Xv2:xxxx_at;" # and reload data into table hgLoadPsl mm5 affyU74.psl rm -fr chrom temp run ## MAKE THE affyGnfU74 TRACKs (DONE - 2004-07-18 - Fan) # Make bed files and load consensus sequences for Affy U74 chip set. # Fix broken symlinks to microarray data after directory structure changed # (DONE, 2005-05-03, hartera) ---------------------------------- #This needs to be done after affyU74 is already made. ssh hgwdev mkdir -p /cluster/data/mm5/bed/affyGnf.2004-07-16 cd /cluster/data/mm5/bed/affyGnf.2004-07-16 # may need to build this command in src/hg/affyGnf affyPslAndAtlasToBed ../affyU74.2004-07-16/affyU74.psl \ /projects/compbio/data/microarray/affyGnfMouse/data/data_public_U74 \ affyGnfU74A.bed affyGnfU74A.exp -newType -chip=U74Av2 affyPslAndAtlasToBed ../affyU74.2004-07-16/affyU74.psl \ /projects/compbio/data/microarray/affyGnfMouse/data/U74B_b.txt \ affyGnfU74B.bed affyGnfU74B.exp -newType -chip=U74Bv2 affyPslAndAtlasToBed ../affyU74.2004-07-16/affyU74.psl \ /projects/compbio/data/microarray/affyGnfMouse/data/U74C_b.txt \ affyGnfU74C.bed affyGnfU74C.exp -newType -chip=U74Cv2 # edit 3 .bed files to shorten qName to "xxxx_at" instead of "U74Xv2:xxxx_at;" # and reload data into table hgLoadBed mm5 affyGnfU74A affyGnfU74A.bed hgLoadBed mm5 affyGnfU74B affyGnfU74B.bed hgLoadBed mm5 affyGnfU74C affyGnfU74C.bed # Add in sequence data for U74 tracks. # Copy consensus sequence to /gbdb if it isn't already # [THE SYM LINKS WERE ALREADY DONE.] mkdir -p /gbdb/hgFixed/affyProbes cd /gbdb/hgFixed/affyProbes # fix broken symlinks after directory structure changed # /projects/compbiodata ----> /projects/compbio/data rm U74* # make correct symlinks (hartera, 2005-05-03) ln -s /projects/compbio/data/microarray/affyGnfMouse/sequences/U74Av2_consensus.fa . ln -s /projects/compbio/data/microarray/affyGnfMouse/sequences/U74Bv2_consensus.fa . ln -s /projects/compbio/data/microarray/affyGnfMouse/sequences/U74Cv2_consensus.fa . # used perl -pi.bak -e 's/;/ /' to remove ";" after probe name # ASSUMED THIS IS ALREADY DONE LAST TIME FOR MM4. # reload sequences with prefix removed so acc matches name used in # other dependent tables hgLoadSeq -abbr=U74Av2: mm5 /gbdb/hgFixed/affyProbes/U74Av2_consensus.fa hgLoadSeq -abbr=U74Bv2: mm5 /gbdb/hgFixed/affyProbes/U74Bv2_consensus.fa hgLoadSeq -abbr=U74Cv2: mm5 /gbdb/hgFixed/affyProbes/U74Cv2_consensus.fa ### GNF ATLAS 2 [DONE Fan 7/18/2004] # Align probes from GNF1M chip. ssh kk cd /cluster/data/mm5/bed mkdir -p geneAtlas2/run/psl cd geneAtlas2/run mkdir -p /cluster/bluearc/geneAtlas2 cp /projects/compbio/data/microarray/geneAtlas2/mouse/gnf1m.fa /cluster/bluearc/geneAtlas2 ls -1 /scratch/mus/mm5/maskedContigs/ > genome.lst ls -1 /cluster/bluearc/geneAtlas2/gnf1m.fa > mrna.lst echo '#LOOP\nblat -fine -ooc=/scratch/hg/h/mouse11.ooc /scratch/mus/mm5/maskedContigs/$(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl}\n#ENDLOOP' > gsub gensub2 genome.lst mrna.lst gsub spec para create spec para try para check para push para time # Completed: 639 of 639 jobs # CPU time in finished jobs: 58174s 969.57m 16.16h 0.67d 0.002 y # IO & Wait Time: 4833s 80.55m 1.34h 0.06d 0.000 y # Average job time: 99s 1.64m 0.03h 0.00d # Longest job: 189s 3.15m 0.05h 0.00d # Submission to last job: 1749s 29.15m 0.49h 0.02d # Do sort, best in genome filter, and convert to chromosome coordinates # to create gnf1h.psl. pslSort dirs raw.psl tmp psl pslReps -minCover=0.3 -minAli=0.95 -nearTop=0.005 raw.psl contig.psl /dev/null liftUp ../affyGnf1m.psl ../../../jkStuff/liftAll.lft warn contig.psl rm -r contig.psl raw.psl psl # Load probes and alignments from GNF1H into database. ssh hgwdev cd /cluster/data/mm5/bed/geneAtlas2 ln -s /projects/compbio/data/microarray/geneAtlas2/mouse/gnf1m.fa /gbdb/hgFixed/affyProbes hgLoadPsl mm5 affyGnf1m.psl hgLoadSeq mm5 /gbdb/hgFixed/affyProbes/gnf1m.fa # Load up track hgMapMicroarray gnfAtlas2.bed hgFixed.gnfMouseAtlas2MedianRatio \ affyGnf1m.psl # Note that the unmapped 5000 records are from all-N sequences. hgLoadBed mm5 gnfAtlas2 gnfAtlas2.bed # MOUSE AFFYMETRIX MOE430 TRACK (DONE, 2004-07-19, Fan) mkdir -p /projects/compbio/data/microarray/affyMouse # Download MOE430A and MOE430B consensus sequences from Affymetrix web site # http://www.affymetrix.com/support/technical/byproduct.affx?product=moe430 unzip MOE430*_consensus.zip # check for duplicate probes: there are none, all have unique names # check for duplicate probes: 100 from 136745_at to 1367551_a_at # remove "consensus:" and ";" from FASTA headers to shorten probeset # names for database sed -e 's/consensus://' MOE430A_consensus | sed -e 's/;/ /' > MOE430_all.fa sed -e 's/consensus://' MOE430B_consensus | sed -e 's/;/ /' >> MOE430_all.fa cp /projects/compbio/data/microarray/affyMouse/MOE430_all.fa \ /cluster/bluearc/affy/ # THE ABOVE WAS ALREADY DONE BY RACHEL 4/16/04. # Set up cluster job to align MOE430 consensus sequences to mm5 ssh kkr1u00 cd /cluster/data/mm5/bed mkdir -p affyMOE430 cd affyMOE430 mkdir -p /iscratch/i/affy cp /cluster/bluearc/affy/MOE430_all.fa /iscratch/i/affy iSync ssh kk cd /cluster/data/mm5/bed/affyMOE430 ls -1 /iscratch/i/affy/MOE430_all.fa > affy.lst ls -1 /scratch/mus/mm5/maskedContigs/ > allctg.lst echo '#LOOP\n/cluster/bin/i386/blat -fine -mask=lower -minIdentity=95 - ooc=/scratch/hg/h/mouse11.ooc /scratch/mus/mm5/maskedContigs/$(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl}\n#ENDLOOP' > template.sub gensub2 allctg.lst affy.lst template.sub para.spec mkdir psl para create para.spec # Actually do the job with usual para try/check/push/time etc. # para time # Completed: 639 of 639 jobs # CPU time in finished jobs: 24369s 406.14m 6.77h 0.28d 0.001 y # IO & Wait Time: 2263s 37.72m 0.63h 0.03d 0.000 y # Average job time: 42s 0.69m 0.01h 0.00d # Longest job: 63s 1.05m 0.02h 0.00d # Submission to last job: 671s 11.18m 0.19h 0.01d # Do sort, best in genome filter, and convert to chromosome coordinates # to create affyRAE230.psl pslSort dirs raw.psl tmp psl # only use alignments that cover 30% of sequence and have at least # 95% identity in aligned region. # low minCover as a lot of n's in these sequences pslReps -minCover=0.3 -sizeMatters -minAli=0.95 -nearTop=0.005 raw.psl contig.psl /dev/null liftUp affyMOE430.psl ../../jkStuff/liftAll.lft warn contig.psl # Load alignments and sequences into database ssh hgwdev cd /cluster/data/mm5/bed/affyMOE430 # shorten names in psl file sed -e 's/MOE430//' affyMOE430.psl > affyMOE430.psl.bak mv affyMOE430.psl.bak affyMOE430.psl # load track into database hgLoadPsl mm5 affyMOE430.psl # 1 warning on loading: Blat error so that 1449824_at has a # negative entry (-195) in the qBaseInsert field. # Loading into the database forces this to 0. # Add consensus sequences for MOE430 # Copy sequences to gbdb is they are not there already mkdir -p /gbdb/hgFixed/affyProbes ln -s /projects/compbio/data/microarray/affyMouse/MOE430_all.fa \ /gbdb/hgFixed/affyProbes hgLoadSeq -abbr=MOE430 mm5 /gbdb/hgFixed/affyProbes/MOE430_all.fa # Clean up rm batch.bak contig.psl raw.psl # BELOW TWO THINGS WERE DONE BY RACHEL ALREDAY FOR MM4 # add entry to trackDb.ra in ~kent/src/hg/makeDb/trackDb/mouse/ # add affyMOE430.html file and then do make alpha to add to trackDb table ######## MAKING GENE SORTER TABLES ####### (STARTED - 2004-07-15 - Hiram) # These are instructions for building the # Gene Sorter. Don't start these until # there is a knownGene track. and the affy tracks # Cluster together various alt-splicing isoforms. # Creates the knownIsoforms and knownCanonical tables ssh hgwdev cd /tmp hgClusterGenes mm5 knownGene knownIsoforms knownCanonical # You may need to build this binary in src/hg/near/hgClusterGenes # Got 24603 clusters, from 41208 genes in 43 chromosomes # featureBits mm5 knownCanonical # 853516995 bases of 2615483787 (32.633%) in intersection # featureBits mm4 knownCanonical # 840021165 bases of 2627444668 (31.971%) in intersection # featureBits mm3 knownCanonical # 825943052 bases of 2505900260 (32.960%) in intersection # ! ! ! Can not do featureBits on knownIsoforms # Extract peptides from knownGenes into fasta file # and create a blast database out of them. ssh hgwdev mkdir -p /cluster/data/mm5/bed/geneSorter/blastp cd /cluster/data/mm5/bed/geneSorter/blastp pepPredToFa mm5 knownGenePep known.faa # You may need to build this binary in src/hg/near/pepPredToFa /cluster/bluearc/blast229/formatdb -i known.faa -t known -n known # Copy over database to bluearc scratch mkdir /cluster/bluearc/scratch/mus/mm5/blastp cp -p /cluster/data/mm5/bed/geneSorter/blastp/known.* \ /cluster/bluearc/scratch/mus/mm5/blastp # Split up fasta file into bite sized chunks for cluster cd /cluster/data/mm5/bed/geneSorter/blastp mkdir split faSplit sequence known.faa 8000 split/kg # Make parasol run directory ssh kk mkdir /cluster/data/mm5/bed/geneSorter/blastp/self cd /cluster/data/mm5/bed/geneSorter/blastp/self mkdir run cd run mkdir out # Make blast script cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/cluster/bluearc/blast229/data /cluster/bluearc/blast229/blastall \ -p blastp -d /cluster/bluearc/scratch/mus/mm5/blastp/known \ -i $1 -o $2 -e 0.01 -m 8 -b 1000 '_EOF_' # << keep emacs happy chmod a+x blastSome # Make gensub2 file cat << '_EOF_' > gsub #LOOP blastSome {check in line+ $(path1)} {check out line out/$(root1).tab} #ENDLOOP '_EOF_' # << keep emacs happy # Create parasol batch # 'ls ../../split/*.fa' is too much, hence the echo echo ../../split/*.fa | wordLine stdin > split.lst gensub2 split.lst single gsub jobList para create jobList para try para check para push ... etc ... # Completed: 7739 of 7739 jobs # CPU time in finished jobs: 120685s 2011.42m 33.52h 1.40d 0.004 y # IO & Wait Time: 22722s 378.69m 6.31h 0.26d 0.001 y # Average job time: 19s 0.31m 0.01h 0.00d # Longest job: 147s 2.45m 0.04h 0.00d # Submission to last job: 705s 11.75m 0.20h 0.01d # Load into database. This takes about an hour. ssh hgwdev cd /cluster/data/mm5/bed/geneSorter/blastp/self/run/out hgLoadBlastTab mm5 knownBlastTab *.tab # Scanning through 7739 files # Loading database with 8017562 rows # real 17m9.104s # user 3m8.980s # sys 0m28.800s # Create known gene mapping table and expression distance tables # for GNF Atlas 2. (The hgExpDistance takes an hour.) # DONE (04-07-18 Fan) hgMapToGene mm5 affyGnf1m knownGene knownToGnf1m hgExpDistance mm5 hgFixed.gnfMouseAtlas2MedianRatio \ hgFixed.gnfMouseAtlas2MedianExps gnfAtlas2Distance \ -lookup=knownToGnf1m # Create table that maps between known genes and RefSeq hgMapToGene mm5 refGene knownGene knownToRefSeq # may need to build this command in src/hg/near/hgMapToGene # Create a table that maps between known genes and # the nice affy expression data. hgMapToGene mm5 affyU74 knownGene knownToU74 hgMapToGene mm5 affyMOE430 knownGene knownToMOE430 hgMapToGene mm5 affyMOE430 -prefix=A: knownGene knownToMOE430A # Format and load Rinn et al sex expression data mkdir /cluster/data/mm5/bed/rinnSex cd !$ hgMapMicroarray rinnSex.bed hgFixed.mouseRinnSexMedianRatio \ ../affyMOE430/affyMOE430.psl hgLoadBed mm5 rinnSex rinnSex.bed # Format and load the GNF data mkdir /cluster/data/mm5/bed/affyGnf95 cd /cluster/data/mm5/bed/affyGnf95 affyPslAndAtlasToBed -newType ../affyU95.psl \ /projects/compbio/data/microarray/affyGnfHuman/data_public_U95 \ affyGnfU95.tab affyGnfU95Exps.tab -shortOut # this .sql load was in preceeding instructions, but this .sql file # appears to not exist and it doesn't seem to be needed anyway. # Everything below this seems to create tables OK. # hgsql mm5 < ~/kent/src/hg/affyGnf/affyGnfU95.sql # Create table that gives distance in expression space between # GNF genes. These commands take about 15 minutes each # The affyGnfU74?Exps arguments appear to be unused in hgExpDistance hgExpDistance mm5 affyGnfU74A affyGnfU74AExps affyGnfU74ADistance \ -lookup=knownToU74 # Got 13593 unique elements in affyGnfU74A hgExpDistance mm5 affyGnfU74B affyGnfU74BExps affyGnfU74BDistance \ -lookup=knownToU74 # Got 8512 unique elements in affyGnfU74B hgExpDistance mm5 affyGnfU74C affyGnfU74CExps affyGnfU74CDistance \ -lookup=knownToU74 # Got 2318 unique elements in affyGnfU74C # C.ELEGANS BLASTP FOR GENE SORTER (DONE 7/20/04 Fan) # Make C. elegans ortholog column using blastp on wormpep. # First make C. elegans protein database and copy it to iscratch/i # if it doesn't exist already: ssh eieio mkdir /cluster/data/ce2/bed/blastp cd /cluster/data/ce2/bed/blastp # Point a web browser at ftp://ftp.sanger.ac.uk/pub/databases/wormpep/ # to find out the latest version. Then use that in place of 128 below. wget -O wormPep128.faa \ ftp://ftp.sanger.ac.uk/pub/databases/wormpep/wormpep128/wormpep128 formatdb -i wormPep128.faa -t wormPep128 -n wormPep128 ssh kkr1u00 if (-e /iscratch/i/ce2/blastp) then rm -r /iscratch/i/ce2/blastp endif mkdir -p /iscratch/i/ce2/blastp cp /cluster/data/ce2/bed/blastp/wormPep128.p?? /iscratch/i/ce2/blastp iSync # Make parasol run directory ssh kk mkdir -p /cluster/data/mm5/bed/blastp/ce2/run/out cd /cluster/data/mm5/bed/blastp/ce2/run # Make blast script cat > blastSome < gsub <split.lst #ls -1S ../../split/*.fa > split.lst #EDIT split.lst to add "../../../geneSorter/blastp/split/" in front of "kg" gensub2 split.lst single gsub spec para create spec para try, check, push, check, ... # Completed: 7739 of 7739 jobs # CPU time in finished jobs: 54871s 914.51m 15.24h 0.64d 0.002 y # IO & Wait Time: 26157s 435.95m 7.27h 0.30d 0.001 y # Average job time: 10s 0.17m 0.00h 0.00d # Longest job: 41s 0.68m 0.01h 0.00d # Submission to last job: 210s 3.50m 0.06h 0.00d # Load into database. ssh hgwdev cd /cluster/data/mm5/bed/blastp/ce2/run/out hgLoadBlastTab mm5 ceBlastTab -maxPer=1 *.tab # HUMAN BLASTP FOR GENE SORTER (DONE 7/20/04 Fan) # Make human ortholog column using blastp on human known genes. # First make human protein database and copy it to iscratch/i # if it doesn't exist already: mkdir /cluster/data/hg17/bed/blastp cd /cluster/data/hg17/bed/blastp pepPredToFa hg17 knownGenePep known.faa formatdb -i known.faa -t known -n known ssh kkr1u00 if (-e /iscratch/i/hg17/blastp) then rm -r /iscratch/i/hg17/blastp endif mkdir -p /iscratch/i/hg17/blastp cp /cluster/data/hg17/bed/blastp/known.p?? /iscratch/i/hg17/blastp iSync # Make parasol run directory ssh kk mkdir -p /cluster/data/mm5/bed/blastp/hg17/run/out cd /cluster/data/mm5/bed/blastp/hg17/run # Make blast script cat > blastSome < gsub <split.lst #EDIT split.lst to add "../../../geneSorter/blastp/split/" in front of "kg" gensub2 split.lst single gsub spec para create spec para try, check, push, check, ... # Completed: 7739 of 7739 jobs # CPU time in finished jobs: 125830s 2097.17m 34.95h 1.46d 0.004 y # IO & Wait Time: 22740s 379.00m 6.32h 0.26d 0.001 y # Average job time: 19s 0.32m 0.01h 0.00d # Longest job: 137s 2.28m 0.04h 0.00d # Submission to last job: 301s 5.02m 0.08h 0.00d # Load into database. ssh hgwdev cd /cluster/data/mm5/bed/blastp/hg17/run/out hgLoadBlastTab mm5 hgBlastTab -maxPer=1 *.tab # ZEBRAFISH BLASTP FOR GENE SORTER (DONE 7/20/04 Fan) # Make Danio rerio (zebrafish) ortholog column using blastp on Ensembl. # First make protein database and copy it to iscratch/i # if it doesn't exist already: ssh kkstore mkdir /cluster/data/danRer1/bed/blastp cd /cluster/data/danRer1/bed/blastp wget ftp://ftp.ensembl.org/pub/current_zebrafish/data/fasta/pep/Danio_rerio.ZFISH3.ma y.pep.fa.gz zcat Dan*.pep.fa.gz > ensembl.faa formatdb -i ensembl.faa -t ensembl -n ensembl ssh kkr1u00 if (-e /iscratch/i/danRer1/blastp) then rm -r /iscratch/i/danRer1/blastp endif mkdir -p /iscratch/i/danRer1/blastp cp /cluster/data/danRer1/bed/blastp/ensembl.p?? /iscratch/i/danRer1/blastp iSync # THE ABOVE IS ALREADY DONE BY ANGIE # Make parasol run directory ssh kk mkdir -p /cluster/data/mm5/bed/blastp/danRer1/run/out cd /cluster/data/mm5/bed/blastp/danRer1/run # Make blast script cat > blastSome < gsub <split.lst #EDIT split.lst to add "../../../geneSorter/blastp/split/" in front of "kg" gensub2 split.lst single gsub spec para create spec para try, check, push, check, ... # Completed: 7739 of 7739 jobs # CPU time in finished jobs: 96773s 1612.89m 26.88h 1.12d 0.003 y # IO & Wait Time: 29356s 489.26m 8.15h 0.34d 0.001 y # Average job time: 16s 0.27m 0.00h 0.00d # Longest job: 73s 1.22m 0.02h 0.00d # Submission to last job: 282s 4.70m 0.08h 0.00d # Load into database. ssh hgwdev cd /cluster/data/mm5/bed/blastp/danRer1/run/out hgLoadBlastTab mm5 drBlastTab -maxPer=1 *.tab # YEAST BLASTP FOR GENE SORTER (DONE 7/20/04 Fan) # Make Saccharomyces cerevisiae (yeast) ortholog column using blastp on # RefSeq. First make protein database and copy it to iscratch/i # if it doesn't exist already: mkdir /cluster/data/sacCer1/bed/blastp cd /cluster/data/sacCer1/bed/blastp wget ftp://genome- ftp.stanford.edu/pub/yeast/data_download/sequence/genomic_sequence/orf_protein/o rf_trans.fasta.gz zcat orf_trans.fasta.gz > sgdPep.faa formatdb -i sgdPep.faa -t sgdPep -n sgdPep #ABOVE WAS ALREDY DONE BY JIM ssh kkr1u00 # Note: sacCer1 is a name conflict with SARS coronavirus... oh well, # fortunately we won't be looking for homologs there. :) if (-e /iscratch/i/sacCer1/blastp) then rm -r /iscratch/i/sacCer1/blastp endif mkdir -p /iscratch/i/sacCer1/blastp cp /cluster/data/sacCer1/bed/blastp/sgdPep.p?? /iscratch/i/sacCer1/blastp iSync # Make parasol run directory ssh kk mkdir -p /cluster/data/mm5/bed/blastp/sacCer1/run/out cd /cluster/data/mm5/bed/blastp/sacCer1/run # Make blast script cat > blastSome < gsub <split.lst #EDIT split.lst to add "../../../geneSorter/blastp/split/" in front of "kg" gensub2 split.lst single gsub spec para create spec para try, check, push, check, ... # Completed: 7739 of 7739 jobs # CPU time in finished jobs: 16348s 272.46m 4.54h 0.19d 0.001 y # IO & Wait Time: 23063s 384.39m 6.41h 0.27d 0.001 y # Average job time: 5s 0.08m 0.00h 0.00d # Longest job: 14s 0.23m 0.00h 0.00d # Submission to last job: 203s 3.38m 0.06h 0.00d # Load into database. ssh hgwdev cd /cluster/data/mm5/bed/blastp/sacCer1/run/out hgLoadBlastTab mm5 scBlastTab -maxPer=1 *.tab # DM1 BLASTP FOR GENE SORTER (DONE 7/20/04 Fan) # Make Drosophila melanagaster ortholog column using blastp on FlyBase. # First make protein database and copy it to iscratch/i # if it doesn't exist already: # This is already done, see makeMm3.doc for procedure # the directory: /cluster/bluearc/dm1/blastp should have data ssh kkr1u00 if (-e /iscratch/i/dm1/blastp) then rm -r /iscratch/i/dm1/blastp endif mkdir -p /iscratch/i/dm1/blastp cp /cluster/data/dm1/bed/blastp/bdgp.p?? /iscratch/i/dm1/blastp iSync # THE ABOVE IS ALREADY DONE BY ANGIE # Make parasol run directory ssh kk mkdir -p /cluster/data/mm5/bed/blastp/dm1/run/out cd /cluster/data/mm5/bed/blastp/dm1/run # Make blast script cat > blastSome < gsub <split.lst #EDIT split.lst to add "../../../geneSorter/blastp/split/" in front of "kg" gensub2 split.lst single gsub spec para create spec para try, check, push, check, ... # Completed: 7739 of 7739 jobs # CPU time in finished jobs: 64033s 1067.22m 17.79h 0.74d 0.002 y # IO & Wait Time: 20868s 347.79m 5.80h 0.24d 0.001 y # Average job time: 11s 0.18m 0.00h 0.00d # Longest job: 45s 0.75m 0.01h 0.00d # Submission to last job: 351s 5.85m 0.10h 0.00d # Load into database. ssh hgwdev cd /cluster/data/mm5/bed/blastp/dm1/run/out hgLoadBlastTab mm5 dmBlastTab -maxPer=1 *.tab # Create table that maps between known genes and LocusLink (DONE 7/20/04 Fan) hgsql --skip-column-names -e "select mrnaAcc,locusLinkId from refLink" mm5 \ > refToLl.txt hgMapToGene mm5 refGene knownGene knownToLocusLink -lookup=refToLl.txt # row count is 30303 # Create table that maps between known genes and Pfam domains hgMapViaSwissProt mm5 knownGene name proteinID Pfam knownToPfam # row count is 29069 # Create table to map between known genes and GNF Atlas2 # expression data. hgMapToGene mm5 gnfAtlas2 knownGene knownToGnfAtlas2 '-type=bed 12' # Create table that maps between known genes and genePix database (DONE 3/15/05 JK) knownToGenePix mm5 # ENABLE GENE SORTER FOR mm5 IN HGCENTRALTEST (DONE 7/20/04 Fan) echo "update dbDb set hgNearOk = 1 where name = 'mm5';" \ | hgsql -h genome-testdb hgcentraltest # RAT BLASTP FOR GENE SORTER (DONE 4/20/05 Fan) # Make RAT ortholog column using blastp on RAT known genes. # First make RAT protein database and copy it to iscratch/i # if it doesn't exist already: mkdir /cluster/data/rn3/bed/blastp cd /cluster/data/rn3/bed/blastp pepPredToFa rn3 knownGenePep known.faa formatdb -i known.faa -t known -n known ssh kkr1u00 if (-e /iscratch/i/rn3/blastp) then rm -r /iscratch/i/rn3/blastp endif mkdir -p /iscratch/i/rn3/blastp cp /cluster/data/rn3/bed/blastp/known.p?? /iscratch/i/rn3/blastp iSync # Make parasol run directory ssh kk mkdir -p /cluster/data/mm5/bed/blastp/rn3/run/out cd /cluster/data/mm5/bed/blastp/rn3/run # Make blast script cat > blastSome < gsub <split.lst #EDIT split.lst to add "../../../geneSorter/blastp/split/" in front of "kg" gensub2 split.lst single gsub spec para create spec para try, check, push, check, ... Completed: 7739 of 7739 jobs CPU time in finished jobs: 24369s 406.14m 6.77h 0.28d 0.001 y IO & Wait Time: 21867s 364.46m 6.07h 0.25d 0.001 y Average job time: 6s 0.10m 0.00h 0.00d Longest running job: 0s 0.00m 0.00h 0.00d Longest finished job: 25s 0.42m 0.01h 0.00d Submission to last job: 276s 4.60m 0.08h 0.00d # Load into database. ssh hgwdev cd /cluster/data/mm5/bed/blastp/rn3/run/out hgLoadBlastTab mm5 rnBlastTab -maxPer=1 *.tab # END OF GENE SORTER STUFF ############################################################################# # BLASTZ RAT RN3 (DONE - 2004-07-15 - Fan) # NOTE: THIS IS RE-DONE. SEE THE SAME SECTION OF 2004-08-30. Fan. ssh kk mkdir -p /cluster/data/mm5/bed/blastz.rn3.2004-07-14 cd /cluster/data/mm5/bed ln -s blastz.rn3.2004-07-14 blastz.rn3 cd blastz.rn3 cat << '_EOF_' > DEF # rat vs. mouse export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/home/angie/schwartz bin:/cluster/home/kent/bin/i386 ALIGN=blastz-run BLASTZ=blastz BLASTZ_H=2000 BLASTZ_ABRIDGE_REPEATS=1 # TARGET # Mouse SEQ1_DIR=/scratch/mus/mm5/softNib # not used SEQ1_RMSK= # not used SEQ1_FLAG= SEQ1_SMSK=/iscratch/i/mus/mm5/linSpecRep.notInRat SEQ1_IN_CONTIGS=0 SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY # Rat SEQ2_DIR=/iscratch/i/rn3/bothMaskedNibs # not currently used SEQ2_RMSK= # not currently used SEQ2_FLAG= SEQ2_SMSK=/cluster/bluearc/rat/rn3/linSpecRep.notInMouse SEQ2_IN_CONTIGS=0 SEQ2_CHUNK=30000000 SEQ2_LAP=0 BASE=/cluster/data/mm5/bed/blastz.rn3 DEF=$BASE/DEF RAW=$BASE/raw CDBDIR=$BASE SEQ1_LEN=$BASE/S1.len SEQ2_LEN=$BASE/S2.len '_EOF_' # << this line makes emacs coloring happy # prepare first cluster run ssh kk cd /cluster/data/mm5/bed/blastz.rn3 bash source ./DEF # script copied over from /cluster/data/hg17/jkStuff/BlastZ_run0.sh # it is a generic script and works for any assembly cp /cluster/data/hg17/jkStuff/BlastZ_run0.sh \ /cluster/data/mm5/jkStuff/BlastZ_run0.sh /cluster/data/mm5/jkStuff/BlastZ_run0.sh cd run.0 para try, check, push, check, .... Completed: 41943 of 41943 jobs CPU time in finished jobs: 16854319s 280905.31m 4681.76h 195.07d 0.534 y IO & Wait Time: 448464s 7474.41m 124.57h 5.19d 0.014 y Average job time: 413s 6.88m 0.11h 0.00d Longest job: 9358s 155.97m 2.60h 0.11d Submission to last job: 73416s 1223.60m 20.39h 0.85d # Second cluster run to convert the .out's to .lav's # You do NOT want to run this on the big cluster. It brings # the file server to its knees. Run this on the small cluster. ssh kki cd /cluster/data/mm5/bed/blastz.rn3 # script copied over from /cluster/data/mm4/jkStuff/BlastZ_run1.sh # fixup machine check, should be kki, not kk cp /cluster/data/mm4/jkStuff/BlastZ_run1.sh \ /cluster/data/mm5/jkStuff/BlastZ_run1.sh vi /cluster/data/mm5/jkStuff/BlastZ_run1.sh /cluster/data/mm5/jkStuff/BlastZ_run1.sh cd run.1 para try, check, push, etc ... # Completed: 341 of 341 jobs # CPU time in finished jobs: 7859s 130.98m 2.18h 0.09d 0.000 y # IO & Wait Time: 104771s 1746.19m 29.10h 1.21d 0.003 y # Average job time: 330s 5.50m 0.09h 0.00d # Longest job: 1625s 27.08m 0.45h 0.02d # Submission to last job: 8535s 142.25m 2.37h 0.10d # Third cluster run to convert lav's to axt's ssh kki cd /cluster/data/mm5/bed/blastz.rn3 bash source ./DEF # The copy of this in mm4 was broken, use the hg17 one instead cp /cluster/data/hg17/jkStuff/BlastZ_run2.sh \ /cluster/data/mm5/jkStuff/BlastZ_run2.sh # vi /cluster/data/mm5/jkStuff/BlastZ_run2.sh /cluster/data/mm5/jkStuff/BlastZ_run2.sh cd run.2 #edited gsub to change /scratch/mus/mm5 to /cluster/bluearc/scratch/mus/mm5 # and recreated jobList by: gensub2 chrom.list single gsub jobList para create jobList para try, check, push, etc ... # Completed: 42 of 43 jobs # Crashed: 1 jobs # CPU time in finished jobs: 2050s 34.17m 0.57h 0.02d 0.000 y # IO & Wait Time: 143135s 2385.58m 39.76h 1.66d 0.005 y # Average job time: 3457s 57.61m 0.96h 0.04d # Longest job: 14460s 241.00m 4.02h 0.17d # Submission to last job: 14849s 247.48m 4.12h 0.17d # Note: previous numbers were: # Completed: 46 of 46 jobs # CPU time in finished jobs: 426s 7.09m 0.12h 0.00d 0.000 y # IO & Wait Time: 7283s 121.39m 2.02h 0.08d 0.000 y # Average job time: 168s 2.79m 0.05h 0.00d # Longest job: 642s 10.70m 0.18h 0.01d # Submission to last job: 642s 10.70m 0.18h 0.01d # probably due to data on bluearc instead of on kki nodes. # One job failed consistently because short of memory error # went to kkr4u00 to run the following job: # Per Angie's advice, created /cluster/bin/scripts/blastz-chromlav2axtLargeMem # by from /cluster/bin/scripts/blastz-chromlav2axt and changed /cluster/bin/i386 # to /cluster/bin/x86_64 and then ran: /cluster/bin/scripts/blastz-chromlav2axtLargeMem /cluster/data/mm5/bed/blastz.rn3/lav/chr2 /cluster/data/mm5/bed/blastz.rn3/axtChrom/chr2.axt /cluster/bluearc/scratch/mus/mm5/softNib /iscratch/i/rn3/bothMaskedNibs # It worked! # translate sorted axt files into psl ssh kksilo cd /cluster/data/mm5/bed/blastz.rn3 mkdir pslChrom set tbl = "blastzRn3" foreach f (axtChrom/chr*.axt) set c=$f:t:r echo "Processing chr $c" /cluster/bin/i386/axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl end # That takes about 2 hours # Load database tables ssh hgwdev cd /cluster/data/mm5/bed/blastz.rn3/pslChrom bash for I in *.psl do /cluster/bin/i386/hgLoadPsl -noTNameIx mm5 ${I} echo "done: ${I}" done # Check results # featureBits hg16 blastzRn3 # 1013603401 bases of 2865248791 (35.376%) in intersection # featureBits mm5 blastzRn3 ran out of memory. # So check a few specific chromosomes # featureBits mm5 blastzRn3 -chrom=chr17 # 61029084 bases of 86658738 (70.425%) in intersection # featureBits mm4 blastzRn3 -chrom=chr17 # 62824556 bases of 89616841 (70.104%) in intersection # featureBits mm5 blastzRn3 -chrom=chr18 # 61442155 bases of 86685738 (70.879%) in intersection # featureBits mm4 blastzRn3 -chrom=chr18 # 57158006 bases of 81388777 (70.228%) in intersection # CHAIN RN3 BLASTZ (DONE - 2004-07-22 - Fan) # NOTE: THIS IS RE-DONE. SEE THE SAME SECTION OF 2004-08-30. Fan. # The axtChain is best run on the small kluster, or the kk9 kluster ssh kki mkdir -p /cluster/data/mm5/bed/blastz.rn3/axtChain/run1 cd /cluster/data/mm5/bed/blastz.rn3/axtChain/run1 mkdir out chain ls -1S /cluster/data/mm5/bed/blastz.rn3/axtChrom/*.axt > input.lst cat << '_EOF_' > gsub #LOOP doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} out/$(root1).out #ENDLOOP '_EOF_' # << this line makes emacs coloring happy cat << '_EOF_' > doChain #!/bin/csh axtChain $1 \ /iscratch/i/mus/mm5/softNib \ /iscratch/i/rn3/bothMaskedNibs $2 > $3 '_EOF_' # << this line makes emacs coloring happy chmod a+x doChain # 46 jobs gensub2 input.lst single gsub jobList para create jobList para try para push # ... etc ... # Completed: 43 of 43 jobs # CPU time in finished jobs: 18318s 305.30m 5.09h 0.21d 0.001 y # IO & Wait Time: 41906s 698.44m 11.64h 0.49d 0.001 y # Average job time: 1401s 23.34m 0.39h 0.02d # Longest job: 5598s 93.30m 1.55h 0.06d # Submission to last job: 5635s 93.92m 1.57h 0.07d # now on the file server, sort chains ssh kksilo cd /cluster/data/mm5/bed/blastz.rn3/axtChain time chainMergeSort run1/chain/*.chain > all.chain & # real 26m14.694s # user 16m16.190s # sys 2m19.520s time chainSplit chain all.chain & # real 26m29.801s # user 15m40.780s # sys 2m40.610s # optionally: rm run1/chain/*.chain # Load chains into database # next machine ssh hgwdev cd /cluster/data/mm5/bed/blastz.rn3/axtChain/chain foreach i (*.chain) set c = $i:r hgLoadChain mm5 ${c}_chainRn3 $i echo done $c end # featureBits mm4 chainRn3Link -chrom=chr16 # 67474802 bases of 95076222 (70.969%) in intersection # featureBits mm5 chainRn3Link -chrom=chr16 # 66703715 bases of 92679592 (71.972%) in intersection # featureBits mm4 chainRn3Link -chrom=chr17 # 61932430 bases of 89616841 (69.108%) in intersection # featureBits mm5 chainRn3Link -chrom=chr17 # 60676019 bases of 86658738 (70.017%) in intersection # NET RN3 (DONE - 2004-07-23 - Fan) # NOTE: THIS IS RE-DONE. SEE THE SAME SECTION OF 2004-08-31. Fan. ssh kksilo cd /cluster/data/mm5/bed/blastz.rn3/axtChain mkdir preNet cd chain foreach i (*.chain) echo preNetting $i /cluster/bin/i386/chainPreNet $i /cluster/data/mm5/chrom.sizes \ /cluster/data/rn3/chrom.sizes ../preNet/$i end cd .. mkdir n1 cd preNet foreach i (*.chain) set n = $i:r.net echo primary netting $i /cluster/bin/i386/chainNet $i -minSpace=1 /cluster/data/mm5/chrom.sizes \ /cluster/data/rn3/chrom.sizes ../n1/$n /dev/null end cd .. cat n1/*.net | /cluster/bin/i386/netSyntenic stdin hNoClass.net # memory usage 1850904576, utime 9294 s/100, stime 2079 # The netClass operations requires an "ancientRepeat" table to exist # in either mm5 or rn3. So, create the table: ssh hgwdev mkdir -p /cluster/data/mm5/bed/ancientRepeat cd /cluster/data/mm5/bed/ancientRepeat # mysqldump needs write permission to this directory # and you need to use your read/write enabled user with password chmod 777 . hgsqldump --all --tab=. mm4 ancientRepeat chmod 775 . hgsql mm5 < ancientRepeat.sql mysqlimport -u -p mm5 ancientRepeat.txt # This is a hand curated table obtained from Arian. ssh hgwdev cd /cluster/data/mm5/bed/blastz.rn3/axtChain time netClass hNoClass.net mm5 rn3 rat.net \ -tNewR=/cluster/bluearc/scratch/mus/mm5/linSpecRep.notInRat \ -qNewR=/cluster/bluearc/rat/rn3/linSpecRep.notInMouse # 508.060u 89.340s 12:10.36 81.7% 0+0k 0+0io 201pf+0w # If things look good do ssh kksilo cd /cluster/data/mm5/bed/blastz.rn3/axtChain rm -r n1 hNoClass.net # Make a 'syntenic' subset of these with time netFilter -syn rat.net > ratSyn.net # real 5m5.494s # user 3m52.710s # sys 0m32.670s # Load the nets into database ssh hgwdev cd /cluster/data/mm5/bed/blastz.rn3/axtChain netFilter -minGap=10 rat.net | hgLoadNet mm5 netRn3 stdin netFilter -minGap=10 ratSyn.net | hgLoadNet mm5 syntenyNetRn3 stdin # real 8m50.781s # user 4m59.660s # sys 0m52.840s # check results # featureBits mm4 netRn3 # 96806381 bases of 95076222 (101.820%) in intersection # featureBits mm5 netRn3 # 2638255333 bases of 2615483787 (100.871%) in intersection # featureBits mm4 syntenyNetRn3 # 96760405 bases of 95076222 (101.771%) in intersection # featureBits mm5 syntenyNetRn3 # 2600849289 bases of 2615483787 (99.440%) in intersection # Add entries for net and chain to mouse/mm5 trackDb # make net ssh kksilo cd /cluster/data/mm5/bed/blastz.rn3/axtChain mkdir ratNet time netSplit rat.net ratNet # real 5m28.037s # user 3m58.150s # sys 0m37.870s # extract axts from net mkdir ../axtNet foreach n (ratNet/chr*.net) set c=$n:t:r echo "netToAxt: $c.net -> $c.axt" rm -f ../axtNet/$c.axt netToAxt ratNet/$c.net chain/$c.chain \ /cluster/data/mm5/nib \ /cluster/data/rn3/nib ../axtNet/$c.axt echo "Complete: $c.net -> axtNet/$c.axt" end # sort axt's and convert to maf format mkdir ../mafNet cat << 'EOF' > makeMaf.csh foreach f (../axtNet/chr*.axt) set c=$f:t:r echo $c.axt mv ../axtNet/$c.axt ../axtNet/$c.unsorted.axt axtSort ../axtNet/$c.unsorted.axt ../axtNet/$c.axt rm ../axtNet/$c.unsorted.axt axtToMaf ../axtNet/$c.axt \ /cluster/data/mm5/chrom.sizes /cluster/data/rn3/chrom.sizes \ ../mafNet/$c.maf -tPrefix=mm5. -qPrefix=rn3. end 'EOF' #csh makeMaf.csh >&! makeMaf.log & csh makeMaf.csh > makeMaf.log & tail -100f makeMaf.log # THE ABOVE DID NOT WORK. TRIED THE FOLLOWING: foreach f (../axtNet/chr*.axt) set c=$f:t:r echo $c.axt mv ../axtNet/$c.axt ../axtNet/$c.unsorted.axt axtSort ../axtNet/$c.unsorted.axt ../axtNet/$c.axt rm ../axtNet/$c.unsorted.axt axtToMaf ../axtNet/$c.axt \ /cluster/data/mm5/chrom.sizes /cluster/data/rn3/chrom.sizes \ ../mafNet/$c.maf -tPrefix=mm5. -qPrefix=rn3. end ssh hgwdev mkdir -p /cluster/data/mm5/bed/blastz.rn3/axtBest cd /cluster/data/mm5/bed/blastz.rn3/axtBest ln -s ../axtNet/chr*.axt . # copy net axt's to download area ssh hgwdev cd /cluster/data/mm5/bed/blastz.rn3/axtNet mkdir -p /usr/local/apache/htdocs/goldenPath/mm5/vsRn3/axtNet cp -p *.axt /usr/local/apache/htdocs/goldenPath/mm5/vsRn3/axtNet cd /usr/local/apache/htdocs/goldenPath/mm5/vsRn3/axtNet gzip *.axt # add README.txt file to dir (use previous assembly's copy as template) # Convert those axt files to psl ssh kksilo cd /cluster/data/mm5/bed/blastz.rn3 mkdir pslBest foreach a (axtBest/chr*.axt) set c=$a:t:r echo "processing $c.axt -> ${c}_blastzBestRn3.psl" /cluster/bin/i386/axtToPsl axtBest/${c}.axt \ S1.len S2.len pslBest/${c}_blastzBestRn3.psl echo "Done: ${c}_blastzBestRn3.psl" end # Load tables ssh hgwdev cd /cluster/data/mm5/bed/blastz.rn3/pslBest bash for I in chr*BestRn3.psl do /cluster/bin/i386/hgLoadPsl -noTNameIx mm5 ${I} echo "done ${I}" done # check results # featureBits mm5 blastzBestRn3 # 1778653886 bases of 2615483787 (68.005%) in intersection # featureBits mm4 blastzBestRn3 # 1780774716 bases of 2627444668 (67.776%) in intersection # Make /gbdb links and add them to the axtInfo table: mkdir -p /gbdb/mm5/axtBest/Rn3 cd /gbdb/mm5/axtBest/Rn3 ln -s /cluster/data/mm5/bed/blastz.rn3/axtNet/chr*.axt . cd /cluster/data/mm5/bed/blastz.rn3/axtNet rm -f axtInfoInserts.sql foreach f (/gbdb/mm5/axtBest/Rn3/chr*.axt) set chr=$f:t:r echo "INSERT INTO axtInfo (species, alignment, chrom, fileName) \ VALUES ('rn3','Blastz Best in Genome','$chr','$f');" \ >> axtInfoInserts.sql end hgsql mm5 < ~/kent/src/hg/lib/axtInfo.sql # table axtInfo may already exist, ignore create error. hgsql mm5 < axtInfoInserts.sql # BLASTZ RN3 CLEAN UP (DONE - 2004-07-26 - Fan) # NOTE: THIS IS RE-DONE. SEE THE SAME SECTION OF 2004-08-31. Fan. ssh kksilo cd /cluster/data/mm5/bed/blastz.rn3 nice rm -rf raw & nice rm axtChain/run1/chain/* & nice gzip {axt,psl}Chrom/* lav/*/* axtChain/{all.chain,*.net} & # MAKE VSRN3 DOWNLOADABLES (DONE 9/14/04 Fan) ssh kksilo cd /cluster/data/mm5/bed/blastz.rn3/axtChain ln all.chain rat.chain foreach f (rat.chain rat.net) gzip -c $f > $f.gz end rm rat.chain # Make chain-format of raw alignments ssh kksilo cd /cluster/data/mm5/bed/blastz.rn3 mkdir blastzECF foreach f (axtChrom/chr*.axt) set chr = $f:t:r axtToChain $f S1.len S2.len stdout \ | gzip -c - > blastzECF/$chr.ecf.gz end ssh hgwdev mkdir /usr/local/apache/htdocs/goldenPath/mm5/vsRn3 cd /usr/local/apache/htdocs/goldenPath/mm5/vsRn3 mv /cluster/data/mm5/bed/blastz.rn3/axtChain/rat*.gz . cp -p /cluster/data/mm5/bed/blastz.rn3/axtChain/all.chain.gz \ /usr/local/apache/htdocs/goldenPath/mm5/vsRn3/rat.chain.gz md5sum *.gz > md5sum.txt # Copy over & edit README.txt w/pointers to chain, net formats. # Not for pushing -- handle separately. mv /cluster/data/mm5/bed/blastz.rn3/blastzECF . cd blastzECF md5sum *.gz > md5sum.txt # BLASTZ ZEBRAFISH (DANRER1) (DONE, 2004-07-29, hartera) ssh kkr1u00 # blastz requires lineage-specific repeats # Treat all repeats as lineage-specific. mkdir -p /iscratch/i/mm5/linSpecRep.notInZebrafish foreach f (/cluster/bluearc/scratch/mus/mm5/rmsk/chr*.fa.out) cp -p $f /iscratch/i/mm5/linSpecRep.notInZebrafish/$f:t:r:r.out.spec end mkdir -p /iscratch/i/danRer1/linSpecRep.notInMouse foreach f (/iscratch/i/danRer1/rmsk/chr*.fa.out) cp -p $f /iscratch/i/danRer1/linSpecRep.notInMouse/$f:t:r:r.out.spec end iSync ssh kk mkdir -p /cluster/data/mm5/bed/blastz.danRer1.2004-07-27 ln -s /cluster/data/mm5/bed/blastz.danRer1.2004-07-27 \ /cluster/data/mm5/bed/blastz.danRer1 cd /cluster/data/mm5/bed/blastz.danRer1 # use same parameters as for danRer1-hg17 cat << '_EOF_' > DEF # mouse (mm5) vs zebrafish (danRer1) export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin ALIGN=blastz-run BLASTZ=blastz # Reuse parameters from hg16-fr1 and danRer1-hg17. BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q BLASTZ_ABRIDGE_REPEATS=1 # TARGET: Mouse (mm5) SEQ1_DIR=/cluster/bluearc/scratch/mus/mm5/softNib SEQ1_RMSK= SEQ1_FLAG= SEQ1_SMSK=/iscratch/i/mm5/linSpecRep.notInZebrafish SEQ1_IN_CONTIGS=0 SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Zebrafish (danRer1) SEQ2_DIR=/iscratch/i/danRer1/nib SEQ2_RMSK= SEQ2_FLAG= SEQ2_SMSK=/iscratch/i/danRer1/linSpecRep.notInMouse SEQ2_IN_CONTIGS=0 SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/cluster/data/mm5/bed/blastz.danRer1 DEF=$BASE/DEF RAW=$BASE/raw CDBDIR=$BASE SEQ1_LEN=$BASE/S1.len SEQ2_LEN=$BASE/S2.len #DEBUG=1 '_EOF_' # << this line keeps emacs coloring happy # Save the DEF file in the current standard place chmod +x DEF cp DEF ~angie/hummus/DEF.mm5-danRer1.2004-07-27 # setup cluster run # copy shell scripts for blastz runs if not there already cp -p /cluster/data/danRer1/jkStuff/BlastZ* /cluster/data/mm5/jkStuff/ # edit BlastZ_run0.sh # replace line 22: /cluster/home/angie/schwartzbin/ with /cluster/bin/penn/ # this is the directory for the latest version of blastz-run # source the DEF file bash . ./DEF /cluster/data/mm5/jkStuff/BlastZ_run0.sh cd run.0 # check batch looks ok then para try, check, push, check, .... # para time # Completed: 57970 of 57970 jobs # CPU time in finished jobs: 18228826s 303813.77m 5063.56h 210.98d 0.578 y # IO & Wait Time: 1019215s 16986.92m 283.12h 11.80d 0.032 y # Average job time: 332s 5.53m 0.09h 0.00d # Longest job: 2211s 36.85m 0.61h 0.03d # Submission to last job: 45422s 757.03m 12.62h 0.53d # Took about 12 hours to run and output is 1.7G # second cluster run to convert the .out's to .lav's cd /cluster/data/mm5/bed/blastz.danRer1 bash # if a csh/tcsh user . ./DEF /cluster/data/mm5/jkStuff/BlastZ_run1.sh cd run.1 para try, check, push, etc ... # para time # Checking finished jobs # Completed: 341 of 341 jobs # CPU time in finished jobs: 4536s 75.60m 1.26h 0.05d 0.000 y # IO & Wait Time: 65931s 1098.85m 18.31h 0.76d 0.002 y # Average job time: 207s 3.44m 0.06h 0.00d # Longest job: 636s 10.60m 0.18h 0.01d # Submission to last job: 1282s 21.37m 0.36h 0.01d # Third cluster run to convert lav's to axt's ssh kki cd /cluster/data/mm5/bed/blastz.danRer1 mkdir axtChrom # a new run directory mkdir run.2 cd run.2 cat << '_EOF_' > do.csh #!/bin/csh cd $1 cat `ls -1 *.lav | sort -g` \ | lavToAxt stdin /cluster/bluearc/scratch/mus/mm5/softNib \ /iscratch/i/danRer1/nib stdout \ | axtSort stdin $2 '_EOF_' # << this line makes emacs coloring happy chmod a+x do.csh cat << '_EOF_' > gsub #LOOP ./do.csh {check in exists $(path1)} {check out line+ /cluster/data/mm5/bed/blastz.danRer1/axtChrom/$(root1).axt} #ENDLOOP '_EOF_' # << this line makes emacs coloring happy \ls -1Sd ../lav/chr* > chrom.list gensub2 chrom.list single gsub jobList wc -l jobList head jobList para create jobList para try, check, push, check,... # para time # Completed: 43 of 43 jobs # CPU time in finished jobs: 246s 4.10m 0.07h 0.00d 0.000 y # IO & Wait Time: 4985s 83.08m 1.38h 0.06d 0.000 y # Average job time: 122s 2.03m 0.03h 0.00d # Longest job: 446s 7.43m 0.12h 0.01d # Submission to last job: 653s 10.88m 0.18h 0.01d # translate sorted axt files into psl ssh kolossus cd /cluster/data/mm5/bed/blastz.danRer1 mkdir -p pslChrom set tbl = "blastzDanRer1" foreach f (axtChrom/chr*.axt) set c=$f:t:r echo "Processing chr $c" /cluster/bin/i386/axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl end # Load database tables ssh hgwdev cd /cluster/data/mm5/bed/blastz.danRer1/pslChrom foreach f (./*.psl) /cluster/bin/i386/hgLoadPsl mm5 $f end # featureBits -chrom=chr1 mm5 refGene:cds blastzDanRer1 -enrichment #refGene:cds 0.763%,blastzDanRer1 2.918%,both 0.512%,cover 67.12%,enrich 23.00x # featureBits -chrom=chr1 mm5 refGene:cds blastzDanRer1L4000 -enrichment # refGene:cds 0.763%, blastzDanRer1L4000 17.878%, both 0.581%, cover 76.18%, # enrich 4.26x # featureBits -chrom=chr1 mm5 refGene:cds blastzDanRer1L5000 -enrichment # refGene:cds 0.763%,blastzDanRer1L5000 6.013%,both 0.540%,cover 70.81%, # enrich 11.78x # featureBits -chrom=chr1 mm5 refGene:cds blastzDanRer1L6500 -enrichment # refGene:cds 0.763%, blastzDanRer1L6500 2.386%, both 0.495%, cover 64.91%, # enrich 27.20x # featureBits -chrom=chr1 mm5 refGene:cds blastzDanRer1L7000 -enrichment # refGene:cds 0.763%, blastzDanRer1L7000 2.062%, both 0.480%, cover 62.87%, # enrich 30.50x # featureBits -chrom=chr1 mm5 refGene:cds blastzDanRer1HumanParams -enrichment # refGene:cds 0.763%,blastzDanRer1HumanParams 1.661%,both 0.502%, cover 65.82%, # enrich 39.64x # row counts: 172167 blastzDanRer1, # 2288714 blastzDanRer1HumanParams, # 3373525 blastzDanRer1L4000 # 700927 blastzDanRer1L5000 # 13719318 blastzDanRer1L3000 # 103190 blastzDanRer1L6500 # 76758 blastzDanRer1L7000 # Do test runs - repeat above using L=4000 and then try the mm5-hg17 parameters # also L=2000, L=3000 and L=5000. Use only mm5 chr1 for tests. # L=2000 and L=3000 lavToAxt crashed so re-do on kolossus. L2000 crashed again # probably ran out of memory. # The orginal blastzDanRer1 with L= 6000 looks best: good coverage and # enrichment without too many alignments in the database table. # RESCORE DANRER1 BLASTZ ALIGNMENTS (DONE, 2004-08-02, hartera) # Low scores can occur with repeats abridged and using the # HoxD55.q matrix. PSU's restore_rpts program rescored alignments # with the default matrix instead of the BLASTZ_Q matrix. # Rescore them here so the chainer sees the higher scores: ssh kolossus cd /cluster/data/mm5/bed/blastz.danRer1 mkdir axtChrom.rescore foreach f (axtChrom/chr*.axt) axtRescore -scoreScheme=/cluster/data/blastz/HoxD55.q \ $f axtChrom.rescore/$f:t end mv axtChrom axtChrom.orig mv axtChrom.rescore axtChrom # psl files and blastz tables will be the same regardless of score so # no need to reload # CHAIN DANRER1 BLASTZ (DONE, 2004-08-03, hartera) # FILTERED WITH A HIGHER MINSCORE THRESHOLD (DONE, 2004-08-04, hartera) # RELOADED TABLES (DONE, 2004-08-18, hartera) # removed all chainDanRer1 and chainDanRer1Link tables, some extra tables had # been accidentally loaded with this name from a different genome so there # were duplicate chain ids causing joinerCheck to complain. # Re do chains with rescored blastz danRer1 # Run axtChain on little cluster ssh kki cd /cluster/data/mm5/bed/blastz.danRer1 mkdir -p axtChain/run1 cd axtChain/run1 mkdir out chain ls -1S /cluster/data/mm5/bed/blastz.danRer1/axtChrom/*.axt \ > input.lst cat << '_EOF_' > gsub #LOOP doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out} #ENDLOOP '_EOF_' # << this line makes emacs coloring happy # Make our own linear gap file with reduced gap penalties, # in hopes of getting longer chains: cat << '_EOF_' > ../../chickenHumanTuned.gap tablesize^V 11 smallSize^V 111 position^V 1^V 2^V 3^V 11^V 111^V 2111^V 12111^V 32111^V 72111^V 152111^V 252111 qGap^V 325^V 360^V 400^V 450^V 600^V 1100^V 3600^V 7600^V 15600^V 31600^V 56600 tGap^V 325^V 360^V 400^V 450^V 600^V 1100^V 3600^V 7600^V 15600^V 31600^V 56600 bothGap^V 625^V 660^V 700^V 750^V 900^V 1400^V 4000^V 8000^V 16000^V 32000^V 57000 '_EOF_' # << this line makes emacs coloring happy cat << '_EOF_' > doChain #!/bin/csh axtChain -scoreScheme=/cluster/data/blastz/HoxD55.q \ -linearGap=../../chickenHumanTuned.gap \ -minScore=5000 $1 \ /cluster/bluearc/scratch/mus/mm5/softNib \ /iscratch/i/danRer1/nib $2 >& $3 '_EOF_' # << this line makes emacs coloring happy chmod a+x doChain gensub2 input.lst single gsub jobList para create jobList para try, check, push, check... # para time # Completed: 43 of 43 jobs # CPU time in finished jobs: 2260s 37.67m 0.63h 0.03d 0.000 y # IO & Wait Time: 863s 14.38m 0.24h 0.01d 0.000 y # Average job time: 73s 1.21m 0.02h 0.00d # Longest job: 342s 5.70m 0.10h 0.00d # Submission to last job: 36951s 615.85m 10.26h 0.43d # now on the cluster server, sort chains ssh kksilo cd /cluster/data/mm5/bed/blastz.danRer1/axtChain chainMergeSort run1/chain/*.chain > all.chain # filter again to use minScore of 7500 (see featureBits below) (2004-08-04) mv all.chain all.chain.filt5k chainFilter -minScore=7500 all.chain.unfiltered > all.chain # remove old chains rm -r chain chainSplit chain all.chain gzip all.chain.filt5k # take a look at score distr's,try also with smaller bin size. foreach f (chain/*.chain) grep chain $f | awk '{print $2;}' | sort -nr > /tmp/score.$f:t:r echo $f:t:r >> hist.out textHistogram -binSize=10000 /tmp/score.$f:t:r >> hist.out echo "" end # also hist5000.out has bin size 5000. looks good so load into database ssh hgwdev cd /cluster/data/mm5/bed/blastz.danRer1/axtChain/chain foreach i (*.chain) set c = $i:r hgLoadChain mm5 ${c}_chainDanRer1 $i echo done $c end # featureBits still shows good coverage and enrichment # featureBits -chrom=chr1 mm5 refGene:cds chainDanRer1Link -enrichment # refGene:cds 0.763%, chainDanRer1Link 2.246%, both 0.508%, cover 66.61%, # enrich 29.65x # Human Parameters Blastz Chain with minScore = 5,000 filter: # featureBits -chrom=chr1 mm5 refGene:cds chainDanRer1HPLink -enrichment # refGene:cds 0.763%, chainDanRer1HPLink 1.208%, both 0.484%, cover 63.43%, # enrich 52.49x # L=5000 Blastz Chain with minScore = 5,000 filter: # featureBits -chrom=chr1 mm5 refGene:cds chainDanRer1L5kLink -enrichment # refGene:cds 0.763%, chainDanRer1L5kLink 4.137%, both 0.534%, cover 69.96%, # enrich 16.91x # L=5000 Blastz Chain with minScore =10,000 filter: # featureBits -chrom=chr1 mm5 refGene:cds chainDanRer1L5k10kLink -enrichment # refGene:cds 0.763%, chainDanRer1L5k10kLink 1.038%, both 0.448%, cover 58.69%, # enrich 56.54x # filter too stringent, coverage has dropped a lot # with less filtering of blastzDanRer1 where minScore =3000 # featureBits -chrom=chr1 mm5 refGene:cds chainDanRer1Filt3kLink -enrichment # refGene:cds 0.763%, chainDanRer1Filt3kLink 2.487%, both 0.509%, cover 66.78%, # enrich 26.86x # with more filtering, minScore = 6000 # featureBits -chrom=chr1 mm5 refGene:cds chainDanRer1Filt6kLink -enrichment # refGene:cds 0.763%, chainDanRer1Filt6kLink 2.172%, both 0.508%, cover 66.54%, # enrich 30.64x # featureBits -chrom=chr1 mm5 refGene:cds chainDanRer1Filt7500Link -enrichment # refGene:cds 0.763%, chainDanRer1Filt75kLink 2.022%, both 0.504%, cover 66.10%,# enrich 32.70x # rows in database table: # chr1_blastzDanRer1Link: 515119 # chr1_chainDanRer1L5kLink: 1241480 # chr1_chainDanRer1L5k10kLink: 74963 # chr1_chainDanRer1HPLink: 309740 # chr1_chainDanRer1Filt3k: 594057 # chr1_chainDanRer1Filt6kLink: 479368 # chr1_chainDanRer1Filt7500Link: 378954 # Using the original parameters is a good compromise between high coverage # and high enrichment but a filter of 7500 on the score produces only a tiny # reduction in coverage with higher enrichment as there are a lot less # alignments of low score of the same regions or other low scoring alignments. # NET DANRER1 BLASTZ (DONE, 2004-08-04, hartera) ssh kksilo cd /cluster/data/mm5/bed/blastz.danRer1/axtChain mkdir preNet cd chain foreach i (*.chain) echo preNetting $i /cluster/bin/i386/chainPreNet $i ../../S1.len ../../S2.len \ ../preNet/$i end cd .. mkdir n1 cd preNet foreach i (*.chain) set n = $i:r.net echo primary netting $i /cluster/bin/i386/chainNet $i -minSpace=1 ../../S1.len ../../S2.len \ ../n1/$n /dev/null end cd .. cat n1/*.net | /cluster/bin/i386/netSyntenic stdin noClass.net # memory usage 103493632, utime 668 s/100, stime 127 # Add classification info using db tables: cd /cluster/data/mm5/bed/blastz.danRer1/axtChain # netClass looks for ancient repeats in one of the databases # hg17 has this table - hand-curated by Arian but this is for # human-rodent comparisons so do not use here, use -noAr option mkdir -p /cluster/bluearc/mm5/linSpecRep.notInZebrafish mkdir -p /cluster/bluearc/danRer1/linSpecRep.notInMouse cp /iscratch/i/mm5/linSpecRep.notInZebrafish/* \ /cluster/bluearc/mm5/linSpecRep.notInZebrafish cp /iscratch/i/danRer1/linSpecRep.notInMouse/* \ /cluster/bluearc/danRer1/linSpecRep.notInMouse ssh hgwdev cd /cluster/data/mm5/bed/blastz.danRer1/axtChain time netClass noClass.net mm5 danRer1 danRer1.net \ -tNewR=/cluster/bluearc/mm5/linSpecRep.notInZebrafish \ -qNewR=/cluster/bluearc/danRer1/linSpecRep.notInMouse -noAr # 77.700u 46.610s 3:05.75 66.9% 0+0k 0+0io 215pf+0w netFilter -minGap=10 danRer1.net | hgLoadNet mm5 netDanRer1 stdin # EXTRACT AXTs AND MAFs FROM ZEBRAFISH (danRer1) NET # (DONE, 2004-08-04, hartera) ssh eieio # create axts cd /cluster/data/mm5/bed/blastz.danRer1/axtChain netSplit danRer1.net danRer1Net mkdir -p ../axtNet cat > axtNet.csh << 'EOF' foreach f (danRer1Net/chr*.net) set c = $f:t:r echo "axtNet on $c" netToAxt danRer1Net/$c.net chain/$c.chain \ /cluster/data/mm5/mixedNib \ /cluster/data/danRer1/nib ../axtNet/$c.axt echo "Complete: $c.net -> $c.axt" end 'EOF' chmod +x axtNet.csh csh axtNet.csh >&! axtNet.log & tail -100f axtNet.log # sort axts before making mafs - must be sorted for multiz cd /cluster/data/mm5/bed/blastz.danRer1 mv axtNet axtNet.unsorted mkdir axtNet foreach f (axtNet.unsorted/*.axt) set c = $f:t:r echo "Sorting $c" axtSort $f axtNet/$c.axt end # create maf ssh eieio cd /cluster/data/mm5/bed/blastz.danRer1 cd axtNet mkdir ../mafNet cat > makeMaf.csh << 'EOF' foreach f (chr*.axt) set maf = $f:t:r.danRer1.maf echo translating $f to $maf axtToMaf $f \ /cluster/data/mm5/chrom.sizes /cluster/data/danRer1/chrom.sizes \ ../mafNet/$maf -tPrefix=mm5. -qPrefix=danRer1. end 'EOF' chmod +x makeMaf.csh csh makeMaf.csh >&! makeMaf.log & tail -100f makeMaf.log # BLASTZ DANRER1 CLEAN UP (DONE, 2004-08-04, hartera) ssh kksilo cd /cluster/data/mm5/bed/blastz.danRer1 nice rm -rf raw & nice rm -rf lav & nice rm -rf axtChrom.orig & nice rm axtChain/run1/chain/* & nice gzip {axt,psl}Chrom/* axtChain/{all.chain,*.net} & # unzip all.chain.gz and danRer1.net.gz to make vsDanRer1 downloadables # then zip these again (hartera, 2004-09-10) # UPDATE BACEND SEQUENCES (DONE - 2004-07-20 - Fan) # Download new files ssh kksilo mkdir –p /cluster/data/mm5/bed/bacends/ncbi cd /cluster/data/mm5/bed/bacends/ncbi wget ftp://ftp.ncbi.nih.gov/genomes/M_musculus/BACENDS/AllBACends.mfa.gz wget ftp://ftp.ncbi.nih.gov/genomes/M_musculus/BACENDS/cl_acc_gi_len.gz wget ftp://ftp.ncbi.nih.gov/genomes/M_musculus/BACENDS/README gunzip AllBACends.mfa.gz gunzip cl_acc_gi_len.gz # Convert fa file cat << '_EOF_' > convert.pl #!/usr/local/bin/perl -w use strict; while (my $line = <>) { if (substr($line,0,1) ne ">") { print $line; } else { my @fields = split(/\|/, $line); my $printed = 0; for (my $i = 0; $i < $#fields; $i++) { if ($fields[$i] eq "gb") { (my $name, my $vers) = split(/\./,$fields[$i+1]); print ">$name\n"; $i= $#fields; $printed = 1; } } if (!$printed) { die("Failed for $line\n"); } } } '_EOF_' chmod +x convert.pl ./convert.pl < AllBACends.mfa > BACends.fa # Create new pairs files convertBacEndPairInfo cl_acc_gi_len # Split file into pieces and copy to cluster to propagate ssh kksilo cd /cluster/data/mm5/bed/bacends/ncbi /cluster/bin/i386/faSplit sequence BACends.fa 100 BACends rm -rf /cluster/bluearc/scratch/mus/mm5/bacEnds mkdir /cluster/bluearc/scratch/mus/mm5/bacEnds mv BACends???.fa /cluster/bluearc/scratch/mus/mm5/bacEnds cp -p BACends.fa /cluster/bluearc/scratch/mus/mm5/bacEnds # Ask for propagation from sysadmin # Load the sequences (change bacends.# to match correct location) ssh hgwdev mkdir /gbdb/mm5/bacends cd /gbdb/mm5/bacends ln -s /cluster/data/mm5/bed/bacends/ncbi/BACends.fa . cd /tmp hgLoadSeq mm5 /gbdb/mm5/bacends/BACends.fa #Adding /gbdb/mm5/bacends/BACends.fa #452237 sequences #Updating seq table # One additional step 9/10/04 Fan. # Create a composite index to speed up hgTracks display when BAC Ends track selected. hgsql mm5 -e 'create index bacIndex2 on all_bacends(bin, qName(8));' # This will take hours. #All done # BACEND SEQUENCE ALIGNMENTS (DONE - 2004-07-23 - Fan) # (alignments done without RepeatMasking) # We need an ooc file for this genome ssh kksilo mkdir /cluster/data/mm5/ooc cd /cluster/data/mm5/ooc ls ../unmaskedNib/chr*.nib > nib.list blat -makeOoc=11.ooc -repMatch=1024 nib.list nib.list output.psl # Wrote 26077 overused 11-mers to 11.ooc # Did not end using this. Used an old one instead. # Create full sequence alignments ssh kk cd /cluster/data/mm5/bed/bacends /cluster/bin/scripts/splitContigList -scratch /iscratch/i/mus/mm5/maskedContigs 1 # allow blat to run politely in /tmp while it writes output, then # copy results to results file: cat << '_EOF_' > runBlat.sh #!/bin/sh path1=$1 path2=$2 root1=$3 root2=$4 result=$5 rm -fr /tmp/${root1}_${root2} mkdir /tmp/${root1}_${root2} pushd /tmp/${root1}_${root2} /cluster/bin/i386/blat ${path1} ${path2} -ooc=/scratch/hg/h/mouse11.ooc \ ${root1}.${root2}.psl popd rm -f ${result} mv /tmp/${root1}_${root2}/${root1}.${root2}.psl ${result} rm -fr /tmp/${root1}_${root2} '_EOF_' # << this line keeps emacs coloring happy chmod +x runBlat.sh cat << '_EOF_' > template #LOOP ./runBlat.sh {check in exists $(path1)} {check in exists $(path2)} $(root1) $(root2) {check out line+ bacEnds.out/$(root2)/$(root1).$(root2).psl} #ENDLOOP '_EOF_' # << this line keeps emacs coloring happy #ls -1S /iscratch/i/mm5/bacEnds/BACends???.fa > bacEnds.lst ls -1S /scratch/mus/mm5/bacEnds/BACends???.fa > bacEnds.lst mkdir bacEnds.out # create results directories for each to avoid the all result files in # one directory problem foreach f (`cat bacEnds.lst`) set b = $f:t:r echo $b mkdir bacEnds.out/$b end gensub2 contig.lst bacEnds.lst template jobList para create jobList # 62622 jobs written to batch para try, check, push, etc ... # Completed: 62622 of 62622 jobs # CPU time in finished jobs: 3760354s 62672.57m 1044.54h 43.52d 0.119 y # IO & Wait Time: 3216480s 53608.00m 893.47h 37.23d 0.102 y # Average job time: 111s 1.86m 0.03h 0.00d # Longest job: 2841s 47.35m 0.79h 0.03d # Submission to last job: 9395s 156.58m 2.61h 0.11d # Compile alignments and lift the files. # First attempt failed due to /cluster/store6 ran out of space. # Redoing it 7/22/04. ssh kksilo cd /cluster/data/mm5/bed/bacends mkdir /cluster/store8/fanTemp time pslSort dirs raw.psl /cluster/store8/fanTemp bacEnds.out/* \ > time.out & # This may take over over 14 hours! ssh kolossus cd /cluster/data/mm5/bed/bacends time pslReps -nearTop=0.01 -minCover=0.7 -minAli=0.8 -noIntrons raw.psl bacEnds.psl /dev/null # Processed 562840490 alignments rmdir temp # You will want to keep this file around until later processing is # proven correct rm raw.psl # 72 Gb ! It takes a while even to remove it. ssh kksilo cd /cluster/data/mm5/bed/bacends time /cluster/bin/scripts/lifter -psl -mouse /cluster/data/mm5 bacEnds.psl # real 130m36.149s # user 82m38.180s # sys 10m59.580s cp -p ~booch/clusterJobs/bacends/split.pl . cp -p ~booch/clusterJobs/bacends/header . time ./split.pl header < bacEnds.psl.lifted # real 2m16.354s # user 0m36.390s # sys 0m42.290s cp -p bacEnds.psl.lifted bacEnds.psl.lifted.save time pslSort dirs bacEnds.psl.lifted temp split # real 17m2.353s # user 14m17.040s # sys 1m38.560s rmdir temp rm -r split # Copy files to final destination and remove mkdir /cluster/data/mm5/bacends cp -p bacEnds.psl.lifted /cluster/data/mm5/bacends # BACEND PAIRS TRACK (DONE 2004-07-27 - Fan) ssh kolossus cd /cluster/data/mm5/bacends bash time /cluster/bin/x86_64/pslPairs -tInsert=10000 -minId=0.91 -noBin -min=25000 \ -max=350000 -slopval=10000 -hardMax=500000 -slop -short -long -orphan \ -mismatch -verbose bacEnds.psl.lifted \ ../bed/bacends/ncbi/bacEndPairs.txt all_bacends bacEnds # create header required by "rdb" tools echo 'chr\tstart\tend\tclone\tscore\tstrand\tall\tfeatures\tstarts\tsizes' > header echo '10\t10N\t10N\t10\t10N\t10\t10\t10N\t10\t10' >> header # edit header to make sure \t is/become tab character cat header bacEnds.pairs | row score ge 300 | sorttbl chr start | headchg - del > bacEndPairs.bed cat header bacEnds.slop bacEnds.short bacEnds.long bacEnds.mismatch bacEnds.orphan \ | row score ge 300 | sorttbl chr start | headchg -del > bacEndPairsBad.bed # The following took too long, break it into 3 steps. # extractPslLoad -noBin bacEnds.psl.lifted bacEndPairs.bed \ # bacEndPairsBad.bed | sorttbl tname tstart | headchg -del > bacEnds.load.psl extractPslLoad -noBin bacEnds.psl.lifted bacEndPairs.bed \ bacEndPairsBad.bed >j1.out cat j1.out| sorttbl tname tstart >j2.out cat j2.out | headchg -del > bacEnds.load.psl rm j1.out j2.out # load into database ssh hgwdev cd /cluster/data/mm5/bacends # edit bacEndPairs.bed to fix one ID that has a blank character in it. hgLoadBed mm5 bacEndPairs bacEndPairs.bed \ -sqlTable=/cluster/home/kate/kent/src/hg/lib/bacEndPairs.sql # Loaded 168535 # note - this track isn't pushed to RR, just used for assembly QA hgLoadBed mm5 bacEndPairsBad bacEndPairsBad.bed \ -sqlTable=/cluster/home/kate/kent/src/hg/lib/bacEndPairsBad.sql # Loaded 43182 #hgLoadPsl mm5 -nobin -table=all_bacends bacEnds.load.psl # NOTE: truncates file to 0 if -nobin is used hgLoadPsl mm5 -table=all_bacends bacEnds.load.psl # load of all_bacends did not go as planned: 14426473 record(s), 0 row(s) skipped, 4519 warning(s) loading psl.tab # featureBits mm5 all_bacends # 268502414 bases of 2615483787 (10.266%) in intersection # featureBits mm4 all_bacends # 243096171 bases of 2627444668 (9.252%) in intersection # featureBits mm5 bacEndPairs # 2567958504 bases of 2615483787 (98.183%) in intersection # featureBits mm4 bacEndPairs # 2549945356 bases of 2627444668 (97.050%) in intersection # featureBits mm5 bacEndPairsBad # 541027882 bases of 2615483787 (20.686%) in intersection # featureBits mm4 bacEndPairsBad # 1074505863 bases of 2627444668 (40.895%) in intersection # BLASTZ FUGU (FR1) (WORKING 7/28/04 kate) # Using Angie's hg17/fugu as a model # Treat all mouse repeats as lineage-specific (same as chicken, so just # reuse linSpecRep.Chicken). ssh kkr1u00 ln -s /iscratch/i/mus/mm5/linSpecRep.notInChicken \ /iscratch/i/mus/mm5/linSpecRep.notInFugu iSync ssh kk cd /cluster/data/mm5/bed mkdir blastz.fr1.2004-07-28 ln -s blastz.fr1.2004-07-28 blastz.fr1 cd blastz.fr1 cat << '_EOF_' > DEF # mouse vs. fugu export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin ALIGN=blastz-run BLASTZ=blastz # Reuse parameters from human-chicken, except L=6000 (more relaxed) BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q BLASTZ_ABRIDGE_REPEATS=1 # TARGET: Mouse SEQ1_DIR=/iscratch/i/mus/mm5/softNib SEQ1_RMSK= SEQ1_FLAG= SEQ1_SMSK=/iscratch/i/mus/mm5/linSpecRep.notInFugu SEQ1_IN_CONTIGS=0 SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Fugu SEQ2_DIR=/iscratch/i/fr1/nib SEQ2_RMSK= SEQ2_FLAG= SEQ2_SMSK=/iscratch/i/fr1/linSpecRep SEQ2_IN_CONTIGS=0 SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/cluster/data/mm5/bed/blastz.fr1.2004-07-28 DEF=$BASE/DEF RAW=$BASE/raw CDBDIR=$BASE SEQ1_LEN=$BASE/S1.len SEQ2_LEN=$BASE/S2.len '_EOF_' # << this line keeps emacs coloring happy # first cluster run: raw blastz alignments ssh kk bash # if a csh/tcsh user cd /cluster/data/mm5/bed/blastz.fr1.2004-07-28 source DEF mkdir $RAW run.0 /cluster/home/angie/hummus/make-joblist $DEF > $BASE/run.0/j sh ./xdir.sh cd run.0 sed -e 's@^blastz-run@/cluster/bin/penn/blastz-run@' j > jobList para create jobList para try, check, push, check, .... # GOT HERE #Completed: 93775 of 93775 jobs #Average job time: 187s 3.11m 0.05h 0.00d #Longest job: 3907s 65.12m 1.09h 0.05d #Submission to last job: 76763s 1279.38m 21.32h 0.89d # second cluster run: lift raw alignments -> lav dir ssh kki bash # if a csh/tcsh user cd /cluster/data/mm5/bed/blastz.fr1.2004-07-28 source DEF mkdir run.1 lav /cluster/bin/scripts/blastz-make-out2lav $DEF $BASE > $BASE/run.1/jobList cd run.1 wc -l jobList para create jobList para try, check, push, etc ... #Completed: 341 of 341 jobs #Average job time: 98s 1.63m 0.03h 0.00d #Longest job: 281s 4.68m 0.08h 0.00d #Submission to last job: 2102s 35.03m 0.58h 0.02d # third run: lav -> axt # (if non-default BLASTZ_Q is used in the future, put axtRescore in # the pipe after lavToAxt) ssh kki cd /cluster/data/mm5/bed/blastz.fr1.2004-07-28 mkdir axtChrom pslChrom run.2 cd run.2 cat << '_EOF_' > do.csh #!/bin/csh -ef cd $1 set chr = $1:t cat `ls -1 *.lav | sort -g` \ | $HOME/bin/x86_64/lavToAxt stdin \ /iscratch/i/mus/mm5/softNib /iscratch/i/fr1/nib stdout \ | $HOME/bin/x86_64/axtSort stdin ../../axtChrom/$chr.axt $HOME/bin/x86_64/axtToPsl ../../axtChrom/$chr.axt ../../S1.len ../../S2.len \ ../../pslChrom/$chr.psl '_EOF_' # << this line keeps emacs coloring happy chmod a+x do.csh cp /dev/null jobList foreach d (../lav/chr*) echo "do.csh $d" >> jobList end para create jobList para try, check, push, check #Completed: 43 of 43 jobs #Average job time: 671s 11.18m 0.19h 0.01d #Longest job: 2398s 39.97m 0.67h 0.03d #Submission to last job: 2417s 40.28m 0.67h 0.03d # CHAIN FUGU BLASTZ (WORKING 7/16/04 kate) # Run axtChain on little cluster ssh kki cd /cluster/data/mm5/bed/blastz.fr1.2004-07-28 mkdir -p axtChain/run1 cd axtChain/run1 mkdir out chainchimpSuperQuals ls -1S /cluster/data/mm5/bed/blastz.fr1.2004-07-28/axtChrom/*.axt \ > input.lst cat << '_EOF_' > gsub #LOOP doChain {check in line+ $(path1)} {check out line+ chain/$(root1).chain} {check out exists out/$(root1).out} #ENDLOOP '_EOF_' # << this line makes emacs coloring happy cat << '_EOF_' > doChain #!/bin/csh axtChain $1 \ /iscratch/i/mus/mm5/softNib \ /iscratch/i/fr1/nib $2 > $3 '_EOF_' # << this line makes emacs coloring happy chmod a+x doChain gensub2 input.lst single gsub jobList para create jobList para try, check, push, check... #Completed: 43 of 43 jobs #Average job time: 537s 8.96m 0.15h 0.01d #Longest job: 2071s 34.52m 0.58h 0.02d #Submission to last job: 2071s 34.52m 0.58h 0.02d # now on the cluster server, sort chains ssh kksilo cd /cluster/data/mm5/bed/blastz.fr1.2004-07-28/axtChain chainMergeSort run1/chain/*.chain > all.chain chainSplit chain all.chain rm run1/chain/*.chain # take a look at score distr's foreach f (chain/*.chain) grep chain $f | awk '{print $2;}' | sort -nr > /tmp/score.$f:t:r echo $f:t:r textHistogram -binSize=5000 /tmp/score.$f:t:r echo "" end # Lots of chaff with scores in the 3000's. Many very-high-scoring # chains. So filter the chain down somewhat... mv all.chain all.chain.unfiltered chainFilter -minScore=5000 all.chain.unfiltered > all.chain rm chain/* chainSplit chain all.chain gzip all.chain.unfiltered # Load chains into database ssh hgwdev cd /cluster/data/mm5/bed/blastz.fr1.2004-07-28/axtChain/chain foreach i (*.chain) set c = $i:r hgLoadChain mm5 ${c}_chainFr1 $i end # mouse-fugu gets significantly less coverage than human-fugu: featureBits mm5 -chrom=chr1 chainFr1Link #63386139 bases of 185739816 (34.126%) in intersection featureBits hg17 -chrom=chr1 chainFr1Link #123999291 bases of 222827847 (55.648%) in intersection # mouse-fugu isn't a whole lot less than mouse-human though: featureBits mm5 -chrom=chr1 chainHg17Link #75492250 bases of 185739816 (40.644%) in intersection featureBits mm5 -chrom=chr1 chainCanFam1Link #63386139 bases of 185739816 (34.126%) in intersection # NET FUGU BLASTZ (WORKING 7/16/04 kate) ssh kolossus cd /cluster/data/mm5/bed/blastz.fr1.2004-07-28/axtChain chainPreNet all.chain ../S1.len ../S2.len stdout \ | chainNet stdin -minSpace=1 ../S1.len ../S2.len stdout /dev/null \ | netSyntenic stdin noClass.net # Add classification info using db tables: ssh hgwdev cd /cluster/data/mm5/bed/blastz.fr1.2004-07-28/axtChain netClass -noAr noClass.net mm5 fr1 fugu.net # Make a 'syntenic' subset: ssh kksilo cd /cluster/data/mm5/bed/blastz.fr1.2004-07-28/axtChain rm noClass.net # Make a 'syntenic' subset of these with netFilter -syn fugu.net > fuguSyn.net # Load the nets into database ssh hgwdev cd /cluster/data/mm5/bed/blastz.fr1.2004-07-28/axtChain netFilter -minGap=10 fugu.net | hgLoadNet mm5 netFr1 stdin netFilter -minGap=10 fuguSyn.net | hgLoadNet mm5 syntenyNetFr1 stdin # Add entries for chainFr1, netFr1 to mouse/mm5 trackDb # MAKE VSFR1 DOWNLOADABLES (WORKING 7/19/04 kate) ssh kksilo cd /cluster/data/mm5/bed/blastz.fr1.2004-07-28 gzip axtNet/chr*.axt cd /cluster/data/mm5/bed/blastz.fr1.2004-07-28/axtChain ln all.chain fugu.chain zip /cluster/data/mm5/zip/fugu.chain.zip fugu.chain rm fugu.chain zip /cluster/data/mm5/zip/fugu.net.zip fugu.net zip /cluster/data/mm5/zip/fuguSyn.net.zip fuguSyn.net ssh hgwdev mkdir /usr/local/apache/htdocs/goldenPath/mm5/vsFr1 cd /usr/local/apache/htdocs/goldenPath/mm5/vsFr1 mv /cluster/data/mm5/zip/fugu*.zip . cp -pR /cluster/data/mm5/bed/blastz.fr1.2004-07-28/axtNet . md5sum *.zip axtNet/* > md5sum.txt # Copy over & edit README.txt w/pointers to chain, net formats. # GENERATE FR1 MAF FOR MULTIZ FROM NET (WORKING 7/19/04 kate) ssh kksilo cd /cluster/data/mm5/bed/blastz.fr1.2004-07-28/axtChain netSplit fugu.net net cd /cluster/data/mm5/bed/blastz.fr1.2004-07-28 mkdir axtNet foreach f (axtChain/net/*) set chr = $f:t:r netToAxt $f axtChain/chain/$chr.chain /cluster/data/mm5/nib \ /cluster/data/fr1/nib stdout \ | axtSort stdin axtNet/$chr.axt end mkdir mafNet foreach f (axtNet/chr*.axt) set maf = mafNet/$f:t:r.mc.maf axtToMaf $f \ /cluster/data/mm5/chrom.sizes /cluster/data/fr1/chrom.sizes \ $maf -tPrefix=mm5. -qPrefix=fr1. end # BLASTZ FR1 CLEAN UP (WORKING - 2004-07-28 - kate) ssh kksilo cd /cluster/data/mm5/bed/blastz.fr1 nice rm -rf raw & nice rm axtChain/run1/chain/* & nice gzip {axt,psl}Chrom/* lav/*/* axtChain/{all.chain,*.net} & # CONSERVATION TRACK - MULTIZ AND PHASTCONS (WORKING 2004-07-29 kate) ssh kksilo set multizDir = multiz.2004-07-29 set workingDir = /cluster/bluearc/mm5/$multizDir ln -s $workingDir /cluster/bluearc/mm5/multiz5way mkdir -p $workingDir mkdir -p /cluster/data/mm5/bed/$multizDir cd /cluster/data/mm5/bed/$multizDir # wrapper script for multiz # NOTE: first arg is pairwise, 2nd arg is multiple (to add to) # NOTE: next time, modify script so it only needs one arg -- saves the # multiple dirname in a file for use by the next run cat << 'EOF' > doMultiz.csh #!/bin/csh -fe mkdir -p $3:h /cluster/bin/penn/multiz $1 $2 - > $3 'EOF' # << for emacs cat << 'EOF' > gsub #LOOP ../doMultiz.csh {check in line /cluster/bluearc/mm5/multiz.2004-07-29/$(dir1)/$(root2).maf} {check in line /cluster/bluearc/mm5/multiz.2004-07-29/$(root1)/$(root2).maf} {check out line+ /cluster/bluearc/mm5/multiz.2004-07-29/$(root1)$(dir1)/$(root2).maf} #ENDLOOP 'EOF' # << for emacs chmod +x doMultiz.csh # copy mafs to bluearc -- rat ssh kksilo set workingDir = /cluster/bluearc/mm5/multiz.2004-07-29 mkdir $workingDir/rn3 cp /cluster/data/mm5/bed/blastz.rn3/mafNet/chr*.maf $workingDir/rn3 ls $workingDir/rn3/*.maf > chrom.lst # human mkdir $workingDir/hg17 cp /cluster/data/mm5/bed/blastz.hg17/mafNet/chr*.maf $workingDir/hg17 # dog mkdir $workingDir/canFam1 cp /cluster/data/mm5/bed/blastz.canFam1/mafNet/chr*.maf $workingDir/canFam1 # chicken mkdir $workingDir/galGal2 cp /cluster/data/mm5/bed/blastz.galGal2/mafNet/chr*.maf $workingDir/galGal2 # first multiz - add in human to mouse/rat # ssh kki set multizDir = multiz.2004-07-29 set workingDir = /cluster/bluearc/mm5/$multizDir cd /cluster/data/mm5/bed/$multizDir mkdir run.hg17 cd run.hg17 echo "hg17/rn3" > species.lst gensub2 species.lst ../chrom.lst ../gsub jobList para create jobList # 43 jobs para try, check, push, check cd .. # dog mkdir run.canFam1 cd run.canFam1 echo "canFam1/rn3hg17" > species.lst gensub2 species.lst ../chrom.lst ../gsub jobList para create jobList para try, check, push, check cd .. # chicken mkdir run.galGal2 cd run.galGal2 echo "galGal2/rn3hg17canFam1" > species.lst gensub2 species.lst ../chrom.lst ../gsub jobList # no alignment file for chr18_random -- create one so we can create jobList para create jobList para try, check, push, check cd .. # copy 5-way mafs to build directory ssh kksilo set multizDir = multiz.2004-07-29 set workingDir = /cluster/bluearc/mm5/$multizDir ln -s $workingDir/rn3hg17canFam1galGal2 $workingDir/maf cd /cluster/data/mm5/bed/multiz.2004-07-29 mkdir maf cp $workingDir/maf/*.maf maf # PHYLO-HMM CONSERVATION FOR 5-WAY MULTIZ (DONE 2004-07-29 kate) # updated 09-13-04 acs ssh kksilo set path = ($path /cluster/bin/phast) cd /cluster/data/mm5/bed/multiz.2004-07-29 mkdir cons cd cons #break up the genome-wide MAFs into pieces mkdir /cluster/bluearc/mm5/chrom cd /cluster/data/mm5 foreach f (?{,?}/*.fa) echo $f cp $f /cluster/bluearc/mm5/chrom end ssh kki cd /cluster/data/mm5/bed/multiz.2004-07-29/cons mkdir run.split cd run.split set WINDOWS = /cluster/bluearc/mm5/multiz.2004-07-29/cons/WINDOWS rm -fr $WINDOWS mkdir -p $WINDOWS cat << 'EOF' > doSplit.sh #!/bin/sh PHAST=/cluster/bin/phast FA_SRC=/cluster/bluearc/mm5/chrom WINDOWS=/cluster/bluearc/mm5/multiz.2004-07-29/cons/WINDOWS maf=$1 c=`basename $maf .maf` echo $c mkdir -p /scratch/msa_split ${PHAST}/msa_split $maf -i MAF -M ${FA_SRC}/$c.fa -O mm5,rn3,hg17,canFam1,galGal2 -w 1000000,0 -r /scratch/msa_split/$c -o SS -I 1000 -B 5000 [ $? -eq 0 ] || exit 1 echo "Copying..." cd /scratch/msa_split for file in $c.*.ss ; do gzip -c $file > ${WINDOWS}/$file.gz ; done [ $? -eq 0 ] || exit 1 rm -f /scratch/msa_split/$c.*.ss echo "Done copying" echo "Done" >> ${WINDOWS}/$c.done 'EOF' # << for emacs chmod +x doSplit.sh rm -f jobList foreach file (/cluster/bluearc/mm5/multiz.2004-07-29/maf/*.maf) set c = $file:t:r echo "doSplit.sh $file {check out line+ $WINDOWS/$c.done}" >> jobList end para create jobList # 43 jobs para try para check para push #CPU time in finished jobs: 4354s 72.57m 1.21h 0.05d 0.000 y #IO & Wait Time: 6102s 101.70m 1.70h 0.07d 0.000 y #Average job time: 243s 4.05m 0.07h 0.00d #Longest job: 728s 12.13m 0.20h 0.01d #Submission to last job: 1300s 21.67m 0.36h 0.02d cd .. # generate conservation scoring using phastCons ssh kk cd /cluster/data/mm5/bed/multiz.2004-07-29/cons mkdir run.cons cd run.cons # skip parameter estimation step: use parameters already estimated for # hg17 (see makeHg17.doc) cp /cluster/data/hg17/bed/multiz.2004-07-13/cons/run.elements/ave.cons.mod /cluster/data/hg17/bed/multiz.2004-07-13/cons/run.elements/ave.noncons.mod . cat << 'EOF' > doPhastCons.sh #!/bin/sh mkdir -p /cluster/bluearc/mm5/phastCons/POSTPROBS /cluster/bluearc/mm5/phastCons/ELEMENTS pref=`basename $1 .ss.gz` chr=`echo $pref | awk -F\. '{print $1}'` tmpfile=/scratch/phastCons.$$ zcat $1 | /cluster/bin/phast/phastCons - ave.cons.mod,ave.noncons.mod --expected-lengths 12 --target-coverage 0.15 --quiet --seqname $chr --idpref $pref --viterbi /cluster/bluearc/mm5/phastCons/ELEMENTS/$pref.bed --score --require-informative 0 > $tmpfile gzip -c $tmpfile > /cluster/bluearc/mm5/phastCons/POSTPROBS/$pref.pp.gz rm $tmpfile EOF chmod u+x doPhastCons.sh rm -fr /cluster/bluearc/mm5/phastCons/POSTPROBS /cluster/bluearc/mm5/phastCons/ELEMENTS rm -f jobs.lst for f in /cluster/bluearc/mm5/multiz.2004-07-29/cons/WINDOWS/*.ss.gz ; do echo doPhastCons.sh $f >> jobs.lst ; done # run cluster job para create, ... # took about 10 minutes # combine predictions and transform scores to be in 0-1000 interval # do in a way that avoids limits on numbers of args find /cluster/bluearc/mm5/phastCons/ELEMENTS -name "*.bed" > files rm -f splitfiles* all.raw.bed split files splitfiles for s in splitfiles* ; do awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' `cat $s` >> all.raw.bed ; done /cluster/bin/scripts/lodToBedScore all.raw.bed > all.bed rm files splitfiles* hgLoadBed mm5 phastConsElements all.bed # check coverage featureBits mm5 phastConsElements #135605549 bases of 2615483787 (5.185%) in intersection # This should be close enough. If necessary, you can rerun the # steps above with a different target coverage. When hitting the # target is important, you may want to perform several iterations # using a representative subset of the entire dataset (in human, chr1 # seems to work pretty well) # set up wiggle mkdir -p /cluster/bluearc/mm5/phastCons/wib cat << 'EOF' > doWigAsciiToBinary.sh #!/bin/sh chr=$1 zcat `ls /cluster/bluearc/mm5/phastCons/POSTPROBS/$chr.*.pp.gz | sort -t\. -k2,2n` | wigAsciiToBinary -chrom=$chr -wibFile=/cluster/bluearc/mm5/phastCons/wib/${chr}_phastCons stdin EOF chmod u+x doWigAsciiToBinary.sh rm -f jobs2.lst for chr in `ls /cluster/bluearc/mm5/phastCons/POSTPROBS | awk -F\. '{print $1}' | sort -u` ; do echo doWigAsciiToBinary.sh $chr >> jobs2.lst ; done # run a little wigAsciiToBinary cluster job ssh kk, etc. # copy wibs and wigs from bluearc rsync -av /cluster/bluearc/mm5/phastCons/wib . # load track hgLoadWiggle mm5 phastCons -pathPrefix=/gbdb/mm5/phastCons/wib \ wib/chr*_phastCons.wig mkdir -p /gbdb/mm5/phastCons/wib rm -f /gbdb/mm5/phastCons/wib/chr*phastCons.wib ln -s /cluster/data/mm5/bed/multiz.2004-07-29/cons/run.cons/wib/*.wib /gbdb/mm5/phastCons/wib chmod 775 . wib /gbdb/mm5/phastCons /gbdb/mm5/phastCons/wib chmod 664 wib/*.wib # move postprobs over and clean up bluearc rsync -av /cluster/bluearc/mm5/phastCons/POSTPROBS . # (people sometimes want the raw scores) rm -r /cluster/bluearc/mm5/phastCons/ELEMENTS /cluster/bluearc/mm5/phastCons/POSTPROBS /cluster/bluearc/mm5/phastCons/wib # load data for track name "multiz5way" # load multiz maf tables ssh hgwdev cd /cluster/data/mm5/bed/multiz.2004-07-29 set mafDir = /gbdb/mm5/multiz5way/maf set table = multiz5way mkdir -p $mafDir/$table ln -s `pwd`/maf/*.maf $mafDir/$table cd maf hgLoadMaf mm5 -warn multiz5way -pathPrefix=$mafDir/$table # load blastz maf tables # TODO: change mafWiggle to use db names instead of species names # in speciesOrder ssh hgwdev cd /cluster/data/mm5/bed ln -s multiz.2004-07-29 multiz5way cat > multiz5way/loadMaf.csh << 'EOF' set mafDir = /gbdb/mm5/multiz5way/maf foreach s (rn3 hg17 canFam1 galGal2) set O = `echo "select genome from dbDb where name='$s'" | \ hgsql -s -h genome-testdb hgcentraltest` set o = $O:l set table = ${o}_netBlastz mkdir -p $mafDir/$table ln -s `pwd`/blastz.$s/mafNet/*.maf $mafDir/$table echo $o hgLoadMaf mm5 -warn ${o}_netBlastz -pathPrefix=$mafDir/$table end 'EOF' # <&! multiz5way/loadMaf.log & # track multiz5way # shortLabel Conservation # longLabel Rat/Human/Dog/Chicken Multiz Alignments & PhyloHMM Cons # group compGeno # priority 149 # visibility pack #color 0, 10, 100 # type wigMaf 0.0 1.0 # maxHeightPixels 100:40:11 # wiggle phastCons # yLineOnOff Off # autoScale Off # pairwise netBlastz # speciesOrder rat human dog chicken # MULTIZ DOWNLOAD FILES (DONE kate 2004-08-03) ssh kksilo cd /cluster/data/mm5/bed/multiz5way # multiz mkdir gzMaf foreach f (maf/*.maf) gzip -c $f > gzMaf/$f:t.gz echo $f end ssh hgwdev mkdir -p /usr/local/apache/htdocs/goldenPath/mm5/multiz5way cd /usr/local/apache/htdocs/goldenPath/mm5/multiz5way mv /cluster/data/mm5/bed/multiz5way/gzMaf/* . rmdir /cluster/data/mm5/bed/multiz5way/gzMaf md5sum *.gz > md5sum.txt # make a README.txt file # PHASTCONS SCORES DOWNLOADABLES (DONE 10/11/04 angie) ssh kksilo mkdir /cluster/data/mm5/zip/mzRn3Hg17Cf1Gg2 cd /cluster/data/mm5/bed/multiz5way/cons/run.cons/POSTPROBS foreach chr (`awk '{print $1;}' /cluster/data/mm5/chrom.sizes`) echo $chr zcat `ls -1 $chr.*.pp.gz | sort -t\. -k2,2n` \ | gzip -c \ > /cluster/data/mm5/zip/mzRn3Hg17Cf1Gg2/$chr.gz end ssh hgwdev mkdir /usr/local/apache/htdocs/goldenPath/mm5/phastCons # Doh! /cluster/data/mm5/zip/mzRn3Hg17Cf1Gg2 is 8.6G now -- too much # to dump on hgwdev's / which is at 94%. Instead of doing this: #mv /cluster/data/mm5/zip/mzRn3Hg17Cf1Gg2 . # make symbolic links: mkdir /usr/local/apache/htdocs/goldenPath/mm5/phastCons/mzRn3Hg17Cf1Gg2 cd /usr/local/apache/htdocs/goldenPath/mm5/phastCons/mzRn3Hg17Cf1Gg2 ln -s /cluster/data/mm5/zip/mzRn3Hg17Cf1Gg2/* . md5sum *.gz > md5sum.txt # make a README.txt. # PREP FOR LIFTOVER CHAINS TO THIS ASSEMBLY (2004-08-02 kate) # split into 3K chunks ssh kksilo cd /cluster/data/mm5 set liftDir = /iscratch/i/mm5/liftOver/liftSplit mkdir -p $liftDir cd $liftDir mkdir -p split lift cat > split.csh << 'EOF' set liftDir = /iscratch/i/mm5/liftOver/liftSplit cd /cluster/data/mm5 foreach n (`ls ?{,?}/*.fa`) set d = $n:h set c = $n:t:r echo $c faSplit -lift=$liftDir/lift/$c.lft size \ /cluster/data/mm5/$d/$c.fa -oneFile 3000 $liftDir/split/$c end 'EOF' # << for emacs csh split.csh >&! split.log & tail -100f split.log ssh kkr1u00 iSync # LOAD GENEID GENES (DONE 8/2/04 Fan) # reloaded 3/16/04 with -gtf instead of -exon=CDS (nec. now! for stop_codon) mkdir -p /cluster/data/mm5/bed/geneid/download cd /cluster/data/mm5/bed/geneid/download foreach f (/cluster/data/mm5/*/chr*.fa) set chr = $f:t:r wget \ http://genome.imim.es/genepredictions/M.musculus/mmMay2004/geneid_v1.2/$chr.gtf wget \ http://genome.imim.es/genepredictions/M.musculus/mmMay2004/geneid_v1.2/$chr.prot end # Add missing .1 to protein id's foreach f (*.prot) perl -wpe 's/^(>chr\w+)$/$1.1/' $f > $f:r-fixed.prot end cd .. ldHgGene -genePredExt -gtf mm5 geneid download/*.gtf hgPepPred mm5 generic geneidPep download/*-fixed.prot # PRODUCING GENSCAN PREDICTIONS (DONE 08-03-04 Fan) ssh hgwdev mkdir /cluster/data/mm5/bed/genscan cd /cluster/data/mm5/bed/genscan # Check out hg3rdParty/genscanlinux to get latest genscan: cvs co hg3rdParty/genscanlinux # Run on small cluster (more mem than big cluster). ssh kki cd /cluster/data/mm5/bed/genscan # Make 3 subdirectories for genscan to put their output files in mkdir gtf pep subopt # Generate a list file, genome.list, of all the hard-masked contigs that # *do not* consist of all-N's (which would cause genscan to blow up) rm -f genome.list touch genome.list foreach f ( `ls -1S /cluster/data/mm5/*/chr*_*/chr*_?{,?}.fa.masked` ) egrep '[ACGT]' $f > /dev/null if ($status == 0) echo $f >> genome.list end wc -l genome.list # Create template file, gsub, for gensub2. For example (3-line file): cat << '_EOF_' > gsub #LOOP /cluster/bin/i386/gsBig {check in line+ $(path1)} {check out line gtf/$(root1).gtf} -trans={check out line pep/$(root1).pep} -subopt={check out line subopt/$(root1).bed} -exe=hg3rdParty/genscanlinux/genscan - par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=2400000 #ENDLOOP '_EOF_' # << this line makes emacs coloring happy gensub2 genome.list single gsub jobList para create jobList para try, check, push, check, ... # Completed: 638 of 639 jobs # Crashed: 1 jobs # CPU time in finished jobs: 386282s 6438.03m 107.30h 4.47d 0.012 y # IO & Wait Time: 3735s 62.25m 1.04h 0.04d 0.000 y # Average job time: 611s 10.19m 0.17h 0.01d # Longest job: 22687s 378.12m 6.30h 0.26d # Submission to last job: 33710s 561.83m 9.36h 0.39d # If there are crashes, diagnose with "para problems". # If a job crashes due to genscan running out of memory, re-run it # manually with "-window=1200000" instead of "-window=2400000". /cluster/bin/i386/gsBig /cluster/data/mm5/19/chr19_1/chr19_1.fa.masked gtf/chr19_1.fa.gtf -trans=pep/chr19_1.fa.pep -subopt=subopt/chr19_1.fa.bed - exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat - tmp=/tmp -window=1200000 # Convert these to chromosome level files as so: ssh kksilo cd /cluster/data/mm5/bed/genscan liftUp genscan.gtf ../../jkStuff/liftAll.lft warn gtf/*.gtf liftUp genscanSubopt.bed ../../jkStuff/liftAll.lft warn subopt/*.bed cat pep/*.pep > genscan.pep # Load into the database as so: ssh hgwdev cd /cluster/data/mm5/bed/genscan # Reloaded without -genePredExt 1/6/05: ldHgGene mm5 -gtf genscan genscan.gtf hgPepPred mm5 generic genscanPep genscan.pep hgLoadBed mm5 genscanSubopt genscanSubopt.bed # MITOPRED DATA FOR HGGENE (DONE 8/10/04 angie) ssh hgwdev mkdir /cluster/data/mm5/bed/mitopred cd /cluster/data/mm5/bed/mitopred wget http://mitopred.sdsc.edu/data/mus_30.out perl -wpe 's/^(\S+)\s+\S+\s+(.*)/$1\t$2/' mus_30.out > mitopred.tab cat > mitopred.sql << '_EOF_' # Prediction of nuclear-encoded mito. proteins from http://mitopred.sdsc.edu/ CREATE TABLE mitopred ( name varchar(10) not null, # SwissProt ID confidence varchar(8) not null, # Confidence level #Indices PRIMARY KEY(name(6)) ); '_EOF_' # << this line makes emacs coloring happy hgsql mm5 < mitopred.sql hgsql mm5 -e 'load data local infile "mitopred.tab" into table mitopred' # STS MARKERS TRACK (RE-BUILT - 2004-08-24- Fan) ssh kksilo mkdir -p /cluster/data/mm5/bed/STSmarkers/downloads cd /cluster/data/mm5/bed/STSmarkers/downloads # these files appear to be new almost every day wget --timestamping \ ftp://ftp.ncbi.nih.gov/repository/UniSTS/UniSTS_mouse.sts wget --timestamping \ ftp://ftp.ncbi.nih.gov/repository/UniSTS/UniSTS.aliases # these map files appear to be old, 2002 Data wget --timestamping \ ftp://ftp.ncbi.nih.gov/repository/UniSTS/UniSTS_MapReports/Mus_musculus/* # Picks up files: # 345184 Feb 20 2002 10090.MGD.txt # 173294 Jun 27 2002 10090.WI_Mouse_Genetic.txt # 240637 Jun 27 2002 10090.WI_Mouse_YAC.txt # 390088 Jun 27 2002 10090.Whitehead-MRC_RH.txt # If these files have not been changing, then no need to worry about # them. We are just picking them up to see if they have changed # since the last time we worked on this. # these reports from jax.org appear to be changing daily wget --timestamping \ ftp://ftp.informatics.jax.org/pub/reports/MRK_Dump2.rpt wget --timestamping \ ftp://ftp.informatics.jax.org/pub/reports/MRK_Sequence.rpt wget --timestamping \ ftp://ftp.informatics.jax.org/pub/reports/PRB_PrimerSeq.rpt # compare them with previous versions. Before this these were # in /cluster/store5/mouseMarker/orig # these newly picked up files: sum -r 10090* # 48882 338 10090.MGD.txt # 24176 381 10090.Whitehead-MRC_RH.txt # 62367 170 10090.WI_Mouse_Genetic.txt # 50616 235 10090.WI_Mouse_YAC.txt sum -r *.rpt # 21267 4442 MRK_Dump2.rpt # 51274 3743 MRK_Sequence.rpt # 35293 2315 PRB_PrimerSeq.rpt sum -r UniSTS* # 40884 10502 UniSTS.aliases # 14407 2931 UniSTS_mouse.sts # the previous copies cd /cluster/store5/mouseMarker/orig sum -r 10090* # 48882 338 10090.MGD.txt # 24176 381 10090.Whitehead-MRC_RH.txt # 62367 170 10090.WI_Mouse_Genetic.txt # 50616 235 10090.WI_Mouse_YAC.txt sum -r *.rpt # 36880 4160 MRK_Dump2.rpt # 02447 3132 MRK_Sequence.rpt # 57914 2220 PRB_PrimerSeq.rpt sum -r UniSTS* # 36201 8843 UniSTS.aliases # 58524 970 UniSTS_mouse.alias # 42464 2291 UniSTS_mouse.sts # back to our work area, update the bed file # to do this we need a new UniSTS_mouse.alias file # it is created by a combination of information from several # of the above files ! AND ! the previous stsInfoMouse.bed file cp /cluster/data/mm4/bed/STSmarkers/downloads/*.sh . -p cp /cluster/data/mm4/bed/STSmarkers/downloads/*.pl . -p # This process has been captured in the script: # /cluster/data/mm5/bed/STSmarkers/downloads/fetchAllAliases.sh # which uses a couple of perl scripts in that same directory. # briefly it is: # cd /cluster/data/mm5/bed/STSmarkers/downloads # ./UniSTSParse.pl UniSTS_mouse.sts UniSTS.aliases > UniSTS_mouse_alias.0 # grep MGI: UniSTS.aliases > MGI.aliases # ./stsInfoMouseParse.pl /cluster/store5/mouseMarker/stsInfoMouse.bed > \ # stsInfoAliases.txt # ./UniSTSParse.pl stsInfoAliases.txt UniSTS.aliases > stsInfo.aliases # cat UniSTS_mouse_alias.0 MGI.aliases stsInfo.aliases | sort -u \ # | sort -n > UniSTS_mouse.alias /cluster/data/mm5/bed/STSmarkers/downloads/fetchAllAliases.sh # with that, we can create a new stsInfoMouse.bed file: bash cd /cluster/data/mm5/bed/STSmarkers /cluster/store5/mouseMarker/code/updateBed.pl \ /cluster/store5/mouseMarker/stsInfoMouse.bed \ downloads/MRK_Dump2.rpt downloads/PRB_PrimerSeq.rpt \ downloads/MRK_Sequence.rpt downloads/UniSTS_mouse.alias \ downloads/UniSTS_mouse.sts | sed -e "s/\t*$//" > newbedfile # Yontao updated /cluster/store5/mouseMarker/code/cleanInfo.pl 8/10/04 /cluster/store5/mouseMarker/code/cleanInfo.pl newbedfile > stsInfoMouse.bed # copy the stsInfoMouse.bed file from working dir to the marker info storage fold. # added 2 new steps by Yontao mv /cluster/store5/mouseMarker/stsInfoMouse.bed /cluster/store5/mouseMarker/stsInfoMouse.bed_mm3 cp -p stsInfoMouse.bed /cluster/store5/mouseMarker/stsInfoMouse.bed # comparing to Mm4, this file was used there: # /cluster/store6/mm4/bed/STSmarkers # a wc of it shows: # 56406 786036 6425721 stsInfoMouse.bed # Now we have: # 58488 790056 6602318 stsInfoMouse.bed # and from that, create new primer fa, epcr, etc: /cluster/store5/mouseMarker/code/luConvertPrimerToFa \ stsInfoMouse.bed mouseP.fa mouseC.fa mouseP.info # the mouseC.fa file will be empty wc mouse?.* # 0 0 0 mouseC.fa # 286740 286686 6474893 mouseP.fa # 32232 161234 2044810 mouseP.info # 318972 447920 8519703 total # the equivalent Mm4 versions: # 0 0 0 mouseC.fa # 258307 258245 5815248 mouseP.fa # 29906 149545 1890926 mouseP.info # copy the primers over to the bluearc for the kluster run cp -p mouseP.fa /cluster/bluearc/scratch/mus/mm5 cp -p mouseP.info /cluster/bluearc/scratch/mus/mm5 # CLUSTER RUN FOR THE STS PRIMERS ssh kk mkdir -p /cluster/data/mm5/bed/STSmarkers/primer mkdir -p /cluster/data/mm5/bed/STSmarkers/ePCR cd /cluster/data/mm5/bed/STSmarkers/primer # the mouseP.fa comes from above echo "/cluster/bluearc/scratch/mus/mm5/mouseP.fa" > primers.lst # PLEASE NOTE /cluster/bin/i386/blat.2 SPECIFICALLY IS USED HERE. cat << '_EOF_' > template #LOOP /cluster/bin/i386/blat.2 $(path1) $(path2) -ooc=/scratch/hg/h/mouse11.ooc -minMatch=1 -minScore=0 -minIdentity=80 -oneOff {check out line+ primers.out/$(root1).psl} #ENDLOOP '_EOF_' mkdir primers.out /cluster/bin/scripts/splitContigList -mouse -scratch \ /cluster/bluearc/scratch/mus/mm5/maskedContigs 1 /cluster/bin/i386/gensub2 contig.lst primers.lst template jobList para create jobList para try para check para push ... etc ... # Completed: 639 of 639 jobs # CPU time in finished jobs: 334066s 5567.76m 92.80h 3.87d 0.011 y # IO & Wait Time: 72565s 1209.42m 20.16h 0.84d 0.002 y # Average job time: 636s 10.61m 0.18h 0.01d # Longest job: 800s 13.33m 0.22h 0.01d # Submission to last job: 1090s 18.17m 0.30h 0.01d # on the file server ssh kksilo cd /cluster/data/mm5/bed/STSmarkers/primer /cluster/bin/i386/pslSort dirs primers.psl temp primers.out rmdir temp # comparing results to Mm4: wc primers.psl # 5719969 120119288 590806241 primers.psl # Mm4 wc primers.psl /cluster/data/mm4/bed/STSmarkers/primer/primers.psl # 5745617 120657896 592135728 primers.psl # another kluster run ssh kk cd /cluster/data/mm5/bed/STSmarkers/ePCR ls -1S /cluster/bluearc/scratch/mus/mm5/maskedContigs > contig.lst # Edit this list to get full path names! mkdir epcr.out cat << '_EOF_' > template #LOOP /cluster/bin/scripts/luRunEpcr $(path1) $(path2) epcr.out/$(num2).epcr #ENDLOOP '_EOF_' # the mouseP.info was created above echo "/cluster/bluearc/scratch/mus/mm5/mouseP.info" > epcr.lst gensub2 epcr.lst contig.lst template jobList para create jobList para try para check para push ... etc ... # Completed: 639 of 639 jobs # CPU time in finished jobs: 146365s 2439.41m 40.66h 1.69d 0.005 y # IO & Wait Time: 67691s 1128.19m 18.80h 0.78d 0.002 y # Average job time: 335s 5.58m 0.09h 0.00d # Longest job: 427s 7.12m 0.12h 0.00d # Submission to last job: 485s 8.08m 0.13h 0.01d ssh hgwdev cd /cluster/data/mm5/bed/STSmarkers/ePCR # all those results become all.epcr cat epcr.out/*.epcr > all.epcr # comparing results to Mm4: wc *.epcr # 55677 222708 2945623 all.epcr wc /cluster/store6/mm4/bed/STSmarkers/ePCR/*.epcr # 74705 298820 3971712 /cluster/store6/mm4/bed/STSmarkers/ePCR/all.epcr cd /cluster/data/mm5/bed/STSmarkers/primer /cluster/bin/scripts/filterSTSPrimers \ -mouse ../stsInfoMouse.bed primers.psl \ ../mouseP.info ../ePCR/all.epcr > primers.psl.filter.blat # The output should show an increasing count: # Reading name info # Reading primer info # Processing file # 100000 # 200000 # 300000 # ... # 5700000 # Determining ePCR not found # wc primers.psl.filter.blat # 33476 702996 3442402 primers.psl.filter.blat # Mm4: wc primers.psl.filter.blat # 32729 687309 3331894 primers.psl.filter.blat # create accession_info.rdb (chrM added to Terry's script for mouse) touch empty_sequence.inf /cluster/bin/scripts/compileAccInfo -mouse \ /cluster/data/mm5 empty_sequence.inf # works with two seemingly errors: # cat: /cluster/data/mm5/11/chr11_random.agp: No such file or directory # cat: /cluster/data/mm5/M/chrM_random.agp: No such file or directory mv accession_info.rdb accession_info.rdb.tmp /cluster/bin/scripts/sorttbl Chr Ord Start < accession_info.rdb.tmp > \ accession_info.rdb rm accession_info.rdb.tmp # comparing results to Mm4: # Mm5 wc accession_info.rdb # 131845 1450299 9681940 accession_info.rdb # Mm4 wc accession_info.rdb # 86935 956289 6374930 accession_info.rdb # # 219652 1885501 11875772 total # wc /cluster/data/mm5/?/*.agp /cluster/data/mm5/??/*.agp # 252515 2152346 13568720 total # creates epcr.not.found.nomatch and epcr.not.found.psl /cluster/bin/scripts/epcrToPsl -mouse \ epcr.not.found ../mouseP.info \ accession_info.rdb /cluster/data/mm5 # Comparing results to Mm4: # Mm5 wc epcr* # 463 1852 17080 epcr.not.found # 61 732 5845 epcr.not.found.nomatch # 402 8442 39011 epcr.not.found.psl # Mm4 wc epcr* # 328 1312 12011 epcr.not.found # 57 684 5474 epcr.not.found.nomatch # 266 5586 25711 epcr.not.found.psl # there is a single error being propagated here from the file # /cluster/store5/mouseMarker/stsInfoMouse.bed which has an error # at line 53958: 62943 D2J3 91947 D2J3 CAACCAGCTCAC CAACCAGCTCAC 1825, 1025BP 0 MUS MUSCULUS # The value '1825,' is incorrect. Should be a small integer here. # to work around this problem, I'm manually eliminating this problem # from the epcr.not.found.psl file where it has now become four bad # lines: # 24 0 0 0 1 1801 1 1789 + 27119 1825 0 1825chr11_16 0 1115413 1117226 2 12,12, 0,1813, 1115413,1117214, # 24 0 0 0 1 1801 1 1789 + 27119 1825 0 1825chr11_16 0 1115413 1117226 2 12,12, 0,1813, 1115413,1117214, 216a219,220 # 24 0 0 0 1 1801 1 1789 + 62943 1825, 0 1825,chr11_16 0 1115413 1117226 2 12,12, 0,1813, 1115413,1117214, # 24 0 0 0 1 1801 1 1789 + 62943 1825, 0 1825,chr11_16 0 1115413 1117226 2 12,12, 0,1813, 1115413,1117214, # taking those four lines out. cat primers.psl.filter.blat epcr.not.found.psl > primers.psl.filter # lift those primers (added chrM to this lifter script for mouse) # creates primers.psl.filter.lifted /cluster/bin/scripts/lifter -mouse -psl \ /cluster/data/mm5 primers.psl.filter # wc primers.psl.filter.lifted # 33691 707511 3601164 primers.psl.filter.lifted # create primers.psl.filter.lifted.initial bash PATH=/cluster/bin/scripts:$PATH /cluster/bin/scripts/extractPslInfo primers.psl.filter.lifted # wc primers.psl.filter.lifted.initial # 33689 202134 1799016 primers.psl.filter.lifted.initial # create primers.psl.filter.lifted.initial.acc /cluster/bin/scripts/findAccession -agp \ -mouse primers.psl.filter.lifted.initial /cluster/data/mm5 # wc primers.psl.filter.lifted.initial.acc # 33689 235823 2158029 primers.psl.filter.lifted.initial.acc # this needs to be -rat as that specifies how to scan the # stsInfoMouse.bed file and it does not work if you use -mouse /cluster/bin/scripts/getStsId -rat \ ../stsInfoMouse.bed primers.psl.filter.lifted.initial.acc \ > primers.initial.acc.trans # wc primers.initial.acc.trans # 33689 235823 1834889 primers.initial.acc.trans sort -k 4n primers.initial.acc.trans > primers.final rm primers.psl.filter.lifted.initial.acc primers.initial.acc.trans # comparing results to Mm4: # Mm5 wc primers.final # 33689 235823 1834889 primers.final # Mm4 wc primers.final # 32983 230881 1771293 primers.final cd /cluster/data/mm5/bed/STSmarkers # stsMarkers.final is empty for mouse touch stsMarkers.final dummy bash PATH=/cluster/bin/scripts:$PATH \ /cluster/bin/scripts/combineSeqPrimerPos \ stsMarkers.final primer/primers.final > stsMarkers_pos.rdb # Comparing results to Mm4 # Mm5 wc stsMarkers_pos.rdb # 32085 224595 1862816 stsMarkers_pos.rdb # Mm4 wc stsMarkers_pos.rdb # 31270 218890 1869417 stsMarkers_pos.rdb /projects/cc/hg/ytlu/bin/script/perl/createStsBed \ stsInfoMouse.bed stsMarkers_pos.rdb 500 > stsMapMouse.bed # wc stsMapMouse.bed # 29069 301535 2123622 stsMapMouse.bed # loading STS markers tables ssh hgwdev cd /cluster/data/mm5/bed/STSmarkers cp -p /cluster/store6/mm4/bed/STSmarkers/ucscAlias.pl . bash ./ucscAlias.pl stsInfoMouse.bed > ucscStsAlias.tab 2> ucscStsAlias.warnings # wc ucscStsAlias.tab # 126624 379859 3037850 ucscStsAlias.tab hgsql -e "drop table stsAlias;" mm5 hgsql mm5 < ~/kent/src/hg/lib/stsAlias.sql hgsql -e \ 'load data local infile "ucscStsAlias.tab" into table stsAlias;' mm5 hgsql -e "drop table stsMapMouseNew;" mm5 hgsql mm5 < ~/kent/src/hg/lib/stsMapMouseNew.sql hgsql -e \ 'load data local infile "stsMapMouse.bed" into table stsMapMouseNew;' mm5 hgsql -e "drop table stsInfoMouseNew;" mm5 hgsql mm5 < ~/kent/src/hg/lib/stsInfoMouseNew.sql hgsql -e \ 'load data local infile "stsInfoMouse.bed" into table stsInfoMouseNew;' mm5 hgLoadPsl -nobin -table=all_sts_primer mm5 primer/primers.psl.filter.lifted # load primer sequences mkdir /gbdb/mm5/stsMarker ln -s /cluster/data/mm5/bed/STSmarkers/mouseP.fa \ /gbdb/mm5/stsMarker/mouseP.fa # PLEASE NOTE THAT THE -replace option is used because this is a rebuild, # otherwise there will be a problem that the seq and extFile tables # will be out of sync. hgLoadSeq -replace mm5 /gbdb/mm5/stsMarker/mouseP.fa # Adding /gbdb/mm5/stsMarker/mouseP.fa # 32232 sequences # DONE - 2004-08-24 17:02 # QA repush 2006-02-08 seq table to remove old STS sequences with no extFile reference (Jen) Heather found problem found on rr. RR table matched dev and beta was correct, so no joinerCheck errors for the mismatch were flagged for review. # BLASTZ RAT RN3 (RE-DONE - 2004-08-30 - Fan) # !!! PLEASE NOTE AS OF 9/2/04, THE 8/30/04-8/31/04 REBUILD OF BLASTZ, CHAIN, AND NET # FOR MM5-RN3 IS NO LONG USED FOR MM5. THE OLD MM5-RN3 CHAIN AND NET BUILD OF 7/14/04 # IS REVERSE PUSHED FROM RR BACK TO HGWDEV. # Reason for rebuild is to use more stringent blastz parameters to reduce size # of output files. # BLASTZ_H=2000 # BLASTZ_Y=3400 # BLASTZ_L=50000 # scoring matrix # BLASTZ_Q=/cluster/data/blastz/mus_rat.q # MAKE SURE TO INCLUDE THE RESCORE STEP TO CORRECT A BLASTZ PROBLEM. # (axtRescore -scoreScheme=/cluster/data/blastz/mus_rat.q ...) ssh kk mkdir -p /cluster/data/mm5/bed/blastz.rn3.2004-08-29 cd blastz.rn3.2004-08-29 cat << '_EOF_' > DEF # rat vs. mouse export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/home/angie/schwartzbin:/cluster/home/kent/bin/i386 ALIGN=blastz-run BLASTZ=blastz BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=50000 BLASTZ_T=2 # scoring matrix BLASTZ_Q=/cluster/data/blastz/mus_rat.q BLASTZ_ABRIDGE_REPEATS=1 # TARGET # Mouse SEQ1_DIR=/scratch/mus/mm5/softNib # not used SEQ1_RMSK= # not used SEQ1_FLAG= SEQ1_SMSK=/iscratch/i/mus/mm5/linSpecRep.notInRat SEQ1_IN_CONTIGS=0 SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY # Rat SEQ2_DIR=/iscratch/i/rn3/bothMaskedNibs # not currently used SEQ2_RMSK= # not currently used SEQ2_FLAG= SEQ2_SMSK=/cluster/bluearc/rat/rn3/linSpecRep.notInMouse SEQ2_IN_CONTIGS=0 SEQ2_CHUNK=30000000 SEQ2_LAP=0 BASE=/cluster/data/mm5/bed/blastz.rn3.2004-08-29 DEF=$BASE/DEF RAW=$BASE/raw CDBDIR=$BASE SEQ1_LEN=$BASE/S1.len SEQ2_LEN=$BASE/S2.len '_EOF_' # << this line makes emacs coloring happy # prepare first cluster run ssh kk cd /cluster/data/mm5/bed/blastz.rn3.2004-08-29 bash source ./DEF # script copied over from /cluster/data/hg17/jkStuff/BlastZ_run0.sh # it is a generic script and works for any assembly cp -p /cluster/data/hg17/jkStuff/BlastZ_run0.sh \ /cluster/data/mm5/jkStuff/BlastZ_run0.sh /cluster/data/mm5/jkStuff/BlastZ_run0.sh cd run.0 para try, check, push, check, .... # Completed: 41943 of 41943 jobs # CPU time in finished jobs: 4656727s 77612.11m 1293.54h 53.90d 0.148 y # IO & Wait Time: 460782s 7679.70m 128.00h 5.33d 0.015 y # Average job time: 122s 2.03m 0.03h 0.00d # Longest job: 2042s 34.03m 0.57h 0.02d # Submission to last job: 8307s 138.45m 2.31h 0.10d # Second cluster run to convert the .out's to .lav's # You do NOT want to run this on the big cluster. It brings # the file server to its knees. Run this on the small cluster. ssh kki cd /cluster/data/mm5/bed/blastz.rn3.2004-08-29 # script copied over from /cluster/data/mm4/jkStuff/BlastZ_run1.sh # fixup machine check, should be kki, not kk cp /cluster/data/mm4/jkStuff/BlastZ_run1.sh \ /cluster/data/mm5/jkStuff/BlastZ_run1.sh vi /cluster/data/mm5/jkStuff/BlastZ_run1.sh /cluster/data/mm5/jkStuff/BlastZ_run1.sh cd run.1 para try, check, push, etc ... # Completed: 341 of 341 jobs # CPU time in finished jobs: 1293s 21.54m 0.36h 0.01d 0.000 y # IO & Wait Time: 2113s 35.22m 0.59h 0.02d 0.000 y # Average job time: 10s 0.17m 0.00h 0.00d # Longest job: 54s 0.90m 0.01h 0.00d # Submission to last job: 719s 11.98m 0.20h 0.01d # NOTE: BlastZ_run2.sh is not used here. Instead Angie's approach # (using Rescore) is adopted here. # third run: lav -> axt # NOTE: use axtRescore here because we used a non-default BLASTZ_Q matrix # and abridged repeats (Penn State's restore_rpts program rescores with # default matrix, oops). ssh kki cd /cluster/data/mm5/bed/blastz.rn3.2004-08-29 # mv old subdirectories mv axtChrom axtChrom.old mv run.2 run.2.old mkdir axtChrom pslChrom run.2 cd run.2 cat << '_EOF_' > do.csh #!/bin/csh -ef cd $1 set chr = $1:t set path = (/cluster/bin/x86_64 $path) cat `ls -1 *.lav | sort -g` \ | lavToAxt stdin \ /iscratch/i/mus/mm5/softNib /iscratch/i/rn3/bothMaskedNibs stdout \ | axtRescore -scoreScheme=/cluster/data/blastz/mus_rat.q stdin stdout \ | axtSort stdin ../../axtChrom/$chr.axt axtToPsl ../../axtChrom/$chr.axt ../../S1.len ../../S2.len \ ../../pslChrom/$chr.psl '_EOF_' # << this line keeps emacs coloring happy chmod a+x do.csh cp /dev/null jobList foreach d (../lav/chr*) echo "do.csh $d" >> jobList end para create jobList para try, check, push, check # Completed: 43 of 43 jobs # CPU time in finished jobs: 498s 8.31m 0.14h 0.01d 0.000 y # IO & Wait Time: 3367s 56.11m 0.94h 0.04d 0.000 y # Average job time: 90s 1.50m 0.02h 0.00d # Longest job: 299s 4.98m 0.08h 0.00d # Submission to last job: 685s 11.42m 0.19h 0.01d # CHAIN RAT BLASTZ (RE-DONE 8/30/04 Fan) # Run axtChain on little cluster ssh kki cd /cluster/data/mm5/bed/blastz.rn3.2004-08-29 mkdir -p axtChain/run1 cd axtChain/run1 mkdir out chain ls -1S /cluster/data/mm5/bed/blastz.rn3.2004-08-29/axtChrom/*.axt \ > input.lst cat << '_EOF_' > gsub #LOOP doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out} #ENDLOOP '_EOF_' # << this line makes emacs coloring happy cat << '_EOF_' > doChain #!/bin/csh axtChain -scoreScheme=/cluster/data/blastz/mus_rat.q \ -minScore=5000 $1 \ /iscratch/i/mus/mm5/softNib \ /iscratch/i/rn3/bothMaskedNibs $2 > $3 '_EOF_' # << this line makes emacs coloring happy chmod a+x doChain gensub2 input.lst single gsub jobList para create jobList para try, check, push, check... # Completed: 43 of 43 jobs # CPU time in finished jobs: 3145s 52.42m 0.87h 0.04d 0.000 y # IO & Wait Time: 989s 16.48m 0.27h 0.01d 0.000 y # Average job time: 96s 1.60m 0.03h 0.00d # Longest job: 280s 4.67m 0.08h 0.00d # Submission to last job: 1219s 20.32m 0.34h 0.01d # now on the cluster server, sort chains ssh kksilo cd /cluster/data/mm5/bed/blastz.rn3.2004-08-29/axtChain chainMergeSort run1/chain/*.chain > all.chain chainSplit chain all.chain rm run1/chain/*.chain # take a look at score distr's foreach f (chain/*.chain) grep chain $f | awk '{print $2;}' | sort -nr > /tmp/score.$f:t:r echo $f:t:r textHistogram -binSize=5000 /tmp/score.$f:t:r echo "" end # Load chains into database ssh hgwdev cd /cluster/data/mm5/bed/blastz.rn3.2004-08-29/axtChain/chain foreach i (*.chain) set c = $i:r echo loading $c hgLoadChain mm5 ${c}_chainRn3 $i end featureBits mm5 chainRn3Link # 1677291680 bases of 2615483787 (64.129%) in intersection nice featureBits hg17 chainRn3Link # 982059013 bases of 2866216770 (34.263%) in intersection # NET RAT BLASTZ (RE-DONE 8/31/04 Fan) ssh kksilo cd /cluster/data/mm5/bed/blastz.rn3.2004-08-29/axtChain chainPreNet all.chain ../S1.len ../S2.len stdout \ | chainNet stdin -minSpace=1 ../S1.len ../S2.len stdout /dev/null \ | netSyntenic stdin hNoClass.net # memory usage 1710399488, utime 7360 s/100, stime 1891 # The above adapted from Angie's approach # The netClass operations requires an "ancientRepeat" table to exist # in either mm5 or rn3. So, create the table: ssh hgwdev mkdir -p /cluster/data/mm5/bed/ancientRepeat cd /cluster/data/mm5/bed/ancientRepeat # mysqldump needs write permission to this directory # and you need to use your read/write enabled user with password chmod 777 . hgsqldump --all --tab=. mm4 ancientRepeat chmod 775 . hgsql mm5 < ancientRepeat.sql mysqlimport -u -p mm5 ancientRepeat.txt # This is a hand curated table obtained from Arian. # The ancientRepeat table was loaded during the first build of NET RAT BLASTZ. ssh hgwdev cd /cluster/data/mm5/bed/blastz.rn3.2004-08-29/axtChain time netClass hNoClass.net mm5 rn3 rat.net \ -tNewR=/cluster/bluearc/scratch/mus/mm5/linSpecRep.notInRat \ -qNewR=/cluster/bluearc/rat/rn3/linSpecRep.notInMouse # 491.210u 96.250s 12:27.37 78.6% 0+0k 0+0io 249pf+0w # If things look good do ssh kksilo cd /cluster/data/mm5/bed/blastz.rn3.2004-08-29/axtChain rm -r hNoClass.net # Make a 'syntenic' subset of these with time netFilter -syn rat.net > ratSyn.net # 216.290u 34.220s 4:27.60 93.6% 0+0k 0+0io 119pf+0w # Load the nets into database ssh hgwdev cd /cluster/data/mm5/bed/blastz.rn3.2004-08-29/axtChain netFilter -minGap=10 rat.net | hgLoadNet mm5 netRn3 stdin netFilter -minGap=10 ratSyn.net | hgLoadNet mm5 syntenyNetRn3 stdin # check results # featureBits mm4 netRn3 # 96806381 bases of 95076222 (101.820%) in intersection # featureBits mm5 netRn3 # 2601384082 bases of 2615483787 (99.461%) in intersection # featureBits mm4 syntenyNetRn3 # 96760405 bases of 95076222 (101.771%) in intersection # featureBits mm5 syntenyNetRn3 # 2575035774 bases of 2615483787 (98.454%) in intersection # Add entries for net and chain to mouse/mm5 trackDb # make net ssh kksilo cd /cluster/data/mm5/bed/blastz.rn3.2004-08-29/axtChain mkdir ratNet time netSplit rat.net ratNet # 218.990u 29.290s 4:27.86 92.6% 0+0k 0+0io 190pf+0w # extract axts from net mkdir ../axtNet foreach n (ratNet/chr*.net) set c=$n:t:r echo "netToAxt: $c.net -> $c.axt" rm -f ../axtNet/$c.axt netToAxt ratNet/$c.net chain/$c.chain \ /cluster/data/mm5/nib \ /cluster/data/rn3/nib ../axtNet/$c.axt echo "Complete: $c.net -> axtNet/$c.axt" end # sort axt's and convert to maf format mkdir ../mafNet foreach f (../axtNet/chr*.axt) set c=$f:t:r echo $c.axt mv ../axtNet/$c.axt ../axtNet/$c.unsorted.axt axtSort ../axtNet/$c.unsorted.axt ../axtNet/$c.axt rm ../axtNet/$c.unsorted.axt axtToMaf ../axtNet/$c.axt \ /cluster/data/mm5/chrom.sizes /cluster/data/rn3/chrom.sizes \ ../mafNet/$c.maf -tPrefix=mm5. -qPrefix=rn3. end ssh hgwdev mkdir -p /cluster/data/mm5/bed/blastz.rn3.2004-08-29/axtBest cd /cluster/data/mm5/bed/blastz.rn3.2004-08-29/axtBest ln -s ../axtNet/chr*.axt . # copy net axt's to download area ssh hgwdev cd /cluster/data/mm5/bed/blastz.rn3.2004-08-29/axtNet mkdir -p /usr/local/apache/htdocs/goldenPath/mm5/vsRn3/axtNet cp -p *.axt /usr/local/apache/htdocs/goldenPath/mm5/vsRn3/axtNet cd /usr/local/apache/htdocs/goldenPath/mm5/vsRn3/axtNet nice gzip *.axt # add README.txt file to dir (use previous assembly's copy as template) # Convert those axt files to psl ssh kksilo cd /cluster/data/mm5/bed/blastz.rn3.2004-08-29 mkdir pslBest foreach a (axtBest/chr*.axt) set c=$a:t:r echo "processing $c.axt -> ${c}_blastzBestRn3.psl" /cluster/bin/i386/axtToPsl axtBest/${c}.axt \ S1.len S2.len pslBest/${c}_blastzBestRn3.psl echo "Done: ${c}_blastzBestRn3.psl" end # Load tables ssh hgwdev cd /cluster/data/mm5/bed/blastz.rn3.2004-08-29/pslBest bash for I in chr*BestRn3.psl do /cluster/bin/i386/hgLoadPsl -noTNameIx mm5 ${I} echo "done ${I}" done # check results # featureBits mm5 blastzBestRn3 # 1674716868 bases of 2615483787 (64.031%) in intersection # featureBits mm4 blastzBestRn3 # 1780774716 bases of 2627444668 (67.776%) in intersection # Make /gbdb links and add them to the axtInfo table: mkdir -p /gbdb/mm5/axtBest/Rn3 cd /gbdb/mm5/axtBest/Rn3 rm * ln -s /cluster/data/mm5/bed/blastz.rn3.2004-08-29/axtNet/chr*.axt . ssh hgwdev cd /cluster/data/mm5/bed/blastz.rn3.2004-08-29/axtNet rm -f axtInfoInserts.sql foreach f (/gbdb/mm5/axtBest/Rn3/chr*.axt) set chr=$f:t:r echo "INSERT INTO axtInfo (species, alignment, chrom, fileName) \ VALUES ('rn3','Blastz Best in Genome','$chr','$f');" \ >> axtInfoInserts.sql end # these axtInfo file entries should be appended to the table, # not replacing it. The previous hg17 entries are needed -- bob kuhn hgsql mm5 -e 'drop table mm5.axtInfo;' hgsql mm5 < ~/kent/src/hg/lib/axtInfo.sql hgsql mm5 < axtInfoInserts.sql cd /cluster/data/mm5/bed rm blastz.rn3 ln -s blastz.rn3.2004-08-29 blastz.rn3 # BLASTZ RN3 CLEAN UP (RE-DONE - 2004-08-31 - Fan) ssh kksilo cd /cluster/data/mm5/bed/blastz.rn3.2004-08-29 nice rm -rf raw nice rm axtChain/run1/chain/* # do the following later, after rn3-mm5 net and chain done. nice gzip {axt,psl}Chrom/* lav/*/* axtChain/{all.chain,*.net} & # The above line done on 9/7/04. Fan. : # CREATE CYTOBAND TRACK (DONE - 2004-09-7 - Fan) # Should be done after NCBI updated their MapViewer to the latest release. ssh hgwdev cd /cluster/data/mm5 mkdir cytoBand cd cytoBand # Get file from NCBI wget ftp://ftp.ncbi.nih.gov/genomes/M_musculus/maps/mapview/BUILD.33/ideogram.gz gunzip ideogram # Create bed file /cluster/bin/scripts/createNcbiCytoBand ideogram # Load the bed file hgLoadBed -noBin -sqlTable=/cluster/home/kent/src/hg/lib/cytoBand.sql mm5 cytoBand cytoBand.bed # Make cytoBandIdeo track for ideogram gif on hgTracks page. # For mouse cytoBandIdeo is just a replicate of the cytoBand track. # Make the cytoBand track (above) and then: echo "create table cytoBandIdeo (index(chrom(10),chromStart)) as select * from cytoBand;" | hgsql mm5 # REBUILD CYTOBAND TRACK (DONE - 2004-09-15 - Fan) # NCBI updated the ideogram.gz file and also changed its format, # added a new density field after stein. ssh hgwdev cd /cluster/data/mm5 mv cytoBand cytoBand.old mkdir cytoBand cd cytoBand # Get file from NCBI wget ftp://ftp.ncbi.nih.gov/genomes/M_musculus/maps/mapview/BUILD.33/ideogram.gz gunzip ideogram # Create bed file /cluster/bin/scripts/createNcbiCytoBand ideogram # Load the bed file hgLoadBed -noBin -sqlTable=/cluster/home/kent/src/hg/lib/cytoBand.sql mm5 cytoBand cytoBand.bed # Make cytoBandIdeo track for ideogram gif on hgTracks page. # For mouse cytoBandIdeo is just a replicate of the cytoBand track. # First, drop the cytoBandIdeo table in mm5. # Make the cytoBand track (above) and then: echo "create table cytoBandIdeo (index(chrom(10),chromStart)) as select * from cytoBand;"|hgsql mm5 # ADD MAP CONTIGS TRACK (DONE - 2004-09-07 - Fan) ssh hgwdev mkdir -p /cluster/data/mm5/bed/ctgPos cd /cluster/data/mm5/bed/ctgPos # hgCtgPos uses the lift files... but mouse lift files are for the # 5MB contigs from splitFaIntoContigs, not for the real NT_ contigs # from the assembly. (In the future, we should go with the NT's!) # So... just for this release, go straight from the seq_contig.md # to the table def'n: contig, size, chrom, chromStart, chromEnd cat << '_EOF_' > parseSeqContig.pl #!/usr/local/bin/perl -w use strict; while (<>) { if (/^\d+\s+(\S+)\s+(\d+)\s+(\d+)\s+(\S+)\s+(N[TC]_\d+)\s+(\S+)\s+contig\s+\S+\s+\S+\s*$/i) { my $chr=$1; my $start=$2; $start -= 1; my $end=$3; my $ctg=$5; if ($chr !~ /N/ ) { print "$ctg\t" . ($end-$start) . "\tchr$chr\t$start\t$end\n"; } } } '_EOF_' chmod +x parseSeqContig.pl ./parseSeqContig.pl ../../ncbi/seq_contig.md > ctgPos.tab hgsql mm5 < ~/kent/src/hg/lib/ctgPos.sql echo "load data local infile 'ctgPos.tab' into table ctgPos" | hgsql mm5 # Note: the info is there in seq_contig.md to also do the _random's, # but we'd have to do some more work: duplicate the gaps of 50000 between # contigs for all _random's except chrUn_random (1000 between). # featureBits mm5 ctgPos # 2557516950 bases of 2615483787 (97.784%) in intersection # featureBits mm4 ctgPos # 2554101163 bases of 2627444668 (97.209%) in intersection # featureBits mm3 ctgPos # 2500661074 bases of 2505900260 (99.791%) in intersection # RELOAD MAP CONTIGS TRACK (DONE - 2005-Mar-03 - Heather) # /cluster/data/mm5/ncbi/seq_contig.md contains more than just C57BL/6J. # Filter those out. ssh hgwdev cd /cluster/data/mm5/bed/ctgPos cp /cluster/data/mm5/ncbi/seq_contig.md . grep C57BL seq_contig.md > contig.C57BL # contig.C57BL has 41061 lines (252 lines fewer than seq_contig.md) ./parseSeqContig.pl contig.C57BL > ctgPosFiltered.tab # ctgPosFiltered.tab has 302 rows (227 fewer than ctgPos.tab) echo "delete from ctgPos" | hgsql mm5 echo "load data local infile 'ctgPosFiltered.tab' into table ctgPos" | hgsql mm5 # echo "update ctgPos set chrom = "chrM" where chrom = "chrMT" | hgsql mm5 # featureBits mm5 ctgPos # 2557064874 bases of 2615483787 (97.766%) in intersection # FUGU BLAT ALIGNMENTS (DONE 2004-09-08 Fan) ssh kk mkdir /cluster/data/mm5/bed/blatFr1 cd /cluster/data/mm5/bed/blatFr1 ls -1S /iscratch/i/fugu/trfFa/*.fa > fugu.lst ls -1S /scratch/mus/mm5/softNib/*.nib > mouse.lst cat << '_EOF_' > gsub #LOOP blat -mask=lower -q=dnax -t=dnax {check in exists $(path1)} {check in line+ $(path2)} {check out line+ psl/$(root1)_$(root2).psl} #ENDLOOP '_EOF_' # << this line makes emacs coloring happy mkdir psl gensub2 mouse.lst fugu.lst gsub spec para create spec para try, check, push, check, ... Completed: 24854 of 24854 jobs CPU time in finished jobs: 8215774s 136929.56m 2282.16h 95.09d 0.261 y IO & Wait Time: 1415723s 23595.39m 393.26h 16.39d 0.045 y Average job time: 388s 6.46m 0.11h 0.00d Longest job: 46761s 779.35m 12.99h 0.54d Submission to last job: 46761s 779.35m 12.99h 0.54d # Sort alignments: ssh kksilo cd /cluster/data/mm5/bed/blatFr1 pslCat -dir psl | pslSortAcc nohead chrom temp stdin # Processed 1116383 lines into 5 temp files # lift query side to Fugu browser chrUn coordinates liftUp -pslQ all.psl /cluster/data/fr1/fugu_v3.masked.lft warn chrom/*.psl # load into database: ssh hgwdev cd /cluster/data/mm5/bed/blatFr1 hgLoadPsl -fastLoad -table=blatFr1 mm5 all.psl # Processing all.psl # load of blatFr1 did not go as planned: 1116383 record(s), 0 row(s) skipped, 1 warning(s) loading psl.tab # a record is already in trackDb as type xeno psl fr1, with colorChromDefault off # BLASTZ TETRAODON (tetNig1) (DONE, 2004-09-08, hartera) ssh kkr1u00 # blastz requires lineage-specific repeats # Treat all repeats as lineage-specific. mkdir -p /iscratch/i/mm5/linSpecRep.notInTetraodon foreach f (/cluster/bluearc/scratch/mus/mm5/rmsk/chr*.fa.out) cp -p $f /iscratch/i/mm5/linSpecRep.notInTetraodon/$f:t:r:r.out.spec end mkdir -p /iscratch/i/tetNig1/linSpecRep.notInMouse foreach f (/iscratch/i/tetNig1/rmsk/chr*.fa.out) cp -p $f /iscratch/i/tetNig1/linSpecRep.notInMouse/$f:t:r:r.out.spec end iSync ssh kksilo # more space on store8 than store6 mkdir -p /cluster/store8/mm5/blastz.tetNig1.2004-09-02 ln -s /cluster/store8/mm5/blastz.tetNig1.2004-09-02 \ /cluster/data/mm5/bed ln -s /cluster/data/mm5/bed/blastz.tetNig1.2004-09-02 \ /cluster/data/mm5/bed/blastz.tetNig1 ssh kk cd /cluster/data/mm5/bed/blastz.tetNig1 # use same parameters as for danRer1-mm5 cat << '_EOF_' > DEF # mouse (mm5) vs Tetraodon (tetNig1) export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin ALIGN=blastz-run BLASTZ=blastz # Reuse parameters from hg16-fr1 and danRer1-hg17. BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q BLASTZ_ABRIDGE_REPEATS=1 # TARGET: Mouse (mm5) SEQ1_DIR=/iscratch/i/mus/mm5/test SEQ1_RMSK= SEQ1_FLAG= SEQ1_SMSK=/iscratch/i/mm5/linSpecRep.notInTetraodon SEQ1_IN_CONTIGS=0 SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Tetraodon (tetNig1) SEQ2_DIR=/iscratch/i/tetNig1/nib SEQ2_RMSK= SEQ2_FLAG= SEQ2_SMSK=/iscratch/i/tetNig1/linSpecRep.notInMouse SEQ2_IN_CONTIGS=0 SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/cluster/data/mm5/bed/blastz.tetNig1 DEF=$BASE/DEF RAW=$BASE/raw CDBDIR=$BASE SEQ1_LEN=$BASE/S1.len SEQ2_LEN=$BASE/S2.len #DEBUG=1 '_EOF_' # << this line keeps emacs coloring happy # Save the DEF file in the current standard place chmod +x DEF cp DEF ~angie/hummus/DEF.mm5-tetNig1.2004-09-02 # setup cluster run # copy shell scripts for blastz runs if not there already cp -p /cluster/data/danRer1/jkStuff/BlastZ* /cluster/data/mm5/jkStuff/ # edit BlastZ_run0.sh # replace line 22: /cluster/home/angie/schwartzbin/ with /cluster/bin/penn/ # this is the directory for the latest version of blastz-run # source the DEF file bash . ./DEF /cluster/data/mm5/jkStuff/BlastZ_run0.sh cd run.0 # check batch looks ok then para try, check, push, check, .... # para time # Completed: 19437 of 19437 jobs # CPU time in finished jobs: 4681483s 78024.71m 1300.41h 54.18d 0.148 y # IO & Wait Time: 176260s 2937.67m 48.96h 2.04d 0.006 y # Average job time: 250s 4.17m 0.07h 0.00d # Longest job: 790s 13.17m 0.22h 0.01d # Submission to last job: 5475s 91.25m 1.52h 0.06d # second cluster run to convert the .out's to .lav's ssh kki cd /cluster/data/mm5/bed/blastz.tetNig1 bash # if a csh/tcsh user . ./DEF /cluster/data/mm5/jkStuff/BlastZ_run1.sh cd run.1 para try, check, push, etc ... # para time # Completed: 341 of 341 jobs # CPU time in finished jobs: 262s 4.37m 0.07h 0.00d 0.000 y # IO & Wait Time: 981s 16.35m 0.27h 0.01d 0.000 y # Average job time: 4s 0.06m 0.00h 0.00d # Longest job: 9s 0.15m 0.00h 0.00d # Submission to last job: 108s 1.80m 0.03h 0.00d # Third cluster run to convert lav's to axt's ssh kki cd /cluster/data/mm5/bed/blastz.tetNig1 mkdir axtChrom # a new run directory mkdir run.2 cd run.2 cat << '_EOF_' > do.csh #!/bin/csh cd $1 cat `ls -1 *.lav | sort -g` \ | lavToAxt stdin /iscratch/i/mus/mm5/softNib \ /iscratch/i/tetNig1/nib stdout \ | axtSort stdin $2 '_EOF_' # << this line makes emacs coloring happy chmod a+x do.csh cat << '_EOF_' > gsub #LOOP ./do.csh {check in exists $(path1)} {check out line+ /cluster/data/mm5/bed/blastz.tetNig1/axtChrom/$(root1).axt} #ENDLOOP '_EOF_' # << this line makes emacs coloring happy \ls -1Sd ../lav/chr* > chrom.list gensub2 chrom.list single gsub jobList wc -l jobList head jobList para create jobList para try, check, push, check,... # para time # Completed: 43 of 43 jobs # CPU time in finished jobs: 41s 0.68m 0.01h 0.00d 0.000 y # IO & Wait Time: 414s 6.90m 0.12h 0.00d 0.000 y # Average job time: 11s 0.18m 0.00h 0.00d # Longest job: 28s 0.47m 0.01h 0.00d # Submission to last job: 396s 6.60m 0.11h 0.00d # translate sorted axt files into psl ssh kolossus cd /cluster/data/mm5/bed/blastz.tetNig1 mkdir -p pslChrom set tbl = "blastzTetNig1" foreach f (axtChrom/chr*.axt) set c=$f:t:r echo "Processing chr $c" /cluster/bin/i386/axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl end # Load database tables ssh hgwdev cd /cluster/data/mm5/bed/blastz.tetNig1/pslChrom foreach f (./*.psl) /cluster/bin/i386/hgLoadPsl mm5 $f end # featureBits -chrom=chr1 mm5 refGene:cds blastzTetNig1 -enrichment # refGene:cds 0.765%, blastzTetNig1 1.709%, both 0.519%, cover 67.80%, # enrich 39.67x # default with H=2000 # featureBits -chrom=chr1 mm5 refGene:cds blastzTetNig1H2000 -enrichment # refGene:cds 0.765%, blastzTetNig1H2000 1.239%, both 0.502%, cover 65.59%, # enrich 52.92x # blastzDanRer1 with L=8000 # featureBits -chrom=chr1 mm5 refGene:cds blastzTetNig1L8k -enrichment # refGene:cds 0.765%, blastzTetNig1L8k 1.333%, both 0.444%, cover 58.05%, # enrich 43.56x # too much drop in coverage # H=2000, L=4000 # featureBits -chrom=chr1 mm5 refGene:cds blastzTetNig1H2kL4k -enrichment # refGene:cds 0.765%, blastzTetNig1H2kL4k 1.166%, both 0.489%, cover 63.91%, # enrich 54.81x # H=2000, L=6000 # featureBits -chrom=chr1 mm5 refGene:cds blastzTetNig1H2kL6k -enrichment # refGene:cds 0.765%, blastzTetNig1H2kL6k 1.014%, both 0.437%, cover 57.15%, # enrich 56.36x # too much drop in coverage # number of rows in table # blastzTetNig1 38196 # blatzTetNig1H2000 38314 # blastzTetNig1L8k 24749 # blastzTetNig1H2kL4k 31433 # blastzTetNig1H2kL6k 21389 # use blastzTetNig1 as this has the best coverage. enrich is quite high too. # featureBits -chrom=chr1 hg17 refGene:cds blastzFr1 -enrichment # refGene:cds 1.246%, blastzFr1 2.319%, both 0.833%, cover 66.87%, enrich 28.83x # similar coverage to blastzFr1 for hg17 # RESCORE TETNIG1 BLASTZ (DONE, 2004-09-08, hartera) # Low scores can occur with repeats abridged and using the # HoxD55.q matrix. PSU's restore_rpts program rescored alignments # with the default matrix instead of the BLASTZ_Q matrix. # Rescore them here so the chainer sees the higher scores: ssh kolossus cd /cluster/data/mm5/bed/blastz.tetNig1 mkdir axtChrom.rescore foreach f (axtChrom/chr*.axt) axtRescore -scoreScheme=/cluster/data/blastz/HoxD55.q \ $f axtChrom.rescore/$f:t end mv axtChrom axtChrom.orig mv axtChrom.rescore axtChrom # CHAIN TETRAODON (TETNIG1) BLASTZ (DONE, 2004-09-08, hartera) # Re do chains with rescored blastz Hg17 # Run axtChain on little cluster ssh kki cd /cluster/data/mm5/bed/blastz.tetNig1 mkdir -p axtChain/run1 cd axtChain/run1 mkdir out chain ls -1S /cluster/data/mm5/bed/blastz.tetNig1/axtChrom/*.axt \ > input.lst # Reuse gap penalties from hg16 vs chicken run. cat << '_EOF_' > ../../chickenHumanTuned.gap tablesize^V 11 smallSize^V 111 position^V 1^V 2^V 3^V 11^V 111^V 2111^V 12111^V 32111^V 72111^V 152111^V 252111 qGap^V 325^V 360^V 400^V 450^V 600^V 1100^V 3600^V 7600^V 15600^V 31600^V 56600 bothGap^V 625^V 660^V 700^V 750^V 900^V 1400^V 4000^V 8000^V 16000^V 32000^V 57000 '_EOF_' # << this line makes emacs coloring happy cat << '_EOF_' > gsub #LOOP doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out} #ENDLOOP '_EOF_' cat << '_EOF_' > doChain #!/bin/csh axtChain -linearGap=../../chickenHumanTuned.gap $1 \ /iscratch/i/mus/mm5/softNib \ /iscratch/i/tetNig1/nib $2 >& $3 '_EOF_' # << this line makes emacs coloring happy chmod a+x doChain gensub2 input.lst single gsub jobList para create jobList para try, check, push, check... # para time # Completed: 43 of 43 jobs # CPU time in finished jobs: 524s 8.74m 0.15h 0.01d 0.000 y # IO & Wait Time: 140s 2.33m 0.04h 0.00d 0.000 y # Average job time: 15s 0.26m 0.00h 0.00d # Longest job: 25s 0.42m 0.01h 0.00d # Submission to last job: 632s 10.53m 0.18h 0.01d # now on the cluster server, sort chains ssh kksilo cd /cluster/data/mm5/bed/blastz.tetNig1/axtChain chainMergeSort run1/chain/*.chain > all.chain chainSplit chain all.chain # take a look at score distr's,try also with larger bin size. foreach f (chain/*.chain) grep chain $f | awk '{print $2;}' | sort -nr > /tmp/score.$f:t:r echo $f:t:r >> hist5000.out textHistogram -binSize=5000 /tmp/score.$f:t:r >> hist5000.out echo "" end # not a large amount of changes with score < 5000 # load chr1 into database to check ssh hgwdev cd /cluster/data/mm5/bed/blastz.tetNig1/axtChain/chain hgLoadChain mm5 chr1_chainTetNig1 chr1.chain # featureBits -chrom=chr1 mm5 refGene:cds chainTetNig1Link -enrichment # refGene:cds 0.765%, chainTetNig1Link 1.563%, both 0.512%, cover 66.84%, # enrich 42.76x # try filtering with minScore=5000 ssh kksilo cd /cluster/data/mm5/bed/blastz.tetNig1/axtChain mv all.chain all.chain.unfiltered chainFilter -minScore=5000 all.chain.unfiltered > all.chain chainSplit chainFilt5k all.chain ssh hgwdev cd /cluster/data/mm5/bed/blastz.tetNig1/axtChain/chainFilt5k hgLoadChain mm5 chr1_chainTetNig1Filt5k chr1.chain # featureBits -chrom=chr1 mm5 refGene:cds chainTetNig1Filt5kLink -enrichment # refGene:cds 0.765%, chainTetNig1Filt5kLink 1.398%, both 0.504%, cover 65.91%, # enrich 47.13x # chr1_chainTetNig1 21782 # chr1_chainTetNig1Filt5k 9670 # loses very little in coverage so use filtering with minScore=5000 # remove chain rm -r chain mv chainFilt5k chain rm all.chain.unfiltered ssh hgwdev # remove test tables hgsql -e "drop table chr1_chainTetNig1Filt5k;" mm5 hgsql -e "drop table chr1_chainTetNig1Filt5kLink;" mm5 # load chains into database cd /cluster/data/mm5/bed/blastz.tetNig1/axtChain/chain foreach i (*.chain) set c = $i:r hgLoadChain mm5 ${c}_chainTetNig1 $i echo done $c end # NET TETRAODON (tetNig1) BLASTZ (DONE, 2004-09-08, hartera) ssh kksilo cd /cluster/data/mm5/bed/blastz.tetNig1/axtChain mkdir preNet cd chain foreach i (*.chain) echo preNetting $i /cluster/bin/i386/chainPreNet $i ../../S1.len ../../S2.len \ ../preNet/$i end cd .. mkdir n1 cd preNet foreach i (*.chain) set n = $i:r.net echo primary netting $i /cluster/bin/i386/chainNet $i -minSpace=1 ../../S1.len ../../S2.len \ ../n1/$n /dev/null end cd .. cat n1/*.net | /cluster/bin/i386/netSyntenic stdin noClass.net # memory usage 69083136, utime 402 s/100, stime 37 # Add classification info using db tables: cd /cluster/data/mm5/bed/blastz.tetNig1/axtChain # netClass looks for ancient repeats in one of the databases # hg17 has this table - hand-curated by Arian but this is for # human-rodent comparisons so do not use here, use -noAr option mkdir -p /cluster/bluearc/mm5/linSpecRep.notInTetraodon mkdir -p /cluster/bluearc/tetNig1/linSpecRep.notInMouse cp /iscratch/i/mm5/linSpecRep.notInTetraodon/* \ /cluster/bluearc/mm5/linSpecRep.notInTetraodon cp /iscratch/i/tetNig1/linSpecRep.notInMouse/* \ /cluster/bluearc/tetNig1/linSpecRep.notInMouse ssh hgwdev cd /cluster/data/mm5/bed/blastz.tetNig1/axtChain # there is no ancient repeats table for rodent vs fish so use -noAr flag time netClass noClass.net mm5 tetNig1 tetNig1.net \ -tNewR=/cluster/bluearc/mm5/linSpecRep.notInTetraodon \ -qNewR=/cluster/bluearc/tetNig1/linSpecRep.notInMouse -noAr # 59.490u 37.630s 2:41.82 60.0% 0+0k 0+0io 216pf+0w netFilter -minGap=10 tetNig1.net | hgLoadNet mm5 netTetNig1 stdin # featureBits mm5 refGene:cds netTetNig1 -enrichment # refGene:cds 0.921%, netTetNig1 23.633%, both 0.725%, cover 78.70%, # enrich 3.33x # MAKE VSTETNIG1 DOWNLOADABLES (DONE, 2004-09-10, hartera) ssh kksilo # zip chains and nets cd /cluster/data/mm5/bed/blastz.tetNig1/axtChain cp all.chain tetNig1.chain zip -j /cluster/data/mm5/zip/tetNig1.chain.zip tetNig1.chain rm tetNig1.chain zip -j /cluster/data/mm5/zip/tetNig1.net.zip tetNig1.net ssh hgwdev # copy chains and nets to downloads area set gp = /usr/local/apache/htdocs/goldenPath/mm5 mkdir -p $gp/vsTetNig1 cd $gp/vsTetNig1 mv /cluster/data/mm5/zip/tetNig1*.zip . md5sum *.zip > md5sum.txt # move axt files to downloads area and zip cd /cluster/data/mm5/bed/blastz.tetNig1/axtChrom mkdir -p $gp/vsTetNig1/axtChrom cp -p *.axt $gp/vsTetNig1/axtChrom cd $gp/vsTetNig1/axtChrom gzip *.axt md5sum *.gz > md5sum.txt # Copy over & edit README.txt w/pointers to chain, net formats. # MAKE VSDANRER1 DOWNLOADABLES (DONE, 2004-09-10, hartera) ssh kksilo # zip chains and nets cd /cluster/data/mm5/bed/blastz.danRer1/axtChain gunzip all.chain.gz cp all.chain danRer1.chain zip -j /cluster/data/mm5/zip/danRer1.chain.zip danRer1.chain rm danRer1.chain gunzip danRer1.net.gz zip -j /cluster/data/mm5/zip/danRer1.net.zip danRer1.net ssh hgwdev # copy chains and nets to downloads area set gp = /usr/local/apache/htdocs/goldenPath/mm5 mkdir -p $gp/vsDanRer1 cd $gp/vsDanRer1 mv /cluster/data/mm5/zip/danRer1*.zip . md5sum *.zip > md5sum.txt # move axt files to downloads area and zip cd /cluster/data/mm5/bed/blastz.danRer1/axtChrom mkdir -p $gp/vsDanRer1/axtChrom cp -p *.axt $gp/vsDanRer1/axtChrom cd $gp/vsDanRer1/axtChrom gzip *.axt md5sum *.gz > md5sum.txt # add the axtNet *.axt in blastz.danRer1/axtNet cd /cluster/data/mm5/bed/blastz.danRer1/axtNet set gp = /usr/local/apache/htdocs/goldenPath/mm5 mkdir -p $gp/vsDanRer1/axtNet nice cp -p *.axt $gp/vsDanRer1/axtNet cd $gp/vsDanRer1/axtNet nice gzip *.axt md5sum *.gz > md5sum.txt # Copy over & edit README.txt w/pointers to chain, net formats. # BLASTZ TETNIG1 CLEAN UP (DONE, 2004-09-10, hartera) ssh kksilo cd /cluster/data/mm5/bed/blastz.tetNig1 nice rm -rf raw & nice rm -rf lav & nice rm -rf axtChrom.orig & nice rm axtChain/run1/chain/* & nice gzip {axt,psl}Chrom/* axtChain/{all.chain,*.net} & # SGP GENES (REDONE 5/24/05 angie) # Originally loaded 9/17/04; user noticed chrX was missing; IMIM folks # regenerated & we reloaded. ssh kksilo mkdir /cluster/data/mm5/bed/sgp cd /cluster/data/mm5/bed/sgp foreach chr (`awk '{print $1;}' ../../chrom.sizes`) wget http://genome.imim.es/genepredictions/M.musculus/mmMay2004/SGP/humangp200405/$chr.gtf wget http://genome.imim.es/genepredictions/M.musculus/mmMay2004/SGP/humangp200405/$chr.prot end # Add ".1" suffix to each item in .prot's, to match transcript_id's in gtf cp /dev/null sgpPep.fa foreach f (chr*.prot) nice perl -wpe 's/^(>chr\S+)/$1.1/' $f >> sgpPep.fa end ssh hgwdev cd /cluster/data/mm5/bed/sgp ldHgGene -gtf -genePredExt mm5 sgpGene chr*.gtf hgPepPred mm5 generic sgpPep sgpPep.fa # SGP GENES (UPDATE 1/18/2006) sgpPep table dropped, replaced by hgc generated protein seq in browser # MAKE mm5-hg17 OVER.CHAIN FOR LIFTOVER (DONE 2004-09-24 braney) ssh kolossus mkdir -p /cluster/data/mm5/bed/bedOver/mm5Tohg17 cd /cluster/data/mm5/bed/bedOver/mm5Tohg17 set chainDir = /cluster/data/mm5/bed/blastz.hg17/axtChain netSplit $chainDir/human.net net mkdir subset foreach f ($chainDir/chain/*.chain) echo subsetting $f:t:r netChainSubset net/$f:t:r.net $f subset/$f:t end cat subset/*.chain > /cluster/data/mm5/bed/bedOver/mm5Tohg17.chain hgAddLiftOverChain -multiple mm5 hg17 # miRNA track (DONE - 2004-09-30 - Fan) # data from: Sam Griffiths-Jones # and Michel.Weber@ibcg.biotoul.fr # notify them when done. cd /cluster/data/mm5/bed mkdir miRNA cd miRNA wget --timestamping \ ftp://ftp.sanger.ac.uk/pub/databases/Rfam/miRNA/genomes/mmu.bed grep -v "tion" mmu.bed | sed -e "s/ /\t/g" > mm5.bed # check previous release track before update nice featureBits mm4 miRNA # 17782 bases of 2627444668 (0.001%) in intersection hgLoadBed mm5 miRNA mm5.bed # entry in trackDb/trackDb.ra already there # and verify similar numbers after: nice featureBits mm5 miRNA # 17957 bases of 2615483787 (0.001%) in intersection # BLASTZSELF Done (Tue Oct 19 18:06:45 PDT 2004) sugnet # blastzSelf run for mm5. This took about a week due to # being busy with other things and some crashed jobs in a # few places. Think all of the instructions ended up here. # based off of Hiram's instructions for blastzSelf in hg16 & hg17 mkdir -p /cluster/store6/mm5/bed/blastzSelf cd /cluster/store6/mm5/bed/blastzSelf # Create the definitions file. cat << '_EOF_' > DEF # mouse vs. mouse export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/home/angie/schwartzbin:/cluster/home/kent/bin/i386 ALIGN=blastz-run BLASTZ=blastz BLASTZ_H=2000 BLASTZ_ABRIDGE_REPEATS=1 # TARGET # Mouse SEQ1_DIR=/scratch/mus/mm5/softNib # RMSK not currently used SEQ1_RMSK=/scratch/mus/mm5/rmsk # FLAG not currently used SEQ1_FLAG=-rodent SEQ1_SMSK=/cluster/bluearc/scratch/mus/mm5/linSpecRep.notInMouse SEQ1_IN_CONTIGS=0 SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY # Mouse SEQ2_DIR=/scratch/mus/mm5/softNib # RMSK not currently used SEQ2_RMSK=/scratch/mus/mm5/rmsk # FLAG not currently used SEQ2_FLAG=-rodent SEQ2_SMSK=/cluster/bluearc/scratch/mus/mm5/linSpecRep.notInMouse SEQ2_IN_CONTIGS=0 SEQ2_CHUNK=30000000 SEQ2_LAP=0 BASE=/cluster/data/mm5/bed/blastzSelf DEF=$BASE/DEF RAW=$BASE/raw CDBDIR=$BASE SEQ1_LEN=$BASE/S1.len SEQ2_LEN=$BASE/S2.len '_EOF_' # << this line makes emacs coloring happy ssh kk cd /cluster/store6/mm5/bed/blastzSelf /cluster/data/hg17/jkStuff/BlastZ_run0.sh cd run.0 para try, push, check # on mini-cluster, otherwise I/O gets very demanding.... ssh kki cd /cluster/store6/mm5/bed/blastzSelf mkdir -p run.1 /cluster/bin/scripts/blastz-make-out2lav $DEF $BASE > run.1/jobList cd run.1 wc -l jobList # 341 jobList head jobList para create jobList para try # Third cluster run to convert lav's to axt's mkdir run.2 cd run.2 cat << '_EOF_' > gsub #LOOP /cluster/bin/scripts/blastz-chromlav2axt /cluster/data/mm5/bed/blastzSelf/lav/$(root1) {check out line+ /cluster/data/mm5/bed/blastzSelf/axtChrom/$(root1).axt} /scratch/mus/mm5/softNib /scratch/mus/mm5/softNib #ENDLOOP '_EOF_' ls -1S /cluster/data/mm5/bed/blastzSelf/lav > chrom.list gensub2 chrom.list single gsub jobList para create para push # This seems to beat up on the file server a little, load up to 56 on kksilo # Number of jobs died, unsure why. Try them on kksilo: ssh kksilo cat << '_EOF_' > doStragglers.csh #!/bin/tcsh cd /cluster/store6/mm5/bed/blastzSelf set base=/cluster/data/hg16/bed/blastzSelf set seq1_dir=/cluster/data/mm5/nib set seq2_dir=/cluster/data/mm5/nib foreach c (lav/chr17 lav/chr2 lav/chr3 lav/chr7 lav/chrUn_random lav/chrX lav/chrY) echo "Doing $c" pushd $c set chr=$c:t set out=axtChrom/$chr.axt echo "Translating $chr lav to $out" foreach d (*.lav) set smallout=$d.axt lavToAxt $d $seq1_dir $seq2_dir stdout \ | axtDropSelf stdin stdout \ | axtSort stdin $smallout end cat `ls -1 *.lav.axt | sort -g` > $base/$out popd end '_EOF_' # Need to drop overlaps to eliminate diagonals foreach f (axtChrom/chr*.axt) set c=$f:t:r echo "doing $c" /cluster/bin/i386/axtDropOverlap axtChrom/$c.axt chromSizes.tab chromSizes.tab \ /cluster/store6/mm5/bed/blastzSelf/axtChromDropped/$c.axt echo "Done: $c" end cd axtChromDropped gzip *.axt # Translate to psls cd /cluster/data/mm5/bed/blastzSelf mkdir pslChrom set tbl = "blastzSelf" foreach f (axtChrom/chr*.axt) set c=$f:t:r echo "Processing chr $c" zcat /cluster/data/mm5/bed/blastzSelf/axtChromDropped/${c}.axt.gz | \ /cluster/bin/i386/axtToPsl stdin S1.len S2.len pslChrom/${c}_${tbl}.psl end # Load files into the database /cluster/bin/i386/hgLoadPsl -noTNameIx mm5 *_blastzSelf.psl # end BLASTZSELF # CREATE kgSpAlias TABLE FOR PB (Done 10/20/04) hgsql mm5 -e \ 'select kgXref.kgID, spID, alias from kgXref, kgAlias where kgXref.kgID=kgAlias.kgID' >j.tmp hgsql mm5 -e \ 'select kgXref.kgID, spID, alias from kgXref, kgProtAlias where kgXref.kgID=kgProtAlias.kgID'\ >>j.tmp cat j.tmp|sort -u |grep -v 'kgID' >mm5.kgSpAlias.tab rm j.tmp hgsql mm5 -e 'drop table kgSpAlias'; hgsql mm5 < ~/src/hg/lib/kgSpAlis.sql hgsql mm5 -e 'load data local infile "mm5.kgSpAlias.tab" into table kgSpAlias' # ECGENE TRACK (DONE, 2004-10-29, hartera) ssh kksilo mkdir -p /cluster/data/mm5/bed/ECgene.2004-10-29 ln -s /cluster/data/mm5/bed/ECgene.2004-10-29 \ /cluster/data/mm5/bed/ECgene cd /cluster/data/mm5/bed/ECgene wget \ "http://genome.ewha.ac.kr/ECgene/download/v1.2_ECgene/v1.2_mm5_low_gene.txt.gz" wget \ "http://genome.ewha.ac.kr/ECgene/download/v1.2_ECgene/v1.2_mm5_low_pep.txt.gz" gunzip *.gz # load database ssh hgwdev cd /cluster/data/mm5/bed/ECgene ldHgGene -predTab mm5 ECgene v1.2_mm5_low_gene.txt # 343337 gene predictions hgPepPred mm5 tab ECgenePep v1.2_mm5_low_pep.txt rm *.tab nice gzip *.txt ## NIA Mouse Gene Index - (DONE - 2004-11-16 Fan) # requested by: Dudekula, Dawood (NIH/NIA/IRP) DudekulaDB@grc.nia.nih.gov # pick up data ssh hgwdev mkdir -p /cluster/data/mm5/bed/NIAGene cd /cluster/data/mm5/bed/NIAGene wget --timestamp http://lgsun.grc.nia.nih.gov/temp/NIA-Mouse-GeneIndex4-Transcript-to-Genome.psl wget --timestamping \ http://lgsun.grc.nia.nih.gov/temp/NIA-Mouse-GeneIndex4-Transcripts.fasta hgLoadPsl mm5 -table=NIAGene NIA-Mouse-GeneIndex4-Transcript-to-Genome.psl mkdir /gbdb/mm5/NIAGene ln -s /cluster/data/mm5/bed/NIAGene/NIA-Mouse-GeneIndex4-Transcripts.fasta \ /gbdb/mm5/NIAGene/NIA-Mouse-GeneIndex4-Transcripts.fasta hgLoadSeq mm5 /gbdb/mm5/NIAGene/NIA-Mouse-GeneIndex4-Transcripts.fasta Added and edited NIAGene.html and trackDb.ra under kent/src/hg/makeDb/trackDb/mouse/mm5 # CREATE jaxQTL3 (MOUSE QTL) TRACK (DONE - 2004-11-18 Fan) cd /cluster/data/mm5/bed mkdir qtl.2004-11-08 ln -s qtl.2004-11-08 qtl cd qtl # Get the raw data file, mouse_qtl_100804.txt, sent by Carol Bult [cjb@informatics.jax.org]. hgsql mm5 -e 'drop table jaxQtlRaw' hgsql mm5 < ~/src/hg/lib/jaxQtlRaw.sql hgsql mm5 -e 'load data local infile "mouse_qtl_100804.txt" into table jaxQtlRaw ignore 1 lines' # Make sure hgJaxQtl binary executable exist. hgJaxQtl is under ~/src/hg/hgJaxQtl hgJaxQtl mm5 wc jaxQTL3.tab # 981 15310 105164 jaxQTL3.tab hgLoadBed -nobin -tab -sqlTable=$HOME/src/hg/lib/jaxQTL3.sql mm5 jaxQTL3 jaxQTL3.tab # TWINSCAN (DONE 11/29/04 angie) ssh kksilo mkdir /cluster/data/mm5/bed/twinscan cd /cluster/data/mm5/bed/twinscan foreach chr (`awk '{print $1;}' ../../chrom.sizes`) wget http://genes.cs.wustl.edu/predictions/mouse/mm5_11-24-04/chr_gtf/$chr.gtf wget http://genes.cs.wustl.edu/predictions/mouse/mm5_11-24-04/chr_ptx/$chr.ptx end # Add '.a' to end of protein fasta id's, to match gtf transcript_id's: perl -wpe 's/^(>\S+).*/$1.a/' *.ptx > twinscanPep.fa # load. ssh hgwdev cd /cluster/data/mm5/bed/twinscan ldHgGene -gtf -genePredExt mm5 twinscan chr*.gtf hgPepPred mm5 generic twinscanPep twinscanPep.fa featureBits -enrichment mm5 refGene twinscan #refGene 1.551%, twinscan 1.245%, both 0.783%, cover 50.46%, enrich 40.52x # Create mm5GeneList.html (to be used by Google). # This step was done 12/08/04. cd /cluster/data/mm5/bed mkdir geneList cd geneList wget -O mm5GeneList.html "http://hgwdev-fanhsu.cse.ucsc.edu/cgi-bin/hgGeneList?db=mm5" cp -p mm5GeneList.html /usr/local/apache/htdocs/goldenPath # Check this html file into CVS. # BLASTZ ZEBRAFISH (danRer2) (DONE, 2004-12-12, hartera) ssh kkr1u00 # blastz requires lineage-specific repeats # Treat all repeats as lineage-specific. # this directory of mouse repeats exists already mkdir -p /iscratch/i/mm5/linSpecRep.notInZebrafish foreach f (/cluster/bluearc/scratch/mus/mm5/rmsk/chr*.fa.out) cp -p $f /iscratch/i/mm5/linSpecRep.notInZebrafish/$f:t:r:r.out.spec end mkdir -p /iscratch/i/danRer2/linSpecRep.notInMouse foreach f (/iscratch/i/danRer2/rmsk/chr*.fa.out) cp -p $f /iscratch/i/danRer2/linSpecRep.notInMouse/$f:t:r:r.out.spec end iSync ssh kk mkdir -p /cluster/data/mm5/bed/blastz.danRer2.2004-12-10 ln -s /cluster/data/mm5/bed/blastz.danRer2.2004-12-10 \ /cluster/data/mm5/bed/blastz.danRer2 cd /cluster/data/mm5/bed/blastz.danRer2 # use same parameters as for danRer[1|2]-hg17 and for hg16-fr1 and mm5-danRer1 # and similar to those originally used for hg17-galGal2 cat << '_EOF_' > DEF # mouse (mm5) vs zebrafish (danRer2) export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin ALIGN=blastz-run BLASTZ=blastz # Reuse parameters from hg16-fr1, danRer-hg17 and mm5-danRer1 BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q BLASTZ_ABRIDGE_REPEATS=1 # TARGET: Mouse (mm5) SEQ1_DIR=/cluster/bluearc/scratch/mus/mm5/softNib SEQ1_RMSK= SEQ1_FLAG= SEQ1_SMSK=/iscratch/i/mm5/linSpecRep.notInZebrafish SEQ1_IN_CONTIGS=0 SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Zebrafish (danRer2) SEQ2_DIR=/iscratch/i/danRer2/nib SEQ2_RMSK= SEQ2_FLAG= SEQ2_SMSK=/iscratch/i/danRer2/linSpecRep.notInMouse SEQ2_IN_CONTIGS=0 SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/cluster/data/mm5/bed/blastz.danRer2 DEF=$BASE/DEF RAW=$BASE/raw CDBDIR=$BASE SEQ1_LEN=$BASE/S1.len SEQ2_LEN=$BASE/S2.len #DEBUG=1 '_EOF_' # << this line keeps emacs coloring happy # Save the DEF file in the current standard place chmod +x DEF cp DEF ~angie/hummus/DEF.mm5-danRer2.2004-12-10 # setup cluster run # copy shell scripts for blastz runs if not there already cp -p /cluster/data/danRer1/jkStuff/BlastZ* /cluster/data/mm5/jkStuff/ # edit BlastZ_run0.sh # replace line 22: /cluster/home/angie/schwartzbin/ with /cluster/bin/penn/ # this is the directory for the latest version of blastz-run # source the DEF file bash . ./DEF /cluster/data/mm5/jkStuff/BlastZ_run0.sh cd run.0 # check batch looks ok then para try, check, push, check, .... # para time # Completed: 58993 of 58993 jobs # CPU time in finished jobs: 17513361s 291889.35m 4864.82h 202.70d 0.555 y # IO & Wait Time: 1506128s 25102.13m 418.37h 17.43d 0.048 y # Average job time: 322s 5.37m 0.09h 0.00d # Longest job: 2552s 42.53m 0.71h 0.03d # Submission to last job: 50001s 833.35m 13.89h 0.58d # output is 864M # second cluster run to convert the .out's to .lav's ssh kki cd /cluster/data/mm5/bed/blastz.danRer2 bash # if a csh/tcsh user . ./DEF /cluster/data/mm5/jkStuff/BlastZ_run1.sh cd run.1 para try, check, push, etc ... # para time # Checking finished jobs # Completed: 341 of 341 jobs # CPU time in finished jobs: 689s 11.48m 0.19h 0.01d 0.000 y # IO & Wait Time: 1305s 21.76m 0.36h 0.02d 0.000 y # Average job time: 6s 0.10m 0.00h 0.00d # Longest job: 14s 0.23m 0.00h 0.00d # Submission to last job: 250s 4.17m 0.07h 0.00d # Third cluster run to convert lav's to axt's ssh kki cd /cluster/data/mm5/bed/blastz.danRer2 mkdir axtChrom # a new run directory mkdir run.2 cd run.2 cat << '_EOF_' > do.csh #!/bin/csh cd $1 cat `ls -1 *.lav | sort -g` \ | lavToAxt stdin /cluster/bluearc/scratch/mus/mm5/softNib \ /iscratch/i/danRer2/nib stdout \ | axtSort stdin $2 '_EOF_' # << this line makes emacs coloring happy chmod a+x do.csh cat << '_EOF_' > gsub #LOOP ./do.csh {check in exists $(path1)} {check out line+ /cluster/data/mm5/bed/blastz.danRer2/axtChrom/$(root1).axt} #ENDLOOP '_EOF_' # << this line makes emacs coloring happy \ls -1Sd ../lav/chr* > chrom.list gensub2 chrom.list single gsub jobList wc -l jobList head jobList para create jobList para try, check, push, check,... # para time # Completed: 43 of 43 jobs # CPU time in finished jobs: 82s 1.37m 0.02h 0.00d 0.000 y # IO & Wait Time: 1429s 23.82m 0.40h 0.02d 0.000 y # Average job time: 35s 0.59m 0.01h 0.00d # Longest job: 91s 1.52m 0.03h 0.00d # Submission to last job: 1421s 23.68m 0.39h 0.02d # translate sorted axt files into psl ssh kolossus cd /cluster/data/mm5/bed/blastz.danRer2 mkdir -p pslChrom set tbl = "blastzDanRer2" foreach f (axtChrom/chr*.axt) set c=$f:t:r echo "Processing chr $c" /cluster/bin/i386/axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl end # Load database tables ssh hgwdev cd /cluster/data/mm5/bed/blastz.danRer2/pslChrom foreach f (./*.psl) /cluster/bin/i386/hgLoadPsl mm5 $f end # featureBits -chrom=chr1 mm5 refGene:cds blastzDanRer1 -enrichment #refGene:cds 0.763%,blastzDanRer1 2.918%,both 0.512%,cover 67.12%,enrich 23.00x # featureBits -chrom=chr1 mm5 refGene:cds blastzDanRer2 -enrichment # refGene:cds 0.780%, blastzDanRer2 2.816%, both 0.529%, cover 67.89%, # enrich 24.11x # RESCORE DANRER2 BLASTZ ALIGNMENTS (DONE, 2004-12-12, hartera) # Low scores can occur with repeats abridged and using the # HoxD55.q matrix. PSU's restore_rpts program rescored alignments # with the default matrix instead of the BLASTZ_Q matrix. # Rescore them here so the chainer sees the higher scores: ssh kolossus cd /cluster/data/mm5/bed/blastz.danRer2 mkdir axtChrom.rescore foreach f (axtChrom/chr*.axt) axtRescore -scoreScheme=/cluster/data/blastz/HoxD55.q \ $f axtChrom.rescore/$f:t end mv axtChrom axtChrom.orig mv axtChrom.rescore axtChrom # psl files and blastz tables will be the same regardless of score so # no need to reload # CHAIN ZEBRAFISH (danRer2) BLASTZ (DONE, 2004-12-13, hartera) # APPLY chainAntiRepeat TO REMOVE CHAINS THAT ARE THE PRIMARILY THE RESULTS OF # REPEATS AND DEGENERATE DNA (DONE, 2004-12-22, hartera) # Make chains with rescored blastz danRer2 # Run axtChain on little cluster ssh kki cd /cluster/data/mm5/bed/blastz.danRer2 mkdir -p axtChain/run1 cd axtChain/run1 mkdir out chain ls -1S /cluster/data/mm5/bed/blastz.danRer2/axtChrom/*.axt \ > input.lst cat << '_EOF_' > gsub #LOOP doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out} #ENDLOOP '_EOF_' # << this line makes emacs coloring happy # Make our own linear gap file with reduced gap penalties, # in hopes of getting longer chains: cat << '_EOF_' > ../../chickenHumanTuned.gap tablesize^V 11 smallSize^V 111 position^V 1^V 2^V 3^V 11^V 111^V 2111^V 12111^V 32111^V 72111^V 152111^V 252111 qGap^V 325^V 360^V 400^V 450^V 600^V 1100^V 3600^V 7600^V 15600^V 31600^V 56600 tGap^V 325^V 360^V 400^V 450^V 600^V 1100^V 3600^V 7600^V 15600^V 31600^V 56600 bothGap^V 625^V 660^V 700^V 750^V 900^V 1400^V 4000^V 8000^V 16000^V 32000^V 57000 '_EOF_' # << this line makes emacs coloring happy cat << '_EOF_' > doChain #!/bin/csh axtChain -scoreScheme=/cluster/data/blastz/HoxD55.q \ -linearGap=../../chickenHumanTuned.gap $1 \ /cluster/bluearc/scratch/mus/mm5/softNib \ /iscratch/i/danRer1/nib $2 >& $3 '_EOF_' # << this line makes emacs coloring happy chmod a+x doChain gensub2 input.lst single gsub jobList para create jobList para try, check, push, check... # para time # Completed: 43 of 43 jobs # CPU time in finished jobs: 1797s 29.95m 0.50h 0.02d 0.000 y # IO & Wait Time: 575s 9.59m 0.16h 0.01d 0.000 y # Average job time: 55s 0.92m 0.02h 0.00d # Longest job: 133s 2.22m 0.04h 0.00d # Submission to last job: 514s 8.57m 0.14h 0.01d # now on the cluster server, sort chains ssh kksilo cd /cluster/data/mm5/bed/blastz.danRer2/axtChain chainMergeSort run1/chain/*.chain > all.chain chainSplit chain all.chain # take a look at score distr's,try also with smaller bin size. foreach f (chain/*.chain) grep chain $f | awk '{print $2;}' | sort -nr > /tmp/score.$f:t:r echo $f:t:r >> hist5000.out textHistogram -binSize=5000 /tmp/score.$f:t:r >> hist5000.out echo "" end # filter on minScore = 5000 mv all.chain all.chain.unfiltered chainFilter -minScore=5000 all.chain.unfiltered > all.chain.filt5k # remove old chains rm -r chain chainSplit chain all.chain.filt5k # remove repeats from chains and reload into database # (2004-12-22, hartera) ssh kksilo cd /cluster/data/mm5/bed/blastz.danRer2/axtChain mv chain chainRaw mkdir chain cd chainRaw foreach f (*.chain) set c = $f:r echo $c nice chainAntiRepeat /cluster/bluearc/scratch/mus/mm5/softNib \ /cluster/bluearc/danRer2/nib $f \ ../chain/$c.chain end cd .. chainMergeSort ./chain/*.chain > all.chain.antirepeat chainSplit chainAR all.chain.antirepeat # load filtered chains with chains removed that are mostly due to repeats ssh hgwdev cd /cluster/data/mm5/bed/blastz.danRer2/axtChain/chainAR foreach i (*.chain) set c = $i:r hgLoadChain mm5 ${c}_chainDanRer2 $i echo done $c end # featureBits -chrom=chr1 mm5 refGene:cds chainDanRer2 -enrichment # refGene:cds 0.780%, chainDanRer2 22.478%, both 0.604%, cover 77.48%, # enrich 3.45x # featureBits -chrom=chr1 mm5 refGene:cds chainDanRer2Link -enrichment # refGene:cds 0.780%, chainDanRer2Link 2.164%, both 0.526%, cover 67.43%, # enrich 31.17x # featureBits -chrom=chr1 mm5 refGene:cds chainDanRer1 -enrichment # refGene:cds 0.780%, chainDanRer1 20.053%, both 0.593%, cover 75.99%, # enrich 3.79x # featureBits -chrom=chr1 mm5 refGene:cds chainDanRer1Link -enrichment # refGene:cds 0.780%, chainDanRer1Link 2.022%, both 0.512%, cover 65.64%, # enrich 32.47x # after chainAntiRepeat: # featureBits -chrom=chr1 mm5 refGene:cds chainDanRer2Link -enrichment # refGene:cds 0.785%, chainDanRer2Link 2.058%, both 0.530%, cover 67.53%, # enrich 32.81x # NET ZEBRAFISH (danRer2) BLASTZ (DONE, 2004-12-13, hartera) # RE-DO NET WITH CHAINS FILTERED BY chainAntiRepeat (DONE, 2004-12-22,hartera) ssh kksilo cd /cluster/data/mm5/bed/blastz.danRer2/axtChain rm -r preNet mkdir preNet cd chainAR foreach i (*.chain) echo preNetting $i /cluster/bin/i386/chainPreNet $i ../../S1.len ../../S2.len \ ../preNet/$i end cd .. mkdir n1 cd preNet foreach i (*.chain) set n = $i:r.net echo primary netting $i /cluster/bin/i386/chainNet $i -minSpace=1 ../../S1.len ../../S2.len \ ../n1/$n /dev/null end cd .. cat n1/*.net | /cluster/bin/i386/netSyntenic stdin noClass.net # memory usage 105357312, utime 632 s/100, stime 117 # Add classification info using db tables: cd /cluster/data/mm5/bed/blastz.danRer2/axtChain # netClass looks for ancient repeats in one of the databases # hg17 has this table - hand-curated by Arian but this is for # human-rodent comparisons so do not use here, use -noAr option mkdir -p /cluster/bluearc/mm5/linSpecRep.notInZebrafish mkdir -p /cluster/bluearc/danRer2/linSpecRep.notInMouse cp /iscratch/i/mm5/linSpecRep.notInZebrafish/* \ /cluster/bluearc/mm5/linSpecRep.notInZebrafish cp /iscratch/i/danRer2/linSpecRep.notInMouse/* \ /cluster/bluearc/danRer2/linSpecRep.notInMouse ssh hgwdev cd /cluster/data/mm5/bed/blastz.danRer2/axtChain time netClass noClass.net mm5 danRer2 zfishdanRer2.net \ -tNewR=/cluster/bluearc/mm5/linSpecRep.notInZebrafish \ -qNewR=/cluster/bluearc/danRer2/linSpecRep.notInMouse -noAr # 87.010u 56.100s 5:15.16 45.4% 0+0k 0+0io 207pf+0w netFilter -minGap=10 zfishdanRer2.net | hgLoadNet mm5 netDanRer2 stdin # featureBits mm5 refGene:cds netDanRer2 -enrichment # refGene:cds 0.938%, netDanRer2 21.447%, both 0.714%, cover 76.17%, # enrich 3.55x # featureBits mm5 refGene:cds netDanRer1 -enrichment # refGene:cds 0.938%, netDanRer1 19.993%, both 0.702%, cover 74.87%, # enrich 3.74x # after chainAntiRepeat: # featureBits mm5 refGene:cds netDanRer2 -enrichment # refGene:cds 0.942%, netDanRer2 21.161%, both 0.717%, cover 76.14%, # enrich 3.60x # add trackDb.ra entries and html for details pages # TIGR GENE INDEX (DONE 2004-12-13 Fan) mkdir -p /cluster/data/mm5/bed/tigr cd /cluster/data/mm5/bed/tigr wget ftp://ftp.tigr.org/pub/data/tgi/Mus_musculus/TGI_track_MouseGenome_mm5_05-2004.tgz tar xvzf TGI*.tgz foreach f (*cattle*) set f1 = `echo $f | sed -e 's/cattle/cow/g'` mv $f $f1 end foreach o (mouse cow human pig rat) echo $o setenv O $o foreach f (chr*_$o*s) tail +2 $f | perl -wpe 's /THC/TC/; s/(TH?C\d+)/$ENV{O}_$1/;' > $f.gff end end ssh hgwdev cd /cluster/data/mm5/bed/tigr hgsql mm5 -e "drop table tigrGeneIndex" hgsql mm5 < ~/kent/src/hg/lib/tigrGeneIndex.sql foreach f (*.gff) echo Processing $f ... /cluster/home/fanhsu/bin/i386/ldHgGene -oldTable -exon=TC mm5 tigrGeneIndex $f hgsql mm5 -e "select count(*) from tigrGeneIndex" end # Total of 354491 entries created in tigrGeneIndex table. hgsql mm5 -e "update tigrGeneIndex set cdsStart = txStart;" hgsql mm5 -e "update tigrGeneIndex set cdsEnd = txEnd;" checkTableCoords mm5 tigrGeneIndex gzip *.gff *TCs # TIGR GENE INDEX (RE-DONE 2004-12-21 Fan) # This track is re-done due to an error (no strand info) in the original files provided by TIGR. cd /cluster/data/mm5/bed mv tigr tigr_old_wrong mkdir -p /cluster/data/mm5/bed/tigr cd /cluster/data/mm5/bed/tigr wget --timestamp ftp://ftp.tigr.org/pub/data/tgi/Mus_musculus/TGI_track_MouseGenome_mm5_12-2004.tgz tar xvzf TGI*.tgz foreach f (*cattle*) set f1 = `echo $f | sed -e 's/cattle/cow/g'` mv $f $f1 end foreach o (mouse cow human pig rat) echo $o setenv O $o foreach f (chr*_$o*s) tail +2 $f | perl -wpe 's /THC/TC/; s/(TH?C\d+)/$ENV{O}_$1/;' > $f.gff end end ssh hgwdev cd /cluster/data/mm5/bed/tigr hgsql mm5 -e "drop table tigrGeneIndex" hgsql mm5 < ~/kent/src/hg/lib/tigrGeneIndex.sql foreach f (*.gff) echo Processing $f ... /cluster/home/fanhsu/bin/i386/ldHgGene -oldTable -exon=TC mm5 tigrGeneIndex $f hgsql mm5 -e "select count(*) from tigrGeneIndex" end # Total of 385814 entries created in tigrGeneIndex table. hgsql mm5 -e "update tigrGeneIndex set cdsStart = txStart;" hgsql mm5 -e "update tigrGeneIndex set cdsEnd = txEnd;" checkTableCoords mm5 tigrGeneIndex gzip *.gff *TCs #### LOAD ENSEMBL GENES (DONE - 2004-12-17 Fan) # ADDDED STABLE URL TO TRACKDB BLOCK (V27, DEC 2004) (2008-01-11, rhead) # needed for Gene Sorter procedure below # Ensembl released Mouse build 33 the week of Dec 4 2004 mkdir /cluster/data/mm5/bed/ensembl cd /cluster/data/mm5/bed/ensembl Get the ensembl gene data from http://www.ensembl.org/ Go to the EnsMart link Choose Mus musculus as the organism Follow this sequence through the pages: Page 1) Choose the Ensembl Genes choice. Hit next. Page 2) Uncheck the "Limit to" box in the region choice. Then hit next. Page 3) Choose the "Structures" tab. Page 4) Choose GTF as the ouput, choose gzip compression , name the output file ensGeneMm5.gtf.gz and then hit Export # Ensembl handles random chromosomes differently than us, so we # strip this data. Fortunately it just loses a couple of genes. zcat ensGene.gtf.gz | grep -v ^6_DR51 | grep -v _NT_ > unrandom.gtf # Let's see how much it loses: # None. # Add "chr" to front of each line in the gene data gtf file to make # it compatible with ldHgGene sed -e "s/^/chr/" unrandom.gtf > ensGene.gtf # (should also fixup chrMT name here too - 2005-02-28 - Hiram) # sed -e "s/^/chr/" unrandom.gtf | sed -e "s/chrMT/chrM/" > ensGene.gtf ldHgGene mm5 ensGene ensGene.gtf # Read 31035 transcripts in 551352 lines in 1 files # 31035 groups 22 seqs 1 sources 4 feature types # 31035 gene predictions # save space, gzip them: gzip unrandom.gtf gzip ensGene.gtf # The name on chrM was incorrect, fixed (2005-02-28 - Hiram) hgsql mm5 -e 'update ensGene set chrom="chrM" where chrom="chrMT";' # Load Ensembl peptides: Get the ensembl protein data from http://www.ensembl.org/ Go to the EnsMart link Choose Mus musculus as the organism Follow this sequence through the pages: Page 1) Choose the Ensembl Genes choice. Hit next. Page 2) Uncheck the "Limit to" box in the region choice. Then hit next. Page 3) Choose the "Sequences" tab. Page 4) Choose Transcripts/Proteins and peptide Only as the output, choose text/fasta and gzip compression, name the file ensGeneMm5.pep.gz and then hit export. #delete * at end of each protein bash zcat ensGeneMm5.pep.gz | sed "s/\*$//" > ensembl.pep ~matt/bin/fixPep.pl ensembl.pep fixPep_ensembl.pep hgPepPred mm5 generic ensPep fixPep_ensembl.pep # # The chrMT (chrM) peptides as obtained via EnsMart have only # aa's of: X (2005-02-28 - Hiram) # These 13 peptides were fixed up manually by fetching each # one individually by following the 13 links from our browser # to the ensemble protein, asking it to dump the protein # sequence, cut and paste that answer to a local file. # The 13 peptides were dropped from ensPep table via: hgsql mm5 -e 'delete from ensPep where name="ENSMUST00000082392.1";' hgsql mm5 -e 'delete from ensPep where name="ENSMUST00000082396.1";' hgsql mm5 -e 'delete from ensPep where name="ENSMUST00000082402.1";' hgsql mm5 -e 'delete from ensPep where name="ENSMUST00000082405.1";' hgsql mm5 -e 'delete from ensPep where name="ENSMUST00000082407.1";' hgsql mm5 -e 'delete from ensPep where name="ENSMUST00000082408.1";' hgsql mm5 -e 'delete from ensPep where name="ENSMUST00000082409.1";' hgsql mm5 -e 'delete from ensPep where name="ENSMUST00000082411.1";' hgsql mm5 -e 'delete from ensPep where name="ENSMUST00000082413.1";' hgsql mm5 -e 'delete from ensPep where name="ENSMUST00000082414.1";' hgsql mm5 -e 'delete from ensPep where name="ENSMUST00000082418.1";' hgsql mm5 -e 'delete from ensPep where name="ENSMUST00000082419.1";' hgsql mm5 -e 'delete from ensPep where name="ENSMUST00000082421.1";' # Then explicitly reloaded with SQL statements such as: INSERT into ensPep (name, seq) VALUES ('ENSMUST00000082407.1', 'MPQLDTSTWFITIISSMITLFILFQLKVSSQTFPLAPSPKSLTTMKVKTPWELKWTKIYLPHSLPQQ'); # The 13 SQL statements were left in the file: # /cluster/data/mm5/bed/ensembl/chrMPep.sql # loaded via: hgsql mm5 < chrMPep.sql # The following files were "touched" on the RR/MGC after the chrMT/M # change to prevent false errors with joinerCheck. J.Jackson 2005-03-01 # mm5.superfamily.name # mm5.ensGtp.transcript # mm5.ensPep.name # mm5.knownToEnsembl.value # mm5.sfDescription.name # Load ensGtp table. # ensGtp associates geneId/transcriptId/proteinId for hgPepPred and # hgKnownToSuper. Use ensMart to create it as above, except: # Page 3) Choose the "Features" tab. In "Ensembl Attributes", check # Ensembl Gene ID, Ensembl Transcript ID, Ensembl Peptide ID. # Choose Text, tab-separated as the output format, gzip. # Result name file as ensGtpMm5.tab.gz gunzip ensGtpMm5.tab.gz hgsql mm5 < ~/kent/src/hg/lib/ensGtp.sql hgsql -N -e 'load data local infile "ensGtpMm5.tab" into table ensGtp ignore 1 lines;' mm5 # Create knownToEnsembl column hgMapToGene mm5 ensGene knownGene knownToEnsembl # Compress everthing to save space gzip *.tab gzip *.pep #### RE-BUILD Ensembl cross-reference table, ensemblXref3 (DONE - 2004-11-17 - Fan) # PLEASE NOTE THAT THE ENSEMBLXREF3 TABLE IS RE-BUILT USING ENSMART DATA OF MOUSE BUILD 33. # THIS TABLE IS NEEDED TO SUPPORT SUPERFAMILY TRACK OF THE PROTEOME BROWSER. # Get the ensembl gene/protein cross-reference data from # http://www.ensembl.org/Multi/martview?species=Mus_musculus # Follow this sequence through the pages: # Page 1) Make sure that the Mus musculus choice is selected. Hit next. # Page 2) Uncheck the "Limit to" box in the region choice. Then hit next. # Page 3) Choose the "Feature" box, select Ensembl gene, transcript, and peptid IDs, SPTrEMBL ID, SWISSPROT ID, and SWISSPROT AC # Page 4) Choose "Text, tab separated". choose gzip compression. hit export. # Save as ensXref zcat ensXref.tsv.gz|sed -e 's/\./\t/g' > ensemblXref3.tab hgsql mm5 -e "drop table ensemblXref3" hgsql mm5 < ~/src/hg/lib/ensemblXref3.sql hgsql mm5 -e 'load data local infile "ensemblXref3.tab" into table ensemblXref3 ignore 1 lines' # CREATE SUPERFAMILY TRACK (DONE 2004-12-17 - Fan) mkdir /cluster/data/mm5/bed/superfamily cd /cluster/data/mm5/bed/superfamily hgSuperfam mm5 superfam041128 > sf.log wc * # It is normal that many proteins does not have corresponding Superfamily entries. # Load the sfDescription table. hgsql mm5 < ~/src/hg/lib/sfDescription.sql hgsql mm5 -e 'LOAD DATA local INFILE "sfDescription.tab" into table mm5.sfDescription;' # Finally, load the superfamily table. hgLoadBed mm5 superfamily superfamily.tab -tab # Create knownToSuperfamily table cat /cluster/data/superfamily/041128/ass_28-Nov-2004.tab | hgKnownToSuper mm5 mm stdin # created 21899 records output # MAKE VSDANRER2 DOWNLOADABLES (DONE, 2004-12-14, hartera) # REMAKE FOR CHAINS AND NET AFTER USING chainAntiRepeat # (DONE, 2004-12-22, hartera) ssh hgwdev cd /cluster/data/mm5/bed/blastz.danRer2/axtChrom set gp = /usr/local/apache/htdocs/goldenPath/mm5 mkdir -p $gp/vsDanRer2/axtChrom cp -p *.axt $gp/vsDanRer2/axtChrom cd $gp/vsDanRer2/axtChrom gzip *.axt md5sum *.gz > md5sum.txt # copy chains and nets to downloads area # re-make chains and net downloadables (2004-12-22, hartera) rm $gp/vsDanRer2/zebrafish*.gz $gp/vsDanRer2/md5sum.txt cd /cluster/data/mm5/bed/blastz.danRer2/axtChain gzip -c all.chain.antirepeat > \ /cluster/data/mm5/zip/zebrafishDanRer2.chain.gz gzip -c zfishdanRer2.net > /cluster/data/mm5/zip/zebrafishDanRer2.net.gz cd $gp/vsDanRer2 mv /cluster/data/mm5/zip/zebrafish*.gz . md5sum *.gz > md5sum.txt # Copy over & edit README.txt w/pointers to chain, net formats. # BLASTZ DANRER2 CLEANUP (DONE, 2004-12-14, hartera) # RE-DONE (DONE, 2004-12-22, hartera) ssh kksilo cd /cluster/data/mm5/bed/blastz.danRer2 nice rm axtChain/run1/chain/* & nice rm -fr axtChain/n1 axtChain/noClass.net & nice gzip axtChrom/* pslChrom/* axtChain/all.chain axtChain/all.chain.unfiltered axtChain/*.net & nice gzip axtChain/all.chain.antirepeat axtChain/all.chain.filt5k axtChain/chainAR/*.chain & nice rm -fr axtChain/chain axtChain/chainRaw axtChain/preNet & # MOUSE PHOTOGRAPH added to gateway page # Obtained from Jackson Labs press office via email: # # Subject: Re: mouse press photographs # Date: Wed, 29 Dec 2004 14:26:15 -0500 # From: Joyce Peterson # To: Hiram Clawson # References: <41D2FF0B.3090207@soe.ucsc.edu> # Hi, Hiram. You may use the attached photo, noting credit to "The # Jackson Laboratory." # # Cheers, # --Joyce # # Joyce Peterson # Public Information Manager # The Jackson Laboratory # 610 Main Street, Mailbox 664 # Bar Harbor, ME 04609-1526 # Tel. 207-288-6058 # Mobile 207-266-5745 # E-mail joyce@jax.org # http://www.jax.org/news # # Original from this email placed into /cluster/data/mm5/html/C57BL_6J.JPG ssh hgwdev cd /cluster/data/mm5/html # view that image in 'display' to determine crop edges, then: convert -crop 890x690+330+70 -quality 80 -sharpen 0 \ -normalize C57BL_6J.JPG mm.jpg convert -geometry 300x200 -quality 80 mm.jpg Mus_musculus.jpg rm -f mm.jpg cp -p Mus_musculus.jpg /usr/local/apache/htdocs/images # add links to this image in the description.html page, request push # ANDY LAW CPGISSLANDS (DONE 1/14/05 angie) # See notes about this in makeGalGal2.doc. # Running only on masked sequence. ssh kksilo mkdir /cluster/data/mm5/bed/cpgIslandGgfAndy cd /cluster/data/mm5/bed/cpgIslandGgfAndy cp /dev/null cpgIslandGgfAndyMasked.bed foreach f (../../?{,?}/chr*.fa.masked) set chr = $f:t:r:r echo preproc masked $chr /cluster/home/angie/bin/$MACHTYPE/preProcGgfAndy $f > $chr.masked.preproc echo running on $chr masked /cluster/home/angie/ggf-andy-cpg-island.pl $chr.masked.preproc \ | perl -wpe 'chomp; ($s,$e,$cpg,$n,$c,$g,$oE) = split("\t"); $s--; \ $gc = $c + $g; $pCpG = (100.0 * 2 * $cpg / $n); \ $pGc = (100.0 * $gc / $n); \ $_ = "'$chr'\t$s\t$e\tCpG: $cpg\t$n\t$cpg\t$gc\t" . \ "$pCpG\t$pGc\t$oE\n";' \ >> cpgIslandGgfAndyMasked.bed end # load into database: ssh hgwdev cd /cluster/data/mm5/bed/cpgIslandGgfAndy sed -e 's/cpgIslandExt/cpgIslandGgfAndyMasked/g' \ $HOME/kent/src/hg/lib/cpgIslandExt.sql > cpgIslandGgfAndyMasked.sql hgLoadBed mm5 cpgIslandGgfAndyMasked -tab -noBin \ -sqlTable=cpgIslandGgfAndyMasked.sql cpgIslandGgfAndyMasked.bed featureBits mm5 cpgIslandExt #10422989 bases of 2615483787 (0.399%) in intersection featureBits mm5 cpgIslandGgfAndyMasked #38305840 bases of 2615483787 (1.465%) in intersection wc -l ../cpgIsland/cpgIsland.bed cpgIslandGgfAndyMasked.bed # 16238 ../cpgIsland/cpgIsland.bed # 67737 cpgIslandGgfAndyMasked.bed # 1/26/05: Make better island names in cpgIslandGgfAndyMasked, # for Dave Burt's cross-species island comparisons. ssh kksilo cd /cluster/data/mm5/bed/cpgIslandGgfAndy mv cpgIslandGgfAndyMasked.bed cpgIslandGgfAndyMasked.bed.orig perl -wpe '@w=split("\t"); $w[3] = "mm5.$w[0]." . ($w[1]+1) . ".$w[2]"; \ $_ = join("\t", @w);' \ cpgIslandGgfAndyMasked.bed.orig \ > cpgIslandGgfAndyMasked.bed ssh hgwdev cd /cluster/data/mm5/bed/cpgIslandGgfAndy hgLoadBed -noBin -tab -sqlTable=cpgIslandGgfAndyMasked.sql \ mm5 cpgIslandGgfAndyMasked cpgIslandGgfAndyMasked.bed # MAKE MM5-RN3 OVER.CHAIN FOR LIFTOVER (DONE 1/25/05 angie) ssh kolossus set chainDir = /cluster/data/mm5/bed/blastz.rn3/axtChain mkdir -p /cluster/data/mm5/bed/bedOver mkdir /tmp/mm5ToRn3 foreach f ($chainDir/ratNet/chr*.net.gz) set chr = $f:t:r:r echo $chr netChainSubset $f $chainDir/chain/$chr.chain.gz \ /tmp/mm5ToRn3/$chr.chain end cat /tmp/mm5ToRn3/*.chain \ > /cluster/data/mm5/bed/bedOver/mm5ToRn3.over.chain rm -r /tmp/mm5ToRn3 # MAKE MM5-GALGAL2 OVER.CHAIN FOR LIFTOVER (DONE 1/25/05 angie) ssh kolossus set chainDir = /cluster/data/mm5/bed/blastz.galGal2/axtChain mkdir -p /cluster/data/mm5/bed/bedOver netChainSubset $chainDir/chicken.net $chainDir/all.chain \ /cluster/data/mm5/bed/bedOver/mm5ToGalGal2.over.chain # UPDATE kgSpAlias TABLE WITH NEW UNIPROT DISPLAY ID ENTRIES (done 2/11/05 Fan) # Add new mm5 protein display IDs to the alias table to support user search ssh hgwdev mkdir -p /cluster/data/mm5/bed/pb/newDisplayId cd /cluster/data/mm5/bed/pb/newDisplayId hgsql proteome -e 'select mm5.kgSpAlias.kgID, mm5.kgSpAlias.SpID, spOldNew.newDisplayId from spOldNew, mm5.kgSpAlias where spOldNew.acc=mm5.kgSpAlias.spID and oldDisplayId != newDisplayId' |sort -u >mm5.tab # get rid of the header line at the end of the file vi mm5.tab hgsql mm5 -e 'load data local infile "mm5.tab" into table mm5.kgSpAlias' # UPDATE kgProtAlias TABLE WITH NEW UNIPROT DISPLAY ID ENTRIES (done 2/11/05 Fan) # Add new mm5 protein display IDs to the alias table to support user search ssh hgwdev cd /cluster/data/mm5/bed/pb/newDisplayId hgsql proteome -e 'select mm5.kgSpAlias.kgID,spOldNew.oldDisplayId,spOldNew.newDisplayId from spOldNew, mm5.kgSpAlias where spOldNew.acc=mm5.kgSpAlias.spID and oldDisplayId != newDisplayId' |sort -u >mm5.kgProtAlias.tab # get rid of the header line at the end of the file vi mm5.kgProtAlias.tab hgsql mm5 -e 'load data local infile "mm5.kgProtAlias.tab" into table mm5.kgProtAlias' # BLASTZ/CHAIN/NET BOSTAU1 (DONE 2/21/05 angie) ssh kksilo mkdir /cluster/data/mm5/bed/blastz.bosTau1.2005-02-19 cd /cluster/data/mm5/bed/blastz.bosTau1.2005-02-19 cat << '_EOF_' > DEF # mouse vs. cow # TARGET # Mouse SEQ1_DIR=/scratch/mus/mm5/softNib SEQ1_CHUNK=10000000 SEQ1_LAP=10000 SEQ1_LEN=/cluster/data/mm5/chrom.sizes # QUERY # Cow SEQ2_DIR=/iscratch/i/bosTau1/nib/bosTau1.2bit SEQ2_CHUNK=5000000 SEQ2_LAP=0 SEQ2_LEN=/iscratch/i/bosTau1/chrom.sizes BASE=/cluster/data/mm5/bed/blastz.bosTau1.2005-02-19 '_EOF_' # << this line keeps emacs coloring happy doBlastzChainNet.pl DEF \ -blastzOutRoot /cluster/bluearc/mouseVsCow >& do.log & tail -f do.log # kksilo was rebooted so original invocation of doBlastzChainNet.pl # was killed in the middle of the cluster run. I watched the job # progress and restarted 70 failed jobs like this: ssh kk cd /cluster/data/mm5/bed/blastz.bosTau1.2005-02-19/run.blastz para check para push para check ... # When the batch was complete: para time > run.time # (doBlastzChainNet.pl uses run.time as a checkpoint) # Then to continue the run: ssh kksilo cd /cluster/data/mm5/bed/blastz.bosTau1.2005-02-19 doBlastzChainNet.pl -continue=cat DEF \ -blastzOutRoot /cluster/bluearc/mouseVsCow >>& do.log & tail -f do.log # For some reason the script got hung waiting for tty input; I # foregrounded it, hit return a few times, and it eventually completed. # That should be fixed in a future version of doBlastzChainNet.pl. ln -s blastz.bosTau1.2005-02-19 /cluster/data/mm5/bed/blastz.bosTau1 # Add chainBosTau1 and netBosTau1 to mm5/trackDb.ra # Add /usr/local/apache/htdocs/goldenPath/mm5/vsBosTau1/README.txt # LOAD SNPS (Done; March 3, 2005; Heather) # directory structure ssh hgwdev cd /cluster/bluearc/snp mkdir mm5.heather cd mm5.heather mkdir det loc seq str xml # get data ftp ftp.ncbi.nih.gov cd snp/mouse/XML prompt mget ds_ch*.xml.gz # make sure script is current (should add makefile so general build does this) cp -f /cluster/home/heather/kent/src/hg/snp/parseDbSnpXML /cluster/bin/scripts # build jobList for parsing touch jobList foreach file ( ds_ch*.xml.gz ) set out = $file:t:r echo /cluster/bin/scripts/parseDbSnpXML $file /cluster/bluearc/snp/mm5.heather $out.contig >> jobList end # do the parsing ssh kk cd /cluster/bluearc/snp/mm5.heather para create jobList para try para check para push # output goes to det, loc, seq, str and xml directories # concatenate details ssh hgwdev zcat det/ds_ch*.xml.contig.det.gz > in.bed # couldn't find contig-based lift file from mm5 # generate from ctgPos echo "select chromStart, chrom, contig, size, chrom from ctgPos;" > ctgPos.sql hgsql mm5 < ctgPos.sql > ctgPos.out # edit ctgPos.out to put in proper format -- next time write script for this # lift # expect warnings from non-reference assemblies (limited to first 10) liftUp out.bed ctgPos.out warn in.bed # load (exception column will be empty for all rows) hgLoadBed mm5 snp out.bed -sqlTable=/cluster/home/heather/kent/src/hg/lib/snp.sql # generate exceptions 1-20; drop 7 and 9 as they will be changing cd /usr/local/apache/htdocs/qa/test-results/snpException mkdir mm5 cd mm5 snpException mm5 0 mm5snpException # Invariant 1 has 0 exceptions, written to this file: mm5snpException.01.bed # Invariant 2 has 0 exceptions, written to this file: mm5snpException.02.bed # Invariant 3 has 0 exceptions, written to this file: mm5snpException.03.bed # Invariant 4 has 0 exceptions, written to this file: mm5snpException.04.bed # Invariant 5 has 0 exceptions, written to this file: mm5snpException.05.bed # Invariant 6 has 3 exceptions, written to this file: mm5snpException.06.bed # Invariant 7 has 1 exceptions, written to this file: mm5snpException.07.bed # Invariant 8 has 0 exceptions, written to this file: mm5snpException.08.bed # Invariant 9 has 22 exceptions, written to this file: mm5snpException.09.bed # Invariant 10 has 0 exceptions, written to this file: mm5snpException.10.bed # Invariant 11 has 0 exceptions, written to this file: mm5snpException.11.bed # Invariant 12 has 0 exceptions, written to this file: mm5snpException.12.bed # Invariant 13 has 0 exceptions, written to this file: mm5snpException.13.bed # Invariant 14 has 0 exceptions, written to this file: mm5snpException.14.bed # Invariant 15 has 0 exceptions, written to this file: mm5snpException.15.bed # Invariant 16 has 0 exceptions, written to this file: mm5snpException.16.bed # Invariant 17 has 0 exceptions, written to this file: mm5snpException.17.bed # Invariant 18 has 3634 exceptions, written to this file: mm5snpException.18.bed # Invariant 19 has 0 exceptions, written to this file: mm5snpException.19.bed # Invariant 20 has 0 exceptions, written to this file: mm5snpException.20.bed # Invariant 21 has no query string # Invariant 22 has no query string # Invariant 23 has no query string # Invariant 24 has no query string mv mm5snpException.07.bed mm5snpException.07.bed.notused mv mm5snpException.09.bed mm5snpException.09.bed.notused # snpValid cd /cluster/bluearc/snp/mm5.heather/seq nice snpValid mm5 . > & snpValid.out & tail -20 snpValid.out # Grand Totals: # matches: 494545 # mismatches: 246 (exceptionId #22) # missing from flanks: 0 (exceptionId #23) # rev compl matches: 56285 # not rptd strand : 1 (exceptionId #24) # assembly = -: 0 # nib in gap : 0 (must be 0) # Total rows in snp: 494791 # no dna found for : 0 # Total goodExact: 493886 # Total badExact: 534 (exceptionId #21) # copy 21-24 exceptions to location of 1-20 cp *bed /usr/local/apache/htdocs/qa/test-results/snpException/mm5 # add exception data to snp table cp ../build124/updateExceptionList.pl . tail +3 mm5snpException.* | awk '/rs/ {printf "%s\t%d\t%d\n",$4,$2,$5}' | sort -k1,2n > exceptionList.txt updateExceptionList.pl < exceptionList.txt > updateExceptionList.sql hgsql mm5 < updateExceptionList.sql # HUMAN BLASTP FOR GENE SORTER (RE-DONE 7/28/05 Fan) # Make human ortholog column using blastp on human known genes. # First make human protein database and copy it to iscratch/i # if it doesn't exist already: # NOTE: THE SECTION BELOW WAS ALREADY DONE. cd /cluster/data/hg17/bed/blastp pepPredToFa hg17 knownGenePep known.faa formatdb -i known.faa -t known -n known ssh kkr1u00 if (-e /iscratch/i/hg17/blastp) then rm -r /iscratch/i/hg17/blastp endif mkdir -p /iscratch/i/hg17/blastp cp /cluster/data/hg17/bed/blastp/known.p?? /iscratch/i/hg17/blastp iSync # THE SECTION ABOVE WAS ALREADY DONE PREVIOUSLY. # Make parasol run directory ssh kk mkdir -p /cluster/data/mm5/bed/blastp/hg17/run/out cd /cluster/data/mm5/bed/blastp/hg17/run # Make blast script cat > blastSome < gsub <split.lst #EDIT split.lst to add "../../../geneSorter/blastp/split/" in front of "kg" gensub2 split.lst single gsub spec para create spec para try, check, push, check, ... Completed: 7739 of 7739 jobs CPU time in finished jobs: 113019s 1883.65m 31.39h 1.31d 0.004 y IO & Wait Time: 22145s 369.08m 6.15h 0.26d 0.001 y Average job time: 17s 0.29m 0.00h 0.00d Longest running job: 0s 0.00m 0.00h 0.00d Longest finished job: 124s 2.07m 0.03h 0.00d Submission to last job: 495s 8.25m 0.14h 0.01d # Load into database. ssh hgwdev cd /cluster/data/mm5/bed/blastp/hg17/run/out hgLoadBlastTab mm5 hgBlastTab -maxPer=1 *.tab # KNOWN GENES # This was built using ~/kent/src/hg/protein/KGprocess.sh # and it was not documented. # CREATE FULL TEXT INDEX FOR KNOWN GENES (DONE 1/19/2006 JK) # This depends on the go and uniProt databases as well as # the kgAlias and kgProAlias tables. The hgKgGetText takes # about 5 minutes when the database is not too busy. The rest # is real quick. ssh hgwdev cd /cluster/data/mm5/bed/ mkdir -p kgMm5/index cd kgMm5/index hgKgGetText mm5 knownGene.text ixIxx knownGene.text knownGene.ix knownGene.ixx ln -s /cluster/data/mm5/bed/kgMm5/index/knownGene.ix /gbdb/mm5/knownGene.ix ln -s /cluster/data/mm5/bed/kgMm5/index/knownGene.ixx /gbdb/mm5/knownGene.ixx # RE-BUILD cgapAlias TABLE # ORIGINALLY TABLE WAS BUILT BY THE KNOWN GENES PROCESS # cgapAlias table has replicate rows so remove (DONE, 2005-07-26, hartera) # RELOADED cgapAlias AGAIN AS TOO MANY ROWS REMOVED BEFORE (hartera, 2005-10-06) ssh hgwdev cd /cluster/store6/kgDB/bed/kgMm5B # DO TABLE RELOAD AGAIN AS sort -nu REMOVES MORE ROWS THAN sort -u # OR sort -n | uniq. # USE sort -n then uniq TO SORT ON THE IDs AND THEN UNIQ # (hartera, 2005-10-06) sort -n cgapAlias.tab | uniq > cgapAliasSorted.tab hgsql mm5 -e "drop table cgapAlias" hgsql mm5 < ~/kent/src/hg/lib/cgapAlias.sql hgsql mm5 -e 'load data local infile "cgapAliasSorted.tab" \ into table cgapAlias' # Create table that maps between known genes and visiGene database (DONE 2005-10-10 galt) knownToVisiGene mm5 #Made hashes of image: geneImageHash 2117, locusLinkImageHash 780, refSeqImageHash 780, #genbankImageHash 1301 #knownToLocusLink 30303, knownToRefSeq 30291, knownToGene 266841 # RIKEN CAGE STUFF (DONE 11-16-2005 Andy) # Make download area. ssh hgwdev cd /cluster/data/mm5/bed mkdir rikenCageCtss cd rikenCageCtss/ wget -r http://fantom31p.gsc.riken.jp/cage_analysis/export/mm5/ # stupid thing didn't work. Tried tinkering with wget almost every way possible. # Finally just did it the hard way. wget -O /dev/stdout http://fantom31p.gsc.riken.jp/cage_analysis/export/mm5/ 2> /dev/null | egrep ".sql|.bz2" | grep href | sed 's/^.*href=\"//;s/\".*$//' > files.lst rm -rf fantom* for f in `cat files.lst`; do wget http://fantom31p.gsc.riken.jp/cage_analysis/export/mm5/$f; done bunzip2 *.bz2 # Make the simple table of the CAGE-related TSSs. awk 'BEGIN{FS="\t"};{printf("%s\t%s\t%s\t%s\t%s\t1000\t%s\n",$9,$4,$7,$8,$1,($6 == "F") ? "+" : "-")}' \ tss_summary.tsv | grep "^CAGE" | cut -f2- > basicCAGE.bed # Make CAGE wiggle tracks for plus and minus strands awk 'BEGIN{FS="\t"}; {if ($4=="F") printf("%s\t%s\t%d\t%s\n", $2, $5, $5+1, $6)}' \ ctss_summary.tsv | wigEncode stdin ctssForward.wig ctssForward.wib awk 'BEGIN{FS="\t"}; {if ($4=="R") printf("%s\t%s\t%d\t%s\n", $2, $5-1, $5, $6)}' \ ctss_summary.tsv | wigEncode stdin ctssReverse.wig ctssReverse.wib mkdir wiggle mv ctss*.wi{g,b} wiggle/ # Load stuff up: hgLoadBed mm5 rikenCageTc basicCAGE.bed ln -s /cluster/data/mm5/bed/rikenCAGE/wiggle /gbdb/mm5/wib/ctssForward ln -s /cluster/data/mm5/bed/rikenCAGE/wiggle /gbdb/mm5/wib/ctssReverse hgLoadWiggle mm5 ctssForward ctssForward.wig hgLoadWiggle mm5 ctssReverse ctssReverse.wig # OK make them bedGraphs instead. cd ../ rm -rf wiggle/ rm /gbdb/mm5/wib/ctss* hgsql mm5 -e 'drop table ctssForward' hgsql mm5 -e 'drop table ctssReverse' awk 'BEGIN{FS="\t"}; {if ($4=="F") printf("%s\t%s\t%d\t%s\n", $2, $5, $5+1, $6)}' \ ctss_summary.tsv | hgLoadBed -strict -bedGraph=4 mm5 rikenCageCtssPlus stdin awk 'BEGIN{FS="\t"}; {if ($4=="R") printf("%s\t%s\t%d\t%s\n", $2, $5-1, $5, $6)}' \ ctss_summary.tsv | hgLoadBed -strict -bedGraph=4 mm5 rikenCageCtssMinus stdin # track html: cp rikenCageCtss.html ~/kent/src/hg/makeDb/trackDb/mouse/ # trackDb: track rikenCageTc shortLabel Riken CAGE TC longLabel Riken CAGE - Associated Transcript Clusters group genes priority 47.5 visibility hide type bed 6 . track rikenCageCtss compositeTrack on shortLabel Riken CAGE longLabel Riken CAGE - Predicted Gene Start Sites group genes priority 47.51 visibility hide type bedGraph 4 maxHeightPixels 128:16:16 minLimit 1 maxLimit 4316 viewLimits 1.0:10.0 windowingFunction mean autoScale Off origAssembly hg16 track rikenCageCtssPlus subTrack rikenCageCtss shortLabel Riken CAGE + longLabel Riken CAGE Plus Strand - Predicted Gene Start Sites priority 1 color 109,51,43 track rikenCageCtssMinus subTrack rikenCageCtss shortLabel Riken CAGE - longLabel Riken CAGE Minus Strand - Predicted Gene Start Sites priority 2 color 43,51,109 # MYTOUCH FIX - jen - 2006-01-24 sudo mytouch mm5 geneidPep 0408071900.00 sudo mytouch mm5 genscanPep 0501071300.00 sudo mytouch mm5 superfamily 0503011100.00 sudo mytouch mm5 ensGtp 0503011100.00 sudo mytouch mm5 knownToEnsembl 0503011100.00 sudo mytouch mm5 sfDescription 0503011100.00 ############################################################################ # Mm7 to Mm5 liftOver creation (DONE - 2006-02-22 - 2006-02-24 - Hiram) # instructions lifted from Andy's sequence in makeMm7.doc ######## LIFTOVER PREPARATION # Split up mm5 ssh kkr1u00 cd /iscratch/i/mm5 mkdir liftSplits mkdir liftSplits/split mkdir liftSplits/lift for fa in /cluster/data/mm5/?/*.fa /cluster/data/mm5/??/*.fa do c=`basename $fa .fa` echo $c faSplit -lift=liftSplits/lift/${c}.lft size $fa -oneFile 10000 \ liftSplits/split/$c done mkdir biggerSplits mkdir biggerSplits/split cd biggerSplits/ ln -s ../liftSplits/lift cd split/ ln -s ../../liftSplits/split/* . faSplit sequence chr1.fa 5 chr1_ faSplit sequence chrX.fa 10 chrX_ rm chr{1,X}.fa for R in 2 3 4 5 6 7 8 do rsync -a --progress /iscratch/i/mm5/ kkr${R}u00:/iscratch/i/mm5/ done ######## LIFTOVER BLATING # MM7 ssh kk cd /cluster/data/mm7 /cluster/bin/scripts/makeLoChain-align mm7 /scratch/hg/mm7/nib mm5 \ /iscratch/i/mm5/biggerSplits/split cd bed/blat.mm5.2006-02-22/run # target is Mm7 # query is Mm5 cat << '_EOF_' > blat.csh #!/bin/csh -fe set target=$1 set query=$2 set output=$3 set chain=$4 set tPart=$target:t:r set qPart=$query:t:r set tmpDir=/scratch/tmp/${chain}.${tPart}_${qPart} set tmpOutput=$tmpDir/$output:t mkdir -p $tmpDir sleep 2 /cluster/bin/$MACHTYPE/blat $target $query $tmpOutput \ -tileSize=11 -minScore=100 -minIdentity=98 -fastMap \ -ooc=/iscratch/i/mm5/11.ooc mkdir -p `dirname $output` cp $tmpOutput $output rm $tmpOutput rmdir --ignore-fail-on-non-empty $tmpDir '_EOF_' # happy emacs chmod +x blat.csh sed 's#^blat#./blat.csh#; s/\}.*$/}/; s/$/ mm7ToMm5/' spec > jobList para create jobList para -maxNode=200 -priority=25 push para time # Completed: 2451 of 2451 jobs # CPU time in finished jobs: 1266001s 21100.02m 351.67h 14.65d 0.040 y # IO & Wait Time: 13972s 232.87m 3.88h 0.16d 0.000 y # Average job time: 522s 8.70m 0.15h 0.01d # Longest finished job: 6769s 112.82m 1.88h 0.08d # Submission to last job: 26506s 441.77m 7.36h 0.31d ######## LIFTOVER CHAINING # LIFTING ssh kki cd /cluster/data/mm7/bed/blat.mm5.2006-02-22 cat << '_EOF_' > mm5SplitLift.sh #!/bin/bash for C in chr1 chrX do echo joining $C for P in `ls *_${C}_[0-9]*.psl | sed -e "s/_chr.*//" | sort -u` do echo "${P}_${C}.psl" tail --lines=+6 -q "${P}_${C}_[0-9]*.psl" > ${P}_${C}.psl done for f in *_${C}.psl; do cat /san/sanvol1/scratch/andy/psl.header $f > tmp mv tmp $f done done echo Lifting... for C in `awk '{print $1}' /cluster/data/mm5/chrom.sizes`; do echo "lifting $C ... " liftUp -pslQ ../psl/${C}.psl \ /iscratch/i/mm5/biggerSplits/lift/${C}.lft error chr*_${C}.psl echo done $C done '_EOF_' # happy emacs chmod +x mm5SplitLift.sh cat << "EOF" > mm5ChainMergeSplit.sh #!/bin/bash cp -r chainRaw/ /scratch/andy/mm5Lifts pushd /scratch/andy/mm5Lifts mkdir chain /cluster/bin/x86_64/chainMergeSort chainRaw/*.chain | /cluster/bin/x86_64/chainSplit chain stdin cp -r chain `dirs +1` rm -rf chain chainRaw EOF chmod +x mm5ChainMergeSplit.sh cd /cluster/data/mm7/bed/blat.mm5.2006-02-22/raw ../mm5SplitLift.sh cd ../ mkdir chainRun chainRaw cd chainRun cat << '_EOF_' > template #LOOP axtChain -linearGap=medium -verbose=0 -psl $(path1) /scratch/hg/mm7/nib /cluster/data/mm5/nib {check out line+ ../chainRaw/$(root1).chain} #ENDLOOP '_EOF_' ls -1S ../psl/*.psl > in.lst gensub2 in.lst single template jobList para create jobList para push para time # Completed: 43 of 43 jobs # CPU time in finished jobs: 7259s 120.98m 2.02h 0.08d 0.000 y # IO & Wait Time: 1086s 18.10m 0.30h 0.01d 0.000 y # Average job time: 194s 3.23m 0.05h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 1088s 18.13m 0.30h 0.01d # Submission to last job: 2289s 38.15m 0.64h 0.03d ssh kkstore02 cd /cluster/data/mm7/bed/blat.mm5.2006-02-22 mkdir chain time chainMergeSort chainRaw/* | chainSplit chain stdin # real 29m42.365s mkdir net over cd chain for c in *.chain do echo ${c%.chain}; nice chainNet $c /cluster/data/mm7/chrom.sizes \ /cluster/data/mm5/chrom.sizes ../net/${c%.chain}.net /dev/null echo done $c done # real 15m33.593s for chain in *.chain do c=${chain%.chain} nice netChainSubset ../net/$c.net $chain ../over/$c.over done # real 10m48.898s ########## FINISHING ssh kkstore02 cd /cluster/data/mm7/bed/blat.mm5.2006-02-22/over cat * > ../mm7ToMm5.over.chain cd .. gzip mm7ToMm5.over.chain rm -rf psl net chain chainRaw over ssh hgwdev cd /cluster/data/mm7/bed ln -s blat.mm5.2006-02-22 blat.mm5 ln -s `pwd`/blat.mm5/mm7ToMm5.over.chain.gz liftOver/mm7ToMm5.over.chain.gz ln -s `pwd`/liftOver/mm7ToMm5.over.chain.gz \ /gbdb/mm7/liftOver/mm7ToMm5.over.chain.gz ln -s `pwd`/liftOver/mm7ToMm5.over.chain.gz \ /usr/local/apache/htdocs/goldenPath/mm7/liftOver/mm7ToMm5.over.chain.gz hgAddLiftOverChain mm7 mm5 /gbdb/mm7/liftOver/mm7ToMm5.over.chain.gz ############################################################################ # UPDATED mm5.knownToVisiGene (2006-03-21 galt) ssh hgwdev knownToVisiGene mm5 ####################################################################### ## LIFTOVER To Mm8 (DONE - 2006-05-15 - 2006-06-05 - Hiram) ssh kkr1u00 # do not need to run this command since /cluster/data/mm8/split10k # already exists from previous liftOver jobs (mm7 to mm8) # $HOME/kent/src/hg/makeDb/makeLoChain/makeLoChain-split.csh \ # mm8 /cluster/data/mm8/nib # as it says, DO THIS NEXT: ssh kk # if bin/scripts is not in your PATH, add it for this command: PATH=$PATH:/cluster/bin/scripts \ $HOME/kent/src/hg/makeDb/makeLoChain/makeLoChain-align.csh \ mm5 /cluster/data/mm5/nib mm8 /iscratch/i/mm8/split10k \ /cluster/data/mm8/11.ooc # as it says, DO THIS NEXT: cd /cluster/data/mm5/bed/blat.mm8.2006-05-15/run para try, check, push, check, ... # Completed: 1462 of 1462 jobs # CPU time in finished jobs: 3990246s 66504.10m 1108.40h 46.18d 0.127 y # IO & Wait Time: 0s 0.00m 0.00h 0.00d 0.000 y # Average job time: 2371s 39.51m 0.66h 0.03d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 24307s 405.12m 6.75h 0.28d # Submission to last job: 1474509s 24575.15m 409.59h 17.07d # as it says, DO THIS NEXT: # this does the liftUp and makes the psl files # kkr1u00 is down these days ssh kkr3u00 cd /cluster/data/mm5/bed ln -s blat.mm8.2006-05-15 blat.mm8 # edit this script to allow use on kkr3u00 time $HOME/kent/src/hg/makeDb/makeLoChain/makeLoChain-lift.csh mm5 mm8 # real 16m5.091s # as it says, DO THIS NEXT: # the prepares the batch to run for the chaining ssh kki time $HOME/kent/src/hg/makeDb/makeLoChain/makeLoChain-chain.csh \ mm5 /cluster/data/mm5/nib mm8 /cluster/data/mm8/nib # as it says, DO THIS NEXT: # running the chain batch cd /cluster/data/mm5/bed/blat.mm8.2006-05-15/chainRun para try, check, push, check, ... # Completed: 34 of 34 jobs # CPU time in finished jobs: 6893s 114.88m 1.91h 0.08d 0.000 y # IO & Wait Time: 7183s 119.72m 2.00h 0.08d 0.000 y # Average job time: 414s 6.90m 0.12h 0.00d # Longest finished job: 1130s 18.83m 0.31h 0.01d # Submission to last job: 1130s 18.83m 0.31h 0.01d ssh kkstore03 $HOME/kent/src/hg/makeDb/makeLoChain/makeLoChain-net.csh mm5 mm8 # Created /cluster/data/mm5/bed/liftOver/mm5ToMm8.over.chain.gz # as it says, DO THIS NEXT: ssh hgwdev $HOME/kent/src/hg/makeDb/makeLoChain/makeLoChain-load.csh mm5 mm8 # It says this: # Now, add link for # /usr/local/apache/htdocs/goldenPath/mm5/liftOver/mm5ToMm8.over.chain # to hgLiftOver # But I believe that link was already done: cd /gbdb/mm5/liftOver ls -og mm5ToMm8* # lrwxrwxrwx 1 53 Jun 5 16:10 mm5ToMm8.over.chain.gz -> # /cluster/data/mm5/bed/liftOver/mm5ToMm8.over.chain.gz ##################################################################### # SEGMENTAL DUPLICATIONS (DONE 6/30/06 angie) # File emailed from Xinwei She mkdir /cluster/data/mm5/bed/genomicSuperDups cd /cluster/data/mm5/bed/genomicSuperDups sed -e 's/\t_\t/\t-\t/' mm5_genomicSuperDup.tab \ | awk '($3 - $2) >= 1000 && ($9 - $8) >= 1000 {print;}' \ | hgLoadBed mm5 genomicSuperDups stdin \ -sqlTable=$HOME/kent/src/hg/lib/genomicSuperDups.sql ########################################################################## # GenBank gbMiscDiff table (markd 2007-01-10) # Supports `NCBI Clone Validation' section of mgcGenes details page # genbank release 157.0 now contains misc_diff fields for MGC clones # reloading mRNAs results in gbMiscDiff table being created. ./bin/gbDbLoadStep -reload -srcDb=genbank -type=mrna mm5