# for emacs: -*- mode: sh; -*- # This file describes browser build for the mouse # genome, February 2006, ncbi mouse_36 - Mm8 # # "$Id: mm8.txt,v 1.76 2010/01/07 20:47:49 rhead Exp $" # # NOTE: this doc may have genePred loads that fail to include # the bin column. Please correct that for the next build by adding # a bin column when you make any of these tables: # # mysql> SELECT tableName, type FROM trackDb WHERE type LIKE "%Pred%"; # +-------------+-------------------------------------+ # | tableName | type | # +-------------+-------------------------------------+ # | knownGene | genePred knownGenePep knownGeneMrna | # | refGene | genePred refPep refMrna | # | xenoRefGene | genePred xenoRefPep xenoRefMrna | # | mgcGenes | genePred | # | ensGene | genePred ensPep | # | genscan | genePred genscanPep | # +-------------+-------------------------------------+ ####################################################################### # DOWNLOAD THE MOUSE SEQUENCE FROM NCBI (DONE - 2006-02-14 - Hiram) # # Examine disk space issues, find some goodly amount of space ssh kkstore01 mkdir /cluster/store9/mm8 ln -s /cluster/store9/mm8 /cluster/data/mm8 cd /cluster/data/mm8 mkdir ncbi cd ncbi cp -p /cluster/data/mm7/ncbi/.wgetrc . WGETRC=`pwd`/.wgetrc export WGETRC wget --timestamping --force-directories --directory-prefix=. \ --dont-remove-listing --recursive --level=4 --no-parent \ --no-host-directories --cut-dirs=1 \ ftp://ftp-private.ncbi.nih.gov/mouse_36 # Downloaded: 2,201,934,141 bytes in 50 files # real 44m48.975s # The pre-release sequence, Feb 27th: mkdir /cluster/data/mm8/pre_release cd /cluster/data/mm8/pre_release # The .wgetrc is the anonymous user WGETRC=`pwd`/.wgetrc export WGETRC wget --timestamping --force-directories --directory-prefix=. \ --dont-remove-listing --recursive --level=4 --no-parent \ --no-host-directories --cut-dirs=3 \ ftp://ftp.ncbi.nih.gov/genomes/M_musculus/pre_release # Fixup the agp and seq_contig.md files to add chrM # No chrM or chrMT was delivered. Copy from previous assembly ssh kkstore01 cd /cluster/data/mm8/ncbi/chrfasta cp -p /cluster/data/mm7/ncbi/chrfasta/chrM.fa.gz . cd ../contigfasta cp -p /cluster/data/mm7/ncbi/contigfasta/chrM.fa.gz . # with a fixed up header line to be like all the others: # >lcl|chrM.fa gi|34538597|ref|NC_005089.1| Mus musculus mitochondrion, complete genome cd /cluster/data/mm8 zcat ncbi/allrefcontig.chr.agp.gz > allrefcontig.chr.agp echo -e "chrM\t1\t16299\t1\tF\tAY172335.1\t1\t16299\t+" >> \ allrefcontig.chr.agp gzip allrefcontig.chr.agp # I don't see allcontig.agp being used anywhere else ? # zcat ncbi/allcontig.agp.gz > allcontig.agp # echo -e "NC_005089\t1\t16299\t1\tF\tAY172335\t\t1\t16299\t+" >> \ # allcontig.agp # gzip allcontig.agp zcat ncbi/seq_contig.md.gz | egrep -v "Celera|129_substrain" \ | sed -e "238i\ 10090\tM\t0\t0\t+\tstart\t-1\tCONTIG\tC57BL/6J\t10\n\ 10090\tM\t1\t16299\t+\tNC_005089\tGI:34538597\tCONTIG\tC57BL/6J\tna\n\ 10090\tM\t16299\t16299\t+\tend\t-2\tCONTIG\tC57BL/6J\t10" > seq_contig.md # (curiously, this sed command would not work on hgwdev, # only when logged into kkstore01 ?) # The line number 238 was found by checking the contents of # ncbi/seq_contig.md.gz (after the egrep filter) and it was # the line starting with: # 10090 Un|NT_039877 1 35798 # Wanted this chrM information before that line. # summarize sequence counts mkdir faCounts time faCount ncbi/chrfasta/chr*.fa.gz > faCounts/chrfasta.faCount 2>&1 & # about 1.5 minutes time faCount ncbi/contigfasta/chr*.fa.gz > \ faCounts/contigfasta.faCount 2>&1 & # about 3 minutes time zcat ncbi/chrfasta/chr*.fa.gz | grep "^>" > \ faCounts/chrfasta.headers 2>&1 & time zcat ncbi/contigfasta/chr*.fa.gz | grep "^>" > \ faCounts/contigfasta.headers 2>&1 & # about 2 minutes each for the above two zcat/greps ############################################################################# # BREAK UP SEQUENCE INTO 5 MB CHUNKS at NON-BRIDGED CONTIGS # (DONE - 2006-02-14 - Hiram) ######### Are these necessary ? They may no longer be needed. ######### TRF can run on full chroms on the kki kluster # It would be better to use . in place of the /cluster/data/mm8 # for the outputDir argument to splitFaIntoContigs so this script # is independent of specific locations, thus it works in . ssh kkstore01 cd /cluster/data/mm8 for F in ncbi/chrfasta/chr*.fa.gz do CHR=`basename ${F} | sed -e "s/.fa.gz//; s/chr//"` echo ${CHR} ${F} mkdir -p "${CHR}" zcat allrefcontig.chr.agp.gz | \ perl -we "while(<>){if (/^chr${CHR}\t/) {print;}}" > \ ${CHR}/chr${CHR}.agp zcat ncbi/chrfasta/chr${CHR}.fa.gz | \ perl -wpe 's/^>lcl\|(chr\w+)\.fa.*/>$1/' | \ splitFaIntoContigs ${CHR}/chr${CHR}.agp \ stdin /cluster/data/mm8 -nSize=5000000 done # The above loop takes about 5 minutes # Some of these in the chr1 directory got overwritten on 2006-02-27 # during an attempt to verify that the pre-release directory at # NCBI was the same as what we worked with here. ############################################################################# # CREATE CHROM-LEVEL AGP AND FASTA FOR _RANDOMS (DONE 2006-02-14 - Hiram) ssh kkstore01 mkdir /cluster/data/mm8/jkStuff cd /cluster/data/mm8 mkdir Un tmp cp -p /cluster/data/mm7/jkStuff/ncbiFixAgp ./jkStuff zcat allrefcontig.chr.agp.gz | ./jkStuff/ncbiFixAgp /dev/stdin > \ allrefcontig.chr.ordered.agp # Set the appropriate release number here, this one is 35 # Fetch the script from the previous assembly sed -e "s/buildNum = 35/buildNum = 36/" \ /cluster/data/mm7/jkStuff/ncbiToRandomAgps > \ jkStuff/ncbiToRandomAgps chmod +x jkStuff/ncbiToRandomAgps # NOTE ! * ! This mm8 contig.idmap now includes the celera assembly # Filter that out for use here. # There were two broken lines that began _36 - they were removed # after I reported them and the contig.idmap.gz file here was # updated later. zcat ncbi/contig.idmap.gz | grep ref_strain | grep -v "^_36" \ | ./jkStuff/ncbiToRandomAgps seq_contig.md \ allrefcontig.chr.ordered.agp \ /dev/stdin . 2> dbg for C in ? ?? do if [ -s ${C}/chr${C}_random.ctg.agp ]; then echo "building ${C}/chr${C}_random.fa" rm -f ./tmp.fa zcat ncbi/contigfasta/chr${C}.fa.gz | \ perl -wpe 's/^>lcl\|(Mm\w+)\s+.*$/>$1/' > ./tmp.fa agpToFa -verbose=2 -simpleMulti \ ${C}/chr${C}_random.ctg.agp chr${C}_random \ ${C}/chr${C}_random.fa ./tmp.fa rm -f ./tmp.fa fi done > tmp/agpToFa.out 2>&1 # the above loop takes about 3 minutes, examine the tmp/agpToFa.out # record for any errors # We need the lift information from these random.ctg.agp files cp -p /cluster/data/mm7/jkStuff/agpToLift.pl ./jkStuff for AGP in ?/*_random.ctg.agp ??/*_random.ctg.agp do CHR=`dirname ${AGP}` echo ${CHR} mkdir -p ${CHR}/lift ./jkStuff/agpToLift.pl ${AGP} > ${CHR}/lift/ctg_random.lft done # Clean these up to avoid confusion later... they're easily rebuilt # with the ncbiToRandomAgps script above rm ?/*_random.ctg.agp ??/*_random.ctg.agp gzip seq_contig.md allrefcontig.chr.ordered.agp ############################################################################# # BREAK UP _RANDOMS INTO 5 MB CHUNKS AT NON-BRIDGED CONTIGS # (DONE 2006-02-14 - Hiram) ssh kkstore01 cd /cluster/data/mm8 for C in ? ?? do if [ -s ${C}/chr${C}_random.fa ]; then splitFaIntoContigs -nSize=5000000 ${C}/chr${C}_random.agp \ ${C}/chr${C}_random.fa . mkdir -p ${C}/lift rm -f ${C}/lift/rOut.lst ${C}/lift/random.lft ${C}/lift/random.lst mv ${C}_random/lift/oOut.lst ${C}/lift/rOut.lst mv ${C}_random/lift/ordered.lft ${C}/lift/random.lft mv ${C}_random/lift/ordered.lst ${C}/lift/random.lst rmdir ${C}_random/lift rm ${C}_random/chr${C}_random.agp ${C}_random/chr${C}_random.fa rm -rf ${C}/chr${C}_random_* mv ${C}_random/chr${C}_random_* ${C} rmdir ${C}_random fi done > tmp/split.out 2>&1 # the above loop takes less than a minute # scan the tmp/split.out file for possible errors ############################################################################# # MAKE LIFTALL.LFT (DONE - 2006-02-14 - Hiram) ssh kkstore01 cd /cluster/data/mm8 cat ?/lift/*.lft ??/lift/*.lft > jkStuff/liftAll.lft ############################################################################# # CREATING DATABASE (DONE - 2006-02-14 - Hiram) ssh kkstore01 cd /cluster/data/mm8 faToTwoBit ?/chr?.fa ??/chr??.fa ?/chr?_random.fa ??/chr??_random.fa \ mm8.2bit twoBitInfo mm8.2bit stdout | sort -rn +1 > chrom.sizes grep -v random chrom.sizes | cut -f1 | sed -e "s/chr//" > chrom.lst twoBitInfo mm8.2bit stdout | awk '{printf "%s\t%s\t/gbdb/mm8/mm8.2bit\n", $1,$2}' > chromInfo.tab ssh hgwdev cd /cluster/data/mm8 hgsql -e "create database mm8;" mysql # Make sure we have enough room (eventually ~ 70Gb) for mysql tables: df -h | grep mysql # /dev/sda1 472G 225G 223G 51% /var/lib/mysql2 # /dev/sdc1 1.8T 1.5T 190G 89% /var/lib/mysql # CREATING GRP TABLE FOR TRACK GROUPING (DONE - 2006-02-14 - Hiram) # Use any of the newest databases to ensure that the organization # of the grp table is up to date ssh hgwdev hgsql mm8 -e "create table grp (PRIMARY KEY(NAME)) select * from hg18.grp" hgsql mm8 < $HOME/kent/src/hg/lib/chromInfo.sql hgsql mm8 -e 'load data local infile "chromInfo.tab" into table chromInfo;' # Enter mm8 into dbDb and defaultDb so test browser knows about it: hgsql -e 'INSERT INTO dbDb (name, description, nibPath, organism, \ defaultPos, active, orderKey, genome, scientificName, \ htmlPath, hgNearOk, hgPbOk, sourceName) \ VALUES("mm8", "Feb 2006", "/gbdb/mm8", "Mouse", \ "chr6:28912411-28925620", 1, 22, "Mouse", \ "Mus musculus", "/gbdb/mm8/html/description.html", 0, 0, \ "NCBI Build 36");' -h localhost hgcentraltest # Reset default position to be like Mm7, 2006-03-09 - Hiram hgsql -e \ 'update dbDb set defaultPos="chr12:50258170-50263946" where name="mm8";' \ hgcentraltest # Do *NOT* set default genome on genome-test until ready for release # hgsql hgcentraltest \ # -e 'update defaultDb set name="mm8" where genome="Mouse";' # start a new entry in the trackDb hierarchy cd $HOME/kent/src/hg/makeDb/trackDb/mouse mkdir mm8 cvs add mm8 cd mm8 cp ../mm7/description.html . vi description.html - fixup text for this assembly cvs add description.html cvs commit cd ../.. vi makefile - add mm8 to the list mkdir /cluster/data/mm8/html mkdir /gbdb/mm8 ln -s /cluster/data/mm8/html /gbdb/mm8/html ln -s /cluster/data/mm8/mm8.2bit /gbdb/mm8/mm8.2bit cp -p mouse/mm8/description.html /gbdb/mm8/html make DBS=mm8 ############################################################################# # GOLD GAP tracks (DONE - 2006-02-14 - Hiram) ssh hgwdev cd /cluster/data/mm8 # make sure these tmp contig agp files are gone, easily generated # as above with jkStuff/ncbiToRandomAgps mkdir ffa zcat ncbi/sequence.inf.gz > ffa/sequence.inf hgGoldGapGl -chromLst=chrom.lst mm8 /cluster/data/mm8 . featureBits mm8 gold # 2567283971 bases of 2567283971 (100.000%) in intersection featureBits mm7 gold # 2583394090 bases of 2583394090 (100.000%) in intersection featureBits mm6 gold # 2597150411 bases of 2597150411 (100.000%) in intersection featureBits mm5 gold # 2615483787 bases of 2615483787 (100.000%) in intersection featureBits mm4 gold # 2627444668 bases of 2627444668 (100.000%) in intersection featureBits mm8 gap # 97171117 bases of 2567283971 (3.785%) in intersection featureBits mm7 gap # 264323239 bases of 2583394090 (10.232%) in intersection featureBits mm6 gap # 482483041 bases of 2597150411 (18.577%) in intersection featureBits mm5 gap # 549468286 bases of 2615483787 (21.008%) in intersection featureBits mm4 gap # 325167539 bases of 2627444668 (12.376%) in intersection ############################################################################# # GC5BASE (DONE - 2006-02-14 - Hiram) ssh kkstore01 mkdir -p /cluster/data/mm8/bed/gc5Base cd /cluster/data/mm8/bed/gc5Base time hgGcPercent -wigOut -doGaps -file=stdout -win=5 mm8 \ /cluster/data/mm8 | wigEncode stdin gc5Base.wig gc5Base.wib # Calculating gcPercent with window size 5 # Using twoBit: /cluster/data/mm8/mm7.2bit # File stdout created # Converted stdin, upper limit 100.00, lower limit 0.00 # runs for about 14 minutes # load database ssh hgwdev cd /cluster/data/mm8/bed/gc5Base mkdir /gbdb/mm8/wib ln -s `pwd`/gc5Base.wib /gbdb/mm8/wib time hgLoadWiggle -pathPrefix=/gbdb/mm8/wib mm8 gc5Base gc5Base.wig # 29 second load time # verify index is correct: hgsql mm8 -e "show index from gc5Base;" # should see good numbers in Cardinality column ############################################################################# # DISTRIBUTE SEQUENCE TO INTERMEDIATE SERVERS FOR KLUSTER RUNS # (DONE - 2006-02-14 - Hiram) ssh kkstore01 cd /cluster/data/mm8 # break up into 500,000 sized chunks for repeat masker runs TOP=`pwd` export TOP for CTG_DIR in ?/chr?_[0-9]* ??/chr??_[0-9]* ?/chr?_random_[0-9]* \ ??/chr??_random_[0-9]* do ctg=`basename ${CTG_DIR}` cd ${CTG_DIR} faSplit size ${ctg}.fa 500000 ${ctg}_ -lift=${ctg}.lft -maxN=500000 cd ${TOP} done > tmp/ctg_split.out 2>&1 # about 3 minutes, check the tmp/ctg_split.out for anything unusual # make a list of the contigs TOP=`pwd` export TOP for CTG_DIR in ?/chr?_[0-9]* ??/chr??_[0-9]* ?/chr?_random_[0-9]* \ ??/chr??_random_[0-9]* do ctg=`basename ${CTG_DIR}` cd ${CTG_DIR} ls ${ctg}_* | while read F do echo ${CTG_DIR}/${F} done cd ${TOP} done > contig500K.lst # count 'em wc -l contig500K.lst # 5772 contig500K.lst mkdir /cluster/bluearc/scratch/hg/mm8 mkdir /cluster/bluearc/scratch/hg/mm8/contigs rsync -a --progress --files-from=contig500K.lst . \ /cluster/bluearc/scratch/hg/mm8/contigs/ # verify the contig copy above functioned OK cd /cluster/bluearc/scratch/hg/mm8 find ./contigs -type f | wc -l # 5772 ############################################################################# # SIMPLE REPEAT TRACK (DONE - 2006-02-14 Hiram) # TRF can be run in parallel with RepeatMasker # since it doesn't require masked input sequence. ssh kkr1u00 mkdir /iscratch/i/mm8 cd /iscratch/i/mm8 mkdir fa cd fa cp -p /cluster/data/mm8/?/*.fa . cp -p /cluster/data/mm8/??/*.fa . for R in 2 3 4 5 6 7 8 do rsync -a --progress /iscratch/i/mm8/ kkr${R}u00:/iscratch/i/mm8/ done ssh kki mkdir /cluster/data/mm8/bed/simpleRepeat cd /cluster/data/mm8/bed/simpleRepeat mkdir trf cat << '_EOF_' > runTrf #!/bin/csh -fe # set path1 = /iscratch/i/mm8/fa/$1 set inputFN = $1 set outpath = $2 set outputFN = $2:t mkdir -p /scratch/tmp/$outputFN cp $path1 /scratch/tmp/$outputFN pushd . cd /scratch/tmp/$outputFN /cluster/bin/i386/trfBig -trf=/cluster/bin/i386/trf $inputFN /dev/null -bedAt=$outputFN -tempDir=/scratch/tmp popd rm -f $outpath cp -p /scratch/tmp/$outputFN/$outputFN $outpath rm -fr /scratch/tmp/$outputFN/* rmdir --ignore-fail-on-non-empty /scratch/tmp/$outputFN '_EOF_' # << happy emacs chmod +x runTrf cat << '_EOF_' > template #LOOP ./runTrf $(path1) {check out line trf/$(root1).bed} #ENDLOOP '_EOF_' # << keep emacs coloring happy ls -1S /iscratch/i/mm8/fa > genome.lst gensub2 genome.lst single template jobList para create jobList para try ... check ... push ... etc para time # Completed: 34 of 34 jobs # CPU time in finished jobs: 14385s 239.75m 4.00h 0.17d 0.000 y # IO & Wait Time: 794s 13.24m 0.22h 0.01d 0.000 y # Average job time: 446s 7.44m 0.12h 0.01d # Longest finished job: 1437s 23.95m 0.40h 0.02d # Submission to last job: 1685s 28.08m 0.47h 0.02d # Load into the database ssh hgwdev cd /cluster/data/mm8/bed/simpleRepeat cat trf/chr*.bed > simpleRepeat.bed hgLoadBed -strict mm8 simpleRepeat simpleRepeat.bed \ -sqlTable=$HOME/src/hg/lib/simpleRepeat.sql # Loaded 1141941 elements of size 16 featureBits mm8 simpleRepeat # 77752377 bases of 2567283971 (3.029%) in intersection featureBits mm7 simpleRepeat # 77021175 bases of 2583394090 (2.981%) in intersection featureBits mm6 simpleRepeat # 83220723 bases of 2597150411 (3.204%) in intersection featureBits mm5 simpleRepeat # 81414259 bases of 2615483787 (3.113%) in intersection featureBits mm4 simpleRepeat # 82600648 bases of 2627444668 (3.144%) in intersection featureBits mm3 simpleRepeat # 75457193 bases of 2505900260 (3.011%) in intersection ########################################################################### # CREATE MICROSAT TRACK (done 2006-7-5 JK) ssh hgwdev cd /cluster/data/mm8/bed mkdir microsat cd microsat awk '($5==2 || $5==3) && $6 >= 15 && $8 == 100 && $9 == 0 {printf("%s\t%s\t%s\t%dx%s\n", $1, $2, $3, $6, $16);}' ../simpleRepeat/simpleRepeat.bed > microsat.bed /cluster/bin/i386/hgLoadBed mm8 microsat microsat.bed ############################################################################# # PROCESS SIMPLE REPEATS INTO MASK (DONE - 2006-02-14 - Hiram) # After the simpleRepeats track has been built, make a filtered version # of the trf output: keep trf's with period <= 12: ssh kkstore01 cd /cluster/data/mm8/bed/simpleRepeat mkdir trfMask for F in trf/chr*.bed do echo "${F} -> ${F/trf\//}" awk '{if ($5 <= 12) print;}' ${F} > trfMask/${F/trf\//} done ############################################################################# # REPEATMASKER RUN (after contigs have been distributed to bluearc FS) # (DONE - 2006-02-14 - 2006-02-15 - Hiram) # Record RM version used: cat /cluster/bluearc/RepeatMasker060120/Libraries/version # RM database version 20060120 ssh pk #- Make the run directory and job list: mkdir /cluster/data/mm8/RMRun cd /cluster/data/mm8/RMRun cat << '_EOF_' > ../jkStuff/RMMouse #!/bin/csh -fe set C = $1:h set F = $1:t set R = $F:r cd /cluster/data/mm8/$C /bin/mkdir -p /scratch/tmp/mm8/$R /bin/cp /cluster/bluearc/scratch/hg/mm8/contigs/$1 /scratch/tmp/mm8/$R pushd /scratch/tmp/mm8/$R /cluster/bluearc/RepeatMasker060120/RepeatMasker -ali -s -species mus $F popd /bin/cp /scratch/tmp/mm8/$R/$R.fa.out ./ if (-e /scratch/tmp/mm8/$R/$R.fa.align) /bin/cp /scratch/tmp/mm8/$R/$R.fa.align ./ if (-e /scratch/tmp/mm8/$R/$R.fa.tbl) /bin/cp /scratch/tmp/mm8/$R/$R.fa.tbl ./ if (-e /scratch/tmp/mm8/$R/$R.fa.cat) /bin/cp /scratch/tmp/mm8/$R/$R.fa.cat ./ /bin/rm -fr /scratch/tmp/mm8/$R/* /bin/rmdir --ignore-fail-on-non-empty /scratch/tmp/mm8/$R /bin/rmdir --ignore-fail-on-non-empty /scratch/tmp/mm8 '_EOF_' # << happy emacs chmod +x ../jkStuff/RMMouse cat << '_EOF_' > template #LOOP ../jkStuff/RMMouse $(path1) {check out line ../$(dir1)/$(root1).fa.out} #ENDLOOP '_EOF_' # << happy emacs gensub2 ../contig500K.lst single template jobList para create jobList wc -l jobList # 5772 jobList para try ... check ... push ... etc # Completed: 6172 of 6172 jobs # CPU time in finished jobs: 26381042s 439684.03m 7328.07h 305.34d 0.837 y # IO & Wait Time: 46088s 768.13m 12.80h 0.53d 0.001 y # Average job time: 4282s 71.36m 1.19h 0.05d # Longest finished job: 6370s 106.17m 1.77h 0.07d # Submission to last job: 127318s 2121.97m 35.37h 1.47d #- Lift up the split-contig .out's to contig-level .out's ssh kkstore01 cd /cluster/data/mm8 for D in ?/chr?_[0-9]* ??/chr??_[0-9]* ?/chr?_random_[0-9]* \ ??/chr??_random_[0-9]* do CONTIG=`basename ${D}` liftUp ${D}/${CONTIG}.fa.out ${D}/${CONTIG}.lft error \ ${D}/${CONTIG}_[0-9]*.fa.out done > tmp/RM.lift.outs 2>&1 # real 2m32.275s # scan tmp/RM.lift.outs for unusual errors or difficulties cat << '_EOF_' > jkStuff/liftRM_out_to_chr.sh #!/bin/sh for C in ? ?? do echo "lifting ${C}" cd ${C} if [ -s lift/ordered.lft ]; then liftUp chr${C}.fa.out lift/ordered.lft error `cat lift/oOut.lst` else echo "WARNING: Can not find ${C}/lift/ordered.lft" fi if [ -s lift/random.lft ]; then liftUp chr${C}_random.fa.out lift/random.lft error `cat lift/rOut.lst` fi cd .. done '_EOF_' # << happy emacs chmod +x jkStuff/liftRM_out_to_chr.sh ./jkStuff/liftRM_out_to_chr.sh > tmp/liftRM_out_to_chr.out 2>&1 # real 0m24.873s # scan the results tmp/liftRM_out_to_chr.out # there is a single: WARNING: Can not find Un/lift/ordered.lft # which is OK # List the final .out files, nothing should be size 0: ls -og */*.fa.out | sort -k3,3nr #- Load the .out files into the database with: ssh hgwdev cd /cluster/data/mm8 hgLoadOut mm8 ?/chr?.fa.out ??/chr??.fa.out ?/chr?_random.fa.out \ ??/chr??_random.fa.out > tmp/hgLoadOut.out 2>&1 # about 7 minutes, there are always a few of these errors: # verify everything seems normal compared with previous builds featureBits mm8 rmsk # 1087735582 bases of 2567283971 (42.369%) in intersection featureBits mm7 rmsk # 1092611581 bases of 2583394090 (42.294%) in intersection featureBits mm6 rmsk # 1110222842 bases of 2597150411 (42.748%) in intersection featureBits mm5 rmsk # 1137310280 bases of 2615483787 (43.484%) in intersection featureBits mm4 rmsk # 1130883581 bases of 2627444668 (43.041%) in intersection featureBits mm3 rmsk # 1080265553 bases of 2505900260 (43.109%) in intersection ############################################################################# # PROCESS REPEAT MASKER AND SIMPLE REPEATS INTO MASKED SEQUENCE # (DONE - 2006-02-16 - Hiram) ssh kkstore01 cd /cluster/data/mm8 time for CHR in ?/chr?.fa ??/chr??.fa ?/chr?_random.fa ??/chr??_random.fa do FA=${CHR#*\/} C=${FA%.fa} echo -n "repeat masking ${C} ... " /cluster/bin/i386/maskOutFa -soft ${CHR} ${CHR}.out ${CHR} echo -n "adding simpleRepeats ... " /cluster/bin/i386/maskOutFa -softAdd ${CHR} \ bed/simpleRepeat/trfMask/${C}.bed ${CHR} echo "done - ${CHR}" done > tmp/addRM_and_Simple.out 2>&1 # about 4 minutes # you will note the usual warnings about troublesome coordinates # in the repeat masker outputs - even more than when they were lifted. # and make the hard masked sequences from these soft masked sequences time for CHR in ?/chr?.fa ??/chr??.fa ?/chr?_random.fa ??/chr??_random.fa do echo "maskOutFa ${CHR} hard ${CHR}.masked" /cluster/bin/i386/maskOutFa ${CHR} hard ${CHR}.masked done > tmp/hardMask.out 2>&1 # about 2 minutes # rebuild the nib file time faToTwoBit ?/chr?.fa ??/chr??.fa ?/chr?_random.fa ??/chr??_random.fa \ mm8Soft.2bit # 2 minutes # verify the sequence is still the same size as before: twoBitInfo mm8Soft.2bit stdout | sort -rn +1 | sum -r # 20673 1 sum -r chrom.sizes # 20673 1 # Let's see how much is masked: time twoBitToFa mm8Soft.2bit stdout | faSize stdin # 2664455088 bases (97171400 N's 2567283688 real 1477933003 upper # 1089350685 lower) in 34 sequences in 1 files # and bc says: # 1089350685/2664455088 = .408845 # 1089350685/2567283688 = .424320 # replace the former unmasked 2bit file with this new one: rm mm8.2bit; mv mm8Soft.2bit mm8.2bit # check the browser, make sure it is functioning OK # Generate fasta file for random contigs # THIS IS OPTIONAL STUFF, not really needed, well, it is used in # genscan to make the gene names there look pretty. This script # has been checked into the source tree in hg/utils/lft2BitToFa.pl # use it from there next time cp -p /cluster/data/mm7/jkStuff/lft2BitToFa.pl ./jkStuff mkdir randomContigs for L in ?/lift/ctg_random.lft ??/lift/ctg_random.lft do D=${L/\/lift*} echo $L $D ./jkStuff/lft2BitToFa.pl mm8.2bit ${L} \ > randomContigs/chr${D}_random.ctg.fa done # # Verify these *.ctg.fa files have the same bases as the ordinary # chr*_random.fa files: faSize ?/chr?_random.fa ??/chr??_random.fa # 20361100 bases (3250000 N's 17111100 real 7094373 upper 10016727 lower) # in 12 sequences in 12 files faSize randomContigs/*.ctg.fa # 17111100 bases (0 N's 17111100 real 7094373 upper 10016727 lower) # in 77 sequences in 12 files # Note the number of real, upper and lower bases are the same # This random contig business isn't actually needed # Create a 2bit file with the full chrom sequences and these # random contigs for use in blastz: # faToTwoBit ?/chr?.fa ??/chr??.fa randomContigs/chr*.ctg.fa \ # mm8Chroms_RandomContigs.2bit # Copy to bluearc unit for kluster runs # cp -p mm8.2bit /cluster/bluearc/mm8 # cp -p mm8Chroms_RandomContigs.2bit /cluster/bluearc/mm8 # And the lift file to go with it # cat ?/lift/ctg_random.lft ??/lift/ctg_random.lft \ # > jkStuff/Chroms_RandomContigs.lft # cp -p jkStuff/Chroms_RandomContigs.lft /cluster/bluearc/mm8 # create full chrom nibs for blastz SEQ1 target with Lin Spec Repeats mkdir nib for FA in ?/chr?.fa ??/chr??.fa ?/chr?_random.fa ??/chr??_random.fa do B=${FA/*\/} B=${B/.fa/} echo faToNib -softMask ${FA} nib/${B}.nib rm -f nib/${B}.nib faToNib -softMask ${FA} nib/${B}.nib done mkdir /cluster/bluearc/scratch/hg/mm8/nib cp -p nib/*.nib /cluster/bluearc/scratch/hg/mm8/nib cp -p chrom.sizes /cluster/bluearc/scratch/hg/mm8 cp -p mm8.2bit /cluster/bluearc/scratch/hg/mm8 # The contigs over there are no longer needed rm -fr /cluster/bluearc/scratch/hg/mm8/contigs # after lineage specific repeats are created below, this business # can be pushed to the kluster kk nodes and over to the Iservers ############################################################################# # PREPARE "bigZips" files for public release # (DONE 2006-02-16 - Hiram) ssh kkstore01 mkdir /cluster/data/mm8/downloads mkdir /cluster/data/mm8/downloads/bigZips mkdir /cluster/data/mm8/downloads/chromosomes cd /cluster/data/mm8/downloads/chromosomes cp -p ../../?/chr?.fa ../../??/chr??.fa \ ../../?/chr?_random.fa ../../??/chr??_random.fa. gzip chr*.fa # 12 minutes # copy previous release README.txt scp hgwdev:/usr/local/apache/htdocs/goldenPath/mm8/chromosomes/README.txt . # edit it to bring it up to date cd /cluster/data/mm8/downloads/bigZips # copy previous release README.txt scp hgwdev:/usr/local/apache/htdocs/goldenPath/mm8/bigZips/README.txt . # edit README.txt to indicate proper version of sequence and # RepeatMasker cd /cluster/data/mm8 cp -p ?/chr*.fa ??/chr*.fa downloads/chromosomes tar cvzf downloads/bigZips/chromAgp.tar.gz ?/chr*.agp ??/chr*.agp tar cvzf downloads/bigZips/chromFa.tar.gz ?/chr*.fa ??/chr*.fa # 12 minutes tar cvzf downloads/bigZips/chromFaMasked.tar.gz ?/chr*.fa.masked \ ??/chr*.fa.masked tar cvzf downloads/bigZips/chromOut.tar.gz ?/chr*.fa.out ??/chr*.fa.out cd /cluster/data/mm8/bed/simpleRepeat tar cvzf ../../downloads/bigZips/chromTrf.tar.gz ./trfMask # get GenBank native mRNAs and refGene (DONE 2006-02-23) # after the genbank run was complete ssh hgwdev cd /cluster/data/genbank time ./bin/i386/gbGetSeqs -db=mm8 -native GenBank mrna \ /cluster/data/mm8/downloads/bigZips/mrna.fa # 2 minutes cd /cluster/data/mm8/downloads/bigZips gzip mrna.fa cd /cluster/data/mm8/downloads/bigZips for I in 1000 2000 5000 do echo "upstream${I} working ... " featureBits mm8 refGene:upstream:${I} -fa=stdout \ | gzip -c > upstream${I}.fa.gz echo "upstream${I} done" done # real 11m25.493s ssh kkstore01 cd /cluster/data/mm8/downloads/bigZips cp -p ../../mm8.2bit . md5sum *.gz *.2bit README.txt > md5sum.txt ssh hgwdev mkdir -p /usr/local/apache/htdocs/goldenPath/mm8 ln -s /cluster/data/mm8/downloads/bigZips \ /usr/local/apache/htdocs/goldenPath/mm8/bigZips ln -s /cluster/data/mm8/downloads/chromosomes \ /usr/local/apache/htdocs/goldenPath/mm8/chromosomes ############################################################################# # PREPARE LINEAGE SPECIFIC REPEAT FILES FOR BLASTZ (DONE - 2006-02-16 - Hiram) ssh kkr1u00 mkdir /iscratch/i/mm8/rmsk cd /cluster/data/mm8 cp -p */chr*.fa.out /iscratch/i/mm8/rmsk cd /iscratch/i/mm8 for R in 2 3 4 5 6 7 8 do rsync -a --progress /iscratch/i/mm8/ kkr${R}u00:/iscratch/i/mm8/ done cd rmsk ssh kki mkdir /cluster/data/mm8/linSpecRep cd /cluster/data/mm8/linSpecRep ls -1S /iscratch/i/mm8/rmsk > fa.list cat << '_EOF_' > mkLSR.csh #!/bin/csh -fe pushd /iscratch/i/mm8/rmsk /cluster/bluearc/RepeatMasker060120/DateRepeats \ $1 -query mouse -comp human -comp rat -comp dog -comp cow \ -comp rabbit popd /bin/cp -p /iscratch/i/mm8/rmsk/$1_homo-sapiens_rattus_canis-familiaris_bos-taurus_oryctolagus-cuniculus . '_EOF_' # << happy emacs chmod +x mkLSR.csh cat << '_EOF_' > template #LOOP ./mkLSR.csh $(path1) {check out line+ $(path1)_homo-sapiens_rattus_canis-familiaris_bos-taurus_oryctolagus-cuniculus} #ENDLOOP '_EOF_' # << happy emacs gensub2 fa.list single template jobList para try ... check ... push ... etc... para time # Completed: 34 of 34 jobs # CPU time in finished jobs: 1338s 22.29m 0.37h 0.02d 0.000 y # IO & Wait Time: 112s 1.87m 0.03h 0.00d 0.000 y # Average job time: 43s 0.71m 0.01h 0.00d # Longest finished job: 92s 1.53m 0.03h 0.00d # Submission to last job: 181s 3.02m 0.05h 0.00d ssh kkstore01 cd /cluster/data/mm8/linSpecRep mkdir notInHuman notInRat notInDog notInCow notInRabbit for F in chr*.out_homo-sapiens* do B=${F/.fa.out*/} echo $B /cluster/bin/scripts/extractRepeats 1 ${F} > \ notInHuman/${B}.out.spec /cluster/bin/scripts/extractRepeats 2 ${F} > \ notInRat/${B}.out.spec /cluster/bin/scripts/extractRepeats 3 ${F} > \ notInDog/${B}.out.spec /cluster/bin/scripts/extractRepeats 4 ${F} > \ notInCow/${B}.out.spec XXXXX /cluster/bin/scripts/extractRepeats 4 ${F} > \ XXXXX notInRabbit/${B}.out.spec XXXXX done # NOTE: rabbit should be column 5 instead of 4. # This isn't a problem, as we're not using rabbit anyway (see below) # the notInHuman, notInDog, notInCow and notInRabit ended up being # identical. Only the notInRat was different than them # To check identical find . -name "*.out.spec" | \ while read FN; do echo `cat ${FN} | sum -r` ${FN}; done \ | sort -k1,1n | sort -t"/" -k3,3 # Copy to scratch/hg for use in kluster runs mkdir /cluster/bluearc/scratch/hg/mm8/linSpecRep mkdir /cluster/bluearc/scratch/hg/mm8/linSpecRep/notInRat mkdir /cluster/bluearc/scratch/hg/mm8/linSpecRep/notInOthers cp -p notInHuman/* /cluster/bluearc/scratch/hg/mm8/linSpecRep/notInOthers cp -p notInRat/* /cluster/bluearc/scratch/hg/mm8/linSpecRep/notInRat # Request this scratch/hg/mm8 directory push to the kk nodes # and we can do the Iservers simply: ssh kkr1u00 cd /iscratch/i/mm8 # no longer need these two directories rm -fr fa rmsk rsync -a --progress /cluster/bluearc/scratch/hg/mm8/ . for R in 2 3 4 5 6 7 8 do rsync -a --progress /iscratch/i/mm8/ kkr${R}u00:/iscratch/i/mm8/ done ############################################################################ # BLATSERVERS ENTRY (DONE - 2006-02-16 - Hiram) # After getting a blat server assigned by the Blat Server Gods, ssh hgwdev hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ VALUES ("mm8", "blat17", "17784", "1", "0"); \ INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ VALUES ("mm8", "blat17", "17785", "0", "1");' \ hgcentraltest # test it with some sequence ######################################################################### # CPGISLANDS (DONE - 2006-02-16 - Hiram) ssh hgwdev mkdir /cluster/data/mm8/bed/cpgIsland cd /cluster/data/mm8/bed/cpgIsland # Build software from Asif Chinwalla (achinwal@watson.wustl.edu) cvs co hg3rdParty/cpgIslands cd hg3rdParty/cpgIslands make # gcc readseq.c cpg_lh.c -o cpglh.exe cd ../.. ln -s hg3rdParty/cpgIslands/cpglh.exe . # cpglh.exe requires hard-masked (N) .fa's. # There may be warnings about "bad character" for IUPAC ambiguous # characters like R, S, etc. Ignore the warnings. ssh kkstore01 cd /cluster/data/mm8/bed/cpgIsland for F in ../../*/chr*.fa.masked do FA=${F/*\/} C=${FA/.fa.masked/} echo "./cpglh.exe ${FA} > ${C}.cpg" ./cpglh.exe ${F} > ${C}.cpg done > cpglh.out 2>&1 & # about 3 minutes 20 seconds # Several chroms have 0 results: # -rw-rw-r-- 1 0 Feb 16 15:19 chr10_random.cpg # -rw-rw-r-- 1 0 Feb 16 15:20 chr15_random.cpg # -rw-rw-r-- 1 0 Feb 16 15:22 chr8_random.cpg # -rw-rw-r-- 1 0 Feb 16 15:22 chr9_random.cpg # -rw-rw-r-- 1 0 Feb 16 15:22 chrM.cpg # -rw-rw-r-- 1 0 Feb 16 15:22 chrX_random.cpg # -rw-rw-r-- 1 0 Feb 16 15:22 chrY.cpg # Transform cpglh output to bed + cat << '_EOF_' > filter.awk { $2 = $2 - 1; width = $3 - $2; printf("%s\t%d\t%s\t%s %s\t%s\t%s\t%0.0f\t%0.1f\t%s\t%s\n", $1, $2, $3, $5,$6, width, $6, width*$7*0.01, 100.0*2*$6/width, $7, $9); } '_EOF_' # << happy emacs awk -f filter.awk chr*.cpg | sort -k1,1 -k2,2n > cpgIsland.bed ssh hgwdev cd /cluster/data/mm8/bed/cpgIsland hgLoadBed -strict mm8 cpgIslandExt -tab -noBin \ -sqlTable=$HOME/kent/src/hg/lib/cpgIslandExt.sql cpgIsland.bed # Reading cpgIsland.bed # Loaded 15963 elements of size 10 featureBits mm8 cpgIslandExt # 10456823 bases of 2567283971 (0.407%) in intersection featureBits mm7 cpgIslandExt # 10439328 bases of 2583394090 (0.404%) in intersection featureBits mm6 cpgIslandExt # 10432360 bases of 2597150411 (0.402%) in intersection featureBits mm5 cpgIslandExt # 10422989 bases of 2615483787 (0.399%) in intersection featureBits mm4 cpgIsland # 11109692 bases of 2627444668 (0.423%) in intersection featureBits mm3 cpgIsland # 10102968 bases of 2505900260 (0.403%) in intersection ######################################################################### # ANDY LAW CPGISSLANDS (DONE - 2006-02-16 - Hiram) # See notes in makeGalGal2.doc and makeCanFam2.doc ssh kkstore01 mkdir /cluster/data/mm8/bed/cpgIslandGgfAndy cd /cluster/data/mm8/bed/cpgIslandGgfAndy # Build the preProcGgfAndy program in # kent/src/oneShot/preProcGgfAndy into your ~/bin/$MACHTYPE # Use masked sequence since this is a mammal... for F in ../../*/chr*.fa.masked do FA=${F/*\/} C=${FA/.fa.masked/} echo preproc and run on masked "${C} ${F}" 1>/dev/stderr ~/bin/$MACHTYPE/preProcGgfAndy ${F} \ | /cluster/home/angie/ggf-andy-cpg-island.pl \ | perl -wpe 'chomp; ($s,$e,$cpg,$n,$c,$g1,$oE) = split("\t"); $s--; $gc=$c+$g1; $pCpG=(100.0 * 2 * $cpg / $n); $pGc=(100.0 * $gc / $n); $_="'${C}'\t$s\t$e\tCpG: $cpg\t$n\t$cpg\t$gc\t" . "$pCpG\t$pGc\t$oE\n";' done | sort -k1,1 -k2,2n > cpgIslandGgfAndyMasked.bed # load into database: ssh hgwdev cd /cluster/data/mm8/bed/cpgIslandGgfAndy sed -e 's/cpgIslandExt/cpgIslandGgfAndyMasked/g' \ $HOME/kent/src/hg/lib/cpgIslandExt.sql > cpgIslandGgfAndyMasked.sql hgLoadBed -strict mm8 cpgIslandGgfAndyMasked -tab -noBin \ -sqlTable=cpgIslandGgfAndyMasked.sql cpgIslandGgfAndyMasked.bed # Loaded 67442 elements of size 10 featureBits mm8 cpgIslandExt # 10456823 bases of 2567283971 (0.407%) in intersection featureBits mm7 cpgIslandExt # 10439328 bases of 2583394090 (0.404%) in intersection featureBits mm8 cpgIslandGgfAndyMasked # 38850121 bases of 2567283971 (1.513%) in intersection featureBits mm7 cpgIslandGgfAndyMasked # 38774242 bases of 2583394090 (1.501%) in intersection wc -l ../cpgIsland/cpgIsland.bed *bed # 15963 ../cpgIsland/cpgIsland.bed # 67442 cpgIslandGgfAndyMasked.bed ######################################################################### # BLASTZ HUMAN Hg18 (DONE - 2006-02-16 - 2006-02-18 - Hiram) ssh pk mkdir /cluster/data/mm8/bed/blastzHg18.2006-02-16 cd /cluster/data/mm8/bed ln -s blastzHg18.2006-02-16 blastz.hg18 cd blastzHg18.2006-02-16 # Started this before the rsync to /scratch/hg/mm8/ had completed, # hence the /cluster/bluearc/scratch/hg/mm8/ location is used # here. cat << '_EOF_' > DEF # mouse vs human export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin BLASTZ=blastz.v7.x86_64 BLASTZ_ABRIDGE_REPEATS=1 # TARGET: Mouse Mm8 SEQ1_DIR=/cluster/bluearc/scratch/hg/mm8/nib SEQ1_SMSK=/cluster/bluearc/scratch/hg/mm8/linSpecRep/notInOthers SEQ1_LEN=/cluster/bluearc/scratch/hg/mm8/chrom.sizes SEQ1_CHUNK=50000000 SEQ1_LAP=10000 # QUERY: Human Hg18 - single chunk big enough to run each chrom by itself SEQ2_DIR=/scratch/hg/hg18/nib SEQ2_SMSK=/scratch/hg/hg18/linSpecRep/notInMouse SEQ2_LEN=/scratch/hg/hg18/chrom.sizes SEQ2_CHUNK=300000000 SEQ2_LAP=0 BASE=/cluster/data/mm8/bed/blastzHg18.2006-02-16 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # establish a screen to control this job screen time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ `pwd`/DEF > blastz.out 2>&1 & # Started 2006-02-16 16:15 # failed due to pk node difficulties, finish the run.blastz # manually # Completed: 3724 of 3724 jobs # CPU time in finished jobs: 5190293s 86504.89m 1441.75h 60.07d 0.165 y # IO & Wait Time: 259150s 4319.16m 71.99h 3.00d 0.008 y # Average job time: 1463s 24.39m 0.41h 0.02d # Longest finished job: 10621s 177.02m 2.95h 0.12d # Submission to last job: 74153s 1235.88m 20.60h 0.86d # continuing time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -continue=cat `pwd`/DEF > cat.out 2>&1 & # Done 2006-02-17 15:02 # Then to swap over to Hg18 time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -swap -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ `pwd`/DEF > swap.out 2>&1 & # Started 2006-02-17 15:30 ssh hgwdev time nice -n +19 featureBits mm8 chainHg18Link # 984380268 bases of 2567283971 (38.343%) in intersection time nice -n +19 featureBits hg18 chainMm8Link # 994530182 bases of 2881515245 (34.514%) in intersection ######################################################################### # BLASTZ RAT Rn4 (DONE - 2006-02-16 - 2006-02-18 - Hiram) ssh kkr1u00 cd /iscratch/i/rn4 rsync -a --progress /cluster/data/rn4/linSpecRep.notInMouse/ \ ./linSpecRep.notInMouse rsync -a --progress /cluster/data/rn4/nib/ ./nib/ cp -p /cluster/data/rn4/chrom.sizes . for R in 2 3 4 5 6 7 8 do rsync -a --progress /iscratch/i/rn4/ kkr${R}u00:/iscratch/i/rn4/ done ssh kk mkdir /cluster/data/mm8/bed/blastzRn4.2006-02-16 cd /cluster/data/mm8/bed ln -s blastzRn4.2006-02-16 blastz.rn4 cd blastzRn4.2006-02-16 # Started this before the rsync to /scratch/hg/mm8/ had completed, # hence the /cluster/bluearc/scratch/hg/mm8/ location is used # here. cat << '_EOF_' > DEF # mouse vs rat export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin:/parasol/bin BLASTZ=blastz.v7 BLASTZ_ABRIDGE_REPEATS=1 # TARGET: Mouse Mm8 SEQ1_DIR=/cluster/bluearc/scratch/hg/mm8/nib SEQ1_SMSK=/cluster/bluearc/scratch/hg/mm8/linSpecRep/notInRat SEQ1_LEN=/cluster/bluearc/scratch/hg/mm8/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: Rat Rn4 - single chunk big enough to run each chrom by itself SEQ2_DIR=/iscratch/i/rn4/nib SEQ2_SMSK=/iscratch/i/rn4/linSpecRep.notInMouse SEQ2_LEN=/iscratch/i/rn4/chrom.sizes SEQ2_CHUNK=300000000 SEQ2_LAP=0 BASE=/cluster/data/mm8/bed/blastzRn4.2006-02-16 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # establish a screen to control this job screen time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=kk -chainMinScore=3000 -chainLinearGap=medium \ `pwd`/DEF > blastz.out 2>&1 & # Started 2006-02-16 16:15 time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=kk -chainMinScore=3000 -chainLinearGap=medium \ -continue=cat `pwd`/DEF > cat.out 2>&1 & time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=kk -chainMinScore=3000 -chainLinearGap=medium \ -swap `pwd`/DEF > swap.out 2>&1 & time nice -n +19 featureBits mm8 chainRn4Link # 1770319811 bases of 2567283971 (68.957%) in intersection time nice -n +19 featureBits rn4 chainMm8Link # 1791093685 bases of 2571531505 (69.651%) in intersection ############################################################################## # CLONE ENDS - BACEND TRACK (DONE - 2006-02-17 - Hiram) ssh kkstore01 cd /cluster/data/mm8 # check disk space: 73Gb free df -h . # Filesystem Size Used Avail Use% Mounted on # /export/cluster/store5 # 1.5T 1.3T 73G 95% /cluster/store5 mkdir -p bed/cloneend/ncbi cd bed/cloneend/ncbi wget --timestamping \ ftp://ftp.ncbi.nih.gov/genomes/CLONEEND/mus_musculus/* cd /cluster/data/mm8/bed/cloneend # seems like the *.mfa files were split just for convenience # concatenate for F in ncbi/*.mfa.gz do zcat ${F} done | gzip > all.mfa.gz # Convert the title line of the all.mfa file cat << '_EOF_' > convert.pl #!/usr/bin/env perl use strict; use warnings; while (my $line = <>) { if ($line !~ m/^>/) { print $line } else { my @fields = split('\|', $line); my $fieldCount = scalar(@fields); my $printed = 0; for (my $i = 0; $i < $fieldCount; $i++) { if ($fields[$i] eq "gb" || $fields[$i] eq "dbj") { (my $name, my $vers) = split(/\./,$fields[$i+1]); print ">$name\n"; $i= $fieldCount; $printed = 1; } } if (!$printed) { die("Failed for $line\n"); } } } '_EOF_' # << happy emacs chmod +x convert.pl zcat all.mfa.gz | ./convert.pl | gzip > cloneEnds.fa.gz # make sure nothing got broken: faSize all.mfa.gz # 498162791 bases (16779168 N's 481383623 real 304962409 upper 176421214 # lower) in 789466 sequences in 1 files faSize cloneEnds.fa.gz # 498162791 bases (16779168 N's 481383623 real 304962409 upper 176421214 # lower) in 789466 sequences in 1 files # identical numbers, curiously, these are exactly the same numbers # as were seen during the build of Mm7. Do these things not # change with time ? # concatenate the text files, too for F in ncbi/*.txt.gz do zcat ${F} done | gzip > all.txt.gz # generate cloneEndPairs.txt and cloneEndSingles.txt cp -p /cluster/data/mm7/bed/cloneend/convertTxt.pl . zcat all.txt.gz | ./convertTxt.pl stdin # Reading in end info # Writing out pair info # Writing out singleton info # 354485 pairs and 78423 singles # faSplit does not function correctly if given a .gz source file # AND, we need the unzipped file for sequence loading below gunzip cloneEnds.fa.gz # split mkdir splitdir cd splitdir faSplit sequence ../cloneEnds.fa 100 cloneEnds # Check to ensure no breakage: cat *.fa | faSize stdin # 498162791 bases (16779168 N's 481383623 real 304962409 upper 176421214 # lower) in 789466 sequences in 1 files # same numbers as before # Copy to san for cluster runs mkdir /san/sanvol1/scratch/mm8/cloneEnds cp -p *.fa /san/sanvol1/scratch/mm8/cloneEnds rm * cd .. rmdir splitdir # may as well remove the previous assembly copy: rm -fr /san/sanvol1/scratch/mm7/cloneEnds # load sequences ssh hgwdev mkdir /gbdb/mm8/cloneend cd /gbdb/mm8/cloneend ln -s /cluster/data/mm8/bed/cloneend/cloneEnds.fa . cd /tmp hgLoadSeq mm8 /gbdb/mm8/cloneend/cloneEnds.fa # Advisory lock created # Creating .tab file # Adding /gbdb/mm8/cloneend/cloneEnds.fa # 789466 sequences # Updating seq table # Advisory lock has been released # All done ############################################################################ # BACEND SEQUENCE ALIGNMENTS (DONE - 2006-02-17 - 2006-02-22 - Hiram) ssh kkstore01 mkdir /cluster/data/mm8/noMask cd /cluster/data/mm8/ # Need an unmasked sequence for this work for CHR in ?/chr?.fa ??/chr??.fa ?/chr?_random.fa ??/chr??_random.fa do C=`basename ${CHR}` echo -n "working ${C} ... " head -1 ${CHR} > noMask/${C} tail +2 ${CHR} | tr [:lower:] [:upper:] >> noMask/${C} echo "done" done mkdir /san/sanvol1/scratch/mm8/noMask time cp --verbose -p noMask/chr*.fa /san/sanvol1/scratch/mm8/noMask # Size of mouse non-gap genome: 2567283971 # Size of Hg18 non-gap genome: 2881515245 # Adjusting the 1024 number from typical human ooc generation: # 1024 * (2567283971 / 2881515245) = 912 time blat mm8.2bit \ /dev/null /dev/null -tileSize=11 -makeOoc=11.ooc -repMatch=912 # Wrote 29643 overused 11-mers to 11.ooc # real 2m13.206 # Copy over to the san cp -p 11.ooc /san/sanvol1/scratch/mm8 # and for the kluster run ssh pk mkdir /cluster/data/mm8/bed/bacends cd /cluster/data/mm8/bed/bacends mkdir out # allow blat to run politely in /tmp while it writes output, then # copy results to results file: cat << '_EOF_' > runBlat.sh #!/bin/sh root1=$1 root2=$2 result=$3 rm -fr /scratch/tmp/${root1}_${root2} mkdir /scratch/tmp/${root1}_${root2} pushd /scratch/tmp/${root1}_${root2} /cluster/bin/x86_64/blat /san/sanvol1/scratch/mm8/noMask/${root1}.fa \ /san/sanvol1/scratch/mm8/cloneEnds/${root2}.fa \ -ooc=/san/sanvol1/scratch/mm8/11.ooc ${root1}.${root2}.psl popd mkdir -p out/${root2} rm -f ${result} mv /scratch/tmp/${root1}_${root2}/${root1}.${root2}.psl ${result} rm -fr /scratch/tmp/${root1}_${root2} '_EOF_' # << happy emacs chmod +x runBlat.sh cat << '_EOF_' > template #LOOP ./runBlat.sh $(root1) $(root2) {check out line+ out/$(root2)/$(root1).$(root2).psl} #ENDLOOP '_EOF_' # << emacs happy ls -1S /san/sanvol1/scratch/mm8/cloneEnds/cloneEnds???.fa > bacEnds.lst ls -1S /san/sanvol1/scratch/mm8/noMask/chr*.fa > contig.lst gensub2 contig.lst bacEnds.lst template jobList para create jobList # 3322 jobs written to batch para try, check, push, etc ... # Completed: 3332 of 3332 jobs # CPU time in finished jobs: 649465s 10824.42m 180.41h 7.52d 0.021 y # IO & Wait Time: 11633s 193.88m 3.23h 0.13d 0.000 y # Average job time: 198s 3.31m 0.06h 0.00d # Longest finished job: 1326s 22.10m 0.37h 0.02d # Submission to last job: 429201s 7153.35m 119.22h 4.97d ssh kkstore01 cd /cluster/data/mm8/bed/bacends screen mkdir temp time pslSort dirs raw.psl temp out/* > pslSort.out 2>&1 & # real 22m4.019s # -rw-rw-r-- 1 8422362557 Feb 22 15:35 raw.psl time pslReps -nearTop=0.01 -minCover=0.7 -minAli=0.8 -noIntrons \ raw.psl bacEnds.psl /dev/null > pslReps.out 2>&1 & # real 6m15.981s # -rw-rw-r-- 1 197029888 Feb 22 15:37 bacEnds.psl # utilize the scripts from the previous build cp -p /cluster/data/mm7/bed/bacends/split.pl . cp -p /cluster/data/mm7/bed/bacends/header . time ./split.pl header < bacEnds.psl # real 0m26.983s mv bacEnds.psl bacEnds.psl.save time pslSort dirs bacEnds.psl temp split # real 2m19.131s # -rw-rw-r-- 1 1227866614 Feb 22 15:48 bacEnds.psl # Copy files to final destination and remove mkdir /cluster/data/mm8/bacends cp -p bacEnds.psl /cluster/data/mm8/bacends ############################################################################ # BACEND PAIRS TRACK (DONE - 2006-02-22 - Hiram) ssh kolossus cd /cluster/data/mm8/bacends time /cluster/bin/x86_64/pslPairs -tInsert=10000 -minId=0.91 -noBin -min=25000 \ -max=350000 -slopval=10000 -hardMax=500000 -slop -short -long -orphan \ -mismatch -verbose bacEnds.psl \ ../bed/cloneend/cloneEndPairs.txt all_bacends bacEnds # real 0m47.401s # create header required by "rdb" tools echo -e \ "chr\tstart\tend\tclone\tscore\tstrand\tall\tfeatures\tstarts\tsizes" > header echo -e "10\t10N\t10N\t10\t10N\t10\t10\t10N\t10\t10" >> header cat header bacEnds.pairs | \ /cluster/bin/scripts/row score ge 300 | \ /cluster/bin/scripts/sorttbl chr start | \ /cluster/bin/scripts/headchg -del > bacEndPairs.bed # -rw-rw-r-- 1 23816801 Feb 22 15:52 bacEndPairs.bed cat header bacEnds.slop bacEnds.short bacEnds.long bacEnds.mismatch \ bacEnds.orphan | /cluster/bin/scripts/row score ge 300 | \ /cluster/bin/scripts/sorttbl chr start | \ /cluster/bin/scripts/headchg -del > bacEndPairsBad.bed # -rw-rw-r-- 1 6843775 Feb 22 15:54 bacEndPairsBad.bed /cluster/bin/scripts/extractPslLoad -noBin bacEnds.psl bacEndPairs.bed \ bacEndPairsBad.bed >j1.out cat j1.out| /cluster/bin/scripts/sorttbl tname tstart >j2.out cat j2.out | /cluster/bin/scripts/headchg -del > bacEnds.load.psl # -rw-rw-r-- 1 983668200 Feb 22 16:04 bacEnds.load.psl rm j1.out j2.out # CHECK bacEndPairs.bed ID's to make sure they have no blanks in them awk '{print $5}' bacEndPairs.bed | sort -u # result should be the scores, no extraneous strings: # 1000 # 300 # 375 # 500 # 750 # edit the file and fix it if it has a bad name. # load into database ssh hgwdev cd /cluster/data/mm8/bacends hgLoadBed -strict -notItemRgb mm8 bacEndPairs bacEndPairs.bed \ -sqlTable=$HOME/kent/src/hg/lib/bacEndPairs.sql # Loaded 235440 elements of size 11 # note - this track isn't pushed to RR, just used for assembly QA hgLoadBed -strict -notItemRgb mm8 bacEndPairsBad bacEndPairsBad.bed \ -sqlTable=$HOME/kent/src/hg/lib/bacEndPairsBad.sql # Loaded 95099 elements of size 11 # NOTE: truncates file to 0 if -nobin is used time hgLoadPsl mm8 -table=all_bacends bacEnds.load.psl # load of all_bacends did not go as planned: 8132116 record(s), 0 row(s) # skipped, 1 warning(s) loading psl.tab # skipped, 1 warning(s) loading psl.tab # real 20m45.055s featureBits mm8 all_bacends # 327086559 bases of 2567283971 (12.741%) in intersection featureBits mm7 all_bacends # 334161740 bases of 2583394090 (12.935%) in intersection featureBits mm6 all_bacends # 336981828 bases of 2597150411 (12.975%) in intersection featureBits mm5 all_bacends # 268502414 bases of 2615483787 (10.266%) in intersection featureBits mm4 all_bacends # 243096171 bases of 2627444668 (9.252%) in intersection featureBits mm8 bacEndPairs # 2572527283 bases of 2567283971 (100.204%) in intersection featureBits mm7 bacEndPairs # 2578837424 bases of 2583394090 (99.824%) in intersection featureBits mm6 bacEndPairs # 2570768812 bases of 2597150411 (98.984%) in intersection featureBits mm5 bacEndPairs # 2567958504 bases of 2615483787 (98.183%) in intersection featureBits mm4 bacEndPairs # 2549945356 bases of 2627444668 (97.050%) in intersection featureBits mm8 bacEndPairsBad # 879222026 bases of 2567283971 (34.247%) in intersection featureBits mm7 bacEndPairsBad # 954662115 bases of 2583394090 (36.954%) in intersection featureBits mm6 bacEndPairsBad # 1006314997 bases of 2597150411 (38.747%) in intersection featureBits mm5 bacEndPairsBad # 541027882 bases of 2615483787 (20.686%) in intersection featureBits mm4 bacEndPairsBad # 1074505863 bases of 2627444668 (40.895%) in intersection ######################################################################### # GENBANK auto update (DONE - 2006-02-17 - 2006-02-23 - Hiram) # align with revised genbank process. drop xeno ESTs. ssh hgwdev cd ~/kent/src/hg/makeDb/genbank cvs update -d -P etc # edit etc/genbank.conf to add mm8, it is a copy of mm7 with changes: # mm8 mm8.serverGenome = /cluster/data/mm8/mm8.2bit mm8.clusterGenome = /scratch/hg/mm8/mm8.2bit mm8.ooc = /cluster/data/mm8/11.ooc mm8.align.unplacedChroms = chrUn_random mm8.lift = /cluster/data/mm8/jkStuff/liftAll.lft mm8.refseq.mrna.native.pslCDnaFilter = ${ordered.refseq.mrna.native.pslCDnaFilter} mm8.refseq.mrna.xeno.pslCDnaFilter = ${ordered.refseq.mrna.xeno.pslCDnaFilter} mm8.genbank.mrna.native.pslCDnaFilter = ${ordered.genbank.mrna.native.pslCDnaFilter} mm8.genbank.mrna.xeno.pslCDnaFilter = ${ordered.genbank.mrna.xeno.pslCDnaFilter} mm8.genbank.est.native.pslCDnaFilter = ${ordered.genbank.est.native.pslCDnaFilter} mm8.downloadDir = mm8 mm8.refseq.mrna.xeno.load = yes mm8.refseq.mrna.xeno.loadDesc = yes mm8.mgcTables.default = full mm8.mgcTables.mgc = all # check that into CVS, then # update /cluster/data/genbank/ make etc-update ssh kkstore04 cd /cluster/data/genbank nice bin/gbAlignStep -initial mm8 & # var/build/logs/2006.02.17-16:10:17.mm8.initalign.log # the parasol batch job on kk broke down in: # /cluster/bluearc/genbank/work/initial.mm8/align # go to kk and this directory and get the batch finished nice bin/gbAlignStep -continue=finish -initial mm8 & # var/build/logs/2006.02.22-20:26:54.mm8.initalign.log # load database when finished ssh hgwdev cd /cluster/data/genbank nice ./bin/gbDbLoadStep -drop -initialLoad mm8 & # var/dbload/hgwdev/logs/2006.02.23-10:21:36.dbload.log # real 228m59.734s ######################################################################### # BLASTZ rheMac2 (DONE - 2006-02-17 - Hiram) ssh pk mkdir /cluster/data/mm8/bed/blastz.rheMac2.2006-02-17 cd /cluster/data/mm8/bed ln -s blastz.rheMac2.2006-02-17 blastz.rheMac2 cd blastz.rheMac2 cat << '_EOF_' > DEF # mouse vs macaca mulatta export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn/x86_64:/cluster/bin/x86_64:/parasol/bin ALIGN=blastz-run BLASTZ=blastz.v7.x86_64 # TARGET - mm8 SEQ1_DIR=/scratch/hg/mm8/nib SEQ1_LEN=/scratch/hg/mm8/chrom.sizes SEQ1_CHUNK=50000000 SEQ1_LAP=10000 # QUERY - macaca mulatta - big enough chunk to do whole chroms at once SEQ2_DIR=/san/sanvol1/scratch/rheMac2/rheMac2.2bit SEQ2_LEN=/san/sanvol1/scratch/rheMac2/rheMac2.sizes SEQ2_CHUNK=250000000 SEQ2_LAP=0 BASE=/cluster/data/mm8/bed/blastz.rheMac2.2006-02-17 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ `pwd`/DEF > blastz.out 2>&1 & # Started 2006-02-17 16:42 # crashed due to no copies of mm8 in /scratch/hg/mm8/ on the # Iservers. Fix that up and get the chain run done. Continuing. time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -continue=chainMerge `pwd`/DEF > chainMerge.out 2>&1 & time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -swap `pwd`/DEF > swap.out 2>&1 & time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -swap -continue=net `pwd`/DEF > swap.net.out 2>&1 & # failed during a san hiccup, finish that off, then: time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -swap -continue=load `pwd`/DEF > swap.load.out 2>&1 & time nice -n +19 featureBits mm8 chainRheMac2Link # 891310108 bases of 2567283971 (34.718%) in intersection time nice -n +19 featureBits rheMac2 chainMm8Link # 877906099 bases of 2646704109 (33.170%) in intersection ######################################################################### # BLASTZ canFam2 (DONE - 2006-02-18 - Hiram) ssh pk mkdir /cluster/data/mm8/bed/blastz.canFam2.2006-02-18 cd /cluster/data/mm8/bed ln -s blastz.canFam2.2006-02-18 blastz.canFam2 cd blastz.canFam2 cat << '_EOF_' > DEF # mouse vs dog export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin BLASTZ=blastz.v7.x86_64 BLASTZ_ABRIDGE_REPEATS=1 # TARGET: Mouse Mm8 SEQ1_DIR=/scratch/hg/mm8/nib SEQ1_SMSK=/scratch/hg/mm8/linSpecRep/notInOthers SEQ1_LEN=/scratch/hg/mm8/chrom.sizes SEQ1_CHUNK=50000000 SEQ1_LAP=10000 # QUERY: Dog CanFam2 - chunk big enough to do all chroms in single whole pieces SEQ2_DIR=/scratch/hg/canFam2/nib SEQ2_SMSK=/san/sanvol1/scratch/canFam2/linSpecRep.notInMouse SEQ2_LEN=/san/sanvol1/scratch/canFam2/chrom.sizes SEQ2_CHUNK=200000000 SEQ2_LAP=0 BASE=/cluster/data/mm8/bed/blastzCanFam2.2006-02-18 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ `pwd`/DEF > blastz.out 2>&1 & time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -swap `pwd`/DEF > swap.out 2>&1 & time nice -n +19 featureBits mm8 chainCanFam2Link # 828741604 bases of 2567283971 (32.281%) in intersection time nice -n +19 featureBits canFam2 chainMm8Link # 816262344 bases of 2384996543 (34.225%) in intersection ######################################################################### # BLASTZ bosTau2 (DONE - 2006-02-18 - Hiram) ssh pk mkdir /cluster/data/mm8/bed/blastz.bosTau2.2006-02-18 cd /cluster/data/mm8/bed ln -s blastz.bosTau2.2006-02-18 blastz.bosTau2 cd blastz.bosTau2 cat << '_EOF_' > DEF # mouse vs cow export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin BLASTZ=blastz.v7.x86_64 # TARGET: Mouse Mm8 SEQ1_DIR=/scratch/hg/mm8/nib SEQ1_LEN=/scratch/hg/mm8/chrom.sizes SEQ1_CHUNK=50000000 SEQ1_LAP=10000 # QUERY: Cow (bosTau2) # large enough chunk to do chroms in one piece SEQ2_DIR=/scratch/hg/bosTau2/bosTau2.noBin0.2bit SEQ2_LEN=/scratch/hg/bosTau2/noBin0.sizes SEQ2_CHUNK=150000000 SEQ2_LIMIT=100 SEQ2_LAP=0 BASE=/cluster/data/mm8/bed/blastzBosTau.2006-02-18 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ `pwd`/DEF > blastz.out 2>&1 & time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -swap `pwd`/DEF > swap.out 2>&1 & time nice -n +19 featureBits mm8 chainBosTau2Link # 688859641 bases of 2567283971 (26.832%) in intersection time nice -n +19 featureBits bosTau2 chainMm8Link # 683178156 bases of 2812203870 (24.293%) in intersection ######################################################################### # BLASTZ galGal2 (DONE - 2006-02-18 - Hiram) ssh kk mkdir /cluster/data/mm8/bed/blastz.galGal2.2006-02-18 cd /cluster/data/mm8/bed ln -s blastz.galGal2.2006-02-18 blastz.galGal2 cd blastz.galGal2 cat << '_EOF_' > DEF # mouse vs chicken export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin:/parasol/bin BLASTZ=blastz.v7 BLASTZ_ABRIDGE_REPEATS=1 # TARGET: Mouse Mm8 SEQ1_DIR=/scratch/hg/mm8/nib SEQ1_SMSK=/scratch/hg/mm8/linSpecRep/notInOthers SEQ1_LEN=/scratch/hg/mm8/chrom.sizes SEQ1_CHUNK=50000000 SEQ1_LAP=10000 # QUERY: Chicken galGal2 - single chunk big enough for whole chroms at once SEQ2_DIR=/scratch/hg/galGal2/nib SEQ2_LEN=/scratch/hg/galGal2/chrom.sizes SEQ2_SMSK=/scratch/hg/galGal2/linSpecRep SEQ2_IN_CONTIGS=0 SEQ2_CHUNK=200000000 SEQ2_LAP=0 BASE=/cluster/data/mm8/bed/blastzGalGal2.2006-02-18 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=kk -chainMinScore=5000 -chainLinearGap=loose \ `pwd`/DEF > blastz.out 2>&1 & time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=kk -chainMinScore=5000 -chainLinearGap=loose \ -continue=cat `pwd`/DEF > cat.out 2>&1 & time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=kk -chainMinScore=5000 -chainLinearGap=loose \ -continue=net `pwd`/DEF > net.out 2>&1 & time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=kk -chainMinScore=5000 -chainLinearGap=loose \ -swap `pwd`/DEF > swap.out 2>&1 & time nice -n +19 featureBits mm8 chainGalGal2Link # 65517358 bases of 2567283971 (2.552%) in intersection time nice -n +19 featureBits galGal2 chainMm8Link # 57074100 bases of 1054197620 (5.414%) in intersection ######################################################################### # BLASTZ dasNov1 (DONE - 2006-02-19 - Hiram) ssh pk mkdir /cluster/data/mm8/bed/blastz.dasNov1.2006-02-19 cd /cluster/data/mm8/bed ln -s blastz.dasNov1.2006-02-19 blastz.dasNov1 cd blastz.dasNov1 cat << '_EOF_' > DEF export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/parasol/bin BLASTZ=blastz.v7.x86_64 # TARGET: Mouse Mm8 SEQ1_DIR=/scratch/hg/mm8/nib SEQ1_LEN=/scratch/hg/mm8/chrom.sizes SEQ1_CHUNK=50000000 SEQ1_LAP=10000 # QUERY - Armadillo dasNov1 SEQ2_DIR=/scratch/hg/dasNov1/dasNov1.2bit SEQ2_LEN=/scratch/hg/dasNov1/chrom.sizes SEQ2_LIMIT=100 SEQ2_CHUNK=50000000 SEQ2_LAP=0 BASE=/cluster/data/mm8/bed/blastzDasNov1.2006-02-19 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ `pwd`/DEF > blastz.out 2>&1 & time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -continue=cat `pwd`/DEF > cat.out 2>&1 & time nice -n +19 featureBits mm8 chainDasNov1Link # 431944142 bases of 2567283971 (16.825%) in intersection ######################################################################### # BLASTZ echTel1 (DONE - 2006-02-19 - Hiram) ssh pk mkdir /cluster/data/mm8/bed/blastz.echTel1.2006-02-19 cd /cluster/data/mm8/bed ln -s blastz.echTel1.2006-02-19 blastz.echTel1 cd blastz.echTel1 cat << '_EOF_' > DEF export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/parasol/bin BLASTZ=blastz.v7.x86_64 # TARGET: Mouse Mm8 SEQ1_DIR=/scratch/hg/mm8/nib SEQ1_LEN=/scratch/hg/mm8/chrom.sizes SEQ1_CHUNK=50000000 SEQ1_LAP=10000 # QUERY - Tenrec echTel1 SEQ2_DIR=/scratch/hg/echTel1/echTel1.2bit SEQ2_LEN=/scratch/hg/echTel1/chrom.sizes SEQ2_LIMIT=100 SEQ2_CHUNK=50000000 SEQ2_LAP=0 BASE=/cluster/data/mm8/bed/blastzEchTel1.2006-02-19 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ `pwd`/DEF > blastz.out 2>&1 & time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ -continue=cat `pwd`/DEF > cat.out 2>&1 & time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ -continue=chainRun `pwd`/DEF > chain.out 2>&1 & time nice -n +19 featureBits mm8 chainEchTel1Link # 292970406 bases of 2567283971 (11.412%) in intersection ######################################################################### # BLASTZ fr1 (DONE - 2006-02-19 - Hiram) ssh pk mkdir /cluster/data/mm8/bed/blastz.fr1.2006-02-19 cd /cluster/data/mm8/bed ln -s blastz.fr1.2006-02-19 blastz.fr1 cd blastz.fr1 cat << '_EOF_' > DEF # mouse vs. fugu export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin:/parasol/bin BLASTZ=blastz.v7 # Reuse parameters from human-chicken, except L=6000 (more relaxed) BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET: Mouse Mm8 SEQ1_DIR=/scratch/hg/mm8/nib SEQ1_LEN=/scratch/hg/mm8/chrom.sizes SEQ1_CHUNK=50000000 SEQ1_LAP=10000 # QUERY: Fugu - chunk big enough to run the whole chrom at once SEQ2_DIR=/san/sanvol1/scratch/fr1/nib SEQ2_LEN=/san/sanvol1/scratch/fr1/chrom.sizes SEQ2_CHUNK=400000000 SEQ2_LAP=0 BASE=/cluster/data/mm8/bed/blastzFr1.2006-02-19 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ `pwd`/DEF > blastz.out 2>&1 & time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ -swap `pwd`/DEF > swap.out 2>&1 & time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ -swap -continue=net `pwd`/DEF > swap.net.out 2>&1 & time nice -n +19 featureBits mm8 chainFr1Link # 48949500 bases of 2567283971 (1.907%) in intersection time nice -n +19 featureBits fr1 chainMm8Link # 42671288 bases of 315518167 (13.524%) in intersection ######################################################################### # BLASTZ loxAfr1 (DONE - 2006-02-19 - Hiram) ssh kk mkdir /cluster/data/mm8/bed/blastz.loxAfr1.2006-02-19 cd /cluster/data/mm8/bed ln -s blastz.loxAfr1.2006-02-19 blastz.loxAfr1 cd blastz.loxAfr1 cat << '_EOF_' > DEF export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/parasol/bin BLASTZ=blastz.v7 # TARGET: Mouse Mm8 SEQ1_DIR=/scratch/hg/mm8/nib SEQ1_LEN=/scratch/hg/mm8/chrom.sizes SEQ1_CHUNK=30000000 SEQ1_LAP=10000 # QUERY - Elephant loxAfr1 SEQ2_DIR=/scratch/hg/loxAfr1/loxAfr1.2bit SEQ2_LEN=/scratch/hg/loxAfr1/chrom.sizes SEQ2_LIMIT=100 SEQ2_CHUNK=30000000 SEQ2_LAP=0 BASE=/cluster/data/mm8/bed/blastzLoxAfr1.2006-02-19 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=kk -chainMinScore=3000 -chainLinearGap=medium \ `pwd`/DEF > blastz.out 2>&1 & # failed during the cat, fixed the script time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=kk -chainMinScore=3000 -chainLinearGap=medium \ -continue=chainRun `pwd`/DEF > chain.out 2>&1 & time nice -n +19 featureBits mm8 chainLoxAfr1Link # 472168702 bases of 2567283971 (18.392%) in intersection ######################################################################### # BLASTZ tetNig1 (DONE - 2006-02-19 - Hiram) ssh kk mkdir /cluster/data/mm8/bed/blastz.tetNig1.2006-02-19 cd /cluster/data/mm8/bed ln -s blastz.tetNig1.2006-02-19 blastz.tetNig1 cd blastz.tetNig1 cat << '_EOF_' > DEF # Mouse vs tetraodon export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin:/parasol/bin BLASTZ=blastz.v7 BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET: Mouse Mm8 SEQ1_DIR=/scratch/hg/mm8/nib SEQ1_LEN=/scratch/hg/mm8/chrom.sizes SEQ1_CHUNK=50000000 SEQ1_LAP=10000 # QUERY: Tetraodon TetNig1 - single chunk big enough to run whole chroms SEQ2_DIR=/san/sanvol1/scratch/tetNig1/tetNig1.2bit SEQ2_LEN=/san/sanvol1/scratch/tetNig1/chrom.sizes SEQ2_CHUNK=200000000 SEQ2_LAP=0 BASE=/cluster/data/mm8/bed/blastzTetNig1.2006-02-19 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=kk -chainMinScore=5000 -chainLinearGap=loose \ `pwd`/DEF > blastz.out 2>&1 & time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=kk -chainMinScore=5000 -chainLinearGap=loose \ -continue=net `pwd`/DEF > net.out 2>&1 & time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=kk -chainMinScore=5000 -chainLinearGap=loose \ -swap `pwd`/DEF > swap.out 2>&1 & time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=kk -chainMinScore=5000 -chainLinearGap=loose \ -swap -continue=net `pwd`/DEF > swap-net.out 2>&1 & time nice -n +19 featureBits mm8 chainTetNig1Link # 50358792 bases of 2567283971 (1.962%) in intersection time nice -n +19 featureBits tetNig1 chainMm8Link # 47024263 bases of 342403326 (13.734%) in intersection ######################################################################### # BLASTZ oryCun1 (DONE - 2006-02-21 - Hiram) ssh pk mkdir /cluster/data/mm8/bed/blastz.oryCun1.2006-02-21 cd /cluster/data/mm8/bed ln -s blastz.oryCun1.2006-02-21 blastz.oryCun1 cd blastz.oryCun1 cat << '_EOF_' > DEF export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/parasol/bin BLASTZ=blastz.v7.x86_64 # TARGET: Mouse Mm8 SEQ1_DIR=/scratch/hg/mm8/nib SEQ1_LEN=/scratch/hg/mm8/chrom.sizes SEQ1_CHUNK=50000000 SEQ1_LAP=10000 # QUERY - Rabbit oryCun1 SEQ2_DIR=/scratch/hg/oryCun1/oryCun1.2bit SEQ2_LEN=/scratch/hg/oryCun1/chrom.sizes SEQ2_LIMIT=100 SEQ2_CHUNK=50000000 SEQ2_LAP=0 BASE=/cluster/data/mm8/bed/blastzOryCun1.2006-02-21 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ `pwd`/DEF > blastz.out 2>&1 & time nice -n +19 featureBits mm8 chainOryCun1Link # 496060619 bases of 2567283971 (19.322%) in intersection ######################################################################### # BLASTZ xenTro1 (DONE - 2006-02-21 - Hiram) ssh kk mkdir /cluster/data/mm8/bed/blastz.xenTro1.2006-02-21 cd /cluster/data/mm8/bed ln -s blastz.xenTro1.2006-02-21 blastz.xenTro1 cd blastz.xenTro1 cat << '_EOF_' > DEF # mouse vs. frog export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin:/parasol/bin BLASTZ=blastz.v7 # Specific settings for chicken (per Webb email to Brian Raney) BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=8000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET: Mouse Mm8 SEQ1_DIR=/scratch/hg/mm8/nib SEQ1_LEN=/scratch/hg/mm8/chrom.sizes SEQ1_CHUNK=50000000 SEQ1_LAP=10000 # QUERY: Frog xenTro1 - single chunk big enough to run two of the # largest scaffolds in one job SEQ2_DIR=/scratch/hg/xenTro1/xenTro1.2bit SEQ2_LEN=/scratch/hg/xenTro1/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LAP=0 SEQ2_LIMIT=100 BASE=/cluster/data/mm8/bed/blastzXenTro1.2006-02-21 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=kk -chainMinScore=5000 -chainLinearGap=loose \ `pwd`/DEF > blastz.out 2>&1 & time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=kk -chainMinScore=5000 -chainLinearGap=loose \ -continue=cat `pwd`/DEF > cat.out 2>&1 & time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=kk -chainMinScore=5000 -chainLinearGap=loose \ -swap `pwd`/DEF > swap.out 2>&1 & time nice -n +19 featureBits mm8 chainXenTro1Link # 62015601 bases of 2567283971 (2.416%) in intersection time nice -n +19 featureBits xenTro1 chainMm8Link # 59307185 bases of 1381238994 (4.294%) in intersection ######################################################################### # BLASTZ monDom4 (DONE - 2006-02-23 - Hiram) ssh pk mkdir /cluster/data/mm8/bed/blastz.monDom4.2006-02-23 cd /cluster/data/mm8/bed ln -s blastz.monDom4.2006-02-23 blastz.monDom4 cd blastz.monDom4 cat << '_EOF_' > DEF # Mouse vs. opossum export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/parasol/bin BLASTZ=blastz.v7.x86_64 # Specific settings for chicken (per Webb email to Brian Raney) BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=10000 BLASTZ_K=2200 BLASTZ_M=20 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET: Mouse (mm8) SEQ1_DIR=/scratch/hg/mm8/nib SEQ1_LEN=/scratch/hg/mm8/chrom.sizes SEQ1_CHUNK=100000000 SEQ1_LAP=10000 # QUERY: Opossum monDom2 SEQ2_DIR=/cluster/bluearc/scratch/hg/monDom4/monDom4.2bit SEQ2_LEN=/cluster/bluearc/scratch/hg/monDom4/chrom.sizes SEQ2_CHUNK=50000000 SEQ2_LIMIT=100 SEQ2_LAP=0 BASE=/cluster/data/mm8/bed/blastzMonDom4.2006-02-23 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ `pwd`/DEF > blastz.out 2>&1 & time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ -swap `pwd`/DEF > swap.out 2>&1 & time nice -n +19 featureBits mm8 chainMonDom4Link # 211663336 bases of 2567283971 (8.245%) in intersection time nice -n +19 featureBits monDom4 chainMm8Link # 210933035 bases of 3501643220 (6.024%) in intersection # Something caused the loaded chains and nets on Mm8 to disappear. # to reload them (DONE - Hiram - 2006-07-18) # recover the individual chain files ssh kkstore04 cd /cluster/data/mm8/bed/blastzMonDom4.2006-02-23/axtChain nice chainSplit chain mm8.monDom4.all.chain.gz ssh hgwdev cd /cluster/data/mm8/bed/blastzMonDom4.2006-02-23/axtChain/chain foreach f (*.chain) set c = $f:r echo hgLoadChain mm8 ${c}_chainMonDom4 $f hgLoadChain mm8 ${c}_chainMonDom4 $f end time netFilter -minGap=10 mm8.monDom4.net.gz \ | hgLoadNet -verbose=0 mm8 netMonDom4 stdin # clean up ssh kkstore04 cd /cluster/data/mm8/bed/blastzMonDom4.2006-02-23/axtChain rm -fr chain ######################################################################### # BLASTZ panTro1 (DONE - 2006-02-23 - Hiram) ssh pk mkdir /cluster/data/mm8/bed/blastz.panTro1.2006-02-23 cd /cluster/data/mm8/bed ln -s blastz.panTro1.2006-02-23 blastz.panTro1 cd blastz.panTro1 cat << '_EOF_' > DEF # mouse vs chimp export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin BLASTZ=blastz.v7.x86_64 BLASTZ_M=50 # TARGET: Mouse Mm7 SEQ1_DIR=/scratch/hg/mm8/nib SEQ1_LEN=/scratch/hg/mm8/chrom.sizes SEQ1_CHUNK=50000000 SEQ1_LAP=10000 # QUERY: Chimp PanTro1 SEQ2_DIR=/scratch/hg/panTro1/nib SEQ2_LEN=/scratch/hg/panTro1/chrom.sizes SEQ2_CHUNK=30000000 SEQ2_LAP=0 BASE=/cluster/data/mm8/bed/blastzPanTro1.2006-02-23 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ `pwd`/DEF > blastz.out 2>&1 & time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -swap `pwd`/DEF > swap.out 2>&1 & time nice -n +19 featureBits mm8 chainPanTro1Link # 901276629 bases of 2567283971 (35.106%) in intersection time nice -n +19 featureBits panTro1 chainMm8Link # 901976621 bases of 2733948177 (32.992%) in intersection ######################################################################### # BLASTZ danRer4 (DONE - 2006-04-26 - 2006-04-28 - Hiram) ssh pk mkdir /cluster/data/mm8/bed/blastzDanRer4.2006-04-26 cd /cluster/data/mm8/bed ln -s blastzDanRer4.2006-04-26 blastz.danRer4 cd blastz.danRer4 cat << '_EOF_' > DEF # mouse vs zebrafish export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn/x86_64:/cluster/bin/x86_64:/parasol/bin BLASTZ=blastz.v7.x86_64 BLASTZ_ABRIDGE_REPEATS=1 # Reuse parameters from hg16-fr1, danRer-hg17 and mm5-danRer BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_M=50 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET: Mouse Mm8 SEQ1_DIR=/scratch/hg/mm8/nib SEQ1_LEN=/scratch/hg/mm8/chrom.sizes SEQ1_SMSK=/scratch/hg/mm8/linSpecRep/notInOthers SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: Zebrafish (danRer4) # large enough chunk to do complete chroms at once SEQ2_DIR=/san/sanvol1/scratch/danRer4/chromNib SEQ2_LEN=/san/sanvol1/scratch/danRer4/chromNib.sizes SEQ2_SMSK=/san/sanvol1/scratch/danRer4/linSpecRep.notInOthers SEQ2_CHUNK=100000000 SEQ2_LAP=0 BASE=/cluster/data/mm8/bed/blastzDanRer4.2006-04-26 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs cd /cluster/data/mm8/bed/blastzDanRer4.2006-04-26 time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ `pwd`/DEF > blastz.out 2>&1 & time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ -continue=net `pwd`/DEF > net.out 2>&1 & # swap, see also makeDanRer4.doc time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ -swap `pwd`/DEF > swap.out 2>&1 & time nice -n +19 featureBits mm8 chainDanRer4Link \ > fb.mm8.chainDanRer4Link 2>&1 & cat fb.mm8.chainDanRer4Link # 54036008 bases of 2567283971 (2.105%) in intersection time nice -n +19 featureBits danRer4 chainMm8Link \ > fb.danRer4.chainDanRer4Link 2>&1 & cat fb.danRer4.chainDanRer4Link # 58145856 bases of 1626093931 (3.576%) in intersection ######################################################################### # BLASTZ danRer4 (DONE - 2006-04-26 - 2006-04-28 - Hiram) # REMAKE THIS USING ALL CHROMS FOR danRer4 (2005-05-22 - ). ssh pk mkdir /cluster/data/mm8/bed/blastzDanRer4.2006-05-22 cd /cluster/data/mm8/bed/blastzDanRer4.2006-05-22 # ln -s blastzDanRer4.2006-04-26 blastz.danRer4 cat << '_EOF_' > DEF # mouse vs zebrafish export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn/x86_64:/cluster/bin/x86_64:/parasol/bin BLASTZ=blastz.v7.x86_64 BLASTZ_ABRIDGE_REPEATS=1 # Reuse parameters from hg16-fr1, danRer-hg17 and mm5-danRer BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_M=50 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET: Mouse Mm8 SEQ1_DIR=/scratch/hg/mm8/nib SEQ1_LEN=/scratch/hg/mm8/chrom.sizes SEQ1_SMSK=/scratch/hg/mm8/linSpecRep/notInOthers SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: Zebrafish (danRer4) # large enough chunk to do complete chroms at once SEQ2_DIR=/san/sanvol1/scratch/danRer4/nib SEQ2_LEN=/san/sanvol1/scratch/danRer4/chrom.sizes SEQ2_SMSK=/san/sanvol1/scratch/danRer4/linSpecRep.notInOthers SEQ2_CHUNK=300000000 SEQ2_LAP=0 BASE=/cluster/data/mm8/bed/blastzDanRer4.2006-05-22 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs chmod +x DEF cd /cluster/data/mm8/bed/blastzDanRer4.2006-05-22 time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ `pwd`/DEF >& blastz.out & # 0.118u 0.107s 4:05:08.71 0.0% 0+0k 0+0io 0pf+0w time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ -continue=net `pwd`/DEF >& net.out & # 0.121u 0.072s 4:48.04 0.0% 0+0k 0+0io 0pf+0w cd /cluster/data/mm8/bed/blastzDanRer4.2006-05-22 # swap, see also makeDanRer4.doc time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ -swap `pwd`/DEF >& swap.out & # 0.129u 0.109s 5:02.55 0.0% 0+0k 0+0io 0pf+0w ssh hgwdev cd /cluster/data/mm8/bed/blastzDanRer4.2006-05-22 featureBits mm8 chainDanRer4Link >& fb.mm8.chainDanRer4Link & cat fb.mm8.chainDanRer4Link # 55147954 bases of 2567283971 (2.148%) in intersection featureBits danRer4 chainMm8Link >& fb.danRer4.chainDanRer4Link & cat fb.danRer4.chainDanRer4Link # 60721886 bases of 1626093931 (3.734%) in intersection featureBits -chrom=chr1 mm8 refGene:cds chainDanRer4Link -enrichment # refGene:cds 0.856%, chainDanRer4Link 1.867%, both 0.584%, # cover 68.16%, enrich 36.51x featureBits -chrom=chr1 mm8 refGene:cds chainDanRer3Link -enrichment # refGene:cds 0.856%, chainDanRer3Link 1.760%, both 0.492%, cover 57.49%, # enrich 32.67x featureBits -chrom=chr1 danRer4 refGene:cds chainMm8Link -enrichment # refGene:cds 0.746%, chainMm8Link 3.807%, both 0.566%, cover 75.86%, # enrich 19.93x featureBits -chrom=chr1 danRer3 refGene:cds chainMm8Link -enrichment # refGene:cds 0.786%, chainMm8Link 4.581%, both 0.612%, cover 77.88%, # enrich 17.00x # Higher coverage than for danRer3 chains on mm8 and similar coverage # for mm8 chains on danRer4 as on danRer3 so that is good. ######################################################################### # BLASTZ danRer3 (DONE - 2006-02-28 - Hiram) ssh pk mkdir /cluster/data/mm8/bed/blastz.danRer3.2006-02-28 cd /cluster/data/mm8/bed ln -s blastz.danRer3.2006-02-28 blastz.danRer3 cd blastz.danRer3 cat << '_EOF_' > DEF # mouse vs zebrafish export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn/x86_64:/cluster/bin/x86_64:/parasol/bin BLASTZ=blastz.v7.x86_64 # Reuse parameters from hg16-fr1, danRer-hg17 and mm5-danRer BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_M=50 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET: Mouse Mm8 SEQ1_DIR=/scratch/hg/mm8/nib SEQ1_LEN=/scratch/hg/mm8/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: Zebrafish (danRer3) # large enough chunk to do complete chroms at once SEQ2_DIR=/san/sanvol1/scratch/danRer3/chromNib SEQ2_LEN=/san/sanvol1/scratch/danRer3/chromNib.sizes SEQ2_CHUNK=100000000 SEQ2_LAP=0 BASE=/cluster/data/mm8/bed/blastzDanRer3.2006-02-28 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ `pwd`/DEF > blastz.out 2>&1 & # real 216m23.425s time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ -swap `pwd`/DEF > swap.out 2>&1 & time nice -n +19 featureBits mm8 chainDanRer3Link # 53125783 bases of 2567283971 (2.069%) in intersection time nice -n +19 featureBits danRer3 chainMm8Link # 54831876 bases of 1630323462 (3.363%) in intersection ############################################################################# # STS MARKERS DATA DOWNLOAD (DONE - 2006-02-23 - 2006-02-28 - Hiram) ### *** PLEASE NOTE - STS markers redone 2006-08-29 - look for section: ## redoing STS markers track to get them more correct ### later in this file ssh kkstore01 mkdir -p /cluster/data/mm8/bed/STSmarkers/downloads cd /cluster/data/mm8/bed/STSmarkers/downloads # these files appear to be new almost every day wget --timestamping \ ftp://ftp.ncbi.nih.gov/repository/UniSTS/UniSTS_mouse.sts wget --timestamping \ ftp://ftp.ncbi.nih.gov/repository/UniSTS/UniSTS.aliases # The new feature in the .aliases file this time are names with # spaces in them ! This changes our parsing business below, # hopefully the spaces in the names won't cause trouble elsewhere. wget --timestamping \ ftp://ftp.ncbi.nih.gov/repository/UniSTS/UniSTS_MapReports/Mus_musculus/* # these reports from jax.org appear to be changing daily wget --timestamping \ ftp://ftp.informatics.jax.org/pub/reports/MRK_Dump2.rpt wget --timestamping \ ftp://ftp.informatics.jax.org/pub/reports/MRK_Sequence.rpt wget --timestamping \ ftp://ftp.informatics.jax.org/pub/reports/PRB_PrimerSeq.rpt ls -ogrt # -rw-rw-r-- 1 676 Mar 11 2004 README # -rw-rw-r-- 1 396858 Jan 28 2005 10090.MGI.txt # -rw-rw-r-- 1 390139 Mar 16 2005 10090.WI_MRC_RH.txt # -rw-rw-r-- 1 240688 Mar 16 2005 10090.WI-YAC.txt # -rw-rw-r-- 1 173344 Mar 16 2005 10090.WI-Genetic.txt # -rw-rw-r-- 1 25691253 Jan 13 16:42 UniSTS.aliases # -rw-rw-r-- 1 4140920 Feb 22 18:43 UniSTS_mouse.sts # -rw-rw-r-- 1 4576611 Feb 23 02:22 MRK_Dump2.rpt # -rw-rw-r-- 1 2549974 Feb 23 02:23 PRB_PrimerSeq.rpt # -rw-rw-r-- 1 4531489 Feb 23 02:23 MRK_Sequence.rpt # I note the UniSTS.aliases file is over twice as big as was in # Mm7 build. I wonder what got into it ... # What got into it was that it was completely broken. It appeared # to have a vast section of itself duplicated again in the file. # It was cleaned up via: echo -e "#Unique ID\tAliases" > uniqueSTS.aliases grep -v "^#" UniSTS.aliases | sort -n | uniq >> uniqueSTS.aliases mv UniSTS.aliases UniSTS.aliases.broken mv uniqueSTS.aliases UniSTS.aliases # back to our work area, update the bed file # to do this we need a new UniSTS_mouse.alias file # it is created by a combination of information from several # of the above files ! AND ! the previous stsInfoMouse.bed file cd /cluster/data/mm8/bed/STSmarkers/downloads cp -p /cluster/data/mm7/bed/STSmarkers/downloads/*.sh . cp -p /cluster/data/mm7/bed/STSmarkers/downloads/*.pl . # There is a line in the fetchAllAliases.sh script that needs to # be updated, it must point to the previous bed file: # BEDFile=/cluster/data/mm7/bed/STSmarkers/stsInfoMouse.bed # Next time, this should read: # BEDFile=/cluster/data/mm8/bed/STSmarkers/stsInfoMouse.bed # This process has been captured in the script: # /cluster/data/mm5/bed/STSmarkers/downloads/fetchAllAliases.sh # which uses a couple of perl scripts in that same directory. # briefly it is: # ./UniSTSParse.pl UniSTS_mouse.sts UniSTS.aliases > UniSTS_mouse_alias.0 # grep MGI: UniSTS.aliases > MGI.aliases # ./stsInfoMouseParse.pl /cluster/store5/mouseMarker/stsInfoMouse.bed > \ # stsInfoAliases.txt # ./UniSTSParse.pl stsInfoAliases.txt UniSTS.aliases > stsInfo.aliases # cat UniSTS_mouse_alias.0 MGI.aliases stsInfo.aliases | sort -u \ # | sort -n > UniSTS_mouse.alias time ./fetchAllAliases.sh > fetchAllAliases.out 2>&1 # Here is a normal set of errors: # processing UniSTS_mouse.sts to find aliases # # ERROR: KNOWN(==OK) duplicate ID: '108991' encountered at line # # 2384 # processing MGI.aliases # fetching existing aliases from previous stsInfoMouse.bed file # found 27648 potential errors in # /cluster/data/mm7/bed/STSmarkers/stsInfoMouse.bed # to see the errors: grep ERROR stsInfoAliases.txt # verify those stsInfoMouse.bed aliases with UniSTS.aliases # those errors in the previous stsInfoMouse.bed file are an # accumulation of errors from a long long time ago in this chain # of processing. Some day it might be nice to fix them, but they # don't seem to bother anything, so they continue to be carried # forward, and a couple of new ones are added with each assembly. # with that, we can create a new stsInfoMouse.bed file: # Update the m m 7 directory name here to m m 8 # for the next build of m m 9 cd /cluster/data/mm8/bed/STSmarkers /cluster/store5/mouseMarker/code/updateBed.pl \ /cluster/data/mm7/bed/STSmarkers/stsInfoMouse.bed \ downloads/MRK_Dump2.rpt downloads/PRB_PrimerSeq.rpt \ downloads/MRK_Sequence.rpt downloads/UniSTS_mouse.alias \ downloads/UniSTS_mouse.sts | sed -e "s/\t*$//" > newbedfile # Yontao updated /cluster/store5/mouseMarker/code/cleanInfo.pl 8/10/04 /cluster/store5/mouseMarker/code/cleanInfo.pl newbedfile > stsInfoMouse.bed # copy the stsInfoMouse.bed file from working dir to the marker # info storage fold. added 2 new steps by Yontao # be wary of the archive name here, check the directory and get # the name right here. mv /cluster/store5/mouseMarker/stsInfoMouse.bed \ /cluster/store5/mouseMarker/stsInfoMouse.bed_mm7 cp -p stsInfoMouse.bed /cluster/store5/mouseMarker/stsInfoMouse.bed # comparing to previous, numbers increase slightly each time wc /cluster/store5/mouseMarker/stsInfoMouse.bed \ /cluster/store5/mouseMarker/stsInfoMouse.bed_mm7 \ /cluster/store5/mouseMarker/stsInfoMouse.bed_mm6 \ /cluster/store5/mouseMarker/stsInfoMouse.bed_mm5 # 60440 801181 6871232 /cluster/store5/mouseMarker/stsInfoMouse.bed # 59843 794642 6802825 /cluster/store5/mouseMarker/stsInfoMouse.bed_mm7 # 58980 784786 6690105 /cluster/store5/mouseMarker/stsInfoMouse.bed_mm6 # 58493 778055 6524821 /cluster/store5/mouseMarker/stsInfoMouse.bed_mm5 # and from that, create new primer fa, epcr, etc: /cluster/store5/mouseMarker/code/luConvertPrimerToFa \ stsInfoMouse.bed mouseP.fa mouseC.fa mouseP.info # the mouseC.fa file will be empty, should be more than last time wc mouse?.* # 0 0 0 mouseC.fa # 305991 305937 6910111 mouseP.fa # 34475 172467 2195057 mouseP.info # 340466 478404 9105168 total # the equivalent Mm7 files: # 0 0 0 mouseC.fa # 300968 300914 6798466 mouseP.fa # 33838 169275 2153113 mouseP.info # 334806 470189 8951579 total # the equivalent Mm6 files: # 0 0 0 mouseC.fa # 293305 293251 6624638 mouseP.fa # 32890 164528 2087271 mouseP.info # 326195 457779 8711909 total # the equivalent Mm5 files: # 0 0 0 mouseC.fa # 286740 286686 6474893 mouseP.fa # 32232 161234 2044810 mouseP.info # 318972 447920 8519703 total # copy the primers over to some filesystem close to the klusters # and split them up to have a small number of sequences in one file mkdir /cluster/bluearc/mm8/stsMarkers cp -p mouseP.fa /cluster/bluearc/mm8/stsMarkers cd /cluster/bluearc/mm8/stsMarkers cp -p /cluster/data/mm8/11.ooc . mkdir split # 400 files for 34,475 sequences, == about 80 sequences per file faSplit sequence mouseP.fa 400 split/mm_ # PLEASE NOTE /cluster/bin/i386/blat.2 SPECIFICALLY IS USED HERE. # This process could convert to a modern version of blat with the # filters as described, for example, in the STS markers build in Hg18 # CLUSTER RUN FOR THE STS PRIMERS ssh kk mkdir /cluster/data/mm8/bed/STSmarkers/primer mkdir /cluster/data/mm8/bed/STSmarkers/ePCR cd /cluster/data/mm8/bed/STSmarkers/primer mkdir out # interestingly, this blat2.2 binary did not function correctly # when given nib files. It has only about 1/4th of the number of # alignments as it gets when it used fa files for the target # sequence. ls -1S /cluster/bluearc/mm8/stsMarkers/split > primers.list ls -1S /cluster/bluearc/mm8/stsMarkers/chroms > chr.list cat << '_EOF_' > runBlat2.csh #!/bin/csh -fe set primer = /cluster/bluearc/mm8/stsMarkers/split/$1 set fa = /cluster/bluearc/mm8/stsMarkers/chroms/$2 set ooc = /cluster/bluearc/mm8/stsMarkers/11.ooc set root2 = $2:r mkdir -p out/${root2} set out = $3 /cluster/bin/i386/blat.2 ${fa} ${primer} -ooc=${ooc} \ -minMatch=1 -minScore=0 -minIdentity=80 -oneOff ${out} '_EOF_' # << happy emacs chmod +x runBlat2.csh cat << '_EOF_' > template #LOOP ./runBlat2.csh $(path1) $(path2) {check out line+ out/$(root2)/$(root1).psl} #ENDLOOP '_EOF_' # << happy emacs gensub2 primers.list chr.list template jobList para create jobList para try ... check ... push ... etc ... # Completed: 12104 of 12104 jobs # CPU time in finished jobs: 1075037s 17917.28m 298.62h 12.44d 0.034 y # IO & Wait Time: 7444257s 124070.95m 2067.85h 86.16d 0.236 y # Average job time: 704s 11.73m 0.20h 0.01d # Longest finished job: 61869s 1031.15m 17.19h 0.72d # Submission to last job: 168538s 2808.97m 46.82h 1.95d # some of the jobs got stuck for unknown reasons. Had to find # them and kill them on their nodes. Their blat.2 process was # stuck and would not kill. Don't know what happened there. # on the file server ssh kkstore01 cd /cluster/data/mm8/bed/STSmarkers/primer time pslSort dirs primers.raw.psl temp out/chr* # -rw-rw-r-- 1 586124177 Feb 26 21:28 primers.raw.psl # filter alignments for (qEnd-qStart) vs. (tEnd-tStart) # should not be more than 100 bases different. # This filters out about 1,028,202 alignments, or # %17.4 = 100.0 * 1028202 / 5921712 time pslSort dirs stdout temp out/chr* | awk -F"\t" ' { if (((($13 - $12) - ($17 - $16)) > -100) && ((($13 - $12) - ($17 - $16)) < 100)) {print} } ' > primers.psl.100 rmdir temp wc -l *.100 *.psl # 5445367 primers.raw.psl # 4500528 primers.psl.100 # 944839 difference # a rough comparison with previous results: wc -l /cluster/data/mm7/bed/STSmarkers/primer/primers.psl.100 # 4893510 102763628 510563575 primers.psl.100 wc primers.psl (unfiltered, Mm7) # 5921712 124355891 636898117 primers.psl wc /cluster/data/mm7/bed/STSmarkers/primer/primers.psl # 5724127 120206606 615248041 wc /cluster/data/mm5/bed/STSmarkers/primer/primers.psl # 5719969 120119288 590806241 wc /cluster/data/mm4/bed/STSmarkers/primer/primers.psl # 5745617 120657896 592135728 # another kluster run for the ePCR ssh pk cd /cluster/data/mm8/bed/STSmarkers/ePCR ls -1S /cluster/bluearc/mm8/stsMarkers/chroms > chr.list # pick up e-PCR source from # ftp://ftp.ncbi.nlm.nih.gov/pub/schuler/e-PCR/ # version 2.3.1 11 Feb 2005 # Had to add the following to both re-PCR_main.cpp and # e-PCR_main.cpp to get them to compile on kolossus: // max and min Copied from /usr/include/mysql/my_global.h #define max(a, b) ((a) >? (b)) #define min(a, b) ((a) runPCR #!/bin/csh -fe /cluster/bin/x86_64/e-PCR /cluster/data/mm8/bed/STSmarkers/mouseP.info \ /cluster/bluearc/mm8/stsMarkers/chroms/$1 N=1 M=50 W=5 > $2 '_EOF_' # << happy emacs chmod +x runPCR cat << '_EOF_' > template #LOOP ./runPCR $(path1) {check out line+ out/$(num1).epcr} #ENDLOOP '_EOF_' # the mouseP.info was created above gensub2 chr.list single template jobList para create jobList para try para check para push ... etc ... # STARTED 2006-02-27 16:24 # There is a single job that produces no output: ./runPCR chrX_random.fa out/30.epcr # WARNING: 96 STSs have primer shorter than W # WARNING: 21 STSs have ambiguities within W of 3' end # Not sure what's up with that # Completed: 33 of 34 jobs # Crashed: 1 jobs # CPU time in finished jobs: 67601s 1126.69m 18.78h 0.78d 0.002 y # IO & Wait Time: 1028s 17.13m 0.29h 0.01d 0.000 y # Average job time: 2080s 34.66m 0.58h 0.02d # Longest finished job: 5134s 85.57m 1.43h 0.06d # Submission to last job: 5134s 85.57m 1.43h 0.06d ssh kkstore01 cd /cluster/data/mm8/bed/STSmarkers/ePCR # all those results become all.epcr cat out/*.epcr > all.epcr # comparing to previous results: wc -l all.epcr # 58088 all.epcr wc -l /cluster/data/mm7/bed/STSmarkers/ePCR/all.epcr # 57709 /cluster/data/mm7/bed/STSmarkers/ePCR/all.epcr wc -l /cluster/data/mm6/bed/STSmarkers/ePCR/all.epcr # 55871 /cluster/data/mm6/bed/STSmarkers/ePCR/all.epcr wc /cluster/data/mm5/bed/STSmarkers/ePCR/all.epcr # 55677 222708 2945623 /cluster/data/mm5/bed/STSmarkers/ePCR/all.epcr wc /cluster/data/mm4/bed/STSmarkers/ePCR/all.epcr # 74705 298820 3971712 /cluster/data/mm4/bed/STSmarkers/ePCR/all.epcr # Mm4 seems to be out of whack cd /cluster/data/mm8/bed/STSmarkers/primer /cluster/bin/scripts/filterSTSPrimers \ -mouse ../stsInfoMouse.bed primers.psl.100 \ ../mouseP.info ../ePCR/all.epcr > primers.psl.filter.blat # The output should show an increasing count: # Reading name info # Reading primer info # Processing file # 100000 # 200000 # 300000 # ... # 4500000 # Determining ePCR not found # wc -l primers.psl.filter.blat # 34026 primers.psl.filter.blat wc -l /cluster/data/mm7/bed/STSmarkers/primer/primers.psl.filter.blat # 33986 /cluster/data/mm7/bed/STSmarkers/primer/primers.psl.filter.blat wc -l /cluster/data/mm6/bed/STSmarkers/primer/primers.psl.filter.blat # 33128 /cluster/data/mm6/bed/STSmarkers/primer/primers.psl.filter.blat wc -l /cluster/data/mm5/bed/STSmarkers/primer/primers.psl.filter.blat # 33476 /cluster/data/mm5/bed/STSmarkers/primer/primers.psl.filter.blat # create accession_info.rdb touch empty_sequence.inf /cluster/bin/scripts/compileAccInfo -mouse \ /cluster/data/mm8 empty_sequence.inf # works with errors on missing randoms, etc...: # cat: /cluster/data/mm5/11/chr11_random.agp: No such file or directory # cat: /cluster/data/mm5/M/chrM_random.agp: No such file or directory mv accession_info.rdb accession_info.rdb.tmp /cluster/bin/scripts/sorttbl Chr Ord Start < accession_info.rdb.tmp > \ accession_info.rdb rm accession_info.rdb.tmp # comparing results to previous # Continuing the trend that began with Mm7, the numbers in # accession_info.rdb continue to decrease. Even Mm8 has much less # fragments than did mm7: # e.g.: [hiram@kkstore01 /cluster/data] wc -l mm8/*/chr*.agp | tail -1 # 21910 total [hiram@kkstore01 /cluster/data] wc -l mm7/*/chr*.agp | tail -1 # 70125 total [hiram@kkstore01 /cluster/data] wc -l mm6/*/chr*.agp | tail -1 # 170812 total wc -l accession_info.rdb # 20385 accession_info.rdb wc -l /cluster/data/mm7/bed/STSmarkers/primer/accession_info.rdb # 44046 484510 3112816 accession_info.rdb wc /cluster/data/mm7/bed/STSmarkers/primer/accession_info.rdb # 93052 1023576 6824900 accession_info.rdb wc /cluster/data/mm5/bed/STSmarkers/primer/accession_info.rdb # 131845 1450299 9681940 wc /cluster/data/mm4/bed/STSmarkers/primer/accession_info.rdb # 86935 956289 6374930 # creates epcr.not.found.nomatch and epcr.not.found.psl # /cluster/bin/scripts/epcrToPsl # Fixed this script (in mm7) to make it not look for contigs in the usual # manner, we don't have those for this assembly sed -e "s/mm7/mm8/g" /cluster/data/mm7/bed/STSmarkers/primer/epcrToPsl \ > ./epcrToPsl chmod +x epcrToPsl ./epcrToPsl -mouse \ epcr.not.found ../mouseP.info \ accession_info.rdb /cluster/data/mm8 # Comparing results to previous: wc -l epcr* # 501 epcr.not.found # 0 epcr.not.found.nomatch # 501 epcr.not.found.psl # 158 epcrToPsl # 1160 total # Mm7 wc epcr* wc -l /cluster/data/mm7/bed/STSmarkers/primer/epcr* # 474 /cluster/data/mm7/bed/STSmarkers/primer/epcr.not.found # 0 /cluster/data/mm7/bed/STSmarkers/primer/epcr.not.found.nomatch # 474 /cluster/data/mm7/bed/STSmarkers/primer/epcr.not.found.psl # 158 /cluster/data/mm7/bed/STSmarkers/primer/epcrToPsl # 1106 total # Mm6 wc epcr* wc -l /cluster/data/mm6/bed/STSmarkers/primer/epcr* # 472 /cluster/data/mm6/bed/STSmarkers/primer/epcr.not.found # 63 /cluster/data/mm6/bed/STSmarkers/primer/epcr.not.found.nomatch # 404 /cluster/data/mm6/bed/STSmarkers/primer/epcr.not.found.psl # 158 /cluster/data/mm6/bed/STSmarkers/primer/epcrToPsl # 1097 total cat primers.psl.filter.blat epcr.not.found.psl > primers.psl.filter wc -l primers.psl.filter # 34527 primers.psl.filter wc -l /cluster/data/mm7/bed/STSmarkers/primer/primers.psl.filter # 34460 /cluster/data/mm7/bed/STSmarkers/primer/primers.psl.filter wc -l /cluster/data/mm6/bed/STSmarkers/primer/primers.psl.filter # 33532 /cluster/data/mm6/bed/STSmarkers/primer/primers.psl.filter wc -l /cluster/data/mm5/bed/STSmarkers/primer/primers.psl.filter.lifted # 33691 /cluster/data/mm5/bed/STSmarkers/primer/primers.psl.filter.lifted # create primers.psl.filter.lifted.initial # if you do not run with scripts in your path, add the PATH business PATH=/cluster/bin/scripts:$PATH /cluster/bin/scripts/extractPslInfo \ primers.psl.filter wc -l primers.psl.filter.initial # 34513 primers.psl.filter.initial wc -l /cluster/data/mm7/bed/STSmarkers/primer/primers.psl.filter.initial # 34443 /cluster/data/mm7/bed/STSmarkers/primer/primers.psl.filter.initial wc -l /cluster/data/mm6/bed/STSmarkers/primer/primers.psl.filter.initial # 33514 /cluster/data/mm6/bed/STSmarkers/primer/primers.psl.filter.initial wc -l \ /cluster/data/mm5/bed/STSmarkers/primer/primers.psl.filter.lifted.initial # 33689 # create primers.psl.filter.lifted.initial.acc /cluster/bin/scripts/findAccession -agp \ -mouse primers.psl.filter.initial /cluster/data/mm8 # it complains about missing _random items, it is OK wc -l primers.psl.filter.initial.acc # 34513 primers.psl.filter.initial.acc wc -l /cluster/data/mm7/bed/STSmarkers/primer/primers.psl.filter.initial.acc # 34443 # this needs to be -rat as that specifies how to scan the # stsInfoMouse.bed file and it does not work if you use -mouse /cluster/bin/scripts/getStsId -rat \ ../stsInfoMouse.bed primers.psl.filter.initial.acc \ | sort -k4,4n > primers.final wc -l primers.final # 34513 primers.final wc -l /cluster/data/mm7/bed/STSmarkers/primer/primers.final # 34443 /cluster/data/mm7/bed/STSmarkers/primer/primers.final cd /cluster/data/mm8/bed/STSmarkers # stsMarkers.final is empty for mouse touch stsMarkers.final dummy # if you do not run with scripts in your path, add the PATH business PATH=/cluster/bin/scripts:$PATH /cluster/bin/scripts/combineSeqPrimerPos \ stsMarkers.final primer/primers.final > stsMarkers_pos.rdb wc -l stsMarkers_pos.rdb # 33075 stsMarkers_pos.rdb wc -l /cluster/data/mm7/bed/STSmarkers/stsMarkers_pos.rdb # 32869 /cluster/data/mm7/bed/STSmarkers/stsMarkers_pos.rdb wc -l /cluster/data/mm6/bed/STSmarkers/stsMarkers_pos.rdb # 31889 /cluster/data/mm6/bed/STSmarkers/stsMarkers_pos.rdb wc -l /cluster/data/mm5/bed/STSmarkers/stsMarkers_pos.rdb # 32085 /cluster/data/mm5/bed/STSmarkers/stsMarkers_pos.rdb wc -l /cluster/data/mm4/bed/STSmarkers/stsMarkers_pos.rdb # 31270 /cluster/data/mm4/bed/STSmarkers/stsMarkers_pos.rdb /projects/cc/hg/ytlu/bin/script/perl/createStsBed \ stsInfoMouse.bed stsMarkers_pos.rdb 500 \ | sort -k1,1 -k2,2n > stsMapMouse.bed # Fixup --- 2006-04-12 - Hiram - it was found that column 12 had blanks # as the first character of the field. This isn't what is needed # here. Let's take those blanks out, turns out these were the # only blanks in the file: mv stsMapMouse.bed stsMapMouse_withBlanks.bed sed -e "s/ //" stsMapMouse_withBlanks.bed > stsMapMouse.bed wc stsMapMouse.bed # 29888 308263 2087726 stsMapMouse.bed wc /cluster/data/mm7/bed/STSmarkers/stsMapMouse.bed # 29079 301678 2097544 stsMapMouse.bed wc /cluster/data/mm5/bed/STSmarkers/stsMapMouse.bed # 29069 301535 2123622 /cluster/data/mm5/bed/STSmarkers/stsMapMouse.bed # loading STS markers tables ssh hgwdev cd /cluster/data/mm8/bed/STSmarkers cp -p /cluster/data/mm7/bed/STSmarkers/ucscAlias.pl . ./ucscAlias.pl stsInfoMouse.bed > ucscStsAlias.tab 2> ucscStsAlias.warnings # this does leave messages in ucscStsAlias.warnings but they seem # to be very similar to Mm6 with just a few new ones wc ucscStsAlias.tab (after applying filter to primers.psl above) # 144570 433667 3366815 ucscStsAlias.tab wc ucscStsAlias.tab (before applying filter to primers.psl above) # 144570 433667 3366815 ucscStsAlias.tab wc /cluster/data/mm7/bed/STSmarkers/ucscStsAlias.tab # 141585 424725 3284106 ucscStsAlias.tab wc /cluster/store6/mm5/bed/STSmarkers/ucscStsAlias.tab # 126624 379859 3037850 /cluster/store6/mm5/bed/STSmarkers/ucscStsAlias.tab # Use the drop tables if reloading # hgsql -e "drop table stsAlias;" mm8 hgsql mm8 < ~/kent/src/hg/lib/stsAlias.sql hgsql -e \ 'load data local infile "ucscStsAlias.tab" into table stsAlias;' mm8 # reloaded stsMapMouseNew 2006-04-12 to remove blanks in col 12 - Hiram # hgsql -e "drop table stsMapMouseNew;" mm8 hgsql mm8 < ~/kent/src/hg/lib/stsMapMouseNew.sql hgsql -e \ 'load data local infile "stsMapMouse.bed" into table stsMapMouseNew;' mm8 # hgsql -e "drop table stsInfoMouseNew;" mm8 hgsql mm8 < ~/kent/src/hg/lib/stsInfoMouseNew.sql hgsql -e \ 'load data local infile "stsInfoMouse.bed" into table stsInfoMouseNew;' mm8 hgLoadPsl -nobin -table=all_sts_primer mm8 primer/primers.psl.filter # load of all_sts_primer did not go as planned: 34527 record(s), 0 # row(s) skipped, 19 warning(s) loading primer/primers.psl.filter # load primer sequences mkdir /gbdb/mm8/stsMarker ln -s /cluster/data/mm8/bed/STSmarkers/mouseP.fa \ /gbdb/mm8/stsMarker/mouseP.fa # PLEASE NOTE THAT THE If you are going to reload this business, use the # -replace option on this hgLoadSeq # hgLoadSeq -replace mm8 /gbdb/mm8/stsMarker/mouseP.fa # otherwise there will be a problem that the seq and extFile tables # will be out of sync. hgLoadSeq mm8 /gbdb/mm8/stsMarker/mouseP.fa # Adding /gbdb/mm8/stsMarker/mouseP.fa # 33838 sequences featureBits mm8 all_sts_primer # 3746196 bases of 2567283971 (0.146%) in intersection featureBits mm7 all_sts_primer # 3757119 bases of 2583394090 (0.145%) in intersection featureBits mm6 all_sts_primer # 3677372 bases of 2597150411 (0.142%) in intersection featureBits mm8 stsMapMouseNew # 4801964 bases of 2567283971 (0.187%) in intersection featureBits mm7 stsMapMouseNew # 4805958 bases of 2583394090 (0.186%) in intersection featureBits mm6 stsMapMouseNew # 4638338 bases of 2597150411 (0.179%) in intersection hgsql -N mm8 -e "select count(*) from stsAlias;" # 141981 hgsql -N mm7 -e "select count(*) from stsAlias;" # 140649 hgsql -N mm7 -e "select count(*) from stsAlias;" # 137738 hgsql -N mm5 -e "select count(*) from stsAlias;" # 122944 hgsql -N mm8 -e "select count(*) from stsInfoMouseNew;" # 60440 hgsql -N mm7 -e "select count(*) from stsInfoMouseNew;" # 59843 hgsql -N mm7 -e "select count(*) from stsInfoMouseNew;" # 58980 hgsql -N mm5 -e "select count(*) from stsInfoMouseNew;" # 58493 # compare old and new name lists: awk '{print $4}' stsMapMouse.bed | sort -u > mm8.nameList awk '{print $4}' /cluster/data/mm7/bed/STSmarkers/stsMapMouse.bed | \ sort -u > mm7.nameList comm -12 mm?.nameList | wc -l # 28253 <- 28,253 names in common comm -23 mm7.nameList mm8.nameList | wc -l # 174 <- 174 unique to mm7 list comm -13 mm7.nameList mm8.nameList | wc -l # 445 <- 445 unique to mm8 list # previously, Mm6 vs Mm7: # 27320 <- 27,320 names in common # 188 <- 188 unique to mm6 list # 1107 <- 1,107 unique to mm7 list #################################################################################### # BUILD KNOWN GENES TABLES (STARTED 2/25/06, PART I DONE 2/27/06 Fan) # First build protein databases, sp060115 and proteins060115 # See makeProteins060115.doc for details. # Create working subdirectories and temporary databases (kgMm8A) ssh hgwdev mkdir /cluster/store9/kg cd /cluster/store9/kg mkdir kgMm8A ln -s /cluster/store9/kg/kgMm8A /cluster/store6/kgDB/bed/kgMm8A ln -s /cluster/store9/kg/kgMm8A /cluster/data/mm8/bed/kgMm8A hgsql mm8 -e "create database kgMm8A" hgsql mm8 -e "create database kgMm8ATemp" mkdir /cluster/bluearc/kgDB/kgMm8A mkdir /cluster/bluearc/kgDB/kgMm8A/protBlat ln -s /cluster/bluearc/kgDB/kgMm8A/protBlat /cluster/store9/kg/kgMm8A/protBlat cd /cluster/store9/kg/kgMm8A/protBlat # Get all mouse protein sequences hgsql -N sp060115 -e \ 'select p.acc, p.val from protein p, accToTaxon x where x.taxon=10090 and p.acc=x.acc'\ |awk '{print ">" $1;print $2}' >mouseProt.fa hgsql -N sp060115 -e \ 'select v.varAcc, p.val from varAcc v, protein p, accToTaxon x where v.parAcc = p.acc and x.taxon=10090 and v.parAcc=x.acc'\ |awk '{print ">" $1;print $2}' \ >mouseVarProt.fa # append var proteins to mouseProt.fa cat mouseVarProt.fa >>mouseProt.fa # Prepare and perform cluster run for protein/genome alignment ssh pk cd /cluster/data/mm8/bed/kgMm8A/protBlat mkdir prot faSplit sequence mouseProt.fa 2000 prot/prot ls /cluster/bluearc/kgDB/kgMm8A/protBlat/prot/* > prot.lis ssh hgwdev cd /cluster/data/mm8/bed/kgMm8A/protBlat hgsql mm8 -N -e 'select chrom from chromInfo' > chrom.lis exit cat << '_EOF_' > gsub #LOOP /cluster/bin/x86_64/blat -t=dnax -q=prot /cluster/data/mm8/nib/$(path1).nib $(path2) {check out line+ /cluster/bluearc/kgDB/kgMm8A/protBlat/result/$(root1)_$(root2).psl} #ENDLOOP '_EOF_' mkdir result gensub2 chrom.lis prot.lis gsub jobList para create jobList para try para check para push para check ... # started 8:15 AM 2/25/06, done 3:12 AM 2/26/06. # Two jobs crashed due to empty result, push again and finished OK in a few minutes. # Completed: 67354 of 67354 jobs # CPU time in finished jobs: 12580047s 209667.46m 3494.46h 145.60d 0.399 y # IO & Wait Time: 237270s 3954.49m 65.91h 2.75d 0.008 y # Average job time: 190s 3.17m 0.05h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 19991s 333.18m 5.55h 0.23d # Submission to last job: 68128s 1135.47m 18.92h 0.79d # collect BLAT results pslSort -nohead dirs raw.psl temp result pslReps -nohead -minCover=0.80 -minAli=0.80 -nearTop=0.002 raw.psl protBlat.psl /dev/null ssh hgwdev cd /cluster/bluearc/kgDB/kgMm8A/protBlat hgLoadPsl mm8 protBlat.psl # create all_mrna.psl and tight_mrna.psl hgsql mm8 -N -e "select * from all_mrna" |cut -f 2-22 >all_mrna.psl pslReps -minCover=0.40 -minAli=0.97 -nearTop=0.002 \ all_mrna.psl tight_mrna.psl /dev/null # Save a copy of the following mm8 tables, to be used later to construct # kgMore and kgEvenmore all_mrna gbCdnaInfo gbExtFile gbLoaded gbSeq gbStatus refFlat refGene refLink refSeqAli refSeqStatus refSeqSummary xenoMrna xenoRefFlat xenoRefGene xenoRefSeqAli # Use overlapSelect to get protein and mRNA alignment overlaps overlapSelect -statsOutput -dropped=protOut.psl -overlapThreshold=0.90 \ -selectFmt=psl -inFmt=psl tight_mrna.psl protBlat.psl protMrna.stat overlapSelect -mergeOutput -dropped=protOut.psl -overlapThreshold=0.90 -selectFmt=psl \ -inFmt=psl tight_mrna.psl protBlat.psl protMrna.out # Create protein/mRNA pair and protein lists cut -f 10,31 protMrna.out|sort -u >spMrna.tab cut -f 10 protMrna.out|sort -u >protein.lis cp -p protein.lis /cluster/data/mm8/bed/kgMm8A # Load spMrna.tab into spMrna table in temp DB. hgsql kgMm8ATemp < ~/src/hg/lib/spMrna.sql hgsql kgMm8ATemp -e 'load data local infile "spMrna.tab" into table spMrna' hgsql kgMm8ATemp -e 'create index mrnaID on spMrna(mrnaID)' # Prepare and perform cluster run of protein/mRNA alignment # Get mRNA fa file. cd /cluster/data/mm8/bed/kgMm8A /cluster/data/genbank/bin/i386/gbGetSeqs -native -db=mm8 \ -gbRoot=/cluster/data/genbank genbank mrna mrna.fa # Create mrnaSeq table in kgMm8ATemp DB. faToTab mrna.fa mrnaSeq.tab hgsql kgMm8ATemp -e 'drop table mrnaSeq' hgsql kgMm8ATemp <~/src/hg/lib/mrnaSeq.sql hgsql kgMm8ATemp -e 'load data local infile "mrnaSeq.tab" into table mrnaSeq' # Prepare files for cluster run cd /cluster/bluearc/kgDB/kgMm8A ~/src/hg/protein/KG2.sh kgMm8A mm8 060115 # Perform cluster run of protein/mRNA alignment ~/src/hg/protein/KG3.sh kgMm8A mm8 060115 # Collect cluster run results cd kgBestMrna ls out | sed -e 's/prot/do1 prot/g' >doall # create do1 with the following 2 lines: cat << '_EOF_' > do1 echo processing $1 cat out/$1/*.out >>protMrnaRaw.psl '_EOF_' chmod +x do* doall # Filter out low quality alignments pslReps -nohead -singleHit -minAli=0.9 protMrnaRaw.psl protMrnaBlat.psl /dev/null cut -f 10,14 protMrnaBlat.psl |sort -u >protMrna.lis wc protMrna.lis # Load BLAT results into temp DB. ssh hgwdev cd /cluster/store9/kg/kgMm8A/kgBestMrna hgsql kgMm8ATemp < ~/src/hg/lib/protMrnaBlat.sql hgsql kgMm8ATemp -e 'load data local infile "protMrnaBlat.psl" into table protMrnaBlat' hgsql kgMm8ATemp -e 'create index tName on protMrnaBlat(tName)' # Create CDS files from protein/mRNA alignment results. hgsql kgMm8ATemp -N -e \ 'select qName,"_",tName,tStart+1,":",tEnd+3 from protMrnaBlat order by qName,tName,tEnd-tStart desc'\ |sed 's/\t_\t/_/g'|sed 's/\t:\t/../g' >protMrna.cds # Create protMrna.psl with proteinID_mrnaID as query ID. cut -f 22-30 ../protBlat/protMrna.out > j1.tmp cut -f 32-42 ../protBlat/protMrna.out > j2.tmp cut -f 10,31 ../protBlat/protMrna.out|sed -e 's/\t/_/g' >j3.tmp paste j1.tmp j3.tmp j2.tmp >protMrna.psl rm j1.tmp j2.tmp j3.tmp # Run mrnaToGene to create protMrna.gp bash mrnaToGene -cdsFile=protMrna.cds protMrna.psl protMrna.gp 2>protMrna.err >protMrna.log exit # move kgBestMrna to /san/sanvol1 to save space on store9 mv /cluster/store9/kg/kgMm8A/kgBestMrna/clusterRun /san/sanvol1/scratch/fan/mm8/kgMm8A/kgBestMrna ln -s /san/sanvol1/scratch/fan/mm8/kgMm8A/kgBestMrna/clusterRun \ /cluster/store9/kg/kgMm8A/kgBestMrna/clusterRun # Prepare refGene and all_mrna gp files. cd .. cp -p base/refGene.tab ref.gp hgsql mm8 -N -e \ 'select gbCdnaInfo.acc,cds.name from gbCdnaInfo,cds,all_mrna where all_mrna.qName=gbCdnaInfo.acc and gbCdnaInfo.cds=cds.id' \ |sort -u > all_mrna.cds cat base/all_mrna.tab |cut -f 2-22 >all_mrna.psl bash mrnaToGene -cdsFile=all_mrna.cds all_mrna.psl all_mrna.gp 2>all_mrna.err > all_mrna.log exit # Align proteins to RefSeq. overlapSelect -inCds -statsOutput -overlapThreshold=0.90 -selectFmt=psl -inFmt=genePred\ protBlat/protBlat.psl ref.gp ref.stat overlapSelect -inCds -dropped=refOut1.gp -overlapThreshold=0.90 -selectFmt=psl -inFmt=genePred\ protBlat/protBlat.psl ref.gp protRef.gp overlapSelect -mergeOutput -selectCds -dropped=protOut1.psl -overlapThreshold=0.80 -inFmt=psl\ -selectFmt=genePred ref.gp protBlat/protBlat.psl protRef.out cut -f 10,22 protRef.out | sort -u >spRef.tab cut -f 10 protRef.out | sort -u >protRef.lis hgsql kgMm8ATemp -e 'drop table spRef' hgsql kgMm8ATemp <~/src/hg/lib/spRef.sql hgsql kgMm8ATemp -e 'load data local infile "spRef.tab" into table spRef' # Prepare and perform cluster runs for protein/RefSeq alignments ~/src/hg/protein/KGRef2.sh kgMm8A mm8 060115 ~/src/hg/protein/KGRef3.sh kgMm8A mm8 060115 cd kgBestRef ls out | sed -e 's/prot/do1 prot/g' >doall cat << '_EOF_' > do1 echo processing $1 cat out/$1/*.out >>protRefRaw.psl '_EOF_' chmod +x do* doall # Filter out low quality alignments. pslReps -nohead -singleHit -minAli=0.9 protRefRaw.psl protRefBlat.psl /dev/null cut -f 10,14 protRefBlat.psl |sort -u >protRef.lis wc protRef.lis hgsql kgMm8ATemp -e 'drop table protRefBlat' hgsql kgMm8ATemp < ~/src/hg/lib/protRefBlat.sql hgsql kgMm8ATemp -e 'load data local infile "protRefBlat.psl" into table protRefBlat' hgsql kgMm8ATemp -e 'create index tName on protRefBlat(tName)' # Run gene-check to filter out invalid gp entries cd /cluster/data/mm8/bed/kgMm8A cat ref.gp kgBestMrna/protMrna.gp all_mrna.gp >kgCandidate0.gp gene-check -incl-ok -ok-genepred-out kgCandidate0.passed.gp -nib-dir /cluster/data/mm8/nib kgCandidate0.gp kgCandidate0.check hgsql kgMm8ATemp -e 'drop table kgCandidate0' hgsql kgMm8ATemp < ~/src/hg/lib/kgCandidate0.sql hgsql kgMm8ATemp -e 'load data local infile "kgCandidate0.gp" into table kgCandidate0' hgsql kgMm8ATemp -e 'drop table geneCheck' hgsql kgMm8ATemp < ~/src/hg/lib/geneCheck.sql hgsql kgMm8ATemp -e 'load data local infile "kgCandidate0.check" into table geneCheck ignore 2 lines' # Run kgCheck to get all KG candidates that pass the KG gene check criteria kgCheck kgMm8ATemp mm8 kgCandidate0 geneCheck kgCandidate.tab hgsql kgMm8ATemp -e 'drop table kgCandidate' hgsql kgMm8ATemp < ~/src/hg/lib/kgCandidate.sql hgsql kgMm8ATemp -e 'load data local infile "kgCandidate.tab" into table kgCandidate' hgsql kgMm8ATemp -e 'create index alignID on kgCandidate(alignID)' # Construct the kgCandidateX table that has alignID in the name field. cut -f 2-10 kgCandidate.tab >j2.tmp cut -f 11 kgCandidate.tab >j1.tmp paste j1.tmp j2.tmp >kgCandidateX.tab hgsql kgMm8ATemp -e 'drop table kgCandidateX' hgsql kgMm8ATemp < ~/src/hg/lib/kgCandidateX.sql hgsql kgMm8ATemp -e 'load data local infile "kgCandidateX.tab" into table kgCandidateX' # Score protein/mRna and protein/RefSeq alignments kgResultBestMrna2 060115 kgMm8ATemp mm8 protMrnaBlat|sort -u >protMrnaBlatScore.tab kgResultBestRef2 060115 kgMm8ATemp mm8 protRefBlat|sort -u >protRefScore.tab # Combine scoring results and load them into temp DB. cat protMrnaBlatScore.tab protRefScore.tab >protMrnaScore.tab hgsql kgMm8ATemp -e 'drop table protMrnaScore' hgsql kgMm8ATemp < ~/src/hg/lib/protMrnaScore.sql hgsql kgMm8ATemp -e 'load data local infile "protMrnaScore.tab" into table protMrnaScore' hgsql kgMm8ATemp -e 'create index mrnaAcc on protMrnaScore(mrnaAcc)' # Run kgGetCds to get CDS structure of each gene kgGetCds kgMm8ATemp 060115 kgCandidateX jY.tmp # G171564 does not have cds. # G171565 does not have cds. cat jY.tmp |sort -u >kgCandidateY.tab rm jY.tmp hgsql kgMm8ATemp -e 'drop table kgCandidateY' hgsql kgMm8ATemp < ~/src/hg/lib/kgCandidateY.sql hgsql kgMm8ATemp -e 'load data local infile "kgCandidateY.tab" into table kgCandidateY' # Run kgPickPrep to replace long cds structure string with cdsId. kgPickPrep kgMm8ATemp kgCandidateZ.tab hgsql kgMm8ATemp -e 'drop table kgCandidateZ' hgsql kgMm8ATemp < ~/src/hg/lib/kgCandidateZ.sql hgsql kgMm8ATemp -e 'load data local infile "kgCandidateZ.tab" into table kgCandidateZ' hgsql kgMm8ATemp -e 'create index cdsId on kgCandidateZ(cdsId)' # Run kgPick to pick the representative a mrna/protein pair for each unique CDS structure. kgPick kgMm8ATemp mm8 sp060115 kg3.tmp dupSpMrna.tmp sort -u dupSpMrna.tmp >dupSpMrna.tab # Create put back list # gbGetSeqs2, a modified version of gbGetSeqs output the RefSeq IDs at the beginning of each output line. gbGetSeqs2 -gbRoot=/cluster/data/genbank db=mm8 -get=ra RefSeq mrna ref.ra cat ref.ra | sed -e 's/ /\t/' | sort -u >refRa.tab hgsql mm8 -e 'drop table refRa' hgsql mm8 < ~/src/hg/lib/refRa.sql hgsql mm8 -e 'load data local infile "refRa.tab" into table refRa ignore 1 lines' hgsql mm8 -N -e \ 'select r.acc, r.attr, r.val from refRa r, refRa r2, refRa r3 where r.attr="selenocysteine" and r.acc=r2.acc and r2.attr="rss" and r2.val="rev" and r3.acc=r.acc and r3.attr="org" and r3.val="Mus musculus"' \ >kgPutBack2.tab hgsql mm8 -N -e \ 'select r.acc, r.attr, r.val from refRa r, refRa r2, refRa r3 where r.attr="cno" and r.val like "%ribosomal frameshift%" and r.acc=r2.acc and r2.attr="rss" and r2.val="rev" and r2.val="rev" and r3.acc=r.acc and r3.attr="org" and r3.val="Mus musculus"' \ >>kgPutBack2.tab hgsql mm8 -N -e \ 'select r.acc, r.attr, r.val from refRa r, refRa r2, refRa r3 where r.attr="cno" and r.val like "%non-AUG%" and r.acc=r2.acc and r2.attr="rss" and r2.val="rev" and r2.val="rev" and r3.acc=r.acc and r3.attr="org" and r3.val="Mus musculus"' \ >>kgPutBack2.tab hgsql mm8 -N -e \ 'select r.acc, r.attr, r.val from refRa r, refRa r2, refRa r3 where r.attr="translExcept" and r.acc=r2.acc and r2.attr="rss" and r2.val="rev" and r2.val="rev" and r3.acc=r.acc and r3.attr="org" and r3.val="Mus musculus"' \ >>kgPutBack2.tab hgsql mm8 -N -e \ 'select r.acc, r.attr, r.val from refRa r, refRa r2, refRa r3 where r.attr="exception" and r.acc=r2.acc and r2.attr="rss" and r2.val="rev" and r2.val="rev" and r3.acc=r.acc and r3.attr="org" and r3.val="Mus musculus"' \ >>kgPutBack2.tab hgsql kgMm8ATemp -e 'drop table kgPutBack2' hgsql kgMm8ATemp < ~/src/hg/lib/kgPutBack2.sql hgsql kgMm8ATemp -e 'load data local infile "kgPutBack2.tab" into table kgPutBack2' kgPutBack kgMm8ATemp mm8 sp060115 kgPutBack2 kgPutBack2.gp # No matching protein found for NM_008523. # No matching protein found for NM_194444. # No matching protein found for NM_206941. # Sort KG genes to make the kg4.gp table file. cat kgPutBack2.gp kg3.tmp > kg4.tmp ~/kent/src/hg/protein/sortKg.pl kg4.tmp >knownGene.tab hgsql kgMm8ATemp -e 'drop table knownGene' hgsql kgMm8ATemp < ~/src/hg/lib/knownGene.sql hgsql kgMm8ATemp -e 'load data local infile "knownGene.tab" into table knownGene' # Load data into mm8 knownGene table. hgsql mm8 -e 'drop table knownGene' hgsql mm8 < ~/src/hg/lib/knownGene.sql hgsql mm8 -e 'load data local infile "knownGene.tab" into table knownGene' # Load dupSpMrna table after knownGene table is loaded so that joinerCheck does not complain. hgsql mm8 -e 'drop table dupSpMrna' hgsql mm8 < ~/src/hg/lib/dupSpMrna.sql hgsql mm8 -e 'load data local infile "dupSpMrna.tab" into table dupSpMrna' # Perform analysis on KG nice featureBits mm8 knownGene # 54684224 bases of 2567283971 (2.130%) in intersection nice featureBits mm8 knownGene:cds # 28459053 bases of 2567283971 (1.109%) in intersection nice featureBits mm8 refGene # 46256526 bases of 2567283971 (1.802%) in intersection nice featureBits mm8 refGene:cds # 27221018 bases of 2567283971 (1.060%) in intersection nice featureBits mm8 refGene knownGene # 43441486 bases of 2567283971 (1.692%) in intersection nice featureBits mm8 refGene:cds knownGene:cds # 25164531 bases of 2567283971 (0.980%) in intersection nice featureBits mm7 knownGene # 53165921 bases of 2583394090 (2.058%) in intersection nice featureBits mm7 knownGene:cds # 27531524 bases of 2583394090 (1.066%) in intersection nice featureBits mm7 refGene # 46425940 bases of 2583394090 (1.797%) in intersection nice featureBits mm7 refGene:cds # 27319308 bases of 2583394090 (1.057%) in intersection nice featureBits mm7 refGene knownGene # 41777202 bases of 2583394090 (1.617%) in intersection nice featureBits mm7 refGene:cds knownGene:cds # 24297646 bases of 2583394090 (0.941%) in intersection # Build knownGeneMrna and knownGenePep tables. kgPepMrna kgMm8ATemp mm8 060115 hgsql mm8 -e 'drop table knownGeneMrna' hgsql mm8 < ~/src/hg/lib/knownGeneMrna.sql hgsql mm8 -e 'load data local infile "knownGeneMrna.tab" into table knownGeneMrna' hgsql mm8 -e 'drop table knownGenePep' hgsql mm8 < ~/src/hg/lib/knownGenePep.sql hgsql mm8 -e 'load data local infile "knownGenePep.tab" into table knownGenePep' # Build kgXref table kgXref2 kgMm8ATemp 060115 mm8 hgsql mm8 -e 'drop table kgXref' hgsql mm8 < ~/src/hg/lib/kgXref.sql hgsql mm8 -e 'load data local infile "kgXref.tab" into table kgXref' # Build spMrna table hgsql mm8 -N -e 'select proteinID, name from knownGene' >kgSpMrna.tab hgsql mm8 -e 'drop table spMrna' hgsql mm8 <~/src/hg/lib/spMrna.sql hgsql mm8 -e 'load data local infile "kgSpMrna.tab" into table spMrna' # Build kgProtMap table ssh hgwdev cd /cluster/store9/kg/kgMm8A ln -s protBlat/tight_mrna.psl . ~/src/hg/protein/kgProtMap2.sh kgMm8A mm8 060115 ##################################### # Build alias tables. (DONE 2/28/06, Fan) ssh hgwdev cd /cluster/store9/kg/kgMm8A mkdir alias cd alias kgAliasM mm8 proteins060115 # kgAliasKgXref reads from mm8.knownGene.proteinID, # mm8.knownGene.name, mm8.kgXref.geneSymbol # to create kgAliasKgXref.tab kgAliasKgXref mm8 # kgAliasRefseq reads from mm8.knownGene.name, # mm8.knownGene.proteinID, mm8.kgXref.refseq # to create kgAliasRefseq.tab kgAliasRefseq mm8 hgsql sp060115 -N -e 'select name,gene.val from mm8.knownGene,displayId,gene where displayId.val=proteinID and displayId.acc=gene.acc' \ | sort -u > kgAliasP.tab hgsql mm8 -N -e 'select name, name from knownGene' >kgAliasDup.tab hgsql mm8 -N -e 'select mrnaID, dupMrnaID from dupSpMrna' >>kgAliasDup.tab cat kgAliasM.tab kgAliasRefseq.tab kgAliasKgXref.tab kgAliasP.tab kgAliasDup.tab| \ sort |uniq > kgAlias.tab hgsql -e "drop table kgAlias;" mm8 hgsql mm8 < ~/kent/src/hg/lib/kgAlias.sql hgsql mm8 -e 'LOAD DATA local INFILE "kgAlias.tab" into table kgAlias' # kgProtAlias reads from mm8.knownGene.name, # mm8.knownGene.proteinID, mm8.knownGene.alignID, # proteins060115.spXref3.accession, proteins060115.spSecondaryID, proteins060115.pdbSP.pdb # to create kgProtAlias.tab# kgProtAlias mm8 060115 hgsql mm8 -N -e \ 'select kgID, spDisplayID, protAcc from kgXref where protAcc != ""'\ | sort -u >kgProtAliasNCBI.tab # include variant splice protein IDs hgsql mm8 -N -e \ 'select name, proteinID, parAcc from knownGene,sp060115.varAcc where varAcc=proteinID'\ |sort -u >kgProtAliasDup.tab # include duplicate protein IDs from dupSpMrna table hgsql mm8 -N -e \ 'select name, knownGene.proteinID, dupProteinID from knownGene, dupSpMrna where name=mrnaID'\ |sort -u >>kgProtAliasDup.tab # catch parent acc from dupProteinID too hgsql mm8 -N -e\ 'select name, knownGene.proteinID, parAcc from knownGene,dupSpMrna,sp060115.varAcc where name=mrnaID and dupProteinID=varAcc.varAcc'\ |sort -u >>kgProtAliasDup.tab cat kgProtAliasNCBI.tab kgProtAlias.tab kgProtAliasDup.tab | sort -u > kgProtAliasAll.tab echo "`date` creating table kgProtAlias" hgsql mm8 -e "drop table kgProtAlias;" hgsql mm8 <~/src/hg/lib/kgProtAlias.sql; hgsql mm8 -e 'LOAD DATA local INFILE "kgProtAliasAll.tab" into table kgProtAlias;' # Build kgSpAlias table hgsql mm8 -e \ 'select kgXref.kgID, spID, alias from kgXref, kgAlias where kgXref.kgID=kgAlias.kgID' >j.tmp hgsql mm8 -e \ 'select kgXref.kgID, spID, alias from kgXref, kgProtAlias where kgXref.kgID=kgProtAlias.kgID'\ >>j.tmp cat j.tmp|sort -u |grep -v 'kgID' >mm8.kgSpAlias.tab rm j.tmp hgsql mm8 -e 'drop table kgSpAlias'; hgsql mm8 < ~/src/hg/lib/kgSpAlias.sql hgsql mm8 -e 'load data local infile "mm8.kgSpAlias.tab" into table kgSpAlias' ############################################################################# # 17-WAY VAR_MULTIZ - ALIGNMENTS (DONE - 2006-02-28 - 2006-03-02 - Hiram) # Re-DONE with panTro2 in place of panTro1 - 2006-04-19 - Hiram) # And again with xenTro1 in place of xenTro2 - 2006-04-24 # And again with danRer4 in place of danRer3 - 2006-05-02 ssh kkstore04 mkdir /cluster/data/mm8/bed/multiz17way cd /cluster/data/mm8/bed/multiz17way # create tree diagram to guide work below. # This tree was constructed from one that Adam is using for # ENCODE work and a 27-way alignment. Took that file and # removed some of the entries, adding together the appropriate # distances. cat << '_EOF_' > 17way.nh ((((((((( (human_hg18:0.006690,chimp_panTro2:0.007571):0.024272, macaque_rheMac2:0.0592):0.023960, ((rat_rn4:0.081728,mouse_mm8:0.077017):0.229273, rabbit_oryCun1:0.206767):0.1065):0.023026, (cow_bosTau2:0.159182,dog_canFam2:0.147731):0.039450):0.028505, armadillo_dasNov1:0.149862):0.015994, (elephant_loxAfr1:0.104891,tenrec_echTel1:0.259797):0.040371):0.218400, monodelphis_monDom4:0.371073):0.189124, chicken_galGal2:0.454691):0.123297, xenopus_xenTro2:0.782453):0.156067, ((tetraodon_tetNig1:0.199381,fugu_fr1:0.239894):0.492961, zebrafish_danRer4:0.782561):0.156067); '_EOF_' # << happy emacs /cluster/bin/phast/draw_tree 17way.nh > 17way.ps /cluster/bin/phast/all_dists 17way.nh > 17way.distances.txt grep -y mm8 17way.distances.txt | sort -k3,3n # Print out that file for reference, and use the calculated # distances in the table below to order the organisms and check # the button order on the browser. Zebrafish ends up before # tetraodon and fugu on the browser despite its distance. # And if you can fill in the table below entirely, you have # succeeded in finishing all the alignments required. # # featureBits chainLink measures # chainMm8Link chain linearGap # distance on Mm8 on other minScore # 1 0.1587 - rat rn4 (% 68.957) (% 69.651) 3000 medium # 2 0.4677 - human hg18 (% 38.343) (% 34.514) 3000 medium # 3 0.4686 - chimp panTro2 (% 37.549) (% 33.614) 3000 medium # 4 0.4960 - macaque rheMac2 (% 34.718) (% 33.170) 3000 medium # 5 0.5131 - rabbit oryCun1 (% 19.322) (no swap ) 3000 medium # 6 0.6142 - armadillo dasNov1 (% 16.825) (no swap ) 3000 medium # 7 0.6230 - dog canFam2 (% 32.281) (% 34.255) 3000 medium # 8 0.6256 - elephant loxAfr1 (% 18.392) (no swap ) 3000 medium # 9 0.6344 - cow bosTau2 (% 26.832) (% 24.293) 3000 medium # 10 0.7805 - tenrec echTel1 (% 11.412) (no swap ) 5000 loose # 11 1.0698 - opossum monDom4 (% 8.245) (% 6.024) 5000 loose # 12 1.3425 - chicken galGal2 (% 2.552) (% 5.414) 5000 loose # 13 1.7936 - frog xenTro2 (% 2.651) (% 5.358) 5000 loose # 14 2.0157 - tetraodon tetNig1 (% 1.962) (% 13.734) 5000 loose # 15 2.0562 - fugu fr1 (% 1.907) (% 13.524) 5000 loose # 16 2.1059 - zebrafish danRer4 (% 2.105) (% 3.576) 5000 loose cd /cluster/data/mm8/bed/multiz17way # bash shell syntax here ... export H=/cluster/data/mm8/bed mkdir mafLinks for G in rn4 hg18 panTro2 rheMac2 oryCun1 dasNov1 canFam2 \ loxAfr1 bosTau2 echTel1 monDom4 galGal2 xenTro2 tetNig1 fr1 danRer4 do mkdir mafLinks/$G if [ ! -d ${H}/blastz.${G}/mafNet ]; then echo "missing directory blastz.${G}/mafNet" exit 255 fi ln -s ${H}/blastz.$G/mafNet/*.maf.gz ./mafLinks/$G done # Copy MAFs to some appropriate NFS server for kluster run ssh kkstore04 mkdir /san/sanvol1/scratch/mm8/multiz17way cd /san/sanvol1/scratch/mm8/multiz17way time rsync -a --copy-links --progress \ /cluster/data/mm8/bed/multiz17way/mafLinks/ . # We have about 5.9 Gb of data here, takes ~ 10 minutes to copy mkdir penn cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/multiz penn cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/maf_project penn cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/autoMZ penn # the autoMultiz cluster run ssh pk cd /cluster/data/mm8/bed/multiz17way/ # create species list and stripped down tree for autoMZ sed 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//; /^ *$/d' \ 17way.nh > tmp.nh echo `cat tmp.nh` > tree-commas.nh echo `cat tree-commas.nh` | sed 's/ //g; s/,/ /g' > tree.nh sed 's/[()]//g; s/,/ /g' tree.nh > species.lst # the maf directory here is a symlink to a /cluster/store8 # directory to even out the data load on store9 and store8 on kkstore04 mkdir /cluster/store8/mm8/bed/multiz17way/maf ln -s /cluster/store8/mm8/bed/multiz17way/maf ./maf mkdir run cd run # NOTE: you need to set the db properly in this script cat > autoMultiz << '_EOF_' #!/bin/csh -ef set db = mm8 set c = $1 set maf = $2 set binDir = /san/sanvol1/scratch/$db/multiz17way/penn set tmp = /scratch/tmp/$db/multiz.$c set pairs = /san/sanvol1/scratch/$db/multiz17way rm -fr $tmp mkdir -p $tmp cp ../{tree.nh,species.lst} $tmp pushd $tmp foreach s (`cat species.lst`) set in = $pairs/$s/$c.maf set out = $db.$s.sing.maf if ($s == $db) then continue endif if (-e $in.gz) then zcat $in.gz > $out else if (-e $in) then cp $in $out else echo "##maf version=1 scoring=autoMZ" > $out endif end set path = ($binDir $path); rehash $binDir/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c.maf popd cp $tmp/$c.maf $maf rm -fr $tmp '_EOF_' # << happy emacs chmod +x autoMultiz cat << '_EOF_' > template #LOOP autoMultiz $(root1) {check out line+ /cluster/store8/mm8/bed/multiz17way/maf/$(root1).maf} #ENDLOOP '_EOF_' # << happy emacs awk '{print $1}' /cluster/data/mm8/chrom.sizes > chrom.lst gensub2 chrom.lst single template jobList para create jobList # 34 jobs para try ... check ... push ... etc ... # Completed: 34 of 34 jobs # CPU time in finished jobs: 210573s 3509.55m 58.49h 2.44d 0.007 y # IO & Wait Time: 4870s 81.17m 1.35h 0.06d 0.000 y # Average job time: 6337s 105.61m 1.76h 0.07d # Longest finished job: 17786s 296.43m 4.94h 0.21d # Submission to last job: 41755s 695.92m 11.60h 0.48d # combine results into a single file for loading and gbdb reference ssh kkstore04 cd /cluster/data/mm8/bed/multiz17way # There used to be a mafFilter here with a minScore of 500, but it # turns out that the scores in these maf files are pretty much # useless. They range from very large negatives to very large # positives. time catDir maf > multiz17way.maf # real 10m17.400s # makes an 17 Gb file: # -rw-rw-r-- 1 17334936245 Apr 20 10:31 multiz17way.maf # Create per-chrom individual maf files for downloads # These are actually done after the annotation mafs are made ## re-done with corrected annotated mafs 2007-03-28 - Hiram ssh kkstore04 cd /cluster/data/mm8/bed/multiz17way mkdir mafDownloads time for M in anno/maf/chr*.maf do B=`basename $M` nice -n +19 cp -p ${M} mafDownloads/${B} nice -n +19 gzip mafDownloads/${B} echo ${B} done done # real 59m16.415s cd mafDownloads md5sum *.gz > md5sum.txt # deliver to downloads ssh hgwdev ln -s /cluster/data/mm8/bed/multiz17way/mafDownloads \ /usr/local/apache/htdocs/goldenPath/mm8/multiz17way # Load into database, actually annotation mafs are loaded later ssh hgwdev cd /cluster/data/mm8/bed/multiz17way mkdir /gbdb/mm8/multiz17way ln -s /cluster/data/mm8/bed/multiz17way/multiz17way.maf \ /gbdb/mm8/multiz17way time nice -n +19 hgLoadMaf mm8 multiz17way # Loaded 11601035 mafs in 1 files from /gbdb/mm8/multiz17way # real 27m29.960s time nice -n +19 hgLoadMafSummary -minSize=10000 -mergeGap=500 \ -maxSize=50000 mm8 multiz17waySummary multiz17way.maf # Created 5782229 summary blocks from 65123362 components and # 11601035 mafs from multiz17way.maf # real 32m34.791s # Dropped unused indexes (2006-05-09 kate) # NOTE: this is not required in the future, as the loader # has been fixed to not generate these indexes hgsql mm8 -e "alter table multiz17waySummary drop index chrom_2" hgsql mm8 -e "alter table multiz17waySummary drop index chrom_3" # This was done for Mm7, same image can be reused # create tree image: # cat << '_EOF_' > species.nh # ((((((human,(mouse,rat)),(dog,cow)),opossum),chicken),frog),(tetraodon,zebrafish)) # '_EOF_' # /cluster/bin/phast/draw_tree -b -s species.nh > species10.ps # photoshop to enhance, reduce the amount of whitespace to make it # smaller, then save as jpg # cp species10.jpg /usr/local/apache/htdocs/images/phylo/Mm7_17way.jpg # creating upstream mafs (DONE - 2006-07-31 - Hiram) ssh hgwdev # data data load balancing in the kkstore04 filesystems mkdir /cluster/store8/mm8/bed/multiz17way/upstreamMafs cd /cluster/data/mm8/bed/multiz17way ln -s /cluster/store8/mm8/bed/multiz17way/upstreamMafs ./upstreamMafs # rebuilt 2007-12-21 to fix difficulty in mafFrags when species.lst # did not have mm8 as the first one for S in 1000 2000 5000 do echo "making upstream${S}.maf" nice -n +19 $HOME/bin/$MACHTYPE/featureBits -verbose=2 mm8 \ refGene:upstream:${S} -fa=/dev/null -bed=stdout \ | perl -wpe 's/_up[^\t]+/\t0/' | sort -k1,1 -k2,2n \ | $HOME/kent/src/hg/ratStuff/mafFrags/mafFrags mm8 multiz17way \ stdin stdout -orgs=species.lst \ | gzip -c > upstreamMafs/upstream${S}.maf.gz echo "done upstream${S}.maf.gz" done ssh hgwdev cd /usr/local/apache/htdocs/goldenPath/mm8/multiz17way ln -s /cluster/data/mm8/bed/multiz17way/upstreamMafs/upstream*.maf.gz . ############################################################################ # ANNOTATE MULTIZ17WAY MAF AND LOAD TABLES (DONE - 2006-04-24 - Hiram) # RE-DONE 2006-05-03 with danRer4 in place of danRer3 ## Redone to correct usage of nBeds and sizes file (2007-03-28 - Hiram) ssh kolossus mkdir /cluster/data/mm8/bed/multiz17way/anno cd /cluster/data/mm8/bed/multiz17way/anno mkdir maf run cd run rm -f sizes nBeds twoBitInfo -nBed /cluster/data/mm8/mm8.{2bit,N.bed} for DB in `cat /cluster/data/mm8/bed/multiz17way/species.lst` do ln -s /cluster/data/${DB}/chrom.sizes ${DB}.len ln -s /cluster/data/${DB}/${DB}.N.bed ${DB}.bed echo ${DB}.bed >> nBeds echo ${DB}.len >> sizes echo $DB done echo '#!/bin/csh -ef' > jobs.csh echo date >> jobs.csh # do smaller jobs first so you can see some progress immediately: for F in `ls -1rS ../../maf/*.maf` do echo nice mafAddIRows -nBeds=nBeds -sizes=sizes $F \ /cluster/data/mm8/mm8.2bit ../maf/`basename $F` >> jobs.csh echo "echo $F" >> jobs.csh done echo date >> jobs.csh chmod +x jobs.csh time ./jobs.csh > jobs.log 2>&1 & # to watch progress; tail -f jobs.log # real 218m16.272s # Load anno/maf ssh hgwdev cd /cluster/data/mm8/bed/multiz17way/anno/maf mkdir -p /gbdb/mm8/multiz17way/anno/maf ln -s /cluster/data/mm8/bed/multiz17way/anno/maf/*.maf \ /gbdb/mm8/multiz17way/anno/maf time nice -n +19 hgLoadMaf \ -pathPrefix=/gbdb/mm8/multiz17way/anno/maf mm8 multiz17way # Loaded 12484442 mafs in 34 files from /gbdb/mm8/multiz17way/anno/maf # real 8m14.757s # Do the computation-intensive part of hgLoadMafSummary on a workhorse # machine and then load on hgwdev: ssh hgwdev64 cd /cluster/data/mm8/bed/multiz17way/anno/maf time cat *.maf | \ nice -n +19 hgLoadMafSummary mm8 -minSize=30000 -mergeGap=1500 \ -maxSize=200000 -test multiz17waySummary stdin # Created 3153839 summary blocks from 65123362 components # and 12484442 mafs from stdin # real 13m25.961s ssh hgwdev cd /cluster/data/mm8/bed/multiz17way/anno/maf time nice -n +19 hgLoadSqlTab mm8 multiz17waySummary \ ~/kent/src/hg/lib/mafSummary.sql multiz17waySummary.tab # real 0m53.525s rm *.tab ####################################################################### # MULTIZ17WAY MAF FRAMES (DONE - 2006-04-24 - 2006-04-25 - Hiram) # RE-DONE 2006-05-03 to replace danRer3 with danRer4 ssh hgwdev mkdir /cluster/data/mm8/bed/multiz17way/frames cd /cluster/data/mm8/bed/multiz17way/frames # The following is adapted from MarkD's Makefile used for mm7... #------------------------------------------------------------------------ # get the genes for all genomes # mRNAs with CDS. single select to get cds+psl, then split that up and # create genePred # using mrna table as genes: mkdir genes for qDB in oryCun1 panTro2 rheMac2 canFam2 bosTau2 danRer4 loxAfr1 \ tetNig1 fr1 # single danRer4 re-run 2006-05-03, removed danRer3 for qDB in danRer4 do tmpExt=`mktemp temp.XXXXXX` tmpMrnaCds=${qDB}.mrna-cds.${tmpExt} tmpMrna=${qDB}.mrna.${tmpExt} tmpCds=${qDB}.cds.${tmpExt} echo $qDB hgsql -N -e 'select all_mrna.qName,cds.name,all_mrna.* \ from all_mrna,gbCdnaInfo,cds \ where (all_mrna.qName = gbCdnaInfo.acc) and \ (gbCdnaInfo.cds != 0) and (gbCdnaInfo.cds = cds.id)' \ ${qDB} > ${tmpMrnaCds} cut -f 1-2 ${tmpMrnaCds} > ${tmpCds} cut -f 4-100 ${tmpMrnaCds} > ${tmpMrna} mrnaToGene -cdsFile=${tmpCds} -smallInsertSize=8 -quiet ${tmpMrna} \ stdout \ | genePredSingleCover stdin stdout | gzip -2c \ > /scratch/tmp/$qDB.tmp.gz rm ${tmpMrnaCds} ${tmpMrna} ${tmpCds} mv /scratch/tmp/$qDB.tmp.gz genes/$qDB.gp.gz rm -f $tmpExt done # tried to use monDom4 in the above loop, but got this error: # (450211944 450214274) out of range (0 400000000) in binKeeperAdd # Which is interesting. This should be looked into to see why # this is here. # using knownGene for rn4 mm8 hg18 # using refGene for galGal2 # using mgcGenes for xenTro2 # no genes for monDom4 dasNov1 echTel1 # genePreds; (must keep only the first 10 columns for knownGene) for qDB in rn4 mm8 hg18 galGal2 xenTro2 do if [ $qDB = "xenTro2" ]; then geneTbl=mgcGenes elif [ $qDB = "galGal2" ]; then geneTbl=refGene else geneTbl=knownGene fi echo hgsql -N -e 'select * from '"$geneTbl ${qDB}" hgsql -N -e "select * from $geneTbl" ${qDB} | cut -f 1-10 \ | genePredSingleCover stdin stdout | gzip -2c \ > /scratch/tmp/$qDB.tmp.gz mv /scratch/tmp/$qDB.tmp.gz genes/$qDB.gp.gz rm -f $tmpExt done #------------------------------------------------------------------------ # create frames # beware, BASH syntax here ... # rebuild frames to get bug fix, using 1-pass maf methodology (2006-06-09 markd) clusterDir=/cluster/bluearc/mm8/multiz17wayFrames multizDir=/cluster/data/mm8/bed/multiz17way mafDir=$multizDir/mafDownloads geneDir=$multizDir/frames/genes clusterMafDir=${clusterDir}/maf clusterGeneDir=${clusterDir}/genes clusterFramesDir=${clusterDir}/mafFrames.kki # copy mafs to cluster storage mkdir $clusterDir ssh -x kkstore04 "rsync -av $mafDir/*.maf.gz $clusterMafDir/" # copy genes to cluster storage ssh -x kkstore04 "rsync -av $geneDir/*.gp.gz $clusterGeneDir/" # run cluster jobs tmpExt=`mktemp temp.XXXXXX` paraDir=$multizDir/frames/para.${tmpExt} cd /cluster/data/mm8/bed/multiz17way/frames mkdir mafFrames $paraDir mkdir ${clusterFramesDir} for qDB in `cat /cluster/data/mm8/bed/multiz17way/species.lst` do mkdir ${clusterFramesDir}/${qDB} for C in `awk '{print $1;}' /cluster/data/mm8/chrom.sizes` do if [ -e ${clusterGeneDir}/${qDB}.gp.gz ]; then echo /cluster/bin/scripts/mkMafFrames.pl ${qDB} mm8 \ ${clusterGeneDir}/${qDB}.gp.gz ${clusterMafDir}/$C.maf.gz \ ${clusterFramesDir}/${qDB}/$C.mafFrames \ >> $paraDir/jobList fi done done rm -f $tmpExt ssh -x kki "cd ${paraDir} && para make jobList && para time" # Completed: 476 of 476 jobs # CPU time in finished jobs: 6235s 103.91m 1.73h 0.07d 0.000 y # IO & Wait Time: 13538s 225.64m 3.76h 0.16d 0.000 y # Average job time: 42s 0.69m 0.01h 0.00d # Longest finished job: 237s 3.95m 0.07h 0.00d # Submission to last job: 1242s 20.70m 0.34h 0.01d # combine results from cluster for qDB in \ `sed -e "s/ dasNov1//; s/ echTel1//; s/ monDom4//;" ../species.lst` do ssh -x kolossus "cat ${clusterFramesDir}/${qDB}/*.mafFrames | gzip -2c > ${multizDir}/frames/mafFrames/${qDB}.mafFrames.gz" echo "${qDB}" done #------------------------------------------------------------------------ # load the database ssh hgwdev cd /cluster/data/mm8/bed/multiz17way/frames time nice -n +19 hgLoadMafFrames mm8 multiz17wayFrames \ mafFrames/*.mafFrames.gz # real 1m11.457s #------------------------------------------------------------------------ # clean up rm -rf ${clusterDir} ### # rebuild frames to get bug fix, using 1-pass maf methodology (2006-06-09 markd) ssh kkstore04 cd /cluster/data/mm8/bed/multiz17way/frames mv mafFrames/ mafFrames.old nice tcsh # easy way to get process niced (cat ../maf/*.maf | time genePredToMafFrames mm8 stdin stdout bosTau2 genes/bosTau2.gp.gz canFam2 genes/canFam2.gp.gz danRer4 genes/danRer4.gp.gz fr1 genes/fr1.gp.gz galGal2 genes/galGal2.gp.gz hg18 genes/hg18.gp.gz loxAfr1 genes/loxAfr1.gp.gz mm8 genes/mm8.gp.gz oryCun1 genes/oryCun1.gp.gz panTro2 genes/panTro2.gp.gz rheMac2 genes/rheMac2.gp.gz rn4 genes/rn4.gp.gz tetNig1 genes/tetNig1.gp.gz xenTro2 genes/xenTro2.gp.gz bosTau2 genes/bosTau2.gp.gz | gzip >multiz17way.mafFrames.gz)>&log& ssh hgwdev cd /cluster/data/mm8/bed/multiz17way/frames hgLoadMafFrames mm8 multiz17wayFrames multiz17way.mafFrames.gz |&mail markd& ############################################################################ # CREATE CONSERVATION WIGGLE WITH PHASTCONS # (DONE - 2006-03-02 - Hiram) # (RE-DONE - 2006-04-25 with panTro2 and xenTro2 - Hiram) # (RE-DONE - 2006-05-03 with danRer4 instead of danRer3 - Hiram) # Will skip this estimate for Mm8 since it was well done in Mm7 # and in Hg17, skip to the creation of the SS files # Estimate phastCons parameters ssh kkstore01 mkdir /cluster/data/mm8/bed/multiz17way/cons cd /cluster/data/mm8/bed/multiz17way/cons # Create a starting-tree.mod based on chr2 (the largest one) /cluster/bin/phast/$MACHTYPE/msa_split ../maf/chr2.maf \ --refseq ../../../2/chr2.fa --in-format MAF \ --windows 100000000,1000 --out-format SS \ --between-blocks 5000 --out-root s1 # 10 minutes /cluster/bin/phast/$MACHTYPE/phyloFit -i SS s1.*.ss \ --tree "((((((((((hg18,panTro2),rheMac2),((rn4,mm8),oryCun1)),(bosTau2,canFam2)),dasNov1),(loxAfr1,echTel1)),monDom4),galGal2),xenTro2),((tetNig1,fr1),danRer4))" \ --out-root starting-tree # real 840m53.157s # That is 14 hours ! rm s1.*.ss # add up the C and G: grep BACKGROUND starting-tree.mod | awk '{printf "%0.3f\n", $3 + $4;}' # 0.407 # This 0.407 is used in the --gc argument below # CONTINUE HERE, no estimation required # Create big bad bloated SS files on san filesystem (takes ~ 2h 20m) # Increasing their size this time from 1,000,000 to 10,000,000 to # slow down the phastCons pk jobs ssh kkstore04 mkdir -p /san/sanvol1/scratch/mm8/cons/ss cd /san/sanvol1/scratch/mm8/cons/ss time for C in `awk '{print $1}' /cluster/data/mm8/chrom.sizes` do if [ -s /cluster/data/mm8/bed/multiz17way/maf/${C}.maf ]; then mkdir ${C} echo msa_split $C chrN=${C/chr/} chrN=${chrN/_random/} /cluster/bin/phast/$MACHTYPE/msa_split \ /cluster/data/mm8/bed/multiz17way/maf/${C}.maf \ --refseq /cluster/data/mm8/${chrN}/${C}.fa \ --in-format MAF --windows 4000000,0 --between-blocks 5000 \ --out-format SS --out-root ${C}/${C} fi done & # real 94m49.273s # Again, going to SKIP this tuning business this time and use the # previous numbers. # Create a random list of 50 1 mb regions (do not use the _randoms) cd /san/sanvol1/scratch/mm8/cons/ss ls -1l chr*/chr*.ss | grep -v random | \ awk '$5 > 4000000 {print $9;}' | randomLines stdin 50 ../randomSs.list # Set up parasol directory to calculate trees on these 50 regions ssh pk mkdir /san/sanvol1/scratch/mm8/cons/treeRun1 cd /san/sanvol1/scratch/mm8/cons/treeRun1 mkdir tree log # Tuning this loop should come back to here to recalculate # Create little script that calls phastCons with right arguments # --target-coverage of 0.20 is about right for mouse, will be # tuned exactly below cat > makeTree.csh << '_EOF_' #!/bin/csh -fe set C=$1:h mkdir -p log/${C} tree/${C} /cluster/bin/phast/$MACHTYPE/phastCons ../ss/$1 \ /cluster/data/mm8/bed/multiz17way/cons/starting-tree.mod \ --gc 0.407 --nrates 1,1 --no-post-probs --ignore-missing \ --expected-lengths 12 --target-coverage 0.17 \ --quiet --log log/$1 --estimate-trees tree/$1 '_EOF_' # << happy emacs chmod a+x makeTree.csh # Create gensub file cat > template << '_EOF_' #LOOP makeTree.csh $(path1) #ENDLOOP '_EOF_' # << happy emacs # Make cluster job and run it gensub2 ../randomSs.list single template jobList para create jobList para try/push/check/etc # Completed: 50 of 50 jobs # CPU time in finished jobs: 354644s 5910.74m 98.51h 4.10d 0.011 y # IO & Wait Time: 352s 5.86m 0.10h 0.00d 0.000 y # Average job time: 7100s 118.33m 1.97h 0.08d # Longest finished job: 29358s 489.30m 8.15h 0.34d # Submission to last job: 29446s 490.77m 8.18h 0.34d # Now combine parameter estimates. We can average the .mod files # using phyloBoot. This must be done separately for the conserved # and nonconserved models ssh kkstore01 cd /san/sanvol1/scratch/mm8/cons/treeRun1 ls -1 tree/chr*/*.cons.mod > cons.list time /cluster/bin/phast/$MACHTYPE/phyloBoot --read-mods '*cons.list' \ --output-average ../ave.cons.mod > cons_summary.txt 2>&1 & ls -1 tree/chr*/*.noncons.mod > noncons.list /cluster/bin/phast/$MACHTYPE/phyloBoot --read-mods '*noncons.list' \ --output-average ../ave.noncons.mod > noncons_summary.txt cd .. cp -p ave.*.mod /cluster/data/mm8/bed/multiz17way/cons # measuring entropy # consEntopy # ave.cons.mod ave.noncons.mod --NH 9.78 # never stops with the --NH argument /cluster/bin/phast/$MACHTYPE/consEntropy .17 12 \ ave.cons.mod ave.noncons.mod XXXX - does not work: 2005-11-28 [hiram@kkstore01 /san/sanvol1/scratch/mm8/cons] /cluster/bin/phast/$MACHTYPE/consEntropy .17 12 ave.cons.mod ave.noncons.mod ERROR: with no separate source alignment, ss_from_msas expects sequences of positive length and no SS object. #Transition parameters:gamma=0.100000, omega=12.000000, mu=0.083333, nu=0.009259 # Relative entropy: H=1.454874 bits/site # Required length: N=7.596943 sites # Total entropy: NH=11.052595 bits # consEntropy .20 12 ave.cons.mod.1 ave.noncons.mod.1 # Transition params: gamma=0.200000, omega=12.000000, mu=0.083333, nu=0.020833 # Relative entropy: H=1.454874 bits/site # Required length: N=6.629337 sites # Total entropy: NH=9.644850 bits # consEntropy .10 12 ave.cons.mod.2 ave.noncons.mod.2 # Transition params: gamma=0.100000, omega=12.000000, mu=0.083333, nu=0.009259 # Relative entropy: H=1.527815 bits/site # Required length: N=7.205526 sites # Total entropy: NH=11.008713 bits # consEntropy .20 8 ave.cons.mod.3 ave.noncons.mod.3 # Transition params: gamma=0.200000, omega=8.000000, mu=0.125000, nu=0.031250 # Relative entropy: H=1.654878 bits/site # Required length: N=5.146793 sites # Total entropy: NH=8.517313 bits ### !!! *** This one with .17 and 12 is the one that was finally used # consEntropy .17 12 ave.cons.mod.4 ave.noncons.mod.4 # Transition params: gamma=0.170000, omega=12.000000, mu=0.083333, nu=0.017068 # Relative entropy: H=1.478838 bits/site # Required length: N=6.753382 sites # Total entropy: NH=9.987159 bits # SKIP to here passing by the tuning numbers ssh pk # Create cluster dir to do main phastCons run mkdir /san/sanvol1/scratch/mm8/cons/consRun3 cd /san/sanvol1/scratch/mm8/cons cp /san/sanvol1/scratch/mm7/cons/elliotsEncode.mod . # edit, change monDom2 to monDom4, hg17 to hg18, rheMac1 to # rheMac2, rn3 to rn4, mm7 to mm8 # danRer3 to danRer4 # It looks like: ALPHABET: A C G T ORDER: 0 SUBST_MOD: REV TRAINING_LNL: -988246.132962 BACKGROUND: 0.295 0.205 0.205 0.295 RATE_MAT: -1.165221 0.315494 0.589884 0.259843 0.189778 -0.878194 0.208718 0.479698 0.444622 0.261535 -0.885604 0.179447 0.234867 0.720815 0.215191 -1.170872 TREE: (((((((((((((hg18:0.006690,panTro2:0.007571):0.024272,(colobus_monkey:0.015404,(baboon:0.008258,rheMac2:0.028617):0.008519):0.022120):0.023960,(dusky_titi:0.025662,(owl_monkey:0.012151,marmoset:0.029549):0.008236):0.027158):0.066101,(mouse_lemur:0.059024,galago:0.121375):0.032386):0.017073,((rn4:0.081728,mm8:0.077017):0.229273,oryCun1:0.206767):0.023340):0.023026,(((bosTau2:0.159182,canFam2:0.147731):0.004946,rfbat:0.138877):0.010150,(hedgehog:0.193396,shrew:0.261724):0.054246):0.024354):0.028505,dasNov1:0.149862):0.015994,(loxAfr1:0.104891,echTel1:0.259797):0.040371):0.218400,monDom4:0.371073):0.065268,platypus:0.468116):0.123856,galGal2:0.454691):0.123297,xenTro2:0.782453):0.156067,((tetNig1:0.199381,fr1:0.239894):0.492961,danRer4:0.782561):0.156067); cd /san/sanvol1/scratch/mm8/cons/consRun3 mkdir ppRaw bed # Create script to run phastCons with right parameters # These parameters: # --rho 0.28 --expected-length 14 --target-coverage 0.008 --quiet \ # were taken from Kate's 17-way in Hg17, removing the # --not-informative panTro2 since that isn't relevant here, nor # would be --not-informative rn4 - Jim says rn4 is far enough away # from mm8 that it is informative. # This job is I/O intensive in its output files, thus it is all # working over in /scratch/tmp/ cat > doPhast << '_EOF_' #!/bin/csh -fe mkdir /scratch/tmp/${2} cp -p ../ss/${1}/${2}.ss ../elliotsEncode.mod /scratch/tmp/${2} pushd /scratch/tmp/${2} > /dev/null /cluster/bin/phast/${MACHTYPE}/phastCons ${2}.ss elliotsEncode.mod \ --rho 0.28 --expected-length 14 --target-coverage 0.008 --quiet \ --seqname ${1} --idpref ${1} --viterbi ${2}.bed --score > ${2}.pp popd > /dev/null mkdir -p ppRaw/${1} mkdir -p bed/${1} mv /scratch/tmp/${2}/${2}.pp ppRaw/${1} mv /scratch/tmp/${2}/${2}.bed bed/${1} rm /scratch/tmp/${2}/elliotsEncode.mod rm /scratch/tmp/${2}/${2}.ss rmdir /scratch/tmp/${2} '_EOF_' # << happy emacs chmod a+x doPhast # root1 == chrom name, file1 == ss file name without .ss suffix # Create gsub file cat > template << '_EOF_' #LOOP doPhast $(root1) $(file1) #ENDLOOP '_EOF_' # << happy emacs # Create parasol batch and run it ls -1 ../ss/chr*/chr*.ss | sed 's/.ss$//' > in.list gensub2 in.list single template jobList para create jobList para try/check/push/etc. # These jobs are very fast and very I/O intensive, even on the san # they will hang it up as they work at full tilt. # Completed: 689 of 689 jobs # CPU time in finished jobs: 12806s 213.44m 3.56h 0.15d 0.000 y # IO & Wait Time: 16079s 267.98m 4.47h 0.19d 0.001 y # Average job time: 42s 0.70m 0.01h 0.00d # Longest finished job: 94s 1.57m 0.03h 0.00d # Submission to last job: 350s 5.83m 0.10h 0.00d # combine predictions and transform scores to be in 0-1000 interval # it uses a lot of memory, so on kolossus: ssh kolossus cd /san/sanvol1/scratch/mm8/cons/consRun3 # The sed's and the sort get the file names in chrom,start order # You might like to verify it is correct by first looking at the # list it produces: find ./bed -type f | sed -e "s#/# x #g; s#\.# y #g; s#-# z #g" \ | sort -k7,7 -k9,9n \ | sed -e "s# z #-#g; s# y #\.#g; s# x #/#g" | less # if that looks right, then let it run: # FOR NEXT TIME - the result file should be named: # phastConsElements17way.bed since that is the name of the DB # table that it is loaded into. (instead of mostConserved.bed) find ./bed -type f | sed -e "s#/# x #g; s#\.# y #g; s#-# z #g" \ | sort -k7,7 -k9,9n \ | sed -e "s# z #-#g; s# y #\.#g; s# x #/#g" | xargs cat \ | awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' \ | /cluster/bin/scripts/lodToBedScore /dev/stdin > mostConserved.bed # ~ 1 minute cp -p mostConserved.bed /cluster/data/mm8/bed/multiz17way # Figure out how much is actually covered by the bed file as so: # Get the non-n genome size from faSize on all chroms: ssh kkstore01 cd /cluster/data/mm8 faSize ?{,?}/chr*.fa # 2664455088 bases (97171400 N's 2567283688 real 1477933003 upper # 1089350685 lower) in 34 sequences in 34 files cd /san/sanvol1/scratch/mm8/cons/consRun3 # The 2567283688 comes from the non-n genome as counted above. awk ' {sum+=$3-$2} END{printf "%% %.2f = 100.0*%d/2567283688\n",100.0*sum/2567283688,sum}' \ mostConserved.bed # --rho 0.28 --expected-length 14 --target-coverage 0.008 # % 5.40 = 100.0*138575691/2567283688 danRer4 instead of danRer3 # % 5.43 = 100.0*139309333/2567283688 panTro2 and xenTro2 # % 5.39 = 100.0*138300407/2567283688 panTro1 and xenTro1 # Aiming for %70 coverage in # the following featureBits measurement on CDS: # Beware of negative scores when too high. The logToBedScore # will output an error on any negative scores. HGDB_CONF=~/.hg.conf.read-only time nice -n +19 featureBits mm8 \ -enrichment refGene:cds mostConserved.bed # --rho 0.28 --expected-length 14 --target-coverage 0.008 # with danRer4 instead of danRer3: # refGene:cds 1.062%, mostConserved.bed 5.398%, both 0.743%, cover # 69.99%, enrich 12.97x # with panTro2 and xenTro2: # refGene:cds 1.060%, mostConserved.bed 5.426%, both 0.740%, cover # 69.85%, enrich 12.87x # with panTro1 and xenTro1: # refGene:cds 1.060%, mostConserved.bed 5.387%, both 0.739%, cover # 69.71%, enrich 12.94x # Load most conserved track into database ssh hgwdev cd /cluster/data/mm8/bed/multiz17way # the copy was already done above # cp -p /san/sanvol1/scratch/mm8/cons/consRun3/mostConserved.bed . time nice -n +19 hgLoadBed -strict mm8 phastConsElements17way \ mostConserved.bed # Loaded 1883370 elements of size 5 # real 2m54.033s # should measure the same as above time nice -n +19 featureBits mm8 -enrichment refGene:cds \ phastConsElements17way # with danRer4 in place of danRer3: # refGene:cds 1.062%, phastConsElements17way 5.398%, both 0.743%, # cover 69.99%, enrich 12.97x # with panTro2 and xenTro2: # refGene:cds 1.060%, phastConsElements 5.426%, both 0.740%, cover # 69.85%, enrich 12.87x # with panTro1 and xenTro1: # refGene:cds 1.060%, phastConsElements 5.387%, both 0.739%, cover # 69.71%, enrich 12.94x # Create merged posterier probability file and wiggle track data files ssh kkstore04 cd /san/sanvol1/scratch/mm8/cons/consRun3 # the sed business gets the names sorted by chromName, chromStart # so that everything goes in numerical order into wigEncode # This was verified above to be correct time nice -n +19 find ./ppRaw -type f \ | sed -e "s#/# x #g; s#\.# y #g; s#-# z #g" \ | sort -k7,7 -k9,9n \ | sed -e "s# z #-#g; s# y #\.#g; s# x #/#g" | xargs cat \ | $HOME/bin/$MACHTYPE/wigEncode -noOverlap stdin \ phastCons17.wig phastCons17.wib # real 15m59.846s # -rw-rw-r-- 1 1961998053 May 3 12:22 phastCons17.wib # -rw-rw-r-- 1 237229239 May 3 12:22 phastCons17.wig time nice -n +19 cp -p phastCons17.wi? /cluster/data/mm8/bed/multiz17way/ # real 1m21.329s # prepare compressed copy of ascii data values for downloads ssh pk cd /san/sanvol1/scratch/mm8/cons/consRun3 cat << '_EOF_' > gzipAscii.sh #!/bin/sh TOP=`pwd` export TOP mkdir -p phastCons17Scores for D in ppRaw/chr* do C=${D/ppRaw\/} out=phastCons17Scores/${C}.data.gz echo "========================== ${C} ${D}" find ./${D} -type f | sed -e "s#/# x #g; s#\.# y #g; s#-# z #g" \ | sort -k7,7 -k9,9n \ | sed -e "s# z #-#g; s# y #\.#g; s# x #/#g" | xargs cat | gzip > ${out} done '_EOF_' # << happy emacs chmod +x gzipAscii.sh time nice -n +19 ./gzipAscii.sh # real 18m15.212s # copy them for downloads ssh kkstore04 # this directory is actually a symlink from store9 to store8 to # avoid the data full problem on store9 mkdir /cluster/data/mm8/bed/multiz17way/phastCons17Scores cd /cluster/data/mm8/bed/multiz17way/phastCons17Scores cp -p /san/sanvol1/scratch/mm8/cons/consRun3/phastCons17Scores/* . ssh hgwdev cd /usr/local/apache/htdocs/goldenPath/mm8 ln -s /cluster/data/mm8/bed/multiz17way/phastCons17Scores . # Load gbdb and database with wiggle. ssh hgwdev cd /cluster/data/mm8/bed/multiz17way ln -s `pwd`/phastCons17.wib /gbdb/mm8/wib/phastCons17.wib time nice -n +19 hgLoadWiggle mm8 phastCons17 phastCons17.wig # real 2m55.836s # Create histogram to get an overview of all the data ssh hgwdev cd /cluster/data/mm8/bed/multiz17way time nice -n +19 hgWiggle -doHistogram \ -hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \ -db=mm8 phastCons17 > histogram.data 2>&1 # real 28m24.388s # create plot of histogram: cat << '_EOF_' | gnuplot > histo.png set terminal png small color \ x000000 xffffff xc000ff x66ff66 xffff00 x00ffff xff0000 set size 1.4, 0.8 set key left box set grid noxtics set grid ytics set title " Mouse Mm8 Histogram phastCons17 track" set xlabel " phastCons17 score" set ylabel " Relative Frequency" set y2label " Cumulative Relative Frequency (CRF)" set y2range [0:1] set y2tics set yrange [0:0.02] plot "histogram.data" using 2:5 title " RelFreq" with impulses, \ "histogram.data" using 2:7 axes x1y2 title " CRF" with lines '_EOF_' # << happy emacs display histo.png & # QA NOTE: (ASZ: 5/1/2006) changed name of phastConsElements table to phastConsElements17way # QA NOTE: (ASZ: 5/1/2006) changed name of phastCons17 table to phastCons17way # Hiram Note: phastCons17 never changed to phastCons17way at any time ######################################################################### # MAKE FOLDUTR TABLES (DONE 2006-02-28, Fan) # First set up directory structure and extract UTR sequence on hgwdev ssh hgwdev cd /cluster/data/mm8/bed rm rnaStruct mkdir /san/sanvol1/scratch/mm8/rnaStruct.2006-02-28 ln -s /san/sanvol1/scratch/mm8/rnaStruct.2006-02-28 rnaStruct cd rnaStruct mkdir -p utr3/split utr5/split utr3/fold utr5/fold utrFa mm8 knownGene utr3 utr3/utr.fa utrFa mm8 knownGene utr5 utr5/utr.fa # Split up files and make files that define job. ssh pk cd /cluster/data/mm8/bed/rnaStruct faSplit sequence utr3/utr.fa 4000 utr3/split/s faSplit sequence utr5/utr.fa 4000 utr5/split/s ls -1 utr3/split > utr3/in.lst ls -1 utr5/split > utr5/in.lst cd utr3 cat > gsub < cgapBIOCARTAdescSorted.tab hgsql mm8 -e "drop table cgapAlias" hgsql mm8 -e "drop table cgapBiocDesc" hgsql mm8 -e "drop table cgapBiocPathway" hgsql mm8 <~/src/hg/lib/cgapAlias.sql hgsql mm8 <~/src/hg/lib/cgapBiocDesc.sql hgsql mm8 <~/src/hg/lib/cgapBiocPathway.sql hgsql mm8 -e 'load data local infile "cgapAlias.tab" into table cgapAlias' hgsql mm8 -e 'load data local infile "cgapBIOCARTAdescSorted.tab" into table cgapBiocDesc' hgsql mm8 -e 'load data local infile "cgapBIOCARTA.tab" into table cgapBiocPathway' #################################################################################### # BUILD PROTEOME BROWSER TABLES FOR mm8 (DONE 3/8/06, Fan) # These are instructions for building tables needed for the Proteome Browser. # DON'T START THESE UNTIL TABLES FOR KNOWN GENES AND kgProtMap table # ARE REBUILT. # This build is based on proteins DBs dated 060115. # Create the working directory ssh hgwdev mkdir /cluster/store9/kg/kgMm8A/pb-2006-03-08 cd /cluster/data/mm8/bed rm pb ln -s /cluster/store9/kg/kgMm8A/pb-2006-03-08 pb cd pb # Define pep* tables in mm8 DB cat ~/kent/src/hg/lib/pep*.sql > pepAll.sql # First edit out pepPred table definition, then hgsql mm8 < pepAll.sql # Build the pepMwAa table hgsql proteins060115 -N -e \ "select info.acc, molWeight, aaSize from sp060115.info, sp060115.accToTaxon where accToTaxon.taxon=10090 and accToTaxon.acc = info.acc" > pepMwAa.tab hgsql mm8 -e 'load data local infile "pepMwAa.tab" into table pepMwAa' o Build the pepPi table hgsql proteins060115 -e \ "select info.acc from sp060115.info, sp060115.accToTaxon where accToTaxon.taxon=10090 and accToTaxon.acc = info.acc" > protAcc.lis hgsql mm8 -N -e 'select proteinID from knownGene where proteinID like "%-%"' | sort -u >> protAcc.lis pbCalPi protAcc.lis sp060115 pepPi.tab hgsql mm8 -e 'delete from pepPi' hgsql mm8 -e 'load data local infile "pepPi.tab" into table mm8.pepPi' # Calculate and load pep distributions pbCalDist sp060115 proteins060115 10090 mm8 >pbCalDist.out wc pbCalDist.out hgsql mm8 load data local infile "pepExonCntDist.tab" into table mm8.pepExonCntDist; load data local infile "pepCCntDist.tab" into table mm8.pepCCntDist; load data local infile "pepHydroDist.tab" into table mm8.pepHydroDist; load data local infile "pepMolWtDist.tab" into table mm8.pepMolWtDist; load data local infile "pepResDist.tab" into table mm8.pepResDist; load data local infile "pepIPCntDist.tab" into table mm8.pepIPCntDist; load data local infile "pepPiDist.tab" into table mm8.pepPiDist; quit # Calculate frequency distributions pbCalResStd sp060115 10090 mm8 # Create pbAnomLimit and pbResAvgStd tables hgsql mm8 -e "drop table pbAnomLimit" hgsql mm8 -e "drop table pbResAvgStd" hgsql mm8 < ~/src/hg/lib/pbAnomLimit.sql hgsql mm8 < ~/src/hg/lib/pbResAvgStd.sql hgsql mm8 -e 'load data local infile "pbResAvgStd.tab" into table mm8.pbResAvgStd;' hgsql mm8 -e 'load data local infile "pbAnomLimit.tab" into table mm8.pbAnomLimit;' # Create pbStamp table for PB hgsql mm8 -e "drop table pbStamp" hgsql mm8 < ~/src/hg/lib/pbStamp.sql hgsql mm7 -N -e 'select * from pbStamp' > pbStamp.tab hgsql mm8 -e 'load data local infile "pbStamp.tab" into table mm8.pbStamp' # ENABLE PB FOR mm8 IN HGCENTRALTEST echo " insert into gdbPdb values('mm8', 'proteins060115')" \ | hgsql -h genome-testdb hgcentraltest echo "update dbDb set hgPbOk = 1 where name = 'mm8';" \ | hgsql -h genome-testdb hgcentraltest # Adjust drawing parameters for Proteome Browser stamps Now invoke Proteome Browser and adjust various drawing parameters (mostly the ymax of each stamp) if necessary, by updating the pbStamp.tab file and then delete and reload the pbStamp table. hgsql mm8 -e "drop table pbStamp" hgsql mm8 < ~/src/hg/lib/pbStamp.sql hgsql mm8 -e 'load data local infile "pbStamp.tab" into table mm8.pbStamp' # Perform preliminary review of Proteome Browser for mm8, then notify QA for formal review. # BUILD MISC STUFF FOR KG # Build mrnaRefseq table # First make sure the entrez DB is updated. (recently updated on 2/8/06). ssh hgwdev cd /cluster/store9/kg/kgMm8A hgsql entrez -N -e \ 'select mrna, refseq from entrezRefseq, entrezMrna, mm8.all_mrna where qName=mrna and entrezRefseq.geneID=entrezMrna.geneID' \ >mrnaRefseq1.tab.tab hgsql mm8 -N -e 'select name, name from refGene' >mrnaRefseq2.tab cat mrnaRefseq1.tab mrnaRefseq2.tab |sort -u >mrnaRefseq.tab hgsql mm8 -e 'drop table mrnaRefseq' hgsql mm8 < ~/src/hg/lib/mrnaRefseq.sql hgsql mm8 -e 'load data local infile "mrnaRefseq.tab" into table mrnaRefseq' # CREATE FULL TEXT INDEX FOR KNOWN GENES (DONE 3/8/06 Fan) # This depends on the go and uniProt databases as well as # the kgAlias and kgProAlias tables. The hgKgGetText takes # about 5 minutes when the database is not too busy. The rest # is real quick. ssh hgwdev cd /cluster/store9/kg/kgMm8A mkdir index cd index hgKgGetText mm8 knownGene.text ixIxx knownGene.text knownGene.ix knownGene.ixx ln -s /cluster/store9/kg/kgMm8A/index/knownGene.ix /gbdb/mm8/knownGene.ix ln -s /cluster/store9/kg/kgMm8A/index/knownGene.ixx /gbdb/mm8/knownGene.ixx # BUILD KNOWN GENE LIST FOR GOOGLE. # make knownGeneLists.html mm8GeneList.html mm5GeneList.html rm3GeneList.html cd /cluster/data/mm8/bed rm -rf knownGeneList/mm8 # Run hgKnownGeneList to generate the tree of HTML pages # under ./knownGeneList/mm8 hgKnownGeneList mm8 # copy over to /usr/local/apache/htdocs rm -rf /usr/local/apache/htdocs/knownGeneList/mm8 mkdir -p /usr/local/apache/htdocs/knownGeneList/mm8 cp -Rfp knownGeneList/mm8/* /usr/local/apache/htdocs/knownGeneList/mm8 ################################################################################## # Create description.html for mm8 mkdir -p ~/kent/src/hg/makeDb/trackDb/mouse/mm8 cd ~/kent/src/hg/makeDb/trackDb/mouse/mm8 cp ../hg17/description.html . vi description.html # Change release date and build number and change hg17 to mm8 # Check it into CVS mkdir -p /cluster/data/mm8/html cp -p description.html /cluster/data/mm8/html ln -s /cluster/data/mm8/html/description.html /gbdb/mm8/html/description.html # BUILD GENE SORTER TABLES (AKA: FAMILY BROWSER) (STARTED 2006-03-08, DONE 2006-02-14 - Fan) # This should be done after KG tables are complete from known genes build # process. # # Cluster together various alt-splicing isoforms. # Creates the knownIsoforms and knownCanonical tables ssh hgwdev mkdir /cluster/data/mm8/bed/geneSorter.2006-03-08 # remove old symbolic link rm /cluster/data/mm8/bed/geneSorter ln -s /cluster/data/mm8/bed/geneSorter.2006-03-08 /cluster/data/mm8/bed/geneSorter cd /cluster/data/mm8/bed/geneSorter hgClusterGenes mm8 knownGene knownIsoforms knownCanonical # Extract peptides from knownGenes into fasta file # and create a blast database out of them. mkdir /cluster/data/mm8/bed/geneSorter/blastp cd /cluster/data/mm8/bed/geneSorter/blastp pepPredToFa mm8 knownGenePep known.faa # You may need to build this binary in src/hg/near/pepPredToFa /scratch/blast/formatdb -i known.faa -t known -n known # This command is in /projects/compbio/bin/$MACH/formatdb # Copy over database to bluearc rm -fr /cluster/bluearc/mm8/blastp mkdir -p /cluster/bluearc/mm8/blastp cp -p /cluster/data/mm8/bed/geneSorter/blastp/known.* /cluster/bluearc/mm8/blastp # Split up fasta file into bite sized chunks for cluster cd /cluster/data/mm8/bed/geneSorter/blastp mkdir split faSplit sequence known.faa 8000 split/kg # Make parasol run directory ssh pk mkdir /cluster/data/mm8/bed/geneSorter/blastp/self cd /cluster/data/mm8/bed/geneSorter/blastp/self mkdir run cd run mkdir out # Make blast script cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/cluster/bluearc/blast229/data /cluster/bluearc/blast229/blastall \ -p blastp -d /cluster/bluearc/mm8/blastp/known -i $1 -o $2 \ -e 0.01 -m 8 -b 1000 '_EOF_' # << keep emacs happy chmod +x blastSome # Make gensub2 file cat << '_EOF_' > gsub #LOOP blastSome {check in line+ $(path1)} {check out line out/$(root1).tab} #ENDLOOP '_EOF_' # << keep emacs happy # Create parasol batch # 'ls ../../split/*.fa' is too much, hence the echo echo ../../split/*.fa | wordLine stdin > split.lst gensub2 split.lst single gsub jobList para create jobList para try para push para check Completed: 7730 of 7730 jobs CPU time in finished jobs: 35194s 586.56m 9.78h 0.41d 0.001 y IO & Wait Time: 29033s 483.89m 8.06h 0.34d 0.001 y Average job time: 8s 0.14m 0.00h 0.00d Longest running job: 0s 0.00m 0.00h 0.00d Longest finished job: 43s 0.72m 0.01h 0.00d Submission to last job: 206s 3.43m 0.06h 0.00d # Load into database. This takes about 20 minutes ssh hgwdev cd /cluster/data/mm8/bed/geneSorter/blastp/self/run/out bash time hgLoadBlastTab mm8 knownBlastTab *.tab # Scanning through 7730 files # Loading database with 5270545 rows # real 13m30.534s cd /cluster/data/mm8/bed/geneSorter # Create table that maps between known genes and RefSeq hgMapToGene mm8 refGene knownGene knownToRefSeq # Create table that maps between known genes and LocusLink hgsql --skip-column-names -e "select mrnaAcc,locusLinkId from refLink" mm8 > refToLl.txt hgMapToGene mm8 refGene knownGene knownToLocusLink -lookup=refToLl.txt hgsql -e "select count(*) from knownToLocusLink;" mm8 # 27636 # Create table that maps between known genes and Pfam domains hgMapViaSwissProt mm8 knownGene name proteinID Pfam knownToPfam hgsql -e "select count(*) from knownToPfam;" mm8 # 29479 ############################################################################ ### MAKE THE affyU74 TRACK - needed for the Gene Sorter (DONE # # MAKE THE affyU74 TRACK using Affy consensus sequences instead of # target sequences. Recalculate alignments and load data ---------------------------------- # Load up semi-local disk with target sequences for Affy mouse U74 chips. # ssh kkr1u00 # mkdir -p /iscratch/i/affy # This /projects filesystem is not available on kkr1u00 # but it is on kk # ssh kk # cp /projects/compbio/data/microarray/affyGnfMouse/sequences/U74*consensus.fa /iscratch/i/affy ssh kkr1u00 iSync # Run cluster job to do alignments ssh kk mkdir /cluster/data/mm8/ cd /cluster/data/mm8/bed/affyU74.2006-03-08 mkdir run cd run mkdir psl #echo /scratch/mus/mm8/maskedContigs/*.fa | wordLine stdin > genome.lst echo /scratch/hg/mm8/nib/*.nib | wordLine stdin > genome.lst ls -1 /iscratch/i/affy/U74*consensus.fa > affy.lst cat << '_EOF_' > gsub #LOOP /cluster/bin/i386/blat -fine -mask=lower -minIdentity=95 -ooc=/scratch/hg/h/11.ooc $(path1) {check in line+ $(path2)} {check out line+ psl/$(root1)_$(root2).psl} #ENDLOOP '_EOF_' # << this line makes emacs coloring happy gensub2 genome.lst affy.lst gsub jobList para create jobList para try para check para push # Completed: 102 of 102 jobs # CPU time in finished jobs: 5846s 97.43m 1.62h 0.07d 0.000 y # IO & Wait Time: 367s 6.12m 0.10h 0.00d 0.000 y # Average job time: 61s 1.02m 0.02h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 189s 3.15m 0.05h 0.00d # Submission to last job: 200s 3.33m 0.06h 0.00d # Do sort, best in genome filter, and convert to chromosome coordinates # to create affyU74.psl. ssh kk cd /cluster/data/mm8/bed/affyU74.2006-03-08/run pslSort dirs raw.psl tmp psl # change filter parameters for these sequences. only use alignments that # cover 30% of sequence and have at least minAli = 0.95. # minAli = 0.97 too high. low minCover as a lot of n's in these sequences #pslReps -minCover=0.3 -sizeMatters -minAli=0.95 -nearTop=0.005 raw.psl contig.psl /dev/null pslReps -minCover=0.3 -sizeMatters -minAli=0.95 -nearTop=0.005 raw.psl ../all_affyU74.psl /dev/null # Sort by chromosome and load into database. ssh hgwdev cd /cluster/data/mm8/bed/affyU74.2006-03-08 pslSortAcc nohead chrom temp all_affyU74.psl cat chrom/*.psl > affyU74.psl # shorten qName to "xxxx_at" instead of "U74Xv2:xxxx_at;" # and reload data into table mv affyU74.psl affyU74.psl.orig cut -f 1-9 affyU74.psl.orig >j1.tmp cut -f 10 affyU74.psl.orig | sed -e 's/:/\t/' | cut -f 2 > j2.tmp cut -f 11-21 affyU74.psl.orig >j3.tmp paste j1.tmp j2.tmp j3.tmp >affyU74.psl hgLoadPsl mm8 affyU74.psl rm -rf chrom temp run ## MAKE THE affyGnfU74 TRACKs (DONE 3/8/06, Fan) # Make bed files and load consensus sequences for Affy U74 chip set. #This needs to be done after affyU74 is already made. ssh hgwdev mkdir -p /cluster/data/mm8/bed/affyGnf.2006-03-08 cd /cluster/data/mm8/bed/affyGnf.2006-03-08 # may need to build this command in src/hg/affyGnf ~/src/hg/affyGnf/affyPslAndAtlasToBed ../affyU74.2006-03-08/affyU74.psl \ /projects/compbio/data/microarray/affyGnfMouse/data/data_public_U74 \ affyGnfU74A.bed affyGnfU74A.exp -newType -chip=U74Av2 ~/src/hg/affyGnf/affyPslAndAtlasToBed ../affyU74.2006-03-08/affyU74.psl \ /projects/compbio/data/microarray/affyGnfMouse/data/U74B_b.txt \ affyGnfU74B.bed affyGnfU74B.exp -newType -chip=U74Bv2 ~/src/hg/affyGnf/affyPslAndAtlasToBed ../affyU74.2006-03-08/affyU74.psl \ /projects/compbio/data/microarray/affyGnfMouse/data/U74C_b.txt \ affyGnfU74C.bed affyGnfU74C.exp -newType -chip=U74Cv2 # edit 3 .bed files to shorten qName to "xxxx_at" instead of "U74Xv2:xxxx_at;" mkdir sav cp *.bed sav -p cat sav/affyGnfU74A.bed|sed -e "s/U74Av2://" >affyGnfU74A.bed cat sav/affyGnfU74B.bed|sed -e "s/U74Bv2://" >affyGnfU74B.bed cat sav/affyGnfU74C.bed|sed -e "s/U74Cv2://" >affyGnfU74C.bed # and reload data into table hgLoadBed -strict mm8 affyGnfU74A affyGnfU74A.bed hgLoadBed -strict mm8 affyGnfU74B affyGnfU74B.bed hgLoadBed -strict mm8 affyGnfU74C affyGnfU74C.bed # Add in sequence data for U74 tracks. # Copy consensus sequence to /gbdb if it isn't already # [THE SYM LINKS WERE ALREADY DONE.] # mkdir -p /gbdb/hgFixed/affyProbes cd /gbdb/hgFixed/affyProbes # fix broken symlinks after directory structure changed # /projects/compbiodata ----> /projects/compbio/data rm U74* # make correct symlinks (hartera, 2005-05-03) ln -s /projects/compbio/data/microarray/affyGnfMouse/sequences/U74Av2_consensus.fa . ln -s /projects/compbio/data/microarray/affyGnfMouse/sequences/U74Bv2_consensus.fa . ln -s /projects/compbio/data/microarray/affyGnfMouse/sequences/U74Cv2_consensus.fa . # used perl -pi.bak -e 's/;/ /' to remove ";" after probe name # ASSUMED THIS IS ALREADY DONE LAST TIME FOR MM4. # reload sequences with prefix removed so acc matches name used in # other dependent tables hgLoadSeq -abbr=U74Av2: mm8 /gbdb/hgFixed/affyProbes/U74Av2_consensus.fa hgLoadSeq -abbr=U74Bv2: mm8 /gbdb/hgFixed/affyProbes/U74Bv2_consensus.fa hgLoadSeq -abbr=U74Cv2: mm8 /gbdb/hgFixed/affyProbes/U74Cv2_consensus.fa ### GNF ATLAS 2 (DONE 3/9/06, Fan) # Align probes from GNF1M chip. ssh kk cd /cluster/data/mm8/bed mkdir -p geneAtlas2/run/psl cd geneAtlas2/run echo /scratch/hg/mm8/nib/*.nib | wordLine stdin > genome.lst ls -1 /cluster/bluearc/geneAtlas2/gnf1m.fa > mrna.lst echo '#LOOP\nblat -fine -ooc=/scratch/hg/h/mouse11.ooc $(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl}\n#ENDLOOP' > gsub gensub2 genome.lst mrna.lst gsub spec para create spec para try para check para push para time # Completed: 34 of 34 jobs # CPU time in finished jobs: 53165s 886.08m 14.77h 0.62d 0.002 y # IO & Wait Time: 241s 4.02m 0.07h 0.00d 0.000 y # Average job time: 1571s 26.18m 0.44h 0.02d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 3929s 65.48m 1.09h 0.05d # Submission to last job: 3929s 65.48m 1.09h 0.05d # Do sort, best in genome filter, and convert to chromosome coordinates # to create gnf1h.psl. pslSort dirs raw.psl tmp psl pslReps -minCover=0.3 -minAli=0.95 -nearTop=0.005 raw.psl ../affyGnf1m.psl /dev/null #rm -r contig.psl raw.psl psl # Load probes and alignments from GNF1H into database. ssh hgwdev cd /cluster/data/mm8/bed/geneAtlas2 # ln -s /projects/compbio/data/microarray/geneAtlas2/mouse/gnf1m.fa /gbdb/hgFixed/affyProbes hgLoadPsl mm8 affyGnf1m.psl hgLoadSeq mm8 /gbdb/hgFixed/affyProbes/gnf1m.fa # Load up track hgMapMicroarray gnfAtlas2.bed hgFixed.gnfMouseAtlas2MedianRatio \ affyGnf1m.psl # Note that the unmapped 5000 records are from all-N sequences. hgLoadBed -strict mm8 gnfAtlas2 gnfAtlas2.bed # MOUSE AFFYMETRIX MOE430 TRACK (DONE Fan 2006-03-09) # mkdir -p /projects/compbio/data/microarray/affyMouse # Download MOE430A and MOE430B consensus sequences from Affymetrix web site # http://www.affymetrix.com/support/technical/byproduct.affx?product=moe430 # unzip MOE430*_consensus.zip # check for duplicate probes: there are none, all have unique names # check for duplicate probes: 100 from 136745_at to 1367551_a_at # remove "consensus:" and ";" from FASTA headers to shorten probeset # names for database # sed -e 's/consensus://' MOE430A_consensus | sed -e 's/;/ /' > MOE430_all.fa # sed -e 's/consensus://' MOE430B_consensus | sed -e 's/;/ /' >> MOE430_all.fa # cp /projects/compbio/data/microarray/affyMouse/MOE430_all.fa \ # /cluster/bluearc/affy/ # THE ABOVE WAS ALREADY TBD) # Set up cluster job to align MOE430 consensus sequences to mm8 ssh kkr1u00 cd /cluster/data/mm8/bed mkdir -p affyMOE430 cd affyMOE430 # mkdir -p /iscratch/i/affy # cp /cluster/bluearc/affy/MOE430_all.fa /iscratch/i/affy # iSync ssh kk cd /cluster/data/mm8/bed/affyMOE430 ls -1 /iscratch/i/affy/MOE430_all.fa > affy.lst echo /scratch/hg/mm8/nib/*.nib | wordLine stdin > genome.lst echo '#LOOP\n/cluster/bin/i386/blat -fine -mask=lower -minIdentity=95 -ooc=/scratch/hg/h/mouse11.ooc $(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl}\n#ENDLOOP' > template.sub gensub2 genome.lst affy.lst template.sub para.spec mkdir psl para create para.spec # Do the job with usual para try/check/push/time etc. # Completed: 34 of 34 jobs # CPU time in finished jobs: 9196s 153.26m 2.55h 0.11d 0.000 y # IO & Wait Time: 362s 6.04m 0.10h 0.00d 0.000 y # Average job time: 281s 4.69m 0.08h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 639s 10.65m 0.18h 0.01d # Submission to last job: 639s 10.65m 0.18h 0.01d # Do sort, best in genome filter, and convert to chromosome coordinates # to create affyRAE230.psl pslSort dirs raw.psl tmp psl # only use alignments that cover 30% of sequence and have at least # 95% identity in aligned region. # low minCover as a lot of n's in these sequences pslReps -minCover=0.3 -sizeMatters -minAli=0.95 -nearTop=0.005 raw.psl affyMOE430.psl /dev/null # Load alignments and sequences into database ssh hgwdev cd /cluster/data/mm8/bed/affyMOE430 # shorten names in psl file sed -e 's/MOE430//' affyMOE430.psl > affyMOE430.psl.bak mv affyMOE430.psl.bak affyMOE430.psl # load track into database hgLoadPsl mm8 affyMOE430.psl # Add consensus sequences for MOE430 # Copy sequences to gbdb is they are not there already # mkdir -p /gbdb/hgFixed/affyProbes # ln -s /projects/compbio/data/microarray/affyMouse/MOE430_all.fa \ # /gbdb/hgFixed/affyProbes hgLoadSeq -abbr=MOE430 mm8 /gbdb/hgFixed/affyProbes/MOE430_all.fa # Clean up # rm batch.bak contig.psl raw.psl # BELOW TWO THINGS WERE DONE BY RACHEL ALREDAY FOR MM4 # add entry to trackDb.ra in ~kent/src/hg/makeDb/trackDb/mouse/ # add affyMOE430.html file and then do make alpha to add to trackDb table # Create known gene mapping table and expression distance tables # for GNF Atlas 2. (The hgExpDistance takes an hour.) hgMapToGene mm8 affyGnf1m knownGene knownToGnf1m hgExpDistance mm8 hgFixed.gnfMouseAtlas2MedianRatio \ hgFixed.gnfMouseAtlas2MedianExps gnfAtlas2Distance -lookup=knownToGnf1m Have 34863 elements in hgFixed.gnfMouseAtlas2MedianRatio Got 22937 unique elements in hgFixed.gnfMouseAtlas2MedianRatio # Create table that maps between known genes and RefSeq hgMapToGene mm8 refGene knownGene knownToRefSeq # may need to build this command in src/hg/near/hgMapToGene # Create a table that maps between known genes and # the nice affy expression data. hgMapToGene mm8 affyU74 knownGene knownToU74 hgMapToGene mm8 affyMOE430 knownGene knownToMOE430 hgMapToGene mm8 affyMOE430 -prefix=A: knownGene knownToMOE430A # Format and load Rinn et al sex expression data mkdir /cluster/data/mm8/bed/rinnSex cd /cluster/data/mm8/bed/rinnSex hgMapMicroarray rinnSex.bed hgFixed.mouseRinnSexMedianRatio \ ../affyMOE430/affyMOE430.psl hgLoadBed mm8 rinnSex rinnSex.bed # Format and load the GNF data mkdir /cluster/data/mm8/bed/affyGnf95 cd /cluster/data/mm8/bed/affyGnf95 ~/src/hg/affyGnf/affyPslAndAtlasToBed -newType ../affyU95.psl \ /projects/compbio/data/microarray/affyGnfHuman/data_public_U95 \ affyGnfU95.tab affyGnfU95Exps.tab -shortOut # this .sql load was in preceeding instructions, but this .sql file # appears to not exist and it doesn't seem to be needed anyway. # Everything below this seems to create tables OK. # hgsql mm8 < ~/kent/src/hg/affyGnf/affyGnfU95.sql # Create table that gives distance in expression space between # GNF genes. These commands take about 15 minutes each # The affyGnfU74?Exps arguments appear to be unused in # hgExpDistance cd /cluster/data/mm8/bed/geneSorter hgExpDistance mm8 affyGnfU74A affyGnfU74AExps affyGnfU74ADistance -lookup=knownToU74 hgExpDistance mm8 affyGnfU74B affyGnfU74BExps affyGnfU74BDistance -lookup=knownToU74 hgExpDistance mm8 affyGnfU74C affyGnfU74CExps affyGnfU74CDistance -lookup=knownToU74 # Create table to map between known genes and GNF Atlas2 # expression data. hgMapToGene mm8 gnfAtlas2 knownGene knownToGnfAtlas2 '-type=bed 12' # hgsql -e "select count(*) from knownToGnfAtlas2;" mm8 # row count changed to 22978 # Create expression distance table - takes about an hour hgExpDistance mm8 hgFixed.gnfMouseAtlas2MedianRatio \ hgFixed.gnfMouseAtlas2MedianExps gnfAtlas2Distance \ -lookup=knownToGnfAtlas2 & # hgsql -e "select count(*) from gnfAtlas2Distance;" mm8 # row count changed to 22937000 # HGNEAR PROTEIN BLAST TABLES (DONE 3/14/06 Fan) ssh hgwdev mkdir /cluster/data/mm8/bed/hgNearBlastp cd /cluster/data/mm8/bed/hgNearBlastp cat << _EOF_ > config.ra # Latest mouse vs. other Gene Sorter orgs: # human, rat, zebrafish, worm, yeast, fly targetGenesetPrefix mouse targetDb mm8 queryDbs hg18 rn4 danRer3 ce2 sacCer1 dm2 mm8Fa /cluster/data/mm8/bed/geneSorter/blastp/known.faa hg18Fa /cluster/data/hg18/bed/geneSorter/blastp/known.faa rn4Fa /cluster/data/rn4/bed/blastp/known.faa danRer3Fa /cluster/data/danRer3/bed/blastp/ensembl.faa ce2Fa /cluster/data/ce2/bed/blastp/wormPep154.faa sacCer1Fa /cluster/data/sacCer1/bed/blastp/sgdPep.faa dm2Fa /cluster/data/dm2/bed/flybase4.1/flybasePep.fa buildDir /cluster/data/mm8/bed/hgNearBlastp scratchDir /san/sanvol1/scratch/mm8HgNearBlastp _EOF_ doHgNearBlastp.pl config.ra >do.log # output was like this: ... Scanning through 671 files^M Loading database with 14470 rows^M # ssh -x pk rm -rf /san/sanvol1/scratch/mm8HgNearBlastp/mm8.split^M # ssh -x pk rm -rf /san/sanvol1/scratch/mm8HgNearBlastp/mm8.formatdb^M # ssh -x pk rm -rf /san/sanvol1/scratch/mm8HgNearBlastp/hg18.split^M # ssh -x pk rm -rf /san/sanvol1/scratch/mm8HgNearBlastp/hg18.formatdb^M # ssh -x pk rm -rf /san/sanvol1/scratch/mm8HgNearBlastp/rn4.split^M # ssh -x pk rm -rf /san/sanvol1/scratch/mm8HgNearBlastp/rn4.formatdb^M # ssh -x pk rm -rf /san/sanvol1/scratch/mm8HgNearBlastp/danRer3.split^M # ssh -x pk rm -rf /san/sanvol1/scratch/mm8HgNearBlastp/danRer3.formatdb^M # ssh -x pk rm -rf /san/sanvol1/scratch/mm8HgNearBlastp/ce2.split^M # ssh -x pk rm -rf /san/sanvol1/scratch/mm8HgNearBlastp/ce2.formatdb^M # ssh -x pk rm -rf /san/sanvol1/scratch/mm8HgNearBlastp/sacCer1.split^M # ssh -x pk rm -rf /san/sanvol1/scratch/mm8HgNearBlastp/sacCer1.formatdb^M # ssh -x pk rm -rf /san/sanvol1/scratch/mm8HgNearBlastp/dm2.split^M # ssh -x pk rm -rf /san/sanvol1/scratch/mm8HgNearBlastp/dm2.formatdb^M # ssh -x pk rmdir /san/sanvol1/scratch/mm8HgNearBlastp^M ^M *** All done!^M *** Check these tables in mm8:^M *** mouseBlastTab hgBlastTab rnBlastTab drBlastTab ceBlastTab scBlastTab dmBlastTab ^M *** and mmBlastTab in these databases:^M *** hg18 rn4 danRer3 ce2 sacCer1 dm2 ^M # MAKE ORGANISM-SPECIFIC HGNEARDATA FILES cd ~/kent/src/hg/near/hgNear/hgNearData mkdir -p Mouse/mm8 cd Mouse/mm8 cp ../mm7/otherOrgs.ra # Edit ortherOrgs.ra to reflect the latest genomes used in blastp jobs vi ortherOrgs.ra # then check it into CVS. # ENABLE HGNEAR FOR mm8 IN HGCENTRALTEST echo "update dbDb set hgNearOk = 1 where name = 'mm8';" \ | hgsql -h genome-testdb hgcentraltest # END OF HGNEAR STUFF ######################################################################### # BLASTZ panTro2 after chr9 re-masked (DONE - 2006-03-30 - Hiram) ssh pk mkdir /cluster/data/mm8/bed/blastzPanTro2.2006-03-28 cd /cluster/data/mm8/bed rm blastz.panTro2 ln -s blastzPanTro2.2006-03-28 blastz.panTro2 cd blastz.panTro2 cat << '_EOF_' > DEF # mouse vs chimp export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin BLASTZ=blastz.v7.x86_64 BLASTZ_ABRIDGE_REPEATS=1 # TARGET: Mouse Mm7 SEQ1_DIR=/scratch/hg/mm8/nib SEQ1_LEN=/scratch/hg/mm8/chrom.sizes SEQ1_SMSK=/cluster/bluearc/scratch/hg/mm8/linSpecRep/notInOthers SEQ1_CHUNK=50000000 SEQ1_LAP=10000 # QUERY: Chimp PanTro2 SEQ2_DIR=/scratch/hg/panTro2/nib SEQ2_LEN=/scratch/hg/panTro2/chrom.sizes SEQ2_SMSK=/cluster/bluearc/panTro2/linSpecRep/notInRodent SEQ2_CHUNK=50000000 SEQ2_LAP=0 BASE=/cluster/data/mm8/bed/blastzPanTro2.2006-03-28 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ `pwd`/DEF > blastz.out 2>&1 & # broken during blastz run due to panassas failure time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -continue=cat `pwd`/DEF > cat.out 2>&1 & # Do not have this measurement for the first time around, tables # got loaded again before I thought of that. time nice -n +19 featureBits mm8 chainPanTro2Link \ > fb.mm8.chainPanTro2Link # 963977790 bases of 2567283971 (37.549%) in intersection # For panTro1 this was: time nice -n +19 featureBits mm8 chainPanTro1Link \ > fb.mm8.chainPanTro1Link # 901276629 bases of 2567283971 (35.106%) in intersection ssh pk mv /cluster/data/panTro2/bed/blastz.mm8.swap \ /cluster/data/panTro2/bed/blastz.mm8.swap.2006-03-21 mkdir /cluster/data/panTro2/bed/blastz.mm8.swap cd /cluster/data/panTro2/bed/blastz.mm8.swap time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -swap -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ /cluster/data/mm8/bed/blastzPanTro2.2006-03-28/DEF \ > blastz.out 2>&1 & # completed the downloads manually since they failed due to the # existing downloads. Then cleanup: ssh hgwbeta cd /cluster/data/panTro2/bed/blastz.mm8.swap time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -swap -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -continue=cleanup /cluster/data/mm8/bed/blastzPanTro2.2006-03-28/DEF \ > cleanup.out 2>&1 & time nice -n +19 featureBits panTro2 chainMm8Link \ > fb.panTro2.chainMm8Link 2>&1 & # 978002566 bases of 2909512873 (33.614%) in intersection # first time before the chr9 fix was: # 986978326 bases of 2909512873 (33.922%) in intersection ######################################################################### # BLASTZ panTro2 (DONE - 2006-03-15 - Hiram) ssh pk mkdir /cluster/data/mm8/bed/blastz.panTro2.2006-02-23 cd /cluster/data/mm8/bed ln -s blastz.panTro2.2006-02-23 blastz.panTro2 cd blastz.panTro2 cat << '_EOF_' > DEF # mouse vs chimp export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin BLASTZ=blastz.v7.x86_64 BLASTZ_ABRIDGE_REPEATS=1 # TARGET: Mouse Mm7 SEQ1_DIR=/scratch/hg/mm8/nib SEQ1_LEN=/scratch/hg/mm8/chrom.sizes SEQ1_SMSK=/cluster/bluearc/scratch/hg/mm8/linSpecRep/notInOthers SEQ1_CHUNK=50000000 SEQ1_LAP=10000 # QUERY: Chimp PanTro2 SEQ2_DIR=/scratch/hg/panTro2/nib SEQ2_LEN=/scratch/hg/panTro2/chrom.sizes SEQ2_SMSK=/cluster/bluearc/panTro2/linSpecRep/notInRodent SEQ2_CHUNK=50000000 SEQ2_LAP=0 BASE=/cluster/data/mm8/bed/blastzPanTro2.2006-03-15 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ `pwd`/DEF > blastz.out 2>&1 & # broken during chain step due to missing files on the Iservers # completed chain run manually, then continuing time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -continue=chainMerge `pwd`/DEF > chainMerge.out 2>&1 & # broken during loadUp due to script bug, ran loadUp.csh manually time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -swap `pwd`/DEF > swap.out 2>&1 & # mistakenly did PanTro1 here ... should have been PanTro2 time nice -n +19 featureBits mm8 chainPanTro1Link # 901276629 bases of 2567283971 (35.106%) in intersection time nice -n +19 featureBits panTro2 chainMm8Link \ > fb.panTro2.chainMm8Link 2>&1 # 986978326 bases of 2909512873 (33.922%) in intersection ############################################################################# # UPDATED mm8.knownToVisiGene (2006-03-15 galt) ssh hgwdev knownToVisiGene mm8 ############################################################################# # BLASTZ SELF (DONE - 2006-03-20 - 2006-03-22 - Hiram) # using chain min score of 10,000 to cut down on volumn of data ssh pk mkdir /cluster/data/mm8/bed/blastzSelf.2006-03-20 cd /cluster/data/mm8/bed ln -s blastzSelf.2006-03-20 blastz.mm8 cd blastzSelf.2006-03-20 cat << '_EOF_' > DEF # mouse vs mouse export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin BLASTZ=blastz.v7.x86_64 BLASTZ_H=2000 BLASTZ_M=200 # TARGET: Mouse Mm8 SEQ1_DIR=/scratch/hg/mm8/nib SEQ1_LEN=/scratch/hg/mm8/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Mouse Mm8 SEQ2_DIR=/scratch/hg/mm8/nib SEQ2_LEN=/scratch/hg/mm8/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/cluster/data/mm8/bed/blastzSelf.2006-03-20 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs cd /cluster/data/mm8/bed/blastzSelf.2006-03-20 time /cluster/home/hiram/kent/src/utils/doBlastzChainNet.pl -verbose=2 \ -chainMinScore=10000 -chainLinearGap=medium -bigClusterHub=pk \ `pwd`/DEF > blastz.out 2>&1 & # broke during the load step due to doBlastz script changes, # finished the load manually, then: time /cluster/home/hiram/kent/src/utils/doBlastzChainNet.pl -verbose=2 \ -chainMinScore=10000 -chainLinearGap=medium -bigClusterHub=pk \ -continue=download `pwd`/DEF > download.out 2>&1 & ssh kolossus cd /cluster/data/mm8/bed/blastzSelf.2006-03-20 time HGDB_CONF=~/.hg.conf.read-only featureBits mm8 \ chainSelfLink >fb.mm8.chainSelfLink 2>&1 cat fb.mm8.chainSelfLink # 362483673 bases of 2567283971 (14.119%) in intersection ############################################################################# # UPDATED mm8.knownToVisiGene (2006-04-05 galt) ssh hgwdev knownToVisiGene mm8 ############################################################################ # LIFTOVER (DROPUNDER) CHAINS TO MM7 (2006-04-06 kate) # Split (using makeLoChain-split) of mm7 is doc'ed in makeMm7.doc # Do what makeLoChain-split says to do next (start blat alignment) ssh kk cd /cluster/data/mm8/bed/liftOver makeLoChain-align mm8 /scratch/hg/mm8/nib mm7 \ /iscratch/i/mm7/split10k \ /cluster/bluearc/mm7/11.ooc >&! align.log & # Do what its output says to do next (start cluster job) cd /cluster/data/mm8/bed/blat.mm7.2006-04-06/run para shove para time >&! run.time #CPU time in finished jobs: 906023s 15100.39m 251.67h 10.49d 0.029 y #IO & Wait Time: 22074s 367.90m 6.13h 0.26d 0.001 y #Average job time: 343s 5.72m 0.10h 0.00d #Longest running job: 0s 0.00m 0.00h 0.00d #Longest finished job: 4260s 71.00m 1.18h 0.05d #Submission to last job: 4965s 82.75m 1.38h 0.06d # lift alignments ssh kkr1u00 cd /cluster/data/mm8/bed/liftOver makeLoChain-lift mm8 mm7 >&! lift.log & # chain alignments ssh kki cd /cluster/data/mm8/bed/liftOver makeLoChain-chain mm8 /scratch/hg/mm8/nib \ mm7 /scratch/hg/mm7/nib >&! chain.log & # Do what its output says to do next (start cluster job) cd /cluster/data/mm8/bed/blat.mm7.2006-04-06/chainRun para shove para time >&! run.time #CPU time in finished jobs: 3884s 64.73m 1.08h 0.04d 0.000 y #IO & Wait Time: 594s 9.91m 0.17h 0.01d 0.000 y #Average job time: 86s 1.44m 0.02h 0.00d #Longest running job: 0s 0.00m 0.00h 0.00d #Longest finished job: 245s 4.08m 0.07h 0.00d #Submission to last job: 401s 6.68m 0.11h 0.00d # net alignment chains ssh kkstore03 cd /cluster/data/mm8/bed/liftOver makeLoChain-net mm8 mm7 >&! net.log & # load reference to over.chain into database table, # and create symlinks /gbdb and download area ssh hgwdev cd /cluster/data/mm8/bed/liftOver makeLoChain-load mm8 mm7 >&! load.log & # test by converting a region using the "convert" link on # the browser, and comparing to blat of the same region ############################################################################# # Create Allen Brain Atlas mapping. (DONE 2006-04-12 galt) # compile allenCollectSeq ssh hgwdev cd ~/kent/src/hg/makeDb/allenBrain/allenCollectSeq make # Set up directory ssh kk cd /cluster/data/mm8/bed mkdir allenBrain cd allenBrain # In /san/sanvol1/visiGene/offline/allenBrain/probesAndData/ # allen20051021.tab (converted from spreadsheet mailed by Susan Sunkin ) # probeSeq.20051027.fasta (also from Susan). # Create a list of probe sequences filling ones missing from probeSeq.20050127.fa # with some NCBI and TIGR files, and some downloaded one at a time. allenCollectSeq /san/sanvol1/visiGene/offline/allenBrain/probesAndData/allen20051021.tab /san/sanvol1/visiGene/offline/allenBrain/probesAndData/probeSeq.20051027.fasta /cluster/data/mm7/bed/ncbiXm/ncbiNm.fa /cluster/data/mm7/bed/ncbiXm/ncbiXm.fa /cluster/data/mm6/bed/tigrMgiTc/tigrMgiTc.fa ~/kent/src/hg/makeDb/allenBrain/allenCollectSeq/extra.fa allProbes.fa allProbes.tab missing.tab allenBrainUrl.tab # Set up a blat run to align the probes. mkdir split faSplit sequence allProbes.fa 200 split/rp mkdir run cd run ls -1 ../split/*.fa > mrna.lst ls -1 /scratch/hg/mm8/nib/*.nib > genome.lst mkdir psl cat << '_EOF_' > gsub #LOOP blat -ooc=/scratch/hg/h/mouse11.ooc $(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl} #ENDLOOP '_EOF_' gensub2 genome.lst mrna.lst gsub spec para create spec # Then do the usual para try/push/time/check until the run is finished # Then do sorting and near-best-in-genome step on file server ssh kkstore02 cd /cluster/data/mm8/bed/allenBrain/run pslSort dirs raw.psl tmp psl pslReps raw.psl ../best.psl -nohead -minCover=0.20 -minAli=0.96 -nearTop=0.001 /dev/null sort -k 14,14 -k 16,16n ../best.psl > ../allenBrainAli.psl # Clean up big files no longer needed rm raw.psl rm -r psl rm -r ../split # Load up database ssh hgwdev cd /cluster/data/mm8/bed/allenBrain # Make a new table that contains the URLs for the allen brain genes # Make this one first since all.joiner considers it the master table. hgsql mm8 < ~/kent/src/hg/lib/allenBrainUrl.sql hgsql mm8 -e 'load data local infile "allenBrainUrl.tab" into table allenBrainUrl;' # Make probe alignment table, and load sequence. hgLoadPsl mm8 allenBrainAli.psl mkdir /gbdb/mm8/allenBrain ln -s /cluster/data/mm8/bed/allenBrain/allProbes.fa /gbdb/mm8/allenBrain/allProbes.fa hgLoadSeq mm8 /gbdb/mm8/allenBrain/allProbes.fa # Make mapping between known genes and allenBrain hgMapToGene mm8 allenBrainAli -type=psl knownGene knownToAllenBrain ######################################################################### # BLASTZ HUMAN Hg17 (DONE - 2006-04-13 - 2006-04-19 - Hiram) ssh pk mkdir /cluster/data/mm8/bed/blastzHg17.2006-04-13 cd /cluster/data/mm8/bed ln -s blastzHg17.2006-04-13 blastz.hg17 cd blastzHg17.2006-04-13 cat << '_EOF_' > DEF # mouse vs human export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin BLASTZ=blastz.v7.x86_64 BLASTZ_ABRIDGE_REPEATS=1 # TARGET: Mouse Mm8 SEQ1_DIR=/scratch/hg/mm8/nib SEQ1_SMSK=/scratch/hg/mm8/linSpecRep/notInOthers SEQ1_LEN=/scratch/hg/mm8/chrom.sizes SEQ1_CHUNK=50000000 SEQ1_LAP=10000 # QUERY: Human Hg17 - single chunk big enough to run each chrom by itself SEQ2_DIR=/scratch/hg/hg17/bothMaskedNibs SEQ2_SMSK=/scratch/hg/hg17/linSpecRep.notInMouse SEQ2_LEN=/cluster/data/hg17/chrom.sizes SEQ2_CHUNK=300000000 SEQ2_LAP=0 BASE=/cluster/data/mm8/bed/blastzHg17.2006-04-13 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # establish a screen to control this job screen cd /cluster/data/mm8/bed/blastzHg17.2006-04-13 time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ `pwd`/DEF > blastz.out 2>&1 & # real 656m20.633s # Then to swap over to Hg17 mkdir /cluster/data/hg17/bed/blastz.mm8.swap cd /cluster/data/hg17/bed ln -s blastz.mm8.swap blastz.mm8 cd blastz.mm8.swap time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -swap /cluster/data/mm8/bed/blastzHg17.2006-04-13/DEF \ > swap.out 2>&1 & ssh hgwdev time nice -n +19 featureBits mm8 chainHg17Link # 984380268 bases of 2567283971 (38.343%) in intersection time nice -n +19 featureBits hg17 chainMm8Link # 994530172 bases of 2881515245 (34.514%) in intersection cd /cluster/data/mm8/bed/blastzHg17.2006-04-13 time nice -n +19 featureBits mm8 chainHg17Link > fb.mm8.chainHg17Link 2>&1 # 990554882 bases of 2567283971 (38.584%) in intersection time nice -n +19 featureBits hg17 chainMm8Link > fb.hg17.chainMm8Link 2>&1 # 997368618 bases of 2866216770 (34.797%) in intersection ######################################################################## # BLASTZ/CHAIN/NET XENTRO2 (DONE - 2006-04-20 - Hiram) ssh kk mkdir /cluster/data/mm8/bed/blastz.xenTro2.2006-04-20 cd /cluster/data/mm8/bed ln -s blastz.xenTro2.2006-04-20 blastz.xenTro2 cd blastz.xenTro2.2006-04-20 cat << '_EOF_' > DEF # mouse vs. frog BLASTZ=/cluster/bin/penn/blastz.v7 # Use same params as used for mammal-xenTro1 (see makeXenTro1.doc) BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=8000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET: Mouse mm8 SEQ1_DIR=/scratch/hg/mm8/nib SEQ1_LEN=/cluster/data/mm8/chrom.sizes SEQ1_CHUNK=50000000 SEQ1_LAP=10000 # QUERY: Frog xenTro2 - single chunk big enough to run two of the # largest scaffolds in one job SEQ2_DIR=/scratch/hg/xenTro2/xenTro2.2bit SEQ2_LEN=/san/sanvol1/scratch/xenTro2/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LAP=0 SEQ2_LIMIT=100 BASE=/cluster/data/mm8/bed/blastz.xenTro2.2006-04-20 '_EOF_' # << emacs time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=kk -chainMinScore=5000 -chainLinearGap=loose \ `pwd`/DEF > blastz.out 2>&1 & # XXX running 2006-04-20 # Then to swap over to xenTro2 mkdir /cluster/data/xenTro2/bed/blastz.mm8.swap cd /cluster/data/xenTro2/bed ln -s blastz.mm8.swap blastz.mm8 cd blastz.mm8.swap time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=kk -chainMinScore=5000 -chainLinearGap=loose \ -swap /cluster/data/mm8/bed/blastz.xenTro2.2006-04-20/DEF \ > swap.out 2>&1 & ssh hgwdev cd /cluster/data/mm8/bed/blastz.xenTro2.2006-04-20 time nice -n +19 featureBits mm8 chainXenTro2Link \ > fb.mm8.chainXenTro2Link 2>&1 & # 68050843 bases of 2567283971 (2.651%) in intersection cd /cluster/data/xenTro2/bed/blastz.mm8.swap time nice -n +19 featureBits xenTro2 chainMm8Link \ > fb.xenTro2.chainMm8Link 2>&1 # 72840135 bases of 1359412157 (5.358%) in intersection ####################################################################### ## LIFTOVER To Mm7 (DONE - 2006-04-21 - 2006-04-24 - Hiram) ssh kkr1u00 $HOME/kent/src/hg/makeDb/makeLoChain/makeLoChain-split.csh \ mm7 /cluster/data/mm7/nib # as it says, DO THIS NEXT: ssh kk $HOME/kent/src/hg/makeDb/makeLoChain/makeLoChain-align.csh \ mm8 /scratch/hg/mm8/nib mm7 /iscratch/i/mm7/split10k \ /cluster/data/mm7/11.ooc # as it says, DO THIS NEXT: cd /cluster/data/mm8/bed/blat.mm7.2006-04-21/run para try, check, push, check, ... # Completed: 1360 of 1360 jobs # CPU time in finished jobs: 3890058s 64834.31m 1080.57h 45.02d 0.123 y # IO & Wait Time: 13326s 222.09m 3.70h 0.15d 0.000 y # Average job time: 2870s 47.84m 0.80h 0.03d # Longest finished job: 27224s 453.73m 7.56h 0.32d # Submission to last job: 80553s 1342.55m 22.38h 0.93d # as it says, DO THIS NEXT: # this does the liftUp and makes the psl files ssh kkr1u00 cd /cluster/data/mm8/bed ln -s blat.mm7.2006-04-21 blat.mm7 time $HOME/kent/src/hg/makeDb/makeLoChain/makeLoChain-lift.csh mm8 mm7 # real 16m5.091s # as it says, DO THIS NEXT: # the prepares the batch to run for the chaining ssh kki time $HOME/kent/src/hg/makeDb/makeLoChain/makeLoChain-chain.csh \ mm8 /cluster/data/mm8/nib mm7 /cluster/data/mm7/nib # as it says, DO THIS NEXT: # running the chain batch cd /cluster/data/mm8/bed/blat.mm7.2006-04-21/chainRun para try, check, push, check, ... Completed: 40 of 40 jobs # CPU time in finished jobs: 5381s 89.68m 1.49h 0.06d 0.000 y # IO & Wait Time: 2119s 35.32m 0.59h 0.02d 0.000 y # Average job time: 188s 3.12m 0.05h 0.00d # Longest finished job: 652s 10.87m 0.18h 0.01d # Submission to last job: 685s 11.42m 0.19h 0.01d ssh kkstore04 $HOME/kent/src/hg/makeDb/makeLoChain/makeLoChain-net.csh mm8 mm7 # Created /cluster/data/mm8/bed/liftOver/mm8ToMm7.over.chain.gz # as it says, DO THIS NEXT: ssh hgwdev $HOME/kent/src/hg/makeDb/makeLoChain/makeLoChain-load.csh mm8 mm7 # It says this: # Now, add link for # /usr/local/apache/htdocs/goldenPath/mm8/liftOver/mm8ToMm7.over.chain # to hgLiftOver # But I believe that link was already done: cd /gbdb/mm8/liftOver ls -og mm8ToMm7* # lrwxrwxrwx 1 53 Apr 24 12:32 mm8ToMm7.over.chain.gz -> \ # /cluster/data/mm8/bed/liftOver/mm8ToMm7.over.chain.gz ######################################################################## ## CYTOBAND - ideogram track (DONE - 2006-04-28 - Hiram) ssh hgwdev cd /cluster/data/mm8/pre_release # The .wgetrc is the anonymous user WGETRC=`pwd`/.wgetrc export WGETRC wget --timestamping \ ftp://ftp.ncbi.nih.gov/genomes/M_musculus/pre_release/ideogram mkdir /cluster/data/mm8/cytoBand cd /cluster/data/mm8/cytoBand # Create bed file $HOME/kent/src/utils/createNcbiCytoBand.pl \ /cluster/data/mm8/pre_release/ideogram # Load the bed file hgLoadBed -strict -noBin \ -sqlTable=$HOME/kent/src/hg/lib/cytoBand.sql mm8 \ cytoBand cytoBand.bed # Make cytoBandIdeo track for ideogram gif on hgTracks page. # For mouse cytoBandIdeo is just a replicate of the cytoBand track. hgsql -e "drop table cytoBandIdeo;" mm8 hgsql mm8 -e "create table cytoBandIdeo (index(chrom(10),chromStart)) as select * from cytoBand;" ######################################################################### # GENSCAN PREDICTIONS (DONE - 2006-05-03 - 2006-05-05 - Hiram) ssh kkstore04 # Create a 2bit file with the full chrom sequences and the # random contigs, all hard masked cat ?/chr?.fa ??/chr??.fa randomContigs/chr*.ctg.fa \ | maskOutFa stdin hard stdout \ | faToTwoBit stdin mm8Chroms_RandomContigs.hard.2bit # make sure it still has all the unmasked sequence in it: twoBitToFa mm8Chroms_RandomContigs.hard.2bit stdout \ | faSize stdin # 2661205088 bases (1183272085 N's 1477933003 real 1477933003 # upper 0 lower) in 99 sequences in 1 files twoBitToFa mm8.2bit stdout | faSize stdin # 2664455088 bases (97171400 N's 2567283688 real 1477933003 upper # 1089350685 lower) in 34 sequences in 1 files # note the 'real' bases are the same, the lowers have become N's # 1089350685 + 97171400 = 1186522085 # 1186522085 - 1183272085 = 3250000 == N's in gaps between contigs # And, make sure there aren't any sequences in this lot that have # become all N's with no sequence left in them: twoBitToFa mm8Chroms_RandomContigs.hard.2bit stdout \ | faCount stdin > chroms_randoms.faCount # the lowest three are: egrep -v "^#|^total" chroms_randoms.faCount \ | awk '{print $1,$2-$7}' | sort -k2,2nr | tail -3 # MmUn_162590_36 1631 # Mm1_163269_36 1581 # MmUn_102813_36 1479 # creating 4,000,000 sized chunks, the chroms stay together as # single pieces. The contigs get grouped together into 4,000,000 # sized fasta files. You don't want to break these things up # because genscan will be doing its own internal 2.4 million # window on these pieces, and the gene names are going to be # constructed from the sequence name in these fasta files. The # gene names are much better when they are this simple chrN.M # numbering scheme, or in the case of a contig: contig_name.M # where the M is a sequence number that genscan will assign to # each gene it discovers. mkdir hardChunks twoBitToFa mm8Chroms_RandomContigs.hard.2bit stdout \ | faSplit about stdin 4000000 hardChunks/c_ rsync -a --progress hardChunks/ /cluster/bluearc/mm8/hardChunks/ ssh hgwdev mkdir /cluster/data/mm8/bed/genscan cd /cluster/data/mm8/bed/genscan # Check out hg3rdParty/genscanlinux to get latest genscan: cvs co hg3rdParty/genscanlinux # Run on small cluster (more mem than big cluster). ssh kki cd /cluster/data/mm8/bed/genscan # Make 3 subdirectories for genscan to put their output files in mkdir gtf pep subopt # Generate a list file, genome.list, of all the hard-masked contigs that # *do not* consist of all-N's (which would cause genscan to blow up) # Since we split on gaps, we have no chunks like that. You can # verify with faCount on the chunks. ls -1S /cluster/bluearc/mm8/hardChunks/c_*.fa > genome.list # Create template file, gsub, for gensub2. For example (3-line file): cat << '_EOF_' > template #LOOP /cluster/bin/x86_64/gsBig {check in line+ $(path1)} {check out line gtf/$(root1).gtf} -trans={check out line pep/$(root1).pep} -subopt={check out line subopt/$(root1).bed} -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=2400000 #ENDLOOP '_EOF_' # << emacs gensub2 genome.list single template jobList para create jobList para try, check, push, check, ... # Completed: 673 of 673 jobs # CPU time in finished jobs: 76339s 1272.32m 21.21h 0.88d 0.002 y # IO & Wait Time: 2327s 38.78m 0.65h 0.03d 0.000 y # Average job time: 117s 1.95m 0.03h 0.00d # Longest finished job: 1993s 33.22m 0.55h 0.02d # Submission to last job: 7526s 125.43m 2.09h 0.09d # There was a failed job, going to kolossus and running with a # reduced window size: ssh kolossus cd /cluster/data/mm8/bed/genscan time /cluster/bin/x86_64/gsBig /cluster/bluearc/mm8/hardChunks/c_01.fa \ gtf/c_01.gtf -trans=pep/c_01.pep -subopt=subopt/c_01.bed \ -exe=hg3rdParty/genscanlinux/genscan \ -par=hg3rdParty/genscanlinux/HumanIso.smat \ -tmp=/scratch/tmp -window=2000000 # real 258m34.800s # cat and lift the results into single files ssh kkstore04 cd /cluster/data/mm8/bed/genscan cat gtf/c_*.gtf | liftUp -type=.gtf genscan.gtf \ ../../jkStuff/liftAll.lft carry stdin cat subopt/c_*.bed | liftUp -type=.bed genscanSubopt.bed \ ../../jkStuff/liftAll.lft carry stdin cat pep/c_*.pep > genscan.pep # Load into the database as so: ssh hgwdev cd /cluster/data/mm8/bed/genscan ldHgGene mm8 -gtf genscan genscan.gtf # Read 44899 transcripts in 323099 lines in 1 files # 44899 groups 34 seqs 1 sources 1 feature types # 44899 gene predictions hgPepPred mm8 generic genscanPep genscan.pep hgLoadBed -strict mm8 genscanSubopt genscanSubopt.bed # Loaded 530201 elements of size 6 # check the numbers time nice -n +19 featureBits mm8 genscan # 54455852 bases of 2567283971 (2.121%) in intersection time nice -n +19 featureBits mm8 knownGene:cds # 28459053 bases of 2567283971 (1.109%) in intersection featureBits mm7 genscan # 54864694 bases of 2583394090 (2.124%) in intersection time nice -n +19 featureBits mm7 knownGene:cds # 27531524 bases of 2583394090 (1.066%) in intersection featureBits mm6 genscan # 54894283 bases of 2597150411 (2.114%) in intersection featureBits mm5 genscan # 55024722 bases of 2615483787 (2.104%) in intersection featureBits mm4 genscan # 56164126 bases of 2627444668 (2.138%) in intersection featureBits mm3 genscan # 51697165 bases of 2505900260 (2.063%) in intersection featureBits mm8 genscanSubopt # 57048581 bases of 2567283971 (2.222%) in intersection featureBits mm7 genscanSubopt # 57512333 bases of 2583394090 (2.226%) in intersection featureBits mm6 genscanSubopt # 57856316 bases of 2597150411 (2.228%) in intersection featureBits mm5 genscanSubopt # 58474899 bases of 2615483787 (2.236%) in intersection featureBits mm4 genscanSubopt # 59601009 bases of 2627444668 (2.268%) in intersection featureBits mm3 genscanSubopt # 56085184 bases of 2505900260 (2.238%) in intersection ########################################################################## # BUILD NIBB IMAGE PROGES (in progress 2007-05-05 Jim) # Make directory on san for cluster job and copy in sequence ssh pk mkdir /san/sanvol1/scratch/mm8/nibbPics cd /san/sanvol1/scratch/mm8/nibbPics cp /cluster/data/xenTro1/bed/nibbPics/nibbImageProbes.fa . # Make parasol job dir and sequence list files mkdir run cd run mkdir psl ls -1 /scratch/hg/mm8/nib/*.nib > genome.lst echo ../nibbImageProbes.fa > mrna.lst # Create parasol gensub file file cat << '_EOF_' > gsub #LOOP blatz -rna -minScore=6000 -out=psl $(path1) $(path2) psl/$(root1)_$(root2).psl #ENDLOOP '_EOF_' # Create parasol batch gensub2 genome.lst mrna.lst gsub spec para create spec # Do para try/push/time etc. #Completed: 49 of 49 jobs #CPU time in finished jobs: 12585s 209.74m 3.50h 0.15d 0.000 y #IO & Wait Time: 411s 6.86m 0.11h 0.00d 0.000 y #Average job time: 265s 4.42m 0.07h 0.00d #Longest running job: 0s 0.00m 0.00h 0.00d #Longest finished job: 1145s 19.08m 0.32h 0.01d #Submission to last job: 1195s 19.92m 0.33h 0.01d # Make sort and filter catDir psl | sort -k 10 \ | pslReps stdin stdout /dev/null -nohead -minAli=0.60 -nearTop=0.001 -minCover=0.10 -minNearTopSize=80 \ | sort -k 14,14 -k 16,16n \ | sed 's#/scratch/hg/mm8/nib/chr#chr#' \ | sed 's/.nib//' > ../nibbImageProbes.psl # Make bed file and copy in stuff ssh hgwdev cd /cluster/data/mm8/bed mkdir nibbPics cd nibbPics cp /cluster/data/xenTro1/bed/nibbPics/nibbImageProbes.fa . cp /san/sanvol1/scratch/mm8/nibbPics/nibbImageProbes.psl . # Load into database ln -s /cluster/data/xenTro1/bed/nibbPics/nibbImageProbes.fa /gbdb/mm8/nibbImageProbes.fa hgLoadSeq mm8 /gbdb/mm8/nibbImageProbes.fa hgLoadPsl mm8 nibbImageProbes.psl ############################################################################# # miRNA track (DONE - 2006-05-22 - Fan) # data from: Michel.Weber@ibcg.biotoul.fr # notify them when done. ssh hgwdev cd /cluster/data/mm8/bed mkdir miRNA-2006-05-22 cd miRNA-2006-0522 # save the mm8_miRNA_track_may2006.txt file from email cat mm8_miRNA_track_may2006.txt|sed -e 's/ /\t/g' >miRNA.tab hgLoadBed -strict mm8 miRNA miRNA.tab # check previous release track before update featureBits mm8 miRNA # 28630 bases of 2567283971 (0.001%) in intersection featureBits mm7 miRNA # 20620 bases of 2583394090 (0.001%) in intersection featureBits mm6 miRNA # 21167 bases of 2597150411 (0.001%) in intersection featureBits mm5 miRNA # 17957 bases of 2615483787 (0.001%) in intersection ######################################################################### # BLASTZ CHICKEN galGal3 (DONE 5/24/06 angie) ssh pk mkdir /cluster/data/mm8/bed/blastz.galGal3.2006-05-23 cd /cluster/data/mm8/bed/blastz.galGal3.2006-05-23 cat << '_EOF_' > DEF # mouse vs chicken BLASTZ=blastz.v7.x86_64 # Specific settings for chicken (per Webb email to Brian Raney) BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=10000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q BLASTZ_ABRIDGE_REPEATS=1 # TARGET: Mouse mm8 SEQ1_DIR=/scratch/hg/mm8/nib SEQ1_SMSK=/san/sanvol1/scratch/mm8/linSpecRep/notInNonMammal SEQ1_LEN=/scratch/hg/mm8/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Chicken galGal3 - single chunk big enough to run entire chrom SEQ2_DIR=/san/sanvol1/galGal3/nib SEQ2_LEN=/cluster/data/galGal3/chrom.sizes SEQ2_SMSK=/san/sanvol1/galGal3/linSpecRep SEQ2_CHUNK=200000000 SEQ2_LAP=0 BASE=/cluster/data/mm8/bed/blastz.galGal3.2006-05-23 '_EOF_' # << emacs doBlastzChainNet.pl DEF -blastzOutRoot /san/sanvol1/scratch/gg3vsmm8 \ -bigClusterHub=pk -smallClusterHub=pk \ -chainMinScore=5000 -chainLinearGap=loose \ >& do.log & tail -f do.log ln -s blastz.galGal3.2006-05-23 /cluster/data/mm8/bed/blastz.galGal3 ######################################################################### # ADD LINK TO GENENETWORK (DONE. 5/31/06 Fan). # Copy geneNetwork ID list from mm7 ssh hgwdev mkdir -p /cluster/data/mm8/bed/geneNetwork cd /cluster/data/mm8/bed/geneNetwork hgsql mm7 -N -e 'select * from geneNetworkId' > geneNetworkId.tab hgsql mm8 -e 'drop table geneNetworkId' hgsql mm8 < ~/src/hg/lib/geneNetworkId.sql hgsql mm8 -e \ 'load data local infile "geneNetworkId.tab" into table geneNetworkId' ############################################################################ # SGP GENES (DONE - 2006-06-12 - Hiram) ssh kkstore02 cd /cluster/data/mm8/bed ln -s /cluster/store8/mm8/bed/sgp . cd sgp # They don't do chrM for C in `awk '{print $1}' /cluster/data/mm8/chrom.sizes | grep -v chrM` do wget --timestamping \ "http://genome.imim.es/genepredictions/M.musculus/mmMar2006/SGP/humangp200603/${C}.gtf" \ -O "${C}.gtf" done ssh hgwdev cd /cluster/data/mm8/bed/sgp ldHgGene -gtf -genePredExt mm8 sgpGene chr*.gtf featureBits mm8 -enrichment refGene:CDS sgpGene # refGene:CDS 1.063%, sgpGene 1.455%, both 0.918%, cover 86.32%, # enrich 59.32x ######################################################################### # BUILD KNOWN GENE LIST FOR GOOGLE. (DONE. 6/6/06 Fan). cd /cluster/data/mm8/bed rm -rf knownGeneList/mm8 # Run hgKnownGeneList to generate the tree of HTML pages # under ./knownGeneList/mm8 hgKnownGeneList mm8 # copy over to /usr/local/apache/htdocs rm -rf /usr/local/apache/htdocs/knownGeneList/mm8 mkdir -p /usr/local/apache/htdocs/knownGeneList/mm8 cp -Rfp knownGeneList/mm8/* /usr/local/apache/htdocs/knownGeneList/mm8 ######################################################################### ### IGTC (Int'l GeneTrap Consortium) (DONE - 2006-06-12 - angie) ### fasta added 2006-06-21 ### Doug Stryke in Tom Ferrin's lab ### NOTE -- as of 2007-03-01 the igtc track will be automatically ### updated on hgwdev by the scripts monthlyUpdateIgtc.csh and ### updateIgtc.pl in kent/src/hg/utils/automation/ . ssh hgwdev mkdir /cluster/data/mm8/bed/igtc cd /cluster/data/mm8/bed/igtc wget http://www.genetrap.org/blattrack/genetrap_mm8.psl grep -v ^track genetrap_mm8.psl \ | hgLoadPsl mm8 -table=igtc stdin # Probe fasta is shared by all assemblies: wget http://www.genetrap.org/blattrack/genetrap.fasta mkdir /gbdb/mm8/igtc ln -s /cluster/data/mm8/bed/igtc/genetrap.fasta /gbdb/mm8/igtc/ hgLoadSeq -replace mm8 /gbdb/mm8/igtc/genetrap.fasta ######################################################################### # REGULATORY POTENTIAL (DONE - 2006-06-12 - Hiram) # download data from "James Taylor" ssh kkstore04 cd /cluster/data/mm8/bed mkdir /cluster/store8/mm8/bed/regPotential7X ln -s /cluster/store8/mm8/bed/regPotential7X . cd regPotential7X # This is a lot of data time for C in 1 2 3 4 5 6 7 8 9 X 10 11 12 13 14 15 16 17 18 19 do wget --timestamping \ "http://www.bx.psu.edu/~james/esperr_rp_7way_scores/genome_scores_mm8/chr${C}.scores.truncated.bz2" done # real 79m32.840s wget --timestamping \ "http://www.bx.psu.edu/~james/esperr_rp_7way_scores/genome_scores_mm8/trackDb.html" -O description.html time for C in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 X do bzcat chr${C}.scores.truncated.bz2 done | wigEncode -noOverlap stdin regPotential7X.wig regPotential7X.wib # Converted stdin, upper limit 1.00, lower limit 0.00 # real 22m28.583s # Loading the table on hgwdev ssh hgwdev cd /cluster/data/mm8/bed/regPotential7X ln -s /cluster/data/mm8/bed/regPotential7X/regPotential7X.wib \ /gbdb/mm8/wib/regPotential7X.wib # using the tmpDir is faster since it is on local disk and it will # clean up any temporary .tab file it creates there time hgLoadWiggle -tmpDir=/scratch/tmp \ mm8 regPotential7X regPotential7X.wig # real 0m28.683s # create a histogram ssh kolossus cd /cluster/data/mm8/bed/regPotential7X time hgWiggle -verbose=2 -doHistogram -hBinSize=0.01 -hBinCount=100 \ -hMinVal=0.0 -db=mm8 regPotential7X > histogram.data 2>&1 # real 18m29.167s # create download gzip files from the bz2 files: ssh kkstore04 cd /cluster/data/mm8/bed/regPotential7X for F in chr*.scores.truncated.bz2 do C=`echo $F | awk -F'.' '{print $1}'` echo -n "${C}.regPotential7X.mm8.gz working ... " bzcat ${F} | gzip > ${C}.regPotential7X.mm8.gz echo done ############################################################################# # SIB Transcriptome (DONE Aug 29, 2007 - JK) # Create working directory and download data from where Christian Iseli # (Christian.Iseli@licr.org) put it, and unpack. The download takes about # ten minutes (161M file). cd /cluster/data/mm8/bed mkdir sibTranscriptome cd sibTranscriptome wget ftp://ftp.licr.org/pub/databases/trome/mouse/MTR.gtf.gz wget ftp://ftp.licr.org/pub/databases/trome/mouse/txg.tar.gz tar -zxvf txg.tar.gz # Load up sibGene table zcat MTR.gtf.gz | ldHgGene mm8 sibGene stdin # Do a little data cleanup and transformation and load splice graphs into database. sed 's/altGraphX/sibTxGraph/' ~/src/hg/lib/altGraphX.sql > sibTxGraph.sql cat txg/*.txg | txgToAgx stdin stdout | hgLoadBed -notItemRgb -sqlTable=sibTxGraph.sql mm8 sibTxGraph stdin # Create sibAltEvents track for analysed alt-splices. cat txg/*.txg | txgAnalyze stdin /cluster/data/mm8/mm8.2bit sibAltEvents.bed awk '$2 >= 0' sibAltEvents.bed | sort | uniq > foo.bed hgLoadBed mm8 sibAltEvents foo.bed ######################################################################### # MAP CONTIGS TRACK (DONE - 2005-10-04 - Hiram) ssh hgwdev mkdir -p /cluster/data/mm8/bed/ctgPos cd /cluster/data/mm8/bed/ctgPos # hgCtgPos uses the lift files... but mouse lift files are for the # 5MB contigs from splitFaIntoContigs, not for the real NT_ contigs # from the assembly. (In the future, we should go with the NT's!) # So... just for this release, go straight from the seq_contig.md # to the table def'n: contig, size, chrom, chromStart, chromEnd # This script is an improvement from before, this is now doing the # randoms properly. cat << '_EOF_' > seqContigToCtgPos.pl #!/usr/bin/env perl use warnings; use strict; my $prevRandom=""; my $randomPosition=0; while(my $line=<>) { chomp($line); my @a = split('\s+',$line); if ($a[1] =~ m/\|/) { my @b = split('\|',$a[1]); if ($b[0] ne $prevRandom) { $randomPosition=0; $prevRandom=$b[0]; } my $size = $a[3]-$a[2]+1; my $start = $randomPosition; my $end = $randomPosition + $size; printf "%s\t%d\tchr%s_random\t%d\t%d\n", $a[5],$size,$b[0],$start,$end; if ($b[0] ne "Un") { $randomPosition += 50000; } else { $randomPosition += 50000; } $randomPosition += $size; } elsif ($a[5] =~ m/^N[TC]_\d+$/) { my $start = $a[2]-1; my $end = $a[3]; my $size = $end-$start; printf "%s\t%d\tchr%s\t%d\t%d\n", $a[5],$size,$a[1],$start,$end; } } '_EOF_' # << emacs happy chmod +x seqContigToCtgPos.pl egrep "ref_strain|C57BL" ../../seq_contig.md \ | ./seqContigToCtgPos.pl > ctgPos.tab cat ../../seq_contig.md | ./seqContigToCtgPos.pl > ctgPos.tab hgsql mm8 -e "drop table ctgPos;" hgsql mm8 < ~/kent/src/hg/lib/ctgPos.sql hgsql mm8 -e 'load data local infile "ctgPos.tab" into table ctgPos;' featureBits -countGaps mm8 ctgPos # 2573322222 bases of 2664455088 (96.580%) in intersection featureBits -countGaps mm7 ctgPos # 2608810329 bases of 2847717329 (91.611%) in intersection featureBits -countGaps mm6 ctgPos # 2638893452 bases of 3079633452 (85.689%) in intersection featureBits -countGaps mm5 ctgPos # 2557081173 bases of 3164952073 (80.794%) in intersection ##################################################################### #### LOAD ENSEMBL GENES (DONE - 2006-06-21 - Hiram) # ADDED PEPTIDE TABLE, ENSPEP (DONE, 2006-07-11, hartera) # ADDDED STABLE URL TO TRACKDB BLOCK (V39, JUN 2006) (2008-01-10, rhead) mkdir /cluster/data/mm8/bed/ensGene cd /cluster/data/mm7/bed/ensGene Get the Ensembl BioMart at http://www.ensembl.org/Multi/martview Choose Ensembl 39 and Mus musculus, click next It displays status in a window on the right, indicating how many entries are here, currently: 27,967 The next page is the "filter" step, we do not want any filters, nothing is changed on this page, click next Now we are on the "output" tab, the filter in the window on the right indicates that 27,967 passed the filter. (there is no filter) Now, on this output page, change the pull-down menu item from its default of "features" to read "structures" All the check-boxes now change. Mark the check box GTF under output format Under Gene Ensemble Attributes, Unselect Biotype Select Ensembl Gene ID Ensembl Transcript ID External Gene ID gzip compression and give it a filename: ensGeneMm8 it will add the .gff.gz suffix press "export" # The random coordinates are given in contig # coordinates, need to lift them to chroms, create a lift file: echo << '_EOF_' > mkRandomNTLift.sh #!/bin/sh grep random /cluster/data/mm8/chrom.sizes | while read R do chr=`echo $R | awk '{print $1}'` size=`echo $R | awk '{print $2}'` hgsql -N -e "select * from ctgPos where chrom=\"$chr\";" mm8 | \ awk ' BEGIN {size="'$size'"} { printf "%s\t%s\t%s\t%s\t%s\n", $4, $1, $2, $3, size } ' done '_EOF_' # << happy emacs chmod +x ./mkRandomNTLift.sh ./mkRandomNTLift.sh > randomNT.lft # Add "chr" to front of each line (that is a normal chrom number) # in the gene data gtf file to make # it compatible with ldHgGene and convert the chrMT name, and lift # the random coordinates zcat ensGeneMm8.gff.gz \ | sed -e "s/^\([0-9XY][0-9]*\)/chr\1/; s/^MT/chrM/" \ | liftUp ensGene.gtf randomNT.lft carry stdin ldHgGene mm8 ensGene ensGene.gtf # Read 34831 transcripts in 597575 lines in 1 files # 34831 groups 34 seqs 1 sources 4 feature types # 34831 gene predictions featureBits mm8 ensGene # 56159487 bases of 2567283971 (2.188%) in intersection featureBits mm7 ensGene # 57484684 bases of 2583394090 (2.225%) in intersection featureBits mm6 ensGene # 54791625 bases of 2597150411 (2.110%) in intersection # Load ensGtp table. # ensGtp associates geneId/transcriptId/proteinId for hgPepPred and # hgKnownToSuper. # Get the Ensembl BioMart at http://www.ensembl.org/Multi/martview # Choose Ensembl 39 and Mus musculus, click next # Follow this sequence through the pages: # 1) No filters in the filter section, click next go to Output # 2) Select "Structures". # 3) select Ensembl Gene ID, Ensembl Transcript ID, Ensembl Peptide ID. # 4) select "Text, tab separated" gzip and name the output file as "ensGtp" # 5) download the output file "ensGtp.tsv.gz" # the tsv.gz is added automatically to the ensGtp # Something is unusual in this download. The lines are duplicated # about 8 times more than necessary zcat ensGtp.tsv.gz | wc -l # 284554 zcat ensGtp.tsv.gz | sort -u | wc -l gunzip ensGtp.tsv.gz # 34832 hgsql mm8 < ~/kent/src/hg/lib/ensGtp.sql # The 'tail -n +2' skips the first line with is just column # heading labels. The sort -u will eliminate the duplicate lines: zcat ensGtp.tsv.gz | tail -n +2 | sort -u \ | hgsql mm8 -e \ 'load data local infile "/dev/stdin" into table ensGtp;' hgsql -e "select count(*) from ensGtp;" mm8 # 34831 # properly, one less than the count above # clean up gzip ensGene.gtf rm genePred.tab # Now, an experiment to determine if the Ensembl peptide sequences # are the same thing we get here upon translation of the CDS coding # sequence from the genome mkdir /cluster/data/mm8/bed/ensGene/testPeptides cd /cluster/data/mm8/bed/ensGene/testPeptides getRnaPred -cdsOnly mm8 ensGene all stdout | gzip > all.cdsOnly.gz # Obtaining protein sequence from EnsMart # Select "sequences" from the pull-down on the output page # check Peptide in the "Sequences" selection area # and "Ensembl Transcript ID (versioned) in the Transcript # Attributes area # Text,Fasta output, gzip, file name: ensPepMm8 # becomes ensPepMm8.fasta.gz # A special faToTab.pl script to allow an exclude list, first need # to obtain the exclude list from the ensembl set: zcat ensPepMm8.fasta.gz \ | ~/kent/src/utils/faToTab/faToTab.pl /dev/null /dev/stdin \ | sed -e "/^$/d; s/\*$//" | sort > ensPepMm8.fa.tab # extract the exclude list from that grep "Sequence unavailable" ensPepMm8.fa.tab \ | awk '{print $1}' > excludeList.txt # now filter via that exclude list, remove the final '*' character # from their protein sequence and sort by name zcat ensPepMm8.fasta.gz \ | ~/kent/src/utils/faToTab/faToTab.pl excludeList.txt /dev/stdin \ | sed -e "/^$/d; s/\*$//" | sort > ensPepMm8.fa.tab # and then our peptides, same filter, remove the final 'Z' character # from this protein sequence (the stop codon): zcat all.cdsOnly.gz | faTrans stdin stdout \ | ~/kent/src/utils/faToTab/faToTab.pl excludeList.txt /dev/stdin \ | sed -e "/^$/d; s/Z$//" | sort > all.fa.tab # do we have the same lists: awk '{print $1}' ensPepMm8.fa.tab > ensList awk '{print $1}' all.fa.tab > ucscList diff ensList ucscList # no differences in the name list, numbering: wc -l ensList ucscList # 31302 ensList # 31302 ucscList # How many proteins different: diff ensPepMm8.fa.tab all.fa.tab | grep "^>" | awk '{print $2}' | wc -l # 37 # Taking a look at that difference, it is difficult to see the # individual differences, some are single amino acid # differences, others are more radically different: diff ensPepMm8.fa.tab all.fa.tab | less # Conclusion, the 37 differences out of 31,302 are not worth the # trouble to load up the entire Ensembl peptide table # Add Ensembl peptide table - requested by a user (hartera, 2006-07-11) ssh hgwdev cd /cluster/data/mm8/bed/ensGene cat << EOF > ensPep.sql CREATE TABLE ensPep ( name varchar(255) not null, # Name of gene - same as in genePred seq longblob not null, # Peptide sequence #Indices PRIMARY KEY(name(64)) ); EOF cp ./testPeptides/ensPepMm8.fa.tab.gz . gunzip ensPepMm8.fa.tab.gz hgLoadSqlTab mm8 ensPep ensPep.sql ensPepMm8.fa.tab -warn ########################################################################### ## MAKE SUPERFAMILY TRACK (DONE, 6/22/06, Fan) # If mm8.superfamily already exists, drop it. cd /cluster/data/mm8/bed mkdir /cluster/data/mm8/bed/sf.20060622 ln -s sf.20060622 sf cd sf hgSuperfam mm8 superfam060619 > sf.log # It is normal that many proteins do not have corresponding Superfamily entries. # If mm8.sfDescription exists, drop it. hgsql mm8 < ~/src/hg/lib/sfDescription.sql hgsql mm8 -e 'LOAD DATA local INFILE "sfDescription.tab" into table mm8.sfDescription;' # Finally, load the superfamily table. hgLoadBed mm8 superfamily superfamily.tab -tab # Create knownToEnsembl table hgMapToGene mm8 ensGene knownGene knownToEnsembl # Create knownToSuperfamily table # Note hs is changed into ht for this Superfamily release. cat /cluster/data/superfamily/060619/ass_18-Jun-2006.tab \ | hgKnownToSuper mm8 mm stdin # 26547 records output ########################################################################### # dbSNP BUILD 126 (Heather, August 2006) # Set up directory structure ssh kkstore02 cd /cluster/data/dbSNP/126/mouse mkdir mm8 cd mm8 mkdir data mkdir schema mkdir rs_fasta # Get data from NCBI (anonymous FTP) cd /cluster/data/dbSNP/126/mouse/mm8/data ftp ftp.ncbi.nih.gov cd snp/organisms/mouse_10090/database/organism_data # ContigLoc table has coords, orientation, loc_type, and refNCBI allele get b126_SNPContigLoc_36_1.bcp.gz # ContigLocusId has function get b126_SNPContigLocusId_36_1.bcp.gz get b126_ContigInfo_36_1.bcp.gz # MapInfo has alignment weights get b126_SNPMapInfo_36_1.bcp.gz # SNP has univar_id, validation status and heterozygosity get SNP.bcp.gz # Get schema from NCBI cd /cluster/data/dbSNP/126/mouse/mm8/schema ftp ftp.ncbi.nih.gov cd snp/organisms/mouse_10090/database/organism_schema get mouse_10090_table.sql.gz # Get fasta files from NCBI # using headers of fasta files for molType cd /cluster/data/dbSNP/126/mouse/rs_fasta ftp ftp.ncbi.nih.gov cd snp/organisms/mouse_10090/rs_fasta prompt mget *.gz # add rs_fasta to seq/extFile # 2 edits first: strip header to just rsId, and remove duplicates # work on /cluster/store12 (kkstore05) which has more disk space cp rs_ch*.fas.gz /cluster/store12/snp/126/mouse/rs_fasta ssh kkstore05 cd /cluster/store12/snp/126/mouse/rs_fasta # concat into rsAll.fas cat << '_EOF_' > concat.csh #!/bin/csh -ef rm -f rsAll.fas foreach file (rs_ch*.fas) echo $file zcat $file >> rsAll.fas end '_EOF_' # snpCleanSeq strips the header and skips duplicates /cluster/home/heather/kent/src/hg/snp/snpLoad/snpCleanSeq rsAll.fas snp.fa rm rsAll.fas # load on hgwdev ssh hgwdev mkdir /gbdb/mm8/snp ln -s /cluster/store12/snp/126/mouse/rs_fasta/snp.fa /gbdb/mm8/snp/snp.fa cd /cluster/store12/snp/126/mouse/rs_fasta hgLoadSeq mm8 /gbdb/mm8/snp/snp.fa # look up id in extFile # move into separate table hgsql mm8 < snpSeq.sql hgsql -e 'insert into snpSeq select acc, file_offset from seq where extFile = 9642470' mm8 hgsql -e 'delete from seq where extFile = 9642470' mm8 hgsql -e 'alter table snpSeq add index acc (acc)' mm8 # clean up after hgLoadSeq rm seq.tab # Simplify names of data files cd /cluster/data/dbSNP/126/mouse/mm8/data mv b126_ContigInfo_36_1.bcp.gz ContigInfo.gz mv b126_SNPContigLoc_36_1.bcp.gz ContigLoc.gz mv b126_SNPContigLocusId_36_1.bcp.gz ContigLocusId.gz mv b126_SNPMapInfo_36_1.bcp.gz MapInfo.gz mv SNP.bcp.gz SNP.gz ls -1 *.gz > filelist # edit table descriptions cd /cluster/data/dbSNP/126/mouse/mm8/schema # get CREATE statements from mouse_10090_table.sql for our 5 tables # store in table.tmp # convert and rename tables sed -f 'mssqlToMysql.sed' table.tmp > table2.tmp rm table.tmp sed -f 'tableRename.sed' table2.tmp > table.sql rm table2.tmp # Get updated UniVariation table cd /cluster/data/dbSNP/126/shared ftp ftp.ncbi.nih.gov cd snp/database/shared_data get UniVariation.bcp.gz cd ../shared_schema get dbSNP_main_table.sql.gz # get UniVariation CREATE statement from dbSNP_main_table.sql # use mssqlToMysql.sed to convert # get header lines from rs_fasta cd /cluster/data/dbSNP/126/mouse/mm8/rs_fasta /bin/csh gnl.csh # load on kkr5u00 ssh kkr5u00 hgsql -e mysql 'create database mm8snp126' cd /cluster/data/dbSNP/126/mouse/mm8/schema hgsql mm8snp126 < table.sql cd ../data /bin/csh load.csh # note rowcount # ContigLoc 23811983 # SNP 10837184 # MapInfo 23570302 # ContigLocusId 10317095 cd /cluster/data/dbSNP/126/shared hgsql mm8snp126 < UniVariation.sql zcat UniVariation.bcp.gz | hgsql -e 'load data local infile "/dev/stdin" into table UniVariation' mm8snp126 # create working /scratch dir cd /scratch/snp/126 mkdir mouse cd mouse # get mm8 ctgPos, load into mm8snp126, compare contig list between ctgPos and ContigInfo # No issues in non-random # No PAR issues # get gnl files cp /cluster/data/dbSNP/126/mouse/mm8/rs_fasta/*.gnl . # examine ContigInfo for group_term and edit pipeline.csh # use "ref_strain" # filter ContigLoc into ContigLocFilter # this lifts from contig coords to chrom coords # phys_pos_from is used to check coords for non-random chroms # errors reported to stdout # this gets rid of alternate assemblies (using ContigInfo) # this also gets rid of poor quality alignments (weight == 10 || weight == 0 in MapInfo) # assumes all contigs are positively oriented; will abort if not true # Note for mouse we also screen on assembly = "C57BL/6J" in MapInfo mysql> desc ContigLocFilter; # +---------------+-------------+------+-----+---------+-------+ # | Field | Type | Null | Key | Default | Extra | # +---------------+-------------+------+-----+---------+-------+ # | snp_id | int(11) | NO | | | | # | ctg_id | int(11) | NO | | | | # | chromName | varchar(32) | NO | | | | # | loc_type | tinyint(4) | NO | | | | # | start | int(11) | NO | | | | # | end | int(11) | YES | | NULL | | # | orientation | tinyint(4) | NO | | | | # | allele | blob | YES | | NULL | | # +---------------+-------------+------+-----+---------+-------+ /cluster/home/heather/kent/src/hg/snp/snpLoad/snpContigLocFilter mm8snp126 ref_strain C57BL/6J # note rowcount # ContigLocFilter 7923033 # how many are positive strand? hopefully 90% mysql> select count(*) from ContigLocFilter where orientation = 0; # 7779413 # note count by loc_type mysql> select count(*), loc_type from ContigLocFilter group by loc_type; # +----------+----------+ # | count(*) | loc_type | # +----------+----------+ # | 2144 | 1 | # | 7903966 | 2 | # | 13105 | 3 | # | 1052 | 4 | # | 523 | 5 | # | 2243 | 6 | # +----------+----------+ # filter ContigLocusId into ContigLocusIdFilter /cluster/home/heather/kent/src/hg/snp/snpLoad/snpContigLocusIdFilter mm8snp126 ref_strain # note rowcount # ContigLocusIdFilter 3484757 # condense ContigLocusIdFilter into ContigLocusIdCondense (one SNP can have multiple functions) # assumes SNPs are in numerical order; will errAbort if not true /cluster/home/heather/kent/src/hg/snp/snpLoad/snpContigLocusIdCondense mm8snp126 # note rowcount; expect about 50% (ascertainment bias for SNPs within genes) # ContigLocusIdCondense 2789998 # could delete ContigLocusIdFilter table here # create chrN_snpFasta tables from *.gnl files # we are just using molType, but also storing class and observed # need chromInfo for this /cluster/home/heather/kent/src/hg/snp/snpLoad/snpLoadFasta mm8snp126 # (could start using pipeline.csh here) # (pipeline.csh takes about 35 minutes to run) # split ContigLocFilter by chrom # create the first chrN_snpTmp # we will reuse this table name, adding/changing columns as we go # at this point chrN_snpTmp will have the same description as ContigLocFilter # this opens a file handle for every chrom, so will not scale to scaffold-based assemblies /cluster/home/heather/kent/src/hg/snp/snpLoad/snpSplitByChrom mm8snp126 ref_strain # adjust coords using loc_type # possible errors logged to snpLocType.error: # Unknown locType # Between with end != start + 1 # Between with allele != '-' # Exact with end != start # Range with end < start # possible exceptions logged to snpLocType.exceptions: # RefAlleleWrongSize # This run no errors, no exceptions # morph chrN_snpTmp mysql> desc chr1_snpTmp; # +---------------+-------------+------+-----+---------+-------+ # | Field | Type | Null | Key | Default | Extra | # +---------------+-------------+------+-----+---------+-------+ # | snp_id | int(11) | NO | | | | # | ctg_id | int(11) | NO | | | | # | chromStart | int(11) | NO | | | | # | chromEnd | int(11) | NO | | | | # | loc_type | tinyint(4) | NO | | | | # | orientation | tinyint(4) | NO | | | | # | allele | blob | YES | | NULL | | # +---------------+-------------+------+-----+---------+-------+ /cluster/home/heather/kent/src/hg/snp/snpLoad/snpLoctype mm8snp126 ref_strain # expand allele as necessary # report syntax errors to snpExpandAllele.errors # possible exceptions logged to snpExpandAllele.exceptions: # RefAlleleWrongSize # This run no errors, no exceptions # 200? alleles expanded /cluster/home/heather/kent/src/hg/snp/snpLoad/snpExpandAllele mm8snp126 ref_strain # the next few steps prepare for working in UCSC space # sort by position /cluster/home/heather/kent/src/hg/snp/snpLoad/snpSort mm8snp126 ref_strain # rename MT --> M (pipeline.csh takes care of this) hgsql -e "rename table chrMT_snpTmp to chrM_snpTmp" mm8snp126 # get mm8 nib files # get mm8 chromInfo, load into mm8snp126 with editted path # lookup reference allele in nibs # keep reverse complement to use in error checking (snpCheckAlleles) # check here for SNPs larger than 1024 # errAbort if detected # check for coords that are too large, log to snpRefUCSC.error and skip # This run no errors /cluster/home/heather/kent/src/hg/snp/snpLoad/snpRefUCSC mm8snp126 # morph chrN_snpTmp mysql> desc chr1_snpTmp; # +--------------------+-------------+------+-----+---------+-------+ # | Field | Type | Null | Key | Default | Extra | # +--------------------+-------------+------+-----+---------+-------+ # | snp_id | int(11) | NO | | | | # | ctg_id | int(11) | NO | | | | # | chromStart | int(11) | NO | | | | # | chromEnd | int(11) | NO | | | | # | loc_type | tinyint(4) | NO | | | | # | orientation | tinyint(4) | NO | | | | # | allele | blob | YES | | NULL | | # | refUCSC | blob | YES | | NULL | | # | refUCSCReverseComp | blob | YES | | NULL | | # +--------------------+-------------+------+-----+---------+-------+ # compare allele from dbSNP to refUCSC # locType between is excluded from this check # log exceptions to snpCheckAllele.exceptions # if SNP is positive strand, expect allele == refUCSC # log RefAlleleMismatch if not # if SNP is negative strand, if not allele == refUCSC, then check for allele == refUCSCReverseComp # If allele == refUCSCRevComp, log RefAlleleNotRevComp # If allele doesn't match either of refUCSC or refUCSCReverseComp, log RefAlleleMismatch # This run we got: # 0 RefAlleleMismatch # 9621 RefAlleleNotRevComp /cluster/home/heather/kent/src/hg/snp/snpLoad/snpCheckAlleles mm8snp126 # add class and observed using univar_id from SNP table # to get class (subsnp_class) and observed (var_str) from UniVariation # log errors to snpClassAndObserved.errors # errors detected: # class = 0 in UniVariation # class > 8 in UniVariation # univar_id = 0 in SNP # no row in SNP for snp_id in chrN_snpTmp # This run we got: # 3 class = 0 in UniVariation # 0 class > 8 in UniVariation # 2890606 univar_id = 0 in SNP (strange, but okay) # 0 no row in SNP for snp_id in chrN_snpTmp # dbSNP has class = 'in-del' # we promote this to 'deletion' for locType 1&2 and to 'insertion' for locType 3 /cluster/home/heather/kent/src/hg/snp/snpLoad/snpClassAndObserved mm8snp126 # morph chrN_snpTmp # +--------------------+---------------+------+-----+---------+-------+ # | Field | Type | Null | Key | Default | Extra | # +--------------------+---------------+------+-----+---------+-------+ # | snp_id | int(11) | NO | | | | # | chromStart | int(11) | NO | | | | # | chromEnd | int(11) | NO | | | | # | loc_type | tinyint(4) | NO | | | | # | class | varchar(255) | NO | | | | # | orientation | tinyint(4) | NO | | | | # | allele | blob | YES | | NULL | | # | refUCSC | blob | YES | | NULL | | # | refUCSCReverseComp | blob | YES | | NULL | | # | observed | blob | YES | | NULL | | # +--------------------+---------------+------+-----+---------+-------+ # generate exceptions for class and observed # SingleClassBetweenLocType # SingleClassRangeLocType # NamedClassWrongLocType # ObservedWrongFormat # ObservedWrongSize # ObservedMismatch # RangeSubstitutionLocTypeExactMatch # SingleClassTriAllelic # SingleClassQuadAllelic # This will also detect IUPAC symbols in allele /cluster/home/heather/kent/src/hg/snp/snpLoad/snpCheckClassAndObserved mm8snp126 # add function /cluster/home/heather/kent/src/hg/snp/snpLoad/snpFunction mm8snp126 # add validation status and heterozygosity # log error if validation status > 31 or missing # no errors this run /cluster/home/heather/kent/src/hg/snp/snpLoad/snpSNP mm8snp126 # add molType # errors detected: missing or duplicate molType # 57709 duplicates /cluster/home/heather/kent/src/hg/snp/snpLoad/snpMoltype mm8snp126 # generate chrN_snp126 and snp126Exceptions tables cp snpCheckAlleles.exceptions snpCheckAlleles.tab cp snpCheckClassAndObserved.exceptions snpCheckClassAndObserved.tab cp snpExpandAllele.exceptions snpExpandAllele.tab cp snpLocType.exceptions snpLocType.tab /cluster/home/heather/kent/src/hg/snp/snpLoad/snpFinalTable mm8snp126 126 # concat into snp126.tab # cat chr*_snp126.tab >> snp126.tab /bin/sh concat.sh # check for multiple alignments /cluster/home/heather/kent/src/hg/snp/snpLoad/snpMultiple mm8snp126 mysql> load data local infile 'snpMultiple.tab' into table snp126Exceptions; # load on hgwdev cp snp126.tab /cluster/home/heather/transfer/snp hgsql mm8snp126 -e 'select * from snp126Exceptions' > /cluster/home/heather/transfer/snp/snp126Exceptions.tab ssh hgwdev mysql> load data local infile 'snp126.tab' into table snp126; mysql> load data local infile 'snp126Exceptions.tab' into table snp126Exceptions; # create indexes mysql> alter table snp126 add index name (name); mysql> alter table snp126 add index chrom (chrom, bin); mysql> alter table snp126Exceptions add index name(name); # create snp126ExceptionDesc table cd /cluster/data/dbSNP hgsql mm8 < snp126ExceptionDesc.sql # add counts to exception.human.126, can start with exception.template hgsql -e 'select count(*), exception from snp126Exceptions group by exception' mm8 mysql> load data local infile 'exception.mouse.126' into table snp126ExceptionDesc; mysql> select count(*), exception from snp126Exceptions group by exception; +----------+---------------------------+ | count(*) | exception | +----------+---------------------------+ | 97271 | MultipleAlignments | | 1600 | ObservedMismatch | | 27 | ObservedWrongFormat | | 272 | ObservedWrongSize | | 9621 | RefAlleleNotRevComp | | 11169 | SingleClassBetweenLocType | | 346 | SingleClassQuadAllelic | | 5023 | SingleClassRangeLocType | | 3905 | SingleClassTriAllelic | +----------+---------------------------+ #################################################################### ## redoing STS markers track to get them more correct ## (DONE - 2006-09-15 - Hiram) # Went into the updateBed.pl script, reworked it, made it safer, # debugged a lot of things and placed it into the source tree. ssh hgwdev mkdir /cluster/data/mm8/bed/STSmarkers.2006-08-29 cd /cluster/data/mm8/bed/STSmarkers.2006-08-29 # with that fixed script, create a new stsInfoMouse.bed file: # Update the m m 7 directory name here to m m 8 # for the next build of m m 9, ...etc... and so forth time ~/kent/src/hg/stsMarkers/updateBed.pl \ /cluster/data/mm7/bed/STSmarkers/stsInfoMouse.bed \ ../STSmarkers/downloads/MRK_Dump2.rpt \ ../STSmarkers/downloads/PRB_PrimerSeq.rpt \ ../STSmarkers/downloads/MRK_Sequence.rpt \ ../STSmarkers/downloads/UniSTS_mouse.alias \ ../STSmarkers/downloads/UniSTS_mouse.sts \ -g ../STSmarkers/downloads/10090.WI-Genetic.txt \ -r ../STSmarkers/downloads/10090.WI_MRC_RH.txt \ -verbose 2> dbg.updateBed | sed -e "s/\t*$//" > newbedfile ~/kent/src/hg/stsMarkers/cleanInfo.pl -mouse newbedfile \ | sed -e "s/\t*$//" > mm8.stsInfoMouse.bed # copy the stsInfoMouse.bed file from working dir to the marker # info storage fold. added 2 new steps by Yontao # be wary of the archive name here, check the directory and get # the name right here. mv /cluster/store5/mouseMarker/stsInfoMouse.bed \ /cluster/store5/mouseMarker/stsInfoMouse.bed_mm8.firstTime cp -p mm8.stsInfoMouse.bed /cluster/store5/mouseMarker/stsInfoMouse.bed # comparing to previous, numbers increase slightly each time wc -l /cluster/store5/mouseMarker/stsInfoMouse.bed \ /cluster/store5/mouseMarker/stsInfoMouse.bed_mm8.firstTime \ /cluster/store5/mouseMarker/stsInfoMouse.bed_mm7 \ /cluster/store5/mouseMarker/stsInfoMouse.bed_mm6 \ /cluster/store5/mouseMarker/stsInfoMouse.bed_mm5 # 60631 /cluster/store5/mouseMarker/stsInfoMouse.bed # 60440 /cluster/store5/mouseMarker/stsInfoMouse.bed_mm8.firstTime # 59843 /cluster/store5/mouseMarker/stsInfoMouse.bed_mm7 # 58980 /cluster/store5/mouseMarker/stsInfoMouse.bed_mm6 # 58493 /cluster/store5/mouseMarker/stsInfoMouse.bed_mm5 # and from that, create new primer fa, epcr, etc: time ~/kent/src/hg/stsMarkers/luConvertPrimerToFa \ mm8.stsInfoMouse.bed mouseP.fa mouseC.fa mouseP.info # the mouseC.fa file will be empty, should be more than last time wc -l mouse?.* ../STSmarkers/mouse?.* # 0 mouseC.fa # 308384 mouseP.fa # 34666 mouseP.info # 0 ../STSmarkers/mouseC.fa # 305991 ../STSmarkers/mouseP.fa # 34475 ../STSmarkers/mouseP.info # the equivalent Mm7 files: # 0 0 0 mouseC.fa # 300968 300914 6798466 mouseP.fa # 33838 169275 2153113 mouseP.info # 334806 470189 8951579 total # the equivalent Mm6 files: # 0 0 0 mouseC.fa # 293305 293251 6624638 mouseP.fa # 32890 164528 2087271 mouseP.info # 326195 457779 8711909 total # the equivalent Mm5 files: # 0 0 0 mouseC.fa # 286740 286686 6474893 mouseP.fa # 32232 161234 2044810 mouseP.info # 318972 447920 8519703 total # copy the primers over to some filesystem close to the klusters # and split them up to have a small number of sequences in one file mkdir /cluster/bluearc/mm8/stsMarkers.2006-08-29 cp -p mouseP.fa /cluster/bluearc/mm8/stsMarkers.2006-08-29 cd /cluster/bluearc/mm8/stsMarkers.2006-08-29 cp -p /cluster/data/mm8/11.ooc . mkdir split # 356 files for 34,666 sequences, == about 97 sequences per file faSplit sequence mouseP.fa 400 split/mm_ # PLEASE NOTE /cluster/bin/i386/blat.2 SPECIFICALLY IS USED HERE. # This process could convert to a modern version of blat with the # filters as described, for example, in the STS markers build in Hg18 # CLUSTER RUN FOR THE STS PRIMERS ssh kk cd /cluster/data/mm8/bed/STSmarkers.2006-08-29 mkdir primer mkdir ePCR cd primer mkdir out # interestingly, this blat2.2 binary did not function correctly # when given nib files. It has only about 1/4th of the number of # alignments as it gets when it used fa files for the target # sequence. ls -1S /cluster/bluearc/mm8/stsMarkers.2006-08-29/split > primers.list # re-using chrom sequences from first time ls -1S /cluster/bluearc/mm8/stsMarkers/chroms > chr.list cat << '_EOF_' > runBlat2.csh #!/bin/csh -fe set primer = /cluster/bluearc/mm8/stsMarkers.2006-08-29/split/$1 set fa = /cluster/bluearc/mm8/stsMarkers/chroms/$2 set ooc = /cluster/bluearc/mm8/stsMarkers.2006-08-29/11.ooc set root2 = $2:r mkdir -p out/${root2} set out = $3 /cluster/bin/i386/blat.2 ${fa} ${primer} -ooc=${ooc} \ -minMatch=1 -minScore=0 -minIdentity=80 -oneOff ${out} '_EOF_' # << happy emacs chmod +x runBlat2.csh cat << '_EOF_' > template #LOOP ./runBlat2.csh $(path1) $(path2) {check out line+ out/$(root2)/$(root1).psl} #ENDLOOP '_EOF_' # << happy emacs gensub2 primers.list chr.list template jobList para create jobList p80ara try ... check ... push ... etc ... # Completed: 12104 of 12104 jobs # CPU time in finished jobs: 1078733s 17978.89m 299.65h 12.49d 0.034 y # IO & Wait Time: 13537140s 225618.99m 3760.32h 156.68d 0.429 y # Average job time: 1208s 20.13m 0.34h 0.01d # Longest finished job: 11831s 197.18m 3.29h 0.14d # Submission to last job: 20458s 340.97m 5.68h 0.24d # on the file server ssh kkstore04 cd /cluster/data/mm8/bed/STSmarkers.2006-08-29/primer time pslSort dirs primers.raw.psl temp out/chr* # real 3m30.758s # -rw-rw-r-- 1 588001891 Sep 15 10:02 primers.raw.psl # filter alignments for (qEnd-qStart) vs. (tEnd-tStart) # should not be more than 100 bases different. # This filters out about 948,260 alignments, or # %17.4 = 100.0 * 948260 / 5462936 time pslSort dirs stdout temp out/chr* | awk -F"\t" ' { if (((($13 - $12) - ($17 - $16)) > -100) && ((($13 - $12) - ($17 - $16)) < 100)) {print} } ' > primers.psl.100 rmdir temp wc -l *.100 *.psl # 5462936 primers.raw.psl # 4514676 primers.psl.100 # 948260 difference # a rough comparison with previous results: wc -l /cluster/data/mm8/bed/STSmarkers/primer/primers.psl.100 # 4500528 /cluster/data/mm8/bed/STSmarkers/primer/primers.psl.100 # another kluster run for the ePCR ssh pk cd /cluster/data/mm8/bed/STSmarkers.2006-08-29/ePCR ls -1S /cluster/bluearc/mm8/stsMarkers/chroms > chr.list # pick up e-PCR source from # ftp://ftp.ncbi.nlm.nih.gov/pub/schuler/e-PCR/ # version 2.3.1 11 Feb 2005 # Had to add the following to both re-PCR_main.cpp and # e-PCR_main.cpp to get them to compile on kolossus: // max and min Copied from /usr/include/mysql/my_global.h #define max(a, b) ((a) >? (b)) #define min(a, b) ((a) runPCR #!/bin/csh -fe /cluster/bin/x86_64/e-PCR \ /cluster/data/mm8/bed/STSmarkers.2006-08-29/mouseP.info \ /cluster/bluearc/mm8/stsMarkers/chroms/$1 N=1 M=50 W=5 > $2 '_EOF_' # << happy emacs chmod +x runPCR cat << '_EOF_' > template #LOOP ./runPCR $(path1) {check out line+ out/$(num1).epcr} #ENDLOOP '_EOF_' # << the mouseP.info was created above gensub2 chr.list single template jobList para create jobList para try para check para push ... etc ... # There is a single job that produces no output: ./runPCR chrX_random.fa out/30.epcr # WARNING: 96 STSs have primer shorter than W # WARNING: 21 STSs have ambiguities within W of 3' end # Not sure what's up with that # Completed: 33 of 34 jobs # Crashed: 1 jobs # CPU time in finished jobs: 64904s 1081.73m 18.03h 0.75d 0.002 y # IO & Wait Time: 1860s 31.00m 0.52h 0.02d 0.000 y # Average job time: 2023s 33.72m 0.56h 0.02d # Longest finished job: 4861s 81.02m 1.35h 0.06d # Submission to last job: 4862s 81.03m 1.35h 0.06d ssh kkstore04 cd /cluster/data/mm8/bed/STSmarkers.2006-08-29/ePCR # all those results become all.epcr cat out/*.epcr > all.epcr # comparing to previous results: wc -l all.epcr # 58162 all.epcr wc -l /cluster/data/mm8/bed/STSmarkers/ePCR/all.epcr # 58088 all.epcr cd /cluster/data/mm8/bed/STSmarkers.2006-08-29/primer ~/kent/src/hg/stsMarkers/filterSTSPrimers \ -mouse ../mm8.stsInfoMouse.bed primers.psl.100 \ ../mouseP.info ../ePCR/all.epcr > primers.psl.filter.blat # The output should show an increasing count: # Reading name info # Reading primer info # Processing file # 100000 # 200000 # 300000 # ... # 4500000 # Determining ePCR not found from ePCR results # Out of 25749 ePCR alignments examined, not found: 520 # wc -l primers.psl.filter.blat # 34043 primers.psl.filter.blat wc -l /cluster/data/mm8/bed/STSmarkers/primer/primers.psl.filter.blat # 34026 primers.psl.filter.blat # create file accession_info.rdb touch empty_sequence.inf ~/kent/src/hg/stsMarkers/compileAccInfo -mouse \ /cluster/data/mm8 empty_sequence.inf # 20502 processed mv accession_info.rdb accession_info.rdb.tmp ~/kent/src/hg/stsMarkers/sorttbl -x Chr Ord Start \ < accession_info.rdb.tmp > accession_info.rdb # The -x prints the debug statement: # sort arg: -t" " +0 -1 +1 -2g +2 -3g rm accession_info.rdb.tmp # comparing results to previous # Continuing the trend that began with Mm7, the numbers in # accession_info.rdb continue to decrease. Even Mm8 has much less # fragments than did mm7: # e.g.: [hiram@kkstore04 /cluster/data] wc -l mm8/*/chr*.agp | tail -1 # 21910 total [hiram@kkstore04 /cluster/data] wc -l mm7/*/chr*.agp | tail -1 # 70125 total [hiram@kkstore04 /cluster/data] wc -l mm6/*/chr*.agp | tail -1 # 170812 total wc -l accession_info.rdb # 20385 accession_info.rdb wc -l ../../STSmarkers/primer/accession_info.rdb # 20385 ../../STSmarkers/primer/accession_info.rdb # creates epcr.not.found.nomatch and epcr.not.found.psl ~/kent/src/hg/stsMarkers/epcrToPsl -mouse \ epcr.not.found ../mouseP.info \ accession_info.rdb /cluster/data/mm8 2> dbg.epcrToPsl # the dbg.epcrToPsl has a number of lines complaining about bad # primers in ../mouseP.info - and indeed they are bad primers, # they do not have a second primer. # Comparing results to previous: wc -l epcr* # 520 epcr.not.found # 0 epcr.not.found.nomatch # 520 epcr.not.found.psl wc -l ../../STSmarkers/primer/epcr* # 501 ../../STSmarkers/primer/epcr.not.found # 0 ../../STSmarkers/primer/epcr.not.found.nomatch # 501 ../../STSmarkers/primer/epcr.not.found.psl # Mm7 wc epcr* wc -l /cluster/data/mm7/bed/STSmarkers/primer/epcr* # 474 /cluster/data/mm7/bed/STSmarkers/primer/epcr.not.found # 0 /cluster/data/mm7/bed/STSmarkers/primer/epcr.not.found.nomatch # 474 /cluster/data/mm7/bed/STSmarkers/primer/epcr.not.found.psl # 158 /cluster/data/mm7/bed/STSmarkers/primer/epcrToPsl # 1106 total # Mm6 wc epcr* wc -l /cluster/data/mm6/bed/STSmarkers/primer/epcr* # 472 /cluster/data/mm6/bed/STSmarkers/primer/epcr.not.found # 63 /cluster/data/mm6/bed/STSmarkers/primer/epcr.not.found.nomatch # 404 /cluster/data/mm6/bed/STSmarkers/primer/epcr.not.found.psl # 158 /cluster/data/mm6/bed/STSmarkers/primer/epcrToPsl # 1097 total cat primers.psl.filter.blat epcr.not.found.psl > primers.psl.filter wc -l primers.psl.filter # 34563 primers.psl.filter wc -l ../../STSmarkers/primer/primers.psl.filter # 34527 primers.psl.filter wc -l /cluster/data/mm7/bed/STSmarkers/primer/primers.psl.filter # 34460 /cluster/data/mm7/bed/STSmarkers/primer/primers.psl.filter wc -l /cluster/data/mm6/bed/STSmarkers/primer/primers.psl.filter # 33532 /cluster/data/mm6/bed/STSmarkers/primer/primers.psl.filter wc -l /cluster/data/mm5/bed/STSmarkers/primer/primers.psl.filter.lifted # 33691 /cluster/data/mm5/bed/STSmarkers/primer/primers.psl.filter.lifted # create primers.psl.filter.lifted.initial # The PATH setting allows extractPslInfo to find other programs that it # is going to use. PATH=~/kent/src/hg/stsMarkers:$PATH \ ~/kent/src/hg/stsMarkers/extractPslInfo primers.psl.filter wc -l *.initial # 34545 primers.psl.filter.initial wc -l ../../STSmarkers/primer/*.initial # 34513 ../../STSmarkers/primer/primers.psl.filter.initial wc -l /cluster/data/mm7/bed/STSmarkers/primer/primers.psl.filter.initial # 34443 /cluster/data/mm7/bed/STSmarkers/primer/primers.psl.filter.initial wc -l /cluster/data/mm6/bed/STSmarkers/primer/primers.psl.filter.initial # 33514 /cluster/data/mm6/bed/STSmarkers/primer/primers.psl.filter.initial wc -l \ /cluster/data/mm5/bed/STSmarkers/primer/primers.psl.filter.lifted.initial # 33689 # create primers.psl.filter.lifted.initial.acc PATH=~/kent/src/hg/stsMarkers:$PATH \ ~/kent/src/hg/stsMarkers/findAccession -agp \ -mouse primers.psl.filter.initial /cluster/data/mm8 wc -l primers.psl.filter.initial.acc # 34545 primers.psl.filter.initial.acc wc -l ../../STSmarkers/primer/primers.psl.filter.initial.acc # 34513 primers.psl.filter.initial.acc # this needs to be -rat as that specifies how to scan the # stsInfoMouse.bed file and it does not work if you use -mouse # it is not clear what -mouse would mean to this script, some other file # format perhaps from the stsInfoMouse.bed format. ~/kent/src/hg/stsMarkers/getStsId -rat \ ../mm8.stsInfoMouse.bed primers.psl.filter.initial.acc \ | sort -k4,4n > primers.final wc -l primers.final # 34545 primers.final wc -l ../STSmarkers/primer/primers.final # 34513 primers.final cd /cluster/data/mm8/bed/STSmarkers.2006-08-29 # stsMarkers.final is empty for mouse touch stsMarkers.final dummy PATH=~/kent/src/hg/stsMarkers:$PATH \ ~/kent/src/hg/stsMarkers/combineSeqPrimerPos \ stsMarkers.final primer/primers.final > stsMarkers_pos.rdb wc -l stsMarkers_pos.rdb # 33048 stsMarkers_pos.rdb wc -l ../STSmarkers/stsMarkers_pos.rdb # 33075 stsMarkers_pos.rdb PATH=~/kent/src/hg/stsMarkers:$PATH \ ~/kent/src/hg/stsMarkers/createStsBed \ mm8.stsInfoMouse.bed stsMarkers_pos.rdb 500 \ | sort -k1,1 -k2,2n | sed -e "s/ //g" > stsMapMouse.bed # The sed removes unneeded blanks # verify score profile remains similar awk -F'\t' '{print $5}' stsMapMouse.bed | sort -n | uniq -c # 546 500 # 1650 750 # 27705 1000 awk -F'\t' '{print $5}' ../STSmarkers/stsMapMouse.bed | sort -n | uniq -c # 546 500 # 1648 750 # 27692 1000 wc -l stsMapMouse.bed # 29901 stsMapMouse.bed wc -l ../STSmarkers/stsMapMouse.bed # 29888 stsMapMouse.bed # loading STS markers tables ssh hgwdev cd /cluster/data/mm8/bed/STSmarkers.2006-08-29 ~/kent/src/hg/stsMarkers/ucscAlias.pl \ mm8.stsInfoMouse.bed > ucscStsAlias.tab 2> ucscStsAlias.warnings # this does leave messages in ucscStsAlias.warnings but they seem # to be very similar to Mm6 with just a few new ones wc -l ucscStsAlias.tab # 146767 ucscStsAlias.tab wc -l ../STSmarkers/ucscStsAlias.tab # 146064 ucscStsAlias.tab # After extensive comparison with the currently existing STS markers, it # appears that this new set only has a couple of new ones, and a couple # of ones have been dropped. It seems that the primary correction has # been to the marker positions. ssh hgwdev cd /cluster/data/mm8/bed/STSmarkers.2006-08-29 # Saving the existing tables for archival purposes hgsql -e "alter table stsInfoMouseNew rename as stsInfoMouseNewFeb2006;" mm8 hgsql -e "alter table stsAlias rename as stsAliasFeb2006;" mm8 hgsql -e "alter table all_sts_primer rename as all_sts_primerFeb2006;" mm8 hgsql -e "alter table stsMapMouseNew rename as stsMapMouseNewFeb2006;" mm8 hgsql mm8 < ~/kent/src/hg/lib/stsAlias.sql hgsql -e \ 'load data local infile "ucscStsAlias.tab" into table stsAlias;' mm8 hgsql mm8 < ~/kent/src/hg/lib/stsMapMouseNew.sql hgsql -e \ 'load data local infile "stsMapMouse.bed" into table stsMapMouseNew;' mm8 hgsql mm8 < ~/kent/src/hg/lib/stsInfoMouseNew.sql hgsql -e \ 'load data local infile "mm8.stsInfoMouse.bed" into table stsInfoMouseNew;' mm8 hgLoadPsl -nobin -table=all_sts_primer mm8 primer/primers.psl.filter # load of all_sts_primer did not go as planned: 34563 record(s), 0 # row(s) skipped, 1 warning(s) loading primer/primers.psl.filter # After warnings, checkTableCoords to find problems: checkTableCoords -verboseBlocks mm8 all_sts_primer # mm8.all_sts_primer item 61999 chr10:62418012-62418048: blocks 0 and 1 overlap. # mm8.all_sts_primer has 1 records with overlapping blocks. # Strip the offending item from the load: # Verify the grep takes out only one item: wc -l primer/primers.psl.filter # 34563 primer/primers.psl.filter grep -P "\t61999\t" primer/primers.psl.filter | wc -l # 1 # and thus leaves the rest grep -v -P "\t61999\t" primer/primers.psl.filter | wc -l # 34562 grep -v -P "\t61999\t" primer/primers.psl.filter > fixed.primers.psl.filter hgLoadPsl -nobin -table=all_sts_primer mm8 fixed.primers.psl.filter # load primer sequences rm /gbdb/mm8/stsMarker/mouseP.fa ln -s /cluster/data/mm8/bed/STSmarkers.2006-08-29/mouseP.fa \ /gbdb/mm8/stsMarker/mouseP.fa # PLEASE NOTE THAT THE If you are going to reload this business, use the # -replace option on this hgLoadSeq # hgLoadSeq -replace mm8 /gbdb/mm8/stsMarker/mouseP.fa # otherwise there will be a problem that the seq and extFile tables # will be out of sync. hgLoadSeq -replace mm8 /gbdb/mm8/stsMarker/mouseP.fa # Adding /gbdb/mm8/stsMarker/mouseP.fa # 34666 sequences # Warning: load of seq did not go as planned: 34666 record(s), # 0 row(s) skipped, 1 warning(s) loading ./seq.tab featureBits mm8 all_sts_primer # 3700897 bases of 2567283971 (0.144%) in intersection featureBits mm8 all_sts_primerFeb2006 # 3746196 bases of 2567283971 (0.146%) in intersection featureBits mm7 all_sts_primer # 3757119 bases of 2583394090 (0.145%) in intersection featureBits mm6 all_sts_primer # 3677372 bases of 2597150411 (0.142%) in intersection featureBits mm8 stsMapMouseNew # 4812616 bases of 2567283971 (0.187%) in intersection featureBits mm8 stsMapMouseNewFeb2006 # 4801964 bases of 2567283971 (0.187%) in intersection featureBits mm7 stsMapMouseNew # 4805958 bases of 2583394090 (0.186%) in intersection featureBits mm6 stsMapMouseNew # 4638338 bases of 2597150411 (0.179%) in intersection hgsql -N mm8 -e "select count(*) from stsAlias;" # 146767 hgsql -N mm8 -e "select count(*) from stsAliasFeb2006;" # 141981 hgsql -N mm7 -e "select count(*) from stsAlias;" # 140649 hgsql -N mm7 -e "select count(*) from stsAlias;" # 137738 hgsql -N mm5 -e "select count(*) from stsAlias;" # 122944 hgsql -N mm8 -e "select count(*) from stsInfoMouseNew;" # 60440 hgsql -N mm7 -e "select count(*) from stsInfoMouseNew;" # 59843 hgsql -N mm7 -e "select count(*) from stsInfoMouseNew;" # 58980 hgsql -N mm5 -e "select count(*) from stsInfoMouseNew;" # 58493 # compare old and new name lists, not much difference: awk '{print $4}' stsMapMouse.bed | sort -u > mm8.nameList # in common with previous version comm -12 ../STSmarkers/mm8.nameList mm8.nameList | wc -l # 28687 # unique to previous version comm -23 ../STSmarkers/mm8.nameList mm8.nameList | wc -l # 11 # unique to this new set comm -13 ../STSmarkers/mm8.nameList mm8.nameList | wc -l # 20 ########################################################################## # N-SCAN gene predictions (nscanGene) - (2006-08-30 markd) cd /cluster/data/mm8/bed/nscan/ # obtained NSCAN predictions from michael brent's group # at WUSTL mv ardor.wustl.edu/jeltje/mm8/chr_ptx . rm -rf ardor.wustl.edu rm chr_*/index.html* gzip chr_*/* chmod a-w chr_*/*.gz # load tracks. Note that these have *utr features, rather than # exon features. currently ldHgGene creates separate genePred exons # for these. ldHgGene -bin -gtf -genePredExt mm8 nscanGene chr_gtf/chr*.gtf.gz # load protein, add .1 suffix to match transcript id hgPepPred -suffix=.1 mm8 generic nscanPep chr_ptx/chr*.fa.gz rm *.tab # update trackDb; need a mm8-specific page to describe informants mouse/mm8/nscanGene.html (copy from hg18 and edit) mouse/mm8/trackDb.ra # changed search regex to termRegex chr[0-9a-zA-Z_].*\.[0-9]+\.[0-9] ##################################################################### # SEGMENTAL DUPLICATIONS (DONE 9/18/06 angie) # File emailed from Ginger Cheng mkdir /cluster/data/mm8/bed/genomicSuperDups cd /cluster/data/mm8/bed/genomicSuperDups awk '($3 - $2) >= 1000 && ($9 - $8) >= 1000 {print;}' mm8_WGAC.tab \ | hgLoadBed mm8 genomicSuperDups stdin \ -tab -sqlTable=$HOME/kent/src/hg/lib/genomicSuperDups.sql # 8/29/07 Gak! Kayla found that the strand values were "+" and "_" -- fix: hgsql mm8 -e 'update genomicSuperDups set strand = "-" where strand = "_";' ##################################################################### # CELERA COVERAGE (WSSD -- DEPTH OF COVERAGE) (DONE 10/16/06 angie) # File emailed from Ginger Cheng mkdir /cluster/data/mm8/bed/wssd cd /cluster/data/mm8/bed/wssd tail +2 mm8_WSSD_DOC.tab \ | hgLoadBed mm8 wssdCoverage stdin ##################################################################### ## NIA Mouse Gene Index - (DONE, Fan, 10/6/06) # requested by: Dudekula, Dawood (NIH/NIA/IRP) DudekulaDB@grc.nia.nih.gov ssh hgwdev mkdir -p /cluster/data/mm8/bed/NIAGene061003 cd /cluster/data/mm8/bed ln -s NIAGene061003 NIAGene cd NIAGene wget --timestamp http://lgsun.grc.nia.nih.gov/geneindex/mm8/download/T-fasta.ff.gz wget --timestamp http://lgsun.grc.nia.nih.gov/geneindex/mm8/download/T-psl.txt.gz gzip -d *.gz cut -f 1-21 T-psl.txt >NIAGene.tab hgLoadPsl mm8 NIAGene.tab mkdir /gbdb/mm8/NIAGene ln -s /cluster/data/mm8/bed/NIAGene/T-fasta.fa /gbdb/mm8/NIAGene/T-fasta.fa hgLoadSeq mm8 /gbdb/mm8/NIAGene/T-fasta.fa # Create/edit/check in NIAGene.html and trackDb.ra under kent/src/hg/makeDb/trackDb/mouse/mm8 ##################################################################### # LOAD GENEID GENES (DONE - 2006-10-09 - Fan) ssh hgwdev mkdir -p /cluster/data/mm8/bed/geneid/download cd /cluster/data/mm8/bed/geneid/download bash awk '{print $1}' ../../../chrom.sizes | while read C do echo $C wget --timestamping \ http://genome.imim.es/genepredictions/M.musculus/mmMar2006/geneid_v1.2/$C.gtf wget --timestamping \ http://genome.imim.es/genepredictions/M.musculus/mmMar2006/geneid_v1.2/$C.prot done exit # Add missing .1 to protein id's foreach f (*.prot) perl -wpe 's/^(>chr\w+)$/$1.1/' $f > $f:r-fixed.prot end cd .. ldHgGene -genePredExt -gtf mm8 geneid download/*.gtf #Read 35954 transcripts in 284585 lines in 34 files # 35954 groups 34 seqs 1 sources 3 feature types # 35954 gene predictions hgPepPred mm8 generic geneidPep download/*-fixed.prot featureBits mm8 -enrichment refGene geneid # refGene 1.842%, geneid 1.592%, both 0.883%, cover 47.95%, enrich 30.13x featureBits mm7 -enrichment refGene geneid # refGene 1.835%, geneid 1.579%, both 0.866%, cover 47.18%, enrich 29.88x ##################################################################### # RN4 RECIPROCAL BEST CHAINS/NETS (DONE - 2006-10-10 - Angie) doRecipBest.pl mm8 rn4 \ >& /cluster/data/mm8/bed/blastz.rn4/axtChain/recipBest.log & tail -f /cluster/data/mm8/bed/blastz.rn4/axtChain/recipBest.log ############################################################################## ############################################################################ # Load CCDS (ccdsInfo, ccdsGene, ccdsKgMap) (2006-10-10 markd) cd /cluster/data/genbank/data/ccds/ ftp ftp-private.ncbi.nih.gov (user ccds, needs password) get CCDS.20061010.tar.gz mkdir /scratch/tmp/ccds cd /scratch/tmp/ccds tar -zxf /cluster/data/genbank/data/ccds/CCDS.20061010.tar.gz # import ccds database tables /cluster/data/genbank/bin/x86_64/ccdsImport ccds data/*.txt # create and load ccdsGene and ccdsInfo tables from imported database /cluster/data/genbank/bin/x86_64/ccdsMkTables -loadDb ccds mm8 ccdsInfo ccdsGene /cluster/data/genbank/bin/x86_64/mkCcdsGeneMap -db=mm8 -loadDb ccdsGene knownGene ccdsKgMap checkTableCoords mm8 -verbose=2 ccdsGene joinerCheck -database=mm8 -identifier=ccdsGeneId ~/compbio/kent/src/hg/makeDb/schema/all.joiner rm -rf /scratch/tmp/ccds # build initial version of ccdsMgcMap table ./x86_64/mkCcdsGeneMap -loadDb -db=mm8 -loadDb ccdsGene mgcGenes ccdsMgcMap # load trackDb cd kent/src/hg/makeDb/trackDb make alpha # request push of ccdsGene ccdsInfo ccdsKgMap # << emacs ############################################################################ # JAX TRACKS (DONE 10/20/06 angie - UPDATED 7/18/07, 9/27/07) # Table jaxQTL renamed to jaxQtl on 1/7/10 (see NOTE FOR NEXT TIME below) ssh kkstore04 mkdir /cluster/data/mm8/bed/jax/2007_09 cd /cluster/data/mm8/bed/jax/2007_09 wget ftp://ftp.informatics.jax.org/pub/gbrowse/\* wget ftp://ftp.informatics.jax.org/pub/reports/MGI_PhenotypicAllele.rpt # Jax Rep Transcript track # SEQ_RepTransGenomic_rpt.gff --> jaxRepTranscript{,Alias} # -- names like AK016604_4933401J01Rik, NM_001011874_AY534250 # -- aliases ~ MGI:\d+ # Use simple perl script to uniquify transcript names and make alias.tab. # Inspired by the mm6 version, but format has changed. ../2007_07/parseRepTranscript.pl SEQ_RepTransGenomic_rpt.gff \ > jaxRepTranscript.gff # Jax Allele track # AL_*.gff --> jaxAllele{,Info} # -- bed12Source -- add type from filename # -- names like NM_011283_Rp1h, XM_129721_Slc9a2 # -- Info: name, mgiID, source {"Gene trapped", ...} cp ../2007_07/parseAllele.pl . # Edit to accomodate latest format tweaks. rm -f jaxAllele.bed jaxAlleleInfo.tab fixJaxAllele.sql foreach f (AL*.gff) set type = `echo $f:t:r \ | sed -e 's/AL_//; s/GTRAP/GeneTrapped/; s/IND/Induced/; \ s/OTHER/Other/; s/SPON/Spontaneous/; s/TARG/Targeted/; \ s/TRANS/Transgenic/;'` parseAllele.pl $f \ | ldHgGene mm8 placeholder stdin -nobin -out=stdout \ | /cluster/bin/scripts/genePredToBed \ | sed -e 's/$/'"\t$type"'/' \ >> jaxAllele.bed end # This round's formatting inconsistencies: #source not given for NM_015770_a #source not given for NM_029931_Mllt3 #source not given for NM_009521_Wnt3 #source not given for NM_011640_Trp53 #source not given for NM_001081049_Mll1 #Missing > for mRNA name NM_001081193_Lemd3 jaxPhenotype{,Alias} # -- bed12Source -- add type from filename # -- names like NM_001001488_Atp8b1 rm -f jaxPhenotype.bed jaxPhenotypeAlias.tab fixJaxPhenotype.sql foreach f (MP_*.gff) set type = `echo $f:t:r \ | perl -wpe 's/MP_[0-9]*_//; s/[_-](\w)/\u$1/g; s/^(\w)/\u$1/; \ s@AdiposeTissue@Adipose@; \ s@BehaviorNeurological@Behavior@; \ s@CardiovascularSystem@Cardiovascular@; \ s@DigestiveAlimentary@Digestive@; \ s@EndocrineExocrineGland@Gland@; \ s@GrowthSize@Growth Size@; \ s@HearingEar@Hearing/Ear@; \ s@HematopoieticSystem@Hematopoietic@; \ s@HomeostasisMetabolism@Homeostasis@; \ s@ImmuneSystem@Immune@; \ s@LethalityEmbryonicPerinatal@Embryonic Lethal@; \ s@LethalityPostnatal@Postnatal Lethal@; \ s@LifeSpanPostWeaningAging@Life Span@; \ s@LimbsDigitsTail@Limbs and Tail@; \ s@LiverBiliarySystem@Liver and Bile@; \ s@NervousSystem@Nervous System@; \ s@RenalUrinarySystem@Renal/Urinary@; \ s@ReproductiveSystem@Reproductive@; \ s@RespiratorySystem@Respiratory@; \ s@SkinCoatNails@Skin/Coat/Nails@; \ s@TasteOlfaction@Taste/Smell@; \ s@TouchVibrissae@Touch@; \ s@Tumorigenesis@Tumorigenesis@; \ s@VisionEye@Vision/Eye@;'` echo $type ../2006_10/parsePhenotype.pl $f \ | ldHgGene mm8 placeholder stdin -nobin -out=stdout \ | /cluster/bin/scripts/genePredToBed \ | sed -e 's@$@'"\t$type"'@' \ >> jaxPhenotype.bed end sort -u jaxPhenotypeAlias.tab > tmp mv tmp jaxPhenotypeAlias.tab # Jax QTL track # QTL*.gff --> jaxQtl2 (or 3?)... but we're missing MIT SSLP marker # and CM distance for 2, or those plus flanking markers for 3... perl -wpe 'chomp; s/\s*$//; \ ($chr, undef, undef, $start, $end, undef, $strand, undef, $info) = \ split("\t"); \ if ($info =~ /QTL (\w+); Dbxref "(MGI:\d+)"; Alias .*; Note "([^"]+)"/) { \ ($name, $mgiID, $desc) = ($1, $2, $3); \ } else { die "parse\n$info"; } \ $start--; \ s/^.*$/$chr\t$start\t$end\t$name\t1000\t$strand\t\t$mgiID\t$desc\t0.0\n/;' \ QTL_build36_03_alias.gff > jaxQtl.bed # Extract phenotype-allele relationships: # Make a file for the one code not already in a filename: cp /dev/null MP_0003012_no_phenotypic_analysis # Wrote a script to extract the phenotype-allele relationships -- # it uses the filenames to map MP:* codes to our phenotype names. ../2007_07/parsePhenotypicAllele.pl MGI_PhenotypicAllele.rpt \ > jaxAllelePheno.tab # The file "err" has messages about missing data (no gene name in # PhenotypicAllele.rpt, or gene/mgiId not found in jaxAlleleInfo). # Load tables ssh hgwdev cd /cluster/data/mm8/bed/jax/2007_09 # jaxRepTranscript ldHgGene mm8 jaxRepTranscript jaxRepTranscript.gff hgsql mm8 < fixJaxRepTranscript.sql sed -e 's/genericAlias/jaxRepTranscriptAlias/g' \ ~/kent/src/hg/lib/genericAlias.sql > jaxRepTranscriptAlias.sql hgLoadSqlTab mm8 jaxRepTranscriptAlias \ jaxRepTranscriptAlias.sql jaxRepTranscriptAlias.tab # jaxAllele sed -e 's/bed12Source/jaxAllele/g' \ $HOME/kent/src/hg/lib/bed12Source.sql > jaxAllele.sql hgLoadBed -sqlTable=jaxAllele.sql mm8 jaxAllele jaxAllele.bed hgsql mm8 < fixJaxAllele.sql hgLoadSqlTab mm8 jaxAlleleInfo \ ~/kent/src/hg/lib/jaxAlleleInfo.sql jaxAlleleInfo.tab # jaxPhenotype sed -e 's/bed12Source/jaxPhenotype/g' \ $HOME/kent/src/hg/lib/bed12Source.sql > jaxPhenotype.sql hgLoadBed -tab -sqlTable=jaxPhenotype.sql mm8 jaxPhenotype jaxPhenotype.bed hgsql mm8 < fixJaxPhenotype.sql sed -e 's/genericAlias/jaxPhenotypeAlias/' \ ~/kent/src/hg/lib/genericAlias.sql > jaxPhenotypeAlias.sql hgLoadSqlTab mm8 jaxPhenotypeAlias \ jaxPhenotypeAlias.sql jaxPhenotypeAlias.tab ### NOTE FOR NEXT TIME ### ### Call the table jaxQtl instead of jaxQTL -- QA doesn't like jaxQTL. ### (brooke) In fact, QA renamed the table to jaxQtl on 1/7/10 on hgwdev and ### mysqlbeta with this command: mysql> alter table jaxQTL rename to jaxQtl; ### (to make trackDb load with a single trackDb.ra entry for mm8 and mm9) ### Use -sqlTable=$HOME/kent/src/hg/lib/jaxQtl.sql . # jaxQTL hgLoadBed -tab -notItemRgb -noBin \ -sqlTable=$HOME/kent/src/hg/lib/jaxQTL.sql \ mm8 jaxQTL jaxQtl.bed checkTableCoords -verbose=2 mm8 jaxQTL #mm8.jaxQTL item Scpro11 chr18:131504376-131504512: chromEnd > chromSize 90736837 #mm8.jaxQTL item Tswt chr18:134822025-134822132: chromEnd > chromSize 90736837 #mm8.jaxQTL item Ath13 chr14:164794113-164794369: chromEnd > chromSize 123978870 #mm8.jaxQTL item Dob7 chr11:131434708-131434798: chromEnd > chromSize 121798632 # Fix coords > chromSize: perl -wpe 's/^(\w+)\t(\d+)$/ \ delete from jaxQTL where chrom="$1" and chromStart >= $2; \ update jaxQTL set chromEnd = $2 where chrom="$1" and chromEnd > $2;/' \ ../../../chrom.sizes \ | hgsql mm8 checkTableCoords -verbose=2 mm8 jaxQTL # phenotype-allele relationships hgLoadSqlTab mm8 jaxAllelePheno \ ~/kent/src/hg/lib/jaxAllelePheno.sql jaxAllelePheno.tab # Check joiner: runJoiner.csh mm8 jaxAllele runJoiner.csh mm8 jaxPhenotype ########################################################################## # SWAP/CHAIN/NET GASACU1 (DONE 10/23/06 angie) ssh kkstore04 mkdir /cluster/data/mm8/bed/blastz.gasAcu1.swap cd /cluster/data/mm8/bed/blastz.gasAcu1.swap doBlastzChainNet.pl -swap /cluster/data/gasAcu1/bed/blastz.mm8/DEF \ -chainMinScore=2000 -chainLinearGap=loose >& do.log & tail -f do.log ln -s blastz.gasAcu1.swap /cluster/data/mm8/bed/blastz.gasAcu1 nice featureBits mm8 chainGasAcu1Link #52781141 bases of 2567283971 (2.056%) in intersection ######################################################################### # BLASTZ/CHAIN/NET FELCAT3 (Done Nov 15 2006 heather) # working in /cluster/data/felCat3 because /cluster/data/mm8 is 94% full mkdir /cluster/data/felCat3/bed/blastz.mm8.2006-11-14 ln -s /cluster/data/felCat3/bed/blastz.mm8.2006-11-14 /cluster/data/mm8/bed/blastz.felCat3 cd /cluster/data/felCat3/bed/blastz.mm8.2006-11-14 cat << '_EOF_' > DEF BLASTZ_M=50 # TARGET: Mouse mm8 SEQ1_DIR=/scratch/hg/mm8/nib SEQ1_LEN=/scratch/hg/mm8/chrom.sizes SEQ1_CHUNK=50000000 SEQ1_LAP=10000 # QUERY: Cat felCat3 SEQ2_DIR=/san/sanvol1/scratch/felCat3/felCat3.2bit SEQ2_LEN=/san/sanvol1/scratch/felCat3/chrom.sizes # Maximum number of scaffolds that can be lumped together SEQ2_LIMIT=500 SEQ2_CHUNK=30000000 SEQ2_LAP=0 BASE=/cluster/data/felCat3/bed/blastz.mm8.2006-11-14 TMPDIR=/scratch/tmp '_EOF_' # << this line keeps emacs coloring happy doBlastzChainNet.pl DEF \ -bigClusterHub pk -chainMinScore=3000 -chainLinearGap=medium -blastzOutRoot /cluster/bluearc/felCat3/blastz.mm8 >& do.log & tail -f do.log nice featureBits -chrom=chr1 mm8 chainFelCat3Link 36333124 bases of 191450312 (18.978%) in intersection ######################################################################### # BLASTZ/CHAIN/NET BOSTAU3 (Done March 2007 heather) mkdir /cluster/data/mm8/bed/blastz.bosTau3.2007-03-14 ln -s /cluster/data/mm8/bed/blastz.bosTau3.2007-03-14 /cluster/data/mm8/bed/blastz.bosTau3 cd /cluster/data/mm8/bed/blastz.bosTau3 cat << '_EOF_' > DEF BLASTZ_M=50 # TARGET: Mouse mm8 SEQ1_DIR=/scratch/hg/mm8/nib SEQ1_LEN=/scratch/hg/mm8/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Cow bosTau3 SEQ2_DIR=/san/sanvol1/scratch/bosTau3/bosTau3.2bit SEQ2_LEN=/san/sanvol1/scratch/bosTau3/chrom.sizes SEQ2_LIMIT=500 SEQ2_CHUNK=50000000 SEQ2_LAP=0 BASE=/cluster/data/mm8/bed/blastz.bosTau3.2007-03-14 TMPDIR=/scratch/tmp '_EOF_' # << this line keeps emacs coloring happy doBlastzChainNet.pl DEF \ -bigClusterHub pk \ -chainMinScore=3000 -chainLinearGap=medium \ -blastzOutRoot /cluster/bluearc/bosTau3/blastz.mm8 >& do.log & tail -f do.log nice featureBits -chrom=chr1 mm8 chainBosTau3Link # 49896121 bases of 191450312 (26.062%) in intersection ############################################################################# # REBUILD miRNA TRACK (DONE - 2006-12-01 - Fan) # updated data from: Michel.Weber@ibcg.biotoul.fr # notify them when done. ssh hgwdev cd /cluster/data/mm8/bed mkdir miRNA-2006-12-01 cd miRNA-2006-12-01 # save the mmu8_miRNA.txt file from email # add the following line in mmu8_miRNA.txt per email from Michel. chrM 16114 16209 mmu-mir-805 480 - hgLoadBed -strict mm8 miRNA mmu8_miRNA.txt # check previous release track before update featureBits mm8 miRNA # 33033 bases of 2567283971 (0.001%) in intersection featureBits mm7 miRNA # 20620 bases of 2583394090 (0.001%) in intersection ############################################################################# # Create Allen Brain Atlas mapping. (Done 2007-02-08 Galt) # We are creating several things: a psl probe-track for the RR on mouse, # a link out from kg to the probe to the ABA website, # and a set of gene/probe info which visiGene will use. # (This needs to be done after have created sequences in # ncbiXm and tigrMgiTc as above.) # metadata.log and SRGEsequence.log was provided by # Susan Sunkin # this is an update to the visiGene with 6000 new images. # See mm6.txt for steps not needing to be repeated. # copy in the data files (directory already exists from previous build) ssh hgwdev cd /cluster/data/mm8/bed/allenBrain mkdir old mv * old/ cp /cluster/data/mm6/bed/allenBrain/allen20061204.tab . cp /cluster/data/mm6/bed/allenBrain/probeSeq.20061204.fasta . cp /cluster/data/mm6/bed/allenBrain/allProbes.fa . cp /cluster/data/mm6/bed/allenBrain/allProbes.tab . cp /cluster/data/mm6/bed/allenBrain/allenBrainUrl.tab . # Set up a blat run to align the probes. ssk pk cd /cluster/data/mm8/bed/allenBrain mkdir split faSplit sequence allProbes.fa 200 split/rp mkdir run cd run ls -1 ../split/*.fa > mrna.lst ls -1 /scratch/hg/mm8/nib/*.nib > genome.lst mkdir psl cat << '_EOF_' > gsub #LOOP blat -ooc=/scratch/hg/h/mouse11.ooc $(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl} #ENDLOOP '_EOF_' # << happy emacs gensub2 genome.lst mrna.lst gsub spec para create spec # Then do the usual para try/push/time/check until the run is finished #Completed: 6596 of 6596 jobs #CPU time in finished jobs: 27258s 454.30m 7.57h 0.32d 0.001 y #IO & Wait Time: 19700s 328.33m 5.47h 0.23d 0.001 y #Average job time: 7s 0.12m 0.00h 0.00d #Longest running job: 0s 0.00m 0.00h 0.00d #Longest finished job: 39s 0.65m 0.01h 0.00d #Submission to last job: 549s 9.15m 0.15h 0.01d # Then do sorting and near-best-in-genome step on file server ssh kkstore cd /cluster/data/mm8/bed/allenBrain/run pslSort dirs raw.psl tmp psl pslReps raw.psl ../best.psl -nohead -minCover=0.20 -minAli=0.96 -nearTop=0.001 /dev/null sort -k 14,14 -k 16,16n ../best.psl > ../allenBrainAli.psl # Clean up big files no longer needed rm raw.psl rm -r psl rm -r ../split # Load up database ssh hgwdev cd /cluster/data/mm8/bed/allenBrain # Make a new table that contains the URLs for the allen brain genes # Make this one first since all.joiner considers it the master table. hgsql mm8 -e 'drop table allenBrainUrl' hgsql mm8 < ~/kent/src/hg/lib/allenBrainUrl.sql hgsql mm8 -e 'load data local infile "allenBrainUrl.tab" into table allenBrainUrl' # Make probe alignment table, and load sequence. hgLoadPsl mm8 allenBrainAli.psl rm /gbdb/mm8/allenBrain/allProbes.fa ln -s /cluster/data/mm8/bed/allenBrain/allProbes.fa /gbdb/mm8/allenBrain/allProbes.fa hgLoadSeq -replace mm8 /gbdb/mm8/allenBrain/allProbes.fa # Make mapping between known genes and allenBrain hgMapToGene mm8 allenBrainAli -type=psl knownGene knownToAllenBrain ########################################################################## # xxBlastTab - Help filter out unwanted paralogs (Galt 2007-01-11) # # We are starting with xxBlastTab tables already built in the usual way with # blastall/blastp, probably with doHgNearBlastp.pl script. # # we want to update mm8 for human and rat, # so check ./hgGeneData/Mouse/mm8/otherOrgs.ra for current settings ssh hgwdev synBlastp.csh mm8 hg18 #mm8.hgBlastTab #new number of unique query values: #25178 #new number of unique target values #15328 #old number of unique query values: #28286 #old number of unique target values #15901 synBlastp.csh mm8 rn4 #mm8.rnBlastTab: #new number of unique query values: #11163 #new number of unique target values #6573 #old number of unique query values: #23183 #old number of unique target values #6890 ########################################################################## # GenBank gbMiscDiff table (markd 2007-01-10) # Supports `NCBI Clone Validation' section of mgcGenes details page # genbank release 157.0 now contains misc_diff fields for MGC clones # reloading mRNAs results in gbMiscDiff table being created. ./bin/gbDbLoadStep -reload -srcDb=genbank -type=mrna mm8 ########################################################################## ## WindowMasker (DONE - 2007-01-30 - Hiram) ssh kolossus mkdir /cluster/data/mm8/bed/WindowMasker.2007-01-29 cd /cluster/data/mm8/bed/WindowMasker.2007-01-29 # copy *.csh scripts from # /cluster/data/danRer4/bed/WindowMasker.2006-12-04 # and fixup the db name and work directory in those scripts, then: time nice -n +19 ./doCount.csh > doCount.out 2>&1 # real 67m32.178s time nice -n +19 ./doSdust.csh >doSdust.out 2>&1 # real 477m24.667s ssh kkstore04 cd /cluster/data/mm8/bed/WindowMasker.2007-01-29 gzip windowmasker.sdust.bed time nice -n +19 ./applyMask.csh > applyMask.out 2>&1 time nice -n +19 ./addTrf.csh > addTrf.out 2>&1 twoBitToFa mm8.sdTrf.2bit stdout | faSize stdin # 2664455088 bases (97171400 N's 2567283688 real 1644888505 upper # 922395183 lower) in 34 sequences in 1 files ssh hgwdev cd /cluster/data/mm8/bed/WindowMasker.2007-01-29 ########################################################################## ## AUGUSTUS ab initio predictions (DONE, 2007-01-30 - Mario) ssh hgwdev mkdir /cluster/data/mm8/bed/augustus cd /cluster/data/mm8/bed/augustus # get the program AUGUSTUS, e.g. from the web wget http://augustus.gobics.de/binaries/augustus.2.0.1.src.tar.gz # unpack tar xzf augustus.2.0.1.src.tar.gz # compile the binary if necessary cd augustus/src make augustus # create output directory cd /cluster/data/mm8/bed/augustus mkdir out err # create file with sequences and their sizes by modifying chrom.sizes cat ../../chrom.sizes | perl -e 'while(<>){s/chr([0-9a-zA-Z]+)(_random|)/\/cluster\/data\/mm8\/$1\/chr$1$2.fa.masked/; print;}' > seq.lst # create the job list augustus/scripts/createAugustusJoblist.pl --sequences seq.lst --chunksize 5300000 --overlap 300000 --command "/cluster/data/panTro2/bed/augustus/augustus/src/augustus --AUGUSTUS_CONFIG_PATH=/cluster/data/panTro2/bed/augustus/augustus/config --species=human --sample=100 --/augustus/verbosity=0" --outputdir /cluster/data/mm8/bed/augustus/out/ --errordir /cluster/data/mm8/bed/augustus/err/ --joblist job.lst para try para check para push # CPU time in finished jobs: 2984823s 49747.06m 829.12h 34.55d 0.095 y # IO & Wait Time: 19258s 320.96m 5.35h 0.22d 0.001 y # Average job time: 5403s 90.05m 1.50h 0.06d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 7896s 131.60m 2.19h 0.09d # Submission to last job: 15716s 261.93m 4.37h 0.18d # check the error files, should be no errors cat err/*.err cat out/*.gff | augustus/scripts/join_aug_pred.pl > augustus.pep.gff augustus/scripts/getAnnoFasta.pl augustus.pep.gff cat augustus.pep.gff | egrep "CDS|codon"> augustus.gff # load into database ssh hgwdev cd /cluster/data/panTro2/bed/augustus/ ldHgGene -bin mm8 augustus augustus.gff # 32377 gene predictions hgPepPred panTro2 generic augustusPep augustus.pep.aa featureBits mm8 augustus # 35380585 bases of 2567283971 (1.378%) in intersection ######################################################################### ## BLASTZ ANOCAR1 - Lizard - (DONE - 2007-02-19 - 2007-02-20 - Hiram) ssh kkstore04 mkdir /cluster/data/mm8/bed/blastz.anoCar1.2007-02-19 cd /cluster/data/mm8/bed/blastz.anoCar1.2007-02-19 cat << '_EOF_' > DEF # Mouse vs lizard BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_M=50 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET: Mouse Mm8 SEQ1_DIR=/san/sanvol1/scratch/mm8/mm8.sdTrf.2bit SEQ1_LEN=/scratch/hg/mm8/chrom.sizes SEQ1_CHUNK=50000000 SEQ1_LAP=10000 # QUERY: Lizard AnoCar1 - largest chunk big enough for largest scaffold SEQ2_DIR=/san/sanvol1/scratch/anoCar1/anoCar1.2bit SEQ2_LEN=/san/sanvol1/scratch/anoCar1/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LIMIT=30 SEQ2_LAP=0 BASE=/cluster/data/mm8/bed/blastz.anoCar1.2007-02-19 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time doBlastzChainNet.pl DEF -chainMinScore=5000 -chainLinearGap=loose \ -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -verbose=2 -bigClusterHub=pk \ -blastzOutRoot /cluster/bluearc/mm8AnoCar1 > do.log 2>&1 & # real 544m52.722s # appears to have successfully finished ssh hgwdev cd /cluster/data/mm8/bed/blastz.anoCar1.2007-02-19 time nice -n +19 featureBits mm8 chainAnoCar1Link \ > fb.mm8.chainAnoCar1Link.txt 2>&1 # real 1m37.380s # 96286498 bases of 2567283971 (3.751%) in intersection # running the swap to anoCar1 - instructions in anoCar1.txt cd /cluster/data/anoCar1/bed/blastz.mm8.swap time nice -n +19 featureBits anoCar1 chainMm8Link \ > fb.anoCar1.chainMm8Link.txt 2>&1 # real 2m1.527s # 82784787 bases of 1741478929 (4.754%) in intersection ############################################################################# # UPDATED mm8.knownToVisiGene (DONE galt 2007-02-15) ######################################################################### # BLASTZ ORNANA1 (PLATYPUS) - (DONE 2007-03-02 angie) ssh kkstore04 mkdir /cluster/data/mm8/bed/blastz.ornAna1.2007-02-27 cd /cluster/data/mm8/bed/blastz.ornAna1.2007-02-27 cat << '_EOF_' > DEF # mouse vs. platypus # Use same params as used for hg18-danRer4 BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_M=50 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET: mm8 SEQ1_DIR=/scratch/hg/mm8/nib SEQ1_LEN=/scratch/hg/mm8/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: ornAna1 SEQ2_DIR=/iscratch/i/ornAna1/ornAna1.2bit SEQ2_LEN=/iscratch/i/ornAna1/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LIMIT=300 SEQ2_LAP=0 BASE=/cluster/data/mm8/bed/blastz.ornAna1.2007-02-27 TMPDIR=/scratch/tmp '_EOF_' # << emacs doBlastzChainNet.pl DEF \ -workhorse kkr6u00 \ -blastzOutRoot /cluster/bluearc/mm8.ornAna1 \ >& do.log & tail -f do.log ############################################################################ # Reload CCDS (ccdsInfo, ccdsGene, ccdsKgMap) (2007-03-02 markd) # see hg17.txt for build temporary ccds database for CCDS.20070228 # create and load ccdsGene and ccdsInfo tables from imported database /cluster/data/genbank/bin/x86_64/ccdsMkTables -loadDb ccds mm8 ccdsInfo ccdsGene /cluster/data/genbank/bin/x86_64/mkCcdsGeneMap -db=mm8 -loadDb ccdsGene knownGene ccdsKgMap checkTableCoords mm8 -verbose=2 ccdsGene # update all.jointer to include mm8 in ccdsDb joinerCheck -database=mm8 -identifier=ccdsGeneId ~/compbio/genbank/kent/src/hg/makeDb/schema/all.joiner # build initial version of ccdsMgcMap table, updated by nightly genbank update /cluster/data/genbank/bin/x86_64/mkCcdsGeneMap -loadDb -db=mm8 ccdsGene mgcGenes ccdsMgcMap # load trackDb cd kent/src/hg/makeDb/trackDb make alpha # check in browser # request push of ccdsGene ccdsInfo ccdsKgMap ccdsMgcMap # << emacs ############################################################################ # CGAP SAGE (DONE Andy 2007-03-01) ssh hgwdev cd san/andy/mouseSage/ wget ftp://ftp1.nci.nih.gov/pub/SAGE/MOUSE/Mm.libraries.gz wget ftp://ftp1.nci.nih.gov/pub/SAGE/MOUSE/Mm_long.frequencies.gz wget ftp://ftp1.nci.nih.gov/pub/SAGE/SAGE_mm_long_forward_v36.1.tar.gz wget ftp://ftp1.nci.nih.gov/pub/SAGE/SAGE_mm_long_reverse_v36.1.tar.gz tar xvfz SAGE_mm_long_forward_v36.1.tar.gz tar xvfz SAGE_mm_long_reverse_v36.1.tar.gz rm *.tar.gz chmod a+r -R mm_* chmod +x mm_* cd mm_forward/ cat * | awk 'BEGIN{OFS="\t"}{print $1, $3, $4, $2, 1000, "+"}' > ../unlifted.bed cd ../mm_reverse/ cat * | awk 'BEGIN{OFS="\t"}{print $1, $4, $3, $2, 1000, "-"}' >> ../unlifted.bed ctgPosToLft mm8 mm8.lft liftUp lifted.bed mm8.lft warn unlifted.bed awk 'BEGIN{OFS="\t"}{strand = $6; start = $2; end = $3; if (strand == "-") { thickStart = end; } else { start = start - 1; thickStart = start - 4; } thickEnd = thickStart + 4; print $1, start, end, $4, $5, strand, thickStart, thickEnd; }' lifted.bed > mapping.bed gunzip *.gz rm -rf mm_forward/ mm_reverse/ unlifted.bed lifted.bed mm8.lft awk 'BEGIN{FS="\t"}{sex = $13; for (i=1; i<=12; i++) { printf("%s\t", $i); } if (sex == "unknown") { sex = ""; } else if (sex == "male and fe male") { sex = "male,female,"} else if (sex == "male") { sex = "male,"} else {sex = "female,"}; printf("%s\t", sex); for (i=14; i<=20; i++) { printf("%s\t", $i); } print $21}' Mm.libraries | tail +2 > massaged.Mm.libraries cgapSageBedAddFreqs -noEmpty mapping.bed Mm_long.frequencies massaged.Mm.libraries cgapSage.bed ln -s ~/hg/lib/cgapSage/cgapSageLib.sql ln -s ~/hg/lib/cgapSage/cgapSage.sql hgLoadBed -sqlTable=cgapSage.sql mm8 cgapSage cgapSage.bed hgLoadSqlTab mm8 cgapSageLib cgapSageLib.sql massaged.Mm.libraries ############################ # HUMAN (hg18) PROTEINS TRACK (DONE braney 2007-04-02) ssh kkstore04 bash mkdir /cluster/data/mm8/blastDb cd /cluster/data/mm8 ls noMask/*.fa | grep -v random > temp.lst ls randomContigs/*.fa >> temp.lst cat `cat temp.lst` > temp.fa faSplit gap temp.fa 1000000 blastDb/x -lift=blastDb.lft rm temp.fa cd blastDb for i in *.fa do /cluster/bluearc/blast229/formatdb -i $i -p F done rm *.fa mkdir -p /san/sanvol1/scratch/mm8/blastDb cd /cluster/data/mm8/blastDb for i in nhr nin nsq; do echo $i cp *.$i /san/sanvol1/scratch/mm8/blastDb done mkdir -p /cluster/data/mm8/bed/tblastn.hg18KG cd /cluster/data/mm8/bed/tblastn.hg18KG echo /san/sanvol1/scratch/mm8/blastDb/*.nsq | xargs ls -S | sed "s/\.nsq//" > query.lst wc -l query.lst # 2733 query.lst # we want around 150000 jobs calc `wc /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl | awk "{print \\\$1}"`/\(150000/`wc query.lst | awk "{print \\\$1}"`\) # 36727/(150000/2733) = 669.165940 mkdir -p /cluster/bluearc/mm8/bed/tblastn.hg18KG/kgfa split -l 670 /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl /cluster/bluearc/mm8/bed/tblastn.hg18KG/kgfa/kg ln -s /cluster/bluearc/mm8/bed/tblastn.hg18KG/kgfa kgfa cd kgfa for i in *; do nice pslxToFa $i $i.fa; rm $i; done cd .. ls -1S kgfa/*.fa > kg.lst mkdir -p /cluster/bluearc/mm8/bed/tblastn.hg18KG/blastOut ln -s /cluster/bluearc/mm8/bed/tblastn.hg18KG/blastOut for i in `cat kg.lst`; do mkdir blastOut/`basename $i .fa`; done tcsh cd /cluster/data/mm8/bed/tblastn.hg18KG cat << '_EOF_' > blastGsub #LOOP blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl } #ENDLOOP '_EOF_' cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/cluster/bluearc/blast229/data export BLASTMAT g=`basename $2` f=/tmp/`basename $3`.$g for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11 do if /cluster/bluearc/blast229/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8 then mv $f.8 $f.1 break; fi done if test -f $f.1 then if /cluster/bin/i386/blastToPsl $f.1 $f.2 then liftUp -nosort -type=".psl" -nohead $f.3 /cluster/data/mm8/blastDb.lft carry $f.2 liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg18/bed/blat.hg18KG/protein.lft warn $f.3 if pslCheck -prot $3.tmp then mv $3.tmp $3 rm -f $f.1 $f.2 $f.3 $f.4 fi exit 0 fi fi rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 $f.4 exit 1 '_EOF_' # << happy emacs chmod +x blastSome gensub2 query.lst kg.lst blastGsub blastSpec exit # back to bash ssh pk cd /cluster/data/mm8/bed/tblastn.hg18KG para create blastSpec para time # Completed: 150315 of 150315 jobs # CPU time in finished jobs: 24349624s 405827.07m 6763.78h 281.82d 0.772 y # IO & Wait Time: 1825515s 30425.24m 507.09h 21.13d 0.058 y # Average job time: 174s 2.90m 0.05h 0.00d # Longest finished job: 673s 11.22m 0.19h 0.01d # Submission to last job: 79743s 1329.05m 22.15h 0.92d ssh kkstore04 cd /cluster/data/mm8/bed/tblastn.hg18KG for i in blastOut/* do echo "cd $i; cat *.psl | pslSortAcc nohead chrom /tmp/ stdin ; cd ../.." done > sort.jobs sh -x sort.jobs tcsh mkdir chainRun cd chainRun cat << '_EOF_' > chainGsub #LOOP chainOne $(path1) #ENDLOOP '_EOF_' cat << '_EOF_' > chainOne /cluster/home/braney/bin/x86_64/simpleChain -prot -outPsl -maxGap=150000 $1 `dirname $1`/c.`basename $1`.psl '_EOF_' chmod +x chainOne ls ../blastOut/*/chrom/*.psl > chain.lst gensub2 chain.lst single chainGsub chainSpec # do the cluster run for chaining ssh pk cd /cluster/data/mm8/bed/tblastn.hg18KG/chainRun para create chainSpec para maxNode 30 para try, check, push, check etc. #two batches # Completed: 2574 of 2574 jobs # CPU time in finished jobs: 3338223s 55637.04m 927.28h 38.64d 0.106 y # IO & Wait Time: 21934s 365.57m 6.09h 0.25d 0.001 y # Average job time: 1305s 21.76m 0.36h 0.02d # Longest finished job: 88204s 1470.07m 24.50h 1.02d # Submission to last job: 92614s 1543.57m 25.73h 1.07d # Completed: 2871 of 2871 jobs # CPU time in finished jobs: 2495054s 41584.24m 693.07h 28.88d 0.079 y # IO & Wait Time: 47207s 786.78m 13.11h 0.55d 0.001 y # Average job time: 885s 14.76m 0.25h 0.01d # Longest finished job: 59971s 999.52m 16.66h 0.69d # Submission to last job: 78852s 1314.20m 21.90h 0.91d ssh kkstore04 cd /cluster/data/mm8/bed/tblastn.hg18KG/blastOut bash for i in kg?? do cat $i/chrom/c.*.psl|awk "(\$13 - \$12)/\$11 > 0.6 {print}" > c60.$i.psl sort -rn c60.$i.psl | pslUniq stdin u.$i.psl awk "((\$1 / \$11) ) > 0.60 { print }" c60.$i.psl > m60.$i.psl echo $i done sort -T /tmp -k 14,14 -k 16,16n -k 17,17n u.*.psl m60* | uniq > /cluster/data/mm8/bed/tblastn.hg18KG/preLift.psl cd /cluster/data/mm8/bed/tblastn.hg18KG liftUp -type=.psl -nohead stdout ../../jkStuff/liftAll.lft carry preLift.psl | sort -k 14,14 -k 16,16n -k 17,17n > blastHg18KG.psl pslCheck blastHg18KG.psl # load table ssh hgwdev cd /cluster/data/mm8/bed/tblastn.hg18KG hgLoadPsl mm8 blastHg18KG.psl # check coverage nice featureBits mm8 blastHg18KG # 40445290 bases of 2567283971 (1.575%) in intersection # In comparison to cat and dog: nice featureBits felCat3 blastHg18KG # 15218612 bases of 1642698377 (0.926%) in intersection nice featureBits canFam2 blastHg18KG # 32565727 bases of 2384996543 (1.365%) in intersection featureBits mm8 refGene:cds blastHg18KG -enrichment # refGene:cds 1.157%, blastHg18KG 1.575%, both 0.927%, cover 80.15%, enrich # 50.88x ssh kkstore04 rm -rf /cluster/data/mm8/bed/tblastn.hg18KG/blastOut rm -rf /cluster/bluearc/mm8/bed/tblastn.hg18KG/blastOut #end tblastn # EXONIPHY MM8, lifted from hg18 (DONE acs 2007-04-08) ssh hgwdev cd /cluster/data/mm8/bed mkdir exoniphy cd exoniphy hgLoadGenePred -genePredExt mm8 exoniphy exoniphyMm8.gp # exoniphyMm8.gp was prepared at Cornell as follows hgsql hg18 -e "select * from exoniphy" --skip-column-names > exoniphyHg18.gp liftOver -genePred exoniphyHg18.gp /usr/data/hg18/dbDerived/netSynteny/hg18.mm8.syn.chain exoniphyMm8.gp unmapped (where hg18.mm8.syn.chain representes the human/mouse syntenic net) ######################################################################### # BLASTZ/CHAIN/NET HORSE (DONE 2/21/07 Fan) ssh kkstore05 mkdir /cluster/data/equCab1/bed/blastz.mm8.2007-02-17 cd /cluster/data/equCab1/bed/blastz.mm8.2007-02-17 cat << '_EOF_' > DEF # Horse vs. Mouse BLASTZ_M=50 # TARGET: Horse equCab1 SEQ1_DIR=/san/sanvol1/scratch/equCab1/equCab1.2bit SEQ1_LEN=/san/sanvol1/scratch/equCab1/chrom.sizes # Maximum number of scaffolds that can be lumped together SEQ1_LIMIT=500 SEQ1_CHUNK=30000000 SEQ1_LAP=10000 # QUERY: Mouse mm8 SEQ2_DIR=/scratch/hg/mm8/mm8.2bit SEQ2_LEN=/cluster/data/mm8/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/cluster/data/equCab1/bed/blastz.mm8.2007-02-17 TMPDIR=/scratch/tmp '_EOF_' # Fix script coloring _EOF_ # << this line keeps emacs coloring happy doBlastzChainNet.pl DEF \ -bigClusterHub pk \ -chainMinScore=3000 -chainLinearGap=medium \ -blastzOutRoot /cluster/bluearc/equCab1/blastz.mm8 >& do.log & tail -f do.log ssh hgwdev cd /cluster/data/equCab1/bed/blastz.mm8.2007-02-17 ln -s blastz.mm8.2007-02-17 /cluster/data/equCab1/bed/blastz.mm8 nice featureBits equCab1 -chrom=chr1 chainMm8Link # 70800969 bases of 177498097 (39.888%) in intersection bash time nice -n 19 featureBits equCab1 chainMm8Link \ > fb.equCab1.chainMm8Link.txt 2>&1 # 903993981 bases of 2421923695 (37.325%) in intersection ssh kkstore05 mkdir /cluster/data/mm8/bed/blastz.equCab1.swap cd /cluster/data/mm8/bed/blastz.equCab1.swap bash time doBlastzChainNet.pl \ /cluster/data/equCab1/bed/blastz.mm8.2007-02-17/DEF \ -chainMinScore=3000 -chainLinearGap=medium \ -verbose=2 -swap -bigClusterHub=pk > swap.log 2>&1 & tail -f swap.log # real 76m34.873s ssh hgwdev cd /cluster/data/mm8/bed/blastz.equCab1.swap bash time nice -n 19 featureBits mm8 chainEquCab1Link \ > fb.mm8.chainEquCab1Link.txt 2>&1 # 906568751 bases of 2567283971 (35.312%) in intersection ######################################################################### # CGAP SAGE (Done 2007-05-04) ssh hgwdev cd /san/sanVol1/scratch/andy mkdir cgapSage.mm8 cd cgapSage.mm8 wget ftp://ftp1.nci.nih.gov/pub/SAGE/MOUSE/Mm.libraries.gz wget ftp://ftp1.nci.nih.gov/pub/SAGE/MOUSE/Mm_long.frequencies.gz hgsql -e 'select * from snp126 where class="single" and locType="exact"' mm8 \ | tail +2 | cut -f2- > snps.txt hgsql -e 'select name from snp126Exceptions where exception="ObservedWrongSize" or exception="SingleClassBetweenLocType" or exception="SingleClassRangeLocType" or exception="MultipleAlignment"' mm8 \ | tail +2 > exceptions tabGrep -v exceptions 4 snps.txt > tmp mv tmp snps.txt rm exceptions hgsql -e 'select chrom,chromStart,chromEnd,name from simpleRepeat' mm8 | tail +2 > trf.bed cut -f1-4 snps.txt > snps.bed overlapSelect -nonOverlapping trf.bed snps.bed /dev/stdout | cut -f4 > goodSnps.txt tabGrep goodSnps.txt 4 snps.txt > tmp mv tmp snps.txt rm trf.bed goodSnps.txt snps.bed ln -s /cluster/data/mm8/mm8.2bit ln -s /cluster/data/mm8/chrom.sizes ln -s ~/kent/src/hg/lib/cgapSage/cgapSageLib.sql tail +2 Mm.libraries | awk -f cleanLibs.awk > libs.txt hgLoadSqlTab mm8 cgapSageLib cgapSageLib.sql libs.txt partitionSequence.pl -lstDir small 5000000 30 mm8.2bit chrom.sizes 0 > sequence.lst grep -v small sequence.lst > seq.lst cat small/* >> seq.lst mv seq.lst sequence.lst rm -rf small/ for part in `cat sequence.lst`; do ./doJobList.sh $part >> jobList; done ssh pk cd /san/sanVol1/scratch/andy/cgapSage.mm8 para create jobList para try para push # takes like 5-10 min exit # back to hgwdev find output/ -name '*.bed' -exec cat '{}' >> output.bed \; cgapSageDupeRemove output.bed tmp.bed cgapSageDupeRemove -unique tmp.bed final.bed ln -s ~/kent/src/hg/lib/cgapSage/cgapSage.sql hgLoadBed -sqlTable=cgapSage.sql -tab mm8 cgapSage final.bed ############################################################################# # REBUILD miRNA TRACK (DONE - 2007-05-31 - Fan) # updated data from: Michel.Weber@ibcg.biotoul.fr # notify them when done. ssh hgwdev cd /cluster/data/mm8/bed mkdir miRNA-2007-05-31 cd miRNA-2007-05-31 # save the mouse_miRNA_track_may2007.txt file from email cat mouse_miRNA_track_may2007.txt|sed -e 's/ /\t/g' > miRNA.tab hgLoadBed mm8 miRNA miRNA.tab # check previous release track before update featureBits mm8 miRNA #33398 bases of 2567283971 (0.001%) in intersection featureBits mm7 miRNA # 20620 bases of 2583394090 (0.001%) in intersection ############################################################################# # LIFTOVER TO MM9 (DONE 7/25/07 angie) ssh kkstore04 # -debug run to create run dir, preview scripts... doSameSpeciesLiftOver.pl -debug mm8 mm9 \ -ooc /san/sanvol1/scratch/mm8/11.ooc # Real run: cd /cluster/data/mm8/bed/blat.mm9.2007-07-24 doSameSpeciesLiftOver.pl mm8 mm9 \ -ooc /san/sanvol1/scratch/mm8/11.ooc \ >& do.log & tail -f do.log ############################################################################# # CONTRAST GENES (2007-10-02 markd) # recieved predictions from Sam Gross cd /cluster/data/mm8/bed/contrastGene/ wget http://www.stanford.edu/~ssgross/contrast.mm8.bed # this is a custom track, not a pure BED tail +2 contrast.mm8.bed | hgLoadBed -tab mm8 contrastGene stdin # verify # load track db (ra and contrastGene.html are global # request push of contrastGene ########################################################################### # loading affy mouse Exon probes and transcripts (DONE - 2007-10-04 - Hiram) # data was supplied from Venu Valmeekam Venu_Valmeekam@affymetrix.com # dropped via FTP to genome-test ssh hgwdev mkdir /cluster/data/mm8/bed/affyMoEx1 cd /cluster/data/mm8/bed/affyMoEx1 # the files received: # -rw-r--r-- 1 8909954 Oct 3 10:48 transcript_cluster_mm.bed.gz # -rw-r--r-- 1 48178714 Oct 4 13:35 probe_mm_score.bed.gz # loading: hgLoadBed -tmpDir=/scratch/tmp mm8 affyMoEx1Probe probe_mm_score.bed.gz # Loaded 4549897 elements of size 6 hgLoadBed -tmpDir=/scratch/tmp mm8 affyMoEx1Transcript \ transcript_cluster_mm.bed.gz # Loaded 270140 elements of size 12 # working on description pages for these with Venu. # I manually set the scores in the affyMoEx1Transcript track to # 1000 so it would work OK (not color) with the useScore 1 so that # the affyMoEx1Probe would color itself on the score ########################################################################### # LIFT RM ALIGN FILES, MAKE PER-CHROM DOWNLOADS (DONE 12/7/07 angie) # Lifting of .align files is now automated by doRepeatMasker.pl, but we # got a user request for .align files from this pre-automation db. ssh kkstore04 cd /cluster/data/mm8 mkdir downloads/RMalign foreach c (?{,?}) echo linking/lifting to contigs of $c:t foreach ctgdir ($c/chr$c{,_random}_?{,?}) set ctg = $ctgdir:t if (! -f $ctgdir/$ctg.fa.align) then pushd $ctgdir liftRMAlign.pl $ctg.lft > $ctg.fa.align popd endif ln -s $ctg/$ctg.fa.align $c/ end set chr = chr$c:t if (-e $c/lift/ordered.lft && ! -z $c/lift/ordered.lft) then echo lifting contigs to chr$c liftRMAlign.pl $c/lift/ordered.lft \ | gzip -c > downloads/RMalign/$chr.fa.align.gz endif if (-e $c/lift/random.lft && ! -z $c/lift/random.lft) then echo lifting contigs to chr${c}_random liftRMAlign.pl $c/lift/random.lft \ | gzip -c > downloads/RMalign/${chr}_random.fa.align.gz endif end # Got some messages like these for chunks that fall entirely # within gaps (e.g. centromere, huge unbridged...) #FYI Couldn't open chr1_1_00.fa.align: No such file or directory #... #FYI Couldn't open chr1_1_05.fa.align: No such file or directory #FYI Couldn't open chr1_17_02.fa.align: No such file or directory #... md5sum downloads/RMalign/*.gz > downloads/RMalign/md5sum.txt ssh hgwdev ln -s /cluster/data/mm8/downloads/RMalign \ /usr/local/apache/htdocs/goldenPath/mm8/ ############################################################################ # Reload CCDS (2007-12-12 markd) # import ccds database as described in ccds.txt set db=mm8 # create and load ccdsGene and ccdsInfo tables from imported database /cluster/data/genbank/bin/x86_64/ccdsMkTables -loadDb ccds ${db} ccdsInfo ccdsGene # ccdsKgMap /cluster/data/genbank/bin/x86_64/mkCcdsGeneMap -db=${db} -loadDb ccdsGene knownGene ccdsKgMap # build initial version of ccdsMgcMap table, updated by nightly genbank update /cluster/data/genbank/bin/x86_64/mkCcdsGeneMap -db=${db} -loadDb ccdsGene mgcGenes ccdsMgcMap checkTableCoords ${db} -verbose=2 ccdsGene # update all.jointer to include ${db} in ccdsDb joinerCheck -database=${db} -identifier=ccdsGeneId ~/compbio/genbank/kent/src/hg/makeDb/schema/all.joiner # request push of ccdsGene ccdsInfo ccdsKgMap ccdsMgcMap # << emacs ############################################################################ # Reload CCDS (2008-02-01 markd) # import ccds database as described in ccds.txt set db=mm8 # create and load ccdsGene and ccdsInfo tables from imported database /cluster/data/genbank/bin/x86_64/ccdsMkTables -loadDb ccds ${db} ccdsInfo ccdsGene # ccdsKgMap /cluster/data/genbank/bin/x86_64/mkCcdsGeneMap -db=${db} -loadDb ccdsGene knownGene ccdsKgMap checkTableCoords ${db} -verbose=2 ccdsGene # update all.jointer to include ${db} in ccdsDb joinerCheck -database=${db} -identifier=ccdsGeneId ~/compbio/genbank/kent/src/hg/makeDb/schema/all.joiner # request push of ccdsGene ccdsInfo ccdsKgMap # << emacs ############################################################################ # Broad whole-genome ChIP-Seq in stem and progenitor cells # Mikkelson et al., Nature Aug. 2, 2007 # Requested by David Haussler # 21 data sets, ~4M sequences/dataset # 7 antibodies (histone meth & pol2), # 4 cell sources (ES, NP, MEF, ES+) # alignments/ sequences and mappings for 27bp reads # format: chrom, start, end, strand, read_id, mismatches, sequence # densities/ indication of #reads near the base, 25bp fixed window, -1 if unalignable base # Allele-specific fragment counts # format: chr start allele1 allele2 # # # Enriched intervals by HMM # BED3 # Enriched intervals by fixed-size windows # BED3 # Also, gene expression data # Track organization: # Broad ChIP ES supertrack, with tracks: # - Broad Stem ChIP Seq (read alignments) # - Broad Stem ChIP Sig (density in 25bp windows) # - Broad Stem ChIP Sites (regions from HMM, windowing) # Each track has subtracks for different cell types and antibodies # Also, a track for the expression data: Broad ES # ssh kkstore04 cd /cluster/data/mm8/bed mkdir -p broadStemChip cd broadStemChip/ wget -r ftp://ftp.broad.mit.edu/pub/papers/chipseq/ mv pub/papers/chipseq . rm -fr pub # original data ln -s chipseq lab cd lab ############### # Sites track # HMM Sites -- BED3 mkdir -p hmmSites cd hmmSites tar xvfz ../HMMIntervals.tar.gz ssh hgwdev cd /cluster/data/mm8/bed/broadStemChip cat > hmmSites.csh << 'EOF' foreach f (/hmmSites/HMM_ES_*.txt) set b = $f:t set ab = `echo $b | perl -wpe 's/HMM_ES_(.+).txt/H3$1me3/'` echo $ab tail +2 $f | sed 's/^/chr/' | \ hgLoadBed mm8 broadStemChipHmmSites${ab}Es stdin end 'EOF' # Fix script coloring EOF csh hmmSites.csh >&! hmmSites.log # Loaded 1788 - 19523 elements in 5 tracks # H3K{20,27,36,4,9)me3 mkdir -p WindowSites cd WindowSites tar xvfz ../WindowIntervals.tar.gz cd .. awk '{print $4}' *K*.txt | sort -n | head -1 # Sites from Window algorithm -- BED3 plus float score # min: 2.75, max: 275.50 # distribution of data values: awk '{print $4}' *K*.txt | sort | textHistogram -binSize=10 maxBinCount=30 -real stdin 0.000000 ************************************************************ 38346 10.000000 ************************** 16385 20.000000 ************** 9186 30.000000 ********* 5705 40.000000 ******* 4607 50.000000 ****** 3686 60.000000 **** 2243 70.000000 ** 1094 80.000000 * 382 90.000000 112 100.000000 31 110.000000 10 120.000000 3 130.000000 2 140.000000 0 150.000000 0 160.000000 0 170.000000 0 180.000000 0 190.000000 1 200.000000 0 210.000000 0 220.000000 2 230.000000 0 240.000000 0 250.000000 0 260.000000 0 270.000000 1 # To range score display from 300 to 1000, use: # (x * 2) + 300 mkdir windowSites cat > windowSites.csh << 'EOF' foreach f (chipseq/windowSites/*.K*.txt) set b = $f:t set ab = `echo $b | perl -wpe 's/\w+.(\w)(\w+).txt/H3\u$1\L$2me3/'` set cell = `echo $b | perl -wpe 's/(\w)(\w+).*/\u$1\L$2/'` tail +2 $f | awk '{printf "%s\t%d\t%d\t \t%d\t%s\n", $1, $2, $3, ($4 * 2) + 300, $4}' > windowSites/$cell.$ab.tab # using kate's version, testing -renameSqlTable option /cluster/home/kate/bin/x86_64/hgLoadBed mm8 -tab -noNameIx -renameSqlTable \ -sqlTable=/cluster/bin/sqlCreate/bed5FloatScore.sql \ broadStemChipWinSites${ab}${cell} windowSites/$cell.$ab.tab end 'EOF' # Fix script coloring EOF csh windowSites.csh >&! windowSites.log ############### # Signal track # indication of #reads near the base, 25bp fixed window, -1 if unalignable base ssh kkstore04 cd /cluster/data/mm8/bed/broadStemChip/lab/densities mkdir -p alignable cd alignable tar xvfz ../alignable.tar.gz cd ../.. # Get a list of the datasets mkdir -p signal tar tfz chipseq/densities/chr1.tar.gz | \ perl -wpe 's/chr\w.(\w+.\w+).txt/$1/' > signal/datasets.txt # ignore control (whole-cell extract) grep -v WCE signal/datasets.txt > signal/subtracks.txt wc -l signal/subtracks.txt # 18 # Extract datasets from by-chrom packaging # Weed out missing data which are represented as -1 values # Convert to wiggle cat > makeWig.csh << 'EOF' foreach s (`cat signal/subtracks.txt`) set ab = `echo $s | perl -wpe 's/\w+.(\w)(\w+)/H3\u$1\L$2/'` set cell = `echo $s | perl -wpe 's/(\w)(\w+).\w+/\u$1\L$2/'` set table = broadStemChipSignal${ab}${cell} echo $table rm -f signal/$s.wigVar foreach f (chipseq/densities/chr*.tar.gz) set c = $f:t:r:r (echo "fixedStep chrom=$c start=1 step=25 span=25"; \ tar xfzO $f $c.$s.txt) | \ nice fixStepToBedGraph.pl | \ nice grep -v '\-1$' | \ nice wigBedToStep stdin stdout >> signal/$s.wigVar end nice wigEncode signal/$s.wigVar signal/$s.wig signal/$s.wib end 'EOF' # Fix script coloring EOF # NEWER cat > makeWig.csh << 'EOF' foreach s (`cat signal/subtracks.txt`) set ab = `echo $s | perl -wpe 's/\w+.(\w)(\w+)/\u$1\L$2/'` set cell = `echo $s | perl -wpe 's/(\w)(\w+).\w+/\u$1\L$2/'` set table = broadStemChipSignal${ab}${cell} echo $table rm -f signal/$s.wigVar foreach f (chipseq/densities/chr*.tar.gz) set c = $f:t:r:r echo "variableStep chrom=$c span=25" >> signal/$table.wigVar tar xfzO $f $c.$s.txt | \ awk 'BEGIN {pos = 1} {print pos, $1; pos += 25}' | \ grep -v '\-1$' >> signal/$table.wigVar end cd signal nice wigEncode $table.wigVar $table.wig $table.wib cd .. end 'EOF' # Fix script coloring EOF csh makeWig.csh >&! makeWig.log & # check output and cleanup cd signal gzip *.wigVar csh makeWig.csh >&! makeWig.log & # check output and cleanup cd signal gzip *.wigVar ######## Load wiggles? ssh hgwdev mkdir /gbdb/mm8/broadStemChip cd /cluster/data/mm8/bed/broadStemChip cat > loadWig.csh << \_EOF_ #!/bin/csh -fe cd /cluster/data/mm8/bed/broadStemChip/signal foreach f (*.wib) set wi = $f:t:r set wig = $wi.wig echo Start: $wig echo "ln -s `pwd`/$f /gbdb/mm8/broadStemChip/$wi.wib" time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/mm8/broadStemChip mm8 $wi $wig echo Finished: $wig end _EOF_ chmod +x loadWig.csh time nice -n +19 ./loadWig.csh >> loadWig.log 2>&1 & # Try it by hand. time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/mm8/broadStemChip mm8 broadStemChipSignalH3Es broadStemChipSignalH3Es.wig # Now Try it again. cat > loadWig.csh << \_EOF_ #!/bin/csh -fe cd /cluster/data/mm8/bed/broadStemChip/signal foreach f (*.wib) set wi = $f:t:r set wig = $wi.wig time hgLoadWiggle -pathPrefix=/gbdb/mm8/broadStemChip mm8 $wi $wig echo Finished: $wig end _EOF_ # Try it again. time nice -n +19 ./loadWig.csh >> loadWig.log 2>&1 & # | broadStemChipSignalH3Es | # | broadStemChipSignalK20Es | # | broadStemChipSignalK27Es | # | broadStemChipSignalK27Mef | # | broadStemChipSignalK27Np | # | broadStemChipSignalK36Es | # | broadStemChipSignalK36Eshyb | # | broadStemChipSignalK36Mef | # | broadStemChipSignalK36Np | # | broadStemChipSignalK4Es | # | broadStemChipSignalK4Eshyb | # | broadStemChipSignalK4Mef | # | broadStemChipSignalK4Np | # | broadStemChipSignalK9Es | # | broadStemChipSignalK9Eshyb | # | broadStemChipSignalK9Mef | # | broadStemChipSignalK9Np | # | broadStemChipSignalRpolEs | # Noticed tables badly named, renamed them and corresponding files hgsql mm8 rename table broadStemChipSignalK4Es to broadStemChipSignalH3K4Es ; rename table broadStemChipSignalK4Eshyb to broadStemChipSignalH3K4Eshyb ; rename table broadStemChipSignalK4Mef to broadStemChipSignalH3K4Mef ; rename table broadStemChipSignalK4Np to broadStemChipSignalH3K4Np ; rename table broadStemChipSignalK9Es to broadStemChipSignalH3K9Es ; rename table broadStemChipSignalK9Eshyb to broadStemChipSignalH3K9Eshyb ; rename table broadStemChipSignalK9Mef to broadStemChipSignalH3K9Mef ; rename table broadStemChipSignalK9Np to broadStemChipSignalH3K9Np ; rename table broadStemChipSignalK20Es to broadStemChipSignalH4K20Es ; rename table broadStemChipSignalK27Es to broadStemChipSignalH3K27Es ; rename table broadStemChipSignalK27Mef to broadStemChipSignalH3K27Mef ; rename table broadStemChipSignalK27Np to broadStemChipSignalH3K27Np ; rename table broadStemChipSignalK36Es to broadStemChipSignalH3K36Es ; rename table broadStemChipSignalK36Eshyb to broadStemChipSignalH3K36Eshyb; rename table broadStemChipSignalK36Mef to broadStemChipSignalH3K36Mef ; rename table broadStemChipSignalK36Np to broadStemChipSignalH3K36Np ; # | broadStemChipSignalH3K4Es | # | broadStemChipSignalH3K4Eshyb | # | broadStemChipSignalH3K4Mef | # | broadStemChipSignalH3K4Np | # | broadStemChipSignalH3K9Es | # | broadStemChipSignalH3K9Eshyb | # | broadStemChipSignalH3K9Mef | # | broadStemChipSignalH3K9Np | # | broadStemChipSignalH4K20Es | # | broadStemChipSignalH3K27Es | # | broadStemChipSignalH3K27Mef | # | broadStemChipSignalH3K27Np | # | broadStemChipSignalH3K36Es | # | broadStemChipSignalH3K36Eshyb | # | broadStemChipSignalH3K36Mef | # | broadStemChipSignalH3K36Np | # | broadStemChipSignalH3Es | # | broadStemChipSignalRpolEs | ### ### ### Finished Signals 2008-05-08 ######### Alignments ### Sample from ES.H3.txt.gz # chr10 63848447 63848474 - 3084.4.1 0 GAGAGCCAATGGCTAGGCAGGGCATCA ### Convert to #chr10 63848447 63848474 3084.4.1 0 - 63848447 63848474 0,255,0 0 GAGAGCCAATGGCTAGGCAGGGCATCA # convert to bed-9+ color at 9, mismatch at 10 and seq at 11; grabbed some example code from encodeHg17.txt PET ssh hgwdev cd /cluster/data/mm8/bed/broadStemChip/lab/alignments mkdir bed cd bed cat << \_EOF_ > makeBed9PlusFromAlignments.csh #!/usr/bin/perl # replace "reserved" field of BED >=9 fields with RGB value from 8-scale # black->red palette, based on score value. use warnings; use strict; while (<>) { next if (/^track/ || /^\s*\#/); chomp; my @words = split("\t"); if (scalar(@words) < 7) { @words = split(/\s+/); die "Expecting at least 7 tab-sep fields but got fewer, line $.\n" if (scalar(@words) < 7); } my @newWordOrder = ("","","","","","","","","","",""); $newWordOrder[0] = $words[0]; # chr $newWordOrder[1] = $words[1]; # beg $newWordOrder[2] = $words[2]; # end $newWordOrder[3] = $words[4]; # name #$newWordOrder[4] = "0"; # score $newWordOrder[4] = 1000 - ($words[5] * 100); # score 0=1000 1=900 2=800 $newWordOrder[5] = $words[3]; # strand $newWordOrder[6] = $words[1]; # beg $newWordOrder[7] = $words[2]; # end $newWordOrder[8] = "0,0,0"; # color to be set later $newWordOrder[9] = $words[5]; # mismatch $newWordOrder[10] = $words[6]; # seq print join("\t", @newWordOrder) . "\n"; } _EOF_ cat << \_EOF_ > makeColoredBedOnStrand.csh #!/usr/bin/perl # replace "reserved" field of BED >=9 fields with RGB value from 8-scale # black->red palette, based on score value. use warnings; use strict; # palette consistes of red, green blue my @blues = ("0,0,255","0,0,204","0,0,170"); my @greens = ("0,255,0","0,187,0","0,136,0"); while (<>) { next if (/^track/ || /^\s*\#/); chomp; my @words = split("\t"); if (scalar(@words) < 9) { @words = split(/\s+/); die "Expecting at least 9 tab-sep fields but got fewer, line $.\n" if (scalar(@words) < 9); } die "More than 9 mismatches found line $.\n" if ($words[9] > 9); my $strand = $words[5]; if ($strand eq '+') { if( $words[9] > 2 ) { $words[8] = $blues[2]; # green } else { $words[8] = $blues[$words[9]]; # green } } else { if( scalar($words[9]) > 2 ) { $words[8] = $greens[2]; # blue } else { $words[8] = $greens[$words[9]]; # blue } } print join("\t", @words) . "\n"; } _EOF_ cat << \_EOF_ > convertToBed.csh #!/bin/csh -fe cd /cluster/data/mm8/bed/broadStemChip/lab/alignments/bed foreach f (../*.txt.gz) set root = `echo $f:t:r:r` zcat $f | ./makeBed9PlusFromAlignments.csh | ./makeColoredBedOnStrand.csh | gzip > $root.bed.gz echo $root.bed.gz done end _EOF_ chmod +x makeBed9PlusFromAlignments.csh chmod +x makeColoredBedOnStrand.csh chmod +x convertToBed.csh zcat ../ES.H3.txt.gz | head | ./makeBed9PlusFromAlignments.csh | ./makeColoredBedOnStrand.csh ## How to make bash work ??? #for f in ../*.txt.gz; do # root=${f##*/} # root=${root%.*} # root=${root%.*} # zcat $f | ./makeBed9PlusFromAlignments.csh | ./makeColoredBedOnStrand.csh | gzip > $root.bed.gz # echo $root.bed.gz done #done ssh kkstore04 cd /cluster/data/mm8/bed/broadStemChip/lab/alignments/bed time nice -n +19 ./convertToBed.csh > convert.log 2>&1 & # failed because mismatches exceeded 2, so used following to determin max mismatches: 6 in ES.H3 zcat ../ES.H3.txt.gz | head -100 | awk '{print $6}' | sort -n | uniq -c | wc -l # real 55m8.275s # Two were not gzipped! cat << \_EOF_ > convertTxtToBed.csh #!/bin/csh -fe cd /cluster/data/mm8/bed/broadStemChip/lab/alignments/bed foreach f (../ES.*.txt) set root = `echo $f:t:r` ./makeBed9PlusFromAlignments.csh < $f | ./makeColoredBedOnStrand.csh | gzip > $root.bed.gz echo $root.bed.gz done end _EOF_ chmod +x convertTxtToBed.csh time nice -n +19 ./convertTxtToBed.csh >> convert.log 2>&1 & # Add comments: cat << \_EOF_ > commentBedFiles.csh #!/bin/csh -fe cd /cluster/data/mm8/bed/broadStemChip/lab/alignments/bed set descr1 = `grep Primary ../readme.txt | tr -d "\r"` set descr2 = `grep pluripotent ../readme.txt | tr -d "\r"` foreach f (ES.*.bed.gz) set root = `echo $f:t:r:r` set comment = `grep $root ../readme.txt | tr -d "\r"` echo "# $comment - ${descr1} ${descr2}" > new.${root}.bed zcat $f >> new.${root}.bed gzip new.${root}.bed end _EOF_ ssh kkstore04 cd /cluster/data/mm8/bed/broadStemChip/lab/alignments/bed time nice -n +19 ./commentBedFiles.csh > comment.log 2>&1 & # Rename to match other identifiers? # | broadStemChipHmmSitesH3K20me3Es | # | broadStemChipHmmSitesH3K27me3Es | # | broadStemChipHmmSitesH3K36me3Es | # | broadStemChipHmmSitesH3K4me3Es | # | broadStemChipHmmSitesH3K9me3Es | # | broadStemChipWinSitesH3K27me3Es | # | broadStemChipWinSitesH3K27me3Mef | # | broadStemChipWinSitesH3K27me3Np | # | broadStemChipWinSitesH3K4me3Es | # | broadStemChipWinSitesH3K4me3Mef | # | broadStemChipWinSitesH3K4me3Np | # | broadStemChipWinSitesH3K9me3Es | # | broadStemChipWinSitesH3K9me3Mef | # | broadStemChipWinSitesH3K9me3Np | zcat new.ES.K9.bed.gz | head -1 | awk '{ print $5 }' head -1 new.*.bed | awk '{ print $5 }' for f in new.ES.K*.gz; do zcat $f | head -1 | awk '{ print $2,$5 "Es"}'; done for f in new.ES.WCE.*.gz; do zcat $f | head -1 | awk '{ print $2,"WceEs"}'; done for f in new.ES.H3.*.gz; do zcat $f | head -1 | awk '{ print $2,"H3panEs"}'; done for f in new.ES.R*.gz; do zcat $f | head -1 | awk '{ print $2,"RPolEs"}'; done for f in new.ESHyb.*.gz; do zcat $f | head -1 | awk '{ print $2,"ES" $6 "EsHyb"}'; done for f in new.MEF.K*.gz; do zcat $f | head -1 | awk '{ print $2,$4 "Mef"}'; done for f in new.MEF.WCE.*.gz; do zcat $f | head -1 | awk '{ print $2,"WceMef"}'; done for f in new.NP.K*.gz; do zcat $f | head -1 | awk '{ print $2,$5 "Np"}'; done for f in new.NP.WCE.*.gz; do zcat $f | head -1 | awk '{ print $2,"WceNp"}'; done mv new.ES.K20.bed.gz H4K20Me3Es.bed.gz mv new.ES.K27.bed.gz H3K27Me3Es.bed.gz mv new.ES.K36.bed.gz H3K36Me3Es.bed.gz mv new.ES.K4.bed.gz H3K4Me3Es.bed.gz mv new.ES.K9.bed.gz H3K9Me3Es.bed.gz mv new.ES.WCE.bed.gz WceEs.bed.gz mv new.ES.H3.bed.gz H3panEs.bed.gz mv new.ES.RPol.bed.gz RPolEs.bed.gz mv new.ESHyb.K36.bed.gz ESH3K36Me3EsHyb.bed.gz mv new.ESHyb.K4.bed.gz ESH3K4Me3EsHyb.bed.gz mv new.ESHyb.K9.bed.gz ESH3K9Me3EsHyb.bed.gz mv new.MEF.K27.bed.gz H3K27Me3Mef.bed.gz mv new.MEF.K36.bed.gz H3K36Me3Mef.bed.gz mv new.MEF.K4.bed.gz H3K4Me3Mef.bed.gz mv new.MEF.K9.bed.gz H3K9Me3Mef.bed.gz mv new.MEF.WCE.bed.gz WceMef.bed.gz mv new.NP.K27.bed.gz H3K27Me3Np.bed.gz mv new.NP.K36.bed.gz H3K36Me3Np.bed.gz mv new.NP.K4.bed.gz H3K4Me3Np.bed.gz mv new.NP.K9.bed.gz H3K9Me3Np.bed.gz mv new.NP.WCE.bed.gz WceNp.bed.gz #hgLoadBed mm8 broadStemChipAlign${root} ${f} time nice -n +19 hgLoadBed mm8 broadStemChipAlignmentsWceEs WceEs.bed.gz & ### Failed! All that work to put a nice comment in the bed file, and hgLoadBed does not handle it! ### Fixed this in hgLoadBed.c cat << \_EOF_ > myBedTbl.sql CREATE TABLE myBedTbl ( bin smallint unsigned not null, chrom varchar(255) not null, chromStart int unsigned not null, chromEnd int unsigned not null, name varchar(255) not null, score int unsigned not null, strand char(1) not null, thickStart int unsigned not null, thickEnd int unsigned not null, reserved int unsigned not null, mismatchCount int unsigned not null, seq varchar(255) not null, #Indices INDEX(name(16)), INDEX(chrom(5),bin) ) _EOF_ cat << \_EOF_ > loadBedFiles.csh #!/bin/csh -fe cd /cluster/data/mm8/bed/broadStemChip/lab/alignments/bed foreach f (*.bed.gz) set root = `echo $f:t:r:r` ~/bin/x86_64/hgLoadBed -sqlTable=myBedTbl.sql -renameSqlTable mm8 broadStemChipAlignments${root} ${f} echo broadStemChipAlignments${root} ${f} done end _EOF_ chmod +x loadBedFiles.csh time nice -n +19 ./loadBedFiles.csh & real 62m46.504s # Noticed 3 tables badly named, renamed them and corresponding files hgsql mm8 rename table broadStemChipAlignmentsESH3K36Me3EsHyb to broadStemChipAlignmentsH3K36Me3EsHyb; rename table broadStemChipAlignmentsESH3K4Me3EsHyb to broadStemChipAlignmentsH3K4Me3EsHyb; rename table broadStemChipAlignmentsESH3K9Me3EsHyb to broadStemChipAlignmentsH3K9Me3EsHyb; # edited trackDb.broadStem.ra broadStemChipAlignmentsH3K4Me3Es broadStemChipAlignmentsH3K4Me3Mef broadStemChipAlignmentsH3K4Me3Np broadStemChipAlignmentsH3K9Me3Es broadStemChipAlignmentsH3K9Me3Mef broadStemChipAlignmentsH3K9Me3Np broadStemChipAlignmentsH4K20Me3Es broadStemChipAlignmentsH3K27Me3Es broadStemChipAlignmentsH3K27Me3Mef broadStemChipAlignmentsH3K27Me3Np broadStemChipAlignmentsH3K36Me3Es broadStemChipAlignmentsH3K36Me3Mef broadStemChipAlignmentsH3K36Me3Np broadStemChipAlignmentsH3K9Me3EsHyb broadStemChipAlignmentsH3K36Me3EsHyb broadStemChipAlignmentsH3K4Me3EsHyb broadStemChipAlignmentsWceEs broadStemChipAlignmentsWceMef broadStemChipAlignmentsWceNp broadStemChipAlignmentsRPolEs broadStemChipAlignmentsH3panEs ### ### ### Finished Alignments 2008-04-29 ### ### ### Edited mouse/mmm8/trackDb.broadStem.ra to include new broadChromatinChIPSeq ### ### ### track with 53 subtracts covering sites (HMM, Windowing), siganl & alignments ### ### ### for ES, MAF, NP, ES_hybrid cell lines ### ### ### and H3K4me3 H3K9me3 H4K20me3 H3K27me3 H3K36me3 antibodies ### ### ### and WCE, RPOL-II and pan-H3 controls ############################################################################ # Adding more tracks from Broad (Meissner2008) # (Start 2008-7-14 Tim Done: 2008-07-18) ssh kkstore04 cd /cluster/data/mm8/bed/broadStemChip/chipseq mkdir -p Meissner2008 cd Meissner2008/ wget -r ftp://ftp.broad.mit.edu/pub/papers/chipseq/Meissner2008/ mv pub/papers/chipseq/Meissner2008 . rm -fr pub # original data ln -s chipseq lab cd lab ############### # Sites track mkdir windowSites/Meissner2008 cd windowSites/Meissner2008 tar xvfz ../../Meissner2008/WindowIntervals.tar.gz awk '{print $4}' *.sites | sort -n | head -1 # Sites from Window algorithm -- BED3 plus float score # min: 2.50, max: 275.50 # distribution of data values: awk '{print $4}' *.sites | sort | textHistogram -binSize=10 maxBinCount=30 -real stdin # 0.000000 ************************************************************ 155307 # 10.000000 **************** 42020 # 20.000000 ****** 14576 # 30.000000 **** 10408 # 40.000000 ** 5717 # 50.000000 * 2299 # 60.000000 718 # 70.000000 232 # 80.000000 60 # 90.000000 15 # 100.000000 3 # 110.000000 6 # 120.000000 1 # 130.000000 1 # 140.000000 1 mv Brain.H3K27me3.sites ../Brain.K27me3.sites mv Brain.H3K4me2.sites ../Brain.K4me2.sites mv Brain.H3K4me3.sites ../Brain.K4me3.sites mv ES.H3K4me1.sites ../ES.K4me1.sites mv ES.H3K4me2.sites ../ES.K4me2.sites mv NP.H3K4me1.sites ../NP.K4me1.sites mv NP.H3K4me2.sites ../NP.K4me2.sites mv readme.txt ../readme.Meissner2008.txt cd .. rmdir Meissner2008/ # Continue to distinguish by .sites # Brain.K27me3.sites ES.K27.txt ES.K4me2.sites MEF.K4.txt NP.K4.txt NP.K9.txt # Brain.K4me2.sites ES.K4.txt ES.K9.txt MEF.K9.txt NP.K4me1.sites readme.Meissner2008.txt # Brain.K4me3.sites ES.K4me1.sites MEF.K27.txt NP.K27.txt NP.K4me2.sites readme.txt # To range score display from 300 to 1000, use THE SAME CONVERSION AS for the whole group: # (x * 2) + 300 cd /cluster/data/mm8/bed/broadStemChip mkdir windowSites cat > windowSites.Meissner2008.csh << \_EOF_ foreach f (chipseq/windowSites/*.sites) set b = $f:t set ab = `echo $b | perl -wpe 's/\w+.(\w)(\w+).sites/H3$1\L$2/'` set cell = `echo $b | perl -wpe 's/(\w)(\w+).*/\u$1\L$2/'` echo $cell $ab $b tail +2 $f | awk '{printf "%s\t%d\t%d\t \t%d\t%s\n", $1, $2, $3, ($4 * 2) + 300, $4}' > windowSites/$cell.$ab.tab # using kate's version, testing -renameSqlTable option /cluster/home/kate/bin/x86_64/hgLoadBed mm8 -tab -noNameIx -renameSqlTable \ -sqlTable=/cluster/bin/sqlCreate/bed5FloatScore.sql \ broadStemChipWinSites${ab}${cell} windowSites/$cell.$ab.tab end _EOF_ # Fix script coloring EOF chmod +x windowSites.Meissner2008.csh csh windowSites.Meissner2008.csh > windowSites.Meissner2008.log 2>&1 ############### # Signal track # indication of #reads near the base, 25bp fixed window, -1 if unalignable base ssh kkstore04 cd /cluster/data/mm8/bed/broadStemChip/lab/Meissner2008/densities mkdir -p alignable cd alignable foreach f (../*.tar.gz) tar xvfz $f end cd ../.. # Get a list of the datasets #mkdir -p signal tar tfz chipseq/Meissner2008/densities/chr1.tar.gz | \ perl -wpe 's/chr\w.(\w+.\w+).txt/$1/' > signal/datasetsMeissner2008.txt # ignore control (whole-cell extract) grep -v WCE signal/datasetsMeissner2008.txt > signal/subtracksMeissner2008.txt wc -l signal/subtracksMeissner2008.txt # 7 # Extract datasets from by-chrom packaging # Weed out missing data which are represented as -1 values # Convert to wiggle cat > makeWigMeissner2008.csh << \_EOF_ foreach s (`cat signal/subtracksMeissner2008.txt`) set ab = `echo $s | perl -wpe 's/\w+.(\w)(\w+)/\u$1\u$2/'` set cell = `echo $s | perl -wpe 's/(\w)(\w+).\w+/\u$1\L$2/'` set table = broadStemChipSignal${ab}${cell} echo $table $s rm -f signal/$table.wigVar foreach f (chipseq/Meissner2008/densities/chr*.tar.gz) set c = $f:t:r:r echo "variableStep chrom=$c span=25" >> signal/$table.wigVar tar xfzO $f $c.$s.txt | \ awk 'BEGIN {pos = 1} {print pos, $1; pos += 25}' | \ grep -v '\-1$' >> signal/$table.wigVar end cd signal nice wigEncode $table.wigVar $table.wig $table.wib cd .. end _EOF_ # Fix script coloring EOF chmod +x makeWigMeissner2008.csh csh makeWigMeissner2008.csh > makeWigMeissner2008.log 2>&1 & # check output and cleanup cd signal gzip *.wigVar ######## Load wiggles? ssh hgwdev #mkdir /gbdb/mm8/broadStemChip cd /cluster/data/mm8/bed/broadStemChip cat > loadWigMeissner2008.csh << \_EOF_ #!/bin/csh -fe cd /cluster/data/mm8/bed/broadStemChip/signal foreach f (*H3K*me*.wib) set wi = $f:t:r set wig = $wi.wig echo Start: $wig echo "ln -s `pwd`/$f /gbdb/mm8/broadStemChip/$wi.wib" hgLoadWiggle -pathPrefix=/gbdb/mm8/broadStemChip mm8 $wi $wig echo Finished: $wig end _EOF_ chmod +x loadWigMeissner2008.csh ./loadWigMeissner2008.csh time nice -n +19 ./loadWigMeissner2008.csh >> loadWigMeissner2008.log 2>&1 & # Noticed tables badly named, renamed them and corresponding files # hgsql mm8 # rename table broadStemChipSignalH3Es to broadStemChipSignalH3panEs # rename table broadStemChipSignalH3K27Es to broadStemChipSignalH3K27me3Es # rename table broadStemChipSignalH3K27Mef to broadStemChipSignalH3K27me3Mef # rename table broadStemChipSignalH3K27Np to broadStemChipSignalH3K27me3Np # rename table broadStemChipSignalH3K36Es to broadStemChipSignalH3K36me3Es # rename table broadStemChipSignalH3K36EsHyb to broadStemChipSignalH3K36Esme3Hyb # rename table broadStemChipSignalH3K36Mef to broadStemChipSignalH3K36me3Mef # rename table broadStemChipSignalH3K36Np to broadStemChipSignalH3K36me3Np # rename table broadStemChipSignalH3K4Es to broadStemChipSignalH3K4me3Es # rename table broadStemChipSignalH3K4EsHyb to broadStemChipSignalH3K4Esme3Hyb # rename table broadStemChipSignalH3K4Mef to broadStemChipSignalH3K4me3Mef # rename table broadStemChipSignalH3K4Np to broadStemChipSignalH3K4me3Np # rename table broadStemChipSignalH3K9Es to broadStemChipSignalH3K9me3Es # rename table broadStemChipSignalH3K9EsHyb to broadStemChipSignalH3K9Esme3Hyb # rename table broadStemChipSignalH3K9Mef to broadStemChipSignalH3K9me3Mef # rename table broadStemChipSignalH3K9Np to broadStemChipSignalH3K9me3Np # rename table broadStemChipSignalH4K20Es to broadStemChipSignalH4K20me3Es ######### Alignments ### Sample from Brain.H3K27me3.aligned.gz #chr10 63848447 63848474 - 3084.4.1 0 GAGAGCCAATGGCTAGGCAGGGCATCA ### Convert to #chr10 63848447 63848474 3084.4.1 0 - 63848447 63848474 0,255,0 0 GAGAGCCAATGGCTAGGCAGGGCATCA # convert to bed-9+ color at 9, mismatch at 10 and seq at 11; grabbed some example code from encodeHg17.txt PET ssh hgwdev cd /cluster/data/mm8/bed/broadStemChip/lab/Meissner2008/alignments mkdir bed cd bed cp lab/alignments/bed/make* lab/Meissner2008/alignments/bed # cat << \_EOF_ > makeBed9PlusFromAlignments.csh # #!/usr/bin/perl # # replace "reserved" field of BED >=9 fields with RGB value from 8-scale # # black->red palette, based on score value. # # use warnings; # use strict; # # while (<>) { # next if (/^track/ || /^\s*\#/); # chomp; # my @words = split("\t"); # if (scalar(@words) < 7) { # @words = split(/\s+/); # die "Expecting at least 7 tab-sep fields but got fewer, line $.\n" # if (scalar(@words) < 7); # } # my @newWordOrder = ("","","","","","","","","","",""); # $newWordOrder[0] = $words[0]; # chr # $newWordOrder[1] = $words[1]; # beg # $newWordOrder[2] = $words[2]; # end # $newWordOrder[3] = $words[4]; # name # #$newWordOrder[4] = "0"; # score # $newWordOrder[4] = 1000 - ($words[5] * 100); # score 0=1000 1=900 2=800 # $newWordOrder[5] = $words[3]; # strand # $newWordOrder[6] = $words[1]; # beg # $newWordOrder[7] = $words[2]; # end # $newWordOrder[8] = "0,0,0"; # color to be set later # $newWordOrder[9] = $words[5]; # mismatch # $newWordOrder[10] = $words[6]; # seq # # print join("\t", @newWordOrder) . "\n"; # } # _EOF_ # # cat << \_EOF_ > makeColoredBedOnStrand.csh # #!/usr/bin/perl # # replace "reserved" field of BED >=9 fields with RGB value from 8-scale # # black->red palette, based on score value. # # use warnings; # use strict; # # # palette consistes of red, green blue # my @blues = ("0,0,255","0,0,204","0,0,170"); # my @greens = ("0,255,0","0,187,0","0,136,0"); # # while (<>) { # next if (/^track/ || /^\s*\#/); # chomp; # my @words = split("\t"); # if (scalar(@words) < 9) { # @words = split(/\s+/); # die "Expecting at least 9 tab-sep fields but got fewer, line $.\n" # if (scalar(@words) < 9); # } # die "More than 9 mismatches found line $.\n" # if ($words[9] > 9); # my $strand = $words[5]; # if ($strand eq '+') { # if( $words[9] > 2 ) { # $words[8] = $blues[2]; # green # } else { # $words[8] = $blues[$words[9]]; # green # } # } else { # if( scalar($words[9]) > 2 ) { # $words[8] = $greens[2]; # blue # } else { # $words[8] = $greens[$words[9]]; # blue # } # } # print join("\t", @words) . "\n"; # } # _EOF_ # chmod +x makeBed9PlusFromAlignments.csh # chmod +x makeColoredBedOnStrand.csh cat << \_EOF_ > convertToBed.csh #!/bin/csh -fe cd /cluster/data/mm8/bed/broadStemChip/lab/Meissner2008/alignments/bed foreach f (../*.aligned.gz) set root = `echo $f:t:r:r` zcat $f | ./makeBed9PlusFromAlignments.csh | ./makeColoredBedOnStrand.csh | gzip > $root.bed.gz echo $root.bed.gz done end _EOF_ chmod +x convertToBed.csh zcat ../Brain.H3K27me3.aligned.gz | head | ./makeBed9PlusFromAlignments.csh | ./makeColoredBedOnStrand.csh # chr14 12537326 12537362 205CY.7.1 1000 - 12537326 125373620,255,0 0 GGGATATGGACTGAAATAATTAGGAAAGAAATAACT ## How to make bash work ??? #for f in ../*.txt.gz; do # root=${f##*/} # root=${root%.*} # root=${root%.*} # zcat $f | ./makeBed9PlusFromAlignments.csh | ./makeColoredBedOnStrand.csh | gzip > $root.bed.gz # echo $root.bed.gz done #done ssh kkstore04 cd /cluster/data/mm8/bed/broadStemChip/lab/Meissner2008/alignments/bed time nice -n +19 ./convertToBed.csh > convert.log 2>&1 & # real 25m22.762s # Brain.H3K27me3.bed.gz done # Brain.H3K4me2.bed.gz done # Brain.H3K4me3.bed.gz done # ES.H3K4me1.bed.gz done # ES.H3K4me2.bed.gz done # NP.H3K4me1.bed.gz done # NP.H3K4me2.bed.gz done # zcat Brain.H3K27me3.bed.gz | head -2 # chr14 12537326 12537362 205CY.7.1 1000 - 1253732612537362 0,255,0 0 GGGATATGGACTGAAATAATTAGGAAAGAAATAACT # chr2 70236933 70236969 205CY.7.2 900 + 7023693370236969 0,0,204 1 GAATCCTTGAACATATTTATAATCATTCTTTTTAAT # Compared to: zcat ../../../alignments/bed/ES.K20.bed.gz | head -2 # chr8 77978889 77978916 3080.2.1 1000 + 7797888977978916 0,0,255 0 GAAGGAAATCAGTCTTTGTTGAGCAGT # chr12 38598403 38598430 3080.2.2 1000 + 3859840338598430 0,0,255 0 GATATTTCATTCCTTGGAGAAGGGTAA cp ../../../alignments/bed/myBedTbl.sql . # cat << \_EOF_ > myBedTbl.sql # CREATE TABLE myBedTbl ( # bin smallint unsigned not null, # chrom varchar(255) not null, # chromStart int unsigned not null, # chromEnd int unsigned not null, # name varchar(255) not null, # score int unsigned not null, # strand char(1) not null, # thickStart int unsigned not null, # thickEnd int unsigned not null, # reserved int unsigned not null, # mismatchCount int unsigned not null, # seq varchar(255) not null, # #Indices # INDEX(name(16)), # INDEX(chrom(5),bin) # ) # _EOF_ cat << \_EOF_ > loadBedFiles.csh #!/bin/csh -fe cd /cluster/data/mm8/bed/broadStemChip/lab/Meissner2008/alignments/bed foreach f (*.bed.gz) set root = `echo $f:t:r:r` set ab = `echo $root | perl -wpe 's/\w+.(\w)(\w+)/\u$1\u$2/'` set cell = `echo $root | perl -wpe 's/(\w)(\w+).\w+/\u$1\L$2/'` set table = broadStemChipAlignments${ab}${cell} ~/bin/x86_64/hgLoadBed -sqlTable=myBedTbl.sql -renameSqlTable mm8 ${table} ${f} echo ${table} ${f} done end _EOF_ chmod +x loadBedFiles.csh time nice -n +19 ./loadBedFiles.csh > load.log 2>&1 & real 28m9.939s # broadStemChipAlignmentsH3K27me3Brain # broadStemChipAlignmentsH3K4me1Es # broadStemChipAlignmentsH3K4me1Np # broadStemChipAlignmentsH3K4me2Brain # broadStemChipAlignmentsH3K4me2Es # broadStemChipAlignmentsH3K4me2Np # broadStemChipAlignmentsH3K4me3Brain # # broadStemChipWinSitesH3K27me3Brain # broadStemChipWinSitesH3K4me1Es # broadStemChipWinSitesH3K4me1Np # broadStemChipWinSitesH3K4me2Brain # broadStemChipWinSitesH3K4me2Es # broadStemChipWinSitesH3K4me2Np # broadStemChipWinSitesH3K4me3Brain # # broadStemChipSignalH3K27me3Brain # broadStemChipSignalH3K4me1Es # broadStemChipSignalH3K4me1Np # broadStemChipSignalH3K4me2Brain # broadStemChipSignalH3K4me2Es # broadStemChipSignalH3K4me2Np # broadStemChipSignalH3K4me3Brain # edited trackDb.broadStem.ra ############################################################################ # mm8 - Mouse - Ensembl Genes (DONE - 2008-03-06 - hiram) ssh kkstore04 cd /cluster/data/mm8 cat << '_EOF_' > mm8.ensGene.ra # required db variable db mm8 # optional liftRandoms yes/no or absent liftRandoms yes # optional nameTranslation, the sed command that will transform # Ensemble names to UCSC names. With quotes just to make sure. nameTranslation "s/^\([0-9XY][0-9]*\)/chr\1/; s/^MT/chrM/" # optionally update the knownToEnsembl table after ensGene updated knownToEnsembl yes '_EOF_' # << happy emacs doEnsGeneUpdate.pl -ensVersion=46 mm8.ensGene.ra ssh hgwdev cd /cluster/data/mm8/bed/ensGene.46 featureBits mm8 ensGene # 56654064 bases of 2567283971 (2.207%) in intersection ############################################################################ # Reload CCDS from CCDS.20080502 dump (2008-05-03 markd) # import ccds database as described in ccds.txt set db=mm8 set ncbiBld=36.1 # create and load ccdsGene and ccdsInfo tables from imported database /cluster/data/genbank/bin/x86_64/ccdsMkTables -loadDb ccds ${db} ${ncbiBld} ccdsInfo ccdsGene # ccdsKgMap /cluster/data/genbank/bin/x86_64/mkCcdsGeneMap -db=${db} -loadDb ccdsGene knownGene ccdsKgMap checkTableCoords ${db} -verbose=2 ccdsGene # update all.jointer to include ${db} in ccdsDb joinerCheck -database=${db} -identifier=ccdsGeneId ~/compbio/genbank/kent/src/hg/makeDb/schema/all.joiner # request push of ccdsGene ccdsInfo ccdsKgMap # << emacs ############################################################################ # AGILENT CGH PROBES (Done 2008-05-13, Andy) # (see hg18.txt) ############################################################################ ############################################################################ # TRANSMAP vertebrate.2008-05-20 build (2008-05-24 markd) vertebrate-wide transMap alignments were built Tracks are created and loaded by a single Makefile. This is available from: svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-05-20 see doc/builds.txt for specific details. ############################################################################ ############################################################################# # MOUSE TISSUE EXON ARRAYS (Melissa Cline, cline@biology.ucsc.edu, 10/14/08) # (to build the affyExonTissues track, see the steps outlined in hg18.txt) ############################################################################# ######################################################################## ## AFFY ALL EXON PROBESETS (MM8) (DONE 2009-01-29, Andy) ssh hgwdev mkdir /hive/data/genomes/mm8/bed/affyAllExonProbes cd /hive/data/genomes/mm8/bed/affyAllExonProbes ln -s MoEx-1_0-st-v1.r2.dt1.mm8.csv mm8.csv wget --load-cookies affycookies.txt http://www.affymetrix.com/Auth/analysis/downloads/na20/exon/MoEx-1_0-st-v1.r2.dt1.mm8.zip sed '1,12d' mm8.csv | tr ',' '\t' | cut -f 1,5-8,12 \ | sed 's/\"//g' | grep -v "\-\-\-" \ | awk 'BEGIN{FS="\t";OFS="\t";}{if ($6 == "core") score = 1000; else if ($6 == "extended") score = 700; else if ($6 == "full") score = 300; else score = 100; name = $1"|"$6; print $2, $4-1, $5, name, score, $3}' \ | bedSort stdin mm8.bed hgLoadBed mm8 affyAllExonProbes mm8.bed rm MoEx-1_0-st-v1.r2.dt1.mm8.{cor,ext,full,zip}* bed.tab affycookies.txt mm8.csv gzip MoEx-1_0-st-v1.r2.dt1.mm8.csv mm8.bed ################################################ # AUTOMATE UPSTREAM FILE CREATION (2008-10-15 markd) update genbank.conf: mm8.upstreamGeneTbl = refGene mm8.upstreamMaf = multiz17way /hive/data/genomes/mm8/bed/multiz17way/species.lst ############################################################################# # MAKE PCR TARGET FOR UCSC GENES (DONE 11/4/08) ssh hgwdev mkdir /cluster/data/mm8/bed/mrnaPcr cd /cluster/data/mm8/bed/mrnaPcr hgsql mm8 -NBe 'select * from knownGene' > knownGene.gp genePredToBed knownGene.gp > ucscGenes.bed hgsql mm8 -NBe 'select kgId,geneSymbol from kgXref' \ | perl -wpe 's/^(\S+)\t(\S+)/$1\t${1}__$2/ || die;' \ > idSub.txt subColumn 4 ucscGenes.bed idSub.txt ucscGenesIdSubbed.bed sequenceForBed -keepName -db=mm8 -bedIn=ucscGenesIdSubbed.bed \ -fastaOut=stdout \ | faToTwoBit -ignoreDups stdin kgTargetSeq.2bit cut -f 1-10 knownGene.gp \ | genePredToFakePsl mm8 stdin kgTargetAli.psl /dev/null # Load up the UCSC Genes target PSL table and put 2bit in /gbdb:: cd /cluster/data/mm8/bed/mrnaPcr hgLoadPsl mm8 kgTargetAli.psl mkdir /gbdb/mm8/targetDb ln -s /cluster/data/mm8/bed/mrnaPcr/kgTargetSeq.2bit /gbdb/mm8/targetDb/ # Ask cluster-admin to start an untranslated, -stepSize=5 gfServer on # /gbdb/mm8/targetDb/kgTargetSeq.2bit . ssh hgwdev # Add records to hgcentraltest blatServers and targetDb: hgsql hgcentraltest -e \ 'INSERT into blatServers values ("mm8Kg", "blat13", 17803, 0, 1);' hgsql hgcentraltest -e \ 'INSERT into targetDb values("mm8Kg", "UCSC Genes", \ "mm8", "kgTargetAli", "", "", \ "/gbdb/mm8/targetDb/kgTargetSeq.2bit", 1, now(), "");' ############################################################################# ############################################################################ # TRANSMAP vertebrate.2009-09-13 build (2009-09-20 markd) vertebrate-wide transMap alignments were built Tracks are created and loaded by a single Makefile. This is available from: svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-09-13 see doc/builds.txt for specific details. ############################################################################ # UPDATE KEGG TABLES (DONE, Fan, 6/18/10) mkdir -p /hive/data/genomes/mm8/bed/pathways/kegg cd /hive/data/genomes/mm8/bed/pathways/kegg wget --timestamping ftp://ftp.genome.jp/pub/kegg/pathway/map_title.tab cat map_title.tab | sed -e 's/\t/\tmmu\t/' > j.tmp cut -f 2 j.tmp >j.mmu cut -f 1,3 j.tmp >j.1 paste j.mmu j.1 |sed -e 's/\t//' > keggMapDesc.tab rm j.mmu j.1 rm j.tmp hgsql mm8 -e 'drop table keggMapDesc' hgsql mm8 < ~/kent/src/hg/lib/keggMapDesc.sql hgsql mm8 -e 'load data local infile "keggMapDesc.tab" into table keggMapDesc' wget --timestamping ftp://ftp.genome.jp/pub/kegg/genes/organisms/mmu/mmu_pathway.list cat mmu_pathway.list| sed -e 's/path://'|sed -e 's/:/\t/' > j.tmp hgsql mm8 -e 'drop table keggPathway' hgsql mm8 < ~/kent/src/hg/lib/keggPathway.sql hgsql mm8 -e 'load data local infile "j.tmp" into table keggPathway' hgsql mm8 -N -e \ 'select name, locusID, mapID from keggPathway p, knownToLocusLink l where p.locusID=l.value' \ >keggPathway.tab hgsql mm8 -e 'delete from keggPathway' hgsql mm8 -e 'load data local infile "keggPathway.tab" into table keggPathway' rm j.tmp ############################################################################ # Add KEGG column to mm8 Gene Sorter (Done, Fan, 6/18/2010) mkdir -p /hive/data/genomes/mm8/bed/geneSorter cd /hive/data/genomes/mm8/bed/geneSorter hgsql mm8 -N -e 'select kgId, mapID, mapID, "+", locusID from keggPathway' |sort -u|sed -e 's/\t+\t/+/' > knownToKeggEntrez.tab hgsql mm8 -e 'drop table knownToKeggEntrez' hgsql mm8 < ~/kent/src/hg/lib/knownToKeggEntrez.sql hgsql mm8 -e 'load data local infile "knownToKeggEntrez.tab" into table knownToKeggEntrez' ############################################################################# # LIFTOVER TO MM9 (RE-DONE 2010-07-24 galt) mkdir /hive/data/genomes/mm8/bed/blat.mm9.2010-07-24 cd /hive/data/genomes/mm8/bed/blat.mm9.2010-07-24 # -debug run to create run dir, preview scripts... # verifies files can be found doSameSpeciesLiftOver.pl -debug mm8 mm9 # Real run: screen nice doSameSpeciesLiftOver.pl -verbose=2 \ -bigClusterHub=swarm -dbHost=hgwdev -workhorse=hgwdev \ mm8 mm9 >& do.log There was a question by jdidion on the mailing list in which the liftover in the 9->8 direction was working better than in the other direction. We found that fastMap cannot be used with queries of size greater than 5000. This information was known to Kate back in 2004, and was forgotten by oct 2006 when doSameSpecies.pl was being used with query sizes of 10k chunks. I contacted Jim, and he told me make blat errAbort if the query size exceeds this limit when fastMap is used. This will prevent problems in the future. I tested that this actually happens, so we know the check is working. CONFIRMED: New blat 34x9 does errAbort with chunk size 10k and -fastMap. Then I re-ran it with the smaller allowed sizes of 5k chunks which should take care of the problem. We will probably also need to re-run other same-species liftovers. CONFIRMED: the problem lifting reported by the user jdidion is now fixed. on hgwdev, liftOver from mm8 chr2:22766881-22766905 to mm9 chr2:22770754-22770778 now works perfectly. ############################################################################# # LIFTOVER TO MM9 (DONE 2010-08-05 galt) # using the new doSameSpeciesLiftOver.pl # which uses chunks of max size 5k (to prevent blat -fastMap issues) # and an overlap of 500bp, which we did not have before. # Although we will use the new process in future, # we are not changing old ones. This particular liftover is an # exception since it does fix a tiny problem found by user jdidion. mkdir /hive/data/genomes/mm8/bed/blat.mm9.2010-08-05 cd /hive/data/genomes/mm8/bed/blat.mm9.2010-08-05 # -debug run to create run dir, preview scripts... # verifies files can be found doSameSpeciesLiftOver.pl -debug mm8 mm9 # Real run: screen nice doSameSpeciesLiftOver.pl -verbose=2 \ -bigClusterHub=swarm -dbHost=hgwdev -workhorse=hgwdev \ mm8 mm9 >& do.log ############################################################################# # construct liftOver to mm10 (DONE - 2012-05-01 - Hiram) screen -S mm8 # manage this longish running job in a screen mkdir /hive/data/genomes/mm8/bed/blat.mm10.2012-05-01 cd /hive/data/genomes/mm8/bed/blat.mm10.2012-05-01 # check it with -debug first to see if it is going to work: time doSameSpeciesLiftOver.pl -buildDir=`pwd` -bigClusterHub=swarm \ -ooc=/scratch/data/mm8/11.ooc \ -debug -dbHost=hgwdev -workhorse=hgwdev mm8 mm10 # if that is OK, then run it: time doSameSpeciesLiftOver.pl -buildDir=`pwd` -bigClusterHub=swarm \ -ooc=/scratch/data/mm8/11.ooc \ -dbHost=hgwdev -workhorse=hgwdev mm8 mm10 > do.log 2>&1 # real 86m46.700s # verify this file exists: og -L /gbdb/mm8/liftOver/mm8ToMm10.over.chain.gz # -rw-rw-r-- 1 279647 May 1 10:41 /gbdb/mm8/liftOver/mm8ToMm10.over.chain.gz # and try out the conversion on genome-test from mm8 to mm10 ############################################################################