# for emacs: -*- mode: sh; -*- # This file describes browser build for the mouse # genome, March 2005, ncbi mouse_34 - Mm6 # # "$Id: mm6.txt,v 1.8 2008/01/14 23:06:15 rhead Exp $" # # DOWNLOAD THE MOUSE SEQUENCE FROM NCBI (DONE - 2005-03-09 - Hiram) # # Examine disk space issues, summarize mm5 usage: ssh kksilo cd /cluster/store6 du -hsc mm5 # 476G mm5 # That is a lot of disk space for an assembly, I wonder if it needs # some cleaning. Our last vestige of large amounts of space is on # store10, thus: ssh kksilo mkdir /cluster/store10/mm6 ln -s /cluster/store10/mm6 /cluster/data/mm6 mkdir /cluster/data/mm6/ncbi cd /cluster/data/mm6/ncbi # set the login name and password in a .wgetrc file in this # directory, permissions 600, its format: # login = name # passwd = xxxx WGETRC=`pwd`/.wgetrc export WGETRC wget --timestamping --force-directories --directory-prefix=. \ --dont-remove-listing --recursive --level=4 --no-parent \ --no-host-directories --cut-dirs=1 \ ftp://ftp-private.ncbi.nih.gov/mouse_34 # Downloaded: 1,586,892,564 bytes in 57 files # # NOTE: chrY - radical change from mm5 - most of it was moved into # chrY_random # Notes from Deanna Church: ############################################################################## # There are some general stats on chromosome lengths and N50s (for this build # and compared to Build 33) as well as a comparison to the non-sequence based # maps. # Before there is too much alarm I want to point out that there is a fair # amount of N50 variation from chromosome to chromosome, as well as a few # chromosomes that have significantly different N50 than in Build 33. The most # severe case is Mmu2 (36 Mb in Build 34 vs 90 Mb in Build 33). Remember Mmu2 # was not automatically assembled and this drop is due to gaps being # purposefully introduced. A couple of auto-assembled chromosomes also saw # drops in N50 (Mmu5 down ~13Mb and Mmu13 down about 6 Mb)- however, this is # also likely due to clones having been introduced to the TPF. In these cases, # many of the clones have no sequence- and there is likely no WGS to hold # things together either. So, I don't think this by itself is cause for # concern- the N50 for these two chromosomes is still >19 Mb, so they are very # contiguous. # On the plus side, Mmu14 has an N50 increase of 41 Mb, Mmu12 has an increase # of 24 Mb and Mmu8 has an increase of 13 Mb. # The genome average N50 increased from 23.2 Mb to 26.2 Mb. ############################################################################## # No chrM or chrMT was delivered. Pick up NC_005089.1 fasta file # for chrM sequence from NCBI and place in this ncbi download # directory as chrM in both chrfasta and contigfasta # with a fixed up header line to be like all the others: # >lcl|chrM.fa gi|34538597|ref|NC_005089.1| Mus musculus mitochondrion, complete genome # Fixup the agp and contig.idmap files to add chrM cd /cluster/data/mm6 zcat ncbi/allrefcontig.chr.agp.gz > allrefcontig.chr.agp echo -e "chrM\t1\t16299\t1\tF\tAY172335.1\t1\t16299\t+" >> \ allrefcontig.chr.agp gzip allrefcontig.chr.agp zcat ncbi/allcontig.agp.gz > allcontig.agp echo -e "NC_005089\t1\t16299\t1\tF\tAY172335\t\t1\t16299\t+" >> \ allcontig.agp gzip allcontig.agp zcat ncbi/seq_contig.md.gz | sed -e "6991i\ 10090\tM\t0\t0\t+\tstart\t-1\tCONTIG\tC57BL/6J\t10\n\ 10090\tM\t1\t16299\t+\tNC_005089\tGI:34538597\tCONTIG\tC57BL/6J\tna\n\ 10090\tM\t16299\t16299\t+\tend\t-2\tCONTIG\tC57BL/6J\t10" > seq_contig.md # The line number 6991 was found by checking the contents of # ncbi/seq_contig.md.gz and it was the line starting with: # 10090^IUn|NT_039766^I1^I4412^ # Wanted this chrM information before that line. I tried to get # this work with a match and insert, but for some unknown reason # it would not function: # sed -e "#NT_039769#i\ # .... this is supposed to work, I don't know why it does not # And even more curiously, this command cut and paste did NOT work # on hgwdev in my login. Mysteries of environment. Only worked # on kksilo. gzip seq_contig.md # summarize sequence counts mkdir faCounts time faCount ncbi/chrfasta/chr*.fa.gz > faCounts/chrfasta.faCount 2>&1 & time faCount ncbi/contigfasta/chr*.fa.gz > \ faCounts/contigfasta.faCount 2>&1 & # about 3 minutes each for the above two faCounts time zcat ncbi/chrfasta/chr*.fa.gz | grep "^>" > \ faCounts/chrfasta.headers 2>&1 & time zcat ncbi/contigfasta/chr*.fa.gz | grep "^>" > \ faCounts/contigfasta.headers 2>&1 & # about 2 minutes each for the above two zcat/greps ############################################################################# # BREAK UP SEQUENCE INTO 5 MB CHUNKS at NON-BRIDGED CONTIGS # (DONE - 2005-03-09 - Hiram) ssh kksilo cd /cluster/data/mm6 for F in ncbi/chrfasta/chr*.fa.gz do CHR=`basename ${F} | sed -e "s/.fa.gz//; s/chr//"` echo ${CHR} ${F} mkdir -p "${CHR}" zcat allrefcontig.chr.agp.gz | \ perl -we "while(<>){if (/^chr${CHR}\t/) {print;}}" > \ ${CHR}/chr${CHR}.agp zcat ncbi/chrfasta/chr${CHR}.fa.gz | \ perl -wpe 's/^>lcl\|(chr\w+)\.fa.*/>$1/' | \ splitFaIntoContigs ${CHR}/chr${CHR}.agp \ stdin /cluster/data/mm6 -nSize=5000000 done # The above loop takes about 5 minutes ############################################################################# # CREATE CHROM-LEVEL AGP AND FASTA FOR _RANDOMS (DONE 2005-03-09 - Hiram) ssh kksilo mkdir /cluster/data/mm6/jkStuff cd /cluster/data/mm6 mkdir Un tmp cp -p /cluster/data/mm5/jkStuff/ncbiFixAgp ./jkStuff zcat allrefcontig.chr.agp.gz | ./jkStuff/ncbiFixAgp /dev/stdin | gzip > \ allrefcontig.chr.ordered.agp.gz # Set the appropriate release number here, this one is 34 sed -e "s/buildNum = 33/buildNum = 34/" \ /cluster/data/mm5/jkStuff/ncbiToRandomAgps > \ jkStuff/ncbiToRandomAgps chmod +x jkStuff/ncbiToRandomAgps gunzip seq_contig.md.gz allrefcontig.chr.ordered.agp.gz ./jkStuff/ncbiToRandomAgps seq_contig.md allrefcontig.chr.ordered.agp \ ncbi/contig.idmap . # The chrUn_random.agp created by this is too large with the 5000 # gaps. it will work with 1000 gaps, so fixup the chrUn_random # agp: ./jkStuff/ncbiToRandomAgps -gapLen 1000 -chrom Un \ seq_contig.md allrefcontig.chr.ordered.agp ncbi/contig.idmap . for C in ? ?? do if [ -s ${C}/chr${C}_random.ctg.agp ]; then echo "building ${C}/chr${C}_random.fa" rm -f ./tmp.fa zcat ncbi/contigfasta/chr${C}.fa.gz | \ perl -wpe 's/^>lcl\|(Mm\w+)\s+.*$/>$1/' > ./tmp.fa $HOME/bin/i386/agpToFa -verbose=2 -simpleMulti \ ${C}/chr${C}_random.ctg.agp chr${C}_random \ ${C}/chr${C}_random.fa ./tmp.fa rm -f ./tmp.fa fi done > tmp/agpToFa.out 2>&1 # the above loop takes about 6 minutes, examine the tmp/agpToFa.out # record for any errors # Clean these up to avoid confusion later... they're easily rebuilt # with the ncbiToRandomAgps script above rm ?/*_random.ctg.agp ??/*_random.ctg.agp gzip seq_contig.md allrefcontig.chr.ordered.agp ############################################################################# # BREAK UP _RANDOMS INTO 5 MB CHUNKS AT NON-BRIDGED CONTIGS # (DONE 2005-03-09 - Hiram) ssh kksilo cd /cluster/data/mm6 for C in ? ?? do if [ -s ${C}/chr${C}_random.fa ]; then splitFaIntoContigs -nSize=5000000 ${C}/chr${C}_random.agp \ ${C}/chr${C}_random.fa . mkdir -p ${C}/lift rm -f ${C}/lift/rOut.lst ${C}/lift/random.lft ${C}/lift/random.lst mv ${C}_random/lift/oOut.lst ${C}/lift/rOut.lst mv ${C}_random/lift/ordered.lft ${C}/lift/random.lft mv ${C}_random/lift/ordered.lst ${C}/lift/random.lst rmdir ${C}_random/lift rm ${C}_random/chr${C}_random.agp ${C}_random/chr${C}_random.fa rm -rf ${C}/chr${C}_random_* mv ${C}_random/chr${C}_random_* ${C} rmdir ${C}_random fi done > tmp/split.out 2>&1 # the above loop takes less than a minute # scan the tmp/split.out file for possible errors ############################################################################# # MAKE LIFTALL.LFT (DONE - 2005-03-10 - Hiram) ssh kksilo cd /cluster/data/mm6 cat ?/lift/*.lft ??/lift/*.lft > jkStuff/liftAll.lft ############################################################################# # CREATING DATABASE (DONE - 2005-03-10 - Hiram) ssh kksilo cd /cluster/data/mm6 faToTwoBit ?/chr?.fa ??/chr??.fa ?/chr?_random.fa ??/chr??_random.fa \ mm6.2bit twoBitInfo mm6.2bit stdout | sort -rn +1 > chrom.sizes grep -v random chrom.sizes | cut -f1 | sed -e "s/chr//" > chrom.lst twoBitInfo mm6.2bit stdout | awk '{printf "%s\t%s\t/gbdb/mm6/mm6.2bit\n", $1,$2}' > chromInfo.tab ssh hgwdev cd /cluster/data/mm6 hgsql -e "create database mm6;" mysql # Make sure we have enough room (eventually ~ 70Gb) for mysql tables: df -h | grep mysql # /dev/sda1 472G 227G 222G 51% /var/lib/mysql2 # /dev/sdc1 1.8T 728G 933G 44% /var/lib/mysql # CREATING GRP TABLE FOR TRACK GROUPING (DONE - 2005-03-10 - Hiram) # Use any of the newest databases to ensure that the organization # of the grp table is up to date ssh hgwdev hgsql mm6 -e "create table grp (PRIMARY KEY(NAME)) select * from hg17.grp" hgsql mm6 < $HOME/kent/src/hg/lib/chromInfo.sql hgsql mm6 -e 'load data local infile "chromInfo.tab" into table chromInfo;' # Enter mm6 into dbDb and defaultDb so test browser knows about it: hgsql -e 'INSERT INTO dbDb (name, description, nibPath, organism, \ defaultPos, active, orderKey, genome, scientificName, \ htmlPath, hgNearOk, hgPbOk, sourceName) \ VALUES("mm6", "March 2005", "/gbdb/mm6", "Mouse", \ "chr6:28912411-28925620", 1, 24, "Mouse", \ "Mus musculus", "/gbdb/mm6/html/description.html", 0, 0, \ "NCBI Build 34");' -h localhost hgcentraltest # do this defaultDb entry later after there is something to see # on this browser. hgsql -e 'INSERT INTO defaultDb (name, genome) VALUES("mm6", "Mouse")' \ -h localhost hgcentraltest # start a new entry in the trackDb hierarchy cd $HOME/kent/src/hg/makeDb/trackDb/mouse mkdir mm6 cvs add mm6 cd mm6 cp ../mm5/description.html . vi description.html - fixup text for this assembly cvs add description.html cvs commit cd ../.. vi trackDb.ra - add mm6 to the list mkdir /cluster/data/mm6/html mkdir /gbdb/mm6 ln -s /cluster/data/mm6/html /gbdb/mm6/html ln -s /cluster/data/mm6/mm6.2bit /gbdb/mm6/mm6.2bit cp -p mouse/mm6/description.html /gbdb/mm6/html make DBS=mm6 ZOO_DBS="" ############################################################################# # GOLD GAP tracks (DONE - 2005-03-10 - Hiram) ssh hgwdev cd /cluster/data/mm6 # make sure these tmp contig agp files are gone, easily generated # as above with jkStuff/ncbiToRandomAgps rm -f */chr*.ctg.agp mkdir ffa zcat ncbi/sequence.inf.gz > ffa/sequence.inf hgGoldGapGl -chromLst=chrom.lst mm6 /cluster/data/mm6 . featureBits mm6 gold # 2597150411 bases of 2597150411 (100.000%) in intersection featureBits mm5 gold # 2615483787 bases of 2615483787 (100.000%) in intersection featureBits mm4 gold # 2627444668 bases of 2627444668 (100.000%) in intersection featureBits mm6 gap # 482483041 bases of 2597150411 (18.577%) in intersection featureBits mm5 gap # 549468286 bases of 2615483787 (21.008%) in intersection featureBits mm4 gap # 325167539 bases of 2627444668 (12.376%) in intersection ############################################################################# # DISTRIBUTE SEQUENCE TO INTERMEDIATE SERVERS FOR KLUSTER RUNS ssh kksilo mkdir /cluster/bluearc/mm6 cd /cluster/data/mm6 mkdir /cluster/bluearc/mm6/chrom cp -p */*.fa /cluster/bluearc/mm6/chrom # (these were removed later) # break up into 500,000 sized chunks for repeat masker runs TOP=`pwd` export TOP for CTG_DIR in ?/chr?_[0-9]* ??/chr??_[0-9]* ?/chr?_random_[0-9]* \ ??/chr??_random_[0-9]* do ctg=`basename ${CTG_DIR}` cd ${CTG_DIR} faSplit size ${ctg}.fa 500000 ${ctg}_ -lift=${ctg}.lft -maxN=500000 cd ${TOP} done > tmp/ctg_split.out 2>&1 # about 3 minutes, check the tmp/ctg_split.out for anything unusual # make a list of the contigs TOP=`pwd` export TOP for CTG_DIR in ?/chr?_[0-9]* ??/chr??_[0-9]* ?/chr?_random_[0-9]* \ ??/chr??_random_[0-9]* do ctg=`basename ${CTG_DIR}` cd ${CTG_DIR} ls ${ctg}_* | while read F do echo ${CTG_DIR}/${F} done cd ${TOP} done > contig500K.lst # count 'em wc contig500K.lst # 6678 6678 176765 contig500K.lst mkdir -p /panasas/store/mm6/contigs rsync -a --progress --files-from=contig500K.lst . \ /panasas/store/mm6/contigs/ ssh kkr1u00 mkdir /iscratch/i/mm6 cd /iscratch/i/mm6 cp -p /cluster/bluearc/mm6/chrom/* . /cluster/bin/iSync # verify the contig copy above functioned OK find /panasas/store/mm6/contigs -type f | wc # 6678 6678 443885 ############################################################################# # SIMPLE REPEAT TRACK (DONE - 2005-03-10 Hiram) # TRF can be run in parallel with RepeatMasker on the file server # since it doesn't require masked input sequence. # Run this on the rack 9 cluster ssh kk9 mkdir /cluster/data/mm6/bed/simpleRepeat cd /cluster/data/mm6/bed/simpleRepeat mkdir trf cat << '_EOF_' > runTrf #!/bin/csh -fe # set path1 = $1 set inputFN = $1:t set outpath = $2 set outputFN = $2:t mkdir -p /tmp/$outputFN cp $path1 /tmp/$outputFN pushd . cd /tmp/$outputFN /cluster/bin/i386/trfBig -trf=/cluster/bin/i386/trf $inputFN /dev/null -bedAt=$outputFN -tempDir=/tmp popd rm -f $outpath cp -p /tmp/$outputFN/$outputFN $outpath rm -fr /tmp/$outputFN/* rmdir --ignore-fail-on-non-empty /tmp/$outputFN '_EOF_' # << keep emacs coloring happy chmod +x runTrf cat << '_EOF_' > gsub #LOOP ./runTrf {check in line+ $(path1)} {check out line trf/$(root1).bed} #ENDLOOP '_EOF_' # << keep emacs coloring happy ls -1S /iscratch/i/mm6/chrom/chr*.fa > genome.lst gensub2 genome.lst single gsub jobList para create jobList # be gentle on the start up of these things since each starting # job is a copy of the .fa file, a 'para try' starts 10 jobs # there are only 40 total jobs para try sleep 30 para check para try sleep 30 para check para try sleep 30 para check para try para check ... all 40 are running at this point, some are already done para time Completed: 40 of 40 jobs CPU time in finished jobs: 20946s 349.11m 5.82h 0.24d 0.001 y IO & Wait Time: 5543s 92.38m 1.54h 0.06d 0.000 y Average job time: 662s 11.04m 0.18h 0.01d Longest running job: 0s 0.00m 0.00h 0.00d Longest finished job: 1934s 32.23m 0.54h 0.02d Submission to last job: 1934s 32.23m 0.54h 0.02d # Load into the database ssh hgwdev cd /cluster/data/mm6/bed/simpleRepeat cat trf/chr*.bed > simpleRepeat.bed hgLoadBed mm6 simpleRepeat simpleRepeat.bed \ -sqlTable=$HOME/src/hg/lib/simpleRepeat.sql # Loaded 1152810 elements of size 16 featureBits mm6 simpleRepeat # 83220723 bases of 2597150411 (3.204%) in intersection featureBits mm5 simpleRepeat # 81414259 bases of 2615483787 (3.113%) in intersection featureBits mm4 simpleRepeat # 82600648 bases of 2627444668 (3.144%) in intersection featureBits mm3 simpleRepeat # 75457193 bases of 2505900260 (3.011%) in intersection ############################################################################# # PROCESS SIMPLE REPEATS INTO MASK (DONE - 2005-03-14 - DONE) # After the simpleRepeats track has been built, make a filtered version # of the trf output: keep trf's with period <= 12: ssh kksilo cd /cluster/data/mm6/bed/simpleRepeat mkdir trfMask for F in trf/chr*.bed do echo "${F} -> ${F/trf\//}" awk '{if ($5 <= 12) print;}' ${F} > trfMask/${F/trf\//} done ############################################################################# # REPEATMASKER RUN (after contigs have been distributed to panasas FS) # (DONE - 2005-03-10 - 2005-03-14 - Hiram) # RM Version: RepBase Update 9.11, RM database version 20050112 # /cluster/bluearc/RepeatMasker050112 ssh kk #- Make the run directory and job list: cd /cluster/data/mm6 cat << '_EOF_' > jkStuff/RMMouse #!/bin/csh -fe cd /cluster/data/mm6/$1 pushd . /bin/mkdir -p /tmp/mm6/$2 /bin/cp /panasas/store/mm6/contigs/$1/$2 /tmp/mm6/$2 cd /tmp/mm6/$2 /cluster/bluearc/RepeatMasker050112/RepeatMasker -ali -s -species mus $2 popd /bin/cp /tmp/mm6/$2/$2.out ./ if (-e /tmp/mm6/$2/$2.align) /bin/cp /tmp/mm6/$2/$2.align ./ if (-e /tmp/mm6/$2/$2.tbl) /bin/cp /tmp/mm6/$2/$2.tbl ./ if (-e /tmp/mm6/$2/$2.cat) /bin/cp /tmp/mm6/$2/$2.cat ./ /bin/rm -fr /tmp/mm6/$2/* /bin/rmdir --ignore-fail-on-non-empty /tmp/mm6/$2 /bin/rmdir --ignore-fail-on-non-empty /tmp/mm6 '_EOF_' chmod +x jkStuff/RMMouse mkdir -p RMRun rm -f RMRun/RMJobs cat contig500K.lst | while read C do D=`dirname ${C}` F=`basename ${C}` echo /cluster/data/mm6/jkStuff/RMMouse ${D} ${F} \ '{'check out line+ /cluster/data/mm6/${D}/${F}.out'}' done >> RMRun/RMJobs #- Do the run ssh kk cd /cluster/data/mm6/RMRun para create RMJobs para try, para check, para check, para push, para check,... # Completed: 6678 of 6678 jobs # CPU time in finished jobs: 45303442s 755057.37m 12584.29h 524.35d 1.437 y # IO & Wait Time: 100211s 1670.18m 27.84h 1.16d 0.003 y # Average job time: 6799s 113.32m 1.89h 0.08d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 10760s 179.33m 2.99h 0.12d # Submission to last job: 121602s 2026.70m 33.78h 1.41d # had cluster contention with other jobs #- Lift up the split-contig .out's to contig-level .out's ssh kksilo cd /cluster/data/mm6 for D in ?/chr?_[0-9]* ??/chr??_[0-9]* ?/chr?_random_[0-9]* \ ??/chr??_random_[0-9]* do CONTIG=`basename ${D}` liftUp ${D}/${CONTIG}.fa.out ${D}/${CONTIG}.lft warn \ ${D}/${CONTIG}_[0-9]*.fa.out done > tmp/RM.lift.outs 2>&1 cat << '_EOF_' > jkStuff/liftRM_out_to_chr.sh #!/bin/sh for C in ? ?? do echo "lifting ${C}" cd ${C} if [ -s lift/ordered.lft ]; then liftUp chr${C}.fa.out lift/ordered.lft warn `cat lift/oOut.lst` else echo "WARNING: Can not find ${C}/lift/ordered.lft" fi if [ -s lift/random.lft ]; then liftUp chr${C}_random.fa.out lift/random.lft warn `cat lift/rOut.lst` fi cd .. done '_EOF_' # << keep emacs coloring happy chmod +x jkStuff/liftRM_out_to_chr.sh ./jkStuff/liftRM_out_to_chr.sh > tmp/liftRM_out_to_chr.out 2>&1 # scan the results tmp/liftRM_out_to_chr.out # there is a single: WARNING: Can not find Un/lift/ordered.lft # which is OK # List the final .out files, nothing should be size 0: ls -og */*.fa.out #- Load the .out files into the database with: ssh hgwdev cd /cluster/data/mm6 hgLoadOut mm6 ?/chr?.fa.out ??/chr??.fa.out ?/chr?_random.fa.out \ ??/chr??_random.fa.out # about 7 minutes, there are always a few of these errors: Strange perc. field -0.1 line 179923 of 1/chr1.fa.out Strange perc. field -0.1 line 190937 of 1/chr1.fa.out Strange perc. field -0.1 line 83366 of 5/chr5.fa.out Strange perc. field -4.5 line 57734 of 7/chr7.fa.out Strange perc. field -3.1 line 110634 of 10/chr10.fa.out Strange perc. field -9.2 line 110634 of 10/chr10.fa.out Strange perc. field -0.7 line 44931 of 14/chr14.fa.out Strange perc. field -0.1 line 952 of 9/chr9_random.fa.out Loading up table chrUn_random_rmsk note: 394 records dropped due to repStart > repEnd run with -verbose=2 for details # verify everything seems normal compared with previous builds featureBits mm6 rmsk # 1110222842 bases of 2597150411 (42.748%) in intersection featureBits mm5 rmsk # 1137310280 bases of 2615483787 (43.484%) in intersection featureBits mm4 rmsk # 1130883581 bases of 2627444668 (43.041%) in intersection featureBits mm3 rmsk # 1080265553 bases of 2505900260 (43.109%) in intersection featureBits -countGaps mm6 rmsk # 1110222842 bases of 3079633452 (36.050%) in intersection featureBits -countGaps mm5 rmsk # 1137310280 bases of 3164952073 (35.935%) in intersection featureBits -countGaps mm4 rmsk # 1130883581 bases of 2952612207 (38.301%) in intersection featureBits -countGaps mm3 rmsk # 1080265553 bases of 2708220133 (39.888%) in intersection ############################################################################# # GC5BASE (DONE - 2005-03-10 - Hiram) ssh hgwdev mkdir -p /cluster/data/mm6/bed/gc5Base cd /cluster/data/mm6/bed/gc5Base hgGcPercent -wigOut -doGaps -file=stdout -win=5 mm6 \ /cluster/data/mm6 | wigEncode stdin gc5Base.wig gc5Base.wib # Calculating gcPercent with window size 5 # Using twoBit: /cluster/data/mm6/mm6.2bit # File stdout created # Converted stdin, upper limit 100.00, lower limit 0.00 # runs for about 50 minutes mkdir /gbdb/mm6/wib ln -s `pwd`/gc5Base.wib /gbdb/mm6/wib hgLoadWiggle -pathPrefix=/gbdb/mm6/wib mm6 gc5Base gc5Base.wig # verify index is correct: hgsql mm6 -e "show index from gc5Base;" # should see good numbers in Cardinality column, NULL: hgsql mm6 -e "analyze table gc5Base;" ############################################################################# # PROCESS REPEAT MASKER AND SIMPLE REPEATS INTO MASKED SEQUENCE # (DONE - 2005-03-14 - Hiram) ssh kksilo cd /cluster/data/mm6 for CHR in ?/chr?.fa ??/chr??.fa ?/chr?_random.fa ??/chr??_random.fa do FA=${CHR#*\/} C=${FA%.fa} echo -n "repeat masking ${C} ... " /cluster/bin/i386/maskOutFa -soft ${CHR} ${CHR}.out ${CHR} echo -n "adding simpleRepeats ... " /cluster/bin/i386/maskOutFa -softAdd ${CHR} \ bed/simpleRepeat/trfMask/${C}.bed ${CHR} echo "done - ${CHR}" done > tmp/addRM_and_Simple.out 2>&1 # you will note the usual warnings about troublesome coordinates # in the repeat masker outputs - even more than when they were lifted. # and make the hard masked sequences from these soft masked sequences for CHR in ?/chr?.fa ??/chr??.fa ?/chr?_random.fa ??/chr??_random.fa do echo "maskOutFa ${CHR} hard ${CHR}.masked" /cluster/bin/i386/maskOutFa ${CHR} hard ${CHR}.masked done > /tmp/hardMask.out 2>&1 # rebuild the nib file faToTwoBit ?/chr?.fa ??/chr??.fa ?/chr?_random.fa ??/chr??_random.fa \ mm6Soft.2bit # verify the sequence is still the same size as before: twoBitInfo mm6.2bit stdout | sort -rn +1 | sum -r # 62443 1 sum -r chrom.sizes # 62443 1 # replace the former unmasked 2bit file with this new one: rm mm6.2bit mv mm6Soft.2bit mm6.2bit # check the browser, make sure it is functioning OK # Copy to panasas unit for cluster runs cp -p mm6.2bit /panasas/store/mm6/mm6.2bit mkdir /panasas/store/mm6/fasta time cp -p */*.fa */*.fa.masked /panasas/store/mm6/fasta ############################################################################# # PREPARE "bigZips" files for public release # (DONE through mrna.fa - 2005-03-15 - Hiram) ssh hgwdev mkdir -p /usr/local/apache/htdocs/goldenPath/mm6/bigZips cd /usr/local/apache/htdocs/goldenPath/mm6/bigZips cp -p /usr/local/apache/htdocs/goldenPath/mm5/bigZips/README.txt . # edit README.txt to indicate proper version of sequence and # RepeatMasker cd /cluster/data/mm6 tar cvzf /usr/local/apache/htdocs/goldenPath/mm6/bigZips/chromAgp.tar.gz \ ?/chr*.agp ??/chr*.agp tar cvzf /usr/local/apache/htdocs/goldenPath/mm6/bigZips/chromFa.tar.gz \ ?/chr*.fa ??/chr*.fa tar cvzf /usr/local/apache/htdocs/goldenPath/mm6/bigZips/chromFaMasked.tar.gz \ ?/chr*.fa.masked ??/chr*.fa.masked tar cvzf /usr/local/apache/htdocs/goldenPath/mm6/bigZips/chromOut.tar.gz \ ?/chr*.fa.out ??/chr*.fa.out cd /cluster/data/mm6/bed/simpleRepeat tar cvzf /usr/local/apache/htdocs/goldenPath/mm6/bigZips/chromTrf.tar.gz \ ./trfMask # get GenBank native mRNAs cd /cluster/data/genbank ./bin/i386/gbGetSeqs -db=mm6 -native GenBank mrna \ /usr/local/apache/htdocs/goldenPath/mm6/bigZips/mrna.fa cd /usr/local/apache/htdocs/goldenPath/mm6/bigZips gzip mrna.fa # add upstreams file (Heather, Sept. 2005) cd /usr/local/apache/htdocs/goldenPath/mm6/bigZips nice featureBits mm6 refGene:upstream:1000 -fa=upstream1000.fa nice gzip upstream1000.fa nice featureBits mm6 refGene:upstream:2000 -fa=upstream2000.fa nice gzip upstream2000.fa nice featureBits mm6 refGene:upstream:5000 -fa=upstream5000.fa nice gzip upstream5000.fa md5sum *.gz > md5sum.txt ############################################################################# # PREPARE LINEAGE SPECIFIC REPEAT FILES FOR BLASTZ (DONE - 2005-03-14 - Hiram) ssh kksilo mkdir /panasas/store/mm6/rmsk cd /cluster/data/mm6 cp -p */chr*.fa.out /panasas/store/mm6/rmsk mkdir /panasas/store/mm6/rmsk.spec cd /panasas/store/mm6/rmsk.spec ln -s ../rmsk/*.out . for FN in chr*.fa.out do echo ${FN} /cluster/bluearc/RepeatMasker050112/DateRepeats \ ${FN} -query mouse -comp human -comp rat -comp dog -comp cow done # takes about 30 minutes cd /panasas/store/mm6 mkdir linSpecRep.notInHuman mkdir linSpecRep.notInRat mkdir linSpecRep.notInDog mkdir linSpecRep.notInCow for F in rmsk.spec/chr*.out_homo-sapiens* do B=${F/rmsk.spec\/} B=${B/.fa.out*/} echo $B /cluster/bin/scripts/extractRepeats 1 ${F} > \ linSpecRep.notInHuman/${B}.out.spec /cluster/bin/scripts/extractRepeats 2 ${F} > \ linSpecRep.notInRat/${B}.out.spec /cluster/bin/scripts/extractRepeats 3 ${F} > \ linSpecRep.notInDog/${B}.out.spec /cluster/bin/scripts/extractRepeats 4 ${F} > \ linSpecRep.notInCow/${B}.out.spec done # the notInHuman, notInDog, and notInCow ended up being identical ############################################################################# # NIBS for BLASTZ (DONE - 2005-03-15 - Hiram) # turns out not all the details are worked out with the blastz # script to allow it to use a 2bit file for target. So, we will # need nib files until then. Eventually this requirement should # be eliminated. ssh kksilo cd /cluster/data/mm6 mkdir nib for C in ?/chr?*.fa ??/chr??*.fa do B=${C/*\/} B=${B/.fa/} echo faToNib -softMask ${C} nib/${B}.nib rm -f nib/${B}.nib faToNib -softMask ${C} nib/${B}.nib done mkdir /panasas/store/mm6/nib cp -p nib/* /panasas/store/mm6/nib ######################################################################### # CONTIG SPLIT UP - (DONE - 2005-03-24 - Hiram) # A first attempt was made to allow the genbank alignments to work # from just the 2bit file, but that leads to some large job # situations and things are not split up as best as they could be. # A survey was taken of the gaps and longest stretch of unbroken # sequence (see /cluster/data/mm6/gapAnalysis/ ) # and there are only 30 stretches of sequence longer than 5 Mb # Going to try an faSplit in a 10 Mb basis (this used to be 5 # Mb in the past, split on contigs) and see how that goes. ############################################################################# # BREAK UP SEQUENCE INTO 10 MB CHUNKS AT GAPS OF AT LEAST 100 ssh kksilo cd /cluster/data/mm6 mkdir ctgs10Mb mkdir ctgs10Mb/lift for C in ? ?? do mkdir ctgs10Mb/${C} if [ -s ${C}/chr${C}.fa ]; then echo -n "working: chr${C} ... " $HOME/bin/i386/faSplit -minGapSize=100 \ -lift=ctgs10Mb/lift/chr${C}.lft gap \ ${C}/chr${C}.fa 10000000 ctgs10Mb/${C}/chr${C}_ fi if [ -s ${C}/chr${C}_random.fa ]; then echo -n "working: chr${C}_random ... " $HOME/bin/i386/faSplit -minGapSize=100 \ -lift=ctgs10Mb/lift/chr${C}_random.lft gap \ ${C}/chr${C}_random.fa 10000000 \ ctgs10Mb/${C}/chr${C}_random_ fi done cat ctgs10Mb/lift/*.lft > jkStuff/liftAll.lft cp -p jkStuff/liftAll.lft /panasas/store/mm6 ######################################################################### # GENBANK auto update started (DONE - 2005-03-15 - 2005-03-29 - Hiram) ssh eieio cd /cluster/data/genbank # edit etc/genbank.conf, add the following section: # mm6 mm6.genome = /panasas/store/mm6/mm6.2bit mm6.lift = /panasas/store/mm6/liftAll.lft mm6.downloadDir = mm6 mm6.genbank.est.xeno.load = yes mm6.mgcTables.default = full mm6.mgcTables.mgc = all # Do the refseq's first, they are the quick ones nice bin/gbAlignStep -srcDb=refseq -type=mrna -verbose=1 -initial mm6 # var/build/logs/2005.03.25-12:18:01.mm6.initalign.log # real 109m23.547s # user 4m8.057s # sys 1m18.459s # Completed: 5190 of 5190 jobs # CPU time in finished jobs: 44385s 739.76m 12.33h 0.51d 0.001 y # IO & Wait Time: 20723s 345.38m 5.76h 0.24d 0.001 y # Average job time: 13s 0.21m 0.00h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 278s 4.63m 0.08h 0.00d # Submission to last job: 6017s 100.28m 1.67h 0.07d # Load the results from the above ssh hgwdev cd /cluster/data/genbank nice bin/gbDbLoadStep -verbose=1 -drop -initialLoad mm6 # var/dbload/hgwdev/logs/2005.03.25-15:31:22.dbload.log # real 2m28.355s # user 0m19.830s # sys 0m10.180s # check the RefSeq Genes track shows up in the browser # table browser query on RefSeq Genes whole genome, summary stats # indicates 18,397 items covering 713,077,002 (%27.46) bases # To get the genbank started, the above results need to be # moved out of the way. These things can be removed if there are # no problems to debug ssh eieio cd /cluster/data/genbank/work mv initial.mm6 initial.mm6.refseq.mrna # or: rm -fr initial.mm6 cd /cluster/data/genbank nice bin/gbAlignStep -srcDb=genbank -type=mrna -verbose=1 -initial mm6 # logFile: var/build/logs/2005.03.25-17:13:13.mm6.initalign.log # RUNNING - 2005-03-25 17:30 # There was one incredibly long job that occupied most of the time # real 741m1.285s # user 88m31.751s # sys 25m6.943s # Completed: 27680 of 27680 jobs # CPU time in finished jobs: 5495665s 91594.42m 1526.57h 63.61d 0.174 y # IO & Wait Time: 114623s 1910.38m 31.84h 1.33d 0.004 y # Average job time: 203s 3.38m 0.06h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 13544s 225.73m 3.76h 0.16d # Submission to last job: 37663s 627.72m 10.46h 0.44d # Load the results from the above ssh hgwdev cd /cluster/data/genbank # There is a lock file present, I believe from the previous load: [hiram@hgwdev /cluster/data/genbank/var/dbload/hgwdev/run] ls -l # -rw-r--r-- 1 hiram protein 18 Mar 15 10:08 dbload.lock # checking that it is actually owned by yourself, it is safe to remove # it. The next load will not proceed with this lock in place. rm var/dbload/hgwdev/run/dbload.lock time nice bin/gbDbLoadStep -verbose=1 -drop -initialLoad mm6 # var/dbload/hgwdev/logs/2005.03.26-08:47:42.dbload.log # 22 minute load time # And finally, the big est run - expect several days on this one ssh eieio cd /cluster/data/genbank/work mv initial.mm6 initial.mm6.genbank.mrna # since it is going to run several days, create a screen for it to # run in. detach and reattach as necessary to view the progress # of the job cd /cluster/data/genbank screen nice bin/gbAlignStep -srcDb=genbank -type=est -verbose=1 -initial mm6 # var/build/logs/2005.03.26-09:00:22.mm6.initalign.log # STARTED 2005-03-26 09:13 # FINISHED 2005-03-28 11:53 # about 50 hours run time # detach from screen session: Ctrl-a Ctrl-d # to reattach to this screen session: ssh eieio screen -d -r # Completed: 159852 of 159852 jobs # CPU time in finished jobs: 110196174s 1836602.90m 30610.05h 1275.42d 3.494 y # IO & Wait Time: 1230416s 20506.93m 341.78h 14.24d 0.039 y # Average job time: 697s 11.62m 0.19h 0.01d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 41516s 691.93m 11.53h 0.48d # Submission to last job: 137193s 2286.55m 38.11h 1.59d ssh hgwdev cd /cluster/data/genbank time nice bin/gbDbLoadStep -verbose=1 -drop -initialLoad mm6 # many tables are loaded by this load: # refFlat, refGene, refLink, refSeqAli, refSeqStatus, refSeqSummary # mgcGenes mgcFullMrna mgcFullStatus intronEst all_est chr*est # estOrientInfo xenoEst # var/dbload/hgwdev/logs/2005.03.28-13:38:13.dbload.log # LOADING - 2005-03-28 - 13:35 # FINISHED - 2005-03-29 - 00:43 # real 665m1.596s # user 105m19.790s # sys 33m10.390s # Note, that is an 11 hour load time. # Measurements: (these are interesting in the table browser too) # RefSeq Genes featureBits mm6 refGene # 41752877 bases of 2597150411 (1.608%) in intersection # MGC Genes featureBits mm6 refFlat # 41752877 bases of 2597150411 (1.608%) in intersection featureBits mm6 refSeqAli # 41738603 bases of 2597150411 (1.607%) in intersection # additionally created tables by the genbank process that can not # be measured with featureBits # refLink, refSeqStatus, refSeqSummary featureBits mm6 mgcGenes # 27174785 bases of 2597150411 (1.046%) in intersection # Mouse mRNAs featureBits mm6 all_mrna # 112068807 bases of 2597150411 (4.315%) in intersection # Spliced ESTs featureBits mm6 intronEst # 52812872 bases of 2597150411 (2.033%) in intersection # Mouse ESTs featureBits mm6 est # 236687034 bases of 2597150411 (9.113%) in intersection # Non-Mouse mRNAs featureBits mm6 xenoMrna # 52119099 bases of 2597150411 (2.007%) in intersection # Non-Mouse ESTs featureBits mm6 xenoEst ######################################################################### # PRODUCING GENSCAN PREDICTIONS (DONE 2005-03-14 - 2005-03-17 Hiram) ssh hgwdev mkdir /cluster/data/mm6/bed/genscan cd /cluster/data/mm6/bed/genscan # Check out hg3rdParty/genscanlinux to get latest genscan: cvs co hg3rdParty/genscanlinux # Run on small cluster (more mem than big cluster). ssh kki cd /cluster/data/mm6/bed/genscan # Make 3 subdirectories for genscan to put their output files in mkdir gtf pep subopt # Generate a list file, genome.list, of all the hard-masked contigs that # *do not* consist of all-N's (which would cause genscan to blow up) ls -1S /panasas/store/mm6/fasta/*.masked > genome.list # XXX There is an error in the following template, note the extra # space between the - and par= # It turns out the default for the -par argument is this same # matrix so the extra space had no effect on the end result. # Create template file, gsub, for gensub2. For example (3-line file): cat << '_EOF_' > gsub #LOOP /cluster/bin/x86_64/gsBig {check in line+ $(path1)} {check out line gtf/$(root1).gtf} -trans={check out line pep/$(root1).pep} -subopt={check out line subopt/$(root1).bed} -exe=hg3rdParty/genscanlinux/genscan - par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=2400000 #ENDLOOP '_EOF_' # << keep emacs coloring happy gensub2 genome.list single gsub jobList para create jobList para try, check, push, check, ... # Had two jobs crash: # Completed: 38 of 40 jobs # Crashed: 2 jobs # CPU time in finished jobs: 343416s 5723.60m 95.39h 3.97d 0.011 y # IO & Wait Time: 3443s 57.38m 0.96h 0.04d 0.000 y # Average job time: 9128s 152.13m 2.54h 0.11d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 27423s 457.05m 7.62h 0.32d # Submission to last job: 34524s 575.40m 9.59h 0.40d # If there are crashes, diagnose with "para problems". para problems > problems.0 # Two of them needed to be rerun, adjust window down to 2000000 to # get them to complete. Lower that number if the error persists. ssh kolossus cd /cluster/data/mm6/bed/genscan # XXX There is an error in the following commands, note the extra # space between the - and par= # It turns out the default for the -par argument is this same # matrix so the extra space had no effect on the end result. /cluster/bin/x86_64/gsBig /panasas/store/mm6/fasta/chr2.fa.masked gtf/chr2.fa.gtf -trans=pep/chr2.fa.pep -subopt=subopt/chr2.fa.bed -exe=hg3rdParty/genscanlinux/genscan - par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=2000000 /cluster/bin/x86_64/gsBig /panasas/store/mm6/fasta/chr14.fa.masked gtf/chr14.fa.gtf -trans=pep/chr14.fa.pep -subopt=subopt/chr14.fa.bed -exe=hg3rdParty/genscanlinux/genscan - par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=2000000 # cat the results into single files ssh kksilo cd /cluster/data/mm6/bed/genscan cat gtf/chr?.fa.gtf gtf/chr??.fa.gtf gtf/chr?_random.fa.gtf \ gtf/chr??_random.fa.gtf > genscan.gtf cat subopt/chr?.fa.bed subopt/chr??.fa.bed subopt/chr?_random.fa.bed \ subopt/chr??_random.fa.bed > genscanSubopt.bed cat pep/chr?.fa.pep pep/chr??.fa.pep pep/chr?_random.fa.pep \ pep/chr??_random.fa.pep > genscan.pep # Load into the database as so: ssh hgwdev cd /cluster/data/mm6/bed/genscan ldHgGene mm6 -gtf genscan genscan.gtf hgPepPred mm6 generic genscanPep genscan.pep hgLoadBed mm6 genscanSubopt genscanSubopt.bed # check the numbers featureBits mm6 genscan # 54894283 bases of 2597150411 (2.114%) in intersection featureBits mm5 genscan # 55024722 bases of 2615483787 (2.104%) in intersection featureBits mm4 genscan # 56164126 bases of 2627444668 (2.138%) in intersection featureBits mm3 genscan # 51697165 bases of 2505900260 (2.063%) in intersection featureBits mm6 genscanSubopt # 57856316 bases of 2597150411 (2.228%) in intersection featureBits mm5 genscanSubopt # 58474899 bases of 2615483787 (2.236%) in intersection featureBits mm4 genscanSubopt # 59601009 bases of 2627444668 (2.268%) in intersection featureBits mm3 genscanSubopt # 56085184 bases of 2505900260 (2.238%) in intersection ######################################################################### # BLASTZ NOTE: with the advent of Angie's script to run the # blastz process through to chains and nets loaded into the # database and download files prepared, it is now a juggling act # to see which klusters are available. The particular options to # the script to make it go to one kluster or another are to be # determined at run-time. The typical run-times listed here will # be a factor in your choice of kluster to operate on. ######################################################################### # BLASTZ HUMAN Hg17 (DONE - 2005-03-14 - 2005-03-18 - Hiram) ssh kk mkdir /cluster/data/mm6/bed/blastzHg17.2005_03_14 cd /cluster/data/mm6/bed/blastzHg17.2005_03_14 cat << '_EOF_' > DEF # mouse vs. human export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/home/angie/schwartzbin:/cluster/home/kent/bin/i386 ALIGN=blastz-run BLASTZ=blastz BLASTZ_H=2000 BLASTZ_ABRIDGE_REPEATS=1 # TARGET: Mouse SEQ1_DIR=/panasas/store/mm6/nib # not used SEQ1_RMSK=/panasas/store/mm6/rmsk # not used SEQ1_FLAG=-rodent SEQ1_SMSK=/panasas/store/mm6/linSpecRep.notInHuman SEQ1_IN_CONTIGS=0 SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Human SEQ2_DIR=/scratch/hg/hg17/bothMaskedNibs # RMSK not currently used SEQ2_RMSK= # FLAG not currently used SEQ2_FLAG= SEQ2_SMSK=/scratch/hg/hg17/linSpecRep.notInMouse SEQ2_IN_CONTIGS=0 SEQ2_CHUNK=30000000 SEQ2_LAP=0 BASE=/cluster/data/mm6/bed/blastzHg17.2005_03_14 DEF=$BASE/DEF RAW=$BASE/raw CDBDIR=$BASE SEQ1_LEN=$BASE/S1.len SEQ2_LEN=$BASE/S2.len '_EOF_' # << keep emacs coloring happy cp /cluster/data/mm6/chrom.sizes ./S1.len sort -rn +1 /cluster/data/hg17/chrom.sizes > S2.len # establish a screen to control this job screen time /cluster/bin/scripts/doBlastzChainNet.pl `pwd`/DEF > \ blast.run.out 2>&1 & # real 993m28.547s # user 0m0.198s # sys 0m0.171s # detach from screen session: Ctrl-a Ctrl-d # to reattach to this screen session: ssh kksilo screen -d -r # STARTED - 2005-03-17 21:25 # FINISHED - 2005-03-18 14:00 # Completed: 45347 of 45347 jobs # CPU time in finished jobs: 16921981s 282033.02m 4700.55h 195.86d 0.537 y # IO & Wait Time: 2381711s 39695.18m 661.59h 27.57d 0.076 y # Average job time: 426s 7.09m 0.12h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 9568s 159.47m 2.66h 0.11d # Submission to last job: 58695s 978.25m 16.30h 0.68d # Completed: 331 of 331 jobs # CPU time in finished jobs: 272s 4.54m 0.08h 0.00d 0.000 y # IO & Wait Time: 1145s 19.08m 0.32h 0.01d 0.000 y # Average job time: 4s 0.07m 0.00h 0.00d # Longest job: 24s 0.40m 0.01h 0.00d # Submission to last job: 265s 4.42m 0.07h 0.00d # The kki batch doChainRun.csh appears to have failed # due to underlying changes in the location of hg17 items # fixup the symlinks which are in a state of flux today, then, # to recover: ssh kki cd /cluster/data/mm6/bed/blastzHg17.2005_03_14/axtChain/run rm -fr chain time ./doChainRun.csh # real 22m47.917s # user 0m0.380s # sys 0m0.630s # Completed: 40 of 40 jobs # CPU time in finished jobs: 6373s 106.22m 1.77h 0.07d 0.000 y # IO & Wait Time: 552s 9.20m 0.15h 0.01d 0.000 y # Average job time: 173s 2.89m 0.05h 0.00d # Longest job: 662s 11.03m 0.18h 0.01d # Submission to last job: 1200s 20.00m 0.33h 0.01d # That was the last part of the chainRun step, can now continue: ssh kksilo cd /cluster/data/mm6/bed/blastzHg17.2005_03_14 time /cluster/bin/scripts/doBlastzChainNet.pl -continue chainMerge `pwd`/DEF > chainMerge.run.out 2>&1 & # STARTED - 2005-03-18 15:00 # FINISHED 2005-03-18 16:33 # checking the numbers for sanity: ssh hgwdev # expect ~ 2m30 seconds for chain measurement time featureBits mm6 chainHg17 # 2596946329 bases of 2597150411 (99.992%) in intersection time featureBits mm5 chainHg17 # 2507720521 bases of 2615483787 (95.880%) in intersection # expect ~ 2m30s seconds for net measurement time featureBits mm6 netHg17 # 2579747741 bases of 2597150411 (99.330%) in intersection time featureBits mm5 netHg17 # 2504056038 bases of 2615483787 (95.740%) in intersection ssh kolossus # expect ~ 20-22 minutes for the chainLink measurement HGDB_CONF=~/.hg.conf.read-only /usr/bin/time --portability \ featureBits mm6 chainHg17Link # 966916309 bases of 2597150411 (37.230%) in intersection HGDB_CONF=~/.hg.conf.read-only /usr/bin/time --portability \ featureBits mm5 chainHg17Link # 1025750185 bases of 2615483787 (39.218%) in intersection # swap results to place mm6 alignments onto Hg17 time /cluster/bin/scripts/doBlastzChainNet.pl -swap `pwd`/DEF > \ swap.run.out 2>&1 & # STARTED - 2005-03-29 - 15:58 # FINI - 2005-03-29 - 18:48 # real 171m26.172s # user 0m2.270s # sys 0m0.870s ssh kolossus time HGDB_CONF=~/.hg.conf.read-only featureBits hg17 chainMm6Link # 969459954 bases of 2866216770 (33.824%) in intersection time HGDB_CONF=~/.hg.conf.read-only featureBits hg17 chainMm5Link # 1020106336 bases of 2866216770 (35.591%) in intersection # A measurement script to do all featureBits combinations: cd /cluster/data/mm6/jkStuff cat << '_EOF_' > netChainCheck.sh #!/bin/sh usage() { echo "usage: netChainCheck.sh " echo " does: featureBits net" echo " featureBits net" echo " as well as the chain and chainLink tables," echo " and on the targetDb:" echo " featureBits net" echo " featureBits net" echo " and the chain and chainLink tables." echo -e "\texample: netChainCheck.sh mm6 mm5 fr1" } doOne() { db=$1 tbl=$2 echo " featureBits $db $tbl" echo -en " #\t" time featureBits $db $tbl } ucFirstLetter() { ucString="$1" fc=`echo "${ucString}" | sed -e "s/\(.\).*/\1/"` rest=`echo "${ucString}" | sed -e "s/.\(.*\)/\1/"` FC=`echo "${fc}" | tr '[a-z]' '[A-Z]'` echo "${FC}${rest}" } if [ "$#" -ne 3 ]; then usage exit 255 fi db0=$1 db1=$2 targetDb=$3 targetDB=`ucFirstLetter "${targetDb}"` DB0=`ucFirstLetter "${db0}"` DB1=`ucFirstLetter "${db1}"` export db0 db1 targetDb targetDB DB0 DB1 # echo "${db0} ${db1} ${targetDb} ${targetDB} ${DB0} ${DB1}" doOne "${db0}" net${targetDB} doOne "${db1}" net${targetDB} doOne "${db0}" chain${targetDB} doOne "${db1}" chain${targetDB} doOne "${db0}" chain${targetDB}Link doOne "${db1}" chain${targetDB}Link doOne ${targetDb} net${DB0} doOne ${targetDb} net${DB1} doOne ${targetDb} chain${DB0} doOne ${targetDb} chain${DB1} doOne ${targetDb} chain${DB0}Link doOne ${targetDb} chain${DB1}Link '_EOF_' # << keep emacs coloring happy ######################################################################### # BLASTZ RAT Rn3 (FIRST TRY - 2005-03-15 - 2005-03-17 - Hiram) # THESE ARE THE CORRECT PARAMETERS - the second try was not used # it was too restrictive and cuts out too many alignments ssh kksilo mkdir /cluster/data/mm6/bed/blastzRn3.2005_03_22 cd /cluster/data/mm6/bed/blastzRn3.2005_03_22 cat << '_EOF_' > DEF # mouse vs. human export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/home/angie/schwartzbin:/cluster/home/kent/bin/i386 ALIGN=blastz-run BLASTZ=blastz BLASTZ_H=2000 BLASTZ_ABRIDGE_REPEATS=1 # TARGET # Mouse SEQ1_DIR=/panasas/store/mm6/nib # not used SEQ1_RMSK=/panasas/store/mm6/rmsk # not used SEQ1_FLAG=-rodent SEQ1_SMSK=/panasas/store/mm6/linSpecRep.notInRat SEQ1_IN_CONTIGS=0 SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY # Rat SEQ2_DIR=/iscratch/i/rn3/bothMaskedNibs # RMSK not currently used SEQ2_RMSK= # FLAG not currently used SEQ2_FLAG= SEQ2_SMSK=/cluster/bluearc/rat/rn3/linSpecRep.notInMouse SEQ2_IN_CONTIGS=0 SEQ2_CHUNK=30000000 SEQ2_LAP=0 BASE=/cluster/data/mm6/bed/blastzRn3.2005_03_15 DEF=$BASE/DEF RAW=$BASE/raw CDBDIR=$BASE SEQ1_LEN=$BASE/S1.len SEQ2_LEN=$BASE/S2.len '_EOF_' # << keep emacs coloring happy cp /cluster/data/mm6/chrom.sizes ./S1.len sort -rn +1 /cluster/data/rn3/chrom.sizes > S2.len # establish a screen to control this job screen time /cluster/bin/scripts/doBlastzChainNet.pl `pwd`/DEF > \ blast.run.out 2>&1 & # detach from screen session: Ctrl-a Ctrl-d # to reattach to this screen session: ssh kksilo screen -d -r # Completed: 40713 of 40713 jobs # CPU time in finished jobs: 18170174s 302836.24m 5047.27h 210.30d 0.576 y # IO & Wait Time: 1770530s 29508.83m 491.81h 20.49d 0.056 y # Average job time: 490s 8.16m 0.14h 0.01d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 28252s 470.87m 7.85h 0.33d # Submission to last job: 69864s 1164.40m 19.41h 0.81d # Completed: 331 of 331 jobs # CPU time in finished jobs: 1168s 19.46m 0.32h 0.01d 0.000 y # IO & Wait Time: 3047s 50.79m 0.85h 0.04d 0.000 y # Average job time: 13s 0.21m 0.00h 0.00d # Longest job: 119s 1.98m 0.03h 0.00d # Submission to last job: 359s 5.98m 0.10h 0.00d # Completed: 40 of 40 jobs # CPU time in finished jobs: 12274s 204.56m 3.41h 0.14d 0.000 y # IO & Wait Time: 1719s 28.66m 0.48h 0.02d 0.000 y # Average job time: 350s 5.83m 0.10h 0.00d # Longest job: 1016s 16.93m 0.28h 0.01d # Submission to last job: 1482s 24.70m 0.41h 0.02d # After this was complete, realized that it needs a minScore # filter on the chaining step. Also, we need some pslChrom files # for Gill's work: ssh kksilo /cluster/data/mm6/bed/blastzRn3.2005_03_15 cat << '_EOF_' > mkPslChrom.sh #!/bin/sh if [ -d pslChrom ]; then mv pslChrom pslChrom.0 rm -fr pslChrom.0 & fi mkdir pslChrom ls pslParts | sed -e "s/.nib.*//" | sort -u | while read C do echo -n "working: ${C} ... " zcat `ls pslParts/${C}.nib* | sort --field-separator=':' -k1,1 -k3,3n` \ > pslChrom/${C}.psl echo "done" done '_EOF_' # << keep emacs coloring happy chmod +x mkPslChrom.sh ./mkPslChrom.sh # After the experiment of 2005-03-22 # RELOADING these chains and nets ssh hgwdev hgsql mm6 -e "drop table netRn3;" hgsql mm6 -e "show tables;" | grep chainRn3 | while read T do hgsql mm6 -e "drop table ${T};" echo ${T} done # kksilo currently off-limits to logins due to hardware difficulties ssh kolossus cd /cluster/data/mm6/bed/blastzRn3.2005_03_15/axtChain chainSplit chain mm6.rn3.all.chain.gz ssh hgwdev cat << '_EOF_' > reLoad.csh #!/bin/csh -ef # Load chains: cd /cluster/data/mm6/bed/blastzRn3.2005_03_15/axtChain/chain foreach f (*.chain) set c = $f:r echo "hgLoadChain mm6 ${c}_chainRn3 $f" hgLoadChain mm6 ${c}_chainRn3 $f end cd /cluster/data/mm6/bed/blastzRn3.2005_03_15/axtChain # Load nets: netFilter -minGap=10 mm6.rn3.net.gz | hgLoadNet -verbose=0 mm6 netRn3 stdin '_EOF_' # << keep emacs coloring happy chmod +x reLoad.csh time ./reLoad.csh # ~ 188m == 3 hours 8 min # Measurements: time featureBits mm6 netRn3 # expect ~ 2m 12s # 2720144602 bases of 2597150411 (104.736%) in intersection time featureBits mm5 netRn3 # 2638255333 bases of 2615483787 (100.871%) in intersection time featureBits mm6 chainRn3 # expect ~ 10m 30s to 13m 25s # 2768422449 bases of 2597150411 (106.595%) in intersection time featureBits mm5 chainRn3 # 2646682349 bases of 2615483787 (101.193%) in intersection ssh kolossus time HGDB_CONF=~/.hg.conf.read-only featureBits mm6 chainRn3Link # 1802980225 bases of 2597150411 (69.421%) in intersection # real 94m48.021s time HGDB_CONF=~/.hg.conf.read-only featureBits mm5 chainRn3Link # 1798705001 bases of 2615483787 (68.771%) in intersection # real 76m44.580s # Prepare for a re-run of the swap, move the 2005_03_22 swap # results out of the way ssh hgwdev cd /cluster/data/rn3/bed mv blastz.mm6.swap blastz.mm6.swap.2005_03_22 cd /cluster/data/mm6/bed/blastzRn3.2005_03_15 time /cluster/bin/scripts/doBlastzChainNet.pl -swap `pwd`/DEF > \ swap.run_1.out 2>&1 & # STARTED - 2005-03-29 15:55 # FINI - 2005-03-30 05:21 # real 807m3.833s # user 0m2.200s # sys 0m1.150s ssh kolossus time HGDB_CONF=~/.hg.conf.read-only featureBits rn3 chainMm6Link # 1812992492 bases of 2571104688 (70.514%) in intersection time HGDB_CONF=~/.hg.conf.read-only featureBits rn3 chainMm5Link # 1673171206 bases of 2571104688 (65.076%) in intersection ######################################################################### # BLASTZ RAT REDONE 2005-03-22 # THIS WAS AN EXPERIMENT - THESE RESULTS WERE DROPPED FROM THE DB # more stringent BLASTZ parameters and chain filtering # COMPLETE 2005-03-23 - swap to place chainMm6 and netMm6 on rn3 browser ssh kksilo mkdir /cluster/data/mm6/bed/blastzRn3.2005_03_22 cd /cluster/data/mm6/bed/blastzRn3.2005_03_22 ### XXXX - 2005-03-31 - THERE IS AN ERROR IN THIS DEF FILE SPECIFYING ### the SEQ2_LEN but it didn't seem to matter. The blastz run appears ### to ahve used SEQ2 correctly despite this incorrect specification. cat << '_EOF_' > DEF export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/bin/scripts:/cluster/home/angie/schwartzbin/ # mouse vs. rat # more stringent parameters to reduce output BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=50000 BLASTZ_T=2 # scoring matrix BLASTZ_Q=/cluster/data/blastz/mus_rat.q BLASTZ_ABRIDGE_REPEATS=1 # TARGET: Mouse (mm6) SEQ1_DIR=/panasas/store/mm6/nib SEQ1_SMSK=/panasas/store/mm6/linSpecRep.notInRat SEQ1_IN_CONTIGS=0 SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Rat (rn3) SEQ2_DIR=/iscratch/i/rn3/bothMaskedNibs SEQ2_SMSK=/cluster/bluearc/rat/rn3/linSpecRep.notInMouse SEQ2_IN_CONTIGS=0 SEQ2_CHUNK=30000000 SEQ2_LAP=0 SEQ2_LEN=/iscratch/i/bosTau1/chrom.sizes # XXXXXXXXXXXXXXX ^^^^^^^ wrong file ! XXXXXXXXXXXXXX BASE=/cluster/data/mm6/bed/blastzRn3.2005_03_22 SEQ1_LEN=$BASE/S1.len SEQ2_LEN=$BASE/S2.len '_EOF_' # << keep emacs coloring happy cp /cluster/data/mm6/chrom.sizes ./S1.len sort -rn +1 /cluster/data/rn3/chrom.sizes > S2.len # establish a screen to control this job screen time /cluster/bin/scripts/doBlastzChainNet.pl -chainMinScore 5000 \ `pwd`/DEF > blast.run.out 2>&1 & # detach from screen session: Ctrl-a Ctrl-d # to reattach to this screen session: ssh kksilo screen -d -r # These more strict blastz parameters make this run much faster; # Completed: 40713 of 40713 jobs # CPU time in finished jobs: 4813023s 80217.06m 1336.95h 55.71d 0.153 y # IO & Wait Time: 1788355s 29805.91m 496.77h 20.70d 0.057 y # Average job time: 162s 2.70m 0.05h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 2543s 42.38m 0.71h 0.03d # Submission to last job: 10945s 182.42m 3.04h 0.13d # Completed: 331 of 331 jobs # CPU time in finished jobs: 146s 2.43m 0.04h 0.00d 0.000 y # IO & Wait Time: 840s 14.00m 0.23h 0.01d 0.000 y # Average job time: 3s 0.05m 0.00h 0.00d # Longest job: 7s 0.12m 0.00h 0.00d # Submission to last job: 66s 1.10m 0.02h 0.00d # Completed: 40 of 40 jobs # CPU time in finished jobs: 3870s 64.50m 1.08h 0.04d 0.000 y # IO & Wait Time: 364s 6.06m 0.10h 0.00d 0.000 y # Average job time: 106s 1.76m 0.03h 0.00d # Longest job: 224s 3.73m 0.06h 0.00d # Submission to last job: 406s 6.77m 0.11h 0.00d # FINISHED - 2005-03-22 15:44 # it is helpful to time these commands to make sure everything # is sane. The times should also be similar, as are the numbers. # check the numbers time featureBits mm6 chainRn3 2705309999 bases of 2597150411 (104.165%) in intersection time featureBits mm5 chainRn3 2646682349 bases of 2615483787 (101.193%) in intersection # These chainRn3 tables on mm6 are much smaller than mm5 # There was an attempt during mm5 to run the blastz on rn3 with # the stringent parameters used here, but that ran into # difficulties as there were other tracks depending upon the # older bulky alignments and it was necessary to bring the bulky # alignments back. There is a vast difference in the number of # chains: mm5.chr1_chainRn3: 1,865,181 rows, mm6.chr1_chainRn3: 16466 # mm5.chr1_chainRn3Link: 18,252,937 rows, mm6.chr1_chainRn3Link: 2,340,447 # trying to do the chainLink's requires kolossus, big memory ssh kolossus # specify a .hg.conf file with read-only passwords: # these take about 15 and 90 minutes (the mm5's are much larger) HGDB_CONF=~/.hg.conf.read-only /usr/bin/time --portability \ featureBits mm6 chainRn3Link # 1652692239 bases of 2597150411 (63.635%) in intersection # real 864.72 # user 211.05 # sys 66.95 # 1802980225 bases of 2597150411 (69.421%) in intersection HGDB_CONF=~/.hg.conf.read-only /usr/bin/time --portability \ featureBits mm5 chainRn3Link # 1798705001 bases of 2615483787 (68.771%) in intersection # the netRn3 measurements take about 2m30s time featureBits mm6 netRn3 # 2705309999 bases of 2597150411 (104.165%) in intersection time featureBits mm5 netRn3 # 2638255333 bases of 2615483787 (100.871%) in intersection # And then the swap of that: time /cluster/bin/scripts/doBlastzChainNet.pl -swap `pwd`/DEF > \ swap.run.out 2>&1 & # STARTED - 2005-03-22 16:15 # FINISHED - 2005-03-22 17:41 # check the numbers featureBits rn3 chainMm6 # 2819351420 bases of 2571104688 (109.655%) in intersection featureBits rn3 chainMm5 # 2786666162 bases of 2571104688 (108.384%) in intersection featureBits rn3 netMm6 # 2808675438 bases of 2571104688 (109.240%) in intersection featureBits rn3 netMm5 # 2778454647 bases of 2571104688 (108.065%) in intersection ######################################################################### # BLASTZ Zebrafish danRer2 (DONE - 2005-03-17 - 2005-03-18 - Hiram) ssh kksilo mkdir /cluster/data/mm6/bed/blastzDanRer2.2005_03_17 cd /cluster/data/mm6/bed/blastzDanRer2.2005_03_17 cat << '_EOF_' > DEF # mouse (mm6) vs zebrafish (danRer2) export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin ALIGN=blastz-run BLASTZ=blastz # Reuse parameters from hg16-fr1, danRer-hg17 and mm6-danRer1 BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q BLASTZ_ABRIDGE_REPEATS=0 # TARGET: Mouse SEQ1_DIR=/panasas/store/mm6/nib SEQ1_RMSK= SEQ1_FLAG= SEQ1_SMSK= SEQ1_IN_CONTIGS=0 SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Zebrafish (danRer2) SEQ2_DIR=/iscratch/i/danRer2/nib SEQ2_RMSK= SEQ2_FLAG= SEQ2_SMSK= SEQ2_IN_CONTIGS=0 SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/cluster/data/mm6/bed/blastzDanRer2.2005_03_17 DEF=$BASE/DEF RAW=$BASE/raw CDBDIR=$BASE SEQ1_LEN=$BASE/S1.len SEQ2_LEN=$BASE/S2.len '_EOF_' # << keep emacs coloring happy cp /cluster/data/mm6/chrom.sizes ./S1.len sort -rn +1 /cluster/data/danRer2/chrom.sizes > S2.len # establish a screen to control this job screen time /cluster/bin/scripts/doBlastzChainNet.pl `pwd`/DEF > \ blast.run.out 2>&1 & # detach from screen session: Ctrl-a Ctrl-d # to reattach to this screen session: ssh kksilo screen -d -r # STARTED - 2005-03-17 10:00 # FINISHED - 2005-03-18 01:01 # real 494m43.717s # user 0m0.322s # sys 0m0.184s # Completed: 57263 of 57263 jobs # CPU time in finished jobs: 14680881s 244681.36m 4078.02h 169.92d 0.466 y # IO & Wait Time: 320049s 5334.14m 88.90h 3.70d 0.010 y # Average job time: 262s 4.37m 0.07h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 1308s 21.80m 0.36h 0.02d # Submission to last job: 51993s 866.55m 14.44h 0.60d # Completed: 331 of 331 jobs # CPU time in finished jobs: 87s 1.46m 0.02h 0.00d 0.000 y # IO & Wait Time: 869s 14.48m 0.24h 0.01d 0.000 y # Average job time: 3s 0.05m 0.00h 0.00d # Longest job: 8s 0.13m 0.00h 0.00d # Submission to last job: 161s 2.68m 0.04h 0.00d # Completed: 40 of 40 jobs # CPU time in finished jobs: 2496s 41.60m 0.69h 0.03d 0.000 y # IO & Wait Time: 295s 4.92m 0.08h 0.00d 0.000 y # Average job time: 70s 1.16m 0.02h 0.00d # Longest job: 139s 2.32m 0.04h 0.00d # Submission to last job: 470s 7.83m 0.13h 0.01d # swap results to place mm6 alignments onto danRer2 ssh hgwdev cd /cluster/data/mm6/bed/blastzDanRer2.2005_03_17 time /cluster/bin/scripts/doBlastzChainNet.pl -fileServer eieio \ -swap `pwd`/DEF > swap.run.out 2>&1 & # ~ 27 minutes cat << '_EOF_' > measurements.sh #!/bin/sh doOne() { db=$1 tbl=$2 echo " featureBits $db $tbl" echo -en " #\t" time featureBits $db $tbl } doOne mm6 netDanRer2 doOne mm5 netDanRer2 doOne mm6 chainDanRer2 doOne mm5 chainDanRer2 doOne mm6 chainDanRer2Link doOne mm5 chainDanRer2Link doOne danRer2 netMm6 doOne danRer2 netMm5 doOne danRer2 chainMm6 doOne danRer2 chainMm5 doOne danRer2 chainMm6Link doOne danRer2 chainMm5Link '_EOF_' # << keep emacs happy chmod +x measurements.sh time ./measurements.sh > measures.out 2>&1 & featureBits mm6 netDanRer2 # 686375730 bases of 2597150411 (26.428%) in intersection featureBits mm5 netDanRer2 # 553450442 bases of 2615483787 (21.161%) in intersection featureBits mm6 chainDanRer2 # 782392894 bases of 2597150411 (30.125%) in intersection featureBits mm5 chainDanRer2 # 598864029 bases of 2615483787 (22.897%) in intersection featureBits mm6 chainDanRer2Link # 162226493 bases of 2597150411 (6.246%) in intersection featureBits mm5 chainDanRer2Link # 59978861 bases of 2615483787 (2.293%) in intersection featureBits danRer2 netMm6 # 576283947 bases of 1560497282 (36.930%) in intersection featureBits danRer2 netMm5 # 476966014 bases of 1560497282 (30.565%) in intersection featureBits danRer2 chainMm6 # 641696461 bases of 1560497282 (41.121%) in intersection featureBits danRer2 chainMm5 # 505097651 bases of 1560497282 (32.368%) in intersection featureBits danRer2 chainMm6Link # 176391894 bases of 1560497282 (11.304%) in intersection featureBits danRer2 chainMm5Link # 68003819 bases of 1560497282 (4.358%) in intersection ######################################################################### # BLASTZ FUGU fr1 (DONE - 2005-03-17 - 2005-03-19 - Hiram) ssh kksilo mkdir /cluster/data/mm6/bed/blastzFr1.2005_04_01 cd /cluster/data/mm6/bed/blastzFr1.2005_04_01 cat << '_EOF_' > DEF # mouse vs. fugu export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin ALIGN=blastz-run BLASTZ=blastz # Reuse parameters from human-chicken, except L=6000 (more relaxed) BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q BLASTZ_ABRIDGE_REPEATS=0 # TARGET: Mouse SEQ1_DIR=/panasas/store/mm6/nib SEQ1_IN_CONTIGS=0 SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Fugu SEQ2_DIR=/iscratch/i/fr1/nib SEQ2_IN_CONTIGS=0 SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/cluster/data/mm6/bed/blastzFr1.2005_04_01 DEF=$BASE/DEF RAW=$BASE/raw CDBDIR=$BASE SEQ1_LEN=$BASE/S1.len SEQ2_LEN=$BASE/S2.len '_EOF_' # << this line keeps emacs coloring happy cp /cluster/data/mm6/chrom.sizes ./S1.len sort -rn +1 /cluster/data/fr1/chrom.sizes > S2.len # establish a screen to control this job screen time /cluster/bin/scripts/doBlastzChainNet.pl -bigClusterHub kki \ -fileServer eieio -chainMinScore 5000 `pwd`/DEF > blast.run.out 2>&1 & # STARTED - 2005-04-01 16:30 time /cluster/bin/scripts/doBlastzChainNet.pl -bigClusterHub kk9 \ -continue cat -fileServer eieio -chainMinScore 5000 \ `pwd`/DEF > cat.run.out 2>&1 & # STARTED - 2005-04-01 16:30 # detach from screen session: Ctrl-a Ctrl-d # to reattach to this screen session: ssh kksilo screen -d -r # STARTED - 2005-03-17 11:00 # FINISHED - 2005-03-19 00:14 time /cluster/bin/scripts/doBlastzChainNet.pl -fileServer eieio \ -chainMinScore 5000 -swap `pwd`/DEF > swap.run.out 2>&1 & # measurements featureBits mm6 netFr1 # 618129802 bases of 2597150411 (23.800%) in intersection featureBits mm6 chainFr1 # 666835089 bases of 2597150411 (25.676%) in intersection featureBits mm6 chainFr1Link # 55355465 bases of 2597150411 (2.131%) in intersection featureBits fr1 netMm6 # 146828640 bases of 315518167 (46.536%) in intersection featureBits fr1 chainMm6 # 160874127 bases of 315518167 (50.987%) in intersection featureBits fr1 chainMm6Link # 46266090 bases of 315518167 (14.664%) in intersection ######################################################################### # BLASTZ TETRAODON tetNig1 (TBD - 2005-03-17 - Hiram) ssh kksilo mkdir /cluster/data/mm6/bed/blastzTetNig1.2005_03_17 cd /cluster/data/mm6/bed/blastzTetNig1.2005_03_17 # use same parameters as for danRer1-mm5 # NOTE: The BLASTZ_Q score matrix should have been the Tuned.gap # one which is recreated below during the re-score cat << '_EOF_' > DEF # mouse (mm6) vs Tetraodon (tetNig1) export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin ALIGN=blastz-run BLASTZ=blastz # Reuse parameters from hg16-fr1 and danRer1-hg17. BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q BLASTZ_ABRIDGE_REPEATS=0 # TARGET: Mouse SEQ1_DIR=/panasas/store/mm6/nib SEQ1_RMSK= SEQ1_FLAG= SEQ1_SMSK= SEQ1_IN_CONTIGS=0 SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Tetraodon (tetNig1) SEQ2_DIR=/iscratch/i/tetNig1/nib SEQ2_RMSK= SEQ2_FLAG= SEQ2_SMSK= SEQ2_IN_CONTIGS=0 SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/cluster/data/mm6/bed/blastzTetNig1.2005_03_17 DEF=$BASE/DEF RAW=$BASE/raw CDBDIR=$BASE SEQ1_LEN=$BASE/S1.len SEQ2_LEN=$BASE/S2.len '_EOF_' # << this line keeps emacs coloring happy cp /cluster/data/mm6/chrom.sizes ./S1.len sort -rn +1 /cluster/data/tetNig1/chrom.sizes > S2.len # establish a screen to control this job screen time /cluster/bin/scripts/doBlastzChainNet.pl -bigClusterHub kk9 \ `pwd`/DEF > blast.run.out 2>&1 & # detach from screen session: Ctrl-a Ctrl-d # to reattach to this screen session: ssh kksilo screen -d -r # STARTED 2005-03-17 21:30 # FINISHED 2005-03-18 05:10 # real 461m56.901s # user 0m0.426s # sys 0m0.310s # Completed: 18867 of 18867 jobs # CPU time in finished jobs: 2396227s 39937.11m 665.62h 27.73d 0.076 y # IO & Wait Time: 53160s 886.00m 14.77h 0.62d 0.002 y # Average job time: 130s 2.16m 0.04h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 591s 9.85m 0.16h 0.01d # Submission to last job: 26573s 442.88m 7.38h 0.31d # Completed: 331 of 331 jobs # CPU time in finished jobs: 27s 0.45m 0.01h 0.00d 0.000 y # IO & Wait Time: 798s 13.30m 0.22h 0.01d 0.000 y # Average job time: 2s 0.04m 0.00h 0.00d # Longest job: 6s 0.10m 0.00h 0.00d # Submission to last job: 200s 3.33m 0.06h 0.00d # Completed: 40 of 40 jobs # CPU time in finished jobs: 688s 11.47m 0.19h 0.01d 0.000 y # IO & Wait Time: 169s 2.82m 0.05h 0.00d 0.000 y # Average job time: 21s 0.36m 0.01h 0.00d # Longest job: 55s 0.92m 0.02h 0.00d # Submission to last job: 212s 3.53m 0.06h 0.00d # Re-do the chains with different scoring matrix ssh kki cd /cluster/data/mm6/bed/blastzTetNig1.2005_03_17/axtChain/run # Reuse gap penalties from chicken run. # It turns out this is unnecessary. This scoring matrix is # actually the default scoring matrix used in axtChain # The processing below does not use this file. cat << '_EOF_' | sed 's/ */\t/g' > ../../Tuned.gap tablesize 11 smallSize 111 position 1 2 3 11 111 2111 12111 32111 72111 152111 252111 qGap 325 360 400 450 600 1100 3600 7600 15600 31600 56600 tGap 325 360 400 450 600 1100 3600 7600 15600 31600 56600 bothGap 625 660 700 750 900 1400 4000 8000 16000 32000 57000 '_EOF_' # << this line keeps emacs coloring happy rm batch para make jobList para check para time # Completed: 40 of 40 jobs # CPU time in finished jobs: 692s 11.54m 0.19h 0.01d 0.000 y # IO & Wait Time: 295s 4.91m 0.08h 0.00d 0.000 y # Average job time: 25s 0.41m 0.01h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 60s 1.00m 0.02h 0.00d # Submission to last job: 87s 1.45m 0.02h 0.00d ssh kolossus cd /cluster/data/mm6/bed/blastzTetNig1.2005_03_17 mv mafNet mafNet.0 mv axtNet axtNet.0 cd /cluster/data/mm6/bed/blastzTetNig1.2005_03_17/axtChain mv mm6.tetNig1.all.chain.gz mm6.tetNig1.all.chain.gz.0 mv mm6.tetNig1.over.chain.gz mm6.tetNig1.over.chain.gz.0 mv mm6.tetNig1.net.gz mm6.tetNig1.net.gz.0 chainMergeSort run/chain/*.chain | nice gzip -c > mm6.tetNig1.all.chain.gz chainSplit chain mm6.tetNig1.all.chain.gz time ./netChains.csh # ~ 3m17s ssh hgwdev cd /cluster/data/mm6/bed/blastzTetNig1.2005_03_17/axtChain time ./loadUp.csh # ~ 7m40s gzip mm6.tetNig1.net cd /usr/local/apache/htdocs/goldenPath/mm6/vsTetNig1 md5sum *.gz axtNet/*.gz > md5sum.txt # swap results to place mm6 alignments onto TetNig1 ssh hgwdev cd /cluster/data/mm6/bed/blastzTetNig1.2005_03_17 rm -fr /usr/local/apache/htdocs/goldenPath/tetNig1/vsMm6 time /cluster/bin/scripts/doBlastzChainNet.pl -fileServer eieio \ -swap `pwd`/DEF > swap.run.out 2>&1 & # Measurements: cat << '_EOF_' > measurements.sh #!/bin/sh doOne() { db=$1 tbl=$2 echo " featureBits $db $tbl" echo -en " #\t" time featureBits $db $tbl } doOne mm6 netTetNig1 doOne mm5 netTetNig1 doOne mm6 chainTetNig1 doOne mm5 chainTetNig1 doOne mm6 chainTetNig1Link doOne mm5 chainTetNig1Link doOne tetNig1 netMm6 doOne tetNig1 netMm5 doOne tetNig1 chainMm6 doOne tetNig1 chainMm5 doOne tetNig1 chainMm6Link doOne tetNig1 chainMm5Link '_EOF_' # << keep emacs happy chmod +x measurements.sh time ./measurements.sh featureBits mm6 netTetNig1 # 720943295 bases of 2597150411 (27.759%) in intersection featureBits mm5 netTetNig1 # 618111072 bases of 2615483787 (23.633%) in intersection featureBits mm6 chainTetNig1 # 771732145 bases of 2597150411 (29.715%) in intersection featureBits mm5 chainTetNig1 # 652622662 bases of 2615483787 (24.952%) in intersection featureBits mm6 chainTetNig1Link # 62346107 bases of 2597150411 (2.401%) in intersection featureBits mm5 chainTetNig1Link # 43905129 bases of 2615483787 (1.679%) in intersection featureBits tetNig1 netMm6 # 176451958 bases of 342403326 (51.533%) in intersection featureBits tetNig1 netMm5 # 152232538 bases of 342403326 (44.460%) in intersection featureBits tetNig1 chainMm6 # 197657323 bases of 342403326 (57.726%) in intersection featureBits tetNig1 chainMm5 # 163683179 bases of 342403326 (47.804%) in intersection featureBits tetNig1 chainMm6Link # 55282376 bases of 342403326 (16.145%) in intersection featureBits tetNig1 chainMm5Link # 41736750 bases of 342403326 (12.189%) in intersection ######################################################################### # CPGISLANDS (DONE - 2005-03-17 - Hiram) ssh hgwdev mkdir -p /cluster/data/mm6/bed/cpgIsland cd /cluster/data/mm6/bed/cpgIsland # Build software from Asif Chinwalla (achinwal@watson.wustl.edu) cvs co hg3rdParty/cpgIslands cd hg3rdParty/cpgIslands make # gcc readseq.c cpg_lh.c -o cpglh.exe mv cpglh.exe ../.. # cpglh.exe requires hard-masked (N) .fa's. # There may be warnings about "bad character" for IUPAC ambiguous # characters like R, S, etc. Ignore the warnings. ssh kksilo cd /cluster/data/mm6/bed/cpgIsland for F in ../../*/chr*.fa.masked do FA=${F/*\/} C=${FA/.fa.masked/} echo "./cpglh.exe ${FA} > ${C}.cpg" ./cpglh.exe ${F} > ${C}.cpg done > cpglh.out 2>&1 & # three warnings: # Bad char 0x52 = 'R' at line 164245, base 8212187, sequence chr14 # Bad char 0x53 = 'S' at line 167424, base 8371114, sequence chr14 # Bad char 0x53 = 'S' at line 167426, base 8371198, sequence chr14 # Several chroms have 0 results: # -rw-rw-r-- 1 0 Mar 17 12:13 chr10_random.cpg # -rw-rw-r-- 1 0 Mar 17 12:18 chr9_random.cpg # -rw-rw-r-- 1 0 Mar 17 12:18 chrM.cpg # -rw-rw-r-- 1 0 Mar 17 12:18 chrY.cpg # -rw-rw-r-- 1 0 Mar 17 12:18 chrY_random.cpg # XXX - this is interesting that chrY, either one, have nothing. # the previous mm5 release did have some on chrY # Evidently the new chrY is too short - this chrY is being # reconstructed and only a small part of it is known in this # assembly. The bulk of chrY from previous assemblies is now in # chrY_random # Transform cpglh output to bed + cat << '_EOF_' > filter.awk { $2 = $2 - 1; width = $3 - $2; printf("%s\t%d\t%s\t%s %s\t%s\t%s\t%0.0f\t%0.1f\t%s\t%s\n", $1, $2, $3, $5,$6, width, $6, width*$7*0.01, 100.0*2*$6/width, $7, $9); } '_EOF_' # << this line makes emacs coloring happy awk -f filter.awk chr*.cpg > cpgIsland.bed ssh hgwdev cd /cluster/data/mm6/bed/cpgIsland hgLoadBed mm6 cpgIslandExt -tab -noBin \ -sqlTable=$HOME/kent/src/hg/lib/cpgIslandExt.sql cpgIsland.bed # Reading cpgIsland.bed # Loaded 16100 elements of size 10 # Sorted # Saving bed.tab # Loading mm6 featureBits mm6 cpgIslandExt # 10432360 bases of 2597150411 (0.402%) in intersection featureBits mm5 cpgIslandExt # 10422989 bases of 2615483787 (0.399%) in intersection featureBits mm4 cpgIsland # 11109692 bases of 2627444668 (0.423%) in intersection featureBits mm3 cpgIsland # 10102968 bases of 2505900260 (0.403%) in intersection ######################################################################### # BLASTZ Dog canFam1 (DONE - 2005-03-18 - 2005-04-03 - Hiram) ssh kksilo mkdir /cluster/data/mm6/bed/blastzCanFam1.2005_03_18 cd /cluster/data/mm6/bed/blastzCanFam1.2005_03_18 cat << '_EOF_' > DEF # mouse vs. dog export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin ALIGN=blastz-run BLASTZ=blastz # Default BLASTZ_H=2000 BLASTZ_ABRIDGE_REPEATS=1 # TARGET: Mouse (mm6) SEQ1_DIR=/panasas/store/mm6/nib SEQ1_RMSK= SEQ1_FLAG= SEQ1_SMSK=/panasas/store/mm6/linSpecRep.notInDog SEQ1_IN_CONTIGS=0 SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Dog (canFam1) SEQ2_DIR=/scratch/hg/canFam1/nib SEQ2_RMSK= SEQ2_FLAG= SEQ2_SMSK=/scratch/hg/canFam1/linSpecRep.notInMouse SEQ2_IN_CONTIGS=0 SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/cluster/data/mm6/bed/blastzCanFam1.2005_03_18 DEF=$BASE/DEF RAW=$BASE/raw CDBDIR=$BASE SEQ1_LEN=$BASE/S1.len SEQ2_LEN=$BASE/S2.len '_EOF_' # << keep emacs coloring happy cp /cluster/data/mm6/chrom.sizes ./S1.len sort -rn +1 /cluster/data/canFam1/chrom.sizes > S2.len # establish a screen to control this job screen time /cluster/bin/scripts/doBlastzChainNet.pl -bigClusterHub kk9 \ -fileServer eieio -chainMinScore 5000 `pwd`/DEF > blast.run.out 2>&1 & # detach from screen session: Ctrl-a Ctrl-d # to reattach to this screen session: ssh kksilo screen -d -r # STARTED - 2005-03-18 09:57 # BROKEN - 2005-03-20 11:00 - due to rearrangements on /scratch/ # need to regenerate the linSpec not in mouse for canFam1 # RESTARTED - 2005-03-30 16:25 # COMPLETELY RESTARTED 2005-04-01 11:40 time /cluster/bin/scripts/doBlastzChainNet.pl -bigClusterHub kk9 \ -continue cat -fileServer eieio -chainMinScore 5000 \ `pwd`/DEF > cat.run.out 2>&1 & # Check measurements at this point: featureBits mm6 netCanFam1 # 2544343230 bases of 2597150411 (97.967%) in intersection featureBits mm5 netCanFam1 # 2456773441 bases of 2615483787 (93.932%) in intersection featureBits mm6 chainCanFam1 # 2562947250 bases of 2597150411 (98.683%) in intersection featureBits mm5 chainCanFam1 # 2464497454 bases of 2615483787 (94.227%) in intersection featureBits mm6 chainCanFam1Link # 798637320 bases of 2597150411 (30.751%) in intersection featureBits mm5 chainCanFam1Link # 859275338 bases of 2615483787 (32.853%) in intersection # Those are looking pretty good, so now do the swap: ssh eieio cd /cluster/data/mm6/bed/blastzCanFam1.2005_03_18 time /cluster/bin/scripts/doBlastzChainNet.pl -bigClusterHub kk9 \ -swap -fileServer eieio -chainMinScore 5000 \ `pwd`/DEF > swap.run.out 2>&1 & # 125 minutes featureBits canFam1 netMm6 # 2305458923 bases of 2359845093 (97.695%) in intersection featureBits canFam1 netMm5 # 2255138517 bases of 2359845093 (95.563%) in intersection featureBits canFam1 chainMm6 # 2310615069 bases of 2359845093 (97.914%) in intersection featureBits canFam1 chainMm5 # 2257403477 bases of 2359845093 (95.659%) in intersection featureBits canFam1 chainMm6Link # 783631188 bases of 2359845093 (33.207%) in intersection featureBits canFam1 chainMm5Link # 837236252 bases of 2359845093 (35.478%) in intersection ######################################################################### # BLASTZ Cow bosTau1 (DONE - 2005-03-18 - 2005-04-08 - Hiram) ssh kksilo mkdir /cluster/data/mm6/bed/blastzBosTau1.2005_03_18 cd /cluster/data/mm6/bed/blastzBosTau1.2005_03_18 cat << '_EOF_' > DEF # mouse vs. cow # TARGET: Mouse (mm6) SEQ1_DIR=/panasas/store/mm6/nib SEQ1_CHUNK=10000000 SEQ1_LAP=10000 SEQ1_LEN=/cluster/data/mm6/chrom.sizes # QUERY: Cow (bosTau1) SEQ2_DIR=/iscratch/i/bosTau1/nib/bosTau1.2bit SEQ2_CHUNK=5000000 SEQ2_LAP=0 SEQ2_LEN=/iscratch/i/bosTau1/chrom.sizes BASE=/cluster/data/mm6/bed/blastzBosTau1.2005_03_18 '_EOF_' # << keep emacs coloring happy # establish a screen to control this job screen time /cluster/bin/scripts/doBlastzChainNet.pl \ `pwd`/DEF > blast.run.out 2>&1 & # detach from screen session: Ctrl-a Ctrl-d # to reattach to this screen session: ssh kksilo screen -d -r # STARTED - 2005-03-18 13:20 # BROKEN - 2005-03-20 - 22:03 - power failure to all machines # RESTARTED - 2005-03-30 14:35 # After several reruns of the batch, believe it may be finished. # establish check point marker in the run.time file: para time > run.time # Now to the rest of the story: ssh eieio cd /cluster/data/mm6/bed/blastzBosTau1.2005_03_18 time /cluster/bin/scripts/doBlastzChainNet.pl -fileServer eieio \ -continue cat `pwd`/DEF > cat.run.out 2>&1 & # Completed: 40 of 40 jobs # CPU time in finished jobs: 834s 13.90m 0.23h 0.01d 0.000 y # IO & Wait Time: 2421s 40.35m 0.67h 0.03d 0.000 y # Average job time: 81s 1.36m 0.02h 0.00d # Longest job: 334s 5.57m 0.09h 0.00d # Submission to last job: 365s 6.08m 0.10h 0.00d # measurements: ssh hgwdev cd /cluster/data/mm6/bed/blastzBosTau1.2005_03_18 time ../../jkStuff/netChainCheck.sh mm6 mm5 bosTau1 >measurements.out 2>&1 & featureBits mm6 netBosTau1 # 1483158691 bases of 2597150411 (57.107%) in intersection featureBits mm5 netBosTau1 # 1491250043 bases of 2615483787 (57.016%) in intersection featureBits mm6 chainBosTau1 # 1551920940 bases of 2597150411 (59.755%) in intersection featureBits mm5 chainBosTau1 # 1557897465 bases of 2615483787 (59.564%) in intersection featureBits mm6 chainBosTau1Link # 603091864 bases of 2597150411 (23.221%) in intersection featureBits mm5 chainBosTau1Link # 606973993 bases of 2615483787 (23.207%) in intersection # Looking OK, so do the swap ssh eieio cd /cluster/data/mm6/bed/blastzBosTau1.2005_03_18 time /cluster/bin/scripts/doBlastzChainNet.pl -fileServer eieio \ -swap `pwd`/DEF > swap.run.out 2>&1 & # 308 m = 5h 8m # failed on kolossus due to NFS problems ssh kolossus cd /cluster/data/bosTau1/bed/blastz.mm6.swap/axtChain # extract the unfinished portion of netChains.csh into # finiChains.csh and run it: time ./finiChains.csh # STARTED - 2005-04-06 # 13h 50m # continuing ssh eieio cd /cluster/data/mm6/bed/blastzBosTau1.2005_03_18 time /cluster/bin/scripts/doBlastzChainNet.pl -fileServer eieio \ -swap -continue load `pwd`/DEF > load.run.out 2>&1 & # 5h 6min load time # checking measurements: featureBits bosTau1 netMm6 # 1317934269 bases of 2261116798 (58.287%) in intersection featureBits bosTau1 netMm5 # 1317539731 bases of 2261116798 (58.269%) in intersection featureBits bosTau1 chainMm6 # 1325743373 bases of 2261116798 (58.632%) in intersection featureBits bosTau1 chainMm5 # 1325445280 bases of 2261116798 (58.619%) in intersection featureBits bosTau1 chainMm6Link # 589779558 bases of 2261116798 (26.084%) in intersection featureBits bosTau1 chainMm5Link # 588460684 bases of 2261116798 (26.025%) in intersection # looks good, done. ############################################################################# # BLASTZ SELF (DONE - 2005-03-31 - 2005-04-08 - Hiram) # The procedure for lineage spec business with self is to simply # use the actual repeat masker output for this mouse assembly as # the lineage specific repeats for itself. Thus, merely make # symlinks to the repeat masker out files and name them as expected # for blastz. In this case they are called notInMouse but they # really mean InMouse. Yes, it is confusing, but that's just the # nature of the game in this case. ssh eieio mkdir /panasas/store/mm6/linSpecRep.notInMouse cd /panasas/store/mm6/linSpecRep.notInMouse foreach f (../rmsk/*.fa.out) set base = $f:t:r:r echo $base.out.spec ln -s $f $base.out.spec end mkdir /cluster/data/mm6/bed/blastzSelf cd /cluster/data/mm6/bed/blastzSelf cat << '_EOF_' > DEF # mouse vs. mouse export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/home/angie/schwartzbin:/cluster/home/kent/bin/i386 ALIGN=blastz-run BLASTZ=blastz BLASTZ_H=2000 BLASTZ_ABRIDGE_REPEATS=1 # TARGET: Mouse Mm6 SEQ1_DIR=/panasas/store/mm6/nib # RMSK not currently used SEQ1_RMSK=/panasas/store/mm6/rmsk # FLAG not currently used SEQ1_FLAG=-rodent SEQ1_SMSK=/panasas/store/mm6/linSpecRep.notInMouse SEQ1_IN_CONTIGS=0 SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Mouse Mm6 SEQ2_DIR=/panasas/store/mm6/nib # RMSK not currently used SEQ2_RMSK=/panasas/store/mm6/rmsk # FLAG not currently used SEQ2_FLAG=-rodent SEQ2_SMSK=/panasas/store/mm6/linSpecRep.notInMouse SEQ2_IN_CONTIGS=0 SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/cluster/data/mm6/bed/blastzSelf DEF=$BASE/DEF RAW=$BASE/raw CDBDIR=$BASE SEQ1_LEN=$BASE/S1.len SEQ2_LEN=$BASE/S2.len '_EOF_' # << keep emacs coloring happy cp /cluster/data/mm6/chrom.sizes ./S1.len cp /cluster/data/mm6/chrom.sizes ./S2.len # establish a screen to control this job screen # kksilo was off-limits to logins as this started, use eieio time /cluster/bin/scripts/doBlastzChainNet.pl -fileServer eieio \ `pwd`/DEF > blast.run.out 2>&1 & # STARTED - 2005-03-31 - 13:53 # detach from screen session: Ctrl-a Ctrl-d # to reattach to this screen session: ssh eieio screen -d -r # The job had difficulties due to failing nodes on the KK # recover the job by going to kk machine and # directory /cluster/data/mm6/bed/blastzSelf # to get it to complete with para recover etc... # One job seemed to take forever, it appears the section: # chrUn_random:50000000-60010000 with the same piece, 40 hours. # Completed: 8861 of 8861 jobs # CPU time in finished jobs: 3519718s 58661.97m 977.70h 40.74d 0.112 y # IO & Wait Time: 460422s 7673.70m 127.89h 5.33d 0.015 y # Average job time: 449s 7.49m 0.12h 0.01d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 145760s 2429.33m 40.49h 1.69d # Submission to last job: 263627s 4393.78m 73.23h 3.05d # After it was complete, create the run.time file: ssh kk cd /cluster/data/mm6/bed/blastzSelf/run.blastz para time > run.time time /cluster/bin/scripts/doBlastzChainNet.pl \ -continue cat -fileServer eieio `pwd`/DEF > cat.run.out 2>&1 & # STARTED - 2005-04-04 11:40 # three jobs failed, go to kolossus and try them there: ssh kolossus cd /cluster/data/mm6/bed/blastzSelf/axtChain/run ./chain.csh chrUn_random.nib:chrUn_random: \ chain/chrUn_random.nib:chrUn_random:.chain # chrUn failed too on kolossus ./chain.csh chrY_random.nib:chrY_random: \ chain/chrY_random.nib:chrY_random:.chain # chrY worked in 28 minutes ./chain.csh chrX.nib:chrX: chain/chrX.nib:chrX:.chain # chrX worked in 44 minutes # The chrUn business, in fact any of the chr*_random's shouldn't # be chained anyway since they aren't contiguous sequence anyhow. # So, simply leave chrUn out of the chain and net stuff. # to continue: ssh kk cd /cluster/data/mm6/bed/blastzSelf/axtChain/run para time > run.time ssh eieio cd /cluster/data/mm6/bed/blastzSelf time /cluster/bin/scripts/doBlastzChainNet.pl \ -continue chainMerge -fileServer eieio `pwd`/DEF > merge.run.out 2>&1 & # 385 min = 3h 25m # that finished OK, checking the measurements: # this self alignment only appears to be present on mm3 as mouseChain time featureBits mm6 netSelf # 2336281173 bases of 2597150411 (89.956%) in intersection time featureBits mm6 chainSelf # 2579948751 bases of 2597150411 (99.338%) in intersection time featureBits mm3 mouseChain # 889252994 bases of 2505900260 (35.486%) in intersection # the chainLink measurements need kolossus: ssh kolossus time HGDB_CONF=~/.hg.conf.read-only featureBits mm6 chainSelfLink # 417927047 bases of 2597150411 (16.092%) in intersection # 244 minutes time HGDB_CONF=~/.hg.conf.read-only featureBits mm3 mouseChainLink # 383345536 bases of 2505900260 (15.298%) in intersection # Gill likes to see the blastzSelf track: ssh eieio cd /cluster/data/mm6/bed/blastzSelf # cat the pslParts together, per-chrom, and in chromStart order: ls pslParts | sed -e "s/.nib.*//" | sort -u | while read C do echo -n "working: ${C} ... " zcat `ls pslParts/${C}.nib* | sort --field-separator=':' -k1,1 -k3,3n` \ | gzip > pslChrom/${C}_blastzSelf.psl.gz echo "done" done # Load blastzSelf ssh hgwdev cd /cluster/data/mm6/bed/blastzSelf/pslChrom for I in *.psl.gz do $HOME/bin/i386/hgLoadPsl -noTNameIx mm6 ${I} echo "done: ${I}" done # STARTED - 2005-04-06 15:24 # 4h 24m load time - chrUn_random failed to load ssh kolossus time HGDB_CONF=~/.hg.conf.read-only featureBits mm6 blastzSelf # 8h 34m job # 471978757 bases of 2597150411 (18.173%) in intersection ############################################################################# # BLASTZ CHICKEN - (DONE - 2005-03-21 - 2005-04-08 - Hiram) # MAKE LINEAGE-SPECIFIC REPEATS FOR CHICKEN # In an email 2/13/04, Arian said we could treat all human repeats as # lineage-specific for human-chicken blastz. Do the same for mouse. # Scripts expect *.out.spec filenames, so set that up: ssh eieio mkdir /panasas/store/mm6/linSpecRep.notInChicken cd /panasas/store/mm6/linSpecRep.notInChicken foreach f (../rmsk/*.fa.out) set base = $f:t:r:r echo $base.out.spec ln -s $f $base.out.spec end mkdir /cluster/data/mm6/bed/blastzGalGal2.2005_03_31 cd /cluster/data/mm6/bed/blastzGalGal2.2005_03_31 cat << '_EOF_' > DEF # mouse vs. chicken export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin ALIGN=blastz-run BLASTZ=blastz # Specific settings for chicken (per Webb email to Brian Raney) BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=10000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q BLASTZ_ABRIDGE_REPEATS=1 # TARGET: Mouse Mm6 SEQ1_DIR=/panasas/store/mm6/nib # RMSK not currently used SEQ1_RMSK=/panasas/store/mm6/rmsk # FLAG not currently used SEQ1_FLAG=-rodent SEQ1_SMSK=/panasas/store/mm6/linSpecRep.notInChicken SEQ1_IN_CONTIGS=0 SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Chicken galGal2 SEQ2_DIR=/iscratch/i/galGal2/nib SEQ2_RMSK= SEQ2_FLAG= SEQ2_SMSK=/iscratch/i/galGal2/linSpecRep SEQ2_IN_CONTIGS=0 SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/cluster/data/mm6/bed/blastzGalGal2.2005_03_31 DEF=$BASE/DEF RAW=$BASE/raw SEQ1_LEN=$BASE/S1.len SEQ2_LEN=$BASE/S2.len '_EOF_' # << this line keeps emacs coloring happy ssh eieio cd /cluster/data/mm6/bed/blastzGalGal2.2005_03_31 cp /cluster/data/mm6/chrom.sizes ./S1.len sort -rn +1 /cluster/data/galGal2/chrom.sizes > S2.len # establish a screen to control this job screen time /cluster/bin/scripts/doBlastzChainNet.pl -chainMinScore=5000 \ -fileServer eieio `pwd`/DEF > blast.run.out 2>&1 & # STARTED 2005-04-01 11:30 # detach from screen session: Ctrl-a Ctrl-d # to reattach to this screen session: ssh kksilo screen -d -r # CRASHED due to kksilo problems, finished batch manually, then # continuing: time /cluster/bin/scripts/doBlastzChainNet.pl -chainMinScore=5000 \ -continue cat -fileServer eieio `pwd`/DEF > cat.run.out 2>&1 & # measurements are looking good: featureBits mm6 netGalGal2 # 1937053597 bases of 2597150411 (74.584%) in intersection featureBits mm5 netGalGal2 # 1958796258 bases of 2615483787 (74.892%) in intersection featureBits mm6 chainGalGal2 # 1969505681 bases of 2597150411 (75.833%) in intersection featureBits mm5 chainGalGal2 # 1990102297 bases of 2615483787 (76.089%) in intersection featureBits mm6 chainGalGal2Link # 82018349 bases of 2597150411 (3.158%) in intersection featureBits mm5 chainGalGal2Link # 78951466 bases of 2615483787 (3.019%) in intersection # Since those are OK, now do the swap: ssh eieio time /cluster/bin/scripts/doBlastzChainNet.pl -swap -chainMinScore=5000 \ -fileServer eieio `pwd`/DEF > swap.run.out 2>&1 & # and measure: featureBits galGal2 netMm6 # 832583709 bases of 1054197620 (78.978%) in intersection featureBits galGal2 netMm5 # 835277984 bases of 1054197620 (79.234%) in intersection featureBits galGal2 chainMm6 # 843746491 bases of 1054197620 (80.037%) in intersection featureBits galGal2 chainMm5 # 846905330 bases of 1054197620 (80.336%) in intersection featureBits galGal2 chainMm6Link # 72687426 bases of 1054197620 (6.895%) in intersection featureBits galGal2 chainMm5Link # 70542788 bases of 1054197620 (6.692%) in intersection ############################################################################# # BLASTZ OPOSSUM (DONE - 2005-04-01 - 2005-04-08 - Hiram) ssh eieio mkdir /cluster/data/mm6/bed/blastzMonDom1.2005_04_01 cd /cluster/data/mm6/bed/blastzMonDom1.2005_04_01 cat << '_EOF_' > DEF # mouse vs. opossum export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin ALIGN=blastz-run BLASTZ=blastz # Using parameters from monDom1<->mm5 alignments, see notes there. BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=10000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q BLASTZ_ABRIDGE_REPEATS=0 # TARGET: Mouse Mm6 SEQ1_DIR=/panasas/store/mm6/nib SEQ1_IN_CONTIGS=0 SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Opossum MonDom1 SEQ2_DIR=/iscratch/i/monDom1/monDom1.2bit SEQ2_IN_CONTIGS=1 SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/cluster/data/mm6/bed/blastzMonDom1.2005_04_01 DEF=$BASE/DEF RAW=$BASE/raw CDBDIR=$BASE SEQ1_LEN=$BASE/S1.len SEQ2_LEN=$BASE/S2.len '_EOF_' # << this line keeps emacs coloring happy ssh eieio cd /cluster/data/mm6/bed/blastzMonDom1.2005_04_01 cp /cluster/data/mm6/chrom.sizes ./S1.len sort -rn +1 /cluster/data/monDom1/chrom.sizes > S2.len # establish a screen to control this job screen time /cluster/bin/scripts/doBlastzChainNet.pl -chainMinScore=5000 \ -fileServer eieio `pwd`/DEF > blast.run.out 2>&1 & # STARTED 2005-04-01 11:30 # detach from screen session: Ctrl-a Ctrl-d # to reattach to this screen session: ssh kksilo screen -d -r # CRASHED due to kksilo problems, finished batch manually and # created the run.time file # continuing time /cluster/bin/scripts/doBlastzChainNet.pl -chainMinScore=5000 \ -continue cat -fileServer eieio `pwd`/DEF > cat.run.out 2>&1 & # STARTED 2005-04-03 11:45 # during the load of the tables, this command failed, perhaps due # to kksilo crashes: netClass -verbose=0 -noAr noClass.net mm6 monDom1 mm6.monDom1.net # So, trying to recover: 2005-04-04 15:15 ssh hgwdev cd /cluster/data/mm6/bed/blastzMonDom1.2005_04_01/axtChain time netClass -verbose=0 -noAr noClass.net mm6 monDom1 mm6.monDom1.net # Fails with the same error: Can't start query: select genoName,genoStart,genoEnd,repName,repClass,repFamily from rmsk order by genoName,genoStart mySQL error 3: Error writing file '/tmp/MYUAGF4h' (Errcode: 28) # I wonder if it is due to tmp space: # Filesystem Size Used Avail Use% Mounted on # /dev/sdb3 267G 244G 9.1G 97% / # Yes, it was that, I removed some garbage from /scratch to make # more space: # Filesystem Size Used Avail Use% Mounted on # /dev/sdb3 267G 242G 12G 96% / # And the command finished. I don't know where it was keeping its # files as I was watching for something large to show up in /tmp # during this 65 minute command, but I never saw a large file # there but I did see available space get down to only a couple Gb # free. Now, to finish the load of the nets: netFilter -minGap=10 mm6.monDom1.net | hgLoadNet \ -verbose=0 mm6 netMonDom1 stdin # 5 minute load time # continuing 2005-04-05 16:00 ssh eieio cd /cluster/data/mm6/bed/blastzMonDom1.2005_04_01 time /cluster/bin/scripts/doBlastzChainNet.pl -chainMinScore=5000 \ -continue download -fileServer eieio `pwd`/DEF > download.run.out 2>&1 & # And measurments: ssh hgwdev cd /cluster/data/mm6/bed/blastzMonDom1.2005_04_01 ../../jkStuff/netChainCheck.sh mm6 mm5 monDom1 > measurements.out 2>&1 & featureBits mm6 netMonDom1 # 2082064216 bases of 2597150411 (80.167%) in intersection featureBits mm5 netMonDom1 # 2094316044 bases of 2615483787 (80.074%) in intersection featureBits mm6 chainMonDom1 # 2109438148 bases of 2597150411 (81.221%) in intersection featureBits mm5 chainMonDom1 # 2121448151 bases of 2615483787 (81.111%) in intersection featureBits mm6 chainMonDom1Link # 249576105 bases of 2597150411 (9.610%) in intersection featureBits mm5 chainMonDom1Link # 248180346 bases of 2615483787 (9.489%) in intersection # looks OK, so to the swap: ssh eieio cd /cluster/data/mm6/bed/blastzMonDom1.2005_04_01 time /cluster/bin/scripts/doBlastzChainNet.pl -chainMinScore=5000 \ -swap -fileServer eieio `pwd`/DEF > swap.run.out 2>&1 & # # measurements: featureBits monDom1 netMm6 # 2884735370 bases of 3492108230 (82.607%) in intersection featureBits monDom1 netMm5 # 2889580530 bases of 3492108230 (82.746%) in intersection featureBits monDom1 chainMm6 # 2908045004 bases of 3492108230 (83.275%) in intersection featureBits monDom1 chainMm5 # 2913812625 bases of 3492108230 (83.440%) in intersection featureBits monDom1 chainMm6Link # 253105698 bases of 3492108230 (7.248%) in intersection featureBits monDom1 chainMm5Link # 249594220 bases of 3492108230 (7.147%) in intersection # looks OK, done ############################################################################## # BLASTZ FROG Xenopus tropicalis (DONE - 2005-04-05 - 2005-04-08 - Hiram) ssh eieio mkdir /cluster/data/mm6/bed/blastzXenTro1.2005_04_05 cd /cluster/data/mm6/bed/blastzXenTro1.2005_04_05 cat << '_EOF_' > DEF # mouse vs. frog export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin ALIGN=blastz-run BLASTZ=blastz # Set up blastz parameters using parameters between chicken and fish, # but not abridging repeats since can't do that with scaffolds, and # it's not very relevant at this evolutionary distance. BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=8000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q BLASTZ_ABRIDGE_REPEATS=0 # TARGET: mouse mm6 SEQ1_DIR=/panasas/store/mm6/nib SEQ1_IN_CONTIGS=0 SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Frog xenTro1 SEQ2_DIR=/iscratch/i/xenTro1/xenTro1.2bit SEQ2_IN_CONTIGS=1 SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/cluster/data/mm6/bed/blastzXenTro1.2005_04_05 DEF=$BASE/DEF RAW=$BASE/raw CDBDIR=$BASE SEQ1_LEN=$BASE/S1.len SEQ2_LEN=$BASE/S2.len '_EOF_' # << keep emacs coloring happy cp /cluster/data/mm6/chrom.sizes ./S1.len sort -rn +1 /cluster/data/xenTro1/chrom.sizes > S2.len # establish a screen to control this job screen time /cluster/bin/scripts/doBlastzChainNet.pl -chainMinScore=5000 \ -fileServer eieio `pwd`/DEF > blast.run.out 2>&1 & # STARTED 2005-04-05 10:30 - new machine serving this filesystem today # 635 min = 10h 35m # detach from screen session: Ctrl-a Ctrl-d # to reattach to this screen session: ssh kksilo screen -d -r # had a failure on many of the jobs here. Clean up: ssh kk9 cd /cluster/data/mm6/bed/blastzXenTro1.2005_04_05/run.blastz para time > time.0 para problems > probs.1 para recover jobList recoverJobList ssh kk cd /cluster/data/mm6/bed/blastzXenTro1.2005_04_05/run.blastz para create recoverJobList para try para push ... check ... etc ... # with that successfully completed: para time > run.time ssh eieio cd /cluster/data/mm6/bed/blastzXenTro1.2005_04_05 time /cluster/bin/scripts/doBlastzChainNet.pl -chainMinScore=5000 \ -continue cat -fileServer eieio `pwd`/DEF > cat.run.out 2>&1 & # measurements ssh hgwdev cd /cluster/data/mm6/bed/blastzXenTro1.2005_04_05 time ../../jkStuff/netChainCheck.sh mm6 mm5 xenTro1 > measures.out 2>&1 & featureBits mm6 netXenTro1 # 1033071781 bases of 2597150411 (39.777%) in intersection featureBits mm5 netXenTro1 # 1042210258 bases of 2615483787 (39.848%) in intersection featureBits mm6 chainXenTro1 # 1063392793 bases of 2597150411 (40.945%) in intersection featureBits mm5 chainXenTro1 # 1078618413 bases of 2615483787 (41.240%) in intersection featureBits mm6 chainXenTro1Link # 67119684 bases of 2597150411 (2.584%) in intersection featureBits mm5 chainXenTro1Link # 73115446 bases of 2615483787 (2.795%) in intersection # Those are looking good, now to the swap: ssh eieio cd /cluster/data/mm6/bed/blastzXenTro1.2005_04_05 time /cluster/bin/scripts/doBlastzChainNet.pl -chainMinScore=5000 \ -swap -fileServer eieio `pwd`/DEF > swap.run.out 2>&1 & # 70 minutes # Measurements: featureBits xenTro1 netMm6 # 683225633 bases of 1381238994 (49.465%) in intersection featureBits xenTro1 netMm5 # 697384254 bases of 1381238994 (50.490%) in intersection featureBits xenTro1 chainMm6 # 700638086 bases of 1381238994 (50.725%) in intersection featureBits xenTro1 chainMm5 # 721494705 bases of 1381238994 (52.235%) in intersection featureBits xenTro1 chainMm6Link # 64584213 bases of 1381238994 (4.676%) in intersectio featureBits xenTro1 chainMm5Link # 76415718 bases of 1381238994 (5.532%) in intersection ############################################################################# # BLASTZ CHIMP PanTro1 (DONE - 2005-04-05 - 2005-04-15 - Hiram) ssh eieio mkdir /cluster/data/mm6/bed/blastzPanTro1.2005_04_08 cd /cluster/data/mm6/bed/blastzPanTro1.2005_04_08 # same parameters as Human alignment, except for the use of the # SMSK linSpecRepeats - in this case, using none. Should be an # interesting comparison if the lineage specific repeats make much # difference in the result. cat << '_EOF_' > DEF # mouse vs. human export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/home/angie/schwartzbin:/cluster/home/kent/bin/i386 ALIGN=blastz-run BLASTZ=blastz BLASTZ_H=2000 BLASTZ_ABRIDGE_REPEATS=0 # TARGET: Mouse mm6 SEQ1_DIR=/panasas/store/mm6/nib SEQ1_IN_CONTIGS=0 SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Chimp panTro1 SEQ2_DIR=/scratch/chimp/panTro1/nib SEQ2_IN_CONTIGS=0 SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/cluster/data/mm6/bed/blastzPanTro1.2005_04_08 DEF=$BASE/DEF RAW=$BASE/raw CDBDIR=$BASE SEQ1_LEN=$BASE/S1.len SEQ2_LEN=$BASE/S2.len '_EOF_' # << keep emacs coloring happy cp /cluster/data/mm6/chrom.sizes ./S1.len sort -rn +1 /cluster/data/panTro1/chrom.sizes > S2.len # establish a screen to control this job screen time /cluster/bin/scripts/doBlastzChainNet.pl \ -fileServer eieio `pwd`/DEF > blast.run.out 2>&1 & # STARTED 2005-04-06 10:40 # 489 minutes = 8h 09m # detach from screen session: Ctrl-a Ctrl-d # to reattach to this screen session: ssh kksilo screen -d -r Completed: 155570 of 155570 jobs CPU time in finished jobs: 14707939s 245132.32m 4085.54h 170.23d 0.466 y IO & Wait Time: 609798s 10163.29m 169.39h 7.06d 0.019 y Average job time: 98s 1.64m 0.03h 0.00d Longest running job: 0s 0.00m 0.00h 0.00d Longest finished job: 5146s 85.77m 1.43h 0.06d Submission to last job: 20972s 349.53m 5.83h 0.24d Completed: 331 of 331 jobs CPU time in finished jobs: 260s 4.33m 0.07h 0.00d 0.000 y IO & Wait Time: 1135s 18.92m 0.32h 0.01d 0.000 y Average job time: 4s 0.07m 0.00h 0.00d Longest running job: 0s 0.00m 0.00h 0.00d Longest finished job: 16s 0.27m 0.00h 0.00d Submission to last job: 234s 3.90m 0.07h 0.00d Completed: 40 of 40 jobs CPU time in finished jobs: 7229s 120.48m 2.01h 0.08d 0.000 y IO & Wait Time: 207s 3.46m 0.06h 0.00d 0.000 y Average job time: 186s 3.10m 0.05h 0.00d Longest running job: 0s 0.00m 0.00h 0.00d Longest finished job: 597s 9.95m 0.17h 0.01d Submission to last job: 1287s 21.45m 0.36h 0.01d ssh hgwdev cd /cluster/data/mm6/bed/blastzPanTro1.2005_04_08 featureBits mm6 netPanTro1 # 2569701404 bases of 2597150411 (98.943%) in intersection time featureBits mm6 netHg17 # 2579747741 bases of 2597150411 (99.330%) in intersection featureBits mm6 chainPanTro1 # 2585896564 bases of 2597150411 (99.567%) in intersection time featureBits mm6 chainHg17 # 2596946329 bases of 2597150411 (99.992%) in intersection featureBits mm6 chainPanTro1Link # 924893452 bases of 2597150411 (35.612%) in intersection featureBits mm6 chainHg17Link (on kolossus) # 966916309 bases of 2597150411 (37.230%) in intersection # Looks about correct, now for the swap ssh eieio cd /cluster/data/mm6/bed/blastzPanTro1.2005_04_08 time /cluster/bin/scripts/doBlastzChainNet.pl \ -swap -fileServer eieio `pwd`/DEF > swap.run.out 2>&1 & # 107 minutes featureBits panTro1 netMm6 # 3306360710 bases of 2733948177 (120.937%) in intersection featureBits panTro1 chainMm6 # 3363239156 bases of 2733948177 (123.018%) in intersection featureBits panTro1 chainMm6Link # 922583825 bases of 2733948177 (33.745%) in intersection featureBits -countGaps panTro1 netMm6 # 3306360710 bases of 4420375440 (74.798%) in intersection featureBits -countGaps panTro1 netHg16 # 4015411490 bases of 4420375440 (90.839%) in intersection featureBits -countGaps panTro1 chainMm6 # 3363239156 bases of 4420375440 (76.085%) in intersection featureBits -countGaps panTro1 chainHg16 # 4056193816 bases of 4420375440 (91.761%) in intersection # on kolossus: HGDB_CONF=~/.hg.conf.read-only featureBits -countGaps panTro1 chainHg16Link # 2611490291 bases of 4420375440 (59.078%) in intersection HGDB_CONF=~/.hg.conf.read-only featureBits -countGaps panTro1 chainMm6Link # 922583825 bases of 4420375440 (20.871%) in intersection # Appears to be reasonable, check the genome-test browser on both # the Mm6 assembly and the PanTro1 assembly to see if the net and # chain tracks appear and are in the proper order. ############################################################################# # STS MARKERS DATA DOWNLOAD (DONE - 2005-04-06 - 2005-04-18 - Hiram) # Applied a filter to primers.psl - 2005-10-20 - Hiram # Removed 404 rows from all_sts_primers where qName had bad format - 2005-11-02 - Jen ssh eieio mkdir -p /cluster/data/mm6/bed/STSmarkers/downloads cd /cluster/data/mm6/bed/STSmarkers/downloads # these files appear to be new almost every day # AND, they were incorrect when I fetched them on April 6th, # they were corrected on April 8th # HOWEVER, they still appear to be incorrect. There are IDs in # the UniSTS_mouse.sts file that do not appear in the aliases file # 2005-04-14 Further information on this says that some IDs do not # have aliases, thus need no entries in the alises file. wget --timestamping \ ftp://ftp.ncbi.nih.gov/repository/UniSTS/UniSTS_mouse.sts wget --timestamping \ ftp://ftp.ncbi.nih.gov/repository/UniSTS/UniSTS.aliases # these map files used to be static for some years, now they appear # to be new wget --timestamping \ ftp://ftp.ncbi.nih.gov/repository/UniSTS/UniSTS_MapReports/Mus_musculus/* # These files used to be unchanging. This time they seem to be # new files: # 396858 Jan 28 19:15 10090.MGI.txt # 173344 Mar 16 19:45 10090.WI-Genetic.txt # 240688 Mar 16 19:45 10090.WI-YAC.txt # 390139 Mar 16 20:16 10090.WI_MRC_RH.txt # Will have to watch below to see how these figure into the construction. # these reports from jax.org appear to be changing daily wget --timestamping \ ftp://ftp.informatics.jax.org/pub/reports/MRK_Dump2.rpt wget --timestamping \ ftp://ftp.informatics.jax.org/pub/reports/MRK_Sequence.rpt wget --timestamping \ ftp://ftp.informatics.jax.org/pub/reports/PRB_PrimerSeq.rpt # back to our work area, update the bed file # to do this we need a new UniSTS_mouse.alias file # it is created by a combination of information from several # of the above files ! AND ! the previous stsInfoMouse.bed file cp -p /cluster/data/mm5/bed/STSmarkers/downloads/*.sh . cp -p /cluster/data/mm5/bed/STSmarkers/downloads/*.pl . # There is a line in the fetchAllAliases.sh script that needs to # be updated, it must point to the previous bed file: # BEDFile=/cluster/data/mm5/bed/STSmarkers/stsInfoMouse.bed # Next time, this should read: # BEDFile=/cluster/data/mm6/bed/STSmarkers/stsInfoMouse.bed # The perl scripts were reworked, updated, cleaned up, and fixed # to handle a new type of format found in the UniSTS.aliases file. # *!*! ACTUALLY there was an error in the UniSTS.aliases file # format, it was not correct. Upon submitting a query to Deanna # Church, I got the following answer: # Dear Hiram, # Thanks for reporting formatting problems in UniSTS.aliases file. # It was fixed in the program and correct file was put to ftp site. # -Wonhee Jang # --------------------------------------------------- # Wonhee Jang, Ph.D # National Center for Biotechnology Information/NIH # Building 45, Room 5AS43D-49, Bethesda, MD 20894 # jang@ncbi.nlm.nih.gov phone)301-402-9307 # fax) 301-480-2484 # --------------------------------------------------- # This process has been captured in the script: # /cluster/data/mm5/bed/STSmarkers/downloads/fetchAllAliases.sh # which uses a couple of perl scripts in that same directory. # briefly it is: # ./UniSTSParse.pl UniSTS_mouse.sts UniSTS.aliases > UniSTS_mouse_alias.0 # grep MGI: UniSTS.aliases > MGI.aliases # ./stsInfoMouseParse.pl /cluster/store5/mouseMarker/stsInfoMouse.bed > \ # stsInfoAliases.txt # ./UniSTSParse.pl stsInfoAliases.txt UniSTS.aliases > stsInfo.aliases # cat UniSTS_mouse_alias.0 MGI.aliases stsInfo.aliases | sort -u \ # | sort -n > UniSTS_mouse.alias time ./fetchAllAliases.sh # with that, we can create a new stsInfoMouse.bed file: cd /cluster/data/mm6/bed/STSmarkers /cluster/store5/mouseMarker/code/updateBed.pl \ /cluster/data/mm5/bed/STSmarkers/stsInfoMouse.bed \ downloads/MRK_Dump2.rpt downloads/PRB_PrimerSeq.rpt \ downloads/MRK_Sequence.rpt downloads/UniSTS_mouse.alias \ downloads/UniSTS_mouse.sts | sed -e "s/\t*$//" > newbedfile # Yontao updated /cluster/store5/mouseMarker/code/cleanInfo.pl 8/10/04 /cluster/store5/mouseMarker/code/cleanInfo.pl newbedfile > stsInfoMouse.bed # copy the stsInfoMouse.bed file from working dir to the marker info storage fold. # added 2 new steps by Yontao mv /cluster/store5/mouseMarker/stsInfoMouse.bed \ /cluster/store5/mouseMarker/stsInfoMouse.bed_mm5 cp -p stsInfoMouse.bed /cluster/store5/mouseMarker/stsInfoMouse.bed # comparing to Mm5 # /cluster/store5/mouseMarker/stsInfoMouse.bed_mm5 # /cluster/store5/mouseMarker/stsInfoMouse.bed 58493 778055 6524821 stsInfoMouse.bed_mm5 58980 784786 6690105 stsInfoMouse.bed # and from that, create new primer fa, epcr, etc: /cluster/store5/mouseMarker/code/luConvertPrimerToFa \ stsInfoMouse.bed mouseP.fa mouseC.fa mouseP.info # the mouseC.fa file will be empty wc mouse?.* # 0 0 0 mouseC.fa # 293305 293251 6624638 mouseP.fa # 32890 164528 2087271 mouseP.info # 326195 457779 8711909 total # the equivalent Mm5 files: # 0 0 0 mouseC.fa # 286740 286686 6474893 mouseP.fa # 32232 161234 2044810 mouseP.info # 318972 447920 8519703 total # copy the primers over to the panasas for the kluster run mkdir /panasas/store/mm6/STSmarkers cp -p mouseP.fa /panasas/store/mm6/STSmarkers cp -p mouseP.info /panasas/store/mm6/STSmarkers # CLUSTER RUN FOR THE STS PRIMERS ssh kk9 mkdir -p /cluster/data/mm6/bed/STSmarkers/primer mkdir -p /cluster/data/mm6/bed/STSmarkers/ePCR cd /cluster/data/mm6/bed/STSmarkers/primer # the mouseP.fa comes from above # PLEASE NOTE /cluster/bin/i386/blat.2 SPECIFICALLY IS USED HERE. cat << '_EOF_' > template #LOOP /cluster/bin/i386/blat.2 $(path1) /panasas/store/mm6/STSmarkers/mouseP.fa -ooc=/scratch/hg/h/mouse11.ooc -minMatch=1 -minScore=0 -minIdentity=80 -oneOff {check out line+ primers.out/$(root1).psl} #ENDLOOP '_EOF_' mkdir primers.out ls -1S /panasas/store/mm6/fasta/chr*.fa > contig.lst gensub2 contig.lst single template jobList para create jobList para try para check para push # STARTED - 2005-04-14 15:40 # Completed: 40 of 40 jobs # CPU time in finished jobs: 445070s 7417.83m 123.63h 5.15d 0.014 y # IO & Wait Time: 463s 7.72m 0.13h 0.01d 0.000 y # Average job time: 11138s 185.64m 3.09h 0.13d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 53053s 884.22m 14.74h 0.61d # Submission to last job: 53053s 884.22m 14.74h 0.61d # on the file server ssh kksilo cd /cluster/data/mm6/bed/STSmarkers/primer /cluster/bin/i386/pslSort dirs primers.psl temp primers.out # filter alignments for (qEnd-qStart) vs. (tEnd-tStart) 2005-10-20 # should not be more than 100 bases different. # This filters out about 973,365 alignments, or # %17.0 = 100.0 * 973365 / 5724127 pslSort dirs stdout temp primers.out | awk -F"\t" ' { if (((($13 - $12) - ($17 - $16)) > -100) && ((($13 - $12) - ($17 - $16)) < 100)) {print} } ' > primers.psl.100 rmdir temp # a rough comparison with previous results: wc primers.psl.100 (after applying filter to primers.psl) # 4750762 99765920 495766873 primers.psl.100 wc primers.psl (before applying filter to primers.psl) # 5724127 120206606 615248041 primers.psl wc /cluster/data/mm5/bed/STSmarkers/primer/primers.psl # 5719969 120119288 590806241 wc /cluster/data/mm4/bed/STSmarkers/primer/primers.psl # 5745617 120657896 592135728 # another kluster run for the ePCR ssh kk9 cd /cluster/data/mm6/bed/STSmarkers/ePCR ls -1S /panasas/store/mm6/fasta/chr*.fa > contig.lst mkdir epcr.out cat << '_EOF_' > runPCR.csh #!/bin/csh -fe /cluster/bin/i386/e-PCR $1 $2 N=1 M=50 W=5 > $3 '_EOF_' # emacs happy ? chmod +x runPCR.csh cat << '_EOF_' > template #LOOP ./runPCR.csh /panasas/store/mm6/STSmarkers/mouseP.info $(path1) {check out line+ epcr.out/$(num1).epcr} #ENDLOOP '_EOF_' # the mouseP.info was created above gensub2 contig.lst single template jobList para create jobList para try para check para push ... etc ... # Completed: 40 of 40 jobs # CPU time in finished jobs: 77676s 1294.60m 21.58h 0.90d 0.002 y # IO & Wait Time: 370s 6.17m 0.10h 0.00d 0.000 y # Average job time: 1951s 32.52m 0.54h 0.02d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 6173s 102.88m 1.71h 0.07d # Submission to last job: 6173s 102.88m 1.71h 0.07d ssh hgwdev cd /cluster/data/mm6/bed/STSmarkers/ePCR # all those results become all.epcr cat epcr.out/*.epcr > all.epcr # comparing to previous results: wc all.epcr # 55871 223484 3086148 all.epcr wc /cluster/data/mm5/bed/STSmarkers/ePCR/all.epcr # 55677 222708 2945623 /cluster/data/mm5/bed/STSmarkers/ePCR/all.epcr wc /cluster/data/mm4/bed/STSmarkers/ePCR/all.epcr # 74705 298820 3971712 /cluster/data/mm4/bed/STSmarkers/ePCR/all.epcr # Mm4 seems to be out of whack cd /cluster/data/mm6/bed/STSmarkers/primer /cluster/bin/scripts/filterSTSPrimers \ -mouse ../stsInfoMouse.bed primers.psl.100 \ ../mouseP.info ../ePCR/all.epcr > primers.psl.filter.blat # The output should show an increasing count: # Reading name info # Reading primer info # Processing file # 100000 # 200000 # 300000 # ... # 5700000 # Determining ePCR not found # wc primers.psl.filter.blat (after applying filter to primers.psl above) # 33128 695688 3542978 primers.psl.filter.blat wc primers.psl.filter.blat (before applying filter to primers.psl above) # 33662 706902 3605847 primers.psl.filter.blat wc /cluster/data/mm5/bed/STSmarkers/primer/primers.psl.filter.blat # 33476 702996 3442402 wc /cluster/data/mm4/bed/STSmarkers/primer/primers.psl.filter.blat # 32729 687309 3331894 # It appears Mm4 became sane after the filter # create accession_info.rdb touch empty_sequence.inf /cluster/bin/scripts/compileAccInfo -mouse \ /cluster/data/mm6 empty_sequence.inf # works with errors on missing randoms, etc...: # cat: /cluster/data/mm5/11/chr11_random.agp: No such file or directory # cat: /cluster/data/mm5/M/chrM_random.agp: No such file or directory mv accession_info.rdb accession_info.rdb.tmp /cluster/bin/scripts/sorttbl Chr Ord Start < accession_info.rdb.tmp > \ accession_info.rdb rm accession_info.rdb.tmp # comparing results to previous wc accession_info.rdb # 93052 1023576 6824900 accession_info.rdb wc /cluster/data/mm5/bed/STSmarkers/primer/accession_info.rdb # 131845 1450299 9681940 wc /cluster/data/mm4/bed/STSmarkers/primer/accession_info.rdb # 86935 956289 6374930 # creates epcr.not.found.nomatch and epcr.not.found.psl # /cluster/bin/scripts/epcrToPsl # Fixed this script to make it not look for contigs in the usual # manner, we don't have those for this assembly ./epcrToPsl -mouse \ epcr.not.found ../mouseP.info \ accession_info.rdb /cluster/data/mm6 # Comparing results to previous: wc epcr* # 467 1868 17135 epcr.not.found # 63 756 6041 epcr.not.found.nomatch # 404 8484 40254 epcr.not.found.psl # 158 535 4308 epcrToPsl # 1092 11643 67738 total # Mm5 wc epcr* wc /cluster/data/mm5/bed/STSmarkers/primer/epcr* # 463 1852 17080 epcr.not.found # 61 732 5845 epcr.not.found.nomatch # 398 8358 38591 epcr.not.found.psl # 402 8442 39011 epcr.not.found.psl.orig # 1324 19384 100527 total # Mm4 wc epcr* wc /cluster/data/mm4/bed/STSmarkers/primer/epcr* # 328 1312 12011 epcr.not.found # 57 684 5474 epcr.not.found.nomatch # 266 5586 25711 epcr.not.found.psl # 163 552 4370 epcrToPsl # 814 8134 47566 total cat primers.psl.filter.blat epcr.not.found.psl > primers.psl.filter wc primers.psl.filter (after filter applied above to primers.psl) # 33532 704172 3583232 primers.psl.filter wc primers.psl.filter (before filter applied above to primers.psl) # 34066 715386 3646101 primers.psl.filter wc /cluster/data/mm5/bed/STSmarkers/primer/primers.psl.filter.lifted # 33691 707511 3601164 primers.psl.filter.lifted # create primers.psl.filter.lifted.initial PATH=/cluster/bin/scripts:$PATH /cluster/bin/scripts/extractPslInfo \ primers.psl.filter wc primers.psl.filter.initial (after filter applied above to primers.psl) # 33514 201084 1786769 primers.psl.filter.initial wc primers.psl.filter.initial (before filter applied above to primers.psl) # 34048 204288 1815222 primers.psl.filter.initial wc /cluster/data/mm5/bed/STSmarkers/primer/primers.psl.filter.lifted.initial # 33689 202134 1799016 primers.psl.filter.lifted.initial # create primers.psl.filter.lifted.initial.acc /cluster/bin/scripts/findAccession -agp \ -mouse primers.psl.filter.initial /cluster/data/mm6 # it complains about missing _random items, it is OK wc primers.psl.filter.initial.acc (after filter applied above to primers.psl) # 33514 234598 2120939 primers.psl.filter.initial.acc wc primers.psl.filter.initial.acc (before filter applied above to primers.psl) # 34048 238336 2154798 primers.psl.filter.initial.acc wc /cluster/data/mm5/bed/STSmarkers/primer/primers.psl.filter.lifted.initial.acc # 33689 235823 2158029 primers.psl.filter.lifted.initial.acc # this needs to be -rat as that specifies how to scan the # stsInfoMouse.bed file and it does not work if you use -mouse /cluster/bin/scripts/getStsId -rat \ ../stsInfoMouse.bed primers.psl.filter.initial.acc \ > primers.initial.acc.trans # No id for 61645_RH126840 # No id for 4187_D10MIT171.2 # No id for 63449_RH125771 # No id for 67188_PMC99911P4 # No id for 8839_D6MIT360.1 # No id for 62732_RH126829 # No id for 63746_RH127126 wc primers.initial.acc.trans (after filter applied to primers.psl above) # 33507 234549 1800766 primers.initial.acc.trans wc primers.initial.acc.trans (before filter applied to primers.psl above) # 34041 238287 1829724 primers.initial.acc.trans sort -k 4n primers.initial.acc.trans > primers.final wc primers.final (after filter applied to primers.psl above) # 33507 234549 1800766 primers.final wc primers.final (before filter applied to primers.psl above) # 34041 238287 1829724 primers.final wc /cluster/data/mm5/bed/STSmarkers/primer/primers.final # 33689 235823 1834889 /cluster/data/mm5/bed/STSmarkers/primer/primers.final rm primers.psl.filter.lifted.initial.acc primers.initial.acc.trans cd /cluster/data/mm6/bed/STSmarkers # stsMarkers.final is empty for mouse touch stsMarkers.final dummy PATH=/cluster/bin/scripts:$PATH /cluster/bin/scripts/combineSeqPrimerPos \ stsMarkers.final primer/primers.final > stsMarkers_pos.rdb wc stsMarkers_pos.rdb (after filter applied to primers.psl above) # 31889 223223 1881886 stsMarkers_pos.rdb wc stsMarkers_pos.rdb (before filter applied to primers.psl above) # 32350 226450 1909506 stsMarkers_pos.rdb wc /cluster/data/mm5/bed/STSmarkers/stsMarkers_pos.rdb # 32085 224595 1862816 /cluster/data/mm5/bed/STSmarkers/stsMarkers_pos.rdb wc /cluster/data/mm4/bed/STSmarkers/stsMarkers_pos.rdb # 31270 218890 1869417 /cluster/data/mm4/bed/STSmarkers/stsMarkers_pos.rdb /projects/cc/hg/ytlu/bin/script/perl/createStsBed \ stsInfoMouse.bed stsMarkers_pos.rdb 500 > stsMapMouse.bed wc stsMapMouse.bed (after filter applied to primers.psl above) # 28713 298319 2072647 stsMapMouse.bed wc stsMapMouse.bed (before filter applied to primers.psl above) # 29079 301678 2097544 stsMapMouse.bed wc /cluster/data/mm5/bed/STSmarkers/stsMapMouse.bed # 29069 301535 2123622 /cluster/data/mm5/bed/STSmarkers/stsMapMouse.bed # loading STS markers tables ssh hgwdev cd /cluster/data/mm6/bed/STSmarkers cp -p /cluster/store6/mm5/bed/STSmarkers/ucscAlias.pl . ./ucscAlias.pl stsInfoMouse.bed > ucscStsAlias.tab 2> ucscStsAlias.warnings # this does leave messages in ucscStsAlias.warnings but the seem # to be the same as they were in Mm5 wc ucscStsAlias.tab # 141585 424725 3284106 ucscStsAlias.tab wc /cluster/store6/mm5/bed/STSmarkers/ucscStsAlias.tab # 126624 379859 3037850 /cluster/store6/mm5/bed/STSmarkers/ucscStsAlias.tab hgsql -e "drop table stsAlias;" mm6 hgsql mm6 < ~/kent/src/hg/lib/stsAlias.sql hgsql -e \ 'load data local infile "ucscStsAlias.tab" into table stsAlias;' mm6 hgsql -e "drop table stsMapMouseNew;" mm6 hgsql mm6 < ~/kent/src/hg/lib/stsMapMouseNew.sql hgsql -e \ 'load data local infile "stsMapMouse.bed" into table stsMapMouseNew;' mm6 hgsql -e "drop table stsInfoMouseNew;" mm6 hgsql mm6 < ~/kent/src/hg/lib/stsInfoMouseNew.sql hgsql -e \ 'load data local infile "stsInfoMouse.bed" into table stsInfoMouseNew;' mm6 hgLoadPsl -nobin -table=all_sts_primer mm6 primer/primers.psl.filter # load of all_sts_primer did not go as planned: 33532 record(s), # 0 row(s) skipped, 14 warning(s) loading primer/primers.psl.filter # load of all_sts_primer did not go as planned: 34066 record(s), # 0 row(s) skipped, 14 warning(s) loading primer/primers.psl.filter # load primer sequences mkdir /gbdb/mm6/stsMarker ln -s /cluster/data/mm6/bed/STSmarkers/mouseP.fa \ /gbdb/mm6/stsMarker/mouseP.fa # PLEASE NOTE THAT THE If you are going to reload this business, use the # -replace option on this hgLoadSeq # hgLoadSeq -replace mm6 /gbdb/mm6/stsMarker/mouseP.fa # otherwise there will be a problem that the seq and extFile tables # will be out of sync. hgLoadSeq mm6 /gbdb/mm6/stsMarker/mouseP.fa # Adding /gbdb/mm6/stsMarker/mouseP.fa # 32890 sequences # After applying filter to primers.psl above featureBits mm6 all_sts_primer # 3706406 bases of 2597150411 (0.143%) in intersection featureBits mm6 stsMapMouseNew # 4638338 bases of 2597150411 (0.179%) in intersection featureBits mm7 stsMapMouseNew # 4805958 bases of 2583394090 (0.186%) in intersection # Before applying filter to primers.psl above featureBits mm6 all_sts_primer # 3735649 bases of 2597150411 (0.144%) in intersection featureBits mm5 all_sts_primer # 3727268 bases of 2615483787 (0.143%) in intersection featureBits mm6 stsMapMouseNew # 4736039 bases of 2597150411 (0.182%) in intersection featureBits mm5 stsMapMouseNew # 4719679 bases of 2615483787 (0.180%) in intersection hgsql -N mm6 -e "select count(*) from stsAlias;" # 137738 hgsql -N mm5 -e "select count(*) from stsAlias;" # 122944 hgsql -N mm6 -e "select count(*) from stsInfoMouseNew;" # 58980 hgsql -N mm5 -e "select count(*) from stsInfoMouseNew;" # 58493 # compare old and new name lists: awk '{print $4}' /cluster/data/mm5/bed/STSmarkers/stsMapMouse.bed | \ sort -u > mm5.nameList awk '{print $4}' stsMapMouse.bed | sort -u > mm6.nameList # After applying filter to primers.psl above comm -12 mm?.nameList | wc # 27109 27109 264019 <- 27,109 names in common comm -23 mm5.nameList mm6.nameList | wc # 527 527 4617 <- 527 unique to mm5 list comm -13 mm5.nameList mm6.nameList | wc # 399 399 3646 <- 399 unique to mm6 list # Before applying filter to primers.psl above comm -12 mm?.nameList | wc # 27454 27454 266951 <- 27,545 names in common comm -23 mm5.nameList mm6.nameList | wc # 182 182 1685 <- 182 unique to mm5 list comm -13 mm5.nameList mm6.nameList | wc # 1625 1625 15090 <- 1,625 unique to mm6 list ############################################################################# # HGCENTRAL DEFAULTDB UPDATE (DONE - 2005-04-08 - Hiram) # May as well make this assembly be the default on hgwdev ssh hgwdev hgsql hgcentraltest \ -e 'update defaultDb set name="mm6" where genome="Mouse";' ############################################################################# # 10-WAY VAR_MULTIZ - SECOND TIME, PERHAPS CORRECTLY (WORKING 2005-11-30) # - Hiram ssh kkstore01 mkdir /cluster/data/mm6/bed/multiz10way.2005-11-30 cd /cluster/data/mm6/bed/multiz10way.2005-11-30 mkdir mafLinks mkdir mafLinks/rn3 mkdir mafLinks/hg17 mkdir mafLinks/canFam2 mkdir mafLinks/bosTau1 mkdir mafLinks/monDom1 mkdir mafLinks/galGal2 mkdir mafLinks/xenTro1 mkdir mafLinks/danRer3 mkdir mafLinks/tetNig1 export H=/cluster/data/mm6/bed ln -s ${H}/blastzRn3.2005-11-30/mafNet/*.maf.gz ./mafLinks/rn3 ln -s ${H}/blastzHg17.2005-11-30/mafNet/*.maf.gz ./mafLinks/hg17 ln -s ${H}/blastzCanFam2.2005-12-02/mafNet/*.maf.gz ./mafLinks/canFam2 ln -s ${H}/blastzBosTau1.2005_03_18/mafNet/*.maf.gz ./mafLinks/bosTau1 ln -s ${H}/blastzMonDom1.2005_04_01/mafNet/*.maf.gz ./mafLinks/monDom1 ln -s ${H}/blastzGalGal2.2005-11-30/mafNet/*.maf.gz ./mafLinks/galGal2 ln -s ${H}/blastzXenTro1.2005_04_05/mafNet/*.maf.gz ./mafLinks/xenTro1 ln -s ${H}/blastzDanRer3.2005-08-05/mafNet/*.maf.gz ./mafLinks/danRer3 ln -s ${H}/blastzTetNig1.2005_03_17/mafNet/*.maf.gz ./mafLinks/tetNig1 # Copy MAFs to san for pk kluster run mkdir /san/sanvol1/scratch/mm6/multiz10way.2005-11-30 cd /san/sanvol1/scratch/mm6/multiz10way.2005-11-30 mkdir mafs rsync -a --copy-links --progress \ /cluster/data/mm6/bed/multiz10way.2005-11-30/mafLinks/ ./mafs/ # 3.3 Gb of data to copy, about 10 minutes or so mkdir penn cp -p /cluster/bin/penn/v10.5.x86_64/multiz-tba/multiz penn cp -p /cluster/bin/penn/v10.5.x86_64/multiz-tba/maf_project penn # And for the kluster run ssh pk mkdir /cluster/data/mm6/bed/multiz10way.2005-11-30 cd /cluster/data/mm6/bed/multiz10way.2005-11-30 mkdir -p maf run cd run # create scripts to run var_multiz on cluster cat > oneMultiz.csh << 'EOF' #!/bin/csh -fe set c = $1 set multi = /scratch/mm6/multiz10way.$c set pairs = /san/sanvol1/scratch/mm6/multiz10way.2005-11-30/mafs # special mode -- # with 1 arg, cleanup if ($#argv == 1) then rm -fr $multi exit endif # special mode -- # with 3 args, saves an alignment file if ($#argv == 3) then cp $multi/$2/$c.maf $3 exit endif set s1 = $2 set s2 = $3 set flag = $4 # locate input files -- in pairwise dir, or multiple dir set d1 = $multi set d2 = $multi if (-d $pairs/$s1) then set d1 = $pairs set f1 = $d1/$s1/$c.maf.gz set t1 = /tmp/$s1.$c.maf zcat $f1 > $t1 else set f1 = $d1/$s1/$c.maf set t1 = /tmp/$s1.$c.maf cp -p $f1 $t1 endif if (-d $pairs/$s2) then set d2 = $pairs set f2 = $d2/$s2/$c.maf.gz set t2 = /tmp/$s2.$c.maf zcat $f2 > $t2 else set f2 = $d2/$s2/$c.maf set t2 = /tmp/$s2.$c.maf cp -p $f2 $t2 endif # write to output dir set out = $multi/${s1}${s2} mkdir -p $out # check for empty input file if (-s $t1 && -s $t2) then echo "Aligning $f1 $f2 $flag" /san/sanvol1/scratch/mm6/multiz10way.2005-11-30/penn/multiz \ $t1 $t2 $flag $out/$c.unused1.maf \ $out/$c.unused2.maf > $out/$c.full.maf cat $out/$c.full.maf $out/$c.unused1.maf $out/$c.unused2.maf > \ $out/$c.tmp.maf echo "Ordering $c.maf" /san/sanvol1/scratch/mm6/multiz10way.2005-11-30/penn/maf_project \ $out/$c.tmp.maf mm6.$c > $out/$c.maf rm -f $t1 $t2 else if (-s $t1) then cp -p $t1 $out/$c.maf rm -f $t1 else if (-s $t2) then cp -p $t2 $out/$c.maf rm -f $t2 endif 'EOF' # happy emacs chmod +x oneMultiz.csh cp -p oneMultiz.csh \ /san/sanvol1/scratch/mm6/multiz10way.2005-11-30/penn/oneMultiz.csh cat > allMultiz.csh << 'EOF' #!/bin/csh -fe # multiple alignment steps: set c = $1 set s = "/san/sanvol1/scratch/mm6/multiz10way.2005-11-30/penn/oneMultiz.csh" $s $c hg17 rn3 1 $s $c bosTau1 canFam2 0 $s $c hg17rn3 bosTau1canFam2 1 $s $c hg17rn3bosTau1canFam2 monDom1 1 $s $c hg17rn3bosTau1canFam2monDom1 galGal2 1 $s $c hg17rn3bosTau1canFam2monDom1galGal2 xenTro1 1 $s $c danRer3 tetNig1 0 $s $c hg17rn3bosTau1canFam2monDom1galGal2xenTro1 danRer3tetNig1 1 # get final alignment file $s $c hg17rn3bosTau1canFam2monDom1galGal2xenTro1danRer3tetNig1 \ /cluster/data/mm6/bed/multiz10way.2005-11-30/maf/$c.maf #cleanup $s $c 'EOF' # happy emacs chmod +x allMultiz.csh cat << 'EOF' > template #LOOP ./allMultiz.csh $(root1) {check out line+ /cluster/data/mm6/bed/multiz10way.2005-11-30/maf/$(root1).maf} #ENDLOOP 'EOF' awk '{print $1}' ../../../chrom.sizes > chrom.lst gensub2 chrom.lst single template jobList para create jobList para try; para check para push XXXX - running 2005-12-05 16:30 # Completed: 40 of 40 jobs # CPU time in finished jobs: 124610s 2076.83m 34.61h 1.44d 0.004 y # IO & Wait Time: 1331s 22.18m 0.37h 0.02d 0.000 y # Average job time: 3149s 52.48m 0.87h 0.04d # Longest finished job: 12711s 211.85m 3.53h 0.15d # Submission to last job: 12711s 211.85m 3.53h 0.15d ssh kkstore01 cd /cluster/data/mm6/bed/multiz10way.2005-11-30 catDir maf > multiz10wayU1.maf # ~ 3.5 minutes ssh hgwdev cd /cluster/data/mm6/bed/multiz10way.2005-11-30 mkdir /gbdb/mm6/multiz10wayU1 ln -s /cluster/data/mm6/bed/multiz10way.2005-11-30/multiz10wayU1.maf \ /gbdb/mm6/multiz10wayU1/multiz10wayU1.maf time hgLoadMaf mm6 multiz10wayU1 # real 18m22.810s time hgLoadMafSummary -minSize=10000 -mergeGap=500 -maxSize=50000 mm6 \ multiz10wayU1Summary multiz10wayU1.maf # real 20m45.326s ############################################################################ # CREATE CONSERVATION WIGGLE WITH PHASTCONS Second time with new multiz10way # (WORKING - 2005-12-06 - Hiram) # Estimate phastCons parameters ssh kkstore01 mkdir /cluster/data/mm6/bed/multiz10way.2005-11-30/cons cd /cluster/data/mm6/bed/multiz10way.2005-11-30/cons # Create a starting-tree.mod based on chr2 (the biggest maf) /cluster/bin/phast/$MACHTYPE/msa_split ../maf/chr2.maf \ --refseq ../../../2/chr2.fa --in-format MAF \ --windows 100000000,1000 --out-format SS \ --between-blocks 5000 --out-root s1 # ~4 minutes /cluster/bin/phast/$MACHTYPE/phyloFit -i SS s1.*.ss \ --tree "((((((hg17,(mm6,rn3)),(canFam2,bosTau1)),monDom1),galGal2),xenTro1),(tetNig1,danRer3))" \ --out-root starting-tree # about 45 minutes rm s1.*.ss # add up the C and G: grep BACKGROUND phyloFit.mod | awk '{printf "%0.3f\n", $3 + $4;}' # 0.408 # This 0.408 is used in the --gc argument below # Create big bad bloated SS files on san filesystem (takes ~ 1h 05m) ssh kkstore01 mkdir -p /san/sanvol1/scratch/mm6/cons/ss cd /san/sanvol1/scratch/mm6/cons/ss for C in `awk '{print $1}' /cluster/data/mm6/chrom.sizes` do if [ -s /cluster/data/mm6/bed/multiz10way.2005-11-30/maf/${C}.maf ]; then mkdir ${C} echo msa_split $C chrN=${C/chr/} chrN=${chrN/_random/} /cluster/bin/phast/$MACHTYPE/msa_split \ /cluster/data/mm6/bed/multiz10way.2005-11-30/maf/${C}.maf \ --refseq /cluster/data/mm6/${chrN}/${C}.fa \ --in-format MAF --windows 1000000,0 --between-blocks 5000 \ --out-format SS --out-root ${C}/${C} fi done # real 63m41.485s # Create a random list of 50 1 mb regions (do not use the _randoms) ls -1l chr*/chr*.ss | grep -v random | \ awk '$5 > 4000000 {print $9;}' | randomLines stdin 50 ../randomSs.list # Set up parasol directory to calculate trees on these 50 regions ssh pk mkdir /san/sanvol1/scratch/mm6/cons/treeRun1 cd /san/sanvol1/scratch/mm6/cons/treeRun1 mkdir tree log # Tuning this loop should come back to here to recalculate # Create little script that calls phastCons with right arguments # --target-coverage of 0.20 is about right for mouse, will be # tuned exactly below cat > makeTree.csh << '_EOF_' #!/bin/csh -fe set C=$1:h mkdir -p log/${C} tree/${C} /cluster/bin/phast/$MACHTYPE/phastCons ../ss/$1 \ /cluster/data/mm6/bed/multiz10way.2005-11-30/cons/phyloFit.mod \ --gc 0.408 --nrates 1,1 --no-post-probs --ignore-missing \ --expected-lengths 12 --target-coverage 0.17 \ --quiet --log log/$1 --estimate-trees tree/$1 '_EOF_' # emacs happy chmod a+x makeTree.csh # Create gensub file cat > template << '_EOF_' #LOOP makeTree.csh $(path1) #ENDLOOP '_EOF_' # happy emacs # Make cluster job and run it gensub2 ../randomSs.list single template jobList para create jobList para try/push/check/etc # Completed: 50 of 50 jobs # Now combine parameter estimates. We can average the .mod files # using phyloBoot. This must be done separately for the conserved # and nonconserved models ssh kkstore01 cd /san/sanvol1/scratch/mm6/cons/treeRun1 ls -1 tree/chr*/*.cons.mod > cons.list /cluster/bin/phast/$MACHTYPE/phyloBoot --read-mods '*cons.list' \ --output-average ../ave.cons.mod > cons_summary.txt ls -1 tree/chr*/*.noncons.mod > noncons.list /cluster/bin/phast/$MACHTYPE/phyloBoot --read-mods '*noncons.list' \ --output-average ../ave.noncons.mod > noncons_summary.txt cd .. cp -p ave.*.mod /cluster/data/mm6/bed/multiz10way.2005-11-30/cons # measuring entropy # consEntopy # ave.cons.mod ave.noncons.mod --NH 9.78 # never stops with the --NH argument /cluster/bin/phast/$MACHTYPE/consEntropy .17 12 \ ave.cons.mod ave.noncons.mod ### !!! *** This one with .17 and 12 is the one that was finally used #Transition parameters:gamma=0.170000, omega=12.000000, mu=0.083333, nu=0.017068 #Relative entropy: H=1.461641 bits/site #Expected min. length: L_min=6.838719 sites #Expected max. length: L_max=5.059638 sites #Phylogenetic information threshold: PIT=L_min*H=9.995752 bits # We are aiming for PIT to be near 10 (aka total entropy) # This is good enough. Tuning wasn't necessary this time because # the tuning that was done the first time evidently was still good # for this one. ssh pk # Create cluster dir to do main phastCons run mkdir /san/sanvol1/scratch/mm6/cons/consRun1 cd /san/sanvol1/scratch/mm6/cons/consRun1 mkdir ppRaw bed # Create script to run phastCons with right parameters # This job is I/O intensive in its output files, thus it is all # working over in /scratch/tmp/ cat > doPhast.csh << '_EOF_' #!/bin/csh -fe mkdir /scratch/tmp/${2} cp -p ../ss/${1}/${2}.ss ../ave.cons.mod ../ave.noncons.mod /scratch/tmp/${2} pushd /scratch/tmp/${2} > /dev/null /cluster/bin/phast/${MACHTYPE}/phastCons ${2}.ss ave.cons.mod,ave.noncons.mod \ --expected-length 12 --target-coverage 0.17 --quiet \ --seqname ${1} --idpref ${1} --viterbi ${2}.bed --score > ${2}.pp popd > /dev/null mkdir -p ppRaw/${1} mkdir -p bed/${1} mv /scratch/tmp/${2}/${2}.pp ppRaw/${1} mv /scratch/tmp/${2}/${2}.bed bed/${1} rm /scratch/tmp/${2}/ave.*cons.mod rm /scratch/tmp/${2}/${2}.ss rmdir /scratch/tmp/${2} '_EOF_' # emacs happy chmod a+x doPhast.csh # root1 == chrom name, file1 == ss file name without .ss suffix # Create gsub file cat > template << '_EOF_' #LOOP doPhast.csh $(root1) $(file1) #ENDLOOP '_EOF_' # happy emacs # Create parasol batch and run it ls -1 ../ss/chr*/chr*.ss | sed 's/.ss$//' > in.list gensub2 in.list single template jobList para create jobList para try/check/push/etc. # Completed: 3098 of 3098 jobs # CPU time in finished jobs: 11377s 189.61m 3.16h 0.13d 0.000 y # IO & Wait Time: 88563s 1476.06m 24.60h 1.03d 0.003 y # Average job time: 32s 0.54m 0.01h 0.00d # Longest finished job: 292s 4.87m 0.08h 0.00d # Submission to last job: 627s 10.45m 0.17h 0.01d # combine predictions and transform scores to be in 0-1000 interval # it uses a lot of memory, so on kolossus: ssh kolossus cd /san/sanvol1/scratch/mm6/cons/consRun1 # The sed's and the sort get the file names in chrom,start order find ./bed -type f | sed -e "s#/# x #g; s#\.# y #g; s#-# z #g" \ | sort -k7,7 -k9,9n \ | sed -e "s# z #-#g; s# y #\.#g; s# x #/#g" | xargs cat \ | awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' \ | /cluster/bin/scripts/lodToBedScore /dev/stdin > mostConserved.bed # ~ 1m 12s # Figure out how much is actually covered by the bed files as so: # The 2597150151 comes from the non-n genome size, # from faSize on all chroms: ssh kkstore01 cd /cluster/data/mm6 faSize ?/chr*.fa ??/chr*.fa # 3079633452 bases (482483301 N's 2597150151 real 1486015217 # upper 1111134934 cd /san/sanvol1/scratch/mm6/cons/consRun1 awk ' {sum+=$3-$2} END{printf "%% %.2f = 100.0*%d/2597150151\n",100.0*sum/2597150151,sum}' \ mostConserved.bed -target-coverage 0.17: % 5.29 = 100.0*137323490/2597150151 length 12 cp -p mostConserved.bed /cluster/data/mm6/bed/multiz10way.2005-11-30 # Load most conserved track into database ssh hgwdev cd /cluster/data/mm6/bed/multiz10way.2005-11-30 hgLoadBed -strict mm6 phastConsElementsU1 mostConserved.bed # Loaded 2356669 elements of size 5 # ~5 minute load time featureBits mm6 -enrichment refGene:cds phastConsElementsU1 # -target-coverage 0.17 and expected lengths 12: # refGene:cds 1.013%, phastConsElementsU1 5.287%, both 0.694%, cover 68.54%, # enrich 12.96x # Create merged posterier probability file and wiggle track data files # the sed business gets the names sorted by chromName, chromStart # so that everything goes in numerical order into wigEncode ssh kkstore01 cd /san/sanvol1/scratch/mm6/cons/consRun1 find ./ppRaw -type f | sed -e "s#/# x #g; s#\.# y #g; s#-# z #g" \ | sort -k7,7 -k9,9n \ | sed -e "s# z #-#g; s# y #\.#g; s# x #/#g" | xargs cat \ | wigEncode stdin phastCons10U1.wig phastCons10U1.wib # about 22 minutes for above # -rw-rw-r-- 1 1975849149 Dec 6 14:50 phastCons10U1.wib # -rw-rw-r-- 1 253234710 Dec 6 14:50 phastCons10U1.wig cp -p phastCons10U1.wig phastCons10U1.wib \ /cluster/data/mm6/bed/multiz10way.2005-11-30 # Load gbdb and database with wiggle. ssh hgwdev cd /cluster/data/mm6/bed/multiz10way.2005-11-30 ln -s `pwd`/phastCons10U1.wib /gbdb/mm6/wib/phastCons10U1.wib hgLoadWiggle mm6 phastCons10U1 phastCons10U1.wig # ~ 3 minute load # Create histogram to get an overview of all the data ssh hgwdev cd /cluster/data/mm6/bed/multiz10way.2005-11-30 time hgWiggle -doHistogram \ -hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \ -db=mm6 phastCons10U1 > histogram.data 2>&1 # about 23 minutes to scan all data # prepare compressed copy of ascii data values for downloads ssh pk cd /san/sanvol1/scratch/mm6/cons/consRun1 cat << '_EOF_' > gzipAscii.sh #!/bin/sh TOP=`pwd` export TOP mkdir -p phastCons10Scores for D in ppRaw/chr* do C=${D/ppRaw\/} out=phastCons10Scores/${C}.data.gz echo "========================== ${C} ${D}" find ./${D} -type f | sed -e "s#/# x #g; s#\.# y #g; s#-# z #g" \ | sort -k7,7 -k9,9n \ | sed -e "s# z #-#g; s# y #\.#g; s# x #/#g" | xargs cat | gzip > ${out} done '_EOF_' # happy emacs chmod +x gzipAscii.sh time ./gzipAscii.sh # takes about 37 minutes, makes 2.9 Gb of data # copy them for downloads ssh kkstore01 mkdir /cluster/data/mm6/bed/multiz10way.2005-11-30/phastCons10Scores cd /cluster/data/mm6/bed/multiz10way.2005-11-30/phastCons10Scores rsync -a --progress \ pk:/san/sanvol1/scratch/mm6/cons/consRun1/phastCons10Scores/ . # ~5 minute copy ssh hgwdev mkdir /usr/local/apache/htdocs/goldenPath/mm6/phastCons10Scores cd /usr/local/apache/htdocs/goldenPath/mm6/phastCons10Scores ln -s /cluster/data/mm6/bed/multiz10way.2005-11-30/phastCons10Scores/*.gz . # prepare maf downloads ssh kkstore01 cd /cluster/data/mm6/bed/multiz10way.2005-11-30 mkdir mafDownloads for M in maf/chr*.maf do B=`basename $M` cp -p ${M} mafDownloads/${B} gzip mafDownloads/${B} echo ${B} done done # Creates 2.7 gb of data ssh hgwdev mkdir /usr/local/apache/htdocs/goldenPath/mm6/multiz10way cd /usr/local/apache/htdocs/goldenPath/mm6/multiz10way ln -s /cluster/data/mm6/bed/multiz10way.2005-11-30/mafDownloads/*.maf.gz . ############################################################################# # 10-WAY VAR_MULTIZ - ALIGNMENTS (DONE - 2005-04-08 - 2005-04-18 - Hiram) ssh eieio mkdir /cluster/data/mm6/bed/multiz10way cd /cluster/data/mm6/bed/multiz10way mkdir mafLinks mkdir mafLinks/rn3 mkdir mafLinks/hg17 mkdir mafLinks/canFam1 mkdir mafLinks/bosTau1 mkdir mafLinks/monDom1 mkdir mafLinks/galGal2 mkdir mafLinks/xenTro1 mkdir mafLinks/danRer2 mkdir mafLinks/tetNig1 export H=/cluster/data/mm6/bed ln -s ${H}/blastzRn3.2005_03_22/mafNet/*.maf.gz ./mafLinks/rn3 ln -s ${H}/blastzHg17.2005_03_14/mafNet/*.maf.gz ./mafLinks/hg17 ln -s ${H}/blastzCanFam1.2005_03_18/mafNet/*.maf.gz ./mafLinks/canFam1 ln -s ${H}/blastzBosTau1.2005_03_18/mafNet/*.maf.gz ./mafLinks/bosTau1 ln -s ${H}/blastzMonDom1.2005_04_01/mafNet/*.maf.gz ./mafLinks/monDom1 ln -s ${H}/blastzGalGal2.2005_03_31/mafNet/*.maf.gz ./mafLinks/galGal2 ln -s ${H}/blastzXenTro1.2005_04_05/mafNet/*.maf.gz ./mafLinks/xenTro1 ln -s ${H}/blastzDanRer2.2005_03_17/mafNet/*.maf.gz ./mafLinks/danRer2 ln -s ${H}/blastzTetNig1.2005_03_17/mafNet/*.maf.gz ./mafLinks/tetNig1 # Copy MAFs to Iservers for kluster run ssh kkr1u00 mkdir /iscratch/i/mm6/multiz10way cd /iscratch/i/mm6/multiz10way rsync -a --copy-links --progress \ /cluster/data/mm6/bed/multiz10way/mafLinks/ . # We have about 3.2 Gb of data here, takes ~ 15-20 minutes to copy over # At least it does today, something is fishy with the connection. mkdir penn cp -p /cluster/bin/penn/psuCVS/multiz-tba/multiz penn cp -p /cluster/bin/penn/maf_project penn /cluster/bin/iSync # Progressive alignment up the tree w/o stager, # using multiz.v10 (var_multiz) # Method: align internal subtrees (using 0 flag to var_multiz) # Then, align these to human (using 1 flag to var_multiz) # NOTE: must use maf_project after each multiz run, in order # to order output. Single-cov guaranteed by use of net MAF's, # so it is not necessary to run single_cov2. ssh eieio cd /cluster/data/mm6/bed/multiz # make output dir and run dir ssh kki cd /cluster/data/mm6/bed/multiz10way mkdir -p maf mkdir -p run cd run # create scripts to run var_multiz on cluster cat > oneMultiz.csh << 'EOF' #!/bin/csh -fe set c = $1 set multi = /scratch/mm6/multiz10way.$c set pairs = /iscratch/i/mm6/multiz10way # special mode -- # with 1 arg, cleanup if ($#argv == 1) then rm -fr $multi exit endif # special mode -- # with 3 args, saves an alignment file if ($#argv == 3) then cp $multi/$2/$c.maf $3 exit endif set s1 = $2 set s2 = $3 set flag = $4 # locate input files -- in pairwise dir, or multiple dir set d1 = $multi set d2 = $multi if (-d $pairs/$s1) then set d1 = $pairs set f1 = $d1/$s1/$c.maf.gz set t1 = /tmp/$s1.$c.maf zcat $f1 > $t1 else set f1 = $d1/$s1/$c.maf set t1 = /tmp/$s1.$c.maf cp -p $f1 $t1 endif if (-d $pairs/$s2) then set d2 = $pairs set f2 = $d2/$s2/$c.maf.gz set t2 = /tmp/$s2.$c.maf zcat $f2 > $t2 else set f2 = $d2/$s2/$c.maf set t2 = /tmp/$s2.$c.maf cp -p $f2 $t2 endif # write to output dir set out = $multi/${s1}${s2} mkdir -p $out # check for empty input file if (-s $t1 && -s $t2) then echo "Aligning $f1 $f2 $flag" /iscratch/i/mm6/multiz10way/penn/multiz $t1 $t2 $flag $out/$c.unused1.maf $out/$c.unused2.maf > $out/$c.full.maf cat $out/$c.full.maf $out/$c.unused1.maf $out/$c.unused2.maf > \ $out/$c.tmp.maf echo "Ordering $c.maf" /iscratch/i/mm6/multiz10way/penn/maf_project $out/$c.tmp.maf mm6.$c > $out/$c.maf rm -f $t1 $t2 else if (-s $t1) then cp -p $t1 $out/$c.maf rm -f $t1 else if (-s $t2) then cp -p $t2 $out/$c.maf rm -f $t2 endif 'EOF' # << keep emacs coloring happy chmod +x oneMultiz.csh # Copy this script to iscratch ssh kkr1u00 cd /iscratch/i/mm6/multiz10way/penn cp -p /cluster/data/mm6/bed/multiz10way/run/oneMultiz.csh . /cluster/bin/iSync # back to run the job ssh kki cd /cluster/data/mm6/bed/multiz10way/run # This tree.nh was used in the distant past for early versions # of phastCons. Now, this is merely a convenient reference to the # tree under construction. This is also used to draw a graphic # tree as species.nh, see below. cat << '_EOF_' > tree.nh ((((((hg17,(mm6,rn3)),(canFam1,bosTau1)),monDom1),galGal2),xenTro1),(tetNig1,danRer2)) '_EOF_' # << this line keeps emacs coloring happy cat > allMultiz.csh << 'EOF' #!/bin/csh -fe # multiple alignment steps: set c = $1 /iscratch/i/mm6/multiz10way/penn/oneMultiz.csh $c hg17 rn3 1 /iscratch/i/mm6/multiz10way/penn/oneMultiz.csh $c bosTau1 canFam1 0 /iscratch/i/mm6/multiz10way/penn/oneMultiz.csh $c bosTau1canFam1 hg17rn3 1 /iscratch/i/mm6/multiz10way/penn/oneMultiz.csh $c monDom1 bosTau1canFam1hg17rn3 1 /iscratch/i/mm6/multiz10way/penn/oneMultiz.csh $c galGal2 monDom1bosTau1canFam1hg17rn3 1 /iscratch/i/mm6/multiz10way/penn/oneMultiz.csh $c xenTro1 galGal2monDom1bosTau1canFam1hg17rn3 1 /iscratch/i/mm6/multiz10way/penn/oneMultiz.csh $c danRer2 tetNig1 0 /iscratch/i/mm6/multiz10way/penn/oneMultiz.csh $c danRer2tetNig1 xenTro1galGal2monDom1bosTau1canFam1hg17rn3 1 # get final alignment file /iscratch/i/mm6/multiz10way/penn/oneMultiz.csh $c danRer2tetNig1xenTro1galGal2monDom1bosTau1canFam1hg17rn3 /cluster/data/mm6/bed/multiz10way/maf/$c.maf #cleanup /iscratch/i/mm6/multiz10way/penn/oneMultiz.csh $c 'EOF' # << keep emacs coloring happy chmod +x allMultiz.csh cat << 'EOF' > template #LOOP ./allMultiz.csh $(root1) {check out line+ /cluster/data/mm6/bed/multiz10way/maf/$(root1).maf} #ENDLOOP 'EOF' cd /cluster/data/mm6/bed/multiz10way/run awk '{print $1}' ../../../chrom.sizes > chrom.lst gensub2 chrom.lst single template jobList para create jobList para try; para check para push # Completed: 40 of 40 jobs # CPU time in finished jobs: 151565s 2526.08m 42.10h 1.75d 0.005 y # IO & Wait Time: 25097s 418.29m 6.97h 0.29d 0.001 y # Average job time: 4417s 73.61m 1.23h 0.05d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 18222s 303.70m 5.06h 0.21d # Submission to last job: 18592s 309.87m 5.16h 0.22d # combine results into a single file for loading # it is too large for kksilo, use kolossus ssh kolossus cd /cluster/data/mm6/bed/multiz10way catDir maf | mafFilter stdin -minScore=500 > multiz10way.maf # rejected 1548566 blocks # 7m 22s # makes an 8 Gb file: # -rw-rw-r-- 1 8443473465 Apr 18 09:57 multiz10way.maf # Create per-chrom individual maf files for downloads # 2005-08-02 - Hiram ssh kkstore01 cd /cluster/data/mm6/bed/multiz10way mkdir mafDownloads for M in maf/chr*.maf do B=`basename $M` echo "cat ${M} | mafFilter stdin -minScore=500 > mafDownloads/${B}" cat ${M} | mafFilter stdin -minScore=500 > mafDownloads/${B} done cd mafDownloads gzip chr*.maf ssh hgwdev cd /usr/local/apache/htdocs/goldenPath/mm6/multiz10way ln -s /cluster/data/mm6/bed/multiz10way/mafDownloads/chr*.maf.gz . # Load into database (DONE - 2005-01-04 - Hiram) ssh hgwdev cd /cluster/data/mm6/bed/multiz10way mkdir /gbdb/mm6/multiz10way ln -s /cluster/data/mm6/bed/multiz10way/multiz10way.maf \ /gbdb/mm6/multiz10way hgLoadMaf mm6 multiz10way # Loaded 6284892 mafs in 1 files from /gbdb/mm6/multiz10way # 14 minutes to load hgLoadMafSummary -minSize=10000 -mergeGap=500 -maxSize=50000 mm6 \ multiz10waySummary multiz10way.maf # Processed 18067226 components in 6284892 mafs from multiz10way.maf # 19m 15s # Dropped unused indexes (2006-05-09 kate) # NOTE: this is not required in the future, as the loader # has been fixed to not generate these indexes hgsql mm6 -e "alter table multiz10waySummary drop index chrom_2" hgsql mm6 -e "alter table multiz10waySummary drop index chrom_3" # create tree image: cat << '_EOF_' > species.nh ((((((human,(mouse,rat)),(dog,cow)),opossum),chicken),frog),(tetraodon,zebrafish)) '_EOF_' /cluster/bin/phast/draw_tree -b -s species.nh > species10.ps # photoshop to enhance, reduce the amount of whitespace to make it # smaller, then save as jpg cp species10.jpg /usr/local/apache/htdocs/images/phylo/Mm6_10way.jpg ############################################################################ # CREATE CONSERVATION WIGGLE WITH PHASTCONS # (DONE - 2005-04-18 - 2004-04-20- Hiram) # Estimate phastCons parameters ssh kksilo mkdir /cluster/data/mm6/bed/multiz10way/cons cd /cluster/data/mm6/bed/multiz10way/cons # Create a starting-tree.mod based on chr1 (the largest one) /cluster/bin/phast/msa_split ../maf/chr1.maf \ --refseq ../../../1/chr1.fa --in-format MAF \ --windows 100000000,1000 --out-format SS \ --between-blocks 5000 --out-root s1 # 5 minutes /cluster/bin/phast/phyloFit -i SS s1.*.ss \ --tree "((((((hg17,(mm6,rn3)),(canFam1,bosTau1)),monDom1),galGal2),xenTro1),(tetNig1,danRer2))" \ --out-root starting-tree # more than 1h 30m rm s1.*.ss # add up the C and G: grep BACKGROUND starting-tree.mod | awk '{printf "%0.3f\n", $3 + $4;}' # 0.403 # This 0.403 is used in the --gc argument below # Create big bad bloated SS files in bluearc (takes ~45 minutes) ssh kksilo mkdir -p /cluster/bluearc/mm6/cons/ss cd /cluster/bluearc/mm6/cons/ss # this is making over 3000 files in one directory, might be better # to put them into per-chrom hierarchies for C in `awk '{print $1}' /cluster/data/mm6/chrom.sizes` do if [ -s /cluster/data/mm6/bed/multiz10way/maf/${C}.maf ]; then echo msa_split $C chrN=${C/chr/} chrN=${chrN/_random/} /cluster/bin/phast/msa_split \ /cluster/data/mm6/bed/multiz10way/maf/${C}.maf \ --refseq /cluster/data/mm6/${chrN}/${C}.fa \ --in-format MAF --windows 1000000,0 --between-blocks 5000 \ --out-format SS --out-root ${C} fi done # Create a random list of 50 1 mb regions (do not use the _randoms) ls -l | grep -v random | awk '$5 > 4000000 {print $9;}' | \ randomLines stdin 50 ../randomSs # Set up parasol directory to calculate trees on these 50 regions ssh kk9 mkdir /cluster/bluearc/mm6/cons/treeRun1 cd /cluster/bluearc/mm6/cons/treeRun1 mkdir tree log # Tuning this loop should come back to here to recalculate # Create little script that calls phastCons with right arguments # --target-coverage of 0.20 is about right for mouse, will be # tuned exactly below cat > makeTree << '_EOF_' /cluster/bin/phast/phastCons ../ss/$1.ss \ /cluster/data/mm6/bed/multiz10way/cons/starting-tree.mod \ --gc 0.403 --nrates 1,1 --no-post-probs --ignore-missing \ --expected-lengths 12 --target-coverage 0.17 \ --quiet --log log/$1 --estimate-trees tree/$1 '_EOF_' # emacs happy chmod a+x makeTree # Create gensub file cat > template << '_EOF_' #LOOP makeTree $(root1) #ENDLOOP '_EOF_' # happy emacs # Make cluster job and run it gensub2 ../randomSs single template jobList para create jobList para try/push/check/etc # Completed: 50 of 50 jobs # CPU time in finished jobs: 83332s 1388.87m 23.15h 0.96d 0.003 y # IO & Wait Time: 429s 7.15m 0.12h 0.00d 0.000 y # Average job time: 1675s 27.92m 0.47h 0.02d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 3533s 58.88m 0.98h 0.04d # Submission to last job: 3543s 59.05m 0.98h 0.04d # Now combine parameter estimates. We can average the .mod files # using phyloBoot. This must be done separately for the conserved # and nonconserved models ls tree/*.cons.mod > cons.txt /cluster/bin/phast/phyloBoot --read-mods '*cons.txt' \ --output-average ../ave.cons.mod > cons_summary.txt ls tree/*.noncons.mod > noncons.txt /cluster/bin/phast/phyloBoot --read-mods '*noncons.txt' \ --output-average ../ave.noncons.mod > noncons_summary.txt cd .. cp -p ave.*.mod /cluster/data/mm6/bed/multiz10way/cons # measuring entropy # consEntopy # ave.cons.mod ave.noncons.mod --NH 9.78 # never stops with the --NH argument /cluster/bin/phast/consEntropy .10 12 \ ave.cons.mod ave.noncons.mod #Transition parameters:gamma=0.100000, omega=12.000000, mu=0.083333, nu=0.009259 # Relative entropy: H=1.454874 bits/site # Required length: N=7.596943 sites # Total entropy: NH=11.052595 bits # consEntropy .20 12 ave.cons.mod.1 ave.noncons.mod.1 # Transition params: gamma=0.200000, omega=12.000000, mu=0.083333, nu=0.020833 # Relative entropy: H=1.454874 bits/site # Required length: N=6.629337 sites # Total entropy: NH=9.644850 bits # consEntropy .10 12 ave.cons.mod.2 ave.noncons.mod.2 # Transition params: gamma=0.100000, omega=12.000000, mu=0.083333, nu=0.009259 # Relative entropy: H=1.527815 bits/site # Required length: N=7.205526 sites # Total entropy: NH=11.008713 bits # consEntropy .20 8 ave.cons.mod.3 ave.noncons.mod.3 # Transition params: gamma=0.200000, omega=8.000000, mu=0.125000, nu=0.031250 # Relative entropy: H=1.654878 bits/site # Required length: N=5.146793 sites # Total entropy: NH=8.517313 bits ### !!! *** This one with .17 and 12 is the one that was finally used # consEntropy .17 12 ave.cons.mod.4 ave.noncons.mod.4 # Transition params: gamma=0.170000, omega=12.000000, mu=0.083333, nu=0.017068 # Relative entropy: H=1.478838 bits/site # Required length: N=6.753382 sites # Total entropy: NH=9.987159 bits ssh kk9 # Create cluster dir to do main phastCons run mkdir /cluster/bluearc/mm6/cons/consRun4 cd /cluster/bluearc/mm6/cons/consRun4 mkdir ppRaw bed # Create script to run phastCons with right parameters # This job is I/O intensive in its output files. To make this # cluster safe, it would be better to do this work somewhere over # in /tmp/... and copy the final result back. kk9 can do this # run, but kk cannot. cat > doPhast << '_EOF_' mkdir -p ppRaw/$2 /cluster/bin/phast/phastCons ../ss/$1.ss ../ave.cons.mod,../ave.noncons.mod \ --expected-lengths 12 --target-coverage 0.17 --quiet --seqname $2 \ --idpref $2 --viterbi bed/$1.bed --score --require-informative 0 > \ ppRaw/$2/$1.pp '_EOF_' # emacs happy chmod a+x doPhast # Create gsub file cat > template << '_EOF_' #LOOP doPhast $(file1) $(root1) #ENDLOOP '_EOF_' # happy emacs # Create parasol batch and run it ls -1 ../ss | sed 's/.ss//' > in.lst gensub2 in.lst single template jobList para create jobList para try/check/push/etc. # Completed: 3098 of 3098 jobs # CPU time in finished jobs: 28179s 469.65m 7.83h 0.33d 0.001 y # IO & Wait Time: 204688s 3411.47m 56.86h 2.37d 0.006 y # Average job time: 75s 1.25m 0.02h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 150s 2.50m 0.04h 0.00d # Submission to last job: 2569s 42.82m 0.71h 0.03d # combine predictions and transform scores to be in 0-1000 interval # it uses a lot of memory, so on kolossus: ssh kolossus cd /cluster/bluearc/mm6/cons/consRun4 catDir bed | awk ' {printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' | \ /cluster/bin/scripts/lodToBedScore /dev/stdin > \ /cluster/data/mm6/bed/multiz10way/mostConserved.bed # ~ 1 minute # Figure out how much is actually covered by the bed files as so: ssh kkstore01 cd /cluster/data/mm6/bed/multiz10way awk ' {sum+=$3-$2} END{printf "%% %.2f = 100.0*%d/2597150151\n",100.0*sum/2597150151,sum}' \ mostConserved.bed -target-coverage 0.17: % 5.40 = 100.0*140350815/2597150151 length 12 -target-coverage 0.10: % 5.26 = 100.0*136494994/2597150151 -target-coverage 0.15: % 7.34 = 100.0*190616745/2597150151 -target-coverage 0.20: % 7.86 = 100.0*204262705/2597150151 # I was mistakenly reading the results below as, for example %2.14 # when in reality it was %21.4 - way way much too high. -target-coverage 0.50: 554319262/2597150151 = 0.214 - no complaints -target-coverage 0.52: 619851159/2597150151 = 0.239 - no complaints -target-coverage 0.53: 655016636/2597150151 = 0.252 - seven complaints -target-coverage 0.55: 729540911/2597150151 = 0.281 - many negative scores -target-coverage 0.60: 928959674/2597150151 = 0.358 - many negative scores # the non-n genome size, from faSize on all chroms: 2597150151 # Given the above measurements, we are using the 0.52 target # coverage run. # If the results of the this divided by the non-n genome size (1.5G) aren't # around 4%, then do it again, adjusting the target-coverage phastCons # parameter. Beware of negative scores when too high. The logToBedScore # will output an error on any negative scores. # -target-coverage 0.17 and expected lengths 12: featureBits mm6 -enrichment refGene:cds mostConserved.bed refGene:cds 0.980%, mostConserved.bed 5.404%, both 0.679%, cover 69.24%, enrich 12.81x # Load most conserved track into database ssh hgwdev cd /cluster/data/mm6/bed/multiz10way hgLoadBed mm6 phastConsElements mostConserved.bed # Loaded 2291164 elements of size 5 # 5 minute load time featureBits mm6 -enrichment refGene:cds phastConsElements # refGene:cds 0.980%, phastConsElements 5.256%, both 0.688%, cover 70.18%, # enrich 13.35x # Create merged posterier probability file and wiggle track data files ssh kksilo cd /cluster/bluearc/mm6/cons/consRun4 # interesting sort here on the chr name and position. # first convert all . and - characters to special strings x/ and x_/ # to get a consistent delimiter of / for all fields to be sorted. # Then do the sort on the chrom name and the start position, after # the sort convert the special stringx x_/ and x/ back to - and . # respectively. This gets everything in order by chrom name and # chrom start. find ./ppRaw -type f | sed -e "s#\.#x/#g; s#-#x_/#g" | \ sort -t"/" -k4,4 -k6,6n | sed -e "s#x_/#-#g; s#x/#.#g" | xargs cat | \ wigEncode stdin phastCons10.wig phastCons10.wib # about 45 minutes for above ssh kkstore01 cd /cluster/bluearc/mm6/cons/consRun4 cp -p phastCons10.wi? /cluster/data/mm6/bed/multiz10way/cons # 2m 30s copy on kkstore01 # prepare compressed copy of ascii data values for downloads cd /cluster/bluearc/mm6/cons/consRun4 zcat << '_EOF_' > gzipAscii.sh #!/bin/sh TOP=`pwd` export TOP mkdir -p phastCons10Scores ls ppRaw | while read D do out=${TOP}/phastCons10Scores/${D}.gz echo -n "$out ... " cd ${TOP}/ppRaw/${D} gzip -c `ls *.pp | sed -e "s#-#.x-x.#g;" | \ sort -t"." -k1,1 -k2,2n | sed -e "s#.x-x.#-#g;"` > ${out} echo "done" exit 255 done '_EOF_' # happy emacs chmod +x gzipAscii.sh time ./gzipAscii.sh # takes about 40 minutes, makes 2.8 Gb of data # copy them for downloads ssh kkstore01 mkdir /cluster/data/mm6/bed/multiz10way/phastCons10Scores cd /cluster/data/mm6/bed/multiz10way/phastCons10Scores rsync -a --progress /cluster/bluearc/mm6/cons/consRun4/phastCons10Scores/ . # 3 minute copy ssh hgwdev mkdir /usr/local/apache/htdocs/goldenPath/mm6/phastCons10Scores cd /usr/local/apache/htdocs/goldenPath/mm6/phastCons10Scores ln -s /cluster/data/mm6/bed/multiz10way/phastCons10Scores/*.gz . # Load gbdb and database with wiggle. ssh hgwdev cd /cluster/data/mm6/bed/multiz10way/cons ln -s `pwd`/phastCons10.wib /gbdb/mm6/wib/phastCons10.wib hgLoadWiggle mm6 phastCons10 phastCons10.wig # ~ 3 minute load # Create histogram to get an overview of all the data ssh hgwdev cd /cluster/data/mm6/bed/multiz10way/cons time hgWiggle -doHistogram \ -hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \ -db=mm6 phastCons10 > histogram.data 2>&1 # about 23 minutes to scan all data # create plot of histogram: cat << '_EOF_' | gnuplot > histo.png set terminal png small color \ x000000 xffffff xc000ff x66ff66 xffff00 x00ffff xff0000 set size 1.4, 0.8 set key left box set grid noxtics set grid ytics set title " Mouse Mm6 Histogram phastCons10 track" set xlabel " phastCons10 score" set ylabel " Relative Frequency" set y2label " Cumulative Relative Frequency (CRF)" set y2range [0:1] set y2tics set yrange [0:0.02] plot "histogram.data" using 2:5 title " RelFreq" with impulses, \ "histogram.data" using 2:7 axes x1y2 title " CRF" with lines '_EOF_' # happy emacs display histo.png & ############################################################################ # BUILD KNOWN GENES TABLES (Started 3/19/05, done 4/13/05. Fan) # First build protein databases, sp050315 and proteins050315 # See makeProteins050315.doc for details. # Create working subdirectories and temporary databases ssh hgwdev cd /cluster/store10/kg mkdir kgMm6A ln -s /cluster/store10/kg/kgMm6A /cluster/store6/kgDB/bed/kgMm6A ln -s /cluster/store10/kg/kgMm6A /cluster/data/mm6/bed/kgMm6A hgsql mm6 -e "create database kgMm6ATemp" mkdir /cluster/bluearc/kgDB/kgMm6A mkdir /cluster/bluearc/kgDB/kgMm6A/protBlat ln -s /cluster/bluearc/kgDB/kgMm6A/protBlat /cluster/store10/kg/kgMm6A/protBlat cd /cluster/store10/kg/kgMm6A/protBlat # Get all mouse protein sequences hgsql -N sp050315 -e \ 'select proteins050315.spXref3.accession,protein.val from proteins050315.spXref3,protein where division="10090" and acc=accession' \ |awk '{print ">" $1;print $2}' >mouseProt.fa # Prepare and perform cluster run for protein/genome alignment ssh kk cd /cluster/data/mm6/bed/kgMm6A/protBlat mkdir prot faSplit sequence mouseProt.fa 1000 prot/prot ls /cluster/bluearc/kgDB/kgMm6A/protBlat/prot/* > prot.lis hgsql mm6 -N -e 'select chrom from chromInfo' > chrom.lis cat << '_EOF_' > gsub #LOOP /cluster/bin/i386/blat -noHead -t=dnax -q=prot /panasas/store/mm6/nib/$(path1).nib $(path2) {check out line+ /cluster/bluearc/kgDB/kgMm6A/protBlat/result/$(root1)_$(root2).psl} #ENDLOOP '_EOF_' mkdir result gensub2 chrom.lis prot.lis gsub jobList para create jobList para check para push para check ... # This cluster run takes about two days. Crashed jobs are due to empty BLAT result. It is OK. Completed: 31081 of 39600 jobs Crashed: 8519 jobs CPU time in finished jobs: 28671747s 477862.45m 7964.37h 331.85d 0.909 y IO & Wait Time: 1469964s 24499.40m 408.32h 17.01d 0.047 y Average job time: 970s 16.16m 0.27h 0.01d Longest running job: 0s 0.00m 0.00h 0.00d Longest finished job: 39632s 660.53m 11.01h 0.46d Submission to last job: 124276s 2071.27m 34.52h 1.44d # collect BLAT results ssh hgwdev cd /cluster/data/mm6/bed/kgMm6A/protBlat mkdir result2 mkdir result3 cat chrom.lis |sed -e 's/chr/do1 chr/g' >doall cat << '_EOF_' > do1.1 echo processing $1 cat result/$1_prot*.psl >result2/$1.psl '_EOF_' cat << '_EOF_' > do1.1 echo processing $1 pslReps -nohead -minCover=0.80 -minAli=0.80 -nearTop=0.002 result2/$1.psl result4/$1.psl /dev/null >>j.out '_EOF_' chmod +x do* cp do1.1 do1 doall cp do1.2 do1 doall cat result3/*.psl >protBlat.psl hgLoadPsl mm6 protBlat.psl # Remember to remove result2 and result3 when KG is built and validated. cd /cluster/data/mm6/bed/kgMm6A # create all_mrna.psl and tight_mrna.psl hgsql mm6 -N -e "select * from all_mrna" |cut -f 2-22 >all_mrna.psl pslReps -minCover=0.40 -minAli=0.97 -nearTop=0.002 \ all_mrna.psl tight_mrna.psl /dev/null # Use overlapSelect to get protein and mRNA alignment overlaps overlapSelect -statsOutput -dropped=protOut.psl -overlapThreshold=0.90 \ -selectFmt=psl -inFmt=psl tight_mrna.psl protBlat/protBlat.psl protMrna.stat overlapSelect -mergeOutput -dropped=protOut.psl -overlapThreshold=0.90 -selectFmt=psl \ -inFmt=psl tight_mrna.psl protBlat/protBlat.psl protMrna.out # Create protein/mRNA pair and protein lists cut -f 10,31 protMrna.out|sort -u >spMrna.tab cut -f 10 protMrna.out|sort -u >protein.lis # Load spMrna.tab into spMrna table in temp DB. hgsql kgMm6ATemp < ~/src/hg/lib/spMrna.sql hgsql kgMm6ATemp -e 'load data local infile "spMrna.tab" into table spMrna' hgsql kgMm6ATemp -e 'create index mrnaID on spMrna(mrnaID)' # Prepare and perform cluster run of protein/mRNA alignment # Get mRNA fa file. /cluster/data/genbank/bin/i386/gbGetSeqs -native -db=mm6 \ -gbRoot=/cluster/data/genbank genbank mrna mrna.fa # Create mrnaSeq table in kgMm6ATemp DB. hgFaToTab mrna.fa mrnaSeq.tab hgsql kgMm6ATemp <~/src/hg/lib/mrnaSeq.sql hgsql kgMm6ATemp -e "load data local infile "mrnaSeq.tab" into table mrnaSeq" # Prepare files for cluster run ~/src/hg/protein/KG2.sh kgMm6A mm6 050315 # Perform cluster run of protein/mRNA alignment ~/src/hg/protein/KG3.sh kgMm6A mm6 050315 # Collect cluster run results cd kgBestMrna ls out | sed -e 's/prot/do1 prot/g' >doall # create do1 with the following 2 lines: cat << '_EOF_' > do1 echo processing $1 cat out/$1/*.out >>protMrnaRaw.psl '_EOF_' chmod +x do* doall # Filter out low quality alignments pslReps -nohead -singleHit -minAli=0.9 protMrnaRaw.psl protMrnaBlat.psl /dev/null cut -f 10,14 protMrnaBlat.psl |sort -u >protMrna.lis wc protMrna.lis # Load BLAT results into temp DB. hgsql kgMm6ATemp < ~/src/hg/lib/protMrnaBlat.sql hgsql kgMm6ATemp -e 'load data local infile "protMrnaBlat.psl" into table protMrnaBlat' hgsql kgMm6ATemp -e 'create index tName on protMrnaBlat(tName)' # Create CDS files from protein/mRNA alignment results. hgsql kgMm6ATemp -N -e \ 'select qName,"_",tName,tStart+1,":",tEnd+3 from protMrnaBlat order by qName,tName,tEnd-tStart desc'\ |sed 's/\t_\t/_/g'|sed 's/\t:\t/../g' >protMrna.cds # Create protMrna.psl with proteinID_mrnaID as query ID. cut -f 22-30 ../protMrna.out > j1.tmp cut -f 32-42 ../protMrna.out > j2.tmp cut -f 10,31 ../protMrna.out|sed -e 's/\t/_/g' >j3.tmp paste j1.tmp j3.tmp j2.tmp >protMrna.psl rm j1.tmp j2.tmp j3.tmp # Run mrnaToGene to create protMrna.gp bash mrnaToGene -cdsFile=protMrna.cds protMrna.psl protMrna.gp 2>protMrna.err >protMrna.log exit # Prepare refGene and all_mrna gp files. cd .. hgsql mm6 -N -e 'select * from refGene' >ref.gp hgsql mm6 -N -e \ 'select gbCdnaInfo.acc,cds.name from gbCdnaInfo,cds,all_mrna where all_mrna.qName=gbCdnaInfo.acc and gbCdnaInfo.cds=cds.id' \ |sort -u > all_mrna.cds bash mrnaToGene -cdsFile=all_mrna.cds all_mrna.psl all_mrna.gp 2>all_mrna.err > all_mrna.log exit # Align proteins to RefSeq. overlapSelect -inCds -statsOutput -overlapThreshold=0.90 -selectFmt=psl -inFmt=genePred\ protBlat/protBlat.psl ref.gp ref.stat overlapSelect -inCds -dropped=refOut1.gp -overlapThreshold=0.90 -selectFmt=psl -inFmt=genePred\ protBlat/protBlat.psl ref.gp protRef.gp overlapSelect -mergeOutput -selectCds -dropped=protOut1.psl -overlapThreshold=0.90 -inFmt=psl\ -selectFmt=genePred ref.gp protBlat/protBlat.psl protRef.out cut -f 10,22 protRef.out | sort -u >spRef.tab cut -f 10 protRef.out | sort -u >protRef.lis hgsql kgMm6ATemp <~/src/hg/lib/spRef.sql hgsql kgMm6ATemp -e 'load data local infile "spRef.tab" into table spRef' # Prepare and perform cluster runs for protein/RefSeq alignments ~/src/hg/protein/KGRef2.sh kgMm6A mm6 050315 ~/src/hg/protein/KGRef3.sh kgMm6A mm6 050315 cd kgBestRef ls out | sed -e 's/prot/do1 prot/g' >doall cat << '_EOF_' > do1 echo processing $1 cat out/$1/*.out >>protRefRaw.psl '_EOF_' chmod +x do* doall # Filter out low quality alignments. pslReps -nohead -singleHit -minAli=0.9 protRefRaw.psl protRefBlat.psl /dev/null cut -f 10,14 protRefBlat.psl |sort -u >protRef.lis wc protRef.lis hgsql kgMm6ATemp < ~/src/hg/lib/protRefBlat.sql hgsql kgMm6ATemp -e 'load data local infile "protRefBlat.psl" into table protRefBlat' hgsql kgMm6ATemp -e 'create index tName on protRefBlat(tName)' # Run gene-check to filter out invalid gp entries cat ref.gp protMrna.gp all_mrna.gp >kgCandidate0.gp gene-check -incl-ok -ok-genepred-out kgCandidate0.passed.gp -nib-dir /cluster/store10/mm6/nib kgCandidate0.gp kgCandidate0.check hgsql kgMm6ATemp < ~/src/hg/lib/kgCandidate0.sql hgsql kgMm6ATemp -e 'load data local infile "kgCandidate0.gp" into table kgCandidate0' hgsql kgMm6ATemp < ~/src/hg/lib/geneCheck.sql hgsql kgMm6ATemp -e 'load data local infile "kgCandidate0.check" into table geneCheck ignore 2 lines' # Run kgKeep to filter out invalid gene candidates kgCheck kgMm6ATemp mm6 kgCandidate.tab hgsql kgMm6ATemp -e 'drop table kgCandidate' hgsql kgMm6ATemp < ~/src/hg/lib/kgCandidate.sql hgsql kgMm6ATemp -e 'load data local infile "kgCandidate.tab" into table kgCandidate' # Update and clean up kgResultBestMrna2.c and then check it in. # Score protein/mRna and protein/RefSeq alignments kgResultBestMrna2 050201 kgMm6ATemp mm6|sort -u >protMrnaBlatScore.tab kgResultBestRef2 050315 kgMm6ATemp mm6|sort -u >protRefScore.tab # Combine scoring results and load them into temp DB. cat protMrnaBlatScore.tab protRefScore.tab >protMrnaScore.tab hgsql kgMm6ATemp < ~/src/hg/lib/protMrnaScore.sql hgsql kgMm6ATemp -e 'load data local infile "protMrnaScore.tab" into table protMrnaScore' hgsql kgMm6ATemp -e 'create index mrnaAcc on protMrnaScore(mrnaAcc)' # Run kgSelect to select highest scoring mRNA or RefSeq for each protein. kgSelect kgMm6ATemp kgCandidate2.gp hgsql kgMm6ATemp -e 'drop table kgCandidate2' hgsql kgMm6ATemp < ~/src/hg/lib/kgCandidate2.sql hgsql kgMm6ATemp -e 'load data local infile "kgCandidate2.gp" into table kgCandidate2' # Create sorted file to get entries with identical CDS regions group together. hgsql kgMm6ATemp -N -e \ 'select name,chrom,cdsStart,cdsEnd,score,proteinID from kgCandidate2,protMrnaScore where proteinID=protAcc and name=mrnaAcc order by name,cdsStart,cdsEnd,score desc,proteinID' \ >kgSorted.tab # Run kgUniq to pick the top mRNA/RefSeq with hightest score for each CDS structure. kgUniq kgMm6ATemp sp050315 kgSorted.tab knownGene.gp dupSpMrna.tab hgsql mm6 -e 'drop table dupSpMrna' hgsql mm6 <~/src/hg/lib/dupSpMrna.sql hgsql mm6 -e 'load data local infile "dupSpMrna.tab" into table dupSpMrna' # Build mrnaRefseq table first before loading knownGene table cd /cluster/store10/entrez mkdir 050401 ln -s /cluster/store10/entrez/050401 /cluster/data/entrez/050401 cd /cluster/data/entrez/050401 wget -- timestamp ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/gene2accession.gz wget -- timestamp ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/gene_info.gz wget -- timestamp ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/gene2refseq.gz gzip -d *.gz cut -f 2,4 gene2accession | sort -u | grep -v "-" | grep -v "NM_" | sed -e 's/\./\t/g' > entrezMrna.tab cut -f 2,4 gene2refseq | grep "NM_"| sort -u | grep -v "-" | sed -e 's/\./\t/g' > entrezRefseq.tab cut -f 2,4,6 gene2accession | grep "NM_"| grep "NP_"|sort -u | sed -e 's/\./\t/g' > entrezRefProt.tab hgsql entrez -e 'drop table entrezRefseq' hgsql entrez -e 'drop table entrezMrna' hgsql entrez -e 'drop table entrezRefProt' hgsql entrez < ~/src/hg/lib/entrezRefseq.sql hgsql entrez < ~/src/hg/lib/entrezMrna.sql hgsql entrez < ~/src/hg/lib/entrezRefProt.sql hgsql entrez -e 'load data local infile "entrezRefseq.tab" into table entrezRefseq' hgsql entrez -e 'load data local infile "entrezMrna.tab" into table entrezMrna' hgsql entrez -e 'load data local infile "entrezRefProt.tab" into table entrezRefProt' hgsql entrez -N -e \ 'select mrna, refseq from entrezRefseq, entrezMrna where entrezRefseq.geneID=entrezMrna.geneID' \ >mrnaRefseq.tab hgsql mm6 < ~/src/hg/lib/mrnaRefseq.sql hgsql mm6 -e 'load data local infile "mrnaRefseq.tab" into table mrnaRefseq' # Sort knownGene table ~/kent/src/hg/protein/sortKg.pl knownGene.gp > sortedKnownGene.gp # Load knownGene table cd /cluster/data/kgDB/bed/kgMm6A hgsql mm6 -e 'drop table knownGene' hgsql mm6 <~/src/hg/lib/knownGene.sql hgsql mm6 -e 'load data local infile "sortedKnownGene.gp" into table knownGene' # Build kgXref table kgXref2 mm6 proteins050315 mm6 hgsql mm6 -e 'drop table kgXref' hgsql mm6 <~/src/hg/lib/kgXref.sql hgsql mm6 -e 'load data local infile "kgXref.tab" into table kgXref' # Build spMrna table hgsql mm6 -N -e 'select name, proteinID from knownGene' >kgSpMrna.tab hgsql mm6 -e 'drop table spMrna' hgsql mm6 <~/src/hg/lib/spMrna.sql hgsql mm6 -e 'load data local infile "kgSpMrna.tab" into table spMrna' # Build knownGenePep table hgsql mm6 -N -e \ 'select name, protein.val from knownGene, sp050315.displayId, sp050315.protein where proteinID=displayId.val and displayId.acc=protein.acc' \ >knownGenePep.tab hgsql mm6 -e 'drop table knownGenePep' hgsql mm6 <~/src/hg/lib/knownGenePep.sql hgsql mm6 -e 'load data local infile "knownGenePep.tab" into table knownGenePep' # Build knownGeneMrna table /cluster/data/genbank/bin/i386/gbGetSeqs -native -db=mm6 \ -gbRoot=/cluster/data/genbank refseq mrna stdout \ | faToTab stdin refseqSeq.tab hgsql kgMm6ATemp -e "drop table refseqSeq" hgsql kgMm6ATemp <~/src/hg/lib/refseqSeq.sql hgsql kgMm6ATemp -e 'load data local infile "refseqSeq.tab" into table refseqSeq' hgsql kgMm6ATemp -N -e \ 'select knownGene.name, seq from refseqSeq, mm6.knownGene where knownGene.name=refseqSeq.name'\ >j1.tmp hgsql kgMm6ATemp -N -e \ 'select knownGene.name, seq from mrnaSeq, mm6.knownGene where knownGene.name=mrnaSeq.name' \ >j2.tmp cat j1.tmp j2.tmp >knownGeneMrna.tab rm j1.tmp j2.tmp hgsql mm6 -e "drop table mm6.knownGeneMrna" hgsql mm6 <~/src/hg/lib/knownGeneMrna.sql hgsql mm6 -e 'load data local infile "knownGeneMrna.tab" into table knownGeneMrna' # Build KEGG pathway tables ~/src/hg/protein/KGpath.sh kgMm6A mm6 050315 hgsql kgMm6ATemp -e "drop table keggList" hgsql kgMm6ATemp <~/src/hg/lib/keggList.sql hgsql kgMm6ATemp -e 'load data local infile "keggList.tab" into table keggList' hgsql mm6 -e "drop table keggMapDesc" hgsql mm6 -e "drop table keggPathway" hgsql mm6 <~/src/hg/lib/keggMapDesc.sql hgsql mm6 <~/src/hg/lib/keggPathway.sql hgsql mm6 -e 'load data local infile "keggMapDesc.tab" into table keggMapDesc' hgsql mm6 -e 'load data local infile "keggPathway.tab" into table keggPathway' # Build CGAP pathway tables ~/src/hg/protein/KGcgap.sh kgMm6A mm6 050315 hgsql sp050315 -N -e \ 'select name, gene.val from mm6.knownGene, gene, displayId where proteinID=displayId.val and gene.acc=displayId.acc' \ | sort -u >kgAliasP.tab # Build alias tables # kgAliasM reads from proteins050315.hugo.symbol, proteins050315.hugo.aliases # proteins050315.hugo.withdraws, mm6.kgXref.kgID # to create kgAliasM.tab and geneAlias.tab # by picking out those kgID items from kgXref where # kgXref.geneSymbol == hugo.symbol kgAliasM mm6 proteins050315 # kgAliasKgXref reads from mm6.knownGene.proteinID, # mm6.knownGene.name, mm6.kgXref.geneSymbol # to create kgAliasKgXref.tab kgAliasKgXref mm6 # kgAliasRefseq reads from mm6.knownGene.name, # mm6.knownGene.proteinID, mm6.kgXref.refseq # to create kgAliasRefseq.tab kgAliasRefseq mm6 hgsql sp050315 -N -e \ 'select name, gene.val from mm6.knownGene, gene, displayId where proteinID=displayId.val and gene.acc=displayId.acc' \ | sort -u >kgAliasP.tab cat kgAliasM.tab kgAliasRefseq.tab kgAliasKgXref.tab kgAliasP.tab | \ sort |uniq > kgAlias.tab hgsql -e "drop table kgAlias;" mm6 hgsql mm6 < ~/kent/src/hg/lib/kgAlias.sql hgsql mm6 -e 'LOAD DATA local INFILE "kgAlias.tab" into table kgAlias' # kgProtAlias reads from mm6.knownGene.name, # mm6.knownGene.proteinID, mm6.knownGene.alignID, # proteins050315.spXref3.accession, proteins050315.spSecondaryID, proteins050315.pdbSP.pdb # to create kgProtAlias.tab # kgProtAlias mm6 050315 hgsql mm6 -N -e \ 'select kgID, spDisplayID, protAcc from kgXref where protAcc != ""'\ | sort -u >kgProtAliasNCBI.tab cat kgProtAliasNCBI.tab kgProtAlias.tab | sort | uniq > kgProtAliasBoth.tab rm kgProtAliasNCBI.tab kgProtAlias.tab echo "`date` creating table kgProtAlias" hgsql mm6 -e "drop table kgProtAlias;" hgsql mm6 <~/src/hg/lib/kgProtAlias.sql; hgsql mm6 -e 'LOAD DATA local INFILE "kgProtAliasBoth.tab" into table kgProtAlias;' # MAKING FOLDUTR TABLES (DONE 2005-04-21, Fan) # First set up directory structure and extract UTR sequence on hgwdev ssh hgwdev mkdir -p /cluster/data/mm6/bed/rnaStruct cd /cluster/data/mm6/bed/rnaStruct mkdir -p utr3/split utr5/split utr3/fold utr5/fold utrFa mm6 knownGene utr3 utr3/utr.fa utrFa mm6 knownGene utr5 utr5/utr.fa # Split up files and make files that define job. ssh kk cd /cluster/data/mm6/bed/rnaStruct faSplit sequence utr3/utr.fa 50000 utr3/split/s faSplit sequence utr5/utr.fa 50000 utr5/split/s ls -1 utr3/split > utr3/in.lst ls -1 utr5/split > utr5/in.lst cd utr3 cat > gsub < genome.lst echo /panasas/store/mm6/nib/*.nib | wordLine stdin > genome.lst ls -1 /iscratch/i/affy/U74*consensus.fa > affy.lst cat << '_EOF_' > gsub #LOOP /cluster/bin/i386/blat -fine -mask=lower -minIdentity=95 -ooc=/scratch/hg/h/11.ooc $(path1) {check in line+ $(path2)} {check out line+ psl/$(root1)_$(root2).psl} #ENDLOOP '_EOF_' # << this line makes emacs coloring happy gensub2 genome.lst affy.lst gsub jobList para create jobList para try # do usual para check/para push etc. until the job is done. # Completed: 120 of 120 jobs # CPU time in finished jobs: 7197s 119.94m 2.00h 0.08d 0.000 y # IO & Wait Time: 1047s 17.46m 0.29h 0.01d 0.000 y # Average job time: 69s 1.15m 0.02h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 208s 3.47m 0.06h 0.00d # Submission to last job: 751s 12.52m 0.21h 0.01d # Do sort, best in genome filter, and convert to chromosome coordinates # to create affyU74.psl. ssh kk cd /cluster/data/mm6/bed/affyU74.2005-04-14/run pslSort dirs raw.psl tmp psl # change filter parameters for these sequences. only use alignments that # cover 30% of sequence and have at least minAli = 0.95. # minAli = 0.97 too high. low minCover as a lot of n's in these sequences #pslReps -minCover=0.3 -sizeMatters -minAli=0.95 -nearTop=0.005 raw.psl contig.psl /dev/null pslReps -minCover=0.3 -sizeMatters -minAli=0.95 -nearTop=0.005 raw.psl ../all_affyU74.psl /dev/null # Sort by chromosome and load into database. ssh hgwdev cd /cluster/data/mm6/bed/affyU74.2005-04-14 pslSortAcc nohead chrom temp all_affyU74.psl cat chrom/*.psl > affyU74.psl # shorten qName to "xxxx_at" instead of "U74Xv2:xxxx_at;" # and reload data into table hgLoadPsl mm6 affyU74.psl # rm -fr chrom temp run ## MAKE THE affyGnfU74 TRACKs (DONE - 2005-04-14 - Fan) # Make bed files and load consensus sequences for Affy U74 chip set. # Fix broken symlinks to microarray data after directory structure changed # (DONE, 2005-05-03, hartera) ---------------------------------- #This needs to be done after affyU74 is already made. ssh hgwdev mkdir -p /cluster/data/mm6/bed/affyGnf.2005-04-14 cd /cluster/data/mm6/bed/affyGnf.2005-04-14 # may need to build this command in src/hg/affyGnf affyPslAndAtlasToBed ../affyU74.2005-04-14/affyU74.psl \ /projects/compbio/data/microarray/affyGnfMouse/data/data_public_U74 \ affyGnfU74A.bed affyGnfU74A.exp -newType -chip=U74Av2 affyPslAndAtlasToBed ../affyU74.2005-04-14/affyU74.psl \ /projects/compbio/data/microarray/affyGnfMouse/data/U74B_b.txt \ affyGnfU74B.bed affyGnfU74B.exp -newType -chip=U74Bv2 affyPslAndAtlasToBed ../affyU74.2005-04-14/affyU74.psl \ /projects/compbio/data/microarray/affyGnfMouse/data/U74C_b.txt \ affyGnfU74C.bed affyGnfU74C.exp -newType -chip=U74Cv2 # edit 3 .bed files to shorten qName to "xxxx_at" instead of "U74Xv2:xxxx_at;" mkdir sav cp *.bed sav -p cat sav/affyGnfU74A.bed|sed -e "s/U74Av2://" >affyGnfU74A.bed cat sav/affyGnfU74B.bed|sed -e "s/U74Bv2://" >affyGnfU74B.bed cat sav/affyGnfU74C.bed|sed -e "s/U74Cv2://" >affyGnfU74C.bed # and reload data into table hgLoadBed mm6 affyGnfU74A affyGnfU74A.bed hgLoadBed mm6 affyGnfU74B affyGnfU74B.bed hgLoadBed mm6 affyGnfU74C affyGnfU74C.bed # Add in sequence data for U74 tracks. # Copy consensus sequence to /gbdb if it isn't already # [THE SYM LINKS WERE ALREADY DONE.] # mkdir -p /gbdb/hgFixed/affyProbes cd /gbdb/hgFixed/affyProbes # fix broken symlinks after directory structure changed # /projects/compbiodata ----> /projects/compbio/data rm U74* # make correct symlinks (hartera, 2005-05-03) ln -s /projects/compbio/data/microarray/affyGnfMouse/sequences/U74Av2_consensus.fa . ln -s /projects/compbio/data/microarray/affyGnfMouse/sequences/U74Bv2_consensus.fa . ln -s /projects/compbio/data/microarray/affyGnfMouse/sequences/U74Cv2_consensus.fa . # used perl -pi.bak -e 's/;/ /' to remove ";" after probe name # ASSUMED THIS IS ALREADY DONE LAST TIME FOR MM4. # reload sequences with prefix removed so acc matches name used in # other dependent tables hgLoadSeq -abbr=U74Av2: mm6 /gbdb/hgFixed/affyProbes/U74Av2_consensus.fa hgLoadSeq -abbr=U74Bv2: mm6 /gbdb/hgFixed/affyProbes/U74Bv2_consensus.fa hgLoadSeq -abbr=U74Cv2: mm6 /gbdb/hgFixed/affyProbes/U74Cv2_consensus.fa ### GNF ATLAS 2 [DONE Fan 2005-04-14] # Align probes from GNF1M chip. ssh kk cd /cluster/data/mm6/bed mkdir -p geneAtlas2/run/psl cd geneAtlas2/run #mkdir -p /cluster/bluearc/geneAtlas2 #cp /projects/compbio/data/microarray/geneAtlas2/mouse/gnf1m.fa /cluster/bluearc/geneAtlas2 #ls -1 /scratch/mus/mm6/maskedContigs/ > genome.lst echo /panasas/store/mm6/nib/*.nib | wordLine stdin > genome.lst ls -1 /cluster/bluearc/geneAtlas2/gnf1m.fa > mrna.lst echo '#LOOP\nblat -fine -ooc=/scratch/hg/h/mouse11.ooc $(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl}\n#ENDLOOP' > gsub gensub2 genome.lst mrna.lst gsub spec para create spec para try para check para push para time # Completed: 40 of 40 jobs # CPU time in finished jobs: 56570s 942.84m 15.71h 0.65d 0.002 y # IO & Wait Time: 392s 6.53m 0.11h 0.00d 0.000 y # Average job time: 1424s 23.73m 0.40h 0.02d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 3979s 66.32m 1.11h 0.05d # Submission to last job: 3993s 66.55m 1.11h 0.05d # Do sort, best in genome filter, and convert to chromosome coordinates # to create gnf1h.psl. pslSort dirs raw.psl tmp psl pslReps -minCover=0.3 -minAli=0.95 -nearTop=0.005 raw.psl ../affyGnf1m.psl /dev/null #rm -r contig.psl raw.psl psl # Load probes and alignments from GNF1H into database. ssh hgwdev cd /cluster/data/mm6/bed/geneAtlas2 # ln -s /projects/compbio/data/microarray/geneAtlas2/mouse/gnf1m.fa /gbdb/hgFixed/affyProbes hgLoadPsl mm6 affyGnf1m.psl hgLoadSeq mm6 /gbdb/hgFixed/affyProbes/gnf1m.fa # Load up track hgMapMicroarray gnfAtlas2.bed hgFixed.gnfMouseAtlas2MedianRatio \ affyGnf1m.psl # Note that the unmapped 5000 records are from all-N sequences. hgLoadBed mm6 gnfAtlas2 gnfAtlas2.bed # MOUSE AFFYMETRIX MOE430 TRACK (DONE, 2005-04-14, Fan) # mkdir -p /projects/compbio/data/microarray/affyMouse # Download MOE430A and MOE430B consensus sequences from Affymetrix web site # http://www.affymetrix.com/support/technical/byproduct.affx?product=moe430 # unzip MOE430*_consensus.zip # check for duplicate probes: there are none, all have unique names # check for duplicate probes: 100 from 136745_at to 1367551_a_at # remove "consensus:" and ";" from FASTA headers to shorten probeset # names for database # sed -e 's/consensus://' MOE430A_consensus | sed -e 's/;/ /' > MOE430_all.fa # sed -e 's/consensus://' MOE430B_consensus | sed -e 's/;/ /' >> MOE430_all.fa # cp /projects/compbio/data/microarray/affyMouse/MOE430_all.fa \ # /cluster/bluearc/affy/ # THE ABOVE WAS ALREADY DONE BY RACHEL 4/16/04. # Set up cluster job to align MOE430 consensus sequences to mm6 ssh kkr1u00 cd /cluster/data/mm6/bed mkdir -p affyMOE430 cd affyMOE430 # mkdir -p /iscratch/i/affy # cp /cluster/bluearc/affy/MOE430_all.fa /iscratch/i/affy # iSync ssh kk cd /cluster/data/mm6/bed/affyMOE430 ls -1 /iscratch/i/affy/MOE430_all.fa > affy.lst echo /panasas/store/mm6/nib/*.nib | wordLine stdin > genome.lst echo '#LOOP\n/cluster/bin/i386/blat -fine -mask=lower -minIdentity=95 -ooc=/scratch/hg/h/mouse11.ooc $(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl}\n#ENDLOOP' > template.sub gensub2 genome.lst affy.lst template.sub para.spec mkdir psl para create para.spec # Actually do the job with usual para try/check/push/time etc. # Completed: 40 of 40 jobs # CPU time in finished jobs: 9414s 156.90m 2.61h 0.11d 0.000 y # IO & Wait Time: 281s 4.69m 0.08h 0.00d 0.000 y # Average job time: 242s 4.04m 0.07h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 597s 9.95m 0.17h 0.01d # Submission to last job: 657s 10.95m 0.18h 0.01d # Do sort, best in genome filter, and convert to chromosome coordinates # to create affyRAE230.psl pslSort dirs raw.psl tmp psl # only use alignments that cover 30% of sequence and have at least # 95% identity in aligned region. # low minCover as a lot of n's in these sequences pslReps -minCover=0.3 -sizeMatters -minAli=0.95 -nearTop=0.005 raw.psl affyMOE430.psl /dev/null # Load alignments and sequences into database ssh hgwdev cd /cluster/data/mm6/bed/affyMOE430 # shorten names in psl file sed -e 's/MOE430//' affyMOE430.psl > affyMOE430.psl.bak mv affyMOE430.psl.bak affyMOE430.psl # load track into database hgLoadPsl mm6 affyMOE430.psl # Add consensus sequences for MOE430 # Copy sequences to gbdb is they are not there already # mkdir -p /gbdb/hgFixed/affyProbes # ln -s /projects/compbio/data/microarray/affyMouse/MOE430_all.fa \ # /gbdb/hgFixed/affyProbes hgLoadSeq -abbr=MOE430 mm6 /gbdb/hgFixed/affyProbes/MOE430_all.fa # Clean up # rm batch.bak contig.psl raw.psl # BELOW TWO THINGS WERE DONE BY RACHEL ALREDAY FOR MM4 # add entry to trackDb.ra in ~kent/src/hg/makeDb/trackDb/mouse/ # add affyMOE430.html file and then do make alpha to add to trackDb table ######## MAKING GENE SORTER TABLES ####### (STARTED - 2005-04-15, DONE 4/18/05 - Fan) # These are instructions for building the # Gene Sorter. Don't start these until # there is a knownGene track and the affy tracks # Cluster together various alt-splicing isoforms. # Creates the knownIsoforms and knownCanonical tables ssh hgwdev cd /tmp hgClusterGenes mm6 knownGene knownIsoforms knownCanonical # You may need to build this binary in src/hg/near/hgClusterGenes # Got 24603 clusters, from 41208 genes in 43 chromosomes # featureBits mm6 knownCanonical # 686054706 bases of 2597150411 (26.416%) in intersection # featureBits mm5 knownCanonical # 853516995 bases of 2615483787 (32.633%) in intersection # featureBits mm4 knownCanonical # 840021165 bases of 2627444668 (31.971%) in intersection # featureBits mm3 knownCanonical # 825943052 bases of 2505900260 (32.960%) in intersection # ! ! ! Can not do featureBits on knownIsoforms # Extract peptides from knownGenes into fasta file # and create a blast database out of them. ssh hgwdev mkdir -p /cluster/data/mm6/bed/geneSorter/blastp cd /cluster/data/mm6/bed/geneSorter/blastp pepPredToFa mm6 knownGenePep known.faa # You may need to build this binary in src/hg/near/pepPredToFa /cluster/bluearc/blast229/formatdb -i known.faa -t known -n known # Copy over database to bluearc scratch mkdir /cluster/panasas/home/store/mm6/blastp cp -p /cluster/data/mm6/bed/geneSorter/blastp/known.* \ /cluster/panasas/home/store/mm6/blastp # Split up fasta file into bite sized chunks for cluster cd /cluster/data/mm6/bed/geneSorter/blastp mkdir split faSplit sequence known.faa 8000 split/kg # Make parasol run directory ssh kk mkdir /cluster/data/mm6/bed/geneSorter/blastp/self cd /cluster/data/mm6/bed/geneSorter/blastp/self mkdir run cd run mkdir out # Make blast script cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/cluster/bluearc/blast229/data /cluster/bluearc/blast229/blastall \ -p blastp -d /cluster/panasas/home/store/mm6/blastp/known \ -i $1 -o $2 -e 0.01 -m 8 -b 1000 '_EOF_' # << keep emacs happy chmod a+x blastSome # Make gensub2 file cat << '_EOF_' > gsub #LOOP blastSome {check in line+ $(path1)} {check out line out/$(root1).tab} #ENDLOOP '_EOF_' # << keep emacs happy # Create parasol batch # 'ls ../../split/*.fa' is too much, hence the echo echo ../../split/*.fa | wordLine stdin > split.lst gensub2 split.lst single gsub jobList para create jobList para try para check para push ... etc ... # Completed: 7715 of 7715 jobs # CPU time in finished jobs: 31525s 525.42m 8.76h 0.36d 0.001 y # IO & Wait Time: 34031s 567.18m 9.45h 0.39d 0.001 y # Average job time: 8s 0.14m 0.00h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 61s 1.02m 0.02h 0.00d # Submission to last job: 142s 2.37m 0.04h 0.00d # Load into database. This takes about an hour. ssh hgwdev cd /cluster/data/mm6/bed/geneSorter/blastp/self/run/out hgLoadBlastTab mm6 knownBlastTab *.tab Scanning through 7715 files Loading database with 1972005 rows # Create known gene mapping table and expression distance tables # for GNF Atlas 2. (The hgExpDistance takes an hour.) # DONE (05-04-15 Fan) hgMapToGene mm6 affyGnf1m knownGene knownToGnf1m hgExpDistance mm6 hgFixed.gnfMouseAtlas2MedianRatio \ hgFixed.gnfMouseAtlas2MedianExps gnfAtlas2Distance -lookup=knownToGnf1m # Create table that maps between known genes and RefSeq hgMapToGene mm6 refGene knownGene knownToRefSeq # may need to build this command in src/hg/near/hgMapToGene # Create a table that maps between known genes and # the nice affy expression data. hgMapToGene mm6 affyU74 knownGene knownToU74 hgMapToGene mm6 affyMOE430 knownGene knownToMOE430 hgMapToGene mm6 affyMOE430 -prefix=A: knownGene knownToMOE430A # Format and load Rinn et al sex expression data mkdir /cluster/data/mm6/bed/rinnSex cd !$ hgMapMicroarray rinnSex.bed hgFixed.mouseRinnSexMedianRatio \ ../affyMOE430/affyMOE430.psl hgLoadBed mm6 rinnSex rinnSex.bed # Format and load the GNF data mkdir /cluster/data/mm6/bed/affyGnf95 cd /cluster/data/mm6/bed/affyGnf95 affyPslAndAtlasToBed -newType ../affyU95.psl \ /projects/compbio/data/microarray/affyGnfHuman/data_public_U95 \ affyGnfU95.tab affyGnfU95Exps.tab -shortOut # this .sql load was in preceeding instructions, but this .sql file # appears to not exist and it doesn't seem to be needed anyway. # Everything below this seems to create tables OK. # hgsql mm6 < ~/kent/src/hg/affyGnf/affyGnfU95.sql # Create table that gives distance in expression space between # GNF genes. These commands take about 15 minutes each # The affyGnfU74?Exps arguments appear to be unused in # hgExpDistance hgExpDistance mm6 affyGnfU74A affyGnfU74AExps affyGnfU74ADistance -lookup=knownToU74 # Got 7720 unique elements in affyGnfU74A hgExpDistance mm6 affyGnfU74B affyGnfU74BExps affyGnfU74BDistance -lookup=knownToU74 # Got 4619 unique elements in affyGnfU74B hgExpDistance mm6 affyGnfU74C affyGnfU74CExps affyGnfU74CDistance -lookup=knownToU74 # Got 1406 unique elements in affyGnfU74C # C.ELEGANS BLASTP FOR GENE SORTER (DONE 4/15/05 Fan) # Make C. elegans ortholog column using blastp on wormpep. # First make C. elegans protein database and copy it to iscratch/i # if it doesn't exist already: ssh eieio mkdir /cluster/data/ce2/bed/blastp cd /cluster/data/ce2/bed/blastp # Point a web browser at ftp://ftp.sanger.ac.uk/pub/databases/wormpep/ # to find out the latest version. Then use that in place of 142 below. wget -O wormPep142.faa ftp://ftp.sanger.ac.uk/pub/databases/wormpep/wormpep142/wormpep142 formatdb -i wormPep142.faa -t wormPep142 -n wormPep142 ssh kkr1u00 if (-e /iscratch/i/ce2/blastp) then rm -r /iscratch/i/ce2/blastp endif mkdir -p /iscratch/i/ce2/blastp cp /cluster/data/ce2/bed/blastp/wormPep142.p?? /iscratch/i/ce2/blastp iSync # Make parasol run directory ssh kk mkdir -p /cluster/data/mm6/bed/blastp/ce2/run/out cd /cluster/data/mm6/bed/blastp/ce2/run # Make blast script cat > blastSome < gsub <split.lst #EDIT split.lst to add "../../../geneSorter/blastp/split/" in front of "kg" gensub2 split.lst single gsub spec para create spec para try, check, push, check, ... # Completed: 7715 of 7715 jobs # CPU time in finished jobs: 29337s 488.96m 8.15h 0.34d 0.001 y # IO & Wait Time: 24651s 410.84m 6.85h 0.29d 0.001 y # Average job time: 7s 0.12m 0.00h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 40s 0.67m 0.01h 0.00d # Submission to last job: 206s 3.43m 0.06h 0.00d # Load into database. ssh hgwdev cd /cluster/data/mm6/bed/blastp/ce2/run/out hgLoadBlastTab mm6 ceBlastTab -maxPer=1 *.tab # HUMAN BLASTP FOR GENE SORTER (DONE 4/18/05 Fan) # Make human ortholog column using blastp on human known genes. # First make human protein database and copy it to iscratch/i # if it doesn't exist already: mkdir /cluster/data/hg17/bed/blastp cd /cluster/data/hg17/bed/blastp pepPredToFa hg17 knownGenePep known.faa formatdb -i known.faa -t known -n known # PLEASE NOTE, hg17B IS USED INSTEAD OF hg17 for /iscratch/i, # TO GO AROUND A SUBDIRECTORY ACCESS RIGHT PROBLEM. ssh kkr1u00 if (-e /iscratch/i/hg17B/blastp) then rm -r /iscratch/i/hg17B/blastp endif mkdir -p /iscratch/i/hg17B/blastp cp /cluster/data/hg17/bed/blastp/known.p?? /iscratch/i/hg17B/blastp iSync # Make parasol run directory ssh kk mkdir -p /cluster/data/mm6/bed/blastp/hg17/run/out cd /cluster/data/mm6/bed/blastp/hg17/run # Make blast script cat > blastSome < gsub <split.lst #EDIT split.lst to add "../../../geneSorter/blastp/split/" in front of "kg" gensub2 split.lst single gsub spec para create spec para try, check, push, check, ... # Completed: 7715 of 7715 jobs # CPU time in finished jobs: 67090s 1118.17m 18.64h 0.78d 0.002 y # IO & Wait Time: 22543s 375.72m 6.26h 0.26d 0.001 y # Average job time: 12s 0.19m 0.00h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 83s 1.38m 0.02h 0.00d # Submission to last job: 213s 3.55m 0.06h 0.00d # Load into database. ssh hgwdev cd /cluster/data/mm6/bed/blastp/hg17/run/out hgLoadBlastTab mm6 hgBlastTab -maxPer=1 *.tab # ZEBRAFISH BLASTP FOR GENE SORTER (DONE 4/15/05 Fan) # Make Danio rerio (zebrafish) ortholog column using blastp on Ensembl. # First make protein database and copy it to iscratch/i # if it doesn't exist already: ssh kkstore mkdir /cluster/data/danRer1/bed/blastp cd /cluster/data/danRer1/bed/blastp wget ftp://ftp.ensembl.org/pub/current_zebrafish/data/fasta/pep/Danio_rerio.ZFISH4.apr.pep.fa.gz zcat Dan*.pep.fa.gz > ensembl.faa formatdb -i ensembl.faa -t ensembl -n ensembl ssh kkr1u00 if (-e /iscratch/i/danRer1/blastp) then rm -r /iscratch/i/danRer1/blastp endif mkdir -p /iscratch/i/danRer1/blastp cp /cluster/data/danRer1/bed/blastp/ensembl.p?? /iscratch/i/danRer1/blastp iSync # Make parasol run directory ssh kk mkdir -p /cluster/data/mm6/bed/blastp/danRer1/run/out cd /cluster/data/mm6/bed/blastp/danRer1/run # Make blast script cat > blastSome < gsub <split.lst #EDIT split.lst to add "../../../geneSorter/blastp/split/" in front of "kg" gensub2 split.lst single gsub spec para create spec para try, check, push, check, ... # Completed: 7715 of 7715 jobs # CPU time in finished jobs: 53430s 890.51m 14.84h 0.62d 0.002 y # IO & Wait Time: 24688s 411.46m 6.86h 0.29d 0.001 y # Average job time: 10s 0.17m 0.00h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 76s 1.27m 0.02h 0.00d # Submission to last job: 202s 3.37m 0.06h 0.00d # Load into database. ssh hgwdev cd /cluster/data/mm6/bed/blastp/danRer1/run/out hgLoadBlastTab mm6 drBlastTab -maxPer=1 *.tab # YEAST BLASTP FOR GENE SORTER (DONE 4/15/05 Fan) # Make Saccharomyces cerevisiae (yeast) ortholog column using blastp on # RefSeq. First make protein database and copy it to iscratch/i # if it doesn't exist already: mkdir /cluster/data/sacCer1/bed/blastp cd /cluster/data/sacCer1/bed/blastp wget ftp://genome-ftp.stanford.edu/pub/yeast/data_download/sequence/genomic_sequence/orf_protein/orf_trans.fasta.gz zcat orf_trans.fasta.gz > sgdPep.faa formatdb -i sgdPep.faa -t sgdPep -n sgdPep ssh kkr1u00 # Note: sacCer1 is a name conflict with SARS coronavirus... oh well, # fortunately we won't be looking for homologs there. :) if (-e /iscratch/i/sacCer1/blastp) then rm -r /iscratch/i/sacCer1/blastp endif mkdir -p /iscratch/i/sacCer1/blastp cp /cluster/data/sacCer1/bed/blastp/sgdPep.p?? /iscratch/i/sacCer1/blastp iSync # Make parasol run directory ssh kk mkdir -p /cluster/data/mm6/bed/blastp/sacCer1/run/out cd /cluster/data/mm6/bed/blastp/sacCer1/run # Make blast script cat > blastSome < gsub <split.lst #EDIT split.lst to add "../../../geneSorter/blastp/split/" in front of "kg" gensub2 split.lst single gsub spec para create spec para try, check, push, check, ... # Completed: 7715 of 7715 jobs # CPU time in finished jobs: 8741s 145.68m 2.43h 0.10d 0.000 y # IO & Wait Time: 20376s 339.60m 5.66h 0.24d 0.001 y # Average job time: 4s 0.06m 0.00h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 17s 0.28m 0.00h 0.00d # Submission to last job: 199s 3.32m 0.06h 0.00d # Load into database. ssh hgwdev cd /cluster/data/mm6/bed/blastp/sacCer1/run/out hgLoadBlastTab mm6 scBlastTab -maxPer=1 *.tab # DM1 BLASTP FOR GENE SORTER (DONE 4/18/05, Fan) # Make Drosophila melanagaster ortholog column using blastp on FlyBase. # First make protein database and copy it to iscratch/i # if it doesn't exist already: # This is already done, see makeMm3.doc for procedure # the directory: /cluster/bluearc/dm1/blastp should have data # ssh kkr1u00 # if (-e /iscratch/i/dm1/blastp) then # rm -r /iscratch/i/dm1/blastp # endif # mkdir -p /iscratch/i/dm1/blastp # cp /cluster/data/dm1/bed/blastp/bdgp.p?? /iscratch/i/dm1/blastp # iSync # THE ABOVE IS ALREADY DONE BY ANGIE # Make parasol run directory ssh kk mkdir -p /cluster/data/mm6/bed/blastp/dm1/run/out cd /cluster/data/mm6/bed/blastp/dm1/run # Make blast script cat > blastSome < gsub <split.lst #EDIT split.lst to add "../../../geneSorter/blastp/split/" in front of "kg" gensub2 split.lst single gsub spec para create spec para try, check, push, check, ... # Completed: 7715 of 7715 jobs # CPU time in finished jobs: 33260s 554.33m 9.24h 0.38d 0.001 y # IO & Wait Time: 24452s 407.54m 6.79h 0.28d 0.001 y # Average job time: 7s 0.12m 0.00h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 45s 0.75m 0.01h 0.00d # Submission to last job: 121s 2.02m 0.03h 0.00d # Load into database. ssh hgwdev cd /cluster/data/mm6/bed/blastp/dm1/run/out hgLoadBlastTab mm6 dmBlastTab -maxPer=1 *.tab # Create table that maps between known genes and LocusLink (DONE 4/18/05 Fan) hgsql --skip-column-names -e "select mrnaAcc,locusLinkId from refLink" mm6 \ > refToLl.txt hgMapToGene mm6 refGene knownGene knownToLocusLink -lookup=refToLl.txt # row count is 17480 # Create table that maps between known genes and Pfam domains hgMapViaSwissProt mm6 knownGene name proteinID Pfam knownToPfam # row count is 17132 # Create table to map between known genes and GNF Atlas2 # expression data. hgMapToGene mm6 gnfAtlas2 knownGene knownToGnfAtlas2 '-type=bed 12' # Create table that maps between known genes and genePix database knownToGenePix mm6 # ENABLE GENE SORTER FOR mm6 IN HGCENTRALTEST (DONE 7/20/04 Fan) echo "update dbDb set hgNearOk = 1 where name = 'mm6';" \ | hgsql -h genome-testdb hgcentraltest # RAT BLASTP FOR GENE SORTER (DONE 4/20/05 Fan) # Make RAT ortholog column using blastp on RAT known genes. # First make RAT protein database and copy it to iscratch/i # if it doesn't exist already: mkdir /cluster/data/rn3/bed/blastp cd /cluster/data/rn3/bed/blastp pepPredToFa rn3 knownGenePep known.faa formatdb -i known.faa -t known -n known ssh kkr1u00 if (-e /iscratch/i/rn3/blastp) then rm -r /iscratch/i/rn3/blastp endif mkdir -p /iscratch/i/rn3/blastp cp /cluster/data/rn3/bed/blastp/known.p?? /iscratch/i/rn3/blastp iSync # Make parasol run directory ssh kk mkdir -p /cluster/data/mm6/bed/blastp/rn3/run/out cd /cluster/data/mm6/bed/blastp/rn3/run # Make blast script cat > blastSome < gsub <split.lst #EDIT split.lst to add "../../../geneSorter/blastp/split/" in front of "kg" gensub2 split.lst single gsub spec para create spec para try, check, push, check, ... # Completed: 7715 of 7715 jobs # CPU time in finished jobs: 12896s 214.93m 3.58h 0.15d 0.000 y # IO & Wait Time: 21725s 362.08m 6.03h 0.25d 0.001 y # Average job time: 4s 0.07m 0.00h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 22s 0.37m 0.01h 0.00d # Submission to last job: 246s 4.10m 0.07h 0.00d # Load into database. ssh hgwdev cd /cluster/data/mm6/bed/blastp/rn3/run/out hgLoadBlastTab mm6 rnBlastTab -maxPer=1 *.tab # END OF GENE SORTER STUFF ############################################################################# ### MM6 PROTEOME BROWSER TABLES BUILD #### (DONE - 2005-04-20 - Fan) # These are instructions for building tables # needed for the Proteome Browser to be used with mm6. # DON'T START THESE UNTIL TABLES FOR KNOWN GENES AND kgProtMap table # ARE REBUILT. # This build is based on proteins DBs dated 050315. # Create the working directory ssh hgwdev mkdir /cluster/data/mm6/bed/pb.2005-04-20 cd /cluster/data/mm6/bed ln -s /cluster/data/mm6/bed/pb.2005-04-20 pb cd pb # Define pep* tables in mm6 DB cat ~/kent/src/hg/lib/pep*.sql > pepAll.sql # First edit out pepPred table definition, then hgsql mm6 < pepAll.sql # Build the pepMwAa table hgsql proteins050315 -e "select info.acc, molWeight, aaSize from sp050315.info, sp050315.accToTaxon where accToTaxon.taxon=10090 and accToTaxon.acc = info.acc" > pepMwAa.tab hgsql mm6 -e 'load data local infile "pepMwAa.tab" into table mm6.pepMwAa ignore 1 lines;' o Build the pepPi table hgsql proteins050315 -e "select info.acc from sp050315.info, sp050315.accToTaxon where accToTaxon.taxon=10090 and accToTaxon.acc = info.acc" > protAcc.lis pbCalPi protAcc.lis sp050315 pepPi.tab hgsql mm6 -e 'load data local infile "pepPi.tab" into table mm6.pepPi;' # Calculate and load pep distributions pbCalDist sp050315 proteins050315 10090 mm6 >pbCalDist.out cat pbCalDist.out wc pbCalDist.out hgsql mm6 load data local infile "pepExonCntDist.tab" into table mm6.pepExonCntDist; load data local infile "pepCCntDist.tab" into table mm6.pepCCntDist; load data local infile "pepHydroDist.tab" into table mm6.pepHydroDist; load data local infile "pepMolWtDist.tab" into table mm6.pepMolWtDist; load data local infile "pepResDist.tab" into table mm6.pepResDist; load data local infile "pepIPCntDist.tab" into table mm6.pepIPCntDist; load data local infile "pepPiDist.tab" into table mm6.pepPiDist; quit # Calculate frequency distributions pbCalResStd 050315 10090 mm6 # Create pbAnomLimit and pbResAvgStd tables hgsql mm6 < ~/src/hg/lib/pbAnomLimit.sql hgsql mm6 < ~/src/hg/lib/pbResAvgStd.sql hgsql mm6 -e 'load data local infile "pbResAvgStd.tab" into table mm6.pbResAvgStd;' hgsql mm6 -e 'load data local infile "pbAnomLimit.tab" into table mm6.pbAnomLimit;' # UPDATE kgSpAlias TABLE TO BE USED BY PB (Done 4/20/05) cd /cluster/data/mm6/bed/pb hgsql mm6 -e \ 'select kgXref.kgID, spID, alias from kgXref, kgAlias where kgXref.kgID=kgAlias.kgID' >j.tmp hgsql mm6 -e \ 'select kgXref.kgID, spID, alias from kgXref, kgProtAlias where kgXref.kgID=kgProtAlias.kgID'\ >>j.tmp cat j.tmp|sort -u |grep -v 'kgID' >mm6.kgSpAlias.tab rm j.tmp hgsql mm6 -e 'drop table kgSpAlias'; hgsql mm6 < ~/src/hg/lib/kgSpAlias.sql hgsql mm6 -e 'load data local infile "mm6.kgSpAlias.tab" into table kgSpAlias' gzip mm6.kgSpAlias.tab # Create pbStamp table for PB hgsql mm6 < ~/src/hg/lib/pbStamp.sql hgsql mm5 -e 'select * from pbStamp' > pbStamp.tab hgsql mm6 -e 'delete from pbStamp' hgsql mm6 -e 'load data local infile "pbStamp.tab" into table mm6.pbStamp ignore 1 lines;' # ENABLE PROTEOME BROWSER FOR mm6 IN HGCENTRALTEST echo "update dbDb set hgPbOk = 1 where name = 'mm6';" \ | hgsql -h genome-testdb hgcentraltest # Connect to genome-testdb and use hgcentraltest DB. # Update the entry in gdbPdb table from mySql prompt: delete from gdbPdb where genomeDb='mm6'; insert into gdbPdb values('mm6', 'proteins050415'); # Adjust drawing parameters for Proteome Browser stamps Now invoke Proteome Browser and adjust various drawing parameters (mostly the ymax of each stamp) if necessary, by updating the pbStamp.tab file and then delete and reload the pbStamp table. # Perform preliminary review of Proteome Browser for mm6, then notify QA for formal review. ##################################################################### # MAP CONTIGS TRACK (DONE - 2005-04-21 - Hiram) ssh hgwdev mkdir -p /cluster/data/mm6/bed/ctgPos cd /cluster/data/mm6/bed/ctgPos # hgCtgPos uses the lift files... but mouse lift files are for the # 5MB contigs from splitFaIntoContigs, not for the real NT_ contigs # from the assembly. (In the future, we should go with the NT's!) # So... just for this release, go straight from the seq_contig.md # to the table def'n: contig, size, chrom, chromStart, chromEnd # This script is an improvement from before, this is now doing the # randoms properly. cat << '_EOF_' > seqContigToCtgPos.pl #!/usr/bin/env perl use warnings; use strict; my $prevRandom=""; my $randomPosition=0; while(my $line=<>) { chomp($line); my @a = split('\s+',$line); if ($a[1] =~ m/\|/) { my @b = split('\|',$a[1]); if ($b[0] ne $prevRandom) { $randomPosition=0; $prevRandom=$b[0]; } my $size = $a[3]-$a[2]+1; my $start = $randomPosition; my $end = $randomPosition + $size; printf "%s\t%d\tchr%s_random\t%d\t%d\n", $a[5],$size,$b[0],$start,$end; if ($b[0] ne "Un") { $randomPosition += 50000; } else { $randomPosition += 1000; } $randomPosition += $size; } elsif ($a[5] =~ m/^N[TC]_\d+$/) { my $start = $a[2]-1; my $end = $a[3]; my $size = $end-$start; printf "%s\t%d\tchr%s\t%d\t%d\n", $a[5],$size,$a[1],$start,$end; } } '_EOF_' # emacs happy chmod +x seqContigToCtgPos.pl # /cluster/data/mm6/ncbi/seq_contig.md contains more than just C57BL/6J. # Filter those out with the grep. zcat ../../seq_contig.md.gz | grep C57BL | \ ./seqContigToCtgPos.pl > ctgPos.tab hgsql mm6 < ~/kent/src/hg/lib/ctgPos.sql hgsql mm6 -e 'load data local infile "ctgPos.tab" into table ctgPos;' featureBits -countGaps mm6 ctgPos # 2638893452 bases of 3079633452 (85.689%) in intersection featureBits -countGaps mm5 ctgPos # 2557081173 bases of 3164952073 (80.794%) in intersection ######################################################################### # BLASTZ HUMAN Hg16 (DONE - 2005-04-27 - 2005-04-29 - Hiram) # to replace the Mm4 chains and links on Hg16 since Mm4 is being # retired with this Mm6 release ssh eieio mkdir /cluster/data/mm6/bed/blastzHg16.2005_04_27 cd /cluster/data/mm6/bed ln -s blastzHg16.2005_04_27 blastz.hg16 cd blastzHg16.2005_04_27 cat << '_EOF_' > DEF # mouse vs. human export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/home/angie/schwartzbin:/cluster/home/kent/bin/i386 ALIGN=blastz-run BLASTZ=blastz BLASTZ_H=2000 BLASTZ_ABRIDGE_REPEATS=1 # TARGET: Mouse Mm6 SEQ1_DIR=/panasas/store/mm6/nib SEQ1_FLAG=-rodent SEQ1_SMSK=/panasas/store/mm6/linSpecRep.notInHuman SEQ1_IN_CONTIGS=0 SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Human Hg16 SEQ2_DIR=/scratch/hg/hg16/bothMaskedNibs SEQ2_SMSK=/scratch/hg/hg16/linSpecRep.notInMouse SEQ2_IN_CONTIGS=0 SEQ2_CHUNK=30000000 SEQ2_LAP=0 BASE=/cluster/data/mm6/bed/blastzHg16.2005_04_27 DEF=$BASE/DEF RAW=$BASE/raw CDBDIR=$BASE SEQ1_LEN=$BASE/S1.len SEQ2_LEN=$BASE/S2.len '_EOF_' # << keep emacs coloring happy cp /cluster/data/mm6/chrom.sizes ./S1.len sort -rn +1 /cluster/data/hg16/chrom.sizes > S2.len # establish a screen to control this job screen cd /cluster/data/mm6/bed/blastzHg16.2005_04_27 time /cluster/bin/scripts/doBlastzChainNet.pl `pwd`/DEF > \ blast.run.out 2>&1 & # STARTED - 2005-04-27 12:20 # FINISHED - 2005-04-28 00:11 # real 712m59.223s # user 0m0.669s # sys 0m0.442s # detach from screen session: Ctrl-a Ctrl-d # to reattach to this screen session: ssh eieio screen -d -r # STARTED - 2005-03-17 21:25 # FINISHED - 2005-03-18 14:00 # Completed: 44354 of 44354 jobs # CPU time in finished jobs: 16945019s 282416.99m 4706.95h 196.12d 0.537 y # IO & Wait Time: 2624756s 43745.93m 729.10h 30.38d 0.083 y # Average job time: 441s 7.35m 0.12h 0.01d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 9471s 157.85m 2.63h 0.11d # Submission to last job: 35934s 598.90m 9.98h 0.42d # Completed: 331 of 331 jobs # CPU time in finished jobs: 274s 4.56m 0.08h 0.00d 0.000 y # IO & Wait Time: 1104s 18.40m 0.31h 0.01d 0.000 y # Average job time: 4s 0.07m 0.00h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 15s 0.25m 0.00h 0.00d # Submission to last job: 101s 1.68m 0.03h 0.00d # Completed: 40 of 40 jobs # CPU time in finished jobs: 6328s 105.47m 1.76h 0.07d 0.000 y # IO & Wait Time: 551s 9.18m 0.15h 0.01d 0.000 y # Average job time: 172s 2.87m 0.05h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 620s 10.33m 0.17h 0.01d # Submission to last job: 830s 13.83m 0.23h 0.01d # swap results to place mm6 alignments onto Hg16 ssh eieio cd /cluster/data/mm6/bed/blastzHg16.2005_04_27 time /cluster/bin/scripts/doBlastzChainNet.pl -swap `pwd`/DEF > \ swap.run.out 2>&1 & featureBits mm6 netHg16 # 2580637164 bases of 2597150411 (99.364%) in intersection featureBits mm6 netHg17 # 2579747741 bases of 2597150411 (99.330%) in intersection featureBits mm6 chainHg16 # 2597476551 bases of 2597150411 (100.013%) in intersection featureBits mm6 chainHg17 # 2596946329 bases of 2597150411 (99.992%) in intersection featureBits hg16 netMm6 # 2890452713 bases of 2865248791 (100.880%) in intersection featureBits hg16 chainMm6 # 2913361200 bases of 2865248791 (101.679%) in intersection HGDB_CONF=~/.hg.conf.read-only featureBits mm6 chainHg16Link # 966699669 bases of 2597150411 (37.222%) in intersection HGDB_CONF=~/.hg.conf.read-only featureBits mm6 chainHg17Link # 966916309 bases of 2597150411 (37.230%) in intersection HGDB_CONF=~/.hg.conf.read-only featureBits hg16 chainMm6Link # 969979195 bases of 2865248791 (33.853%) in intersection HGDB_CONF=~/.hg.conf.read-only featureBits hg17 chainMm5Link # 1020106336 bases of 2866216770 (35.591%) in intersection #### Blat knownGene proteins to determine exons (braney 2005-05-20 DONE) ssh hgwdev cd /cluster/data/mm6/bed mkdir blat.mm6KG.2005-05-02 rm blat.mm6KG ln -s blat.mm6KG.2005-05-02 blat.mm6KG cd blat.mm6KG pepPredToFa mm6 knownGenePep known.fa hgPepPred mm6 generic blastKGPep03 known.fa grep ">" known.fa | sed "s/>//" > kgName.lst ssh kk cd /cluster/data/mm6/bed/blat.mm6KG cat << '_EOF_' > blatSome #!/bin/csh -fe /cluster/bin/i386/blat -t=dnax -q=prot -out=pslx $1 $2 $3 '_EOF_' # << keep emacs happy chmod +x blatSome ls -1S /panasas/store/mm6/nib/*.nib > mouse.lst mkdir kgfa cd kgfa faSplit sequence ../known.fa 3000 kg cd .. ls -1S kgfa/*.fa > kg.lst cat << '_EOF_' > blatGsub #LOOP blatSome $(path1) {check in line $(path2)} {check out line psl/$(root1)/$(root2).psl} #ENDLOOP '_EOF_' # << keep emacs happy gensub2 mouse.lst kg.lst blatGsub blatSpec mkdir psl cd psl foreach i (`cat ../mouse.lst`) mkdir `basename $i .nib` end cd .. para create blatSpec para push # Completed: 115720 of 115720 jobs # CPU time in finished jobs: 14938417s 248973.62m 4149.56h 172.90d 0.474 y # IO & Wait Time: 2116275s 35271.25m 587.85h 24.49d 0.067 y # Average job time: 147s 2.46m 0.04h 0.00d # Longest finished job: 9235s 153.92m 2.57h 0.11d # Submission to last job: 25264s 421.07m 7.02h 0.29d ssh eieio cd /cluster/data/mm6/bed/blat.mm6KG pslSort dirs raw.psl /tmp psl/* pslReps -nohead -minCover=0.9 -minAli=0.9 raw.psl cooked.psl /dev/null pslUniq cooked.psl mm6KG.psl pslxToFa mm6KG.psl mm6KG_ex.fa -liftTarget=genome.lft -liftQuery=protein.lft ssh hgwdev kgName mm6 mm6KG.psl blastKGRef03 hgsql mm6 < ~/kent/src/hg/lib/blastRef.sql echo "rename table blastRef to blastKGRef03" | hgsql mm6 echo "load data local infile 'blastKGRef03' into table blastKGRef03" | hgsql mm6 # LOAD GENEID GENES (DONE 5/16/05 angie) mkdir -p /cluster/data/mm6/bed/geneid/download cd /cluster/data/mm6/bed/geneid/download foreach chr (`awk '{print $1;}' ../../../chrom.sizes`) echo $chr wget \ http://genome.imim.es/genepredictions/M.musculus/mmMar2005/geneid_v1.2/$chr.gtf wget \ http://genome.imim.es/genepredictions/M.musculus/mmMar2005/geneid_v1.2/$chr.prot end # Add missing .1 to protein id's foreach f (*.prot) perl -wpe 's/^(>chr\w+)$/$1.1/' $f > $f:r-fixed.prot end cd .. ldHgGene -genePredExt -gtf mm6 geneid download/*.gtf hgPepPred mm6 generic geneidPep download/*-fixed.prot featureBits mm6 -enrichment refGene geneid #refGene 1.623%, geneid 1.561%, both 0.794%, cover 48.91%, enrich 31.34x ############################################################################## # CLONE ENDS - BACEND TRACK (started - 2005-05-20 - Heather) ssh kkstore01 cd /cluster/data/mm6 # check disk space: 120 Gigs free cd bed mkdir cloneend cd cloneend mkdir ncbi cd ncbi ftp ftp.ncbi.nih.gov # anonymous login cd genomes/CLONEEND/mus_musculus binary prompt mget * gunzip * # 650 megs # seems like the *.mfa files were split just for convenience # concatenate foreach f (*.mfa) cat $f >> all.mfa end # Convert the title line of the all.mfa file # Location of perl different on kkstore01 cat << '_EOF_' > convert.pl #!/usr/bin/perl -w use strict; while (my $line = <>) { if (substr($line,0,1) ne ">") { print $line; } else { my @fields = split(/\|/, $line); my $printed = 0; for (my $i = 0; $i < $#fields; $i++) { if ($fields[$i] eq "gb" || $fields[$1] eq "dbj") { (my $name, my $vers) = split(/\./,$fields[$i+1]); print ">$name\n"; $i= $#fields; $printed = 1; } } if (!$printed) { die("Failed for $line\n"); } } } '_EOF_' # << for emacs chmod +x convert.pl ./convert.pl < all.mfa > cloneEnds.fa # check that files still have the same number of lines # expecting 8290734 wc -l all.mfa wc -l cloneEnds.fa # concatenate the text files, too foreach f (*.txt) cat $f >> all.txt end # generate cloneEndPairs.txt and cloneEndSingles.txt ./convertTxt.pl all.txt # Reading in end info # Writing out pair info # Writing out singleton info # 354485 pairs and 78424 singles # a bit of cleanup mkdir archive mv 10090* archive # split mkdir splitdir faSplit sequence cloneEnds.fa 100 cloneEnds mkdir /cluster/bluearc/scratch/mus/mm6 mkdir /cluster/bluearc/scratch/mus/mm6/cloneEnds mv cloneEnds???.fa /cluster/bluearc/scratch/mus/mm6/cloneEnds cp -p cloneEnds.fa /cluster/bluearc/scratch/mus/mm6/cloneEnds # request updateLocal to make available on /scratch/mus/mm6/cloneEnds on the cluster # load sequences ssh hgwdev cd /gbdb/mm6 mkdir cloneend cd cloneend ln -s /cluster/data/mm6/bed/cloneend/ncbi/cloneEnds.fa . cd /tmp hgLoadSeq mm6 /gbdb/mm6/cloneend/cloneEnds.fa # Advisory lock created # Creating .tab file # Adding /gbdb/mm6/cloneend/cloneEnds.fa # 789467 sequences # Updating seq table # Advisory lock has been released # All done ############################################################################ # BACEND SEQUENCE ALIGNMENTS (DONE - 2005-06-02 - Hiram) ssh kkstore01 mkdir /cluster/data/mm6/noMask cd /cluster/data/mm6/ # Need an unmasked sequence for this work for CHR in ?/chr?.fa ??/chr??.fa ?/chr?_random.fa ??/chr??_random.fa do C=`basename ${CHR}` echo -n "working ${C} ... " head -1 ${CHR} > noMask/${C} tail +2 ${CHR} | tr [:lower:] [:upper:] >> noMask/${C} echo "done" done mkdir ooc ls noMask/chr*.fa > fa.list blat -makeOoc=ooc/11.ooc -repMatch=1024 fa.list fa.list output.psl # Wrote 25952 overused 11-mers to ooc/11.ooc ssh kkr1u00 mkdir /iscratch/i/mm6/ooc cp -p /cluster/data/mm6/ooc/11.ooc /iscratch/i/mm6/ooc mkdir /iscratch/i/mm6/noMask cp -p /cluster/data/mm6/noMask/chr*.fa /iscratch/i/mm6/noMask for U in 2 3 4 5 6 7 8 do rsync -a --progress /iscratch/i/mm6/ooc/ kkr${U}u00:/iscratch/i/mm6/ooc rsync -a --progress /iscratch/i/mm6/noMask/ \ kkr${U}u00:/iscratch/i/mm6/noMask echo "done kkr${U}u00" done # allow blat to run politely in /tmp while it writes output, then # copy results to results file: ssh kk mkdir /cluster/data/mm6/bed/bacends cd /cluster/data/mm6/bed/bacends cat << '_EOF_' > runBlat.sh #!/bin/sh path1=$1 path2=$2 root1=$3 root2=$4 result=$5 rm -fr /tmp/${root1}_${root2} mkdir /tmp/${root1}_${root2} pushd /tmp/${root1}_${root2} /cluster/bin/i386/blat ${path1} ${path2} -ooc=/iscratch/i/mm6/ooc/11.ooc \ ${root1}.${root2}.psl popd rm -f ${result} mv /tmp/${root1}_${root2}/${root1}.${root2}.psl ${result} rm -fr /tmp/${root1}_${root2} '_EOF_' # << emacs happy chmod +x runBlat.sh cat << '_EOF_' > template #LOOP ./runBlat.sh {check in exists $(path1)} {check in exists $(path2)} $(root1) $(root2) {check out line+ bacEnds.out/$(root2)/$(root1).$(root2).psl} #ENDLOOP '_EOF_' # << emacs happy ls -1S /scratch/mus/mm6/cloneEnds/cloneEnds???.fa > bacEnds.lst mkdir bacEnds.out # create results directories for each to avoid the all result files in # one directory problem foreach f (`cat bacEnds.lst`) set b = $f:t:r echo $b mkdir bacEnds.out/$b end ls -1S /iscratch/i/mm6/noMask/chr*.fa > contig.lst gensub2 contig.lst bacEnds.lst template jobList para create jobList # 7850 jobs written to batch para try, check, push, etc ... # Completed: 3920 of 3920 jobs # CPU time in finished jobs: 2681337s 44688.95m 744.82h 31.03d 0.085 y # IO & Wait Time: 110523s 1842.05m 30.70h 1.28d 0.004 y # Average job time: 712s 11.87m 0.20h 0.01d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 5203s 86.72m 1.45h 0.06d # Submission to last job: 6402s 106.70m 1.78h 0.07d ssh kkstore01 cd /cluster/data/mm6/bed/bacends screen mkdir temp time pslSort dirs raw.psl temp bacEnds.out/* > pslSort.out 2>&1 & # real 27m20.352s # user 20m10.329s # sys 1m55.287s time pslReps -nearTop=0.01 -minCover=0.7 -minAli=0.8 -noIntrons \ raw.psl bacEnds.psl /dev/null > pslReps.out 2>&1 & # real 8m15.671s # user 7m18.229s # sys 0m20.554s cp -p ~booch/clusterJobs/bacends/split.pl . cp -p ~booch/clusterJobs/bacends/header . time ./split.pl header < bacEnds.psl cp -p bacEnds.psl bacEnds.psl.save time pslSort dirs bacEnds.psl temp split # ~ 3 minutes # Copy files to final destination and remove mkdir /cluster/data/mm6/bacends cp -p bacEnds.psl /cluster/data/mm6/bacends ############################################################################ # BACEND PAIRS TRACK (DONE 2005-06-02 - Hiram) ssh kolossus cd /cluster/data/mm6/bacends time /cluster/bin/x86_64/pslPairs -tInsert=10000 -minId=0.91 -noBin -min=25000 \ -max=350000 -slopval=10000 -hardMax=500000 -slop -short -long -orphan \ -mismatch -verbose bacEnds.psl \ ../bed/cloneend/ncbi/cloneEndPairs.txt all_bacends bacEnds # create header required by "rdb" tools echo -e \ "chr\tstart\tend\tclone\tscore\tstrand\tall\tfeatures\tstarts\tsizes" > header echo -e "10\t10N\t10N\t10\t10N\t10\t10\t10N\t10\t10" >> header cat header bacEnds.pairs | \ /cluster/bin/scripts/row score ge 300 | \ /cluster/bin/scripts/sorttbl chr start | \ /cluster/bin/scripts/headchg -del > bacEndPairs.bed cat header bacEnds.slop bacEnds.short bacEnds.long bacEnds.mismatch \ bacEnds.orphan | /cluster/bin/scripts/row score ge 300 | \ /cluster/bin/scripts/sorttbl chr start | \ /cluster/bin/scripts/headchg -del > bacEndPairsBad.bed /cluster/bin/scripts/extractPslLoad -noBin bacEnds.psl bacEndPairs.bed \ bacEndPairsBad.bed >j1.out cat j1.out| /cluster/bin/scripts/sorttbl tname tstart >j2.out cat j2.out | /cluster/bin/scripts/headchg -del > bacEnds.load.psl rm j1.out j2.out # load into database ssh hgwdev cd /cluster/data/mm6/bacends # CHECK bacEndPairs.bed ID's to make sure they have no blanks in them awk '{print $5}' bacEndPairs.bed | sort -u # result should be the scores, no extraneous strings: # 1000 # 300 # 375 # 500 # 750 # edit the file and fix it if it has a bad name. hgLoadBed -notItemRgb mm6 bacEndPairs bacEndPairs.bed \ -sqlTable=$HOME/kent/src/hg/lib/bacEndPairs.sql # Loaded 226279 elements of size 11 # note - this track isn't pushed to RR, just used for assembly QA hgLoadBed -notItemRgb mm6 bacEndPairsBad bacEndPairsBad.bed \ -sqlTable=$HOME/kent/src/hg/lib/bacEndPairsBad.sql # Loaded 104221 elements of size 11 # NOTE: truncates file to 0 if -nobin is used hgLoadPsl mm6 -table=all_bacends bacEnds.load.psl # load of all_bacends did not go as planned: 9046691 record(s), 0 row(s) # skipped, 37 warning(s) loading psl.tab # real 36m1.178s # user 5m55.630s # sys 0m41.380s # featureBits mm6 all_bacends # 336981828 bases of 2597150411 (12.975%) in intersection # featureBits mm5 all_bacends # 268502414 bases of 2615483787 (10.266%) in intersection # featureBits mm4 all_bacends # 243096171 bases of 2627444668 (9.252%) in intersection # featureBits mm6 bacEndPairs # 2570768812 bases of 2597150411 (98.984%) in intersection # featureBits mm5 bacEndPairs # 2567958504 bases of 2615483787 (98.183%) in intersection # featureBits mm4 bacEndPairs # 2549945356 bases of 2627444668 (97.050%) in intersection # featureBits mm6 bacEndPairsBad # 1006314997 bases of 2597150411 (38.747%) in intersection # featureBits mm5 bacEndPairsBad # 541027882 bases of 2615483787 (20.686%) in intersection # featureBits mm4 bacEndPairsBad # 1074505863 bases of 2627444668 (40.895%) in intersection # SGP GENES (DONE 5/25/05 angie) ssh hgwdev mkdir /cluster/data/mm6/bed/sgp cd /cluster/data/mm6/bed/sgp foreach chr (`awk '{print $1;}' ../../chrom.sizes`) wget http://genome.imim.es/genepredictions/M.musculus/mmMar2005/SGP/humangp200405/$chr.gtf wget http://genome.imim.es/genepredictions/M.musculus/mmMar2005/SGP/humangp200405/$chr.prot end # Add ".1" suffix to each item in .prot's, to match transcript_id's in gtf cp /dev/null sgpPep.fa foreach f (chr*.prot) nice perl -wpe 's/^(>chr\S+)/$1.1/' $f >> sgpPep.fa end ldHgGene -gtf -genePredExt mm6 sgpGene chr*.gtf hgPepPred mm6 generic sgpPep sgpPep.fa featureBits mm6 -enrichment refGene:CDS sgpGene #refGene:CDS 0.983%, sgpGene 1.421%, both 0.841%, cover 85.57%, enrich 60.21x # SGP GENES (UPDATE 1/18/2006) sgpPep table dropped, replaced by hgc generated protein seq in browser ############################################################################ # RE-BUILD KNOWN GENES RELATED TABLES for mm6 (STARTED 5/26/05, DONE6/1/05. Fan) # First build protein databases, sp050415 and proteins050415 # See makeProteins050415.doc for details. # Please note that the protein and displayId tables in sp050415 have data of variant splice proteins. # Create working subdirectories and temporary databases ssh hgwdev cd /cluster/store10/kg mkdir kgMm6B ln -s /cluster/store10/kg/kgMm6B /cluster/store6/kgDB/bed/kgMm6B ln -s /cluster/store10/kg/kgMm6B /cluster/data/mm6/bed/kgMm6B hgsql mm6 -e "create database kgMm6B" hgsql mm6 -e "create database kgMm6BTemp" mkdir /cluster/bluearc/kgDB/kgMm6B mkdir /cluster/bluearc/kgDB/kgMm6B/protBlat ln -s /cluster/bluearc/kgDB/kgMm6B/protBlat /cluster/store10/kg/kgMm6B/protBlat cd /cluster/store10/kg/kgMm6B/protBlat # Get all human protein sequences hgsql -N sp050415 -e \ 'select proteins050415.spXref3.accession,protein.val from proteins050415.spXref3,protein where division="10090" and acc=accession' \ |awk '{print ">" $1;print $2}' >mm6Prot.fa # Prepare and perform cluster run for protein/genome alignment ssh kk cd /cluster/data/mm6/bed/kgMm6B/protBlat mkdir prot faSplit sequence mm6Prot.fa 1000 prot/prot ls /cluster/bluearc/kgDB/kgMm6B/protBlat/prot/* > prot.lis ssh hgwdev cd /cluster/data/mm6/bed/kgMm6B/protBlat hgsql mm6 -N -e 'select chrom from chromInfo' > chrom.lis exit cat << '_EOF_' > gsub #LOOP /cluster/bin/i386/blat -noHead -t=dnax -q=prot /cluster/data/mm6/nib/$(path1).nib $(path2) {check out line+ /cluster/bluearc/kgDB/kgMm6B/protBlat/result/$(root1)_$(root2).psl} #ENDLOOP '_EOF_' mkdir result gensub2 chrom.lis prot.lis gsub jobList para create jobList para try para check para push para check ... # Completed: 31386 of 39600 jobs # Crashed: 8214 jobs # CPU time in finished jobs: 32377544s 539625.74m 8993.76h 374.74d 1.027 y # IO & Wait Time: 727341s 12122.34m 202.04h 8.42d 0.023 y # Average job time: 1055s 17.58m 0.29h 0.01d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 34182s 569.70m 9.49h 0.40d # Submission to last job: 57659s 960.98m 16.02h 0.67d # Many output .psl files are empty, these warnings are OK. # Check to see if there is any other error type. para problems |grep empty|wc # 8214 24642 642357 # collect BLAT results ssh hgwdev cd /cluster/data/mm6/bed/kgMm6B/protBlat mkdir result2 mkdir result3 cat chrom.lis |sed -e 's/chr/do1 chr/g' >doall cat << '_EOF_' > do1.1 echo processing $1 cat result/$1_prot*.psl >result2/$1.psl '_EOF_' cat << '_EOF_' > do1.2 echo processing $1 pslReps -nohead -minCover=0.80 -minAli=0.80 -nearTop=0.002 result2/$1.psl result3/$1.psl /dev/null >>j.out '_EOF_' chmod +x do* cp do1.1 do1 doall cp do1.2 do1 doall cat result3/*.psl >protBlat.psl hgLoadPsl mm6 protBlat.psl # Processing protBlat.psl # load of protBlat did not go as planned: 82296 record(s), 0 row(s) skipped, 750 warning(s) loading psl.tab # Looked into the cause of the warnings before and found that it was due to that qBaseInsert # and tBaseInsert have negative values, probably due to that this is protein alignment. # Remember to remove result2 and result3 when KG is built and validated. cd /cluster/data/mm6/bed/kgMm6B # create all_mrna.psl and tight_mrna.psl hgsql mm6 -N -e "select * from all_mrna" |cut -f 2-22 >all_mrna.psl pslReps -minCover=0.40 -minAli=0.97 -nearTop=0.002 all_mrna.psl tight_mrna.psl /dev/null # Processed 194640 alignments # Use overlapSelect to get protein and mRNA alignment overlaps overlapSelect -statsOutput -dropped=protOut.psl -overlapThreshold=0.90 \ -selectFmt=psl -inFmt=psl tight_mrna.psl protBlat/protBlat.psl protMrna.stat overlapSelect -mergeOutput -dropped=protOut.psl -overlapThreshold=0.90 -selectFmt=psl \ -inFmt=psl tight_mrna.psl protBlat/protBlat.psl protMrna.out # Create protein/mRNA pair and protein lists cut -f 10,31 protMrna.out|sort -u >spMrna.tab cut -f 10 protMrna.out|sort -u >protein.lis # Load spMrna.tab into spMrna table in temp DB. hgsql kgMm6BTemp < ~/src/hg/lib/spMrna.sql hgsql kgMm6BTemp -e 'load data local infile "spMrna.tab" into table spMrna' hgsql kgMm6BTemp -e 'create index mrnaID on spMrna(mrnaID)' # Prepare and perform cluster run of protein/mRNA alignment # Get mRNA fa file. cd /cluster/data/mm6/bed/kgMm6B /cluster/data/genbank/bin/i386/gbGetSeqs -native -db=mm6 \ -gbRoot=/cluster/data/genbank genbank mrna mrna.fa # Create mrnaSeq table in kgMm6BTemp DB. hgFaToTab mrna.fa mrnaSeq.tab hgsql kgMm6BTemp -e 'drop table mrnaSeq' hgsql kgMm6BTemp <~/src/hg/lib/mrnaSeq.sql hgsql kgMm6BTemp -e 'load data local infile "mrnaSeq.tab" into table mrnaSeq' rm mrnaSeq.tab # Prepare files for cluster run ~/src/hg/protein/KG2.sh kgMm6B mm6 050415 # Perform cluster run of protein/mRNA alignment ~/src/hg/protein/KG4.sh kgMm6B mm6 050415 # Collect cluster run results cd kgBestMrna ls out | sed -e 's/prot/do1 prot/g' >doall # create do1 with the following 2 lines: cat << '_EOF_' > do1 echo processing $1 cat out/$1/*.out >>protMrnaRaw.psl '_EOF_' chmod +x do* doall # Filter out low quality alignments pslReps -nohead -singleHit -minAli=0.9 protMrnaRaw.psl protMrnaBlat.psl /dev/null cut -f 10,14 protMrnaBlat.psl |sort -u >protMrna.lis wc protMrna.lis # Load BLAT results into temp DB. hgsql kgMm6BTemp < ~/src/hg/lib/protMrnaBlat.sql hgsql kgMm6BTemp -e 'load data local infile "protMrnaBlat.psl" into table protMrnaBlat' hgsql kgMm6BTemp -e 'create index tName on protMrnaBlat(tName)' # Create CDS files from protein/mRNA alignment results. hgsql kgMm6BTemp -N -e \ 'select qName,"_",tName,tStart+1,":",tEnd+3 from protMrnaBlat order by qName,tName,tEnd-tStart desc'\ |sed 's/\t_\t/_/g'|sed 's/\t:\t/../g' >protMrna.cds # Create protMrna.psl with proteinID_mrnaID as query ID. cut -f 22-30 ../protMrna.out > j1.tmp cut -f 32-42 ../protMrna.out > j2.tmp cut -f 10,31 ../protMrna.out|sed -e 's/\t/_/g' >j3.tmp paste j1.tmp j3.tmp j2.tmp >protMrna.psl rm j1.tmp j2.tmp j3.tmp # Run mrnaToGene to create protMrna.gp bash mrnaToGene -cdsFile=protMrna.cds protMrna.psl protMrna.gp 2>protMrna.err >protMrna.log exit # Prepare refGene and all_mrna gp files. cd .. hgsql mm6 -N -e 'select * from refGene' >ref.gp hgsql mm6 -N -e \ 'select gbCdnaInfo.acc,cds.name from gbCdnaInfo,cds,all_mrna where all_mrna.qName=gbCdnaInfo.acc and gbCdnaInfo.cds=cds.id' \ |sort -u > all_mrna.cds bash mrnaToGene -cdsFile=all_mrna.cds all_mrna.psl all_mrna.gp 2>all_mrna.err > all_mrna.log exit # Align proteins to RefSeq. overlapSelect -inCds -statsOutput -overlapThreshold=0.90 -selectFmt=psl -inFmt=genePred\ protBlat/protBlat.psl ref.gp ref.stat overlapSelect -inCds -dropped=refOut1.gp -overlapThreshold=0.90 -selectFmt=psl -inFmt=genePred\ protBlat/protBlat.psl ref.gp protRef.gp overlapSelect -mergeOutput -selectCds -dropped=protOut1.psl -overlapThreshold=0.80 -inFmt=psl\ -selectFmt=genePred ref.gp protBlat/protBlat.psl protRef.out cut -f 10,22 protRef.out | sort -u >spRef.tab cut -f 10 protRef.out | sort -u >protRef.lis hgsql kgMm6BTemp -e 'drop table spRef' hgsql kgMm6BTemp <~/src/hg/lib/spRef.sql hgsql kgMm6BTemp -e 'load data local infile "spRef.tab" into table spRef' # Prepare and perform cluster runs for protein/RefSeq alignments ~/src/hg/protein/KGRef2.sh kgMm6B mm6 050415 ~/src/hg/protein/KGRef3.sh kgMm6B mm6 050415 cd kgBestRef ls out | sed -e 's/prot/do1 prot/g' >doall cat << '_EOF_' > do1 echo processing $1 cat out/$1/*.out >>protRefRaw.psl '_EOF_' chmod +x do* doall # Filter out low quality alignments. pslReps -nohead -singleHit -minAli=0.9 protRefRaw.psl protRefBlat.psl /dev/null cut -f 10,14 protRefBlat.psl |sort -u >protRef.lis wc protRef.lis hgsql kgMm6BTemp -e 'drop table protRefBlat' hgsql kgMm6BTemp < ~/src/hg/lib/protRefBlat.sql hgsql kgMm6BTemp -e 'load data local infile "protRefBlat.psl" into table protRefBlat' hgsql kgMm6BTemp -e 'create index tName on protRefBlat(tName)' # Run gene-check to filter out invalid gp entries cd /cluster/data/mm6/bed/kgMm6B cat ref.gp kgBestMrna/protMrna.gp all_mrna.gp >kgCandidate0.gp gene-check -incl-ok -ok-genepred-out kgCandidate0.passed.gp -nib-dir \ /cluster/data/mm6/nib kgCandidate0.gp kgCandidate0.check hgsql kgMm6BTemp < ~/src/hg/lib/kgCandidate0.sql hgsql kgMm6BTemp -e 'load data local infile "kgCandidate0.gp" into table kgCandidate0' hgsql kgMm6BTemp < ~/src/hg/lib/geneCheck.sql hgsql kgMm6BTemp -e 'load data local infile "kgCandidate0.check" into table geneCheck ignore 2 lines' # Run kgCheck to get all KG candidates that pass the KG gene check criteria kgCheck kgMm6BTemp mm6 kgCandidate0 geneCheck kgCandidate.tab hgsql kgMm6BTemp -e 'drop table kgCandidate' hgsql kgMm6BTemp < ~/src/hg/lib/kgCandidate.sql hgsql kgMm6BTemp -e 'load data local infile "kgCandidate.tab" into table kgCandidate' hgsql kgMm6BTemp -e 'create index alignID on kgCandidate(alignID)' # Construct the kgCandidateX table that has alignID in the name field. cut -f 2-10 kgCandidate.tab >j2.tmp cut -f 11 kgCandidate.tab >j1.tmp paste j1.tmp j2.tmp >kgCandidateX.tab hgsql kgMm6BTemp -e 'drop table kgCandidateX' hgsql kgMm6BTemp < ~/src/hg/lib/kgCandidateX.sql hgsql kgMm6BTemp -e 'load data local infile "kgCandidateX.tab" into table kgCandidateX' # Score protein/mRna and protein/RefSeq alignments kgResultBestMrna2 050415 kgMm6BTemp mm6|sort -u >protMrnaBlatScore.tab kgResultBestRef2 050415 kgMm6BTemp mm6|sort -u >protRefScore.tab # Combine scoring results and load them into temp DB. cat protMrnaBlatScore.tab protRefScore.tab >protMrnaScore.tab hgsql kgMm6BTemp -e 'drop table protMrnaScore' hgsql kgMm6BTemp < ~/src/hg/lib/protMrnaScore.sql hgsql kgMm6BTemp -e 'load data local infile "protMrnaScore.tab" into table protMrnaScore' hgsql kgMm6BTemp -e 'create index mrnaAcc on protMrnaScore(mrnaAcc)' # Run kgGetCds to get CDS structure of each gene kgGetCds kgMm6BTemp kgCandidateX jY.tmp cat jY.tmp |sort -u >kgCandidateY.tab rm jY.tmp hgsql kgMm6BTemp -e 'drop table kgCandidateY' hgsql kgMm6BTemp < ~/src/hg/lib/kgCandidateY.sql hgsql kgMm6BTemp -e 'load data local infile "kgCandidateY.tab" into table kgCandidateY' # Run kgPickPrep to replace long cds structure string with cdsId. kgPickPrep kgMm6BTemp kgCandidateZ.tab hgsql kgMm6BTemp -e 'drop table kgCandidateZ' hgsql kgMm6BTemp < ~/src/hg/lib/kgCandidateZ.sql hgsql kgMm6BTemp -e 'load data local infile "kgCandidateZ.tab" into table kgCandidateZ' hgsql kgMm6BTemp -e 'create index cdsId on kgCandidateZ(cdsId)' # Run kgPick to pick the representative a mrna/protein pair for each unique CDS structure. kgPick kgMm6BTemp mm6 proteins050415 kg4.tmp dupSpMrna.tmp sort -u dupSpMrna.tmp >dupSpMrna.tab hgsql mm6 -e 'drop table dupSpMrna' hgsql mm6 < ~/src/hg/lib/dupSpMrna.sql hgsql mm6 -e 'load data local infile "dupSpMrna.tab" into table dupSpMrna' # Sort KG genes to make the kg4.gp table file. ~/kent/src/hg/protein/sortKg.pl kg4.tmp >kg4.gp hgsql kgMm6BTemp -e 'drop table knownGene' hgsql kgMm6BTemp < ~/src/hg/lib/knownGene.sql hgsql kgMm6BTemp -e 'load data local infile "kg4.gp" into table knownGene' hgsql mm6 -e 'drop table kg4' hgsql mm6 < ~/src/hg/lib/kg4.sql hgsql mm6 -e 'load data local infile "kg4.gp" into table kg4' # Perform analysis before loading kg4 table data to mm6.knownGene table. # Load data into mm6 knownGene table. hgsql mm6 -e 'drop table knownGene' hgsql mm6 < ~/src/hg/lib/knownGene.sql hgsql mm6 -e 'load data local infile "kg4.gp" into table knownGene' # Build knownGeneMrna and knownGenePep tables. kgPepMrna kgMm6BTemp mm6 050415 hgsql mm6 -e 'drop table knownGeneMrna' hgsql mm6 < ~/src/hg/lib/knownGeneMrna.sql hgsql mm6 -e 'load data local infile "knownGeneMrna.tab" into table knownGeneMrna' hgsql mm6 -e 'drop table knownGenePep' hgsql mm6 < ~/src/hg/lib/knownGenePep.sql hgsql mm6 -e 'load data local infile "knownGenePep.tab" into table knownGenePep' # Build kgXref table kgXref2 kgMm6BTemp 050415 mm6 hgsql mm6 -e 'drop table kgXref' hgsql mm6 < ~/src/hg/lib/kgXref.sql hgsql mm6 -e 'load data local infile "kgXref.tab" into table kgXref' # Build spMrna table hgsql mm6 -N -e 'select name, proteinID from knownGene' >kgSpMrna.tab hgsql mm6 -e 'drop table spMrna' hgsql mm6 <~/src/hg/lib/spMrna.sql hgsql mm6 -e 'load data local infile "kgSpMrna.tab" into table spMrna' # Build mrnaRefseq table cd /cluster/store10/entrez mkdir 050601 rm /cluster/data/entrez ln -s /cluster/store10/entrez/050601 /cluster/data/entrez cd /cluster/data/entrez wget -- timestamp ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/gene2accession.gz wget -- timestamp ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/gene_info.gz wget -- timestamp ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/gene2refseq.gz gzip -d *.gz cut -f 2,4 gene2accession | sort -u | grep -v "-" | grep -v "NM_" | sed -e 's/\./\t/g' > entrezMrna.tab cut -f 2,4 gene2refseq | grep "NM_"| sort -u | grep -v "-" | sed -e 's/\./\t/g' > entrezRefseq.tab cut -f 2,4,6 gene2accession | grep "NM_"| grep "NP_"|sort -u | sed -e 's/\./\t/g' > entrezRefProt.tab hgsql entrez -e 'drop table entrezRefseq' hgsql entrez -e 'drop table entrezMrna' hgsql entrez -e 'drop table entrezRefProt' hgsql entrez < ~/src/hg/lib/entrezRefseq.sql hgsql entrez < ~/src/hg/lib/entrezMrna.sql hgsql entrez < ~/src/hg/lib/entrezRefProt.sql hgsql entrez -e 'load data local infile "entrezRefseq.tab" into table entrezRefseq' hgsql entrez -e 'load data local infile "entrezMrna.tab" into table entrezMrna' hgsql entrez -e 'load data local infile "entrezRefProt.tab" into table entrezRefProt' hgsql entrez -N -e \ 'select mrna, refseq from entrezRefseq, entrezMrna where entrezRefseq.geneID=entrezMrna.geneID' \ >mrnaRefseq.tab hgsql mm6 -e 'drop table mrnaRefseq' hgsql mm6 < ~/src/hg/lib/mrnaRefseq.sql hgsql mm6 -e 'load data local infile "mrnaRefseq.tab" into table mrnaRefseq' # Build kgProtMap table ~/src/hg/protein/kgProtMap2.sh kgMm6B mm6 050415 # Update and clean up kgResultBestMrna2.c and then check it in. # Build alias tables. # kgAliasM reads from proteins050415.hugo.symbol, proteins050415.hugo.aliases # proteins050415.hugo.withdraws, mm6.kgXref.kgID # to create kgAliasM.tab and geneAlias.tab # by picking out those kgID items from kgXref where # kgXref.geneSymbol == hugo.symbol cd /cluster/store10/kg/kgMm6B mkdir alias cd alias kgAliasM mm6 proteins050415 # kgAliasKgXref reads from mm6.knownGene.proteinID, # mm6.knownGene.name, mm6.kgXref.geneSymbol # to create kgAliasKgXref.tab kgAliasKgXref mm6 # kgAliasRefseq reads from mm6.knownGene.name, # mm6.knownGene.proteinID, mm6.kgXref.refseq # to create kgAliasRefseq.tab kgAliasRefseq mm6 hgsql sp050415 -N -e 'select name,gene.val from mm6.knownGene,displayId,gene where displayId.val=proteinID and displayId.acc=gene.acc' \ | sort -u > kgAliasP.tab hgsql mm6 -N -e 'select name, name from knownGene' >kgAliasDup.tab hgsql mm6 -N -e 'select mrnaID, dupMrnaID from dupSpMrna' >>kgAliasDup.tab cat kgAliasM.tab kgAliasRefseq.tab kgAliasKgXref.tab kgAliasP.tab kgAliasDup.tab| \ sort |uniq > kgAlias.tab hgsql -e "drop table kgAlias;" mm6 hgsql mm6 < ~/kent/src/hg/lib/kgAlias.sql hgsql mm6 -e 'LOAD DATA local INFILE "kgAlias.tab" into table kgAlias' # kgProtAlias reads from mm6.knownGene.name, # mm6.knownGene.proteinID, mm6.knownGene.alignID, # proteins050415.spXref3.accession, proteins050415.spSecondaryID, proteins050415.pdbSP.pdb # to create kgProtAlias.tab kgProtAlias mm6 050415 hgsql mm6 -N -e \ 'select kgID, spDisplayID, protAcc from kgXref where protAcc != ""'\ | sort -u >kgProtAliasNCBI.tab # include variant splice protein IDs hgsql mm6 -N -e \ 'select name, proteinID, parAcc from knownGene,sp050415.varAcc where varAcc=proteinID'\ |sort -u >kgProtAliasDup.tab # include duplicate protein IDs from dupSpMrna table hgsql mm6 -N -e \ 'select name, knownGene.proteinID, dupProteinID from knownGene, dupSpMrna where name=mrnaID'\ |sort -u >>kgProtAliasDup.tab # catch parent acc from dupProteinID too hgsql mm6 -N -e\ 'select name, knownGene.proteinID, parAcc from knownGene,dupSpMrna,sp050415.varAcc where name=mrnaID and dupProteinID=varAcc.varAcc'\ |sort -u >>kgProtAliasDup.tab cat kgProtAliasNCBI.tab kgProtAlias.tab kgProtAliasDup.tab | sort -u > kgProtAliasAll.tab echo "`date` creating table kgProtAlias" hgsql mm6 -e "drop table kgProtAlias;" hgsql mm6 <~/src/hg/lib/kgProtAlias.sql; hgsql mm6 -e 'LOAD DATA local INFILE "kgProtAliasAll.tab" into table kgProtAlias;' # Build kgSpAlias table hgsql mm6 -e \ 'select kgXref.kgID, spID, alias from kgXref, kgAlias where kgXref.kgID=kgAlias.kgID' >j.tmp hgsql mm6 -e \ 'select kgXref.kgID, spID, alias from kgXref, kgProtAlias where kgXref.kgID=kgProtAlias.kgID'\ >>j.tmp cat j.tmp|sort -u |grep -v 'kgID' >mm6.kgSpAlias.tab rm j.tmp hgsql mm6 -e 'drop table kgSpAlias'; hgsql mm6 < ~/src/hg/lib/kgSpAlias.sql hgsql mm6 -e 'load data local infile "mm6.kgSpAlias.tab" into table kgSpAlias' # MAKE FOLDUTR TABLES (DONE 2005-05-31 Fan) # First set up directory structure and extract UTR sequence on hgwdev ssh hgwdev cd /cluster/data/mm6/bed mkdir rnaStruct.2005-05-31 rm rnaStruct ln -s rnaStruct.2005-05-31 rnaStruct cd rnaStruct mkdir -p utr3/split utr5/split utr3/fold utr5/fold utrFa mm6 knownGene utr3 utr3/utr.fa utrFa mm6 knownGene utr5 utr5/utr.fa # Split up files and make files that define job. ssh kk cd /cluster/data/mm6/bed/rnaStruct faSplit sequence utr3/utr.fa 50000 utr3/split/s faSplit sequence utr5/utr.fa 50000 utr5/split/s ls -1 utr3/split > utr3/in.lst ls -1 utr5/split > utr5/in.lst cd utr3 cat > gsub < blastSome #!/bin/sh BLASTMAT=/cluster/bluearc/blast229/data /cluster/bluearc/blast229/blastall \ -p blastp -d /cluster/panasas/home/store/mm6/blastp/known \ -i $1 -o $2 -e 0.01 -m 8 -b 1000 '_EOF_' # << keep emacs happy chmod a+x blastSome # Make gensub2 file cat << '_EOF_' > gsub #LOOP blastSome {check in line+ $(path1)} {check out line out/$(root1).tab} #ENDLOOP '_EOF_' # << keep emacs happy # Create parasol batch # 'ls ../../split/*.fa' is too much, hence the echo echo ../../split/*.fa | wordLine stdin > split.lst gensub2 split.lst single gsub jobList para create jobList para try para check para push ... etc ... # Completed: 7729 of 7729 jobs # CPU time in finished jobs: 58630s 977.16m 16.29h 0.68d 0.002 y # IO & Wait Time: 39839s 663.99m 11.07h 0.46d 0.001 y # Average job time: 13s 0.21m 0.00h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 116s 1.93m 0.03h 0.00d # Submission to last job: 188s 3.13m 0.05h 0.00d # Load into database. This takes about an hour. ssh hgwdev cd /cluster/data/mm6/bed/geneSorter/blastp/self/run/out hgLoadBlastTab mm6 knownBlastTab *.tab # Scanning through 7729 files # Loading database with 3391069 rows # Create known gene mapping table and expression distance tables # for GNF Atlas 2. (The hgExpDistance takes an hour.) # DONE (05-04-15 Fan) hgMapToGene mm6 affyGnf1m knownGene knownToGnf1m hgExpDistance mm6 hgFixed.gnfMouseAtlas2MedianRatio \ hgFixed.gnfMouseAtlas2MedianExps gnfAtlas2Distance -lookup=knownToGnf1m # Have 34863 elements in hgFixed.gnfMouseAtlas2MedianRatio # Got 20114 unique elements in hgFixed.gnfMouseAtlas2MedianRatio # Create table that maps between known genes and RefSeq hgMapToGene mm6 refGene knownGene knownToRefSeq # may need to build this command in src/hg/near/hgMapToGene # Create a table that maps between known genes and # the nice affy expression data. hgMapToGene mm6 affyU74 knownGene knownToU74 hgMapToGene mm6 affyMOE430 knownGene knownToMOE430 hgMapToGene mm6 affyMOE430 -prefix=A: knownGene knownToMOE430A # Format and load Rinn et al sex expression data mkdir /cluster/data/mm6/bed/rinnSex cd !$ hgMapMicroarray rinnSex.bed hgFixed.mouseRinnSexMedianRatio \ ../affyMOE430/affyMOE430.psl hgLoadBed mm6 rinnSex rinnSex.bed # Format and load the GNF data mkdir /cluster/data/mm6/bed/affyGnf95 cd /cluster/data/mm6/bed/affyGnf95 affyPslAndAtlasToBed -newType ../affyU95.psl \ /projects/compbio/data/microarray/affyGnfHuman/data_public_U95 \ affyGnfU95.tab affyGnfU95Exps.tab -shortOut # this .sql load was in preceeding instructions, but this .sql file # appears to not exist and it doesn't seem to be needed anyway. # Everything below this seems to create tables OK. # hgsql mm6 < ~/kent/src/hg/affyGnf/affyGnfU95.sql # Create table that gives distance in expression space between # GNF genes. These commands take about 15 minutes each # The affyGnfU74?Exps arguments appear to be unused in # hgExpDistance cd /cluster/data/mm6/bed/geneSorter hgExpDistance mm6 affyGnfU74A affyGnfU74AExps affyGnfU74ADistance -lookup=knownToU74 # Got 10157 unique elements in affyGnfU74A hgExpDistance mm6 affyGnfU74B affyGnfU74BExps affyGnfU74BDistance -lookup=knownToU74 # Got 6076 unique elements in affyGnfU74B hgExpDistance mm6 affyGnfU74C affyGnfU74CExps affyGnfU74CDistance -lookup=knownToU74 # Got 1793 unique elements in affyGnfU74C # C.ELEGANS BLASTP FOR GENE SORTER # Make C. elegans ortholog column using blastp on wormpep. # First make C. elegans protein database and copy it to iscratch/i # if it doesn't exist already: ssh eieio mkdir /cluster/data/ce2/bed/blastp cd /cluster/data/ce2/bed/blastp # Point a web browser at ftp://ftp.sanger.ac.uk/pub/databases/wormpep/ # to find out the latest version. Then use that in place of 142 below. wget -O wormPep142.faa ftp://ftp.sanger.ac.uk/pub/databases/wormpep/wormpep142/wormpep142 formatdb -i wormPep142.faa -t wormPep142 -n wormPep142 ssh kkr1u00 if (-e /iscratch/i/ce2/blastp) then rm -r /iscratch/i/ce2/blastp endif mkdir -p /iscratch/i/ce2/blastp cp /cluster/data/ce2/bed/blastp/wormPep142.p?? /iscratch/i/ce2/blastp iSync # Make parasol run directory ssh kk mkdir -p /cluster/data/mm6/bed/blastp/ce2/run/out cd /cluster/data/mm6/bed/blastp/ce2/run # Make blast script cat > blastSome < gsub <split.lst gensub2 split.lst single gsub spec para create spec para try, check, push, check, ... # Completed: 7729 of 7729 jobs # CPU time in finished jobs: 40061s 667.69m 11.13h 0.46d 0.001 y # IO & Wait Time: 21049s 350.81m 5.85h 0.24d 0.001 y # Average job time: 8s 0.13m 0.00h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 33s 0.55m 0.01h 0.00d # Submission to last job: 134s 2.23m 0.04h 0.00d # Load into database. ssh hgwdev cd /cluster/data/mm6/bed/blastp/ce2/run/out hgLoadBlastTab mm6 ceBlastTab -maxPer=1 *.tab # HUMAN BLASTP FOR GENE SORTER (DONE 4/18/05 Fan) # Make human ortholog column using blastp on human known genes. # First make human protein database and copy it to iscratch/i # if it doesn't exist already: mkdir /cluster/data/hg17/bed/blastp cd /cluster/data/hg17/bed/blastp pepPredToFa hg17 knownGenePep known.faa formatdb -i known.faa -t known -n known ssh kkr1u00 if (-e /iscratch/i/hg17/blastp) then rm -r /iscratch/i/hg17/blastp endif mkdir -p /iscratch/i/hg17/blastp cp /cluster/data/hg17/bed/blastp/known.p?? /iscratch/i/hg17/blastp iSync # Make parasol run directory ssh kk mkdir -p /cluster/data/mm6/bed/blastp/hg17/run/out cd /cluster/data/mm6/bed/blastp/hg17/run # Make blast script cat > blastSome < gsub <split.lst gensub2 split.lst single gsub spec para create spec para try, check, push, check, ... # Completed: 7729 of 7729 jobs # CPU time in finished jobs: 81526s 1358.76m 22.65h 0.94d 0.003 y # IO & Wait Time: 23670s 394.51m 6.58h 0.27d 0.001 y # Average job time: 14s 0.23m 0.00h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 84s 1.40m 0.02h 0.00d # Submission to last job: 185s 3.08m 0.05h 0.00d # Load into database. ssh hgwdev cd /cluster/data/mm6/bed/blastp/hg17/run/out hgLoadBlastTab mm6 hgBlastTab -maxPer=1 *.tab # ZEBRAFISH BLASTP FOR GENE SORTER # Make Danio rerio (zebrafish) ortholog column using blastp on Ensembl. # First make protein database and copy it to iscratch/I # The below is done by hg17, that section from makeHg17.doc is copied here. ssh kkstore mkdir /cluster/data/danRer2/bed/blastp cd /cluster/data/danRer2/bed/blastp wget ftp://ftp.ensembl.org/pub/current_zebrafish/data/fasta/pep/Danio_rerio.ZFISH4.apr.pep.fa.gz zcat Dan*.pep.fa.gz > ensembl.faa formatdb -i ensembl.faa -t ensembl -n ensembl ssh kkr1u00 if (-e /iscratch/i/danRer2/blastp) then rm -r /iscratch/i/danRer2/blastp endif mkdir -p /iscratch/i/danRer2/blastp cp /cluster/data/danRer2/bed/blastp/ensembl.p?? /iscratch/i/danRer2/blastp iSync # The above is copied from makeHg17.doc. # Make parasol run directory ssh kk mkdir -p /cluster/data/mm6/bed/blastp/danRer2/run/out cd /cluster/data/mm6/bed/blastp/danRer2/run # Make blast script cat > blastSome < gsub <split.lst gensub2 split.lst single gsub spec para create spec para try, check, push, check, ... # Completed: 7729 of 7729 jobs # CPU time in finished jobs: 72894s 1214.89m 20.25h 0.84d 0.002 y # IO & Wait Time: 21284s 354.74m 5.91h 0.25d 0.001 y # Average job time: 12s 0.20m 0.00h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 73s 1.22m 0.02h 0.00d # Submission to last job: 176s 2.93m 0.05h 0.00d # Load into database. ssh hgwdev cd /cluster/data/mm6/bed/blastp/danRer2/run/out hgLoadBlastTab mm6 drBlastTab -maxPer=1 *.tab # YEAST BLASTP FOR GENE SORTER # Make Saccharomyces cerevisiae (yeast) ortholog column using blastp on # RefSeq. First make protein database and copy it to iscratch/i # if it doesn't exist already: mkdir /cluster/data/sacCer1/bed/blastp cd /cluster/data/sacCer1/bed/blastp wget ftp://genome-ftp.stanford.edu/pub/yeast/data_download/sequence/genomic_sequence/orf_protein/orf_trans.fasta.gz zcat orf_trans.fasta.gz > sgdPep.faa formatdb -i sgdPep.faa -t sgdPep -n sgdPep ssh kkr1u00 # Note: sacCer1 is a name conflict with SARS coronavirus... oh well, # fortunately we won't be looking for homologs there. :) if (-e /iscratch/i/sacCer1/blastp) then rm -r /iscratch/i/sacCer1/blastp endif mkdir -p /iscratch/i/sacCer1/blastp cp /cluster/data/sacCer1/bed/blastp/sgdPep.p?? /iscratch/i/sacCer1/blastp iSync # Make parasol run directory ssh kk mkdir -p /cluster/data/mm6/bed/blastp/sacCer1/run/out cd /cluster/data/mm6/bed/blastp/sacCer1/run # Make blast script cat > blastSome < gsub <split.lst gensub2 split.lst single gsub spec para create spec para try, check, push, check, ... # Completed: 7729 of 7729 jobs # CPU time in finished jobs: 11663s 194.38m 3.24h 0.13d 0.000 y # IO & Wait Time: 20479s 341.32m 5.69h 0.24d 0.001 y # Average job time: 4s 0.07m 0.00h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 11s 0.18m 0.00h 0.00d # Submission to last job: 143s 2.38m 0.04h 0.00d # Load into database. ssh hgwdev cd /cluster/data/mm6/bed/blastp/sacCer1/run/out hgLoadBlastTab mm6 scBlastTab -maxPer=1 *.tab # DM1 BLASTP FOR GENE SORTER (DONE 5/30/05, Fan) # Make Drosophila melanagaster ortholog column using blastp on FlyBase. # First make protein database and copy it to iscratch/i # if it doesn't exist already: # This is already done, see makeMm3.doc for procedure # the directory: /cluster/bluearc/dm1/blastp should have data # ssh kkr1u00 # if (-e /iscratch/i/dm1/blastp) then # rm -r /iscratch/i/dm1/blastp # endif # mkdir -p /iscratch/i/dm1/blastp # cp /cluster/data/dm1/bed/blastp/bdgp.p?? /iscratch/i/dm1/blastp # iSync # THE ABOVE IS ALREADY DONE BY ANGIE # Make parasol run directory ssh kk mkdir -p /cluster/data/mm6/bed/blastp/dm1/run/out cd /cluster/data/mm6/bed/blastp/dm1/run # Make blast script cat > blastSome < gsub <split.lst gensub2 split.lst single gsub spec para create spec para try, check, push, check, ... # Completed: 7729 of 7729 jobs # CPU time in finished jobs: 45146s 752.44m 12.54h 0.52d 0.001 y # IO & Wait Time: 21289s 354.81m 5.91h 0.25d 0.001 y # Average job time: 9s 0.14m 0.00h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 43s 0.72m 0.01h 0.00d # Submission to last job: 139s 2.32m 0.04h 0.00d # Load into database. ssh hgwdev cd /cluster/data/mm6/bed/blastp/dm1/run/out hgLoadBlastTab mm6 dmBlastTab -maxPer=1 *.tab # Create table that maps between known genes and LocusLink cd /cluster/data/mm6/bed/geneSorter hgsql --skip-column-names -e "select mrnaAcc,locusLinkId from refLink" mm6 > refToLl.txt hgMapToGene mm6 refGene knownGene knownToLocusLink -lookup=refToLl.txt # row count is 23074 # Create table that maps between known genes and Pfam domains hgMapViaSwissProt mm6 knownGene name proteinID Pfam knownToPfam # row count is 22525 # Create table to map between known genes and GNF Atlas2 # expression data. hgMapToGene mm6 gnfAtlas2 knownGene knownToGnfAtlas2 '-type=bed 12' # Create table that maps between known genes and genePix database knownToGenePix mm6 # ENABLE GENE SORTER FOR mm6 IN HGCENTRALTEST (already done during first mm6 KG build) echo "update dbDb set hgNearOk = 1 where name = 'mm6';" \ | hgsql -h genome-testdb hgcentraltest # RAT BLASTP FOR GENE SORTER # Make RAT ortholog column using blastp on RAT known genes. # First make RAT protein database and copy it to iscratch/i # if it doesn't exist already: mkdir /cluster/data/rn3/bed/blastp cd /cluster/data/rn3/bed/blastp pepPredToFa rn3 knownGenePep known.faa formatdb -i known.faa -t known -n known ssh kkr1u00 if (-e /iscratch/i/rn3/blastp) then rm -r /iscratch/i/rn3/blastp endif mkdir -p /iscratch/i/rn3/blastp cp /cluster/data/rn3/bed/blastp/known.p?? /iscratch/i/rn3/blastp iSync # Make parasol run directory ssh kk mkdir -p /cluster/data/mm6/bed/blastp/rn3/run/out cd /cluster/data/mm6/bed/blastp/rn3/run # Make blast script cat > blastSome < gsub <split.lst gensub2 split.lst single gsub spec para create spec para try, check, push, check, ... # Completed: 7729 of 7729 jobs # CPU time in finished jobs: 17126s 285.44m 4.76h 0.20d 0.001 y # IO & Wait Time: 20493s 341.54m 5.69h 0.24d 0.001 y # Average job time: 5s 0.08m 0.00h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 24s 0.40m 0.01h 0.00d # Submission to last job: 131s 2.18m 0.04h 0.00d # Load into database. ssh hgwdev cd /cluster/data/mm6/bed/blastp/rn3/run/out hgLoadBlastTab mm6 rnBlastTab -maxPer=1 *.tab # END OF GENE SORTER STUFF ############################################################################# ### MM6 PROTEOME BROWSER TABLES RE-BUILD #### (DONE - 2005-06-01 - Fan) # These are instructions for re-building tables # needed for the Proteome Browser to be used with mm6. # DON'T START THESE UNTIL TABLES FOR KNOWN GENES AND kgProtMap table # ARE REBUILT. # This build is based on proteins DBs dated 050415. # Create the working directory ssh hgwdev mkdir /cluster/data/mm6/bed/pb.2005-06-01 cd /cluster/data/mm6/bed rm pb ln -s /cluster/data/mm6/bed/pb.2005-06-01 pb cd pb # Define pep* tables in mm6 DB cat ~/kent/src/hg/lib/pep*.sql > pepAll.sql # delete from the following tables (previously built): hgsql mm6 delete from pepCCntDist ; delete from pepExonCntDist ; delete from pepHydroDist ; delete from pepIPCntDist ; delete from pepMolWtDist ; delete from pepMwAa ; delete from pepPi ; delete from pepPiDist ; delete from pepPred ; delete from pepResDist ; delete from pbAnomLimit; delete from pbResAvgStd; delete from pbStamp; quit; # Build the pepMwAa table hgsql proteins050415 -e \ "select info.acc, molWeight, aaSize from sp050415.info, sp050415.accToTaxon where accToTaxon.taxon=10090 and accToTaxon.acc = info.acc" > pepMwAa.tab hgsql mm6 -e 'load data local infile "pepMwAa.tab" into table mm6.pepMwAa ignore 1 lines;' o Build the pepPi table hgsql proteins050415 -e "select info.acc from sp050415.info, sp050415.accToTaxon where accToTaxon.taxon=10090 and accToTaxon.acc = info.acc" > protAcc.lis pbCalPi protAcc.lis sp050415 pepPi.tab hgsql mm6 -e 'load data local infile "pepPi.tab" into table mm6.pepPi;' # Calculate and load pep distributions pbCalDist sp050415 proteins050415 10090 mm6 >pbCalDist.out cat pbCalDist.out wc pbCalDist.out hgsql mm6 load data local infile "pepExonCntDist.tab" into table mm6.pepExonCntDist; load data local infile "pepCCntDist.tab" into table mm6.pepCCntDist; load data local infile "pepHydroDist.tab" into table mm6.pepHydroDist; load data local infile "pepMolWtDist.tab" into table mm6.pepMolWtDist; load data local infile "pepResDist.tab" into table mm6.pepResDist; load data local infile "pepIPCntDist.tab" into table mm6.pepIPCntDist; load data local infile "pepPiDist.tab" into table mm6.pepPiDist; quit # Calculate frequency distributions pbCalResStd sp050415 10090 mm6 # Create pbAnomLimit and pbResAvgStd tables # hgsql mm6 < ~/src/hg/lib/pbAnomLimit.sql # hgsql mm6 < ~/src/hg/lib/pbResAvgStd.sql hgsql mm6 -e 'load data local infile "pbResAvgStd.tab" into table mm6.pbResAvgStd;' hgsql mm6 -e 'load data local infile "pbAnomLimit.tab" into table mm6.pbAnomLimit;' # UPDATE kgSpAlias TABLE TO BE USED BY PB cd /cluster/data/mm6/bed/pb hgsql mm6 -e \ 'select kgXref.kgID, spID, alias from kgXref, kgAlias where kgXref.kgID=kgAlias.kgID' >j.tmp hgsql mm6 -e \ 'select kgXref.kgID, spID, alias from kgXref, kgProtAlias where kgXref.kgID=kgProtAlias.kgID'\ >>j.tmp cat j.tmp|sort -u |grep -v 'kgID' >mm6.kgSpAlias.tab rm j.tmp hgsql mm6 -e 'drop table kgSpAlias'; hgsql mm6 < ~/src/hg/lib/kgSpAlias.sql hgsql mm6 -e 'load data local infile "mm6.kgSpAlias.tab" into table kgSpAlias' gzip mm6.kgSpAlias.tab # Create pbStamp table for PB hgsql mm6 < ~/src/hg/lib/pbStamp.sql hgsql mm5 -N -e 'select * from pbStamp' > pbStamp.tab hgsql mm6 -e 'delete from pbStamp' hgsql mm6 -e 'load data local infile "pbStamp.tab" into table mm6.pbStamp' # ENABLE PROTEOME BROWSER FOR mm6 IN HGCENTRALTEST (already done previously) echo "update dbDb set hgPbOk = 1 where name = 'mm6';" \ | hgsql -h genome-testdb hgcentraltest # Adjust drawing parameters for Proteome Browser stamps Now invoke Proteome Browser and adjust various drawing parameters (mostly the ymax of each stamp) if necessary, by updating the pbStamp.tab file and then delete and reload the pbStamp table. # Perform preliminary review of Proteome Browser for mm6, then notify QA for formal review. # Update default Browser position # bring up mySQL on genome-testdb and use hgcentraltest DB (done previously): update dbDb set defaultPos="chrX:87947304-87959012" where name="mm6"; # Create QA Push Queue entry with the following tables: ceBlastTab cgapAlias cgapBiocDesc cgapBiocPathway dmBlastTab drBlastTab dupSpMrna foldUtr3 foldUtr5 gnfAtlas2Distance hgBlastTab keggMapDesc keggPathway kgAlias kgProtAlias kgProtMap kgXref knownBlastTab knownCanonical knownGene knownGeneMrna knownGenePep knownIsoforms knownToGenePix knownToGnf1m knownToGnfAtlas2 knownToLocusLink knownToMOE430 knownToMOE430A knownToPfam knownToRefSeq knownToU74 knownToXmBest rinnSex rnBlastTab scBlastTab spMrna # END OF mm6 KG/GS/PB RE-BUILD. 6/1/05 Fan. ##################################################################### #################################################################################### # RE-BUILD KNOWN GENES TABLES, 3RD TRIAL WITH CORRECTED kgCheck and kgGetCds (DONE 6/8/05 Fan) ssh hgwdev cd /cluster/store10/kg/kgMm6B mkdir try2 mv * try2 hgsql mm6 -e 'create database kgMm6BTempTry2' hgsql kgMm6BTempTry2 -e 'drop table kgCandidate0' hgsql kgMm6BTempTry2 < ~/src/hg/lib/kgCandidate0.sql hgsql kgMm6BTempTry2 -e 'load data local infile "try2/kgCandidate0.gp" into table kgCandidate0' hgsql kgMm6BTempTry2 -e 'drop table geneCheck' hgsql kgMm6BTempTry2 < ~/src/hg/lib/geneCheck.sql hgsql kgMm6BTempTry2 -e 'load data local infile "try2/kgCandidate0.check" into table geneCheck ignore 2 lines' # Run kgCheck to get all KG candidates that pass the KG gene check criteria kgCheck kgMm6BTempTry2 mm6 kgCandidate0 geneCheck kgCandidate.tab hgsql kgMm6BTempTry2 -e 'drop table kgCandidate' hgsql kgMm6BTempTry2 < ~/src/hg/lib/kgCandidate.sql hgsql kgMm6BTempTry2 -e 'load data local infile "kgCandidate.tab" into table kgCandidate' hgsql kgMm6BTempTry2 -e 'create index alignID on kgCandidate(alignID)' # Construct the kgCandidateX table that has alignID in the name field. cut -f 2-10 kgCandidate.tab >j2.tmp cut -f 11 kgCandidate.tab >j1.tmp paste j1.tmp j2.tmp >kgCandidateX.tab rm j1.tmp j2.tmp hgsql kgMm6BTempTry2 -e 'drop table kgCandidateX' hgsql kgMm6BTempTry2 < ~/src/hg/lib/kgCandidateX.sql hgsql kgMm6BTempTry2 -e 'load data local infile "kgCandidateX.tab" into table kgCandidateX' # Score protein/mRna and protein/RefSeq alignments # kgResultBestMrna2 050415 kgMm6BTempTry2 mm6|sort -u >protMrnaBlatScore.tab # kgResultBestRef2 050415 kgMm6BTempTry2 mm6|sort -u >protRefScore.tab # Combine scoring results and load them into temp DB. # cat protMrnaBlatScore.tab protRefScore.tab >protMrnaScore.tab hgsql kgMm6BTempTry2 -e 'drop table protMrnaScore' hgsql kgMm6BTempTry2 < ~/src/hg/lib/protMrnaScore.sql hgsql kgMm6BTempTry2 -e 'load data local infile "try2/protMrnaScore.tab" into table protMrnaScore' hgsql kgMm6BTempTry2 -e 'create index mrnaAcc on protMrnaScore(mrnaAcc)' # Run kgGetCds to get CDS structure of each gene kgGetCds kgMm6BTempTry2 kgCandidateX jY.tmp cat jY.tmp |sort -u >kgCandidateY.tab # rm jY.tmp hgsql kgMm6BTempTry2 -e 'drop table kgCandidateY' hgsql kgMm6BTempTry2 < ~/src/hg/lib/kgCandidateY.sql hgsql kgMm6BTempTry2 -e 'load data local infile "kgCandidateY.tab" into table kgCandidateY' # Run kgPickPrep to replace long cds structure string with cdsId. kgPickPrep kgMm6BTempTry2 kgCandidateZ.tab hgsql kgMm6BTempTry2 -e 'drop table kgCandidateZ' hgsql kgMm6BTempTry2 < ~/src/hg/lib/kgCandidateZ.sql hgsql kgMm6BTempTry2 -e 'load data local infile "kgCandidateZ.tab" into table kgCandidateZ' hgsql kgMm6BTempTry2 -e 'create index cdsId on kgCandidateZ(cdsId)' # Run kgPick to pick the representative a mrna/protein pair for each unique CDS structure. kgPick kgMm6BTempTry2 mm6 proteins050415 kgTry2.tmp dupSpMrna.tmp cat kgTry2.tmp | grep NM_ > jNM cat kgTry2.tmp | grep -v NM_ >jnoNM cut -f 1 jnoNM | sed -e "s/_/_\n/" |grep -v _ >jnoNM1 cut -f 2-12 jnoNM >jnoNM2 paste jnoNM1 jnoNM2 > kgTry2B.tmp cat jNM >> kgTry2B.tmp sort -u dupSpMrna.tmp >dupSpMrna.tab hgsql mm6 -e 'drop table dupSpMrna' hgsql mm6 < ~/src/hg/lib/dupSpMrna.sql hgsql mm6 -e 'load data local infile "dupSpMrna.tab" into table dupSpMrna' # Add entries in the put back list # Obtain the mouse put back list from Mark and save it as kgPutBack.tab hgsql kgMm6BTempTry2 -e 'drop table kgPutBack' hgsql kgMm6BTempTry2 < ~/src/hg/lib/kgPutBack.sql hgsql kgMm6BTempTry2 -e 'load data local infile "kgPutBack.tab" into table kgPutBack' kgPutBack kgMm6BTempTry2 mm6 proteins050415 kgPutBack kgPutBack.gp # Sort KG genes to make the kgTry2.gp table file. cat kgTry2B.tmp kgPutBack.gp >kgTry2C.tmp ~/kent/src/hg/protein/sortKg.pl kgTry2C.tmp >kgTry2.gp # Manually edit to correct one line problem of O75438_BC009691 hgsql kgMm6BTempTry2 -e 'drop table knownGene' hgsql kgMm6BTempTry2 < ~/src/hg/lib/knownGene.sql hgsql kgMm6BTempTry2 -e 'load data local infile "kgTry2.gp" into table knownGene' # Load data into mm6 knownGene table. hgsql mm6 -e 'drop table knownGene' hgsql mm6 < ~/src/hg/lib/knownGene.sql hgsql mm6 -e 'load data local infile "kgTry2.gp" into table knownGene' # Build knownGeneMrna and knownGenePep tables. hgsql kgMm6BTempTry2 -e 'drop table mrnaSeq' hgsql kgMm6BTempTry2 < ~/src/hg/lib/mrnaSeq.sql # hgsql kgMm6BTempTry2 -e 'load data local infile "try2/mrnaSeq.tab" into table mrnaSeq' hgsql kgMm6BTempTry2 -e 'load data local infile "/cluster/store10/kg/kgMm6A/mrnaSeq.tab" into table mrnaSeq' kgPepMrna kgMm6BTempTry2 mm6 050415 hgsql mm6 -e 'drop table knownGeneMrna' hgsql mm6 < ~/src/hg/lib/knownGeneMrna.sql hgsql mm6 -e 'load data local infile "knownGeneMrna.tab" into table knownGeneMrna' hgsql mm6 -e 'drop table knownGenePep' hgsql mm6 < ~/src/hg/lib/knownGenePep.sql hgsql mm6 -e 'load data local infile "knownGenePep.tab" into table knownGenePep' # Build kgXref table kgXref2 kgMm6BTempTry2 050415 mm6 hgsql mm6 -e 'drop table kgXref' hgsql mm6 < ~/src/hg/lib/kgXref.sql hgsql mm6 -e 'load data local infile "kgXref.tab" into table kgXref' # Build kgProtMap table ~/src/hg/protein/kgProtMap2.sh kgMm6B mm6 050415 # Update and clean up kgResultBestMrna2.c and then check it in. # Build spMrna table hgsql mm6 -N -e 'select name, proteinID from knownGene' |sort -u| >kgSpMrna.tab hgsql mm6 -e 'drop table spMrna' hgsql mm6 <~/src/hg/lib/spMrna.sql hgsql mm6 -e 'load data local infile "kgSpMrna.tab" into table spMrna' # Build mrnaRefseq table cd /cluster/store10/entrez mkdir 050601 rm /cluster/data/entrez ln -s /cluster/store10/entrez/050601 /cluster/data/entrez cd /cluster/data/entrez wget -- timestamp ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/gene2accession.gz wget -- timestamp ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/gene_info.gz wget -- timestamp ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/gene2refseq.gz gzip -d *.gz cut -f 2,4 gene2accession | sort -u | grep -v "-" | grep -v "NM_" | sed -e 's/\./\t/g' > entrezMrna.tab cut -f 2,4 gene2refseq | grep "NM_"| sort -u | grep -v "-" | sed -e 's/\./\t/g' > entrezRefseq.tab cut -f 2,4,6 gene2accession | grep "NM_"| grep "NP_"|sort -u | sed -e 's/\./\t/g' > entrezRefProt.tab hgsql entrez -e 'drop table entrezRefseq' hgsql entrez -e 'drop table entrezMrna' hgsql entrez -e 'drop table entrezRefProt' hgsql entrez < ~/src/hg/lib/entrezRefseq.sql hgsql entrez < ~/src/hg/lib/entrezMrna.sql hgsql entrez < ~/src/hg/lib/entrezRefProt.sql hgsql entrez -e 'load data local infile "entrezRefseq.tab" into table entrezRefseq' hgsql entrez -e 'load data local infile "entrezMrna.tab" into table entrezMrna' hgsql entrez -e 'load data local infile "entrezRefProt.tab" into table entrezRefProt' hgsql entrez -N -e \ 'select mrna, refseq from entrezRefseq, entrezMrna where entrezRefseq.geneID=entrezMrna.geneID' \ >mrnaRefseq.tab hgsql mm6 -e 'drop table mrnaRefseq' hgsql mm6 < ~/src/hg/lib/mrnaRefseq.sql hgsql mm6 -e 'load data local infile "mrnaRefseq.tab" into table mrnaRefseq' # Build alias tables. # kgAliasM reads from proteins050415.hugo.symbol, proteins050415.hugo.aliases # proteins050415.hugo.withdraws, mm6.kgXref.kgID # to create kgAliasM.tab and geneAlias.tab # by picking out those kgID items from kgXref where # kgXref.geneSymbol == hugo.symbol cd /cluster/store10/kg/kgMm6B mkdir alias cd alias kgAliasM mm6 proteins050415 # kgAliasKgXref reads from mm6.knownGene.proteinID, # mm6.knownGene.name, mm6.kgXref.geneSymbol # to create kgAliasKgXref.tab kgAliasKgXref mm6 # kgAliasRefseq reads from mm6.knownGene.name, # mm6.knownGene.proteinID, mm6.kgXref.refseq # to create kgAliasRefseq.tab kgAliasRefseq mm6 hgsql sp050415 -N -e 'select name,gene.val from mm6.knownGene,displayId,gene where displayId.val=proteinID and displayId.acc=gene.acc' \ | sort -u > kgAliasP.tab hgsql mm6 -N -e 'select name, name from knownGene' >kgAliasDup.tab hgsql mm6 -N -e 'select mrnaID, dupMrnaID from dupSpMrna' >>kgAliasDup.tab cat kgAliasM.tab kgAliasRefseq.tab kgAliasKgXref.tab kgAliasP.tab kgAliasDup.tab| \ sort |uniq > kgAlias.tab hgsql -e "drop table kgAlias;" mm6 hgsql mm6 < ~/kent/src/hg/lib/kgAlias.sql hgsql mm6 -e 'LOAD DATA local INFILE "kgAlias.tab" into table kgAlias' # kgProtAlias reads from mm6.knownGene.name, # mm6.knownGene.proteinID, mm6.knownGene.alignID, # proteins050415.spXref3.accession, proteins050415.spSecondaryID, proteins050415.pdbSP.pdb # to create kgProtAlias.tab kgProtAlias mm6 050415 hgsql mm6 -N -e \ 'select kgID, spDisplayID, protAcc from kgXref where protAcc != ""'\ | sort -u >kgProtAliasNCBI.tab # include variant splice protein IDs hgsql mm6 -N -e \ 'select name, proteinID, parAcc from knownGene,sp050415.varAcc where varAcc=proteinID'\ |sort -u >kgProtAliasDup.tab # include duplicate protein IDs from dupSpMrna table hgsql mm6 -N -e \ 'select name, knownGene.proteinID, dupProteinID from knownGene, dupSpMrna where name=mrnaID'\ |sort -u >>kgProtAliasDup.tab # catch parent acc from dupProteinID too hgsql mm6 -N -e\ 'select name, knownGene.proteinID, parAcc from knownGene,dupSpMrna,sp050415.varAcc where name=mrnaID and dupProteinID=varAcc.varAcc'\ |sort -u >>kgProtAliasDup.tab cat kgProtAliasNCBI.tab kgProtAlias.tab kgProtAliasDup.tab | sort -u > kgProtAliasAll.tab echo "`date` creating table kgProtAlias" hgsql mm6 -e "drop table kgProtAlias;" hgsql mm6 <~/src/hg/lib/kgProtAlias.sql; hgsql mm6 -e 'LOAD DATA local INFILE "kgProtAliasAll.tab" into table kgProtAlias;' # Build kgSpAlias table hgsql mm6 -e \ 'select kgXref.kgID, spID, alias from kgXref, kgAlias where kgXref.kgID=kgAlias.kgID' >j.tmp hgsql mm6 -e \ 'select kgXref.kgID, spID, alias from kgXref, kgProtAlias where kgXref.kgID=kgProtAlias.kgID'\ >>j.tmp cat j.tmp|sort -u |grep -v 'kgID' >mm6.kgSpAlias.tab rm j.tmp hgsql mm6 -e 'drop table kgSpAlias'; hgsql mm6 < ~/src/hg/lib/kgSpAlias.sql hgsql mm6 -e 'load data local infile "mm6.kgSpAlias.tab" into table kgSpAlias' # MAKE FOLDUTR TABLES # First set up directory structure and extract UTR sequence on hgwdev ssh hgwdev cd /cluster/data/mm6/bed mkdir rnaStruct.2005-06-05 rm rnaStruct ln -s rnaStruct.2005-06-05 rnaStruct cd rnaStruct mkdir -p utr3/split utr5/split utr3/fold utr5/fold utrFa mm6 knownGene utr3 utr3/utr.fa utrFa mm6 knownGene utr5 utr5/utr.fa # Split up files and make files that define job. ssh kk cd /cluster/data/mm6/bed/rnaStruct faSplit sequence utr3/utr.fa 50000 utr3/split/s faSplit sequence utr5/utr.fa 50000 utr5/split/s ls -1 utr3/split > utr3/in.lst ls -1 utr5/split > utr5/in.lst cd utr3 cat > gsub < cgapAliasSorted.tab hgsql mm6 -e 'load data local infile "cgapAliasSorted.tab" \ into table cgapAlias' hgsql mm6 -e 'load data local infile "cgapBIOCARTAdescSorted.tab" \ into table cgapBiocDesc' hgsql mm6 -e 'load data local infile "cgapBIOCARTA.tab" \ into table cgapBiocPathway' ### MM6 PROTEOME BROWSER TABLES RE-BUILD #### (DONE - 2005-06-06 - Fan) # These are instructions for re-building tables # needed for the Proteome Browser to be used with mm6. # DON'T START THESE UNTIL TABLES FOR KNOWN GENES AND kgProtMap table # ARE REBUILT. # This build is based on proteins DBs dated 050415. # Create the working directory ssh hgwdev mkdir /cluster/data/mm6/bed/pb.2005-06-06 cd /cluster/data/mm6/bed rm pb ln -s /cluster/data/mm6/bed/pb.2005-06-06 pb cd pb # Define pep* tables in mm6 DB # cat ~/kent/src/hg/lib/pep*.sql > pepAll.sql # delete from the following tables (previously built): hgsql mm6 delete from pepCCntDist ; delete from pepExonCntDist ; delete from pepHydroDist ; delete from pepIPCntDist ; delete from pepMolWtDist ; delete from pepMwAa ; delete from pepPi ; delete from pepPiDist ; delete from pepPred ; delete from pepResDist ; delete from pbAnomLimit; delete from pbResAvgStd; delete from pbStamp; quit; # Build the pepMwAa table hgsql proteins050415 -e \ "select info.acc, molWeight, aaSize from sp050415.info, sp050415.accToTaxon where accToTaxon.taxon=10090 and accToTaxon.acc = info.acc" > pepMwAa.tab hgsql mm6 -e 'load data local infile "pepMwAa.tab" into table mm6.pepMwAa ignore 1 lines;' o Build the pepPi table hgsql proteins050415 -e "select info.acc from sp050415.info, sp050415.accToTaxon where accToTaxon.taxon=10090 and accToTaxon.acc = info.acc" > protAcc.lis pbCalPi protAcc.lis sp050415 pepPi.tab hgsql mm6 -e 'load data local infile "pepPi.tab" into table mm6.pepPi;' # Calculate and load pep distributions pbCalDist sp050415 proteins050415 10090 mm6 >pbCalDist.out cat pbCalDist.out wc pbCalDist.out hgsql mm6 load data local infile "pepExonCntDist.tab" into table mm6.pepExonCntDist; load data local infile "pepCCntDist.tab" into table mm6.pepCCntDist; load data local infile "pepHydroDist.tab" into table mm6.pepHydroDist; load data local infile "pepMolWtDist.tab" into table mm6.pepMolWtDist; load data local infile "pepResDist.tab" into table mm6.pepResDist; load data local infile "pepIPCntDist.tab" into table mm6.pepIPCntDist; load data local infile "pepPiDist.tab" into table mm6.pepPiDist; quit # Calculate frequency distributions pbCalResStd sp050415 10090 mm6 # Create pbAnomLimit and pbResAvgStd tables # hgsql mm6 < ~/src/hg/lib/pbAnomLimit.sql # hgsql mm6 < ~/src/hg/lib/pbResAvgStd.sql hgsql mm6 -e 'load data local infile "pbResAvgStd.tab" into table mm6.pbResAvgStd;' hgsql mm6 -e 'load data local infile "pbAnomLimit.tab" into table mm6.pbAnomLimit;' # UPDATE kgSpAlias TABLE TO BE USED BY PB cd /cluster/data/mm6/bed/pb hgsql mm6 -e \ 'select kgXref.kgID, spID, alias from kgXref, kgAlias where kgXref.kgID=kgAlias.kgID' >j.tmp hgsql mm6 -e \ 'select kgXref.kgID, spID, alias from kgXref, kgProtAlias where kgXref.kgID=kgProtAlias.kgID'\ >>j.tmp cat j.tmp|sort -u |grep -v 'kgID' >mm6.kgSpAlias.tab rm j.tmp hgsql mm6 -e 'drop table kgSpAlias'; hgsql mm6 < ~/src/hg/lib/kgSpAlias.sql hgsql mm6 -e 'load data local infile "mm6.kgSpAlias.tab" into table kgSpAlias' gzip mm6.kgSpAlias.tab # Create pbStamp table for PB hgsql mm6 < ~/src/hg/lib/pbStamp.sql hgsql mm5 -N -e 'select * from pbStamp' > pbStamp.tab hgsql mm6 -e 'delete from pbStamp' hgsql mm6 -e 'load data local infile "pbStamp.tab" into table mm6.pbStamp' # ENABLE PROTEOME BROWSER FOR mm6 IN HGCENTRALTEST (already done previously) echo "update dbDb set hgPbOk = 1 where name = 'mm6';" \ | hgsql -h genome-testdb hgcentraltest # Adjust drawing parameters for Proteome Browser stamps Now invoke Proteome Browser and adjust various drawing parameters (mostly the ymax of each stamp) if necessary, by updating the pbStamp.tab file and then delete and reload the pbStamp table. # Perform preliminary review of Proteome Browser for mm6, then notify QA for formal review. # Update default Browser position # bring up mySQL on genome-testdb and use hgcentraltest DB (done previously): update dbDb set defaultPos="chrX:87947304-87959012" where name="mm6"; # Create QA Push Queue entry with the following tables: ceBlastTab cgapAlias cgapBiocDesc cgapBiocPathway dmBlastTab drBlastTab dupSpMrna foldUtr3 foldUtr5 gnfAtlas2Distance hgBlastTab keggMapDesc keggPathway kgAlias kgProtAlias kgProtMap kgXref knownBlastTab knownCanonical knownGene knownGeneMrna knownGenePep knownIsoforms knownToGenePix knownToGnf1m knownToGnfAtlas2 knownToLocusLink knownToMOE430 knownToMOE430A knownToPfam knownToRefSeq knownToU74 knownToXmBest rinnSex rnBlastTab scBlastTab spMrna # END OF mm6 KG/GS/PB RE-BUILD. 6/6/05 Fan. ##################################################################### ## NIA Mouse Gene Index - (DONE - 2005-06-21 Fan) # requested by: Dudekula, Dawood (NIH/NIA/IRP) DudekulaDB@grc.nia.nih.gov ssh hgwdev mkdir -p /cluster/data/mm6/bed/NIAGene cd /cluster/data/mm6/bed/NIAGene wget --timestamp http://lgsun.grc.nia.nih.gov/geneindex5/download/T-psl.txt.gz cut -f 1-21 T-psl.txt >NIAGene.tab hgLoadPsl mm6 NIAGene.tab wget --timestamp http://lgsun.grc.nia.nih.gov/geneindex5/download/T-fasta.fa.gz gzip -d T-fasta.fa.gz mkdir /gbdb/mm6/NIAGene ln -s /cluster/data/mm6/bed/NIAGene/T-fasta.fa /gbdb/mm6/NIAGene/T-fasta.fa hgLoadSeq mm6 /gbdb/mm6/NIAGene/T-fasta.fa Create/edit/check in NIAGene.html and trackDb.ra under kent/src/hg/makeDb/trackDb/mouse/mm6 # Update mrnaRefseq table (DONE - Fan 6/22/05) # The old table contains non-mouse mrna/RefSeqs. # The new table contains only mouse mrna/RefSeq and RefSeq/RefSeq. # First build entrez DB tables, see the section on mrnaRefseq earlier # for details. ssh hgwdev cd /cluster/store10/kg/kgMm6B hgsql entrez -N -e \ 'select mrna, refseq from entrezRefseq, entrezMrna, mm6.all_mrna where entrezRefseq.geneID=entrezMrna.geneID and mrna=all_mrna.qName' \ >mrnaRefseq1.tab # Include RefSeq as valid mRNA too. hgsql mm6 -N -e 'select name, name from refGene' >mrnaRefseq2.tab cat mrnaRefseq1.tab mrnaRefseq2.tab |sort -u >mrnaRefseq.tab hgsql mm6 -e 'drop table mrnaRefseq' hgsql mm6 < ~/src/hg/lib/mrnaRefseq.sql hgsql mm6 -e 'load data local infile "mrnaRefseq.tab" into table mrnaRefseq' # BUILD KNOWN GENE LIST FOR GOOGLE. DONE 6/27/05 Fan. cd /cluster/data/mm6/bed rm -rf knownGeneList/mm6 # Run hgKnownGeneList to generate the tree of HTML pages # under ./knownGeneList/mm6 hgKnownGeneList mm6 # copy over to /usr/local/apache/htdocs rm -rf /usr/local/apache/htdocs/knownGeneList/mm6 mkdir -p /usr/local/apache/htdocs/knownGeneList/mm6 cp -Rfp knownGeneList/mm6/* /usr/local/apache/htdocs/knownGeneList/mm6 # Build kgReactome table for KG to Reactome xref. Done 6/28/05 Fan. # First, make sure the reactome DB is built. See makeHg17.doc for details. ssh hgwdev mkdir -p /cluster/data/mm6/bed/reactome cd /cluster/data/mm6/bed/reactome hgsql reactome -N -e 'select kgId, spID, DB_ID from ReferenceEntity, mm6.kgXref where identifier=spID' >kgReactome.tab; hgsql mm6 -e 'drop table kgReactome' hgsql mm6 < ~/src/hg/lib/kgReactome.sql hgsql mm6 -e 'load data local infile "kgReactome.tab" into table kgReactome' # miRNA track (DONE - 2005-06-29 - Fan) # data from: Michel.Weber@ibcg.biotoul.fr # notify them when done. cd /cluster/data/mm6/bed mkdir miRNA cd miRNA save miRNA_track_mm6.txt file from email cp miRNA_track_mm6.txt miRNA.bed # edit miRNA.bed to get rid of the top field description lines hgLoadBed mm6 miRNA miRNA.bed # check previous release track before update nice featureBits mm5 miRNA # 17957 bases of 2615483787 (0.001%) in intersection nice featureBits mm6 #19126 bases of 2597150411 (0.001%) in intersection # ADDED THE EXONPRIMER TO QUICK LINKS SECTION OF KG DEAILS PAGE (05/07/11, Fan) # Added the following lines to links.ra under src/hg/hgGene/hgGeneData/Mouse/mm6 name exonPrimer shortLabel ExonPrimer tables kgXref idSql select kgID from kgXref where kgID = '%s' url http://ihg.gsf.de/cgi-bin/primer/ExonPrimerUCSC.pl?db=mm6&acc=%s priority 95 # REBUILT knownToPfam TABLE TO ALLOW KG REPRESENTED BY VARIANT SPLICE PROTEINS MAPPED TO PFAM (DONE 7/14/05, Fan) # hgMapViaSwissProt.c was updated to support this. # Create table that maps between known genes and Pfam domains ~/bin/i386/hgMapViaSwissProt mm6 knownGene name proteinID Pfam knownToPfam # row count is 24650 # SCDb CLONES (7/12/2005 Andy) cd /cluster/data/mm6/bed mkdir blat.SCDb-07-05-2005 cd blat.SCDb-07-05-2005/ ln -s `pwd` ~/scdb pushd /santest/scratch/andy wget http://stemcell.princeton.edu/download/scdb.fa.gz mkdir scdb faSplit sequence scdb.fa.gz 80 scdb/scdb_ popd find /santest/scratch/andy/scdb -type f > scdb.lst find /panasas/store/mm6/nib -type f > mm6.lst cat << "_EOF_" > blat.sh #!/bin/bash cdir=${3%/*} mkdir -p $cdir blat -q=dna -t=dna -noHead -ooc=/iscratch/i/mm6/ooc/11.ooc $1 $2 $3 _EOF_ cat << "_EOF_" > gsub #LOOP ./blat.sh {check in exists $(path2)} {check in line+ $(path1)} {check out line /cluster/bluearc/andy/scdb.psl/$(root2)/$(root2)_$(root1).psl} #ENDLOOP _EOF_ chmod +x blat.sh ssh kk cd /cluster/data/mm6/bed/blat.SCDb-07-05-2005 gensub2 scdb.lst mm6.lst gsub spec para create spec para try para push para time #Completed: 3200 of 3200 jobs #CPU time in finished jobs: 24158s 402.64m 6.71h 0.28d 0.001 y #IO & Wait Time: 14437s 240.61m 4.01h 0.17d 0.000 y #Average job time: 12s 0.20m 0.00h 0.00d #Longest running job: 0s 0.00m 0.00h 0.00d #Longest finished job: 244s 4.07m 0.07h 0.00d #Submission to last job: 727s 12.12m 0.20h 0.01d ssh hgwdev cd /cluster/data/mm6/bed/blat.SCDb-07-05-2005 # See if things check out. find /cluster/bluearc/andy/scdb.psl -type f -exec cat '{}' ';' > scdb.all.psl pslReps -singleHit scdb.all.psl scdb.best.psl info.psr # All the original names grep '>' scdb.fa | sed 's/^>//' | cut -f1 -d' ' | sort | uniq > names.scdb # All the names from ones that hit. cut -f10 scdb.all.psl | sort | uniq > all.names.scdb # All the ones with a "best" hit. cut -f10 scdb.best.psl | sort | uniq > best.names.scdb # Yeah a bunch of them (4,443/37,386) are missing. It seems many of the # clones aren't from mouse anyways. mkdir ../scdb cp scdb.best.psl ../scdb/scdb.psl cp scdb.fa ../scdb/ cp best.names.scdb ../scdb/ cd ../scdb/ faSomeRecords scdb.fa best.names.scdb scdb.best.fa rm scdb.fa mkdir /gbdb/mm6/scdb ln -s /cluster/data/mm6/bed/scdb/scdb.best.fa /gbdb/mm6/scdb/scdb.fa hgLoadSeq mm6 /gbdb/mm6/scdb/scdb.fa # clean up the names... basically take the middle part out. sed 's/SC|\([^|]\+\)|[0-9]\+/\1/' scdb.best.fa > new.scdb.best.fa sed 's/SC|\([^|]\+\)|[0-9]\+/\1/' scdb.psl > new.scdb.psl mv scdb.psl old.scdb.psl mv new.scdb.psl scdb.psl mv scdb.best.fa old.scdb.best.fa mv new.scdb.best.fa scdb.best.fa hgLoadPsl -table=scdb mm6 scdb.psl hgLoadSeq mm6 /gbdb/mm6/scdb/scdb.fa #Warning: load of seq did not go as planned: 37381 record(s), 1 row(s) skipped, 0 warning(s) loading ./seq.tab # Oh well. # Update 7/26/2005: I'm going more restrictive on the pslReps. ssh hgwdev cd /cluster/data/mm6/bed/blat.SCDb-07-05-2005 pslReps -minCover=0.8 -singleHit scdb.all.psl tmp scdb.psr sed 's/SC|\([^|]\+\)|[0-9]\+/\1/' tmp > scdb.psl rm tmp hgLoadPsl mm6 scdb.psl ## REBUILD NIA Mouse Gene Index - (DONE - 2005-07-20 Fan) # requested by: Dudekula, Dawood (NIH/NIA/IRP) DudekulaDB@grc.nia.nih.gov ssh hgwdev cd /cluster/data/mm6/bed mv NIAGene NIAGene_050621 mkdir NIAGene wget --timestamp http://lgsun.grc.nia.nih.gov/geneindex5/download/T-psl.txt.gz cut -f 1-21 T-psl.txt >NIAGene.tab hgLoadPsl mm6 NIAGene.tab wget --timestamp http://lgsun.grc.nia.nih.gov/geneindex5/download/T-fasta.fa.gz gzip -d T-fasta.fa.gz rm /gbdb/mm6/NIAGene/T-fasta.fa ln -s /cluster/data/mm6/bed/NIAGene/T-fasta.fa /gbdb/mm6/NIAGene/T-fasta.fa # Load the sequences. PLEASE NOTE THE "-replace" OPTION SHOULD BE USED!!! hgLoadSeq -replace mm6 /gbdb/mm6/NIAGene/T-fasta.fa # BLASTZ, CHAIN, NET, MAFNET, AXTNET AND ALIGNMENT DOWNLOADS FOR # ZEBRAFISH (danRer3) (DONE, 2005-08-10, hartera) # REMAKE AXTNET AND COPY TO DOWNLOADS. REMAKE MAFNET (DONE, 2005-08-17, hartera) ssh kkr1u00 # Blastz uses lineage-specific repeats. There are none for mouse # and fish so use all repeats for each species as lineage-specific. mkdir -p /iscratch/i/mm6/linSpecRep.notInZebrafish foreach f (/panasas/store/mm6/rmsk/chr*.fa.out) cp -p $f /iscratch/i/mm6/linSpecRep.notInZebrafish/$f:t:r:r.out.spec end # get only lineage specific repeats for chr1-25 and chrM mkdir -p /iscratch/i/danRer3/linSpecRep.notInMouse foreach f (/iscratch/i/danRer3/rmsk/chr[0-9M]*.fa.out) cp -p $f /iscratch/i/danRer3/linSpecRep.notInMouse/$f:t:r:r.out.spec end # make a nib dir that is also just chr1-25 and chrM mkdir -p /iscratch/i/danRer3/chromNib cp /cluster/data/danRer3/nib/chr[0-9M]*.nib /iscratch/i/danRer3/chromNib /cluster/bin/iSync ssh kkstore mkdir /cluster/data/mm6/bed/blastz.danRer3.2005-08-05 cd /cluster/data/mm6/bed ln -s blastz.danRer3.2005-08-05 blastz.danRer3 cd /cluster/data/mm6/bed/blastz.danRer3 # use parameters as for mm5 - see makeMm5.doc cat << '_EOF_' > DEF # mouse (mm6) vs zebrafish (danRer3) export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin ALIGN=blastz-run BLASTZ=blastz # Reuse parameters from hg16-fr1, danRer-hg17 and mm5-danRer BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q BLASTZ_ABRIDGE_REPEATS=1 # TARGET: Mouse (mm6) SEQ1_DIR=/panasas/store/mm6/nib SEQ1_RMSK= SEQ1_FLAG= SEQ1_SMSK=/iscratch/i/mm5/linSpecRep.notInZebrafish SEQ1_IN_CONTIGS=0 SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Zebrafish (danRer3) # just chroms 1-25 and chrM SEQ2_DIR=/iscratch/i/danRer3/chromNib SEQ2_RMSK= SEQ2_FLAG= SEQ2_SMSK=/iscratch/i/danRer3/linSpecRep.notInMouse SEQ2_IN_CONTIGS=0 SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/cluster/data/mm6/bed/blastz.danRer3 DEF=$BASE/DEF RAW=$BASE/raw CDBDIR=$BASE SEQ1_LEN=$BASE/S1.len SEQ2_LEN=$BASE/S2.len #DEBUG=1 '_EOF_' # << this line keeps emacs coloring happy chmod +x DEF cp /cluster/data/mm6/chrom.sizes ./S1.len sort -rn +1 /cluster/data/danRer3/chrom.sizes > S2.len # make output directory mkdir -p /panasas/store/mm6vsdanRer3Out # do blastz and create chains for danRer3 chr1-25 and chrM # chickenHumanTuned.gap scoring matrix is now used by default # by axtChain. nohup nice /cluster/bin/scripts/doBlastzChainNet.pl `pwd`/DEF \ -blastzOutRoot /panasas/store/mm6vsdanRer3Out -chainMinScore=5000 \ -stop chainMerge >& do.log & #PID 31074 on kk, # Started Fri Aug 5 21:18:13 PDT 2005 # Finished Aug 6 06:31 # for chr1-25 and chrM: # blastz run: # para time # Completed: 44023 of 44023 jobs # CPU time in finished jobs: 12375882s 206264.70m 3437.75h 143.24d 0.392 y # IO & Wait Time: 979190s 16319.83m 272.00h 11.33d 0.031 y # Average job time: 303s 5.06m 0.08h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 1778s 29.63m 0.49h 0.02d # Submission to last job: 32605s 543.42m 9.06h 0.38d # chain run: # para time # Completed: 40 of 40 jobs # CPU time in finished jobs: 1075s 17.92m 0.30h 0.01d 0.000 y # IO & Wait Time: 243s 4.04m 0.07h 0.00d 0.000 y # Average job time: 33s 0.55m 0.01h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 51s 0.85m 0.01h 0.00d # Submission to last job: 107s 1.78m 0.03h 0.00d # then do a run with the zebrafish danRer3 NA and Un Scaffolds # that are in a 2bit file to do blastz and make chains. ssh kk mkdir -p /cluster/data/mm6/bed/blastz.danRer3/NAandUnScaffolds cd /cluster/data/mm6/bed/blastz.danRer3/NAandUnScaffolds # copy DEF file and edit for NA and Un scaffolds cat << '_EOF_' > DEF # mouse (mm6) vs zebrafish (danRer3) NA and Un scaffolds only export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin ALIGN=blastz-run BLASTZ=blastz # Reuse parameters from hg16-fr1, danRer-hg17 and mm5-danRer BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q BLASTZ_ABRIDGE_REPEATS=0 # TARGET: Mouse (mm6) SEQ1_DIR=/panasas/store/mm6/nib SEQ1_RMSK= SEQ1_FLAG= SEQ1_SMSK= SEQ1_IN_CONTIGS=0 SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Zebrafish (danRer3) # NA and Un Scaffolds in a 2bit file SEQ2_DIR=/iscratch/i/danRer3/NAandUnScafs/danRer3NAandUnScaf.2bit SEQ2_RMSK= SEQ2_FLAG= SEQ2_SMSK= SEQ2_IN_CONTIGS=0 SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/cluster/data/mm6/bed/blastz.danRer3/NAandUnScaffolds DEF=$BASE/DEF RAW=$BASE/raw CDBDIR=$BASE SEQ1_LEN=$BASE/S1.len SEQ2_LEN=$BASE/S2.len #DEBUG=1 '_EOF_' # << this line keeps emacs coloring happy chmod +x DEF # get lengths for nibs and scaffolds cp /cluster/data/mm6/chrom.sizes ./S1.len # for S2.len, need sizes of the sequences in the 2bit file sort -rn +1 /cluster/data/danRer3/NAandUnScafs.sizes > ./S2.len # make output directory mkdir -p /panasas/store/mm6vsdanRer3Out/NAandUnScafs # do blastz and create chains for danRer3 chrNA and chrUn scaffolds. nohup nice /cluster/bin/scripts/doBlastzChainNet.pl `pwd`/DEF \ -blastzOutRoot /panasas/store/mm6vsdanRer3Out/NAandUnScafs \ -chainMinScore=5000 -stop chainMerge >& do.log & # Start Mon Aug 8 09:04 # Finish Aug 8 13:04 # for NA and Un Scaffolds: # blastz run: # para time # Completed: 15226 of 15226 jobs # CPU time in finished jobs: 6074532s 101242.21m 1687.37h 70.31d 0.193 y # IO & Wait Time: 289788s 4829.79m 80.50h 3.35d 0.009 y # Average job time: 418s 6.97m 0.12h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 7795s 129.92m 2.17h 0.09d # Submission to last job: 13851s 230.85m 3.85h 0.16d # chain run: # para time # Completed: 40 of 40 jobs # CPU time in finished jobs: 270s 4.50m 0.08h 0.00d 0.000 y # IO & Wait Time: 252s 4.20m 0.07h 0.00d 0.000 y # Average job time: 13s 0.22m 0.00h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 29s 0.48m 0.01h 0.00d # Submission to last job: 54s 0.90m 0.01h 0.00d # now need to do a liftUp to get the chromosomes co-ordinates # then merge together and continue on with net step ssh kkstore01 cd /cluster/data/mm6/bed/blastz.danRer3/NAandUnScaffolds/axtChain mkdir liftedChain foreach f (chain/*.chain) set c=$f:t:r echo $c liftUp -chainQ liftedChain/${c}.lifted.chain \ /cluster/data/danRer3/liftSupertoChrom/liftNAandUnScaffoldsToChrom.lft \ warn $f end # now merge these lifted chain files with the existing chain files for # the chroms and then sort by score using chainSort cd /cluster/data/mm6/bed/blastz.danRer3/axtChain # gzipped file is only chains for chroms1-25 and chrM so rename mv mm6.danRer3.all.chain.gz mm6.danRer3.chroms.chain.gz mv chain chromChain mkdir chain chainUnSorted # get all chains to be merged in chainUnSorted dir cp ./chromChain/*.chain ./chainUnSorted/ # copy scaffolds chains, these are *.lifted.chain so they do not # write over the chrom chains. cp ../NAandUnScaffolds/axtChain/liftedChain/*.chain ./chainUnSorted/ # then merge and sort all these chains. they must be merged and all # sorted together so that all IDs are unique across all chroms. # IDs are reassigned by chainMergeSort so that IDs are unique. nice chainMergeSort chainUnSorted/*.chain | nice gzip -c \ > mm6.danRer3.all.chain.gz # use chainSplit to split this into chains again zcat mm6.danRer3.all.chain.gz | chainSplit chain stdin # then pick up the doBlastzChainNet.pl script with the net step ssh kk cd /cluster/data/mm6/bed/blastz.danRer3 cp DEF DEF.chroms # edit DEF so SEQ2_DIR=/iscratch/i/danRer3/nib as need all nib files now # make sure that :~/.ssh/config has only user write permission and not # group otherwise the ssh will fail. nohup nice /cluster/bin/scripts/doBlastzChainNet.pl `pwd`/DEF \ -blastzOutRoot /panasas/store/mm6vsdanRer3Out -chainMinScore=5000 \ -continue net >& doNet.log & # Start: Aug 10 13:34 # Finished: Aug 10 13:54 # it crashes at the cleanup step as it can not get to /panasas/ from # kkstore01 - should specifiy a different fileServer for this step. # run this step manually cd /cluster/data/mm6/bed/blastz.danRer3 cleanUp.csh & # All done now # check README.txt in downloads directory and also add html and # trackDb.ra entry for chain and net tracks for danRer3 # featureBits -chrom=chr1 mm6 refGene:cds chainDanRer3Link -enrichment # refGene:cds 0.808%, chainDanRer3Link 5.196%, both 0.522%, cover 64.64%, # enrich 12.44x # featureBits -chrom=chr1 mm5 refGene:cds chainDanRer2Link -enrichment # refGene:cds 0.818%, chainDanRer2Link 2.058%, both 0.546%, cover 66.75%, # enrich 32.43x # featureBits -chrom=chr1 mm6 refGene:cds chainDanRer2Link -enrichment # refGene:cds 0.808%, chainDanRer2Link 6.412%, both 0.542%, cover 67.04%, # enrich 10.46x # Remake axtNet and then remake mafNet from these (2005-08-17, harterA) # netToAxt was processing nets incorrectly so remake these with # new version of netToAxt # and transfer to downloads dir. ssh eieio cd /cluster/data/mm6/bed/blastz.danRer3 rm -r axtNet # Make axtNet for download: one .axt per mm6 seq. # remake noClass.net #Make nets("noClass", i.e. without rmsk/class stats which are added later): cd axtChain chainPreNet mm6.danRer3.all.chain.gz /cluster/data/mm6/bed/blastz.danRer3/S1.len /cluster/data/mm6/bed/blastz.danRer3/S2.len stdout \ | chainNet stdin -minSpace=1 /cluster/data/mm6/bed/blastz.danRer3/S1.len /cluster/data/mm6/bed/blastz.danRer3/S2.len stdout /dev/null \ | netSyntenic stdin noClass.net # create net for each chrom again netSplit noClass.net net # also split up chains again mkdir chain zcat mm6.danRer3.all.chain.gz | chainSplit chain stdin cd .. # make axtNet again using new version of axtNet, the previous version was # not processing the nets correctly. mkdir axtNet foreach f (axtChain/net/*.net) netToAxt $f axtChain/chain/$f:t:r.chain \ /panasas/store/mm6/nib /iscratch/i/danRer3/nib stdout \ | axtSort stdin stdout \ | gzip -c > axtNet/$f:t:r.mm6.danRer3.net.axt.gz end # cleanup cd axtChain rm noClass.net rm -r net rm -r chain # remake mafNet from the new axtNet cd /cluster/data/mm6/bed/blastz.danRer3 rm -r mafNet # Make mafNet for multiz: one .maf per mm6 seq. mkdir mafNet foreach f (axtNet/*.mm6.danRer3.net.axt.gz) axtToMaf -tPrefix=mm6. -qPrefix=danRer3. $f \ /cluster/data/mm6/bed/blastz.danRer3/S1.len /cluster/data/mm6/bed/blastz.danRer3/S2.len \ stdout \ | gzip -c > mafNet/$f:t:r:r:r:r:r.maf.gz end # remove old axtNet downloads and add links to new axtNet files ssh hgwdev rm -r /usr/local/apache/htdocs/goldenPath/mm6/vsDanRer3/axtNet cd /usr/local/apache/htdocs/goldenPath/mm6/vsDanRer3 mkdir -p /usr/local/apache/htdocs/goldenPath/mm6/vsDanRer3/axtNet ln -s /cluster/data/mm6/bed/blastz.danRer3/axtNet/*.axt.gz axtNet/ # make md5sum.txt again rm md5sum.txt md5sum *.gz */*.gz > md5sum.txt #### LOAD ENSEMBL GENES (DONE - 2005-08-10 Fan) # ADDDED STABLE URL TO TRACKDB BLOCK (V32, JUL 2005) (2008-01-11, rhead) # needed for Gene Sorter procedure below # Ensembl released Mouse build 34 the week of August 10th, 2005 mkdir -p /cluster/store11/mm6/bed/ensGene ln -s /cluster/store11/mm6/bed/ensGene /cluster/data/mm6/bed/ensGene cd /cluster/data/mm6/bed/ensGene Get the Ensembl BioMart at http://www.ensembl.org/Multi/martview Choose Ensembl 32 and Mus musculus, click next Follow this sequence through the pages: 1) Select "Known genes" in the Gene seciont. Hit next. 2) Select "Structures". 3) Choose GTF as the output, choose gzip compression, name the output file ensGeneMm6.gtf.gz and then hit Export # Ensembl handles random chromosomes differently than us, so we # strip this data. Fortunately it just loses a couple of genes. zcat ensGeneMm6.gtf.gz | grep -v ^6_DR51 | grep -v NT_ > unrandom.gtf # Let's see how much it loses: # None. # Add "chr" to front of each line in the gene data gtf file to make # it compatible with ldHgGene sed -e "s/^/chr/" unrandom.gtf | sed -e "s/chrMT/chrM/" > ensGene.gtf ldHgGene mm6 ensGene ensGene.gtf # Read 38200 transcripts in 674378 lines in 1 files # 38200 groups 22 seqs 1 sources 4 feature types # 38200 gene predictions hgsql mm6 -N -e 'select * from ensGene' | sed -e 's/\./\t/' |\ cut -f 1,3-11 >ensGeneNew.tab # save space, gzip them: gzip unrandom.gtf gzip ensGene.gtf # Load Ensembl peptides: Get the ensembl protein data from BioMar Choose Mus musculus as the organism Follow this sequence through the pages: 1) Choose "Known genes". Hit next. 2) Choose "Sequences" and "Peptide" and "Ensembl Transcript ID", choose text/fasta and gzip compression, name the file ensPep and then hit export. zcat ensPep.fasta.gz|faToTab -type=protein stdin j1.tmp cat j1.tmp|grep -v "SEQXENCEXNAVAILAXLE" >j2.tmp cat j2.tmp |awk '{print ">" $1;print $2}' > ensPep.fa rm j1.tmp j2.tmp hgPepPred mm6 generic ensPep ensPep.fa # Load ensGtp table. # ensGtp associates geneId/transcriptId/proteinId for hgPepPred and # hgKnownToSuper. # Get the Ensembl BioMart at http://www.ensembl.org/Multi/martview # Choose Ensembl 32 and Mus musculus, click next # Follow this sequence through the pages: # 1) Select "Known genes" in the Gene seciont. Hit next. # 2) Select "Structures". # 3) select Ensembl Gene ID, Ensembl Transcript ID, Ensembl Peptide ID. # 4) select "Text, tab separated" and name the output file as "ensGtp" # 5) download the output file "ensGtp.tsv.gz" gunzip ensGtp.tsv.gz hgsql mm6 < ~/kent/src/hg/lib/ensGtp.sql hgsql mm6 -N -e 'load data local infile "ensGtp.tsv" into table ensGtp ignore 1 lines;' # Create knownToEnsembl column hgMapToGene mm6 ensGene knownGene knownToEnsembl # Compress everthing to save space gzip * #### BUILD Ensembl cross-reference table, ensemblXref3 (DONE - 2005-08-16 - Fan) # PLEASE NOTE THAT THE ENSEMBLXREF3 TABLE IS BUILT USING ENSEMBL BIOMART DATA OF MOUSE BUILD 34. # THIS TABLE IS NEEDED TO SUPPORT SUPERFAMILY TRACK OF THE PROTEOME BROWSER. # SINCE ENSEMBL CHANGED THE DATA FORMAT AGAIN (AS USUAL :-( ), THERE IS NO VERSION NUMBER # IN THEIR IDs, A FAKE "0" IS GENERATED FOR EACH ID IN ensemblXref3 TABLE. # Get the ensembl gene/protein cross-reference data BioMart # Follow this sequence through the pages: # 1) Make sure that the Mus musculus choice is selected. Hit next. # 2) Choose the "Feature" box, select Ensembl gene, transcript, and peptid IDs, SPTrEMBL ID, SWISSPROT ID, and SWISSPROT AC # 3) Choose "Text, tab separated". choose gzip compression. hit export. # Save as ensXref load data local infile "ensXref.tsv" into table ensemblXref3Temp ignore 1 lines; hgsql mm6 -N -e 'select gene, "0", transcript, "0", protein, "0", tremblAcc, swissDisplayId, swissAcc from ensemblXref3Temp' \ > ensemblXref3.tab hgsql mm6 -e 'drop table ensemblXref3' hgsql mm6 <~/src/hg/lib/ensemblXref3.sql hgsql mm6 -e 'load data local infile "ensemblXref3.tab" into table ensemblXref3' # load the table into proteome DB also hgsql proteome -e 'load data local infile "ensemblXref3.tab" into table ensemblXref3' #### BUILD SUPERFAMILY RELATED TABLES (DONE - 2005-08-17 - Fan) # Download Superfamily data files and build the Superfamily DB # from supfam.mrc-lmb.cam.ac.uk mkdir -p /cluster/store11/superfamily/050817 ln -s /cluster/store11/superfamily/050817 /cluster/data/superfamily/050817 cd /cluster/data/superfamily/050817 # add the following line to ~/.netrc machine supfam.mrc-lmb.cam.ac.uk login license password XXXXX # ftp over to supfam.mrc-lmb.cam.ac.uk and get the following two files: supfam_14-Aug-2005.sql.gz ass_14-Aug-2005.tab.gz gzip -d *.gz # Load the Superfamily database hgsql mm6 -e "create database superfam050817" zcat supfam_14-Aug-2005.sql.gz | hgsql superfam050817 # This may take about an hour. # Make sure to add an index on id of the des table of superfam050817. hgsql superfam050817 -e "create index id on des(id);" gzip -d ass_14-Aug-2005.tab.gz hgsql superfam050817 < ~/src/hg/lib/sfAssign.sql hgsql superfam050817 -e \ 'load data local infile "ass_14-Aug-2005.tab" into table superfam050817.sfAssign;' # Build or rebuild Superfamily track and create sf tables needed for PB hgsql mm6 < ~/src/hg/lib/sfAssign.sql cd /cluster/data/superfamily/050817 hgsql mm6 -e 'load data local infile "ass_14-Aug-2005.tab" into table mm6.sfAssign;' # If mm6.sfDes already exists, drop it. hgsql superfam050817 -e "select * from des" >sfDes.tab hgsql mm6 < ~/src/hg/lib/sfDes.sql hgsql mm6 -e 'load data local infile "sfDes.tab" into table mm6.sfDes ignore 1 lines;' # If mm6.superfamily already exists, drop it. cd /cluster/data/mm6/bed mkdir /cluster/data/mm6/sf.2005-0817 ln -s sf.2005-0817 sf hgSuperfam mm6 superfam050817 > sf.log # It is normal that many proteins does not have corresponding Superfamily entries. # If mm6.sfDescription exists, drop it. hgsql mm6 < ~/src/hg/lib/sfDescription.sql hgsql mm6 -e 'LOAD DATA local INFILE "sfDescription.tab" into table mm6.sfDescription;' # Finally, load the superfamily table. hgLoadBed mm6 superfamily superfamily.tab -tab # Create knownToSuperfamily table # Note hs is changed into ht for this Superfamily release. cat /cluster/data/superfamily/050817/ass_14-Aug-2005.tab | hgKnownToSuper mm6 mm stdin # 21185 records output # RE-EXTRACT LINEAGE-SPECIFIC REPEATS FOR DOG (DONE 8/11/05 angie) # originally done 3/14/05 -- redone 8/11/05 just in case latest RM version # has any updates, before aligning to canFam2. ssh kolossus cd /panasas/store/mm6/rmsk # Run Arian's DateRepsinRMoutput.pl to add extra columns telling # whether repeats in -query are also expected in -comp species. foreach outfl ( *.out ) echo "$outfl" /cluster/bluearc/RepeatMasker/DateRepeats \ ${outfl} -query mouse -comp dog end # Now extract dog (extra column 1): cd .. mkdir linSpecRep.notInDog foreach f (rmsk/*.out_canis-familiaris) set base = $f:t:r:r echo $base.out.spec /cluster/bin/scripts/extractRepeats 1 $f > \ linSpecRep.notInDog/$base.out.spec end # Clean up. rm rmsk/*.out_canis* # BLASTZ/CHAIN/NET CANFAM2 (DONE 8/12/05 angie) ssh kkstore01 mkdir /cluster/data/mm6/bed/blastz.canFam2.2005-08-11 cd /cluster/data/mm6/bed/blastz.canFam2.2005-08-11 cat << '_EOF_' > DEF # mouse vs. dog # TARGET: Mouse SEQ1_DIR=/panasas/store/mm6/nib SEQ1_RMSK=/panasas/store/mm6/rmsk SEQ1_SMSK=/panasas/store/mm6/linSpecRep.notInDog SEQ1_LEN=/cluster/data/mm6/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Dog SEQ2_DIR=/scratch/hg/canFam2/nib SEQ2_RMSK=/panasas/store/canFam2/rmsk SEQ2_SMSK=/panasas/store/canFam2/linSpecRep.notInMouse SEQ2_LEN=/cluster/data/canFam2/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/cluster/data/mm6/bed/blastz.canFam2.2005-08-11 '_EOF_' # << for emacs doBlastzChainNet.pl DEF \ -blastzOutRoot /panasas/store/blastzMm6CanFam2Out >& do.log & ln -s blastz.canFam2.2005-08-11 /cluster/data/mm6/bed/blastz.canFam2 # RE-RUN NETTOAXT, AXTTOMAF FOR CANFAM2 (DONE 10/28/05 angie) # Kate fixed netToAxt to avoid duplicated blocks, which is important # for input to multiz. Regenerate maf using commands from sub-script # netChains.csh generated by doBlastzChainNet.pl above. ssh kolossus cd /cluster/data/mm6/bed/blastz.canFam2.2005-08-11/axtChain netSplit mm6.canFam2.net.gz net chainSplit chain mm6.canFam2.all.chain.gz cd .. mv axtNet axtNet.orig mkdir axtNet foreach f (axtChain/net/*.net) netToAxt $f axtChain/chain/$f:t:r.chain \ /panasas/store/mm6/nib /iscratch/i/canFam2/nib stdout \ | axtSort stdin stdout \ | gzip -c > axtNet/$f:t:r.mm6.canFam2.net.axt.gz end rm -r mafNet mkdir mafNet foreach f (axtNet/*.mm6.canFam2.net.axt.gz) axtToMaf -tPrefix=mm6. -qPrefix=canFam2. $f \ /cluster/data/mm6/chrom.sizes /cluster/data/canFam2/chrom.sizes \ stdout \ | gzip -c > mafNet/$f:t:r:r:r:r:r.maf.gz end rm -r axtChain/{chain,net}/ axtNet.orig # UPDATE miRNA track (DONE - 2005-08-24 - Fan) # data from: Michel.Weber@ibcg.biotoul.fr # notify them when done. cd /cluster/data/mm6/bed cd miRNA mkdir old cp -p * old rm * # save miRNA_track_mm6_aug2005.txt file from email cp miRNA_track_mm6_aug2005.txt miRNA.tab vi miRNA.tab # edit miRNA.bed to get rid of the top description lines # and a few blank lines hgLoadBed mm6 miRNA miRNA.tab # check previous release track before update nice featureBits mm5 miRNA # 17957 bases of 2615483787 (0.001%) in intersection nice featureBits mm6 miRNA # 20898 bases of 2597150411 (0.001%) in intersection # ADD LINK TO GENENETWORK (Done. 9/6/05 Fan). # Received the file, rat.RefSeqId, list of RefSeq IDs from GeneNetwork. # remove extra CR (or LF?) at end of the line. rmLf mouse.RefSeqId >mm6.geneNetworkId.tab hgsql mm6 -e 'drop table geneNetworkId' hgsql mm6 < ~/src/hg/lib/geneNetworkId.sql hgsql mm6 -e 'load data local infile "mm6.geneNetworkId.tab" into table geneNetworkId' # JACKSON LABS / MGI REPRESENTATIVE TRANSCRIPT (DONE 9/29/05 angie) # Genes reloaded 2/4/06 after Kayla found that exon starts were off-by-one. ssh kkstore01 mkdir /cluster/data/mm6/bed/jaxRepTranscript cd /cluster/data/mm6/bed/jaxRepTranscript wget ftp://gondor.informatics.jax.org/pub/gbrowse/MGIrep-trans_bld34b.gff cat > parseJaxGFF.pl <<'_EOF_' #!/usr/bin/perl -w # Parse this particular flavor of GTF into our preferred flavor (stdout) # plus an association file (alias.tab) and a fixit SQL file (fixit.sql). # Note: for the rep transcript files only, must add 1 to each start coord. use strict; # Keep track of transcript names; our GFF-parsing code requires unique # transcript names but non-unique ones are used here. Add uniquifying # suffix. Rely on the fact that a REP_transcript line always immediately # precedes the REP_exon lines. my %txNameIndx; my $tweakedName; open(OUT, ">alias.tab") || die "Can't open alias.tab for writing: $!\n"; open(SQL, ">fixit.sql") || die "Can't open fixit.sql for writing: $!\n"; while (<>) { chomp; my ($chr, undef, $type, $start, $end, undef, $strand, undef, $info) = split("\t"); if ($type eq "rep_transcript") { my ($name, $mgiID); if ($info =~ /^REP_transcript ([^;]+); Note "[\w_.]+"; Note "(MGI:\d+)"; Note "[\w. ()-]+"$/) { ($name, $mgiID) = ($1, $2); } else { die "parse, line $.:\n$info\n"; } if (defined $txNameIndx{$name}) { $tweakedName = $name . '_' . $txNameIndx{$name}; print SQL "update jaxRepTranscript set name = '$name' " . "where name = '$tweakedName';\n"; } else { undef $tweakedName; print OUT "$name\t$mgiID\n"; } $txNameIndx{$name}++; } elsif ($type eq "rep_exon") { $type = "exon"; my ($name, $mgiID); if ($info =~ /^REP_transcript ([^;]+); Note "[\w_.]+"; Note "(MGI:\d+)"; Note "[\w. ()-]+"$/) { ($name, $mgiID) = ($1, $2); } else { die "parse, line $.:\n$info\n"; } if (defined $tweakedName) { if ($tweakedName !~ /^${name}_\d+$/) { die "tweakedName $tweakedName does not start with name $name and " . " have a numeric suffix like expected"; } $name = $tweakedName; } $start++; print "$chr\tMGI\t$type\t$start\t$end\t.\t$strand\t.\t" . "gene_id \"$mgiID\"; transcript_id \"$name\";\n"; } else { die "unrecognized type $type, line $."; } } close(OUT); close(SQL); '_EOF_' # << for emacs chmod a+x parseJaxGFF.pl parseJaxGFF.pl MGIrep-trans_bld34b.gff > jaxRepTranscript.gtf # Load up the genes and aliases, and use script-generated fixit.sql # to remove our uniquifying suffixes from the duplicated transcripts. ssh hgwdev cd /cluster/data/mm6/bed/jaxRepTranscript ldHgGene mm6 jaxRepTranscript jaxRepTranscript.gtf hgsql mm6 < fixit.sql sed -e 's/genericAlias/jaxRepTranscriptAlias/' \ $HOME/kent/src/hg/lib/genericAlias.sql \ | hgsql mm6 hgsql mm6 -e \ 'load data local infile "alias.tab" into table jaxRepTranscriptAlias' # JACKSON LABS / MGI ALLELE TRANSCRIPTS (DONE 11/15/05 angie) ssh kkstore01 mkdir /cluster/data/mm6/bed/jaxAllele cd /cluster/data/mm6/bed/jaxAllele ftp gondor.informatics.jax.org # anonymous log in cd pub/gbrowse prompt mget allele_* bye # OK, need to ask Bob about how to parse those Note columns... # looks like we could split some of the long names into mult. fields. cat > parseJaxGFF.pl <<'_EOF_' #!/usr/bin/perl -w # Parse this particular flavor of GTF into our preferred flavor (stdout) # plus *append* to an association file (alias.tab) and write a fixit SQL file # ($ARGV[0].sql -- first arg must be table name). use strict; my $tableName = shift @ARGV; die "need an argument (table name)" if (! defined $tableName); # Keep track of transcript names; our GFF-parsing code requires unique # transcript names but non-unique ones are used here. Add uniquifying # suffix. Rely on the fact that a _transcript line always immediately # precedes the _exon lines. my %txNameIndx; my $tweakedName; open(OUT, ">>alias.tab") || die "Can't open alias.tab for appending: $!\n"; open(SQL, ">$tableName.sql") || die "Can't open $tableName.sql for writing: $!\n"; while (<>) { chomp; my ($chr, undef, $type, $start, $end, undef, $strand, undef, $info) = split("\t"); if ($type =~ /^\w+_transcript$/) { my ($name, $alName, $mgiID, $source); if ($info =~ /^\S+_transcript ([^<]+]*>?)_\w+; Note "(MGI:\d+)"; Note "([^"]+)";$/) { ($name, $mgiID, $source) = ($1, $2, $3); } else { die "parse, line $.:\n$info\n"; } if (defined $txNameIndx{$name}) { $tweakedName = $name . '_' . $txNameIndx{$name}; print SQL "update $tableName set name = '$name' " . "where name = '$tweakedName';\n"; } else { undef $tweakedName; print OUT "$name\t$mgiID\t$source\n"; } $txNameIndx{$name}++; } elsif ($type =~ /^\w+_exon$/) { $type = "exon"; my ($name, $mgiID); if ($info =~ /^\S+_transcript ([^<]+]*>?)_\w+; Note "(MGI:\d+)"; Note "[^"]+";$/) { ($name, $mgiID) = ($1, $2); } else { die "parse, line $.:\n$info\n"; } if (defined $tweakedName) { my $escName = $name; $escName =~ s/\(/\\(/g; $escName =~ s/\)/\\)/g; if ($tweakedName !~ /^${escName}_\d+$/) { die "tweakedName $tweakedName does not start with name $name and " . " have a numeric suffix like expected"; } $name = $tweakedName; } print "$chr\tMGI\t$type\t$start\t$end\t.\t$strand\t.\t" . "gene_id \"$mgiID\"; transcript_id \"$name\";\n"; } else { die "unrecognized type $type, line $."; } } close(OUT); close(SQL); '_EOF_' # << for emacs chmod a+x parseJaxGFF.pl cp /dev/null alias.tab foreach f (*.gff) set g = `echo $f | perl -wpe 's/allele_//; s/\.gff//; s/^(\w)/jaxAllele\u$1/;'` ./parseJaxGFF.pl $g $f > $g.gtf end # Load info table hgsql mm6 < $HOME/kent/src/hg/lib/jaxAlleleInfo.sql hgsql mm6 -e 'load data local infile "alias.tab" into table jaxAlleleInfo' # Make a single bed file with Allele type as extra column, for # single track / Gene Sorter. ssh hgwdev cd /cluster/data/mm6/bed/jaxAllele cp /dev/null jaxAllele.bed foreach f (jax*.gtf) set type = `echo $f | sed -e 's/jaxAllele//; s/\.gtf//;'` ldHgGene mm6 $f:t:r $f -out=stdout \ | /cluster/bin/scripts/genePredToBed \ | sed -e 's/$/'"\t$type"'/' \ >> jaxAllele.bed end sed -e 's/bed12Source/jaxAllele/g' \ $HOME/kent/src/hg/lib/bed12Source.sql > jaxAllele.sql hgLoadBed -sqlTable=jaxAllele.sql mm6 jaxAllele jaxAllele.bed # JACKSON LABS / MGI PHENOTYPE (DONE 11/15/05 angie) ssh kkstore01 mkdir /cluster/data/mm6/bed/jaxPhenotype cd /cluster/data/mm6/bed/jaxPhenotype ftp gondor.informatics.jax.org # anonymous log in cd pub/gbrowse prompt mget *_phenotype.gff mget lethality* life* tumor* bye cat > parseJaxGFF.pl <<'_EOF_' #!/usr/bin/perl -w # Parse this particular flavor of GTF into our preferred flavor (stdout) # plus *append* to an association file (alias.tab) and write a fixit SQL file # ($ARGV[0].sql -- first arg must be table name). use strict; my $tableName = shift @ARGV; die "need an argument (table name)" if (! defined $tableName); # Keep track of transcript names; our GFF-parsing code requires unique # transcript names but non-unique ones are used here. Add uniquifying # suffix. Rely on the fact that a _transcript line always immediately # precedes the _exon lines. my %txNameIndx; my $tweakedName; open(OUT, ">>alias.tab") || die "Can't open alias.tab for appending: $!\n"; open(SQL, ">$tableName.sql") || die "Can't open $tableName.sql for writing: $!\n"; while (<>) { chomp; my ($chr, undef, $type, $start, $end, undef, $strand, undef, $info) = split("\t"); if ($type =~ /^\w+_transcript$/) { my ($name, $mgiID); if ($info =~ /^MP_\d+_transcript ([^;]+)_MP_\d+; Note "(MGI:\d+)"; Note "MP_\d+";$/) { ($name, $mgiID) = ($1, $2); } else { die "parse, line $.:\n$info\n"; } if (defined $txNameIndx{$name}) { $tweakedName = $name . '_' . $txNameIndx{$name}; print SQL "update $tableName set name = '$name' " . "where name = '$tweakedName';\n"; } else { undef $tweakedName; print OUT "$name\t$mgiID\n"; } $txNameIndx{$name}++; } elsif ($type =~ /^\w+_exon$/) { $type = "exon"; my ($name, $mgiID); if ($info =~ /^MP_\d+_transcript ([^;]+)_MP_\d+; Note "(MGI:\d+)"; Note "MP_\d+";$/) { ($name, $mgiID) = ($1, $2); } else { die "parse, line $.:\n$info\n"; } if (defined $tweakedName) { if ($tweakedName !~ /^${name}_\d+$/) { die "tweakedName $tweakedName does not start with name $name and " . " have a numeric suffix like expected"; } $name = $tweakedName; } print "$chr\tMGI\t$type\t$start\t$end\t.\t$strand\t.\t" . "gene_id \"$mgiID\"; transcript_id \"$name\";\n"; } else { die "unrecognized type $type, line $."; } } close(OUT); close(SQL); '_EOF_' # << for emacs chmod a+x parseJaxGFF.pl cp /dev/null alias.tab foreach f (*.gff) set g = `echo $f | perl -wpe 's/(_phenotype)?\.gff//; s/[_-](\w)/\u$1/g; s/^(\w)/jaxPhenotype\u$1/;'` ./parseJaxGFF.pl $g $f > $g.gtf end sort -u alias.tab > alias.unique.tab # Load up the aliases: ssh hgwdev cd /cluster/data/mm6/bed/jaxPhenotype sed -e 's/genericAlias/jaxPhenotypeAlias/' \ $HOME/kent/src/hg/lib/genericAlias.sql \ | hgsql mm6 hgsql mm6 -e \ 'load data local infile "alias.unique.tab" into table jaxPhenotypeAlias' # Make a single bed file with phenotype as extra column, for single track # / Gene Sorter. Use Jim's abbreviations. ssh hgwdev cd /cluster/data/mm6/bed/jaxPhenotype cp /dev/null jaxPhenotype.bed foreach f (jax*.gtf) set type = `echo $f | sed -e 's@jaxPhenotype@@; s@\.gtf@@; \ s@AdiposeTissue@Adipose@; \ s@BehaviorNeurological@Behavior@; \ s@CardiovascularSystem@Cardiovascular@; \ s@DigestiveAlimentary@Digestive@; \ s@EndocrineExocrineGland@Gland@; \ s@GrowthSize@Growth Size@; \ s@HearingEar@Hearing/Ear@; \ s@HematopoieticSystem@Hematopoietic@; \ s@HomeostasisMetabolism@Homeostasis@; \ s@ImmuneSystem@Immune@; \ s@LethalityEmbryonicPerinatal@Embryonic Lethal@; \ s@LethalityPostnatal@Postnatal Lethal@; \ s@LifeSpanPostWeaningAging@Life Span@; \ s@LimbsDigitsTail@Limbs and Tail@; \ s@LiverBiliarySystem@Liver and Bile@; \ s@NervousSystem@Nervous System@; \ s@RenalUrinarySystem@Renal/Urinary@; \ s@ReproductiveSystem@Reproductive@; \ s@RespiratorySystem@Respiratory@; \ s@SkinCoatNails@Skin/Coat/Nails@; \ s@TasteOlfaction@Taste/Smell@; \ s@TouchVibrissae@Touch@; \ s@Tumorigenesis@Tumorigenesis@; \ s@VisionEye@Vision/Eye@;'` ldHgGene mm6 $f:t:r $f -out=stdout \ | /cluster/bin/scripts/genePredToBed \ | sed -e 's@$@'"\t$type"'@' \ >> jaxPhenotype.bed end sed -e 's/bed12Source/jaxPhenotype/g' \ $HOME/kent/src/hg/lib/bed12Source.sql > jaxPhenotype.sql hgLoadBed -tab -sqlTable=jaxPhenotype.sql mm6 jaxPhenotype \ jaxPhenotype.bed ########################################################################## # MGI ALLELE-PHENOTYPE MAPPING (DONE 7/18/07 angie) # Load in the mapping of alleles to phenotypes from an MGI report # file. The file is independent of assembly version, but it uses # some allele names that apparently have changed (or been added) # since mm6 tables were loaded. So the coverage is not as complete # as for mm8. Still, this info is very nice to have! cd /cluster/data/mm6/bed/jaxAllele/ ln -s alias.tab jaxAlleleInfo.tab /cluster/data/mm8/bed/jax/2007_07/parsePhenotypicAllele.pl \ /cluster/data/mm8/bed/jax/2007_07/MGI_PhenotypicAllele.rpt \ > jaxAllelePheno.tab hgLoadSqlTab mm6 jaxAllelePheno \ ~/kent/src/hg/lib/jaxAllelePheno.sql jaxAllelePheno.tab runJoiner.csh mm6 jaxAllelePheno ~/kent/src/hg/makeDb/schema ########################################################################## # NSCAN track - (2005-09-29 markd) loaded proteins 2005-10-13 cd /cluster/data/mm6/bed/nscan/ # obtained NSCAN-EST predictions from michael brent's group at WUSTL wget http://genome.cse.wustl.edu/predictions/mouse/mm6_09_14_05/mm6Predictions.tar.gz tar -zxf mm6Predictions.tar.gz # change protein fasta file to have transcript id in header foreach f (chr_ptx/*.ptx) awk '/^>/{$0=$1".a"}{print $0}' $f >$f.fix end ldHgGene -gtf -genePredExt mm6 nscanGene chr_gtf/chr*.gtf hgPepPred mm6 generic nscanPep chr_ptx/chr*.fix rm -rf chr_* *.tab # update trackDb; need a mm6-specific page to describe informants mouse/mm6/nscanGene.html mouse/mm6/trackDb.ra # Create table that maps between known genes and visiGene database (DONE 2005-10-10 galt) knownToVisiGene mm6 #Made hashes of image: geneImageHash 2117, locusLinkImageHash 780, refSeqImageHash 780, #genbankImageHash 1301 #knownToLocusLink 23124, knownToRefSeq 23124, knownToGene 250882 ## REBUILD NIA Mouse Gene Index - (DONE - 2005-10-17 Fan) # requested by: Dudekula, Dawood (NIH/NIA/IRP) DudekulaDB@grc.nia.nih.gov ssh hgwdev cd /cluster/data/mm6/bed mv NIAGene NIAGene_050720 mkdir NIAGene cd MOAGene wget --timestamp http://lgsun.grc.nia.nih.gov/geneindex5/download/T-psl.txt.gz gzip -d T-psl.txt.gz cut -f 1-21 T-psl.txt >NIAGene.tab hgLoadPsl mm6 NIAGene.tab wget --timestamp http://lgsun.grc.nia.nih.gov/geneindex5/download/T-fasta.fa.gz gzip -d T-fasta.fa.gz rm /gbdb/mm6/NIAGene/T-fasta.fa ln -s /cluster/data/mm6/bed/NIAGene/T-fasta.fa /gbdb/mm6/NIAGene/T-fasta.fa hgLoadSeq -replace mm6 /gbdb/mm6/NIAGene/T-fasta.fa # UPDATE miRNA track (DONE - 2005-10-20 - Fan) # data from: Michel.Weber@ibcg.biotoul.fr # notify them when done. cd /cluster/data/mm6/bed mv miRNA miRNA_050824 mkdir miRNA cd miRNA # save miRNA_track_mm6_oct2005.txt file from email # MANUALLY EDIT ONE LINE PER WEB's EMAIL OF 10/18/05. cp miRNA_track_mm6_oct2005.txt miRNA.tab vi miRNA.tab # edit miRNA.bed to get rid of the top description lines # and a few blank lines # and replace blank with tab hgLoadBed mm6 miRNA miRNA.tab # check previous release track before update nice featureBits mm5 miRNA # 17957 bases of 2615483787 (0.001%) in intersection nice featureBits mm6 miRNA # 21167 bases of 2597150411 (0.001%) in intersection ############################################################################# # Add TIGR MGI TC Alignments (In progress Oct 24, 2005 JK) # These are clusters of ESTs and other sequences on the mouse genome. # Create directory and download data into it. cd /cluster/data/mm6/bed mkdir tigrMgiTc cd tigrMgiTc wget ftp://ftp.tigr.org/pub/data/tgi/Mus_musculus/MGI.release_15.zip unzip MGI.release_15.zip # Extract only the clusters of ESTs from their big sequence file # that also includes singleton ESTs. This is about 40% the size # of the full file, and all we need. faFilter '-name=TC???????' MGI.022505 tigrMgiTc.fa # Split sequence into pieces for cluster run mkdir split faSplit sequence tigrMgiTc.fa 500 split/tc # Set up cluster run ssh kk cd /cluster/data/mm6/bed/tigrMgiTc mkdir run cd run mkdir psl ls -1S /iscratch/i/mm6/chrom/*.fa > genome.lst ls ../split/*.fa > mrna.lst cat << '_EOF_' > gsub #LOOP blat -ooc=/scratch/hg/h/mouse11.ooc $(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl} #ENDLOOP '_EOF_' gensub2 genome.lst mrna.lst gsub spec para create spec # Then do the usual para try/push/time/check until the run is finished # Then do sorting and near-best-in-genome step on file server ssh kkstore cd /cluster/data/mm6/bed/tigrMgiTc/run pslSort dirs raw.psl tmp psl pslReps raw.psl ../tigrMgiTc.psl -nohead -minCover=0.25 -minAli=0.96 -nearTop=0.001 /dev/null # Clean up big files no longer needed rm raw.psl rm -r psl rm -r ../split rm ../MGI.022505 rm ../MGI.GO.022505 rm ../MGI.TC_EST.022505 ############################################################################# # Add NCBI XM_ alignments - note this is just to create files for # the Allen Brain Atlas mapping. It does not produce a track. # Create directory and download XM_ sequence from NCBI ssh kk cd /cluster/data/mm6/bed mkdir ncbiXm cd ncbiXm wget ftp://ftp.ncbi.nih.gov/refseq/M_musculus/mRNA_Prot/mouse.rna.fna.gz # Unzip, simplify fa headers, and filter out non-XM_ sequence, and split mkdir split zcat mouse.rna.fna.gz | faNcbiToUcsc -wordBefore=ref stdin stdout | faFilter -name=XM_\* stdin ncbiXm.fa zcat mouse.rna.fna.gz | faNcbiToUcsc -wordBefore=ref stdin stdout | faFilter -name=NM_\* stdin ncbiNm.fa faSplit sequence ncbiXm.fa 150 split/xm # Set up cluster job mkdir run cd run mkdir psl ls -1S /iscratch/i/mm6/chrom/*.fa > genome.lst ls ../split/*.fa > mrna.lst cat << '_EOF_' > gsub #LOOP blat -ooc=/scratch/hg/h/mouse11.ooc $(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl} #ENDLOOP '_EOF_' gensub2 genome.lst mrna.lst gsub spec para create spec # Then do the usual para try/push/time/check until the run is finished # Then do sorting and near-best-in-genome step on file server ssh kkstore cd /cluster/data/mm6/bed/ncbiXm/run pslSort dirs raw.psl tmp psl pslReps raw.psl ../ncbiXm.psl -nohead -minCover=0.50 -minAli=0.99 -nearTop=0.001 /dev/null ############################################################################# # Create Allen Brain Atlas mapping. (Done 28 Oct 2005 JK) # This needs to be done after have created sequences in # ncbiXm and tigrMgiTc as above. # Set up directory ssh kk cd /cluster/data/mm6/bed mkdir allenBrain cd allenBrain # Copy in allen20051021.tab file that was converted from # spreadsheet mailed by Susan Sunkin # Also copy in probeSeq.20051027.fasta, also from Susan. # Create a list of probe sequences filling ones missing from probeSeq.20050127.fa # with some NCBI and TIGR files, and some downloaded one at a time. allenCollectSeq allen20051021.tab probeSeq.20051027.fasta ../ncbiXm/ncbiNm.fa ../ncbiXm/ncbiXm.fa ../tigrMgiTc/tigrMgiTc.fa ~/kent/src/hg/makeDb/allenBrain/allenCollectSeq/extra.fa allProbes.fa allProbes.tab missing.tab allenBrainUrl.tab # Set up a blat run to align the probes. cd /cluster/data/mm6/bed/allenBrain mkdir split faSplit sequence allProbes.fa 200 split/rp mkdir run cd run ls -1 ../split/*.fa > mrna.lst ls -1 /iscratch/i/mm6/chrom/*.fa > genome.lst mkdir psl cat << '_EOF_' > gsub #LOOP blat -ooc=/scratch/hg/h/mouse11.ooc $(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl} #ENDLOOP '_EOF_' gensub2 genome.lst mrna.lst gsub spec para create spec # Then do the usual para try/push/time/check until the run is finished #Completed: 7800 of 7800 jobs #CPU time in finished jobs: 105907s 1765.12m 29.42h 1.23d 0.003 y #IO & Wait Time: 447478s 7457.96m 124.30h 5.18d 0.014 y #Average job time: 71s 1.18m 0.02h 0.00d #Longest running job: 0s 0.00m 0.00h 0.00d #Longest finished job: 227s 3.78m 0.06h 0.00d #Submission to last job: 1292s 21.53m 0.36h 0.01d # Then do sorting and near-best-in-genome step on file server ssh kkstore cd /cluster/data/mm6/bed/allenBrain/run pslSort dirs raw.psl tmp psl pslReps raw.psl ../best.psl -nohead -minCover=0.20 -minAli=0.96 -nearTop=0.001 /dev/null sort -k 14,14 -k 16,16n ../best.psl > ../allenBrainAli.psl # Clean up big files no longer needed rm raw.psl rm -r psl rm -r ../split # Load up database ssh hgwdev cd /cluster/data/mm6/bed/allenBrain # Make a new table that contains the URLs for the allen brain genes # Make this one first since all.joiner considers it the master table. hgsql mm6 < ~/kent/src/hg/lib/allenBrainUrl.sql hgsql mm6 -e 'load data local infile "allenBrainUrl.tab" into table allenBrainUrl;' # Make probe alignment table, and load sequence. hgLoadPsl mm6 allenBrainAli.psl ln -s /cluster/data/mm6/bed/allenBrain/allProbes.fa /gbdb/mm6/allenBrain/allProbes.fa hgLoadSeq mm6 /gbdb/mm6/allenBrain/allProbes.fa # Make mapping between known genes and allenBrain hgMapToGene mm6 allenBrainAli -type=psl knownGene knownToAllenBrain ########################################################################### # RIKEN CAGE STUFF (DONE 11-16-2005 Andy) cd /cluster/data/mm6/bed mkdir rikenCageCtss cd rikenCageCtss/ hgsql mm5 -e 'select * from rikenCageTc' | cut -f2- | tail +2 > rikenCageTc.mm5.bed hgsql mm5 -e 'select chrom,chromStart,chromEnd,dataValue from rikenCageCtssMinus' \ | tail +2 > minus.mm5.bed hgsql mm5 -e 'select chrom,chromStart,chromEnd,dataValue from rikenCageCtssPlus' \ | tail +2 > plus.mm5.bed liftOver rikenCageTc.mm5.bed /gbdb/mm5/liftOver/mm5ToMm6.over.chain rikenCageTc.mm6.bed \ rikenCageTc.mm6.missed liftOver plus.mm5.bed /gbdb/mm5/liftOver/mm5ToMm6.over.chain plus.mm6.bed \ plus.mm6.missed liftOver minus.mm5.bed /gbdb/mm5/liftOver/mm5ToMm6.over.chain minus.mm6.bed \ minus.mm6.missed wc -l *.missed hgLoadBed mm6 rikenCageTc rikenCageTc.mm6.bed hgLoadBed -strict -bedGraph=4 mm6 rikenCageCtssMinus minus.mm6.bed hgLoadBed -strict -bedGraph=4 mm6 rikenCageCtssPlus plus.mm6.bed ########################################################################### # BLASTZ HUMAN Hg17 second time correctly # The initial run was done at a time when there was an error in # the processing scripts and the lineage specific repeats were not # handled correctly. This re-work produces better chains and nets # as the lineage specific repeats are handled properly, also set # the chain minScore and linearGap at better settings. # (DONE - 2005-11-30 - 2005-12-08 - Hiram) ssh pk mkdir /cluster/data/mm6/bed/blastzHg17.2005-11-30 cd /cluster/data/mm6/bed/blastzHg17.2005-11-30 cat << '_EOF_' > DEF # mouse vs human export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin BLASTZ=blastz.v7.x86_64 BLASTZ_ABRIDGE_REPEATS=1 # TARGET: Mouse Mm6 SEQ1_DIR=/san/sanvol1/scratch/mm6/nib SEQ1_SMSK=/san/sanvol1/scratch/mm6/linSpecRep.notInHuman SEQ1_LEN=/san/sanvol1/scratch/mm6/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Human Hg17 - single chunk big enough to run entire genome SEQ2_DIR=/scratch/hg/hg17/bothMaskedNibs SEQ2_SMSK=/scratch/hg/hg17/linSpecRep.notInMouse SEQ2_LEN=/cluster/bluearc/hg17/chrom.sizes SEQ2_CHUNK=3000000000 SEQ2_LAP=0 BASE=/cluster/data/mm6/bed/blastzHg17.2005-11-30 TMPDIR=/scratch/tmp '_EOF_' # happy emacs /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 -bigClusterHub=pk \ -chainMinScore=3000 -chainLinearGap=medium \ -stop=net `pwd`/DEF > to-net.out 2>&1 # recover from broken blastz run, then continue /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 -bigClusterHub=pk \ -chainMinScore=3000 -chainLinearGap=medium \ -continue=cat -stop=net `pwd`/DEF > cat-to-net.out 2>&1 # Manually loading to put these in a different table name than the # existing track ssh hgwdev cd /cluster/data/mm6/bed/blastzHg17.2005-11-30/axtChain # Copy the loadUp.csh used in Mm7 and alter the script: cp /cluster/data/mm7/bed/blastzHg17.2005-11-14/axtChain/loadUp.csh . # set mm6 data base and table names to chainHg17u1 and netHg17u1 # and proper path names to here # then run the script time ./loadUp.csh > loadUp.out ssh kolossus cd /cluster/data/mm6/bed/blastzHg17.2005-11-30 time HGDB_CONF=~/.hg.conf.read-only \ featureBits mm6 chainHg17u1Link > fb.mm6.chainHg17u1Link 2>&1 & # 989964288 bases of 2597150411 (38.117%) in intersection # Previously with the broken lineage specific repeats, this was: # 966916309 bases of 2597150411 (37.230%) in intersection # Move the existing swap directory out of the way ssh pk cd /cluster/data/hg17/bed mv blastz.mm6.swap blastz.mm6.swap.2005-03-29 cd /cluster/data/mm6/bed/blastzHg17.2005-11-30 /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 -bigClusterHub=pk \ -chainMinScore=3000 -chainLinearGap=medium \ -swap -stop=net `pwd`/DEF > swap-to-net.out 2>&1 # Manually load to change tables loaded into ssh hgwdev cd /cluster/data/hg17/bed/blastz.mm6.swap/axtChain # Copy loadUp script used in mm7, change db to mm6 and table names # to be chainMm6u1 and netMm6u1 cp /cluster/data/hg17/bed/blastz.mm7.swap/axtChain/loadUp.csh . time ./loadUp.csh > loadUp.out # 79 minute load time time HGDB_CONF=~/.hg.conf.read-only \ featureBits hg17 chainMm6u1Link > fb.hg16.chainMm6u1Link 2>&1 & # 992497149 bases of 2866216770 (34.627%) in intersection # With broken lineage specific repeats, and different chain # minScore and linearGap settings, this measurement was: # 969459954 bases of 2866216770 (33.824%) in intersection # prepare new downloads and clean up ssh hgwdev cd /usr/local/apache/htdocs/goldenPath/mm6 mv vsHg17 vsHg17.old # There is no file in # /usr/local/apache/htdocs/goldenPath/mm6/liftOver # this information was lost in cleaning over the summer cd /usr/local/apache/htdocs/goldenPath/hg17 mv vsMm6 vsMm6.old # There is no file in # /usr/local/apache/htdocs/goldenPath/hg17/liftOver # this information was lost in cleaning over the summer ssh pk cd /cluster/data/mm6/bed/blastzHg17.2005-11-30 /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 -bigClusterHub=pk \ -chainMinScore=3000 -chainLinearGap=medium \ -continue download `pwd`/DEF > download-clean.out 2>&1 # 2 minutes /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 -bigClusterHub=pk \ -chainMinScore=3000 -chainLinearGap=medium \ -swap -continue download `pwd`/DEF > swap.download-clean.out 2>&1 # 2m 20s # Check the README information in # /usr/local/apache/htdocs/goldenPath/mm6/vsHg17 # /usr/local/apache/htdocs/goldenPath/hg17/vsMm6 # Use that information to place the matrix definition in the # trackDb chain html files # QA UPDATE: (2-14-2006 ASZ) # the tables from this run were originally named with a "u1" # e.g. chr1_chainMm6u1 (in the hg17 database) # I have deleted the old chain and net tables (from the original blastz run) # > DROP TABLE netMm6; # and renamed the new chain and net tables from this new blastz run # > RENAME TABLE netMm6u1 TO netMm6; ########################################################################### # BLASTZ CHICKEN GalGal2 second time correctly # The initial run was done at a time when there was an error in # the processing scripts and the lineage specific repeats were not # handled correctly. This re-work produces better chains and nets # as the lineage specific repeats are handled properly, also set # the chain minScore and linearGap at better settings. # (DONE - 2005-11-30 - 2005-12-09 - Hiram) ssh pk mkdir /cluster/data/mm6/bed/blastzGalGal2.2005-11-30 cd /cluster/data/mm6/bed/blastzGalGal2.2005-11-30 cat << '_EOF_' > DEF # mouse vs. chicken export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin BLASTZ=blastz.v7.x86_64 # Specific settings for chicken (per Webb email to Brian Raney) BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=10000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q BLASTZ_ABRIDGE_REPEATS=1 # TARGET: Mouse Mm6 SEQ1_DIR=/san/sanvol1/scratch/mm6/nib SEQ1_SMSK=/san/sanvol1/scratch/mm6/linSpecRep.notInChicken SEQ1_LEN=/san/sanvol1/scratch/mm6/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Chicken galGal2 - single chunk big enough for whole chroms at # once SEQ2_DIR=/scratch/hg/galGal2/nib SEQ2_LEN=/scratch/hg/galGal2/chrom.sizes SEQ2_SMSK=/scratch/hg/galGal2/linSpecRep SEQ2_IN_CONTIGS=0 SEQ2_CHUNK=200000000 SEQ2_LAP=0 BASE=/cluster/data/mm6/bed/blastzGal TMPDIR=/scratch/tmp '_EOF_' # happy emacs /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 -bigClusterHub=pk \ -chainMinScore=5000 -chainLinearGap=loose \ -stop=net `pwd`/DEF > to-net.out 2>&1 # recover from network slowness in making the net file appear /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 -bigClusterHub=pk \ -chainMinScore=5000 -chainLinearGap=loose \ -continue=net -stop=net `pwd`/DEF > net.out 2>&1 # Manually loading to put these in a different table name than the # existing track ssh hgwdev cd /cluster/data/mm6/bed/blastzGalGal2.2005-11-30/axtChain # Copy the loadUp.csh used in Mm7 and alter the script: cp \ /cluster/data/mm7/bed/blastzGalGal2.2005-11-14/axtChain/axtChain/loadUp.csh . # set mm6 data base and table names to chainGalGal2u1 and netGalGal2u1 # and proper path names to here # then run the script time ./loadUp.csh > loadUp.out ssh kolossus cd /cluster/data/mm6/bed/blastzGalGal2.2005-11-30 time HGDB_CONF=~/.hg.conf.read-only \ featureBits mm6 chainGalGal2u1Link > fb.mm6.chainGalGal2u1Link 2>&1 & # 77836209 bases of 2597150411 (2.997%) in intersection # Previously with the broken lineage specific repeats and # different linearGap matrix, this was: # 82018349 bases of 2597150411 (3.158%) in intersection # Move the existing swap directory out of the way ssh pk cd /cluster/data/galGal2/bed mv blastz.mm6.swap blastz.mm6.swap.2005-04-04 cd /cluster/data/mm6/bed/blastzGalGal2.2005-11-30 /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 -bigClusterHub=pk \ -chainMinScore=5000 -chainLinearGap=loose \ -swap -stop=net `pwd`/DEF > swap-to-net.out 2>&1 # Manually load to change tables loaded into ssh hgwdev cd /cluster/data/galGal2/bed/blastz.mm6.swap/axtChain # Copy loadUp script used in mm7, change db to mm6 and table names # to be chainMm6u1 and netMm6u1 cp /cluster/data/galGal2/bed/blastz.mm7.swap/axtChain/loadUp.csh . time ./loadUp.csh > loadUp.out 2>&1 ssh kolossus cd /cluster/data/mm6/bed/blastzGalGal2.2005-11-30 time HGDB_CONF=~/.hg.conf.read-only \ featureBits galGal2 chainMm6u1Link > fb.mm6.chainMm6u1Link 2>&1 & # 70147509 bases of 1054197620 (6.654%) in intersection # Previously, with broken lineage specific repeats and different # minScore and linearGap, this was: # 72687426 bases of 1054197620 (6.895%) in intersection ssh hgwdev # move the existing data out of the way cd /usr/local/apache/htdocs/goldenPath/galGal2 mv vsMm6 vsMm6.old cd /usr/local/apache/htdocs/goldenPath/mm6 mv vsGalGal2 vsGalGal2.old ssh pk cd /cluster/data/mm6/bed/blastzGalGal2.2005-11-30 /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 -bigClusterHub=pk \ -chainMinScore=5000 -chainLinearGap=loose \ -continue=download `pwd`/DEF > download.out 2>&1 /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 -bigClusterHub=pk \ -chainMinScore=5000 -chainLinearGap=loose \ -swap -continue=download `pwd`/DEF > swap.download.out 2>&1 # QA UPDATE: (3-1-2006 ASZ) # the tables from this run were originally named with a "u1" # e.g. chr1_chainMm6u1 (in the galGal2 database) # I have deleted the old chain and net tables (from the original blastz run) # > DROP TABLE netMm6; # and renamed the new chain and net tables from this new blastz run # > RENAME TABLE netMm6u1 TO netMm6; ########################################################################### # BLASTZ Rat Rn3 second time correctly # The initial run was done at a time when there was an error in # the processing scripts and the lineage specific repeats were not # handled correctly. This re-work produces better chains and nets # as the lineage specific repeats are handled properly, also set # the chain minScore and linearGap at better settings. # (DONE - 2005-11-30 - 2005-12-09 - Hiram) ssh pk mkdir /cluster/data/mm6/bed/blastzRn3.2005-11-30 cd /cluster/data/mm6/bed/blastzRn3.2005-11-30 cat << '_EOF_' > DEF # mouse vs rat export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin BLASTZ=blastz.v7.x86_64 BLASTZ_ABRIDGE_REPEATS=1 # TARGET: Mouse Mm6 SEQ1_DIR=/san/sanvol1/scratch/mm6/nib SEQ1_SMSK=/san/sanvol1/scratch/mm6/linSpecRep.notInRat SEQ1_LEN=/san/sanvol1/scratch/mm6/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Rat Rn3 - chunk big enough to do all chroms in single whole # pieces SEQ2_DIR=/san/sanvol1/scratch/rn3/softNib SEQ2_SMSK=/san/sanvol1/scratch/rn3/linSpecRep.notInMouse SEQ2_LEN=/san/sanvol1/scratch/rn3/chrom.sizes SEQ2_CHUNK=300000000 SEQ2_LAP=0 BASE=/cluster/data/mm6/bed/blastzRn3.2005-11-30 TMPDIR=/scratch/tmp '_EOF_' # happy emacs /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 -bigClusterHub=pk \ -chainMinScore=3000 -chainLinearGap=medium \ -stop=net `pwd`/DEF > to-net.out 2>&1 # Manually loading to put these in a different table name than the # existing track ssh hgwdev cd /cluster/data/mm6/bed/blastzRn3.2005-11-30/axtChain # Copy the loadUp.csh used in Mm7 and alter the script: cp /cluster/data/mm7/bed/blastzRn3.2005-11-14/axtChain/loadUp.csh . # set mm6 data base and table names to chainRn3u1 and netRn3u1 # and proper path names to here # then run the script time ./loadUp.csh > loadUp.out ssh kolossus cd /cluster/data/mm6/bed/blastzRn3.2005-11-30 time HGDB_CONF=~/.hg.conf.read-only \ featureBits mm6 chainRn3u1Link > fb.mm6.chainRn3u1Link 2>&1 & # 1768516862 bases of 2597150411 (68.095%) in intersection # Previously, with the broken lineage specific repeats and # different minScore and linearGap, this was: # 1802980225 bases of 2597150411 (69.421%) in intersection # Move the existing swap directory out of the way ssh pk cd /cluster/data/rn3/bed mv blastz.mm6.swap blastz.mm6.swap.2005-03-29 cd /cluster/data/mm6/bed/blastzRn3.2005-11-30 /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 -bigClusterHub=pk \ -chainMinScore=3000 -chainLinearGap=medium \ -swap -stop=net `pwd`/DEF > swap-to-net.out 2>&1 # Manually load to change tables loaded into ssh hgwdev cd /cluster/data/rn3/bed/blastz.mm6.swap/axtChain # Copy loadUp script used in mm7, change db to mm6 and table names # to be chainMm6u1 and netMm6u1 cp /cluster/data/rn3/bed/blastz.mm7.swap/axtChain/loadUp.csh . time ./loadUp.csh > loadUp.out 2>&1 ssh kolossus cd /cluster/data/mm6/bed/blastzRn3.2005-11-30 time HGDB_CONF=~/.hg.conf.read-only \ featureBits rn3 chainMm6u1Link > fb.rn3.chainMm6u1Link 2>&1 & # 1780302108 bases of 2571104688 (69.243%) in intersection # This was previously, with broken lineage specific repeats and # different chain minScore and linearGap: # 1812992492 bases of 2571104688 (70.514%) in intersection # move existing downloads out of the way: cd /usr/local/apache/htdocs/goldenPath/mm6 mv vsRn3 vsRn3.old cd /usr/local/apache/htdocs/goldenPath/rn3 mv vsMm6 vsMm6.old ssh pk cd /cluster/data/mm6/bed/blastzRn3.2005-11-30 /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 -bigClusterHub=pk \ -chainMinScore=3000 -chainLinearGap=medium \ -continue=download `pwd`/DEF > download.out 2>&1 /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 -bigClusterHub=pk \ -chainMinScore=3000 -chainLinearGap=medium \ -swap -continue=download `pwd`/DEF > swap.download.out 2>&1 # QA UPDATE: (3-1-2006 ASZ) # the tables from this run were originally named with a "u1" # e.g. chr1_chainMm6u1 (in the rn3 database) # I have deleted the old chain and net tables (from the original blastz run) # > DROP TABLE netMm6; # and renamed the new chain and net tables from this new blastz run # > RENAME TABLE netMm6u1 TO netMm6; ########################################################################### # BLASTZ DOG canFam2 second time correctly # The initial run was done at a time when there was an error in # the processing scripts and the lineage specific repeats were not # handled correctly. This re-work produces better chains and nets # as the lineage specific repeats are handled properly, also set # the chain minScore and linearGap at better settings. # (DONE - 2005-12-02 - 2005-12-09 - Hiram) ssh pk mkdir /cluster/data/mm6/bed/blastzCanFam2.2005-12-02 cd /cluster/data/mm6/bed/blastzCanFam2.2005-12-02 cat << '_EOF_' > DEF # mouse vs dog export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin BLASTZ=blastz.v7.x86_64 BLASTZ_ABRIDGE_REPEATS=1 # TARGET: Mouse Mm6 SEQ1_DIR=/san/sanvol1/scratch/mm6/nib SEQ1_SMSK=/san/sanvol1/scratch/mm6/linSpecRep.notInDog SEQ1_LEN=/san/sanvol1/scratch/mm6/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Dog CanFam2 - chunk big enough to do all chroms in single whole # pieces SEQ2_DIR=/scratch/hg/canFam2/nib SEQ2_SMSK=/san/sanvol1/scratch/canFam2/linSpecRep.notInMouse SEQ2_LEN=/san/sanvol1/scratch/canFam2/chrom.sizes SEQ2_CHUNK=200000000 SEQ2_LAP=0 BASE=/cluster/data/mm6/bed/blastzCanFam2.2005-12-02 TMPDIR=/scratch/tmp '_EOF_' # << emacs /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 -bigClusterHub=pk \ -chainMinScore=3000 -chainLinearGap=medium \ -stop=net `pwd`/DEF > to-net.out 2>&1 # Manually loading to put these in a different table name than the # existing track ssh hgwdev cd /cluster/data/mm6/bed/blastzCanFam2.2005-12-02/axtChain cp /cluster/data/mm7/bed/blastzCanFam2.2005-11-14/axtChain/loadUp.csh . # set mm6 data base and table names to chainCanFam2u1 and # netCanFam2u1 # and proper path names to here # then run the script time ./loadUp.csh > loadUp.out ssh kolossus cd /cluster/data/mm6/bed/blastzCanFam2.2005-11-30 time HGDB_CONF=~/.hg.conf.read-only \ featureBits mm6 chainCanFam2u1Link > fb.mm6.chainCanFam2u1Link 2>&1 & # 829007305 bases of 2597150411 (31.920%) in intersection # Previously on canFam1, broken lineage specific repeats, # different minScore and linearGap, this was: # 798637320 bases of 2597150411 (30.751%) in intersection # Move the existing swap directory out of the way ssh pk cd /cluster/data/canFam2/bed mv blastz.mm6.swap blastz.mm6.swap.2005-11-01 cd /cluster/data/mm6/bed/blastzCanFam2.2005-12-02 /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 -bigClusterHub=pk \ -chainMinScore=3000 -chainLinearGap=medium \ -swap -stop=net `pwd`/DEF > swap-to-net.out 2>&1 # Manually load to change tables loaded into ssh hgwdev cd /cluster/data/canFam2/bed/blastz.mm6.swap/axtChain # Copy loadUp script used in mm7, change db to mm6 and table names # to be chainMm6u1 and netMm6u1 cp /cluster/data/canFam2/bed/blastz.mm7.swap/axtChain/loadUp.csh . time ./loadUp.csh > loadUp.out 2>&1 # 52 minute load time ssh kolossus time HGDB_CONF=~/.hg.conf.read-only \ featureBits canFam2 chainMm6u1Link > fb.canFam2.chainMm6u1Link 2>&1 & # 813032415 bases of 2384996543 (34.089%) in intersection # Angie's run of this had: # 780509502 bases of 2384996543 (32.726%) in intersection # Move existing chain and net download data of the way ssh hgwdev cd /usr/local/apache/htdocs/goldenPath/canFam2 mv vsMm6 vsMm6.old # doesn't seem to be any canFam2 data yet in mm6 ssh pk cd /cluster/data/mm6/bed/blastzCanFam2.2005-12-02 /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 -bigClusterHub=pk \ -chainMinScore=3000 -chainLinearGap=medium \ -continue download `pwd`/DEF > download.out 2>&1 /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 -bigClusterHub=pk \ -chainMinScore=3000 -chainLinearGap=medium \ -swap -continue download `pwd`/DEF > swap.download.out 2>&1 # UPDATED mm6.knownToVisiGene (2006-03-14 galt) ssh hgwdev knownToVisiGene mm6 # UPDATED mm6.knownToVisiGene (2006-04-05 galt) ssh hgwdev knownToVisiGene mm6 ####################################################################### ## LIFTOVER To Mm8 (DONE - 2006-05-15 - 2006-06-05 - Hiram) ssh kkr1u00 # do not need to run this command since /cluster/data/mm8/split10k # already exists from previous liftOver jobs (mm7 to mm8) # $HOME/kent/src/hg/makeDb/makeLoChain/makeLoChain-split.csh \ # mm8 /cluster/data/mm8/nib # as it says, DO THIS NEXT: ssh kk # if bin/scripts is not in your PATH, add it for this command: PATH=$PATH:/cluster/bin/scripts \ $HOME/kent/src/hg/makeDb/makeLoChain/makeLoChain-align.csh \ mm6 /cluster/data/mm6/nib mm8 /iscratch/i/mm8/split10k \ /cluster/data/mm8/11.ooc # as it says, DO THIS NEXT: cd /cluster/data/mm6/bed/blat.mm8.2006-05-15/run para try, check, push, check, ... # Completed: 1360 of 1360 jobs # CPU time in finished jobs: 3975252s 66254.20m 1104.24h 46.01d 0.126 y # IO & Wait Time: 0s 0.00m 0.00h 0.00d 0.000 y # Average job time: 2555s 42.58m 0.71h 0.03d # Longest finished job: 25347s 422.45m 7.04h 0.29d # Submission to last job: 1477498s 24624.97m 410.42h 17.10d # as it says, DO THIS NEXT: # this does the liftUp and makes the psl files # kkr1u00 is down at this time, fixup this script to work on kkr3u00 ssh kkr3u00 cd /cluster/data/mm6/bed ln -s blat.mm8.2006-05-15 blat.mm8 time $HOME/kent/src/hg/makeDb/makeLoChain/makeLoChain-lift.csh mm6 mm8 # real 16m5.091s # as it says, DO THIS NEXT: # the prepares the batch to run for the chaining ssh kki time $HOME/kent/src/hg/makeDb/makeLoChain/makeLoChain-chain.csh \ mm6 /cluster/data/mm6/nib mm8 /cluster/data/mm8/nib # as it says, DO THIS NEXT: # running the chain batch cd /cluster/data/mm6/bed/blat.mm8.2006-05-15/chainRun para try, check, push, check, ... Completed: 40 of 40 jobs # Completed: 34 of 34 jobs # CPU time in finished jobs: 6655s 110.92m 1.85h 0.08d 0.000 y # IO & Wait Time: 1238s 20.63m 0.34h 0.01d 0.000 y # Average job time: 232s 3.87m 0.06h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 759s 12.65m 0.21h 0.01d # Submission to last job: 759s 12.65m 0.21h 0.01d ssh kkstore01 $HOME/kent/src/hg/makeDb/makeLoChain/makeLoChain-net.csh mm6 mm8 # Created /cluster/data/mm6/bed/liftOver/mm6ToMm8.over.chain.gz # as it says, DO THIS NEXT: ssh hgwdev $HOME/kent/src/hg/makeDb/makeLoChain/makeLoChain-load.csh mm6 mm8 # It says this: # Now, add link for # /usr/local/apache/htdocs/goldenPath/mm6/liftOver/mm6ToMm8.over.chain # to hgLiftOver # But I believe that link was already done: cd /gbdb/mm6/liftOver ls -og mm6ToMm8* # lrwxrwxrwx 1 53 Jun 5 16:35 mm6ToMm8.over.chain.gz -> # /cluster/data/mm6/bed/liftOver/mm6ToMm8.over.chain.gz ######################################################################### ### IGTC (Int'l GeneTrap Consortium) (DONE - 2006-06-07 - angie) ### fasta added 2006-06-21 ### Doug Stryke in Tom Ferrin's lab ### NOTE -- as of 2007-03-01 the igtc track were automatically ### updated on hgwdev by the scripts monthlyUpdateIgtc.csh and ### updateIgtc.pl in kent/src/hg/utils/automation/ . ### 2007-09-01 was the last update for mm6 because IGTC moved on ### to {mm7, mm8, mm9}. ssh hgwdev mkdir /cluster/data/mm6/bed/igtc cd /cluster/data/mm6/bed/igtc wget http://www.genetrap.org/blattrack/genetrap_mm6.psl grep -v ^track genetrap_mm6.psl \ | hgLoadPsl mm6 -table=igtc stdin # Probe fasta is shared by all assemblies: wget http://www.genetrap.org/blattrack/genetrap.fasta mkdir /gbdb/mm6/igtc ln -s /cluster/data/mm6/bed/igtc/genetrap.fasta /gbdb/mm6/igtc/ hgLoadSeq -replace mm6 /gbdb/mm6/igtc/genetrap.fasta ######################################################################### # SPLIT MM6 SEQUENCES FOR LIFTOVER FROM OTHER ASSEMBLIES # (DONE, 2006-06-11, hartera) ssh kkr1u00 cd /cluster/data/mm6/bed mkdir bed/liftOver cd bed/liftOver # split the mouse mm6 sequences first makeLoChain-split mm6 /cluster/data/mm6/nib >&! split.log & # also add these to the san for pk cluster runs ssh pk mkdir -p /san/sanvol1/scratch/mm6 rsync -a --progress \ kkr1u00:/iscratch/i/mm6/split10k /san/sanvol1/scratch/mm6/ ############################################################################# # Create Allen Brain Atlas mapping. (Done 2007-02-08 Galt) # We are creating several things: a psl probe-track for the RR on mouse, # a link out from kg to the probe to the ABA website, # and a set of gene/probe info which visiGene will use. # (This needs to be done after have created sequences in # ncbiXm and tigrMgiTc as above.) # metadata.log and SRGEsequence.log was provided by # Susan Sunkin # this is an update to the visiGene with 6000 new images. cd /san/sanvol1/visiGene/offline/allenBrain/imageDisk/May_06 # convert new metadata.log to be like previous allen.tab cat metadata.log | tail +3 | sed 's/_.*jp2//' \ | gawk -F ',' '{ print$1"\tUnknown Gene\t"$3"\t"$4"\thttp://www.brain-map.org/search.do?queryText="($3=="0" ? "genesym" : "egeneid")"="($3=="0" ? $1 : $3) }' \ > allen20061204update.tab cat /san/sanvol1/visiGene/offline/allenBrain/probesAndData/allen20051021.tab \ allen20061204update.tab > allen20061204combined.tab # new program allenCleanup # (located in ~kent/src/hg/makeDb/outside/allenBrain/allenCleanup) # make the output from allen20061204combined.tab go into # /san/sanvol1/visiGene/offline/allenBrain/probesAndData/allen20061204.tab # can also use allenCleanup options to clean up duplicate and unused images, and to check the # full-image,thumb,and tile .jpgs are present. allenCleanup \ /san/sanvol1/visiGene/offline/allenBrain/imageDisk \ /san/sanvol1/visiGene/gbdb/full/inSitu/Mouse/allenBrain \ /san/sanvol1/visiGene/offline/allenBrain/imageDisk/May_06/allen20061204combined.tab \ /san/sanvol1/visiGene/offline/allenBrain/probesAndData/allen20061204.tab \ > log # convert new SRGEsequence.log to be new probeSeq.fasta cat SRGEsequence.log | tail +2 \ | gawk -F ',' '{ if (($6=="sagittal")&&($8!="")) print ">aibs|"$1"|sym|"$2"|entrez|"$3"|refseq|"$4"|probe|"$7"\n"$8 }' \ > probeSeq.20061204.fasta # update the files in probesAndData for use by mm6,mm7,mm8 cd /san/sanvol1/visiGene/offline/allenBrain/probesAndData/ cp /san/sanvol1/visiGene/offline/allenBrain/imageDisk/May_06/probeSeq.20061204.fasta . # copy in the data files (directory already exists from previous build) cd /cluster/data/mm6/bed/allenBrain mkdir old mv * old/ cp /san/sanvol1/visiGene/offline/allenBrain/probesAndData/allen20061204.tab . cp /san/sanvol1/visiGene/offline/allenBrain/probesAndData/probeSeq.20061204.fasta . # updated kent/src/hg/makeDb/outside/allenBrain/allenCollectSeq # to relax handling of refseq ids between the .tab and the .fasta # i.e. sometimes chopping off trailing [.][0-9], sometimes adding [.][0-4]. # Last time it had found 23 missing. Now there are none missing. # Create a list of probe sequences filling ones missing from probeSeq.20061204.fa # with some NCBI and TIGR files, and some downloaded one at a time. allenCollectSeq allen20061204.tab \ probeSeq.20061204.fasta ../ncbiXm/ncbiNm.fa ../ncbiXm/ncbiXm.fa ../tigrMgiTc/tigrMgiTc.fa \ ~/kent/src/hg/makeDb/outside/allenBrain/allenCollectSeq/extra.fa \ allProbes.fa allProbes.tab missing.tab allenBrainUrl.tab #18463 sequences in ../ncbiXm/ncbiNm.fa #8386 sequences in ../ncbiXm/ncbiXm.fa #161499 sequences in ../tigrMgiTc/tigrMgiTc.fa #16 sequences in #/cluster/home/galt/kent/src/hg/makeDb/outside/allenBrain/allenCollectSeq/extra.fa #21075 sequences in probeSeq.20061204.fasta #17895 (99.9%) hitProbe #9 (0.1%) hitNm #1 (0.0%) hitXm #3 (0.0%) hitTc #5 (0.0%) hitExtra #0 (0.0%) hitNone # go run vgLoadAllen (see visiGene.txt) to make sure everything is ok # before proceeding. # ok, looks fine now after using allenCleanup # Set up a blat run to align the probes. ssk kk cd /cluster/data/mm6/bed/allenBrain mkdir split faSplit sequence allProbes.fa 200 split/rp mkdir run cd run ls -1 ../split/*.fa > mrna.lst ls -1 /iscratch/i/mm6/chrom/*.fa > genome.lst mkdir psl cat << '_EOF_' > gsub #LOOP blat -ooc=/scratch/hg/h/mouse11.ooc $(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl} #ENDLOOP '_EOF_' gensub2 genome.lst mrna.lst gsub spec para create spec #Completed: 7760 of 7760 jobs #CPU time in finished jobs: 99653s 1660.89m 27.68h 1.15d 0.003 y #IO & Wait Time: 411590s 6859.83m 114.33h 4.76d 0.013 y #Average job time: 66s 1.10m 0.02h 0.00d #Longest running job: 0s 0.00m 0.00h 0.00d #Longest finished job: 1279s 21.32m 0.36h 0.01d #Submission to last job: 2352s 39.20m 0.65h 0.03d # Then do sorting and near-best-in-genome step on file server ssh kkstore cd /cluster/data/mm6/bed/allenBrain/run pslSort dirs raw.psl tmp psl pslReps raw.psl ../best.psl -nohead -minCover=0.20 -minAli=0.96 -nearTop=0.001 /dev/null sort -k 14,14 -k 16,16n ../best.psl > ../allenBrainAli.psl # Clean up big files no longer needed rm raw.psl rm -r psl rm -r ../split # Load up database ssh hgwdev cd /cluster/data/mm6/bed/allenBrain # Make a new table that contains the URLs for the allen brain genes # Make this one first since all.joiner considers it the master table. hgsql mm6 -e 'drop table allenBrainUrl' hgsql mm6 < ~/kent/src/hg/lib/allenBrainUrl.sql hgsql mm6 -e 'load data local infile "allenBrainUrl.tab" into table allenBrainUrl' # Make probe alignment table, and load sequence. hgLoadPsl mm6 allenBrainAli.psl rm /gbdb/mm6/allenBrain/allProbes.fa ln -s /cluster/data/mm6/bed/allenBrain/allProbes.fa /gbdb/mm6/allenBrain/allProbes.fa hgLoadSeq -replace mm6 /gbdb/mm6/allenBrain/allProbes.fa # Make mapping between known genes and allenBrain hgMapToGene mm6 allenBrainAli -type=psl knownGene knownToAllenBrain ########################################################################## # GenBank gbMiscDiff table (markd 2007-01-10) # Supports `NCBI Clone Validation' section of mgcGenes details page # genbank release 157.0 now contains misc_diff fields for MGC clones # reloading mRNAs results in gbMiscDiff table being created. ./bin/gbDbLoadStep -reload -srcDb=genbank -type=mrna mm6 # UPDATED mm6.knownToVisiGene (DONE galt 2007-02-15) ssh hgwdev knownToVisiGene mm6 ########################################################################## # construct liftOver to mm10 (DONE - 2012-05-01 - Hiram) screen -S mm6 # manage this longish running job in a screen mkdir /hive/data/genomes/mm6/bed/blat.mm10.2012-05-01 cd /hive/data/genomes/mm6/bed/blat.mm10.2012-05-01 # check it with -debug first to see if it is going to work: time doSameSpeciesLiftOver.pl -buildDir=`pwd` -bigClusterHub=swarm \ -ooc=/hive/data/genomes/mm6/ooc/11.ooc \ -debug -dbHost=hgwdev -workhorse=hgwdev mm6 mm10 # if that is OK, then run it: # This is interesting, the above debug correctly used # /hive/data/genomes/mm6/mm6.2bit but this attempted run said # that /scratch/data/mm6/mm6.2bit was missing. So, ran the # scripts created above manually to get the blat done: # Completed: 6027 of 6027 jobs # CPU time in finished jobs: 961983s 16033.05m 267.22h 11.13d 0.031 y # IO & Wait Time: 258451s 4307.51m 71.79h 2.99d 0.008 y # Average job time: 202s 3.37m 0.06h 0.00d # Longest finished job: 771s 12.85m 0.21h 0.01d # Submission to last job: 2617s 43.62m 0.73h 0.03d # then: time doSameSpeciesLiftOver.pl -continue=chain -buildDir=`pwd` \ -bigClusterHub=swarm \ -ooc=/hive/data/genomes/mm6/ooc/11.ooc \ -dbHost=hgwdev -workhorse=hgwdev mm6 mm10 > chain.log 2>&1 # real 63m12.463s # verify this file exists: og -L /gbdb/mm6/liftOver/mm6ToMm10.over.chain.gz # -rw-rw-r-- 1 3567460 May 1 12:06 /gbdb/mm6/liftOver/mm6ToMm10.over.chain.gz # and try out the conversion on genome-test from mm6 to mm10 ############################################################################