# for emacs: -*- mode: sh; -*- # This file describes browser build for the mouse # genome, August 2005, ncbi mouse_35 - Mm7 # # "$Id: mm7.txt,v 1.12 2008/10/17 01:06:34 markd Exp $" # ####################################################################### # DOWNLOAD THE MOUSE SEQUENCE FROM NCBI (DONE - 2005-08-26 - Hiram) # # Examine disk space issues, summarize mm6 usage: ssh kkstore01 cd /cluster/store10 du -hsc mm6 # 188G mm6 # There is an amount of disk space on kkstore02 that would be # appropriate for this # store10, thus: ssh kkstore02 df -h | grep sd # /dev/sdc1 1.5T 1.1T 330G 77% /export/cluster/store5 # So, mkdir /cluster/store5/mm7 ln -s /cluster/store5/mm7 /cluster/data/mm7 mkdir /cluster/data/mm7/ncbi cd /cluster/data/mm7/ncbi # set the login name and password in a .wgetrc file in this # directory, permissions 600, its format: # login = name # passwd = xxxx WGETRC=`pwd`/.wgetrc export WGETRC wget --timestamping --force-directories --directory-prefix=. \ --dont-remove-listing --recursive --level=4 --no-parent \ --no-host-directories --cut-dirs=1 \ ftp://ftp-private.ncbi.nih.gov/mouse_35 #Downloaded: 2,348,948,224 bytes in 55 files #real 48m1.328s #user 0m2.696s #sys 0m21.548s # Fixup the agp and seq_contig.md files to add chrM # No chrM or chrMT was delivered. Copy from previous assembly ssh kkstore02 cd /cluster/data/mm7/ncbi/chrfasta cp -p /cluster/data/mm6/ncbi/chrfasta/chrM.fa.gz . cd ../contigfasta cp -p /cluster/data/mm6/ncbi/contigfasta/chrM.fa.gz . # with a fixed up header line to be like all the others: # >lcl|chrM.fa gi|34538597|ref|NC_005089.1| Mus musculus mitochondrion, complete genome cd /cluster/data/mm7 zcat ncbi/allrefcontig.chr.agp.gz > allrefcontig.chr.agp echo -e "chrM\t1\t16299\t1\tF\tAY172335.1\t1\t16299\t+" >> \ allrefcontig.chr.agp gzip allrefcontig.chr.agp zcat ncbi/allcontig.agp.gz > allcontig.agp echo -e "NC_005089\t1\t16299\t1\tF\tAY172335\t\t1\t16299\t+" >> \ allcontig.agp gzip allcontig.agp zcat ncbi/seq_contig.md.gz | sed -e "3863i\ 10090\tM\t0\t0\t+\tstart\t-1\tCONTIG\tC57BL/6J\t10\n\ 10090\tM\t1\t16299\t+\tNC_005089\tGI:34538597\tCONTIG\tC57BL/6J\tna\n\ 10090\tM\t16299\t16299\t+\tend\t-2\tCONTIG\tC57BL/6J\t10" > seq_contig.md # The line number 3863 was found by checking the contents of # ncbi/seq_contig.md.gz and it was the line starting with: # 10090^IUn|NT_039862^I1^I3190 # Wanted this chrM information before that line. gzip seq_contig.md # summarize sequence counts mkdir faCounts time faCount ncbi/chrfasta/chr*.fa.gz > faCounts/chrfasta.faCount 2>&1 & # about 1.5 minutes time faCount ncbi/contigfasta/chr*.fa.gz > \ faCounts/contigfasta.faCount 2>&1 & # about 3 minutes time zcat ncbi/chrfasta/chr*.fa.gz | grep "^>" > \ faCounts/chrfasta.headers 2>&1 & time zcat ncbi/contigfasta/chr*.fa.gz | grep "^>" > \ faCounts/contigfasta.headers 2>&1 & # about 2 minutes each for the above two zcat/greps ############################################################################# # BREAK UP SEQUENCE INTO 5 MB CHUNKS at NON-BRIDGED CONTIGS # (DONE - 2005-08-26 - Hiram) ssh kkstore02 cd /cluster/data/mm7 for F in ncbi/chrfasta/chr*.fa.gz do CHR=`basename ${F} | sed -e "s/.fa.gz//; s/chr//"` echo ${CHR} ${F} mkdir -p "${CHR}" zcat allrefcontig.chr.agp.gz | \ perl -we "while(<>){if (/^chr${CHR}\t/) {print;}}" > \ ${CHR}/chr${CHR}.agp zcat ncbi/chrfasta/chr${CHR}.fa.gz | \ perl -wpe 's/^>lcl\|(chr\w+)\.fa.*/>$1/' | \ splitFaIntoContigs ${CHR}/chr${CHR}.agp \ stdin /cluster/data/mm7 -nSize=5000000 done # The above loop takes about 5 minutes ############################################################################# # CREATE CHROM-LEVEL AGP AND FASTA FOR _RANDOMS (DONE 2005-08-26 - Hiram) ssh kkstore02 mkdir /cluster/data/mm7/jkStuff cd /cluster/data/mm7 mkdir Un tmp cp -p /cluster/data/mm6/jkStuff/ncbiFixAgp ./jkStuff zcat allrefcontig.chr.agp.gz | ./jkStuff/ncbiFixAgp /dev/stdin | gzip > \ allrefcontig.chr.ordered.agp.gz # Set the appropriate release number here, this one is 35 # Fetch the script from the previous assembly sed -e "s/buildNum = 34/buildNum = 35/" \ /cluster/data/mm6/jkStuff/ncbiToRandomAgps > \ jkStuff/ncbiToRandomAgps chmod +x jkStuff/ncbiToRandomAgps gunzip seq_contig.md.gz allrefcontig.chr.ordered.agp.gz # NOTE ! * ! This mm7 contig.idmap now includes the celera assembly # Filter that out for use here. grep ref_strain ncbi/contig.idmap \ | ./jkStuff/ncbiToRandomAgps seq_contig.md \ allrefcontig.chr.ordered.agp \ /dev/stdin . # The chrUn_random.agp created by this is too large with the 5000 # gaps. it will work with 1000 gaps, so fixup the chrUn_random # agp: grep ref_strain ncbi/contig.idmap \ | ./jkStuff/ncbiToRandomAgps -gapLen 1000 -chrom Un \ seq_contig.md allrefcontig.chr.ordered.agp /dev/stdin . for C in ? ?? for C in Un do if [ -s ${C}/chr${C}_random.ctg.agp ]; then echo "building ${C}/chr${C}_random.fa" rm -f ./tmp.fa zcat ncbi/contigfasta/chr${C}.fa.gz | \ perl -wpe 's/^>lcl\|(Mm\w+)\s+.*$/>$1/' > ./tmp.fa agpToFa -verbose=2 -simpleMulti \ ${C}/chr${C}_random.ctg.agp chr${C}_random \ ${C}/chr${C}_random.fa ./tmp.fa rm -f ./tmp.fa fi done > tmp/agpToFa.out 2>&1 # the above loop takes about 6 minutes, examine the tmp/agpToFa.out # record for any errors # We need the lift information from these random.ctg.agp files cp -p /cluster/data/cb2/scripts/agpToLift.pl ./jkStuff for AGP in ?/*_random.ctg.agp ??/*_random.ctg.agp do CHR=`dirname ${AGP}` echo ${CHR} mkdir -p ${CHR}/lift ./jkStuff/agpToLift.pl ${AGP} > ${CHR}/lift/ctg_random.lft done # Clean these up to avoid confusion later... they're easily rebuilt # with the ncbiToRandomAgps script above rm ?/*_random.ctg.agp ??/*_random.ctg.agp gzip seq_contig.md allrefcontig.chr.ordered.agp ############################################################################# # BREAK UP _RANDOMS INTO 5 MB CHUNKS AT NON-BRIDGED CONTIGS # (DONE 2005-08-26 - Hiram) ssh kkstore02 cd /cluster/data/mm7 for C in ? ?? do if [ -s ${C}/chr${C}_random.fa ]; then splitFaIntoContigs -nSize=5000000 ${C}/chr${C}_random.agp \ ${C}/chr${C}_random.fa . mkdir -p ${C}/lift rm -f ${C}/lift/rOut.lst ${C}/lift/random.lft ${C}/lift/random.lst mv ${C}_random/lift/oOut.lst ${C}/lift/rOut.lst mv ${C}_random/lift/ordered.lft ${C}/lift/random.lft mv ${C}_random/lift/ordered.lst ${C}/lift/random.lst rmdir ${C}_random/lift rm ${C}_random/chr${C}_random.agp ${C}_random/chr${C}_random.fa rm -rf ${C}/chr${C}_random_* mv ${C}_random/chr${C}_random_* ${C} rmdir ${C}_random fi done > tmp/split.out 2>&1 # the above loop takes less than a minute # scan the tmp/split.out file for possible errors ############################################################################# # MAKE LIFTALL.LFT (DONE - 2005-08-26 - Hiram) ssh kkstore02 cd /cluster/data/mm7 cat ?/lift/*.lft ??/lift/*.lft > jkStuff/liftAll.lft ############################################################################# # CREATING DATABASE (DONE - 2005-08-26 - Hiram) ssh kkstore02 cd /cluster/data/mm7 faToTwoBit ?/chr?.fa ??/chr??.fa ?/chr?_random.fa ??/chr??_random.fa \ mm7.2bit twoBitInfo mm7.2bit stdout | sort -rn +1 > chrom.sizes grep -v random chrom.sizes | cut -f1 | sed -e "s/chr//" > chrom.lst twoBitInfo mm7.2bit stdout | awk '{printf "%s\t%s\t/gbdb/mm7/mm7.2bit\n", $1,$2}' > chromInfo.tab ssh hgwdev cd /cluster/data/mm7 hgsql -e "create database mm7;" mysql # Make sure we have enough room (eventually ~ 70Gb) for mysql tables: df -h | grep mysql # /dev/sda1 472G 227G 222G 51% /var/lib/mysql2 # /dev/sdc1 1.8T 1010G 650G 61% /var/lib/mysql # CREATING GRP TABLE FOR TRACK GROUPING (DONE - 2005-08-26 - Hiram) # Use any of the newest databases to ensure that the organization # of the grp table is up to date ssh hgwdev hgsql mm7 -e "create table grp (PRIMARY KEY(NAME)) select * from hg17.grp" hgsql mm7 < $HOME/kent/src/hg/lib/chromInfo.sql hgsql mm7 -e 'load data local infile "chromInfo.tab" into table chromInfo;' # Enter mm7 into dbDb and defaultDb so test browser knows about it: hgsql -e 'INSERT INTO dbDb (name, description, nibPath, organism, \ defaultPos, active, orderKey, genome, scientificName, \ htmlPath, hgNearOk, hgPbOk, sourceName) \ VALUES("mm7", "Aug 2005", "/gbdb/mm7", "Mouse", \ "chr6:28912411-28925620", 1, 23, "Mouse", \ "Mus musculus", "/gbdb/mm7/html/description.html", 0, 0, \ "NCBI Build 35");' -h localhost hgcentraltest # It can be the default genome on genome-test hgsql hgcentraltest \ -e 'update defaultDb set name="mm7" where genome="Mouse";' # start a new entry in the trackDb hierarchy cd $HOME/kent/src/hg/makeDb/trackDb/mouse mkdir mm7 cvs add mm7 cd mm7 cp ../mm6/description.html . vi description.html - fixup text for this assembly cvs add description.html cvs commit cd ../.. vi trackDb.ra - add mm7 to the list mkdir /cluster/data/mm7/html mkdir /gbdb/mm7 ln -s /cluster/data/mm7/html /gbdb/mm7/html ln -s /cluster/data/mm7/mm7.2bit /gbdb/mm7/mm7.2bit cp -p mouse/mm7/description.html /gbdb/mm7/html make DBS=mm7 ############################################################################# # GOLD GAP tracks (DONE - 2005-08-26 - Hiram) ssh hgwdev cd /cluster/data/mm7 # make sure these tmp contig agp files are gone, easily generated # as above with jkStuff/ncbiToRandomAgps mkdir ffa zcat ncbi/sequence.inf.gz > ffa/sequence.inf hgGoldGapGl -chromLst=chrom.lst mm7 /cluster/data/mm7 . featureBits mm7 gold # 2583394090 bases of 2583394090 (100.000%) in intersection featureBits mm6 gold # 2597150411 bases of 2597150411 (100.000%) in intersection featureBits mm5 gold # 2615483787 bases of 2615483787 (100.000%) in intersection featureBits mm4 gold # 2627444668 bases of 2627444668 (100.000%) in intersection featureBits mm7 gap # 264323239 bases of 2583394090 (10.232%) in intersection featureBits mm6 gap # 482483041 bases of 2597150411 (18.577%) in intersection featureBits mm5 gap # 549468286 bases of 2615483787 (21.008%) in intersection featureBits mm4 gap # 325167539 bases of 2627444668 (12.376%) in intersection ############################################################################# # DISTRIBUTE SEQUENCE TO INTERMEDIATE SERVERS FOR KLUSTER RUNS # (DONE - 2005-08-26 - Hiram) ssh kkstore02 mkdir /cluster/bluearc/mm7 cd /cluster/data/mm7 # break up into 500,000 sized chunks for repeat masker runs TOP=`pwd` export TOP for CTG_DIR in ?/chr?_[0-9]* ??/chr??_[0-9]* ?/chr?_random_[0-9]* \ ??/chr??_random_[0-9]* do ctg=`basename ${CTG_DIR}` cd ${CTG_DIR} faSplit size ${ctg}.fa 500000 ${ctg}_ -lift=${ctg}.lft -maxN=500000 cd ${TOP} done > tmp/ctg_split.out 2>&1 # about 3 minutes, check the tmp/ctg_split.out for anything unusual # make a list of the contigs TOP=`pwd` export TOP for CTG_DIR in ?/chr?_[0-9]* ??/chr??_[0-9]* ?/chr?_random_[0-9]* \ ??/chr??_random_[0-9]* do ctg=`basename ${CTG_DIR}` cd ${CTG_DIR} ls ${ctg}_* | while read F do echo ${CTG_DIR}/${F} done cd ${TOP} done > contig500K.lst # count 'em wc contig500K.lst # 6172 6172 157096 contig500K.lst mkdir /cluster/bluearc/mm7/contigs rsync -a --progress --files-from=contig500K.lst . \ /cluster/bluearc/mm7/contigs/ # verify the contig copy above functioned OK find /cluster/bluearc/mm7/contigs -type f | wc # 6172 6172 336084 ############################################################################# # SIMPLE REPEAT TRACK (DONE - 2005-08-29 Hiram) # TRF can be run in parallel with RepeatMasker # since it doesn't require masked input sequence. ssh pk mkdir /cluster/data/mm7/bed/simpleRepeat cd /cluster/data/mm7/bed/simpleRepeat mkdir trf cat << '_EOF_' > runTrf #!/bin/csh -fe # set path1 = $1 set inputFN = $1:t set outpath = $2 set outputFN = $2:t mkdir -p /tmp/$outputFN cp $path1 /tmp/$outputFN pushd . cd /tmp/$outputFN /cluster/bin/i386/trfBig -trf=/cluster/bin/i386/trf $inputFN /dev/null -bedAt=$outputFN -tempDir=/tmp popd rm -f $outpath cp -p /tmp/$outputFN/$outputFN $outpath rm -fr /tmp/$outputFN/* rmdir --ignore-fail-on-non-empty /tmp/$outputFN '_EOF_' # happy emacs chmod +x runTrf cat << '_EOF_' > gsub #LOOP ./runTrf {check in line+ $(path1)} {check out line trf/$(root1).bed} #ENDLOOP '_EOF_' # << keep emacs coloring happy ls -1S /cluster/data/mm7/?/chr*.fa /cluster/data/mm7/??/chr*.fa > genome.lst gensub2 genome.lst single gsub jobList para create jobList # be gentle on the start up of these things since each starting # job is a copy of the .fa file, a 'para try' starts 10 jobs # there are only 40 total jobs para try sleep 30 para check para try sleep 30 para check para try sleep 30 para check para try para check ... all 40 are running at this point, some are already done para time # Completed: 40 of 40 jobs # CPU time in finished jobs: 15539s 258.98m 4.32h 0.18d 0.000 y # IO & Wait Time: 532s 8.87m 0.15h 0.01d 0.000 y # Average job time: 402s 6.70m 0.11h 0.00d # Longest finished job: 1803s 30.05m 0.50h 0.02d # Submission to last job: 1845s 30.75m 0.51h 0.02d # Load into the database ssh hgwdev cd /cluster/data/mm7/bed/simpleRepeat cat trf/chr*.bed > simpleRepeat.bed hgLoadBed -strict mm7 simpleRepeat simpleRepeat.bed \ -sqlTable=$HOME/src/hg/lib/simpleRepeat.sql # Loaded 1143549 elements of size 16 featureBits mm7 simpleRepeat # 77021175 bases of 2583394090 (2.981%) in intersection featureBits mm6 simpleRepeat # 83220723 bases of 2597150411 (3.204%) in intersection featureBits mm5 simpleRepeat # 81414259 bases of 2615483787 (3.113%) in intersection featureBits mm4 simpleRepeat # 82600648 bases of 2627444668 (3.144%) in intersection featureBits mm3 simpleRepeat # 75457193 bases of 2505900260 (3.011%) in intersection ########################################################################### # CREATE MICROSAT TRACK (done 2006-7-5 JK) ssh hgwdev cd /cluster/data/mm7/bed mkdir microsat cd microsat awk '($5==2 || $5==3) && $6 >= 15 && $8 == 100 && $9 == 0 {printf("%s\t%s\t%s\t%dx%s\n", $1, $2, $3, $6, $16);}' ../simpleRepeat/simpleRepeat.bed > microsat.bed /cluster/bin/i386/hgLoadBed mm7 microsat microsat.bed ############################################################################# # PROCESS SIMPLE REPEATS INTO MASK (DONE - 2005-08-29 - Hiram) # After the simpleRepeats track has been built, make a filtered version # of the trf output: keep trf's with period <= 12: ssh kkstore02 cd /cluster/data/mm7/bed/simpleRepeat mkdir trfMask for F in trf/chr*.bed do echo "${F} -> ${F/trf\//}" awk '{if ($5 <= 12) print;}' ${F} > trfMask/${F/trf\//} done ############################################################################# # REPEATMASKER RUN (after contigs have been distributed to bluearc FS) # (DONE - 2005-08-26 - 2005-08-29 - Hiram) # Record RM version used: cat /cluster/bluearc/RepeatMasker050305/Libraries/version # RepBase Update 9.11, RM database version 20050112 # /cluster/bluearc/RepeatMasker050305 ssh pk #- Make the run directory and job list: cd /cluster/data/mm7 cat << '_EOF_' > jkStuff/RMMouse #!/bin/csh -fe cd /cluster/data/mm7/$1 pushd . /bin/mkdir -p /tmp/mm7/$2 /bin/cp /cluster/bluearc/mm7/contigs/$1/$2 /tmp/mm7/$2 cd /tmp/mm7/$2 /cluster/bluearc/RepeatMasker050305/RepeatMasker -ali -s -species mus $2 popd /bin/cp /tmp/mm7/$2/$2.out ./ if (-e /tmp/mm7/$2/$2.align) /bin/cp /tmp/mm7/$2/$2.align ./ if (-e /tmp/mm7/$2/$2.tbl) /bin/cp /tmp/mm7/$2/$2.tbl ./ if (-e /tmp/mm7/$2/$2.cat) /bin/cp /tmp/mm7/$2/$2.cat ./ /bin/rm -fr /tmp/mm7/$2/* /bin/rmdir --ignore-fail-on-non-empty /tmp/mm7/$2 /bin/rmdir --ignore-fail-on-non-empty /tmp/mm7 '_EOF_' # happy emacs chmod +x jkStuff/RMMouse mkdir -p RMRun rm -f RMRun/RMJobs cat << '_EOF_' > jkStuff/mkRMJobs.pl #!/usr/bin/env perl use strict; use warnings; use File::Basename; while (my $line=<>) { chomp $line; my $basename = basename($line); my $dirname = dirname($line); print "/cluster/data/mm7/jkStuff/RMMouse $dirname $basename {check out line+ /cluster/data/mm7/$dirname/$basename.out}\n"; } '_EOF_' # happy emacs chmod +x jkStuff/mkRMJobs.pl cat contig500K.lst | ./jkStuff/mkRMJobs.pl > RMRun/RMJobs wc RMRun/RMJobs # 6172 43204 770920 RMRun/RMJobs #- Do the run cd /cluster/data/mm7/RMRun para create RMJobs para try, para check, para check, para push, para check,... # Completed: 6172 of 6172 jobs # CPU time in finished jobs: 26381042s 439684.03m 7328.07h 305.34d 0.837 y # IO & Wait Time: 46088s 768.13m 12.80h 0.53d 0.001 y # Average job time: 4282s 71.36m 1.19h 0.05d # Longest finished job: 6370s 106.17m 1.77h 0.07d # Submission to last job: 127318s 2121.97m 35.37h 1.47d #- Lift up the split-contig .out's to contig-level .out's ssh kkstore02 cd /cluster/data/mm7 for D in ?/chr?_[0-9]* ??/chr??_[0-9]* ?/chr?_random_[0-9]* \ ??/chr??_random_[0-9]* do CONTIG=`basename ${D}` liftUp ${D}/${CONTIG}.fa.out ${D}/${CONTIG}.lft warn \ ${D}/${CONTIG}_[0-9]*.fa.out done > tmp/RM.lift.outs 2>&1 # scan tmp/RM.lift.outs for unusual errors or difficulties cat << '_EOF_' > jkStuff/liftRM_out_to_chr.sh #!/bin/sh for C in ? ?? do echo "lifting ${C}" cd ${C} if [ -s lift/ordered.lft ]; then liftUp chr${C}.fa.out lift/ordered.lft warn `cat lift/oOut.lst` else echo "WARNING: Can not find ${C}/lift/ordered.lft" fi if [ -s lift/random.lft ]; then liftUp chr${C}_random.fa.out lift/random.lft warn `cat lift/rOut.lst` fi cd .. done '_EOF_' # happy emacs chmod +x jkStuff/liftRM_out_to_chr.sh ./jkStuff/liftRM_out_to_chr.sh > tmp/liftRM_out_to_chr.out 2>&1 # scan the results tmp/liftRM_out_to_chr.out # there is a single: WARNING: Can not find Un/lift/ordered.lft # which is OK # List the final .out files, nothing should be size 0: ls -og */*.fa.out #- Load the .out files into the database with: ssh hgwdev cd /cluster/data/mm7 hgLoadOut mm7 ?/chr?.fa.out ??/chr??.fa.out ?/chr?_random.fa.out \ ??/chr??_random.fa.out # about 7 minutes, there are always a few of these errors: Strange perc. field -5.6 line 254091 of 1/chr1.fa.out Strange perc. field -3.8 line 77848 of 6/chr6.fa.out Strange perc. field -6.2 line 77848 of 6/chr6.fa.out Strange perc. field -12631.5 line 86222 of 14/chr14.fa.out Strange perc. field -14047.1 line 86222 of 14/chr14.fa.out Strange perc. field -2988.3 line 86222 of 14/chr14.fa.out Strange perc. field -2.1 line 151263 of 14/chr14.fa.out Strange perc. field -6327.2 line 29247 of Y/chrY_random.fa.out Strange perc. field -1122.4 line 29247 of Y/chrY_random.fa.out Strange perc. field -74.6 line 29247 of Y/chrY_random.fa.out # And there are always some of these too: note: 402 records dropped due to repStart > repEnd # verify everything seems normal compared with previous builds featureBits mm7 rmsk # 1092611581 bases of 2583394090 (42.294%) in intersection featureBits mm6 rmsk # 1110222842 bases of 2597150411 (42.748%) in intersection featureBits mm5 rmsk # 1137310280 bases of 2615483787 (43.484%) in intersection featureBits mm4 rmsk # 1130883581 bases of 2627444668 (43.041%) in intersection featureBits mm3 rmsk # 1080265553 bases of 2505900260 (43.109%) in intersection featureBits -countGaps mm7 rmsk # 1092611581 bases of 2847717329 (38.368%) in intersection featureBits -countGaps mm6 rmsk # 1110222842 bases of 3079633452 (36.050%) in intersection featureBits -countGaps mm5 rmsk # 1137310280 bases of 3164952073 (35.935%) in intersection featureBits -countGaps mm4 rmsk # 1130883581 bases of 2952612207 (38.301%) in intersection featureBits -countGaps mm3 rmsk # 1080265553 bases of 2708220133 (39.888%) in intersection ############################################################################# # GC5BASE (DONE - 2005-08-29 - Hiram) ssh kkstore02 mkdir -p /cluster/data/mm7/bed/gc5Base cd /cluster/data/mm7/bed/gc5Base hgGcPercent -wigOut -doGaps -file=stdout -win=5 mm7 \ /cluster/data/mm7 | wigEncode stdin gc5Base.wig gc5Base.wib # Calculating gcPercent with window size 5 # Using twoBit: /cluster/data/mm7/mm7.2bit # File stdout created # Converted stdin, upper limit 100.00, lower limit 0.00 # runs for about 17 minutes # load database ssh hgwdev cd /cluster/data/mm7/bed/gc5Base mkdir /gbdb/mm7/wib ln -s `pwd`/gc5Base.wib /gbdb/mm7/wib hgLoadWiggle -pathPrefix=/gbdb/mm7/wib mm7 gc5Base gc5Base.wig # verify index is correct: hgsql mm7 -e "show index from gc5Base;" # should see good numbers in Cardinality column ############################################################################# # PROCESS REPEAT MASKER AND SIMPLE REPEATS INTO MASKED SEQUENCE # (DONE - 2005-08-29 - Hiram) ssh kkstore02 cd /cluster/data/mm7 time for CHR in ?/chr?.fa ??/chr??.fa ?/chr?_random.fa ??/chr??_random.fa do FA=${CHR#*\/} C=${FA%.fa} echo -n "repeat masking ${C} ... " /cluster/bin/i386/maskOutFa -soft ${CHR} ${CHR}.out ${CHR} echo -n "adding simpleRepeats ... " /cluster/bin/i386/maskOutFa -softAdd ${CHR} \ bed/simpleRepeat/trfMask/${C}.bed ${CHR} echo "done - ${CHR}" done > tmp/addRM_and_Simple.out 2>&1 # about 4 minutes # you will note the usual warnings about troublesome coordinates # in the repeat masker outputs - even more than when they were lifted. # and make the hard masked sequences from these soft masked sequences time for CHR in ?/chr?.fa ??/chr??.fa ?/chr?_random.fa ??/chr??_random.fa do echo "maskOutFa ${CHR} hard ${CHR}.masked" /cluster/bin/i386/maskOutFa ${CHR} hard ${CHR}.masked done > /tmp/hardMask.out 2>&1 # rebuild the nib file time faToTwoBit ?/chr?.fa ??/chr??.fa ?/chr?_random.fa ??/chr??_random.fa \ mm7Soft.2bit # verify the sequence is still the same size as before: twoBitInfo mm7Soft.2bit stdout | sort -rn +1 | sum -r # 37784 1 sum -r chrom.sizes # 37784 1 # replace the former unmasked 2bit file with this new one: rm mm7.2bit mv mm7Soft.2bit mm7.2bit # check the browser, make sure it is functioning OK # Generate fasta file for random contigs cat << '_EOF_' > jkStuff/lft2BitToFa.pl #!/usr/bin/env perl use strict; use warnings; use File::Basename; sub usage() { printf "usage: %s [more_files.lft]\n", basename($0); printf "\tfasta output is to stdout, therefore route stdout to result file\n"; exit 255; } my $argc = scalar(@ARGV); usage if ($argc < 2); my $twoBitFile = shift; while (my $liftFile = shift) { open (FH,"<$liftFile") or die "Can not open $liftFile"; while (my $line=) { chomp $line; my ($start, $contig, $length, $chrom, $chrom_length) = split('\s',$line); my $cmd=sprintf("twoBitToFa $twoBitFile:%s:%d-%d stdout", $chrom, $start, $start+$length); print `$cmd | sed -e "s#^>.*#>$contig#"`; } close (FH); } '_EOF_' # happy emacs chmod +x jkStuff/lft2BitToFa.pl mkdir randomContigs for L in ?/lift/ctg_random.lft ??/lift/ctg_random.lft do D=${L/\/lift*} echo $L $D ./jkStuff/lft2BitToFa.pl mm7.2bit ${L} \ > randomContigs/chr${D}_random.ctg.fa done # # Verify these *.ctg.fa files have the same bases as the ordinary # chr*_random.fa files: faSize ?/chr?_random.fa ??/chr??_random.fa # 225162013 bases (155337148 N's 69824865 real 32381553 upper 37443312 lower) in 18 sequences in 18 files faSize randomContigs/*.ctg.fa # 69825013 bases (148 N's 69824865 real 32381553 upper 37443312 lower) in 4827 sequences in 18 files # Note the number of real, upper and lower bases are the same # Create a 2bit file with the full chrom sequences and these # random contigs for use in blastz: faToTwoBit ?/chr?.fa ??/chr??.fa randomContigs/chr*.ctg.fa \ mm7Chroms_RandomContigs.2bit # Copy to bluearc unit for kluster runs cp -p mm7.2bit /cluster/bluearc/mm7 cp -p mm7Chroms_RandomContigs.2bit /cluster/bluearc/mm7 # And the lift file to go with it cat ?/lift/ctg_random.lft ??/lift/ctg_random.lft \ > jkStuff/Chroms_RandomContigs.lft cp -p jkStuff/Chroms_RandomContigs.lft /cluster/bluearc/mm7 # create full chrom nibs for blastz SEQ1 target mkdir nib for FA in ?/chr?.fa ??/chr??.fa ?/chr?_random.fa ??/chr??_random.fa do B=${FA/*\/} B=${B/.fa/} echo faToNib -softMask ${FA} nib/${B}.nib rm -f nib/${B}.nib faToNib -softMask ${FA} nib/${B}.nib done mkdir /cluster/bluearc/mm7/nib cp -p nib/*.nib /cluster/bluearc/mm7/nib ############################################################################# # BLASTZ SELF experiment 3 (DONE - 2005-11-17 - 2005-11-22 - Hiram) # re-scoring to chain min score of 10,000 to cut down on volumn of data ssh pk mkdir /cluster/data/mm7/bed/blastzSelf.2005-11-17 cd /cluster/data/mm7/bed/blastzSelf.2005-11-17 cat << '_EOF_' > DEF # mouse vs mouse export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin BLASTZ=blastz.v7.x86_64 BLASTZ_H=2000 BLASTZ_M=200 # TARGET: Mouse Mm7 SEQ1_DIR=/scratch/hg/mm7/nib SEQ1_LEN=/scratch/hg/mm7/chrom.sizes SEQ1_CTGDIR=/scratch/hg/mm7/mm7Chroms_RandomContigs.2bit SEQ1_CTGLEN=/scratch/hg/mm7/mm7Chroms_RandomContigs.sizes SEQ1_LIFT=/scratch/hg/mm7/Chroms_RandomContigs.lft SEQ1_LIMIT=30 SEQ1_IN_CONTIGS=0 SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Mouse Mm7 SEQ2_DIR=/scratch/hg/mm7/nib SEQ2_LEN=/scratch/hg/mm7/chrom.sizes SEQ2_CTGDIR=/scratch/hg/mm7/mm7Chroms_RandomContigs.2bit SEQ2_CTGLEN=/scratch/hg/mm7/mm7Chroms_RandomContigs.sizes SEQ2_LIFT=/scratch/hg/mm7/Chroms_RandomContigs.lft SEQ2_SELF=1 SEQ2_LIMIT=30 SEQ2_IN_CONTIGS=0 SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/cluster/data/mm7/bed/blastzSelf.2005-11-17 TMPDIR=/scratch/tmp '_EOF_' # happy emacs cd /cluster/data/mm7/bed/blastzSelf.2005-11-17 time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -chainMinScore=10000 -chainLinearGap=medium -bigClusterHub=pk \ -stop=net \ `pwd`/DEF > blastz.out 2>&1 & time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -chainMinScore=10000 -chainLinearGap=medium -bigClusterHub=pk \ -continue=cat -stop=net \ `pwd`/DEF > cat-to-net.out 2>&1 & time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -chainMinScore=10000 -chainLinearGap=medium -bigClusterHub=pk \ -continue=chainMerge -stop=net \ `pwd`/DEF > chainMerge-to-net.out 2>&1 & time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -chainMinScore=10000 -chainLinearGap=medium -bigClusterHub=pk \ -continue=load -stop=load \ `pwd`/DEF > load.out 2>&1 & ssh kolossus cd /cluster/data/mm7/bed/blastzSelf.2005-11-17 time HGDB_CONF=~/.hg.conf.read-only featureBits mm7 \ chainSelfLink >fb.mm7.chainSelfLink 2>&1 cat fb.mm7.chainSelfLink # 409297278 bases of 2583394090 (15.843%) in intersection time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -chainMinScore=10000 -chainLinearGap=medium -bigClusterHub=pk \ -continue=download \ `pwd`/DEF > download.out 2>&1 & ############################################################################# # BLASTZ SELF experiment 1 (DONE - 2005-09-14 - 2005-09-17 - Hiram) ssh pk mkdir /cluster/data/mm7/bed/blastzSelf cd /cluster/data/mm7/bed/blastzSelf cat << '_EOF_' > DEF # mouse vs mouse export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin BLASTZ=blastz.v7.x86_64 BLASTZ_H=2000 BLASTZ_M=50 BLASTZ_ABRIDGE_REPEATS=0 # TARGET: Mouse Mm7 SEQ1_DIR=/cluster/bluearc/mm7/nib SEQ1_CTGDIR=/cluster/bluearc/mm7/mm7.2bit SEQ1_IN_CONTIGS=0 SEQ1_CHUNK=500000 SEQ1_LAP=50 # QUERY: Mouse Mm7 SEQ2_DIR=/cluster/bluearc/mm7/mm7Chroms_RandomContigs.2bit SEQ2_CTGDIR=/cluster/bluearc/mm7/mm7.2bit SEQ2_LIFT=/cluster/bluearc/mm7/Chroms_RandomContigs.lft SEQ2_SELF=1 SEQ2_IN_CONTIGS=0 SEQ2_CHUNK=3000000000 SEQ2_LAP=0 BASE=/cluster/data/mm7/bed/blastzSelf SEQ1_LEN=$BASE/S1.len SEQ2_LEN=$BASE/S2.len SEQ2_CTGLEN=$BASE/S2ctg.len TMPDIR=/scratch/tmp '_EOF_' # happy emacs cd /cluster/data/mm7/bed/blastzSelf nibSize /cluster/bluearc/mm7/nib/*.nib \ | awk '{printf "%s\t%s\n", $2, $3}' | sort -k2,2nr > S1.len twoBitInfo /cluster/bluearc/mm7/mm7Chroms_RandomContigs.2bit stdout \ | sort -k2,2nr > S2ctg.len twoBitInfo /cluster/bluearc/mm7/mm7.2bit stdout | sort -k2,2nr > S2.len time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=kk \ `pwd`/DEF > blastz.out 2>&1 & time $HOME/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk \ -stop=blastz \ `pwd`/DEF > blastz.out 2>&1 & # real 1573m31.130s # user 0m0.081s # sys 0m0.071s time $HOME/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk \ -continue=cat -stop=cat \ `pwd`/DEF > cat.out 2>&1 & time $HOME/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk \ -continue=chainRun -stop=chainRun \ `pwd`/DEF > chainRun.out 2>&1 & time $HOME/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk \ -continue=chainMerge -stop=load \ `pwd`/DEF > chainMergeToLoad.out 2>&1 & # real 233m20.096s # user 0m0.101s # sys 0m0.088s # The chainLink measurements need kolossus: ssh kolossus cd /cluster/data/mm7/bed/blastzSelf time HGDB_CONF=~/.hg.conf.read-only featureBits mm7 chainSelfLink \ > featBits.chainSelfLink.mm7 2>&1 # real 28m20.150s # user 9m4.878s # sys 2m53.059s # 434012375 bases of 2583394090 (16.800%) in intersection time HGDB_CONF=~/.hg.conf.read-only featureBits mm6 chainSelfLink \ > featBits.chainSelfLink.mm6 2>&1 # real 109m50.760s # user 32m4.830s # sys 11m42.215s # 417927047 bases of 2597150411 (16.092%) in intersection time featureBits mm7 netSelf # 2336281173 bases of 2597150411 (89.956%) in intersection time featureBits mm7 chainSelf # 2579948751 bases of 2597150411 (99.338%) in intersection time featureBits mm3 mouseChain # 889252994 bases of 2505900260 (35.486%) in intersection # the chainLink measurements need kolossus: ssh kolossus time HGDB_CONF=~/.hg.conf.read-only featureBits mm6 chainSelfLink # 417927047 bases of 2597150411 (16.092%) in intersection # 244 minutes time HGDB_CONF=~/.hg.conf.read-only featureBits mm3 mouseChainLink # 383345536 bases of 2505900260 (15.298%) in intersection # Gill likes to see the blastzSelf track: ssh eieio cd /cluster/data/mm7/bed/blastzSelf # cat the pslParts together, per-chrom, and in chromStart order: ls pslParts | sed -e "s/.nib.*//" | sort -u | while read C do echo -n "working: ${C} ... " zcat `ls pslParts/${C}.nib* | sort --field-separator=':' -k1,1 -k3,3n` \ | gzip > pslChrom/${C}_blastzSelf.psl.gz echo "done" done # Load blastzSelf ssh hgwdev cd /cluster/data/mm7/bed/blastzSelf/pslChrom for I in *.psl.gz do $HOME/bin/i386/hgLoadPsl -noTNameIx mm7 ${I} echo "done: ${I}" done # STARTED - 2005-04-06 15:24 # 4h 24m load time - chrUn_random failed to load ssh kolossus time HGDB_CONF=~/.hg.conf.read-only featureBits mm7 blastzSelf # 8h 34m job # 471978757 bases of 2597150411 (18.173%) in intersection ############################################################################# # PREPARE "bigZips" files for public release # (DONE 2005-09-20 - Hiram) ssh kkstore02 mkdir /cluster/data/mm7/bigZips cd /cluster/data/mm7/bigZips scp hgwdev:/usr/local/apache/htdocs/goldenPath/mm6/bigZips/README.txt . # edit README.txt to indicate proper version of sequence and # RepeatMasker cd /cluster/data/mm7 tar cvzf bigZips/chromAgp.tar.gz ?/chr*.agp ??/chr*.agp tar cvzf bigZips/chromFa.tar.gz ?/chr*.fa ??/chr*.fa tar cvzf bigZips/chromFaMasked.tar.gz ?/chr*.fa.masked ??/chr*.fa.masked tar cvzf bigZips/chromOut.tar.gz ?/chr*.fa.out ??/chr*.fa.out cd /cluster/data/mm7/bed/simpleRepeat tar cvzf ../../bigZips/chromTrf.tar.gz ./trfMask # get GenBank native mRNAs ssh hgwdev cd /cluster/data/genbank ./bin/i386/gbGetSeqs -db=mm7 -native GenBank mrna \ /cluster/data/mm7/bigZips/mrna.fa cd /cluster/data/mm7/bigZips gzip mrna.fa ssh kkstore02 cd /cluster/data/mm7/bigZips md5sum *.gz > md5sum.txt ssh hgwdev mkdir -p /usr/local/apache/htdocs/goldenPath/mm7/bigZips cd /usr/local/apache/htdocs/goldenPath/mm7/bigZips ln -s /cluster/data/mm7/bigZips/* . ######################################################################### # GENBANK auto update (DONE 2005-09-29 markd) # align with revised genbank process. drop xeno ESTs. cd ~kent/src/makeDb/genbank cvs update -d etc # edit etc/genbank.conf to add mm7 # mm7 mm7.serverGenome = /cluster/data/mm7/mm7.2bit mm7.clusterGenome = /cluster/bluearc/mm7/mm7.2bit mm7.ooc = /cluster/bluearc/mm7/11.ooc mm7.align.unplacedChroms = chrUn_random mm7.lift = /cluster/data/mm7/jkStuff/liftAll.lft mm7.refseq.mrna.native.pslCDnaFilter = ${ordered.refseq.mrna.native.pslCDnaFilter} mm7.refseq.mrna.xeno.pslCDnaFilter = ${ordered.refseq.mrna.xeno.pslCDnaFilter} mm7.genbank.mrna.native.pslCDnaFilter = ${ordered.genbank.mrna.native.pslCDnaFilter} mm7.genbank.mrna.xeno.pslCDnaFilter = ${ordered.genbank.mrna.xeno.pslCDnaFilter} mm7.genbank.est.native.pslCDnaFilter = ${ordered.genbank.est.native.pslCDnaFilter} mm7.downloadDir = mm7 mm7.refseq.mrna.xeno.load = yes mm7.refseq.mrna.xeno.loadDesc = yes mm7.mgcTables.default = full mm7.mgcTables.mgc = all # update /cluster/data/genbank/ make etc-update ssh kkstore02 cd /cluster/data/genbank nice bin/gbAlignStep -initial mm7 & # load database when finished ssh hgwdev cd /cluster/data/genbank nice ./bin/gbDbLoadStep -drop -initialLoad mm7& # enable daily alignment and update of hgwdev cd ~kent/src/makeDb/genbank cvs update -d etc # add mm7 to: etc/align.dbs etc/hgwdev.dbs cvs commit make etc-update ######################################################################### # PRODUCING GENSCAN PREDICTIONS (DONE - 2005-08-31 - 2005-09-06 - Hiram) ssh kkstore02 # create hard masked sequence mkdir /cluster/data/mm7/hardMasked cd /cluster/data/mm7/hardMasked twoBitToFa ../mm7Chroms_RandomContigs.2bit stdout \ | maskOutFa stdin hard stdout \ > Chroms_RandomContigs.hardMasked.fa faCount Chroms_RandomContigs.hardMasked.fa > faCount.hardMasked tail -1 faCount.chunks # total 2692380329 437167178 307449158 307469169 437328614 1202966210 13463576 # These counts are going to be different than the soft masked # sequence since a lot of stuff has been turned into N's # In fact, a lot of these are all N's, filter the name list to allow # only sequences with at least 18 nucleotides to prevent genscan # from going bonkers. egrep -v "^#|^total" faCount.hardMasked \ | awk '{if ($2-$7 > 17) {print $1}}' > over18.lst faSomeRecords Chroms_RandomContigs.hardMasked.fa over18.lst over18.fa # creating 10,000,000 sized chunks mkdir hardChunks time faSplit about over18.fa 10000000 hardChunks/c_ mkdir /cluster/bluearc/mm7/hardChunks cp -p hardChunks/c_*.fa /cluster/bluearc/mm7/hardChunks ssh hgwdev mkdir /cluster/data/mm7/bed/genscan cd /cluster/data/mm7/bed/genscan # Check out hg3rdParty/genscanlinux to get latest genscan: cvs co hg3rdParty/genscanlinux # Run on small cluster (more mem than big cluster). ssh kki cd /cluster/data/mm7/bed/genscan # Make 3 subdirectories for genscan to put their output files in mkdir gtf pep subopt # Generate a list file, genome.list, of all the hard-masked contigs that # *do not* consist of all-N's (which would cause genscan to blow up) ls -1S /cluster/bluearc/mm7/hardChunks/c_*.fa > genome.list # XXX There is an error in the following template, note the extra # space between the - and par= # It turns out the default for the -par argument is this same # matrix so the extra space had no effect on the end result. # Create template file, gsub, for gensub2. For example (3-line file): cat << '_EOF_' > gsub #LOOP /cluster/bin/x86_64/gsBig {check in line+ $(path1)} {check out line gtf/$(root1).gtf} -trans={check out line pep/$(root1).pep} -subopt={check out line subopt/$(root1).bed} -exe=hg3rdParty/genscanlinux/genscan - par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=2400000 #ENDLOOP '_EOF_' # happy emacs gensub2 genome.list single gsub jobList para create jobList para try, check, push, check, ... # Had three jobs crash: # Completed: 25 of 28 jobs # Crashed: 3 jobs # CPU time in finished jobs: 269839s 4497.32m 74.96h 3.12d 0.009 y # IO & Wait Time: 14374s 239.57m 3.99h 0.17d 0.000 y # Average job time: 11369s 189.48m 3.16h 0.13d # Longest finished job: 29640s 494.00m 8.23h 0.34d # Submission to last job: 29687s 494.78m 8.25h 0.34d # If there are crashes, diagnose with "para problems". # Three of them needed to be rerun, adjust window down to 2000000 to # get them to complete. Lower that number if the error persists. # These jobs can take over 4 hours on kolossus. ssh kolossus cd /cluster/data/mm7/bed/genscan # XXX There is an error in the following commands, note the extra # space between the - and par= # It turns out the default for the -par argument is this same # matrix so the extra space had no effect on the end result. /cluster/bin/x86_64/gsBig /cluster/bluearc/mm7/hardChunks/c_03.fa gtf/c_03.gtf -trans=pep/c_03.pep -subopt=subopt/c_03.bed -exe=hg3rdParty/genscanlinux/genscan - par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=2000000 /cluster/bin/x86_64/gsBig /cluster/bluearc/mm7/hardChunks/c_12.fa gtf/c_12.gtf -trans=pep/c_12.pep -subopt=subopt/c_12.bed -exe=hg3rdParty/genscanlinux/genscan - par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=2000000 /cluster/bin/x86_64/gsBig /cluster/bluearc/mm7/hardChunks/c_01.fa gtf/c_01.gtf -trans=pep/c_01.pep -subopt=subopt/c_01.bed -exe=hg3rdParty/genscanlinux/genscan - par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=2000000 # The last one on c_01.fa timing is: # real 259m46.197s # user 256m27.702s # sys 2m35.809s # cat and lift the results into single files ssh kkstore02 cd /cluster/data/mm7/bed/genscan cat gtf/c_*.gtf | liftUp -type=.gtf genscan.gtf \ /cluster/data/mm7/jkStuff/Chroms_RandomContigs.lft carry stdin cat subopt/c_*.bed | liftUp -type=.bed genscanSubopt.bed \ /cluster/data/mm7/jkStuff/Chroms_RandomContigs.lft carry stdin cat pep/c_*.pep > genscan.pep # Load into the database as so: ssh hgwdev cd /cluster/data/mm7/bed/genscan ldHgGene mm7 -gtf genscan genscan.gtf # Read 46116 transcripts in 325432 lines in 1 files # 46116 groups 40 seqs 1 sources 1 feature types # 46116 gene predictions hgPepPred mm7 generic genscanPep genscan.pep hgLoadBed -strict mm7 genscanSubopt genscanSubopt.bed # Loaded 533197 elements of size 6 # check the numbers featureBits mm7 genscan # 54864694 bases of 2583394090 (2.124%) in intersection featureBits mm6 genscan # 54894283 bases of 2597150411 (2.114%) in intersection featureBits mm5 genscan # 55024722 bases of 2615483787 (2.104%) in intersection featureBits mm4 genscan # 56164126 bases of 2627444668 (2.138%) in intersection featureBits mm3 genscan # 51697165 bases of 2505900260 (2.063%) in intersection featureBits mm7 genscanSubopt # 57512333 bases of 2583394090 (2.226%) in intersection featureBits mm6 genscanSubopt # 57856316 bases of 2597150411 (2.228%) in intersection featureBits mm5 genscanSubopt # 58474899 bases of 2615483787 (2.236%) in intersection featureBits mm4 genscanSubopt # 59601009 bases of 2627444668 (2.268%) in intersection featureBits mm3 genscanSubopt # 56085184 bases of 2505900260 (2.238%) in intersection ######################################################################### # BLASTZ HUMAN Hg17 second time (DONE - 2005-11-14 - 2005-11-21 - Hiram) # After fixing a bug in the lineage specific repeat snip business # in blastz-run-ucsc script ssh pk mkdir /cluster/data/mm7/bed/blastzHg17.2005-11-14 cd /cluster/data/mm7/bed rm blastz.hg17 ln -s blastzHg17.2005-11-14 blastz.hg17 cd lastzHg17.2005-11-14 cat << '_EOF_' > DEF # mouse vs human export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin BLASTZ=blastz.v7.x86_64 BLASTZ_ABRIDGE_REPEATS=1 # TARGET: Mouse Mm7 SEQ1_DIR=/scratch/hg/mm7/nib SEQ1_SMSK=/scratch/hg/mm7/linSpecRep/notInHumanDogCow SEQ1_CHUNK=10000000 SEQ1_LAP=10000 SEQ1_LEN=/scratch/hg/mm7/chrom.sizes # QUERY: Human Hg17 - single chunk big enough to run entire genome SEQ2_DIR=/scratch/hg/gs.18/build35/bothMaskedNibs SEQ2_SMSK=/scratch/hg/gs.18/build35/linSpecRep.notInMouse SEQ2_LEN=/cluster/bluearc/hg17/chrom.sizes SEQ2_CHUNK=3000000000 SEQ2_LAP=0 BASE=/cluster/data/mm7/bed/blastzHg17.2005-11-14 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # establish a screen to control this job screen time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk \ -stop=net \ `pwd`/DEF > blastz-to-net.out 2>&1 & # Started 2005-11-14 11:15 time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk \ -continue=load -stop=load \ `pwd`/DEF > load.out 2>&1 & time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -swap -stop=net `pwd`/DEF > swap-to-net.out 2>&1 & time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -swap -continue=load -stop=load \ `pwd`/DEF > swap-load.out 2>&1 & # re-scoring the chains ssh kkstore02 cd /cluster/data/mm7/bed/blastzHg17.2005-11-14 rm -fr axtNet mafNet axtChain ssh pk cd /cluster/data/mm7/bed/blastzHg17.2005-11-14 time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -continue=chainRun -stop=load \ `pwd`/DEF > rescoreChain.out 2>&1 & time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -swap -stop=load \ `pwd`/DEF > rescoreChainSwap.out 2>&1 & time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -swap -continue=load -stop=load \ `pwd`/DEF > rescoreChainSwapLoad.out 2>&1 & time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -continue=download \ `pwd`/DEF > rescoreDownload.out 2>&1 & time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -swap -continue=download \ `pwd`/DEF > rescoreDownloadSwap.out 2>&1 & # Measurements: ssh kolossus cd /cluster/data/mm7/bed/blastzHg17.2005-11-14 time HGDB_CONF=~/.hg.conf.read-only featureBits mm7 \ chainHg17Link > fb.mm7.chainHg17Link.rescore 2>&1 cat fb.mm7.chainHg17Link.rescore # 996434728 bases of 2583394090 (38.571%) in intersection time HGDB_CONF=~/.hg.conf.read-only featureBits hg17 \ chainMm7Link > fb.hg17.chainMm7Link.rescore 2>&1 cat fb.hg17.chainMm7Link.rescore # 994737081 bases of 2866216770 (34.706%) in intersection ######################################################################### # BLASTZ HUMAN Hg17 experiment (DONE - 2005-09-16 - 2005-10-04 - Hiram) ssh kkstore02 # Create the composite chrom/contigs from randoms 2bit file # for Hg17 ***!!! THIS IS NOT NEEDED !!!*** # These can't be used with linage specific repeats # You can't use 2bit files with SMSK settings. mkdir /cluster/data/hg17/randomContigs cd /cluster/data/hg17 rm jkStuff/hg17Chroms_RandomContigs.lft for C in `cat chrom.lst` do L="${C}/lift/random.lft" if [ -f "${L}" ]; then ls -og "${L}" cat ${L} >> jkStuff/hg17Chroms_RandomContigs.lft /cluster/data/mm7/jkStuff/lft2BitToFa.pl hg17.2bit ${L} \ > randomContigs/chr${C}_random.ctg.fa fi done # Check that the sequence remains the same faSize randomContigs/*.ctg.fa # 14578373 bases (110698 N's 14467675 real 7182813 upper 7284862 # lower) in 86 sequences in 19 files faSize */chr*_random.fa # 17928373 bases (3460698 N's 14467675 real 7182813 upper 7284862 # lower) in 19 sequences in 19 files # Note, only the N's are different faToTwoBit ?/chr?.fa ??/chr??.fa 6_hla_hap?/chr*.fa \ randomContigs/chr*.ctg.fa hg17Chroms_RandomContigs.2bit # Verify sequence isn't broken: twoBitToFa hg17Chroms_RandomContigs.2bit stdout | faSize stdin # 3091666460 bases (225561672 N's 2866104788 real 1474041767 # upper 1392063021 lower) in 113 sequences in 1 files twoBitToFa hg17.2bit stdout | faSize stdin # 3095016460 bases (228911672 N's 2866104788 real 1474041767 # upper 1392063021 lower) in 46 sequences in 1 files # Only difference is the N count. 228911672 - 225561672 = 3350000 cp -p hg17Chroms_RandomContigs.2bit /cluster/bluearc/hg17 cp -p jkStuff/hg17Chroms_RandomContigs.lft /cluster/bluearc/hg17 mkdir /cluster/data/mm7/bed/blastzHg17.2005-09-16 cd /cluster/data/mm7/bed ln -s blastzHg17.2005-09-16 blastz.hg17 cd /cluster/data/mm7/bed/blastzHg17.2005-09-16 cat << '_EOF_' > DEF # mouse vs human export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin BLASTZ=blastz.v7.x86_64 BLASTZ_ABRIDGE_REPEATS=1 # TARGET: Mouse Mm7 SEQ1_DIR=/cluster/bluearc/mm7/nib SEQ1_SMSK=/cluster/bluearc/mm7/linSpecRep/notInHumanDogCow SEQ1_CHUNK=10000000 SEQ1_LAP=10000 SEQ1_LEN=/cluster/bluearc/mm7/chrom.sizes # QUERY: Human Hg17 SEQ2_DIR=/cluster/bluearc/hg17/bothMaskedNibs SEQ2_SMSK=/cluster/bluearc/hg17/linSpecRep.notInMouse SEQ2_LEN=/cluster/bluearc/hg17/chrom.sizes SEQ2_CHUNK=3000000000 SEQ2_LAP=0 BASE=/cluster/data/mm7/bed/blastzHg17.2005-09-16 TMPDIR=/scratch/tmp '_EOF_' # << keep emacs coloring happy cd /cluster/data/mm7/bed/blastzHg17.2005-09-16 # establish a screen to control this job screen time $HOME/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk \ -stop=blastz \ `pwd`/DEF > blastz.out 2>&1 & time $HOME/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk \ -continue=cat -stop=load \ `pwd`/DEF > catToLoad.out 2>&1 & # real 335m51.785s # user 0m0.080s # sys 0m0.051s # detach from screen session: Ctrl-a Ctrl-d # to reattach to this screen session: ssh kkstore02 screen -d -r time $HOME/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk \ -continue=download \ `pwd`/DEF > download-cleanup.out 2>&1 & # Special swap, the load scripts have been disabled so these can # be loaded manually to give them special names for comparison # experiments time $HOME/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk \ -swap -stop=load \ `pwd`/DEF > swapLoadReady.out 2>&1 & # Manually loaded tables chainHg17LSR, chainHg17LSRLink and # netHg17LSR # real 268m48.953s # user 21m46.640s # sys 5m0.510s featureBits mm7 chainHg17noLSRLink # 952987983 bases of 2583394090 (36.889%) in intersection time featureBits mm7 chainHg17Link # 961671385 bases of 2583394090 (37.225%) in intersection # And, their intersection: ssh kolossus time HGDB_CONF=~/.hg.conf.read-only featureBits mm7 \ chainHg17Link chainHg17noLSRLink # 950261805 bases of 2583394090 (36.783%) in intersection # real 33m16.210s # user 8m39.179s # sys 2m33.198s # Loading lineage specific repeats for measurement purposes: ssh kkstore02 cd /cluster/bluearc/mm7/linSpecRep/notInHumanDogCow time grep -h chr chr*.out.spec | awk '{print $5,$6,$7,$10,$1}' \ | sort -k1,1 -k2,2n \ > /cluster/data/mm7/bed/blastzHg17.2005-09-16/mm7LSRhg.bed cd /cluster/bluearc/hg17/linSpecRep.notInMouse time grep -h chr chr*.out.spec | awk '{print $5,$6,$7,$10,$1}' \ | sort -k1,1 -k2,2n \ > /cluster/data/mm7/bed/blastzHg17.2005-09-16/hg17LSRmm.bed ssh hgwdev cd /cluster/data/mm7/bed/blastzHg17.2005-09-16 hgLoadBed mm7 linSpecRepNotHg -strict mm7LSRhg.bed # Loaded 2816082 elements of size 5 hgLoadBed hg17 linSpecRepNotMm -strict hg17LSRmm.bed # had one broken line in the file, removed it # Expecting 5 words line 298322 of hg17LSRmm.bed got 4 # Loaded 1639435 elements of size 5 ssh kolossus cd /cluster/data/mm7/bed/blastzHg17.2005-09-16 time HGDB_CONF=~/.hg.conf.read-only featureBits mm7 \ chainHg17Link linSpecRepNotHg \ > fbMm7.chainHg17Link.linSpecRepNotHg 2>&1 time HGDB_CONF=~/.hg.conf.read-only featureBits mm7 \ chainHg17noLSRLink linSpecRepNotHg \ > fbMm7.chainHg17noLSRLink.linSpecRepNotHg 2>&1 time HGDB_CONF=~/.hg.conf.read-only featureBits hg17 \ chainMm7LSRLink linSpecRepNotMm \ > fbHg17.chainMm7LSRLink.linSpecRepNotMm time HGDB_CONF=~/.hg.conf.read-only featureBits hg17 \ chainMm7Link linSpecRepNotMm \ > fbHg17.chainMm7Link.linSpecRepNotMm ########################################################################## # A measurement script to do all featureBits combinations: cd /cluster/data/mm7/jkStuff cat << '_EOF_' > netChainCheck.sh #!/bin/sh usage() { echo "usage: netChainCheck.sh " echo " does: featureBits net" echo " featureBits net" echo " as well as the chain and chainLink tables," echo " and on the targetDb:" echo " featureBits net" echo " featureBits net" echo " and the chain and chainLink tables." echo -e "\texample: netChainCheck.sh mm7 mm5 fr1" } doOne() { db=$1 tbl=$2 echo " featureBits $db $tbl" echo -en " #\t" time featureBits $db $tbl } ucFirstLetter() { ucString="$1" fc=`echo "${ucString}" | sed -e "s/\(.\).*/\1/"` rest=`echo "${ucString}" | sed -e "s/.\(.*\)/\1/"` FC=`echo "${fc}" | tr '[a-z]' '[A-Z]'` echo "${FC}${rest}" } if [ "$#" -ne 3 ]; then usage exit 255 fi db0=$1 db1=$2 targetDb=$3 targetDB=`ucFirstLetter "${targetDb}"` DB0=`ucFirstLetter "${db0}"` DB1=`ucFirstLetter "${db1}"` export db0 db1 targetDb targetDB DB0 DB1 # echo "${db0} ${db1} ${targetDb} ${targetDB} ${DB0} ${DB1}" doOne "${db0}" net${targetDB} doOne "${db1}" net${targetDB} doOne "${db0}" chain${targetDB} doOne "${db1}" chain${targetDB} doOne "${db0}" chain${targetDB}Link doOne "${db1}" chain${targetDB}Link doOne ${targetDb} net${DB0} doOne ${targetDb} net${DB1} doOne ${targetDb} chain${DB0} doOne ${targetDb} chain${DB1} doOne ${targetDb} chain${DB0}Link doOne ${targetDb} chain${DB1}Link '_EOF_' # << keep emacs coloring happy ######################################################################### # BLASTZ RAT Rn3 second time (WORKING - 2005-11-14 - Hiram) # After fixing a bug in the lineage specific repeat snip business # in blastz-run-ucsc script ssh pk mkdir /cluster/data/mm7/bed/blastzRn3.2005-11-14 cd /cluster/data/mm7/bed rm blastz.rn3 ln -s blastzRn3.2005-11-14 blastz.rn3 cd blastzRn3.2005-11-14 cat << '_EOF_' > DEF # mouse vs rat export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin:/parasol/bin BLASTZ=blastz.v7 BLASTZ_ABRIDGE_REPEATS=1 # TARGET: Mouse Mm7 SEQ1_DIR=/scratch/hg/mm7/nib SEQ1_SMSK=/scratch/hg/mm7/linSpecRep/notInRat SEQ1_LEN=/scratch/hg/mm7/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Rat Rn3 - chunk big enough to do all chroms in single whole pieces SEQ2_DIR=/scratch/rat/rn3/softNib SEQ2_SMSK=/cluster/bluearc/rat/rn3/linSpecRep.notInMouse SEQ2_LEN=/cluster/bluearc/rat/rn3/chrom.sizes SEQ2_CHUNK=300000000 SEQ2_LAP=0 BASE=/cluster/data/mm7/bed/blastzRn3.2005-11-14 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # establish a screen to control this job screen time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk \ -stop=load \ `pwd`/DEF > to-load.out 2>&1 & # Started 2005-11-16 09:37 ssh kolossus cd /cluster/data/mm7/bed/blastzRn3.2005-11-14 time HGDB_CONF=~/.hg.conf.read-only featureBits mm7 chainRn3Link > \ fb.mm7.chainRn3Link # 1779807900 bases of 2583394090 (68.894%) in intersection # real 65m22.546s time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk \ -swap -stop=load \ `pwd`/DEF > swap-to-load.out 2>&1 & # re-scoring the chains ssh kkstore02 cd /cluster/data/mm7/bed/blastzRn3.2005-09-16 rm -fr axtNet mafNet axtChain ssh pk cd /cluster/data/mm7/bed/blastzRn3.2005-09-16 time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -continue=chainRun -stop=load \ `pwd`/DEF > rescoreChain.out 2>&1 & time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -swap -stop=load \ `pwd`/DEF > rescoreChainSwap.out 2>&1 & time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -continue=download \ `pwd`/DEF > rescoreChainDownload.out 2>&1 & time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -swap -continue=download \ `pwd`/DEF > rescoreChainDownloadSwap.out 2>&1 & # Measurements: ssh kolossus cd /cluster/data/mm7/bed/blastzRn3.2005-11-14 time HGDB_CONF=~/.hg.conf.read-only featureBits mm7 \ chainRn3Link > fb.mm7.chainRn3Link.rescore 2>&1 cat fb.mm7.chainRn3Link.rescore # 1775924100 bases of 2583394090 (68.744%) in intersection time HGDB_CONF=~/.hg.conf.read-only featureBits rn3 \ chainMm7Link > fb.rn3.chainMm7Link.rescore 2>&1 cat fb.rn3.chainMm7Link.rescore # 1781501323 bases of 2571104688 (69.289%) in intersection ######################################################################### # BLASTZ RAT Rn3 (DONE - 2005-09-16 - 2005-10-18 - Hiram) ssh kkstore02 mkdir /cluster/data/mm7/bed/blastzRn3.2005-09-16 cd /cluster/data/mm7/bed ln -s blastzRn3.2005-09-16 blastz.rn3 cd blastzRn3.2005-09-16 cat << '_EOF_' > DEF # mouse vs rat export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin BLASTZ=blastz.v7.x86_64 BLASTZ_ABRIDGE_REPEATS=1 # TARGET: Mouse Mm7 SEQ1_DIR=/cluster/bluearc/mm7/nib SEQ1_SMSK=/cluster/bluearc/mm7/linSpecRep/notInRat SEQ1_LEN=/cluster/bluearc/mm7/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Rat Rn3 - chunk big enough to do all chroms in single whole pieces SEQ2_DIR=/cluster/bluearc/rat/rn3/softNib SEQ2_SMSK=/cluster/bluearc/rat/rn3/linSpecRep.notInMouse SEQ2_LEN=/cluster/bluearc/rat/rn3/chrom.sizes SEQ2_CHUNK=300000000 SEQ2_LAP=0 BASE=/cluster/data/mm7/bed/blastzRn3.2005-09-16 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # establish a screen to control this job screen time $HOME/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk \ `pwd`/DEF > full.run.out 2>&1 & # various debugging situations as the script is reorganized time $HOME/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -continue=load -stop=load -bigClusterHub=pk \ `pwd`/DEF > load.out 2>&1 & # detach from screen session: Ctrl-a Ctrl-d # to reattach to this screen session: ssh kkstore02 screen -d -r # Completed: 40713 of 40713 jobs # CPU time in finished jobs: 18170174s 302836.24m 5047.27h 210.30d 0.576 y # IO & Wait Time: 1770530s 29508.83m 491.81h 20.49d 0.056 y # Average job time: 490s 8.16m 0.14h 0.01d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 28252s 470.87m 7.85h 0.33d # Submission to last job: 69864s 1164.40m 19.41h 0.81d # Completed: 331 of 331 jobs # CPU time in finished jobs: 1168s 19.46m 0.32h 0.01d 0.000 y # IO & Wait Time: 3047s 50.79m 0.85h 0.04d 0.000 y # Average job time: 13s 0.21m 0.00h 0.00d # Longest job: 119s 1.98m 0.03h 0.00d # Submission to last job: 359s 5.98m 0.10h 0.00d # Completed: 40 of 40 jobs # CPU time in finished jobs: 12274s 204.56m 3.41h 0.14d 0.000 y # IO & Wait Time: 1719s 28.66m 0.48h 0.02d 0.000 y # Average job time: 350s 5.83m 0.10h 0.00d # Longest job: 1016s 16.93m 0.28h 0.01d # Submission to last job: 1482s 24.70m 0.41h 0.02d time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -continue=download -bigClusterHub=pk -fileServer=kolossus \ `pwd`/DEF > download-clean.out 2>&1 & # Measurements: time featureBits mm7 netRn3 # 2644171540 bases of 2583394090 (102.353%) in intersection time featureBits mm6 netRn3 # expect ~ 2m 12s # 2720144602 bases of 2597150411 (104.736%) in intersection time featureBits mm5 netRn3 # 2638255333 bases of 2615483787 (100.871%) in intersection time featureBits mm7 chainRn3 # 2690727913 bases of 2583394090 (104.155%) in intersection time featureBits mm6 chainRn3 # expect ~ 10m 30s to 13m 25s # 2768422449 bases of 2597150411 (106.595%) in intersection time featureBits mm5 chainRn3 # 2646682349 bases of 2615483787 (101.193%) in intersection ssh kolossus time HGDB_CONF=~/.hg.conf.read-only featureBits mm7 chainRn3Link # 1798351560 bases of 2583394090 (69.612%) in intersection time HGDB_CONF=~/.hg.conf.read-only featureBits mm6 chainRn3Link # 1802980225 bases of 2597150411 (69.421%) in intersection # real 94m48.021s time HGDB_CONF=~/.hg.conf.read-only featureBits mm5 chainRn3Link # 1798705001 bases of 2615483787 (68.771%) in intersection # real 76m44.580s ssh kkstore02 screen -r -d cd /cluster/data/mm7/bed/blastzRn3.2005-09-16 time $HOME/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -swap -bigClusterHub=pk \ `pwd`/DEF > swap.out 2>&1 & # STARTED - 2005-10-04 16:29 time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -swap -continue=download -bigClusterHub=pk -fileServer=kolossus \ `pwd`/DEF > swap-download-clean.out 2>&1 & ssh kolossus time HGDB_CONF=~/.hg.conf.read-only featureBits rn3 chainMm7Link # 1802674646 bases of 2571104688 (70.113%) in intersection time HGDB_CONF=~/.hg.conf.read-only featureBits rn3 chainMm6Link # 1812992492 bases of 2571104688 (70.514%) in intersection time HGDB_CONF=~/.hg.conf.read-only featureBits rn3 chainMm5Link # 1673171206 bases of 2571104688 (65.076%) in intersection ######################################################################### # BLASTZ FUGU fr1 (DONE - 2005-09-16 - 2005-11-21 - Hiram) ssh pk mkdir /cluster/data/mm7/bed/blastzFr1.2005-09-16 cd /cluster/data/mm7/bed ln -s blastzFr1.2005-09-16 blastz.fr1 cd blastzFr1.2005-09-16 cat << '_EOF_' > DEF # mouse vs. fugu export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin BLASTZ=blastz.v7.x86_64 # Reuse parameters from human-chicken, except L=6000 (more relaxed) BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET: Mouse Mm7 - testing 100,000,000 sized chunk on pk kluster SEQ1_DIR=/cluster/bluearc/mm7/nib SEQ1_LEN=/cluster/bluearc/mm7/chrom.sizes SEQ1_CHUNK=100000000 SEQ1_LAP=10000 # QUERY: Fugu Fr1 - chunk big enough to run the whole chrom at once SEQ2_DIR=/san/sanvol1/scratch/fr1/nib SEQ2_LEN=/san/sanvol1/scratch/fr1/chrom.sizes SEQ2_CHUNK=400000000 SEQ2_LAP=0 BASE=/cluster/data/mm7/bed/blastzFr1.2005-09-16 '_EOF_' # << happy emacs # establish a screen to control this job ssh pk cd /cluster/data/mm7/bed/blastzFr1.2005-09-16 screen time $HOME/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -stop=load \ `pwd`/DEF > thruLoad.out 2>&1 & # With 100,000,000 and 400,000,000 chunk sizes, makes only 56 jobs # for the blastz run # STARTED - 2005-09-16 16:21 # detach from screen session: Ctrl-a Ctrl-d # to reattach to this screen session: ssh pk screen -d -r # real 708m20.607s # user 0m0.106s # sys 0m0.107s time $HOME/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -continue=net -stop=load \ `pwd`/DEF > netThruLoad.out 2>&1 & # real 11m37.061s # user 0m0.054s # sys 0m0.052s time $HOME/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -continue=download \ `pwd`/DEF > download.clean.out 2>&1 & time $HOME/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -swap \ `pwd`/DEF > swap.out 2>&1 & # measurements featureBits mm7 chainFr1Link # 49240999 bases of 2583394090 (1.906%) in intersection featureBits mm6 chainFr1Link # 55355465 bases of 2597150411 (2.131%) in intersection featureBits mm7 netFr1 # 611679163 bases of 2583394090 (23.677%) in intersection featureBits mm6 netFr1 # 618129802 bases of 2597150411 (23.800%) in intersection featureBits mm7 chainFr1 # 659036967 bases of 2583394090 (25.511%) in intersection featureBits mm6 chainFr1 # 666835089 bases of 2597150411 (25.676%) in intersection featureBits fr1 chainMm7Link # 42546014 bases of 315518167 (13.484%) in intersection featureBits fr1 chainMm6Link # 46266090 bases of 315518167 (14.664%) in intersection featureBits fr1 netMm7 # 143256422 bases of 315518167 (45.404%) in intersection featureBits fr1 netMm6 # 146828640 bases of 315518167 (46.536%) in intersection featureBits fr1 chainMm7 # 155565064 bases of 315518167 (49.305%) in intersection featureBits fr1 chainMm6 # 160874127 bases of 315518167 (50.987%) in intersection # re-scoring the chains ssh kkstore02 cd /cluster/data/mm7/bed/blastzFr1.2005-09-16 rm -fr axtNet mafNet axtChain ssh pk cd /cluster/data/mm7/bed/blastzFr1.2005-09-16 time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ -continue=chainRun -stop=load \ `pwd`/DEF > rescoreChain.out 2>&1 & # real 403m27.362s time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ -swap -stop=load \ `pwd`/DEF > rescoreChainSwap.out 2>&1 & time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ -continue=download \ `pwd`/DEF > rescoreChainDownload.out 2>&1 & time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ -swap -continue=download \ `pwd`/DEF > rescoreChainDownloadSwap.out 2>&1 & # Measurements: ssh kolossus cd /cluster/data/mm7/bed/blastzFr1.2005-09-16 time HGDB_CONF=~/.hg.conf.read-only featureBits mm7 \ chainFr1Link > fb.mm7.chainFr1Link.rescore 2>&1 cat fb.mm7.chainFr1Link.rescore # 49240999 bases of 2583394090 (1.906%) in intersection time HGDB_CONF=~/.hg.conf.read-only featureBits fr1 \ chainMm7Link > fb.fr1.chainMm7Link.rescore 2>&1 cat fb.fr1.chainMm7Link.rescore # 42546014 bases of 315518167 (13.484%) in intersection ######################################################################### # BLASTZ TETRAODON tetNig1 (DONE - 2005-09-20 - 2005-10-11 - Hiram) ssh pk mkdir /cluster/data/mm7/bed/blastzTetNig1.2005-09-20 cd /cluster/data/mm7/bed ln -s blastzTetNig1.2005-09-20 blastz.tetNig1 cd blastzTetNig1.2005-09-20 # use same parameters as for danRer1-mm5 # NOTE: The BLASTZ_Q score matrix should have been the Tuned.gap # one which is recreated below during the re-score cat << '_EOF_' > DEF # mouse (mm7) vs Tetraodon tetNig1 export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/parasol/bin BLASTZ=blastz.v7.x86_64 # Reuse parameters from hg16-fr1 and danRer1-hg17. BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/san/sanvol1/scratch/blastz/HoxD55.q # TARGET: Mouse (mm7) # small enough chunk to get reasonable running time SEQ1_DIR=/scratch/hg/mm7/mm7.2bit SEQ1_LEN=/scratch/hg/mm7/chrom.sizes SEQ1_CTGDIR=/scratch/hg/mm7/mm7Chroms_RandomContigs.2bit SEQ1_CTGLEN=/scratch/hg/mm7/mm7Chroms_RandomContigs.sizes SEQ1_LIFT=/scratch/hg/mm7/Chroms_RandomContigs.lft SEQ1_IN_CONTIGS=0 SEQ1_CHUNK=10000000 SEQ1_LAP=5000 # QUERY: Tetraodon tetNig1 # large enough chunk to do all genome in one piece SEQ2_DIR=/san/sanvol1/scratch/tetNig1/chromsAndScafs/tetNig1ChromsRandomScafs.2bit SEQ2_CTGDIR=/san/sanvol1/scratch/tetNig1/tetNig1.2bit SEQ2_LIFT=/san/sanvol1/scratch/tetNig1/chromsAndScafs/chromsAndScafs.lft SEQ2_LEN=/san/sanvol1/scratch/tetNig1/chrom.sizes SEQ2_CTGLEN=/san/sanvol1/scratch/tetNig1/chromsAndScafs/chromsAndScafs.sizes SEQ2_IN_CONTIGS=0 SEQ2_CHUNK=410000000 SEQ2_LAP=0 BASE=/cluster/data/mm7/bed/blastzTetNig1.2005-09-20 TMPDIR=/scratch/tmp '_EOF_' # < happy emacs # establish a screen to control this job screen time $HOME/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk \ `pwd`/DEF > blastz.out 2>&1 & # detach from screen session: Ctrl-a Ctrl-d # to reattach to this screen session: ssh pk screen -d -r # Completed: 435 of 435 jobs # CPU time in finished jobs: 3994116s 66568.60m 1109.48h 46.23d 0.127 y # IO & Wait Time: 180402s 3006.70m 50.11h 2.09d 0.006 y # Average job time: 9597s 159.94m 2.67h 0.11d # Longest finished job: 20697s 344.95m 5.75h 0.24d # Submission to last job: 108662s 1811.03m 30.18h 1.26d # Completed: 435 of 435 jobs # CPU time in finished jobs: 16s 0.27m 0.00h 0.00d 0.000 y # IO & Wait Time: 1403s 23.38m 0.39h 0.02d 0.000 y # Average job time: 3s 0.05m 0.00h 0.00d # Longest finished job: 7s 0.12m 0.00h 0.00d # Submission to last job: 92s 1.53m 0.03h 0.00d # Completed: 183 of 183 jobs # CPU time in finished jobs: 2603s 43.38m 0.72h 0.03d 0.000 y # IO & Wait Time: 1175s 19.59m 0.33h 0.01d 0.000 y # Average job time: 21s 0.34m 0.01h 0.00d # Longest finished job: 321s 5.35m 0.09h 0.00d # Submission to last job: 232082s 3868.03m 64.47h 2.69d # Broken chaining run on empty result files, finish that off # manually, then (mistakenly overwrite blastz.out here) time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -continue chainMerge -bigClusterHub=pk \ `pwd`/DEF > blastz.out 2>&1 & # swap results to place mm7 alignments onto TetNig1 ssh hgwdev cd /cluster/data/mm7/bed/blastzTetNig1.2005-09-20 time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -swap -bigClusterHub=pk -fileServer kolossus \ `pwd`/DEF > swap.out 2>&1 & # Measurements: cat << '_EOF_' > measurements.sh #!/bin/sh doOne() { db=$1 tbl=$2 echo " featureBits $db $tbl" echo -en " #\t" time featureBits $db $tbl } doOne mm7 netTetNig1 doOne mm5 netTetNig1 doOne mm7 chainTetNig1 doOne mm5 chainTetNig1 doOne mm7 chainTetNig1Link doOne mm5 chainTetNig1Link doOne tetNig1 netMm6 doOne tetNig1 netMm5 doOne tetNig1 chainMm6 doOne tetNig1 chainMm5 doOne tetNig1 chainMm6Link doOne tetNig1 chainMm5Link '_EOF_' # << keep emacs happy chmod +x measurements.sh time ./measurements.sh featureBits mm7 netTetNig1 # 705879667 bases of 2583394090 (27.324%) in intersection featureBits mm6 netTetNig1 # 720943295 bases of 2597150411 (27.759%) in intersection featureBits mm5 netTetNig1 # 618111072 bases of 2615483787 (23.633%) in intersection featureBits mm7 chainTetNig1 # 750370420 bases of 2583394090 (29.046%) in intersection featureBits mm6 chainTetNig1 # 771732145 bases of 2597150411 (29.715%) in intersection featureBits mm5 chainTetNig1 # 652622662 bases of 2615483787 (24.952%) in intersection featureBits mm7 chainTetNig1Link # 51363209 bases of 2583394090 (1.988%) in intersection featureBits mm6 chainTetNig1Link # 62346107 bases of 2597150411 (2.401%) in intersection featureBits mm5 chainTetNig1Link # 43905129 bases of 2615483787 (1.679%) in intersection featureBits tetNig1 netMm7 # 160849376 bases of 342403326 (46.977%) in intersection featureBits tetNig1 netMm6 # 176451958 bases of 342403326 (51.533%) in intersection featureBits tetNig1 netMm5 # 152232538 bases of 342403326 (44.460%) in intersection featureBits tetNig1 chainMm7 # 175481308 bases of 342403326 (51.250%) in intersection featureBits tetNig1 chainMm6 # 197657323 bases of 342403326 (57.726%) in intersection featureBits tetNig1 chainMm5 # 163683179 bases of 342403326 (47.804%) in intersection featureBits tetNig1 chainMm7Link # 47408068 bases of 342403326 (13.846%) in intersection featureBits tetNig1 chainMm6Link # 55282376 bases of 342403326 (16.145%) in intersection featureBits tetNig1 chainMm5Link # 41736750 bases of 342403326 (12.189%) in intersection # re-scoring the chains ssh kkstore02 cd /cluster/data/mm7/bed/blastzTetNig1.2005-09-20 rm -fr axtNet mafNet axtChain ssh pk cd /cluster/data/mm7/bed/blastzTetNig1.2005-09-20 time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ -continue=chainRun -stop=load \ `pwd`/DEF > rescoreChain.out 2>&1 & # real 155m15.610s time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ -swap -stop=load \ `pwd`/DEF > rescoreChain.out 2>&1 time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ -continue=download \ `pwd`/DEF > rescoreChainDownload.out 2>&1 & time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ -swap -continue=download \ `pwd`/DEF > rescoreChainDownloadSwap.out 2>&1 & # Measurements: ssh kolossus cd /cluster/data/mm7/bed/blastzTetNig1.2005-09-20 time HGDB_CONF=~/.hg.conf.read-only featureBits mm7 \ chainTetNig1Link > fb.mm7.chainTetNig1Link.rescore 2>&1 cat fb.mm7.chainTetNig1Link.rescore # 50521413 bases of 2583394090 (1.956%) in intersection time HGDB_CONF=~/.hg.conf.read-only featureBits tetNig1 \ chainMm7Link > fb.tetNig1.chainMm7Link.rescore 2>&1 cat fb.tetNig1.chainMm7Link.rescore # 46775251 bases of 342403326 (13.661%) in intersection ######################################################################### # CPGISLANDS (DONE - 2005-09-20 - Hiram) ssh hgwdev mkdir -p /cluster/data/mm7/bed/cpgIsland cd /cluster/data/mm7/bed/cpgIsland # Build software from Asif Chinwalla (achinwal@watson.wustl.edu) cvs co hg3rdParty/cpgIslands cd hg3rdParty/cpgIslands make # gcc readseq.c cpg_lh.c -o cpglh.exe mv cpglh.exe ../.. # cpglh.exe requires hard-masked (N) .fa's. # There may be warnings about "bad character" for IUPAC ambiguous # characters like R, S, etc. Ignore the warnings. ssh kkstore02 cd /cluster/data/mm7/bed/cpgIsland for F in ../../*/chr*.fa.masked do FA=${F/*\/} C=${FA/.fa.masked/} echo "./cpglh.exe ${FA} > ${C}.cpg" ./cpglh.exe ${F} > ${C}.cpg done > cpglh.out 2>&1 & # Bad char 0x52 = 'R' at line 164245, base 8212187, sequence chr14 # Bad char 0x53 = 'S' at line 167424, base 8371114, sequence chr14 # Bad char 0x53 = 'S' at line 167426, base 8371198, sequence chr14 # Several chroms have 0 results: # -rw-rw-r-- 1 0 Sep 20 11:52 chrM.cpg # -rw-rw-r-- 1 0 Sep 20 11:52 chrY.cpg # -rw-rw-r-- 1 0 Sep 20 11:52 chrY_random.cpg # XXX - this is interesting that chrY, either one, have nothing. # the previous mm5 release did have some on chrY # Evidently the new chrY is too short - this chrY is being # reconstructed and only a small part of it is known in this # assembly. The bulk of chrY from previous assemblies is now in # chrY_random # Transform cpglh output to bed + cat << '_EOF_' > filter.awk { $2 = $2 - 1; width = $3 - $2; printf("%s\t%d\t%s\t%s %s\t%s\t%s\t%0.0f\t%0.1f\t%s\t%s\n", $1, $2, $3, $5,$6, width, $6, width*$7*0.01, 100.0*2*$6/width, $7, $9); } '_EOF_' # << this line makes emacs coloring happy awk -f filter.awk chr*.cpg | sort -k1,1 -k2,2n > cpgIsland.bed ssh hgwdev cd /cluster/data/mm7/bed/cpgIsland hgLoadBed -strict mm7 cpgIslandExt -tab -noBin \ -sqlTable=$HOME/kent/src/hg/lib/cpgIslandExt.sql cpgIsland.bed # Reading cpgIsland.bed # Loaded 16030 elements of size 10 # Sorted # Saving bed.tab # Loading mm7 featureBits mm7 cpgIslandExt # 10439328 bases of 2583394090 (0.404%) in intersection featureBits mm6 cpgIslandExt # 10432360 bases of 2597150411 (0.402%) in intersection featureBits mm5 cpgIslandExt # 10422989 bases of 2615483787 (0.399%) in intersection featureBits mm4 cpgIsland # 11109692 bases of 2627444668 (0.423%) in intersection featureBits mm3 cpgIsland # 10102968 bases of 2505900260 (0.403%) in intersection ######################################################################### # ANDY LAW CPGISSLANDS (DONE - 2005-09-20 - Hiram) # See notes in makeGalGal2.doc and makeCanFam2.doc ssh kkstore02 mkdir /cluster/data/mm7/bed/cpgIslandGgfAndy cd /cluster/data/mm7/bed/cpgIslandGgfAndy # Build the preProcGgfAndy program in # kent/src/oneShot/preProcGgfAndy into your ~/bin/$MACHTYPE # Use masked sequence since this is a mammal... for F in ../../*/chr*.fa.masked do FA=${F/*\/} C=${FA/.fa.masked/} echo preproc and run on masked "${C} ${F}" 1>/dev/stderr ~/bin/$MACHTYPE/preProcGgfAndy ${F} \ | /cluster/home/angie/ggf-andy-cpg-island.pl \ | perl -wpe 'chomp; ($s,$e,$cpg,$n,$c,$g1,$oE) = split("\t"); $s--; $gc=$c+$g1; $pCpG=(100.0 * 2 * $cpg / $n); $pGc=(100.0 * $gc / $n); $_="'${C}'\t$s\t$e\tCpG: $cpg\t$n\t$cpg\t$gc\t" . "$pCpG\t$pGc\t$oE\n";' done | sort -k1,1 -k2,2n > cpgIslandGgfAndyMasked.bed # load into database: ssh hgwdev cd /cluster/data/mm7/bed/cpgIslandGgfAndy sed -e 's/cpgIslandExt/cpgIslandGgfAndyMasked/g' \ $HOME/kent/src/hg/lib/cpgIslandExt.sql > cpgIslandGgfAndyMasked.sql hgLoadBed mm7 cpgIslandGgfAndyMasked -tab -noBin \ -sqlTable=cpgIslandGgfAndyMasked.sql cpgIslandGgfAndyMasked.bed # Loaded 67616 elements of size 10 featureBits mm7 cpgIslandExt # 10439328 bases of 2583394090 (0.404%) in intersection featureBits mm7 cpgIslandGgfAndyMasked # 38774242 bases of 2583394090 (1.501%) in intersection wc -l ../cpgIsland/cpgIsland.bed *bed # 16030 ../cpgIsland/cpgIsland.bed # 67616 cpgIslandGgfAndyMasked.bed ######################################################################### # BLASTZ Cow bosTau2 (DONE - 2005-09-20 - 2005-11-22 - Hiram) ssh kkstore02 # There is no need to use the chrBin0 in this alignment # Create a bosTau2 2bit file without the chrBin0 sequence mkdir /cluster/data/bosTau2/noBin0 cd /cluster/data/bosTau2/noBin0 twoBitInfo ../bosTau2.2bit stdout | grep -v chrBin0 \ | awk '{print $1}' > chrList twoBitToFa ../bosTau2.2bit stdout -seqList=chrList \ | faToTwoBit stdin bosTau2.noBin0.2bit twoBitInfo bosTau2.noBin0.2bit noBin0.sizes rm chrList mkdir /cluster/data/mm7/bed/blastzBosTau2.2005-09-20 cd /cluster/data/mm7/bed ln -s blastzBosTau2.2005-09-20 blastz.bosTau2 cd blastzBosTau2.2005-09-20 cat << '_EOF_' > DEF export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn/x86_64:/cluster/bin/x86_64:/parasol/bin BLASTZ=blastz.x86_64 BLASTZ_M=50 # mouse vs. cow # TARGET: Mouse (mm7) # small enough chunk to get reasonable running time SEQ1_DIR=/cluster/bluearc/mm7/mm7Chroms_RandomContigs.2bit SEQ1_CTGDIR=/cluster/bluearc/mm7/mm7.2bit SEQ1_LIFT=/cluster/bluearc/mm7/Chroms_RandomContigs.lft SEQ1_LEN=/cluster/bluearc/mm7/chrom.sizes SEQ1_CTGLEN=/cluster/bluearc/mm7/mm7Chroms_RandomContigs.sizes SEQ1_LIMIT=30 SEQ1_IN_CONTIGS=0 SEQ1_CHUNK=500000 SEQ1_LAP=5000 # QUERY: Cow bosTau2 # large enough chunk to do all the genome in one piece SEQ2_DIR=/san/sanvol1/scratch/bosTau2/bosTau2.noBin0.2bit SEQ2_LEN=/san/sanvol1/scratch/bosTau2/noBin0.sizes SEQ2_CHUNK=3200000000 SEQ2_LAP=0 BASE=/cluster/data/mm7/bed/blastzBosTau2.2005-09-20 TMPDIR=/scratch/tmp '_EOF_' # << keep emacs coloring happy # establish a screen to control this job screen time $HOME/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk \ `pwd`/DEF > blastz.out 2>&1 & time /cluster/bin/scripts/doBlastzChainNet.pl \ -continue=cat -bigClusterHub=pk \ `pwd`/DEF > continue.cat.out 2>&1 & time /cluster/bin/scripts/doBlastzChainNet.pl \ -continue=chainMerge -bigClusterHub=pk \ `pwd`/DEF > continue.chainMerge.out 2>&1 & time /cluster/bin/scripts/doBlastzChainNet.pl \ -swap -bigClusterHub=pk \ `pwd`/DEF > swap.out 2>&1 & time /cluster/bin/scripts/doBlastzChainNet.pl \ `pwd`/DEF > blast.run.out 2>&1 & # detach from screen session: Ctrl-a Ctrl-d # to reattach to this screen session: ssh kkstore02 screen -d -r # STARTED - 2005-03-18 13:20 # BROKEN - 2005-03-20 - 22:03 - power failure to all machines # RESTARTED - 2005-03-30 14:35 # After several reruns of the batch, believe it may be finished. # establish check point marker in the run.time file: para time > run.time # Now to the rest of the story: ssh eieio cd /cluster/data/mm7/bed/blastzBosTau1.2005_03_18 time /cluster/bin/scripts/doBlastzChainNet.pl -fileServer eieio \ -continue cat `pwd`/DEF > cat.run.out 2>&1 & # Completed: 40 of 40 jobs # CPU time in finished jobs: 834s 13.90m 0.23h 0.01d 0.000 y # IO & Wait Time: 2421s 40.35m 0.67h 0.03d 0.000 y # Average job time: 81s 1.36m 0.02h 0.00d # Longest job: 334s 5.57m 0.09h 0.00d # Submission to last job: 365s 6.08m 0.10h 0.00d # measurements: ssh hgwdev cd /cluster/data/mm7/bed/blastzBosTau1.2005_03_18 time ../../jkStuff/netChainCheck.sh mm7 mm5 bosTau1 >measurements.out 2>&1 & featureBits mm7 netBosTau1 # 1483158691 bases of 2597150411 (57.107%) in intersection featureBits mm5 netBosTau1 # 1491250043 bases of 2615483787 (57.016%) in intersection featureBits mm7 chainBosTau1 # 1551920940 bases of 2597150411 (59.755%) in intersection featureBits mm5 chainBosTau1 # 1557897465 bases of 2615483787 (59.564%) in intersection featureBits mm7 chainBosTau1Link # 603091864 bases of 2597150411 (23.221%) in intersection featureBits mm5 chainBosTau1Link # 606973993 bases of 2615483787 (23.207%) in intersection # Looking OK, so do the swap ssh eieio cd /cluster/data/mm7/bed/blastzBosTau1.2005_03_18 time /cluster/bin/scripts/doBlastzChainNet.pl -fileServer eieio \ -swap `pwd`/DEF > swap.run.out 2>&1 & # 308 m = 5h 8m # failed on kolossus due to NFS problems ssh kolossus cd /cluster/data/bosTau1/bed/blastz.mm7.swap/axtChain # extract the unfinished portion of netChains.csh into # finiChains.csh and run it: time ./finiChains.csh # STARTED - 2005-04-06 # 13h 50m # continuing ssh eieio cd /cluster/data/mm7/bed/blastzBosTau1.2005_03_18 time /cluster/bin/scripts/doBlastzChainNet.pl -fileServer eieio \ -swap -continue load `pwd`/DEF > load.run.out 2>&1 & # 5h 6min load time # checking measurements: featureBits bosTau1 netMm6 # 1317934269 bases of 2261116798 (58.287%) in intersection featureBits bosTau1 netMm5 # 1317539731 bases of 2261116798 (58.269%) in intersection featureBits bosTau1 chainMm6 # 1325743373 bases of 2261116798 (58.632%) in intersection featureBits bosTau1 chainMm5 # 1325445280 bases of 2261116798 (58.619%) in intersection featureBits bosTau1 chainMm6Link # 589779558 bases of 2261116798 (26.084%) in intersection featureBits bosTau1 chainMm5Link # 588460684 bases of 2261116798 (26.025%) in intersection # looks good, done. # re-scoring the chains ssh kkstore02 cd /cluster/data/mm7/bed/blastzBosTau2.2005-09-20 rm -fr axtNet mafNet axtChain ssh pk cd /cluster/data/mm7/bed/blastzBosTau2.2005-09-20 time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -continue=chainRun -stop=load \ `pwd`/DEF > rescoreChain.out 2>&1 & time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -swap -stop=load \ `pwd`/DEF > rescoreChainSwap.out 2>&1 & # Measurements: ssh kolossus cd /cluster/data/mm7/bed/blastzBosTau2.2005-11-14 time HGDB_CONF=~/.hg.conf.read-only featureBits mm7 \ chainBosTau2Link > fb.mm7.chainBosTau2Link.rescore 2>&1 cat fb.mm7.chainBosTau2Link.rescore # 692466702 bases of 2583394090 (26.805%) in intersection time HGDB_CONF=~/.hg.conf.read-only featureBits bosTau2 \ chainMm7Link > fb.bosTau2.chainMm7Link.rescore 2>&1 cat fb.bosTau2.chainMm7Link.rescore # 678966583 bases of 2812203870 (24.144%) in intersection time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -continue=download \ `pwd`/DEF > rescoreChainDownload.out 2>&1 & time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -swap -continue=download \ `pwd`/DEF > rescoreChainSwapDownload.out 2>&1 & ############################################################################# # BLASTZ CHICKEN galGal2 - second time (DONE - 2005-11-14 - 2005-11-21 - Hiram) # After fixing a bug in the lineage specific repeat snip business # in blastz-run-ucsc script ssh kk mkdir /cluster/data/mm7/bed/blastzGalGal2.2005-11-14 cd /cluster/data/mm7/bed rm blastz.galGal2 ln -s blastzGalGal2.2005-11-14 blastz.galGal2 cd blastzGalGal2.2005-11-14 cat << '_EOF_' > DEF # mouse vs. chicken export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/parasol/bin:/cluster/home/angie/schwartzbin BLASTZ=blastz.v7 # Specific settings for chicken (per Webb email to Brian Raney) BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=10000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q BLASTZ_ABRIDGE_REPEATS=1 # TARGET: Mouse (mm7) SEQ1_DIR=/scratch/hg/mm7/nib SEQ1_SMSK=/scratch/hg/mm7/linSpecRep/notInChicken SEQ1_LEN=/scratch/hg/mm7/chrom.sizes SEQ1_IN_CONTIGS=0 SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Chicken galGal2 - single chunk big enough for whole chroms at once SEQ2_DIR=/scratch/hg/galGal2/nib SEQ2_LEN=/scratch/hg/galGal2/chrom.sizes SEQ2_SMSK=/scratch/hg/galGal2/linSpecRep SEQ2_IN_CONTIGS=0 SEQ2_CHUNK=200000000 SEQ2_LAP=0 BASE=/cluster/data/mm7/bed/blastzGalGal2.2005-11-14 TMPDIR=/scratch/tmp '_EOF_' # happy emacs # establish a screen to control this job screen time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=kk \ -stop=net \ `pwd`/DEF > blastz-to-net.out 2>&1 & # Started 2005-11-14 15:03 # Completed: 16524 of 16524 jobs # CPU time in finished jobs: 10883816s 181396.93m 3023.28h 125.97d 0.345 y # IO & Wait Time: 259868s 4331.13m 72.19h 3.01d 0.008 y # Average job time: 674s 11.24m 0.19h 0.01d # Longest finished job: 10058s 167.63m 2.79h 0.12d # Submission to last job: 43463s 724.38m 12.07h 0.50d # Completed: 306 of 306 jobs # CPU time in finished jobs: 33s 0.55m 0.01h 0.00d 0.000 y # IO & Wait Time: 1244s 20.73m 0.35h 0.01d 0.000 y # Average job time: 4s 0.07m 0.00h 0.00d # Longest finished job: 11s 0.18m 0.00h 0.00d # Submission to last job: 82s 1.37m 0.02h 0.00d # Completed: 40 of 40 jobs # CPU time in finished jobs: 976s 16.27m 0.27h 0.01d 0.000 y # IO & Wait Time: 229s 3.81m 0.06h 0.00d 0.000 y # Average job time: 30s 0.50m 0.01h 0.00d # Longest finished job: 82s 1.37m 0.02h 0.00d # Submission to last job: 105s 1.75m 0.03h 0.00d time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=kk \ -continue=net -stop=load \ `pwd`/DEF > net-to-load.out 2>&1 & # measurements are looking good: featureBits mm7 chainGalGal2Link # 81103709 bases of 2583394090 (3.139%) in intersection time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=kk \ -swap -stop=load \ `pwd`/DEF > swap-to-load.out 2>&1 & # 72202388 bases of 1054197620 (6.849%) in intersection # re-scoring the chains ssh kkstore02 cd /cluster/data/mm7/bed/blastzGalGal2.2005-11-14 rm -fr axtNet mafNet axtChain ssh pk cd /cluster/data/mm7/bed/blastzGalGal2.2005-11-14 time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ -continue=chainRun -stop=load \ `pwd`/DEF > rescoreChain.out 2>&1 & # real 72m11.827s time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ -swap -stop=load \ `pwd`/DEF > rescoreChainSwap.out 2>&1 & # real 13m42.163s time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ -continue=download \ `pwd`/DEF > rescoreDownload.out 2>&1 & time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ -swap -continue=download \ `pwd`/DEF > rescoreDownloadSwap.out 2>&1 & # Measurements: ssh kolossus cd /cluster/data/mm7/bed/blastzGalGal2.2005-11-14 time HGDB_CONF=~/.hg.conf.read-only featureBits mm7 \ chainGalGal2Link > fb.mm7.chainGalGal2Link.rescore 2>&1 cat fb.mm7.chainGalGal2Link.rescore # 79325073 bases of 2583394090 (3.071%) in intersection time HGDB_CONF=~/.hg.conf.read-only featureBits galGal2 \ chainMm7Link > fb.galGal2.chainMm7Link.rescore 2>&1 cat fb.galGal2.chainMm7Link.rescore # 70793655 bases of 1054197620 (6.715%) in intersection ############################################################################# # BLASTZ CHICKEN - (DONE - 2005-10-03 - Hiram) # MAKE LINEAGE-SPECIFIC REPEATS FOR CHICKEN # In an email 2/13/04, Arian said we could treat all human repeats as # lineage-specific for human-chicken blastz. Do the same for mouse. ssh pk mkdir /cluster/data/mm7/bed/blastzGalGal2.2005-09-29 cd /cluster/data/mm7/bed/ ln -s blastzGalGal2.2005-09-29 blastz.galGal2 cd blastzGalGal2.2005-09-29 cat << '_EOF_' > DEF # mouse vs. chicken export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/parasol/bin BLASTZ=blastz.v7.x86_64 # Specific settings for chicken (per Webb email to Brian Raney) BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=10000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q BLASTZ_ABRIDGE_REPEATS=1 # TARGET: Mouse (mm7) SEQ1_DIR=/scratch/hg/mm7/nib SEQ1_SMSK=/scratch/hg/mm7/linSpecRep/notInChicken SEQ1_LEN=/scratch/hg/mm7/chrom.sizes SEQ1_IN_CONTIGS=0 SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Chicken galGal2 SEQ2_DIR=/scratch/hg/galGal2/nib SEQ2_LEN=/scratch/hg/galGal2/chrom.sizes SEQ2_SMSK=/scratch/hg/galGal2/linSpecRep SEQ2_IN_CONTIGS=0 SEQ2_CHUNK=30000000 SEQ2_LAP=0 BASE=/cluster/data/mm7/bed/blastzGalGal2.2005-09-29 TMPDIR=/scratch/tmp '_EOF_' # << this line keeps emacs coloring happy # Probably should have done this with chainMinScore=5000 # But it doesn't seem to have made much difference according to the # featureBits output measurements. ssh pk screen cd /cluster/data/mm7/bed/blastzGalGal2.2005-09-29 time $HOME/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk \ `pwd`/DEF > blastz.out 2>&1 & # detach from screen session: Ctrl-a Ctrl-d # to reattach to this screen session: ssh pk screen -d -r # Completed: 24480 of 24480 jobs # CPU time in finished jobs: 4362447s 72707.44m 1211.79h 50.49d 0.138 y # IO & Wait Time: 80948s 1349.14m 22.49h 0.94d 0.003 y # Average job time: 182s 3.03m 0.05h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 645s 10.75m 0.18h 0.01d # Submission to last job: 13853s 230.88m 3.85h 0.16d # Completed: 306 of 306 jobs # CPU time in finished jobs: 34s 0.56m 0.01h 0.00d 0.000 y # IO & Wait Time: 1060s 17.67m 0.29h 0.01d 0.000 y # Average job time: 4s 0.06m 0.00h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 7s 0.12m 0.00h 0.00d # Submission to last job: 72s 1.20m 0.02h 0.00d # Completed: 40 of 40 jobs # CPU time in finished jobs: 935s 15.58m 0.26h 0.01d 0.000 y # IO & Wait Time: 182s 3.04m 0.05h 0.00d 0.000 y # Average job time: 28s 0.47m 0.01h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 82s 1.37m 0.02h 0.00d # Submission to last job: 104s 1.73m 0.03h 0.00d # measurements are looking good: featureBits mm7 chainGalGal2Link # 80992755 bases of 2583394090 (3.135%) in intersection featureBits mm6 chainGalGal2Link # 82018349 bases of 2597150411 (3.158%) in intersection featureBits mm5 chainGalGal2Link # 78951466 bases of 2615483787 (3.019%) in intersection featureBits mm7 netGalGal2 # 1946029450 bases of 2583394090 (75.328%) in intersection featureBits mm6 netGalGal2 # 1937053597 bases of 2597150411 (74.584%) in intersection featureBits mm5 netGalGal2 # 1958796258 bases of 2615483787 (74.892%) in intersection featureBits mm7 chainGalGal2 # 1975402545 bases of 2583394090 (76.465%) in intersection featureBits mm6 chainGalGal2 # 1969505681 bases of 2597150411 (75.833%) in intersection featureBits mm5 chainGalGal2 # 1990102297 bases of 2615483787 (76.089%) in intersection # Since those are OK, now do the swap: time $HOME/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -swap -chainMinScore=5000 \ `pwd`/DEF > swap.out 2>&1 & # Had a failure during loading, finished it manually, then time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -continue=download -swap -chainMinScore=5000 \ `pwd`/DEF > swap.download-clean.out 2>&1 & # and measure: featureBits galGal2 netMm6 # 832583709 bases of 1054197620 (78.978%) in intersection featureBits galGal2 netMm5 # 835277984 bases of 1054197620 (79.234%) in intersection featureBits galGal2 chainMm6 # 843746491 bases of 1054197620 (80.037%) in intersection featureBits galGal2 chainMm5 # 846905330 bases of 1054197620 (80.336%) in intersection featureBits galGal2 chainMm7Link # 72046938 bases of 1054197620 (6.834%) in intersection featureBits galGal2 chainMm6Link # 72687426 bases of 1054197620 (6.895%) in intersection featureBits galGal2 chainMm5Link # 70542788 bases of 1054197620 (6.692%) in intersection ############################################################################# # BLASTZ OPOSSUM second time (WORKING - 2006-01-24 - Hiram) # With opossum sequence now properly repeat masked ssh pk mkdir /cluster/data/mm7/bed/blastzMonDom2.2006-01-24 cd /cluster/data/mm7/bed/blastzMonDom2.2006-01-24 cat << '_EOF_' > DEF # Mouse vs. opossum export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/parasol/bin BLASTZ=blastz.v7.x86_64 # Specific settings for chicken (per Webb email to Brian Raney) BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=10000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q BLASTZ_ABRIDGE_REPEATS=0 # TARGET: Mouse (mm7) SEQ1_DIR=/scratch/hg/mm7/nib SEQ1_LEN=/scratch/hg/mm7/chrom.sizes SEQ1_IN_CONTIGS=0 SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Opossum monDom2 SEQ2_DIR=/scratch/hg/monDom2/monDom2.2bit SEQ2_LEN=/scratch/hg/monDom2/chrom.sizes SEQ2_IN_CONTIGS=0 SEQ2_CHUNK=30000000 SEQ2_LIMIT=100 SEQ2_LAP=0 BASE=/cluster/data/mm7/bed/blastzMonDom2.2006-01-24 '_EOF_' # happy emacs cd /cluster/data/mm7/bed/blastzMonDom2.2006-01-24 time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ `pwd`/DEF >blastz.out 2>&1 # real 570m27.849s # ran into difficulty during install downloads because the # previous results exists. So, go to hgwdev ssh hgwdev cd /usr/local/apache/htdocs/goldenPath/mm7 rm -fr vsMonDom2 rm liftOver/mm7ToMonDom2.over.chain.gz # then, repeat the downloads step cd /cluster/data/mm7/bed/blastzMonDom2.2006-01-24 time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -continue=download \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ `pwd`/DEF > download.out 2>&1 # And a featureBits measurement: ssh kolossus time featureBits mm7 chainMonDom2Link > fb.mm7.chainMonDom2Link 2>&1 & # real 61m59.651s # 215440797 bases of 2583394090 (8.339%) in intersection # before you can do the swap, you need to remove the existing swap ############################################################################# # BLASTZ OPOSSUM (DONE - 2005-10-03 - 2005-11-24 - Hiram) ssh pk mkdir /cluster/data/mm7/bed/blastzMonDom2.2005-10-03 cd /cluster/data/mm7/bed ln -s blastzMonDom2.2005-10-03 blastz.monDom2 cd blastzMonDom2.2005-10-03 cat << '_EOF_' > DEF # mouse vs. opossum export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/parasol/bin BLASTZ=blastz.v7.x86_64 # Specific settings for chicken (per Webb email to Brian Raney) BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=10000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q BLASTZ_ABRIDGE_REPEATS=0 # TARGET: Mouse (mm7) SEQ1_DIR=/scratch/hg/mm7/nib SEQ1_LEN=/scratch/hg/mm7/chrom.sizes SEQ1_IN_CONTIGS=0 SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Chicken galGal2 SEQ2_DIR=/scratch/hg/monDom2/monDom2.2bit SEQ2_LEN=/scratch/hg/monDom2/chrom.sizes SEQ2_IN_CONTIGS=0 SEQ2_CHUNK=30000000 SEQ2_LAP=0 BASE=/cluster/data/mm7/bed/blastzMonDom2.2005-10-03 TMPDIR=/scratch/tmp '_EOF_' # << this line keeps emacs coloring happy # This should have been done with chainMinScore=5000 # After this was all finished, went back manually and reran the # chaining step to get a -minScore=5000 in there. # Probably should have done this with chainMinScore=5000 # But it doesn't seem to have made much difference according to the # featureBits output measurements. ssh pk screen cd /cluster/data/mm7/bed/blastzMonDom2.2005-10-03 time $HOME/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk \ `pwd`/DEF > blastz.out 2>&1 & # STARTED 2005-10-03 14:30 # real 579m33.438s # user 0m0.064s # sys 0m0.043s # detach from screen session: Ctrl-a Ctrl-d # to reattach to this screen session: ssh kolossus time HGDB_CONF=~/.hg.conf.read-only featureBits mm7 chainMonDom2Link ssh pk screen -d -r # Completed: 44370 of 44370 jobs # CPU time in finished jobs: 7966937s 132782.29m 2213.04h 92.21d 0.253 y # IO & Wait Time: 195416s 3256.93m 54.28h 2.26d 0.006 y # Average job time: 184s 3.07m 0.05h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 6810s 113.50m 1.89h 0.08d # Submission to last job: 60543s 1009.05m 16.82h 0.70d time $HOME/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk \ -continue=cat \ `pwd`/DEF > continue.cat.out 2>&1 & time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -chainMinScore=5000 -bigClusterHub=pk \ -swap -fileServer=kolossus \ `pwd`/DEF > swap.out 2>&1 & # The load of the swapped chains can not complete, out of memory # So, try to split them up into 1000 lumps: ssh kkstore01 cd /cluster/data/monDom2/bed/blastz.mm7.swap/axtChain chainSplit -lump=1000 lump1000 monDom2.mm7.all.chain.gz ssh kolossus cd /cluster/data/mm7/bed/blastzMonDom1.2005-10-03 time HGDB_CONF=~/.hg.conf.read-only featureBits mm7 chainMonDom2Link featureBits mm6 netMonDom1 # 2082064216 bases of 2597150411 (80.167%) in intersection featureBits mm5 netMonDom1 # 2094316044 bases of 2615483787 (80.074%) in intersection featureBits mm7 chainMonDom1 # 2109438148 bases of 2597150411 (81.221%) in intersection featureBits mm5 chainMonDom1 # 2121448151 bases of 2615483787 (81.111%) in intersection featureBits mm7 chainMonDom1Link # 249576105 bases of 2597150411 (9.610%) in intersection featureBits mm5 chainMonDom1Link # 248180346 bases of 2615483787 (9.489%) in intersection # measurements: time HGDB_CONF=~/.hg.conf.read-only featureBits monDom2 chainMm7Link featureBits monDom1 netMm6 # 2884735370 bases of 3492108230 (82.607%) in intersection featureBits monDom1 netMm5 # 2889580530 bases of 3492108230 (82.746%) in intersection featureBits monDom1 chainMm6 # 2908045004 bases of 3492108230 (83.275%) in intersection featureBits monDom1 chainMm5 # 2913812625 bases of 3492108230 (83.440%) in intersection featureBits monDom1 chainMm6Link # 253105698 bases of 3492108230 (7.248%) in intersection featureBits monDom1 chainMm5Link # 249594220 bases of 3492108230 (7.147%) in intersection # looks OK, done # re-scoring the chains ssh kkstore02 cd /cluster/data/mm7/bed/blastzMonDom2.2005-10-03 rm -fr axtNet mafNet axtChain ssh pk cd /cluster/data/mm7/bed/blastzMonDom2.2005-10-03 time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ -continue=chainRun -stop=load \ `pwd`/DEF > rescoreChain.out 2>&1 & time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ -swap -stop=load \ `pwd`/DEF > rescoreChainSwap.out 2>&1 & time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ -swap -continue=net -stop=load \ `pwd`/DEF > rescoreChainSwap.net-to-load.out 2>&1 & time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ -continue=download \ `pwd`/DEF > rescoreChainDownload.out 2>&1 & ############################################################################## # BLASTZ FROG Xenopus tropicalis (DONE - 2005-10-03 - 2005-11-21 - Hiram) ssh pk mkdir /cluster/data/mm7/bed/blastzXenTro1.2005-10-03 cd /cluster/data/mm7/bed ln -s blastzXenTro1.2005-10-03 blastz.xenTro1 cd blastzXenTro1.2005-10-03 cat << '_EOF_' > DEF # mouse vs. frog export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/parasol/bin BLASTZ=blastz.v7.x86_64 # Specific settings for chicken (per Webb email to Brian Raney) BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=8000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q BLASTZ_ABRIDGE_REPEATS=0 # TARGET: Mouse (mm7) SEQ1_DIR=/scratch/hg/mm7/nib SEQ1_LEN=/scratch/hg/mm7/chrom.sizes SEQ1_IN_CONTIGS=0 SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Frog xenTro1 SEQ2_DIR=/scratch/hg/xenTro1/xenTro1.2bit SEQ2_LEN=/scratch/hg/xenTro1/chrom.sizes SEQ2_IN_CONTIGS=0 SEQ2_CHUNK=30000000 SEQ2_LAP=0 BASE=/cluster/data/mm7/bed/blastzXenTro1.2005-10-03 TMPDIR=/scratch/tmp '_EOF_' # << keep emacs coloring happy screen time $HOME/bin/scripts/doBlastzChainNet.pl -verbose=2 -chainMinScore=5000 \ -bigClusterHub=pk \ `pwd`/DEF > blastz.out 2>&1 & # STARTED 2005-10-03 14:30 # real 918m58.309s # user 0m0.063s # sys 0m0.056s # Had some trouble, finished the blastz manually, then continuing: time $HOME/bin/scripts/doBlastzChainNet.pl -verbose=2 -chainMinScore=5000 \ -bigClusterHub=pk -continue=cat \ `pwd`/DEF > continue.cat.out 2>&1 & # detach from screen session: Ctrl-a Ctrl-d # to reattach to this screen session: ssh kkstore02 screen -d -r # measurements ssh hgwdev cd /cluster/data/mm7/bed/blastzXenTro1.2005_04_05 time ../../jkStuff/netChainCheck.sh mm7 mm5 xenTro1 > measures.out 2>&1 & featureBits mm7 netXenTro1 # 1033982128 bases of 2583394090 (40.024%) in intersection featureBits mm6 netXenTro1 # 1033071781 bases of 2597150411 (39.777%) in intersection featureBits mm5 netXenTro1 # 1042210258 bases of 2615483787 (39.848%) in intersection featureBits mm7 chainXenTro1 # 1065396669 bases of 2583394090 (41.240%) in intersection featureBits mm6 chainXenTro1 # 1063392793 bases of 2597150411 (40.945%) in intersection featureBits mm5 chainXenTro1 # 1078618413 bases of 2615483787 (41.240%) in intersection featureBits mm7 chainXenTro1Link # 62465913 bases of 2583394090 (2.418%) in intersection featureBits mm6 chainXenTro1Link # 67119684 bases of 2597150411 (2.584%) in intersection featureBits mm5 chainXenTro1Link # 73115446 bases of 2615483787 (2.795%) in intersection # Those are looking good, now to the swap: ssh pk cd /cluster/data/mm7/bed/blastzXenTro1.2005-10-03 time $HOME/bin/scripts/doBlastzChainNet.pl -verbose=2 -chainMinScore=5000 \ -swap -bigClusterHub=pk -continue=cat \ `pwd`/DEF > swap.out 2>&1 & # 70 minutes # Measurements: featureBits xenTro1 netMm7 # 681240580 bases of 1381238994 (49.321%) in intersection featureBits xenTro1 netMm6 # 683225633 bases of 1381238994 (49.465%) in intersection featureBits xenTro1 netMm5 # 697384254 bases of 1381238994 (50.490%) in intersection featureBits xenTro1 chainMm7 # 697097012 bases of 1381238994 (50.469%) in intersection featureBits xenTro1 chainMm6 # 700638086 bases of 1381238994 (50.725%) in intersection featureBits xenTro1 chainMm5 # 721494705 bases of 1381238994 (52.235%) in intersection HGDB_CONF=~/.hg.conf.read-only featureBits xenTro1 chainMm7Link # 59339777 bases of 1381238994 (4.296%) in intersection featureBits xenTro1 chainMm6Link # 64584213 bases of 1381238994 (4.676%) in intersectio featureBits xenTro1 chainMm5Link # 76415718 bases of 1381238994 (5.532%) in intersection # re-scoring the chains ssh kkstore02 cd /cluster/data/mm7/bed/blastzXenTro1.2005-10-03 rm -fr axtNet mafNet axtChain ssh pk cd /cluster/data/mm7/bed/blastzXenTro1.2005-10-03 time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ -continue=chainRun -stop=load \ `pwd`/DEF > rescoreChain.out 2>&1 & # real 101m3.953s time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ -swap -stop=load \ `pwd`/DEF > rescoreChainSwap.out 2>&1 & time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ -continue=download \ `pwd`/DEF > rescoreChainDownload.out 2>&1 & time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ -swap -continue=download \ `pwd`/DEF > rescoreChainDownloadSwap.out 2>&1 & # Measurements: ssh kolossus cd /cluster/data/mm7/bed/blastzXenTro1.2005-10-03 time HGDB_CONF=~/.hg.conf.read-only featureBits mm7 \ chainXenTro1Link > fb.mm7.chainXenTro1Link.rescore 2>&1 cat fb.mm7.chainXenTro1Link.rescore # 62465913 bases of 2583394090 (2.418%) in intersection time HGDB_CONF=~/.hg.conf.read-only featureBits xenTro1 \ chainMm7Link > fb.xenTro1.chainMm7Link.rescore 2>&1 cat fb.xenTro1.chainMm7Link.rescore # 59339777 bases of 1381238994 (4.296%) in intersection ############################################################################# # BLASTZ CHIMP PanTro1 (DONE - 2005-10-04 - 2005-10-20 - Hiram) # This sequence didn't function correctly in the multiple # alignment. The blastz run was done a second time. ssh kkstore03 # Create the composite chrom/contigs from randoms 2bit file # for PanTro1 mkdir /cluster/data/panTro1/randomContigs cd /cluster/data/panTro1/randomContigs cp -p /cluster/data/cb2/scripts/agpToLift.pl ../jkStuff for AGP in ../pan_troglodytes_agp/ptr*_random.agp do CHR=`basename ${AGP}` CHR=${CHR/.agp} CHR=${CHR#ptr} echo ${CHR} sed -e "s/^ptr/chr/" ${AGP} | ../jkStuff/agpToLift.pl /dev/stdin \ > chr${CHR}.lft /cluster/data/mm7/jkStuff/lft2BitToFa.pl ../panTro1.2bit chr${CHR}.lft \ > chr${CHR}.ctg.fa done cat chr*.lft > ../jkStuff/panTro1Chroms_RandomContigs.lft # Check that the sequence remains the same cd /cluster/data/panTro1 faSize randomContigs/*.ctg.fa # 371582039 bases (50043664 N's 321538375 real 158541165 # upper 162997210 lower) in 31264 sequences in 26 files faSize */chr*_random.fa # 1336382039 bases (1014843664 N's 321538375 real 158541165 # upper 162997210 lower) in 26 sequences in 26 files # Note, only the N's are different faToTwoBit ?/chr?.fa ??/chr??.fa \ randomContigs/chr*.ctg.fa panTro1Chroms_RandomContigs.2bit # Verify sequence isn't broken: twoBitToFa panTro1Chroms_RandomContigs.2bit stdout | faSize stdin # 3455575440 bases (721627263 N's 2733948177 real 1461884170 # upper 1272064007 lower) in 31290 sequences in 1 files twoBitToFa panTro1.2bit stdout | faSize stdin # 4420375440 bases (1686427263 N's 2733948177 real 1461884170 # upper 1272064007 lower) in 52 sequences in 1 files # Only difference is the N count. 1686427263 - 721627263 = 964800000 XXX something is broken for the chrUn lift file: faSize chrUn_random.fa 240967748 bases (159721331 Ns 81246417 real 36386260 upper 44860157 lower) in 1 sequences in 1 files The agp file for chrUn has a contig gap as the last entry in the file: ptrUn_random 240945901 240957748 29919 W scaffold_9998 1 11848 + ptrUn_random 240957749 240967748 29920 N 10000 contig twoBitInfo panTro1Chroms_RandomContigs.2bit \ panTro1Chroms_RandomContigs.sizes ssh pk cd /cluster/data/panTro1 mkdir /san/sanvol1/scratch/panTro1/ cp -p panTro1.2bit /san/sanvol1/scratch/panTro1 cp -p chrom.sizes /san/sanvol1/scratch/panTro1 cp -p panTro1Chroms_RandomContigs.sizes /san/sanvol1/scratch/panTro1 cp -p panTro1Chroms_RandomContigs.2bit /san/sanvol1/scratch/panTro1 cp -p jkStuff/panTro1Chroms_RandomContigs.lft /san/sanvol1/scratch/panTro1 mkdir /cluster/data/mm7/bed/blastzPanTro1.2005-10-05 cd /cluster/data/mm7/bed ln -s blastzPanTro1.2005-10-05 blastz.panTro1 cd blastzPanTro1.2005-10-05 # same parameters as Human alignment, except for the use of the # SMSK linSpecRepeats - in this case, using none. Should be an # interesting comparison if the lineage specific repeats make much # difference in the result. cat << '_EOF_' > DEF # human vs mouse export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn/x86_64:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin BLASTZ=blastz.x86_64 BLASTZ_H=2000 BLASTZ_M=50 # TARGET: Mouse (mm7) small enough chunk to get reasonable run time SEQ1_DIR=/scratch/hg/mm7/mm7.2bit SEQ1_LEN=/scratch/hg/mm7/chrom.sizes SEQ1_CTGDIR=/scratch/hg/mm7/mm7Chroms_RandomContigs.2bit SEQ1_CTGLEN=/scratch/hg/mm7/mm7Chroms_RandomContigs.sizes SEQ1_LIFT=/scratch/hg/mm7/Chroms_RandomContigs.lft SEQ1_LIMIT=30 SEQ1_IN_CONTIGS=0 SEQ1_CHUNK=500000 SEQ1_LAP=5000 # QUERY: Chimp panTro1 large enough chunk to go all at once SEQ2_DIR=/san/sanvol1/scratch/panTro1/panTro1.2bit SEQ2_LEN=/san/sanvol1/scratch/panTro1/chrom.sizes SEQ2_CTGDIR=/san/sanvol1/scratch/panTro1/panTro1Chroms_RandomContigs.2bit SEQ2_CTGLEN=/san/sanvol1/scratch/panTro1/panTro1Chroms_RandomContigs.sizes SEQ2_LIFT=/san/sanvol1/scratch/panTro1/panTro1Chroms_RandomContigs.lft SEQ2_IN_CONTIGS=0 SEQ2_CHUNK=4421000000 SEQ2_LAP=0 BASE=/cluster/data/mm7/bed/blastzPanTro1.2005-10-05 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs screen time $HOME/bin/scripts/doBlastzChainNet.pl \ -bigClusterHub=pk \ `pwd`/DEF > blast.run.out 2>&1 & # STARTED 2005-10-07 12:48 time /cluster/bin/scripts/doBlastzChainNet.pl \ -continue=cat -bigClusterHub=pk \ `pwd`/DEF > continue.cat.out 2>&1 & # Had some problems due to chrom.sizes being incorrect because all # of the *_random chroms *end* with a gap ! Manually edited the # chrom.sizes file in /san/sanvol1/ to take off 50000 from the # length (or 10000 for the chrUn_random) to get the nets to # complete. time /cluster/bin/scripts/doBlastzChainNet.pl \ -continue=chainMerge -bigClusterHub=pk \ `pwd`/DEF > continue.chainMerge.out 2>&1 & time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -continue=net -bigClusterHub=pk \ `pwd`/DEF > continue.net.out 2>&1 & # check feature bits ssh kolossus HGDB_CONF=~/.hg.conf.read-only featureBits mm7 chainPanTro1Link # 917177798 bases of 2583394090 (35.503%) in intersection # Looks OK, now to swap time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -swap -bigClusterHub=pk -fileServer=kolossus \ `pwd`/DEF > swap.out 2>&1 & # Had some difficulties with the netting - after fixing, then load: time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -continue=load -swap -bigClusterHub=kk \ `pwd`/DEF > load.swap.out 2>&1 & # 489 minutes = 8h 09m # detach from screen session: Ctrl-a Ctrl-d # to reattach to this screen session: ssh kkstore02 screen -d -r Completed: 155570 of 155570 jobs CPU time in finished jobs: 14707939s 245132.32m 4085.54h 170.23d 0.466 y IO & Wait Time: 609798s 10163.29m 169.39h 7.06d 0.019 y Average job time: 98s 1.64m 0.03h 0.00d Longest running job: 0s 0.00m 0.00h 0.00d Longest finished job: 5146s 85.77m 1.43h 0.06d Submission to last job: 20972s 349.53m 5.83h 0.24d Completed: 331 of 331 jobs CPU time in finished jobs: 260s 4.33m 0.07h 0.00d 0.000 y IO & Wait Time: 1135s 18.92m 0.32h 0.01d 0.000 y Average job time: 4s 0.07m 0.00h 0.00d Longest running job: 0s 0.00m 0.00h 0.00d Longest finished job: 16s 0.27m 0.00h 0.00d Submission to last job: 234s 3.90m 0.07h 0.00d Completed: 40 of 40 jobs CPU time in finished jobs: 7229s 120.48m 2.01h 0.08d 0.000 y IO & Wait Time: 207s 3.46m 0.06h 0.00d 0.000 y Average job time: 186s 3.10m 0.05h 0.00d Longest running job: 0s 0.00m 0.00h 0.00d Longest finished job: 597s 9.95m 0.17h 0.01d Submission to last job: 1287s 21.45m 0.36h 0.01d ssh hgwdev cd /cluster/data/mm7/bed/blastzPanTro1.2005_04_08 featureBits mm7 netPanTro1 # 2569701404 bases of 2597150411 (98.943%) in intersection time featureBits mm7 netHg17 # 2579747741 bases of 2597150411 (99.330%) in intersection featureBits mm7 chainPanTro1 # 2585896564 bases of 2597150411 (99.567%) in intersection time featureBits mm7 chainHg17 # 2596946329 bases of 2597150411 (99.992%) in intersection featureBits mm6 chainPanTro1Link # 924893452 bases of 2597150411 (35.612%) in intersection featureBits mm7 chainHg17Link (on kolossus) # 966916309 bases of 2597150411 (37.230%) in intersection # Looks about correct, now for the swap ssh eieio cd /cluster/data/mm7/bed/blastzPanTro1.2005_04_08 time /cluster/bin/scripts/doBlastzChainNet.pl \ -swap -fileServer eieio `pwd`/DEF > swap.run.out 2>&1 & # 107 minutes featureBits panTro1 netMm6 # 3306360710 bases of 2733948177 (120.937%) in intersection featureBits panTro1 chainMm6 # 3363239156 bases of 2733948177 (123.018%) in intersection featureBits panTro1 chainMm6Link # 922583825 bases of 2733948177 (33.745%) in intersection featureBits -countGaps panTro1 netMm6 # 3306360710 bases of 4420375440 (74.798%) in intersection featureBits -countGaps panTro1 netHg16 # 4015411490 bases of 4420375440 (90.839%) in intersection featureBits -countGaps panTro1 chainMm6 # 3363239156 bases of 4420375440 (76.085%) in intersection featureBits -countGaps panTro1 chainHg16 # 4056193816 bases of 4420375440 (91.761%) in intersection # on kolossus: HGDB_CONF=~/.hg.conf.read-only featureBits -countGaps panTro1 chainHg16Link # 2611490291 bases of 4420375440 (59.078%) in intersection HGDB_CONF=~/.hg.conf.read-only featureBits -countGaps panTro1 chainMm6Link # 922583825 bases of 4420375440 (20.871%) in intersection # Appears to be reasonable, check the genome-test browser on both # the Mm6 assembly and the PanTro1 assembly to see if the net and # chain tracks appear and are in the proper order. # re-scoring the chains ssh kkstore02 cd /cluster/data/mm7/bed/blastzPanTro1.2005-10-05 rm -fr axtNet mafNet axtChain ssh pk cd /cluster/data/mm7/bed/blastzPanTro1.2005-10-05 time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -continue=chainRun -stop=load \ `pwd`/DEF > rescoreChain.out 2>&1 & time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -swap -stop=load \ `pwd`/DEF > rescoreChainSwap.out 2>&1 & time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -swap -continue=load -stop=load \ `pwd`/DEF > rescoreChainSwapLoad.out 2>&1 & ############################################################################# # STS MARKERS DATA DOWNLOAD (DONE - 2005-09-29 - 2005-10-07 - Hiram) # Applied a filter to primers.psl - 2005-10-20 - Hiram ssh kkstore02 mkdir -p /cluster/data/mm7/bed/STSmarkers/downloads cd /cluster/data/mm7/bed/STSmarkers/downloads # these files appear to be new almost every day wget --timestamping \ ftp://ftp.ncbi.nih.gov/repository/UniSTS/UniSTS_mouse.sts wget --timestamping \ ftp://ftp.ncbi.nih.gov/repository/UniSTS/UniSTS.aliases # The new feature in the .aliases file this time are names with # spaces in them ! This changes our parsing business below, # hopefully the spaces in the names won't cause trouble elsewhere. wget --timestamping \ ftp://ftp.ncbi.nih.gov/repository/UniSTS/UniSTS_MapReports/Mus_musculus/* # these reports from jax.org appear to be changing daily wget --timestamping \ ftp://ftp.informatics.jax.org/pub/reports/MRK_Dump2.rpt wget --timestamping \ ftp://ftp.informatics.jax.org/pub/reports/MRK_Sequence.rpt wget --timestamping \ ftp://ftp.informatics.jax.org/pub/reports/PRB_PrimerSeq.rpt ls -ogrt # -r--r--r-- 1 676 Mar 11 2004 README # -r--r--r-- 1 396858 Jan 28 2005 10090.MGI.txt # -r--r--r-- 1 240688 Mar 16 2005 10090.WI-YAC.txt # -r--r--r-- 1 390139 Mar 16 2005 10090.WI_MRC_RH.txt # -r--r--r-- 1 173344 Mar 16 2005 10090.WI-Genetic.txt # -rw-rw-r-- 1 4540721 Sep 29 02:18 MRK_Dump2.rpt # -rw-rw-r-- 1 2520760 Sep 29 02:19 PRB_PrimerSeq.rpt # -rw-rw-r-- 1 4345973 Sep 29 02:19 MRK_Sequence.rpt # -r--r--r-- 1 13832082 Sep 29 07:20 UniSTS.aliases # -r--r--r-- 1 4106299 Sep 29 07:21 UniSTS_mouse.sts # back to our work area, update the bed file # to do this we need a new UniSTS_mouse.alias file # it is created by a combination of information from several # of the above files ! AND ! the previous stsInfoMouse.bed file cd /cluster/data/mm7/bed/STSmarkers/downloads cp -p /cluster/data/mm6/bed/STSmarkers/downloads/*.sh . cp -p /cluster/data/mm6/bed/STSmarkers/downloads/*.pl . # There is a line in the fetchAllAliases.sh script that needs to # be updated, it must point to the previous bed file: # BEDFile=/cluster/data/mm6/bed/STSmarkers/stsInfoMouse.bed # Next time, this should read: # BEDFile=/cluster/data/mm7/bed/STSmarkers/stsInfoMouse.bed # This process has been captured in the script: # /cluster/data/mm5/bed/STSmarkers/downloads/fetchAllAliases.sh # which uses a couple of perl scripts in that same directory. # briefly it is: # ./UniSTSParse.pl UniSTS_mouse.sts UniSTS.aliases > UniSTS_mouse_alias.0 # grep MGI: UniSTS.aliases > MGI.aliases # ./stsInfoMouseParse.pl /cluster/store5/mouseMarker/stsInfoMouse.bed > \ # stsInfoAliases.txt # ./UniSTSParse.pl stsInfoAliases.txt UniSTS.aliases > stsInfo.aliases # cat UniSTS_mouse_alias.0 MGI.aliases stsInfo.aliases | sort -u \ # | sort -n > UniSTS_mouse.alias time ./fetchAllAliases.sh # Here is a normal set of errors: # processing UniSTS_mouse.sts to find aliases # # ERROR: KNOWN(==OK) duplicate ID: '108991' encountered at line # # 2388 # processing MGI.aliases # fetching existing aliases from previous stsInfoMouse.bed file # found 27615 potential errors in /cluster/data/mm6/bed/STSmarkers/stsInfoMouse.bed # to see the errors: grep ERROR stsInfoAliases.txt # verify those stsInfoMouse.bed aliases with UniSTS.aliases # with that, we can create a new stsInfoMouse.bed file: # Update the mm6 directory name here to mm7 cd /cluster/data/mm7/bed/STSmarkers /cluster/store5/mouseMarker/code/updateBed.pl \ /cluster/data/mm6/bed/STSmarkers/stsInfoMouse.bed \ downloads/MRK_Dump2.rpt downloads/PRB_PrimerSeq.rpt \ downloads/MRK_Sequence.rpt downloads/UniSTS_mouse.alias \ downloads/UniSTS_mouse.sts | sed -e "s/\t*$//" > newbedfile # Yontao updated /cluster/store5/mouseMarker/code/cleanInfo.pl 8/10/04 /cluster/store5/mouseMarker/code/cleanInfo.pl newbedfile > stsInfoMouse.bed # copy the stsInfoMouse.bed file from working dir to the marker info storage fold. # added 2 new steps by Yontao mv /cluster/store5/mouseMarker/stsInfoMouse.bed \ /cluster/store5/mouseMarker/stsInfoMouse.bed_mm6 cp -p stsInfoMouse.bed /cluster/store5/mouseMarker/stsInfoMouse.bed # comparing to previous wc /cluster/store5/mouseMarker/stsInfoMouse.bed \ /cluster/store5/mouseMarker/stsInfoMouse.bed_mm6 \ /cluster/store5/mouseMarker/stsInfoMouse.bed_mm5 # 59843 794642 6802825 /cluster/store5/mouseMarker/stsInfoMouse.bed # 58980 784786 6690105 /cluster/store5/mouseMarker/stsInfoMouse.bed_mm6 # 58493 778055 6524821 /cluster/store5/mouseMarker/stsInfoMouse.bed_mm5 # and from that, create new primer fa, epcr, etc: /cluster/store5/mouseMarker/code/luConvertPrimerToFa \ stsInfoMouse.bed mouseP.fa mouseC.fa mouseP.info # the mouseC.fa file will be empty wc mouse?.* # 0 0 0 mouseC.fa # 300968 300914 6798466 mouseP.fa # 33838 169275 2153113 mouseP.info # 334806 470189 8951579 total # the equivalent Mm6 files: # 0 0 0 mouseC.fa # 293305 293251 6624638 mouseP.fa # 32890 164528 2087271 mouseP.info # 326195 457779 8711909 total # the equivalent Mm5 files: # 0 0 0 mouseC.fa # 286740 286686 6474893 mouseP.fa # 32232 161234 2044810 mouseP.info # 318972 447920 8519703 total # copy the primers over to the san for the kluster run ssh pk mkdir -p /san/sanvol1/scratch/mm7/fasta cd /cluster/data/mm7 time cp --verbose -p ?/*.fa ??/*.fa /san/sanvol1/scratch/mm7/fasta cd /cluster/data/mm7/bed/STSmarkers mkdir -p /san/sanvol1/scratch/mm7/STSmarkers cp -p mouseP.fa /san/sanvol1/scratch/mm7/STSmarkers cp -p mouseP.info /san/sanvol1/scratch/mm7/STSmarkers # CLUSTER RUN FOR THE STS PRIMERS mkdir -p /cluster/data/mm7/bed/STSmarkers/primer mkdir -p /cluster/data/mm7/bed/STSmarkers/ePCR cd /cluster/data/mm7/bed/STSmarkers/primer # the mouseP.fa comes from above # PLEASE NOTE /cluster/bin/i386/blat.2 SPECIFICALLY IS USED HERE. cat << '_EOF_' > template #LOOP /cluster/bin/i386/blat.2 $(path1) /san/sanvol1/scratch/mm7/STSmarkers/mouseP.fa -ooc=/scratch/hg/h/mouse11.ooc -minMatch=1 -minScore=0 -minIdentity=80 -oneOff {check out line+ primers.out/$(root1).psl} #ENDLOOP '_EOF_' # << emacs mkdir primers.out ls -1S /san/sanvol1/scratch/mm7/fasta/chr*.fa > contig.lst gensub2 contig.lst single template jobList para create jobList para try para check para push # STARTED - 2005-09-20 14:26 Completed: 40 of 40 jobs CPU time in finished jobs: 367939s 6132.32m 102.21h 4.26d 0.012 y IO & Wait Time: 739s 12.31m 0.21h 0.01d 0.000 y Average job time: 9217s 153.62m 2.56h 0.11d Longest running job: 0s 0.00m 0.00h 0.00d Longest finished job: 42051s 700.85m 11.68h 0.49d Submission to last job: 42051s 700.85m 11.68h 0.49d # on the file server ssh kkstore02 cd /cluster/data/mm7/bed/STSmarkers/primer # filter alignments for (qEnd-qStart) vs. (tEnd-tStart) 2005-10-20 # should not be more than 100 bases different. # This filters out about 1,028,202 alignments, or # %17.4 = 100.0 * 1028202 / 5921712 pslSort dirs stdout temp primers.out | awk -F"\t" ' { if (((($13 - $12) - ($17 - $16)) > -100) && ((($13 - $12) - ($17 - $16)) < 100)) {print} } ' > primers.psl.100 rmdir temp # a rough comparison with previous results: wc primers.psl.100 # 4893510 102763628 510563575 primers.psl.100 wc primers.psl (unfiltered, Mm7) # 5921712 124355891 636898117 primers.psl wc /cluster/data/mm6/bed/STSmarkers/primer/primers.psl # 5724127 120206606 615248041 wc /cluster/data/mm5/bed/STSmarkers/primer/primers.psl # 5719969 120119288 590806241 wc /cluster/data/mm4/bed/STSmarkers/primer/primers.psl # 5745617 120657896 592135728 # another kluster run for the ePCR ssh pk cd /cluster/data/mm7/bed/STSmarkers/ePCR ls -1S /san/sanvol1/scratch/mm7/fasta/chr*.fa > contig.lst # pick up e-PCR source from # ftp://ftp.ncbi.nlm.nih.gov/pub/schuler/e-PCR/ # version 2.3.1 11 Feb 2005 # Had to add the following to both re-PCR_main.cpp and # e-PCR_main.cpp to get them to compile on kolossus: // max and min Copied from /usr/include/mysql/my_global.h #define max(a, b) ((a) >? (b)) #define min(a, b) ((a) runPCR.csh #!/bin/csh -fe /cluster/bin/x86_64/e-PCR $1 $2 N=1 M=50 W=5 > $3 '_EOF_' # << emacs chmod +x runPCR.csh cat << '_EOF_' > template #LOOP ./runPCR.csh /san/sanvol1/scratch/mm7//STSmarkers/mouseP.info $(path1) {check out line+ epcr.out/$(num1).epcr} #ENDLOOP '_EOF_' # << the mouseP.info was created above gensub2 contig.lst single template jobList para create jobList para try para check para push ... etc ... # STARTED 2005-09-30 11:45 # Completed: 40 of 40 jobs # CPU time in finished jobs: 62439s 1040.64m 17.34h 0.72d 0.002 y # IO & Wait Time: 180s 3.01m 0.05h 0.00d 0.000 y # Average job time: 1565s 26.09m 0.43h 0.02d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 4582s 76.37m 1.27h 0.05d # Submission to last job: 4608s 76.80m 1.28h 0.05d ssh kkstore02 cd /cluster/data/mm7/bed/STSmarkers/ePCR # all those results become all.epcr cat epcr.out/*.epcr > all.epcr # comparing to previous results: wc all.epcr # 57709 230836 3185188 all.epcr wc /cluster/data/mm6/bed/STSmarkers/ePCR/all.epcr # 55871 223484 3086148 all.epcr wc /cluster/data/mm5/bed/STSmarkers/ePCR/all.epcr # 55677 222708 2945623 /cluster/data/mm5/bed/STSmarkers/ePCR/all.epcr wc /cluster/data/mm4/bed/STSmarkers/ePCR/all.epcr # 74705 298820 3971712 /cluster/data/mm4/bed/STSmarkers/ePCR/all.epcr # Mm4 seems to be out of whack cd /cluster/data/mm7/bed/STSmarkers/primer /cluster/bin/scripts/filterSTSPrimers \ -mouse ../stsInfoMouse.bed primers.psl.100 \ ../mouseP.info ../ePCR/all.epcr > primers.psl.filter.blat # The output should show an increasing count: # Reading name info # Reading primer info # Processing file # 100000 # 200000 # 300000 # ... # 5700000 # Determining ePCR not found # wc primers.psl.filter.blat (after filter applied above to primers.psl) # 33986 713706 3637462 primers.psl.filter.blat wc primers.psl.filter.blat (before filter applied above to primers.psl) # 34674 728154 3718714 primers.psl.filter.blat wc /cluster/data/mm6/bed/STSmarkers/primer/primers.psl.filter.blat # 33662 706902 3605847 primers.psl.filter.blat wc /cluster/data/mm5/bed/STSmarkers/primer/primers.psl.filter.blat # 33476 702996 3442402 wc /cluster/data/mm4/bed/STSmarkers/primer/primers.psl.filter.blat # 32729 687309 3331894 # It appears Mm4 became sane after the filter # create accession_info.rdb touch empty_sequence.inf /cluster/bin/scripts/compileAccInfo -mouse \ /cluster/data/mm7 empty_sequence.inf # works with errors on missing randoms, etc...: # cat: /cluster/data/mm5/11/chr11_random.agp: No such file or directory # cat: /cluster/data/mm5/M/chrM_random.agp: No such file or directory mv accession_info.rdb accession_info.rdb.tmp /cluster/bin/scripts/sorttbl Chr Ord Start < accession_info.rdb.tmp > \ accession_info.rdb rm accession_info.rdb.tmp # comparing results to previous # There is a dramatic decrease in Mm7 - this appears to be due to # the lower numbers of fragments in the agp files. # e.g.: [hiram@pk /cluster/data] wc mm6/*/chr*.agp | tail -1 # 170812 1459570 9240332 total [hiram@pk /cluster/data] wc mm7/*/chr*.agp | tail -1 # 70125 605050 3696903 total wc accession_info.rdb # 44046 484510 3112816 accession_info.rdb wc /cluster/data/mm6/bed/STSmarkers/primer/accession_info.rdb # 93052 1023576 6824900 accession_info.rdb wc /cluster/data/mm5/bed/STSmarkers/primer/accession_info.rdb # 131845 1450299 9681940 wc /cluster/data/mm4/bed/STSmarkers/primer/accession_info.rdb # 86935 956289 6374930 # creates epcr.not.found.nomatch and epcr.not.found.psl # /cluster/bin/scripts/epcrToPsl # Fixed this script (in mm6) to make it not look for contigs in the usual # manner, we don't have those for this assembly sed -e "s/mm6/mm7/g" /cluster/data/mm6/bed/STSmarkers/primer/epcrToPsl \ > ./epcrToPsl ./epcrToPsl -mouse \ epcr.not.found ../mouseP.info \ accession_info.rdb /cluster/data/mm7 # Comparing results to previous: wc epcr* # 474 1896 17400 epcr.not.found # 0 0 0 epcr.not.found.nomatch # 474 9954 47288 epcr.not.found.psl # 158 535 4308 epcrToPsl # 1106 12385 68996 total # Mm6 wc epcr* wc /cluster/data/mm6/bed/STSmarkers/primer/epcr* # 467 1868 17135 epcr.not.found # 63 756 6041 epcr.not.found.nomatch # 404 8484 40254 epcr.not.found.psl # 158 535 4308 epcrToPsl # 1092 11643 67738 total # Mm5 wc epcr* wc /cluster/data/mm5/bed/STSmarkers/primer/epcr* # 463 1852 17080 epcr.not.found # 61 732 5845 epcr.not.found.nomatch # 398 8358 38591 epcr.not.found.psl # 402 8442 39011 epcr.not.found.psl.orig # 1324 19384 100527 total # Mm4 wc epcr* wc /cluster/data/mm4/bed/STSmarkers/primer/epcr* # 328 1312 12011 epcr.not.found # 57 684 5474 epcr.not.found.nomatch # 266 5586 25711 epcr.not.found.psl # 163 552 4370 epcrToPsl # 814 8134 47566 total cat primers.psl.filter.blat epcr.not.found.psl > primers.psl.filter wc primers.psl.filter (after filter applied above to primers.psl) # 34460 723660 3684750 primers.psl.filter wc primers.psl.filter (before filter applied above to primers.psl) # 35148 738108 3766002 primers.psl.filter wc /cluster/data/mm6/bed/STSmarkers/primer/primers.psl.filter # 34066 715386 3646101 primers.psl.filter wc /cluster/data/mm5/bed/STSmarkers/primer/primers.psl.filter.lifted # 33691 707511 3601164 primers.psl.filter.lifted # create primers.psl.filter.lifted.initial PATH=/cluster/bin/scripts:$PATH /cluster/bin/scripts/extractPslInfo \ primers.psl.filter wc primers.psl.filter.initial (after filter applied above to primers.psl) # 34443 206658 1833793 primers.psl.filter.initial wc primers.psl.filter.initial (before filter applied above to primers.psl) # 35131 210786 1870800 primers.psl.filter.initial wc /cluster/data/mm6/bed/STSmarkers/primer/primers.psl.filter.initial # 34048 204288 1815222 primers.psl.filter.initial wc /cluster/data/mm5/bed/STSmarkers/primer/primers.psl.filter.lifted.initial # 33689 202134 1799016 primers.psl.filter.lifted.initial # create primers.psl.filter.lifted.initial.acc /cluster/bin/scripts/findAccession -agp \ -mouse primers.psl.filter.initial /cluster/data/mm7 # it complains about missing _random items, it is OK wc primers.psl.filter.initial.acc (after filter applied above to primers.psl) # 34443 241101 2160540 primers.psl.filter.initial.acc wc primers.psl.filter.initial.acc (before filter applied above to primers.psl) # 35131 245917 2204115 primers.psl.filter.initial.acc wc /cluster/data/mm6/bed/STSmarkers/primer/primers.psl.filter.initial.acc # 34048 238336 2154798 primers.psl.filter.initial.acc wc /cluster/data/mm5/bed/STSmarkers/primer/primers.psl.filter.lifted.initial.acc # 33689 235823 2158029 primers.psl.filter.lifted.initial.acc # this needs to be -rat as that specifies how to scan the # stsInfoMouse.bed file and it does not work if you use -mouse /cluster/bin/scripts/getStsId -rat \ ../stsInfoMouse.bed primers.psl.filter.initial.acc \ > primers.initial.acc.trans wc primers.initial.acc.trans (after filter applied above to primers.psl) # 34443 241101 1830431 primers.initial.acc.trans wc primers.initial.acc.trans (before filter applied above to primers.psl) # 35131 245917 1867180 primers.initial.acc.trans wc /cluster/data/mm6/bed/STSmarkers/primer/primers.initial.acc.trans wc primers.initial.acc.trans # 34041 238287 1829724 primers.initial.acc.trans # I don't see any of these errors in the Mm7 build: # No id for 61645_RH126840 # No id for 4187_D10MIT171.2 # No id for 63449_RH125771 # No id for 67188_PMC99911P4 # No id for 8839_D6MIT360.1 # No id for 62732_RH126829 # No id for 63746_RH127126 sort -k 4n primers.initial.acc.trans > primers.final wc primers.final (after filter applied above to primers.psl) # 34443 241101 1830431 primers.final wc primers.final (before filter applied above to primers.psl) # 35131 245917 1867180 primers.final wc /cluster/data/mm6/bed/STSmarkers/primer/primers.final # 34041 238287 1829724 primers.final wc /cluster/data/mm5/bed/STSmarkers/primer/primers.final # 33689 235823 1834889 /cluster/data/mm5/bed/STSmarkers/primer/primers.final rm primers.initial.acc.trans cd /cluster/data/mm7/bed/STSmarkers # stsMarkers.final is empty for mouse touch stsMarkers.final dummy PATH=/cluster/bin/scripts:$PATH /cluster/bin/scripts/combineSeqPrimerPos \ stsMarkers.final primer/primers.final > stsMarkers_pos.rdb wc stsMarkers_pos.rdb (after filter applied to primers.psl above) # 32869 230083 1868276 stsMarkers_pos.rdb wc stsMarkers_pos.rdb (before filter applied to primers.psl above) # 33421 233947 1903951 stsMarkers_pos.rdb wc /cluster/data/mm6/bed/STSmarkers/stsMarkers_pos.rdb # 32350 226450 1909506 stsMarkers_pos.rdb wc /cluster/data/mm5/bed/STSmarkers/stsMarkers_pos.rdb # 32085 224595 1862816 /cluster/data/mm5/bed/STSmarkers/stsMarkers_pos.rdb wc /cluster/data/mm4/bed/STSmarkers/stsMarkers_pos.rdb # 31270 218890 1869417 /cluster/data/mm4/bed/STSmarkers/stsMarkers_pos.rdb /projects/cc/hg/ytlu/bin/script/perl/createStsBed \ stsInfoMouse.bed stsMarkers_pos.rdb 500 > stsMapMouse.bed wc stsMapMouse.bed (after filter applied to primers.psl above) # 29842 308123 2135776 stsMapMouse.bed wc stsMapMouse.bed (before filter applied to primers.psl above) # 30296 312257 2166725 stsMapMouse.bed wc /cluster/data/mm6/bed/STSmarkers/stsMapMouse.bed # 29079 301678 2097544 stsMapMouse.bed wc /cluster/data/mm5/bed/STSmarkers/stsMapMouse.bed # 29069 301535 2123622 /cluster/data/mm5/bed/STSmarkers/stsMapMouse.bed # loading STS markers tables ssh hgwdev cd /cluster/data/mm7/bed/STSmarkers cp -p /cluster/data/mm6/bed/STSmarkers/ucscAlias.pl . ./ucscAlias.pl stsInfoMouse.bed > ucscStsAlias.tab 2> ucscStsAlias.warnings # this does leave messages in ucscStsAlias.warnings but they seem # to be very similar to Mm6 with just a few new ones wc ucscStsAlias.tab (after applying filter to primers.psl above) # 144570 433667 3366815 ucscStsAlias.tab wc ucscStsAlias.tab (before applying filter to primers.psl above) # 144570 433667 3366815 ucscStsAlias.tab wc /cluster/data/mm6/bed/STSmarkers/ucscStsAlias.tab # 141585 424725 3284106 ucscStsAlias.tab wc /cluster/store6/mm5/bed/STSmarkers/ucscStsAlias.tab # 126624 379859 3037850 /cluster/store6/mm5/bed/STSmarkers/ucscStsAlias.tab hgsql -e "drop table stsAlias;" mm7 hgsql mm7 < ~/kent/src/hg/lib/stsAlias.sql hgsql -e \ 'load data local infile "ucscStsAlias.tab" into table stsAlias;' mm7 hgsql -e "drop table stsMapMouseNew;" mm7 hgsql mm7 < ~/kent/src/hg/lib/stsMapMouseNew.sql hgsql -e \ 'load data local infile "stsMapMouse.bed" into table stsMapMouseNew;' mm7 hgsql -e "drop table stsInfoMouseNew;" mm7 hgsql mm7 < ~/kent/src/hg/lib/stsInfoMouseNew.sql hgsql -e \ 'load data local infile "stsInfoMouse.bed" into table stsInfoMouseNew;' mm7 hgLoadPsl -nobin -table=all_sts_primer mm7 primer/primers.psl.filter # load of all_sts_primer did not go as planned: 34460 record(s), # 0 row(s) skipped, 21 warning(s) loading primer/primers.psl.filter # load of all_sts_primer did not go as planned: 35148 record(s), 0 # row(s) skipped, 21 warning(s) loading primer/primers.psl.filter # load primer sequences mkdir /gbdb/mm7/stsMarker ln -s /cluster/data/mm7/bed/STSmarkers/mouseP.fa \ /gbdb/mm7/stsMarker/mouseP.fa # PLEASE NOTE THAT THE If you are going to reload this business, use the # -replace option on this hgLoadSeq # hgLoadSeq -replace mm7 /gbdb/mm7/stsMarker/mouseP.fa # otherwise there will be a problem that the seq and extFile tables # will be out of sync. hgLoadSeq mm7 /gbdb/mm7/stsMarker/mouseP.fa # Adding /gbdb/mm7/stsMarker/mouseP.fa # 33838 sequences # After applying the filter to primers.psl above (and in mm6 too) featureBits mm7 all_sts_primer # 3757119 bases of 2583394090 (0.145%) in intersection featureBits mm6 all_sts_primer # 3706406 bases of 2597150411 (0.143%) in intersection featureBits mm7 stsMapMouseNew # 4805958 bases of 2583394090 (0.186%) in intersection featureBits mm6 stsMapMouseNew # 4638338 bases of 2597150411 (0.179%) in intersection # Before applying the filter to primers.psl above featureBits mm7 all_sts_primer # 3794153 bases of 2583394090 (0.147%) in intersection featureBits mm6 all_sts_primer # 3735649 bases of 2597150411 (0.144%) in intersection featureBits mm5 all_sts_primer # 3727268 bases of 2615483787 (0.143%) in intersection featureBits mm7 stsMapMouseNew # 4917456 bases of 2583394090 (0.190%) in intersection featureBits mm6 stsMapMouseNew # 4736039 bases of 2597150411 (0.182%) in intersection featureBits mm5 stsMapMouseNew # 4719679 bases of 2615483787 (0.180%) in intersection hgsql -N mm7 -e "select count(*) from stsAlias;" # 140649 hgsql -N mm6 -e "select count(*) from stsAlias;" # 137738 hgsql -N mm5 -e "select count(*) from stsAlias;" # 122944 hgsql -N mm7 -e "select count(*) from stsInfoMouseNew;" # 59843 hgsql -N mm6 -e "select count(*) from stsInfoMouseNew;" # 58980 hgsql -N mm5 -e "select count(*) from stsInfoMouseNew;" # 58493 # compare old and new name lists: awk '{print $4}' /cluster/data/mm6/bed/STSmarkers/stsMapMouse.bed | \ sort -u > mm6.nameList awk '{print $4}' stsMapMouse.bed | sort -u > mm7.nameList # After applying the filter to primers.psl above (and in mm6 too) comm -12 mm?.nameList | wc # 27320 27320 265970 <- 27,320 names in common comm -23 mm6.nameList mm7.nameList | wc # 188 188 1695 <- 188 unique to mm6 list comm -13 mm6.nameList mm7.nameList | wc # 1107 1107 12626 <- 1,107 unique to mm7 list # Before applying the filter to primers.psl above comm -12 mm?.nameList | wc # 27658 27658 268827 <- 27,658 names in commmon comm -23 mm6.nameList mm7.nameList | wc # 197 197 1781 <- 197 unique to mm6 list comm -13 mm6.nameList mm7.nameList | wc # 1184 1184 13480 <- 1,184 unique to mm7 list # Mm6 vs Mm5: # 27454 27454 266951 <- 27,545 names in common # 182 182 1685 <- 182 unique to mm5 list # 1625 1625 15090 <- 1,625 unique to mm6 list ############################################################################# # 17-WAY VAR_MULTIZ - ALIGNMENTS (WORKING - 2005-10-25) ssh kkstore02 mkdir /cluster/data/mm7/bed/multiz17way cd /cluster/data/mm7/bed/multiz17way # create tree diagram to guide work below. # This tree was constructed from one that Adam is using for # ENCODE work and a 27-way alignment. Took that file and # removed some of the entries, adding together the appropriate # distances. cat << '_EOF_' > 17way.nh ((((((((( (human_hg17:0.006690,chimp_panTro1:0.007571):0.024272, macaque_rheMac1:0.0592):0.023960, ((rat_rn3:0.081728,mouse_mm7:0.077017):0.229273, rabbit_oryCun1:0.206767):0.1065):0.023026, (cow_bosTau2:0.159182,dog_canFam2:0.147731):0.039450):0.028505, armadillo_dasNov1:0.149862):0.015994, (elephant_loxAfr1:0.104891,tenrec_echTel1:0.259797):0.040371):0.218400, monodelphis_monDom2:0.371073):0.189124, chicken_galGal2:0.454691):0.123297, xenopus_xenTro1:0.782453):0.156067, ((tetraodon_tetNig1:0.199381,fugu_fr1:0.239894):0.492961, zebrafish_danRer3:0.782561):0.156067); '_EOF_' # happy emacs /cluster/bin/phast/draw_tree 17way.nh > 17way.ps /cluster/bin/phast/all_dists 17way.nh > 17way.distances.txt grep -y mm7 17way.distances.txt | sort -k3,3n # Print out that file for reference, and use the calculated # distances in the table below to order the organisms and check # the button order on the browser. Zebrafish ends up before # tetraodon and fugu on the browser despite its distance. # And if you can fill in the table below entirely, you have # succeeded in finishing all the alignments required. # # featureBits chainLink measures # chainMm7Link chain linearGap # distance on Mm7 on other minScore # 1 0.1587 - rat rn3 (% 68.744) (% 69.289) 3000 medium # 2 0.4677 - human hg17 (% 38.571) (% 34.706) 3000 medium # 3 0.4686 - chimp panTro1 (% 35.100) (% 32.910) 3000 medium # 4 0.4960 - macaque rheMac1 (% 33.898) (% 32.023) 3000 medium # 5 0.5131 - rabbit oryCun1 (% 19.319) (% 24.164) 3000 medium # 6 0.6142 - armadillo dasNov1 (% 16.838) (% 20.60x) 3000 medium # 7 0.6230 - dog canFam2 (% 32.266) (% 34.135) 3000 medium # 8 0.6256 - elephant loxAfr1 (% 18.381) (% 20.555) 3000 medium # 9 0.6344 - cow bosTau2 (% 26.805) (% 24.144) 3000 medium # 10 0.7805 - tenrec echTel1 (% 11.421) (% 14.36x) 5000 loose # 11 1.0698 - opossum monDom2 (% 8.501) (% 6.319) 5000 loose # 12 1.3425 - chicken galGal2 (% 3.071) (% 6.715) 5000 loose # 13 1.7936 - frog xenTro1 (% 2.418) (% 4.296) 5000 loose # 14 2.0157 - tetraodon tetNig1 (% 1.956) (% 13.661) 5000 loose # 15 2.0562 - fugu fr1 (% 1.906) (% 13.484) 5000 loose # 16 2.1059 - zebrafish danRer3 (% 2.694) (% 4.372) 5000 loose export H=/cluster/data/mm7/bed mkdir mafLinks for G in rn3 hg17 panTro1 rheMac1 oryCun1 dasNov1 canFam2 \ loxAfr1 bosTau2 echTel1 monDom2 galGal2 xenTro1 tetNig1 fr1 danRer3 do mkdir mafLinks/$G if [ ! -d ${H}/blastz.${G}/mafNet ]; then echo "missing directory blastz.${G}/mafNet" exit 255 fi ln -s ${H}/blastz.$G/mafNet/*.maf.gz ./mafLinks/$G done # Copy MAFs to SAN for kluster run ssh pk mkdir /san/sanvol1/scratch/mm7/multiz17way cd /san/sanvol1/scratch/mm7/multiz17way rsync -a --copy-links --progress \ /cluster/data/mm7/bed/multiz17way/mafLinks/ . # We have about 5.9 Gb of data here, takes ~ 10 minutes to copy mkdir penn cp -p /cluster/bin/penn/v10.5.x86_64/multiz-tba/multiz penn cp -p /cluster/bin/penn/v10.5.x86_64/multiz-tba/maf_project penn # Progressive alignment up the tree w/o stager, # using multiz.v10.5 (var_multiz) # Method: align internal subtrees (using 0 flag to var_multiz) # Then, align these to mouse (using 1 flag to var_multiz) # NOTE: must use maf_project after each multiz run, in order # to order output. Single-cov guaranteed by use of net MAF's, # so it is not necessary to run single_cov2. # make output dir and run dir ssh pk cd /cluster/data/mm7/bed/multiz17way mkdir -p maf mkdir -p run cd run # create scripts to run var_multiz on cluster cat > oneMultiz.csh << 'EOF' #!/bin/csh -fe set c = $1 set multi = /scratch/mm7/multiz17way.$c set pairs = /san/sanvol1/scratch/mm7/multiz17way # special mode -- # with 1 arg, cleanup if ($#argv == 1) then rm -fr $multi exit endif # special mode -- # with 3 args, saves an alignment file if ($#argv == 3) then cp $multi/$2/$c.maf $3 exit endif set s1 = $2 set s2 = $3 set flag = $4 # locate input files -- in pairwise dir, or multiple dir set d1 = $multi set d2 = $multi if (-d $pairs/$s1) then set d1 = $pairs set f1 = $d1/$s1/$c.maf.gz set t1 = /tmp/$s1.$c.maf zcat $f1 > $t1 else set f1 = $d1/$s1/$c.maf set t1 = /tmp/$s1.$c.maf cp -p $f1 $t1 endif if (-d $pairs/$s2) then set d2 = $pairs set f2 = $d2/$s2/$c.maf.gz set t2 = /tmp/$s2.$c.maf zcat $f2 > $t2 else set f2 = $d2/$s2/$c.maf set t2 = /tmp/$s2.$c.maf cp -p $f2 $t2 endif # write to output dir set out = $multi/${s1}${s2} mkdir -p $out # check for empty input file if (-s $t1 && -s $t2) then echo "Aligning $f1 $f2 $flag" /san/sanvol1/scratch/mm7/multiz17way/penn/multiz $t1 $t2 $flag \ $out/$c.unused1.maf $out/$c.unused2.maf > $out/$c.full.maf cat $out/$c.full.maf $out/$c.unused1.maf $out/$c.unused2.maf > \ $out/$c.tmp.maf echo "Ordering $c.maf" /san/sanvol1/scratch/mm7/multiz17way/penn/maf_project \ $out/$c.tmp.maf mm7.$c > $out/$c.maf rm -f $t1 $t2 else if (-s $t1) then cp -p $t1 $out/$c.maf rm -f $t1 else if (-s $t2) then cp -p $t2 $out/$c.maf rm -f $t2 endif 'EOF' # << keep emacs coloring happy chmod +x oneMultiz.csh cp -p oneMultiz.csh /san/sanvol1/scratch/mm7/multiz17way/penn/oneMultiz.csh # using your tree diagram printed above, arrange these alignments # in order of the tree branches cat > allMultiz.csh << 'EOF' #!/bin/csh -fe # multiple alignment steps: set c = $1 set s = "/san/sanvol1/scratch/mm7/multiz17way/penn/oneMultiz.csh" $s $c hg17 panTro1 0 $s $c hg17panTro1 rheMac1 1 $s $c rn3 oryCun1 0 $s $c hg17panTro1rheMac1 rn3oryCun1 1 $s $c canFam2 bosTau2 0 $s $c hg17panTro1rheMac1rn3oryCun1 canFam2bosTau2 1 $s $c hg17panTro1rheMac1rn3oryCun1canFam2bosTau2 dasNov1 1 $s $c loxAfr1 echTel1 0 $s $c hg17panTro1rheMac1rn3oryCun1canFam2bosTau2dasNov1 loxAfr1echTel1 1 $s $c hg17panTro1rheMac1rn3oryCun1canFam2bosTau2dasNov1loxAfr1echTel1 monDom2 1 $s $c hg17panTro1rheMac1rn3oryCun1canFam2bosTau2dasNov1loxAfr1echTel1monDom2 \ galGal2 1 $s $c \ hg17panTro1rheMac1rn3oryCun1canFam2bosTau2dasNov1loxAfr1echTel1monDom2galGal2 \ xenTro1 1 $s $c tetNig1 fr1 0 $s $c tetNig1fr1 danRer3 1 $s $c \ hg17panTro1rheMac1rn3oryCun1canFam2bosTau2dasNov1loxAfr1echTel1monDom2galGal2xenTro1 \ tetNig1fr1danRer3 1 # get final alignment file $s $c \ hg17panTro1rheMac1rn3oryCun1canFam2bosTau2dasNov1loxAfr1echTel1monDom2galGal2xenTro1tetNig1fr1danRer3 \ /cluster/data/mm7/bed/multiz17way/maf/$c.maf #cleanup $s $c 'EOF' # << keep emacs coloring happy chmod +x allMultiz.csh cat << 'EOF' > template #LOOP ./allMultiz.csh $(root1) {check out line+ /cluster/data/mm7/bed/multiz17way/maf/$(root1).maf} #ENDLOOP 'EOF' cd /cluster/data/mm7/bed/multiz17way/run awk '{print $1}' ../../../chrom.sizes > chrom.lst gensub2 chrom.lst single template jobList para create jobList para try; para check para push # Completed: 40 of 40 jobs # CPU time in finished jobs: 460579s 7676.32m 127.94h 5.33d 0.015 y # IO & Wait Time: 13669s 227.81m 3.80h 0.16d 0.000 y # Average job time: 11856s 197.60m 3.29h 0.14d # Longest finished job: 48345s 805.75m 13.43h 0.56d # Submission to last job: 48345s 805.75m 13.43h 0.56d # combine results into a single file for loading # This step could be skipped and the catDir output sent directly # into the loader. ssh kkstore02 cd /cluster/data/mm7/bed/multiz17way # There used to be a mafFilter here with a minScore of 500, but it # turns out that the scores in these maf files are pretty much # useless. They range from very large negatives to very large # positives. time catDir maf > multiz17way.maf # real 18m12.267s # makes an 15 Gb file: # -rw-rw-r-- 1 15433191843 Nov 23 09:25 multiz17way.maf # Create per-chrom individual maf files for downloads # (DONE - 2005-11-30) ssh kkstore02 cd /cluster/data/mm7/bed/multiz17way mkdir mafDownloads for M in maf/chr*.maf do B=`basename $M` cp -p ${M} mafDownloads/${B} gzip mafDownloads/${B} echo ${B} done done ssh hgwdev mkdir /usr/local/apache/htdocs/goldenPath/mm7/multiz17way cd /usr/local/apache/htdocs/goldenPath/mm7/multiz17way ln -s /cluster/data/mm7/bed/multiz17way/mafDownloads/chr*.maf.gz . # Load into database ssh hgwdev cd /cluster/data/mm7/bed/multiz17way mkdir /gbdb/mm7/multiz17way ln -s /cluster/data/mm7/bed/multiz17way/multiz17way.maf \ /gbdb/mm7/multiz17way time hgLoadMaf mm7 multiz17way # Loaded 11964967 mafs in 1 files from /gbdb/mm7/multiz17way # real 32m40.228s time hgLoadMafSummary -minSize=10000 -mergeGap=500 -maxSize=50000 mm7 \ multiz17waySummary multiz17way.maf # Processed 66508536 components in 11964967 mafs from multiz17way.maf # real 37m56.218s # Dropped unused indexes (2006-05-09 kate) # NOTE: this is not required in the future, as the loader # has been fixed to not generate these indexes hgsql mm7 -e "alter table multiz17waySummary drop index chrom_2" hgsql mm7 -e "alter table multiz17waySummary drop index chrom_3" # create tree image: cat << '_EOF_' > species.nh ((((((human,(mouse,rat)),(dog,cow)),opossum),chicken),frog),(tetraodon,zebrafish)) '_EOF_' /cluster/bin/phast/draw_tree -b -s species.nh > species10.ps # photoshop to enhance, reduce the amount of whitespace to make it # smaller, then save as jpg cp species10.jpg /usr/local/apache/htdocs/images/phylo/Mm7_17way.jpg ############################################################################ # CREATE CONSERVATION WIGGLE WITH PHASTCONS # (DONE 2005-11-23 - 2005-12-07 - Hiram) # Estimate phastCons parameters ssh kkstore02 mkdir /cluster/data/mm7/bed/multiz17way/cons cd /cluster/data/mm7/bed/multiz17way/cons # Create a starting-tree.mod based on chr2 (the largest one) /cluster/bin/phast/$MACHTYPE/msa_split ../maf/chr2.maf \ --refseq ../../../2/chr2.fa --in-format MAF \ --windows 100000000,1000 --out-format SS \ --between-blocks 5000 --out-root s1 # 10 minutes /cluster/bin/phast/$MACHTYPE/phyloFit -i SS s1.*.ss \ --tree "((((((((((hg17,panTro1),rheMac1),((rn3,mm7),oryCun1)),(bosTau2,canFam2)),dasNov1),(loxAfr1,echTel1)),monDom2),galGal2),xenTro1),((tetNig1,fr1),danRer3))" \ --out-root starting-tree # real 840m53.157s # That is 14 hours ! rm s1.*.ss # add up the C and G: grep BACKGROUND starting-tree.mod | awk '{printf "%0.3f\n", $3 + $4;}' # 0.407 # This 0.407 is used in the --gc argument below # Create big bad bloated SS files on san filesystem (takes ~ 2h 20m) ssh kkstore02 mkdir -p /san/sanvol1/scratch/mm7/cons/ss cd /san/sanvol1/scratch/mm7/cons/ss for C in `awk '{print $1}' /cluster/data/mm7/chrom.sizes` do if [ -s /cluster/data/mm7/bed/multiz17way/maf/${C}.maf ]; then mkdir ${C} echo msa_split $C chrN=${C/chr/} chrN=${chrN/_random/} /cluster/bin/phast/$MACHTYPE/msa_split \ /cluster/data/mm7/bed/multiz17way/maf/${C}.maf \ --refseq /cluster/data/mm7/${chrN}/${C}.fa \ --in-format MAF --windows 1000000,0 --between-blocks 5000 \ --out-format SS --out-root ${C}/${C} fi done # real 143m42.404s # Create a random list of 50 1 mb regions (do not use the _randoms) cd /san/sanvol1/scratch/mm7/cons/ss ls -1l chr*/chr*.ss | grep -v random | \ awk '$5 > 4000000 {print $9;}' | randomLines stdin 50 ../randomSs.list # Set up parasol directory to calculate trees on these 50 regions ssh pk mkdir /san/sanvol1/scratch/mm7/cons/treeRun1 cd /san/sanvol1/scratch/mm7/cons/treeRun1 mkdir tree log # Tuning this loop should come back to here to recalculate # Create little script that calls phastCons with right arguments # --target-coverage of 0.20 is about right for mouse, will be # tuned exactly below cat > makeTree.csh << '_EOF_' #!/bin/csh -fe set C=$1:h mkdir -p log/${C} tree/${C} /cluster/bin/phast/$MACHTYPE/phastCons ../ss/$1 \ /cluster/data/mm7/bed/multiz17way/cons/starting-tree.mod \ --gc 0.407 --nrates 1,1 --no-post-probs --ignore-missing \ --expected-lengths 12 --target-coverage 0.17 \ --quiet --log log/$1 --estimate-trees tree/$1 '_EOF_' # emacs happy chmod a+x makeTree.csh # Create gensub file cat > template << '_EOF_' #LOOP makeTree.csh $(path1) #ENDLOOP '_EOF_' # happy emacs # Make cluster job and run it gensub2 ../randomSs.list single template jobList para create jobList para try/push/check/etc XXXX - working 2005-11-28 10:45 # Completed: 50 of 50 jobs # CPU time in finished jobs: 354644s 5910.74m 98.51h 4.10d 0.011 y # IO & Wait Time: 352s 5.86m 0.10h 0.00d 0.000 y # Average job time: 7100s 118.33m 1.97h 0.08d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 29358s 489.30m 8.15h 0.34d # Submission to last job: 29446s 490.77m 8.18h 0.34d # Now combine parameter estimates. We can average the .mod files # using phyloBoot. This must be done separately for the conserved # and nonconserved models ssh kkstore02 cd /san/sanvol1/scratch/mm7/cons/treeRun1 ls -1 tree/chr*/*.cons.mod > cons.list time /cluster/bin/phast/$MACHTYPE/phyloBoot --read-mods '*cons.list' \ --output-average ../ave.cons.mod > cons_summary.txt 2>&1 & ls -1 tree/chr*/*.noncons.mod > noncons.list /cluster/bin/phast/$MACHTYPE/phyloBoot --read-mods '*noncons.list' \ --output-average ../ave.noncons.mod > noncons_summary.txt cd .. cp -p ave.*.mod /cluster/data/mm7/bed/multiz17way/cons # measuring entropy # consEntopy # ave.cons.mod ave.noncons.mod --NH 9.78 # never stops with the --NH argument /cluster/bin/phast/$MACHTYPE/consEntropy .17 12 \ ave.cons.mod ave.noncons.mod XXXX - doesn not work: 2005-11-28 [hiram@kkstore02 /san/sanvol1/scratch/mm7/cons] /cluster/bin/phast/$MACHTYPE/consEntropy .17 12 ave.cons.mod ave.noncons.mod ERROR: with no separate source alignment, ss_from_msas expects sequences of positive length and no SS object. #Transition parameters:gamma=0.100000, omega=12.000000, mu=0.083333, nu=0.009259 # Relative entropy: H=1.454874 bits/site # Required length: N=7.596943 sites # Total entropy: NH=11.052595 bits # consEntropy .20 12 ave.cons.mod.1 ave.noncons.mod.1 # Transition params: gamma=0.200000, omega=12.000000, mu=0.083333, nu=0.020833 # Relative entropy: H=1.454874 bits/site # Required length: N=6.629337 sites # Total entropy: NH=9.644850 bits # consEntropy .10 12 ave.cons.mod.2 ave.noncons.mod.2 # Transition params: gamma=0.100000, omega=12.000000, mu=0.083333, nu=0.009259 # Relative entropy: H=1.527815 bits/site # Required length: N=7.205526 sites # Total entropy: NH=11.008713 bits # consEntropy .20 8 ave.cons.mod.3 ave.noncons.mod.3 # Transition params: gamma=0.200000, omega=8.000000, mu=0.125000, nu=0.031250 # Relative entropy: H=1.654878 bits/site # Required length: N=5.146793 sites # Total entropy: NH=8.517313 bits ### !!! *** This one with .17 and 12 is the one that was finally used # consEntropy .17 12 ave.cons.mod.4 ave.noncons.mod.4 # Transition params: gamma=0.170000, omega=12.000000, mu=0.083333, nu=0.017068 # Relative entropy: H=1.478838 bits/site # Required length: N=6.753382 sites # Total entropy: NH=9.987159 bits ssh pk # Create cluster dir to do main phastCons run mkdir /san/sanvol1/scratch/mm7/cons/consRun1 cd /san/sanvol1/scratch/mm7/cons/consRun1 mkdir ppRaw bed # Create script to run phastCons with right parameters # This job is I/O intensive in its output files, thus it is all # working over in /scratch/tmp/ cat > doPhast.csh << '_EOF_' #!/bin/csh -fe mkdir /scratch/tmp/${2} cp -p ../ss/${1}/${2}.ss ../elliotsEncode.mod /scratch/tmp/${2} pushd /scratch/tmp/${2} > /dev/null /cluster/bin/phast/${MACHTYPE}/phastCons ${2}.ss elliotsEncode.mod \ --rho 0.3 --expected-length 11 --target-coverage 0.16 --quiet \ --seqname ${1} --idpref ${1} --viterbi ${2}.bed --score > ${2}.pp popd > /dev/null mkdir -p ppRaw/${1} mkdir -p bed/${1} mv /scratch/tmp/${2}/${2}.pp ppRaw/${1} mv /scratch/tmp/${2}/${2}.bed bed/${1} rm /scratch/tmp/${2}/elliotsEncode.mod rm /scratch/tmp/${2}/${2}.ss rmdir /scratch/tmp/${2} '_EOF_' # emacs happy chmod a+x doPhast.csh # root1 == chrom name, file1 == ss file name without .ss suffix # Create gsub file cat > template << '_EOF_' #LOOP doPhast.csh $(root1) $(file1) #ENDLOOP '_EOF_' # happy emacs # Create parasol batch and run it ls -1 ../ss/chr*/chr*.ss | sed 's/.ss$//' > in.list gensub2 in.list single template jobList para create jobList para try/check/push/etc. # These jobs are very fast and very I/O intensive, even on the san # they will hang it up as they work at full tilt. # Completed: 2867 of 2867 jobs # CPU time in finished jobs: 15359s 255.98m 4.27h 0.18d 0.000 y # IO & Wait Time: 34015s 566.92m 9.45h 0.39d 0.001 y # Average job time: 17s 0.29m 0.00h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 259s 4.32m 0.07h 0.00d # Submission to last job: 3236s 53.93m 0.90h 0.04d # combine predictions and transform scores to be in 0-1000 interval # it uses a lot of memory, so on kolossus: ssh kolossus cd /san/sanvol1/scratch/mm7/cons/consRun1 # The sed's and the sort get the file names in chrom,start order find ./bed -type f | sed -e "s#/# x #g; s#\.# y #g; s#-# z #g" \ | sort -k7,7 -k9,9n \ | sed -e "s# z #-#g; s# y #\.#g; s# x #/#g" | xargs cat \ | awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' \ | /cluster/bin/scripts/lodToBedScore /dev/stdin > mostConserved.bed # ~ 1 minute cp -p mostConserved.bed /cluster/data/mm7/bed/multiz17way # Figure out how much is actually covered by the bed files as so: # The 2583393846 comes from the non-n genome size, # from faSize on all chroms: ssh kkstore02 cd /cluster/data/mm7 faSize ?/chr*.fa ??/chr*.fa # 2847717329 bases (264323483 N's 2583393846 real 1489414119 # upper 1093979727 cd /san/sanvol1/scratch/mm7/cons/consRun1 awk ' {sum+=$3-$2} END{printf "%% %.2f = 100.0*%d/2583393846\n",100.0*sum/2583393846,sum}' \ mostConserved.bed -target-coverage 0.16: % 7.00 = 100.0*187599245/2583393846 length 11 -target-coverage 0.17: % 7.26 = 100.0*187599245/2583393846 length 12 -target-coverage 0.13: % 6.86 = 100.0*177139744/2583393846 length 12 -target-coverage 0.08: % 6.39 = 100.0*165132383/2583393846 length 12 -target-coverage 0.03: % 5.88 = 100.0*151876064/2583393846 length 12 -target-coverage 0.013: % 5.62 = 100.0*145240686/2583393846 length 12 -target-coverage 0.08: % 6.10 = 100.0*157463396/2583393846 length 10 -target-coverage 0.08: % 5.50 = 100.0*142084011/2583393846 length 7 -target-coverage 0.03: % 5.02 = 100.0*129774852/2583393846 length 7 -target-coverage 0.03: % 4.74 = 100.0*122386185/2583393846 length 6 # We used to aim for %4 in the above measurement, but as you can # see it won't go down that low. Instead, aim for %70 coverage in # the following featureBits measurement on CDS: # Beware of negative scores when too high. The logToBedScore # will output an error on any negative scores. HGDB_CONF=~/.hg.conf.read-only featureBits mm7 \ -enrichment refGene:cds mostConserved.bed # ~ 1.5 minutes # -target-coverage 0.16 and expected lengths 11: # refGene:cds 1.022%, mostConserved.bed 6.999%, both 0.721%, cover 70.50%, # enrich 10.07x # -target-coverage 0.17 and expected lengths 12: # refGene:cds 1.022%, mostConserved.bed 7.262%, both 0.724%, cover 70.79%, # enrich 9.75x # -target-coverage 0.13 and expected lengths 12: # refGene:cds 1.022%, mostConserved.bed 6.857%, both 0.723%, cover 70.75%, # enrich 10.32x # Load most conserved track into database ssh hgwdev cd /cluster/data/mm7/bed/multiz17way hgLoadBed -strict mm7 phastConsElements mostConserved.bed # Loaded 4767525 elements of size 5 # 5 minute load time # should measure the same as above featureBits mm7 -enrichment refGene:cds phastConsElements # -target-coverage 0.16 and expected lengths 11: # refGene:cds 1.022%, phastConsElements 6.999%, both 0.721%, cover 70.50%, # enrich 10.07x # -target-coverage 0.17 and expected lengths 12: # refGene:cds 1.022%, phastConsElements 7.262%, both 0.724%, cover 70.78%, # enrich 9.75x # Create merged posterier probability file and wiggle track data files # pk is currently closer to the san than any other machine ssh pk cd /san/sanvol1/scratch/mm7/cons/consRun1 # the sed business gets the names sorted by chromName, chromStart # so that everything goes in numerical order into wigEncode find ./ppRaw -type f | sed -e "s#/# x #g; s#\.# y #g; s#-# z #g" \ | sort -k7,7 -k9,9n \ | sed -e "s# z #-#g; s# y #\.#g; s# x #/#g" | xargs cat \ | wigEncode stdin phastCons17.wig phastCons17.wib # about 23 minutes for above time rsync -a --progress phastCons17.wi? \ kkstore02:/cluster/data/mm7/bed/multiz17way/ # 3m 05s copy from pk to on kkstore02 # prepare compressed copy of ascii data values for downloads ssh pk cd /san/sanvol1/scratch/mm7/cons/consRun1 cat << '_EOF_' > gzipAscii.sh #!/bin/sh TOP=`pwd` export TOP mkdir -p phastCons17Scores for D in ppRaw/chr* do C=${D/ppRaw\/} out=phastCons17Scores/${C}.data.gz echo "========================== ${C} ${D}" find ./${D} -type f | sed -e "s#/# x #g; s#\.# y #g; s#-# z #g" \ | sort -k7,7 -k9,9n \ | sed -e "s# z #-#g; s# y #\.#g; s# x #/#g" | xargs cat | gzip > ${out} done '_EOF_' # happy emacs chmod +x gzipAscii.sh time ./gzipAscii.sh # takes about 40 minutes, makes 2.9 Gb of data # copy them for downloads ssh hgwdev mkdir /cluster/data/mm7/bed/multiz17way/phastCons17Scores cd /cluster/data/mm7/bed/multiz17way/phastCons17Scores cp -p /san/sanvol1/scratch/mm7/cons/consRun1/phastCons17Scores/* . # ~12 minute copy mkdir /usr/local/apache/htdocs/goldenPath/mm7/phastCons17Scores cd /usr/local/apache/htdocs/goldenPath/mm7/phastCons17Scores ln -s /cluster/data/mm7/bed/multiz17way/phastCons17Scores/*.gz . # Load gbdb and database with wiggle. ssh hgwdev cd /cluster/data/mm7/bed/multiz17way ln -s `pwd`/phastCons17.wib /gbdb/mm7/wib/phastCons17.wib hgLoadWiggle mm7 phastCons17 phastCons17.wig # ~ 3 minute load # Create histogram to get an overview of all the data ssh hgwdev cd /cluster/data/mm7/bed/multiz17way/cons time hgWiggle -doHistogram \ -hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \ -db=mm7 phastCons17 > histogram.data 2>&1 # about 27 minutes to scan all data # create plot of histogram: cat << '_EOF_' | gnuplot > histo.png set terminal png small color \ x000000 xffffff xc000ff x66ff66 xffff00 x00ffff xff0000 set size 1.4, 0.8 set key left box set grid noxtics set grid ytics set title " Mouse Mm7 Histogram phastCons17 track" set xlabel " phastCons17 score" set ylabel " Relative Frequency" set y2label " Cumulative Relative Frequency (CRF)" set y2range [0:1] set y2tics set yrange [0:0.02] plot "histogram.data" using 2:5 title " RelFreq" with impulses, \ "histogram.data" using 2:7 axes x1y2 title " CRF" with lines '_EOF_' # happy emacs display histo.png & ############################################################################ # BUILD KNOWN GENES TABLES (Started 3/19/05, done 4/13/05. Fan) # First build protein databases, sp050315 and proteins050315 # See makeProteins050315.doc for details. # Create working subdirectories and temporary databases ssh hgwdev cd /cluster/store10/kg mkdir kgMm6A ln -s /cluster/store10/kg/kgMm6A /cluster/store6/kgDB/bed/kgMm6A ln -s /cluster/store10/kg/kgMm6A /cluster/data/mm7/bed/kgMm6A hgsql mm7 -e "create database kgMm6ATemp" mkdir /cluster/bluearc/kgDB/kgMm6A mkdir /cluster/bluearc/kgDB/kgMm6A/protBlat ln -s /cluster/bluearc/kgDB/kgMm6A/protBlat /cluster/store10/kg/kgMm6A/protBlat cd /cluster/store10/kg/kgMm6A/protBlat # Get all mouse protein sequences hgsql -N sp050315 -e \ 'select proteins050315.spXref3.accession,protein.val from proteins050315.spXref3,protein where division="10090" and acc=accession' \ |awk '{print ">" $1;print $2}' >mouseProt.fa # Prepare and perform cluster run for protein/genome alignment ssh kk cd /cluster/data/mm7/bed/kgMm6A/protBlat mkdir prot faSplit sequence mouseProt.fa 1000 prot/prot ls /cluster/bluearc/kgDB/kgMm6A/protBlat/prot/* > prot.lis hgsql mm7 -N -e 'select chrom from chromInfo' > chrom.lis cat << '_EOF_' > gsub #LOOP /cluster/bin/i386/blat -noHead -t=dnax -q=prot /panasas/store/mm7/nib/$(path1).nib $(path2) {check out line+ /cluster/bluearc/kgDB/kgMm6A/protBlat/result/$(root1)_$(root2).psl} #ENDLOOP '_EOF_' mkdir result gensub2 chrom.lis prot.lis gsub jobList para create jobList para check para push para check ... # This cluster run takes about two days. Crashed jobs are due to empty BLAT result. It is OK. Completed: 31081 of 39600 jobs Crashed: 8519 jobs CPU time in finished jobs: 28671747s 477862.45m 7964.37h 331.85d 0.909 y IO & Wait Time: 1469964s 24499.40m 408.32h 17.01d 0.047 y Average job time: 970s 16.16m 0.27h 0.01d Longest running job: 0s 0.00m 0.00h 0.00d Longest finished job: 39632s 660.53m 11.01h 0.46d Submission to last job: 124276s 2071.27m 34.52h 1.44d # collect BLAT results ssh hgwdev cd /cluster/data/mm7/bed/kgMm6A/protBlat mkdir result2 mkdir result3 cat chrom.lis |sed -e 's/chr/do1 chr/g' >doall cat << '_EOF_' > do1.1 echo processing $1 cat result/$1_prot*.psl >result2/$1.psl '_EOF_' cat << '_EOF_' > do1.1 echo processing $1 pslReps -nohead -minCover=0.80 -minAli=0.80 -nearTop=0.002 result2/$1.psl result4/$1.psl /dev/null >>j.out '_EOF_' chmod +x do* cp do1.1 do1 doall cp do1.2 do1 doall cat result3/*.psl >protBlat.psl hgLoadPsl mm7 protBlat.psl # Remember to remove result2 and result3 when KG is built and validated. cd /cluster/data/mm7/bed/kgMm6A # create all_mrna.psl and tight_mrna.psl hgsql mm7 -N -e "select * from all_mrna" |cut -f 2-22 >all_mrna.psl pslReps -minCover=0.40 -minAli=0.97 -nearTop=0.002 \ all_mrna.psl tight_mrna.psl /dev/null # Use overlapSelect to get protein and mRNA alignment overlaps overlapSelect -statsOutput -dropped=protOut.psl -overlapThreshold=0.90 \ -selectFmt=psl -inFmt=psl tight_mrna.psl protBlat/protBlat.psl protMrna.stat overlapSelect -mergeOutput -dropped=protOut.psl -overlapThreshold=0.90 -selectFmt=psl \ -inFmt=psl tight_mrna.psl protBlat/protBlat.psl protMrna.out # Create protein/mRNA pair and protein lists cut -f 10,31 protMrna.out|sort -u >spMrna.tab cut -f 10 protMrna.out|sort -u >protein.lis # Load spMrna.tab into spMrna table in temp DB. hgsql kgMm6ATemp < ~/src/hg/lib/spMrna.sql hgsql kgMm6ATemp -e 'load data local infile "spMrna.tab" into table spMrna' hgsql kgMm6ATemp -e 'create index mrnaID on spMrna(mrnaID)' # Prepare and perform cluster run of protein/mRNA alignment # Get mRNA fa file. /cluster/data/genbank/bin/i386/gbGetSeqs -native -db=mm7 \ -gbRoot=/cluster/data/genbank genbank mrna mrna.fa # Create mrnaSeq table in kgMm6ATemp DB. hgFaToTab mrna.fa mrnaSeq.tab hgsql kgMm6ATemp <~/src/hg/lib/mrnaSeq.sql hgsql kgMm6ATemp -e "load data local infile "mrnaSeq.tab" into table mrnaSeq" # Prepare files for cluster run ~/src/hg/protein/KG2.sh kgMm6A mm7 050315 # Perform cluster run of protein/mRNA alignment ~/src/hg/protein/KG3.sh kgMm6A mm7 050315 # Collect cluster run results cd kgBestMrna ls out | sed -e 's/prot/do1 prot/g' >doall # create do1 with the following 2 lines: cat << '_EOF_' > do1 echo processing $1 cat out/$1/*.out >>protMrnaRaw.psl '_EOF_' chmod +x do* doall # Filter out low quality alignments pslReps -nohead -singleHit -minAli=0.9 protMrnaRaw.psl protMrnaBlat.psl /dev/null cut -f 10,14 protMrnaBlat.psl |sort -u >protMrna.lis wc protMrna.lis # Load BLAT results into temp DB. hgsql kgMm6ATemp < ~/src/hg/lib/protMrnaBlat.sql hgsql kgMm6ATemp -e 'load data local infile "protMrnaBlat.psl" into table protMrnaBlat' hgsql kgMm6ATemp -e 'create index tName on protMrnaBlat(tName)' # Create CDS files from protein/mRNA alignment results. hgsql kgMm6ATemp -N -e \ 'select qName,"_",tName,tStart+1,":",tEnd+3 from protMrnaBlat order by qName,tName,tEnd-tStart desc'\ |sed 's/\t_\t/_/g'|sed 's/\t:\t/../g' >protMrna.cds # Create protMrna.psl with proteinID_mrnaID as query ID. cut -f 22-30 ../protMrna.out > j1.tmp cut -f 32-42 ../protMrna.out > j2.tmp cut -f 10,31 ../protMrna.out|sed -e 's/\t/_/g' >j3.tmp paste j1.tmp j3.tmp j2.tmp >protMrna.psl rm j1.tmp j2.tmp j3.tmp # Run mrnaToGene to create protMrna.gp bash mrnaToGene -cdsFile=protMrna.cds protMrna.psl protMrna.gp 2>protMrna.err >protMrna.log exit # Prepare refGene and all_mrna gp files. cd .. hgsql mm7 -N -e 'select * from refGene' >ref.gp hgsql mm7 -N -e \ 'select gbCdnaInfo.acc,cds.name from gbCdnaInfo,cds,all_mrna where all_mrna.qName=gbCdnaInfo.acc and gbCdnaInfo.cds=cds.id' \ |sort -u > all_mrna.cds bash mrnaToGene -cdsFile=all_mrna.cds all_mrna.psl all_mrna.gp 2>all_mrna.err > all_mrna.log exit # Align proteins to RefSeq. overlapSelect -inCds -statsOutput -overlapThreshold=0.90 -selectFmt=psl -inFmt=genePred\ protBlat/protBlat.psl ref.gp ref.stat overlapSelect -inCds -dropped=refOut1.gp -overlapThreshold=0.90 -selectFmt=psl -inFmt=genePred\ protBlat/protBlat.psl ref.gp protRef.gp overlapSelect -mergeOutput -selectCds -dropped=protOut1.psl -overlapThreshold=0.90 -inFmt=psl\ -selectFmt=genePred ref.gp protBlat/protBlat.psl protRef.out cut -f 10,22 protRef.out | sort -u >spRef.tab cut -f 10 protRef.out | sort -u >protRef.lis hgsql kgMm6ATemp <~/src/hg/lib/spRef.sql hgsql kgMm6ATemp -e 'load data local infile "spRef.tab" into table spRef' # Prepare and perform cluster runs for protein/RefSeq alignments ~/src/hg/protein/KGRef2.sh kgMm6A mm7 050315 ~/src/hg/protein/KGRef3.sh kgMm6A mm7 050315 cd kgBestRef ls out | sed -e 's/prot/do1 prot/g' >doall cat << '_EOF_' > do1 echo processing $1 cat out/$1/*.out >>protRefRaw.psl '_EOF_' chmod +x do* doall # Filter out low quality alignments. pslReps -nohead -singleHit -minAli=0.9 protRefRaw.psl protRefBlat.psl /dev/null cut -f 10,14 protRefBlat.psl |sort -u >protRef.lis wc protRef.lis hgsql kgMm6ATemp < ~/src/hg/lib/protRefBlat.sql hgsql kgMm6ATemp -e 'load data local infile "protRefBlat.psl" into table protRefBlat' hgsql kgMm6ATemp -e 'create index tName on protRefBlat(tName)' # Run gene-check to filter out invalid gp entries cat ref.gp protMrna.gp all_mrna.gp >kgCandidate0.gp gene-check -incl-ok -ok-genepred-out kgCandidate0.passed.gp -nib-dir /cluster/store10/mm7/nib kgCandidate0.gp kgCandidate0.check hgsql kgMm6ATemp < ~/src/hg/lib/kgCandidate0.sql hgsql kgMm6ATemp -e 'load data local infile "kgCandidate0.gp" into table kgCandidate0' hgsql kgMm6ATemp < ~/src/hg/lib/geneCheck.sql hgsql kgMm6ATemp -e 'load data local infile "kgCandidate0.check" into table geneCheck ignore 2 lines' # Run kgKeep to filter out invalid gene candidates kgCheck kgMm6ATemp mm7 kgCandidate.tab hgsql kgMm6ATemp -e 'drop table kgCandidate' hgsql kgMm6ATemp < ~/src/hg/lib/kgCandidate.sql hgsql kgMm6ATemp -e 'load data local infile "kgCandidate.tab" into table kgCandidate' # Update and clean up kgResultBestMrna2.c and then check it in. # Score protein/mRna and protein/RefSeq alignments kgResultBestMrna2 050201 kgMm6ATemp mm7|sort -u >protMrnaBlatScore.tab kgResultBestRef2 050315 kgMm6ATemp mm7|sort -u >protRefScore.tab # Combine scoring results and load them into temp DB. cat protMrnaBlatScore.tab protRefScore.tab >protMrnaScore.tab hgsql kgMm6ATemp < ~/src/hg/lib/protMrnaScore.sql hgsql kgMm6ATemp -e 'load data local infile "protMrnaScore.tab" into table protMrnaScore' hgsql kgMm6ATemp -e 'create index mrnaAcc on protMrnaScore(mrnaAcc)' # Run kgSelect to select highest scoring mRNA or RefSeq for each protein. kgSelect kgMm6ATemp kgCandidate2.gp hgsql kgMm6ATemp -e 'drop table kgCandidate2' hgsql kgMm6ATemp < ~/src/hg/lib/kgCandidate2.sql hgsql kgMm6ATemp -e 'load data local infile "kgCandidate2.gp" into table kgCandidate2' # Create sorted file to get entries with identical CDS regions group together. hgsql kgMm6ATemp -N -e \ 'select name,chrom,cdsStart,cdsEnd,score,proteinID from kgCandidate2,protMrnaScore where proteinID=protAcc and name=mrnaAcc order by name,cdsStart,cdsEnd,score desc,proteinID' \ >kgSorted.tab # Run kgUniq to pick the top mRNA/RefSeq with hightest score for each CDS structure. kgUniq kgMm6ATemp sp050315 kgSorted.tab knownGene.gp dupSpMrna.tab hgsql mm7 -e 'drop table dupSpMrna' hgsql mm7 <~/src/hg/lib/dupSpMrna.sql hgsql mm7 -e 'load data local infile "dupSpMrna.tab" into table dupSpMrna' # Build mrnaRefseq table first before loading knownGene table cd /cluster/store10/entrez mkdir 050401 ln -s /cluster/store10/entrez/050401 /cluster/data/entrez/050401 cd /cluster/data/entrez/050401 wget -- timestamp ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/gene2accession.gz wget -- timestamp ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/gene_info.gz wget -- timestamp ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/gene2refseq.gz gzip -d *.gz cut -f 2,4 gene2accession | sort -u | grep -v "-" | grep -v "NM_" | sed -e 's/\./\t/g' > entrezMrna.tab cut -f 2,4 gene2refseq | grep "NM_"| sort -u | grep -v "-" | sed -e 's/\./\t/g' > entrezRefseq.tab cut -f 2,4,6 gene2accession | grep "NM_"| grep "NP_"|sort -u | sed -e 's/\./\t/g' > entrezRefProt.tab hgsql entrez -e 'drop table entrezRefseq' hgsql entrez -e 'drop table entrezMrna' hgsql entrez -e 'drop table entrezRefProt' hgsql entrez < ~/src/hg/lib/entrezRefseq.sql hgsql entrez < ~/src/hg/lib/entrezMrna.sql hgsql entrez < ~/src/hg/lib/entrezRefProt.sql hgsql entrez -e 'load data local infile "entrezRefseq.tab" into table entrezRefseq' hgsql entrez -e 'load data local infile "entrezMrna.tab" into table entrezMrna' hgsql entrez -e 'load data local infile "entrezRefProt.tab" into table entrezRefProt' hgsql entrez -N -e \ 'select mrna, refseq from entrezRefseq, entrezMrna where entrezRefseq.geneID=entrezMrna.geneID' \ >mrnaRefseq.tab hgsql mm7 < ~/src/hg/lib/mrnaRefseq.sql hgsql mm7 -e 'load data local infile "mrnaRefseq.tab" into table mrnaRefseq' # Sort knownGene table ~/kent/src/hg/protein/sortKg.pl knownGene.gp > sortedKnownGene.gp # Load knownGene table cd /cluster/data/kgDB/bed/kgMm6A hgsql mm7 -e 'drop table knownGene' hgsql mm7 <~/src/hg/lib/knownGene.sql hgsql mm7 -e 'load data local infile "sortedKnownGene.gp" into table knownGene' # Build kgXref table kgXref2 mm7 proteins050315 mm7 hgsql mm7 -e 'drop table kgXref' hgsql mm7 <~/src/hg/lib/kgXref.sql hgsql mm7 -e 'load data local infile "kgXref.tab" into table kgXref' # Build spMrna table hgsql mm7 -N -e 'select proteinID, name from knownGene' >kgSpMrna.tab hgsql mm7 -e 'drop table spMrna' hgsql mm7 <~/src/hg/lib/spMrna.sql hgsql mm7 -e 'load data local infile "kgSpMrna.tab" into table spMrna' # Build knownGenePep table hgsql mm7 -N -e \ 'select name, protein.val from knownGene, sp050315.displayId, sp050315.protein where proteinID=displayId.val and displayId.acc=protein.acc' \ >knownGenePep.tab hgsql mm7 -e 'drop table knownGenePep' hgsql mm7 <~/src/hg/lib/knownGenePep.sql hgsql mm7 -e 'load data local infile "knownGenePep.tab" into table knownGenePep' # Build knownGeneMrna table /cluster/data/genbank/bin/i386/gbGetSeqs -native -db=mm7 \ -gbRoot=/cluster/data/genbank refseq mrna stdout \ | faToTab stdin refseqSeq.tab hgsql kgMm6ATemp -e "drop table refseqSeq" hgsql kgMm6ATemp <~/src/hg/lib/refseqSeq.sql hgsql kgMm6ATemp -e 'load data local infile "refseqSeq.tab" into table refseqSeq' hgsql kgMm6ATemp -N -e \ 'select knownGene.name, seq from refseqSeq, mm7.knownGene where knownGene.name=refseqSeq.name'\ >j1.tmp hgsql kgMm6ATemp -N -e \ 'select knownGene.name, seq from mrnaSeq, mm7.knownGene where knownGene.name=mrnaSeq.name' \ >j2.tmp cat j1.tmp j2.tmp >knownGeneMrna.tab rm j1.tmp j2.tmp hgsql mm7 -e "drop table mm7.knownGeneMrna" hgsql mm7 <~/src/hg/lib/knownGeneMrna.sql hgsql mm7 -e 'load data local infile "knownGeneMrna.tab" into table knownGeneMrna' # update mrnaRefseq table hgsql mm7 -N -e 'select * from mrnaRefseq' >j.1 hgsql mm7 -N -e 'select name, name from knownGene where name like "NM_"' >j.2 cat j.1 j.2 |sort -u >mrnaRefseq2.tab hgsql mm7 -e 'delete from mrnaRefseq' hgsql mm7 -e 'load data local infile "mrnaRefseq2.tab" into table mrnaRefseq' # Create empty knownGeneLink table to make some old code happy. hgsql mm7 <~/src/hg/lib/knownGeneLink.sql # Build KEGG pathway tables ~/src/hg/protein/KGpath.sh kgMm6A mm7 050315 hgsql kgMm6ATemp -e "drop table keggList" hgsql kgMm6ATemp <~/src/hg/lib/keggList.sql hgsql kgMm6ATemp -e 'load data local infile "keggList.tab" into table keggList' hgsql mm7 -e "drop table keggMapDesc" hgsql mm7 -e "drop table keggPathway" hgsql mm7 <~/src/hg/lib/keggMapDesc.sql hgsql mm7 <~/src/hg/lib/keggPathway.sql hgsql mm7 -e 'load data local infile "keggMapDesc.tab" into table keggMapDesc' hgsql mm7 -e 'load data local infile "keggPathway.tab" into table keggPathway' # Build CGAP pathway tables ~/src/hg/protein/KGcgap.sh kgMm6A mm7 050315 hgsql sp050315 -N -e \ 'select name, gene.val from mm7.knownGene, gene, displayId where proteinID=displayId.val and gene.acc=displayId.acc' \ | sort -u >kgAliasP.tab # Build alias tables # kgAliasM reads from proteins050315.hugo.symbol, proteins050315.hugo.aliases # proteins050315.hugo.withdraws, mm7.kgXref.kgID # to create kgAliasM.tab and geneAlias.tab # by picking out those kgID items from kgXref where # kgXref.geneSymbol == hugo.symbol kgAliasM mm7 proteins050315 # kgAliasKgXref reads from mm7.knownGene.proteinID, # mm7.knownGene.name, mm7.kgXref.geneSymbol # to create kgAliasKgXref.tab kgAliasKgXref mm7 # kgAliasRefseq reads from mm7.knownGene.name, # mm7.knownGene.proteinID, mm7.kgXref.refseq # to create kgAliasRefseq.tab kgAliasRefseq mm7 hgsql sp050315 -N -e \ 'select name, gene.val from mm7.knownGene, gene, displayId where proteinID=displayId.val and gene.acc=displayId.acc' \ | sort -u >kgAliasP.tab cat kgAliasM.tab kgAliasRefseq.tab kgAliasKgXref.tab kgAliasP.tab | \ sort |uniq > kgAlias.tab hgsql -e "drop table kgAlias;" mm7 hgsql mm7 < ~/kent/src/hg/lib/kgAlias.sql hgsql mm7 -e 'LOAD DATA local INFILE "kgAlias.tab" into table kgAlias' # kgProtAlias reads from mm7.knownGene.name, # mm7.knownGene.proteinID, mm7.knownGene.alignID, # proteins050315.spXref3.accession, proteins050315.spSecondaryID, proteins050315.pdbSP.pdb # to create kgProtAlias.tab # kgProtAlias mm7 050315 hgsql mm7 -N -e \ 'select kgID, spDisplayID, protAcc from kgXref where protAcc != ""'\ | sort -u >kgProtAliasNCBI.tab cat kgProtAliasNCBI.tab kgProtAlias.tab | sort | uniq > kgProtAliasBoth.tab rm kgProtAliasNCBI.tab kgProtAlias.tab echo "`date` creating table kgProtAlias" hgsql mm7 -e "drop table kgProtAlias;" hgsql mm7 <~/src/hg/lib/kgProtAlias.sql; hgsql mm7 -e 'LOAD DATA local INFILE "kgProtAliasBoth.tab" into table kgProtAlias;' # MAKING FOLDUTR TABLES (TBD) # First set up directory structure and extract UTR sequence on hgwdev ssh hgwdev mkdir -p /cluster/data/mm7/bed/rnaStruct cd /cluster/data/mm7/bed/rnaStruct mkdir -p utr3/split utr5/split utr3/fold utr5/fold utrFa mm7 knownGene utr3 utr3/utr.fa utrFa mm7 knownGene utr5 utr5/utr.fa # Split up files and make files that define job. ssh kk cd /cluster/data/mm7/bed/rnaStruct faSplit sequence utr3/utr.fa 50000 utr3/split/s faSplit sequence utr5/utr.fa 50000 utr5/split/s ls -1 utr3/split > utr3/in.lst ls -1 utr5/split > utr5/in.lst cd utr3 cat > gsub < genome.lst echo /panasas/store/mm7/nib/*.nib | wordLine stdin > genome.lst ls -1 /iscratch/i/affy/U74*consensus.fa > affy.lst cat << '_EOF_' > gsub #LOOP /cluster/bin/i386/blat -fine -mask=lower -minIdentity=95 -ooc=/scratch/hg/h/11.ooc $(path1) {check in line+ $(path2)} {check out line+ psl/$(root1)_$(root2).psl} #ENDLOOP '_EOF_' # << this line makes emacs coloring happy gensub2 genome.lst affy.lst gsub jobList para create jobList para try # do usual para check/para push etc. until the job is done. # Completed: 120 of 120 jobs # CPU time in finished jobs: 7197s 119.94m 2.00h 0.08d 0.000 y # IO & Wait Time: 1047s 17.46m 0.29h 0.01d 0.000 y # Average job time: 69s 1.15m 0.02h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 208s 3.47m 0.06h 0.00d # Submission to last job: 751s 12.52m 0.21h 0.01d # Do sort, best in genome filter, and convert to chromosome coordinates # to create affyU74.psl. ssh kk cd /cluster/data/mm7/bed/affyU74.2005-04-14/run pslSort dirs raw.psl tmp psl # change filter parameters for these sequences. only use alignments that # cover 30% of sequence and have at least minAli = 0.95. # minAli = 0.97 too high. low minCover as a lot of n's in these sequences #pslReps -minCover=0.3 -sizeMatters -minAli=0.95 -nearTop=0.005 raw.psl contig.psl /dev/null pslReps -minCover=0.3 -sizeMatters -minAli=0.95 -nearTop=0.005 raw.psl ../all_affyU74.psl /dev/null # Sort by chromosome and load into database. ssh hgwdev cd /cluster/data/mm7/bed/affyU74.2005-04-14 pslSortAcc nohead chrom temp all_affyU74.psl cat chrom/*.psl > affyU74.psl # shorten qName to "xxxx_at" instead of "U74Xv2:xxxx_at;" # and reload data into table hgLoadPsl mm7 affyU74.psl # rm -fr chrom temp run ## MAKE THE affyGnfU74 TRACKs (TBD) # Make bed files and load consensus sequences for Affy U74 chip set. # Fix broken symlinks to microarray data after directory structure changed # (TBD) ---------------------------------- #This needs to be done after affyU74 is already made. ssh hgwdev mkdir -p /cluster/data/mm7/bed/affyGnf.2005-04-14 cd /cluster/data/mm7/bed/affyGnf.2005-04-14 # may need to build this command in src/hg/affyGnf affyPslAndAtlasToBed ../affyU74.2005-04-14/affyU74.psl \ /projects/compbio/data/microarray/affyGnfMouse/data/data_public_U74 \ affyGnfU74A.bed affyGnfU74A.exp -newType -chip=U74Av2 affyPslAndAtlasToBed ../affyU74.2005-04-14/affyU74.psl \ /projects/compbio/data/microarray/affyGnfMouse/data/U74B_b.txt \ affyGnfU74B.bed affyGnfU74B.exp -newType -chip=U74Bv2 affyPslAndAtlasToBed ../affyU74.2005-04-14/affyU74.psl \ /projects/compbio/data/microarray/affyGnfMouse/data/U74C_b.txt \ affyGnfU74C.bed affyGnfU74C.exp -newType -chip=U74Cv2 # edit 3 .bed files to shorten qName to "xxxx_at" instead of "U74Xv2:xxxx_at;" mkdir sav cp *.bed sav -p cat sav/affyGnfU74A.bed|sed -e "s/U74Av2://" >affyGnfU74A.bed cat sav/affyGnfU74B.bed|sed -e "s/U74Bv2://" >affyGnfU74B.bed cat sav/affyGnfU74C.bed|sed -e "s/U74Cv2://" >affyGnfU74C.bed # and reload data into table hgLoadBed -strict mm7 affyGnfU74A affyGnfU74A.bed hgLoadBed -strict mm7 affyGnfU74B affyGnfU74B.bed hgLoadBed -strict mm7 affyGnfU74C affyGnfU74C.bed # Add in sequence data for U74 tracks. # Copy consensus sequence to /gbdb if it isn't already # [THE SYM LINKS WERE ALREADY DONE.] # mkdir -p /gbdb/hgFixed/affyProbes cd /gbdb/hgFixed/affyProbes # fix broken symlinks after directory structure changed # /projects/compbiodata ----> /projects/compbio/data rm U74* # make correct symlinks (hartera, 2005-05-03) ln -s /projects/compbio/data/microarray/affyGnfMouse/sequences/U74Av2_consensus.fa . ln -s /projects/compbio/data/microarray/affyGnfMouse/sequences/U74Bv2_consensus.fa . ln -s /projects/compbio/data/microarray/affyGnfMouse/sequences/U74Cv2_consensus.fa . # used perl -pi.bak -e 's/;/ /' to remove ";" after probe name # ASSUMED THIS IS ALREADY DONE LAST TIME FOR MM4. # reload sequences with prefix removed so acc matches name used in # other dependent tables hgLoadSeq -abbr=U74Av2: mm7 /gbdb/hgFixed/affyProbes/U74Av2_consensus.fa hgLoadSeq -abbr=U74Bv2: mm7 /gbdb/hgFixed/affyProbes/U74Bv2_consensus.fa hgLoadSeq -abbr=U74Cv2: mm7 /gbdb/hgFixed/affyProbes/U74Cv2_consensus.fa ### GNF ATLAS 2 [TBD) # Align probes from GNF1M chip. ssh kk cd /cluster/data/mm7/bed mkdir -p geneAtlas2/run/psl cd geneAtlas2/run #mkdir -p /cluster/bluearc/geneAtlas2 #cp /projects/compbio/data/microarray/geneAtlas2/mouse/gnf1m.fa /cluster/bluearc/geneAtlas2 #ls -1 /scratch/mus/mm7/maskedContigs/ > genome.lst echo /panasas/store/mm7/nib/*.nib | wordLine stdin > genome.lst ls -1 /cluster/bluearc/geneAtlas2/gnf1m.fa > mrna.lst echo '#LOOP\nblat -fine -ooc=/scratch/hg/h/mouse11.ooc $(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl}\n#ENDLOOP' > gsub gensub2 genome.lst mrna.lst gsub spec para create spec para try para check para push para time # Completed: 40 of 40 jobs # CPU time in finished jobs: 56570s 942.84m 15.71h 0.65d 0.002 y # IO & Wait Time: 392s 6.53m 0.11h 0.00d 0.000 y # Average job time: 1424s 23.73m 0.40h 0.02d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 3979s 66.32m 1.11h 0.05d # Submission to last job: 3993s 66.55m 1.11h 0.05d # Do sort, best in genome filter, and convert to chromosome coordinates # to create gnf1h.psl. pslSort dirs raw.psl tmp psl pslReps -minCover=0.3 -minAli=0.95 -nearTop=0.005 raw.psl ../affyGnf1m.psl /dev/null #rm -r contig.psl raw.psl psl # Load probes and alignments from GNF1H into database. ssh hgwdev cd /cluster/data/mm7/bed/geneAtlas2 # ln -s /projects/compbio/data/microarray/geneAtlas2/mouse/gnf1m.fa /gbdb/hgFixed/affyProbes hgLoadPsl mm7 affyGnf1m.psl hgLoadSeq mm7 /gbdb/hgFixed/affyProbes/gnf1m.fa # Load up track hgMapMicroarray gnfAtlas2.bed hgFixed.gnfMouseAtlas2MedianRatio \ affyGnf1m.psl # Note that the unmapped 5000 records are from all-N sequences. hgLoadBed -strict mm7 gnfAtlas2 gnfAtlas2.bed # MOUSE AFFYMETRIX MOE430 TRACK (TBD) # mkdir -p /projects/compbio/data/microarray/affyMouse # Download MOE430A and MOE430B consensus sequences from Affymetrix web site # http://www.affymetrix.com/support/technical/byproduct.affx?product=moe430 # unzip MOE430*_consensus.zip # check for duplicate probes: there are none, all have unique names # check for duplicate probes: 100 from 136745_at to 1367551_a_at # remove "consensus:" and ";" from FASTA headers to shorten probeset # names for database # sed -e 's/consensus://' MOE430A_consensus | sed -e 's/;/ /' > MOE430_all.fa # sed -e 's/consensus://' MOE430B_consensus | sed -e 's/;/ /' >> MOE430_all.fa # cp /projects/compbio/data/microarray/affyMouse/MOE430_all.fa \ # /cluster/bluearc/affy/ # THE ABOVE WAS ALREADY TBD) # Set up cluster job to align MOE430 consensus sequences to mm7 ssh kkr1u00 cd /cluster/data/mm7/bed mkdir -p affyMOE430 cd affyMOE430 # mkdir -p /iscratch/i/affy # cp /cluster/bluearc/affy/MOE430_all.fa /iscratch/i/affy # iSync ssh kk cd /cluster/data/mm7/bed/affyMOE430 ls -1 /iscratch/i/affy/MOE430_all.fa > affy.lst echo /panasas/store/mm7/nib/*.nib | wordLine stdin > genome.lst echo '#LOOP\n/cluster/bin/i386/blat -fine -mask=lower -minIdentity=95 -ooc=/scratch/hg/h/mouse11.ooc $(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl}\n#ENDLOOP' > template.sub gensub2 genome.lst affy.lst template.sub para.spec mkdir psl para create para.spec # Actually do the job with usual para try/check/push/time etc. # Completed: 40 of 40 jobs # CPU time in finished jobs: 9414s 156.90m 2.61h 0.11d 0.000 y # IO & Wait Time: 281s 4.69m 0.08h 0.00d 0.000 y # Average job time: 242s 4.04m 0.07h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 597s 9.95m 0.17h 0.01d # Submission to last job: 657s 10.95m 0.18h 0.01d # Do sort, best in genome filter, and convert to chromosome coordinates # to create affyRAE230.psl pslSort dirs raw.psl tmp psl # only use alignments that cover 30% of sequence and have at least # 95% identity in aligned region. # low minCover as a lot of n's in these sequences pslReps -minCover=0.3 -sizeMatters -minAli=0.95 -nearTop=0.005 raw.psl affyMOE430.psl /dev/null # Load alignments and sequences into database ssh hgwdev cd /cluster/data/mm7/bed/affyMOE430 # shorten names in psl file sed -e 's/MOE430//' affyMOE430.psl > affyMOE430.psl.bak mv affyMOE430.psl.bak affyMOE430.psl # load track into database hgLoadPsl mm7 affyMOE430.psl # Add consensus sequences for MOE430 # Copy sequences to gbdb is they are not there already # mkdir -p /gbdb/hgFixed/affyProbes # ln -s /projects/compbio/data/microarray/affyMouse/MOE430_all.fa \ # /gbdb/hgFixed/affyProbes hgLoadSeq -abbr=MOE430 mm7 /gbdb/hgFixed/affyProbes/MOE430_all.fa # Clean up # rm batch.bak contig.psl raw.psl # BELOW TWO THINGS WERE DONE BY RACHEL ALREDAY FOR MM4 # add entry to trackDb.ra in ~kent/src/hg/makeDb/trackDb/mouse/ # add affyMOE430.html file and then do make alpha to add to trackDb table ######## MAKING GENE SORTER TABLES ####### (TBD) # These are instructions for building the # Gene Sorter. Don't start these until # there is a knownGene track and the affy tracks # Cluster together various alt-splicing isoforms. # Creates the knownIsoforms and knownCanonical tables ssh hgwdev cd /tmp hgClusterGenes mm7 knownGene knownIsoforms knownCanonical # You may need to build this binary in src/hg/near/hgClusterGenes # Got 24603 clusters, from 41208 genes in 43 chromosomes # featureBits mm7 knownCanonical # 686054706 bases of 2597150411 (26.416%) in intersection # featureBits mm5 knownCanonical # 853516995 bases of 2615483787 (32.633%) in intersection # featureBits mm4 knownCanonical # 840021165 bases of 2627444668 (31.971%) in intersection # featureBits mm3 knownCanonical # 825943052 bases of 2505900260 (32.960%) in intersection # ! ! ! Can not do featureBits on knownIsoforms # Extract peptides from knownGenes into fasta file # and create a blast database out of them. ssh hgwdev mkdir -p /cluster/data/mm7/bed/geneSorter/blastp cd /cluster/data/mm7/bed/geneSorter/blastp pepPredToFa mm7 knownGenePep known.faa # You may need to build this binary in src/hg/near/pepPredToFa /cluster/bluearc/blast229/formatdb -i known.faa -t known -n known # Copy over database to bluearc scratch mkdir /cluster/panasas/home/store/mm7/blastp cp -p /cluster/data/mm7/bed/geneSorter/blastp/known.* \ /cluster/panasas/home/store/mm7/blastp # Split up fasta file into bite sized chunks for cluster cd /cluster/data/mm7/bed/geneSorter/blastp mkdir split faSplit sequence known.faa 8000 split/kg # Make parasol run directory ssh kk mkdir /cluster/data/mm7/bed/geneSorter/blastp/self cd /cluster/data/mm7/bed/geneSorter/blastp/self mkdir run cd run mkdir out # Make blast script cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/cluster/bluearc/blast229/data /cluster/bluearc/blast229/blastall \ -p blastp -d /cluster/panasas/home/store/mm7/blastp/known \ -i $1 -o $2 -e 0.01 -m 8 -b 1000 '_EOF_' # << keep emacs happy chmod a+x blastSome # Make gensub2 file cat << '_EOF_' > gsub #LOOP blastSome {check in line+ $(path1)} {check out line out/$(root1).tab} #ENDLOOP '_EOF_' # << keep emacs happy # Create parasol batch # 'ls ../../split/*.fa' is too much, hence the echo echo ../../split/*.fa | wordLine stdin > split.lst gensub2 split.lst single gsub jobList para create jobList para try para check para push ... etc ... # Completed: 7715 of 7715 jobs # CPU time in finished jobs: 31525s 525.42m 8.76h 0.36d 0.001 y # IO & Wait Time: 34031s 567.18m 9.45h 0.39d 0.001 y # Average job time: 8s 0.14m 0.00h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 61s 1.02m 0.02h 0.00d # Submission to last job: 142s 2.37m 0.04h 0.00d # Load into database. This takes about an hour. ssh hgwdev cd /cluster/data/mm7/bed/geneSorter/blastp/self/run/out hgLoadBlastTab mm7 knownBlastTab *.tab Scanning through 7715 files Loading database with 1972005 rows # Create known gene mapping table and expression distance tables # for GNF Atlas 2. (The hgExpDistance takes an hour.) # TBD) hgMapToGene mm7 affyGnf1m knownGene knownToGnf1m hgExpDistance mm7 hgFixed.gnfMouseAtlas2MedianRatio \ hgFixed.gnfMouseAtlas2MedianExps gnfAtlas2Distance -lookup=knownToGnf1m # Create table that maps between known genes and RefSeq hgMapToGene mm7 refGene knownGene knownToRefSeq # may need to build this command in src/hg/near/hgMapToGene # Create a table that maps between known genes and # the nice affy expression data. hgMapToGene mm7 affyU74 knownGene knownToU74 hgMapToGene mm7 affyMOE430 knownGene knownToMOE430 hgMapToGene mm7 affyMOE430 -prefix=A: knownGene knownToMOE430A # Format and load Rinn et al sex expression data mkdir /cluster/data/mm7/bed/rinnSex cd !$ hgMapMicroarray rinnSex.bed hgFixed.mouseRinnSexMedianRatio \ ../affyMOE430/affyMOE430.psl hgLoadBed -strict mm7 rinnSex rinnSex.bed # Format and load the GNF data mkdir /cluster/data/mm7/bed/affyGnf95 cd /cluster/data/mm7/bed/affyGnf95 affyPslAndAtlasToBed -newType ../affyU95.psl \ /projects/compbio/data/microarray/affyGnfHuman/data_public_U95 \ affyGnfU95.tab affyGnfU95Exps.tab -shortOut # this .sql load was in preceeding instructions, but this .sql file # appears to not exist and it doesn't seem to be needed anyway. # Everything below this seems to create tables OK. # hgsql mm7 < ~/kent/src/hg/affyGnf/affyGnfU95.sql # Create table that gives distance in expression space between # GNF genes. These commands take about 15 minutes each # The affyGnfU74?Exps arguments appear to be unused in # hgExpDistance hgExpDistance mm7 affyGnfU74A affyGnfU74AExps affyGnfU74ADistance -lookup=knownToU74 # Got 7720 unique elements in affyGnfU74A hgExpDistance mm7 affyGnfU74B affyGnfU74BExps affyGnfU74BDistance -lookup=knownToU74 # Got 4619 unique elements in affyGnfU74B hgExpDistance mm7 affyGnfU74C affyGnfU74CExps affyGnfU74CDistance -lookup=knownToU74 # Got 1406 unique elements in affyGnfU74C # C.ELEGANS BLASTP FOR GENE SORTER (TBD) # Make C. elegans ortholog column using blastp on wormpep. # First make C. elegans protein database and copy it to iscratch/i # if it doesn't exist already: ssh eieio mkdir /cluster/data/ce2/bed/blastp cd /cluster/data/ce2/bed/blastp # Point a web browser at ftp://ftp.sanger.ac.uk/pub/databases/wormpep/ # to find out the latest version. Then use that in place of 142 below. wget -O wormPep142.faa ftp://ftp.sanger.ac.uk/pub/databases/wormpep/wormpep142/wormpep142 formatdb -i wormPep142.faa -t wormPep142 -n wormPep142 ssh kkr1u00 if (-e /iscratch/i/ce2/blastp) then rm -r /iscratch/i/ce2/blastp endif mkdir -p /iscratch/i/ce2/blastp cp /cluster/data/ce2/bed/blastp/wormPep142.p?? /iscratch/i/ce2/blastp iSync # Make parasol run directory ssh kk mkdir -p /cluster/data/mm7/bed/blastp/ce2/run/out cd /cluster/data/mm7/bed/blastp/ce2/run # Make blast script cat > blastSome < gsub <split.lst #EDIT split.lst to add "../../../geneSorter/blastp/split/" in front of "kg" gensub2 split.lst single gsub spec para create spec para try, check, push, check, ... # Completed: 7715 of 7715 jobs # CPU time in finished jobs: 29337s 488.96m 8.15h 0.34d 0.001 y # IO & Wait Time: 24651s 410.84m 6.85h 0.29d 0.001 y # Average job time: 7s 0.12m 0.00h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 40s 0.67m 0.01h 0.00d # Submission to last job: 206s 3.43m 0.06h 0.00d # Load into database. ssh hgwdev cd /cluster/data/mm7/bed/blastp/ce2/run/out hgLoadBlastTab mm7 ceBlastTab -maxPer=1 *.tab # HUMAN BLASTP FOR GENE SORTER (TBD) # Make human ortholog column using blastp on human known genes. # First make human protein database and copy it to iscratch/i # if it doesn't exist already: mkdir /cluster/data/hg17/bed/blastp cd /cluster/data/hg17/bed/blastp pepPredToFa hg17 knownGenePep known.faa formatdb -i known.faa -t known -n known # PLEASE NOTE, hg17B IS USED INSTEAD OF hg17 for /iscratch/i, # TO GO AROUND A SUBDIRECTORY ACCESS RIGHT PROBLEM. ssh kkr1u00 if (-e /iscratch/i/hg17B/blastp) then rm -r /iscratch/i/hg17B/blastp endif mkdir -p /iscratch/i/hg17B/blastp cp /cluster/data/hg17/bed/blastp/known.p?? /iscratch/i/hg17B/blastp iSync # Make parasol run directory ssh kk mkdir -p /cluster/data/mm7/bed/blastp/hg17/run/out cd /cluster/data/mm7/bed/blastp/hg17/run # Make blast script cat > blastSome < gsub <split.lst #EDIT split.lst to add "../../../geneSorter/blastp/split/" in front of "kg" gensub2 split.lst single gsub spec para create spec para try, check, push, check, ... # Completed: 7715 of 7715 jobs # CPU time in finished jobs: 67090s 1118.17m 18.64h 0.78d 0.002 y # IO & Wait Time: 22543s 375.72m 6.26h 0.26d 0.001 y # Average job time: 12s 0.19m 0.00h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 83s 1.38m 0.02h 0.00d # Submission to last job: 213s 3.55m 0.06h 0.00d # Load into database. ssh hgwdev cd /cluster/data/mm7/bed/blastp/hg17/run/out hgLoadBlastTab mm7 hgBlastTab -maxPer=1 *.tab # ZEBRAFISH BLASTP FOR GENE SORTER (TBD) # Make Danio rerio (zebrafish) ortholog column using blastp on Ensembl. # First make protein database and copy it to iscratch/i # if it doesn't exist already: ssh kkstore mkdir /cluster/data/danRer1/bed/blastp cd /cluster/data/danRer1/bed/blastp wget ftp://ftp.ensembl.org/pub/current_zebrafish/data/fasta/pep/Danio_rerio.ZFISH4.apr.pep.fa.gz zcat Dan*.pep.fa.gz > ensembl.faa formatdb -i ensembl.faa -t ensembl -n ensembl ssh kkr1u00 if (-e /iscratch/i/danRer1/blastp) then rm -r /iscratch/i/danRer1/blastp endif mkdir -p /iscratch/i/danRer1/blastp cp /cluster/data/danRer1/bed/blastp/ensembl.p?? /iscratch/i/danRer1/blastp iSync # Make parasol run directory ssh kk mkdir -p /cluster/data/mm7/bed/blastp/danRer1/run/out cd /cluster/data/mm7/bed/blastp/danRer1/run # Make blast script cat > blastSome < gsub <split.lst #EDIT split.lst to add "../../../geneSorter/blastp/split/" in front of "kg" gensub2 split.lst single gsub spec para create spec para try, check, push, check, ... # Completed: 7715 of 7715 jobs # CPU time in finished jobs: 53430s 890.51m 14.84h 0.62d 0.002 y # IO & Wait Time: 24688s 411.46m 6.86h 0.29d 0.001 y # Average job time: 10s 0.17m 0.00h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 76s 1.27m 0.02h 0.00d # Submission to last job: 202s 3.37m 0.06h 0.00d # Load into database. ssh hgwdev cd /cluster/data/mm7/bed/blastp/danRer1/run/out hgLoadBlastTab mm7 drBlastTab -maxPer=1 *.tab # YEAST BLASTP FOR GENE SORTER (TBD) # Make Saccharomyces cerevisiae (yeast) ortholog column using blastp on # RefSeq. First make protein database and copy it to iscratch/i # if it doesn't exist already: mkdir /cluster/data/sacCer1/bed/blastp cd /cluster/data/sacCer1/bed/blastp wget ftp://genome-ftp.stanford.edu/pub/yeast/data_download/sequence/genomic_sequence/orf_protein/orf_trans.fasta.gz zcat orf_trans.fasta.gz > sgdPep.faa formatdb -i sgdPep.faa -t sgdPep -n sgdPep ssh kkr1u00 # Note: sacCer1 is a name conflict with SARS coronavirus... oh well, # fortunately we won't be looking for homologs there. :) if (-e /iscratch/i/sacCer1/blastp) then rm -r /iscratch/i/sacCer1/blastp endif mkdir -p /iscratch/i/sacCer1/blastp cp /cluster/data/sacCer1/bed/blastp/sgdPep.p?? /iscratch/i/sacCer1/blastp iSync # Make parasol run directory ssh kk mkdir -p /cluster/data/mm7/bed/blastp/sacCer1/run/out cd /cluster/data/mm7/bed/blastp/sacCer1/run # Make blast script cat > blastSome < gsub <split.lst #EDIT split.lst to add "../../../geneSorter/blastp/split/" in front of "kg" gensub2 split.lst single gsub spec para create spec para try, check, push, check, ... # Completed: 7715 of 7715 jobs # CPU time in finished jobs: 8741s 145.68m 2.43h 0.10d 0.000 y # IO & Wait Time: 20376s 339.60m 5.66h 0.24d 0.001 y # Average job time: 4s 0.06m 0.00h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 17s 0.28m 0.00h 0.00d # Submission to last job: 199s 3.32m 0.06h 0.00d # Load into database. ssh hgwdev cd /cluster/data/mm7/bed/blastp/sacCer1/run/out hgLoadBlastTab mm7 scBlastTab -maxPer=1 *.tab # DM1 BLASTP FOR GENE SORTER (TBD) # Make Drosophila melanagaster ortholog column using blastp on FlyBase. # First make protein database and copy it to iscratch/i # if it doesn't exist already: # This is already done, see makeMm3.doc for procedure # the directory: /cluster/bluearc/dm1/blastp should have data # ssh kkr1u00 # if (-e /iscratch/i/dm1/blastp) then # rm -r /iscratch/i/dm1/blastp # endif # mkdir -p /iscratch/i/dm1/blastp # cp /cluster/data/dm1/bed/blastp/bdgp.p?? /iscratch/i/dm1/blastp # iSync # THE ABOVE IS ALREADY DONE BY ANGIE # Make parasol run directory ssh kk mkdir -p /cluster/data/mm7/bed/blastp/dm1/run/out cd /cluster/data/mm7/bed/blastp/dm1/run # Make blast script cat > blastSome < gsub <split.lst #EDIT split.lst to add "../../../geneSorter/blastp/split/" in front of "kg" gensub2 split.lst single gsub spec para create spec para try, check, push, check, ... # Completed: 7715 of 7715 jobs # CPU time in finished jobs: 33260s 554.33m 9.24h 0.38d 0.001 y # IO & Wait Time: 24452s 407.54m 6.79h 0.28d 0.001 y # Average job time: 7s 0.12m 0.00h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 45s 0.75m 0.01h 0.00d # Submission to last job: 121s 2.02m 0.03h 0.00d # Load into database. ssh hgwdev cd /cluster/data/mm7/bed/blastp/dm1/run/out hgLoadBlastTab mm7 dmBlastTab -maxPer=1 *.tab # Create table that maps between known genes and LocusLink (TBD) hgsql --skip-column-names -e "select mrnaAcc,locusLinkId from refLink" mm7 \ > refToLl.txt hgMapToGene mm7 refGene knownGene knownToLocusLink -lookup=refToLl.txt # row count is 17480 # Create table that maps between known genes and Pfam domains hgMapViaSwissProt mm7 knownGene name proteinID Pfam knownToPfam # row count is 17132 # Create table to map between known genes and GNF Atlas2 # expression data. hgMapToGene mm7 gnfAtlas2 knownGene knownToGnfAtlas2 '-type=bed 12' # Create table that maps between known genes and visiGene database knownToVisiGene mm7 # ENABLE GENE SORTER FOR mm7 IN HGCENTRALTEST (TBD) echo "update dbDb set hgNearOk = 1 where name = 'mm7';" \ | hgsql -h genome-testdb hgcentraltest # RAT BLASTP FOR GENE SORTER (TBD) # Make RAT ortholog column using blastp on RAT known genes. # First make RAT protein database and copy it to iscratch/i # if it doesn't exist already: mkdir /cluster/data/rn3/bed/blastp cd /cluster/data/rn3/bed/blastp pepPredToFa rn3 knownGenePep known.faa formatdb -i known.faa -t known -n known ssh kkr1u00 if (-e /iscratch/i/rn3/blastp) then rm -r /iscratch/i/rn3/blastp endif mkdir -p /iscratch/i/rn3/blastp cp /cluster/data/rn3/bed/blastp/known.p?? /iscratch/i/rn3/blastp iSync # Make parasol run directory ssh kk mkdir -p /cluster/data/mm7/bed/blastp/rn3/run/out cd /cluster/data/mm7/bed/blastp/rn3/run # Make blast script cat > blastSome < gsub <split.lst #EDIT split.lst to add "../../../geneSorter/blastp/split/" in front of "kg" gensub2 split.lst single gsub spec para create spec para try, check, push, check, ... # Completed: 7715 of 7715 jobs # CPU time in finished jobs: 12896s 214.93m 3.58h 0.15d 0.000 y # IO & Wait Time: 21725s 362.08m 6.03h 0.25d 0.001 y # Average job time: 4s 0.07m 0.00h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 22s 0.37m 0.01h 0.00d # Submission to last job: 246s 4.10m 0.07h 0.00d # Load into database. ssh hgwdev cd /cluster/data/mm7/bed/blastp/rn3/run/out hgLoadBlastTab mm7 rnBlastTab -maxPer=1 *.tab # END OF GENE SORTER STUFF ############################################################################# ### MM6 PROTEOME BROWSER TABLES BUILD #### (TBD) # These are instructions for building tables # needed for the Proteome Browser to be used with mm7. # DON'T START THESE UNTIL TABLES FOR KNOWN GENES AND kgProtMap table # ARE REBUILT. # This build is based on proteins DBs dated 050315. # Create the working directory ssh hgwdev mkdir /cluster/data/mm7/bed/pb.2005-04-20 cd /cluster/data/mm7/bed ln -s /cluster/data/mm7/bed/pb.2005-04-20 pb cd pb # Define pep* tables in mm7 DB cat ~/kent/src/hg/lib/pep*.sql > pepAll.sql # First edit out pepPred table definition, then hgsql mm7 < pepAll.sql # Build the pepMwAa table hgsql proteins050315 -e "select info.acc, molWeight, aaSize from sp050315.info, sp050315.accToTaxon where accToTaxon.taxon=10090 and accToTaxon.acc = info.acc" > pepMwAa.tab hgsql mm7 -e 'load data local infile "pepMwAa.tab" into table mm7.pepMwAa ignore 1 lines;' o Build the pepPi table hgsql proteins050315 -e "select info.acc from sp050315.info, sp050315.accToTaxon where accToTaxon.taxon=10090 and accToTaxon.acc = info.acc" > protAcc.lis pbCalPi protAcc.lis sp050315 pepPi.tab hgsql mm7 -e 'load data local infile "pepPi.tab" into table mm7.pepPi;' # Calculate and load pep distributions pbCalDist sp050315 proteins050315 10090 mm7 >pbCalDist.out cat pbCalDist.out wc pbCalDist.out hgsql mm7 load data local infile "pepExonCntDist.tab" into table mm7.pepExonCntDist; load data local infile "pepCCntDist.tab" into table mm7.pepCCntDist; load data local infile "pepHydroDist.tab" into table mm7.pepHydroDist; load data local infile "pepMolWtDist.tab" into table mm7.pepMolWtDist; load data local infile "pepResDist.tab" into table mm7.pepResDist; load data local infile "pepIPCntDist.tab" into table mm7.pepIPCntDist; load data local infile "pepPiDist.tab" into table mm7.pepPiDist; quit # Calculate frequency distributions pbCalResStd 050315 10090 mm7 # Create pbAnomLimit and pbResAvgStd tables hgsql mm7 < ~/src/hg/lib/pbAnomLimit.sql hgsql mm7 < ~/src/hg/lib/pbResAvgStd.sql hgsql mm7 -e 'load data local infile "pbResAvgStd.tab" into table mm7.pbResAvgStd;' hgsql mm7 -e 'load data local infile "pbAnomLimit.tab" into table mm7.pbAnomLimit;' # UPDATE kgSpAlias TABLE TO BE USED BY PB (Done 4/20/05) cd /cluster/data/mm7/bed/pb hgsql mm7 -e \ 'select kgXref.kgID, spID, alias from kgXref, kgAlias where kgXref.kgID=kgAlias.kgID' >j.tmp hgsql mm7 -e \ 'select kgXref.kgID, spID, alias from kgXref, kgProtAlias where kgXref.kgID=kgProtAlias.kgID'\ >>j.tmp cat j.tmp|sort -u |grep -v 'kgID' >mm7.kgSpAlias.tab rm j.tmp hgsql mm7 -e 'drop table kgSpAlias'; hgsql mm7 < ~/src/hg/lib/kgSpAlias.sql hgsql mm7 -e 'load data local infile "mm7.kgSpAlias.tab" into table kgSpAlias' gzip mm7.kgSpAlias.tab # Create pbStamp table for PB hgsql mm7 < ~/src/hg/lib/pbStamp.sql hgsql mm5 -e 'select * from pbStamp' > pbStamp.tab hgsql mm7 -e 'delete from pbStamp' hgsql mm7 -e 'load data local infile "pbStamp.tab" into table mm7.pbStamp ignore 1 lines;' # ENABLE PROTEOME BROWSER FOR mm7 IN HGCENTRALTEST echo "update dbDb set hgPbOk = 1 where name = 'mm7';" \ | hgsql -h genome-testdb hgcentraltest # Connect to genome-testdb and use hgcentraltest DB. # Update the entry in gdbPdb table from mySql prompt: delete from gdbPdb where genomeDb='mm7'; insert into gdbPdb values('mm7', 'proteins050415'); # Adjust drawing parameters for Proteome Browser stamps Now invoke Proteome Browser and adjust various drawing parameters (mostly the ymax of each stamp) if necessary, by updating the pbStamp.tab file and then delete and reload the pbStamp table. # Perform preliminary review of Proteome Browser for mm7, then notify QA for formal review. ##################################################################### # MAP CONTIGS TRACK (DONE - 2005-10-04 - Hiram) ssh hgwdev mkdir -p /cluster/data/mm7/bed/ctgPos cd /cluster/data/mm7/bed/ctgPos # hgCtgPos uses the lift files... but mouse lift files are for the # 5MB contigs from splitFaIntoContigs, not for the real NT_ contigs # from the assembly. (In the future, we should go with the NT's!) # So... just for this release, go straight from the seq_contig.md # to the table def'n: contig, size, chrom, chromStart, chromEnd # This script is an improvement from before, this is now doing the # randoms properly. cat << '_EOF_' > seqContigToCtgPos.pl #!/usr/bin/env perl use warnings; use strict; my $prevRandom=""; my $randomPosition=0; while(my $line=<>) { chomp($line); my @a = split('\s+',$line); if ($a[1] =~ m/\|/) { my @b = split('\|',$a[1]); if ($b[0] ne $prevRandom) { $randomPosition=0; $prevRandom=$b[0]; } my $size = $a[3]-$a[2]+1; my $start = $randomPosition; my $end = $randomPosition + $size; printf "%s\t%d\tchr%s_random\t%d\t%d\n", $a[5],$size,$b[0],$start,$end; if ($b[0] ne "Un") { $randomPosition += 50000; } else { $randomPosition += 1000; } $randomPosition += $size; } elsif ($a[5] =~ m/^N[TC]_\d+$/) { my $start = $a[2]-1; my $end = $a[3]; my $size = $end-$start; printf "%s\t%d\tchr%s\t%d\t%d\n", $a[5],$size,$a[1],$start,$end; } } '_EOF_' # emacs happy chmod +x seqContigToCtgPos.pl # /cluster/data/mm7/ncbi/seq_contig.md contains more than just C57BL/6J. # Filter those out with the grep. cat ../../seq_contig.md | grep C57BL | \ ./seqContigToCtgPos.pl > ctgPos.tab hgsql mm7 < ~/kent/src/hg/lib/ctgPos.sql hgsql mm7 -e 'load data local infile "ctgPos.tab" into table ctgPos;' featureBits -countGaps mm7 ctgPos # 2608810329 bases of 2847717329 (91.611%) in intersection featureBits -countGaps mm6 ctgPos # 2638893452 bases of 3079633452 (85.689%) in intersection featureBits -countGaps mm5 ctgPos # 2557081173 bases of 3164952073 (80.794%) in intersection ######################################################################### #### Blat knownGene proteins to determine exons (TBD) ssh hgwdev cd /cluster/data/mm7/bed mkdir blat.mm7KG.2005-05-02 rm blat.mm7KG ln -s blat.mm7KG.2005-05-02 blat.mm7KG cd blat.mm7KG pepPredToFa mm7 knownGenePep known.fa hgPepPred mm7 generic blastKGPep03 known.fa grep ">" known.fa | sed "s/>//" > kgName.lst ssh kk cd /cluster/data/mm7/bed/blat.mm7KG cat << '_EOF_' > blatSome #!/bin/csh -fe /cluster/bin/i386/blat -t=dnax -q=prot -out=pslx $1 $2 $3 '_EOF_' # << keep emacs happy chmod +x blatSome ls -1S /panasas/store/mm7/nib/*.nib > mouse.lst mkdir kgfa cd kgfa faSplit sequence ../known.fa 3000 kg cd .. ls -1S kgfa/*.fa > kg.lst cat << '_EOF_' > blatGsub #LOOP blatSome $(path1) {check in line $(path2)} {check out line psl/$(root1)/$(root2).psl} #ENDLOOP '_EOF_' # << keep emacs happy gensub2 mouse.lst kg.lst blatGsub blatSpec mkdir psl cd psl foreach i (`cat ../mouse.lst`) mkdir `basename $i .nib` end cd .. para create blatSpec para push # Completed: 115720 of 115720 jobs # CPU time in finished jobs: 14938417s 248973.62m 4149.56h 172.90d 0.474 y # IO & Wait Time: 2116275s 35271.25m 587.85h 24.49d 0.067 y # Average job time: 147s 2.46m 0.04h 0.00d # Longest finished job: 9235s 153.92m 2.57h 0.11d # Submission to last job: 25264s 421.07m 7.02h 0.29d ssh eieio cd /cluster/data/mm7/bed/blat.mm7KG pslSort dirs raw.psl /tmp psl/* pslReps -nohead -minCover=0.9 -minAli=0.9 raw.psl cooked.psl /dev/null pslUniq cooked.psl mm7KG.psl pslxToFa mm7KG.psl mm7KG_ex.fa -liftTarget=genome.lft -liftQuery=protein.lft ssh hgwdev kgName mm7 mm7KG.psl blastKGRef03 hgsql mm7 < ~/kent/src/hg/lib/blastRef.sql echo "rename table blastRef to blastKGRef03" | hgsql mm7 echo "load data local infile 'blastKGRef03' into table blastKGRef03" | hgsql mm7 ############################################################################## # LOAD GENEID GENES (DONE - 2005-12-20 - Hiram) ssh hgwdev mkdir -p /cluster/data/mm7/bed/geneid/download cd /cluster/data/mm7/bed/geneid/download awk '{print $1}' ../../../chrom.sizes | while read C do echo $C wget --timestamping \ http://genome.imim.es/genepredictions/M.musculus/mmDec2005/geneid_v1.2/$C.gtf wget --timestamping \ http://genome.imim.es/genepredictions/M.musculus/mmDec2005/geneid_v1.2/$C.prot done # Add missing .1 to protein id's foreach f (*.prot) perl -wpe 's/^(>chr\w+)$/$1.1/' $f > $f:r-fixed.prot end cd .. ldHgGene -genePredExt -gtf mm7 geneid download/*.gtf # Read 34992 transcripts in 295837 lines in 40 files # 34992 groups 40 seqs 1 sources 3 feature types # 34992 gene predictions hgPepPred mm7 generic geneidPep download/*-fixed.prot featureBits mm7 -enrichment refGene geneid # refGene 1.707%, geneid 1.579%, both 0.831%, cover 48.68%, enrich 30.83x featureBits mm6 -enrichment refGene geneid # refGene 1.623%, geneid 1.561%, both 0.794%, cover 48.91%, enrich 31.34x ############################################################################## # CLONE ENDS - BACEND TRACK (DONE - 2005-09-29 - Hiram) ssh kkstore02 cd /cluster/data/mm7 # check disk space: 73Gb free df -h . # Filesystem Size Used Avail Use% Mounted on # /export/cluster/store5 # 1.5T 1.3T 73G 95% /cluster/store5 mkdir -p bed/cloneend/ncbi cd bed/cloneend/ncbi wget --timestamping \ ftp://ftp.ncbi.nih.gov/genomes/CLONEEND/mus_musculus/* cd /cluster/data/mm7/bed/cloneend # seems like the *.mfa files were split just for convenience # concatenate for F in ncbi/*.mfa.gz do zcat ${F} done | gzip > all.mfa.gz # Convert the title line of the all.mfa file cat << '_EOF_' > convert.pl #!/usr/bin/env perl use strict; use warnings; while (my $line = <>) { if ($line !~ m/^>/) { print $line } else { my @fields = split('\|', $line); my $fieldCount = scalar(@fields); my $printed = 0; for (my $i = 0; $i < $fieldCount; $i++) { if ($fields[$i] eq "gb" || $fields[$i] eq "dbj") { (my $name, my $vers) = split(/\./,$fields[$i+1]); print ">$name\n"; $i= $fieldCount; $printed = 1; } } if (!$printed) { die("Failed for $line\n"); } } } '_EOF_' # < happy emacs chmod +x convert.pl zcat all.mfa | ./convert.pl | gzip > cloneEnds.fa.gz # make sure nothing got broken: faSize all.mfa.gz # 498162791 bases (16779168 N's 481383623 real 304962409 upper 176421214 lower) in 789466 sequences in 1 files faSize cloneEnds.fa.gz # 498162791 bases (16779168 N's 481383623 real 304962409 upper 176421214 lower) in 789466 sequences in 1 files # identical numbers # concatenate the text files, too for F in ncbi/*.txt.gz do zcat ${F} done | gzip > all.txt.gz # generate cloneEndPairs.txt and cloneEndSingles.txt cp -p /cluster/data/mm6/bed/cloneend/ncbi/convertTxt.pl . zcat all.txt.gz | ./convertTxt.pl stdin # Reading in end info # Writing out pair info # Writing out singleton info # 354485 pairs and 78423 singles # faSplit does not function correctly if given a .gz source file # AND, we need the unzipped file for sequence loading below gunzip cloneEnds.fa.gz # split mkdir splitdir cd splitdir faSplit sequence ../cloneEnds.fa 100 cloneEnds # Check to ensure no breakage: cat *.fa | faSize stdin # 498162791 bases (16779168 N's 481383623 real 304962409 upper 176421214 lower) in 789466 sequences in 1 files # same numbers as before # Copy to san for cluster runs ssh pk cd /cluster/data/mm7/bed/cloneend/splitDir mkdir /san/sanvol1/scratch/mm7/cloneEnds cp -p *.fa /san/sanvol1/scratch/mm7/cloneEnds rm * cd .. rmdir splitDir # load sequences ssh hgwdev mkdir /gbdb/mm7/cloneend cd /gbdb/mm7/cloneend ln -s /cluster/data/mm7/bed/cloneend/cloneEnds.fa . cd /tmp hgLoadSeq mm7 /gbdb/mm7/cloneend/cloneEnds.fa # Advisory lock created # Creating .tab file # Adding /gbdb/mm7/cloneend/cloneEnds.fa # 789466 sequences # Updating seq table # Advisory lock has been released # All done ############################################################################ # BACEND SEQUENCE ALIGNMENTS (DONE - 2005-09-29 - 2005-10-03 - Hiram) ssh kkstore02 mkdir /cluster/data/mm7/noMask cd /cluster/data/mm7/ # Need an unmasked sequence for this work for CHR in ?/chr?.fa ??/chr??.fa ?/chr?_random.fa ??/chr??_random.fa do C=`basename ${CHR}` echo -n "working ${C} ... " head -1 ${CHR} > noMask/${C} tail +2 ${CHR} | tr [:lower:] [:upper:] >> noMask/${C} echo "done" done mkdir ooc ls noMask/chr*.fa > fa.list # ooc file was created earlier into /cluster/bluearc/mm7/11.ooc # and /san/sanvol1/scratch/mm7/11.ooc ssh pk mkdir /san/sanvol1/scratch/mm7/noMask cd /san/sanvol1/scratch/mm7/noMask time cp --verbose -p /cluster/data/mm7/noMask/chr*.fa . # allow blat to run politely in /tmp while it writes output, then # copy results to results file: mkdir /cluster/data/mm7/bed/bacends cd /cluster/data/mm7/bed/bacends cat << '_EOF_' > runBlat.sh #!/bin/sh path1=$1 path2=$2 root1=$3 root2=$4 result=$5 rm -fr /tmp/${root1}_${root2} mkdir /tmp/${root1}_${root2} pushd /tmp/${root1}_${root2} /cluster/bin/x86_64/blat ${path1} ${path2} \ -ooc=/san/sanvol1/scratch/mm7/11.ooc ${root1}.${root2}.psl popd rm -f ${result} mv /tmp/${root1}_${root2}/${root1}.${root2}.psl ${result} rm -fr /tmp/${root1}_${root2} '_EOF_' # << emacs happy chmod +x runBlat.sh cat << '_EOF_' > template #LOOP ./runBlat.sh {check in exists $(path1)} {check in exists $(path2)} $(root1) $(root2) {check out line+ bacEnds.out/$(root2)/$(root1).$(root2).psl} #ENDLOOP '_EOF_' # << emacs happy ls -1S /san/sanvol1/scratch/mm7/cloneEnds/cloneEnds???.fa > bacEnds.lst mkdir bacEnds.out # create results directories for each to avoid the all result files in # one directory problem foreach f (`cat bacEnds.lst`) set b = $f:t:r echo $b mkdir bacEnds.out/$b end ls -1S /san/sanvol1/scratch/mm7/noMask/chr*.fa > contig.lst gensub2 contig.lst bacEnds.lst template jobList para create jobList # 3920 jobs written to batch para try, check, push, etc ... XXX - STARTED - 2005-10-02 - Hiram # Completed: 3920 of 3920 jobs # CPU time in finished jobs: 683325s 11388.75m 189.81h 7.91d 0.022 y # IO & Wait Time: 51220s 853.67m 14.23h 0.59d 0.002 y # Average job time: 187s 3.12m 0.05h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 1388s 23.13m 0.39h 0.02d # Submission to last job: 65899s 1098.32m 18.31h 0.76d ssh kkstore02 cd /cluster/data/mm7/bed/bacends screen mkdir temp time pslSort dirs raw.psl temp bacEnds.out/* > pslSort.out 2>&1 & # real 27m20.352s # user 20m10.329s # sys 1m55.287s time pslReps -nearTop=0.01 -minCover=0.7 -minAli=0.8 -noIntrons \ raw.psl bacEnds.psl /dev/null > pslReps.out 2>&1 & # real 8m15.671s # user 7m18.229s # sys 0m20.554s cp -p ~booch/clusterJobs/bacends/split.pl . cp -p ~booch/clusterJobs/bacends/header . time ./split.pl header < bacEnds.psl cp -p bacEnds.psl bacEnds.psl.save time pslSort dirs bacEnds.psl temp split # ~ 3 minutes # Copy files to final destination and remove mkdir /cluster/data/mm7/bacends cp -p bacEnds.psl /cluster/data/mm7/bacends ############################################################################ # BACEND PAIRS TRACK (DONE - 2005-10-03 - 2005-10-04 - Hiram) ssh kolossus cd /cluster/data/mm7/bacends time /cluster/bin/x86_64/pslPairs -tInsert=10000 -minId=0.91 -noBin -min=25000 \ -max=350000 -slopval=10000 -hardMax=500000 -slop -short -long -orphan \ -mismatch -verbose bacEnds.psl \ ../bed/cloneend/cloneEndPairs.txt all_bacends bacEnds # create header required by "rdb" tools echo -e \ "chr\tstart\tend\tclone\tscore\tstrand\tall\tfeatures\tstarts\tsizes" > header echo -e "10\t10N\t10N\t10\t10N\t10\t10\t10N\t10\t10" >> header cat header bacEnds.pairs | \ /cluster/bin/scripts/row score ge 300 | \ /cluster/bin/scripts/sorttbl chr start | \ /cluster/bin/scripts/headchg -del > bacEndPairs.bed cat header bacEnds.slop bacEnds.short bacEnds.long bacEnds.mismatch \ bacEnds.orphan | /cluster/bin/scripts/row score ge 300 | \ /cluster/bin/scripts/sorttbl chr start | \ /cluster/bin/scripts/headchg -del > bacEndPairsBad.bed /cluster/bin/scripts/extractPslLoad -noBin bacEnds.psl bacEndPairs.bed \ bacEndPairsBad.bed >j1.out cat j1.out| /cluster/bin/scripts/sorttbl tname tstart >j2.out cat j2.out | /cluster/bin/scripts/headchg -del > bacEnds.load.psl rm j1.out j2.out # CHECK bacEndPairs.bed ID's to make sure they have no blanks in them awk '{print $5}' bacEndPairs.bed | sort -u # result should be the scores, no extraneous strings: # 1000 # 300 # 375 # 500 # 750 # edit the file and fix it if it has a bad name. # load into database ssh hgwdev cd /cluster/data/mm7/bacends hgLoadBed -strict -notItemRgb mm7 bacEndPairs bacEndPairs.bed \ -sqlTable=$HOME/kent/src/hg/lib/bacEndPairs.sql # Loaded 231570 elements of size 11 # note - this track isn't pushed to RR, just used for assembly QA hgLoadBed -strict -notItemRgb mm7 bacEndPairsBad bacEndPairsBad.bed \ -sqlTable=$HOME/kent/src/hg/lib/bacEndPairsBad.sql # Loaded 95099 elements of size 11 # NOTE: truncates file to 0 if -nobin is used hgLoadPsl mm7 -table=all_bacends bacEnds.load.psl # load of all_bacends did not go as planned: 8397416 record(s), 0 row(s) # skipped, 1 warning(s) loading psl.tab # This takes about 40 minutes featureBits mm7 all_bacends # 334161740 bases of 2583394090 (12.935%) in intersection featureBits mm6 all_bacends # 336981828 bases of 2597150411 (12.975%) in intersection featureBits mm5 all_bacends # 268502414 bases of 2615483787 (10.266%) in intersection featureBits mm4 all_bacends # 243096171 bases of 2627444668 (9.252%) in intersection featureBits mm7 bacEndPairs # 2578837424 bases of 2583394090 (99.824%) in intersection featureBits mm6 bacEndPairs # 2570768812 bases of 2597150411 (98.984%) in intersection featureBits mm5 bacEndPairs # 2567958504 bases of 2615483787 (98.183%) in intersection featureBits mm4 bacEndPairs # 2549945356 bases of 2627444668 (97.050%) in intersection featureBits mm7 bacEndPairsBad # 954662115 bases of 2583394090 (36.954%) in intersection featureBits mm6 bacEndPairsBad # 1006314997 bases of 2597150411 (38.747%) in intersection featureBits mm5 bacEndPairsBad # 541027882 bases of 2615483787 (20.686%) in intersection featureBits mm4 bacEndPairsBad # 1074505863 bases of 2627444668 (40.895%) in intersection ############################################################################ # SGP GENES (DONE - 2006-01-18 - Hiram) ssh kkstore02 mkdir /cluster/data/mm7/bed/sgp cd /cluster/data/mm7/bed/sgp # They don't do chrM for CHR in `awk '{print $1}' ../../chrom.sizes | grep -v chrM` do wget --timestamping \ "http://genome.imim.es/genepredictions/M.musculus/mmDec2005/SGP/humangp200405/${CHR}.gtf" \ -O "${CHR}.gtf" wget --timestamping \ "http://genome.imim.es/genepredictions/M.musculus/mmDec2005/SGP/humangp200405/${CHR}.prot" \ -O "${CHR}.prot" done # Add ".1" suffix to each item in .prot's, to match transcript_id's in gtf for P in chr*.prot do sed -e "s/^>\(.*\)/>\1.1/" ${P} done > sgpPep.fa ssh hgwdev cd /cluster/data/mm7/bed/sgp ldHgGene -gtf -genePredExt mm7 sgpGene chr*.gtf hgPepPred mm7 generic sgpPep sgpPep.fa featureBits mm7 -enrichment refGene:CDS sgpGene # refGene:CDS 1.033%, sgpGene 1.441%, both 0.888%, cover 85.94%, enrich 59.63 ############################################################################ # RE-BUILD KNOWN GENES RELATED TABLES for mm7 (TBD) # First build protein databases, sp050415 and proteins050415 # See makeProteins050415.doc for details. # Please note that the protein and displayId tables in sp050415 have data of variant splice proteins. # Create working subdirectories and temporary databases ssh hgwdev cd /cluster/store10/kg mkdir kgMm6B ln -s /cluster/store10/kg/kgMm6B /cluster/store6/kgDB/bed/kgMm6B ln -s /cluster/store10/kg/kgMm6B /cluster/data/mm7/bed/kgMm6B hgsql mm7 -e "create database kgMm6B" hgsql mm7 -e "create database kgMm6BTemp" mkdir /cluster/bluearc/kgDB/kgMm6B mkdir /cluster/bluearc/kgDB/kgMm6B/protBlat ln -s /cluster/bluearc/kgDB/kgMm6B/protBlat /cluster/store10/kg/kgMm6B/protBlat cd /cluster/store10/kg/kgMm6B/protBlat # Get all human protein sequences hgsql -N sp050415 -e \ 'select proteins050415.spXref3.accession,protein.val from proteins050415.spXref3,protein where division="10090" and acc=accession' \ |awk '{print ">" $1;print $2}' >mm7Prot.fa # Prepare and perform cluster run for protein/genome alignment ssh kk cd /cluster/data/mm7/bed/kgMm6B/protBlat mkdir prot faSplit sequence mm7Prot.fa 1000 prot/prot ls /cluster/bluearc/kgDB/kgMm6B/protBlat/prot/* > prot.lis ssh hgwdev cd /cluster/data/mm7/bed/kgMm6B/protBlat hgsql mm7 -N -e 'select chrom from chromInfo' > chrom.lis exit cat << '_EOF_' > gsub #LOOP /cluster/bin/i386/blat -noHead -t=dnax -q=prot /cluster/data/mm7/nib/$(path1).nib $(path2) {check out line+ /cluster/bluearc/kgDB/kgMm6B/protBlat/result/$(root1)_$(root2).psl} #ENDLOOP '_EOF_' mkdir result gensub2 chrom.lis prot.lis gsub jobList para create jobList para try para check para push para check ... # Completed: 31386 of 39600 jobs # Crashed: 8214 jobs # CPU time in finished jobs: 32377544s 539625.74m 8993.76h 374.74d 1.027 y # IO & Wait Time: 727341s 12122.34m 202.04h 8.42d 0.023 y # Average job time: 1055s 17.58m 0.29h 0.01d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 34182s 569.70m 9.49h 0.40d # Submission to last job: 57659s 960.98m 16.02h 0.67d # Many output .psl files are empty, these warnings are OK. # Check to see if there is any other error type. para problems |grep empty|wc # 8214 24642 642357 # collect BLAT results ssh hgwdev cd /cluster/data/mm7/bed/kgMm6B/protBlat mkdir result2 mkdir result3 cat chrom.lis |sed -e 's/chr/do1 chr/g' >doall cat << '_EOF_' > do1.1 echo processing $1 cat result/$1_prot*.psl >result2/$1.psl '_EOF_' cat << '_EOF_' > do1.2 echo processing $1 pslReps -nohead -minCover=0.80 -minAli=0.80 -nearTop=0.002 result2/$1.psl result3/$1.psl /dev/null >>j.out '_EOF_' chmod +x do* cp do1.1 do1 doall cp do1.2 do1 doall cat result3/*.psl >protBlat.psl hgLoadPsl mm7 protBlat.psl # Processing protBlat.psl # load of protBlat did not go as planned: 82296 record(s), 0 row(s) skipped, 750 warning(s) loading psl.tab # Looked into the cause of the warnings before and found that it was due to that qBaseInsert # and tBaseInsert have negative values, probably due to that this is protein alignment. # Remember to remove result2 and result3 when KG is built and validated. cd /cluster/data/mm7/bed/kgMm6B # create all_mrna.psl and tight_mrna.psl hgsql mm7 -N -e "select * from all_mrna" |cut -f 2-22 >all_mrna.psl pslReps -minCover=0.40 -minAli=0.97 -nearTop=0.002 all_mrna.psl tight_mrna.psl /dev/null # Processed 194640 alignments # Use overlapSelect to get protein and mRNA alignment overlaps overlapSelect -statsOutput -dropped=protOut.psl -overlapThreshold=0.90 \ -selectFmt=psl -inFmt=psl tight_mrna.psl protBlat/protBlat.psl protMrna.stat overlapSelect -mergeOutput -dropped=protOut.psl -overlapThreshold=0.90 -selectFmt=psl \ -inFmt=psl tight_mrna.psl protBlat/protBlat.psl protMrna.out # Create protein/mRNA pair and protein lists cut -f 10,31 protMrna.out|sort -u >spMrna.tab cut -f 10 protMrna.out|sort -u >protein.lis # Load spMrna.tab into spMrna table in temp DB. hgsql kgMm6BTemp < ~/src/hg/lib/spMrna.sql hgsql kgMm6BTemp -e 'load data local infile "spMrna.tab" into table spMrna' hgsql kgMm6BTemp -e 'create index mrnaID on spMrna(mrnaID)' # Prepare and perform cluster run of protein/mRNA alignment # Get mRNA fa file. cd /cluster/data/mm7/bed/kgMm6B /cluster/data/genbank/bin/i386/gbGetSeqs -native -db=mm7 \ -gbRoot=/cluster/data/genbank genbank mrna mrna.fa # Create mrnaSeq table in kgMm6BTemp DB. hgFaToTab mrna.fa mrnaSeq.tab hgsql kgMm6BTemp -e 'drop table mrnaSeq' hgsql kgMm6BTemp <~/src/hg/lib/mrnaSeq.sql hgsql kgMm6BTemp -e 'load data local infile "mrnaSeq.tab" into table mrnaSeq' rm mrnaSeq.tab # Prepare files for cluster run ~/src/hg/protein/KG2.sh kgMm6B mm7 050415 # Perform cluster run of protein/mRNA alignment ~/src/hg/protein/KG4.sh kgMm6B mm7 050415 # Collect cluster run results cd kgBestMrna ls out | sed -e 's/prot/do1 prot/g' >doall # create do1 with the following 2 lines: cat << '_EOF_' > do1 echo processing $1 cat out/$1/*.out >>protMrnaRaw.psl '_EOF_' chmod +x do* doall # Filter out low quality alignments pslReps -nohead -singleHit -minAli=0.9 protMrnaRaw.psl protMrnaBlat.psl /dev/null cut -f 10,14 protMrnaBlat.psl |sort -u >protMrna.lis wc protMrna.lis # Load BLAT results into temp DB. hgsql kgMm6BTemp < ~/src/hg/lib/protMrnaBlat.sql hgsql kgMm6BTemp -e 'load data local infile "protMrnaBlat.psl" into table protMrnaBlat' hgsql kgMm6BTemp -e 'create index tName on protMrnaBlat(tName)' # Create CDS files from protein/mRNA alignment results. hgsql kgMm6BTemp -N -e \ 'select qName,"_",tName,tStart+1,":",tEnd+3 from protMrnaBlat order by qName,tName,tEnd-tStart desc'\ |sed 's/\t_\t/_/g'|sed 's/\t:\t/../g' >protMrna.cds # Create protMrna.psl with proteinID_mrnaID as query ID. cut -f 22-30 ../protMrna.out > j1.tmp cut -f 32-42 ../protMrna.out > j2.tmp cut -f 10,31 ../protMrna.out|sed -e 's/\t/_/g' >j3.tmp paste j1.tmp j3.tmp j2.tmp >protMrna.psl rm j1.tmp j2.tmp j3.tmp # Run mrnaToGene to create protMrna.gp bash mrnaToGene -cdsFile=protMrna.cds protMrna.psl protMrna.gp 2>protMrna.err >protMrna.log exit # Prepare refGene and all_mrna gp files. cd .. hgsql mm7 -N -e 'select * from refGene' >ref.gp hgsql mm7 -N -e \ 'select gbCdnaInfo.acc,cds.name from gbCdnaInfo,cds,all_mrna where all_mrna.qName=gbCdnaInfo.acc and gbCdnaInfo.cds=cds.id' \ |sort -u > all_mrna.cds bash mrnaToGene -cdsFile=all_mrna.cds all_mrna.psl all_mrna.gp 2>all_mrna.err > all_mrna.log exit # Align proteins to RefSeq. overlapSelect -inCds -statsOutput -overlapThreshold=0.90 -selectFmt=psl -inFmt=genePred\ protBlat/protBlat.psl ref.gp ref.stat overlapSelect -inCds -dropped=refOut1.gp -overlapThreshold=0.90 -selectFmt=psl -inFmt=genePred\ protBlat/protBlat.psl ref.gp protRef.gp overlapSelect -mergeOutput -selectCds -dropped=protOut1.psl -overlapThreshold=0.80 -inFmt=psl\ -selectFmt=genePred ref.gp protBlat/protBlat.psl protRef.out cut -f 10,22 protRef.out | sort -u >spRef.tab cut -f 10 protRef.out | sort -u >protRef.lis hgsql kgMm6BTemp -e 'drop table spRef' hgsql kgMm6BTemp <~/src/hg/lib/spRef.sql hgsql kgMm6BTemp -e 'load data local infile "spRef.tab" into table spRef' # Prepare and perform cluster runs for protein/RefSeq alignments ~/src/hg/protein/KGRef2.sh kgMm6B mm7 050415 ~/src/hg/protein/KGRef3.sh kgMm6B mm7 050415 cd kgBestRef ls out | sed -e 's/prot/do1 prot/g' >doall cat << '_EOF_' > do1 echo processing $1 cat out/$1/*.out >>protRefRaw.psl '_EOF_' chmod +x do* doall # Filter out low quality alignments. pslReps -nohead -singleHit -minAli=0.9 protRefRaw.psl protRefBlat.psl /dev/null cut -f 10,14 protRefBlat.psl |sort -u >protRef.lis wc protRef.lis hgsql kgMm6BTemp -e 'drop table protRefBlat' hgsql kgMm6BTemp < ~/src/hg/lib/protRefBlat.sql hgsql kgMm6BTemp -e 'load data local infile "protRefBlat.psl" into table protRefBlat' hgsql kgMm6BTemp -e 'create index tName on protRefBlat(tName)' # Run gene-check to filter out invalid gp entries cd /cluster/data/mm7/bed/kgMm6B cat ref.gp kgBestMrna/protMrna.gp all_mrna.gp >kgCandidate0.gp gene-check -incl-ok -ok-genepred-out kgCandidate0.passed.gp -nib-dir \ /cluster/data/mm7/nib kgCandidate0.gp kgCandidate0.check hgsql kgMm6BTemp < ~/src/hg/lib/kgCandidate0.sql hgsql kgMm6BTemp -e 'load data local infile "kgCandidate0.gp" into table kgCandidate0' hgsql kgMm6BTemp < ~/src/hg/lib/geneCheck.sql hgsql kgMm6BTemp -e 'load data local infile "kgCandidate0.check" into table geneCheck ignore 2 lines' # Run kgCheck to get all KG candidates that pass the KG gene check criteria kgCheck kgMm6BTemp mm7 kgCandidate0 geneCheck kgCandidate.tab hgsql kgMm6BTemp -e 'drop table kgCandidate' hgsql kgMm6BTemp < ~/src/hg/lib/kgCandidate.sql hgsql kgMm6BTemp -e 'load data local infile "kgCandidate.tab" into table kgCandidate' hgsql kgMm6BTemp -e 'create index alignID on kgCandidate(alignID)' # Construct the kgCandidateX table that has alignID in the name field. cut -f 2-10 kgCandidate.tab >j2.tmp cut -f 11 kgCandidate.tab >j1.tmp paste j1.tmp j2.tmp >kgCandidateX.tab hgsql kgMm6BTemp -e 'drop table kgCandidateX' hgsql kgMm6BTemp < ~/src/hg/lib/kgCandidateX.sql hgsql kgMm6BTemp -e 'load data local infile "kgCandidateX.tab" into table kgCandidateX' # Score protein/mRna and protein/RefSeq alignments kgResultBestMrna2 050415 kgMm6BTemp mm7|sort -u >protMrnaBlatScore.tab kgResultBestRef2 050415 kgMm6BTemp mm7|sort -u >protRefScore.tab # Combine scoring results and load them into temp DB. cat protMrnaBlatScore.tab protRefScore.tab >protMrnaScore.tab hgsql kgMm6BTemp -e 'drop table protMrnaScore' hgsql kgMm6BTemp < ~/src/hg/lib/protMrnaScore.sql hgsql kgMm6BTemp -e 'load data local infile "protMrnaScore.tab" into table protMrnaScore' hgsql kgMm6BTemp -e 'create index mrnaAcc on protMrnaScore(mrnaAcc)' # Run kgGetCds to get CDS structure of each gene kgGetCds kgMm6BTemp kgCandidateX jY.tmp cat jY.tmp |sort -u >kgCandidateY.tab rm jY.tmp hgsql kgMm6BTemp -e 'drop table kgCandidateY' hgsql kgMm6BTemp < ~/src/hg/lib/kgCandidateY.sql hgsql kgMm6BTemp -e 'load data local infile "kgCandidateY.tab" into table kgCandidateY' # Run kgPickPrep to replace long cds structure string with cdsId. kgPickPrep kgMm6BTemp kgCandidateZ.tab hgsql kgMm6BTemp -e 'drop table kgCandidateZ' hgsql kgMm6BTemp < ~/src/hg/lib/kgCandidateZ.sql hgsql kgMm6BTemp -e 'load data local infile "kgCandidateZ.tab" into table kgCandidateZ' hgsql kgMm6BTemp -e 'create index cdsId on kgCandidateZ(cdsId)' # Run kgPick to pick the representative a mrna/protein pair for each unique CDS structure. kgPick kgMm6BTemp mm7 proteins050415 kg4.tmp dupSpMrna.tmp sort -u dupSpMrna.tmp >dupSpMrna.tab hgsql mm7 -e 'drop table dupSpMrna' hgsql mm7 < ~/src/hg/lib/dupSpMrna.sql hgsql mm7 -e 'load data local infile "dupSpMrna.tab" into table dupSpMrna' # Sort KG genes to make the kg4.gp table file. ~/kent/src/hg/protein/sortKg.pl kg4.tmp >kg4.gp hgsql kgMm6BTemp -e 'drop table knownGene' hgsql kgMm6BTemp < ~/src/hg/lib/knownGene.sql hgsql kgMm6BTemp -e 'load data local infile "kg4.gp" into table knownGene' hgsql mm7 -e 'drop table kg4' hgsql mm7 < ~/src/hg/lib/kg4.sql hgsql mm7 -e 'load data local infile "kg4.gp" into table kg4' # Perform analysis before loading kg4 table data to mm7.knownGene table. # Load data into mm7 knownGene table. hgsql mm7 -e 'drop table knownGene' hgsql mm7 < ~/src/hg/lib/knownGene.sql hgsql mm7 -e 'load data local infile "kg4.gp" into table knownGene' # Build knownGeneMrna and knownGenePep tables. kgPepMrna kgMm6BTemp mm7 050415 hgsql mm7 -e 'drop table knownGeneMrna' hgsql mm7 < ~/src/hg/lib/knownGeneMrna.sql hgsql mm7 -e 'load data local infile "knownGeneMrna.tab" into table knownGeneMrna' hgsql mm7 -e 'drop table knownGenePep' hgsql mm7 < ~/src/hg/lib/knownGenePep.sql hgsql mm7 -e 'load data local infile "knownGenePep.tab" into table knownGenePep' # Build kgXref table kgXref2 kgMm6BTemp 050415 mm7 hgsql mm7 -e 'drop table kgXref' hgsql mm7 < ~/src/hg/lib/kgXref.sql hgsql mm7 -e 'load data local infile "kgXref.tab" into table kgXref' # Build spMrna table hgsql mm7 -N -e 'select name, proteinID from knownGene' >kgSpMrna.tab hgsql mm7 -e 'drop table spMrna' hgsql mm7 <~/src/hg/lib/spMrna.sql hgsql mm7 -e 'load data local infile "kgSpMrna.tab" into table spMrna' # Build mrnaRefseq table cd /cluster/store10/entrez mkdir 050601 rm /cluster/data/entrez ln -s /cluster/store10/entrez/050601 /cluster/data/entrez cd /cluster/data/entrez wget -- timestamp ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/gene2accession.gz wget -- timestamp ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/gene_info.gz wget -- timestamp ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/gene2refseq.gz gzip -d *.gz cut -f 2,4 gene2accession | sort -u | grep -v "-" | grep -v "NM_" | sed -e 's/\./\t/g' > entrezMrna.tab cut -f 2,4 gene2refseq | grep "NM_"| sort -u | grep -v "-" | sed -e 's/\./\t/g' > entrezRefseq.tab cut -f 2,4,6 gene2accession | grep "NM_"| grep "NP_"|sort -u | sed -e 's/\./\t/g' > entrezRefProt.tab hgsql entrez -e 'drop table entrezRefseq' hgsql entrez -e 'drop table entrezMrna' hgsql entrez -e 'drop table entrezRefProt' hgsql entrez < ~/src/hg/lib/entrezRefseq.sql hgsql entrez < ~/src/hg/lib/entrezMrna.sql hgsql entrez < ~/src/hg/lib/entrezRefProt.sql hgsql entrez -e 'load data local infile "entrezRefseq.tab" into table entrezRefseq' hgsql entrez -e 'load data local infile "entrezMrna.tab" into table entrezMrna' hgsql entrez -e 'load data local infile "entrezRefProt.tab" into table entrezRefProt' hgsql entrez -N -e \ 'select mrna, refseq from entrezRefseq, entrezMrna where entrezRefseq.geneID=entrezMrna.geneID' \ >mrnaRefseq.tab hgsql mm7 -e 'drop table mrnaRefseq' hgsql mm7 < ~/src/hg/lib/mrnaRefseq.sql hgsql mm7 -e 'load data local infile "mrnaRefseq.tab" into table mrnaRefseq' # Build kgProtMap table ~/src/hg/protein/kgProtMap2.sh kgMm6B mm7 050415 # Update and clean up kgResultBestMrna2.c and then check it in. # Build alias tables. # kgAliasM reads from proteins050415.hugo.symbol, proteins050415.hugo.aliases # proteins050415.hugo.withdraws, mm7.kgXref.kgID # to create kgAliasM.tab and geneAlias.tab # by picking out those kgID items from kgXref where # kgXref.geneSymbol == hugo.symbol cd /cluster/store10/kg/kgMm6B mkdir alias cd alias kgAliasM mm7 proteins050415 # kgAliasKgXref reads from mm7.knownGene.proteinID, # mm7.knownGene.name, mm7.kgXref.geneSymbol # to create kgAliasKgXref.tab kgAliasKgXref mm7 # kgAliasRefseq reads from mm7.knownGene.name, # mm7.knownGene.proteinID, mm7.kgXref.refseq # to create kgAliasRefseq.tab kgAliasRefseq mm7 hgsql sp050415 -N -e 'select name,gene.val from mm7.knownGene,displayId,gene where displayId.val=proteinID and displayId.acc=gene.acc' \ | sort -u > kgAliasP.tab hgsql mm7 -N -e 'select name, name from knownGene' >kgAliasDup.tab hgsql mm7 -N -e 'select mrnaID, dupMrnaID from dupSpMrna' >>kgAliasDup.tab cat kgAliasM.tab kgAliasRefseq.tab kgAliasKgXref.tab kgAliasP.tab kgAliasDup.tab| \ sort |uniq > kgAlias.tab hgsql -e "drop table kgAlias;" mm7 hgsql mm7 < ~/kent/src/hg/lib/kgAlias.sql hgsql mm7 -e 'LOAD DATA local INFILE "kgAlias.tab" into table kgAlias' # kgProtAlias reads from mm7.knownGene.name, # mm7.knownGene.proteinID, mm7.knownGene.alignID, # proteins050415.spXref3.accession, proteins050415.spSecondaryID, proteins050415.pdbSP.pdb # to create kgProtAlias.tab kgProtAlias mm7 050415 hgsql mm7 -N -e \ 'select kgID, spDisplayID, protAcc from kgXref where protAcc != ""'\ | sort -u >kgProtAliasNCBI.tab # include variant splice protein IDs hgsql mm7 -N -e \ 'select name, proteinID, parAcc from knownGene,sp050415.varAcc where varAcc=proteinID'\ |sort -u >kgProtAliasDup.tab # include duplicate protein IDs from dupSpMrna table hgsql mm7 -N -e \ 'select name, knownGene.proteinID, dupProteinID from knownGene, dupSpMrna where name=mrnaID'\ |sort -u >>kgProtAliasDup.tab # catch parent acc from dupProteinID too hgsql mm7 -N -e\ 'select name, knownGene.proteinID, parAcc from knownGene,dupSpMrna,sp050415.varAcc where name=mrnaID and dupProteinID=varAcc.varAcc'\ |sort -u >>kgProtAliasDup.tab cat kgProtAliasNCBI.tab kgProtAlias.tab kgProtAliasDup.tab | sort -u > kgProtAliasAll.tab echo "`date` creating table kgProtAlias" hgsql mm7 -e "drop table kgProtAlias;" hgsql mm7 <~/src/hg/lib/kgProtAlias.sql; hgsql mm7 -e 'LOAD DATA local INFILE "kgProtAliasAll.tab" into table kgProtAlias;' # Build kgSpAlias table hgsql mm7 -e \ 'select kgXref.kgID, spID, alias from kgXref, kgAlias where kgXref.kgID=kgAlias.kgID' >j.tmp hgsql mm7 -e \ 'select kgXref.kgID, spID, alias from kgXref, kgProtAlias where kgXref.kgID=kgProtAlias.kgID'\ >>j.tmp cat j.tmp|sort -u |grep -v 'kgID' >mm7.kgSpAlias.tab rm j.tmp hgsql mm7 -e 'drop table kgSpAlias'; hgsql mm7 < ~/src/hg/lib/kgSpAlias.sql hgsql mm7 -e 'load data local infile "mm7.kgSpAlias.tab" into table kgSpAlias' # MAKE FOLDUTR TABLES (TBD) # First set up directory structure and extract UTR sequence on hgwdev ssh hgwdev cd /cluster/data/mm7/bed mkdir rnaStruct.2005-05-31 rm rnaStruct ln -s rnaStruct.2005-05-31 rnaStruct cd rnaStruct mkdir -p utr3/split utr5/split utr3/fold utr5/fold utrFa mm7 knownGene utr3 utr3/utr.fa utrFa mm7 knownGene utr5 utr5/utr.fa # Split up files and make files that define job. ssh kk cd /cluster/data/mm7/bed/rnaStruct faSplit sequence utr3/utr.fa 50000 utr3/split/s faSplit sequence utr5/utr.fa 50000 utr5/split/s ls -1 utr3/split > utr3/in.lst ls -1 utr5/split > utr5/in.lst cd utr3 cat > gsub < blastSome #!/bin/sh BLASTMAT=/cluster/bluearc/blast229/data /cluster/bluearc/blast229/blastall \ -p blastp -d /cluster/panasas/home/store/mm7/blastp/known \ -i $1 -o $2 -e 0.01 -m 8 -b 1000 '_EOF_' # << keep emacs happy chmod a+x blastSome # Make gensub2 file cat << '_EOF_' > gsub #LOOP blastSome {check in line+ $(path1)} {check out line out/$(root1).tab} #ENDLOOP '_EOF_' # << keep emacs happy # Create parasol batch # 'ls ../../split/*.fa' is too much, hence the echo echo ../../split/*.fa | wordLine stdin > split.lst gensub2 split.lst single gsub jobList para create jobList para try para check para push ... etc ... # Completed: 7729 of 7729 jobs # CPU time in finished jobs: 58630s 977.16m 16.29h 0.68d 0.002 y # IO & Wait Time: 39839s 663.99m 11.07h 0.46d 0.001 y # Average job time: 13s 0.21m 0.00h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 116s 1.93m 0.03h 0.00d # Submission to last job: 188s 3.13m 0.05h 0.00d # Load into database. This takes about an hour. ssh hgwdev cd /cluster/data/mm7/bed/geneSorter/blastp/self/run/out hgLoadBlastTab mm7 knownBlastTab *.tab # Scanning through 7729 files # Loading database with 3391069 rows # Create known gene mapping table and expression distance tables # for GNF Atlas 2. (The hgExpDistance takes an hour.) # TBD) hgMapToGene mm7 affyGnf1m knownGene knownToGnf1m hgExpDistance mm7 hgFixed.gnfMouseAtlas2MedianRatio \ hgFixed.gnfMouseAtlas2MedianExps gnfAtlas2Distance -lookup=knownToGnf1m # Have 34863 elements in hgFixed.gnfMouseAtlas2MedianRatio # Got 20114 unique elements in hgFixed.gnfMouseAtlas2MedianRatio # Create table that maps between known genes and RefSeq hgMapToGene mm7 refGene knownGene knownToRefSeq # may need to build this command in src/hg/near/hgMapToGene # Create a table that maps between known genes and # the nice affy expression data. hgMapToGene mm7 affyU74 knownGene knownToU74 hgMapToGene mm7 affyMOE430 knownGene knownToMOE430 hgMapToGene mm7 affyMOE430 -prefix=A: knownGene knownToMOE430A # Format and load Rinn et al sex expression data mkdir /cluster/data/mm7/bed/rinnSex cd !$ hgMapMicroarray rinnSex.bed hgFixed.mouseRinnSexMedianRatio \ ../affyMOE430/affyMOE430.psl hgLoadBed -strict mm7 rinnSex rinnSex.bed # Format and load the GNF data mkdir /cluster/data/mm7/bed/affyGnf95 cd /cluster/data/mm7/bed/affyGnf95 affyPslAndAtlasToBed -newType ../affyU95.psl \ /projects/compbio/data/microarray/affyGnfHuman/data_public_U95 \ affyGnfU95.tab affyGnfU95Exps.tab -shortOut # this .sql load was in preceeding instructions, but this .sql file # appears to not exist and it doesn't seem to be needed anyway. # Everything below this seems to create tables OK. # hgsql mm7 < ~/kent/src/hg/affyGnf/affyGnfU95.sql # Create table that gives distance in expression space between # GNF genes. These commands take about 15 minutes each # The affyGnfU74?Exps arguments appear to be unused in # hgExpDistance cd /cluster/data/mm7/bed/geneSorter hgExpDistance mm7 affyGnfU74A affyGnfU74AExps affyGnfU74ADistance -lookup=knownToU74 # Got 10157 unique elements in affyGnfU74A hgExpDistance mm7 affyGnfU74B affyGnfU74BExps affyGnfU74BDistance -lookup=knownToU74 # Got 6076 unique elements in affyGnfU74B hgExpDistance mm7 affyGnfU74C affyGnfU74CExps affyGnfU74CDistance -lookup=knownToU74 # Got 1793 unique elements in affyGnfU74C # C.ELEGANS BLASTP FOR GENE SORTER # Make C. elegans ortholog column using blastp on wormpep. # First make C. elegans protein database and copy it to iscratch/i # if it doesn't exist already: ssh eieio mkdir /cluster/data/ce2/bed/blastp cd /cluster/data/ce2/bed/blastp # Point a web browser at ftp://ftp.sanger.ac.uk/pub/databases/wormpep/ # to find out the latest version. Then use that in place of 142 below. wget -O wormPep142.faa ftp://ftp.sanger.ac.uk/pub/databases/wormpep/wormpep142/wormpep142 formatdb -i wormPep142.faa -t wormPep142 -n wormPep142 ssh kkr1u00 if (-e /iscratch/i/ce2/blastp) then rm -r /iscratch/i/ce2/blastp endif mkdir -p /iscratch/i/ce2/blastp cp /cluster/data/ce2/bed/blastp/wormPep142.p?? /iscratch/i/ce2/blastp iSync # Make parasol run directory ssh kk mkdir -p /cluster/data/mm7/bed/blastp/ce2/run/out cd /cluster/data/mm7/bed/blastp/ce2/run # Make blast script cat > blastSome < gsub <split.lst gensub2 split.lst single gsub spec para create spec para try, check, push, check, ... # Completed: 7729 of 7729 jobs # CPU time in finished jobs: 40061s 667.69m 11.13h 0.46d 0.001 y # IO & Wait Time: 21049s 350.81m 5.85h 0.24d 0.001 y # Average job time: 8s 0.13m 0.00h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 33s 0.55m 0.01h 0.00d # Submission to last job: 134s 2.23m 0.04h 0.00d # Load into database. ssh hgwdev cd /cluster/data/mm7/bed/blastp/ce2/run/out hgLoadBlastTab mm7 ceBlastTab -maxPer=1 *.tab # HUMAN BLASTP FOR GENE SORTER (TBD) # Make human ortholog column using blastp on human known genes. # First make human protein database and copy it to iscratch/i # if it doesn't exist already: mkdir /cluster/data/hg17/bed/blastp cd /cluster/data/hg17/bed/blastp pepPredToFa hg17 knownGenePep known.faa formatdb -i known.faa -t known -n known ssh kkr1u00 if (-e /iscratch/i/hg17/blastp) then rm -r /iscratch/i/hg17/blastp endif mkdir -p /iscratch/i/hg17/blastp cp /cluster/data/hg17/bed/blastp/known.p?? /iscratch/i/hg17/blastp iSync # Make parasol run directory ssh kk mkdir -p /cluster/data/mm7/bed/blastp/hg17/run/out cd /cluster/data/mm7/bed/blastp/hg17/run # Make blast script cat > blastSome < gsub <split.lst gensub2 split.lst single gsub spec para create spec para try, check, push, check, ... # Completed: 7729 of 7729 jobs # CPU time in finished jobs: 81526s 1358.76m 22.65h 0.94d 0.003 y # IO & Wait Time: 23670s 394.51m 6.58h 0.27d 0.001 y # Average job time: 14s 0.23m 0.00h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 84s 1.40m 0.02h 0.00d # Submission to last job: 185s 3.08m 0.05h 0.00d # Load into database. ssh hgwdev cd /cluster/data/mm7/bed/blastp/hg17/run/out hgLoadBlastTab mm7 hgBlastTab -maxPer=1 *.tab # ZEBRAFISH BLASTP FOR GENE SORTER # Make Danio rerio (zebrafish) ortholog column using blastp on Ensembl. # First make protein database and copy it to iscratch/I # The below is done by hg17, that section from makeHg17.doc is copied here. ssh kkstore mkdir /cluster/data/danRer2/bed/blastp cd /cluster/data/danRer2/bed/blastp wget ftp://ftp.ensembl.org/pub/current_zebrafish/data/fasta/pep/Danio_rerio.ZFISH4.apr.pep.fa.gz zcat Dan*.pep.fa.gz > ensembl.faa formatdb -i ensembl.faa -t ensembl -n ensembl ssh kkr1u00 if (-e /iscratch/i/danRer2/blastp) then rm -r /iscratch/i/danRer2/blastp endif mkdir -p /iscratch/i/danRer2/blastp cp /cluster/data/danRer2/bed/blastp/ensembl.p?? /iscratch/i/danRer2/blastp iSync # The above is copied from makeHg17.doc. # Make parasol run directory ssh kk mkdir -p /cluster/data/mm7/bed/blastp/danRer2/run/out cd /cluster/data/mm7/bed/blastp/danRer2/run # Make blast script cat > blastSome < gsub <split.lst gensub2 split.lst single gsub spec para create spec para try, check, push, check, ... # Completed: 7729 of 7729 jobs # CPU time in finished jobs: 72894s 1214.89m 20.25h 0.84d 0.002 y # IO & Wait Time: 21284s 354.74m 5.91h 0.25d 0.001 y # Average job time: 12s 0.20m 0.00h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 73s 1.22m 0.02h 0.00d # Submission to last job: 176s 2.93m 0.05h 0.00d # Load into database. ssh hgwdev cd /cluster/data/mm7/bed/blastp/danRer2/run/out hgLoadBlastTab mm7 drBlastTab -maxPer=1 *.tab # YEAST BLASTP FOR GENE SORTER # Make Saccharomyces cerevisiae (yeast) ortholog column using blastp on # RefSeq. First make protein database and copy it to iscratch/i # if it doesn't exist already: mkdir /cluster/data/sacCer1/bed/blastp cd /cluster/data/sacCer1/bed/blastp wget ftp://genome-ftp.stanford.edu/pub/yeast/data_download/sequence/genomic_sequence/orf_protein/orf_trans.fasta.gz zcat orf_trans.fasta.gz > sgdPep.faa formatdb -i sgdPep.faa -t sgdPep -n sgdPep ssh kkr1u00 # Note: sacCer1 is a name conflict with SARS coronavirus... oh well, # fortunately we won't be looking for homologs there. :) if (-e /iscratch/i/sacCer1/blastp) then rm -r /iscratch/i/sacCer1/blastp endif mkdir -p /iscratch/i/sacCer1/blastp cp /cluster/data/sacCer1/bed/blastp/sgdPep.p?? /iscratch/i/sacCer1/blastp iSync # Make parasol run directory ssh kk mkdir -p /cluster/data/mm7/bed/blastp/sacCer1/run/out cd /cluster/data/mm7/bed/blastp/sacCer1/run # Make blast script cat > blastSome < gsub <split.lst gensub2 split.lst single gsub spec para create spec para try, check, push, check, ... # Completed: 7729 of 7729 jobs # CPU time in finished jobs: 11663s 194.38m 3.24h 0.13d 0.000 y # IO & Wait Time: 20479s 341.32m 5.69h 0.24d 0.001 y # Average job time: 4s 0.07m 0.00h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 11s 0.18m 0.00h 0.00d # Submission to last job: 143s 2.38m 0.04h 0.00d # Load into database. ssh hgwdev cd /cluster/data/mm7/bed/blastp/sacCer1/run/out hgLoadBlastTab mm7 scBlastTab -maxPer=1 *.tab # DM1 BLASTP FOR GENE SORTER (TBD) # Make Drosophila melanagaster ortholog column using blastp on FlyBase. # First make protein database and copy it to iscratch/i # if it doesn't exist already: # This is already done, see makeMm3.doc for procedure # the directory: /cluster/bluearc/dm1/blastp should have data # ssh kkr1u00 # if (-e /iscratch/i/dm1/blastp) then # rm -r /iscratch/i/dm1/blastp # endif # mkdir -p /iscratch/i/dm1/blastp # cp /cluster/data/dm1/bed/blastp/bdgp.p?? /iscratch/i/dm1/blastp # iSync # THE ABOVE IS ALREADY DONE BY ANGIE # Make parasol run directory ssh kk mkdir -p /cluster/data/mm7/bed/blastp/dm1/run/out cd /cluster/data/mm7/bed/blastp/dm1/run # Make blast script cat > blastSome < gsub <split.lst gensub2 split.lst single gsub spec para create spec para try, check, push, check, ... # Completed: 7729 of 7729 jobs # CPU time in finished jobs: 45146s 752.44m 12.54h 0.52d 0.001 y # IO & Wait Time: 21289s 354.81m 5.91h 0.25d 0.001 y # Average job time: 9s 0.14m 0.00h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 43s 0.72m 0.01h 0.00d # Submission to last job: 139s 2.32m 0.04h 0.00d # Load into database. ssh hgwdev cd /cluster/data/mm7/bed/blastp/dm1/run/out hgLoadBlastTab mm7 dmBlastTab -maxPer=1 *.tab # Create table that maps between known genes and LocusLink cd /cluster/data/mm7/bed/geneSorter hgsql --skip-column-names -e "select mrnaAcc,locusLinkId from refLink" mm7 > refToLl.txt hgMapToGene mm7 refGene knownGene knownToLocusLink -lookup=refToLl.txt # row count is 23074 # Create table that maps between known genes and Pfam domains hgMapViaSwissProt mm7 knownGene name proteinID Pfam knownToPfam # row count is 22525 # Create table to map between known genes and GNF Atlas2 # expression data. hgMapToGene mm7 gnfAtlas2 knownGene knownToGnfAtlas2 '-type=bed 12' # Create table that maps between known genes and visiGene database knownToVisiGene mm7 # ENABLE GENE SORTER FOR mm7 IN HGCENTRALTEST (already done during first mm7 KG build) echo "update dbDb set hgNearOk = 1 where name = 'mm7';" \ | hgsql -h genome-testdb hgcentraltest # RAT BLASTP FOR GENE SORTER # Make RAT ortholog column using blastp on RAT known genes. # First make RAT protein database and copy it to iscratch/i # if it doesn't exist already: mkdir /cluster/data/rn3/bed/blastp cd /cluster/data/rn3/bed/blastp pepPredToFa rn3 knownGenePep known.faa formatdb -i known.faa -t known -n known ssh kkr1u00 if (-e /iscratch/i/rn3/blastp) then rm -r /iscratch/i/rn3/blastp endif mkdir -p /iscratch/i/rn3/blastp cp /cluster/data/rn3/bed/blastp/known.p?? /iscratch/i/rn3/blastp iSync # Make parasol run directory ssh kk mkdir -p /cluster/data/mm7/bed/blastp/rn3/run/out cd /cluster/data/mm7/bed/blastp/rn3/run # Make blast script cat > blastSome < gsub <split.lst gensub2 split.lst single gsub spec para create spec para try, check, push, check, ... # Completed: 7729 of 7729 jobs # CPU time in finished jobs: 17126s 285.44m 4.76h 0.20d 0.001 y # IO & Wait Time: 20493s 341.54m 5.69h 0.24d 0.001 y # Average job time: 5s 0.08m 0.00h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 24s 0.40m 0.01h 0.00d # Submission to last job: 131s 2.18m 0.04h 0.00d # Load into database. ssh hgwdev cd /cluster/data/mm7/bed/blastp/rn3/run/out hgLoadBlastTab mm7 rnBlastTab -maxPer=1 *.tab # END OF GENE SORTER STUFF ############################################################################# ### MM6 PROTEOME BROWSER TABLES RE-BUILD #### (TBD) # These are instructions for re-building tables # needed for the Proteome Browser to be used with mm7. # DON'T START THESE UNTIL TABLES FOR KNOWN GENES AND kgProtMap table # ARE REBUILT. # This build is based on proteins DBs dated 050415. # Create the working directory ssh hgwdev mkdir /cluster/data/mm7/bed/pb.2005-06-01 cd /cluster/data/mm7/bed rm pb ln -s /cluster/data/mm7/bed/pb.2005-06-01 pb cd pb # Define pep* tables in mm7 DB cat ~/kent/src/hg/lib/pep*.sql > pepAll.sql # delete from the following tables (previously built): hgsql mm7 delete from pepCCntDist ; delete from pepExonCntDist ; delete from pepHydroDist ; delete from pepIPCntDist ; delete from pepMolWtDist ; delete from pepMwAa ; delete from pepPi ; delete from pepPiDist ; delete from pepPred ; delete from pepResDist ; delete from pbAnomLimit; delete from pbResAvgStd; delete from pbStamp; quit; # Build the pepMwAa table hgsql proteins050415 -e \ "select info.acc, molWeight, aaSize from sp050415.info, sp050415.accToTaxon where accToTaxon.taxon=10090 and accToTaxon.acc = info.acc" > pepMwAa.tab hgsql mm7 -e 'load data local infile "pepMwAa.tab" into table mm7.pepMwAa ignore 1 lines;' o Build the pepPi table hgsql proteins050415 -e "select info.acc from sp050415.info, sp050415.accToTaxon where accToTaxon.taxon=10090 and accToTaxon.acc = info.acc" > protAcc.lis pbCalPi protAcc.lis sp050415 pepPi.tab hgsql mm7 -e 'load data local infile "pepPi.tab" into table mm7.pepPi;' # Calculate and load pep distributions pbCalDist sp050415 proteins050415 10090 mm7 >pbCalDist.out cat pbCalDist.out wc pbCalDist.out hgsql mm7 load data local infile "pepExonCntDist.tab" into table mm7.pepExonCntDist; load data local infile "pepCCntDist.tab" into table mm7.pepCCntDist; load data local infile "pepHydroDist.tab" into table mm7.pepHydroDist; load data local infile "pepMolWtDist.tab" into table mm7.pepMolWtDist; load data local infile "pepResDist.tab" into table mm7.pepResDist; load data local infile "pepIPCntDist.tab" into table mm7.pepIPCntDist; load data local infile "pepPiDist.tab" into table mm7.pepPiDist; quit # Calculate frequency distributions pbCalResStd sp050415 10090 mm7 # Create pbAnomLimit and pbResAvgStd tables # hgsql mm7 < ~/src/hg/lib/pbAnomLimit.sql # hgsql mm7 < ~/src/hg/lib/pbResAvgStd.sql hgsql mm7 -e 'load data local infile "pbResAvgStd.tab" into table mm7.pbResAvgStd;' hgsql mm7 -e 'load data local infile "pbAnomLimit.tab" into table mm7.pbAnomLimit;' # UPDATE kgSpAlias TABLE TO BE USED BY PB cd /cluster/data/mm7/bed/pb hgsql mm7 -e \ 'select kgXref.kgID, spID, alias from kgXref, kgAlias where kgXref.kgID=kgAlias.kgID' >j.tmp hgsql mm7 -e \ 'select kgXref.kgID, spID, alias from kgXref, kgProtAlias where kgXref.kgID=kgProtAlias.kgID'\ >>j.tmp cat j.tmp|sort -u |grep -v 'kgID' >mm7.kgSpAlias.tab rm j.tmp hgsql mm7 -e 'drop table kgSpAlias'; hgsql mm7 < ~/src/hg/lib/kgSpAlias.sql hgsql mm7 -e 'load data local infile "mm7.kgSpAlias.tab" into table kgSpAlias' gzip mm7.kgSpAlias.tab # Create pbStamp table for PB hgsql mm7 < ~/src/hg/lib/pbStamp.sql hgsql mm5 -N -e 'select * from pbStamp' > pbStamp.tab hgsql mm7 -e 'delete from pbStamp' hgsql mm7 -e 'load data local infile "pbStamp.tab" into table mm7.pbStamp' # ENABLE PROTEOME BROWSER FOR mm7 IN HGCENTRALTEST (already done previously) echo "update dbDb set hgPbOk = 1 where name = 'mm7';" \ | hgsql -h genome-testdb hgcentraltest # Adjust drawing parameters for Proteome Browser stamps Now invoke Proteome Browser and adjust various drawing parameters (mostly the ymax of each stamp) if necessary, by updating the pbStamp.tab file and then delete and reload the pbStamp table. # Perform preliminary review of Proteome Browser for mm7, then notify QA for formal review. # Update default Browser position # bring up mySQL on genome-testdb and use hgcentraltest DB (done previously): update dbDb set defaultPos="chrX:87947304-87959012" where name="mm7"; # Create QA Push Queue entry with the following tables: ceBlastTab cgapAlias cgapBiocDesc cgapBiocPathway dmBlastTab drBlastTab dupSpMrna foldUtr3 foldUtr5 gnfAtlas2Distance hgBlastTab keggMapDesc keggPathway kgAlias kgProtAlias kgProtMap kgXref knownBlastTab knownCanonical knownGene knownGeneMrna knownGenePep knownIsoforms knownToGenePix knownToGnf1m knownToGnfAtlas2 knownToLocusLink knownToMOE430 knownToMOE430A knownToPfam knownToRefSeq knownToU74 knownToXmBest rinnSex rnBlastTab scBlastTab spMrna # END OF mm7 KG/GS/PB RE-BUILD. 6/1/05 Fan. ##################################################################### #################################################################################### # RE-BUILD KNOWN GENES TABLES, 3RD TRIAL WITH CORRECTED kgCheck and kgGetCds (TBD) ssh hgwdev cd /cluster/store10/kg/kgMm6B mkdir try2 mv * try2 hgsql mm7 -e 'create database kgMm6BTempTry2' hgsql kgMm6BTempTry2 -e 'drop table kgCandidate0' hgsql kgMm6BTempTry2 < ~/src/hg/lib/kgCandidate0.sql hgsql kgMm6BTempTry2 -e 'load data local infile "try2/kgCandidate0.gp" into table kgCandidate0' hgsql kgMm6BTempTry2 -e 'drop table geneCheck' hgsql kgMm6BTempTry2 < ~/src/hg/lib/geneCheck.sql hgsql kgMm6BTempTry2 -e 'load data local infile "try2/kgCandidate0.check" into table geneCheck ignore 2 lines' # Run kgCheck to get all KG candidates that pass the KG gene check criteria kgCheck kgMm6BTempTry2 mm7 kgCandidate0 geneCheck kgCandidate.tab hgsql kgMm6BTempTry2 -e 'drop table kgCandidate' hgsql kgMm6BTempTry2 < ~/src/hg/lib/kgCandidate.sql hgsql kgMm6BTempTry2 -e 'load data local infile "kgCandidate.tab" into table kgCandidate' hgsql kgMm6BTempTry2 -e 'create index alignID on kgCandidate(alignID)' # Construct the kgCandidateX table that has alignID in the name field. cut -f 2-10 kgCandidate.tab >j2.tmp cut -f 11 kgCandidate.tab >j1.tmp paste j1.tmp j2.tmp >kgCandidateX.tab rm j1.tmp j2.tmp hgsql kgMm6BTempTry2 -e 'drop table kgCandidateX' hgsql kgMm6BTempTry2 < ~/src/hg/lib/kgCandidateX.sql hgsql kgMm6BTempTry2 -e 'load data local infile "kgCandidateX.tab" into table kgCandidateX' # Score protein/mRna and protein/RefSeq alignments # kgResultBestMrna2 050415 kgMm6BTempTry2 mm7|sort -u >protMrnaBlatScore.tab # kgResultBestRef2 050415 kgMm6BTempTry2 mm7|sort -u >protRefScore.tab # Combine scoring results and load them into temp DB. # cat protMrnaBlatScore.tab protRefScore.tab >protMrnaScore.tab hgsql kgMm6BTempTry2 -e 'drop table protMrnaScore' hgsql kgMm6BTempTry2 < ~/src/hg/lib/protMrnaScore.sql hgsql kgMm6BTempTry2 -e 'load data local infile "try2/protMrnaScore.tab" into table protMrnaScore' hgsql kgMm6BTempTry2 -e 'create index mrnaAcc on protMrnaScore(mrnaAcc)' # Run kgGetCds to get CDS structure of each gene kgGetCds kgMm6BTempTry2 kgCandidateX jY.tmp cat jY.tmp |sort -u >kgCandidateY.tab # rm jY.tmp hgsql kgMm6BTempTry2 -e 'drop table kgCandidateY' hgsql kgMm6BTempTry2 < ~/src/hg/lib/kgCandidateY.sql hgsql kgMm6BTempTry2 -e 'load data local infile "kgCandidateY.tab" into table kgCandidateY' # Run kgPickPrep to replace long cds structure string with cdsId. kgPickPrep kgMm6BTempTry2 kgCandidateZ.tab hgsql kgMm6BTempTry2 -e 'drop table kgCandidateZ' hgsql kgMm6BTempTry2 < ~/src/hg/lib/kgCandidateZ.sql hgsql kgMm6BTempTry2 -e 'load data local infile "kgCandidateZ.tab" into table kgCandidateZ' hgsql kgMm6BTempTry2 -e 'create index cdsId on kgCandidateZ(cdsId)' # Run kgPick to pick the representative a mrna/protein pair for each unique CDS structure. kgPick kgMm6BTempTry2 mm7 proteins050415 kgTry2.tmp dupSpMrna.tmp cat kgTry2.tmp | grep NM_ > jNM cat kgTry2.tmp | grep -v NM_ >jnoNM cut -f 1 jnoNM | sed -e "s/_/_\n/" |grep -v _ >jnoNM1 cut -f 2-12 jnoNM >jnoNM2 paste jnoNM1 jnoNM2 > kgTry2B.tmp cat jNM >> kgTry2B.tmp sort -u dupSpMrna.tmp >dupSpMrna.tab hgsql mm7 -e 'drop table dupSpMrna' hgsql mm7 < ~/src/hg/lib/dupSpMrna.sql hgsql mm7 -e 'load data local infile "dupSpMrna.tab" into table dupSpMrna' # Add entries in the put back list # Obtain the mouse put back list from Mark and save it as kgPutBack.tab hgsql kgMm6BTempTry2 -e 'drop table kgPutBack' hgsql kgMm6BTempTry2 < ~/src/hg/lib/kgPutBack.sql hgsql kgMm6BTempTry2 -e 'load data local infile "kgPutBack.tab" into table kgPutBack' kgPutBack kgMm6BTempTry2 mm7 proteins050415 kgPutBack kgPutBack.gp # Sort KG genes to make the kgTry2.gp table file. cat kgTry2B.tmp kgPutBack.gp >kgTry2C.tmp ~/kent/src/hg/protein/sortKg.pl kgTry2C.tmp >kgTry2.gp # Manually edit to correct one line problem of O75438_BC009691 hgsql kgMm6BTempTry2 -e 'drop table knownGene' hgsql kgMm6BTempTry2 < ~/src/hg/lib/knownGene.sql hgsql kgMm6BTempTry2 -e 'load data local infile "kgTry2.gp" into table knownGene' # Load data into mm7 knownGene table. hgsql mm7 -e 'drop table knownGene' hgsql mm7 < ~/src/hg/lib/knownGene.sql hgsql mm7 -e 'load data local infile "kgTry2.gp" into table knownGene' # Build knownGeneMrna and knownGenePep tables. hgsql kgMm6BTempTry2 -e 'drop table mrnaSeq' hgsql kgMm6BTempTry2 < ~/src/hg/lib/mrnaSeq.sql # hgsql kgMm6BTempTry2 -e 'load data local infile "try2/mrnaSeq.tab" into table mrnaSeq' hgsql kgMm6BTempTry2 -e 'load data local infile "/cluster/store10/kg/kgMm6A/mrnaSeq.tab" into table mrnaSeq' kgPepMrna kgMm6BTempTry2 mm7 050415 hgsql mm7 -e 'drop table knownGeneMrna' hgsql mm7 < ~/src/hg/lib/knownGeneMrna.sql hgsql mm7 -e 'load data local infile "knownGeneMrna.tab" into table knownGeneMrna' hgsql mm7 -e 'drop table knownGenePep' hgsql mm7 < ~/src/hg/lib/knownGenePep.sql hgsql mm7 -e 'load data local infile "knownGenePep.tab" into table knownGenePep' # Build kgXref table kgXref2 kgMm6BTempTry2 050415 mm7 hgsql mm7 -e 'drop table kgXref' hgsql mm7 < ~/src/hg/lib/kgXref.sql hgsql mm7 -e 'load data local infile "kgXref.tab" into table kgXref' # Build kgProtMap table ~/src/hg/protein/kgProtMap2.sh kgMm6B mm7 050415 # Update and clean up kgResultBestMrna2.c and then check it in. # Build spMrna table hgsql mm7 -N -e 'select name, proteinID from knownGene' |sort -u| >kgSpMrna.tab hgsql mm7 -e 'drop table spMrna' hgsql mm7 <~/src/hg/lib/spMrna.sql hgsql mm7 -e 'load data local infile "kgSpMrna.tab" into table spMrna' # Build mrnaRefseq table cd /cluster/store10/entrez mkdir 050601 rm /cluster/data/entrez ln -s /cluster/store10/entrez/050601 /cluster/data/entrez cd /cluster/data/entrez wget -- timestamp ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/gene2accession.gz wget -- timestamp ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/gene_info.gz wget -- timestamp ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/gene2refseq.gz gzip -d *.gz cut -f 2,4 gene2accession | sort -u | grep -v "-" | grep -v "NM_" | sed -e 's/\./\t/g' > entrezMrna.tab cut -f 2,4 gene2refseq | grep "NM_"| sort -u | grep -v "-" | sed -e 's/\./\t/g' > entrezRefseq.tab cut -f 2,4,6 gene2accession | grep "NM_"| grep "NP_"|sort -u | sed -e 's/\./\t/g' > entrezRefProt.tab hgsql entrez -e 'drop table entrezRefseq' hgsql entrez -e 'drop table entrezMrna' hgsql entrez -e 'drop table entrezRefProt' hgsql entrez < ~/src/hg/lib/entrezRefseq.sql hgsql entrez < ~/src/hg/lib/entrezMrna.sql hgsql entrez < ~/src/hg/lib/entrezRefProt.sql hgsql entrez -e 'load data local infile "entrezRefseq.tab" into table entrezRefseq' hgsql entrez -e 'load data local infile "entrezMrna.tab" into table entrezMrna' hgsql entrez -e 'load data local infile "entrezRefProt.tab" into table entrezRefProt' hgsql entrez -N -e \ 'select mrna, refseq from entrezRefseq, entrezMrna where entrezRefseq.geneID=entrezMrna.geneID' \ >mrnaRefseq.tab hgsql mm7 -e 'drop table mrnaRefseq' hgsql mm7 < ~/src/hg/lib/mrnaRefseq.sql hgsql mm7 -e 'load data local infile "mrnaRefseq.tab" into table mrnaRefseq' # Build alias tables. # kgAliasM reads from proteins050415.hugo.symbol, proteins050415.hugo.aliases # proteins050415.hugo.withdraws, mm7.kgXref.kgID # to create kgAliasM.tab and geneAlias.tab # by picking out those kgID items from kgXref where # kgXref.geneSymbol == hugo.symbol cd /cluster/store10/kg/kgMm6B mkdir alias cd alias kgAliasM mm7 proteins050415 # kgAliasKgXref reads from mm7.knownGene.proteinID, # mm7.knownGene.name, mm7.kgXref.geneSymbol # to create kgAliasKgXref.tab kgAliasKgXref mm7 # kgAliasRefseq reads from mm7.knownGene.name, # mm7.knownGene.proteinID, mm7.kgXref.refseq # to create kgAliasRefseq.tab kgAliasRefseq mm7 hgsql sp050415 -N -e 'select name,gene.val from mm7.knownGene,displayId,gene where displayId.val=proteinID and displayId.acc=gene.acc' \ | sort -u > kgAliasP.tab hgsql mm7 -N -e 'select name, name from knownGene' >kgAliasDup.tab hgsql mm7 -N -e 'select mrnaID, dupMrnaID from dupSpMrna' >>kgAliasDup.tab cat kgAliasM.tab kgAliasRefseq.tab kgAliasKgXref.tab kgAliasP.tab kgAliasDup.tab| \ sort |uniq > kgAlias.tab hgsql -e "drop table kgAlias;" mm7 hgsql mm7 < ~/kent/src/hg/lib/kgAlias.sql hgsql mm7 -e 'LOAD DATA local INFILE "kgAlias.tab" into table kgAlias' # kgProtAlias reads from mm7.knownGene.name, # mm7.knownGene.proteinID, mm7.knownGene.alignID, # proteins050415.spXref3.accession, proteins050415.spSecondaryID, proteins050415.pdbSP.pdb # to create kgProtAlias.tab kgProtAlias mm7 050415 hgsql mm7 -N -e \ 'select kgID, spDisplayID, protAcc from kgXref where protAcc != ""'\ | sort -u >kgProtAliasNCBI.tab # include variant splice protein IDs hgsql mm7 -N -e \ 'select name, proteinID, parAcc from knownGene,sp050415.varAcc where varAcc=proteinID'\ |sort -u >kgProtAliasDup.tab # include duplicate protein IDs from dupSpMrna table hgsql mm7 -N -e \ 'select name, knownGene.proteinID, dupProteinID from knownGene, dupSpMrna where name=mrnaID'\ |sort -u >>kgProtAliasDup.tab # catch parent acc from dupProteinID too hgsql mm7 -N -e\ 'select name, knownGene.proteinID, parAcc from knownGene,dupSpMrna,sp050415.varAcc where name=mrnaID and dupProteinID=varAcc.varAcc'\ |sort -u >>kgProtAliasDup.tab cat kgProtAliasNCBI.tab kgProtAlias.tab kgProtAliasDup.tab | sort -u > kgProtAliasAll.tab echo "`date` creating table kgProtAlias" hgsql mm7 -e "drop table kgProtAlias;" hgsql mm7 <~/src/hg/lib/kgProtAlias.sql; hgsql mm7 -e 'LOAD DATA local INFILE "kgProtAliasAll.tab" into table kgProtAlias;' # Build kgSpAlias table hgsql mm7 -e \ 'select kgXref.kgID, spID, alias from kgXref, kgAlias where kgXref.kgID=kgAlias.kgID' >j.tmp hgsql mm7 -e \ 'select kgXref.kgID, spID, alias from kgXref, kgProtAlias where kgXref.kgID=kgProtAlias.kgID'\ >>j.tmp cat j.tmp|sort -u |grep -v 'kgID' >mm7.kgSpAlias.tab rm j.tmp hgsql mm7 -e 'drop table kgSpAlias'; hgsql mm7 < ~/src/hg/lib/kgSpAlias.sql hgsql mm7 -e 'load data local infile "mm7.kgSpAlias.tab" into table kgSpAlias' # MAKE FOLDUTR TABLES # First set up directory structure and extract UTR sequence on hgwdev ssh hgwdev cd /cluster/data/mm7/bed mkdir rnaStruct.2005-06-05 rm rnaStruct ln -s rnaStruct.2005-06-05 rnaStruct cd rnaStruct mkdir -p utr3/split utr5/split utr3/fold utr5/fold utrFa mm7 knownGene utr3 utr3/utr.fa utrFa mm7 knownGene utr5 utr5/utr.fa # Split up files and make files that define job. ssh kk cd /cluster/data/mm7/bed/rnaStruct faSplit sequence utr3/utr.fa 50000 utr3/split/s faSplit sequence utr5/utr.fa 50000 utr5/split/s ls -1 utr3/split > utr3/in.lst ls -1 utr5/split > utr5/in.lst cd utr3 cat > gsub < pepAll.sql # delete from the following tables (previously built): hgsql mm7 delete from pepCCntDist ; delete from pepExonCntDist ; delete from pepHydroDist ; delete from pepIPCntDist ; delete from pepMolWtDist ; delete from pepMwAa ; delete from pepPi ; delete from pepPiDist ; delete from pepPred ; delete from pepResDist ; delete from pbAnomLimit; delete from pbResAvgStd; delete from pbStamp; quit; # Build the pepMwAa table hgsql proteins050415 -e \ "select info.acc, molWeight, aaSize from sp050415.info, sp050415.accToTaxon where accToTaxon.taxon=10090 and accToTaxon.acc = info.acc" > pepMwAa.tab hgsql mm7 -e 'load data local infile "pepMwAa.tab" into table mm7.pepMwAa ignore 1 lines;' o Build the pepPi table hgsql proteins050415 -e "select info.acc from sp050415.info, sp050415.accToTaxon where accToTaxon.taxon=10090 and accToTaxon.acc = info.acc" > protAcc.lis pbCalPi protAcc.lis sp050415 pepPi.tab hgsql mm7 -e 'load data local infile "pepPi.tab" into table mm7.pepPi;' # Calculate and load pep distributions pbCalDist sp050415 proteins050415 10090 mm7 >pbCalDist.out cat pbCalDist.out wc pbCalDist.out hgsql mm7 load data local infile "pepExonCntDist.tab" into table mm7.pepExonCntDist; load data local infile "pepCCntDist.tab" into table mm7.pepCCntDist; load data local infile "pepHydroDist.tab" into table mm7.pepHydroDist; load data local infile "pepMolWtDist.tab" into table mm7.pepMolWtDist; load data local infile "pepResDist.tab" into table mm7.pepResDist; load data local infile "pepIPCntDist.tab" into table mm7.pepIPCntDist; load data local infile "pepPiDist.tab" into table mm7.pepPiDist; quit # Calculate frequency distributions pbCalResStd sp050415 10090 mm7 # Create pbAnomLimit and pbResAvgStd tables # hgsql mm7 < ~/src/hg/lib/pbAnomLimit.sql # hgsql mm7 < ~/src/hg/lib/pbResAvgStd.sql hgsql mm7 -e 'load data local infile "pbResAvgStd.tab" into table mm7.pbResAvgStd;' hgsql mm7 -e 'load data local infile "pbAnomLimit.tab" into table mm7.pbAnomLimit;' # UPDATE kgSpAlias TABLE TO BE USED BY PB cd /cluster/data/mm7/bed/pb hgsql mm7 -e \ 'select kgXref.kgID, spID, alias from kgXref, kgAlias where kgXref.kgID=kgAlias.kgID' >j.tmp hgsql mm7 -e \ 'select kgXref.kgID, spID, alias from kgXref, kgProtAlias where kgXref.kgID=kgProtAlias.kgID'\ >>j.tmp cat j.tmp|sort -u |grep -v 'kgID' >mm7.kgSpAlias.tab rm j.tmp hgsql mm7 -e 'drop table kgSpAlias'; hgsql mm7 < ~/src/hg/lib/kgSpAlias.sql hgsql mm7 -e 'load data local infile "mm7.kgSpAlias.tab" into table kgSpAlias' gzip mm7.kgSpAlias.tab # Create pbStamp table for PB hgsql mm7 < ~/src/hg/lib/pbStamp.sql hgsql mm5 -N -e 'select * from pbStamp' > pbStamp.tab hgsql mm7 -e 'delete from pbStamp' hgsql mm7 -e 'load data local infile "pbStamp.tab" into table mm7.pbStamp' # ENABLE PROTEOME BROWSER FOR mm7 IN HGCENTRALTEST (already done previously) echo "update dbDb set hgPbOk = 1 where name = 'mm7';" \ | hgsql -h genome-testdb hgcentraltest # Adjust drawing parameters for Proteome Browser stamps Now invoke Proteome Browser and adjust various drawing parameters (mostly the ymax of each stamp) if necessary, by updating the pbStamp.tab file and then delete and reload the pbStamp table. # Perform preliminary review of Proteome Browser for mm7, then notify QA for formal review. # Update default Browser position # bring up mySQL on genome-testdb and use hgcentraltest DB (done previously): update dbDb set defaultPos="chrX:87947304-87959012" where name="mm7"; # Create QA Push Queue entry with the following tables: ceBlastTab cgapAlias cgapBiocDesc cgapBiocPathway dmBlastTab drBlastTab dupSpMrna foldUtr3 foldUtr5 gnfAtlas2Distance hgBlastTab keggMapDesc keggPathway kgAlias kgProtAlias kgProtMap kgXref knownBlastTab knownCanonical knownGene knownGeneMrna knownGenePep knownIsoforms knownToGenePix knownToGnf1m knownToGnfAtlas2 knownToLocusLink knownToMOE430 knownToMOE430A knownToPfam knownToRefSeq knownToU74 knownToXmBest rinnSex rnBlastTab scBlastTab spMrna # END OF mm7 KG/GS/PB RE-BUILD. 6/6/05 Fan. ##################################################################### ## NIA Mouse Gene Index - (TBD) # requested by: Dudekula, Dawood (NIH/NIA/IRP) DudekulaDB@grc.nia.nih.gov ssh hgwdev mkdir -p /cluster/data/mm7/bed/NIAGene cd /cluster/data/mm7/bed/NIAGene wget --timestamp http://lgsun.grc.nia.nih.gov/geneindex5/download/T-psl.txt.gz cut -f 1-21 T-psl.txt >NIAGene.tab hgLoadPsl mm7 NIAGene.tab wget --timestamp http://lgsun.grc.nia.nih.gov/geneindex5/download/T-fasta.fa.gz gzip -d T-fasta.fa.gz mkdir /gbdb/mm7/NIAGene ln -s /cluster/data/mm7/bed/NIAGene/T-fasta.fa /gbdb/mm7/NIAGene/T-fasta.fa hgLoadSeq mm7 /gbdb/mm7/NIAGene/T-fasta.fa Create/edit/check in NIAGene.html and trackDb.ra under kent/src/hg/makeDb/trackDb/mouse/mm7 # Update mrnaRefseq table (TBD) # The old table contains non-mouse mrna/RefSeqs. # The new table contains only mouse mrna/RefSeq and RefSeq/RefSeq. # First build entrez DB tables, see the section on mrnaRefseq earlier # for details. ssh hgwdev cd /cluster/store10/kg/kgMm6B hgsql entrez -N -e \ 'select mrna, refseq from entrezRefseq, entrezMrna, mm7.all_mrna where entrezRefseq.geneID=entrezMrna.geneID and mrna=all_mrna.qName' \ >mrnaRefseq1.tab # Include RefSeq as valid mRNA too. hgsql mm7 -N -e 'select name, name from refGene' >mrnaRefseq2.tab cat mrnaRefseq1.tab mrnaRefseq2.tab |sort -u >mrnaRefseq.tab hgsql mm7 -e 'drop table mrnaRefseq' hgsql mm7 < ~/src/hg/lib/mrnaRefseq.sql hgsql mm7 -e 'load data local infile "mrnaRefseq.tab" into table mrnaRefseq' # BUILD KNOWN GENE LIST FOR GOOGLE. TBD) cd /cluster/data/mm7/bed rm -rf knownGeneList/mm7 # Run hgKnownGeneList to generate the tree of HTML pages # under ./knownGeneList/mm7 hgKnownGeneList mm7 # copy over to /usr/local/apache/htdocs rm -rf /usr/local/apache/htdocs/knownGeneList/mm7 mkdir -p /usr/local/apache/htdocs/knownGeneList/mm7 cp -Rfp knownGeneList/mm7/* /usr/local/apache/htdocs/knownGeneList/mm7 # Build kgReactome table for KG to Reactome xref. Done 6/28/05 Fan. # First, make sure the reactome DB is built. See makeHg17.doc for details. ssh hgwdev mkdir -p /cluster/data/mm7/bed/reactome cd /cluster/data/mm7/bed/reactome hgsql reactome -N -e 'select kgId, spID, DB_ID from ReferenceEntity, mm7.kgXref where identifier=spID' >kgReactome.tab; hgsql mm7 -e 'drop table kgReactome' hgsql mm7 < ~/src/hg/lib/kgReactome.sql hgsql mm7 -e 'load data local infile "kgReactome.tab" into table kgReactome' ############################################################################# # miRNA track (DONE - 2005-10-27 - Hiram) # data from: Michel.Weber@ibcg.biotoul.fr # notify them when done. cd /cluster/data/mm7/bed mkdir miRNA cd miRNA save miRNA_track_mm7.txt file from email cp miRNA_track_mm7.txt miRNA.bed # edit miRNA.bed to get rid of the top field description lines hgLoadBed -strict mm7 miRNA miRNA.bed # check previous release track before update featureBits mm7 miRNA # 20620 bases of 2583394090 (0.001%) in intersection featureBits mm6 miRNA # 21167 bases of 2597150411 (0.001%) in intersection featureBits mm5 miRNA # 17957 bases of 2615483787 (0.001%) in intersection ############################################################################# # ADDED THE EXONPRIMER TO QUICK LINKS SECTION OF KG DEAILS PAGE (05/07/11, Fan) # Added the following lines to links.ra under src/hg/hgGene/hgGeneData/Mouse/mm7 name exonPrimer shortLabel ExonPrimer tables kgXref idSql select kgID from kgXref where kgID = '%s' url http://ihg.gsf.de/cgi-bin/primer/ExonPrimerUCSC.pl?db=mm7&acc=%s priority 95 # REBUILT knownToPfam TABLE TO ALLOW KG REPRESENTED BY VARIANT SPLICE PROTEINS MAPPED TO PFAM (TBD) # hgMapViaSwissProt.c was updated to support this. # Create table that maps between known genes and Pfam domains ~/bin/i386/hgMapViaSwissProt mm7 knownGene name proteinID Pfam knownToPfam # row count is 24650 # SCDb CLONES (7/12/2005 Andy) cd /cluster/data/mm7/bed mkdir blat.SCDb-07-05-2005 cd blat.SCDb-07-05-2005/ ln -s `pwd` ~/scdb pushd /santest/scratch/andy wget http://stemcell.princeton.edu/download/scdb.fa.gz mkdir scdb faSplit sequence scdb.fa.gz 80 scdb/scdb_ popd find /santest/scratch/andy/scdb -type f > scdb.lst find /panasas/store/mm7/nib -type f > mm7.lst cat << "_EOF_" > blat.sh #!/bin/bash cdir=${3%/*} mkdir -p $cdir blat -q=dna -t=dna -noHead -ooc=/iscratch/i/mm7/ooc/11.ooc $1 $2 $3 _EOF_ cat << "_EOF_" > gsub #LOOP ./blat.sh {check in exists $(path2)} {check in line+ $(path1)} {check out line /cluster/bluearc/andy/scdb.psl/$(root2)/$(root2)_$(root1).psl} #ENDLOOP _EOF_ chmod +x blat.sh ssh kk cd /cluster/data/mm7/bed/blat.SCDb-07-05-2005 gensub2 scdb.lst mm7.lst gsub spec para create spec para try para push para time #Completed: 3200 of 3200 jobs #CPU time in finished jobs: 24158s 402.64m 6.71h 0.28d 0.001 y #IO & Wait Time: 14437s 240.61m 4.01h 0.17d 0.000 y #Average job time: 12s 0.20m 0.00h 0.00d #Longest running job: 0s 0.00m 0.00h 0.00d #Longest finished job: 244s 4.07m 0.07h 0.00d #Submission to last job: 727s 12.12m 0.20h 0.01d ssh hgwdev cd /cluster/data/mm7/bed/blat.SCDb-07-05-2005 # See if things check out. find /cluster/bluearc/andy/scdb.psl -type f -exec cat '{}' ';' > scdb.all.psl pslReps -singleHit scdb.all.psl scdb.best.psl info.psr # All the original names grep '>' scdb.fa | sed 's/^>//' | cut -f1 -d' ' | sort | uniq > names.scdb # All the names from ones that hit. cut -f10 scdb.all.psl | sort | uniq > all.names.scdb # All the ones with a "best" hit. cut -f10 scdb.best.psl | sort | uniq > best.names.scdb # Yeah a bunch of them (4,443/37,386) are missing. It seems many of the # clones aren't from mouse anyways. mkdir ../scdb cp scdb.best.psl ../scdb/scdb.psl cp scdb.fa ../scdb/ cp best.names.scdb ../scdb/ cd ../scdb/ faSomeRecords scdb.fa best.names.scdb scdb.best.fa rm scdb.fa mkdir /gbdb/mm7/scdb ln -s /cluster/data/mm7/bed/scdb/scdb.best.fa /gbdb/mm7/scdb/scdb.fa hgLoadSeq mm7 /gbdb/mm7/scdb/scdb.fa # clean up the names... basically take the middle part out. sed 's/SC|\([^|]\+\)|[0-9]\+/\1/' scdb.best.fa > new.scdb.best.fa sed 's/SC|\([^|]\+\)|[0-9]\+/\1/' scdb.psl > new.scdb.psl mv scdb.psl old.scdb.psl mv new.scdb.psl scdb.psl mv scdb.best.fa old.scdb.best.fa mv new.scdb.best.fa scdb.best.fa hgLoadPsl -table=scdb mm7 scdb.psl hgLoadSeq mm7 /gbdb/mm7/scdb/scdb.fa #Warning: load of seq did not go as planned: 37381 record(s), 1 row(s) skipped, 0 warning(s) loading ./seq.tab # Oh well. # Update 7/26/2005: I'm going more restrictive on the pslReps. ssh hgwdev cd /cluster/data/mm7/bed/blat.SCDb-07-05-2005 pslReps -minCover=0.8 -singleHit scdb.all.psl tmp scdb.psr sed 's/SC|\([^|]\+\)|[0-9]\+/\1/' tmp > scdb.psl rm tmp hgLoadPsl mm7 scdb.psl ## REBUILD NIA Mouse Gene Index - (TBD) # requested by: Dudekula, Dawood (NIH/NIA/IRP) DudekulaDB@grc.nia.nih.gov ssh hgwdev cd /cluster/data/mm7/bed mv NIAGene NIAGene_050621 mkdir NIAGene wget --timestamp http://lgsun.grc.nia.nih.gov/geneindex5/download/T-psl.txt.gz cut -f 1-21 T-psl.txt >NIAGene.tab hgLoadPsl mm7 NIAGene.tab wget --timestamp http://lgsun.grc.nia.nih.gov/geneindex5/download/T-fasta.fa.gz gzip -d T-fasta.fa.gz rm /gbdb/mm7/NIAGene/T-fasta.fa ln -s /cluster/data/mm7/bed/NIAGene/T-fasta.fa /gbdb/mm7/NIAGene/T-fasta.fa # Load the sequences. PLEASE NOTE THE "-replace" OPTION SHOULD BE USED!!! hgLoadSeq -replace mm7 /gbdb/mm7/NIAGene/T-fasta.fa ############################################################################ # BLASTZ ZEBRAFISH (danRer3) (DONE - 2005-09-19 - 2005-11-21 - Hiram) # Create a single 2bit file that has the full chroms and the # random contigs from chrUn and chrNA ssh kkstore02 cd /cluster/data/danRer3/ faToTwoBit [1-9]/chr*.fa [12][0-9]/chr*.fa M/chrM.fa \ Un/scaffoldUn.fa NA/scaffoldNA.fa \ chrUnNA.2bit # Verify the sequence hasn't been damaged: twoBitToFa danRer3.2bit stdout | faSize stdin # 1644032962 bases (48201758 N's 1595831204 real 816464533 upper # 779366671 lower) in 28 sequences in 1 files twoBitToFa chrUnNA.2bit stdout | faSize stdin # 1636563462 bases (40732258 N's 1595831204 real 816464533 upper # 779366671 lower) in 14967 sequences in 1 files # Note, only the N's are different # 48201758 - 40732258 = 7469500 # 1644032962 - 1636563462 = 7469500 # Copy to bluearc: cp -p chrUnNA.2bit /cluster/bluearc/danRer3 twoBitInfo chrUnNA.2bit chrUnNA.sizes cp -p chrUnNA.sizes /cluster/bluearc/danRer3 cp -p liftSupertoChrom/liftNAandUnScaffoldsToChrom.lft \ /cluster/bluearc/danRer3 ssh pk mkdir /cluster/data/mm7/bed/blastzDanRer3.2005-09-19 cd /cluster/data/mm7/bed ln -s blastzDanRer3.2005-09-19 blastz.danRer3 cd blastzDanRer3.2005-09-19 # use parameters as for mm5 - see makeMm5.doc cat << '_EOF_' > DEF # mouse (mm7) vs zebrafish (danRer3) export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn/x86_64:/cluster/bin/x86_64:/parasol/bin BLASTZ=blastz.x86_64 # Reuse parameters from hg16-fr1, danRer-hg17 and mm5-danRer BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_M=50 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET: Mouse (mm7) # small enough chunk to get reasonable running time SEQ1_DIR=/scratch/hg/mm7/mm7.2bit SEQ1_LEN=/scratch/hg/mm7/chrom.sizes SEQ1_CTGDIR=/scratch/hg/mm7/mm7Chroms_RandomContigs.2bit SEQ1_CTGLEN=/scratch/hg/mm7/mm7Chroms_RandomContigs.sizes SEQ1_LIFT=/scratch/hg/mm7/Chroms_RandomContigs.lft SEQ1_LIMIT=30 SEQ1_IN_CONTIGS=0 SEQ1_CHUNK=1000000 SEQ1_LAP=5000 # QUERY: Zebrafish (danRer3) # large enough chunk to do all genome in one piece SEQ2_DIR=/cluster/bluearc/danRer3/danRer3.2bit SEQ2_LEN=/cluster/bluearc/danRer3/chrom.sizes SEQ2_CTGDIR=/cluster/bluearc/danRer3/chrUnNA.2bit SEQ2_CTGLEN=/cluster/bluearc/danRer3/chrUnNA.sizes SEQ2_LIFT=/cluster/bluearc/danRer3/liftNAandUnScaffoldsToChrom.lft SEQ2_IN_CONTIGS=0 SEQ2_CHUNK=1700000000 SEQ2_LAP=0 BASE=/cluster/data/mm7/bed/blastzDanRer3.2005-09-19 TMPDIR=/scratch/tmp '_EOF_' # < happy emacs time $HOME/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -chainMinScore=5000 -bigClusterHub=pk \ `pwd`/DEF > blastz.out 2>&1 & XXX - Started - 2005-10-07 12:13 time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -continue=chainMerge -chainMinScore=5000 -bigClusterHub=pk \ -fileServer=kolossus \ `pwd`/DEF > chainMerge.out 2>&1 & time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -swap -chainMinScore=5000 -bigClusterHub=pk \ -fileServer=kolossus \ `pwd`/DEF > swap.out 2>&1 & # featureBits -chrom=chr1 mm7 refGene:cds chainDanRer3Link -enrichment # refGene:cds 0.810%, chainDanRer3Link 2.365%, both 0.536%, cover 66.19%, # enrich 27.99x # featureBits -chrom=chr1 mm6 refGene:cds chainDanRer3Link -enrichment # refGene:cds 0.808%, chainDanRer3Link 5.196%, both 0.522%, cover 64.64%, # enrich 12.44x # featureBits -chrom=chr1 mm5 refGene:cds chainDanRer2Link -enrichment # refGene:cds 0.818%, chainDanRer2Link 2.058%, both 0.546%, cover 66.75%, # enrich 32.43x # featureBits -chrom=chr1 mm6 refGene:cds chainDanRer2Link -enrichment # refGene:cds 0.808%, chainDanRer2Link 6.412%, both 0.542%, cover 67.04%, # enrich 10.46x time featureBits mm7 chainDanRer3Link # 69591785 bases of 2583394090 (2.694%) in intersection time featureBits mm6 chainDanRer3Link # 134615477 bases of 2597150411 (5.183%) in intersection time featureBits danRer3 chainMm7Link # 71277532 bases of 1630323462 (4.372%) in intersectio time featureBits danRer2 chainMm6Link # 176391894 bases of 1560497282 (11.304%) in intersection # re-scoring the chains ssh kkstore02 cd /cluster/data/mm7/bed/blastzDanRer3.2005-09-19 rm -fr axtNet mafNet axtChain ssh pk cd /cluster/data/mm7/bed/blastzDanRer3.2005-09-19 time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ -continue=chainRun -stop=load \ `pwd`/DEF > rescoreChain.out 2>&1 & # real 186m35.984s time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ -swap -stop=load \ `pwd`/DEF > rescoreChainSwap.out 2>&1 & time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ -continue=download \ `pwd`/DEF > rescoreChainDownload.out 2>&1 & time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ -swap -continue=download \ `pwd`/DEF > rescoreChainDownloadSwap.out 2>&1 & # Measurements: ssh kolossus cd /cluster/data/mm7/bed/blastzDanRer3.2005-09-19 time HGDB_CONF=~/.hg.conf.read-only featureBits mm7 \ chainDanRer3Link > fb.mm7.chainDanRer3Link.rescore 2>&1 cat fb.mm7.chainDanRer3Link.rescore # 69591785 bases of 2583394090 (2.694%) in intersection time HGDB_CONF=~/.hg.conf.read-only featureBits danRer3 \ chainMm7Link > fb.danRer3.chainMm7Link.rescore 2>&1 cat fb.danRer3.chainMm7Link.rescore # 71277532 bases of 1630323462 (4.372%) in intersection ########################################################################### #### BUILD SUPERFAMILY RELATED TABLES (TBD) # Download Superfamily data files and build the Superfamily DB # from supfam.mrc-lmb.cam.ac.uk mkdir -p /cluster/store11/superfamily/050817 ln -s /cluster/store11/superfamily/050817 /cluster/data/superfamily/050817 cd /cluster/data/superfamily/050817 # add the following line to ~/.netrc machine supfam.mrc-lmb.cam.ac.uk login license password XXXXX # ftp over to supfam.mrc-lmb.cam.ac.uk and get the following two files: supfam_14-Aug-2005.sql.gz ass_14-Aug-2005.tab.gz gzip -d *.gz # Load the Superfamily database hgsql mm7 -e "create database superfam050817" zcat supfam_14-Aug-2005.sql.gz | hgsql superfam050817 # This may take about an hour. # Make sure to add an index on id of the des table of superfam050817. hgsql superfam050817 -e "create index id on des(id);" gzip -d ass_14-Aug-2005.tab.gz hgsql superfam050817 < ~/src/hg/lib/sfAssign.sql hgsql superfam050817 -e \ 'load data local infile "ass_14-Aug-2005.tab" into table superfam050817.sfAssign;' # Build or rebuild Superfamily track and create sf tables needed for PB hgsql mm7 < ~/src/hg/lib/sfAssign.sql cd /cluster/data/superfamily/050817 hgsql mm7 -e 'load data local infile "ass_14-Aug-2005.tab" into table mm7.sfAssign;' # If mm7.sfDes already exists, drop it. hgsql superfam050817 -e "select * from des" >sfDes.tab hgsql mm7 < ~/src/hg/lib/sfDes.sql hgsql mm7 -e 'load data local infile "sfDes.tab" into table mm7.sfDes ignore 1 lines;' # If mm7.superfamily already exists, drop it. cd /cluster/data/mm7/bed mkdir /cluster/data/mm7/sf.2005-0817 ln -s sf.2005-0817 sf hgSuperfam mm7 superfam050817 > sf.log # It is normal that many proteins does not have corresponding Superfamily entries. # If mm7.sfDescription exists, drop it. hgsql mm7 < ~/src/hg/lib/sfDescription.sql hgsql mm7 -e 'LOAD DATA local INFILE "sfDescription.tab" into table mm7.sfDescription;' # Finally, load the superfamily table. hgLoadBed -strict mm7 superfamily superfamily.tab -tab # Create knownToSuperfamily table # Note hs is changed into ht for this Superfamily release. cat /cluster/data/superfamily/050817/ass_14-Aug-2005.tab | hgKnownToSuper mm7 mm stdin # 21185 records output ########################################################################### # BLASTZ canFam2 second time (WORKING - 2005-11-14 Hiram) # After fixing a bug in the lineage specific repeat snip business # in blastz-run-ucsc script ssh pk mkdir /cluster/data/mm7/bed/blastzCanFam2.2005-11-14 cd /cluster/data/mm7/bed rm blastz.canFam2 ln -s blastzCanFam2.2005-11-14 blastz.canFam2 cd blastzCanFam2.2005-11-14 cat << '_EOF_' > DEF # mouse vs dog export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin BLASTZ=blastz.v7.x86_64 BLASTZ_ABRIDGE_REPEATS=1 # TARGET: Mouse Mm7 SEQ1_DIR=/scratch/hg/mm7/nib SEQ1_SMSK=/scratch/hg/mm7/linSpecRep/notInHumanDogCow SEQ1_LEN=/scratch/hg/mm7/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Dog CanFam2 - chunk big enough to do all chroms in single whole pieces SEQ2_DIR=/scratch/hg/canFam2/nib SEQ2_SMSK=/san/sanvol1/scratch/canFam2/linSpecRep.notInMouse SEQ2_LEN=/san/sanvol1/scratch/canFam2/chrom.sizes SEQ2_CHUNK=200000000 SEQ2_LAP=0 BASE=/cluster/data/mm7/bed/blastzCanFam2.2005-11-14 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # establish a screen to control this job screen time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk \ -stop=net \ `pwd`/DEF > blastz-to-net.out 2>&1 & XXXX - Started 2005-11-14 14:50 time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk \ -continue=load -stop=load \ `pwd`/DEF > load.out 2>&1 & featureBits mm7 chainCanFam2Link # 842454846 bases of 2583394090 (32.610%) in intersection time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -swap -stop=load `pwd`/DEF > swap-to-load.out 2>&1 & # re-scoring the chains ssh kkstore02 cd /cluster/data/mm7/bed/blastzCanFam2.2005-11-14 rm -fr axtNet mafNet axtChain ssh pk cd /cluster/data/mm7/bed/blastzCanFam2.2005-11-14 time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -continue=chainRun -stop=load \ `pwd`/DEF > rescoreChain.out 2>&1 & time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -swap -stop=load \ `pwd`/DEF > rescoreChainSwap.out 2>&1 & time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -swap -continue=load -stop=load \ `pwd`/DEF > rescoreChainSwapLoad.out 2>&1 & time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -continue=download \ `pwd`/DEF > rescoreChainDownload.out 2>&1 & time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -swap -continue=download \ `pwd`/DEF > rescoreChainDownloadSwap.out 2>&1 & # Measurements: ssh kolossus cd /cluster/data/mm7/bed/blastzCanFam2.2005-11-14 time HGDB_CONF=~/.hg.conf.read-only featureBits mm7 \ chainCanFam2Link > fb.mm7.chainCanFam2Link.rescore 2>&1 cat fb.mm7.chainCanFam2Link.rescore # 833554390 bases of 2583394090 (32.266%) in intersection time HGDB_CONF=~/.hg.conf.read-only featureBits canFam2 \ chainMm7Link > fb.canFam2.chainMm7Link.rescore 2>&1 cat fb.canFam2.chainMm7Link.rescore # 814108570 bases of 2384996543 (34.135%) in intersection ########################################################################### # BLASTZ/CHAIN/NET CANFAM2 (DONE - 2005-09-16 - 2005-10-12- Hiram) ssh kkstore02 mkdir /cluster/data/mm7/bed/blastz.canFam2.2005-09-16 cd /cluster/data/mm7/bed ln -s blastz.canFam2.2005-09-16 blastz.canFam2 cd blastz.canFam2.2005-09-16 cat << '_EOF_' > DEF # mouse vs dog export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin BLASTZ=blastz.v7.x86_64 BLASTZ_ABRIDGE_REPEATS=1 # TARGET: Mouse Mm7 SEQ1_DIR=/cluster/bluearc/mm7/nib SEQ1_SMSK=/cluster/bluearc/mm7/linSpecRep/notInHumanDogCow SEQ1_LEN=/cluster/bluearc/mm7/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Dog CanFam2 - chunk big enough to do all chroms in single whole pieces SEQ2_DIR=/scratch/hg/canFam2/nib SEQ2_SMSK=/san/sanvol1/scratch/canFam2/linSpecRep.notInMouse SEQ2_LEN=/san/sanvol1/scratch/canFam2/chrom.sizes SEQ2_CHUNK=200000000 SEQ2_LAP=0 BASE=/cluster/data/mm7/bed/blastz.canFam2.2005-09-16 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs ssh pk # establish a screen to control this job screen cd /cluster/data/mm7/bed/blastz.canFam2.2005-09-16 time $HOME/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -stop=load \ `pwd`/DEF > thruLoad.out 2>&1 & XXX - STARTED - 2005-09-16 15:50 real 749m31.156s user 0m0.116s sys 0m0.103s time $HOME/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -continue=chainMerge -stop=load \ `pwd`/DEF > chainMergeThruLoad.out 2>&1 & # detach from screen session: Ctrl-a Ctrl-d # to reattach to this screen session: ssh pk screen -d -r time $HOME/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -continue=download \ `pwd`/DEF > download-cleanup.out 2>&1 & time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -swap -fileServer=kolossus \ `pwd`/DEF > swap.out 2>&1 & time HGDB_CONF=~/.hg.conf.read-only featureBits mm7 chainCanFam2Link # 791921556 bases of 2583394090 (30.654%) in intersection time HGDB_CONF=~/.hg.conf.read-only featureBits canFam2 chainMm7Link # 773143256 bases of 2384996543 (32.417%) in intersection time HGDB_CONF=~/.hg.conf.read-only featureBits canFam2 chainMm6Link # 780509502 bases of 2384996543 (32.726%) in intersection time HGDB_CONF=~/.hg.conf.read-only featureBits mm6 chainCanFam1Link # 798637320 bases of 2597150411 (30.751%) in intersection ############################################################################ # UPDATE miRNA track (TBD) # data from: Michel.Weber@ibcg.biotoul.fr # notify them when done. cd /cluster/data/mm7/bed cd miRNA mkdir old cp -p * old rm * # save miRNA_track_mm7_aug2005.txt file from email cp miRNA_track_mm7_aug2005.txt miRNA.tab vi miRNA.tab # edit miRNA.bed to get rid of the top description lines # and a few blank lines hgLoadBed -strict mm7 miRNA miRNA.tab # check previous release track before update nice featureBits mm5 miRNA # 17957 bases of 2615483787 (0.001%) in intersection nice featureBits mm7 miRNA # 20898 bases of 2597150411 (0.001%) in intersection ####################################################################### # MAKE 11.OOC FILE FOR BLAT (DONE - 2005-09-08 - Hiram) ssh kkstore02 cd /cluster/data/mm7 # Size of mouse non-gap genome: 2583394090 # Size of Hg17 non-gap genome: 2866216770 # Adjusting the 1024 number from typical human ooc generation: # 1024 * (2583394090 / 2866216770) = 923 time blat mm7.2bit \ /dev/null /dev/null -tileSize=11 -makeOoc=11.ooc -repMatch=923 # Wrote 29368 overused 11-mers to 11.ooc # real 2m41.929 # Copy over to the bluearc cp -p 11.ooc /cluster/bluearc/mm7 ############################################################################# # PREPARE LINEAGE SPECIFIC REPEAT FILES FOR BLASTZ (DONE - 2005-09-15 - Hiram) ssh kkstore02 mkdir /cluster/data/mm7/rmsk cd /cluster/data/mm7 cp -p */chr*.fa.out rmsk cd rmsk for FN in chr*.fa.out do echo ${FN} /cluster/bluearc/RepeatMasker050305/DateRepeats \ ${FN} -query mouse -comp human -comp rat -comp dog -comp cow done # takes about 30 minutes cd /cluster/data/mm7 mkdir linSpecRep cd linSpecRep mkdir notInHuman mkdir notInRat mkdir notInDog mkdir notInCow cd /cluster/data/mm7 for F in rmsk/chr*.out_homo-sapiens* do B=${F/rmsk\/} B=${B/.fa.out*/} echo $B /cluster/bin/scripts/extractRepeats 1 ${F} > \ linSpecRep/notInHuman/${B}.out.spec /cluster/bin/scripts/extractRepeats 2 ${F} > \ linSpecRep/notInRat/${B}.out.spec /cluster/bin/scripts/extractRepeats 3 ${F} > \ linSpecRep/notInDog/${B}.out.spec /cluster/bin/scripts/extractRepeats 4 ${F} > \ linSpecRep/notInCow/${B}.out.spec done # the notInHuman, notInDog, and notInCow ended up being identical # To check identical cd linSpecRep find . -type f | \ while read FN; do echo `cat ${FN} | sum -r` ${FN}; done \ | sort -u -k1,1n # Copy to bluearc for use in kluster runs mkdir /cluster/bluearc/mm7/linSpecRep cd /cluster/bluearc/mm7/linSpecRep mkdir notInOthers notInRat cp -p /cluster/data/mm7/linSpecRep/notInHuman/* ./notInOthers cp -p /cluster/data/mm7/linSpecRep/notInRat/* ./notInRat ############################################################################# # macaca mulatta vs. Mm7 (WORKING - 2005-10-19 - Hiram) ssh kk mkdir /cluster/data/mm7/bed/blastzRheMac1.2005-10-19 cd /cluster/data/mm7/bed ln -s blastzRheMac1.2005-10-19 blastz.rheMac1 cd blastzRheMac1.2005-10-19 cat << '_EOF_' > DEF export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin:/parasol/bin BLASTZ=blastz.v7 # TARGET - Mm7 SEQ1_DIR=/scratch/hg/mm7/nib SEQ1_LEN=/scratch/hg/mm7/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY - Macaca mulatta rheMac1 SEQ2_DIR=/scratch/hg/rheMac1/rheMac1.2bit SEQ2_LEN=/scratch/hg/rheMac1/chrom.sizes SEQ2_LIMIT=300 SEQ2_CHUNK=30000000 SEQ2_LAP=0 BASE=/cluster/data/mm7/bed/blastzRheMac1.2005-10-19 '_EOF_' # happy emacs time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=kk \ `pwd`/DEF > blastz.out 2>&1 & ssh kolossus HGDB_CONF=~/.hg.conf.read-only featureBits mm7 chainRheMac1Link # 86364754 bases of 2583394090 (34.310%) in intersection time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -swap -bigClusterHub=kk \ `pwd`/DEF > swap.out 2>&1 & # re-scoring the chains ssh kkstore02 cd /cluster/data/mm7/bed/blastzRheMac1.2005-10-19 rm -fr axtNet mafNet axtChain ssh pk cd /cluster/data/mm7/bed/blastzRheMac1.2005-10-19 time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -continue=chainRun -stop=load \ `pwd`/DEF > rescoreChain.out 2>&1 & time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -swap -stop=load \ `pwd`/DEF > rescoreChainSwap.out 2>&1 & time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -swap -continue=load -stop=load \ `pwd`/DEF > rescoreChainSwapLoad.out 2>&1 & # Measurements: ssh kolossus cd /cluster/data/mm7/bed/blastzRheMac1.2005-10-19 time HGDB_CONF=~/.hg.conf.read-only featureBits mm7 \ chainRheMac1Link > fb.mm7.chainRheMac1Link.rescore 2>&1 cat fb.mm7.chainRheMac1Link.rescore # 875709370 bases of 2583394090 (33.898%) in intersection time HGDB_CONF=~/.hg.conf.read-only featureBits rheMac1 \ chainMm7Link > fb.rheMac1.chainMm7Link.rescore 2>&1 cat fb.rheMac1.chainMm7Link.rescore # 862032750 bases of 2691926638 (32.023%) in intersection time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -continue=download \ `pwd`/DEF > rescoreChainDownload.out 2>&1 & time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -swap -continue=download \ `pwd`/DEF > rescoreChainSwapDownload.out 2>&1 & ############################################################################# # Elephant vs. Mm7 (WORKING - 2005-10-20 - Hiram) ssh pk mkdir /cluster/data/mm7/bed/blastzLoxAfr1.2005-10-20 cd /cluster/data/mm7/bed ln -s blastzLoxAfr1.2005-10-20 blastz.loxAfr1 cd blastzLoxAfr1.2005-10-20 cat << '_EOF_' > DEF export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/parasol/bin BLASTZ=blastz.v7.x86_64 # TARGET - Mouse mm7 SEQ1_DIR=/scratch/hg/mm7/nib SEQ1_LEN=/scratch/hg/mm7/chrom.sizes SEQ1_CHUNK=30000000 SEQ1_LAP=10000 # QUERY - Elephant loxAfr1 SEQ2_DIR=/scratch/hg/loxAfr1/loxAfr1.2bit SEQ2_LEN=/scratch/hg/loxAfr1/chrom.sizes SEQ2_LIMIT=300 SEQ2_CHUNK=30000000 SEQ2_LAP=0 BASE=/cluster/data/mm7/bed/blastzLoxAfr1.2005-10-20 '_EOF_' # happy emacs time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk \ `pwd`/DEF > blastz.out 2>&1 & time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -continue=cat -bigClusterHub=kk \ `pwd`/DEF > continue.cat.out 2>&1 & ssh kolossus HGDB_CONF=~/.hg.conf.read-only featureBits mm7 chainLoxAfr1Link # 480074381 bases of 2583394090 (18.583%) in intersection time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -swap -bigClusterHub=kk \ `pwd`/DEF > swap.out 2>&1 & time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -continue=load -swap -bigClusterHub=kk \ `pwd`/DEF > load.swap.out 2>&1 & ssh hgwdev time featureBits loxAfr1 chainMm7Link # 477557610 bases of 2295548473 (20.804%) in intersection # real 4536m4.614s # user 22m50.620s # sys 9m38.420s # *!*! NOTE: That is 75.6 hours running time !!! # re-scoring the chains ssh kkstore02 cd /cluster/data/mm7/bed/blastzLoxAfr1.2005-10-20 rm -fr axtNet mafNet axtChain ssh pk cd /cluster/data/mm7/bed/blastzLoxAfr1.2005-10-20 time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -continue=chainRun -stop=load \ `pwd`/DEF > rescoreChain.out 2>&1 & time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -swap -stop=load \ `pwd`/DEF > rescoreChainSwap.out 2>&1 & time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -swap -continue=net -stop=load \ `pwd`/DEF > rescoreChainSwap.net-to-load.out 2>&1 & time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -swap -continue=load \ `pwd`/DEF > rescoreChainSwapLoad.out 2>&1 & time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -continue=download \ `pwd`/DEF > rescoreChainDownload.out 2>&1 & ssh kolossus cd /cluster/data/mm7/bed/blastz.loxAfr1 time HGDB_CONF=~/.hg.conf.read-only featureBits loxAfr1 \ chainMm7Link > fb.loxAfr1.chainMm7Link.rescore 2>&1 # real 4067m1.699s # user 5m42.134s # sys 2m9.896s # This is over 67 hours ! cat fb.loxAfr1.chainMm7Link.rescore # 471844339 bases of 2295548473 (20.555%) in intersection ############################################################################# # Tenrec vs. Mm7 (WORKING - 2005-10-20 - Hiram) ssh pk mkdir /cluster/data/mm7/bed/blastzEchTel1.2005-10-20 cd /cluster/data/mm7/bed ln -s blastzEchTel1.2005-10-20 blastz.echTel1 cd blastzEchTel1.2005-10-20 cat << '_EOF_' > DEF export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/parasol/bin BLASTZ=blastz.v7.x86_64 # TARGET - Mouse mm7 SEQ1_DIR=/scratch/hg/mm7/nib SEQ1_LEN=/scratch/hg/mm7/chrom.sizes SEQ1_CHUNK=30000000 SEQ1_LAP=10000 # QUERY - Tenrec echTel1 SEQ2_DIR=/scratch/hg/echTel1/echTel1.2bit SEQ2_LEN=/scratch/hg/echTel1/chrom.sizes SEQ2_LIMIT=400 SEQ2_CHUNK=30000000 SEQ2_LAP=0 BASE=/cluster/data/mm7/bed/blastzEchTel1.2005-10-20 '_EOF_' # happy emacs time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk \ `pwd`/DEF > blastz.out 2>&1 & time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -continue=cat -bigClusterHub=pk \ `pwd`/DEF > continue.cat.out 2>&1 & ssh kolossus HGDB_CONF=~/.hg.conf.read-only featureBits mm7 chainEchTel1Link # 296238377 bases of 2583394090 (11.467%) in intersection time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -swap -bigClusterHub=pk \ `pwd`/DEF > swap.out 2>&1 & ssh kolossus time HGDB_CONF=~/.hg.conf.read-only featureBits echTel1 \ chainMm7Link > fb.echTel1.chainMm7Link 2>&1 # real 3973m1.497s # user 3m19.311s # sys 1m24.524s # That is ~ 66 hours # re-scoring the chains ssh kkstore02 cd /cluster/data/mm7/bed/blastzEchTel1.2005-10-20 rm -fr axtNet mafNet axtChain ssh pk cd /cluster/data/mm7/bed/blastzEchTel1.2005-10-20 time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ -continue=chainRun -stop=load \ `pwd`/DEF > rescoreChain.out 2>&1 & time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ -swap -stop=load \ `pwd`/DEF > rescoreChainSwap.out 2>&1 & time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ -swap -continue=load \ `pwd`/DEF > rescoreChainSwapLoad.out 2>&1 & time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ -continue=download \ `pwd`/DEF > rescoreChainDownload.out 2>&1 & real 5472m11.402s ############################################################################# # Rabbit vs. Mm7 (WORKING - 2005-10-20 - Hiram) ssh pk mkdir /cluster/data/mm7/bed/blastzOryCun1.2005-10-20 cd /cluster/data/mm7/bed ln -s blastzOryCun1.2005-10-20 blastz.oryCun1 cd blastzOryCun1.2005-10-20 cat << '_EOF_' > DEF export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/parasol/bin BLASTZ=blastz.v7.x86_64 # TARGET - Mouse mm7 SEQ1_DIR=/scratch/hg/mm7/nib SEQ1_LEN=/scratch/hg/mm7/chrom.sizes SEQ1_CHUNK=30000000 SEQ1_LAP=10000 # QUERY - Rabbit oryCun1 SEQ2_DIR=/scratch/hg/oryCun1/oryCun1.2bit SEQ2_LEN=/scratch/hg/oryCun1/chrom.sizes SEQ2_LIMIT=400 SEQ2_CHUNK=30000000 SEQ2_LAP=0 BASE=/cluster/data/mm7/bed/blastzOryCun1.2005-10-20 '_EOF_' # happy emacs time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk \ `pwd`/DEF > blastz.out 2>&1 & time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -continue=cat -bigClusterHub=pk \ `pwd`/DEF > continue.cat.out 2>&1 & ssh kolossus HGDB_CONF=~/.hg.conf.read-only featureBits mm7 chainOryCun1Link # 503550390 bases of 2583394090 (19.492%) in intersection time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -swap -bigClusterHub=pk \ `pwd`/DEF > swap.out 2>&1 & time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -continue=load -swap -bigClusterHub=kk \ `pwd`/DEF > load.swap.out 2>&1 & ssh kolossus time HGDB_CONF=~/.hg.conf.read-only featureBits oryCun1 \ chainMm7Link >fb.oryCun1.chainMm7Link 2>&1 # real 7258m58.416s # user 3m38.961s # sys 1m41.555s # That is ~ 121 Hours ! # re-scoring the chains ssh kkstore02 cd /cluster/data/mm7/bed/blastzOryCun1.2005-10-20 rm -fr axtNet mafNet axtChain ssh pk cd /cluster/data/mm7/bed/blastzOryCun1.2005-10-20 time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -continue=chainRun -stop=load \ `pwd`/DEF > rescoreChain.out 2>&1 & # real 101m1.441s time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -swap -stop=load \ `pwd`/DEF > rescoreChainSwap.out 2>&1 & time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -swap -continue=load \ `pwd`/DEF > rescoreChainSwapLoad.out 2>&1 & time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -continue=download \ `pwd`/DEF > rescoreChainDownload.out 2>&1 & ssh kolossus cd /cluster/data/mm7/bed/blastz.oryCun1 time HGDB_CONF=~/.hg.conf.read-only featureBits \ oryCun1 chainMm7Link > fb.oryCun1.chainMm7Link.rescore 2>&1 # real 3560m14.127s # user 3m23.400s # sys 1m23.287s # That is over 59 hours ! cat fb.oryCun1.chainMm7Link.rescore # 501663640 bases of 2076044328 (24.164%) in intersection ############################################################################# # Armadillo vs. Mm7 (WORKING - 2005-10-20 - Hiram) ssh pk mkdir /cluster/data/mm7/bed/blastzDasNov1.2005-10-20 cd /cluster/data/mm7/bed ln -s blastzDasNov1.2005-10-20 blastz.dasNov1 cd blastzDasNov1.2005-10-20 cat << '_EOF_' > DEF export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/parasol/bin BLASTZ=blastz.v7.x86_64 # TARGET - Mouse mm7 SEQ1_DIR=/scratch/hg/mm7/nib SEQ1_LEN=/scratch/hg/mm7/chrom.sizes SEQ1_CHUNK=30000000 SEQ1_LAP=10000 # QUERY - Armadillo dasNov1 SEQ2_DIR=/scratch/hg/dasNov1/dasNov1.2bit SEQ2_LEN=/scratch/hg/dasNov1/chrom.sizes SEQ2_LIMIT=400 SEQ2_CHUNK=30000000 SEQ2_LAP=0 BASE=/cluster/data/mm7/bed/blastzDasNov1.2005-10-20 '_EOF_' # happy emacs time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk \ `pwd`/DEF > blastz.out 2>&1 & time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -continue=cat -bigClusterHub=pk \ `pwd`/DEF > continue.cat.out 2>&1 & ssh kolossus HGDB_CONF=~/.hg.conf.read-only featureBits mm7 chainDasNov1Link # 438537191 bases of 2583394090 (16.975%) in intersection time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -swap -bigClusterHub=pk -fileServer=kolossus \ `pwd`/DEF > swap.out 2>&1 & time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -continue=net -swap -bigClusterHub=pk \ `pwd`/DEF > swap.net.out 2>&1 & ssh kolossus time HGDB_CONF=~/.hg.conf.read-only featureBits dasNov1 \ chainMm7Link > fb.dasNov1.chainMm7Link 2>&1 # real 8229m16.489s # user 2m56.146s # sys 1m20.550s # That is ~ 137 hours # re-scoring the chains ssh kkstore02 cd /cluster/data/mm7/bed/blastzDasNov1.2005-10-20 rm -fr axtNet mafNet axtChain ssh pk cd /cluster/data/mm7/bed/blastzDasNov1.2005-10-20 time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -continue=chainRun -stop=load \ `pwd`/DEF > rescoreChain.out 2>&1 & time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -swap -stop=load \ `pwd`/DEF > rescoreChainSwap.out 2>&1 & time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -swap -continue=net -stop=load \ `pwd`/DEF > rescoreChainSwap.net-to-load.out 2>&1 & time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -continue=download \ `pwd`/DEF > rescoreChainDownload.out 2>&1 & time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -swap -continue=load \ `pwd`/DEF > rescoreChainSwapLoad.out 2>&1 & real 5472m48.563s ############################################################################# # Create Allen Brain Atlas mapping. (DONE Nov 2005 JK) # Set up directory ssh kk cd /cluster/data/mm7/bed mkdir allenBrain cd allenBrain # Copy in allen20051021.tab file that was converted from # spreadsheet mailed by Susan Sunkin # Also copy in probeSeq.20051027.fasta, also from Susan. # Create a list of probe sequences filling ones missing from probeSeq.20050127.fa # with some NCBI and TIGR files, and some downloaded one at a time. allenCollectSeq allen20051021.tab probeSeq.20051027.fasta /cluster/data/mm6/bed/ncbiXm/ncbiNm.fa /cluster/data/mm6/bed/ncbiXm/ncbiXm.fa /cluster/data/mm6/bed/tigrMgiTc/tigrMgiTc.fa ~/kent/src/hg/makeDb/allenBrain/allenCollectSeq/extra.fa allProbes.fa allProbes.tab missing.tab allenBrainUrl.tab # Set up a blat run to align the probes. mkdir split faSplit sequence allProbes.fa 200 split/rp mkdir run cd run ls -1 ../split/*.fa > mrna.lst ls -1 /scratch/hg/mm7/nib/*.nib > genome.lst mkdir psl cat << '_EOF_' > gsub #LOOP blat -ooc=/scratch/hg/h/mouse11.ooc $(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl} #ENDLOOP '_EOF_' gensub2 genome.lst mrna.lst gsub spec para create spec # Then do the usual para try/push/time/check until the run is finished # Then do sorting and near-best-in-genome step on file server ssh kkstore02 cd /cluster/data/mm7/bed/allenBrain/run pslSort dirs raw.psl tmp psl pslReps raw.psl ../best.psl -nohead -minCover=0.20 -minAli=0.96 -nearTop=0.001 /dev/null sort -k 14,14 -k 16,16n ../best.psl > ../allenBrainAli.psl # Clean up big files no longer needed rm raw.psl rm -r psl rm -r ../split # Load up database ssh hgwdev cd /cluster/data/mm7/bed/allenBrain # Make a new table that contains the URLs for the allen brain genes # Make this one first since all.joiner considers it the master table. hgsql mm7 < ~/kent/src/hg/lib/allenBrainUrl.sql hgsql mm7 -e 'load data local infile "allenBrainUrl.tab" into table allenBrainUrl;' # Make probe alignment table, and load sequence. hgLoadPsl mm7 allenBrainAli.psl mkdir /gbdb/mm7/allenBrain ln -s /cluster/data/mm7/bed/allenBrain/allProbes.fa /gbdb/mm7/allenBrain/allProbes.fa hgLoadSeq mm7 /gbdb/mm7/allenBrain/allProbes.fa # Make mapping between known genes and allenBrain hgMapToGene mm7 allenBrainAli -type=psl knownGene knownToAllenBrain ############################################################################ # BUILD KNOWN GENES RELATED TABLES for mm7 (STARTED 10/23/05, DONE 11/17/05. Fan) # First build protein databases, sp051015 and proteins051015 # See makeProteins051015.doc for details. # Please note that the protein and displayId tables in sp051015 have data of variant splice proteins. # Create working subdirectories and temporary databases ssh hgwdev cd /cluster/store11/kg mkdir kgMm7A ln -s /cluster/store11/kg/kgMm7A /cluster/store6/kgDB/bed/kgMm7A ln -s /cluster/store11/kg/kgMm7A /cluster/data/mm7/bed/kgMm7A hgsql mm7 -e "create database kgMm7A" hgsql mm7 -e "create database kgMm7ATemp" mkdir /cluster/bluearc/kgDB/kgMm7A mkdir /cluster/bluearc/kgDB/kgMm7A/protBlat ln -s /cluster/bluearc/kgDB/kgMm7A/protBlat /cluster/store11/kg/kgMm7A/protBlat cd /cluster/store11/kg/kgMm7A/protBlat # Get all protein sequences for mouse hgsql -N sp051015 -e \ 'select proteins051015.spXref3.accession,protein.val from proteins051015.spXref3,protein where division="10090" and acc=accession' \ |awk '{print ">" $1;print $2}' >mm7Prot.fa # Prepare and perform cluster run for protein/genome alignment ssh kk cd /cluster/data/mm7/bed/kgMm7A/protBlat mkdir prot faSplit sequence mm7Prot.fa 1000 prot/prot ls /cluster/bluearc/kgDB/kgMm7A/protBlat/prot/* > prot.lis ssh hgwdev cd /cluster/data/mm7/bed/kgMm7A/protBlat hgsql mm7 -N -e 'select chrom from chromInfo' > chrom.lis exit cat << '_EOF_' > gsub #LOOP /cluster/bin/i386/blat -noHead -t=dnax -q=prot /cluster/data/mm7/nib/$(path1).nib $(path2) {check out line+ /cluster/bluearc/kgDB/kgMm7A/protBlat/result/$(root1)_$(root2).psl} #ENDLOOP '_EOF_' mkdir result gensub2 chrom.lis prot.lis gsub jobList para create jobList para try para check para push para check ... #Completed: 32449 of 39600 jobs #Crashed: 7151 jobs #CPU time in finished jobs: 36332462s 605541.04m 10092.35h 420.51d 1.152 y #IO & Wait Time: 802269s 13371.14m 222.85h 9.29d 0.025 y #Average job time: 1144s 19.07m 0.32h 0.01d #Longest running job: 0s 0.00m 0.00h 0.00d #Longest finished job: 34735s 578.92m 9.65h 0.40d #Submission to last job: 113157s 1885.95m 31.43h 1.31d #Estimated complete: 0s 0.00m 0.00h 0.00d #39600 jobs in batch # Many output .psl files are empty, these warnings are OK. # Check to see if there is any other error type. para problems |grep empty|wc # 7151 21453 559219 # collect BLAT results ssh hgwdev cd /cluster/data/mm7/bed/kgMm7A/protBlat mkdir result2 mkdir result3 cat chrom.lis |sed -e 's/chr/do1 chr/g' >doall cat << '_EOF_' > do1.1 echo processing $1 cat result/$1_prot*.psl >result2/$1.psl '_EOF_' cat << '_EOF_' > do1.2 echo processing $1 pslReps -nohead -minCover=0.80 -minAli=0.80 -nearTop=0.002 result2/$1.psl result3/$1.psl /dev/null >>j.out '_EOF_' chmod +x do* cp do1.1 do1 doall cp do1.2 do1 doall cat result3/*.psl >protBlat.psl hgLoadPsl mm7 protBlat.psl # Processing protBlat.psl # load of protBlat did not go as planned: 101748 record(s), 0 row(s) skipped, 880 warning(s) loading psl.tab # Looked into the cause of the warnings before and found that it was due to that qBaseInsert # and tBaseInsert have negative values, probably due to that this is protein alignment. # Remember to remove result2 and result3 when KG is built and validated. # VVVVVVVVVVVVVVVVVVVVVVVVVVVVV BELOW IS AN EXCURSION TO BLAT VAR SPLICE PROTEINS VVVVVVVVVVVVV # (Fan, STARTED 10/28/05, DONE 10/29/05) # FORGOT TO FINISH VAR-SPLICE PROTEINS PROCESSING DURING MAKE PROTEINS BUILD. # NEXT TIME, SHOULD NOT DO THIS. mkdir /cluster/bluearc/kgDB/kgMm7A/protBlatVar ln -s /cluster/bluearc/kgDB/kgMm7A/protBlatVar /cluster/store11/kg/kgMm7A/protBlatVar cd /cluster/store11/kg/kgMm7A/protBlatVar # Get all protein sequences for mouse hgsql -N sp051015 -e \ 'select proteins051015.spXref3.accession,protein.val from proteins051015.spXref3,protein where division="10090" and acc=accession and acc like "%-%"' \ |awk '{print ">" $1;print $2}' >mm7ProtVar.fa # Prepare and perform cluster run for protein/genome alignment ssh kk cd /cluster/data/mm7/bed/kgMm7A/protBlatVar mkdir protVar faSplit sequence mm7ProtVar.fa 100 protVar/protVar ls /cluster/bluearc/kgDB/kgMm7A/protBlatVar/protVar/* > protVar.lis ssh hgwdev cd /cluster/data/mm7/bed/kgMm7A/protBlatVar hgsql mm7 -N -e 'select chrom from chromInfo' > chrom.lis exit cat << '_EOF_' > gsub #LOOP /cluster/bin/i386/blat -noHead -t=dnax -q=prot /cluster/data/mm7/nib/$(path1).nib $(path2) {check out line+ /cluster/bluearc/kgDB/kgMm7A/protBlatVar/result/$(root1)_$(root2).psl} #ENDLOOP '_EOF_' mkdir result gensub2 chrom.lis protVar.lis gsub jobList para create jobList para try para check para push para check ... # Completed: 2498 of 4000 jobs # Crashed: 1502 jobs # CPU time in finished jobs: 3234506s 53908.44m 898.47h 37.44d 0.103 y # IO & Wait Time: 47651s 794.18m 13.24h 0.55d 0.002 y # Average job time: 1314s 21.90m 0.36h 0.02d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 16756s 279.27m 4.65h 0.19d # Submission to last job: 18976s 316.27m 5.27h 0.22d # Many output .psl files are empty, these warnings are OK. # Check to see if there is any other error type. para problems |grep empty|wc # 1502 4506 125061 # collect BLAT results ssh hgwdev cd /cluster/data/mm7/bed/kgMm7A/protBlatVar mkdir result2 mkdir result3 cat chrom.lis |sed -e 's/chr/do1 chr/g' >doall cat << '_EOF_' > do1.1 echo processing $1 cat result/$1_prot*.psl >result2/$1.psl '_EOF_' cat << '_EOF_' > do1.2 echo processing $1 pslReps -nohead -minCover=0.80 -minAli=0.80 -nearTop=0.002 result2/$1.psl result3/$1.psl /dev/null >>j.out '_EOF_' chmod +x do* cp do1.1 do1 doall cp do1.2 do1 doall cat result3/*.psl >protBlatVar.psl cat ../protBlat/protBlat.psl protBlatVar.psl >protBlatAll.psl hgLoadPsl -table=protBlat mm7 protBlatAll.psl # Processing protBlatAll.psl # load of protBlat did not go as planned: 104970 record(s), 0 row(s) skipped, 887 warning(s) loading psl.tab mv ../protBlat/protBlat.psl ../protBlat/protBlat.psl.orig cp -p protBlatAll.psl ../protBlat/protBlat.psl # Looked into the cause of the warnings before and found that it was due to that qBaseInsert # and tBaseInsert have negative values, probably due to that this is protein alignment. # Remember to remove result2 and result3 when KG is built and validated. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ cd /cluster/data/mm7/bed/kgMm7A # create all_mrna.psl and tight_mrna.psl hgsql mm7 -N -e "select * from all_mrna" |cut -f 2-22 >all_mrna.psl pslReps -minCover=0.40 -minAli=0.97 -nearTop=0.002 all_mrna.psl tight_mrna.psl /dev/null # Processed 214613 alignments # Use overlapSelect to get protein and mRNA alignment overlaps overlapSelect -statsOutput -dropped=protOut.psl -overlapThreshold=0.90 \ -selectFmt=psl -inFmt=psl tight_mrna.psl protBlat/protBlat.psl protMrna.stat overlapSelect -mergeOutput -dropped=protOut.psl -overlapThreshold=0.90 -selectFmt=psl \ -inFmt=psl tight_mrna.psl protBlat/protBlat.psl protMrna.out # Create protein/mRNA pair and protein lists cut -f 10,31 protMrna.out|sort -u >spMrna.tab cut -f 10 protMrna.out|sort -u >protein.lis # Load spMrna.tab into spMrna table in temp DB. hgsql kgMm7ATemp -e 'drop table spMrna' hgsql kgMm7ATemp < ~/src/hg/lib/spMrna.sql hgsql kgMm7ATemp -e 'load data local infile "spMrna.tab" into table spMrna' hgsql kgMm7ATemp -e 'create index mrnaID on spMrna(mrnaID)' # Prepare and perform cluster run of protein/mRNA alignment # Get mRNA fa file. cd /cluster/data/mm7/bed/kgMm7A /cluster/data/genbank/bin/i386/gbGetSeqs -native -db=mm7 \ -gbRoot=/cluster/data/genbank genbank mrna mrna.fa # Create mrnaSeq table in kgMm7ATemp DB. faToTab mrna.fa mrnaSeq.tab hgsql kgMm7ATemp -e 'drop table mrnaSeq' hgsql kgMm7ATemp <~/src/hg/lib/mrnaSeq.sql hgsql kgMm7ATemp -e 'load data local infile "mrnaSeq.tab" into table mrnaSeq' rm mrnaSeq.tab # Prepare files for cluster run ?? ~/src/hg/protein/KG2.sh kgMm7A mm7 051015 # Perform cluster run of protein/mRNA alignment ~/src/hg/protein/KG3.sh kgMm7A mm7 051015 # Collect cluster run results cd kgBestMrna ls out | sed -e 's/prot/do1 prot/g' >doall # create do1 with the following 2 lines: cat << '_EOF_' > do1 echo processing $1 cat out/$1/*.out >>protMrnaRaw.psl '_EOF_' chmod +x do* doall # Filter out low quality alignments pslReps -nohead -singleHit -minAli=0.9 protMrnaRaw.psl protMrnaBlat.psl /dev/null cut -f 10,14 protMrnaBlat.psl |sort -u >protMrna.lis wc protMrna.lis # 225764 451528 3587320 protMrna.lis # Load BLAT results into temp DB. hgsql kgMm7ATemp < ~/src/hg/lib/protMrnaBlat.sql hgsql kgMm7ATemp -e 'load data local infile "protMrnaBlat.psl" into table protMrnaBlat' hgsql kgMm7ATemp -e 'create index tName on protMrnaBlat(tName)' # Create CDS files from protein/mRNA alignment results. hgsql kgMm7ATemp -N -e \ 'select qName,"_",tName,tStart+1,":",tEnd+3 from protMrnaBlat order by qName,tName,tEnd-tStart desc'\ |sed 's/\t_\t/_/g'|sed 's/\t:\t/../g' >protMrna.cds # Create protMrna.psl with proteinID_mrnaID as query ID. cut -f 22-30 ../protMrna.out > j1.tmp cut -f 32-42 ../protMrna.out > j2.tmp cut -f 10,31 ../protMrna.out|sed -e 's/\t/_/g' >j3.tmp paste j1.tmp j3.tmp j2.tmp >protMrna.psl rm j1.tmp j2.tmp j3.tmp # Run mrnaToGene to create protMrna.gp bash mrnaToGene -cdsFile=protMrna.cds protMrna.psl protMrna.gp 2>protMrna.err >protMrna.log exit ############################################ # Redo prot-mRNA alignment using BLAST instead of BLAT, per Mark's recommendation. ~/src/hg/protein/kgProtMrna.sh kgMm7A mm7 051015 # During the following job: # # ./kgProtMrnaBlast.csh /iscratch/i/kgDB/kgMm7A/kgProtMrna/kgProtPep/kgProtPep4993.fa # # the blastall program had a segment fault on the following protein: # # Q8C9X7 # Removed this protein from /iscratch/i/kgDB/kgMm7A/kgProtMrna/kgProtPep/kgProtPep4993.fa # on kk manually and re-ran: ./kgProtMrnaBlast.csh /iscratch/i/kgDB/kgMm7A/kgProtMrna/kgProtPep/kgProtPep4993.fa # Collect alignment results and filter out low quality alignments find ./psl.tmp -name '*.psl.gz' | xargs zcat | \ pslReps -nohead -singleHit -minAli=0.9 stdin ProtMrnaBlast.psl /dev/null cut -f 10,14 kgProtMrna.psl |sort -u >protMrna.lis wc protMrna.lis # 210958 421916 3365702 protMrna.lis # Load BLAST results into temp DB. hgsql kgMm7ATemp < ~/src/hg/lib/protMrnaBlast.sql hgsql kgMm7ATemp -e 'load data local infile "protMrnaBlast.psl" into table protMrnaBlast' hgsql kgMm7ATemp -e 'create index tName on protMrnaBlast(tName)' # Create CDS files from protein/mRNA alignment results. hgsql kgMm7ATemp -N -e \ 'select qName,"_",tName,tStart+1,":",tEnd+3 from protMrnaBlast order by qName,tName,tEnd-tStart desc'\ |sed 's/\t_\t/_/g'|sed 's/\t:\t/../g' >protMrna.cds # Create protMrna.psl with proteinID_mrnaID as query ID. cd /cluster/data/mm7/bed/kgMm7A cut -f 22-30 protMrna.out > j1.tmp cut -f 32-42 protMrna.out > j2.tmp cut -f 10,31 protMrna.out|sed -e 's/\t/_/g' >j3.tmp paste j1.tmp j3.tmp j2.tmp >protMrna.psl rm j1.tmp j2.tmp j3.tmp # Run mrnaToGene to create protMrna.gp bash mrnaToGene -cdsFile=kgProtMrna/protMrna.cds protMrna.psl protMrna.gp 2>protMrna.err >protMrna.log exit ############################################ # Prepare refGene and all_mrna gp files. cd /cluster/data/mm7/bed/kgMm7A hgsql mm7 -N -e 'select * from refGene' >ref.gp hgsql mm7 -N -e \ 'select gbCdnaInfo.acc,cds.name from gbCdnaInfo,cds,all_mrna where all_mrna.qName=gbCdnaInfo.acc and gbCdnaInfo.cds=cds.id' \ |sort -u > all_mrna.cds bash mrnaToGene -cdsFile=all_mrna.cds all_mrna.psl all_mrna.gp 2>all_mrna.err > all_mrna.log exit # Align proteins to RefSeq. overlapSelect -inCds -statsOutput -overlapThreshold=0.90 -selectFmt=psl -inFmt=genePred\ protBlat/protBlat.psl ref.gp ref.stat overlapSelect -inCds -dropped=refOut1.gp -overlapThreshold=0.90 -selectFmt=psl -inFmt=genePred\ protBlat/protBlat.psl ref.gp protRef.gp overlapSelect -mergeOutput -selectCds -dropped=protOut1.psl -overlapThreshold=0.80 -inFmt=psl\ -selectFmt=genePred ref.gp protBlat/protBlat.psl protRef.out cut -f 10,22 protRef.out | sort -u >spRef.tab cut -f 10 protRef.out | sort -u >protRef.lis hgsql kgMm7ATemp -e 'drop table spRef' hgsql kgMm7ATemp <~/src/hg/lib/spRef.sql hgsql kgMm7ATemp -e 'load data local infile "spRef.tab" into table spRef' # Prepare and perform cluster runs for protein/RefSeq alignments ~/src/hg/protein/KGRef2.sh kgMm7A mm7 051015 ~/src/hg/protein/KGRef3.sh kgMm7A mm7 051015 cd kgBestRef ls out | sed -e 's/prot/do1 prot/g' >doall cat << '_EOF_' > do1 echo processing $1 cat out/$1/*.out >>protRefRaw.psl '_EOF_' chmod +x do* doall # Filter out low quality alignments. pslReps -nohead -singleHit -minAli=0.9 protRefRaw.psl protRefBlat.psl /dev/null cut -f 10,14 protRefBlat.psl |sort -u >protRef.lis wc protRef.lis # 54489 108978 940797 protRef.lis hgsql kgMm7ATemp -e 'drop table protRefBlat' hgsql kgMm7ATemp < ~/src/hg/lib/protRefBlat.sql hgsql kgMm7ATemp -e 'load data local infile "protRefBlat.psl" into table protRefBlat' hgsql kgMm7ATemp -e 'create index tName on protRefBlat(tName)' # Run gene-check to filter out invalid gp entries cd /cluster/data/mm7/bed/kgMm7A cat ref.gp protMrna.gp all_mrna.gp >kgCandidate0.gp gene-check -incl-ok -ok-genepred-out kgCandidate0.passed.gp -details-out kgCandidate0.check.detail \ -nib-dir /cluster/data/mm7/nib kgCandidate0.gp kgCandidate0.check hgsql kgMm7ATemp -e 'drop table kgCandidate0' hgsql kgMm7ATemp < ~/src/hg/lib/kgCandidate0.sql hgsql kgMm7ATemp -e 'load data local infile "kgCandidate0.gp" into table kgCandidate0' hgsql kgMm7ATemp -e 'drop table geneCheck' hgsql kgMm7ATemp < ~/src/hg/lib/geneCheck.sql hgsql kgMm7ATemp -e 'load data local infile "kgCandidate0.check" into table geneCheck ignore 2 lines' # Run kgCheck to get all KG candidates that pass the KG gene check criteria kgCheck2 kgMm7ATemp mm7 kgCandidate0 geneCheck kgCandidate.tab hgsql kgMm7ATemp -e 'drop table kgCandidate' hgsql kgMm7ATemp < ~/src/hg/lib/kgCandidate.sql hgsql kgMm7ATemp -e 'load data local infile "kgCandidate.tab" into table kgCandidate' hgsql kgMm7ATemp -e 'create index alignID on kgCandidate(alignID)' # Construct the kgCandidateX table that has alignID in the name field. cut -f 2-10 kgCandidate.tab >j2.tmp cut -f 11 kgCandidate.tab >j1.tmp paste j1.tmp j2.tmp >kgCandidateX.tab hgsql kgMm7ATemp -e 'drop table kgCandidateX' hgsql kgMm7ATemp < ~/src/hg/lib/kgCandidateX.sql hgsql kgMm7ATemp -e 'load data local infile "kgCandidateX.tab" into table kgCandidateX' # Score protein/mRna and protein/RefSeq alignments cd /cluster/data/mm7/bed/kgMm7A kgResultBestMrna2 051015 kgMm7ATemp mm7 protMrnaBlast|sort -u >protMrnaBlastScore.tab kgResultBestRef2 051015 kgMm7ATemp mm7 protRefBlat|sort -u >protRefScore.tab # Combine scoring results and load them into temp DB. cat protMrnaBlastScore.tab protRefScore.tab >protMrnaScore.tab hgsql kgMm7ATemp -e 'drop table protMrnaScore' hgsql kgMm7ATemp < ~/src/hg/lib/protMrnaScore.sql hgsql kgMm7ATemp -e 'load data local infile "protMrnaScore.tab" into table protMrnaScore' hgsql kgMm7ATemp -e 'create index mrnaAcc on protMrnaScore(mrnaAcc)' # Run kgGetCds to get CDS structure of each gene kgGetCds kgMm7ATemp 051015 kgCandidateX jY.tmp cat jY.tmp |sort -u >kgCandidateY.tab rm jY.tmp hgsql kgMm7ATemp -e 'drop table kgCandidateY' hgsql kgMm7ATemp < ~/src/hg/lib/kgCandidateY.sql hgsql kgMm7ATemp -e 'load data local infile "kgCandidateY.tab" into table kgCandidateY' # Run kgPickPrep to replace long cds structure string with cdsId. kgPickPrep kgMm7ATemp kgCandidateZ.tab hgsql kgMm7ATemp -e 'drop table kgCandidateZ' hgsql kgMm7ATemp < ~/src/hg/lib/kgCandidateZ.sql hgsql kgMm7ATemp -e 'load data local infile "kgCandidateZ.tab" into table kgCandidateZ' hgsql kgMm7ATemp -e 'create index cdsId on kgCandidateZ(cdsId)' # Run kgPick to pick the representative a mrna/protein pair for each unique CDS structure. kgPick kgMm7ATemp mm7 proteins051015 kg4.tmp dupSpMrna.tmp sort -u dupSpMrna.tmp >dupSpMrna.tab # Create put back list # gbGetSeqsX, a modified version of gbGetSeqs output the RefSeq IDs at the beginning of each output line. gbGetSeqsX -gbRoot=/cluster/data/genbank db=mm7 -get=ra RefSeq mrna ref.ra hgsql mm7 -N -e \ 'select r.acc, r.attr, r.val from refRa r, refRa r2, refRa r3 wherer.attr="selenocysteine" and r.acc=r2.acc and r2.attr="rss" and r2.val="rev" and r3.acc=r.acc and r3.attr="org" and r3.val="Mus musculus"' \ >kgPutBack2.tab hgsql mm7 -N -e \ 'select r.acc, r.attr, r.val from refRa r, refRa r2, refRa r3 where r.attr="cno" and r.val like "%ribosomal frameshift%" and r.acc=r2.acc and r2.attr="rss" and r2.val="rev" and r2.val="rev" and r3.acc=r.acc and r3.attr="org" and r3.val="Mus musculus"' \ >>kgPutBack2.tab hgsql mm7 -N -e \ 'select r.acc, r.attr, r.val from refRa r, refRa r2, refRa r3 where r.attr="cno" and r.val like "%non-AUG%" and r.acc=r2.acc and r2.attr="rss" and r2.val="rev" and r2.val="rev" and r3.acc=r.acc and r3.attr="org" and r3.val="Mus musculus"' \ >>kgPutBack2.tab hgsql mm7 -N -e \ 'select r.acc, r.attr, r.val from refRa r, refRa r2, refRa r3 where r.attr="translExcept" and r.acc=r2.acc and r2.attr="rss" and r2.val="rev" and r2.val="rev" and r3.acc=r.acc and r3.attr="org" and r3.val="Mus musculus"' \ >>kgPutBack2.tab hgsql mm7 -N -e \ 'select r.acc, r.attr, r.val from refRa r, refRa r2, refRa r3 where r.attr="exception" and r.acc=r2.acc and r2.attr="rss" and r2.val="rev" and r2.val="rev" and r3.acc=r.acc and r3.attr="org" and r3.val="Mus musculus"' \ >>kgPutBack2.tab # Add entries in the put back list # Obtain from Mark the put back list, kgPutBack.lis, for human RefSeq. hgsql kgMm7ATemp -e 'drop table kgPutBack2' hgsql kgMm7ATemp < ~/src/hg/lib/kgPutBack2.sql hgsql kgMm7ATemp -e 'load data local infile "kgPutBack2.tab" into table kgPutBack2' kgPutBack kgMm7ATemp mm7 proteins051015 kgPutBack2 kgPutBack2.gp # Sort KG genes to make the kg4.gp table file. cat kgPutBack2.gp kg4.tmp > kg4B.tmp ~/kent/src/hg/protein/sortKg.pl kg4B.tmp >kg4.gp hgsql kgMm7ATemp -e 'drop table knownGene' hgsql kgMm7ATemp < ~/src/hg/lib/knownGene.sql hgsql kgMm7ATemp -e 'load data local infile "kg4.gp" into table knownGene' # Load dupSpMrna table after knownGene table is loaded so that joinerCheck does not complain. hgsql mm7 -e 'drop table dupSpMrna' hgsql mm7 < ~/src/hg/lib/dupSpMrna.sql hgsql mm7 -e 'load data local infile "dupSpMrna.tab" into table dupSpMrna' # Perform analysis before loading kg4 table data to mm7.knownGene table. # Load data into mm7 knownGene table. hgsql mm7 -e 'drop table knownGene' hgsql mm7 < ~/src/hg/lib/knownGene.sql hgsql mm7 -e 'load data local infile "kg4.gp" into table knownGene' # Build knownGeneMrna and knownGenePep tables. kgPepMrna kgMm7ATemp mm7 051015 hgsql mm7 -e 'drop table knownGeneMrna' hgsql mm7 < ~/src/hg/lib/knownGeneMrna.sql hgsql mm7 -e 'load data local infile "knownGeneMrna.tab" into table knownGeneMrna' hgsql mm7 -e 'drop table knownGenePep' hgsql mm7 < ~/src/hg/lib/knownGenePep.sql hgsql mm7 -e 'load data local infile "knownGenePep.tab" into table knownGenePep' # Build kgXref table kgXref2 kgMm7ATemp 051015 mm7 hgsql mm7 -e 'drop table kgXref' hgsql mm7 < ~/src/hg/lib/kgXref.sql hgsql mm7 -e 'load data local infile "kgXref.tab" into table kgXref' # Build spMrna table hgsql mm7 -N -e 'select name, proteinID from knownGene' >kgSpMrna.tab hgsql mm7 -e 'drop table spMrna' hgsql mm7 <~/src/hg/lib/spMrna.sql hgsql mm7 -e 'load data local infile "kgSpMrna.tab" into table spMrna' # Build mrnaRefseq table mkdir /cluster/store11/entrez cd /cluster/store11/entrez mkdir 051113 rm /cluster/data/entrez ln -s /cluster/store11/entrez/051113 /cluster/data/entrez cd /cluster/data/entrez wget -- timestamp ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/gene2accession.gz wget -- timestamp ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/gene_info.gz wget -- timestamp ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/gene2refseq.gz gzip -d *.gz cut -f 2,4 gene2accession | sort -u | grep -v "-" | grep -v "NM_" | sed -e 's/\./\t/g' > entrezMrna.tab cut -f 2,4 gene2refseq | grep "NM_"| sort -u | grep -v "-" | sed -e 's/\./\t/g' > entrezRefseq.tab cut -f 2,4,6 gene2accession | grep "NM_"| grep "NP_"|sort -u | sed -e 's/\./\t/g' > entrezRefProt.tab hgsql entrez -e 'drop table entrezRefseq' hgsql entrez -e 'drop table entrezMrna' hgsql entrez -e 'drop table entrezRefProt' hgsql entrez < ~/src/hg/lib/entrezRefseq.sql hgsql entrez < ~/src/hg/lib/entrezMrna.sql hgsql entrez < ~/src/hg/lib/entrezRefProt.sql hgsql entrez -e 'load data local infile "entrezRefseq.tab" into table entrezRefseq' hgsql entrez -e 'load data local infile "entrezMrna.tab" into table entrezMrna' hgsql entrez -e 'load data local infile "entrezRefProt.tab" into table entrezRefProt' hgsql entrez -N -e \ 'select mrna, refseq from entrezRefseq, entrezMrna where entrezRefseq.geneID=entrezMrna.geneID' \ >mrnaRefseq.tab hgsql mm7 -e 'drop table mrnaRefseq' hgsql mm7 < ~/src/hg/lib/mrnaRefseq.sql hgsql mm7 -e 'load data local infile "mrnaRefseq.tab" into table mrnaRefseq' # Build kgProtMap table cd /cluster/data/mm7/bed/kgMm7A ~/src/hg/protein/kgProtMap2.sh kgMm7A mm7 051015 # Update and clean up kgResultBestMrna2.c and then check it in. # Build alias tables. # kgAliasM reads from proteins051015.hugo.symbol, proteins051015.hugo.aliases # proteins051015.hugo.withdraws, mm7.kgXref.kgID # to create kgAliasM.tab and geneAlias.tab # by picking out those kgID items from kgXref where # kgXref.geneSymbol == hugo.symbol cd /cluster/store11/kg/kgMm7A mkdir alias cd alias kgAliasM mm7 proteins051015 # kgAliasKgXref reads from mm7.knownGene.proteinID, # mm7.knownGene.name, mm7.kgXref.geneSymbol # to create kgAliasKgXref.tab kgAliasKgXref mm7 # kgAliasRefseq reads from mm7.knownGene.name, # mm7.knownGene.proteinID, mm7.kgXref.refseq # to create kgAliasRefseq.tab kgAliasRefseq mm7 hgsql sp051015 -N -e 'select name,gene.val from mm7.knownGene,displayId,gene where displayId.val=proteinID and displayId.acc=gene.acc' \ | sort -u > kgAliasP.tab hgsql mm7 -N -e 'select name, name from knownGene' >kgAliasDup.tab hgsql mm7 -N -e 'select mrnaID, dupMrnaID from dupSpMrna' >>kgAliasDup.tab cat kgAliasM.tab kgAliasRefseq.tab kgAliasKgXref.tab kgAliasP.tab kgAliasDup.tab| \ sort |uniq > kgAlias.tab hgsql -e "drop table kgAlias;" mm7 hgsql mm7 < ~/kent/src/hg/lib/kgAlias.sql hgsql mm7 -e 'LOAD DATA local INFILE "kgAlias.tab" into table kgAlias' # kgProtAlias reads from mm7.knownGene.name, # mm7.knownGene.proteinID, mm7.knownGene.alignID, # proteins051015.spXref3.accession, proteins051015.spSecondaryID, proteins051015.pdbSP.pdb # to create kgProtAlias.tab kgProtAlias mm7 051015 hgsql mm7 -N -e \ 'select kgID, spDisplayID, protAcc from kgXref where protAcc != ""'\ | sort -u >kgProtAliasNCBI.tab # include variant splice protein IDs hgsql mm7 -N -e \ 'select name, proteinID, parAcc from knownGene,sp051015.varAcc where varAcc=proteinID'\ |sort -u >kgProtAliasDup.tab # include duplicate protein IDs from dupSpMrna table hgsql mm7 -N -e \ 'select name, knownGene.proteinID, dupProteinID from knownGene, dupSpMrna where name=mrnaID'\ |sort -u >>kgProtAliasDup.tab # catch parent acc from dupProteinID too hgsql mm7 -N -e\ 'select name, knownGene.proteinID, parAcc from knownGene,dupSpMrna,sp051015.varAcc where name=mrnaID and dupProteinID=varAcc.varAcc'\ |sort -u >>kgProtAliasDup.tab cat kgProtAliasNCBI.tab kgProtAlias.tab kgProtAliasDup.tab | sort -u > kgProtAliasAll.tab echo "`date` creating table kgProtAlias" hgsql mm7 -e "drop table kgProtAlias;" hgsql mm7 <~/src/hg/lib/kgProtAlias.sql; hgsql mm7 -e 'LOAD DATA local INFILE "kgProtAliasAll.tab" into table kgProtAlias;' # Build kgSpAlias table hgsql mm7 -e \ 'select kgXref.kgID, spID, alias from kgXref, kgAlias where kgXref.kgID=kgAlias.kgID' >j.tmp hgsql mm7 -e \ 'select kgXref.kgID, spID, alias from kgXref, kgProtAlias where kgXref.kgID=kgProtAlias.kgID'\ >>j.tmp cat j.tmp|sort -u |grep -v 'kgID' >mm7.kgSpAlias.tab rm j.tmp hgsql mm7 -e 'drop table kgSpAlias'; hgsql mm7 < ~/src/hg/lib/kgSpAlias.sql hgsql mm7 -e 'load data local infile "mm7.kgSpAlias.tab" into table kgSpAlias' # MAKE FOLDUTR TABLES (DONE 2005-11-15 Fan) # First set up directory structure and extract UTR sequence on hgwdev ssh hgwdev cd /cluster/data/mm7/bed mkdir rnaStruct.2005-11-14 rm rnaStruct ln -s rnaStruct.2005-11-14 rnaStruct cd rnaStruct mkdir -p utr3/split utr5/split utr3/fold utr5/fold utrFa mm7 knownGene utr3 utr3/utr.fa utrFa mm7 knownGene utr5 utr5/utr.fa # Split up files and make files that define job. ssh kk cd /cluster/data/mm7/bed/rnaStruct faSplit sequence utr3/utr.fa 50000 utr3/split/s faSplit sequence utr5/utr.fa 50000 utr5/split/s ls -1 utr3/split > utr3/in.lst ls -1 utr5/split > utr5/in.lst cd utr3 cat > gsub < genome.lst ls -1 /iscratch/i/affy/U74*consensus.fa > affy.lst cat << '_EOF_' > gsub #LOOP /cluster/bin/i386/blat -fine -mask=lower -minIdentity=95 -ooc=/scratch/hg/h/11.ooc $(path1) {check in line+ $(path2)} {check out line+ psl/$(root1)_$(root2).psl} #ENDLOOP '_EOF_' # << this line makes emacs coloring happy gensub2 genome.lst affy.lst gsub jobList para create jobList para try para push # Completed: 120 of 120 jobs # CPU time in finished jobs: 6022s 100.37m 1.67h 0.07d 0.000 y # IO & Wait Time: 4113s 68.55m 1.14h 0.05d 0.000 y # Average job time: 84s 1.41m 0.02h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 244s 4.07m 0.07h 0.00d # Submission to last job: 280s 4.67m 0.08h 0.00d # Do sort, best in genome filter, and convert to chromosome coordinates # to create affyU74.psl. ssh kk cd /cluster/data/mm7/bed/affyU74.2005-11-14/run pslSort dirs raw.psl tmp psl # change filter parameters for these sequences. only use alignments that # cover 30% of sequence and have at least minAli = 0.95. # minAli = 0.97 too high. low minCover as a lot of n's in these sequences #pslReps -minCover=0.3 -sizeMatters -minAli=0.95 -nearTop=0.005 raw.psl contig.psl /dev/null pslReps -minCover=0.3 -sizeMatters -minAli=0.95 -nearTop=0.005 raw.psl ../all_affyU74.psl /dev/null # Sort by chromosome and load into database. ssh hgwdev cd /cluster/data/mm7/bed/affyU74.2005-11-14 pslSortAcc nohead chrom temp all_affyU74.psl cat chrom/*.psl > affyU74.psl # shorten qName to "xxxx_at" instead of "U74Xv2:xxxx_at;" # and reload data into table hgLoadPsl mm7 affyU74.psl # rm -fr chrom temp run ## MAKE THE affyGnfU74 TRACKs (DONE - 2005-11-14 - Fan) # Make bed files and load consensus sequences for Affy U74 chip set. # Fix broken symlinks to microarray data after directory structure changed # (DONE, 2005-05-03, hartera) ---------------------------------- #This needs to be done after affyU74 is already made. ssh hgwdev mkdir -p /cluster/data/mm7/bed/affyGnf.2005-11-14 cd /cluster/data/mm7/bed/affyGnf.2005-11-14 # may need to build this command in src/hg/affyGnf ~/src/hg/affyGnf/affyPslAndAtlasToBed ../affyU74.2005-11-14/affyU74.psl \ /projects/compbio/data/microarray/affyGnfMouse/data/data_public_U74 \ affyGnfU74A.bed affyGnfU74A.exp -newType -chip=U74Av2 ~/src/hg/affyGnf/affyPslAndAtlasToBed ../affyU74.2005-11-14/affyU74.psl \ /projects/compbio/data/microarray/affyGnfMouse/data/U74B_b.txt \ affyGnfU74B.bed affyGnfU74B.exp -newType -chip=U74Bv2 ~/src/hg/affyGnf/affyPslAndAtlasToBed ../affyU74.2005-11-14/affyU74.psl \ /projects/compbio/data/microarray/affyGnfMouse/data/U74C_b.txt \ affyGnfU74C.bed affyGnfU74C.exp -newType -chip=U74Cv2 # edit 3 .bed files to shorten qName to "xxxx_at" instead of "U74Xv2:xxxx_at;" mkdir sav cp *.bed sav -p cat sav/affyGnfU74A.bed|sed -e "s/U74Av2://" >affyGnfU74A.bed cat sav/affyGnfU74B.bed|sed -e "s/U74Bv2://" >affyGnfU74B.bed cat sav/affyGnfU74C.bed|sed -e "s/U74Cv2://" >affyGnfU74C.bed # and reload data into table hgLoadBed mm7 affyGnfU74A affyGnfU74A.bed hgLoadBed mm7 affyGnfU74B affyGnfU74B.bed hgLoadBed mm7 affyGnfU74C affyGnfU74C.bed # Add in sequence data for U74 tracks. # Copy consensus sequence to /gbdb if it isn't already # [THE SYM LINKS WERE ALREADY DONE.] # mkdir -p /gbdb/hgFixed/affyProbes cd /gbdb/hgFixed/affyProbes # fix broken symlinks after directory structure changed # /projects/compbiodata ----> /projects/compbio/data rm U74* # make correct symlinks (hartera, 2005-05-03) ln -s /projects/compbio/data/microarray/affyGnfMouse/sequences/U74Av2_consensus.fa . ln -s /projects/compbio/data/microarray/affyGnfMouse/sequences/U74Bv2_consensus.fa . ln -s /projects/compbio/data/microarray/affyGnfMouse/sequences/U74Cv2_consensus.fa . # used perl -pi.bak -e 's/;/ /' to remove ";" after probe name # ASSUMED THIS IS ALREADY DONE LAST TIME FOR MM4. # reload sequences with prefix removed so acc matches name used in # other dependent tables hgLoadSeq -abbr=U74Av2: mm7 /gbdb/hgFixed/affyProbes/U74Av2_consensus.fa hgLoadSeq -abbr=U74Bv2: mm7 /gbdb/hgFixed/affyProbes/U74Bv2_consensus.fa hgLoadSeq -abbr=U74Cv2: mm7 /gbdb/hgFixed/affyProbes/U74Cv2_consensus.fa ### GNF ATLAS 2 [DONE Fan 2005-11-14] # Align probes from GNF1M chip. ssh kk cd /cluster/data/mm7/bed mkdir -p geneAtlas2/run/psl cd geneAtlas2/run #mkdir -p /cluster/bluearc/geneAtlas2 #cp /projects/compbio/data/microarray/geneAtlas2/mouse/gnf1m.fa /cluster/bluearc/geneAtlas2 #ls -1 /scratch/mus/mm7/maskedContigs/ > genome.lst echo /cluster/bluearc/mm7/nib/*.nib | wordLine stdin > genome.lst ls -1 /cluster/bluearc/geneAtlas2/gnf1m.fa > mrna.lst echo '#LOOP\nblat -fine -ooc=/scratch/hg/h/mouse11.ooc $(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl}\n#ENDLOOP' > gsub gensub2 genome.lst mrna.lst gsub spec para create spec para try para check para push para time # CPU time in finished jobs: 53036s 883.93m 14.73h 0.61d 0.002 y # IO & Wait Time: 442s 7.37m 0.12h 0.01d 0.000 y # Average job time: 1337s 22.28m 0.37h 0.02d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 3862s 64.37m 1.07h 0.04d # Submission to last job: 3862s 64.37m 1.07h 0.04d # Do sort, best in genome filter, and convert to chromosome coordinates # to create gnf1h.psl. pslSort dirs raw.psl tmp psl pslReps -minCover=0.3 -minAli=0.95 -nearTop=0.005 raw.psl ../affyGnf1m.psl /dev/null #rm -r contig.psl raw.psl psl # Load probes and alignments from GNF1H into database. ssh hgwdev cd /cluster/data/mm7/bed/geneAtlas2 # ln -s /projects/compbio/data/microarray/geneAtlas2/mouse/gnf1m.fa /gbdb/hgFixed/affyProbes hgLoadPsl mm7 affyGnf1m.psl hgLoadSeq mm7 /gbdb/hgFixed/affyProbes/gnf1m.fa # Load up track hgMapMicroarray gnfAtlas2.bed hgFixed.gnfMouseAtlas2MedianRatio \ affyGnf1m.psl # Note that the unmapped 5000 records are from all-N sequences. hgLoadBed mm7 gnfAtlas2 gnfAtlas2.bed # MOUSE AFFYMETRIX MOE430 TRACK (DONE, 2005-11-14, Fan) # mkdir -p /projects/compbio/data/microarray/affyMouse # Download MOE430A and MOE430B consensus sequences from Affymetrix web site # http://www.affymetrix.com/support/technical/byproduct.affx?product=moe430 # unzip MOE430*_consensus.zip # check for duplicate probes: there are none, all have unique names # check for duplicate probes: 100 from 136745_at to 1367551_a_at # remove "consensus:" and ";" from FASTA headers to shorten probeset # names for database # sed -e 's/consensus://' MOE430A_consensus | sed -e 's/;/ /' > MOE430_all.fa # sed -e 's/consensus://' MOE430B_consensus | sed -e 's/;/ /' >> MOE430_all.fa # cp /projects/compbio/data/microarray/affyMouse/MOE430_all.fa \ # /cluster/bluearc/affy/ # THE ABOVE WAS ALREADY DONE BY RACHEL 4/16/04. # Set up cluster job to align MOE430 consensus sequences to mm7 ssh kkr1u00 cd /cluster/data/mm7/bed mkdir -p affyMOE430 cd affyMOE430 # mkdir -p /iscratch/i/affy # cp /cluster/bluearc/affy/MOE430_all.fa /iscratch/i/affy # iSync ssh kk cd /cluster/data/mm7/bed/affyMOE430 ls -1 /iscratch/i/affy/MOE430_all.fa > affy.lst echo /cluster/bluearc/mm7/nib/*.nib | wordLine stdin > genome.lst echo '#LOOP\n/cluster/bin/i386/blat -fine -mask=lower -minIdentity=95 -ooc=/scratch/hg/h/mouse11.ooc $(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl}\n#ENDLOOP' > template.sub gensub2 genome.lst affy.lst template.sub para.spec mkdir psl para create para.spec para try para check para push # Completed: 40 of 40 jobs # CPU time in finished jobs: 9533s 158.88m 2.65h 0.11d 0.000 y # IO & Wait Time: 924s 15.40m 0.26h 0.01d 0.000 y # Average job time: 261s 4.36m 0.07h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 855s 14.25m 0.24h 0.01d # Submission to last job: 862s 14.37m 0.24h 0.01d # Do sort, best in genome filter, and convert to chromosome coordinates # to create affyRAE230.psl pslSort dirs raw.psl tmp psl # only use alignments that cover 30% of sequence and have at least # 95% identity in aligned region. # low minCover as a lot of n's in these sequences pslReps -minCover=0.3 -sizeMatters -minAli=0.95 -nearTop=0.005 raw.psl affyMOE430.psl /dev/null # Load alignments and sequences into database ssh hgwdev cd /cluster/data/mm7/bed/affyMOE430 # shorten names in psl file sed -e 's/MOE430//' affyMOE430.psl > affyMOE430.psl.bak mv affyMOE430.psl.bak affyMOE430.psl # load track into database hgLoadPsl mm7 affyMOE430.psl # Add consensus sequences for MOE430 # Copy sequences to gbdb is they are not there already # mkdir -p /gbdb/hgFixed/affyProbes # ln -s /projects/compbio/data/microarray/affyMouse/MOE430_all.fa \ # /gbdb/hgFixed/affyProbes hgLoadSeq -abbr=MOE430 mm7 /gbdb/hgFixed/affyProbes/MOE430_all.fa # Clean up # rm batch.bak contig.psl raw.psl # BELOW TWO THINGS WERE DONE BY RACHEL ALREDAY FOR MM4 # add entry to trackDb.ra in ~kent/src/hg/makeDb/trackDb/mouse/ # add affyMOE430.html file and then do make alpha to add to trackDb table ######## BUILD GENE SORTER TABLES ####### (DONE - 2005-11-15 - Fan) # These are instructions for building the # Gene Sorter. Don't start these until # there is a knownGene track and the affy tracks # Cluster together various alt-splicing isoforms. # Creates the knownIsoforms and knownCanonical tables ssh hgwdev cd /tmp hgClusterGenes mm7 knownGene knownIsoforms knownCanonical # Got 18908 clusters, from 31449 genes in 40 chromosomes # featureBits mm7 knownCanonical # 776952836 bases of 2583394090 (30.075%) in intersection # featureBits mm6 knownCanonical # 764263619 bases of 2597150411 (29.427%) in intersection # featureBits mm5 knownCanonical # 853516995 bases of 2615483787 (32.633%) in intersection # featureBits mm4 knownCanonical # 840021165 bases of 2627444668 (31.971%) in intersection # featureBits mm3 knownCanonical # 825943052 bases of 2505900260 (32.960%) in intersection # Extract peptides from knownGenes into fasta file # and create a blast database out of them. ssh hgwdev mkdir -p /cluster/data/mm7/bed/geneSorter/blastp cd /cluster/data/mm7/bed/geneSorter/blastp pepPredToFa mm7 knownGenePep known.faa # You may need to build this binary in src/hg/near/pepPredToFa /cluster/bluearc/blast229/formatdb -i known.faa -t known -n known # Copy over database to bluearc scratch mkdir /cluster/bluearc/mm7/blastp cp -p /cluster/data/mm7/bed/geneSorter/blastp/known.* /cluster/bluearc/mm7/blastp # Split up fasta file into bite sized chunks for cluster cd /cluster/data/mm7/bed/geneSorter/blastp mkdir split faSplit sequence known.faa 8000 split/kg # Make parasol run directory ssh kk mkdir /cluster/data/mm7/bed/geneSorter/blastp/self cd /cluster/data/mm7/bed/geneSorter/blastp/self mkdir run cd run mkdir out # Make blast script cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/cluster/bluearc/blast229/data /cluster/bluearc/blast229/blastall \ -p blastp -d /cluster/bluearc/mm7/blastp/known \ -i $1 -o $2 -e 0.01 -m 8 -b 1000 '_EOF_' # << keep emacs happy chmod a+x blastSome # Make gensub2 file cat << '_EOF_' > gsub #LOOP blastSome {check in line+ $(path1)} {check out line out/$(root1).tab} #ENDLOOP '_EOF_' # << keep emacs happy # Create parasol batch # 'ls ../../split/*.fa' is too much, hence the echo echo ../../split/*.fa | wordLine stdin > split.lst gensub2 split.lst single gsub jobList para create jobList para try para check para push # Completed: 7720 of 7720 jobs # CPU time in finished jobs: 72141s 1202.34m 20.04h 0.83d 0.002 y # IO & Wait Time: 600180s 10003.01m 166.72h 6.95d 0.019 y # Average job time: 87s 1.45m 0.02h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 282s 4.70m 0.08h 0.00d # Submission to last job: 1745s 29.08m 0.48h 0.02d # Load into database. This takes about an hour. ssh hgwdev cd /cluster/data/mm7/bed/geneSorter/blastp/self/run/out hgLoadBlastTab mm7 knownBlastTab *.tab # Scanning through 7729 files # Loading database with 3391069 rows # Create known gene mapping table and expression distance tables # for GNF Atlas 2. (The hgExpDistance takes an hour.) hgMapToGene mm7 affyGnf1m knownGene knownToGnf1m hgExpDistance mm7 hgFixed.gnfMouseAtlas2MedianRatio \ hgFixed.gnfMouseAtlas2MedianExps gnfAtlas2Distance -lookup=knownToGnf1m # Have 34863 elements in hgFixed.gnfMouseAtlas2MedianRatio # Got 20114 unique elements in hgFixed.gnfMouseAtlas2MedianRatio # Create table that maps between known genes and RefSeq hgMapToGene mm7 refGene knownGene knownToRefSeq # may need to build this command in src/hg/near/hgMapToGene # Create a table that maps between known genes and # the nice affy expression data. hgMapToGene mm7 affyU74 knownGene knownToU74 hgMapToGene mm7 affyMOE430 knownGene knownToMOE430 hgMapToGene mm7 affyMOE430 -prefix=A: knownGene knownToMOE430A # Format and load Rinn et al sex expression data mkdir /cluster/data/mm7/bed/rinnSex cd !$ hgMapMicroarray rinnSex.bed hgFixed.mouseRinnSexMedianRatio \ ../affyMOE430/affyMOE430.psl hgLoadBed mm7 rinnSex rinnSex.bed # Format and load the GNF data mkdir /cluster/data/mm7/bed/affyGnf95 cd /cluster/data/mm7/bed/affyGnf95 ~/src/hg/affyGnf/affyPslAndAtlasToBed -newType ../affyU95.psl \ /projects/compbio/data/microarray/affyGnfHuman/data_public_U95 \ affyGnfU95.tab affyGnfU95Exps.tab -shortOut # this .sql load was in preceeding instructions, but this .sql file # appears to not exist and it doesn't seem to be needed anyway. # Everything below this seems to create tables OK. # hgsql mm7 < ~/kent/src/hg/affyGnf/affyGnfU95.sql # Create table that gives distance in expression space between # GNF genes. These commands take about 15 minutes each # The affyGnfU74?Exps arguments appear to be unused in # hgExpDistance cd /cluster/data/mm7/bed/geneSorter hgExpDistance mm7 affyGnfU74A affyGnfU74AExps affyGnfU74ADistance -lookup=knownToU74 # Have 9621 elements in affyGnfU74A # Got 11805 unique elements in affyGnfU74A hgExpDistance mm7 affyGnfU74B affyGnfU74BExps affyGnfU74BDistance -lookup=knownToU74 # Have 11002 elements in affyGnfU74B # Got 7173 unique elements in affyGnfU74B hgExpDistance mm7 affyGnfU74C affyGnfU74CExps affyGnfU74CDistance -lookup=knownToU74 # Have 7502 elements in affyGnfU74C # Got 2130 unique elements in affyGnfU74C # C.ELEGANS BLASTP FOR GENE SORTER # Make C. elegans ortholog column using blastp on wormpep. # First make C. elegans protein database and copy it to iscratch/i # if it doesn't exist already: ssh eieio mkdir /cluster/data/ce2/bed/blastp cd /cluster/data/ce2/bed/blastp # Point a web browser at ftp://ftp.sanger.ac.uk/pub/databases/wormpep/ # to find out the latest version. Then use that in place of 142 below. wget -O wormPep142.faa ftp://ftp.sanger.ac.uk/pub/databases/wormpep/wormpep142/wormpep142 formatdb -i wormPep142.faa -t wormPep142 -n wormPep142 ssh kkr1u00 if (-e /iscratch/i/ce2/blastp) then rm -r /iscratch/i/ce2/blastp endif mkdir -p /iscratch/i/ce2/blastp cp /cluster/data/ce2/bed/blastp/wormPep142.p?? /iscratch/i/ce2/blastp iSync # Make parasol run directory ssh kk mkdir -p /cluster/data/mm7/bed/blastp/ce2/run/out cd /cluster/data/mm7/bed/blastp/ce2/run # Make blast script cat > blastSome < gsub <split.lst gensub2 split.lst single gsub spec para create spec para try, check, push, check, ... # Completed: 7720 of 7720 jobs # CPU time in finished jobs: 45124s 752.06m 12.53h 0.52d 0.001 y # IO & Wait Time: 274207s 4570.12m 76.17h 3.17d 0.009 y # Average job time: 41s 0.69m 0.01h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 354s 5.90m 0.10h 0.00d # Submission to last job: 1859s 30.98m 0.52h 0.02d # Load into database. ssh hgwdev cd /cluster/data/mm7/bed/blastp/ce2/run/out hgLoadBlastTab mm7 ceBlastTab -maxPer=1 *.tab # HUMAN BLASTP FOR GENE SORTER # Make human ortholog column using blastp on human known genes. # First make human protein database and copy it to iscratch/i # if it doesn't exist already: # mkdir /cluster/data/hg17/bed/blastp # cd /cluster/data/hg17/bed/blastp # pepPredToFa hg17 knownGenePep known.faa # formatdb -i known.faa -t known -n known # ssh kkr1u00 # if (-e /iscratch/i/hg17/blastp) then # rm -r /iscratch/i/hg17/blastp # endif # mkdir -p /iscratch/i/hg17/blastp # cp /cluster/data/hg17/bed/blastp/known.p?? /iscratch/i/hg17/blastp # iSync # ABOVE WAS ALREADY DONE DURING MM6 BUILD. # Make parasol run directory ssh kk mkdir -p /cluster/data/mm7/bed/blastp/hg17/run/out cd /cluster/data/mm7/bed/blastp/hg17/run # Make blast script cat > blastSome < gsub <split.lst gensub2 split.lst single gsub spec para create spec para try, check, push, check, ... # Completed: 7720 of 7720 jobs # CPU time in finished jobs: 89614s 1493.56m 24.89h 1.04d 0.003 y # IO & Wait Time: 151484s 2524.74m 42.08h 1.75d 0.005 y # Average job time: 31s 0.52m 0.01h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 88s 1.47m 0.02h 0.00d # Submission to last job: 1605s 26.75m 0.45h 0.02d # Load into database. ssh hgwdev cd /cluster/data/mm7/bed/blastp/hg17/run/out hgLoadBlastTab mm7 hgBlastTab -maxPer=1 *.tab # ZEBRAFISH BLASTP FOR GENE SORTER # Make Danio rerio (zebrafish) ortholog column using blastp on Ensembl. # First make protein database and copy it to iscratch/I # The below is done by hg17, that section from makeHg17.doc is copied here. # ssh kkstore # mkdir /cluster/data/danRer2/bed/blastp # cd /cluster/data/danRer2/bed/blastp # wget ftp://ftp.ensembl.org/pub/current_zebrafish/data/fasta/pep/Danio_rerio.ZFISH4.apr.pep.fa.gz # zcat Dan*.pep.fa.gz > ensembl.faa # formatdb -i ensembl.faa -t ensembl -n ensembl # ssh kkr1u00 # if (-e /iscratch/i/danRer2/blastp) then # rm -r /iscratch/i/danRer2/blastp # endif # mkdir -p /iscratch/i/danRer2/blastp # cp /cluster/data/danRer2/bed/blastp/ensembl.p?? /iscratch/i/danRer2/blastp # iSync # ABOVE WAS ALREADY DONE DURING MM6 BUILD. # The above is copied from makeHg17.doc. # Make parasol run directory ssh kk mkdir -p /cluster/data/mm7/bed/blastp/danRer2/run/out cd /cluster/data/mm7/bed/blastp/danRer2/run # Make blast script cat > blastSome < gsub <split.lst gensub2 split.lst single gsub spec para create spec para try, check, push, check, ... # Completed: 7720 of 7720 jobs # CPU time in finished jobs: 81817s 1363.61m 22.73h 0.95d 0.003 y # IO & Wait Time: 148583s 2476.39m 41.27h 1.72d 0.005 y # Average job time: 30s 0.50m 0.01h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 93s 1.55m 0.03h 0.00d # Submission to last job: 2638s 43.97m 0.73h 0.03d # Load into database. ssh hgwdev cd /cluster/data/mm7/bed/blastp/danRer2/run/out hgLoadBlastTab mm7 drBlastTab -maxPer=1 *.tab # YEAST BLASTP FOR GENE SORTER # Make Saccharomyces cerevisiae (yeast) ortholog column using blastp on # RefSeq. First make protein database and copy it to iscratch/i # if it doesn't exist already: # mkdir /cluster/data/sacCer1/bed/blastp # cd /cluster/data/sacCer1/bed/blastp # wget ftp://genome- ftp.stanford.edu/pub/yeast/data_download/sequence/genomic_sequence/orf_protein/orf_trans.fasta.gz # zcat orf_trans.fasta.gz > sgdPep.faa # formatdb -i sgdPep.faa -t sgdPep -n sgdPep # ssh kkr1u00 # Note: sacCer1 is a name conflict with SARS coronavirus... oh well, # fortunately we won't be looking for homologs there. :) # if (-e /iscratch/i/sacCer1/blastp) then # rm -r /iscratch/i/sacCer1/blastp # endif # mkdir -p /iscratch/i/sacCer1/blastp # cp /cluster/data/sacCer1/bed/blastp/sgdPep.p?? /iscratch/i/sacCer1/blastp # iSync # ABOVE WAS ALREADY DONE DURING MM6 BUILD. # Make parasol run directory ssh kk mkdir -p /cluster/data/mm7/bed/blastp/sacCer1/run/out cd /cluster/data/mm7/bed/blastp/sacCer1/run # Make blast script cat > blastSome < gsub <split.lst gensub2 split.lst single gsub spec para create spec para try, check, push, check, ... # Completed: 7720 of 7720 jobs # CPU time in finished jobs: 13337s 222.29m 3.70h 0.15d 0.000 y # IO & Wait Time: 149594s 2493.23m 41.55h 1.73d 0.005 y # Average job time: 21s 0.35m 0.01h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 121s 2.02m 0.03h 0.00d # Submission to last job: 15796s 263.27m 4.39h 0.18d # Load into database. ssh hgwdev cd /cluster/data/mm7/bed/blastp/sacCer1/run/out hgLoadBlastTab mm7 scBlastTab -maxPer=1 *.tab # DM1 BLASTP FOR GENE SORTER (DONE 5/30/05, Fan) # Make Drosophila melanagaster ortholog column using blastp on FlyBase. # First make protein database and copy it to iscratch/i # if it doesn't exist already: # This is already done, see makeMm3.doc for procedure # the directory: /cluster/bluearc/dm1/blastp should have data # ssh kkr1u00 # if (-e /iscratch/i/dm1/blastp) then # rm -r /iscratch/i/dm1/blastp # endif # mkdir -p /iscratch/i/dm1/blastp # cp /cluster/data/dm1/bed/blastp/bdgp.p?? /iscratch/i/dm1/blastp # iSync # THE ABOVE IS ALREADY DONE BY ANGIE # PLEASE NOTE THE SPLIT DATA WERE COPIED TO STORE3. SEE BLASTP RUN FOR RN3 FOR DETAILS # Make parasol run directory ssh kk # mkdir -p /cluster/data/mm7/bed/blastp/dm1/run/out mkdir -p /cluster/store3/mm7/bed/blastp/dm1/run/out cd /cluster/data/mm7/bed/blastp/dm1/run # Make blast script cat > blastSome < gsub <split.lst ls -1S /cluster/data/mm7/bed/geneSorter/blastp/split \ |sed -e 's=kg=/cluster/store3/mm7/bed/split/kg=g' >split.lst gensub2 split.lst single gsub spec para create spec para try, check, push, check, .. # Completed: 7720 of 7720 jobs # CPU time in finished jobs: 50313s 838.55m 13.98h 0.58d 0.002 y # IO & Wait Time: 49291s 821.52m 13.69h 0.57d 0.002 y # Average job time: 13s 0.22m 0.00h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 81s 1.35m 0.02h 0.00d # Submission to last job: 341s 5.68m 0.09h 0.00d # Load into database. ssh hgwdev cd /cluster/data/mm7/bed/blastp/dm1/run/out hgLoadBlastTab mm7 dmBlastTab -maxPer=1 *.tab # Create table that maps between known genes and LocusLink cd /cluster/data/mm7/bed/geneSorter hgsql --skip-column-names -e "select mrnaAcc,locusLinkId from refLink" mm7 > refToLl.txt hgMapToGene mm7 refGene knownGene knownToLocusLink -lookup=refToLl.txt # row count is 23074 # Create table that maps between known genes and Pfam domains hgMapViaSwissProt mm7 knownGene name proteinID Pfam knownToPfam # row count is 22525 # Create table to map between known genes and GNF Atlas2 # expression data. hgMapToGene mm7 gnfAtlas2 knownGene knownToGnfAtlas2 '-type=bed 12' # Create table that maps between known genes and genePix database knownToGenePix mm7 # ENABLE GENE SORTER FOR mm7 IN HGCENTRALTEST (already done during first mm7 KG build) echo "update dbDb set hgNearOk = 1 where name = 'mm7';" \ | hgsql -h genome-testdb hgcentraltest # RAT BLASTP FOR GENE SORTER # Make RAT ortholog column using blastp on RAT known genes. # First make RAT protein database and copy it to iscratch/i # if it doesn't exist already: # mkdir /cluster/data/rn3/bed/blastp # cd /cluster/data/rn3/bed/blastp # pepPredToFa rn3 knownGenePep known.faa # formatdb -i known.faa -t known -n known # ssh kkr1u00 # if (-e /iscratch/i/rn3/blastp) then # rm -r /iscratch/i/rn3/blastp # endif # mkdir -p /iscratch/i/rn3/blastp # cp /cluster/data/rn3/bed/blastp/known.p?? /iscratch/i/rn3/blastp # iSync # ABOVE WAS ALREADY DONE DURING MM6 BUILD. # Make parasol run directory ssh kk # PLEASE NOTE THE SPLIT FILES ARE COPIED TO STORE3. THE NEW KKSTORE-2 FILE SYSTEM # WHERE STORE5 IS HAS AN OS VERSION COMPATIBILITY PROBLEM THAT CAUSED A LOT # I/O PAGE FAULTS. mkdir -p /cluster/store3/mm7/bed cp -Rp /cluster/store5/mm7/bed/geneSorter/blastp/split . cd /cluster/store5/mm7/bed/geneSorter/blastp mv split split.sav ln -s /cluster/store3/mm7/bed/split ./split mkdir -p /cluster/store3/mm7/bed/blastp/rn3/run/out mv /cluster/data/mm7/bed/blastp /cluster/data/mm7/bed/blastp.sav ln -s /cluster/store3/mm7/bed/blastp /cluster/data/mm7/bed/blastp cd /cluster/data/mm7/bed/blastp/rn3/run # Make blast script cat > blastSome < gsub <split.lst ls -1S /cluster/data/mm7/bed/geneSorter/blastp/split \ |sed -e 's=kg=/cluster/store3/mm7/bed/split/kg=g' >split.lst gensub2 split.lst single gsub spec para create spec para try, check, push, check, ... # Completed: 7720 of 7720 jobs # CPU time in finished jobs: 19182s 319.70m 5.33h 0.22d 0.001 y # IO & Wait Time: 20477s 341.28m 5.69h 0.24d 0.001 y # Average job time: 5s 0.09m 0.00h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 23s 0.38m 0.01h 0.00d # Submission to last job: 140s 2.33m 0.04h 0.00d # Load into database. ssh hgwdev cd /cluster/data/mm7/bed/blastp/rn3/run/out hgLoadBlastTab mm7 rnBlastTab -maxPer=1 *.tab # END OF GENE SORTER STUFF (DONE 11/17/05, Fan) ############################################################################# ### MM7 PROTEOME BROWSER TABLES BUILD #### (DONE - 2005-11-14 - Fan) # These are instructions for re-building tables # needed for the Proteome Browser to be used with mm7. # DON'T START THESE UNTIL TABLES FOR KNOWN GENES AND kgProtMap table # ARE REBUILT. # This build is based on proteins DBs dated 051015. # Create the working directory ssh hgwdev mkdir /cluster/data/mm7/bed/pb.2005-11-14 cd /cluster/data/mm7/bed rm pb ln -s /cluster/data/mm7/bed/pb.2005-11-14 pb cd pb # Define pep* tables in mm7 DB cat ~/kent/src/hg/lib/pep*.sql > pepAll.sql # edit the .sql file and get rid of non-PB table definitions, e.g. pepPred. vi pepAll.sql hgsql mm7 pepMwAa.tab hgsql mm7 -e 'load data local infile "pepMwAa.tab" into table mm7.pepMwAa ignore 1 lines;' o Build the pepPi table hgsql proteins051015 -e "select info.acc from sp051015.info, sp051015.accToTaxon where accToTaxon.taxon=10090 and accToTaxon.acc = info.acc" > protAcc.lis pbCalPi protAcc.lis sp051015 pepPi.tab hgsql mm7 -e 'load data local infile "pepPi.tab" into table mm7.pepPi;' # Calculate and load pep distributions pbCalDist sp051015 proteins051015 10090 mm7 >pbCalDist.out cat pbCalDist.out wc pbCalDist.out hgsql mm7 load data local infile "pepExonCntDist.tab" into table mm7.pepExonCntDist; load data local infile "pepCCntDist.tab" into table mm7.pepCCntDist; load data local infile "pepHydroDist.tab" into table mm7.pepHydroDist; load data local infile "pepMolWtDist.tab" into table mm7.pepMolWtDist; load data local infile "pepResDist.tab" into table mm7.pepResDist; load data local infile "pepIPCntDist.tab" into table mm7.pepIPCntDist; load data local infile "pepPiDist.tab" into table mm7.pepPiDist; quit # Calculate frequency distributions pbCalResStd sp051015 10090 mm7 # Create pbAnomLimit and pbResAvgStd tables hgsql mm7 < ~/src/hg/lib/pbAnomLimit.sql hgsql mm7 < ~/src/hg/lib/pbResAvgStd.sql hgsql mm7 -e 'load data local infile "pbResAvgStd.tab" into table mm7.pbResAvgStd;' hgsql mm7 -e 'load data local infile "pbAnomLimit.tab" into table mm7.pbAnomLimit;' # UPDATE kgSpAlias TABLE TO BE USED BY PB cd /cluster/data/mm7/bed/pb hgsql mm7 -e \ 'select kgXref.kgID, spID, alias from kgXref, kgAlias where kgXref.kgID=kgAlias.kgID' >j.tmp hgsql mm7 -e \ 'select kgXref.kgID, spID, alias from kgXref, kgProtAlias where kgXref.kgID=kgProtAlias.kgID'\ >>j.tmp cat j.tmp|sort -u |grep -v 'kgID' >mm7.kgSpAlias.tab rm j.tmp hgsql mm7 -e 'drop table kgSpAlias'; hgsql mm7 < ~/src/hg/lib/kgSpAlias.sql hgsql mm7 -e 'load data local infile "mm7.kgSpAlias.tab" into table kgSpAlias' gzip mm7.kgSpAlias.tab # Create pbStamp table for PB hgsql mm7 < ~/src/hg/lib/pbStamp.sql hgsql mm6 -N -e 'select * from pbStamp' > pbStamp.tab hgsql mm7 -e 'delete from pbStamp' hgsql mm7 -e 'load data local infile "pbStamp.tab" into table mm7.pbStamp' # ENABLE PROTEOME BROWSER FOR mm7 IN HGCENTRALTEST (already done previously) echo "update dbDb set hgPbOk = 1 where name = 'mm7';" \ | hgsql -h genome-testdb hgcentraltest # Adjust drawing parameters for Proteome Browser stamps Now invoke Proteome Browser and adjust various drawing parameters (mostly the ymax of each stamp) if necessary, by updating the pbStamp.tab file and then delete and reload the pbStamp table. # Perform preliminary review of Proteome Browser for mm7, then notify QA for formal review. # Update default Browser position # bring up mySQL on genome-testdb and use hgcentraltest DB (done previously): update dbDb set defaultPos="chr2:146923205-146928018" where name="mm7"; # Create QA Push Queue entry with the following tables: ceBlastTab cgapAlias cgapBiocDesc cgapBiocPathway dmBlastTab drBlastTab dupSpMrna foldUtr3 foldUtr5 gnfAtlas2Distance hgBlastTab keggMapDesc keggPathway kgAlias kgProtAlias kgProtMap kgXref knownBlastTab knownCanonical knownGene knownGeneMrna knownGenePep knownIsoforms knownToVisiGene knownToGnf1m knownToGnfAtlas2 knownToLocusLink knownToMOE430 knownToMOE430A knownToPfam knownToRefSeq knownToU74 knownToXmBest rinnSex rnBlastTab scBlastTab spMrna # END OF mm7 KG/GS/PB RE-BUILD. 11/17/05 Fan. ##################################################################### ############################################################################ ## build multiz17way mafFrames (multiz17wayFrames) (markd 2005/11/13) ## rebuild frames to get bug fix, using 1-pass maf methodology (2006-06-09 markd) cd /cluster/data/mm7/bed/ mkdir multiz17wayFrames/ cd multiz17wayFrames/ # created Makefile and mkMafFrames based on one for canHg11 # create frames usins knownGenes if available, if not refseqs if availble, # otherwise mRNA with CDS annotation # created for these databases: rn3 hg17 panTro1 rheMac1 oryCun1 canFam2 bosTau2 galGal2 xenTro1 fr1 danRer3 nice make getGenes nice make -j 3 getFrames nice make loadDb # need to do a make sanClean after track verified ### # rebuild frames to get bug fix, using 1-pass maf methodology # (2006-06-09 markd) ssh kkstore02 cd /cluster/data/mm7/bed/multiz17wayFrames mv mafFrames/ mafFrames.old nice tcsh # easy way to get process niced (cat ../maf/*.maf | time genePredToMafFrames mm7 stdin stdout bosTau2 genes/bosTau2.gp.gz canFam2 genes/canFam2.gp.gz danRer3 genes/danRer3.gp.gz fr1 genes/fr1.gp.gz galGal2 genes/galGal2.gp.gz hg17 genes/hg17.gp.gz mm7 genes/mm7.gp.gz oryCun1 genes/oryCun1.gp.gz panTro1 genes/panTro1.gp.gz rheMac1 genes/rheMac1.gp.gz rn3 genes/rn3.gp.gz xenTro1 genes/xenTro1.gp.gz | gzip >multiz17way.mafFrames.gz)>&log& ssh hgwdev cd /cluster/data/mm7/bed/multiz17wayFrames hgLoadMafFrames mm7 multiz17wayFrames multiz17way.mafFrames.gz >&frameslog& ############################################################################ # BLASTZ Chimp panTro1 second time (DONE - 2005-11-20 - 2005-11-22 - Hiram) # The first attempt didn't work well when it got into the multiple # alignment, it seemed to get lost in gap areas of the randoms. ssh pk mkdir /cluster/data/mm7/bed/blastzPanTro1.2005-11-20 cd /cluster/data/mm7/bed rm -f blastz.panTro1 ln -s blastzPanTro1.2005-11-20 blastz.panTro1 cd blastzPanTro1.2005-11-20 cat << '_EOF_' > DEF # mouse vs chimp export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin BLASTZ=blastz.v7.x86_64 BLASTZ_M=50 # TARGET: Mouse Mm7 SEQ1_DIR=/scratch/hg/mm7/nib SEQ1_LEN=/scratch/hg/mm7/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Chimp PanTro1 SEQ2_DIR=/scratch/hg/panTro1/nib SEQ2_DIR=/scratch/hg/panTro1/chrom.sizes SEQ2_CHUNK=30000000 SEQ2_LAP=0 BASE=/cluster/data/mm7/bed/blastzPanTro1.2005-11-20 TMPDIR=/scratch/tmp '_EOF_' # happy emacs time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -stop=load `pwd`/DEF > blastz.to.load.out 2>&1 & time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -continue=cat -stop=load `pwd`/DEF > cat.to.load.out 2>&1 & time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -continue=chainMerge -stop=load \ `pwd`/DEF > chainMerge.to.load.out 2>&1 & time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -swap -stop=load \ `pwd`/DEF > swap.to.load.out 2>&1 & # Measurements: ssh kolossus cd /cluster/data/mm7/bed/blastzPanTro1.2005-11-20 time HGDB_CONF=~/.hg.conf.read-only featureBits mm7 \ chainPanTro1Link > fb.mm7.chainPanTro1Link 2>&1 cat fb.mm7.chainPanTro1Link # 906762987 bases of 2583394090 (35.100%) in intersection time HGDB_CONF=~/.hg.conf.read-only featureBits panTro1 \ chainMm7Link > fb.panTro1.chainMm7Link 2>&1 cat fb.panTro1.chainMm7Link # 899743967 bases of 2733948177 (32.910%) in intersection time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -continue=download \ `pwd`/DEF > download.out 2>&1 & time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -swap -continue=download \ `pwd`/DEF > downloadSwap.out 2>&1 & ############################################################################ # MM3/MM4/MM5/MM6 -> MM7 LIFTOVER CHAINS (DONE 11/21/2005 Andy) # These chains hopefully don't suck. # Sorry I only used the makeLoChain-align script from the set of scripts # already created for this task. I wanted more control. I should mention # I used a size of 10kb instead of 3kb for the split (blat query) sizes in # mm7. This had a huge affect on the amount of hits in the blat, which # then had a huge effect on the amount of chains. I should also mention # that mm7 chromosomes chr1, chrX, and chrUn_random were split further # into more than a single query file. This helped a LOT in avoiding # cluster hippos classically associated with those chroms. ######## LIFTOVER PREPARATION # Split up mm7 ssh hgwdev cd /san/sanVol1/scratch/mm7 mkdir -p liftSplits/{split,lift} for fa in /cluster/data/mm7/?{,?}/*.fa; do c=`basename $fa .fa` echo $c faSplit -lift=liftSplits/lift/${c}.lft size $fa -oneFile 10000 liftSplits/split/$c done mkdir -p biggerSplits/split cd biggerSplits/ ln -s ../liftSplits/lift cd split/ ln -s ../../liftSplits/split/* . faSplit sequence chr1.fa 5 chr1_ faSplit sequence chrX.fa 10 chrX_ rm chr{1,X}.fa # Make some dirs cd /san/sanVol1/scratch mkdir -p mm{3,4,5} # Copy 11.ooc files to each of mm3, mm4, mm5, mm6 dirs. ######## LIFTOVER BLATING # MM3 ssh pk cd /cluster/data/mm3 makeLoChain-align mm3 /san/sanVol1/scratch/mm3/nib mm7 /san/sanVol1/scratch/mm7/biggerSplits/split cd bed/ mv blat.mm7.2005-11-17/ /san/sanVol1/scratch/mm3 cd /san/sanVol1/scratch/mm3/blat.mm7.2005-11-17/run/ sed 's/^blat/blat.sh/; s/\}.*$/}/' spec | awk '{print "/san/sanVol1/scratch/andy/" $0 " mm3ToMm7"}' > newspec para create newspec para -maxNode=200 -priority=25 push para time #Completed: 2109 of 2109 jobs #CPU time in finished jobs: 2322886s 38714.77m 645.25h 26.89d 0.074 y #IO & Wait Time: 20395s 339.92m 5.67h 0.24d 0.001 y #Average job time: 1111s 18.52m 0.31h 0.01d #Longest running job: 0s 0.00m 0.00h 0.00d #Longest finished job: 21949s 365.82m 6.10h 0.25d #Submission to last job: 41513s 691.88m 11.53h 0.48d # MM4 ssh pk cd /cluster/data/mm4 makeLoChain-align mm4 /scratch/mus/mm4/softNib mm7 /san/sanVol1/scratch/mm7/biggerSplits/split cd bed/ mv blat.mm7.2005-11-17/ /san/sanVol1/scratch/mm4 cd /san/sanVol1/scratch/mm4/blat.mm7.2005-11-17/run/ sed 's/^blat/blat.sh/; s/\}.*$/}/' spec | awk '{print "/san/sanVol1/scratch/andy/" $0 " mm4ToMm7"}' > newspec para create newspec para -maxNode=200 -priority=25 push para time #Completed: 2508 of 2508 jobs #CPU time in finished jobs: 1351933s 22532.22m 375.54h 15.65d 0.043 y #IO & Wait Time: 13885s 231.41m 3.86h 0.16d 0.000 y #Average job time: 545s 9.08m 0.15h 0.01d #Longest running job: 0s 0.00m 0.00h 0.00d #Longest finished job: 6516s 108.60m 1.81h 0.08d #Submission to last job: 25881s 431.35m 7.19h 0.30d # MM5 ssh pk cd /cluster/data/mm5 makeLoChain-align mm5 /scratch/mus/mm5/softNib mm7 /san/sanVol1/scratch/mm7/biggerSplits/split cd bed/ mv blat.mm7.2005-11-17/ /san/sanVol1/scratch/mm5 cd /san/sanVol1/scratch/mm5/blat.mm7.2005-11-17/run/ sed 's/^blat/blat.sh/; s/\}.*$/}/' spec | awk '{print "/san/sanVol1/scratch/andy/" $0 " mm5ToMm7"}' > newspec para create newspec para -maxNode=200 -priority=25 push para time #Completed: 2451 of 2451 jobs #CPU time in finished jobs: 1266001s 21100.02m 351.67h 14.65d 0.040 y #IO & Wait Time: 13972s 232.87m 3.88h 0.16d 0.000 y #Average job time: 522s 8.70m 0.15h 0.01d #Longest running job: 0s 0.00m 0.00h 0.00d #Longest finished job: 6769s 112.82m 1.88h 0.08d #Submission to last job: 26506s 441.77m 7.36h 0.31d # MM6 cd /cluster/data/mm6 makeLoChain-align mm6 /scratch/mus/mm6/nib mm7 /san/sanVol1/scratch/mm7/biggerSplits/split cd bed/ mv blat.mm7.2005-11-17/ /san/sanVol1/scratch/mm6 cd /san/sanVol1/scratch/mm6/blat.mm7.2005-11-17/run/ sed 's/^blat/blat.sh/; s/\}.*$/}/' spec | awk '{print "/san/sanVol1/scratch/andy/" $0 " mm6ToMm7"}' > newspec para create newspec para -maxNode=200 -priority=25 push para time #Completed: 2280 of 2280 jobs #CPU time in finished jobs: 1250929s 20848.81m 347.48h 14.48d 0.040 y #IO & Wait Time: 12983s 216.39m 3.61h 0.15d 0.000 y #Average job time: 554s 9.24m 0.15h 0.01d #Longest running job: 0s 0.00m 0.00h 0.00d #Longest finished job: 6575s 109.58m 1.83h 0.08d #Submission to last job: 26374s 439.57m 7.33h 0.31d ######## LIFTOVER CHAINING # LIFTING ssh pk cd /san/sanVol1/scratch/andy cat << "EOF" > mm7SplitLift.sh EOF chmod +x mm7SplitLift.sh cat << "EOF" > mm7ChainMergeSplit.sh #!/bin/bash cp -r chainRaw/ /scratch/andy/mm7Lifts pushd /scratch/andy/mm7Lifts mkdir chain /cluster/bin/x86_64/chainMergeSort chainRaw/*.chain | /cluster/bin/x86_64/chainSplit chain stdin cp -r chain `dirs +1` rm -rf chain chainRaw EOF chmod +x mm7ChainMergeSplit.sh # MM3 cd /san/sanVol1/scratch/mm3/blat.mm7.2005-11-17/raw /san/sanVol1/scratch/andy/mm7SplitLift.sh cd ../ mkdir chainRun chainRaw cd chainRun cat > gsub << "EOF" #LOOP /cluster/bin/x86_64/axtChain -verbose=0 -psl $(path1) /san/sanVol1/scratch/mm3/nib /san/sanVol1/scratch/mm7/nib {check out line+ ../chainRaw/$(root1).chain} #ENDLOOP EOF ls -1S ../psl/*.psl > in.lst gensub2 in.lst single gsub spec para create spec para push para time #Completed: 40 of 40 jobs #CPU time in finished jobs: 8687s 144.79m 2.41h 0.10d 0.000 y #IO & Wait Time: 2751s 45.85m 0.76h 0.03d 0.000 y #Average job time: 286s 4.77m 0.08h 0.00d #Longest running job: 0s 0.00m 0.00h 0.00d #Longest finished job: 920s 15.33m 0.26h 0.01d #Submission to last job: 921s 15.35m 0.26h 0.01d # MM4 cd /san/sanVol1/scratch/mm4/blat.mm7.2005-11-17/raw /san/sanVol1/scratch/andy/mm7SplitLift.sh cd ../ mkdir chainRun chainRaw cd chainRun cat > gsub << "EOF" #LOOP /cluster/bin/x86_64/axtChain -verbose=0 -psl $(path1) /scratch/mus/mm4/softNib /san/sanVol1/scratch/mm7/nib {check out line+ ../chainRaw/$(root1).chain} #ENDLOOP EOF ls -1S ../psl/*.psl > in.lst gensub2 in.lst single gsub spec para create spec para push para time #Completed: 40 of 40 jobs #CPU time in finished jobs: 7678s 127.96m 2.13h 0.09d 0.000 y #IO & Wait Time: 4254s 70.90m 1.18h 0.05d 0.000 y #Average job time: 298s 4.97m 0.08h 0.00d #Longest running job: 0s 0.00m 0.00h 0.00d #Longest finished job: 3087s 51.45m 0.86h 0.04d #Submission to last job: 3088s 51.47m 0.86h 0.04d # MM5 cd /san/sanVol1/scratch/mm5/blat.mm7.2005-11-17/raw /san/sanVol1/scratch/andy/mm7SplitLift.sh cd ../ mkdir chainRun chainRaw cd chainRun cat > gsub << "EOF" #LOOP /cluster/bin/x86_64/axtChain -verbose=0 -psl $(path1) /scratch/mus/mm5/softNib /san/sanVol1/scratch/mm7/nib {check out line+ ../chainRaw/$(root1).chain} #ENDLOOP EOF ls -1S ../psl/*.psl > in.lst gensub2 in.lst single gsub spec para create spec para push para time #Completed: 40 of 40 jobs #CPU time in finished jobs: 8450s 140.83m 2.35h 0.10d 0.000 y #IO & Wait Time: 9259s 154.32m 2.57h 0.11d 0.000 y #Average job time: 443s 7.38m 0.12h 0.01d #Longest running job: 0s 0.00m 0.00h 0.00d #Longest finished job: 2929s 48.82m 0.81h 0.03d #Submission to last job: 2968s 49.47m 0.82h 0.03d # MM6 cd /san/sanVol1/scratch/mm6/blat.mm7.2005-11-17/raw /san/sanVol1/scratch/andy/mm7SplitLift.sh cd ../ mkdir chainRun chainRaw cd chainRun cat > gsub << "EOF" #LOOP /cluster/bin/x86_64/axtChain -verbose=0 -psl $(path1) /scratch/mus/mm6/nib /san/sanVol1/scratch/mm7/nib {check out line+ ../chainRaw/$(root1).chain} #ENDLOOP EOF ls -1S ../psl/*.psl > in.lst gensub2 in.lst single gsub spec para create spec para push para time #Completed: 40 of 40 jobs #CPU time in finished jobs: 8140s 135.66m 2.26h 0.09d 0.000 y #IO & Wait Time: 13372s 222.87m 3.71h 0.15d 0.000 y #Average job time: 538s 8.96m 0.15h 0.01d #Longest running job: 0s 0.00m 0.00h 0.00d #Longest finished job: 2822s 47.03m 0.78h 0.03d #Submission to last job: 2857s 47.62m 0.79h 0.03d ######### CHAINMERGE/NET/NETSUBSET ssh kolossus cd /scratch/andy/mm7Lifts cp -r ~/san/mm6/blat.mm7.2005-11-17/chainRaw/ . mkdir chain time /cluster/bin/x86_64/chainMergeSort chainRaw/* | /cluster/bin/x86_64/chainSplit chain stdin #real 30m32.406s #user 25m57.779s #sys 1m57.769s cp -r chain ~/san/mm6/blat.mm7.2005-11-17/ rm -rf chain* cp -r ~/san/mm5/blat.mm7.2005-11-17/chainRaw/ . mkdir chain time /cluster/bin/x86_64/chainMergeSort chainRaw/* | /cluster/bin/x86_64/chainSplit chain stdin #real 31m48.856s #user 27m19.356s #sys 2m4.895s cp -r chain ~/san/mm5/blat.mm7.2005-11-17/ rm -rf chain* cp -r ~/san/mm4/blat.mm7.2005-11-17/chainRaw/ . mkdir chain time /cluster/bin/x86_64/chainMergeSort chainRaw/* | /cluster/bin/x86_64/chainSplit chain stdin #real 26m57.181s #user 23m22.516s #sys 1m45.750s cd chain/ for c in *.chain; do echo ${c%.chain}; /cluster/bin/x86_64/chainNet $c /cluster/data/mm4/chrom.sizes \ /cluster/data/mm7/chrom.sizes ../net/${c%.chain}.net /dev/null echo done $c done for chain in *; do c=${chain%.chain} /cluster/bin/x86_64/netChainSubset ../net/$c.net $chain ../over/$c.over done cp -r * ~/san/mm4/blat.mm7.2005-11-17/ rm -rf * # Ehhhh... do the other ones in a cluster job cp -r ~/san/mm3/blat.mm7.2005-11-17/chainRaw/ . mkdir chain time /cluster/bin/x86_64/chainMergeSort chainRaw/* | /cluster/bin/x86_64/chainSplit chain stdin #real 33m13.555s #user 28m17.531s #sys 2m5.165s cp -r chain ~/san/mm3/blat.mm7.2005-11-17/ rm -rf chain* ssh pk cd ~/san/andy cat << "EOF" > netOver.sh #!/bin/bash chain=$1 chrom=`basename $chain .chain` sizesMMOld=$2 sizesMM7=/cluster/data/mm7/chrom.sizes chainDir=`dirname $chain` blatDir=`dirname $chainDir` net=${blatDir}/net/${chrom}.net over=${blatDir}/over/${chrom}.over mkdir -p ${blatDir}/{over,net} /cluster/bin/x86_64/chainNet $chain $sizesMMOld $sizesMM7 $net /dev/null /cluster/bin/x86_64/netChainSubset $net $chain $over EOF # << for emacs chmod +x netOver.sh mkdir netRun cd netRun/ find /san/sanVol1/scratch/mm3/blat.mm7.2005-11-17/chain -name "*.chain" \ | awk '{print "/san/sanVol1/scratch/andy/netOver.sh " $1 " /cluster/data/mm3/chrom.sizes"}' >> spec find /san/sanVol1/scratch/mm5/blat.mm7.2005-11-17/chain -name "*.chain" \ | awk '{print "/san/sanVol1/scratch/andy/netOver.sh " $1 " /cluster/data/mm5/chrom.sizes"}' >> spec find /san/sanVol1/scratch/mm6/blat.mm7.2005-11-17/chain -name "*.chain" \ | awk '{print "/san/sanVol1/scratch/andy/netOver.sh " $1 " /cluster/data/mm6/chrom.sizes"}' >> spec para create spec para push para time #Completed: 120 of 120 jobs #CPU time in finished jobs: 4826s 80.44m 1.34h 0.06d 0.000 y #IO & Wait Time: 6816s 113.59m 1.89h 0.08d 0.000 y #Average job time: 97s 1.62m 0.03h 0.00d #Longest running job: 0s 0.00m 0.00h 0.00d #Longest finished job: 247s 4.12m 0.07h 0.00d #Submission to last job: 340s 5.67m 0.09h 0.00d ########## FINISHING ssh hgwdev # MM3 cd ~/san/mm3/blat.mm7.2005-11-17/over cat * >> mm3ToMm7.over.chain cd ../ rm -rf psl/ net/ chain/ chainRaw/ over/ cd ../ cp -r blat.mm7.2005-11-17/ /cluster/data/mm3/bed cd /cluster/data/mm3/bed ln -s blat.mm7.2005-11-17 blat.mm7 ln -s `pwd`/blat.mm7/mm3ToMm7.over.chain liftOver/mm3ToMm7.over.chain ln -s `pwd`/liftOver/mm3ToMm7.over.chain /gbdb/mm3/liftOver/mm3ToMm7.over.chain cd /usr/local/apache/htdocs/goldenPath/mm3/liftOver cp /gbdb/mm3/liftOver/mm3ToMm7.over.chain . gzip mm3ToMm7.over.chain hgAddLiftOverChain mm3 mm7 /gbdb/mm3/liftOver/mm3ToMm7.over.chain # MM4 cd ~/san/mm4/blat.mm7.2005-11-17/over cat * >> ../mm4ToMm7.over.chain cd ../ rm -rf psl/ net/ chain/ chainRaw/ over/ cd ../ cp -r blat.mm7.2005-11-17/ /cluster/data/mm4/bed cd /cluster/data/mm4/bed ln -s blat.mm7.2005-11-17 blat.mm7 ln -s `pwd`/blat.mm7/mm4ToMm7.over.chain liftOver/mm4ToMm7.over.chain ln -s `pwd`/liftOver/mm4ToMm7.over.chain /gbdb/mm4/liftOver/mm4ToMm7.over.chain mkdir -p /usr/local/apache/htdocs/goldenPath/mm4/liftOver cd /usr/local/apache/htdocs/goldenPath/mm4/liftOver cp /gbdb/mm4/liftOver/mm4ToMm7.over.chain . gzip mm4ToMm7.over.chain hgAddLiftOverChain mm4 mm7 /gbdb/mm4/liftOver/mm4ToMm7.over.chain # MM5 cd ~/san/mm5/blat.mm7.2005-11-17/over cat * >> ../mm5ToMm7.over.chain cd ../ rm -rf psl/ net/ chain/ chainRaw/ over/ cd ../ cp -r blat.mm7.2005-11-17/ /cluster/data/mm5/bed cd /cluster/data/mm5/bed ln -s blat.mm7.2005-11-17 blat.mm7 ln -s `pwd`/blat.mm7/mm5ToMm7.over.chain liftOver/mm5ToMm7.over.chain ln -s `pwd`/liftOver/mm5ToMm7.over.chain /gbdb/mm5/liftOver/mm5ToMm7.over.chain mkdir -p /usr/local/apache/htdocs/goldenPath/mm5/liftOver cd /usr/local/apache/htdocs/goldenPath/mm5/liftOver cp /gbdb/mm5/liftOver/mm5ToMm7.over.chain . gzip mm5ToMm7.over.chain hgAddLiftOverChain mm5 mm7 /gbdb/mm5/liftOver/mm5ToMm7.over.chain # MM6 cd ~/san/mm6/blat.mm7.2005-11-17/over cat * >> ../mm6ToMm7.over.chain cd ../ rm -rf psl/ net/ chain/ chainRaw/ over/ cd ../ cp -r blat.mm7.2005-11-17/ /cluster/data/mm6/bed cd /cluster/data/mm6/bed ln -s blat.mm7.2005-11-17 blat.mm7 ln -s `pwd`/blat.mm7/mm6ToMm7.over.chain liftOver/mm6ToMm7.over.chain ln -s `pwd`/liftOver/mm6ToMm7.over.chain /gbdb/mm6/liftOver/mm6ToMm7.over.chain mkdir -p /usr/local/apache/htdocs/goldenPath/mm6/liftOver cd /usr/local/apache/htdocs/goldenPath/mm6/liftOver cp /gbdb/mm6/liftOver/mm6ToMm7.over.chain . gzip mm6ToMm7.over.chain hgAddLiftOverChain mm6 mm7 /gbdb/mm6/liftOver/mm6ToMm7.over.chain ############################################################################ # RIKEN CAGE STUFF (DONE 11-16-2005 Andy) cd /cluster/data/mm7/bed mkdir rikenCageCtss cd rikenCageCtss/ hgsql mm5 -e 'select * from rikenCageTc' | cut -f2- | tail +2 > rikenCageTc.mm5.bed hgsql mm5 -e 'select chrom,chromStart,chromEnd,dataValue from rikenCageCtssMinus' \ | tail +2 > minus.mm5.bed hgsql mm5 -e 'select chrom,chromStart,chromEnd,dataValue from rikenCageCtssPlus' \ | tail +2 > plus.mm5.bed liftOver rikenCageTc.mm5.bed /gbdb/mm5/liftOver/mm5ToMm7.over.chain rikenCageTc.mm7.bed \ rikenCageTc.mm7.missed liftOver plus.mm5.bed /gbdb/mm5/liftOver/mm5ToMm7.over.chain plus.mm7.bed \ plus.mm7.missed liftOver minus.mm5.bed /gbdb/mm5/liftOver/mm5ToMm7.over.chain minus.mm7.bed \ minus.mm7.missed wc -l *.missed hgLoadBed mm7 rikenCageTc rikenCageTc.mm7.bed hgLoadBed -strict -bedGraph=4 mm7 rikenCageCtssMinus minus.mm7.bed hgLoadBed -strict -bedGraph=4 mm7 rikenCageCtssPlus plus.mm7.bed ############################################################################ # Hg17 BLASTZ Mm7 Lineage specific repeats comparison: # using Lineage Specific Repeats: # Mm7 target chunk size: 10,000,000 overlap 10,000 # Hg17 query size: 30,000,000 # chain minScore=3000 linearGap=medium # Hg17 on Mm7 featureBits mm7 chainHg17Link # 996434728 bases of 2583394090 (38.571%) in intersection featureBits refGene:cds mm7 chainHg17Link -enrichment # refGene:cds 1.022%, chainHg17Link 38.571%, both 1.004%, # cover 98.24%, enrich 2.55x # Mm7 on Hg17 featureBits hg17 chainMm7Link # 994737081 bases of 2866216770 (34.706%) in intersection featureBits refGene:cds hg17 chainMm7Link -enrichment # refGene:cds 1.060%, chainMm7Link 34.706%, both 1.032%, # cover 97.35%, enrich 2.80x # Not using Lineage Specific Repeats, using BLASTZ dynamic masking # with M=50, chain minScore=1000, linearGap=loose # Hg17 target chunk size: 500,000 overlap 50 # Mm7 query chunk size: entire genome # Hg17 on Mm7 featureBits mm7 chainHg17noLSRLink # 952987983 bases of 2583394090 (36.889%) in intersection featureBits refGene:cds mm7 chainHg17noLSRLink -enrichment # refGene:cds 1.022%, chainHg17noLSRLink 36.889%, both 0.987%, # cover 96.61%, enrich 2.62x: featureBits hg17 chainMm7noLSRLink # 955168137 bases of 2866216770 (33.325%) in intersection # Mm7 on Hg17 featureBits refGene:cds hg17 chainMm7noLSRLink -enrichment # refGene:cds 1.060%, chainMm7noLSRLink 33.325%, both 1.031%, # cover 97.31%, enrich 2.92x # Intersection of the two experiments: featureBits hg17 chainMm7noLSRLink chainMm7Link # 899176517 bases of 2866216770 (31.372%) in intersectio featureBits mm7 chainHg17noLSRLink chainHg17Link # 896080475 bases of 2583394090 (34.686%) in intersection # Chain length measurements: # With lineage specific repeats # Mm7 on Hg17: # Number of Chains: 3,073,210 # min: 32 max: 242 median: 880 stddev: 206,679 # Number of Chain Links: 61,184,737 # min: 1 max: 17,154 median: 23 stddev: 64.4087 # Hg17 on Mm7: # Number of Chains: 3,073,210 # min: 32 max: 1.22701e+08 median: 883 stddev: 177,227 # Number of Chain Links: 61,184,737 # min: 1 max: 17,154 median: 23 stddev: 64.4087 # Without lineage specific repeats: # Mm7 on Hg17: # Number of Chains: 990,397 # min: 26 max: 1.45906e+08 median: 330 stddev: 458,649 # min: 4,262 max: 2.45523e+08 median: 1.35414e+08 stddev: 6.02228e+07 # Number of Chain Links: 41,223,632 # min: 1 max: 17,154 median: 22 stddev: 43.5466 # table sizes: # on hg17: # chainMm7Link 2,586,839,468 bytes 61,184,737 rows # chainMm7noLSRLink 1,743,097,348 bytes 41,223,632 rows # chainMm7 267,042,956 bytes 3,073,210 rows # chainMm7noLSR 86,227,444 bytes 990,397 rows # on mm7: # chainHg17Link 2,593,329,860 bytes 61,184,737 rows # chainHg17noLSRLink 1,958,082,264 bytes 41,223,632 rows # chainHg17 267,023,452 bytes 3,073,210 rows # chainHg17noLSR 89,196,048 bytes 990,397 rows ############################################################################ # ADD LINK TO GENENETWORK (DONE. 2/9/06 Fan). # Copy data from mm6 hgsql mm6 -N -e 'select * from geneNetworkId' > geneNetworkId.tab hgsql mm7 -e 'drop table geneNetworkId' hgsql mm7 < ~/src/hg/lib/geneNetworkId.sql hgsql mm7 -e 'load data local infile "geneNetworkId.tab" into table geneNetworkId' # BLASTZ/CHAIN/NET RN4 (DONE 2/23/06 angie) ssh pk mkdir /cluster/data/mm7/bed/blastz.rn4.2006-02-23 cd /cluster/data/mm7/bed/blastz.rn4.2006-02-23 cat << '_EOF_' > DEF # mouse vs rat BLASTZ=/cluster/bin/penn/x86_64/blastz.v7.x86_64 BLASTZ_ABRIDGE_REPEATS=1 # TARGET: Mouse Mm7 SEQ1_DIR=/scratch/hg/mm7/nib SEQ1_SMSK=/scratch/hg/mm7/linSpecRep/notInRat SEQ1_LEN=/scratch/hg/mm7/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: Rat Rn4 - single chunk big enough to run each chrom by itself SEQ2_DIR=/scratch/hg/rn4/nib SEQ2_SMSK=/cluster/bluearc/scratch/hg/rn4/linSpecRep.notInMouse SEQ2_LEN=/cluster/data/rn4/chrom.sizes SEQ2_CHUNK=300000000 SEQ2_LAP=0 BASE=/cluster/data/mm7/bed/blastz.rn4.2006-02-23 TMPDIR=/scratch/tmp '_EOF_' # << for emacs doBlastzChainNet.pl DEF -chainMinScore 3000 -chainLinearGap medium \ -bigClusterHub pk \ -blastzOutRoot /cluster/bluearc/blastzMm7Rn4Out >& do.log & tail -f do.log rm -f /cluster/data/mm7/bed/blastz.rn4 ln -s blastz.rn4.2006-02-23 /cluster/data/mm7/bed/blastz.rn4 # Update table maps known genes to visiGene images (2006-03-07 galt) knownToVisiGene mm7 # UPDATED mm7.knownToVisiGene (2006-03-14 galt) ssh hgwdev knownToVisiGene mm7 # UPDATED mm7.knownToVisiGene (2006-04-05 galt) ssh hgwdev knownToVisiGene mm7 ########################################################################### # SPLIT SEQUENCE FOR LIFTOVER (DROPUNDER) CHAINS FROM MM8 (2006-04-06 kate) ssh kkr1u00 cd /cluster/data/mm7/bed mkdir bed/liftOver cd bed/liftOver makeLoChain-split mm7 /cluster/data/mm7/nib >&! split.log & ####################################################################### ## LIFTOVER To Mm8 (DONE - 2006-04-21 - 2006-04-24 - Hiram) ssh kkr1u00 $HOME/kent/src/hg/makeDb/makeLoChain/makeLoChain-split.csh \ mm8 /cluster/data/mm8/nib # as it says, DO THIS NEXT: ssh kk $HOME/kent/src/hg/makeDb/makeLoChain/makeLoChain-align.csh \ mm7 /scratch/hg/mm7/nib mm8 /iscratch/i/mm8/split10k \ /cluster/data/mm8/11.ooc # as it says, DO THIS NEXT: cd /cluster/data/mm7/bed/blat.mm8.2006-04-21/run para try, check, push, check, ... # Completed: 1360 of 1360 jobs # CPU time in finished jobs: 4087937s 68132.28m 1135.54h 47.31d 0.130 y # IO & Wait Time: 15121s 252.02m 4.20h 0.18d 0.000 y # Average job time: 3017s 50.28m 0.84h 0.03d # Longest finished job: 25341s 422.35m 7.04h 0.29d # Submission to last job: 84772s 1412.87m 23.55h 0.98d # as it says, DO THIS NEXT: ssh kkr1u00 cd /cluster/data/mm7/bed ln -s blat.mm8.2006-04-21 blat.mm8 # runs liftUp to create the psl files time $HOME/kent/src/hg/makeDb/makeLoChain/makeLoChain-lift.csh mm7 mm8 # as it says, DO THIS NEXT: ssh kki # prepares the chain batch job $HOME/kent/src/hg/makeDb/makeLoChain/makeLoChain-chain.csh \ mm7 /cluster/data/mm7/nib mm8 /cluster/data/mm8/nib # as it says, DO THIS NEXT: cd /cluster/data/mm7/bed/blat.mm8.2006-04-21/chainRun para try, check, push, check, ... # Completed: 34 of 34 jobs # CPU time in finished jobs: 5834s 97.23m 1.62h 0.07d 0.000 y # IO & Wait Time: 852s 14.20m 0.24h 0.01d 0.000 y # Average job time: 197s 3.28m 0.05h 0.00d # Longest finished job: 622s 10.37m 0.17h 0.01d # Submission to last job: 1367s 22.78m 0.38h 0.02d ssh kkstore02 $HOME/kent/src/hg/makeDb/makeLoChain/makeLoChain-net.csh mm7 mm8 # Created /cluster/data/mm7/bed/liftOver/mm7ToMm8.over.chain.gz # as it says, DO THIS NEXT: ssh hgwdev $HOME/kent/src/hg/makeDb/makeLoChain/makeLoChain-load.csh mm7 mm8 # It says this: # Now, add link for # /usr/local/apache/htdocs/goldenPath/mm7/liftOver/mm7ToMm8.over.chain # to hgLiftOver # But I believe that link was already done: cd /gbdb/mm7/liftOver ls -og mm7ToMm8* # lrwxrwxrwx 1 53 Apr 24 14:22 mm7ToMm8.over.chain.gz -> \ # /cluster/data/mm7/bed/liftOver/mm7ToMm8.over.chain.gz # ALTGRAPHX TRACK (sugnet) Wed Apr 26 14:00:54 PDT 2006 # First make altGraphX track for hg18 (see makeHg18.doc) cd /cluster/store1/sugnet/altSplice/hg18-2006.04.13/ mkdir orthoSplice.mm7 cd orthoSplice.mm7/ cp /cluster/data/mm7/bed/blastz.hg18/axtChain/mm7.hg18.all.chain.gz . cp /cluster/data/mm7/bed/blastz.hg18/axtChain/mm7.hg18.net.gz . gunzip *.gz echo 'select chrom, size from chromInfo' | hgsql mm7 | grep -v chrom > chromSizes.tab chainSplit chains mm7.hg18.all.chain netSplit mm7.hg18.net nets mkdir agx report logs cat << '_EOF_' > makeRun.sh #!/usr/bin/perl -w open(IN, 'chromSizes.tab') or die "Can't open chromSizes.tab\n"; while() { chomp; @w = split; print "/cluster/home/sugnet/bin/i386/orthoSplice -chromSize=$w[1] -trumpNum=3 -chrom=$w[0] -altInFile=../mm7/agxs/mm7.$w[0].agx -orthoAgxFile=../hg18/hg18.agx -db=mm7 -orthoDb=hg18 -netFile=nets/$w[0].net -chainFile=chains/$w[0].chain -commonFile=agx/$w[0].mm7.hg18.cons.t3.agx -reportFile=report/$w[0].mm7.report -edgeFile=report/$w[0].mm7.edge.report >& logs/$w[0].test.log\n"; } '_EOF_' # << this line keeps emacs coloring happy chmod 755 makeRun.sh ./makeRun.sh > orthoSplice.para.spec cat agx/*.agx > mm7.hg18.t3.agx cp ~/latestJk/kent/src/hg/lib/altGraphX.sql . hgLoadBed -notItemRgb -sqlTable=altGraphX.sql mm7 altGraphX mm7.hg18.t3.agx # done altGraphX track. # EXONWALK TRACK (sugnet) Wed Apr 26 14:04:28 PDT 2006 # First build the altGraphX track above. cd /cluster/store1/sugnet/altSplice/hg18-2006.04.13/orthoSplice.mm7 mkdir exonWalk cd exonWalk mkdir beds foreach file (`ls ../agx/*.agx`) set base=`basename $file .agx` echo "/cluster/home/sugnet/bin/i386/exonWalk db=mm7 minPercent=0 trumpSize=100000 $file beds/$base.bed" >> exonWalk.para.spec end para create exonWalk.para.spec para push cat beds/*.bed > mm7.hg18.cons.t3.bed mkdir orfs cd orfs mkdir bedOrf beds fa borf splitFile ../../mm7.hg18.cons.t3.bed 500 exonWalk. cd .. cat << '_EOF_' > makeFa.sh #!/bin/sh for file in "$@" do base=`basename $file` echo "Doing $file" echo "sequenceForBed -db=mm7 -bedIn=$file -fastaOut=fa/$base.fa " sequenceForBed -db=mm7 -bedIn=$file -fastaOut=fa/$base.fa done '_EOF_' # << this line keeps emacs coloring happy chmod 755 makeFa.sh makeFa.sh beds/* cat << '_EOF_' > makeGenePred.sh #!/bin/sh for file in "$@" do base=`basename $file` /cluster/home/sugnet/bin/i386/borfMatcher -keepNmd beds/$base borf/$base.borf bedOrf/$base.bed genePred/$base.gp done '_EOF_' # << this line keeps emacs coloring happy cat << '_EOF_' > makeGenePredWithNmd.sh #!/bin/sh for file in "$@" do base=`basename $file` /cluster/home/sugnet/bin/i386/borfMatcher -keepNmd beds/$base borf/$base.borf bedOrfWithNmd/$base.bed genePredWithNmd/$base.gp done '_EOF_' # << this line keeps emacs coloring happy chmod 755 *.sh mkdir genePred mkdir genePredWithNmd mkdir bedOrfWithNmd ./makeGenePred.sh beds/* ./makeGenePredWithNmd.sh beds/* cat beds/* > mm7.hg18.exonWalk.all.bed cat genePred/* > mm7.hg18.exonWalk.gp cat genePredWithNmd/* > mm7.hg18.exonWalk.withNmd.gp hgLoadBed mm7 exonWalkAll mm7.hg18.exonWalk.all.bed ldHgGene -predTab mm7 exonWalk mm7.hg18.exonWalk.noNmd.gp ldHgGene -predTab mm7 exonWalkWithNmd mm7.hg18.exonWalk.withNmd.gp trackGenome mm7 all refGene:cds trackGenome.spec Track Specification track overlap track cov track new cum size size geno track cov cov cov ----------------------------------------------------------------------------- exonWalk:cds 29568035 24053910 1.04% 81.35% 88.02% 88.02% 88.02% exonWalk 50860916 24360287 1.79% 47.90% 89.14% 89.14% 89.14% exonWalkWithNmd:cdf 50860916 24360287 1.79% 47.90% 89.14% 89.14% 89.14% exonWalk 50860916 24360287 1.79% 47.90% 89.14% 89.14% 89.14% exonWalkAll 58621025 25081327 2.06% 42.79% 91.78% 91.78% 91.78% mrna 136069914 26020276 4.78% 19.12% 95.22% 95.22% 95.22% intronEst 52046578 21219340 1.83% 40.77% 77.65% 77.65% 77.65% est 185259362 23838078 6.51% 12.87% 87.23% 87.23% 87.23% [hgwdev:orfs> trackGenome mm7 all refGene trackGenome.spec Track Specification track overlap track cov track new cum size size geno track cov cov cov ----------------------------------------------------------------------------- exonWalk:cds 29568035 24517871 1.04% 82.92% 52.77% 52.77% 52.77% exonWalk 50860916 38905251 1.79% 76.49% 83.74% 83.74% 83.74% exonWalkWithNmd:cdf 50860916 38905251 1.79% 76.49% 83.74% 83.74% 83.74% exonWalk 50860916 38905251 1.79% 76.49% 83.74% 83.74% 83.74% exonWalkAll 58621025 40837805 2.06% 69.66% 87.90% 87.90% 87.90% mrna 136069914 44970708 4.78% 33.05% 96.79% 96.79% 96.79% intronEst 52046578 28482307 1.83% 54.72% 61.30% 61.30% 61.30% est 185259362 41486850 6.51% 22.39% 89.30% 89.30% 89.30% # done exonWalk. ############################################################################ # SGP GENES (DONE - 2006-05-05 - Fan) ssh kkstore02 cd mkdir /cluster/data/mm7/bed mv sgp sgp_old mkdir /cluster/data/mm7/bed/sgp cd /cluster/data/mm7/bed/sgp # They don't do chrM bash for CHR in `awk '{print $1}' ../../chrom.sizes | grep -v chrM` do wget --timestamping \ "http://genome.imim.es/genepredictions/M.musculus/mmDec2005/SGP/humangp200603/${CHR}.gtf" \ -O "${CHR}.gtf" done ssh hgwdev cd /cluster/data/mm7/bed/sgp ldHgGene -gtf -genePredExt mm7 sgpGene chr*.gtf # Dropped the sgpPep table, just let hgc calculate predict protein sequence. hgsql mm7 -e 'drop table sgpPep' featureBits mm7 -enrichment refGene:CDS sgpGene refGene:CDS 1.059%, sgpGene 1.451%, both 0.912%, cover 86.18%, enrich 59.39x ########################################################################### #### LOAD ENSEMBL GENES (DONE 5/12/06, Fan) # ADDDED STABLE URL TO TRACKDB BLOCK (V38, APR 2006) (2008-01-11, rhead) # needed for Gene Sorter procedure below # Ensembl released Mouse build 34 the week of August 10th, 2005 mkdir -p /cluster/store5/mm7/bed/ensGene # ln -s /cluster/store5/mm7/bed/ensGene /cluster/data/mm7/bed cd /cluster/data/mm7/bed/ensGene # Get the Ensembl BioMart at http://www.ensembl.org/Multi/martview # Choose Ensembl 38 and Mus musculus genes (NCBIM35), click next # Follow this sequence through the pages: 1) Select "Known genes" in the Gene section. Hit next. NOTE: "Known genes" is no longer available for later versions of BioMart. 2) Uncheck everything on "Filter" page. Then hit next. 3) Select "Structures". 4) Choose GTF as the output, choose gzip compression, name the output file ensGeneMm7.gtf.gz and then hit Export # Ensembl handles random chromosomes differently than us, so we # strip this data. Fortunately it just loses a couple of genes. gzip -d ensGeneMm7.gtf.gz cat ensGeneMm7.gtf | grep -v ^6_DR51 | grep -v NT_ > unrandom.gtf # Let's see how much it loses: # 614986 8609804 87362912 ensGeneMm7.gtf # 603148 8444072 85668522 unrandom.gtf # Add "chr" to front of each line in the gene data gtf file to make # it compatible with ldHgGene sed -e "s/^/chr/" unrandom.gtf | sed -e "s/chrMT/chrM/"| sed -e 's/\..\"/\"/g' > ensGene.gtf ldHgGene mm7 ensGene ensGene.gtf # Read 33902 transcripts in 603148 lines in 1 files # 33902 groups 22 seqs 1 sources 4 feature types # 33902 gene predictions # save space, gzip them: gzip unrandom.gtf gzip ensGene.gtf # ensGtp associates geneId/transcriptId/proteinId for hgPepPred and # hgKnownToSuper. Use ensMart to create it as above, except: # Page 3) Choose the "Features" box. In "Ensembl Attributes", check # Ensembl Gene ID, Ensembl Transcript ID, Ensembl Peptide ID. # Choose Text, tab-separated as the output format. Result name ensGtp. # Save file as ensGtp.tsv.gz gunzip ensGtp.tsv.gz hgsql mm7 < ~/kent/src/hg/lib/ensGtp.sql # remove header line from ensGtp.txt echo "load data local infile 'ensGtp.tsv' into table ensGtp ignore 1 lines" | hgsql -N mm7 # Load Ensembl peptides: # Get them from ensembl as above in the gene section except for # Page 3) Choose the "Sequences" box. # Page 4) check Peptide, then Ensembl Gene ID, Ensembl Transcript ID, # and Ensembl Peptide ID, uncheck others (chrom, bioType, etc). Format = FASTA. # Save file as ensemblPep5.fa.gz (several previous trials failed). gunzip ensemblPep5.fa.gz cat ensemblPep5.fasta | faToTab -type=protein stdin stdout | grep -v SEQXENCEXNAVAILAXLE > j.tab cat j.tab |grep -v "Reading"|awk '{print ">" $1;print $2}' > ensPep.fa hgPepPred mm7 ensembl ensPep.fa rm j.tab gzip * # Create knownToEnsembl column hgMapToGene mm7 ensGene knownGene knownToEnsembl # Compress everthing to save space gzip * #### BUILD Ensembl cross-reference table, ensemblXref3 (TBD) # PLEASE NOTE THAT THE ENSEMBLXREF3 TABLE IS BUILT USING ENSEMBL BIOMART DATA OF MOUSE BUILD 34. # THIS TABLE IS NEEDED TO SUPPORT SUPERFAMILY TRACK OF THE PROTEOME BROWSER. # SINCE ENSEMBL CHANGED THE DATA FORMAT AGAIN (AS USUAL :-( ), THERE IS NO VERSION NUMBER # IN THEIR IDs, A FAKE "0" IS GENERATED FOR EACH ID IN ensemblXref3 TABLE. # Get the ensembl gene/protein cross-reference data BioMart # Follow this sequence through the pages: # 1) Make sure that the Mus musculus choice is selected. Hit next. # 2) Choose the "Feature" box, select Ensembl gene, Ensembl transcript, # and Ensembl peptid IDs, UniProt/SPTREMBL ID, UniProt/Swiss-Prot ID, # and UniProt/Swiss-Prot Accession # 3) Choose "Text, tab separated". choose gzip compression. hit export. # Save as ensXref2.tsv.gz gzip -d ensXref2.tsv.gz hgsql mm7 < ~/hg/lib/ensemblXref3Temp.sql hgsql mm7 -e 'load data local infile "ensXref2.tsv" into table ensemblXref3Temp ignore 1 lines' hgsql mm7 -N -e 'select gene, "0", transcript, "0", protein, "0", tremblAcc, swissDisplayId, swissAcc from ensemblXref3Temp' \ > ensemblXref3.tab hgsql mm7 -e 'drop table ensemblXref3' hgsql mm7 <~/src/hg/lib/ensemblXref3.sql hgsql mm7 -e 'load data local infile "ensemblXref3.tab" into table ensemblXref3' # load the table into proteome DB also hgsql proteome <~/src/hg/lib/ensemblXref3.sql hgsql proteome -e 'load data local infile "ensemblXref3.tab" into table ensemblXref3' ######################################################################## ### microRNA targets tracks (DONE - 2006-05-16 - Hiram) ### from: http://pictar.bio.nyu.edu/ Rajewsky Lab ### Nikolaus Rajewsky nr@scarbo.bio.nyu.edu ### Yi-Lu Wang ylw205@nyu.edu ### dg@thp.Uni-Koeln.DE ssh hgwdev mkdir /cluster/data/mm7/bed/picTar cd /cluster/data/mm7/bed/picTar wget --timestamping \ 'http://pictar.bio.nyu.edu/ucsc/mouse/mouse_bed' -O mouse_dog.bed wget --timestamping \ 'http://pictar.bio.nyu.edu/ucsc/mouse/mouse_chicken_bed' \ -O mouse_chicken.bed grep -v "^track" mouse_dog.bed \ | hgLoadBed -strict mm7 picTarMiRNADog stdin # Loaded 127119 elements of size 9 grep -v "^track" mouse_chicken.bed \ | hgLoadBed -strict mm7 picTarMiRNAChicken stdin # Loaded 12298 elements of size 9 nice -n +19 featureBits mm7 picTarMiRNADog # 449871 bases of 2583394090 (0.017%) in intersection nice -n +19 featureBits mm7 picTarMiRNAChicken # 35586 bases of 2583394090 (0.001%) in intersection ######################################################################### ### IGTC (Int'l GeneTrap Consortium) (DONE - 2006-06-07 - angie) ### fasta added 2006-06-21 ### Doug Stryke in Tom Ferrin's lab ### NOTE -- as of 2007-03-01 the igtc track will be automatically ### updated on hgwdev by the scripts monthlyUpdateIgtc.csh and ### updateIgtc.pl in kent/src/hg/utils/automation/ . ssh hgwdev mkdir /cluster/data/mm7/bed/igtc cd /cluster/data/mm7/bed/igtc wget http://www.genetrap.org/blattrack/genetrap_mm7.psl grep -v ^track genetrap_mm7.psl \ | hgLoadPsl mm7 -table=igtc stdin # Probe fasta is shared by all assemblies: wget http://www.genetrap.org/blattrack/genetrap.fasta mkdir /gbdb/mm7/igtc ln -s /cluster/data/mm7/bed/igtc/genetrap.fasta /gbdb/mm7/igtc/ hgLoadSeq -replace mm7 /gbdb/mm7/igtc/genetrap.fasta ######################################################################### # LIFTOVER TO MM6 (DONE, 2006-06-11 - 2006-06-12, hartera) # Split of mm6 sequences done using makeLoChain-split and doc'd in # makeMm6.doc: SPLIT MM6 SEQUENCES FOR LIFTOVER FROM OTHER ASSEMBLIS section. # Run on pk as faster than kk. ssh pk mkdir -p /cluster/data/mm7/bed/liftOver cd /cluster/data/mm7/bed/liftOver cat << '_EOF_' > align.csh #!/bin/csh -fe set oldAssembly = $1 set oldNibDir = $2 set newAssembly = $3 set newSplitDir = $4 set ooc = $5 if ("$ooc" != "") then set ooc = '-ooc='$ooc endif set blatDir = /cluster/data/$oldAssembly/bed/blat.$newAssembly.`date +%Y-%m-%d` echo "Setting up blat in $blatDir" rm -fr $blatDir mkdir $blatDir cd $blatDir mkdir raw psl run cd run echo '#LOOP' > gsub echo 'blat $(path1) $(path2) {check out line+ ../raw/$(root1)_$(root2).psl} ' \ '-tileSize=11 '$ooc' -minScore=100 -minIdentity=98 -fastMap' \ >> gsub echo '#ENDLOOP' >> gsub # target ls -1S $oldNibDir/*.{nib,2bit} > old.lst # query ls -1S $newSplitDir/*.{nib,fa} > new.lst gensub2 old.lst new.lst gsub spec /parasol/bin/para create spec echo "" echo "First two lines of para spec:" head -2 spec echo "" echo "DO THIS NEXT:" echo " cd $blatDir/run" echo " para try, check, push, check, ..." echo "" exit 0 '_EOF_' # << emacs chmod +x align.csh align.csh mm7 /san/sanvol1/scratch/mm7/nib mm6 \ /san/sanvol1/scratch/mm6/split10k \ /san/sanvol1/scratch/mm6/11.ooc >&! align.log & # Took a few seconds. # Do what its output says to do next (start cluster job) cd /cluster/data/mm7/bed/blat.mm6.2006-06-11/run para try, check, push, check, ... para time >&! run.time # Completed: 1600 of 1600 jobs # CPU time in finished jobs: 1483451s 24724.18m 412.07h 17.17d 0.047 y # IO & Wait Time: 6704s 111.74m 1.86h 0.08d 0.000 y # Average job time: 931s 15.52m 0.26h 0.01d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 9049s 150.82m 2.51h 0.10d # Submission to last job: 12027s 200.45m 3.34h 0.14d ssh pk cd /cluster/data/mm7/bed/liftOver cat << '_EOF_' > lift.csh #!/bin/csh -ef set oldAssembly = $1 set newAssembly = $2 set newLiftDir = /san/sanvol1/scratch/$newAssembly/split10k set prefix = /cluster/data/$oldAssembly/bed/blat.$newAssembly set blatDir = `ls -td $prefix.20* | head -1` echo "using dir $blatDir" if ( ! -e $blatDir/raw ) then echo "Can't find $blatDir/raw" endif if (`ls -1 $newLiftDir/*.lft | wc -l` < 1) then echo "Can't find any .lft files in $newLiftDir" exit 1 endif cd $blatDir/raw foreach chr (`awk '{print $1;}' /cluster/data/$newAssembly/chrom.sizes`) echo $chr liftUp -pslQ ../psl/$chr.psl $newLiftDir/$chr.lft warn chr*_$chr.psl end set execDir = $0:h echo "" echo "DO THIS NEXT:" echo " ssh pk" echo " $execDir/makeLoChain-chain $oldAssembly <$oldAssembly-nibdir> $newAssembly <$newAssembly-nibdir>" echo "" exit 0 '_EOF_' # << emacs chmod +x lift.csh lift.csh mm7 mm6 >&! lift.log & # makeLoChain-chain can be run on pk. chain alignments makeLoChain-chain mm7 /san/sanvol1/scratch/mm7/nib \ mm6 /san/sanvol1/scratch/mm6/nib >&! chain.log & cd /cluster/data/mm7/bed/blat.mm6.2006-06-11/chainRun para try, check, push, check, ... para time >&! run.time # Completed: 40 of 40 jobs # CPU time in finished jobs: 7800s 130.00m 2.17h 0.09d 0.000 y # IO & Wait Time: 15867s 264.45m 4.41h 0.18d 0.001 y # Average job time: 592s 9.86m 0.16h 0.01d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 1323s 22.05m 0.37h 0.02d # Submission to last job: 1323s 22.05m 0.37h 0.02d ssh kkstore02 cd /cluster/data/mm7/bed/liftOver makeLoChain-net mm7 mm6 >&! net.log & # Took about 50 minutes. # load reference to over.chain into database table, # and create symlinks /gbdb and download area ssh hgwdev cd /cluster/data/mm7/bed/liftOver makeLoChain-load mm7 mm6 >&! load.log & # clean up rm *.log # add md5sum.txt to include this new liftOver file cd /usr/local/apache/htdocs/goldenPath/mm7/liftOver rm md5sum.txt md5sum *.gz > md5sum.txt # copy README.txt from another liftOver directory. # test by converting a region using the "convert" link on # the browser, and comparing to blat of the same region ######################################################################### # REGULATORY POTENTIAL (DONE - 2006-06-12 - Hiram) # download data from "James Taylor" ssh kkstore02 cd /cluster/data/mm7/bed mkdir /cluster/store11/mm7/bed/regPotential7X ln -s /cluster/store11/mm7/bed/regPotential7X . cd regPotential7X # This is a lot of data time for C in 1 2 3 4 5 6 7 8 9 X 10 11 12 13 14 15 16 17 18 19 do wget --timestamping \ "http://www.bx.psu.edu/~james/esperr_rp_7way_scores/genome_scores_mm7/chr${C}.scores.truncated.bz2" done # real 77m57.028s wget --timestamping \ "http://www.bx.psu.edu/~james/esperr_rp_7way_scores/genome_scores_mm7/trackDb.html" -O description.html time for C in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 X do bzcat chr${C}.scores.truncated.bz2 done | wigEncode -noOverlap stdin regPotential7X.wig regPotential7X.wib # Converted stdin, upper limit 1.00, lower limit 0.00 # real 22m7.525s # Loading the table on hgwdev ssh hgwdev cd /cluster/data/mm7/bed/regPotential7X ln -s /cluster/data/mm7/bed/regPotential7X/regPotential7X.wib \ /gbdb/mm7/wib/regPotential7X.wib # using the tmpDir is faster since it is on local disk and it will # clean up any temporary .tab file it creates there time hgLoadWiggle -tmpDir=/scratch/tmp \ mm7 regPotential7X regPotential7X.wig # real 0m28.683s # create a histogram ssh kolossus cd /cluster/data/mm7/bed/regPotential7X time hgWiggle -verbose=2 -doHistogram -hBinSize=0.01 -hBinCount=100 \ -hMinVal=0.0 -db=mm7 regPotential7X > histogram.data 2>&1 # real 1m38.660s # *only* %65 of the data is 0.0 # create download gzip files from the bz2 files: for F in chr*.scores.truncated.bz2 do C=`echo $F | awk -F'.' '{print $1}'` echo -n "${C}.regPotential7X.mm7.gz working ... " bzcat ${F} | gzip > ${C}.regPotential7X.mm7.gz echo done ############################################################################# # Create Allen Brain Atlas mapping. (Done 2007-02-08 Galt) # We are creating several things: a psl probe-track for the RR on mouse, # a link out from kg to the probe to the ABA website, # and a set of gene/probe info which visiGene will use. # (This needs to be done after have created sequences in # ncbiXm and tigrMgiTc as above.) # metadata.log and SRGEsequence.log was provided by # Susan Sunkin # this is an update to the visiGene with 6000 new images. # See mm6.txt for steps not needing to be repeated. # copy in the data files (directory already exists from previous build) ssh hgwdev cd /cluster/data/mm7/bed/allenBrain mkdir old mv * old/ cp /cluster/data/mm6/bed/allenBrain/allen20061204.tab . cp /cluster/data/mm6/bed/allenBrain/probeSeq.20061204.fasta . cp /cluster/data/mm6/bed/allenBrain/allProbes.fa . cp /cluster/data/mm6/bed/allenBrain/allProbes.tab . cp /cluster/data/mm6/bed/allenBrain/allenBrainUrl.tab . # Set up a blat run to align the probes. ssk pk cd /cluster/data/mm7/bed/allenBrain mkdir split faSplit sequence allProbes.fa 200 split/rp mkdir run cd run ls -1 ../split/*.fa > mrna.lst ls -1 /scratch/hg/mm7/nib/*.nib > genome.lst mkdir psl cat << '_EOF_' > gsub #LOOP blat -ooc=/scratch/hg/h/mouse11.ooc $(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl} #ENDLOOP '_EOF_' gensub2 genome.lst mrna.lst gsub spec para create spec # Then do the usual para try/push/time/check until the run is finished #Completed: 7760 of 7760 jobs #CPU time in finished jobs: 27428s 457.13m 7.62h 0.32d 0.001 y #IO & Wait Time: 22916s 381.93m 6.37h 0.27d 0.001 y #Average job time: 6s 0.11m 0.00h 0.00d #Longest running job: 0s 0.00m 0.00h 0.00d #Longest finished job: 20s 0.33m 0.01h 0.00d #Submission to last job: 516s 8.60m 0.14h 0.01d # Then do sorting and near-best-in-genome step on file server ssh kkstore cd /cluster/data/mm7/bed/allenBrain/run pslSort dirs raw.psl tmp psl pslReps raw.psl ../best.psl -nohead -minCover=0.20 -minAli=0.96 -nearTop=0.001 /dev/null sort -k 14,14 -k 16,16n ../best.psl > ../allenBrainAli.psl # Clean up big files no longer needed rm raw.psl rm -r psl rm -r ../split # Load up database ssh hgwdev cd /cluster/data/mm7/bed/allenBrain # Make a new table that contains the URLs for the allen brain genes # Make this one first since all.joiner considers it the master table. hgsql mm7 -e 'drop table allenBrainUrl' hgsql mm7 < ~/kent/src/hg/lib/allenBrainUrl.sql hgsql mm7 -e 'load data local infile "allenBrainUrl.tab" into table allenBrainUrl' # Make probe alignment table, and load sequence. hgLoadPsl mm7 allenBrainAli.psl rm /gbdb/mm7/allenBrain/allProbes.fa ln -s /cluster/data/mm7/bed/allenBrain/allProbes.fa /gbdb/mm7/allenBrain/allProbes.fa hgLoadSeq -replace mm7 /gbdb/mm7/allenBrain/allProbes.fa # Make mapping between known genes and allenBrain hgMapToGene mm7 allenBrainAli -type=psl knownGene knownToAllenBrain ########################################################################## # xxBlastTab - Help filter out unwanted paralogs (Galt 2007-01-11) # # We are starting with xxBlastTab tables already built in the usual way with # blastall/blastp, probably with doHgNearBlastp.pl script. # # we want to update mm7 for human and rat, # so check ./hgGeneData/Mouse/mm7/otherOrgs.ra for current settings ssh hgwdev synBlastp.csh mm7 hg17 #mm7.hgBlastTab: #new number of unique query values: #24760 #new number of unique target values #15354 #old number of unique query values: #27511 #old number of unique target values #15931 #cleanup: synBlastp.csh mm7 rn3 #mm7.rnBlastTab: #new number of unique query values: #8603 #new number of unique target values #5071 #old number of unique query values: #19217 #old number of unique target values #5508 ######################################################################### ########################################################################## # GenBank gbMiscDiff table (markd 2007-01-10) # Supports `NCBI Clone Validation' section of mgcGenes details page # genbank release 157.0 now contains misc_diff fields for MGC clones # reloading mRNAs results in gbMiscDiff table being created. ./bin/gbDbLoadStep -reload -srcDb=genbank -type=mrna mm7 # (DONE galt 2007-02-15) # Create table that maps between known genes and visiGene database knownToVisiGene mm7 ################################################ # AUTOMATE UPSTREAM FILE CREATION (2008-10-15 markd) update genbank.conf: mm7.upstreamGeneTbl = refGene ############################################################################# # UPDATE KEGG TABLES (DONE, Fan, 6/18/10) mkdir -p /hive/data/genomes/mm7/bed/pathways/kegg cd /hive/data/genomes/mm7/bed/pathways/kegg wget --timestamping ftp://ftp.genome.jp/pub/kegg/pathway/map_title.tab cat map_title.tab | sed -e 's/\t/\tmmu\t/' > j.tmp cut -f 2 j.tmp >j.mmu cut -f 1,3 j.tmp >j.1 paste j.mmu j.1 |sed -e 's/\t//' > keggMapDesc.tab rm j.mmu j.1 rm j.tmp hgsql mm7 -e 'drop table keggMapDesc' hgsql mm7 < ~/kent/src/hg/lib/keggMapDesc.sql hgsql mm7 -e 'load data local infile "keggMapDesc.tab" into table keggMapDesc' wget --timestamping ftp://ftp.genome.jp/pub/kegg/genes/organisms/mmu/mmu_pathway.list cat mmu_pathway.list| sed -e 's/path://'|sed -e 's/:/\t/' > j.tmp hgsql mm7 -e 'drop table keggPathway' hgsql mm7 < ~/kent/src/hg/lib/keggPathway.sql hgsql mm7 -e 'load data local infile "j.tmp" into table keggPathway' hgsql mm7 -N -e \ 'select name, locusID, mapID from keggPathway p, knownToLocusLink l where p.locusID=l.value' \ >keggPathway.tab hgsql mm7 -e 'delete from keggPathway' hgsql mm7 -e 'load data local infile "keggPathway.tab" into table keggPathway' rm j.tmp ############################################################################# # Add KEGG column to mm7 Gene Sorter (Done, Fan, 6/18/2010) mkdir -p /hive/data/genomes/mm7/bed/geneSorter cd /hive/data/genomes/mm7/bed/geneSorter hgsql mm7 -N -e 'select kgId, mapID, mapID, "+", locusID from keggPathway' |sort -u|sed -e 's/\t+\t/+/' > knownToKeggEntrez.tab hgsql mm7 -e 'drop table knownToKeggEntrez' hgsql mm7 < ~/kent/src/hg/lib/knownToKeggEntrez.sql hgsql mm7 -e 'load data local infile "knownToKeggEntrez.tab" into table knownToKeggEntrez' #############################################################################