# for emacs: -*- mode: sh; -*- # This file describes how we made the browser database on the Rattus # Norvegicus genome, November 2004 update (Rnor3.4) from Baylor. # NOTE: this doc may have genePred loads that fail to include # the bin column. Please correct that for the next build by adding # a bin column when you make any of these tables: # # mysql> SELECT tableName, type FROM trackDb WHERE type LIKE "%Pred%"; # +-------------+-------------------------------------+ # | tableName | type | # +-------------+-------------------------------------+ # | knownGene | genePred knownGenePep knownGeneMrna | # | refGene | genePred refPep refMrna | # | xenoRefGene | genePred xenoRefPep xenoRefMrna | # | mgcGenes | genePred | # | ensGene | genePred ensPep | # | genscan | genePred genscanPep | # +-------------+-------------------------------------+ # CREATE BUILD DIRECTORY (DONE 1/27/06 angie) # df -h /cluster/store*, choose the one with the most space... ssh kkstore01 mkdir /cluster/store9/rn4 ln -s /cluster/store9/rn4 /cluster/data/rn4 # DOWNLOAD MITOCHONDRION GENOME SEQUENCE (DONE 1/27/06 angie) mkdir /cluster/data/rn4/M cd /cluster/data/rn4/M # go to http://www.ncbi.nih.gov/ and search Nucleotide for # "Rattus norvegicus mitochondrion complete genome". # There are more than one of those... I picked NC_001665 whose gi # is # 5835177 # Use that number in the entrez linking interface to get fasta: wget -O chrM.fa \ 'http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=Text&db=Nucleotide&uid=5835177&dopt=FASTA' # Edit chrM.fa: make sure the long fancy header line says it's the # Rattus norvegicus mitochondrion complete genome, and then replace the # header line with just ">chrM". # DOWNLOAD SEQUENCE (DONE 1/27/06 angie) ssh kkstore01 cd /cluster/data/rn4 mkdir downloads cd downloads wget ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Rnorvegicus/Rnor3.4/README foreach c (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 Un X) echo chr$c wget ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Rnorvegicus/Rnor3.4/chromosomes/Rnor3.4chr${c}.fa.gz wget ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Rnorvegicus/Rnor3.4/chromosomes/Rnor3.4chr${c}.fa.qual.gz wget ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Rnorvegicus/Rnor3.4/chromosomes/Rnor3.4chr${c}-random.fa.gz wget ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Rnorvegicus/Rnor3.4/chromosomes/Rnor3.4chr${c}-random.fa.qual.gz echo "" end foreach c (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 Un X) echo chr$c wget ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Rnorvegicus/Rnor3.4/contigs/chr{$c}.agp end mkdir bacs cd bacs foreach c (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 Un X) echo chr$c wget ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Rnorvegicus/Rnor3.4/contigs/chr${c}.contig_bacs.fa.gz end cd .. # Distribute into chrom dirs. foreach c (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 Un X) echo chr$c mkdir ../$c zcat Rnor3.4chr${c}.fa.gz \ | sed -e 's/^>gnl[|].*[|]/>/' > ../$c/chr${c}.fa zcat Rnor3.4chr${c}-random.fa.gz \ | sed -e 's/^>gnl[|].*[|]/>/' > ../$c/chr${c}_random.fa mv chr${c}.agp ../$c/ end #mv: cannot stat `chrUn.agp': No such file or directory # No agp for Un, nor *_random.... guess we'll have to use hgFakeAgp # to at least get gaps for those. # checkAgpAndFa prints out way too much info -- keep the end/stderr only: foreach agp (*/chr*.agp) set fa = $agp:r.fa echo checking consistency of $agp and $fa checkAgpAndFa $agp $fa | tail -1 end faSize */chr*.fa #2834127293 bases (267832528 N's 2566294765 real 2566294765 upper 0 lower) in 45 sequences in 45 files #Total size: mean 62980606.5 sd 75162894.7 min 16300 (chrM) max 267910886 (chr1) median 6862066 #N count: mean 5951834.0 sd 6940811.4 #U count: mean 57028772.6 sd 68565220.7 #L count: mean 0.0 sd 0.0 # MAKE FAKE AGP WHERE NECESSARY (DONE 1/27/06 angie) # In the chromosomal AGP, all gaps are marked "fragment" *except* for # gaps of exactly 50000 which are "clone". There are gaps of >>50000 # marked "fragment"! However, in the chr*_random and chrUn here, # that just seems wrong... chrUn has a gap of 2602000, how could that # possibly be bridged? (Why are they wasting that kind of space?) # So just count >= 50000 as "clone no", even though that is not what # they do in their AGP for the chroms. ssh kkstore01 cd /cluster/data/rn4 foreach f (?{,?}/chr*.fa) set agp = $f:r.agp if (! -e $agp) then echo Faking missing AGP $agp hgFakeAgp -minContigGap=50 -minScaffoldGap=50000 $f stdout \ | sed -e 's/contig/fragment/; s/scaffold/clone/' \ > $agp endif end # BREAK UP SEQUENCE INTO 5 MB CHUNKS AT CONTIGS/GAPS (DONE 1/27/06 angie) ssh kkstore01 cd /cluster/data/rn4 foreach agp (*/chr*.agp) set fa = $agp:r.fa echo splitting $agp and $fa cp -p $agp $agp.bak cp -p $fa $fa.bak splitFaIntoContigs $agp $fa . -nSize=5000000 end # splitFaIntoContigs makes new dirs for _randoms. Move their contents # back into the main chrom dirs and get rid of the _random dirs. foreach d (*_random) set base = `echo $d | sed -e 's/_random$//'` mv $d/lift/oOut.lst $base/lift/rOut.lst mv $d/lift/ordered.lft $base/lift/random.lft mv $d/lift/ordered.lst $base/lift/random.lst rmdir $d/lift mv $d/* $base rmdir $d end # checkAgpAndFa again to get a warm-fuzzy foreach agp (*/chr*.agp) set fa = $agp:r.fa echo checking consistency of $agp and $fa checkAgpAndFa $agp $fa | tail -1 end # Make a "pseudo-contig" for processing chrM too: mkdir M/chrM_1 sed -e 's/chrM/chrM_1/' M/chrM.fa > M/chrM_1/chrM_1.fa mkdir M/lift set mSize=`faSize M/chrM.fa | awk '{print $1;}'` echo "chrM_1/chrM_1.fa.out" > M/lift/oOut.lst echo "chrM_1" > M/lift/ordered.lst echo "0 M/chrM_1 "$mSize" chrM "$mSize"" > M/lift/ordered.lft # MAKE JKSTUFF AND BED DIRECTORIES (DONE 1/27/06 angie) # This used to hold scripts -- better to keep them inline here so # they're in CVS. Now it should just hold lift file(s) and # temporary scripts made by copy-paste from this file. mkdir /cluster/data/rn4/jkStuff # This is where most tracks will be built: mkdir /cluster/data/rn4/bed # REPEAT MASKING (DONE 3/16/06 angie) # Originally run 1/30/06. Coverage was a bit low (43.020%, compared to # 43.463% for rn3) so I made a test case (see below) for Robert Hubley, # who fixed a bug and re-released RepeatMasker 3/14/06. So I'm # re-running... original chr*.fa.out have been moved to .bak and # lower-level .fa.out's have been removed. Added -ali at Robert H's # request. # Record RM version used: ls -l /cluster/bluearc/RepeatMasker #lrwxrwxrwx 1 angie protein 18 Mar 14 16:56 /cluster/bluearc/RepeatMasker -> RepeatMasker060314/ grep RELEASE /cluster/bluearc/RepeatMasker/Libraries/RepeatMaskerLib.embl #CC RELEASE 20060314; * # Run RepeatMasker on a dummy input, just to make it initialize its # rat libraries once before the cluster run: /cluster/bluearc/RepeatMasker/RepeatMasker -spec rat /dev/null #Building species libraries in: /cluster/bluearc/RepeatMasker060314/Libraries/20060314/rattus #- Split contigs into 500kb chunks, at gaps if possible: ssh kkstore01 cd /cluster/data/rn4 foreach c (?{,?}) foreach d ($c/chr${c}*_?{,?}) cd $d echo "splitting $d" set contig = $d:t faSplit gap $contig.fa 500000 ${contig}_ -lift=$contig.lft \ -minGapSize=100 cd ../.. end end #- Make the run directory and job list: cd /cluster/data/rn4 cat << '_EOF_' > jkStuff/RMRat #!/bin/csh -fe cd $1 pushd . /bin/mkdir -p /tmp/rn4/$2 /bin/cp $2 /tmp/rn4/$2/ cd /tmp/rn4/$2 /cluster/bluearc/RepeatMasker/RepeatMasker -s -ali -spec rat $2 popd /bin/cp /tmp/rn4/$2/$2.out ./ if (-e /tmp/rn4/$2/$2.tbl) /bin/cp /tmp/rn4/$2/$2.tbl ./ if (-e /tmp/rn4/$2/$2.cat) /bin/cp /tmp/rn4/$2/$2.cat ./ /bin/rm -fr /tmp/rn4/$2/* /bin/rmdir --ignore-fail-on-non-empty /tmp/rn4/$2 /bin/rmdir --ignore-fail-on-non-empty /tmp/rn4 '_EOF_' # << this line makes emacs coloring happy chmod +x jkStuff/RMRat mkdir RMRun cp /dev/null RMRun/RMJobs foreach c (?{,?}) foreach d ($c/chr${c}{,_random}_?{,?}) set ctg = $d:t foreach f ( $d/${ctg}_?{,?}.fa ) set f = $f:t echo /cluster/data/rn4/jkStuff/RMRat \ /cluster/data/rn4/$d $f \ '{'check out line+ /cluster/data/rn4/$d/$f.out'}' \ >> RMRun/RMJobs end end end #- Do the run ssh pk cd /cluster/data/rn4/RMRun para create RMJobs para make RMJobs; para time | mail -s 'RM cluster run finished' $USER #Completed: 6487 of 6488 jobs #Crashed: 1 jobs #CPU time in finished jobs: 27797186s 463286.43m 7721.44h 321.73d 0.881 y #IO & Wait Time: 41276s 687.94m 11.47h 0.48d 0.001 y #Average job time: 4291s 71.52m 1.19h 0.05d #Longest finished job: 6361s 106.02m 1.77h 0.07d #Submission to last job: 133586s 2226.43m 37.11h 1.55d # The one crash was a divide-by-zero error on chr1_24_04 -- the patch # that Robert send for open-3-1-3 was missing from open-3-1-4. So I # applied it to open-3-1-4 and manually ran ProcessRepeats on the .cat. #- Lift up the 500KB chunk .out's to 5MB ("pseudo-contig") level ssh kkstore01 cd /cluster/data/rn4 foreach d (*/chr*_?{,?}) set contig = $d:t echo $contig liftUp $d/$contig.fa.out $d/$contig.lft warn $d/${contig}_*.fa.out \ > /dev/null end #- Lift pseudo-contigs to chromosome level foreach c (?{,?}) echo lifting $c cd $c if (-e lift/ordered.lft && ! -z lift/ordered.lft) then liftUp chr$c.fa.out lift/ordered.lft warn `cat lift/oOut.lst` \ > /dev/null endif if (-e lift/random.lft && ! -z lift/random.lft) then liftUp chr${c}_random.fa.out lift/random.lft warn `cat lift/rOut.lst` \ > /dev/null endif cd .. end #- Load the .out files into the database with: ssh hgwdev cd /cluster/data/rn4 hgLoadOut rn4 ?{,?}/chr*.fa.out # Many warnings like this one: #Strange perc. field -7282.5 line 396666 of 1/chr1.fa.out #Strange perc. field -143.1 line 396666 of 1/chr1.fa.out # That's from chr1_49, not chr1_24 so it is independent of the tweak to # ProcessRepeats. # There was also this warning at the end: #note: 249 records dropped due to repStart > repEnd # run with -verbose=2 for details # Ran with -verbose=2 on kolossus... these are some of the warnings: #bad rep range [4533, 4532] line 955 of 14/chr14.fa.out #bad rep range [6936, 6935] line 15888 of 14/chr14.fa.out #bad rep range [458, 457] line 30645 of 14/chr14.fa.out #bad rep range [5798, 5797] line 85282 of 14/chr14.fa.out #TODO: send those on to Robert too. nice featureBits rn4 rmsk #1137387810 bases of 2571531505 (44.230%) in intersection # Good -- it's now higher than rn3: #1117483165 bases of 2571104688 (43.463%) in intersection # CREATING DATABASE (DONE 1/27/06 angie) ssh hgwdev hgsql '' -e 'create database rn4' # Use df to make sure there is at least 75G free on hgwdev:/var/lib/mysql df -h /var/lib/mysql #/dev/sdc1 1.8T 1.6T 88G 95% /var/lib/mysql # Wow... we sure have filled that sucker up. # CREATING GRP TABLE FOR TRACK GROUPING (DONE 1/27/06 angie) ssh hgwdev hgsql rn4 -e "create table grp (PRIMARY KEY(NAME)) select * from rn3.grp" # MAKE REPEATMASKER LOW-COVERAGE TEST CASE (DONE 3/10/06 angie) # Eyeball some repeat annotations in the browser, compare to lib seqs. # Run featureBits on rn4 and on previous genome build, and compare: ssh hgwdev nice featureBits rn4 rmsk # Hmmmm... seems wrong for this to decrease...! #1106283290 bases of 2571531505 (43.020%) in intersection nice featureBits rn3 rmsk #1117483165 bases of 2571104688 (43.463%) in intersection # Do some comparisons on the smallest chrom, chr20. featureBits rn3 -chrom=chr20 rmsk #20269238 bases of 49432254 (41.004%) in intersection # liftOver rn3 chr20 rmsk to rn4, find non-overlap and compare: ssh kolossus awk '$1 ~ /[0-9]/ {print $5 "\t" $6 "\t" $7;}' \ /cluster/data/rn3/20/chr20.fa.out > /tmp/rn3.chr20_rmsk.bed liftOver /tmp/rn3.chr20_rmsk.bed \ /cluster/data/rn3/bed/blat.rn4.2006-02-07/rn3ToRn4.over.chain.gz \ /tmp/rn4.chr20_rmskLifted.bed /tmp/rn4.chr20.unmapped featureBits rn4 -chrom=chr20 rmsk #19994056 bases of 49409453 (40.466%) in intersection featureBits rn4 -chrom=chr20 /tmp/rn4.chr20_rmskLifted.bed #20117851 bases of 49409453 (40.717%) in intersection # Wow, even after some small liftOver losses, the rn3 rmsk run is # still bigger than the rn4 run! Look for large missing pieces: featureBits rn4 -chrom=chr20 /tmp/rn4.chr20_rmskLifted.bed \!rmsk \ -minSize=500 -bed=stdout #chr20 107997 108538 chr20.1 #chr20 15410534 15411405 chr20.2 #chr20 26448452 26448976 chr20.3 #chr20 34984144 34984704 chr20.4 #chr20 41862299 41862843 chr20.5 #chr20 49189694 49190470 chr20.6 #chr20 50960151 50960656 chr20.7 #chr20 53898688 53899203 chr20.8 #chr20 54856700 54857268 chr20.9 #634331 bases of 49409453 (1.284%) in intersection # For the first such region above, the rn3 location is the same # (chr20:107998-108538) and is covered by a LINE... 34.1% divergence. # Well, that should make for a handy little test case because the # sequence has not changed at all -- only the RM version. mkdir /cluster/data/rn4/RMTest cd /cluster/data/rn4/RMTest twoBitToFa ../rn4.2bit -seq=chr20 -start=100000 -end=110000 chr20_frag.fa /cluster/bluearc/RepeatMasker/RepeatMasker -s -spec rat chr20_frag.fa mkdir open-3-1-3 mv chr20_frag.fa.* open-3-1-3/ # The version used for rn3: /cluster/bluearc/RepeatMasker030619/RepeatMasker -s -spec rat chr20_frag.fa mkdir 030619 mv chr20_frag.fa.* 030619/ # The next-latest version: /cluster/bluearc/RepeatMasker051101/RepeatMasker -s -spec rat chr20_frag.fa mkdir open-3-1-2 mv chr20_frag.fa.* open-3-1-2/ # 3-1-2 makes exactly the same .out as 3-1-3. 060319 has one more LINE... # The first open- version that we have -- .out identical to 3-1-3: /cluster/bluearc/RepeatMasker040909/RepeatMasker -s -spec rat chr20_frag.fa mkdir open-3-0-5 mv chr20_frag.fa.* open-3-0-5/ # The only other version between 030619 and 040909: /cluster/bluearc/RepeatMasker040130/RepeatMasker -s -spec rat chr20_frag.fa mkdir 040130 mv chr20_frag.fa.* 040130/ # Sent the test case to Robert Hubley -- he found a rodent-specific bug. # MAKE LIFTALL.LFT (DONE 1/27/06 angie) ssh kkstore01 cd /cluster/data/rn4 cat */lift/{ordered,random}.lft > jkStuff/liftAll.lft # GOLD AND GAP TRACKS (DONE 1/27/06 angie) ssh hgwdev cd /cluster/data/rn4 hgGoldGapGl -noGl rn4 /cluster/data/rn4 . # Wow, tons of warnings like this: #unexpected coords (6960, 6960) for frag chrX_random_2 in chrom chrX_random # -- There really are little single non-N bases between large blocks # of N in the random fasta! Sheesh, what a mess. But randoms are # the dregs... # SIMPLE REPEATS (TRF) (DONE 1/30/06 angie) ssh kolossus mkdir /cluster/data/rn4/bed/simpleRepeat cd /cluster/data/rn4/bed/simpleRepeat mkdir trf cp /dev/null jobs.csh foreach d (/cluster/data/rn4/*/chr*_?{,?}) set ctg = $d:t foreach f ($d/${ctg}.fa) set fout = $f:t:r.bed echo $fout echo "/cluster/bin/i386/trfBig -trf=/cluster/bin/i386/trf $f /dev/null -bedAt=trf/$fout -tempDir=/tmp" \ >> jobs.csh end end csh -ef jobs.csh >&! jobs.log & # check on this with tail -f jobs.log wc -l jobs.csh ls -1 trf | wc -l #591 endsInLf trf/* #trf/chrM_1.bed zero length # When job is done do: liftUp simpleRepeat.bed /cluster/data/rn4/jkStuff/liftAll.lft warn \ trf/*.bed # Load into the database: ssh hgwdev hgLoadBed rn4 simpleRepeat \ /cluster/data/rn4/bed/simpleRepeat/simpleRepeat.bed \ -sqlTable=$HOME/kent/src/hg/lib/simpleRepeat.sql nice featureBits rn4 simpleRepeat #72859247 bases of 2571531505 (2.833%) in intersection # Compare to rn3: nice featureBits rn3 simpleRepeat #70073656 bases of 2571104688 (2.725%) in intersection # SET UP DB ON KOLOSSUS (DONE 1/30/06 angie) # to spare hgwdev, make an rn4 on kolossus so that we have the # option of loading there and pushing to hgwdev. It will need the # gap tables and also chromInfo (added below) so that we can run # featureBits on it. hgsql '' -e 'create database rn4' cd /cluster/data/rn4 hgGoldGapGl -noGl rn4 /cluster/data/rn4 . # CYTOBAND (DONE 1/30/06 angie) ssh hgwdev mkdir /cluster/data/rn4/bed/cytoBand cd /cluster/data/rn4/bed/cytoBand wget ftp://ftp.ncbi.nih.gov/genomes/R_norvegicus/mapview/ideogram.gz zcat ideogram.gz | sort -k1,1 -k6n,6n > ideogram.sorted /cluster/bin/scripts/createNcbiCytoBand ideogram.sorted # Load the bed file into both cytoBand and cytoBandIdeo: hgLoadBed -noBin -sqlTable=$HOME/kent/src/hg/lib/cytoBand.sql \ rn4 cytoBand cytoBand.bed hgLoadBed -noBin -sqlTable=$HOME/kent/src/hg/lib/cytoBandIdeo.sql \ rn4 cytoBandIdeo cytoBand.bed # Doh, the file does not have stain information! All of the bands # are the same as in rn3, so grab stain info from there. hgsql rn4 -e '\ create table bandToStain select name,gieStain from rn3.cytoBand; \ update cytoBand,bandToStain \ set cytoBand.gieStain = bandToStain.gieStain \ where cytoBand.name = bandToStain.name; \ update cytoBandIdeo,bandToStain \ set cytoBandIdeo.gieStain = bandToStain.gieStain \ where cytoBandIdeo.name = bandToStain.name; \ drop table bandToStain;' # CREATE MICROSAT TRACK (done 2006-7-5 JK) ssh hgwdev cd /cluster/data/rn4/bed mkdir microsat cd microsat awk '($5==2 || $5==3) && $6 >= 15 && $8 == 100 && $9 == 0 {printf("%s\t%s\t%s\t%dx%s\n", $1, $2, $3, $6, $16);}' ../simpleRepeat/simpleRepeat.bed > microsat.bed /cluster/bin/i386/hgLoadBed rn4 microsat microsat.bed # PROCESS SIMPLE REPEATS INTO MASK (DONE 1/30/06 angie) # After the simpleRepeats track has been built, make a filtered version # of the trf output: keep trf's with period <= 12: ssh kkstore01 cd /cluster/data/rn4/bed/simpleRepeat mkdir -p trfMask foreach f (trf/chr*.bed) awk '{if ($5 <= 12) print;}' $f > trfMask/$f:t end # Lift up filtered trf output to chrom coords as well: cd /cluster/data/rn4 mkdir bed/simpleRepeat/trfMaskChrom foreach c (?{,?}) if (-e $c/lift/ordered.lst) then perl -wpe 's@(\S+)@bed/simpleRepeat/trfMask/$1.bed@' \ $c/lift/ordered.lst > $c/lift/oTrf.lst liftUp bed/simpleRepeat/trfMaskChrom/chr$c.bed \ jkStuff/liftAll.lft warn `cat $c/lift/oTrf.lst` endif if (-e $c/lift/random.lst) then perl -wpe 's@(\S+)@bed/simpleRepeat/trfMask/$1.bed@' \ $c/lift/random.lst > $c/lift/rTrf.lst liftUp bed/simpleRepeat/trfMaskChrom/chr${c}_random.bed \ jkStuff/liftAll.lft warn `cat $c/lift/rTrf.lst` endif end # Here's the coverage for the filtered TRF: ssh kolossus cat /cluster/data/rn4/bed/simpleRepeat/trfMaskChrom/*.bed \ > /tmp/filtTrf.bed featureBits rn4 /tmp/filtTrf.bed #50551402 bases of 2571531505 (1.966%) in intersection # MASK SEQUENCE WITH REPEATMASKER AND SIMPLE REPEAT/TRF (DONE 3/17/06 angie) # Originally done 1/30 -- redoing after re-run of RepeatMasker. ssh kkstore01 cd /cluster/data/rn4 # Soft-mask (lower-case) the contig and chr .fa's, # then make hard-masked versions from the soft-masked. set trfCtg=bed/simpleRepeat/trfMask set trfChr=bed/simpleRepeat/trfMaskChrom foreach f (?{,?}/chr*.fa) echo "repeat- and trf-masking $f" maskOutFa -soft $f $f.out $f set chr = $f:t:r maskOutFa -softAdd $f $trfChr/$chr.bed $f echo "hard-masking $f" maskOutFa $f hard $f.masked end foreach c (?{,?}) echo "repeat- and trf-masking contigs of chr$c, chr${c}_random" foreach d ($c/chr*_?{,?}) set ctg=$d:t set f=$d/$ctg.fa maskOutFa -soft $f $f.out $f maskOutFa -softAdd $f $trfCtg/$ctg.bed $f maskOutFa $f hard $f.masked end end # Make 2bit (for hgBlat, browser): faToTwoBit ?{,?}/chr*.fa rn4.2bit # Make nib (for blastz w/linSpecRep, OK for genbank too): mkdir nib foreach f (?{,?}/chr*.fa) echo $f:t:r faToNib -softMask $f nib/$f:t:r.nib end # PUT NIBS ON /SCRATCH (DONE 3/17/06 angie) # Originally done 1/30 -- redoing after re-run of RepeatMasker. ssh kkstore01 mkdir /cluster/bluearc/scratch/hg/rn4 rsync -av /cluster/data/rn4/nib/* /cluster/bluearc/scratch/hg/rn4/nib/ cp -p /cluster/data/rn4/rn4.2bit /cluster/bluearc/scratch/hg/rn4/ # Ask cluster-admin to distribute to /scratch on big & small cluster # MAKE CHROMINFO TABLE WITH 2BIT (DONE 1/30/06 angie) ssh kkstore01 cd /cluster/data/rn4 mkdir bed/chromInfo twoBitInfo rn4.2bit stdout \ | awk '{print $1 "\t" $2 "\t/gbdb/rn4/rn4.2bit";}' \ > bed/chromInfo/chromInfo.tab # Link to 2bit from /gbdb/rn4/: ssh hgwdev mkdir /gbdb/rn4 ln -s /cluster/data/rn4/rn4.2bit /gbdb/rn4/ # Load /gbdb/rn4/rn4.2bit paths into database and save size info. hgsql rn4 < $HOME/kent/src/hg/lib/chromInfo.sql hgsql rn4 -e 'load data local infile \ "/cluster/data/rn4/bed/chromInfo/chromInfo.tab" \ into table chromInfo;' echo "select chrom,size from chromInfo" | hgsql -N rn4 > chrom.sizes # take a look at chrom.sizes size wc chrom.sizes # 45 90 789 chrom.sizes # Load chromInfo on kolossus too (required for featureBits): ssh kolossus hgsql rn4 < $HOME/kent/src/hg/lib/chromInfo.sql hgsql rn4 -e 'load data local infile \ "/cluster/data/rn4/bed/chromInfo/chromInfo.tab" \ into table chromInfo;' # MAKE HGCENTRALTEST ENTRY AND TRACKDB TABLE (DONE 1/30/06 angie) # Make trackDb table so browser knows what tracks to expect: ssh hgwdev cd ~/kent/src/hg/makeDb/trackDb cvsup # Add trackDb directories and a description.html mkdir rat/rn4 cvs add rat/rn4 cvs add rat/rn4/description.html cvs ci rat/rn4 # Edit that makefile to add rn4 in all the right places and do make update DBS=rn4 mkdir /gbdb/rn4/html cvs ci makefile # Go public on genome-test. In a clean tree (no mods, up-to-date): cvs up makefile make alpha # Note: hgcentral*.genome values must correspond # with defaultDb.genome values hgsql -h genome-testdb hgcentraltest \ -e 'INSERT INTO dbDb \ (name, description, nibPath, organism, \ defaultPos, active, orderKey, genome, scientificName, \ htmlPath, hgNearOk, hgPbOk, sourceName) values \ ("rn4", "Nov. 2004", "/gbdb/rn4", "Rat", \ "chr2:22845558-23004134", 1, 30, "Rat", \ "Rattus norvegicus", "/gbdb/rn4/html/description.html", \ 0, 0, "Baylor HGSC v. 3.4");' # MAKE DOWNLOADABLE SEQUENCE FILES (DONE 1/30/06 angie) # chrom{Fa*,Out}, md5sum.txt REDONE, 3/17/06, after RepeatMasker re-run. # upstream*.fa.gz added 6/23/06 ssh kolossus cd /cluster/data/rn4 #- Build the .tar.gz files -- no genbank for now. cat << '_EOF_' > jkStuff/zipAll.csh rm -rf bigZips mkdir bigZips tar cvzf bigZips/chromAgp.tar.gz ?{,?}/chr*.agp tar cvzf bigZips/chromOut.tar.gz ?{,?}/chr*.fa.out tar cvzf bigZips/chromFa.tar.gz ?{,?}/chr*.fa tar cvzf bigZips/chromFaMasked.tar.gz ?{,?}/chr*.fa.masked cd bed/simpleRepeat tar cvzf ../../bigZips/chromTrf.tar.gz trfMaskChrom/chr*.bed cd ../.. '_EOF_' # << this line makes emacs coloring happy csh -ef ./jkStuff/zipAll.csh >& zipAll.log & tail -f zipAll.log #- Look at zipAll.log to make sure all file lists look reasonable. cd bigZips md5sum *.gz > md5sum.txt # Make a README.txt cd .. mkdir chromGz foreach f ( ?{,?}/chr*.fa ) echo $f:t:r gzip -c $f > chromGz/$f:t.gz end cd chromGz md5sum *.gz > md5sum.txt # Make a README.txt #- Link the .gz and .txt files to hgwdev:/usr/local/apache/... ssh hgwdev set gp = /usr/local/apache/htdocs/goldenPath/rn4 mkdir -p $gp/bigZips ln -s /cluster/data/rn4/bigZips/{chrom*.tar.gz,*.txt} $gp/bigZips mkdir -p $gp/chromosomes ln -s /cluster/data/rn4/chromGz/{chr*.gz,*.txt} $gp/chromosomes # Take a look at bigZips/* and chromosomes/* mkdir $gp/database # Create README.txt files in database/ to explain the files. # 6/23/06 -- Add upstream*.fa.gz. cd /cluster/data/rn4/bigZips foreach size (1000 2000 5000) echo $size featureBits rn4 refGene:upstream:$size -fa=stdout \ | nice gzip -c > upstream$size.fa.gz end nice md5sum up*.gz >> md5sum.txt ln -s /cluster/data/rn4/bigZips/up*.gz $gp/bigZips/ # MAKE 11.OOC (DONE 2/10/06 angie) ssh kolossus cd /cluster/data/rn4 mkdir /cluster/bluearc/rn4 blat rn4.2bit /dev/null /dev/null \ -tileSize=11 -makeOoc=/cluster/bluearc/rn4/11.ooc -repMatch=1024 #Wrote 25608 overused 11-mers to /cluster/bluearc/rn4/11.ooc # and for use in other contexts: cp -p /cluster/bluearc/rn4/11.ooc /cluster/data/rn4 # GENBANK AUTO UPDATE (DONE 2/10/06 angie) # align with revised genbank process. drop xeno ESTs. cd ~/kent/src/makeDb/genbank cvsup # edit etc/genbank.conf to add rn4 # rn4 rn4.serverGenome = /cluster/data/rn4/rn4.2bit rn4.clusterGenome = /iscratch/i/rn4/rn4.2bit rn4.ooc = /cluster/bluearc/rn4/11.ooc rn4.align.unplacedChroms = chrUn,chr*_random rn4.lift = /cluster/data/rn4/jkStuff/liftAll.lft rn4.refseq.mrna.native.pslCDnaFilter = ${ordered.refseq.mrna.native.pslCDnaFilter} rn4.refseq.mrna.xeno.pslCDnaFilter = ${ordered.refseq.mrna.xeno.pslCDnaFilter} rn4.genbank.mrna.native.pslCDnaFilter = ${ordered.genbank.mrna.native.pslCDnaFilter} rn4.genbank.mrna.xeno.pslCDnaFilter = ${ordered.genbank.mrna.xeno.pslCDnaFilter} rn4.genbank.est.native.pslCDnaFilter = ${ordered.genbank.est.native.pslCDnaFilter} rn4.downloadDir = rn4 rn4.refseq.mrna.xeno.load = yes rn4.refseq.mrna.xeno.loadDesc = yes rn4.mgcTables.default = full rn4.mgcTables.mgc = all cvs ci etc/genbank.conf # update /cluster/data/genbank/ make etc-update ssh kkstore02 cd /cluster/data/genbank nice bin/gbAlignStep -initial rn4 & # load database when finished ssh hgwdev cd /cluster/data/genbank nice ./bin/gbDbLoadStep -drop -initialLoad rn4 & # enable daily alignment and update of hgwdev cd ~/kent/src/makeDb/genbank cvsup # add rn4 to: etc/align.dbs etc/hgwdev.dbs cvs commit make etc-update # PRODUCING GENSCAN PREDICTIONS (DONE 1/31/06 angie) ssh hgwdev mkdir /cluster/data/rn4/bed/genscan cd /cluster/data/rn4/bed/genscan # Check out hg3rdParty/genscanlinux to get latest genscan: cvs co hg3rdParty/genscanlinux # Run on small cluster (more mem than big cluster). ssh kki cd /cluster/data/rn4/bed/genscan # Make 3 subdirectories for genscan to put their output files in mkdir gtf pep subopt # Generate a list file, genome.list, of all the hard-masked contigs that # *do not* consist of all-N's (which would cause genscan to blow up) cp /dev/null genome.list foreach f ( `ls -1S /cluster/data/rn4/*/chr*_*/chr*_?{,?}.fa.masked` ) egrep '[ACGT]' $f > /dev/null if ($status == 0) echo $f >> genome.list end wc -l genome.list #591 genome.list # Create template file, gsub, for gensub2. For example (3-line file): cat << '_EOF_' > gsub #LOOP /cluster/bin/x86_64/gsBig {check in line+ $(path1)} {check out line gtf/$(root1).gtf} -trans={check out line pep/$(root1).pep} -subopt={check out line subopt/$(root1).bed} -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=2400000 #ENDLOOP '_EOF_' # << this line makes emacs coloring happy gensub2 genome.list single gsub jobList para make jobList para time #Completed: 589 of 591 jobs #Crashed: 2 jobs #CPU time in finished jobs: 110813s 1846.88m 30.78h 1.28d 0.004 y #IO & Wait Time: 8627s 143.79m 2.40h 0.10d 0.000 y #Average job time: 203s 3.38m 0.06h 0.00d #Longest finished job: 4230s 70.50m 1.18h 0.05d #Submission to last job: 11260s 187.67m 3.13h 0.13d # If there are crashes, diagnose with "para problems" / "para crashed". # If a job crashes due to genscan running out of memory, re-run it # manually with "-window=1200000" instead of "-window=2400000". ssh kkr5u00 cd /cluster/data/rn4/bed/genscan /cluster/bin/x86_64/gsBig /cluster/data/rn4/10/chr10_3/chr10_3.fa.masked gtf/chr10_3.fa.gtf -trans=pep/chr10_3.fa.pep -subopt=subopt/chr10_3.fa.bed -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=1200000 /cluster/bin/x86_64/gsBig /cluster/data/rn4/9/chr9_21/chr9_21.fa.masked gtf/chr9_21.fa.gtf -trans=pep/chr9_21.fa.pep -subopt=subopt/chr9_21.fa.bed -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=1200000 # Convert these to chromosome level files as so: ssh kkstore01 cd /cluster/data/rn4/bed/genscan liftUp genscan.gtf ../../jkStuff/liftAll.lft warn gtf/*.gtf liftUp genscanSubopt.bed ../../jkStuff/liftAll.lft warn subopt/*.bed cat pep/*.pep > genscan.pep # Load into the database as so: ssh hgwdev cd /cluster/data/rn4/bed/genscan ldHgGene rn4 genscan genscan.gtf hgPepPred rn4 generic genscanPep genscan.pep hgLoadBed rn4 genscanSubopt genscanSubopt.bed # MAKE GCPERCENT (DONE 1/30/06 angie) ssh kolossus mkdir /cluster/data/rn4/bed/gc5Base cd /cluster/data/rn4/bed/gc5Base hgGcPercent -wigOut -doGaps -file=stdout -win=5 -verbose=2 rn4 \ /cluster/data/rn4 \ | wigEncode stdin gc5Base.wig gc5Base.wib ssh hgwdev mkdir /gbdb/rn4/wib cd /cluster/data/rn4/bed/gc5Base ln -s `pwd`/gc5Base.wib /gbdb/rn4/wib hgLoadWiggle -pathPrefix=/gbdb/rn4/wib rn4 gc5Base gc5Base.wig # ENSEMBL GENES (DONE 1/30/06 angie - UPDATED 6/13/06) mkdir /cluster/data/rn4/bed/ensembl cd /cluster/data/rn4/bed/ensembl # Get the ensembl gene data from # http://www.ensembl.org/Rattus_norvegicus/martview # Follow this sequence through the pages: # Page 1) Make sure that the Rattus_norvegicus choice is selected. Hit next. # Page 2) Uncheck the "Limit to" box in the region choice. Then hit next. # Page 3) Choose the "Structures" box. # Page 4) Choose GTF as the ouput. choose gzip compression. hit export. # Save as ensembl.gff.gz # Add "chr" to front of each line in the gene data gtf file to make # it compatible with our software. gunzip -c ensembl.gff.gz \ | perl -wpe 's/^([0-9]+|X|Y|M|Un(_random)?)/chr$1/ \ || die "Line $. doesnt start with rat chrom:\n$_"; \ s/^chrMT/chrM/;' \ > ensGene.gtf ssh hgwdev ldHgGene -gtf -genePredExt rn4 ensGene \ /cluster/data/rn4/bed/ensembl/ensGene.gtf # ensGtp associates geneId/transcriptId/proteinId for hgPepPred and # hgKnownToSuper. Use ensMart to create it as above, except: # Page 3) Choose the "Features" box. In "Ensembl Attributes", check # Ensembl Gene ID, Ensembl Transcript ID, Ensembl Peptide ID. # Choose Text, tab-separated as the output format. Result name ensGtp. # Save file as ensGtp.txt.gz gunzip ensGtp.txt.gz hgLoadSqlTab rn4 ensGtp ~/kent/src/hg/lib/ensGtp.sql ensGtp.txt # Load Ensembl peptides: # Get them from ensembl as above in the gene section except for # Page 3) Choose the "Sequences" box. # Page 4) Transcripts/Proteins. Peptide. Format = FASTA. # Save file as ensemblPep.fa.gz gunzip -c ensemblPep.fa.gz \ | perl -wpe 's/^>.*\|(ENSRNOT\d+\.\d+).*/>$1/' \ > ensPep.fa hgPepPred rn4 generic ensPep ensPep.fa # CPGISSLANDS (WUSTL) (DONE 1/30/06 angie) ssh hgwdev mkdir /cluster/data/rn4/bed/cpgIsland cd /cluster/data/rn4/bed/cpgIsland # Build software from Asif Chinwalla (achinwal@watson.wustl.edu) cvs co hg3rdParty/cpgIslands cd hg3rdParty/cpgIslands make mv cpglh.exe /cluster/data/rn4/bed/cpgIsland/ ssh kolossus cd /cluster/data/rn4/bed/cpgIsland foreach f (../../*/chr*.fa.masked) set fout=$f:t:r:r.cpg echo running cpglh on $f to $fout nice ./cpglh.exe $f > $fout end # Transform cpglh output to bed + cat << '_EOF_' > filter.awk /* Input columns: */ /* chrom, start, end, len, CpG: cpgNum, perGc, cpg:gpc, observed:expected */ /* chr1\t 41776\t 42129\t 259\t CpG: 34\t 65.8\t 0.92\t 0.94 */ /* Output columns: */ /* chrom, start, end, name, length, cpgNum, gcNum, perCpg, perGc, obsExp */ /* chr1\t41775\t42129\tCpG: 34\t354\t34\t233\t19.2\t65.8\to0.94 */ { $2 = $2 - 1; width = $3 - $2; printf("%s\t%d\t%s\t%s %s\t%s\t%s\t%0.0f\t%0.1f\t%s\t%s\n", $1, $2, $3, $5,$6, width, $6, width*$7*0.01, 100.0*2*$6/width, $7, $9); } '_EOF_' # << this line makes emacs coloring happy awk -f filter.awk chr*.cpg > cpgIsland.bed # load into database: ssh hgwdev cd /cluster/data/rn4/bed/cpgIsland # Reloaded without -noBin (bin added to .sql) 4/7/06. hgLoadBed rn4 cpgIslandExt -tab \ -sqlTable=$HOME/kent/src/hg/lib/cpgIslandExt.sql cpgIsland.bed # CPGISLANDS (ANDY LAW) (DONE 1/30/06 angie) # See notes in makeGalGal2.doc ssh kolossus mkdir /cluster/data/rn4/bed/cpgIslandGgfAndy cd /cluster/data/rn4/bed/cpgIslandGgfAndy # Build the preProcGgfAndy program in # kent/src/oneShot/preProcGgfAndy into your ~/bin/$MACHTYPE # Use masked sequence since this is a mammal... cp /dev/null cpgIslandGgfAndyMasked.bed foreach f (../../*/chr*.fa.masked) set chr = $f:t:r:r echo preproc and run on masked $chr ~/bin/x86_64/preProcGgfAndy $f \ | /cluster/home/angie/ggf-andy-cpg-island.pl \ | perl -wpe 'chomp; ($s,$e,$cpg,$n,$c,$g,$oE) = split("\t"); $s--; \ $gc = $c + $g; $pCpG = (100.0 * 2 * $cpg / $n); \ $pGc = (100.0 * $gc / $n); \ $_ = "'$chr'\t$s\t$e\tCpG: $cpg\t$n\t$cpg\t$gc\t" . \ "$pCpG\t$pGc\t$oE\n";' \ >> cpgIslandGgfAndyMasked.bed end # load into database: ssh hgwdev cd /cluster/data/rn4/bed/cpgIslandGgfAndy # Reloaded without -noBin (bin added to .sql) 4/7/06. sed -e 's/cpgIslandExt/cpgIslandGgfAndyMasked/g' \ $HOME/kent/src/hg/lib/cpgIslandExt.sql > cpgIslandGgfAndyMasked.sql hgLoadBed rn4 cpgIslandGgfAndyMasked -tab \ -sqlTable=cpgIslandGgfAndyMasked.sql cpgIslandGgfAndyMasked.bed #Loaded 91097 elements of size 10 featureBits rn4 cpgIslandExt #9629809 bases of 2571531505 (0.374%) in intersection featureBits rn4 cpgIslandGgfAndyMasked #43899646 bases of 2571531505 (1.707%) in intersection wc -l ../cpgIsland/cpgIsland.bed *bed # 15809 ../cpgIsland/cpgIsland.bed # 91097 cpgIslandGgfAndyMasked.bed # LIFTOVER RGD (DONE 2/9/06 angie) # ftp://rgd.mcw.edu/pub/RGD_genome_annotations/ still has 3.1 (rn3) not # 3.4 (rn4). Jim said these are interesting enough that we should lift # over at least the QTLs. ssh kolossus mkdir /cluster/data/rn4/bed/rgdLiftover cd /cluster/data/rn4/bed/rgdLiftover wget ftp://rgd.mcw.edu/pub/RGD_genome_annotations/V3.1/GFF_files/rgd_rat_qtl_12052005.gff awk '{print $1"\t"$4-1"\t"$5"\t"$10}' rgd_rat_qtl_12052005.gff \ | sed -e 's/Chr/chr/g; s/"//g; s/RGD://g; s/;//g' \ > rgdQtl_rn3.bed awk '{printf "%s\t%s\t", $12, $10; \ for (i = 14;i <= NF; ++i ) {printf "%s ", $i} printf "\n"} ' \ rgd_rat_qtl_12052005.gff \ | sed -e 's/"//g; s/RGD://g; s/;\t/\t/g' \ > rgdQtlLink.tab # liftOver rgdQtl_rn3.bed \ # /cluster/data/rn3/bed/blat.rn4.2006-02-07/rn3ToRn4.over.chain.gz \ # rgdQtl.bed rgdQtl.unmapped # liftOver of rgdQtl_rn3.bed did not go well -- out of 903 qtls, # only 95 were successfully mapped. 772 got "Partially split in new." # Since these are enormous, presumably fuzzy-edged regions, just lift # start and end. If the start and end of each feature seems to be # liftOver'd coherently, "chain" those back up into big regions in rn4. perl -we 'while (<>) { \ chomp; ($chr, $start, $end, $name) = split; \ print "$chr\t$start\t" . ($start+100) . "\tstart_$name\n"; \ print "$chr\t" . ($end-100) . "\t$end\tend_$name\n"; \ }' \ rgdQtl_rn3.bed > rgdQtl_rn3_endpoints.bed liftOver rgdQtl_rn3_endpoints.bed \ /cluster/data/rn3/bed/blat.rn4.2006-02-07/rn3ToRn4.over.chain.gz \ rgdQtl_endpoints.bed rgdQtl_endpoints.unmapped # Now 1706 out of 1806 survive... but 100 are deleted in new?! # I manually checked 10 of them and they all are either in a gap # or off the end of a chrom. Makes me wonder if these really are # rn3 coords... oh well, I guess we can load them up and ask # RGD about them in the meantime. I submitted a question using their # online form -- my message ID is 439 and I can email rgd@rgd.mcw.edu # with that ID if I don't hear from them in 3 biz days. perl -we 'while (<>) { \ chomp; ($chr, $start, $end, $name) = split; \ if ($name =~ /^start_(\S+)/) { \ $starts{$1} = [$chr, $start]; \ } elsif ($name =~ /^end_(\S+)/) { \ $ends{$1} = [$chr, $end]; \ } else { die "parse error line $.: name $name\n"; } \ } \ foreach my $name (keys %starts) { \ if (defined $ends{$name}) { \ my ($sChr, $start) = @{$starts{$name}}; \ my ($eChr, $end) = @{$ends{$name}}; \ if (($sChr eq $eChr) && $end > $start) { \ print "$sChr\t$start\t$end\t$name\n"; \ } else { \ print STDERR "reject: [$sChr, $start] / [$eChr, $end] / $name\n"; \ } \ } \ }' \ rgdQtl_endpoints.bed \ | sort -k 1,1 -k 2n,2n > rgdQtl.bed wc -l rgdQtl.bed #806 rgdQtl.bed ssh hgwdev cd /cluster/data/rn4/bed/rgdLiftover hgLoadBed rn4 rgdQtl rgdQtl.bed hgsql rn4 < ~/kent/src/hg/lib/rgdQtlLink.sql hgsql rn4 -e \ 'load data local infile "rgdQtlLink.tab" into table rn4.rgdQtlLink;' # MAKE LINEAGE-SPECIFIC REPEATS VS. HUMAN, MOUSE (DONE 2/8/06 angie) ssh kolossus mkdir /cluster/data/rn4/rmsk cd /cluster/data/rn4/rmsk ln -s ../?{,?}/chr*.fa.out . # Run Arian's DateRepsinRMoutput.pl to add extra columns telling # whether repeats in -query are also expected in -comp species. # Human in extra column 1, Mouse in extra column 2 foreach outfl ( *.out ) echo "$outfl" nice /cluster/bluearc/RepeatMasker/DateRepeats \ ${outfl} -query rat -comp human -comp mouse end # Now extract human (extra column 1), mouse (extra column). cd .. mkdir linSpecRep.notInHuman mkdir linSpecRep.notInMouse foreach f (rmsk/*.out_homo-sapiens_mus-musculus) set base = $f:t:r:r echo $base.out.spec /cluster/bin/scripts/extractLinSpecReps 1 $f > \ linSpecRep.notInHuman/$base.out.spec /cluster/bin/scripts/extractLinSpecReps 2 $f > \ linSpecRep.notInMouse/$base.out.spec end wc -l rmsk/*.out # 4417757 total wc -l linSpecRep.notInHuman/* # 2676056 total wc -l linSpecRep.notInMouse/* # 471417 total # Clean up. rm -r rmsk # Distribute linSpecRep.* for cluster run ssh kkstore01 mkdir /san/sanvol1/scratch/rn4 rsync -av /cluster/data/rn4/linSpecRep.notInHuman/* \ /san/sanvol1/scratch/rn4/linSpecRep.notInHuman/ rsync -av /cluster/data/rn4/linSpecRep.notInMouse/* \ /san/sanvol1/scratch/rn4/linSpecRep.notInMouse/ # SWAP/CHAIN/NET HG17 (DONE 2/10/06 angie) mkdir /cluster/data/rn4/bed/blastz.hg17.swap cd /cluster/data/rn4/bed/blastz.hg17.swap doBlastzChainNet.pl -swap /cluster/data/hg17/bed/blastz.rn4/DEF \ -workhorse=kkr5u00 >& do.log & tail -f do.log ln -s blastz.hg17.swap /cluster/data/rn4/bed/blastz.hg177 # MAKE LINEAGE-SPECIFIC REPEATS FOR NON-MAMMALS (DONE 2/13/06 angie) # In an email 2/13/04 to Angie, Arian said we could treat all # human repeats as lineage-specific for human-chicken blastz. # Extrapolate to any mammal vs. anything at least as distant as chicken. ssh kkr1u00 mkdir /iscratch/i/rn4/linSpecRep.notInNonMammal foreach f (/cluster/data/rn4/*/chr*.fa.out) cp -p $f /iscratch/i/rn4/linSpecRep.notInNonMammal/$f:t:r:r.out.spec end iSync # BLASTZ CHICKEN (GALGAL2) (DONE 2/14/06 angie) ssh kk mkdir /cluster/data/rn4/bed/blastz.galGal2.2006-02-13 cd /cluster/data/rn4/bed/blastz.galGal2.2006-02-13 # Set L=10000 (higher threshold on blastz's outer loop) and abridge # repeats. cat << '_EOF_' > DEF #rat vs. chicken # Specific settings for chicken (per Webb email to Brian Raney) BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=10000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q BLASTZ_ABRIDGE_REPEATS=1 # TARGET: Rat SEQ1_DIR=/scratch/hg/rn4/nib SEQ1_SMSK=/iscratch/i/rn4/linSpecRep.notInNonMammal SEQ1_CHUNK=10000000 SEQ1_LAP=10000 SEQ1_LEN=/cluster/data/rn4/chrom.sizes # QUERY: Chicken SEQ2_DIR=/iscratch/i/galGal2/nib SEQ2_SMSK=/iscratch/i/galGal2/linSpecRep SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LEN=/cluster/data/galGal2/chrom.sizes BASE=/cluster/data/rn4/bed/blastz.galGal2.2006-02-13 '_EOF_' # << this line keeps emacs coloring happy doBlastzChainNet.pl DEF \ -blastzOutRoot=/cluster/bluearc/blastzRn4GalGal2Out \ > do.log & tail -f do.log cd /cluster/data/rn4/bed ln -s /cluster/data/rn4/bed/blastz.galGal2.2006-02-13 blastz.galGal2 ########################################################################## # BACEND PAIRS TRACK # DOWNLOAD CLONEENDS (BACENDS) FROM NCBI (DONE 2/23/06 angie) ssh kkstore01 cd /cluster/data/rn4 # Plenty of room at the moment: df -h . # 1.3T 1009G 234G 82% /cluster/store9 mkdir -p bed/cloneend/ncbi cd bed/cloneend/ncbi wget --timestamping \ ftp://ftp.ncbi.nih.gov/genomes/CLONEEND/rattus_norvegicus/\* cd .. # Extract unversioned accessions from headers and combine into one # uncompressed file which we will link to from /gbdb/: zcat ncbi/*ends*.mfa.gz \ | perl -wpe 'if (/^>/) { \ s/^>.*\|(gb|dbj)\|(\w+)\.\d+\|.*/>$2/ || die "parse line $.:$_\n"; }' \ > cloneEnds.fa # Make sure the sequences are intact after the header-munging: faSize ncbi/*.mfa.gz #188448559 bases (4451 N's 188444108 real 102020307 upper 86423801 lower) in 307557 sequences in 18 files #Total size: mean 612.7 sd 198.9 min 83 (gi|30387889|gb|CC181697.1|) max 1158 (gi|24002262|gb|BZ277934.1|) median 634 faSize cloneEnds.fa #188448559 bases (4451 N's 188444108 real 102020307 upper 86423801 lower) in 307557 sequences in 1 files #Total size: mean 612.7 sd 198.9 min 83 (CC181697) max 1158 (BZ277934) median 634 # Extract pairings from info files: zcat ncbi/*info*.txt.gz \ | /cluster/bin/scripts/convertCloneEndInfo stdin #134351 pairs and 38410 singles # split for cluster run mkdir /cluster/bluearc/rn4/cloneEnds faSplit sequence cloneEnds.fa 100 /cluster/bluearc/rn4/cloneEnds/cloneEnds # Check to ensure no breakage: faSize /cluster/bluearc/rn4/cloneEnds/*.fa #188448559 bases (4451 N's 188444108 real 102020307 upper 86423801 lower) in 307557 sequences in 98 files #Total size: mean 612.7 sd 198.9 min 83 (CC181697) max 1158 (BZ277934) median 634 # same numbers as before # load sequences ssh hgwdev mkdir /gbdb/rn4/cloneend ln -s /cluster/data/rn4/bed/cloneend/cloneEnds.fa /gbdb/rn4/cloneend/ cd /tmp hgLoadSeq rn4 /gbdb/rn4/cloneend/cloneEnds.fa #307557 sequences # BACEND SEQUENCE ALIGNMENTS (DONE 2/23/06 angie) ssh kkstore01 # Make unmasked fasta. cd /cluster/data/rn4 mkdir /cluster/bluearc/rn4/noMask foreach f (?{,?}/chr*.fa) echo $f:t:r perl -wpe 'tr/a-z/A-Z/ if (! /^>/);' $f \ > /cluster/bluearc/rn4/noMask/$f:t end # kluster run ssh pk mkdir /cluster/data/rn4/bed/bacends cd /cluster/data/rn4/bed/bacends mkdir out # allow blat to run politely in /tmp while it writes output, then # copy results to results file: cat << '_EOF_' > runBlat.csh #!/bin/csh -ef set root1 = $1 set root2 = $2 set result = $3 rm -fr /scratch/tmp/${root1}_${root2} mkdir /scratch/tmp/${root1}_${root2} pushd /scratch/tmp/${root1}_${root2} /cluster/bin/x86_64/blat /cluster/bluearc/rn4/noMask/${root1}.fa \ /cluster/bluearc/rn4/cloneEnds/${root2}.fa \ -ooc=/cluster/bluearc/rn4/11.ooc ${root1}.${root2}.psl popd mkdir -p out/${root2} rm -f ${result} mv /scratch/tmp/${root1}_${root2}/${root1}.${root2}.psl ${result} rm -fr /scratch/tmp/${root1}_${root2} '_EOF_' # << happy emacs chmod +x runBlat.csh cat << '_EOF_' > template #LOOP ./runBlat.csh $(root1) $(root2) {check out line+ out/$(root2)/$(root1).$(root2).psl} #ENDLOOP '_EOF_' # << emacs happy ls -1S /cluster/bluearc/rn4/cloneEnds/cloneEnds???.fa > bacEnds.lst ls -1S /cluster/bluearc/rn4/noMask/chr*.fa > contig.lst gensub2 contig.lst bacEnds.lst template jobList para make jobList #Completed: 4410 of 4410 jobs #CPU time in finished jobs: 450309s 7505.15m 125.09h 5.21d 0.014 y #IO & Wait Time: 812287s 13538.12m 225.64h 9.40d 0.026 y #Average job time: 286s 4.77m 0.08h 0.00d #Longest finished job: 1834s 30.57m 0.51h 0.02d #Submission to last job: 6982s 116.37m 1.94h 0.08d ssh kkstore01 cd /cluster/data/rn4/bed/bacends mkdir temp time pslSort dirs raw.psl temp out/* #538.065u 55.510s 14:46.80 66.9% 0+0k 0+0io 3pf+0w time pslReps -nearTop=0.01 -minCover=0.7 -minAli=0.8 -noIntrons \ raw.psl bacEnds.psl /dev/null #203.262u 8.877s 3:57.34 89.3% 0+0k 0+0io 2pf+0w # BACEND PAIRS TRACK (DONE 2/23/06 angie) ssh kolossus cd /cluster/data/rn4/bed/bacends time /cluster/bin/x86_64/pslPairs -tInsert=10000 \ -minId=0.91 -noBin -min=25000 \ -max=350000 -slopval=10000 -hardMax=500000 -slop -short -long -orphan \ -mismatch -verbose bacEnds.psl \ ../cloneend/cloneEndPairs.txt all_bacends bacEnds #19.229u 4.720s 0:26.34 90.8% 0+0k 0+0io 0pf+0w # Filter by score and sort by {chrom,chromStart}: awk '$5 >= 300 {print;}' bacEnds.pairs | sort -k1,2n > bacEndPairs.bed cat bacEnds.{slop,short,long,mismatch,orphan} \ | awk '$5 >= 300 {print;}' | sort -k1,2n > bacEndPairsBad.bed # CHECK bacEndPairs.bed ID's to make sure they have no blanks in them awk '{print $5}' bacEndPairs.bed | sort -u /cluster/bin/scripts/extractPslLoad -noBin bacEnds.psl bacEndPairs.bed \ bacEndPairsBad.bed \ | sorttbl tname tstart | headchg -del \ > bacEnds.load.psl wc -l bacEnds.* # 5590477 bacEnds.psl # 4462768 bacEnds.load.psl # 109315 bacEnds.pairs # 705 bacEnds.long # 98 bacEnds.lst # 3925 bacEnds.mismatch # 88292 bacEnds.orphan # 558 bacEnds.short # 687 bacEnds.slop # load into database ssh hgwdev cd /cluster/data/rn4/bed/bacends # Reloaded with updated .sql (no chromEnd index) 4/7/06. hgLoadBed -strict -notItemRgb rn4 bacEndPairs bacEndPairs.bed \ -sqlTable=$HOME/kent/src/hg/lib/bacEndPairs.sql #Loaded 109139 elements of size 11 # note - the "Bad" track isn't pushed to RR, just used for assembly QA. hgLoadBed -strict -notItemRgb rn4 bacEndPairsBad bacEndPairsBad.bed \ -sqlTable=$HOME/kent/src/hg/lib/bacEndPairsBad.sql #Loaded 38981 elements of size 11 time hgLoadPsl rn4 -table=all_bacends bacEnds.load.psl #load of all_bacends did not go as planned: 4462768 record(s), 0 row(s) skipped, 1 warning(s) loading psl.tab #165.730u 24.690s 10:57.43 28.9% 0+0k 0+0io 258pf+0w # To diagnose: echo select \* from all_bacends | hgsql -N rn4 > psl.tab.loaded diff psl.tab* #1491015c1491015 #< 120 908 82 0 0 3 -124 6 278579 - BZ232973 866 0 866 chr17 97296363 49307517 49587086 8 21,5,71,86,327,250,116,114, 0,21,27,98,184,382,632,752, 49307517,49307539,49307544,49335694,49336333,49470455,49586851,49586972, #--- #> 120 908 82 0 0 3 0 6 278579 - BZ232973 866 0 866 chr17 97296363 49307517 49587086 8 21,5,71,86,327,250,116,114, 0,21,27,98,184,382,632,752, 49307517,49307539,49307544,49335694,49336333,49470455,49586851,49586972, # Q gap bases was -124 in... clipped to 0 in the db. # Identify and save the out/*.psl file that contains the -124: grep '^>BZ232973' /cluster/bluearc/rn4/cloneEnds/*.fa #/cluster/bluearc/rn4/cloneEnds/cloneEnds079.fa:>BZ232973 grep BZ232973 out/cloneEnds079/*.psl | grep .-124 #out/cloneEnds079/chr17.cloneEnds079.psl:908 82 0 0 3 -124 6 278579 - BZ232973 866 0 866 chr17 97296363 49307517 49587086 8 21,5,71,86,327,250,116,114, 0,21,27,98,184,382,632,752, 49307517,49307539,49307544,49335694,49336333,49470455,49586851,49586972, cp -p out/cloneEnds079/chr17.cloneEnds079.psl . featureBits rn4 all_bacends #241203668 bases of 2571531505 (9.380%) in intersection featureBits rn3 all_bacends #232313031 bases of 2571104688 (9.036%) in intersection featureBits rn4 bacEndPairs #2687958438 bases of 2571531505 (104.528%) in intersection featureBits rn3 bacEndPairs #2717444119 bases of 2571104688 (105.692%) in intersection featureBits rn4 bacEndPairsBad #664080718 bases of 2571531505 (25.824%) in intersection featureBits rn3 bacEndPairsBad #table bacEndPairsBad not found for any chroms # Clean up - this recovers >10G! rm psl.tab psl.tab.loaded raw.psl bed.tab rm -r out rmdir temp rm -r /cluster/bluearc/rn4/noMask/ /cluster/bluearc/rn4/cloneEnds/ # end BACEND PAIRS TRACK ########################################################################## # SWAP/CHAIN/NET MM7 (DONE 2/24/06 angie) mkdir /cluster/data/rn4/bed/blastz.mm7.swap cd /cluster/data/rn4/bed/blastz.mm7.swap doBlastzChainNet.pl -swap /cluster/data/mm7/bed/blastz.rn4/DEF \ >& do.log & tail -f do.log ln -s blastz.mm7.swap /cluster/data/rn4/bed/blastz.mm7 # SWAP/CHAIN/NET HG18 (DONE 2/24/06 angie) mkdir /cluster/data/rn4/bed/blastz.hg18.swap cd /cluster/data/rn4/bed/blastz.hg18.swap doBlastzChainNet.pl -swap /cluster/data/hg18/bed/blastz.rn4/DEF \ >& do.log & tail -f do.log ln -s blastz.hg18.swap /cluster/data/rn4/bed/blastz.hg18 # MAKE LINEAGE-SPECIFIC REPEATS VS. DOG, COW, RABBIT (DONE 2/24/06 angie) ssh kolossus mkdir /cluster/data/rn4/rmsk cd /cluster/data/rn4/rmsk ln -s ../?{,?}/chr*.fa.out . # Run Arian's DateRepsinRMoutput.pl to add extra columns telling # whether repeats in -query are also expected in -comp species. # Dog in extra column 1, mouse in 2, rabbit in 3 foreach outfl ( *.out ) echo "$outfl" nice /cluster/bluearc/RepeatMasker/DateRepeats \ ${outfl} -query rat -comp dog -comp cow -comp rabbit end # Now extract human (extra column 1), mouse (extra column). cd .. mkdir linSpecRep.notIn{Dog,Cow,Rabbit} foreach f (rmsk/*.out_*) set base = $f:t:r:r echo $base.out.spec /cluster/bin/scripts/extractRepeats 1 $f > \ linSpecRep.notInDog/$base.out.spec /cluster/bin/scripts/extractRepeats 2 $f > \ linSpecRep.notInCow/$base.out.spec /cluster/bin/scripts/extractRepeats 3 $f > \ linSpecRep.notInRabbit/$base.out.spec end wc -l rmsk/*.out # 4417757 total wc -l linSpecRep.notInDog/* # 2676056 total wc -l linSpecRep.notInCow/* # 2676056 total wc -l linSpecRep.notInRabbit/* # 2676056 total # These all look identical to each other (and to human except for header): foreach f (linSpecRep.notInDog/*) cmp $f linSpecRep.notInCow/$f:t cmp $f linSpecRep.notInRabbit/$f:t end # Consolidate to save space (probably could have done this for human too): mv linSpecRep.notInDog linSpecRep.notInNonRodentMammal # Clean up. rm -r rmsk linSpecRep.notInCow linSpecRep.notInRabbit # Distribute linSpecRep.* for cluster run ssh kkstore01 rsync -av /cluster/data/rn4/linSpecRep.notInNonRodentMammal/* \ /cluster/bluearc/rn4/linSpecRep.notInNonRodentMammal/ # BLASTZ/CHAIN/NET CANFAM2 (DONE 2/24/06 angie) ssh kkstore01 mkdir /cluster/data/rn4/bed/blastz.canFam2.2006-02-24 cd /cluster/data/rn4/bed/blastz.canFam2.2006-02-24 cat << '_EOF_' > DEF # rat vs. dog BLASTZ_ABRIDGE_REPEATS=1 # TARGET: Rat SEQ1_DIR=/scratch/hg/rn4/nib SEQ1_SMSK=/cluster/bluearc/rn4/linSpecRep.notInNonRodentMammal SEQ1_LEN=/cluster/data/rn4/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Dog SEQ2_DIR=/scratch/hg/canFam2/nib SEQ2_SMSK=/cluster/bluearc/canFam2/linSpecRep.notInRat SEQ2_LEN=/cluster/data/canFam2/chrom.sizes SEQ2_CHUNK=30000000 SEQ2_LAP=0 BASE=/cluster/data/rn4/bed/blastz.canFam2.2006-02-24 '_EOF_' # << for emacs doBlastzChainNet.pl DEF -bigClusterHub=pk \ -blastzOutRoot /san/sanvol1/scratch/blastzRn4CanFam2Out >& do.log & tail -f do.log ln -s blastz.canFam2.2006-02-24 /cluster/data/rn4/bed/blastz.canFam2 ########################################################################## # STS MARKERS # NOTE: Instead of using UniSTS_rat.sts as usual, I made a local version # which uses the name given in UniSTS.sts if none is given in UniSTS_rat.sts # and "UniSTS:$id" if no name is given in UniSTS.sts. Some inconsistencies # between UniSTS.sts and UniSTS_rat.sts reported 2/21/06; haven't heard # back from them yet so I'll just forge ahead with latest UniSTS files. # MAKE STSINFORAT (DONE 3/6/06 angie) # This method was developed by Yontao. ytlu/script is under CVS control # so I'm using ~/ytlu/script/perl/script/match but have copied several # other scripts from /cluster/store4/ratMarker/code into # kent/src/hg/stsMarkers/ to get them in CVS too. ssh kkstore01 mkdir /cluster/data/rn4/bed/stsMarkers mkdir /cluster/data/rn4/bed/stsMarkers/info cd /cluster/data/rn4/bed/stsMarkers/info wget --timestamping ftp://ftp.ncbi.nih.gov/repository/UniSTS/UniSTS_MapReports/Rattus_norvegicus/10116.FHH_x_ACI.7.txt wget --timestamping ftp://ftp.ncbi.nih.gov/repository/UniSTS/UniSTS_MapReports/Rattus_norvegicus/10116.RH_map.3.4.txt wget --timestamping ftp://ftp.ncbi.nih.gov/repository/UniSTS/UniSTS_MapReports/Rattus_norvegicus/10116.SHRSP_x_BN.7.txt wget --timestamping ftp://ftp.ncbi.nih.gov/repository/UniSTS/UniSTS_rat.sts wget --timestamping ftp://ftp.ncbi.nih.gov/repository/UniSTS/UniSTS.aliases wget --timestamping ftp://rgd.mcw.edu/pub/rhmap/3.4/RHv3_4_MapData.txt wget --timestamping ftp://rgd.mcw.edu/pub/rhmap/3.4/RAW_MARKER_SEQINFO.txt # UniSTS_rat.sts is missing a lot of names that can be found in UniSTS.sts # so load that up and make a fixed-up UniSTS_rat.rat_extracted.sts: wget --timestamping ftp://ftp.ncbi.nih.gov/repository/UniSTS/UniSTS.sts awk -F"\t" '$5 != "-" {print $1 "\t" $5 "\t" $8;}' UniSTS.sts \ > UniSTS.names cat > extractRat.pl <<'_EOF_' #!/usr/bin/perl -w use strict; open(NAMES, ") { chomp; my @words = split("\t"); $id2name{$words[0]} = $words[1]; } print STDERR "Extracting rat and adding in names...\n"; while (<>) { chomp; my @words = split("\t"); next if ($words[7] ne "Rattus norvegicus"); if ($words[4] eq '-') { $words[4] = $id2name{$words[0]}; $words[4] = "UniSTS:$words[0]" if (! defined $words[4]); } elsif ((defined $id2name{$words[0]}) && ($words[4] ne $id2name{$words[0]})) { print STDERR "Disagreement: name of $words[0] given as " . $id2name{$words[0]} . " and as $words[4]\n"; } print join("\t", @words) . "\n"; } '_EOF_' # << emacs chmod a+x extractRat.pl ./extractRat.pl UniSTS_rat.sts > UniSTS_rat.rat_extracted.sts # Many warnings about different name given in UniSTS_rat.sts vs. # UniSTS.sts -- reported to NCBI. # Make UniSTS_rat.alias by joining the first columns of # UniSTS_rat.rat_extracted.sts and UniSTS.aliases so we get just the # aliases of items in UniSTS_rat.rat_extracted.sts: /cluster/home/ytlu/ytlu/script/perl/script/match \ UniSTS_rat.rat_extracted.sts 1 UniSTS.aliases 1 \ > UniSTS_rat.alias # Extract these columns from RHv3_4_MapData.txt: # RGD_Sym TEMPL_SEQ FORWARD_PRIMER REVERSE_PRIMER EXP_SIZE awk '{print $2,$1,$17,$8,$9,$10}' RHv3_4_MapData.txt | sed s/" "/"\t"/g \ > inMapData.txt # Find entries not in RAW_MARKER_SEQINFO.txt but in inMapData.txt, /cluster/home/ytlu/ytlu/script/perl/script/match -M \ RAW_MARKER_SEQINFO.txt 1 inMapData.txt 1 > inMapDataOnly.txt cat inMapDataOnly.txt RAW_MARKER_SEQINFO.txt > marker_seqinfo.txt # Combine sources into a "draft" stsInfoRat.tab file: ~/kent/src/hg/stsMarkers/createStsInfoRat \ UniSTS_rat.rat_extracted.sts marker_seqinfo.txt UniSTS_rat.alias \ 10116.RH_map.3.4.txt 10116.FHH_x_ACI.7.txt 10116.SHRSP_x_BN.7.txt \ > stsInfoRat-draft # clean some redundant and conflicting entries --> final stsInfoRat.tab: ~/kent/src/hg/stsMarkers/cleanInfo.pl -rat stsInfoRat-draft \ > ../stsInfoRat.withSeq.tab # Create the primer.fa, cloneseq.fa, and primerinfo files for alignment: cd .. ~/kent/src/hg/stsMarkers/luConvertPrimerToFa stsInfoRat.withSeq.tab \ ratPrimer.fa ratSeq.fa ratPrimer.info -a 200 # Strip the sequence column off so that it isn't loaded into the db # (causing mysql warnings due to truncation). The seq column is not # used from here on out: perl -wpe '@w=split("\t"); $w[24] = ""; $_ = join("\t", @w) . "\n";' \ stsInfoRat.withSeq.tab > stsInfoRat.tab # BLAT CLONE SEQUENCES (DONE 3/8/06 angie) # Based on what was done for hg17, I think the thing to do is not use # ooc, but filter out the results with tGapSize > 1000. ssh kkstore01 cd /cluster/data/rn4 mkdir -p /cluster/bluearc/rn4/ctgFa foreach d (*/chr*_?{,?}) cp -p $d/$d:t.fa /cluster/bluearc/rn4/ctgFa/ end cp -p /cluster/data/rn4/bed/stsMarkers/ratSeq.fa \ /cluster/bluearc/rn4/ ssh pk mkdir /cluster/data/rn4/bed/stsMarkers/clone cd /cluster/data/rn4/bed/stsMarkers/clone # A run with ooc, and then a run without ooc only on the unaligned seqs, # would probably save time next time around... cat << '_EOF_' > gsub #LOOP /cluster/bin/x86_64/blat {check in line+ $(path1)} {check in line+ /cluster/bluearc/rn4/ratSeq.fa} -stepSize=5 {check out line+ clone.out/$(root1).psl} #ENDLOOP '_EOF_' # << emacs mkdir clone.out ls -1S /cluster/bluearc/rn4/ctgFa/chr*.fa > ctg.lst gensub2 ctg.lst single gsub jobList para make jobList para time #Completed: 591 of 591 jobs #CPU time in finished jobs: 1628303s 27138.38m 452.31h 18.85d 0.052 y #IO & Wait Time: 26091s 434.85m 7.25h 0.30d 0.001 y #Average job time: 2799s 46.66m 0.78h 0.03d #Longest finished job: 6138s 102.30m 1.71h 0.07d #Submission to last job: 8714s 145.23m 2.42h 0.10d ssh kolossus cd /cluster/data/rn4/bed/stsMarkers/clone # filter alignments for (qEnd-qStart) vs. (tEnd-tStart) # should not be more than 100 bases different. pslSort dirs stdout temp clone.out \ | pslReps -nearTop=0.0001 -minCover=0.6 -minAli=0.8 -noIntrons stdin \ clones.unlifted.tmp.psl /dev/null awk '($8 < 1000) {print;}' clones.unlifted.tmp.psl > clones.unlifted.psl wc -l clones.un* # 19097 clones.unlifted.psl # 22275 clones.unlifted.tmp.psl rmdir temp rm /cluster/bluearc/rn4/ratSeq.fa # BLAT.2 STS PRIMERS (DONE 3/7/06 angie) cp -p /cluster/data/rn4/bed/stsMarkers/ratPrimer.fa \ /cluster/bluearc/rn4/ ssh kk mkdir /cluster/data/rn4/bed/stsMarkers/primer cd /cluster/data/rn4/bed/stsMarkers/primer # PLEASE NOTE /cluster/bin/i386/blat.2 SPECIFICALLY IS USED HERE. cat << '_EOF_' > gsub #LOOP /cluster/bin/i386/blat.2 {check in line+ $(path1)} {check in line+ /cluster/bluearc/rn4/ratPrimer.fa} -ooc=/cluster/bluearc/rn4/11.ooc -minMatch=1 -minScore=0 -minIdentity=80 -oneOff {check out line+ primers.out/$(root1).psl} #ENDLOOP '_EOF_' # << emacs mkdir primers.out ls -1S /cluster/bluearc/rn4/ctgFa/chr*.fa > ctg.lst gensub2 ctg.lst single gsub jobList para make jobList para time #Completed: 591 of 591 jobs #CPU time in finished jobs: 466140s 7769.00m 129.48h 5.40d 0.015 y #IO & Wait Time: 63525s 1058.75m 17.65h 0.74d 0.002 y #Average job time: 896s 14.94m 0.25h 0.01d #Longest finished job: 1756s 29.27m 0.49h 0.02d #Submission to last job: 5330s 88.83m 1.48h 0.06d ssh kkstore01 cd /cluster/data/rn4/bed/stsMarkers/primer # filter alignments for (qEnd-qStart) vs. (tEnd-tStart) # should not be more than 100 bases different. pslSort dirs stdout temp primers.out \ | awk -F"\t" ' \ { if (((($13 - $12) - ($17 - $16)) > -100) && \ ((($13 - $12) - ($17 - $16)) < 100)) {print} \ }' \ > primers.psl.100.unlifted rmdir temp rm /cluster/bluearc/rn4/ratPrimer.fa # Would be very nice to compare wc output with rn3 run, but I cannot # find the rn3 run. wc primers.psl.100.unlifted # 8050086 169051724 799770587 primers.psl.100.unlifted # E-PCR STS PRIMERS (DONE 3/7/06 angie) ssh kkstore01 cp -p /cluster/data/rn4/bed/stsMarkers/ratPrimer.info \ /cluster/bluearc/rn4/ ssh kk mkdir /cluster/data/rn4/bed/stsMarkers/ePCR cd /cluster/data/rn4/bed/stsMarkers/ePCR ls -1S /cluster/bluearc/rn4/ctgFa/chr*.fa > ctg.lst # Hiram built latest e-PCR into /cluster/bin/{i386,x86_64}/e-PCR, # see notes in makeMm7.doc. mkdir epcr.out cat << '_EOF_' > runPCR.csh #!/bin/csh -fe /cluster/bin/$MACHTYPE/e-PCR $1 $2 N=1 M=50 W=5 > $3 '_EOF_' chmod +x runPCR.csh cat << '_EOF_' > gsub #LOOP ./runPCR.csh {check in line+ /cluster/bluearc/rn4/ratPrimer.info} {check in line+ $(path1)} {check out line+ epcr.out/$(num1).epcr} #ENDLOOP '_EOF_' gensub2 ctg.lst single gsub jobList para make jobList para time # Actually ran on pk -- kk is swamped at the moment. #Completed: 591 of 591 jobs #CPU time in finished jobs: 103485s 1724.75m 28.75h 1.20d 0.003 y #IO & Wait Time: 29018s 483.63m 8.06h 0.34d 0.001 y #Average job time: 224s 3.74m 0.06h 0.00d #Longest finished job: 341s 5.68m 0.09h 0.00d #Submission to last job: 482s 8.03m 0.13h 0.01d ssh kkstore01 cd /cluster/data/rn4/bed/stsMarkers/ePCR cat epcr.out/*.epcr > all.epcr rm /cluster/bluearc/rn4/ratPrimer.info rm -r /cluster/bluearc/rn4/ctgFa # Would compare to rn3 if I could find it... wc all.epcr # 73885 295540 3915703 all.epcr # COMBINE AND FILTER BLAT.2 AND E-PCR RESULTS (DONE 3/8/06 angie -- stsMapRat.bed REDONE 6/15/06) ssh kolossus cd /cluster/data/rn4/bed/stsMarkers/primer # create accession_info.rdb: AGP info. /cluster/bin/scripts/compileAccInfo -rat /cluster/data/rn4 /dev/null #0 processed # "0 processed" sounds a little scary but it's 0 because we didn't pass # in a -pre rdb. mv accession_info.rdb accession_info.rdb.tmp /cluster/bin/scripts/sorttbl Chr Ord Start < accession_info.rdb.tmp \ > accession_info.rdb # some lines have "chrN_random_M" in the Contig column but "chrN" # (not chrN_random) in the chrom column.... looks weird, but reading # compileAccInfo it seems intentional to lump ordered and random. rm accession_info.rdb.tmp # Would compare to rn3 if I could find it... wc accession_info.rdb # 138571 1524285 9921966 accession_info.rdb # filterSTSPrimers combines blat.2 and epcr results. # For human there is a program pslFilterPrimers that combines isPcr and # epcr results... might be interesting to try that next time around. /cluster/bin/scripts/filterSTSPrimers -rat \ ../stsInfoRat.tab primers.psl.100.unlifted \ ../ratPrimer.info ../ePCR/all.epcr \ | liftUp -nohead -type=.psl primers.psl.filter.blat \ ../../../jkStuff/liftAll.lft warn stdin # Would compare to rn3 if I could find it... wc primers.psl.100.unlifted # 8050086 169051724 799770587 primers.psl.100.unlifted wc primers.psl.filter.blat # 38707 812847 4100619 primers.psl.filter.blat # filterSTSPrimers creates epcr.not.found which actually contains # items that *were* found by epcr but not by blat. Translate to PSL: /cluster/bin/scripts/epcrToPsl -rat epcr.not.found ../ratPrimer.info \ accession_info.rdb /cluster/data/rn4 # epcrToPsl creates epcr.not.found.nomatch and epcr.not.found.psl # Would compare to rn3 if I could find it... wc epcr* # 748 2992 27112 epcr.not.found # 0 0 0 epcr.not.found.nomatch # 748 15708 72452 epcr.not.found.psl # Lift the PSL: mv epcr.not.found.psl epcr.not.found.unlifted.psl liftUp -nohead epcr.not.found.psl \ ../../../jkStuff/liftAll.lft warn epcr.not.found.unlifted.psl wc epcr.not.found.psl # 748 15708 80899 epcr.not.found.psl cat primers.psl.filter.blat epcr.not.found.psl > primers.psl.filter # create primers.psl.filter.lifted.initial /cluster/bin/scripts/extractPslInfo primers.psl.filter # Would compare to rn3 if I could find it... wc primers.psl.filter.initial # 39451 236706 2071215 primers.psl.filter.initial # create primers.psl.filter.lifted.initial.acc /cluster/bin/scripts/findAccession -agp \ -rat primers.psl.filter.initial /cluster/data/rn4 # it warns about missing AGP for M_random -- OK. # Would compare to rn3 if I could find it... wc primers.psl.filter.initial.acc # 39451 276157 2579345 primers.psl.filter.initial.acc /cluster/bin/scripts/getStsId -rat \ ../stsInfoRat.tab primers.psl.filter.initial.acc \ | sort -k 4n primers.initial.acc.trans > primers.final # Would compare to rn3 if I could find it... wc primers.final # 39451 276157 2232848 primers.final # Lift clone sequence alignments cd /cluster/data/rn4/bed/stsMarkers/clone liftUp -nohead clones.psl \ ../../../jkStuff/liftAll.lft warn clones.unlifted.psl /cluster/bin/scripts/extractPslInfo clones.psl # extractPslInfo creates clones.psl.initial /cluster/bin/scripts/findAccession -agp \ -rat clones.psl.initial /cluster/data/rn4 # findAccession creates clones.psl.initial.acc sort -k 4n clones.psl.initial.acc > clones.final # Combine clone sequence alignments with primer alignments cd /cluster/data/rn4/bed/stsMarkers /cluster/bin/scripts/combineSeqPrimerPos \ clone/clones.final primer/primers.final # creates stsMarkers_pos.rdb # Would compare to rn3 if I could find it... wc stsMarkers_pos.rdb # 39291 275037 1893500 stsMarkers_pos.rdb # 6/15/06: re-running from here after fixing createStsMapRat to make # empty chroms instead of "0" when the chrom is not specified -- # hgTracks.c code depends on this. ~/kent/src/hg/stsMarkers/createStsMapRat \ stsInfoRat.tab stsMarkers_pos.rdb > stsMapRat.bed # Would compare to rn3 if I could find it... wc stsMapRat.bed # 38802 490471 3205607 stsMapRat.bed ~/kent/src/hg/stsMarkers/createStsAlias stsInfoRat.tab \ | sort -u > stsAlias.tab # Warnings about aliases with multiple trueNames... would be good to # pass that on to NCBI, if we ever get a resolution for the previous # questions. # Would compare to rn3 if I could find it... wc stsAlias.tab # 69051 207153 1687764 stsAlias.tab # compare old and new name lists: ssh -x hgwdev 'hgsql -N rn3 -e "select name from stsMapRat" | sort -u' \ > rn3.nameList awk '{print $4}' stsMapRat.bed | sort -u > rn4.nameList comm -12 rn?.nameList | wc # 32693 32693 290774 [in common] comm -23 rn3.nameList rn4.nameList | wc # 1208 1208 10659 [in rn3 only] comm -13 rn3.nameList rn4.nameList | wc # 5234 5234 48832 [in rn4 only] # LOAD STS MARKER TABLES (DONE 3/8/06 angie - ratSeq.fa added 6/14/06, stsMapRat reloaded 6/15/06) ssh hgwdev cd /cluster/data/rn4/bed/stsMarkers # Add primer sequences to /gbdb and load: mkdir /gbdb/rn4/stsMarker ln -s /cluster/data/rn4/bed/stsMarkers/ratPrimer.fa \ /gbdb/rn4/stsMarker/ratPrimer.fa ln -s /cluster/data/rn4/bed/stsMarkers/ratSeq.fa \ /gbdb/rn4/stsMarker/ratSeq.fa # PLEASE NOTE: if re-running hgLoadSeq, you MUST add # -replace # hgLoadSeq -replace rn4 /gbdb/rn4/stsMarker/ratPrimer.fa # otherwise there will be a problem that the seq and extFile tables # will be out of sync. hgLoadSeq rn4 /gbdb/rn4/stsMarker/ratPrimer.fa hgLoadSeq rn4 /gbdb/rn4/stsMarker/ratSeq.fa hgLoadSqlTab rn4 stsAlias \ ~/kent/src/hg/lib/stsAlias.sql stsAlias.tab hgLoadSqlTab rn4 stsInfoRat \ ~/kent/src/hg/lib/stsInfoRat.sql stsInfoRat.tab # After some work on the stsInfoRat creation process which should help # next time around (do not do this part next time, just for the record): hgLoadSqlTab rn4 stsInfoRat \ ~/kent/src/hg/lib/stsInfoRat.sql stsInfoRat.tab.new #Warning: load of stsInfoRat did not go as planned: 48048 record(s), 0 row(s) skipped, 5 warning(s) loading stsInfoRat.tab echo 'select * from stsInfoRat' | hgsql -N rn4 > /tmp/tmp # compare /tmp/tmp and stsInfoRat.tmp -- looks like 5 alias fields are # truncated at 255 chars, not so terrible because we have stsAlias. # -- But since the IDs are off-by-one for a large stretch of that due # -- to getting rid of a garbage line, I reloaded and manually deleted # -- the unused and often truncated clone sequence values: hgLoadSqlTab rn4 stsInfoRat \ ~/kent/src/hg/lib/stsInfoRat.sql stsInfoRat.tab #Warning: load of stsInfoRat did not go as planned: 48049 record(s), 0 row(s) skipped, 187463 warning(s) loading stsInfoRat.tab hgsql rn4 -e 'update stsInfoRat set clone = "";' # Again, next time around the above should not be necessary. cat primer/primers.psl.filter clone/clones.psl \ | hgLoadPsl -table=all_sts_primer rn4 stdin # Reloaded 6/15/06 hgLoadBed rn4 stsMapRat -tab \ -sqlTable=$HOME/kent/src/hg/lib/stsMapRat.sql stsMapRat.bed #Loaded 38802 elements of size 15 nice featureBits rn4 all_sts_primer #10103096 bases of 2571531505 (0.393%) in intersection nice featureBits rn3 all_sts_primer #9722909 bases of 2571104688 (0.378%) in intersection nice featureBits rn4 stsMapRat #11755453 bases of 2571531505 (0.457%) in intersection nice featureBits rn3 stsMapRat #10119869 bases of 2571104688 (0.394%) in intersection # Clean up. rm -r primer/primers.out/ primer/primers.psl.100.unlifted clone/clone.out/ # end STS MARKERS ########################################################################## ############################################################################### # BUILD KNOWN GENES TABLES (DONE 3/2/06 angie) # Use protein databases built by Fan: sp060115 and proteins060115 # See makeProteins060115.doc for details. # Ask cluster-admin to rync sp060115 and proteins060115 to kolossus # so that database work can be done there; then final results will be # rsync'd back to hgwdev. # Create working subdirectories and temporary databases (kgRn4A) ssh kolossus mkdir /cluster/data/rn4/bed/kgRn4A ln -s /cluster/data/rn4/bed/kgRn4A /cluster/store6/kgDB/bed/kgRn4A ln -s /cluster/data/rn4/bed/kgRn4A /cluster/store11/kg/kgRn4A hgsql rn4 -e "create database kgRn4A" hgsql rn4 -e "create database kgRn4ATemp" mkdir /cluster/bluearc/kgDB/kgRn4A mkdir /cluster/bluearc/kgDB/kgRn4A/protBlat ln -s /cluster/bluearc/kgDB/kgRn4A/protBlat \ /cluster/data/rn4/bed/kgRn4A/protBlat # Save a copy of several genbank tables so that we can produce consistent # results if we need to add more tables later. First, ask cluster-admin # to rsync the tables from hgwdev rn4 --> kolossus. Save files too: mkdir /cluster/data/rn4/bed/kgRn4A/genbankBackup cd /cluster/data/rn4/bed/kgRn4A/genbankBackup chmod 777 . foreach t (all_mrna cds gbCdnaInfo gbExtFile gbLoaded gbSeq gbStatus \ mgcGenes \ refFlat refGene refLink refSeqAli refSeqStatus refSeqSummary \ xenoMrna xenoRefFlat xenoRefGene xenoRefSeqAli) echo $t hgsqldump --tab=. rn4 $t end gzip *.txt chmod 775 . # Get all rat protein sequences cd /cluster/data/rn4/bed/kgRn4A/protBlat hgsql -N sp060115 -e \ 'select p.acc, p.val from protein p, accToTaxon x \ where x.taxon=10116 and p.acc=x.acc'\ | awk '{print ">" $1; print $2;}' \ > ratProt.fa hgsql -N sp060115 -e \ 'select v.varAcc, p.val from varAcc v, protein p, accToTaxon x \ where v.parAcc = p.acc and x.taxon=10116 and v.parAcc=x.acc'\ | awk '{print ">" $1; print $2;}' \ >> ratProt.fa # Prepare and perform cluster run for protein/genome alignment ssh kolossus cd /cluster/bluearc/kgDB/kgRn4A/protBlat mkdir prot faSplit sequence ratProt.fa 2000 prot/prot ls /cluster/bluearc/kgDB/kgRn4A/protBlat/prot/* > prot.lis hgsql rn4 -N -e 'select chrom from chromInfo' > chrom.lis ssh pk cd /cluster/bluearc/kgDB/kgRn4A/protBlat cat << '_EOF_' > gsub #LOOP /cluster/bin/x86_64/blat -t=dnax -q=prot /scratch/hg/rn4/nib/$(path1).nib $(path2) {check out line+ /cluster/bluearc/kgDB/kgRn4A/protBlat/result/$(root1)_$(root2).psl} #ENDLOOP '_EOF_' # << emacs mkdir result gensub2 chrom.lis prot.lis gsub jobList para make jobList #Completed: 89100 of 89100 jobs #CPU time in finished jobs: 4215176s 70252.93m 1170.88h 48.79d 0.134 y #IO & Wait Time: 230997s 3849.96m 64.17h 2.67d 0.007 y #Average job time: 50s 0.83m 0.01h 0.00d #Longest finished job: 5794s 96.57m 1.61h 0.07d #Submission to last job: 16257s 270.95m 4.52h 0.19d # collect BLAT results ssh kolossus cd /cluster/bluearc/kgDB/kgRn4A/protBlat pslSort -nohead dirs raw.psl temp result pslReps -nohead -minCover=0.80 -minAli=0.80 -nearTop=0.002 raw.psl \ protBlat.psl /dev/null hgLoadPsl -fastLoad rn4 protBlat.psl #load of protBlat did not go as planned: 14768 record(s), 0 row(s) skipped, 48 warning(s) loading psl.tab # Some of the lines had negative qGapSizes. ask Jim... # create all_mrna.psl and tight_mrna.psl zcat /cluster/data/rn4/bed/kgRn4A/genbankBackup/all_mrna.txt.gz \ | cut -f 2-22 > all_mrna.psl pslReps -minCover=0.40 -minAli=0.97 -nearTop=0.002 all_mrna.psl \ tight_mrna.psl /dev/null # Use overlapSelect to get protein and mRNA alignment overlaps overlapSelect -statsOutput -dropped=protOut.psl -overlapThreshold=0.90 \ -selectFmt=psl -inFmt=psl tight_mrna.psl protBlat.psl protMrna.stat overlapSelect -mergeOutput -dropped=protOut.psl -overlapThreshold=0.90 \ -selectFmt=psl -inFmt=psl tight_mrna.psl protBlat.psl protMrna.out # Create protein/mRNA pair and protein lists cut -f 10,31 protMrna.out | sort -u > spMrna.tab cut -f 10 protMrna.out | sort -u > protein.lis # Load spMrna.tab into spMrna table in temp DB. hgLoadSqlTab -notOnServer kgRn4ATemp spMrna ~/kent/src/hg/lib/spMrna.sql \ spMrna.tab hgsql kgRn4ATemp -e 'create index mrnaID on spMrna(mrnaID)' # Prepare and perform cluster run of protein/mRNA alignment # Get mRNA fa file. cd /cluster/data/rn4/bed/kgRn4A /cluster/data/genbank/bin/i386/gbGetSeqs -native -db=rn4 \ -gbRoot=/cluster/data/genbank genbank mrna mrna.fa # Create mrnaSeq table in kgRn4ATemp DB. hgPepPred kgRn4ATemp generic mrnaSeq mrna.fa # Prepare files for cluster run cd /cluster/bluearc/kgDB/kgRn4A ~/kent/src/hg/protein/KG2.sh kgRn4A rn4 060115 # Perform cluster run of protein/mRNA alignment ~/kent/src/hg/protein/KG3.sh kgRn4A rn4 060115 # Collect cluster run results cd kgBestMrna cp /dev/null protMrnaRaw.psl foreach d (out/*) echo $d cat $d/*.out >> protMrnaRaw.psl end # Filter out low quality alignments pslReps -nohead -singleHit -minAli=0.9 protMrnaRaw.psl \ protMrnaBlat.psl /dev/null #Processed 33623 alignments cut -f 10,14 protMrnaBlat.psl | sort -u > protMrna.lis wc protMrna.lis # 21430 42860 334656 protMrna.lis # Load BLAT results into temp DB. ssh kolossus cd /cluster/data/rn4/bed/kgRn4A/kgBestMrna hgsql kgRn4ATemp -e 'create table chromInfo select * from rn4.chromInfo' hgLoadPsl -fastLoad kgRn4ATemp protMrnaBlat.psl #load of protMrnaBlat did not go as planned: 21432 record(s), 0 row(s) skipped, 97 warning(s) loading psl.tab #-- tGapBases (tGapInsert) this time -- ask Jim # Create CDS files from protein/mRNA alignment results. hgsql kgRn4ATemp -N -e \ 'select qName,"_",tName,tStart+1,":",tEnd+3 from protMrnaBlat \ order by qName,tName,tEnd-tStart desc'\ | sed 's/\t_\t/_/g; s/\t:\t/../g' > protMrna.cds # Create protMrna.psl with proteinID_mrnaID as query ID. cut -f 22-30 ../protBlat/protMrna.out > j1.tmp cut -f 32-42 ../protBlat/protMrna.out > j2.tmp cut -f 10,31 ../protBlat/protMrna.out | sed -e 's/\t/_/g' > j3.tmp paste j1.tmp j3.tmp j2.tmp > protMrna.psl rm j1.tmp j2.tmp j3.tmp # Run mrnaToGene to create protMrna.gp bash mrnaToGene -cdsFile=protMrna.cds protMrna.psl protMrna.gp \ 2>protMrna.err >protMrna.log exit # No need to move kgBestMrna anywhere else for now: du -sh #240M . df -h . #kkstore01-10:/export/cluster/store9 # 1.3T 1.1T 199G 84% /cluster/store9 # Prepare refGene and all_mrna gp files. cd .. zcat genbankBackup/refGene.txt.gz > ref.gp hgsql rn4 -N -e \ 'select gbCdnaInfo.acc,cds.name from gbCdnaInfo,cds,all_mrna \ where all_mrna.qName=gbCdnaInfo.acc and gbCdnaInfo.cds=cds.id' \ | sort -u > all_mrna.cds bash mrnaToGene -cdsFile=all_mrna.cds \ /cluster/bluearc/kgDB/kgRn4A/protBlat/all_mrna.psl all_mrna.gp \ 2>all_mrna.err > all_mrna.log exit # Align proteins to RefSeq. overlapSelect -inCds -statsOutput -overlapThreshold=0.90 -selectFmt=psl \ -inFmt=genePred protBlat/protBlat.psl ref.gp ref.stat overlapSelect -inCds -dropped=refOut1.gp -overlapThreshold=0.90 \ -selectFmt=psl -inFmt=genePred protBlat/protBlat.psl ref.gp protRef.gp overlapSelect -mergeOutput -selectCds -dropped=protOut1.psl \ -overlapThreshold=0.80 -inFmt=psl \ -selectFmt=genePred ref.gp protBlat/protBlat.psl protRef.out cut -f 10,22 protRef.out | sort -u >spRef.tab cut -f 10 protRef.out | sort -u >protRef.lis hgLoadSqlTab kgRn4ATemp spRef ~/kent/src/hg/lib/spRef.sql spRef.tab # Prepare and perform cluster runs for protein/RefSeq alignments ~/kent/src/hg/protein/KGRef2.sh kgRn4A rn4 060115 ~/kent/src/hg/protein/KGRef3.sh kgRn4A rn4 060115 cd kgBestRef cp /dev/null protRefRaw.psl foreach d (out/*) echo $d cat $d/*.out >> protRefRaw.psl end # Filter out low quality alignments. pslReps -nohead -singleHit -minAli=0.9 protRefRaw.psl \ protRefBlat.psl /dev/null #Processed 15266 alignments cut -f 10,14 protRefBlat.psl | sort -u > protRef.lis wc protRef.lis # 11957 23914 216980 protRef.lis hgLoadPsl -fastLoad kgRn4ATemp protRefBlat.psl #load of protRefBlat did not go as planned: 11958 record(s), 0 row(s) skipped, 45 warning(s) loading psl.tab # Negative tGapSizes again. # Run gene-check to filter out invalid gp entries cd /cluster/data/rn4/bed/kgRn4A cat ref.gp kgBestMrna/protMrna.gp all_mrna.gp >kgCandidate0.gp /cluster/bin/i386/gene-check -incl-ok -ok-genepred-out \ kgCandidate0.passed.gp \ -nib-dir /cluster/data/rn4/nib kgCandidate0.gp kgCandidate0.check ldHgGene -predTab kgRn4ATemp kgCandidate0 kgCandidate0.gp tail +3 kgCandidate0.check > geneCheck.tab hgLoadSqlTab kgRn4ATemp geneCheck ~/kent/src/hg/lib/geneCheck.sql \ geneCheck.tab # Run kgCheck to get all KG candidates that pass the KG gene check criteria kgCheck kgRn4ATemp rn4 kgCandidate0 geneCheck kgCandidate.tab hgLoadSqlTab kgRn4ATemp kgCandidate ~/kent/src/hg/lib/kgCandidate.sql \ kgCandidate.tab hgsql kgRn4ATemp -e 'create index alignID on kgCandidate(alignID)' # Construct the kgCandidateX table that has alignID in the name field. cut -f 2-10 kgCandidate.tab > j2.tmp cut -f 11 kgCandidate.tab > j1.tmp paste j1.tmp j2.tmp > kgCandidateX.tab ldHgGene -predTab kgRn4ATemp kgCandidateX kgCandidateX.tab # Score protein/mRna and protein/RefSeq alignments ln -s /cluster/bluearc/kgDB/kgRn4A/protBlat/protein.lis . kgResultBestMrna2 060115 kgRn4ATemp rn4 protMrnaBlat | sort -u \ > protMrnaBlatScore.tab kgResultBestRef2 060115 kgRn4ATemp rn4 protRefBlat | sort -u \ > protRefScore.tab # Combine scoring results and load them into temp DB. cat protMrnaBlatScore.tab protRefScore.tab > protMrnaScore.tab hgLoadSqlTab kgRn4ATemp protMrnaScore ~/kent/src/hg/lib/protMrnaScore.sql \ protMrnaScore.tab hgsql kgRn4ATemp -e 'create index mrnaAcc on protMrnaScore(mrnaAcc)' # Run kgGetCds to get CDS structure of each gene kgGetCds kgRn4ATemp 060115 kgCandidateX stdout \ | sort -u > kgCandidateY.tab hgLoadSqlTab kgRn4ATemp kgCandidateY ~/kent/src/hg/lib/kgCandidateY.sql \ kgCandidateY.tab # Run kgPickPrep to replace long cds structure string with cdsId. kgPickPrep kgRn4ATemp kgCandidateZ.tab hgLoadSqlTab kgRn4ATemp kgCandidateZ ~/kent/src/hg/lib/kgCandidateZ.sql \ kgCandidateZ.tab hgsql kgRn4ATemp -e 'create index cdsId on kgCandidateZ(cdsId)' # Run kgPick to pick the representative a mrna/protein pair for each # unique CDS structure. kgPick kgRn4ATemp rn4 sp060115 kg3.tmp stdout \ | sort -u > dupSpMrna.tab # Create put back list /cluster/data/genbank/bin/i386/gbGetSeqs \ -gbRoot=/cluster/data/genbank db=rn4 -get=ra -native RefSeq mrna stdout \ | perl -wpe 's/^(\w+) (.*)/$1\t$2/ || next; $acc = $2 if ($1 eq "acc"); \ s/^/$acc\t/;' \ | sort -u | tail +2 > refRa.tab hgLoadSqlTab rn4 refRa ~/kent/src/hg/lib/refRa.sql refRa.tab hgsql rn4 -N -e \ 'select r.acc, r.attr, r.val from refRa r, refRa r2, refRa r3 \ where r.attr="selenocysteine" and r.acc=r2.acc and r2.attr="rss" and \ r2.val="rev" and r3.acc=r.acc and r3.attr="org" and \ r3.val="Rattus norvegicus"' \ > kgPutBack2.tab hgsql rn4 -N -e \ 'select r.acc, r.attr, r.val from refRa r, refRa r2, refRa r3 \ where r.attr="cno" and r.val like "%ribosomal frameshift%" and \ r.acc=r2.acc and r2.attr="rss" and r2.val="rev" and r2.val="rev" and \ r3.acc=r.acc and r3.attr="org" and r3.val="Rattus norvegicus"' \ >> kgPutBack2.tab hgsql rn4 -N -e \ 'select r.acc, r.attr, r.val from refRa r, refRa r2, refRa r3 \ where r.attr="cno" and r.val like "%non-AUG%" and r.acc=r2.acc and \ r2.attr="rss" and r2.val="rev" and r2.val="rev" and r3.acc=r.acc and \ r3.attr="org" and r3.val="Rattus norvegicus"' \ >> kgPutBack2.tab hgsql rn4 -N -e \ 'select r.acc, r.attr, r.val from refRa r, refRa r2, refRa r3 \ where r.attr="translExcept" and r.acc=r2.acc and r2.attr="rss" and \ r2.val="rev" and r2.val="rev" and r3.acc=r.acc and r3.attr="org" and \ r3.val="Rattus norvegicus"' \ >> kgPutBack2.tab hgsql rn4 -N -e \ 'select r.acc, r.attr, r.val from refRa r, refRa r2, refRa r3 \ where r.attr="exception" and r.acc=r2.acc and r2.attr="rss" and \ r2.val="rev" and r2.val="rev" and r3.acc=r.acc and r3.attr="org" and \ r3.val="Rattus norvegicus"' \ >> kgPutBack2.tab wc -l kgPutBack2.tab #10 kgPutBack2.tab hgLoadSqlTab kgRn4ATemp kgPutBack2 ~/kent/src/hg/lib/kgPutBack2.sql \ kgPutBack2.tab kgPutBack kgRn4ATemp rn4 sp060115 kgPutBack2 kgPutBack2.gp # Sort KG genes to make the kg4.gp table file. cat kgPutBack2.gp kg3.tmp > kg4.tmp ~/kent/src/hg/protein/sortKg.pl kg4.tmp \ | cut -f 1-12 > knownGene.tab # Load data into knownGene table in both kgRn4ATemp and rn4. hgLoadSqlTab kgRn4ATemp knownGene ~/kent/src/hg/lib/knownGene.sql \ knownGene.tab hgLoadSqlTab rn4 knownGene ~/kent/src/hg/lib/knownGene.sql \ knownGene.tab # Load dupSpMrna table after knownGene table is loaded so that \ # joinerCheck does not complain. hgLoadSqlTab rn4 dupSpMrna ~/kent/src/hg/lib/dupSpMrna.sql dupSpMrna.tab # Perform analysis on KG nice featureBits rn4 -enrichment refGene knownGene #refGene 0.722%, knownGene 0.582%, both 0.539%, cover 74.72%, enrich 128.33x nice featureBits rn4 -enrichment refGene:cds knownGene:cds #refGene:cds 0.500%, knownGene:cds 0.377%, both 0.352%, cover 70.41%, enrich 186.83x nice featureBits rn3 -enrichment refGene knownGene #refGene 0.721%, knownGene 0.523%, both 0.411%, cover 56.96%, enrich 108.94x nice featureBits rn3 -enrichment refGene:cds knownGene:cds #refGene:cds 0.500%, knownGene:cds 0.392%, both 0.294%, cover 58.91%, enrich 150.16x # Build knownGeneMrna and knownGenePep tables. kgPepMrna kgRn4ATemp rn4 060115 hgPepPred rn4 tab knownGeneMrna knownGeneMrna.tab hgLoadSqlTab rn4 knownGenePep ~/kent/src/hg/lib/knownGenePep.sql knownGenePep.tab # Use hgwdev's entrez database (loaded by Fan, see makeHg18.doc) # to buid the mrnaRefseq table. ssh hgwdev hgsql entrez -N -e \ 'select mrna, refseq from entrezRefseq, entrezMrna \ where entrezRefseq.geneID=entrezMrna.geneID' \ > /cluster/data/rn4/bed/kgRn4A/mrnaRefseq.tmp hgsql rn4 -N -e 'select name, name from refGene' \ >> /cluster/data/rn4/bed/kgRn4A/mrnaRefseq.tmp ssh kolossus cd /cluster/data/rn4/bed/kgRn4A sort -u mrnaRefseq.tmp > mrnaRefseq.tab hgLoadSqlTab rn4 mrnaRefseq ~/kent/src/hg/lib/mrnaRefseq.sql mrnaRefseq.tab # Build kgXref table kgXref2 kgRn4ATemp 060115 rn4 hgLoadSqlTab rn4 kgXref ~/kent/src/hg/lib/kgXref.sql kgXref.tab # Build spMrna table hgsql rn4 -N -e 'select proteinID, name from knownGene' > kgSpMrna.tab hgLoadSqlTab rn4 spMrna ~/kent/src/hg/lib/spMrna.sql kgSpMrna.tab # Build kgProtMap table ln -s /cluster/bluearc/kgDB/kgRn4A/protBlat/tight_mrna.psl . ~/kent/src/hg/protein/kgProtMap2.sh kgRn4A rn4 060115 ##################################### # Build alias tables. kgAliasM rn4 proteins060115 kgAliasKgXref rn4 kgAliasRefseq rn4 hgsql sp060115 -N -e \ 'select name,gene.val from rn4.knownGene,displayId,gene \ where displayId.val=proteinID and displayId.acc=gene.acc' \ | sort -u > kgAliasP.tab hgsql rn4 -N -e 'select name, name from knownGene' > kgAliasDup.tab hgsql rn4 -N -e 'select mrnaID, dupMrnaID from dupSpMrna' >> kgAliasDup.tab sort -u kgAliasM.tab kgAliasRefseq.tab kgAliasKgXref.tab kgAliasP.tab \ kgAliasDup.tab > kgAlias.tab hgLoadSqlTab rn4 kgAlias ~/kent/src/hg/lib/kgAlias.sql kgAlias.tab kgProtAlias rn4 060115 hgsql rn4 -N -e \ 'select kgID, spDisplayID, protAcc from kgXref where protAcc != ""' \ | sort -u > kgProtAliasNCBI.tab # include variant splice protein IDs hgsql rn4 -N -e \ 'select name, proteinID, parAcc from knownGene,sp060115.varAcc \ where varAcc=proteinID' \ | sort -u > kgProtAliasDup.tab # include duplicate protein IDs from dupSpMrna table hgsql rn4 -N -e \ 'select name, knownGene.proteinID, dupProteinID from knownGene, dupSpMrna \ where name=mrnaID' \ | sort -u >> kgProtAliasDup.tab # catch parent acc from dupProteinID too hgsql rn4 -N -e\ 'select name, knownGene.proteinID, parAcc \ from knownGene,dupSpMrna,sp060115.varAcc \ where name=mrnaID and dupProteinID=varAcc.varAcc' \ | sort -u >> kgProtAliasDup.tab sort -u kgProtAliasNCBI.tab kgProtAlias.tab kgProtAliasDup.tab \ > kgProtAliasAll.tab hgLoadSqlTab rn4 kgProtAlias ~/kent/src/hg/lib/kgProtAlias.sql \ kgProtAliasAll.tab # Build kgSpAlias table hgsql rn4 -e \ 'select kgXref.kgID, spID, alias from kgXref, kgAlias \ where kgXref.kgID=kgAlias.kgID' > j.tmp hgsql rn4 -e \ 'select kgXref.kgID, spID, alias from kgXref, kgProtAlias \ where kgXref.kgID=kgProtAlias.kgID' >> j.tmp sort -u j.tmp | grep -v 'kgID' > rn4.kgSpAlias.tab rm j.tmp hgLoadSqlTab rn4 kgSpAlias ~/kent/src/hg/lib/kgSpAlias.sql \ rn4.kgSpAlias.tab # Ask cluster-admin to rsync the following tables from kolossus rn4 # back to hgwdev rn4: dupSpMrna kgAlias kgProtAlias kgProtMap kgSpAlias kgXref knownGene knownGeneMrna knownGenePep mrnaRefseq protBlat refRa spMrna # Copy history comments from kolossus to hgwdev's rn4.history table. hgsqldump rn4 history \ | egrep 'dupSpMrna|kgAlias|kgProtAlias|kgProtMap|kgSpAlias|kgXref|knownGene|knownGeneMrna|knownGenePep|mrnaRefseq|protBlat|refRa|spMrna' \ | perl -wpe 's/\([0-9]+,/(NULL,/' \ > kgHistory.sql ssh hgwdev hgsql rn4 < /cluster/data/rn4/bed/kgRn4A/kgHistory.sql # CREATE FULL TEXT INDEX FOR KNOWN GENES (DONE 3/6/06 angie) # This depends on the go and uniProt databases as well as # the kgAlias and kgProAlias tables. ssh hgwdev mkdir /cluster/data/rn4/bed/kgRn4A/index cd /cluster/data/rn4/bed/kgRn4A/index hgKgGetText rn4 knownGene.text ixIxx knownGene.text knownGene.ix knownGene.ixx ln -s /cluster/data/rn4/bed/kgRn4A/index/knownGene.ix /gbdb/rn4/ ln -s /cluster/data/rn4/bed/kgRn4A/index/knownGene.ixx /gbdb/rn4/ # end KNOWN GENES ############################################################################### # BLASTZ/CHAIN/NET DANRER3 - NO LINSPECREP (DONE 3/2/06 angie) ssh kkstore01 mkdir /cluster/data/rn4/bed/blastz.danRer3.2006-03-02 cd /cluster/data/rn4/bed/blastz.danRer3.2006-03-02 cat << '_EOF_' > DEF # rat vs zebrafish BLASTZ=blastz.v7.x86_64 # Reuse parameters from hg16-fr1, danRer-hg17 and mm5-danRer BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_M=50 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET: Rat Rn4 SEQ1_DIR=/scratch/hg/rn4/nib SEQ1_LEN=/cluster/data/rn4/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: Zebrafish (danRer3) # large enough chunk to do complete chroms at once SEQ2_DIR=/san/sanvol1/scratch/danRer3/chromNib SEQ2_LEN=/san/sanvol1/scratch/danRer3/chromNib.sizes SEQ2_CHUNK=100000000 SEQ2_LAP=0 BASE=/cluster/data/rn4/bed/blastz.danRer3.2006-03-02 '_EOF_' # << for emacs doBlastzChainNet.pl DEF -bigClusterHub=pk \ -blastzOutRoot /san/sanvol1/scratch/blastzRn4DanRer3Out >& do.log & tail -f do.log ln -s blastz.danRer3.2006-03-02 /cluster/data/rn4/bed/blastz.danRer3 # BLASTZ/CHAIN/NET DANRER3 - WITH LINSPECREP (DONE 3/3/06 angie) # The coverage decreased so I ended up not using this run. # It would be interesting to try a run that uses all repeats # *except for simple* as lineage-specific too sometime. ssh kkstore01 # Set up rn4/linSpecRep.notInNonMammal mkdir /cluster/bluearc/rn4/linSpecRep.notInNonMammal cd /cluster/data/rn4 foreach f (?{,?}/chr*.fa.out) echo $f:t:r:r cp -p $f /cluster/bluearc/rn4/linSpecRep.notInNonMammal/$f:t:r:r.out.spec end # Rename tables from previous run; remove download dir ssh hgwdev foreach t (netDanRer3 \ `hgsql rn4 -N -e 'show tables like "%chainDanRer3%"'`) set newT = `echo $t | sed -e 's/Rer3/Rer3NoLSR/'` hgsql rn4 -e 'rename table '$t' to '$newT end rm -r /usr/local/apache/htdocs/goldenPath/rn4/vsDanRer3 ssh pk mkdir /cluster/data/rn4/bed/blastz.danRer3.2006-03-03 cd /cluster/data/rn4/bed/blastz.danRer3.2006-03-03 cat << '_EOF_' > DEF # rat vs zebrafish BLASTZ=blastz.v7.x86_64 BLASTZ_ABRIDGE_REPEATS=1 # Reuse parameters from hg16-fr1, danRer-hg17 and mm5-danRer BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_M=50 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET: Rat Rn4 SEQ1_DIR=/scratch/hg/rn4/nib SEQ1_SMSK=/cluster/bluearc/rn4/linSpecRep.notInNonMammal SEQ1_LEN=/cluster/data/rn4/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: Zebrafish (danRer3) # large enough chunk to do complete chroms at once SEQ2_DIR=/san/sanvol1/scratch/danRer3/chromNib SEQ2_SMSK=/san/sanvol1/scratch/danRer3/linSpecRep.notInOthers SEQ2_LEN=/san/sanvol1/scratch/danRer3/chromNib.sizes SEQ2_CHUNK=100000000 SEQ2_LAP=0 BASE=/cluster/data/rn4/bed/blastz.danRer3.2006-03-03 '_EOF_' # << for emacs doBlastzChainNet.pl DEF -bigClusterHub=pk \ -blastzOutRoot /san/sanvol1/scratch/blastzRn4DanRer3Out >& do.log & tail -f do.log ln -s blastz.danRer3.2006-03-03 /cluster/data/rn4/bed/blastz.danRer3 # COMPARE danRer3 runs, without vs. with LSR (DONE 3/13/06 angie) ssh kolossus cd /cluster/data/rn4/bed # have to use redirect because vprintf doesn't like being called # twice on x86_64... should fix that sometime. netStats /dev/null blastz.danRer3.2006-03-02/axtChain/rn4.danRer3.net.gz \ > blastz.danRer3.2006-03-02/axtChain/netStats.out netStats /dev/null blastz.danRer3.2006-03-03/axtChain/rn4.danRer3.net.gz \ > blastz.danRer3.2006-03-03/axtChain/netStats.out sdiff blastz.danRer3.2006-03-0[23]/axtChain/netStats.out # A few points of comparison from that: # Without: With: # max depth: 11 7 # gap count: 618046 487550 # gap average size T: 809.1 1024.5 # gap average size Q: 529.2 688.0 # fill count: 60347 58415 # top count: 38000 37787 # top percent of total: 84.9% 87.5% # nonSyn percent of total: 11.1% 9.6% # syn percent of total: 2.5% 1.8% # Coverage: netToBed blastz.danRer3.2006-03-02/axtChain/{rn4.danRer3.net.gz,net.bed} netToBed blastz.danRer3.2006-03-03/axtChain/{rn4.danRer3.net.gz,net.bed} featureBits rn4 blastz.danRer3.2006-03-02/axtChain/net.bed #513894027 bases of 2571531505 (19.984%) in intersection featureBits rn4 blastz.danRer3.2006-03-03/axtChain/net.bed #509076601 bases of 2571531505 (19.797%) in intersection # OK, so coverage drops with LSR. Maybe some repeats are being snipped # that could be aligned (ancestral)? # Eyeball the one place where level == 11 in the 02 set # ("select * from netDanRer3NoLSR where level >= 11;" on hgwdev): # chr3 | 97071076 | 97071655, zoom out 1.5 # NoLSR has a lot more chains... more levels filled. # with LSR gets a "Level 2" chain that NoLSR does not get at all. # Who's to say which is right??? NoLSR looks messier to me but that # is a very subjective judgment. # Interestingly, there are no repeats on rn4 in that region. # Panning to the right a ways, just long gaps for NoLSR but a few # blocks for with. NoLSR gets some sizable alignments where with # does not... a stretch of simple/low-complexity repeats there. # Those should probably be excluded when we are using all repeats # as lin-spec! Hopping over to a danRer3 region aligned only by # NoLSR I see some more low-complexity. Should run that by Arian. # Could do yet another run with all repeats except simple/low-complexity # (what is DNA? maybe that too?) as linSpec. # Panning to the right some more, a long dry spell for with LSR. # There are olfactory receptor genes around which could indicate a # tougher job than usual. Not much agreement between the nets though, # yikes. High precision, but at what accuracy? # I just worry about the reliability of what comes out of the pipeline... # so sensitive to parameters. # chr3:97,345,533-97,360,012 there is a withLSR chain but the net is empty. # Since the coverage is higher without LSR, go with that for now. # BLASTZ/CHAIN/NET XENTRO1 (DONE 3/20/06 angie) ssh kkstore01 mkdir /cluster/data/rn4/bed/blastz.xenTro1.2006-03-20 cd /cluster/data/rn4/bed/blastz.xenTro1.2006-03-20 cat << '_EOF_' > DEF # rat vs. frog BLASTZ=/cluster/bin/penn/x86_64/blastz.v7.x86_64 # Mammal to non-mammal, with threshold of 8000 as in mm8-xenTro1 BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=8000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET: Rat rn4 SEQ1_DIR=/scratch/hg/rn4/nib SEQ1_LEN=/cluster/data/rn4/chrom.sizes SEQ1_CHUNK=50000000 SEQ1_LAP=10000 # QUERY: Frog xenTro1 - single chunk big enough to run two of the # largest scaffolds in one job SEQ2_DIR=/scratch/hg/xenTro1/xenTro1.2bit SEQ2_LEN=/scratch/hg/xenTro1/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LAP=0 SEQ2_LIMIT=100 BASE=/cluster/data/rn4/bed/blastz.xenTro1.2006-03-20 '_EOF_' # << emacs doBlastzChainNet.pl -blastzOutRoot=/san/sanvol1/rn4XenTro1 \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose DEF \ >& do.log & tail -f do.log ln -s blastz.xenTro1.2006-03-20 /cluster/data/rn4/bed/blastz.xenTro1 # BLASTZ/CHAIN/NET RHEMAC2 (DONE 3/23/06 angie) # Won't put this in Conservation -- special request for ancestor recon. ssh pk mkdir /cluster/data/rn4/bed/blastz.rheMac2.2006-03-22 cd /cluster/data/rn4/bed/blastz.rheMac2.2006-03-22 cat << '_EOF_' > DEF # Rat vs. macacque BLASTZ=/cluster/bin/penn/x86_64/blastz.v7.x86_64 BLASTZ_ABRIDGE_REPEATS=1 # TARGET: Rat (rn4) SEQ1_DIR=/scratch/hg/rn4/nib SEQ1_SMSK=/san/sanvol1/scratch/rn4/linSpecRep.notInHuman SEQ1_LEN=/cluster/data/rn4/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Macacque (rheMac2) SEQ2_DIR=/san/sanvol1/scratch/rheMac2/nib SEQ2_SMSK=/cluster/bluearc/rheMac2/linSpecRep/notInRodent SEQ2_LEN=/cluster/data/rheMac2/chrom.sizes SEQ2_CHUNK=50000000 SEQ2_LAP=0 BASE=/cluster/data/rn4/bed/blastz.rheMac2.2006-03-22 '_EOF_' # << emacs doBlastzChainNet.pl -blastzOutRoot /san/sanvol1/rn4RheMac2 \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium DEF \ >& do.log & tail -f do.log ln -s blastz.rheMac2.2006-03-22 /cluster/data/rn4/bed/blastz.rheMac2 ####################################################################### # BLASTZ/CHAIN/NET MONDOM4 (DONE 3/24/06 angie) ssh pk mkdir /cluster/data/rn4/bed/blastz.monDom4.2006-03-23 cd /cluster/data/rn4/bed/blastz.monDom4.2006-03-23 cat << '_EOF_' > DEF # Rat vs. opossum BLASTZ=/cluster/bin/penn/x86_64/blastz.v7.x86_64 # Mammal to non-mammal -- well, I guess non-placental-mammal, with # high L like chicken, and low M to guard against pileups w/monDom4: BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=10000 BLASTZ_K=2200 BLASTZ_M=20 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET: Rat (rn4) SEQ1_DIR=/scratch/hg/rn4/nib SEQ1_LEN=/cluster/data/rn4/chrom.sizes SEQ1_CHUNK=100000000 SEQ1_LAP=10000 # QUERY: Opossum monDom4 SEQ2_DIR=/cluster/bluearc/scratch/hg/monDom4/monDom4.2bit SEQ2_LEN=/cluster/bluearc/scratch/hg/monDom4/chrom.sizes SEQ2_CHUNK=50000000 SEQ2_LIMIT=100 SEQ2_LAP=0 BASE=/cluster/data/rn4/bed/blastz.monDom4.2006-03-23 '_EOF_' # << emacs doBlastzChainNet.pl -blastzOutRoot /san/sanvol1/rn4MonDom4 \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose DEF \ -workhorse kkr7u00 >& do.log & tail -f do.log ln -s blastz.monDom4.2006-03-23 /cluster/data/rn4/bed/blastz.monDom4 ssh kolossus cd /cluster/data/rn4/bed/blastz.monDom4 time nice -n +19 featureBits rn4 chainMonDom4Link \ > fb.rn4.chainMonDom4Link 2>&1 cat fb.rn4.chainMonDom4Link # 200166664 bases of 2571531505 (7.784%) in intersection ####################################################################### # BLASTZ/CHAIN/NET BOSTAU2 (DONE 3/24/06 angie) ssh pk mkdir /cluster/data/rn4/bed/blastz.bosTau2.2006-03-23 cd /cluster/data/rn4/bed/blastz.bosTau2.2006-03-23 cat << '_EOF_' > DEF # rat vs cow BLASTZ=/cluster/bin/penn/x86_64/blastz.v7.x86_64 # TARGET: Rat (rn4) SEQ1_DIR=/scratch/hg/rn4/nib SEQ1_LEN=/cluster/data/rn4/chrom.sizes SEQ1_CHUNK=50000000 SEQ1_LAP=10000 # QUERY: Cow (bosTau2) # large enough chunk to do chroms in one piece SEQ2_DIR=/scratch/hg/bosTau2/bosTau2.noBin0.2bit SEQ2_LEN=/scratch/hg/bosTau2/noBin0.sizes SEQ2_CHUNK=150000000 SEQ2_LIMIT=100 SEQ2_LAP=0 BASE=/cluster/data/rn4/bed/blastz.bosTau2.2006-03-23 '_EOF_' # << emacs doBlastzChainNet.pl -chainMinScore=3000 -chainLinearGap=medium \ -bigClusterHub=pk -workhorse=kkr8u00 DEF \ >& do.log & tail -f do.log ln -s blastz.bosTau2.2006-03-23 /cluster/data/rn4/bed/blastz.bosTau2 # SELF CHAINS (DONE - NOT LOADED INTO DB 3/27/06 angie) # Won't put this in Conservation -- special request for ancestor recon. ssh pk mkdir /cluster/data/rn4/bed/blastz.rn4.2006-03-24 cd /cluster/data/rn4/bed/blastz.rn4.2006-03-24 cat << '_EOF_' > DEF # rat vs rat BLASTZ=/cluster/bin/penn/x86_64/blastz.v7.x86_64 BLASTZ_H=2000 BLASTZ_M=200 # TARGET: rat rn4 SEQ1_DIR=/scratch/hg/rn4/nib SEQ1_LEN=/cluster/data/rn4/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: rat rn4 SEQ2_DIR=/scratch/hg/rn4/nib SEQ2_LEN=/cluster/data/rn4/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/cluster/data/rn4/bed/blastz.rn4.2006-03-24 '_EOF_' # << emacs doBlastzChainNet.pl -chainMinScore=10000 -chainLinearGap=medium \ -bigClusterHub=pk -workhorse=kkr7u00 DEF \ >& do.log & tail -f do.log # It died during loading because hgwdev /var/lib/mysql ran out of room! # There is enough space now but I deleted the tables anyway -- can load # later if anyone wants to browse. Did -continue download so that the # ancestor-reconstruction folks who requested this can find the chains # in the usual place. ln -s blastz.rn4.2006-03-24 /cluster/data/rn4/bed/blastz.rn4 # BLASTZ/CHAIN/NET PANTRO2 (DONE 3/27/06 angie) # Won't put this in Conservation -- special request for ancestor recon. ssh pk mkdir /cluster/data/rn4/bed/blastz.panTro2.2006-03-27 cd /cluster/data/rn4/bed/blastz.panTro2.2006-03-27 cat << '_EOF_' > DEF # Rat vs. chimp BLASTZ=/cluster/bin/penn/x86_64/blastz.v7.x86_64 BLASTZ_ABRIDGE_REPEATS=1 # TARGET: Rat (rn4) SEQ1_DIR=/scratch/hg/rn4/nib SEQ1_SMSK=/san/sanvol1/scratch/rn4/linSpecRep.notInHuman SEQ1_LEN=/cluster/data/rn4/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Chimp (panTro2) SEQ2_DIR=/scratch/hg/panTro2/nib SEQ2_SMSK=/cluster/bluearc/panTro2/linSpecRep/notInRodent SEQ2_LEN=/cluster/data/panTro2/chrom.sizes SEQ2_CHUNK=50000000 SEQ2_LAP=0 BASE=/cluster/data/rn4/bed/blastz.panTro2.2006-03-27 '_EOF_' # << emacs doBlastzChainNet.pl -blastzOutRoot /san/sanvol1/rn4PanTro2 \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium DEF \ -workhorse=kkr5u00 >& do.log & tail -f do.log ln -s blastz.panTro2.2006-03-27 /cluster/data/rn4/bed/blastz.panTro2 ############################################################################ ## BLASTZ swap from mm8 alignments (DONE - 2006-02-18 - Hiram) ssh kk cd /cluster/data/mm8/bed/blastzRn4.2006-02-16 time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -swap -bigClusterHub=kk -chainMinScore=3000 -chainLinearGap=medium \ `pwd`/DEF > swap.out 2>&1 & time nice -n +19 featureBits rn4 chainMm8Link # 1791093685 bases of 2571531505 (69.651%) in intersection # BUILD GENE SORTER TABLES (AKA: FAMILY BROWSER) (STARTED 2006-03-13, Fan) # This should be done after KG tables are complete from known genes build # process. # # Cluster together various alt-splicing isoforms. # Creates the knownIsoforms and knownCanonical tables ssh hgwdev mkdir /cluster/data/rn4/bed/geneSorter.2006-03-13 # remove old symbolic link rm /cluster/data/rn4/bed/geneSorter ln -s /cluster/data/rn4/bed/geneSorter.2006-03-13 /cluster/data/rn4/bed/geneSorter cd /cluster/data/rn4/bed/geneSorter hgClusterGenes rn4 knownGene knownIsoforms knownCanonical # Extract peptides from knownGenes into fasta file # and create a blast database out of them. mkdir /cluster/data/rn4/bed/geneSorter/blastp ln -s /cluster/data/rn4/bed/geneSorter/blastp /cluster/data/rn4/bed/blastp cd /cluster/data/rn4/bed/geneSorter/blastp pepPredToFa rn4 knownGenePep known.faa # You may need to build this binary in src/hg/near/pepPredToFa /scratch/blast/formatdb -i known.faa -t known -n known # This command is in /projects/compbio/bin/$MACH/formatdb #TODO: swap rn4 to: xenTro1 # MULTIZ9WAY (DONE 3/30/06 angie) # (((((((rn4 mm8) hg18) (canFam2 bosTau2)) monDom4) galGal2) xenTro1) danRer3) ssh kkstore01 mkdir /cluster/data/rn4/bed/multiz9way.2006-03-30 cd /cluster/data/rn4/bed/multiz9way.2006-03-30 # copy MAF's to cluster-friendly server # These MAF's already on bluearc: # canFam2, fr1, galGal2, panTro1, rn4 mkdir /san/sanvol1/scratch/rn4/mafNet foreach s (mm8 hg18 canFam2 bosTau2 monDom4 galGal2 xenTro1 danRer3) echo $s rsync -av /cluster/data/rn4/bed/blastz.$s/mafNet/* \ /san/sanvol1/scratch/rn4/mafNet/$s/ end # Prune the hg17 17way tree to just these 9 and update db names: /cluster/bin/phast/tree_doctor \ --prune-all-but=rat_rn3,mouse_mm7,human_hg17,dog_canFam2,cow_bosTau2,monodelphis_monDom2,chicken_galGal2,xenopus_xenTro1,zebrafish_danRer3 \ --rename="rat_rn3 -> rat_rn4 ; mouse_mm7 -> mouse_mm8 ; human_hg17 -> human_hg18 ; monodelphis_monDom2 -> monodelphis_monDom4" \ /cluster/data/hg17/bed/multiz17way/17way.nh > 9way.nh # *carefully* edit 9way.nh to move hg18 to appear after (rat mouse), # save as 9way_ratFirst.nh, and make a tree picture: /cluster/bin/phast/draw_tree 9way_ratFirst.nh > 9way.ps ps2pdf 9way.ps > 9way.pdf /cluster/bin/phast/all_dists 9way_ratFirst.nh > 9way.distances.txt grep rn4 9way.distances.txt | sort -k3,3n | \ awk '{printf ("%.4f\t%s\n", $3, $2)}' > distances.txt cat distances.txt #0.1587 mouse_mm8 #0.4724 human_hg18 #0.6277 dog_canFam2 #0.6392 cow_bosTau2 #1.0745 monodelphis_monDom4 #1.3472 chicken_galGal2 #1.7983 xenopus_xenTro1 #2.1105 zebrafish_danRer3 # the order in the browser display will be by tree topology, # not by distance, but in this case (unlike human) they happen to match. # create species list and stripped down tree for autoMZ sed -e 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//' 9way_ratFirst.nh \ > tree-commas.nh sed -e 's/ //g; s/,/ /g' tree-commas.nh > tree.nh sed -e 's/[()]//g; s/,/ /g' tree.nh > species.lst ssh pk cd /cluster/data/rn4/bed/multiz9way.2006-03-30 mkdir maf run cd run # stash binaries mkdir penn cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/multiz penn cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/maf_project penn cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/autoMZ penn cat > autoMultiz.csh << 'EOF' #!/bin/csh -ef set db = rn4 set c = $1 set maf = $2 set run = `pwd` set tmp = /scratch/tmp/$db/multiz.$c set pairs = /san/sanvol1/scratch/$db/mafNet rm -fr $tmp mkdir -p $tmp cp ../{tree.nh,species.lst} $tmp pushd $tmp foreach s (`cat species.lst`) set in = $pairs/$s/$c.maf set out = $db.$s.sing.maf if ($s == $db) then continue endif if (-e $in.gz) then zcat $in.gz > $out else if (-e $in) then cp $in $out else echo "##maf version=1 scoring=autoMZ" > $out endif end set path = ($run/penn $path); rehash $run/penn/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c.maf popd cp $tmp/$c.maf $maf rm -fr $tmp 'EOF' # << emacs chmod +x autoMultiz.csh cat << 'EOF' > spec #LOOP ./autoMultiz.csh $(root1) {check out line+ /cluster/data/rn4/bed/multiz9way.2006-03-30/maf/$(root1).maf} #ENDLOOP 'EOF' # << emacs awk '{print $1}' /cluster/data/rn4/chrom.sizes > chrom.lst gensub2 chrom.lst single spec jobList para make jobList para time #Completed: 45 of 45 jobs #CPU time in finished jobs: 60564s 1009.39m 16.82h 0.70d 0.002 y #IO & Wait Time: 2996s 49.94m 0.83h 0.03d 0.000 y #Average job time: 1412s 23.54m 0.39h 0.02d #Longest finished job: 6172s 102.87m 1.71h 0.07d #Submission to last job: 6173s 102.88m 1.71h 0.07d # Make .jpg for tree and install in htdocs/images/phylo/... don't forget # to request a push of that file. The treeImage setting in trackDb.ra # is phylo/rn4_9way.jpg (relative to htdocs/images). ssh hgwdev cd /cluster/data/rn4/bed/multiz9way.2006-03-30 pstopnm -stdout 9way.ps | pnmtojpeg > rn4_9way.jpg ln -s /cluster/data/rn4/bed/multiz9way.2006-03-30/rn4_9way.jpg \ /usr/local/apache/htdocs/images/phylo/rn4_9way.jpg # ANNOTATE MULTIZ9WAY MAF AND LOAD TABLES (DONE 4/4/2006 angie) ssh kolossus mkdir /cluster/data/rn4/bed/multiz9way.2006-03-30/anno cd /cluster/data/rn4/bed/multiz9way.2006-03-30/anno mkdir maf run cd run rm -f sizes nBeds foreach db (`cat /cluster/data/rn4/bed/multiz9way.2006-03-30/species.lst`) ln -s /cluster/data/$db/chrom.sizes $db.len if (! -e /cluster/data/$db/$db.N.bed) then twoBitInfo -nBed /cluster/data/$db/$db.{2bit,N.bed} endif ln -s /cluster/data/$db/$db.N.bed $db.bed echo $db.bed >> nBeds echo $db.len >> sizes end echo date > jobs.csh # do smaller jobs first: foreach f (`ls -1rS ../../maf/*.maf`) echo nice mafAddIRows -nBeds=nBeds -sizes=sizes $f \ /cluster/data/rn4/rn4.2bit ../maf/`basename $f` >> jobs.csh echo "echo $f" >> jobs.csh end echo date >> jobs.csh csh -efx jobs.csh >&! jobs.log & tail -f jobs.log # 1.5 hrs. # Load anno/maf ssh hgwdev cd /cluster/data/rn4/bed/multiz9way.2006-03-30/anno/maf mkdir -p /gbdb/rn4/multiz9way/anno/maf ln -s /cluster/data/rn4/bed/multiz9way.2006-03-30/anno/maf/*.maf \ /gbdb/rn4/multiz9way/anno/maf cat > loadMaf.csh << 'EOF' date nice hgLoadMaf -pathPrefix=/gbdb/rn4/multiz9way/anno/maf rn4 multiz9way date 'EOF' # << emacs csh -efx loadMaf.csh >&! loadMaf.log & tail -f loadMaf.log # Do the computation-intensive part of hgLoadMafSummary on a workhorse # machine and then load on hgwdev: ssh kkr7u00 cd /cluster/data/rn4/bed/multiz9way.2006-03-30/anno/maf cat *.maf \ | nice hgLoadMafSummary rn4 -minSize=30000 -mergeGap=1500 -maxSize=200000 \ -test multiz9waySummary stdin #Created 1408247 summary blocks from 22055250 components and 8577575 mafs from stdin ssh hgwdev cd /cluster/data/rn4/bed/multiz9way.2006-03-30/anno/maf sed -e 's/mafSummary/multiz9waySummary/' ~/kent/src/hg/lib/mafSummary.sql \ > /tmp/multiz9waySummary.sql time nice hgLoadSqlTab rn4 multiz9waySummary /tmp/multiz9waySummary.sql \ multiz9waySummary.tab #0.000u 0.000s 4:04.62 0.0% 0+0k 0+0io 217pf+0w rm *.tab ln -s multiz9way.2006-03-30 /cluster/data/rn4/bed/multiz9way # MULTIZ9WAY DOWNLOADABLES (UNANNOTATED) (DONE 6/9/2006 angie) # Annotated MAF is now documented, so use anno/maf for downloads. ssh hgwdev mkdir /cluster/data/rn4/bed/multiz9way.2006-03-30/mafDownloads cd /cluster/data/rn4/bed/multiz9way.2006-03-30/mafDownloads # upstream mafs cat > mafFrags.csh << 'EOF' date foreach i (1000 2000 5000) echo "making upstream$i.maf" nice featureBits rn4 refGene:upstream:$i -fa=/dev/null -bed=up.bad awk -F '\t' '{printf("%s\t%s\t%s\t%s\t%s\t%s\n", $1, $2, $3, substr($4, 0, 9), 0, $5)}' up.bad > up.bed rm up.bad nice mafFrags rn4 multiz9way up.bed upstream$i.maf \ -orgs=../species.lst rm up.bed end date 'EOF' # << emacs time csh mafFrags.csh >&! mafFrags.log & tail -f mafFrags.log #old hgwdev: #75.280u 480.470s 17:07.43 54.0% 0+0k 0+0io 4640pf+0w #new hgwdev: #85.814u 171.368s 7:37.96 56.1% 0+0k 0+0io 0pf+0w ssh kkstore04 cd /cluster/data/rn4/bed/multiz9way.2006-03-30/mafDownloads cat > downloads.csh << 'EOF' date foreach f (../anno/maf/chr*.maf) set c = $f:t:r echo $c nice gzip -c $f > $c.maf.gz end md5sum *.gz > md5sum.txt date 'EOF' # << emacs time csh -efx downloads.csh >&! downloads.log #1897.874u 25.317s 32:58.19 97.2% 0+0k 0+0io 2pf+0w nice gzip up*.maf nice md5sum up*.maf.gz >> md5sum.txt ssh hgwdev set dir = /usr/local/apache/htdocs/goldenPath/rn4/multiz9way mkdir $dir ln -s /cluster/data/rn4/bed/multiz9way.2006-03-30/mafDownloads/{*.gz,md5sum.txt} $dir cp /usr/local/apache/htdocs/goldenPath/mm7/multiz17way/README.txt $dir # edit README.txt # MULTIZ9WAY MAF FRAMES (DONE 4/5/2006 angie) # rebuild frames to get bug fix, using 1-pass maf methodology (2006-06-09 markd) ssh hgwdev mkdir /cluster/data/rn4/bed/multiz9way.2006-03-30/frames cd /cluster/data/rn4/bed/multiz9way.2006-03-30/frames # The following is adapted from MarkD's Makefile used for mm7... #------------------------------------------------------------------------ # get the genes for all genomes # mRNAs with CDS. single select to get cds+psl, then split that up and # create genePred # using mrna table as genes: canFam2 bosTau2 danRer3 mkdir genes foreach queryDb (canFam2 bosTau2 danRer3) set tmpExt = `mktemp temp.XXXXXX` set tmpMrnaCds = ${queryDb}.mrna-cds.${tmpExt} set tmpMrna = ${queryDb}.mrna.${tmpExt} set tmpCds = ${queryDb}.cds.${tmpExt} echo $queryDb hgsql -N -e 'select all_mrna.qName,cds.name,all_mrna.* \ from all_mrna,gbCdnaInfo,cds \ where (all_mrna.qName = gbCdnaInfo.acc) and \ (gbCdnaInfo.cds != 0) and (gbCdnaInfo.cds = cds.id)' \ ${queryDb} > ${tmpMrnaCds} cut -f 1-2 ${tmpMrnaCds} > ${tmpCds} cut -f 4-100 ${tmpMrnaCds} > ${tmpMrna} mrnaToGene -cdsFile=${tmpCds} -smallInsertSize=8 -quiet ${tmpMrna} \ stdout \ | genePredSingleCover stdin stdout | gzip -2c \ > /scratch/tmp/$queryDb.tmp.gz rm ${tmpMrnaCds} ${tmpMrna} ${tmpCds} mv /scratch/tmp/$queryDb.tmp.gz genes/$queryDb.gp.gz rm -f $tmpExt end # using knownGene for rn4 mm8 hg18 # using refGene for galGal2 # using mgcGenes for xenTro1 # no genes for monDom4 # genePreds; (must keep only the first 10 columns for knownGene) foreach queryDb (rn4 mm8 hg18 galGal2 xenTro1) if ($queryDb == "xenTro1") then set geneTbl = mgcGenes else if ($queryDb == "galGal2") then set geneTbl = refGene else set geneTbl = knownGene endif hgsql -N -e "select * from $geneTbl" ${queryDb} | cut -f 1-10 \ | genePredSingleCover stdin stdout | gzip -2c \ > /scratch/tmp/$queryDb.tmp.gz mv /scratch/tmp/$queryDb.tmp.gz genes/$queryDb.gp.gz rm -f $tmpExt end #------------------------------------------------------------------------ # create frames set clusterDir = /cluster/bluearc/rn4/multiz9wayFrames set multizDir = /cluster/data/rn4/bed/multiz9way.2006-03-30 set mafDir = $multizDir/mafDownloads set geneDir = $multizDir/frames/genes set clusterMafDir = ${clusterDir}/maf set clusterGeneDir = ${clusterDir}/genes set clusterFramesDir = ${clusterDir}/mafFrames.kki # copy mafs to cluster storage mkdir $clusterDir ssh -x kkstore04 "rsync -av $mafDir/*.maf.gz $clusterMafDir/" # copy genes to cluster storage ssh -x kkstore04 "rsync -av $geneDir/*.gp.gz $clusterGeneDir/" # run cluster jobs set tmpExt = `mktemp temp.XXXXXX` set paraDir = $multizDir/frames/para.${tmpExt} mkdir mafFrames $paraDir rm -f $paraDir/jobList mkdir ${clusterFramesDir} foreach queryDb (`cat /cluster/data/rn4/bed/multiz9way.2006-03-30/species.lst`) mkdir ${clusterFramesDir}/${queryDb} foreach c (`awk '{print $1;}' /cluster/data/rn4/chrom.sizes`) if (-e ${clusterGeneDir}/${queryDb}.gp.gz) then echo /cluster/bin/scripts/mkMafFrames.pl ${queryDb} rn4 \ ${clusterGeneDir}/${queryDb}.gp.gz ${clusterMafDir}/$c.maf.gz \ ${clusterFramesDir}/${queryDb}/$c.mafFrames \ >> $paraDir/jobList endif end end rm -f $tmpExt ssh -x kki "cd ${paraDir} && para make jobList && para time" #Completed: 360 of 360 jobs #CPU time in finished jobs: 2027s 33.78m 0.56h 0.02d 0.000 y #IO & Wait Time: 3816s 63.60m 1.06h 0.04d 0.000 y #Average job time: 16s 0.27m 0.00h 0.00d #Longest finished job: 124s 2.07m 0.03h 0.00d #Submission to last job: 493s 8.22m 0.14h 0.01d # combine results from cluster foreach queryDb (`cat ../species.lst`) echo $queryDb ssh -x kolossus "cat ${clusterFramesDir}/${queryDb}/*.mafFrames | gzip -2c > ${multizDir}/frames/mafFrames/${queryDb}.mafFrames.gz" end #------------------------------------------------------------------------ # load the database hgLoadMafFrames rn4 multiz9wayFrames mafFrames/*.mafFrames.gz #------------------------------------------------------------------------ # clean up rm -rf ${clusterDir} # rebuild frames to get bug fix, using 1-pass maf methodology # (2006-06-09 markd) ssh kkstore04 cd /cluster/data/rn4/bed/multiz9way/frames mv mafFrames/ mafFrames.old nice tcsh # easy way to get process niced (cat ../anno/maf/*.maf | time genePredToMafFrames rn4 stdin stdout bosTau2 genes/bosTau2.gp.gz canFam2 genes/canFam2.gp.gz danRer3 genes/danRer3.gp.gz galGal2 genes/galGal2.gp.gz hg18 genes/hg18.gp.gz mm8 genes/mm8.gp.gz rn4 genes/rn4.gp.gz xenTro1 genes/xenTro1.gp.gz | gzip >multiz9way.mafFrames.gz)>&log& ssh hgwdev cd /cluster/data/rn4/bed/multiz9way/frames hgLoadMafFrames rn4 multiz9wayFrames multiz9way.mafFrames.gz >&log& # PHASTCONS (DONE 4/2/2006 angie) # Using Kate's process from makeHg17.doc. # This process is distilled from Hiram and Adam's experiments # on mouse (mm7) 17way track. Many parameters are now fixed, without # being experimentally derived, either because the experiments # were lengthy and produced similar results, or because they # weren't runnable given the alignment size. # These parameters are: # --rho # --expected-length # --target-coverage # Also, instead of generating cons and noncons tree models, # we use a single, pre-existing tree model -- Elliot Margulies' model # from the (37-way) ENCODE alignments. # # NOTE: reusing cluster-friendly chrom fasta files created earlier #cd /cluster/data/rn4 #foreach f (`cat chrom.lst`) #echo $f #cp $f/*.fa /cluster/bluearc/rn4/chrom #end # Split chromosome MAF's into windows and use to generate # "sufficient statistics" (ss) files for phastCons input # NOTE: as the SAN fs has lotsa space, we're leaving these # big (temp) files unzipped, to save time during phastCons run. # Note also the larger chunk sizes from previous runs -- this # reduces run-time on the split, slows down the actual phastCons # enough so jobs don't crash (jobs are very quick, just a minute # or so), and according to Adam, will produce better results. # The previous small chunks were probably required by # the phyloFit step, which we are no longer using for the # human alignments. ssh pk mkdir /cluster/data/rn4/bed/multiz9way.2006-03-30/phastCons cd /cluster/data/rn4/bed/multiz9way.2006-03-30/phastCons /cluster/bin/phast/tree_doctor \ --prune-all-but=rn3,mm7,hg17,canFam2,bosTau2,monDom2,galGal2,xenTro1,danRer3 \ --rename="rn3 -> rn4 ; hg17 -> hg18 ; mm7 -> mm8 ; monDom2 -> monDom4" \ /san/sanvol1/scratch/mm7/cons/elliotsEncode.mod \ > elliotsEncodePruned.mod mkdir run.split cd run.split set WINDOWS = /san/sanvol1/scratch/rn4/multiz9way.2006-03-30/phastCons/ss rm -fr $WINDOWS mkdir -p $WINDOWS cat << 'EOF' > doSplit.csh #!/bin/csh -ef set MAFS = /cluster/data/rn4/bed/multiz9way.2006-03-30/maf set WINDOWS = /san/sanvol1/scratch/rn4/multiz9way.2006-03-30/phastCons/ss cd $WINDOWS set c = $1 echo $c rm -fr $c mkdir $c /cluster/bin/phast/$MACHTYPE/msa_split $MAFS/$c.maf -i MAF \ -M /cluster/bluearc/rn4/chrom/$c.fa \ -o SS -r $c/$c -w 10000000,0 -I 1000 -B 5000 echo "Done" >> $c.done 'EOF' # << emacs chmod +x doSplit.csh rm -f jobList foreach f (../../maf/*.maf) set c = $f:t:r echo "doSplit.csh $c {check out line+ $WINDOWS/$c.done}" >> jobList end para make jobList para time #Completed: 45 of 45 jobs #CPU time in finished jobs: 2102s 35.03m 0.58h 0.02d 0.000 y #IO & Wait Time: 8185s 136.42m 2.27h 0.09d 0.000 y #Average job time: 229s 3.81m 0.06h 0.00d #Longest finished job: 661s 11.02m 0.18h 0.01d #Submission to last job: 661s 11.02m 0.18h 0.01d # check tree model on 5MB chunk, using params recommended by Adam, # (to verify branch lengths on 2X species) ssh kolossus cd /cluster/data/rn4/bed/multiz9way.2006-03-30/phastCons /cluster/bin/phast/$MACHTYPE/phyloFit -i SS -E -p MED -s HKY85 \ --tree "`cat ../tree-commas.nh`" \ /san/sanvol1/scratch/rn4/multiz9way.2006-03-30/phastCons/ss/chr7/chr7.119999202-129997313.ss \ -o phyloFit.tree # Comment from makeHg17.doc: # # he ok'ed the results -- not necessary for next human run # TODO: maybe run these by Adam... the numbers in the RATE_MAT and # TREE are different between the phyloFit and elliotsEncode models, # not sure if the differences are significant. I will proceed with # elliotsEncode as in makeHg17.doc. # Run phastCons # This job is I/O intensive in its output files, thus it is all # working over in /scratch/tmp/ mkdir /cluster/data/rn4/bed/multiz9way.2006-03-30/phastCons/run.cons cd /cluster/data/rn4/bed/multiz9way.2006-03-30/phastCons/run.cons cat > doPhast.csh << 'EOF' #!/bin/csh -fe set c = $1 set f = $2 set len = $3 set cov = $4 set rho = $5 set tmp = /scratch/tmp/$f mkdir -p $tmp set san = /san/sanvol1/scratch/rn4/multiz9way.2006-03-30/phastCons cp -p $san/ss/$c/$f.ss ../elliotsEncodePruned.mod $tmp pushd $tmp > /dev/null /cluster/bin/phast/$MACHTYPE/phastCons $f.ss elliotsEncodePruned.mod \ --rho $rho --expected-length $len --target-coverage $cov --quiet \ --seqname $c --idpref $c --viterbi $f.bed --score > $f.pp popd > /dev/null mkdir -p $san/pp/$c $san/bed/$c sleep 1 mv $tmp/$f.pp $san/pp/$c mv $tmp/$f.bed $san/bed/$c rm -fr $tmp 'EOF' # << emacs chmod a+x doPhast.csh # root1 == chrom name, file1 == ss file name without .ss suffix # Create gsub file cat > template << 'EOF' #LOOP doPhast.csh $(root1) $(file1) 14 .007 .27 #ENDLOOP 'EOF' # << emacs # Create parasol batch and run it ssh pk cd /cluster/data/rn4/bed/multiz9way.2006-03-30/phastCons/run.cons pushd /san/sanvol1/scratch/rn4/multiz9way.2006-03-30/phastCons ls -1S ss/chr*/chr*.ss \ | sed 's/.ss$//' \ > /cluster/data/rn4/bed/multiz9way.2006-03-30/phastCons/run.cons/in.list popd gensub2 in.list single template jobList para make jobList para time #Completed: 314 of 313 jobs #CPU time in finished jobs: 9623s 160.38m 2.67h 0.11d 0.000 y #IO & Wait Time: 12815s 213.59m 3.56h 0.15d 0.000 y #Average job time: 72s 1.19m 0.02h 0.00d #Longest finished job: 108s 1.80m 0.03h 0.00d #Submission to last job: 148s 2.47m 0.04h 0.00d # create Most Conserved track ssh kolossus cd /san/sanvol1/scratch/rn4/multiz9way.2006-03-30/phastCons # The sed's and the sort get the file names in chrom,start order # (Hiram tricks -- split into columns on [.-/] with # identifying x,y,z, to allow column sorting and # restoring the filename. Warning: the sort column # will depend on how deep you are in the dir find ./bed -name "chr*.bed" \ | sed -e "s/\// x /g" -e "s/\./ y /g" -e "s/-/ z /g" \ | sort -k7,7 -k9,9n \ | sed -e "s/ x /\//g" -e "s/ y /\./g" -e "s/ z /-/g" \ | xargs cat \ | awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' \ | /cluster/bin/scripts/lodToBedScore /dev/stdin > phastConsElements9way.bed cp -p phastConsElements9way.bed /cluster/data/rn4/bed/multiz9way.2006-03-30/phastCons # Measure coverage. If good, load elements into database and proceed with wiggle. # Try for 5% overall cov, and 70% CDS cov. ssh hgwdev cd /cluster/data/rn4/bed/multiz9way.2006-03-30/phastCons featureBits rn4 -enrichment refGene:cds phastConsElements9way.bed # FIRST ITERATION: doPhast (len cov rho) = (14 .008 .28) #refGene:cds 0.499%, phastConsElements9way.bed 5.101%, both 0.359%, cover 71.97%, enrich 14.11x mv phastConsElements9way.bed phastConsElements9way_14_008_28.bed # SECOND ITERATION: doPhast (len cov rho) = (13 .007 .27) #refGene:cds 0.499%, phastConsElements9way.bed 4.866%, both 0.355%, cover 71.07%, enrich 14.60x mv phastConsElements9way.bed phastConsElements9way_13_007_27.bed # THIRD ITERATION: doPhast (len cov rho) = (14 .007 .27) #refGene:cds 0.499%, phastConsElements9way.bed 4.982%, both 0.356%, cover 71.32%, enrich 14.31x mv phastConsElements9way.bed phastConsElements9way_14_007_27.bed # FOURTH ITERATION: doPhast (len cov rho) = (14 .007 .28) #refGene:cds 0.499%, phastConsElements9way.bed 5.073%, both 0.359%, cover 71.96%, enrich 14.19x mv phastConsElements9way.bed phastConsElements9way_14_007_28.bed # FIFTH ITERATION: doPhast (len cov rho) = (14 .006 .28) #refGene:cds 0.499%, phastConsElements9way.bed 5.042%, both 0.359%, cover 71.95%, enrich 14.27x mv phastConsElements9way.bed phastConsElements9way_14_006_28.bed # Wow, the CDS coverage doesn't move much. Go with 14 .007 .27. # When happy: hgLoadBed -strict rn4 phastConsElements9way phastConsElements9way.bed # Create merged posterier probability file and wiggle track data files # pk is currently closer to the san than any other machine ssh kolossus cd /san/sanvol1/scratch/rn4/multiz9way.2006-03-30/phastCons/ # sort by chromName, chromStart so that items are in numerical order # for wigEncode #next time try Angie's simpler sort, below time find ./pp -name "chr*.pp" | \ sed -e "s/\// x /g" -e "s/\./ y /g" -e "s/-/ z /g" | \ sort -k7,7 -k9,9n | \ sed -e "s/ x /\//g" -e "s/ y /\./g" -e "s/ z /-/g" | \ xargs cat | \ nice wigEncode stdin phastCons9way.wig phastCons9way.wib # about 23 minutes for above cp -p phastCons9way.wi? /cluster/data/rn4/bed/multiz9way.2006-03-30/phastCons # Load gbdb and database with wiggle. ssh hgwdev cd /cluster/data/rn4/bed/multiz9way.2006-03-30/phastCons ln -s `pwd`/phastCons9way.wib /gbdb/rn4/multiz9way/phastCons9way.wib hgLoadWiggle -pathPrefix=/gbdb/rn4/multiz9way rn4 \ phastCons9way phastCons9way.wig # ~ 3 minute load # PHASTCONS SCORES DOWNLOADABLES FOR 9WAY (DONE 4/3/2006 angie) ssh kolossus cd /cluster/data/rn4/bed/multiz9way.2006-03-30 mkdir phastConsDownloads cd phastConsDownloads cat > downloads.csh << 'EOF' date cd /san/sanvol1/scratch/rn4/multiz9way.2006-03-30/phastCons/pp foreach chr (`awk '{print $1}' /cluster/data/rn4/chrom.sizes`) echo $chr cat `ls -1 $chr/$chr.*.pp | sort -t\. -k2,2n` \ | nice gzip -c \ > /cluster/data/rn4/bed/multiz9way.2006-03-30/phastConsDownloads/$chr.gz end date 'EOF' # << emacs csh -efx downloads.csh >&! downloads.log & tail -f downloads.log # ~20min md5sum *.gz > md5sum.txt ssh hgwdev cd /cluster/data/rn4/bed/multiz9way.2006-03-30/phastConsDownloads set dir = /usr/local/apache/htdocs/goldenPath/rn4/phastCons9way mkdir $dir ln -s /cluster/data/rn4/bed/multiz9way.2006-03-30/phastConsDownloads/{*.gz,md5sum.txt} $dir cp /usr/local/apache/htdocs/goldenPath/mm7/phastCons17Scores/README.txt $dir # edit README.txt # Clean up after phastCons run. ssh kkstore04 rm /cluster/data/rn4/bed/multiz9way.2006-03-30/phastCons/*.tab rm -r /san/sanvol1/scratch/rn4/multiz9way.2006-03-30/phastCons # MIRNA (DONE 4/10/06 angie) ssh hgwdev mkdir /cluster/data/rn4/bed/miRNA cd /cluster/data/rn4/bed/miRNA wget ftp://ftp.sanger.ac.uk/pub/mirbase/sequences/8.0/database_files/tables.sql foreach t (mirna mirna_species mirna_mature mirna_pre_mature \ mirna_chromosome_build) wget ftp://ftp.sanger.ac.uk/pub/mirbase/sequences/8.0/database_files/$t.dat.gz end nice gunzip *.gz hgsql '' -e 'create database miRNAtmp' hgsql miRNAtmp < tables.sql foreach t (mirna mirna_species mirna_mature mirna_pre_mature \ mirna_chromosome_build) hgsql miRNAtmp -e 'load data local infile "'$t.dat'" into table '$t end set species = `hgsql miRNAtmp -N -e 'select auto_id from mirna_species where name = "Rattus norvegicus"'` hgsql miRNAtmp -e "create table mirna_rat select * from mirna where auto_species = $species" hgsql miRNAtmp -N -e "select count(*) from mirna_rat" #| 234 | # This is a tiny database so this query takes almost no time at all: time nice hgsql miRNAtmp -N -e \ "select mirna_chromosome_build.xsome, \ mirna_chromosome_build.contig_start - 1, \ mirna_chromosome_build.contig_end, \ mirna_rat.mirna_id, 0, \ mirna_chromosome_build.strand, \ (mirna_chromosome_build.contig_start + mirna_mature.mature_from - 2), \ (mirna_chromosome_build.contig_start + mirna_mature.mature_to - 1) \ from mirna_rat, mirna_chromosome_build, mirna_mature, mirna_pre_mature \ where mirna_rat.auto_mirna = mirna_chromosome_build.auto_mirna and \ mirna_rat.auto_mirna = mirna_pre_mature.auto_mirna and \ mirna_pre_mature.auto_mature = mirna_mature.auto_mature" \ > mirna.almostBed #0.000u 0.010s 0:00.03 33.3% 0+0k 0+0io 211pf+0w awk '{$5 = 960;} $6 == "-" {$5 = 480;} {$1 = "chr" $1; print;}' \ mirna.almostBed > miRNA.bed hgLoadBed rn4 miRNA miRNA.bed hgsql miRNAtmp -e 'drop database miRNAtmp' # AFFYRAE230 (DONE 4/11/06 angie) # Rachel already set up consensus sequences in # /iscratch/i/affy/RAE230_all.fa - see makeRn3.doc. # Align consensus sequences to genome. ssh kk mkdir /cluster/data/rn4/bed/affyRat.2006-04-11 cd /cluster/data/rn4/bed/affyRat.2006-04-11 ls -1 /iscratch/i/affy/RAE230_all.fa > affy.lst ls -1S /panasas/store/rn4/ctgFa/* > allctg.lst echo '#LOOP\n/cluster/bin/i386/blat -fine -mask=lower -minIdentity=95 -ooc=/cluster/bluearc/rn4/11.ooc $(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl}\n#ENDLOOP' > template.sub gensub2 allctg.lst affy.lst template.sub para.spec mkdir psl para make para.spec para time #Completed: 591 of 591 jobs #CPU time in finished jobs: 10704s 178.40m 2.97h 0.12d 0.000 y #IO & Wait Time: 9584s 159.73m 2.66h 0.11d 0.000 y #Average job time: 34s 0.57m 0.01h 0.00d #Longest finished job: 83s 1.38m 0.02h 0.00d #Submission to last job: 270s 4.50m 0.07h 0.00d # Do sort, best in genome filter, and convert to chromosome coordinates # to create affyRAE230.psl pslSort dirs raw.psl tmp psl # only use alignments that cover 30% of sequence and have at least # 95% identity in aligned region. minAli = 0.97 too high. # low minCover as a lot of n's in these sequences pslReps -minCover=0.3 -minAli=0.95 -nearTop=0.005 raw.psl contig.psl \ /dev/null liftUp affyRAE230.psl ../../jkStuff/liftAll.lft warn contig.psl # shorten names in psl file sed -e 's/RAE230//' affyRAE230.psl > affyRAE230.psl.bak mv affyRAE230.psl.bak affyRAE230.psl ssh hgwdev cd /cluster/data/rn4/bed/affyRat.2006-04-11 # Rachel already set up /gbdb/hgFixed/affyProbes/RAE230_all.fa - # see makeRn3.doc. hgLoadSeq -abbr=RAE230 rn4 /gbdb/hgFixed/affyProbes/RAE230_all.fa # load track into database hgLoadPsl rn4 affyRAE230.psl # Load knownToRAE230: hgMapToGene rn4 affyRAE230 knownGene knownToRAE230 # Clean up rm -r psl tmp err batch.bak contig.psl raw.psl seq.tab # AFFYRGU34A (DONE 4/11/06 angie) # RG-U34A is the chip used for the GNF Atlas 2 expression data for Rat # Rachel already set up consensus sequences in # /iscratch/i/affy/RG-U34A_all.fa -see makeRn3.doc. # Align consensus sequences to genome. ssh kk mkdir /cluster/data/rn4/bed/affyU34A cd /cluster/data/rn4/bed/affyU34A ls -1 /iscratch/i/affy/RG-U34A_all.fa > affy.lst ls -1 /panasas/store/rn4/ctgFa/* > allctg.lst echo '#LOOP\n/cluster/bin/i386/blat -fine -mask=lower -minIdentity=95 -ooc=/cluster/bluearc/rn4/11.ooc $(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl}\n#ENDLOOP' > template.sub gensub2 allctg.lst affy.lst template.sub para.spec mkdir psl para make para.spec #Completed: 591 of 591 jobs #CPU time in finished jobs: 4911s 81.85m 1.36h 0.06d 0.000 y #IO & Wait Time: 10019s 166.99m 2.78h 0.12d 0.000 y #Average job time: 25s 0.42m 0.01h 0.00d #Longest finished job: 64s 1.07m 0.02h 0.00d #Submission to last job: 388s 6.47m 0.11h 0.00d # Do sort, best in genome filter, and convert to chromosome coordinates # to create affyRGU34A.psl pslSort dirs raw.psl tmp psl # only use alignments that cover 30% of sequence and have at least # 95% identity in aligned region. minAli = 0.97 too high. # low minCover as a lot of n's in these sequences pslReps -minCover=0.3 -minAli=0.95 -nearTop=0.005 raw.psl contig.psl \ /dev/null liftUp affyU34A.psl ../../jkStuff/liftAll.lft warn contig.psl # shorten names in psl file sed -e 's/RG-U34A://' affyU34A.psl > affyU34A.psl.bak mv affyU34A.psl.bak affyU34A.psl ssh hgwdev cd /cluster/data/rn4/bed/affyU34A # Rachel already set up /gbdb/hgFixed/affyProbes/RG-U34A_all.fa - # see makeRn3.doc. hgLoadSeq -abbr=RG-U34A: rn4 /gbdb/hgFixed/affyProbes/RG-U34A_all.fa # load track into database hgLoadPsl rn4 affyU34A.psl # create and load knownToU34A hgMapToGene rn4 affyU34A knownGene knownToU34A # Clean up rm -r psl tmp err batch.bak contig.psl raw.psl seq.tab # MERGE GNF ATLAS 2 EXPRESSION DATA AND ALIGNMENT (DONE 4/13/06 angie) ssh hgwdev cd /cluster/data/rn4/bed/affyU34A hgMapMicroarray gnfRatAtlas2.bed hgFixed.gnfRatAtlas2MedianRatio \ affyU34A.psl #Loaded 8022 rows of expression data from hgFixed.gnfRatAtlas2MedianRatio #Mapped 7447, multiply-mapped 216, missed 695, unmapped 575 hgLoadBed rn4 gnfAtlas2 gnfRatAtlas2.bed rm bed.tab # Create table to map between known genes and GNF Atlas2 # expression data. hgMapToGene rn4 gnfAtlas2 knownGene knownToGnfAtlas2 '-type=bed 12' hgExpDistance rn4 hgFixed.gnfRatAtlas2MedianRatio \ hgFixed.gnfRatAtlas2MedianExps gnfAtlas2Distance \ -lookup=knownToGnfAtlas2 # HGNEAR PROTEIN BLAST TABLES -- TARGET (DONE 4/12/06 angie) ssh hgwdev mkdir /cluster/data/rn4/bed/hgNearBlastp cd /cluster/data/rn4/bed/hgNearBlastp cat << _EOF_ > config.ra # Latest rat vs. other Gene Sorter orgs: # human, mouse, zebrafish, fly, worm, yeast targetGenesetPrefix known targetDb rn4 queryDbs hg18 mm8 danRer3 dm2 ce2 sacCer1 rn4Fa /cluster/data/rn4/bed/blastp/known.faa hg18Fa /cluster/data/hg18/bed/blastp/known.faa mm8Fa /cluster/data/mm8/bed/geneSorter/blastp/known.faa danRer3Fa /cluster/data/danRer3/bed/blastp/ensembl.faa dm2Fa /cluster/data/dm2/bed/flybase4.2/flybasePep.fa ce2Fa /cluster/data/ce2/bed/blastp/wormPep151.faa sacCer1Fa /cluster/data/sacCer1/bed/blastp/sgdPep.faa buildDir /cluster/data/rn4/bed/hgNearBlastp scratchDir /san/sanvol1/scratch/rn4HgNearBlastp _EOF_ # Do -targetOnly until rn4 has been released -- then it will be OK to # update rnBlastTab in the other species. doHgNearBlastp.pl -targetOnly config.ra >& do.log & tail -f do.log # *** Check these tables in rn4: # *** knownBaseBlastTab hgBlastTab mmBlastTab drBlastTab dmBlastTab ceBlastTab scBlastTab # HGNEAR PROTEIN BLAST TABLES -- QUERIES (DONE 6/26/06 angie) ssh hgwdev cd /cluster/data/rn4/bed/hgNearBlastp # Do -queryOnly after rn4 has been released: doHgNearBlastp.pl -queryOnly config.ra >& do.queries.log & tail -f do.queries.log # *** Check rnBlastTab in these databases: # *** hg18 mm8 danRer3 dm2 ce2 sacCer1 # MORE HGNEAR TABLES (DONE 4/13/06 angie) ssh hgwdev cd /tmp # Create table that maps between known genes and RefSeq hgMapToGene rn4 refGene knownGene knownToRefSeq # Create knownToEnsembl column -- redone 6/13/06 after ensGene update hgMapToGene rn4 ensGene knownGene knownToEnsembl # MGI? # Recomb rate - no doc, but ls -l /cluster/bin/scripts/*ecomb* and see # ftp://ftp.ncbi.nih.gov/genomes/R_norvegicus/Mapping_data/ # SNPS - TBD by Heather's new process. #TODO after SNPS: # Make knownToCdsSnp column. hgMapToGene rn4 snpMap knownGene knownToCdsSnp -all -cds # BLASTZ/CHAIN/NET XENTRO2 (DONE 4/19/06 angie) ssh kkstore04 mkdir /cluster/data/rn4/bed/blastz.xenTro2.2006-04-19 cd /cluster/data/rn4/bed/blastz.xenTro2.2006-04-19 cat << '_EOF_' > DEF # rat vs. frog BLASTZ=/cluster/bin/penn/x86_64/blastz.v7.x86_64 # Mammal to non-mammal, with threshold of 8000 as in mm8-xenTro2 BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=8000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET: Rat rn4 SEQ1_DIR=/scratch/hg/rn4/nib SEQ1_LEN=/cluster/data/rn4/chrom.sizes SEQ1_CHUNK=50000000 SEQ1_LAP=10000 # QUERY: Frog xenTro2 - single chunk big enough to run two of the # largest scaffolds in one job SEQ2_DIR=/scratch/hg/xenTro2/xenTro2.2bit SEQ2_LEN=/san/sanvol1/scratch/xenTro2/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LAP=0 SEQ2_LIMIT=100 BASE=/cluster/data/rn4/bed/blastz.xenTro2.2006-04-19 '_EOF_' # << emacs doBlastzChainNet.pl -blastzOutRoot=/san/sanvol1/rn4XenTro2 \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose DEF \ >& do.log & tail -f do.log ln -s blastz.xenTro2.2006-04-19 /cluster/data/rn4/bed/blastz.xenTro2 ######################################################################### # BLASTZ CHICKEN galGal3 (DONE 5/25/06 angie) ssh pk mkdir /cluster/data/rn4/bed/blastz.galGal3.2006-05-24 cd /cluster/data/rn4/bed/blastz.galGal3.2006-05-24 cat << '_EOF_' > DEF # rat vs chicken BLASTZ=blastz.v7.x86_64 # Specific settings for chicken (per Webb email to Brian Raney) BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=10000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q BLASTZ_ABRIDGE_REPEATS=1 # TARGET: Rat rn4 SEQ1_DIR=/scratch/hg/rn4/nib SEQ1_SMSK=/san/sanvol1/scratch/rn4/linSpecRep.notInNonMammal SEQ1_LEN=/cluster/data/rn4/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Chicken galGal3 - single chunk big enough to run entire chrom SEQ2_DIR=/san/sanvol1/galGal3/nib SEQ2_LEN=/cluster/data/galGal3/chrom.sizes SEQ2_SMSK=/san/sanvol1/galGal3/linSpecRep SEQ2_CHUNK=200000000 SEQ2_LAP=0 BASE=/cluster/data/rn4/bed/blastz.galGal3.2006-05-24 '_EOF_' # << emacs doBlastzChainNet.pl DEF -blastzOutRoot /san/sanvol1/scratch/gg3vsrn4 \ -bigClusterHub=pk -smallClusterHub=pk \ -chainMinScore=5000 -chainLinearGap=loose \ >& do.log & tail -f do.log ln -s blastz.galGal3.2006-05-24 /cluster/data/rn4/bed/blastz.galGal3 ######################################################################### ###### ADD gdbPdb ENTRY FOR rn4 INTO THE CENTRAL DB. (DONE, 6/1/06, Fan) mysql -u hgcat -p$HGPSWD -h genome-testdb -A hgcentraltest delete from gdbPdb where genomeDb='rn4'; insert into gdbPdb values('rn4', 'proteins060115'); ########################################################### #BUILD KEGG PATHWAY TABLES. DONE 6/12/06. Fan. ssh hgwdev cd /cluster/store11/kg/kgRn4A md kegg cd kegg ~/src/hg/protein/KGpath.sh kgRn4A rn4 060115 # This script exited early due to SOAP module of perl seems missing # So go to hgwdevold and manually ran the following: ~/src/hg/protein/getKeggList2.pl rno > keggList.tab hgsql rn4 < ~/kent/src/hg/lib/keggList.sql hgsql -e 'LOAD DATA local INFILE "keggList.tab" into table keggList;' rn4 # go back to hgwdev hgKegg3 rn4 rn4 hgsql rn4 -e "drop table keggMapDesc" hgsql rn4 -e "drop table keggPathway" hgsql rn4 <~/src/hg/lib/keggMapDesc.sql hgsql rn4 <~/src/hg/lib/keggPathway.sql hgsql rn4 -e 'load data local infile "keggMapDesc.tab" into table keggMapDesc' hgsql rn4 -e 'load data local infile "keggPathway.tab" into table keggPathway' ######################################################################### # BUILD SOME KNOWNTO... TABLES, DONE 6/12/06 Fan. cd /cluster/data/rn4/bed/geneSorter # Create table that maps between known genes and RefSeq hgMapToGene rn4 refGene knownGene knownToRefSeq # Create table that maps between known genes and LocusLink hgsql --skip-column-names -e "select mrnaAcc,locusLinkId from refLink" rn4 > refToLl.txt hgMapToGene rn4 refGene knownGene knownToLocusLink -lookup=refToLl.txt # Create table that maps between known genes and Pfam domains hgMapViaSwissProt rn4 knownGene name proteinID Pfam knownToPfam hgsql -e "select count(*) from knownToPfam;" rn4 ######################################################################### # BLASTZ/CHAIN/NET DANRER4 - NO LINSPECREP (DONE, 2006-06-19, hartera) ssh pk mkdir /cluster/data/rn4/bed/blastz.danRer4.2006-06-19 cd /cluster/data/rn4/bed/blastz.danRer4.2006-06-19 # There are no lineage-specific repeats for these two species. All repeats # were not used as this was found to lower coverage with rn4 and danRer3 # so use dynamic masking instead (M parameter for Blastz). cat << '_EOF_' > DEF # rat (rn4) vs zebrafish (danRer4) export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin BLASTZ=blastz.v7.x86_64 BLASTZ_ABRIDGE_REPEATS=0 # Reuse parameters from hg16-fr1, danRer-hg17 and mm5-danRer BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_M=50 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET: Rat Rn4 SEQ1_DIR=/scratch/hg/rn4/nib SEQ1_LEN=/cluster/data/rn4/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: Zebrafish (danRer4) # large enough chunk to do complete chroms at once SEQ2_DIR=/san/sanvol1/scratch/danRer4/nib SEQ2_LEN=/san/sanvol1/scratch/danRer4/chrom.sizes SEQ2_CHUNK=250000000 SEQ2_LAP=0 BASE=/cluster/data/rn4/bed/blastz.danRer4.2006-06-19 '_EOF_' # << for emacs chmod +x DEF nice /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ `pwd`/DEF >& do.log & tail -f do.log # Took 2 hours and 50 minutes to run. ln -s blastz.danRer4.2006-06-19 /cluster/data/rn4/bed/blastz.danRer4 # Do swap to create danRer4 vs rn4: ssh pk cd /cluster/data/rn4/bed/blastz.danRer4.2006-06-19 nice /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ -swap `pwd`/DEF >& swap.log & # Took about 10 minutes to run. rm *.log # Check README.txt is correct for alignment downloads. featureBits rn4 chainDanRer4Link # 67765280 bases of 2571531505 (2.635%) in intersection featureBits rn4 refGene:cds chainDanRer4Link -chrom=chr1 -enrichment # refGene:cds 0.731%, chainDanRer4Link 3.337%, both 0.518%, # cover 70.86%, enrich 21.24x featureBits rn4 refGene:cds chainDanRer3Link -chrom=chr1 -enrichment # refGene:cds 0.731%, chainDanRer3Link 2.699%, both 0.475%, # cover 64.89%, enrich 24.04x featureBits rn4 refGene:cds netDanRer4 -chrom=chr1 -enrichment # refGene:cds 0.731%, netDanRer4 29.999%, both 0.575%, cover 78.65%, # enrich 2.62x featureBits rn4 refGene:cds netDanRer3 -chrom=chr1 -enrichment # refGene:cds 0.731%, netDanRer3 25.934%, both 0.530%, cover 72.52%, # enrich 2.80x ######################################################################### #### REBUILD kgProtMap TABLE (DONE. 6/19/06 Fan) # Existing kgProtMap table only had 33 entries. ssh kolossus cd /cluster/store9/rn4/bed/kgRn4A ~/kent/src/hg/protein/kgProtMap2.sh kgRn4A rn4 060115 cd kgProtMap hgLoadPsl -tNameIx rn4 psl.tmp/kgProtMap.psl ######################################################################## #### BUILD SUPERFAMILY RELATED TABLES (DONE - 2006-06-19 - Fan) # Download latest Superfamily data files and build the Superfamily DB # from supfam.mrc-lmb.cam.ac.uk mkdir /cluster/store10/superfamily/060619 ln -s /cluster/store10/superfamily/060619 /cluster/data/superfamily/060619 cd /cluster/data/superfamily/060619 # ftp over the following two files: ass_18-Jun-2006.tab.gz supfam_18-Jun-2006.sql.gz gzip -d *.gz # Load the Superfamily database hgsql rn4 -e "create database superfam060619" nice hgsql superfam060619 < supfam_18-Jun-2006.sql & # This may take about an hour. # Make sure to add an index on id of the des table of superfam060619. hgsql superfam060619 -e "create index id on des(id);" hgsql superfam060619 < ~/src/hg/lib/sfAssign.sql hgsql superfam060619 -e 'load data local infile "ass_18-Jun-2006.tab" into table superfam060619.sfAssign;' # Build or rebuild Superfamily track and create sf tables needed for PB hgsql rn4 < ~/src/hg/lib/sfAssign.sql cd /cluster/data/superfamily/060619 hgsql rn4 -e 'load data local infile "ass_18-Jun-2006.tab" into table rn4.sfAssign;' # If rn4.sfDes already exists, drop it. hgsql superfam060619 -N -e "select * from des" >sfDes.tab hgsql rn4 < ~/src/hg/lib/sfDes.sql hgsql rn4 -e 'load data local infile "sfDes.tab" into table sfDes' # Build ensemblXref3 # Get the ensembl gene/protein cross-reference data from Ensembl BioMart # http://www.ensembl.org/Multi/martview # Follow this sequence through the pages: # Page 1) Select Ensembl39 and Rat. Hit next. # Page 2) Do not select anything. Hit next. # Page 3) Choose the "Feature" box, select Ensembl gene ID, transcript ID, peptide ID, UniProt/TrEMBL ID, UniProt/SWISSPROT ID, and UniProt/SWISSPROT Accession # Page 4) Choose "Text, tab separated". choose gzip compression. hit export. # Save as ensXref.tsv.gz ssh hgwdev cd /cluster/data/rn4/bed/sf hgsql rn4 < ~/src/hg/lib/ensemblXref3Temp.sql hgsql rn4 -e \ 'load data local infile "ensXref.tsv" into table ensemblXref3Temp ignore 1 lines' hgsql rn4 -N -e \ 'select gene, "0", transcript, "0", protein, "0", tremblAcc, swissDisplayId, swissAcc from ensemblXref3Temp' \ > ensemblXref3.tab hgsql rn4 -e 'drop table ensemblXref3' hgsql rn4 <~/src/hg/lib/ensemblXref3.sql hgsql rn4 -e 'load data local infile "ensemblXref3.tab" into table ensemblXref3' # If rn4.superfamily already exists, drop it. cd /cluster/data/rn4/bed mkdir /cluster/data/rn4/bed/sf.20060619 ln -s sf.20060619 sf cd sf hgSuperfam rn4 superfam060619 > sf.log # It is normal that many proteins do not have corresponding Superfamily entries. # If rn4.sfDescription exists, drop it. hgsql rn4 < ~/src/hg/lib/sfDescription.sql hgsql rn4 -e 'LOAD DATA local INFILE "sfDescription.tab" into table rn4.sfDescription;' # Finally, load the superfamily table. hgLoadBed rn4 superfamily superfamily.tab -tab # Create knownToSuperfamily table # Note hs is changed into ht for this Superfamily release. cat /cluster/data/superfamily/060619/ass_18-Jun-2006.tab \ | hgKnownToSuper rn4 rn stdin # created 25287 rows in knownToSuper # Build tables needed by pbGlobal in proteins060115 cd /cluster/data/superfamily/060619 hgsql proteins060115 -e 'delete from sfAssign' hgsql proteins060115 -e 'delete from sfDes' hgsql proteins060115 -e 'load data local infile "ass_18-Jun-2006.tab" into table sfAssign' hgsql proteins060115 -e 'load data local infile "sfDes.tab" into table sfDes' ######################################################################## # Build rn4 PROTEOME BROWSER TABLES (DONE 6/20/06, Fan) # These are instructions for building tables # needed for the Proteome Browser. # DON'T START THESE UNTIL TABLES FOR KNOWN GENES AND kgProtMap table # ARE REBUILT. # This build is based on proteins DBs dated 060115. # Create the working directory ssh hgwdev mkdir /cluster/store11/kg/kgRn4A/pb-2006-06-19 cd /cluster/data/rn4/bed rm pb ln -s /cluster/store11/kg/kgRn4A/pb-2006-06-19 pb cd pb # Define pep* tables in rn4 DB cat ~/kent/src/hg/lib/pep*.sql > pepAll.sql # First edit out pepPred table definition, then hgsql rn4 < pepAll.sql # Build the pepMwAa table hgsql proteins060115 -N -e \ "select info.acc, molWeight, aaSize from sp060115.info, sp060115.accToTaxon where accToTaxon.taxon=10116 and accToTaxon.acc = info.acc" > pepMwAa.tab hgsql rn4 -e 'load data local infile "pepMwAa.tab" into table pepMwAa' o Build the pepPi table hgsql proteins060115 -e \ "select info.acc from sp060115.info, sp060115.accToTaxon where accToTaxon.taxon=10116 and accToTaxon.acc = info.acc" > protAcc.lis hgsql rn4 -N -e 'select proteinID from knownGene where proteinID like "%-%"' | sort -u >> protAcc.lis pbCalPi protAcc.lis sp060115 pepPi.tab hgsql rn4 -e 'delete from pepPi' hgsql rn4 -e 'load data local infile "pepPi.tab" into table rn4.pepPi' # Calculate and load pep distributions pbCalDist sp060115 proteins060115 10116 rn4 >pbCalDist.out wc pbCalDist.out hgsql rn4 load data local infile "pepExonCntDist.tab" into table rn4.pepExonCntDist; load data local infile "pepCCntDist.tab" into table rn4.pepCCntDist; load data local infile "pepHydroDist.tab" into table rn4.pepHydroDist; load data local infile "pepMolWtDist.tab" into table rn4.pepMolWtDist; load data local infile "pepResDist.tab" into table rn4.pepResDist; load data local infile "pepIPCntDist.tab" into table rn4.pepIPCntDist; load data local infile "pepPiDist.tab" into table rn4.pepPiDist; quit # Calculate frequency distributions pbCalResStd sp060115 10116 rn4 # Create pbAnomLimit and pbResAvgStd tables hgsql rn4 -e "drop table pbAnomLimit" hgsql rn4 -e "drop table pbResAvgStd" hgsql rn4 < ~/src/hg/lib/pbAnomLimit.sql hgsql rn4 < ~/src/hg/lib/pbResAvgStd.sql hgsql rn4 -e 'load data local infile "pbResAvgStd.tab" into table rn4.pbResAvgStd;' hgsql rn4 -e 'load data local infile "pbAnomLimit.tab" into table rn4.pbAnomLimit;' # Create pbStamp table for PB hgsql rn4 -e "drop table pbStamp" hgsql rn4 < ~/src/hg/lib/pbStamp.sql hgsql rn3 -N -e 'select * from pbStamp' > pbStamp.tab hgsql rn4 -e 'delete from pbStamp' hgsql rn4 -e 'load data local infile "pbStamp.tab" into table rn4.pbStamp' # Turn on Proteome Browser for rn4. hgsql -e 'update dbDb set hgPbOk=1 where name="rn4"' \ -h genome-testdb hgcentraltest # Adjust drawing parameters for Proteome Browser stamps Now invoke Proteome Browser and adjust various drawing parameters (mostly the ymax of each stamp) if necessary, by updating the pbStamp.tab file and then delete and reload the pbStamp table. hgsql rn4 -e "drop table pbStamp" hgsql rn4 < ~/src/hg/lib/pbStamp.sql hgsql rn4 -e 'load data local infile "pbStamp.tab" into table rn4.pbStamp' # Perform preliminary review of Proteome Browser for rn4, then notify QA for formal review. ################################################################################ # BUILD KNOWN GENE LIST FOR GOOGLE. (DONE, 6/6/06, Fan) # make knownGeneLists.html rn4GeneList.html mm5GeneList.html rm3GeneList.html cd /cluster/data/rn4/bed rm -rf knownGeneList/rn4 # Run hgKnownGeneList to generate the tree of HTML pages # under ./knownGeneList/rn4 hgKnownGeneList rn4 # copy over to /usr/local/apache/htdocs rm -rf /usr/local/apache/htdocs/knownGeneList/rn4 mkdir -p /usr/local/apache/htdocs/knownGeneList/rn4 cp -Rfp knownGeneList/rn4/* /usr/local/apache/htdocs/knownGeneList/rn4 ################################################################################ ########################################################################## # N-SCAN gene predictions (nscanGene) - (2006-09-11 markd) cd /cluster/data/rn4/bed/nscan/ # obtained NSCAN predictions from michael brent's group # at WUSTL wget -nv -r -l 1 -np --accept=gtf http://ardor.wustl.edu/predictions/rat/rn4/chr_gtf wget -nv -r -l 1 -np --accept=fa http://ardor.wustl.edu/predictions/rat/rn4/chr_ptx # clean up downloaded directorys and compress files. mv ardor.wustl.edu/predictions/rat/rn4/* . rm -rf ardor.wustl.edu gzip chr_*/* chmod a-w chr_*/*.gz # load tracks. Note that these have *utr features, rather than # exon features. currently ldHgGene creates separate genePred exons # for these. ldHgGene -bin -gtf -genePredExt rn4 nscanGene chr_gtf/chr*.gtf.gz # load protein, add .a suffix to match transcript id hgPepPred -suffix=.a rn4 generic nscanPep chr_ptx/chr*.fa.gz rm *.tab # update trackDb; need a rn4-specific page to describe informants rat/rn4/nscanGene.html (copy from mm8 and edit) rat/rn4/trackDb.ra ########################################################################### # dbSNP BUILD 125 (Heather, September 2006) # We didn't have ctgPos. # dbSNP build 125 uses chrom coords (phys_pos_from and phys_pos) in ContigLoc # but not for chrUn. chrUn has contig coords only, and without ctgPos # for supercontigs, I can't translate the chrUn contig coords into chrom coords. # Also note: rn4 has no chrY, and there were no SNPs for chrM. # Not having ctgPos also meant I couldn't process the new locTypes # (4,5,6: rangeInsertion, rangeSubstitution, rangeDeletion) # because those are also only given in contig coords # These were a small number. # Set up directory structure ssh kkstore02 cd /cluster/data/dbSNP/125/organisms/rat_10116 mkdir data mkdir schema mkdir rs_fasta # Get data from NCBI (anonymous FTP) cd /cluster/data/dbSNP/125/organisms/rat_10116/data ftp ftp.ncbi.nih.gov cd snp/organisms/rat_10116/database/organism_data # ContigLoc table has coords, orientation, loc_type, and refNCBI allele get b125_SNPContigLoc_3_1.bcp.gz # ContigLocusId has function get b125_SNPContigLocusId_3_1.bcp.gz get b125_ContigInfo_3_1.bcp.gz # MapInfo has alignment weights get b125_SNPMapInfo_3_1.bcp.gz # SNP has univar_id, validation status and heterozygosity get SNP.bcp.gz # Get schema from NCBI cd /cluster/data/dbSNP/125/organisms/rat_10116/schema ftp ftp.ncbi.nih.gov cd snp/organisms/rat_10116/database/organism_schema get rat_10116_table.sql.gz # Get fasta files from NCBI # using headers of fasta files for molType cd /cluster/data/dbSNP/125/organisms/rat_10116/rs_fasta ftp ftp.ncbi.nih.gov cd snp/organisms/rat_10116/rs_fasta prompt mget *.gz # add rs_fasta to seq/extFile # 2 edits first: strip header to just rsId, and remove duplicates # work on /cluster/store12 (kkstore05) which has more disk space cp rs_ch*.fas.gz /cluster/store12/snp/125/rat/rs_fasta ssh kkstore05 cd /cluster/store12/snp/125/rat/rs_fasta # concat into rsAll.fas cat << '_EOF_' > concat.csh #!/bin/csh -ef rm -f rsAll.fas foreach file (rs_ch*.fas) echo $file zcat $file >> rsAll.fas end '_EOF_' # << emacs #snpCleanSeq strips the header and skips duplicates /cluster/home/heather/kent/src/hg/snp/snpLoad/snpCleanSeq rsAll.fas snp.fa rm rsAll.fas # load on hgwdev ssh hgwdev mkdir /gbdb/rn4/snp ln -s /cluster/store12/snp/125/rat/rs_fasta/snp.fa /gbdb/rn4/snp/snp.fa cd /cluster/store12/snp/125/rat/rs_fasta hgLoadSeq rn4 /gbdb/rn4/snp/snp.fa # look up id in extFile hgsql rn4 < snpSeq.sql hgsql -e 'insert into snpSeq select acc, file_offset from seq where extFile = 420007' rn4 hgsql -e 'delete from seq where extFile = 420007' rn4 hgsql -e 'alter table snpSeq add index acc (acc)' rn4 # clean up after hgLoadSeq rm seq.tab # Simplify names of data files cd /cluster/data/dbSNP/125/organisms/rat_10116/data mv b125_ContigInfo_3_1.bcp.gz ContigInfo.gz mv b125_SNPContigLoc_3_1.bcp.gz ContigLoc.gz mv b125_SNPContigLocusId_3_1.bcp.gz ContigLocusId.gz mv b125_SNPMapInfo_3_1.bcp.gz MapInfo.gz mv SNP.bcp.gz SNP.gz ls -1 *.gz > filelist # edit table descriptions cd /cluster/data/dbSNP/125/organisms/rat_10116/schema # get CREATE statements from rat_10116_table.sql for our 5 tables # store in table.tmp # convert and rename tables sed -f 'mssqlToMysql.sed' table.tmp > table2.tmp rm table.tmp sed -f 'tableRename.sed' table2.tmp > table.sql rm table2.tmp # get header lines from rs_fasta cd /cluster/data/dbSNP/125/organisms/rat_10116/rs_fasta /bin/csh gnl.csh # load on kkr5u00 ssh kkr5u00 hgsql -e mysql 'create database rn4snp125' cd /cluster/data/dbSNP/125/organisms/rat_10116/schema hgsql rn4snp125 < table.sql cd ../data /bin/csh load.csh # note rowcount # ContigLoc 94945 # SNP 41658 # MapInfo 41658 # ContigLocusId 32954 # Get UniVariation from 126 (recently downloaded for mm8snp126) cd /cluster/data/dbSNP/126/shared hgsql rn4snp125 < UniVariation.sql zcat UniVariation.bcp.gz | hgsql -e 'load data local infile "/dev/stdin" into table UniVariation' rn4snp125 # create working /scratch dir cd /scratch/snp/125 mkdir rat cd rat # no rn4.ctgPos table to compare against # get gnl files cp /cluster/data/dbSNP/125/organisms/rat_10116/rs_fasta/*.gnl . # examine ContigInfo for group_term and edit pipeline.csh # use "ref_assembly" # filter ContigLoc into ContigLocFilter # this lifts from contig coords to chrom coords # phys_pos_from is used to check coords for non-random chroms # errors reported to stdout # this gets rid of alternate assemblies (using ContigInfo) # this also gets rid of poor quality alignments (weight == 10 || weight == 0 in MapInfo) # assumes all contigs are positively oriented; will abort if not true mysql> desc ContigLocFilter; # +---------------+-------------+------+-----+---------+-------+ # | Field | Type | Null | Key | Default | Extra | # +---------------+-------------+------+-----+---------+-------+ # | snp_id | int(11) | NO | | | | # | ctg_id | int(11) | NO | | | | # | chromName | varchar(32) | NO | | | | # | loc_type | tinyint(4) | NO | | | | # | start | int(11) | NO | | | | # | end | int(11) | YES | | NULL | | # | orientation | tinyint(4) | NO | | | | # | allele | blob | YES | | NULL | | # +---------------+-------------+------+-----+---------+-------+ /cluster/home/heather/kent/src/hg/snp/snpLoad/snpContigLocFilter125 rn4snp125 ref_assembly RGSC_v3.4.0 # note rowcount # ContigLocFilter 45735 # how many are positive strand? hopefully 90% # we get less than half here! mysql> select count(*) from ContigLocFilter where orientation = 0; # 23353 # note count by loc_type mysql> select count(*), loc_type from ContigLocFilter group by loc_type; # +----------+----------+ # | count(*) | loc_type | # +----------+----------+ # | 57 | 1 | # | 45482 | 2 | # | 196 | 3 | # +----------+----------+ # filter ContigLocusId into ContigLocusIdFilter # need to filter on contig_acc because ctg_id = 0 for all rows in ContigLocusId /cluster/home/heather/kent/src/hg/snp/snpLoad/snpContigLocusIdFilter125 rn4snp125 ref_assembly # note rowcount # 32954 (same as ContigLocusId) # condense ContigLocusIdFilter into ContigLocusIdCondense (one SNP can have multiple functions) # assumes SNPs are in numerical order; will errAbort if not true /cluster/home/heather/kent/src/hg/snp/snpLoad/snpContigLocusIdCondense rn4snp125 # note rowcount; expect about 50% (ascertainment bias for SNPs within genes) # actually we get about 80% here # ContigLocusIdCondense 23754 # could delete ContigLocusIdFilter table here # create chrN_snpFasta tables from *.gnl files # we are just using molType, but also storing class and observed # need chromInfo for this /cluster/home/heather/kent/src/hg/snp/snpLoad/snpLoadFasta rn4snp125 # (could start using pipeline.csh here) # (pipeline.csh takes about 5 minutes to run) # split ContigLocFilter by chrom # create the first chrN_snpTmp # we will reuse this table name, adding/changing columns as we go # at this point chrN_snpTmp will have the same description as ContigLocFilter # this opens a file handle for every chrom, so will not scale to scaffold-based assemblies /cluster/home/heather/kent/src/hg/snp/snpLoad/snpSplitByChrom rn4snp125 ref_assembly # check coords using loc_type # possible errors logged to snpLocType.error: # Unknown locType # Between with allele != '-' # Exact with end != start + 1 # Range with end < start # possible exceptions logged to snpLocType.exceptions: # RefAlleleWrongSize # This run no errors, no exceptions # morph chrN_snpTmp mysql> desc chr1_snpTmp; # +---------------+-------------+------+-----+---------+-------+ # | Field | Type | Null | Key | Default | Extra | # +---------------+-------------+------+-----+---------+-------+ # | snp_id | int(11) | NO | | | | # | ctg_id | int(11) | NO | | | | # | chromStart | int(11) | NO | | | | # | chromEnd | int(11) | NO | | | | # | loc_type | tinyint(4) | NO | | | | # | orientation | tinyint(4) | NO | | | | # | allele | blob | YES | | NULL | | # +---------------+-------------+------+-----+---------+-------+ /cluster/home/heather/kent/src/hg/snp/snpLoad/snpLoctype125 rn4snp125 ref_assembly # expand allele as necessary # report syntax errors to snpExpandAllele.errors # possible exceptions logged to snpExpandAllele.exceptions: # RefAlleleWrongSize # This run just 1 error # 0 alleles expanded /cluster/home/heather/kent/src/hg/snp/snpLoad/snpExpandAllele rn4snp125 ref_assembly # the next few steps prepare for working in UCSC space # sort by position /cluster/home/heather/kent/src/hg/snp/snpLoad/snpSort rn4snp125 ref_assembly # get nib files ssh hgwdev cd /cluster/data/rn4 cp rn4.2bit /cluster/home/heather/transfer/snp/rn4 ssh kkr5u00 cd /scratch/snp/125/rat cp /cluster/home/heather/transfer/snp/rn4/rn4.2bit . # edit path in chromInfo, load into rn4snp125 with editted path # lookup reference allele in nibs # keep reverse complement to use in error checking (snpCheckAlleles) # check here for SNPs larger than 1024 # errAbort if detected # check for coords that are too large, log to snpRefUCSC.error and skip # This run no errors /cluster/home/heather/kent/src/hg/snp/snpLoad/snpRefUCSC rn4snp125 # morph chrN_snpTmp mysql> desc chr1_snpTmp; # +--------------------+-------------+------+-----+---------+-------+ # | Field | Type | Null | Key | Default | Extra | # +--------------------+-------------+------+-----+---------+-------+ # | snp_id | int(11) | NO | | | | # | ctg_id | int(11) | NO | | | | # | chromStart | int(11) | NO | | | | # | chromEnd | int(11) | NO | | | | # | loc_type | tinyint(4) | NO | | | | # | orientation | tinyint(4) | NO | | | | # | allele | blob | YES | | NULL | | # | refUCSC | blob | YES | | NULL | | # | refUCSCReverseComp | blob | YES | | NULL | | # +--------------------+-------------+------+-----+---------+-------+ # compare allele from dbSNP to refUCSC # locType between is excluded from this check # log exceptions to snpCheckAllele.exceptions # if SNP is positive strand, expect allele == refUCSC # log RefAlleleMismatch if not # if SNP is negative strand, if not allele == refUCSC, then check for allele == refUCSCReverseComp # If allele == refUCSCRevComp, log RefAlleleNotRevComp # If allele doesn't match either of refUCSC or refUCSCReverseComp, log RefAlleleMismatch # This run we got: # 0 RefAlleleMismatch # 1830 RefAlleleNotRevComp /cluster/home/heather/kent/src/hg/snp/snpLoad/snpCheckAlleles rn4snp125 # add class and observed using univar_id from SNP table # to get class (subsnp_class) and observed (var_str) from UniVariation # log errors to snpClassAndObserved.errors # errors detected: # class = 0 in UniVariation # class > 8 in UniVariation # univar_id = 0 in SNP # no row in SNP for snp_id in chrN_snpTmp # This run we got: # 3 class = 0 in UniVariation # 0 class > 8 in UniVariation # 0 univar_id = 0 in SNP # 0 no row in SNP for snp_id in chrN_snpTmp # dbSNP has class = 'in-del' # we promote this to 'deletion' for locType 1&2 and to 'insertion' for locType 3 /cluster/home/heather/kent/src/hg/snp/snpLoad/snpClassAndObserved rn4snp125 # morph chrN_snpTmp # +--------------------+---------------+------+-----+---------+-------+ # | Field | Type | Null | Key | Default | Extra | # +--------------------+---------------+------+-----+---------+-------+ # | snp_id | int(11) | NO | | | | # | chromStart | int(11) | NO | | | | # | chromEnd | int(11) | NO | | | | # | loc_type | tinyint(4) | NO | | | | # | class | varchar(255) | NO | | | | # | orientation | tinyint(4) | NO | | | | # | allele | blob | YES | | NULL | | # | refUCSC | blob | YES | | NULL | | # | refUCSCReverseComp | blob | YES | | NULL | | # | observed | blob | YES | | NULL | | # +--------------------+---------------+------+-----+---------+-------+ # generate exceptions for class and observed # SingleClassBetweenLocType # SingleClassRangeLocType # NamedClassWrongLocType # ObservedWrongFormat # ObservedWrongSize # ObservedMismatch # RangeSubstitutionLocTypeExactMatch # SingleClassTriAllelic # SingleClassQuadAllelic # This will also detect IUPAC symbols in allele /cluster/home/heather/kent/src/hg/snp/snpLoad/snpCheckClassAndObserved rn4snp125 # add function /cluster/home/heather/kent/src/hg/snp/snpLoad/snpFunction rn4snp125 # add validation status and heterozygosity # log error if validation status > 31 or missing # no errors this run /cluster/home/heather/kent/src/hg/snp/snpLoad/snpSNP rn4snp125 # add molType # errors detected: missing or duplicate molType # this run no errors /cluster/home/heather/kent/src/hg/snp/snpLoad/snpMoltype rn4snp125 # generate chrN_snp125 and snp125Exceptions tables cp snpCheckAlleles.exceptions snpCheckAlleles.tab cp snpCheckClassAndObserved.exceptions snpCheckClassAndObserved.tab cp snpExpandAllele.exceptions snpExpandAllele.tab cp snpLocType.exceptions snpLocType.tab /cluster/home/heather/kent/src/hg/snp/snpLoad/snpFinalTable rn4snp125 125 # concat into snp125.tab # cat chr*_snp125.tab >> snp125.tab /bin/sh concat.sh # check for multiple alignments /cluster/home/heather/kent/src/hg/snp/snpLoad/snpMultiple rn4snp125 125 mysql> load data local infile 'snpMultiple.tab' into table snp125Exceptions; # load on hgwdev cp snp125.tab /cluster/home/heather/transfer/snp/rn4 hgsql rn4snp125 -e 'select * from snp125Exceptions' > /cluster/home/heather/transfer/snp/rn4/snp125Exceptions.tab ssh hgwdev cd transfer/snp/rn4 mysql> load data local infile 'snp125.tab' into table snp125; mysql> load data local infile 'snp125Exceptions.tab' into table snp125Exceptions; # create indexes mysql> alter table snp125 add index name (name); mysql> alter table snp125 add index chrom (chrom, bin); mysql> alter table snp125Exceptions add index name(name); # create snp125ExceptionDesc table cd /cluster/data/dbSNP hgsql rn4 < snp125ExceptionDesc.sql # add counts to exception.human.125, can start with exception.template hgsql -e 'select count(*), exception from snp125Exceptions group by exception' canfam1 mysql> load data local infile 'exception.rn4.snp125' into table snp125ExceptionDesc; mysql> select count(*), exception from snp125Exceptions group by exception; +----------+---------------------------+ | count(*) | exception | +----------+---------------------------+ | 15252 | MultipleAlignments | | 517 | ObservedMismatch | | 1 | ObservedWrongFormat | | 1 | ObservedWrongSize | | 1830 | RefAlleleNotRevComp | | 1 | RefAlleleWrongSize | | 194 | SingleClassBetweenLocType | | 1 | SingleClassQuadAllelic | | 48 | SingleClassRangeLocType | | 92 | SingleClassTriAllelic | +----------+---------------------------+ ########################################################################## # xxBlastTab - Help filter out unwanted paralogs (Galt 2007-01-11) # # We are starting with xxBlastTab tables already built in the usual way with # blastall/blastp, probably with doHgNearBlastp.pl script. # # we want to update rn4 for human and mouse, # so check ./hgGeneData/Rat/rn4/otherOrgs.ra for current settings ssh hgwdev synBlastp.csh rn4 hg18 #rn4.hgBlastTab: #new number of unique query values: #7379 #new number of unique target values #6373 #old number of unique query values: #7977 #old number of unique target values #6657 synBlastp.csh rn4 mm8 #rn4.mmBlastTab: #new number of unique query values: #7397 #new number of unique target values #6502 #old number of unique query values: #7988 #old number of unique target values #6788 ########################################################################## # GenBank gbMiscDiff table (markd 2007-01-10) # Supports `NCBI Clone Validation' section of mgcGenes details page # genbank release 157.0 now contains misc_diff fields for MGC clones # reloading mRNAs results in gbMiscDiff table being created. ./bin/gbDbLoadStep -reload -srcDb=genbank -type=mrna rn4 ########################################################################### # REVERSE LIFTOVER CHAINS RN4 -> RN3 (DONE 2007-02-09 kate) # copy rn3 2bit to cluster-friendly dir ssh kkstore06 cp -p /cluster/data/rn3/rn3.2bit /san/sanvol1/scratch/rn3 # create the directory and scripts ssh kkstore04 cd /cluster/data/rn4 doSameSpeciesLiftOver.pl rn4 rn3 -debug -bigClusterHub pk # go to created dir and run for real cd /cluster/data/rn4/bed/blat.rn3.2007-02-09 doSameSpeciesLiftOver.pl rn4 rn3 -bigClusterHub pk >& do.log & tail -f do.log # failed for unknown region during chainNet of chr4. Error message # indicated the file /cluster/data/rn3/chrom.sizes couldn't be found. # I extracted the remaining steps from the doNet.csh file and ran those. doNet.csh >&! doNet.log doSameSpeciesLiftOver.pl rn4 rn3 -continue load -bigClusterHub pk >& doLoad.log ########################################################################### # QUALITY SCORES (DONE 2/22/07 angie) ssh kolossus nice tcsh mkdir /cluster/data/rn4/bed/qual cd /cluster/data/rn4/bed/qual zcat ../../downloads/*.qual.gz \ | sed -re 's/^>gnl\|Rnor3.[0-9_]+\|/>/' \ | qaToQac stdin stdout \ | qacToWig -fixed stdin stdout \ | wigEncode stdin qual.{wig,wib} #Made 1 .wig files in stdout #Converted stdin, upper limit 99.00, lower limit 0.00 ssh hgwdev cd /cluster/data/rn4/bed/qual rm -f /gbdb/rn4/wib/qual.wib ln -s /cluster/data/rn4/bed/qual/qual.wib /gbdb/rn4/wib/ hgLoadWiggle -pathPrefix=/gbdb/rn4/wib rn4 quality qual.wig ########################################################################## # BLASTZ/CHAIN/NET HORSE AND RAT (DONE 2/18/07 Fan) ssh kkstore05 mkdir /cluster/data/equCab1/bed/blastz.rn4.2007-02-18 cd /cluster/data/equCab1/bed/blastz.rn4.2007-02-18 cat << '_EOF_' > DEF # Horse vs. Rat BLASTZ_M=50 # TARGET: Horse equCab1 SEQ1_DIR=/san/sanvol1/scratch/equCab1/equCab1.2bit SEQ1_LEN=/san/sanvol1/scratch/equCab1/chrom.sizes # Maximum number of scaffolds that can be lumped together SEQ1_LIMIT=500 SEQ1_CHUNK=30000000 SEQ1_LAP=10000 # QUERY: Mouse rn4 SEQ2_DIR=/scratch/hg/rn4/nib SEQ2_LEN=/cluster/data/rn4/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/cluster/data/equCab1/bed/blastz.rn4.2007-02-18 TMPDIR=/scratch/tmp '_EOF_' # << this line keeps emacs coloring happy doBlastzChainNet.pl DEF -chainMinScore=3000 -chainLinearGap=medium \ -bigClusterHub pk \ -blastzOutRoot /cluster/bluearc/equCab1Rn4 >& do.log & tail -f do.log ssh hgwdev cd /cluster/data/equCab1/bed ln -s blastz.rn4.2007-02-18 /cluster/data/equCab1/bed/blastz.rn4 nice featureBits equCab1 -chrom=chr1 chainRn4Link # 67408374 bases of 177498097 (37.977%) in intersection bash time nice -n 19 featureBits equCab1 chainRn4Link \ > fb.equCab1.chainRn4Link.txt 2>&1 # 859116859 bases of 2421923695 (35.472%) in intersection ssh kkstore05 mkdir /cluster/data/rn4/bed/blastz.equCab1.swap cd /cluster/data/rn4/bed/blastz.equCab1.swap bash time doBlastzChainNet.pl \ /cluster/data/equCab1/bed/blastz.rn4.2007-02-18/DEF \ -chainMinScore=3000 -chainLinearGap=medium \ -verbose=2 -swap -bigClusterHub=pk \ -blastzOutRoot /cluster/bluearc/equCab1Rn4 > swap.log 2>&1 & ssh hgwdev cd /cluster/data/rn4/bed/blastz.equCab1.swap bash time nice -n 19 featureBits rn4 chainEquCab1Link \ > fb.rn4.chainEquCab1Link.txt 2>&1 # 867029914 bases of 2571531505 (33.716%) in intersection ######################################################################### # BLASTZ/CHAIN/NET BOSTAU3 (Done March 2007 heather) mkdir /cluster/data/rn4/bed/blastz.bosTau3.2007-03-25 ln -s /cluster/data/rn4/bed/blastz.bosTau3.2007-03-25 /cluster/data/rn4/bed/blastz.bosTau3 cd /cluster/data/rn4/bed/blastz.bosTau3 cat << '_EOF_' > DEF BLASTZ_M=50 # TARGET: Rat rn4 SEQ1_DIR=/scratch/hg/rn4/nib SEQ1_LEN=/scratch/hg/rn4/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Cow bosTau3 SEQ2_DIR=/san/sanvol1/scratch/bosTau3/bosTau3.2bit SEQ2_LEN=/san/sanvol1/scratch/bosTau3/chrom.sizes SEQ2_LIMIT=500 SEQ2_CHUNK=50000000 SEQ2_LAP=0 BASE=/cluster/data/rn4/bed/blastz.bosTau3.2007-03-25 TMPDIR=/scratch/tmp '_EOF_' # << this line keeps emacs coloring happy doBlastzChainNet.pl DEF \ -bigClusterHub pk \ -chainMinScore=3000 -chainLinearGap=medium \ -blastzOutRoot /cluster/bluearc/bosTau3/blastz.rn4 >& do.log & tail -f do.log nice featureBits -chrom=chr1 rn4 chainBosTau3Link # 61426292 bases of 243058842 (25.272%) in intersection ######################################################################### # BLASTZ Mouse mm9 swap (DONE - 2007-09-11 - Hiram) # the original ssh kkstore06 cd /cluster/data/mm9/bed/blastz.rn4 cat fb.mm9.chainRn4Link.txt # 1713186474 bases of 2620346127 (65.380%) in intersection # and the swap mkdir /cluster/data/rn4/bed/blastz.mm9.swap cd /cluster/data/rn4/bed/blastz.mm9.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /cluster/data/mm9/bed/blastzRn4.2007-08-31/DEF \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=medium \ -swap -syntenicNet > swap.out 2>&1 & ######################################################################### # RGD QTL (DONE 9/24/07 angie -- RELOADED 9/26/07) ssh kkstore04 mkdir /cluster/data/rn4/bed/rgdQtl cd /cluster/data/rn4/bed/rgdQtl wget ftp://rgd.mcw.edu:21/pub/data_release/QTLS # Make bed4 and rgdQtlLink: perl -we 'open(BED, ">rgdQtl.bed") || die; \ open(LINK, ">rgdQtlLink.txt") || die; \ while (<>) { \ chomp; my @w = split("\t"); \ next unless ($w[1] =~ /^rat/ && $w[15]); \ $w[5] =~ s/^/chr/; \ $w[15] =~ s/^([-\d]+).*$/$1/ || die "parse start pos"; \ $w[16] =~ s/^(\d+).*$/$1/ || die "parse end pos"; \ if ($w[15] > $w[16]) { \ $tmp = $w[15]; $w[15] = $w[16]; $w[16] = $tmp; \ } \ $w[15]--; \ $w[15] = 0 if ($w[15] < 0); \ print BED "$w[5]\t$w[15]\t$w[16]\t$w[2]\n"; \ print LINK "$w[0]\t$w[2]\t$w[3]\n"; \ } \ close(BED); close(LINK);' \ QTLS ssh hgwdev cd /cluster/data/rn4/bed/rgdQtl # 9/26/07: RGD said that the duplicate Gluco13 was supposed to be # Gluco13 + Gluco31. Edited rgdQtl.bed to change Gluco13 -> 31 for # the chr15 copy, and rgdQtlLink.txt for id 631516. hgLoadBed rn4 rgdQtl rgdQtl.bed hgLoadSqlTab rn4 rgdQtlLink ~/kent/src/hg/lib/rgdQtlLink.sql rgdQtlLink.txt # Fix coords > chromSize: perl -wpe 's/^(\w+)\t(\d+)$/ \ delete from rgdQtl where chrom="$1" and chromStart >= $2; \ update rgdQtl set chromEnd = $2 where chrom="$1" and chromEnd > $2;/' \ ../../chrom.sizes \ | hgsql rn4 checkTableCoords -verbose=2 rn4 rgdQtl runJoiner.csh rn4 rgdQtl ############################################################################# # CONTRAST GENES (2007-10-02 markd) # recieved predictions from Sam Gross cd /cluster/data/rn4/bed/contrastGene/ wget http://www.stanford.edu/~ssgross/contrast.rn4.bed # this is a custom track, not a pure BED tail +2 contrast.rn4.bed | hgLoadBed -tab rn4 contrastGene stdin # verify # load track db (ra and contrastGene.html are global # request push of contrastGene ########################################################################### # loading affy mouse Exon probes and transcripts (DONE - 2007-10-04 - Hiram) # data was supplied from Venu Valmeekam Venu_Valmeekam@affymetrix.com # dropped via FTP to genome-test ssh hgwdev mkdir /cluster/data/rn4/bed/affyRaEx1 cd /cluster/data/rn4/bed/affyRaEx1 # the files received: # -rw-r--r-- 1 10151819 Oct 3 10:53 transcript_cluster_rn.bed.gz # -rw-r--r-- 1 42240033 Oct 4 13:32 probe_rn_score.bed.gz # loading: hgLoadBed -tmpDir=/scratch/tmp rn4 affyRaEx1Probe probe_rn_score.bed.gz # Loaded 4025442 elements of size 6 hgLoadBed -tmpDir=/scratch/tmp rn4 affyRaEx1Transcript \ transcript_cluster_rn.bed.gz # Loaded 360668 elements of size 12 # working on description pages for these with Venu. # I manually set the scores in the affyRaEx1Transcript track to # 1000 so it would work OK (not color) with the useScore 1 so that # the affyRaEx1Probe would color itself on the score ######################################################################### # EXONIPHY Rn4, lifted from hg18 (DONE - 2007-11-19 - Hiram) # needed for uscsGenes10 building # create a syntenic liftOver chain file ssh kolossus cd /cluster/data/hg18/bed/blastz.rn4/axtChain time nice -n +19 netFilter -syn hg18.rn4.net.gz \ | netChainSubset -verbose=0 stdin hg18.rn4.all.chain.gz stdout \ | chainStitchId stdin stdout | gzip -c > hg18.rn4.syn.chain.gz # real 5m10.947s # slightly smaller than the ordinary liftOver chain file: # -rw-rw-r-- 1 73768012 Feb 17 2006 hg18.rn4.over.chain.gz # -rw-rw-r-- 1 69198780 Nov 19 14:13 hg18.rn4.syn.chain.gz # exoniphyRn4.gp is prepared as follows ssh hgwdev mkdir /cluster/data/rn4/bed/exoniphy cd /cluster/data/rn4/bed/exoniphy hgsql hg18 -N -e "select * from exoniphy" > exoniphyHg18.gp time nice -n +19 liftOver -genePred exoniphyHg18.gp \ /cluster/data/hg18/bed/blastz.rn4/axtChain/hg18.rn4.syn.chain.gz \ exoniphyRn4.gp unmapped # real 32m0.405s wc -l * # 178162 exoniphyHg18.gp # 166859 exoniphyRn4.gp # 22606 unmapped ssh hgwdev cd /cluster/data/rn4/bed/exoniphy nice -n +19 hgLoadGenePred -genePredExt rn4 exoniphy exoniphyRn4.gp nice -n +19 featureBits rn4 exoniphy # 24959589 bases of 2571531505 (0.971%) in intersection nice -n +19 featureBits mm9 exoniphy # 25931742 bases of 2620346127 (0.990%) in intersection nice -n +19 featureBits mm8 exoniphy # 25952211 bases of 2567283971 (1.011%) in intersection ############################################################################ # SGP GENES (DONE - 2007-12-20 - Hiram) ssh kkstore04 mkdir /cluster/data/rn4/bed/sgp cd /cluster/data/rn4/bed/sgp mkdir gtf for C in `awk '{print $1}' /cluster/data/rn4/chrom.sizes` do wget --timestamping \ "http://genome.imim.es/genepredictions/R.norvegicus/rnNov2004/SGP/humangp200603/${C}.gtf" \ -O "gtf/${C}.gtf" done ssh hgwdev cd /cluster/data/rn4/bed/sgp ldHgGene -gtf -genePredExt rn4 sgpGene gtf/chr*.gtf # Read 36569 transcripts in 286299 lines in 45 files # 36569 groups 44 seqs 1 sources 3 feature types # 36569 gene predictions featureBits rn4 sgpGene # 36219236 bases of 2571531505 (1.408%) in intersection featureBits rn4 -enrichment refGene:CDS sgpGene # refGene:CDS 0.770%, sgpGene 1.408%, both 0.672%, cover 87.35%, enrich 62.02x ##################################################################### # LOAD GENEID GENES (DONE - 2007-12-20 - Hiram) ssh kkstore04 mkdir -p /cluster/data/rn4/bed/geneid cd /cluster/data/rn4/bed/geneid mkdir gtf prot for C in `awk '{print $1}' /cluster/data/rn4/chrom.sizes` do echo $C wget --timestamping \ "http://genome.imim.es/genepredictions/R.norvegicus/rnNov2004/geneid_v1.2/${C}.gtf" \ -O "gtf/${C}.gtf" wget --timestamping \ "http://genome.imim.es/genepredictions/R.norvegicus/rnNov2004/geneid_v1.2/${C}.prot" \ -O "prot/${C}.prot" done # Add missing .1 to protein id's cd prot foreach f (*.prot) perl -wpe 's/^(>chr\S+)$/$1.1/' $f > $f:r-fixed.prot end ssh hgwdev cd /cluster/data/rn4/bed/geneid ldHgGene -genePredExt -gtf rn4 geneid gtf/*.gtf # Read 38143 transcripts in 287158 lines in 45 files # 38143 groups 45 seqs 1 sources 3 feature types # 38143 gene predictions hgPepPred rn4 generic geneidPep prot/chr*-fixed.prot featureBits rn4 geneid # 40255245 bases of 2571531505 (1.565%) in intersection featureBits rn4 -enrichment refGene:CDS geneid # refGene:CDS 0.770%, geneid 1.565%, both 0.615%, cover 79.92%, enrich 51.05x ########################################################################## # AGILENT CGH PROBES (Done 2008-05-13, Andy) # (see hg18.txt) ############################################################################ ############################################################################ # TRANSMAP vertebrate.2008-05-20 build (2008-05-24 markd) vertebrate-wide transMap alignments were built Tracks are created and loaded by a single Makefile. This is available from: svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-05-20 see doc/builds.txt for specific details. ############################################################################ ############################################################################ # TRANSMAP vertebrate.2008-06-07 build (2008-06-30 markd) vertebrate-wide transMap alignments were built Tracks are created and loaded by a single Makefile. This is available from: svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-06-30 see doc/builds.txt for specific details. ############################################################################ ############################################################################# # RAT TISSUE EXON ARRAYS (Melissa Cline, cline@biology.ucsc.edu, 10/14/08) # (to build the affyExonTissues track, see the steps outlined in hg18.txt) ############################################################################# ######################################################################## ## AFFY ALL EXON PROBESETS (HG18/MM9/RN4) (DONE 2009-01-29, Andy) ## (instructions are in hg18.txt) ######################################################################## ################################################ # AUTOMATE UPSTREAM FILE CREATION (2008-10-15 markd) update genbank.conf: rn4.upstreamGeneTbl = refGene rn4.upstreamMaf = multiz9way /hive/data/genomes/rn4/bed/multiz9way/species.lst ############################################################################# # TEST LASTZ Mouse Mm9 (DONE - 2008-11-26,12-01 - Hiram) # to compare like with like for Rn5 lastz screen # use screen to manage this job mkdir /hive/data/genomes/rn4/bed/blastzMm9.2008-11-26 cd /hive/data/genomes/rn4/bed/blastzMm9.2008-11-26 cat << '_EOF_' > DEF # rat vs mouse # Specially tuned blastz parameters from Webb Miller BLASTZ=lastz BLASTZ_M=254 BLASTZ_ABRIDGE_REPEATS=0 BLASTZ_O=600 BLASTZ_E=55 BLASTZ_Y=15000 BLASTZ_T=2 BLASTZ_K=4500 BLASTZ_Q=/hive/data/genomes/rn4/bed/lastzMm9.2008-11-26/mouse_rat.q # TARGET: Rat Rn4 SEQ1_DIR=/scratch/data/rn4/nib SEQ1_LEN=/scratch/data/rn4/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Mouse Mm9 SEQ2_DIR=/scratch/data/mm9/nib SEQ2_LEN=/scratch/data/mm9/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/hive/data/genomes/rn4/bed/lastzMm9.2008-11-26 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # establish a screen to control this job time nice -n +19 doBlastzChainNet.pl -verbose=2 \ -workhorse=hgwdev -smallClusterHub=pk -bigClusterHub=pk \ -chainMinScore=5000 -chainLinearGap=medium \ -stop=net `pwd`/DEF > do.log 2>&1 & # real 313m17.777s time nice -n +19 doBlastzChainNet.pl -verbose=2 \ -workhorse=hgwdev -smallClusterHub=pk -bigClusterHub=pk \ -chainMinScore=5000 -chainLinearGap=medium \ -continue=cat -stop=net `pwd`/DEF >cat.log 2>&1 # create a loadUp.csh file so it can be modified to change # the table names to not clash with existing Mm9 tables time nice -n +19 doBlastzChainNet.pl -verbose=2 \ -workhorse=hgwdev -smallClusterHub=pk -bigClusterHub=pk \ -chainMinScore=5000 -chainLinearGap=medium \ -debug -continue=load -stop=load `pwd`/DEF >load.log 2>&1 # real 79m48.305s cd /hive/data/genomes/rn4/bed/lastzMm9.2008-11-26/axtChain time ./load.csh # real 23m39.747s cat fb.rn4.chainMm9LastzLink.txt # 1710794258 bases of 2571531505 (66.528%) in intersection mkdir /hive/data/genomes/mm9/bed/blastz.rn4.swap cd /hive/data/genomes/mm9/bed/blastz.rn4.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/rn4/bed/lastzMm9.2008-11-26/DEF \ -workhorse=hgwdev -smallClusterHub=pk -bigClusterHub=pk \ -chainMinScore=5000 -chainLinearGap=medium \ -swap -stop=net > swap.log 2>&1 & # real 120m41.998s time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/rn4/bed/lastzMm9.2008-11-26/DEF \ -workhorse=hgwdev -smallClusterHub=pk -bigClusterHub=pk \ -chainMinScore=5000 -chainLinearGap=medium \ -debug -swap -continue=load -stop=load > load.log 2>&1 & # real 27m24.819s cat fb.mm9.chainRn4LastzLink.txt # 1713323126 bases of 2620346127 (65.385%) in intersection ############################################################################# # LIFTOVER TO Rn5 (DONE - 2008-12-14 - Hiram ) mkdir /hive/data/genomes/rn4/bed/blat.rn5.2008-12-14 cd /hive/data/genomes/rn4/bed/blat.rn5.2008-12-14 # -debug run to create run dir, preview scripts... doSameSpeciesLiftOver.pl -debug rn4 rn5 # Real run: time nice -n +19 doSameSpeciesLiftOver.pl rn4 rn5 > do.log 2>&1 ############################################################################# ############################################################################ # TRANSMAP vertebrate.2009-07-01 build (2009-07-21 markd) vertebrate-wide transMap alignments were built Tracks are created and loaded by a single Makefile. This is available from: svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-07-01 see doc/builds.txt for specific details. ############################################################################ ############################################################################ # TRANSMAP vertebrate.2009-09-13 build (2009-09-20 markd) vertebrate-wide transMap alignments were built Tracks are created and loaded by a single Makefile. This is available from: svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-09-13 see doc/builds.txt for specific details. ############################################################################ # ADD LINK TO GENENETWORK (DONE. 11/06/09 Fan). # Received geneNetwork ID list file, GN_rat_RefSeq.txt, for rn4 from GeneNetwork, Zhou Xiaodong [xiaodong.zhou@gmail.com]. ssh hgwdev mkdir -p /cluster/data/rn4/bed/geneNetwork cd /cluster/data/rn4/bed/geneNetwork hgsql rn4 < ~/src/hg/lib/geneNetworkId.sql hgsql rn4 -e \ 'load data local infile "GN_rat_RefSeq.txt" into table geneNetworkId' ######################################################################### # BLASTZ/CHAIN/NET BOSTAU4 (DONE - 2010-01-19,22 - Hiram) ssh hgwdev screen # use a screen to manage this multi-day job mkdir /hive/data/genomes/rn4/bed/blastzBosTau4.2010-01-19 cd /hive/data/genomes/rn4/bed/blastzBosTau4.2010-01-19 cat << '_EOF_' > DEF # Rat vs. Cow BLASTZ_M=50 # TARGET: Rat Rn4 SEQ1_DIR=/scratch/data/rn4/nib SEQ1_LEN=/cluster/data/rn4/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Cow bosTau4 SEQ2_DIR=/scratch/data/bosTau4/bosTau4.2bit SEQ2_LEN=/cluster/data/bosTau4/chrom.sizes # Maximum number of scaffolds that can be lumped together SEQ2_LIMIT=100 SEQ2_CHUNK=20000000 SEQ2_LAP=0 BASE=/cluster/data/rn4/bed/blastzBosTau4.2010-01-19 TMPDIR=/scratch/tmp '_EOF_' # << this line keeps emacs coloring happy time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -workhorse=hgwdev -bigClusterHub=pk -chainMinScore=3000 \ -chainLinearGap=medium -noLoadChainSplit -syntenicNet > do.log 2>&1 & # interrupted by storm induced power failures, the finish lastz batch # then continuing: time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -continue=cat -workhorse=hgwdev -bigClusterHub=pk -chainMinScore=3000 \ -smallClusterHub=memk -chainLinearGap=medium -noLoadChainSplit \ -syntenicNet > cat.log 2>&1 & # real 107m32.779s cat fb.rn4.chainBosTau4Link.txt # 649931321 bases of 2571531505 (25.274%) in intersection mkdir /cluster/data/bosTau4/bed/blastz.rn4.swap cd /cluster/data/bosTau4/bed/blastz.rn4.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /cluster/data/rn4/bed/blastzBosTau4.2010-01-19/DEF \ -noLoadChainSplit -workhorse=hgwdev -smallClusterHub=memk \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -swap -syntenicNet > swap.log 2>&1 & # real 77m18.645s cat fb.bosTau4.chainRn4Link.txt # 664253901 bases of 2731830700 (24.315%) in intersection ######################################################################### # ailMel1 Panda alignment (DONE - 2010-02-04 - Hiram) mkdir /hive/data/genomes/rn4/bed/lastzAilMel1.2010-02-04 cd /hive/data/genomes/rn4/bed/lastzAilMel1.2010-02-04 cat << '_EOF_' > DEF # Rat vs. Panda # parameters from the Panda paper supplemental where they describe # their lastz parameters BLASTZ_K=2200 BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_H=2000 BLASTZ_C=2 BLASTZ_T=2 # our usual M BLASTZ_M=50 # TARGET: Rat Rn4 SEQ1_DIR=/scratch/data/rn4/nib SEQ1_LEN=/scratch/data/rn4/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Panda SEQ2_DIR=/scratch/data/ailMel1/ailMel1.2bit SEQ2_LEN=/scratch/data/ailMel1/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LIMIT=50 SEQ2_LAP=0 BASE=/hive/data/genomes/rn4/bed/lastzAilMel1.2010-02-04 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -noLoadChainSplit -syntenicNet \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 501m27.760s cat fb.rn4.chainAilMel1Link.txt # 708197812 bases of 2571531505 (27.540%) in intersection mkdir /hive/data/genomes/ailMel1/bed/blastz.rn4.swap cd /hive/data/genomes/ailMel1/bed/blastz.rn4.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/rn4/bed/lastzAilMel1.2010-02-04/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 73m37.751s cat fb.ailMel1.chainRn4Link.txt # 695366144 bases of 2245312831 (30.970%) in intersection ############################################################################ # tRNAs track (2010-03-12, Fan RE-BUILT) # ssh hgwdev cd /hive/data/genomes/rn4/bed mkdir tRNAs cd tRNAs # Get data files from /projects/lowelab/users/lowe/Browser/vertebrates/ cp -p /projects/lowelab/users/lowe/Browser/vertebrates/rn4-tRNAs.bed . cp -p \ /projects/lowelab/users/lowe/Browser/vertebrates/rn4_tRNAs_images.tar . hgsql rn4 -e 'drop table if exists tRNAs' hgLoadBed -tab rn4 tRNAs rn4-tRNAs.bed -sqlTable=$HOME/kent/src/hg/lib/tRNAs.sql mkdir gif cd gif tar -xvf ../rn4_tRNAs_images.tar cp images/*.gif . rm -rf images mkdir /hive/data/gbdb/rn4/RNA-img rm /hive/data/gbdb/rn4/RNA-img/* cp -p * /hive/data/gbdb/rn4/RNA-img ##################################################################### # INITIAL RGD GENES BUILD (2010-02-12, Fan, DONE) # # NOTE: THE BUILD PIPELINE IS DEVELOPED. SEVERAL MAJOR PROBLEMS WERE # DISCOVERED IN THE ORIGINAL SOURCE DATA FILE. RGD IS LOOKING INTO THEM. # # THIS PIPELINE SHOULD BE ADJUSTED/MODIFIED AND RE-RUN AFTER RGD FIXES THOSE PROBLEMS # AND PROVIDES AN UPDATED VERSION OF THEIR DATA FILE. ssh hgwdev cd /hive/data/genomes/rn4/bed mkdir rgdGene cd rgdGene # download raw data from RGD wget --timestamping ftp://rgd.mcw.edu/pub/data_release/GFF/rgd_genes_gff.zip unzip rgd_genes_gff.zip cat gff/RGDgenes_chr*.gff3 >RGDgenes_all.gff3 hgsql rn4 -e 'drop table rgdGeneRaw0' hgsql rn4 < ~/kent/src/hg/lib/rgdGeneRaw0.sql hgsql rn4 -e 'load data local infile "RGDgenes_all.gff3" into table rgdGeneRaw0' # build the rgdId.tab file hgsql rn4 -N -e 'select "RGD:",seqname from rgdGeneRaw0 '|cut -f 1 >j.rgd hgsql rn4 -N -e 'select * from rgdGeneRaw0 '|cut -f 9 >j.0 cat j.0 |sed -e 's/RGD:/\t/'|cut -f 2 >j.1 cat j.1 |sed -e 's/,/\t/'|sed -e 's/;/\t/'|cut -f 1 >j.2 paste j.rgd j.2 |sed -e 's/\t//' >rgdId.tab rm j.* # fix some currently known problems of RGD Genes # First, remove gene records that contains exons of different strands. # create two temp tables rgdGeneRaw2 and rgdGeneRaw3 cut -f 1-8 RGDgenes_all.gff3 >j.temp8 paste j.temp8 rgdId.tab >rgdGeneRaw2.tab hgsql rn4 -e 'drop table rgdGeneRaw2' hgsql rn4 -e 'drop table rgdGeneRaw3' hgsql rn4 < ~/kent/src/hg/lib/rgdGeneRaw2.sql hgsql rn4 < ~/kent/src/hg/lib/rgdGeneRaw3.sql hgsql rn4 -e 'load data local infile "rgdGeneRaw2.tab" into table rgdGeneRaw2' hgsql rn4 -e 'load data local infile "rgdGeneRaw2.tab" into table rgdGeneRaw3' # create the one line script del1 containing the following line: hgsql rn4 -N -e "delete from rgdGeneRaw2 where rgdId='${2}' and feature='exon' and seqname='${1}' and strand = '${3}'" chmod +x del1 hgsql rn4 -N -e \ "select 'del1', x.seqname, x.rgdId, x.strand from rgdGeneRaw2 x, rgdGeneRaw3 y where x.rgdId=y.rgdId and x.seqname=y.seqname and x.feature='exon' and y.feature='gene' and x.strand!=y.strand;" |\ sort -u > delAll chmod +x delAll ./delAll hgsql rn4 -N -e 'select "chr" from rgdGeneRaw2 where feature != "CDS"' >j.chr hgsql rn4 -N -e 'select * from rgdGeneRaw2 where feature != "CDS"' >j.1 paste j.chr j.1 |sed -e 's/chr\t/chr/'>j.2 # remove one problematic gene cat j.2|grep -v "RGD:3683" >rgdGeneTemp.gff ldHgGene -gtf rn4 rgdGeneTemp rgdGeneTemp.gff hgsql rn4 -N -e 'select x.*, y.start-1, y.end from rgdGeneTemp x, rgdGeneRaw2 y where x.name=y.rgdId and y.feature="CDS"' >j.0 cut -f 2-6 j.0 >j.1 cut -f 12-13 j.0 >j.2 cut -f 9-11 j.0 >j.3 #paste j.1 j.2 j.3 >rgdGene2.tab paste j.1 j.2 j.3 >j.10 cut -f 1-3 j.10 >j.1-3 cut -f 9 j.10 |sed -e 's/,/\t/'|cut -f 1 >j.start cut -f 5-10 j.10 >j.5-10 paste j.1-3 j.start j.5-10 >rgdGene2.tab hgsql rn4 -e 'drop table rgdGene2' hgsql rn4 < ~/kent/src/hg/lib/rgdGene2.sql hgsql rn4 -e 'load data local infile "rgdGene2.tab" into table rgdGene2' hgsql rn4 -e 'drop table rgdGeneTemp' rm j.* ####### temp fix to make txEnd the same as last exonStart getLastExonEnd rn4 |sed -e 's/,//' >j.out cut -f 1 j.out >j.1 cut -f 1 rgdGene2.tab >j.0 diff j.0 j.1 cut -f 2 j.out >j.last cut -f 1-4 rgdGene2.tab > j.1-4 cut -f 6-10 rgdGene2.tab > j.6-10 paste j.1-4 j.last j.6-10 >rgdGene2.tab.new hgsql rn4 -e 'drop table rgdGene2' hgsql rn4 < ~/kent/src/hg/lib/rgdGene2.sql hgsql rn4 -e 'load data local infile "rgdGene2.tab.new" into table rgdGene2' rm j.* ############################################################################# # UPDATE KEGG TABLES (DONE, Fan, 6/18/10) mkdir -p /hive/data/genomes/rn4/bed/pathways/kegg cd /hive/data/genomes/rn4/bed/pathways/kegg wget --timestamping ftp://ftp.genome.jp/pub/kegg/pathway/map_title.tab cat map_title.tab | sed -e 's/\t/\trno\t/' > j.tmp cut -f 2 j.tmp >j.rno cut -f 1,3 j.tmp >j.1 paste j.rno j.1 |sed -e 's/\t//' > keggMapDesc.tab rm j.rno j.1 rm j.tmp hgsql rn4 -e 'drop table keggMapDesc' hgsql rn4 < ~/kent/src/hg/lib/keggMapDesc.sql hgsql rn4 -e 'load data local infile "keggMapDesc.tab" into table keggMapDesc' wget --timestamping ftp://ftp.genome.jp/pub/kegg/genes/organisms/rno/rno_pathway.list cat rno_pathway.list| sed -e 's/path://'|sed -e 's/:/\t/' > j.tmp hgsql rn4 -e 'drop table keggPathway' hgsql rn4 < ~/kent/src/hg/lib/keggPathway.sql hgsql rn4 -e 'load data local infile "j.tmp" into table keggPathway' hgsql rn4 -N -e \ 'select name, locusID, mapID from keggPathway p, knownToLocusLink l where p.locusID=l.value' \ >keggPathway.tab hgsql rn4 -e 'delete from keggPathway' hgsql rn4 -e 'load data local infile "keggPathway.tab" into table keggPathway' rm j.tmp ############################################################################# # Add KEGG column to rn4 Gene Sorter (Done, Fan, 6/18/2010) mkdir -p /hive/data/genomes/rn4/bed/geneSorter cd /hive/data/genomes/rn4/bed/geneSorter hgsql rn4 -N -e 'select kgId, mapID, mapID, "+", locusID from keggPathway' |sort -u|sed -e 's/\t+\t/+/' > knownToKeggEntrez.tab hgsql rn4 -e 'drop table knownToKeggEntrez' hgsql rn4 < ~/kent/src/hg/lib/knownToKeggEntrez.sql hgsql rn4 -e 'load data local infile "knownToKeggEntrez.tab" into table knownToKeggEntrez' ############################################################################# # RE-BUILD HGNEAR PROTEIN BLAST TABLES -- TARGET (DONE 8/7/10, Fan) ssh hgwdev mkdir /hive/data/genomes/rn4/bed/hgNearBlastp/100807 cd /hive/data/genomes/rn4/bed/hgNearBlastp/100807 cat << _EOF_ > config.ra # Latest rat vs. other Gene Sorter orgs: # human, mouse, zebrafish, fly, worm, yeast targetGenesetPrefix known targetDb rn4 queryDbs hg19 mm9 danRer6 dm3 ce6 sacCer2 rn4Fa /hive/data/genomes/rn4/bed/blastp/known.faa hg19Fa /hive/data/genomes/hg19/bed/ucsc.12/ucscGenes.faa mm9Fa /hive/data/genomes/mm9/bed/ucsc.12/ucscGenes.faa danRer6Fa /hive/data/genomes/danRer6/bed/blastp/danRer6.ensPep.faa dm3Fa /hive/data/genomes/dm3/bed/flybase5.3/flyBasePep.fa ce6Fa /hive/data/genomes/ce6/bed/hgNearBlastp/080711/ce6.sangerPep.faa sacCer2Fa /hive/data/genomes/sacCer2/bed/hgNearBlastp/090218/sgdPep.faa buildDir /hive/data/genomes/rn4/bed/hgNearBlastp/100807 scratchDir /hive/data/genomes/rn4/bed/hgNearBlastp/100807/tmp _EOF_ doHgNearBlastp.pl -targetOnly config.ra >& do.log & tail -f do.log # *** All done! # *** Check these tables in rn4: # *** knownBlastTab hgBlastTab mmBlastTab drBlastTab dmBlastTab ceBlastTab scBlastTab ####################################################################################### # RGD GENES BUILD (2010-10-05, Fan, STARTED) ssh hgwdev # download raw data from RGD mkdir –p /hive/data/outside/rgd/rat/093010 cd /hive/data/outside/rgd/rat/093010 # ftp over all files under ftp://rgd.mcw.edu/pub/data_release/GFF/Corrected_GFF3/ cd /hive/data/genomes/rn4/bed mkdir rgdGene cd rgdGene mkdir 093010 cd 093010 mkdir gff cp -p /hive/data/outside/rgd/rat/093010/rn* gff cat gff/rn*.gff3|fgrep -v "##gff-version 3" >RGDgenes_all.gff3 hgsql rn4 -e 'drop table rgdGeneRaw0' hgsql rn4 < ~/kent/src/hg/lib/rgdGeneRaw0.sql hgsql rn4 -e 'load data local infile "RGDgenes_all.gff3" into table rgdGeneRaw0' # build the rgdId.tab file hgsql rn4 -N -e 'select "RGD:",seqname from rgdGeneRaw0 where feature="gene"'|cut -f 1 >j.rgd hgsql rn4 -N -e 'select * from rgdGeneRaw0 where feature = "gene"'|cut -f 9 >j.0 cat j.0 |sed -e 's/RGD:/\t/'|cut -f 2 >j.1 cat j.1 |sed -e 's/,/\t/'|sed -e 's/;/\t/'|cut -f 1 >j.2 paste j.rgd j.2 |sed -e 's/\t//' >rgdId.tab rm j.* # fix some currently known problems of RGD Genes # First, remove gene records that contains exons of different strands. # create two temp tables rgdGeneRaw2 and rgdGeneRaw3 hgsql rn4 -N -e 'select * from rgdGeneRaw0 where feature = "gene" or feature ="CDS" or feature = "exon"' >j.tmp cut -f 1 j.tmp|sed -e 's/Chr//' >j.1 cut -f 2-8 j.tmp >j.2 paste j.1 j.2 >j.temp8 cut -f 9 j.tmp >j.9 cat j.9 |sed -e 's/RGD:/\tRGD:/'|cut -f 2 >j.1 cat j.1 |sed -e 's/,/\t/'|sed -e 's/;/\t/'|cut -f 1 >j.temp9 paste j.temp8 j.temp9 >rgdGeneRaw2.tab hgsql rn4 -e 'drop table rgdGeneRaw2' hgsql rn4 -e 'drop table rgdGeneRaw3' hgsql rn4 < ~/kent/src/hg/lib/rgdGeneRaw2.sql hgsql rn4 < ~/kent/src/hg/lib/rgdGeneRaw3.sql hgsql rn4 -e 'load data local infile "rgdGeneRaw2.tab" into table rgdGeneRaw2' hgsql rn4 -e 'load data local infile "rgdGeneRaw2.tab" into table rgdGeneRaw3' hgsql rn4 -N -e 'select * from rgdGeneRaw2 where feature = "gene"' > rgdGeneRaw2gene.tab hgsql rn4 -N -e 'select * from rgdGeneRaw2 where feature = "exon"' > rgdGeneRaw2exon.tab hgsql rn4 -N -e 'select * from rgdGeneRaw2 where feature = "CDS"' > rgdGeneRaw2cds.tab hgsql rn4 -e 'drop table rgdGeneRaw2gene' hgsql rn4 -e 'drop table rgdGeneRaw2exon' hgsql rn4 -e 'drop table rgdGeneRaw2cds' hgsql rn4 < ~/src/hg/lib/rgdGeneRaw2gene.sql hgsql rn4 < ~/src/hg/lib/rgdGeneRaw2exon.sql hgsql rn4 < ~/src/hg/lib/rgdGeneRaw2cds.sql hgsql rn4 -e 'load data local infile "rgdGeneRaw2gene.tab" into table rgdGeneRaw2gene' hgsql rn4 -e 'load data local infile "rgdGeneRaw2exon.tab" into table rgdGeneRaw2exon' hgsql rn4 -e 'load data local infile "rgdGeneRaw2cds.tab" into table rgdGeneRaw2cds' hgsql rn4 -N -e \ 'select g.rgdId, g.strand, e.strand from rgdGeneRaw2gene g, rgdGeneRaw2exon e where e.rgdId=g.rgdId and g.strand <> e.strand'\ |cut -f 1|sort -u|sed -e 's/RGD/del1 RGD/' >delAll chmod +x delAll # create the one line script del1 containing the following line: hgsql rn4 -N -e "delete from rgdGeneRaw2 where rgdId='${1}'" chmod +x del1 ./delAll hgsql rn4 -N -e 'select "chr" from rgdGeneRaw2 where feature != "CDS"' >j.chr hgsql rn4 -N -e 'select * from rgdGeneRaw2 where feature != "CDS"' >j.1 paste j.chr j.1 |sed -e 's/chr\t/chr/'>j.2 # remove two problematic gene cat j.2|grep -v "RGD:3683" |grep -v "RGD:2549" >rgdGeneTemp.gff ldHgGene -gtf rn4 rgdGeneTemp rgdGeneTemp.gff #Read 18327 transcripts in 358867 lines in 1 files # 18327 groups 21 seqs 2 sources 2 feature types #18245 gene predictions getRgdGeneCds rn4 > rgdGeneCds.tab hgsql rn4 -e 'delete from rgdGeneCds' hgsql rn4 -e 'load data local infile "rgdGeneCds.tab" into table rgdGeneCds' doRgdGene2 rn4 > rgdGene2.tab hgsql rn4 -e 'delete from rgdGene2' hgsql rn4 -e 'load data local infile "rgdGene2.tab" into table rgdGene2' # drop the rgdGeneTemp table to avoid nightly build/check complaint. hgsql rn4 -e 'drop table rgdGeneTemp' rm j.* doRgdGeneXref rn4 j.xref1 j.xref2 cat j.xref1 j.xref2 |sort -u >rgdGene2Xref.tab hgsql rn4 -e 'drop table rgdGene2Xref' hgsql rn4 < ~/kent/src/hg/lib/rgdGene2Xref.sql hgsql rn4 -e 'load data local infile "rgdGene2Xref.tab" into table rgdGene2Xref' sed -e 's/genericAlias/rgdGene2ToUniProt/' \ ~/kent/src/hg/lib/genericAlias.sql > rgdGene2ToUniProt.sql hgLoadSqlTab rn4 rgdGene2ToUniProt rgdGene2ToUniProt.sql rgdGene2ToUniProtLoad.txt hgsql rn4 -N -e \ 'select "RGD:", gene_rgd_id, "gene_desc", gene_desc from genes_rat where gene_desc <>""'\ |sed -e 's/RGD:\t/RGD:/'|cut -f 1,3 |sort -u >rgdGene2ToDescription.tab hgsql rn4 -e 'drop table rgdGene2ToDescription' hgsql rn4 < ~/kent/src/hg/lib/rgdGene2ToDescription.sql hgsql rn4 -e 'load data local infile "rgdGene2ToDescription.tab" into table rgdGene2ToDescription' hgsql rn4 -N -e \ 'select name, u.val from rgdGene2ToUniProt, uniProt.protein u where acc=value' > rgdGene2Pep.tab hgsql rn4 -e 'drop table rgdGene2Pep' hgsql rn4 < ~/kent/src/hg/lib/rgdGene2Pep.sql hgsql rn4 -e 'load data local infile "rgdGene2Pep.tab" into table rgdGene2Pep' cd /hive/data/outside/rgd/rat/093010 wget --timestamping ftp://rgd.mcw.edu/pub/data_release/GENES_RAT . cd /hive/data/genomes/rn4/bed/rgdGene/093010 cp -P /hive/data/outside/rgd/rat/093010/GENES_RAT . hgsql rn4 -e 'drop table genes_rat' hgsql rn4 < ~/kent/src/hg/lib/genes_rat.sql hgsql rn4 -e \ 'load data local infile "GENES_RAT" into table genes_rat ignore 1 lines' doRgdGene2UniProt rn4 j.tmp cat j.tmp |sort -u > rgdGene2ToUniProt.tab hgsql rn4 -e 'drop table rgdGene2ToUniProt' hgsql rn4 < ~/kent/src/hg/lib/rgdGene2ToUniProt.sql hgsql rn4 -e "load data local infile 'rgdGene2ToUniProt.tab' into table rgdGene2ToUniProt" hgsql rn4 -N -e \ 'select name, u.val from rgdGene2ToUniProt, uniProt.protein u where acc=value' > rgdGene2Pep.tab hgsql rn4 -e 'drop table rgdGene2Pep' hgsql rn4 < ~/kent/src/hg/lib/rgdGene2Pep.sql hgsql rn4 -e 'load data local infile "rgdGene2Pep.tab" into table rgdGene2Pep' hgMapToGene rn4 gnfAtlas2 rgdGene2 rgdGene2ToGnfAtlas2 '-type=bed 12' hgExpDistance rn4 hgFixed.gnfRatAtlas2MedianRatio \ hgFixed.gnfRatAtlas2MedianExps gnfAtlas2Distance \ -lookup=knownToGnfAtlas2 hgMapToGene rn4 affyRAE230 rgdGene2 rgdGene2ToRAE230 hgMapToGene rn4 affyU34A rgdGene2 rgdGene2ToU34A # looking for this doRgdGene2Col program and files # doRgdGene2Col rn4 entrez_gene rgdGene2ToLl.txt # hgMapToGene rn4 rgdGene2 rgdGene2 rgdGene2ToLocusLink -lookup=rgdGene2ToLl.txt ############################################################################# # continue rgdGenes2 (DONE - Hiram - 2012-01-26) # change genes_rat table name to be rgdGene2Raw # fixup source code in src/hg/lib/ from genes_rat to be rgdGene2Raw # and in hg/oneShot/doRgdGene2UniProt/doRgdGene2UniProt.c # hg/hgGene/synonym.c # hg/hgGene/rgdInfo.c # hg/makeDb/schema/all.joiner cd /hive/data/genomes/rn4/bed/rgdGene/093010 hgsql rn4 < ~/kent/src/hg/lib/rgdGene2Raw.sql hgsql rn4 -e \ 'load data local infile "GENES_RAT" into table rgdGene2Raw ignore 1 lines' doRgdGene2UniProt rn4 tmp.rgdGene2ToUniProt cat j.tmp |sort -u > rgdGene2ToUniProt.tab hgsql rn4 -N -e \ 'select "RGD:", gene_rgd_id, "gene_desc", gene_desc from genes_rat where gene_desc <>""'\ |sed -e 's/RGD:\t/RGD:/'|cut -f 1,3 |sort -u >rgdGene2ToDescription.tab # there are many proteins listed in rgdGene2ToUniProt that are not # in uniProt - they need to be removed: hgsql -N -e "select value from rgdGene2ToUniProt;" rn4 \ | sort > rgdGene2ToUniProt.value.txt hgsql -N -e "select acc from displayId;" uniProt | sort > uniProt.acc.txt comm -23 rgdGene2ToUniProt.value.txt uniProt.acc.txt > removeProtein.list wc -l removeProtein.list # 249 removeProtein.list hgsql -e "select count(*) from rgdGene2ToUniProt;" rn4 # 21601 for P in `cat removeProtein.list` do hgsql -e "delete from rgdGene2ToUniProt where value=\"${P}\";" rn4 done hgsql -e "select count(*) from rgdGene2ToUniProt;" rn4 # 21255 -> 21601 - 21255 == 346 removed # need to clean up the peptide table to remove bad duplicates mkdir /hive/data/genomes/rn4/bed/rgdGene/093010/dupRemove cd /hive/data/genomes/rn4/bed/rgdGene/093010/dupRemove # via email. RGD indicates a cleaned up correspondence list: wget --timestamping \ ftp://rgd.mcw.edu/pub/data_release/gp2protein.rgd.gz zcat gp2protein.rgd.gz | grep "^RGD" | awk 'NF == 2' \ | sed -e "s/UniProtKB://" | sort > rgd.gp2protein.txt hgsql -N -e "select * from rgdGene2ToUniProt;" rn4 | sort \ > rgdGene2ToUniProt.tab wc -l rgd.gp2protein.txt rgdGene2ToUniProt.tab # 27270 rgd.gp2protein.txt # 21255 rgdGene2ToUniProt.tab comm -12 rgd.gp2protein.txt rgdGene2ToUniProt.tab | wc -l # 17144 # let's see how many we can get out of their new file: cut -f1 rgdGene2ToUniProt.tab | sort -u | while read G do grep -w "${G}" rgd.gp2protein.txt done | wc -l # 18321 # that is almost all of them: cut -f1 rgdGene2ToUniProt.tab | sort -u | wc -l # 18356 # are their proteins unique, they 8 dups: cut -f1 rgdGene2ToUniProt.tab | sort -u | while read G do grep -w "${G}" rgd.gp2protein.txt | cut -f2 done | sort | uniq -c | sort -rn | head # 2 Q91887 # 2 Q5J3L8 # 2 Q0ZFT0 # 2 P81795 # 2 G3V9A2 # 2 D4ABB1 # 2 D3ZE98 # 2 D3ZAY4 # are all their proteins in uniProt: hgsql -N -e "select acc from protein;" uniProt \ | sort -u > protein.uniProt.txt cut -f2 rgd.gp2protein.txt | sort -u > rgd.protein.names.txt wc -l protein.uniProt.txt rgd.protein.names.txt # 17449633 protein.uniProt.txt # 27063 rgd.protein.names.txt comm -12 protein.uniProt.txt rgd.protein.names.txt | wc -l # 19652 # how about the proteins we want to use: cut -f1 rgdGene2ToUniProt.tab | sort -u | while read G do grep -w "${G}" rgd.gp2protein.txt | cut -f2 done | sort -u > to.select.rgd.protein.names.txt # we can get 17973 out of their new list comm -12 protein.uniProt.txt to.select.rgd.protein.names.txt | wc -l # 17973 comm -12 protein.uniProt.txt to.select.rgd.protein.names.txt | while read P do grep -w "${P}" rgd.gp2protein.txt done | sort > corrected.rgdGene2ToUniProt.tab hgsql -N -e "select name from rgdGene2;" rn4 | sort -u > name.rgdGene2.txt # interesting, only 14,339 names in common ? join name.rgdGene2.txt corrected.rgdGene2ToUniProt.tab | wc -l # 14339 # there is already a great difference between rgdGene2ToUniProt and # rgdGene2: hgsql -N -e "select name from rgdGene2ToUniProt;" rn4 \ | sort -u > name.rgdGene2ToUniProt.txt wc -l name.rgdGene2.txt name.rgdGene2ToUniProt.txt # 17487 name.rgdGene2.txt # 18356 name.rgdGene2ToUniProt.txt comm -12 name.rgdGene2.txt name.rgdGene2ToUniProt.txt | wc # 14509 14509 166686 # load up this intersected list: join name.rgdGene2.txt corrected.rgdGene2ToUniProt.tab \ | tr '[ ]' '[\t]' > toLoad.rgdGene2ToUniProt.tab hgsql rn4 -e "delete from rgdGene2ToUniProt" hgsql rn4 -e 'load data local infile "toLoad.rgdGene2ToUniProt.tab" into table rgdGene2ToUniProt' ### XXX ### the following below was a different path to try this elimination ### check this file history to see that business ############################################################################# # cleaning hgNear related tables for rgdGenes2 (DONE - Hiram - 2012-03-02) mkdir /hive/data/genomes/rn4/bed/rgdGene/093010/cleaningTables cd /hive/data/genomes/rn4/bed/rgdGene/093010/cleaningTables hgsql -N -e "select name from rgdGene2;" rn4 | sort > name.rgdGene2.rn4 hgsql -N -e "select * from affyExonTissuesGsMedianDistRgdGene2;" rn4 \ | sort > affyExonTissuesGsMedianDistRgdGene2.tab join name.rgdGene2.rn4 affyExonTissuesGsMedianDistRgdGene2.tab | sort -k2 \ > col1.clean.tab join name.rgdGene2.rn4 -2 2 col1.clean.tab -o "2.1,2.2,2.3" | sort \ > col2.clean.tab awk '{printf "%s\n%s\n", $1,$2}' col2.clean.tab | sort -u > col1.col2.list wc -l col1.col2.list name.rgdGene2.rn4 # 17406 col1.col2.list # 17487 name.rgdGene2.rn4 # 34893 total comm -12 col1.col2.list name.rgdGene2.rn4 | wc # 17406 17406 201414 hgsqldump --create-options --no-data --tab=. rn4 affyExonTissuesGsMedianDistRgdGene2 hgsql -e "drop table affyExonTissuesGsMedianDistRgdGene2;" rn4 cat col2.clean.tab | tr '[ ]' '[\t]' > toLoad.tab hgsql rn4 < affyExonTissuesGsMedianDistRgdGene2.sql hgsql -e 'load data local infile "toLoad.tab" into table affyExonTissuesGsMedianDistRgdGene2;' rn4 hgsql -N -e "select * from gnfAtlas2RgdGene2Distance;" rn4 | sort > gnfAtlas2RgdGene2Distance.tab join name.rgdGene2.rn4 gnfAtlas2RgdGene2Distance.tab | sort -k2 \ > col1.gnfAtlas2RgdGene2Distance.tab join name.rgdGene2.rn4 -2 2 col1.gnfAtlas2RgdGene2Distance.tab -o "2.1,2.2,2.3" | sort \ > col2.gnfAtlas2RgdGene2Distance.tab awk '{printf "%s\n%s\n", $1,$2}' col2.gnfAtlas2RgdGene2Distance.tab | sort -u > col2.gnfAtlas2RgdGene2Distance.list wc -l col2.gnfAtlas2RgdGene2Distance.list name.rgdGene2.rn4 # 17406 col1.col2.list # 17487 name.rgdGene2.rn4 # 34893 total comm -12 col2.gnfAtlas2RgdGene2Distance.list name.rgdGene2.rn4 | wc # 17406 17406 201414 hgsql -e "drop table gnfAtlas2RgdGene2Distance;" rn4 cat col2.gnfAtlas2RgdGene2Distance.tab | tr '[ ]' '[\t]' \ > toLoad.gnfAtlas2RgdGene2Distance.tab hgsqldump --create-options --no-data --tab=. rn4 gnfAtlas2RgdGene2Distance hgsql rn4 < gnfAtlas2RgdGene2Distance.sql hgsql -e 'load data local infile "toLoad.gnfAtlas2RgdGene2Distance.tab" into table gnfAtlas2RgdGene2Distance;' rn4 hgsql -N -e "select * from rgdGene2BlastTab;" rn4 \ | sort > rgdGene2BlastTab.tab join name.rgdGene2.rn4 rgdGene2BlastTab.tab | sort -k2 \ > col1.rgdGene2BlastTab.tab join -2 2 name.rgdGene2.rn4 col1.rgdGene2BlastTab.tab \ -o "2.1,2.2,2.3,2.4,2.5,2.6,2.7,2.8,2.9,2.10,2.11,2.12" \ | tr '[ ]' '[\t]' | sort > col2.rgdGene2BlastTab.tab awk '{printf "%s\n%s\n", $1,$2}' col2.rgdGene2BlastTab.tab | sort -u \ > col.rgdGene2BlastTab.list wc -l col.rgdGene2BlastTab.list name.rgdGene2.rn4 comm -12 col.rgdGene2BlastTab.list name.rgdGene2.rn4 | wc hgsqldump --create-options --no-data --tab=. rn4 rgdGene2BlastTab hgsql rn4 < rgdGene2BlastTab.sql hgsql -e 'load data local infile "col2.rgdGene2BlastTab.tab" into table rgdGene2BlastTab;' rn4 hgsql -N -e "select * from rgdGene2ToPfam;" rn4 | sort > rgdGene2ToPfam.tab join name.rgdGene2.rn4 rgdGene2ToPfam.tab | sort | tr '[ ]' '[\t]' \ > clean.rgdGene2ToPfam.tab hgsqldump --create-options --no-data --tab=. rn4 rgdGene2ToPfam hgsql rn4 < rgdGene2ToPfam.sql hgsql -e 'load data local infile "clean.rgdGene2ToPfam.tab" into table rgdGene2ToPfam;' rn4 hgsql -N -e "select * from rgdGene2ToSymbol;" rn4 | sort > rgdGene2ToSymbol.tab join name.rgdGene2.rn4 rgdGene2ToSymbol.tab | sort | tr '[ ]' '[\t]' \ > clean.rgdGene2ToSymbol.tab hgsqldump --create-options --no-data --tab=. rn4 rgdGene2ToSymbol hgsql rn4 < rgdGene2ToSymbol.sql hgsql -e 'load data local infile "clean.rgdGene2ToSymbol.tab" into table rgdGene2ToSymbol;' rn4 # verified that rgdGene2ToUniProt was already OK, then to reload # both rgdGene2Isoforms and rgdGene2Canonical: hgClusterGenes rn4 rgdGene2 rgdGene2Isoforms rgdGene2Canonical \ -protAccQuery='select name,value from rgdGene2ToUniProt' hgMapToGene rn4 ensGene rgdGene2 rgdGene2ToEnsembl hgsql rn4 -N -e \ 'select gene_rgd_id,gene_desc from rgdGene2Raw where gene_desc <>""'\ | sed -e 's/^/RGD:/' | sort -u > raw.rgdGene2ToDescription.tab join name.rgdGene2.rn4 raw.rgdGene2ToDescription.tab | sed -e 's/ /\t/' \ > rgdGene2ToDescription.tab hgsql -e "delete from rgdGene2ToDescription;" rn4 hgsql rn4 -e 'load data local infile "rgdGene2ToDescription.tab" into table rgdGene2ToDescription' hgsql -N -e "select * from rgdGene2Xref;" rn4 | sort > rgdGene2Xref.tab.0 # sed the first two blanks to tabs join name.rgdGene2.rn4 rgdGene2Xref.tab.0 | sed -e 's/ /\t/' \ | sed -e 's/ /\t/' > rgdGene2Xref.tab hgsql -e "delete from rgdGene2Xref;" rn4 hgsql rn4 -e 'load data local infile "rgdGene2Xref.tab" into table rgdGene2Xref' hgsql -N -e "select * from rgdGene2KeggPathway;" rn4 | sort -u \ > rgdGene2KeggPathway.tab.0 join name.rgdGene2.rn4 rgdGene2KeggPathway.tab.0 | sed -e 's/ /\t/g' \ > rgdGene2KeggPathway.tab hgsql -e "delete from rgdGene2KeggPathway;" rn4 hgsql rn4 -e 'load data local infile "rgdGene2KeggPathway.tab" into table rgdGene2KeggPathway' hgsql -N -e "select * from rgdGene2ToKeggEntrez;" rn4 | sort -u \ > rgdGene2ToKeggEntrez.tab.0 hgsql rn4 -N -e \ 'select rgdId, mapID, mapID, "+", locusID from rgdGene2KeggPathway' \ | sort -u | sed -e 's/\t+\t/+/' > rgdGene2ToKeggEntrez.tab hgsql rn4 -e 'delete from rgdGene2ToKeggEntrez' hgsql rn4 -e 'load data local infile "rgdGene2ToKeggEntrez.tab" into table rgdGene2ToKeggEntrez' hgsql -N -e "select * from rgdGene2ToGenbankAll;" rn4 | sort \ > rgdGene2ToGenbankAll.tab.0 join name.rgdGene2.rn4 rgdGene2ToGenbankAll.tab.0 | sed -e 's/ /\t/g' \ > rgdGene2ToGenbankAll.tab hgsql rn4 -e 'delete from rgdGene2ToGenbankAll' hgsql rn4 -e 'load data local infile "rgdGene2ToGenbankAll.tab" into table rgdGene2ToGenbankAll' # this small script takes a two column input, the second column may # be a comma separated list. It will break up the second column and # print out each with the first column. cat << '_EOF_' > comma.pl #!/usr/bin/env perl use strict; use warnings; while (my $line = <>) { chomp $line; my ($rgdId, $other) = split('\s+', $line); if ($other =~ m/,/) { my @a = split(',', $other); for (my $i = 0; $i < scalar(@a); ++$i) { printf "%s\t%s\n", $rgdId, $a[$i]; } } else { printf "%s\t%s\n", $rgdId, $other; } } '_EOF_' # << happy emacs # this select is the equivalent of Fan's command: # doRgdGene2Col rn4 entrez_gene rgdGene2ToLl.txt hgsql -N -e 'select gene_rgd_id,entrez_gene from rgdGene2Raw where entrez_gene<>"";' rn4 \ | sed -e "s/^/RGD:/" | ./comma.pl | sort -u > rgdGene2ToLl.txt hgsql -N -e "select * from rgdGene2ToLocusLink;" rn4 \ | sort > rgdGene2ToLocusLink.tab.0 hgMapToGene rn4 rgdGene2 rgdGene2 rgdGene2ToLocusLink \ -lookup=rgdGene2ToLl.txt # this select is the equivalent of Fan's command: # doRgdGene2Col rn4 genbank_nucleotide j hgsql -N -e 'select gene_rgd_id,genbank_nucleotide from rgdGene2Raw where genbank_nucleotide<>"";' rn4 \ | sed -e "s/^/RGD:/" | grep "NM_" | ./comma.pl | grep "NM_" \ | sort -u > rgdGene2ToRefseq.txt hgMapToGene rn4 rgdGene2 rgdGene2 rgdGene2ToRefSeq \ -lookup=rgdGene2ToRefseq.txt # build rgdGene2ToGenbank table # this select is the equivalent of Fan's command: # doRgdGene2Col rn4 genbank_nucleotide j.tmp hgsql -N -e 'select gene_rgd_id,genbank_nucleotide from rgdGene2Raw where genbank_nucleotide<>"";' rn4 \ | sed -e "s/^/RGD:/" | ./comma.pl | grep -v "_" \ | sort -u > rgdGene2ToGenbank.txt hgMapToGene rn4 rgdGene2 rgdGene2 rgdGene2ToGenbank \ -lookup=rgdGene2ToGenbank.txt hgsql rn4 -N -e\ 'select name, val from rgdGene2ToUniProt, uniProt.displayId where acc=value' \ | sort > rgdGene2ToDisplayId.tab hgsql rn4 -e 'delete from rgdGene2ToDisplayId' hgsql rn4 -e 'load data local infile "rgdGene2ToDisplayId.tab" into table rgdGene2ToDisplayId' hgsql rn4 -N -e\ 'select name, extAC from rgdGene2ToUniProt, proteome.spXref2 where value=accession and extDB="PDB"' | sort > rgdGene2ToPDB.tab hgsql rn4 -e 'delete from rgdGene2ToPDB' hgsql rn4 -e 'load data local infile "rgdGene2ToPDB.tab" into table rgdGene2ToPDB' hgsql -N -e "select * from rgdGene2ToSuper;" rn4 \ | sort > rgdGene2ToSuper.tab.0 join name.rgdGene2.rn4 rgdGene2ToSuper.tab.0 | tr '[ ]' '[\t]' \ > rgdGene2ToSuper.tab hgsql rn4 -e 'delete from rgdGene2ToSuper' hgsql rn4 -e 'load data local infile "rgdGene2ToSuper.tab" into table rgdGene2ToSuper' ############################################################################# # RE-BUILD HGNEAR PROTEIN BLAST TABLES for RGD GENES -- TARGET (DONE 10/18/10, Fan) ssh hgwdev mkdir /hive/data/genomes/rn4/bed/blastpRgdGene2 cd /hive/data/genomes/rn4/bed/blastpRgdGene2 hgsql rn4 -N -e 'select * from rgdGene2Pep' |\ sed -e 's/\t/\n/'|sed -e 's/RGD\:/\>RGD\:/' > rgdGene2Pep.faa /hive/data/outside/blast229/formatdb -i rgdGene2Pep.faa -t rgdGene2Pep -n rgdGene2Pep mkdir /hive/data/genomes/rn4/bed/hgNearBlastp/101018rgdGene2 cd /hive/data/genomes/rn4/bed/hgNearBlastp/101018rgdGene2 cat << _EOF_ > config.ra # Latest rat vs. other Gene Sorter orgs: # human, mouse, zebrafish, fly, worm, yeast targetGenesetPrefix rgdGene2 targetDb rn4 queryDbs hg19 mm9 danRer6 dm3 ce6 sacCer2 rn4Fa /hive/data/genomes/rn4/bed/blastpRgdGene2/rgdGene2Pep.faa hg19Fa /hive/data/genomes/hg19/bed/ucsc.12/ucscGenes.faa mm9Fa /hive/data/genomes/mm9/bed/ucsc.12/ucscGenes.faa danRer6Fa /hive/data/genomes/danRer6/bed/blastp/danRer6.ensPep.faa dm3Fa /hive/data/genomes/dm3/bed/flybase5.3/flyBasePep.fa ce6Fa /hive/data/genomes/ce6/bed/hgNearBlastp/080711/ce6.sangerPep.faa sacCer2Fa /hive/data/genomes/sacCer2/bed/hgNearBlastp/090218/sgdPep.faa buildDir /hive/data/genomes/rn4/bed/hgNearBlastp/101018rgdGene2 scratchDir /hive/data/genomes/rn4/bed/hgNearBlastp/101018rgdGene2/tmp _EOF_ doHgNearBlastp.pl -targetOnly config.ra >& do.log & tail -f do.log # *** All done! # *** Check these tables in rn4: # *** rgdGene2BlastTab hgBlastTab mmBlastTab drBlastTab dmBlastTab ceBlastTab scBlastTab ################################### # Build tables for hgNear with RGD Genes as the main gene set for rn4 mkdir /hive/data/genomes/rn4/bed/rgdGene/093010/near cd /hive/data/genomes/rn4/bed/rgdGene/093010/near # build Affy related tables hgMapToGene rn4 affyRAE230 rgdGene2 rgdGene2ToRAE230 hgMapToGene rn4 affyU34A rgdGene2 rgdGene2ToU34A # build rgdGene2Canonical and rgdGene2Isoforms tables hgClusterGenes rn4 rgdGene2 rgdGene2Isoforms rgdGene2Canonical \ -protAccQuery='select name,value from rgdGene2ToUniProt' # build rgdGene2ToDisplayId table hgsql rn4 -N -e\ 'select name, val from rgdGene2ToUniProt, uniProt.displayId where acc=value' >rgdGene2ToDisplayId.tab hgsql rn4 -e 'drop table rgdGene2ToDisplayId' hgsql rn4 < ~/kent/src/hg/lib/rgdGene2ToDisplayId.sql hgsql rn4 -e 'load data local infile "rgdGene2ToDisplayId.tab" into table rgdGene2ToDisplayId' # build rgdGene2ToEnsembl table doRgdGene2Col rn4 ensembl_id rgdGene2ToEnsembl.txt hgMapToGene rn4 rgdGene2 rgdGene2 rgdGene2ToEnsembl -lookup=rgdGene2ToEnsembl.txt # transcript ID instead of gene ID should be used. Fix it here. (Fan, 9/27/2011) hgsql rn4 -N -e 'select r.name, gtp.transcript from ensGtp gtp, rgdGene2ToEnsembl r where r.value=gtp.gene' >rgdGene2ToEnsTranscript.tab hgsql rn4 -e 'delete from rgdGene2ToEnsembl' hgsql rn4 -e 'load data local infile "rgdGene2ToEnsTranscript.tab" into table rgdGene2ToEnsembl' # build rgdGene2ToLocusLink table doRgdGene2Col rn4 entrez_gene rgdGene2ToLl.txt hgMapToGene rn4 rgdGene2 rgdGene2 rgdGene2ToLocusLink -lookup=rgdGene2ToLl.txt # build rgdGene2ToGenbank table doRgdGene2Col rn4 genbank_nucleotide j.tmp cat j.tmp|grep -v "_" >rgdGene2ToGenbank.txt hgMapToGene rn4 rgdGene2 rgdGene2 rgdGene2ToGenbank -lookup=rgdGene2ToGenbank.txt # build rgdGene2ToGenbankAll table doRgdGene2Col rn4 genbank_nucleotide rgdGene2ToGenbankAll.txt hgMapToGene rn4 rgdGene2 rgdGene2 rgdGene2ToGenbankAll -lookup=rgdGene2ToGenbankAll.txt # build rgdGene2ToPDB table hgsql rn4 -N -e\ 'select name, extAC from rgdGene2ToUniProt, proteome.spXref2 where value=accession and extDB="PDB"' >rgdGene2ToPDB.tab hgsql rn4 -e 'drop table rgdGene2ToPDB' hgsql rn4 < ~/kent/src/hg/lib/rgdGene2ToPDB.sql hgsql rn4 -e 'load data local infile "rgdGene2ToPDB.tab" into table rgdGene2ToPDB' # build rgdGene2ToPfam table hgMapViaSwissProt rn4 rgdGene2ToUniProt name value Pfam rgdGene2ToPfam # build rgdGene2ToRefSeq table doRgdGene2Col rn4 genbank_nucleotide j fgrep "NM_" j > rgdGene2ToRefseq.txt hgMapToGene rn4 rgdGene2 rgdGene2 rgdGene2ToRefSeq -lookup=rgdGene2ToRefseq.txt # build rgdGene2ToCdsSnp table hgMapToGene rn4 snp130 rgdGene2 rgdGene2ToCdsSnp -all -cds # build rgdGene2ToSuper table hgsql rn4 -N -e \ 'select r.name, s.superfamily from rgdGene2ToGenbankAll r, knownToSuper s where s.gene=r.value ' >j1 hgsql rn4 -N -e \ 'select r.name, s.superfamily from rgdGene2ToRefSeq r, knownToSuper s where s.gene=r.value ' >j2 cat j1 j2 |sort -u >rgdGene2ToSuper.tab hgsql rn4 -e 'drop table rgdGene2ToSuper' hgsql rn4 < ~/kent/src/hg/lib/rgdGene2ToSuper.sql hgsql rn4 -e 'load data local infile "rgdGene2ToSuper.tab" into table rgdGene2ToSuper' # update KEGG pathway related tables mkdir -p /hive/data/genomes/hg18/bed/pathways/kegg/102510 cd /hive/data/genomes/hg18/bed/pathways/kegg/102510 wget --timestamping ftp://ftp.genome.jp/pub/kegg/pathway/map_title.tab cat map_title.tab | sed -e 's/\t/\thsa\t/' > j.tmp cut -f 2 j.tmp >j.hsa cut -f 1,3 j.tmp >j.1 paste j.hsa j.1 |sed -e 's/\t//' > keggMapDesc.tab rm j.hsa j.1 rm j.tmp hgsql hg18 -e 'drop table keggMapDesc' hgsql hg18 < ~/kent/src/hg/lib/keggMapDesc.sql hgsql hg18 -e 'load data local infile "keggMapDesc.tab" into table keggMapDesc' wget --timestamping ftp://ftp.genome.jp/pub/kegg/genes/organisms/hsa/hsa_pathway.list cat hsa_pathway.list| sed -e 's/path://'|sed -e 's/:/\t/' > j.tmp hgsql hg18 -e 'drop table keggPathway' hgsql hg18 < ~/kent/src/hg/lib/keggPathway.sql hgsql hg18 -e 'load data local infile "j.tmp" into table keggPathway' hgsql hg18 -N -e \ 'select name, locusID, mapID from keggPathway p, knownToLocusLink l where p.locusID=l.value' \ >keggPathway.tab hgsql hg18 -e 'delete from keggPathway' hgsql hg18 -e 'load data local infile "keggPathway.tab" into table keggPathway' rm j.tmp # Add KEGG column to rn4 Gene Sorter (Done, Fan, 10/25/2010) mkdir -p /hive/data/genomes/rn4/bed/geneSorter/102510 cd /hive/data/genomes/rn4/bed/geneSorter/102510 hgsql rn4 -N -e 'select rgdId, mapID, mapID, "+", locusID from rgdGene2KeggPathway' |\ sort -u|sed -e 's/\t+\t/+/' > rgdGene2ToKeggEntrez.tab hgsql rn4 -e 'drop table rgdGene2ToKeggEntrez' hgsql rn4 < ~/kent/src/hg/lib/rgdGene2ToKeggEntrez.sql hgsql rn4 -e 'load data local infile "rgdGene2ToKeggEntrez.tab" into table rgdGene2ToKeggEntrez' ############################################################################# # Agilent arrays (2010-12-01 Andy) cd /hive/data/genomes/rn4/bed/agilentCgh/ # FTP download from ftp.agilent.com using given user/pass from Anniek De-witte # (anniek_de-witte@agilent.com) # downloaded files are gzipped beds. The files are typically located in a # directory called "FOR_UCSC" or something like that. The user/pass and the # directory are deleted after it's confirmed they're received, so it's not # too helpful to mention specifics here. ftp -u user -p password ftp.agilent.com > cd directory > get 027064_D_BED_20100225.bed.gz > get 027065_D_BED_20100318.bed.gz # unzip everything gunzip 027*.bed.gz ln -s 027065_D_BED_20100318.bed agilentCgh1x1m.bed ln -s 027064_D_BED_20100225.bed agilentCgh4x180k.bed for bed in agilent*.bed; do tail -n +2 $bed | hgLoadBed rn4 ${bed%.bed} stdin done rm bed.tab ######################################################################### # LASTZ Turkey MelGal1 ( DONE - 2011-04-04 - Chin) mkdir /hive/data/genomes/rn4/bed/lastzMelGal1.2011-04-04 cd /hive/data/genomes/rn4/bed/lastzMelGal1.2011-04-04 cat << '_EOF_' > DEF # Turkey vs Mouse # TARGET: Mouse rn4 SEQ1_DIR=/scratch/data/rn4/nib SEQ1_LEN=/scratch/data/rn4/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Turkey melGal1 - single chunk big enough to run entire chrom SEQ2_DIR=/scratch/data/melGal1/melGal1.2bit SEQ2_LEN=/scratch/data/melGal1/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/hive/data/genomes/rn4/bed/lastzMelGal1.2011-04-04 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -noLoadChainSplit \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=loose > do.log 2>&1 # real 71m8.450s cat fb.rn4.chainMelGal1Link.txt # 60383476 bases of 2571531505 (2.348%) in intersection cd /hive/data/genomes/rn4/bed ln -s lastzMelGal1.2011-04-04 lastz.melGal1 # running the swap mkdir /hive/data/genomes/melGal1/bed/blastz.rn4.swap cd /hive/data/genomes/melGal1/bed/blastz.rn4.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/rn4/bed/lastzMelGal1.2011-04-04/DEF \ -swap \ -noLoadChainSplit \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=loose > swap.log 2>&1 # real 6m49.871s cat fb.melGal1.chainRn4Link.txt # 48636842 bases of 935922386 (5.197%) in intersection cd /hive/data/genomes/melGal1/bed ln -s blastz.rn4.swap lastz.rn4 ######################################################################### # LASTZ Cow bosTau6 (DONE - 2011-05-31 - Chin) ssh hgwdev screen # use a screen to manage this multi-day job mkdir /hive/data/genomes/rn4/bed/blastzBosTau6.2011-05-31 cd /hive/data/genomes/rn4/bed/blastzBosTau6.2011-05-31 cat << '_EOF_' > DEF # Rat vs. Cow BLASTZ_M=50 # TARGET: Rat Rn4 SEQ1_DIR=/scratch/data/rn4/nib SEQ1_LEN=/cluster/data/rn4/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 /2011-05-31/2011-05-31/g # QUERY: Cow bosTau6 SEQ2_DIR=/scratch/data/bosTau6/bosTau6.2bit SEQ2_LEN=/cluster/data/bosTau6/chrom.sizes # Maximum number of scaffolds that can be lumped together SEQ2_LIMIT=100 SEQ2_CHUNK=20000000 SEQ2_LAP=0 BASE=/cluster/data/rn4/bed/blastzBosTau6.2011-05-31 TMPDIR=/scratch/tmp '_EOF_' # << this line keeps emacs coloring happy time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -noLoadChainSplit \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 # real 241m0.549s cat fb.rn4.chainBosTau6Link.txt # 660647959 bases of 2571531505 (25.691%) in intersection # Create link cd /hive/data/genomes/rn4/bed ln -s blastzBosTau6.2011-05-31 lastz.bosTau6 # and the swap mkdir /hive/data/genomes/bosTau6/bed/blastz.rn4.swap cd /hive/data/genomes/bosTau6/bed/blastz.rn4.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/rn4/bed/blastzBosTau6.2011-05-31/DEF \ -swap -noLoadChainSplit -syntenicNet \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 49m43.881 cat fb.bosTau6.chainRn4Link.txt # 648232024 bases of 2649682029 (24.465%) in intersection cd /hive/data/genomes/bosTau6/bed ln -s blastz.rn4.swap lastz.rn4 ############################################################################# # lastz Frog xenTro3 (WORKING - 2011-09-22 - Hiram) mkdir /hive/data/genomes/rn4/bed/lastzXenTro3.2011-09-22 cd /hive/data/genomes/rn4/bed/lastzXenTro3.2011-09-22 cat << '_EOF_' > DEF # rat vs. frog # Mammal to non-mammal, with threshold of 8000 as in mm8-xenTro1 BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=8000 BLASTZ_K=2200 BLASTZ_Q=/scratch/data/blastz/HoxD55.q # TARGET: Rat rn4 SEQ1_DIR=/scratch/data/rn4/rn4.2bit SEQ1_LEN=/scratch/data/rn4/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Frog xenTro3 SEQ2_DIR=/scratch/data/xenTro3/xenTro3.2bit SEQ2_LEN=/scratch/data/xenTro3/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=30 BASE=/hive/data/genomes/rn4/bed/lastzXenTro3.2011-09-22 '_EOF_' # << emacs screen # manage long running job time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=loose > do.log 2>&1 & # real 549m35.173s cat fb.rn4.chainXenTro3Link.txt # 66202042 bases of 2571531505 (2.574%) in intersection cd /hive/data/genomes/rn4/bed ln -s lastzXenTro3.2011-09-22 lastz.xenTro3 mkdir /hive/data/genomes/xenTro3/bed/blastz.rn4.swap cd /hive/data/genomes/xenTro3/bed/blastz.rn4.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/rn4/bed/lastzXenTro3.2011-09-22/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=loose > swap.log 2>&1 & # Elapsed time: 20m33s cat fb.xenTro3.chainRn4Link.txt # 70846825 bases of 1358334882 (5.216%) in intersection cd /hive/data/genomes/xenTro3/bed ln -s blastz.rn4.swap lastz.rn4 ############################################################################# # hgPal downloads (DONE braney 2011-09-30) # FASTA from 9way for rgdGene2 ssh hgwdev screen bash rm -rf /cluster/data/rn4/bed/multiz9way/pal mkdir /cluster/data/rn4/bed/multiz9way/pal cd /cluster/data/rn4/bed/multiz9way/pal for i in `cat ../species.lst`; do echo $i; done > order.lst mz=multiz9way gp=refGene db=rn4 mkdir exonAA exonNuc ppredAA ppredNuc for j in `sort -nk 2 /cluster/data/$db/chrom.sizes | awk '{print $1}'` do echo "date" echo "mafGene -chrom=$j $db $mz $gp order.lst stdout | \ gzip -c > ppredAA/$j.ppredAA.fa.gz" echo "mafGene -chrom=$j -noTrans $db $mz $gp order.lst stdout | \ gzip -c > ppredNuc/$j.ppredNuc.fa.gz" echo "mafGene -chrom=$j -exons -noTrans $db $mz $gp order.lst stdout | \ gzip -c > exonNuc/$j.exonNuc.fa.gz" echo "mafGene -chrom=$j -exons $db $mz $gp order.lst stdout | \ gzip -c > exonAA/$j.exonAA.fa.gz" done > $gp.jobs time sh -x $gp.jobs > $gp.jobs.log 2>&1 & sleep 1 tail -f $gp.jobs.log mz=multiz9way gp=refGene db=rn4 zcat exonAA/*.gz | gzip -c > $gp.$mz.exonAA.fa.gz zcat exonNuc/*.gz | gzip -c > $gp.$mz.exonNuc.fa.gz zcat ppredAA/*.gz | gzip -c > $gp.$mz.ppredAA.fa.gz zcat ppredNuc/*.gz | gzip -c > $gp.$mz.ppredNuc.fa.gz rm -rf exonAA exonNuc ppredAA ppredNuc # we're only distributing exons at the moment mz=multiz9way gp=refGene db=rn4 pd=/usr/local/apache/htdocs-hgdownload/goldenPath/$db/$mz/alignments mkdir -p $pd ln -s `pwd`/$gp.$mz.exonAA.fa.gz $pd/$gp.exonAA.fa.gz ln -s `pwd`/$gp.$mz.exonNuc.fa.gz $pd/$gp.exonNuc.fa.gz mz=multiz9way gp=rgdGene2 db=rn4 mkdir exonAA exonNuc ppredAA ppredNuc for j in `sort -nk 2 /cluster/data/$db/chrom.sizes | awk '{print $1}'` do echo "date" echo "mafGene -chrom=$j $db $mz $gp order.lst stdout | \ gzip -c > ppredAA/$j.ppredAA.fa.gz" echo "mafGene -chrom=$j -noTrans $db $mz $gp order.lst stdout | \ gzip -c > ppredNuc/$j.ppredNuc.fa.gz" echo "mafGene -chrom=$j -exons -noTrans $db $mz $gp order.lst stdout | \ gzip -c > exonNuc/$j.exonNuc.fa.gz" echo "mafGene -chrom=$j -exons $db $mz $gp order.lst stdout | \ gzip -c > exonAA/$j.exonAA.fa.gz" done > $gp.jobs time sh -x $gp.jobs > $gp.jobs.log 2>&1 & sleep 1 tail -f $gp.jobs.log mz=multiz9way gp=rgdGene2 db=rn4 zcat exonAA/*.gz | gzip -c > $gp.$mz.exonAA.fa.gz zcat exonNuc/*.gz | gzip -c > $gp.$mz.exonNuc.fa.gz zcat ppredAA/*.gz | gzip -c > $gp.$mz.ppredAA.fa.gz zcat ppredNuc/*.gz | gzip -c > $gp.$mz.ppredNuc.fa.gz rm -rf exonAA exonNuc ppredAA ppredNuc # we're only distributing exons at the moment mz=multiz9way gp=rgdGene2 db=rn4 pd=/usr/local/apache/htdocs-hgdownload/goldenPath/$db/$mz/alignments mkdir -p $pd ln -s `pwd`/$gp.$mz.exonAA.fa.gz $pd/$gp.exonAA.fa.gz ln -s `pwd`/$gp.$mz.exonNuc.fa.gz $pd/$gp.exonNuc.fa.gz # now for the canonical set cd /cluster/data/rn4/bed/multiz9way/pal mz=multiz9way cb=rgdGene2Canonical gp=rgdGene2 db=rn4 for j in `awk '{print $1}' /cluster/data/rn4/chrom.sizes` do echo "select chrom, chromStart, chromEnd, transcript from $cb where chrom='$j'" | hgsql $db | tail -n +2 > $j.known.bed done mkdir exonAA exonNuc ppredAA ppredNuc for j in `sort -nk 2 /cluster/data/$db/chrom.sizes | awk '{print $1}'` do echo "date" echo "mafGene -geneBeds=$j.known.bed $db $mz $gp order.lst stdout | \ gzip -c > ppredAA/$j.ppredAA.fa.gz" echo "mafGene -geneBeds=$j.known.bed -noTrans $db $mz $gp order.lst stdout | \ gzip -c > ppredNuc/$j.ppredNuc.fa.gz" echo "mafGene -geneBeds=$j.known.bed -exons -noTrans $db $mz $gp order.lst stdout | \ gzip -c > exonNuc/$j.exonNuc.fa.gz" echo "mafGene -geneBeds=$j.known.bed -exons $db $mz $gp order.lst stdout | \ gzip -c > exonAA/$j.exonAA.fa.gz" done > $cb.jobs time sh -x $cb.jobs > $cb.jobs.log 2>&1 & sleep 1 tail -f $cb.jobs.log mz=multiz9way gp=rgdGene2 cb=rgdGene2Canonical db=rn4 zcat exonAA/*.gz | gzip -c > $cb.$mz.exonAA.fa.gz zcat exonNuc/*.gz | gzip -c > $cb.$mz.exonNuc.fa.gz zcat ppredAA/*.gz | gzip -c > $cb.$mz.ppredAA.fa.gz zcat ppredNuc/*.gz | gzip -c > $cb.$mz.ppredNuc.fa.gz rm -rf exonAA exonNuc ppredAA ppredNuc # we're only distributing exons at the moment mz=multiz9way gp=rgdGene2 cb=rgdGene2Canonical db=rn4 pd=/usr/local/apache/htdocs-hgdownload/goldenPath/$db/$mz/alignments mkdir -p $pd ln -s `pwd`/$cb.$mz.exonAA.fa.gz $pd/$cb.exonAA.fa.gz ln -s `pwd`/$cb.$mz.exonNuc.fa.gz $pd/$cb.exonNuc.fa.gz ######################################################################### # LASTZ Cow bosTau7 (DONE - 2012-01-24 - Chin) ssh hgwdev screen # use a screen to manage this multi-day job mkdir /hive/data/genomes/rn4/bed/blastzBosTau7.2012-01-24 cd /hive/data/genomes/rn4/bed/blastzBosTau7.2012-01-24 cat << '_EOF_' > DEF # Rat vs. Cow BLASTZ_M=50 # TARGET: Rat Rn4 SEQ1_DIR=/scratch/data/rn4/nib SEQ1_LEN=/cluster/data/rn4/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 /2012-01-24/2012-01-24/g # QUERY: Cow bosTau7 SEQ2_DIR=/scratch/data/bosTau7/bosTau7.2bit SEQ2_LEN=/cluster/data/bosTau7/chrom.sizes # Maximum number of scaffolds that can be lumped together SEQ2_LIMIT=100 SEQ2_CHUNK=20000000 SEQ2_LAP=0 BASE=/cluster/data/rn4/bed/blastzBosTau7.2012-01-24 TMPDIR=/scratch/tmp '_EOF_' # << this line keeps emacs coloring happy time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -noLoadChainSplit \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 200m57.138 cat fb.rn4.chainBosTau7Link.txt # 656136973 bases of 2571531505 (25.515%) in intersection # Create link ln -s blastzBosTau7.2012-01-24 lastz.bosTau7 # and the swap mkdir /hive/data/genomes/bosTau7/bed/blastz.rn4.swap cd /hive/data/genomes/bosTau7/bed/blastz.rn4.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/rn4/bed/blastzBosTau7.2012-01-24/DEF \ -swap -noLoadChainSplit -syntenicNet \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 48m51.145 cat fb.bosTau7.chainRn4Link.txt # 668440773 bases of 2804673174 (23.833%) in intersection cd /hive/data/genomes/bosTau7/bed ln -s blastz.rn4.swap lastz.rn4 ######################################################################### # POLYA-SEQ TRACK (from Adnan Derti, Merck) (DONE, Andy 2012-02-06) # (see hg19.txt for multi-species build notes) ############################################################################## ##########################################################################pubStart # Publications track (DONE - 04-27-12 - Max) # article download and conversion is run every night on hgwdev: # 22 22 * * * /hive/data/inside/literature/pubtools/pubCronDailyUpdate.sh # the script downloads files into /hive/data/outside/literature/{PubMedCentral,ElsevierConsyn}/ # then converts them to text into /hive/data/outside/literature/{pmc,elsevier} # all configuration of the pipeline is in /hive/data/inside/literature/pubtools/lib/pubConf.py # data processing was run manually like this export PATH=/cluster/home/max/bin/x86_64:/cluster/bin/x86_64:/cluster/home/max/software/bin/:/cluster/software/bin:/cluster/home/max/projects/pubtools:/cluster/home/max/bin/x86_64:/hive/groups/recon/local/bin:/usr/local/bin:/usr/bin:/bin:/usr/bin/X11:/cluster/home/max/usr/src/scripts:/cluster/home/max/usr/src/oneshot:/cluster/home/max/bin:/cluster/bin/scripts:.:/cluster/home/max/usr/bin:/usr/lib64/qt-3.3/bin:/usr/kerberos/bin:/usr/local/bin:/bin:/usr/bin:/usr/lpp/mmfs/bin/:/opt/dell/srvadmin/bin:/cluster/bin/scripts:/hive/users/hiram/cloud/ec2-api-tools-1.3-51254/bin:/cluster/home/max/bin:/usr/bin/X11:/usr/java/jdk1.6.0_20/bin:/cluster/home/max/bin:/hive/data/inside/literature/pubtools/ # pmc cd /hive/data/inside/literature/pubtools/runs/pmcBlat/ pubBlat init /hive/data/inside/literature/blat/pmc/ /hive/data/inside/literature/text/pmc ssh swarm cd /hive/data/inside/literature/pubtools/runs/pmcBlat/ pubBlat steps:annot-tables exit pubBlat load # elsevier cd /hive/data/inside/literature/pubtools/runs/elsBlat/ pubBlat init /hive/data/inside/literature/blat/elsevier/ /hive/data/inside/literature/text/elsevier ssh swarm cd /hive/data/inside/literature/pubtools/runs/elsBlat/ pubBlat steps:annot-tables exit pubBlat load #--pubEnd ############################################################################## # LIFTOVER TO rn5 (DONE - 2012-07-24 - Hiram ) mkdir /hive/data/genomes/rn4/bed/blat.rn5.2012-07-24 cd /hive/data/genomes/rn4/bed/blat.rn5.2012-07-24 # -debug run to create run dir, preview scripts... doSameSpeciesLiftOver.pl -debug \ -ooc /hive/data/genomes/rn4/11.ooc rn4 rn5 # Real run: time nice -n +19 doSameSpeciesLiftOver.pl -verbose=2 \ -ooc /hive/data/genomes/rn4/11.ooc \ -bigClusterHub=swarm -dbHost=hgwdev -workhorse=hgwdev \ rn4 rn5 > do.log 2>&1 # real 62m5.765s #############################################################################