# for emacs: -*- mode: sh; -*- # This file describes how we made the browser database on the Rattus # Norvegicus genome, June 2003 update. # # Things got a bit confused because the first Rat for June # got started about June 22. Processing proceeded on that for # that week, then it was annnounced that portions of Chr 7 and Chr X # were being moved around and a new Rat was released June 29. # Also, had some difficulty with RepeatMasker. # And, there were cluster difficulties to work around. # What happens is that some things may not be exactly where they # are claimed to be. Although I believe everything has been # corrected. DOWNLOAD SEQUENCE (DONE Rnor3.1 - 2003-06-29 - Hiram) ssh kkstore mkdir /cluster/store3/rn3 cd /cluster/store3/rn3 wget ftp://rat-ftp.hgsc.bcm.tmc.edu/pub/analysis/rat/README wget ftp://rat-ftp.hgsc.bcm.tmc.edu/pub/analysis/rat/conditions_for_use # Three extra files that were not in Rn2: wget ftp://rat-ftp.hgsc.bcm.tmc.edu/pub/analysis/rat/bac.clones wget ftp://rat-ftp.hgsc.bcm.tmc.edu/pub/analysis/rat/bacfile.gz wget ftp://rat-ftp.hgsc.bcm.tmc.edu/pub/analysis/rat/bactigs.bacs wget ftp://rat-ftp.hgsc.bcm.tmc.edu/pub/analysis/rat/bactigs.contigs # Get BCM's chrom assemblies -- we will assemble our own chr*.fa from # contig fa + agp, and cross-check against this. foreach c (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 X Un) mkdir $c wget -O $c/chr$c.fa.bcm.gz \ ftp://rat-ftp.hgsc.bcm.tmc.edu/pub/analysis/rat/chromosome/chr$c.fa.gz wget -O $c/chr${c}_random.fa.bcm.gz \ ftp://rat-ftp.hgsc.bcm.tmc.edu/pub/analysis/rat/chromosome/chr$c.random.fa.gz end # Get BCM's contig fa + agp. We will split into our own conveniently-sized # pseudo-contigs, and assemble chrom fa. # These two files are not available in this rat June 2003 # But they don't seem to be used anywhere else here anyway # wget ftp://rat-ftp.hgsc.bcm.tmc.edu/pub/analysis/rat/contigs/bacfile2-1.gz # wget ftp://rat-ftp.hgsc.bcm.tmc.edu/pub/analysis/rat/contigs/record.dat.gz # Perhaps the extra three files bac.clones, bactigs.bacs and # bactigs.contigs can be used ? foreach c (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 X Un) wget -O $c/chr$c.agp \ ftp://rat-ftp.hgsc.bcm.tmc.edu/pub/analysis/rat/contigs/chr$c.agp wget -O $c/chr$c.contig.fa.gz \ ftp://rat-ftp.hgsc.bcm.tmc.edu/pub/analysis/rat/contigs/chr$c.contig.fa.gz wget -O $c/chr${c}_random.agp \ ftp://rat-ftp.hgsc.bcm.tmc.edu/pub/analysis/rat/contigs/chr$c.random.agp wget -O $c/chr${c}_random.contig.fa.gz \ ftp://rat-ftp.hgsc.bcm.tmc.edu/pub/analysis/rat/contigs/chr$c.random.contig.fa.gz end BUILD AND CHECK CHROM-LEVEL SEQUENCE (DONE - 2003-06-29 - Hiram) # Make chrom fa: ssh kkstore # The contig sequences are delivered with various upper and lower # case. We need them all in upper case to begin with. The first # line of the file is not a problem, it is already all upper case. # According to Kim Worley: # "The lowercase letters are regions where Phrap assigns a quality score # of less than 20. These can safely be ignored." foreach c (?{,?}) set f = $c/chr$c.contig.fa.gz set g = $f:r echo "${f}" zcat ${f} | tr '[a-z]' '[A-Z]' > ${g} end foreach c (?{,?}) set f = $c/chr${c}_random.contig.fa.gz set g = $f:r if (-e ${f}) then echo "${f}" zcat ${f} | tr '[a-z]' '[A-Z]' > ${g} endif end foreach c (?{,?}) echo "Processing ${c}" $HOME/bin/i386/agpToFa -simpleMultiMixed $c/chr$c.agp chr$c \ $c/chr$c.fa $c/chr$c.contig.fa if (-e $c/chr${c}_random.agp) then $HOME/bin/i386/agpToFa -simpleMultiMixed $c/chr${c}_random.agp \ chr${c}_random $c/chr${c}_random.fa $c/chr${c}_random.contig.fa endif echo "${c} - DONE" end # Check that the size of each chromosome .fa file is equal to the # last coord of the .agp: foreach f ( ?{,?}/*.agp ) set agpLen = `tail -1 $f | awk '{print $3;}'` set g = $f:r set faLen = `faSize $g.fa | awk '{print $1;}'` if ($agpLen == $faLen) then echo " OK: $f length = $g length = $faLen" else echo "ERROR: $f length = $agpLen, but $g length = $faLen" endif end # Check that our assembled chrom fa jive with the BCM chrom fa # You will want a long window size in your terminal to run this # It outputs quite a few lines foreach c ( ?{,?} ) set ucscLen = `faSize $c/chr$c.fa | awk '{print $1;}'` set bcmLen = `gunzip -c $c/chr$c.fa.bcm.gz | faSize stdin \ | awk '{print $1;}'` if ($ucscLen == $bcmLen) then echo " OK: chr$c.fa length = chr$c.fa.bcm length = $bcmLen" else echo -n "ERROR: chr$c.fa length = $ucscLen, but chr$c.fa.bcm length" echo " = $bcmLen" echo -n "ERROR: chr$c.fa length = $ucscLen, but chr$c.fa.bcm length" echo " = $bcmLen" endif if (-e $c/chr${c}_random.fa) then set ucscLen = `faSize $c/chr${c}_random.fa | awk '{print $1;}'` set bcmLen = `gunzip -c $c/chr${c}_random.fa.bcm.gz | faSize stdin \ | awk '{print $1;}'` if ($ucscLen == $bcmLen) then echo -n " OK: chr${c}_random.fa length = chr${c}_random.fa.bcm" echo " length = $bcmLen" else echo -n "ERROR: chr${c}_random.fa length = $ucscLen, but" echo " chr${c}_random.fa.bcm length = $bcmLen" endif endif end BREAK UP SEQUENCE INTO 5 MB CHUNKS AT NON_BRIDGED CONTIGS (DONE - 2003-06-29) (DONE - 2003-06-29 - Hiram) # This will split the rat sequence into approx. 5 Mbase # supercontigs between non-bridged clone contigs and drop the # resulting dir structure in /cluster/store3/rn3. The resulting # dir structure will include 1 dir for each chromosome, each of # which has a set of subdirectories, one subdir per supercontig. ssh kkstore cd /cluster/store3/rn3 foreach c (?{,?}) echo "Working ${c}" cp -p $c/chr$c.agp $c/chr$c.agp.bak cp -p $c/chr$c.fa $c/chr$c.fa.bak splitFaIntoContigs $c/chr$c.agp $c/chr$c.fa . -nSize=5000000 if (-e $c/chr${c}_random.fa) then cp -p $c/chr${c}_random.agp $c/chr${c}_random.agp.bak cp -p $c/chr${c}_random.fa $c/chr${c}_random.fa.bak rm -fr ${c}_random splitFaIntoContigs $c/chr${c}_random.agp $c/chr${c}_random.fa . \ -nSize=5000000 mv ${c}_random/lift/oOut.lst $c/lift/rOut.lst mv ${c}_random/lift/ordered.lft $c/lift/random.lft mv ${c}_random/lift/ordered.lst $c/lift/random.lst rmdir ${c}_random/lift rm ${c}_random/chr${c}_random.{agp,fa} rm -fr $c/chr${c}_random_1 rm -fr $c/chr${c}_random_2 mv ${c}_random/* $c rmdir ${c}_random endif echo "DONE ${c}" end # Make sure the reconstructed .fa jives with the original: # four lines output for each chrom, long window required to be # able to catch all the output. Look for non-zero numbers. The # wc -l should always output zero foreach f ( */*.fa.bak ) echo -n $f:r " " diff $f $f:r | wc -l end # The .agp goes through a slight format change, but make sure it # at least ends up with the same number of lines: foreach f ( ?{,?}/*.agp.bak ) set l1 = `wc -l $f | awk '{print $1}'` set l2 = `wc -l $f:r | awk '{print $1}'` if ($l1 == $l2) then echo " OK: $f and $f:r have the same #lines" else echo "ERROR: $f has $l1 lines, but $f:r has $l2" endif end # Save some space - NO - the *.contig.fa.gz files already existing # here are the original downloads. These non-zipped versions are # the translated to upper case versions. # foreach c (?{,?}) # echo $c # gzip $c/chr*.contig.fa # end # Do need to at least move these out of the way because later # wild card expansions are looking for things like *.fa and we # don't want these in those expansions. foreach c (?{,?}) echo "$c/chr${c}.contig.fa -> $c/chr${c}.contig.fa.UC" mv $c/chr${c}_random.contig.fa $c/chr${c}_random.contig.fa.UC mv $c/chr${c}.contig.fa $c/chr${c}.contig.fa.UC end # This rm may be a good thing to do. But later, after repeat # masking is done and thus the soft masked results can be compared # with the *.bak originals. # rm */*.bak COPY OVER JKSTUFF SCRIPTS DIRECTORY (DONE - 2003-06-29 - Hiram) # Some of these scripts are being edited after they were # copied to here. Some cases of binaries in # ~kent/bin/i386 need to be changed either to # /cluster/bin/i386 or ~hiram/bin/i386 depending upon what # was being fixed as all this proceeded. # The "real" place should be /cluster/bin/i386 but before # we go that route, we need to have our build processes # working properly to keep that bin up to date. ssh kkstore ln -s /cluster/store3/rn3 ~/rn3 rm -f ~/lastRn ln -s /cluster/store4/rn2 ~/lastRn cd ~/rn3 cp -Rp ~/lastRn/jkStuff . rm jkStuff/*.{out,lst,lft} jkStuff/*~ CREATING DATABASE (DONE - 2003-06-23 - Hiram) # Create the database. ssh hgwdev echo 'create database rn3' | hgsql '' # make a semi-permanent read-only alias: # (I haven't noticed any use of this ? Useless ? - Hiram) alias rn3 "mysql -u hguser -phguserstuff -A rn3" # Use df to ake sure there is at least 5 gig free on # hgwdev:/var/lib/mysql # [hiram@hgwdev /cluster/home/hiram] df -h /var/lib/mysql # Filesystem Size Used Avail Use% Mounted on # /dev/sda1 472G 384G 64G 86% /var/lib/mysql CREATING GRP TABLE FOR TRACK GROUPING (DONE - 2003-06-23 - Hiram) ssh hgwdev echo "create table grp (PRIMARY KEY(NAME)) select * from rn1.grp" \ | hgsql rn3 REPEAT MASKING (RUNNING - 2003-06-30 - Hiram) Split contigs, run RepeatMasker, lift results Notes: * If there is a new version of RepeatMasker, build it and ask the admins to binrsync it (kkstore:/scratch/hg/RepeatMasker/*). * Contigs (*/chr*_*/chr*_*.fa) are split into 500kb chunks to make RepeatMasker runs manageable on the cluster ==> results need lifting. * For the NCBI assembly we repeat mask on the sensitive mode setting (RepeatMasker -m -s) #- Split contigs into 500kb chunks: ssh kkstore cd ~/rn3 foreach d ( */chr*_?{,?} ) cd $d echo "splitting $d" set contig = $d:t faSplit size $contig.fa 500000 ${contig}_ -lift=$contig.lft \ -maxN=500000 cd ../.. end #- Make the run directory and job list: cd ~/rn3 cat << '_EOF_' > jkStuff/RMRat #!/bin/csh -fe cd $1 pushd . /bin/mkdir -p /tmp/rn3/$2 /bin/cp $2 /tmp/rn3/$2/ cd /tmp/rn3/$2 /cluster/bluearc/RepeatMasker030619/RepeatMasker -ali -s -spec rat -comp mouse $2 popd /bin/cp /tmp/rn3/$2/$2.out ./ if (-e /tmp/rn3/$2/$2.align) /bin/cp /tmp/rn3/$2/$2.align ./ if (-e /tmp/rn3/$2/$2.tbl) /bin/cp /tmp/rn3/$2/$2.tbl ./ if (-e /tmp/rn3/$2/$2.cat) /bin/cp /tmp/rn3/$2/$2.cat ./ /bin/rm -fr /tmp/rn3/$2/* /bin/rmdir --ignore-fail-on-non-empty /tmp/rn3/$2 /bin/rmdir --ignore-fail-on-non-empty /tmp/rn3 '_EOF_' # << this line makes emacs coloring happy chmod +x jkStuff/RMRat mkdir RMRun rm -f RMRun/RMJobs touch RMRun/RMJobs foreach d ( ?{,?}/chr*_?{,?} ) set ctg = $d:t foreach f ( $d/${ctg}_?{,?}.fa ) set f = $f:t echo /cluster/store3/rn3/jkStuff/RMRat \ /cluster/store3/rn3/$d $f \ '{'check out line+ /cluster/store3/rn3/$d/$f.out'}' \ >> RMRun/RMJobs end end #- Do the run ssh kk cd ~/rn3/RMRun para create RMJobs para try, para check, para check, para push, para check,... # Completed: 6209 of 6209 jobs # CPU time in finished jobs: 37009055s 616817.58m 10280.29h 428.35d 1.174 y # IO & Wait Time: 96849s 1614.15m 26.90h 1.12d 0.003 y # Average job time: 5976s 99.60m 1.66h 0.07d # Longest job: 16048s 267.47m 4.46h 0.19d # Submission to last job: 49589s 826.48m 13.77h 0.57d #- Lift up the split-contig .out's to contig-level .out's ssh kkstore cd ~/rn3 foreach d ( ?{,?}/chr*_?{,?} ) cd $d set contig = $d:t /cluster/home/hiram/bin/i386/liftUp $contig.fa.out $contig.lft warn ${contig}_*.fa.out > /dev/null cd ../.. end #- Lift up the contig-level .out's to chr-level cd ~/rn3 ./jkStuff/liftOut5.sh # soft-mask contig .fa's with .out's foreach i (?{,?}) foreach j ($i/chr${i}_?{,?}/chr${i}_?{,?}.fa \ $i/chr${i}_random_?{,?}/chr${i}_random_?{,?}.fa) /cluster/home/hiram/bin/i386/maskOutFa $j $j.out $j -soft end echo done $i end # You will see a few WARNING lines from maskOutFa. This is a # a good sign. Much better than when maskOutFa used to exit. # With the fix, everything is getting masked properly. # Fixing a temporary glitch in repeat masker. This hasn't # affected anything above, but it will change the result of # the load below. Since the repeat masker run above, the # processrepeats script of RepeatMasker has been fixed. This # would not be necessary after that fix. foreach i (?{,?}) /cluster/bluearc/RepeatMasker030619/MakeRMoutSane.pl ${i}/*.fa.out foreach k ($i/chr${i}_?{,?}/chr${i}_?{,?}.fa.out \ $i/chr${i}_random_?{,?}/chr${i}_random_?{,?}.fa.out) /cluster/bluearc/RepeatMasker030619/MakeRMoutSane.pl ${k} end end #- Load the .out files into the database with: ssh hgwdev cd ~/rn3 hgLoadOut rn3 ?{,?}/*.fa.out MAKE LIFTALL.LFT (DONE - 2003-07-01 - Hiram) ssh kkstore cd ~/rn3 cat ?{,?}/lift/{ordered,random}.lft > jkStuff/liftAll.lft cp -p jkStuff/liftAll.lft /cluster/bluearc/rat/rn3 VERIFY REPEATMASKER RESULTS (DONE - 2003-06-24 - Hiram) # Something about this is too early at this point. It errors with: # Can't start query: # select chrom,size,fileName from chromInfo # mySQL error 1146: Table 'rn3.chromInfo' doesn't exist # you have to wait until the nibs have been loaded below and the # chrom.sizes works # Run featureBits on rn3 and on a comparable genome build, and compare: ssh hgwdev featureBits rn3 rmsk (rat version 3.1) # 1117483165 bases of 2571104688 (43.463%) in intersection featureBits rn3 rmsk (rat version 3.0) # 1117328265 bases of 2571104688 (43.457%) in intersection featureBits rn2 rmsk (Jan 2003) # 1100534407 bases of 2495551408 (44.100%) in intersection featureBits rn1 rmsk (Nov 2002) # 1081814344 bases of 2555848684 (42.327%) in intersection STORING O+O SEQUENCE AND ASSEMBLY INFORMATION (DONE - 2003-07-02 - Hiram) # XXXXX - These unmasked nibs are unnecessary. They will be # replaced below by soft masked nibs. This is only done here in # order to quickly get a browser going so that other tracks and # tests can be begun. # Make (unmasked) nibs ssh kkstore cd ~/rn3 mkdir nibNotMasked foreach i (?{,?}) cd $i foreach j ( chr?{,?}{,_random}.fa ) echo "nibbing $j" ~/bin/i386/faToNib $j ../nibNotMasked/$j:r.nib end cd .. end # Make symbolic links from /gbdb/rn3/nib to the real nibs. # Below, after the soft masked nibs are created, these # links will be reset to point to the soft masked nibs. ssh hgwdev mkdir -p /gbdb/rn3/nib foreach f (/cluster/store3/rn3/nibNotMasked/chr*.nib) ln -s $f /gbdb/rn3/nib end # Load /gbdb/rn3/nib paths into database and save size info. ssh hgwdev hgsql rn3 < ~/kent/src/hg/lib/chromInfo.sql cd ~/rn3 hgNibSeq -preMadeNib rn3 /gbdb/rn3/nib ?{,?}/chr?{,?}{,_random}.fa # 2835152425 total bases echo "select chrom,size from chromInfo" | hgsql -N rn3 > chrom.sizes # take a look at chrom.sizes, should be 44 lines and reasonable # numbers wc chrom.sizes # 44 88 778 chrom.sizes GOLD AND GAP TRACKS (DONE - 2003-07-02 - Hiram)(Re-load 2004-10-19 - Hiram) # since these were first loaded, the hgGoldGapGl has changed to # fixup the fragStart position to be 0 relative, hence the re-load ssh hgwdev cd /cluster/data/rn3 hgGoldGapGl -noGl rn3 /cluster/store3/rn3 . MAKE GCPERCENT (DONE - 2003-07-02 - Hiram) ssh hgwdev mkdir -p /cluster/store3/rn3/bed/gcPercent cd /cluster/store3/rn3/bed/gcPercent hgsql rn3 < ~/kent/src/hg/lib/gcPercent.sql hgGcPercent rn3 ../../nibNotMasked MAKE HGCENTRALTEST ENTRY AND TRACKDB TABLE FOR RN3 (DONE - 2003-06-24 - Hiram) # Enter rn3 into hgcentraltest.dbDb so test browser knows about it: echo 'insert into dbDb values("rn3", "Jun 2003", \ "/gbdb/rn3/nib", "Rat", "Espn", 1, \ 30, "Rat");' | hgsql -h genome-testdb hgcentraltest # Temporary entry until mrna tables come along. Approximate # calculated position the same as rn2 # Note: for next assembly, set scientificName column to "Rattus norvegicus" echo 'insert into dbDb (name, description, nibPath, organism, defaultPos, active, orderKey, genome, scientificName) values("rn3", "June 2003", \ "/gbdb/rn3/nib", "Rat", "chr5:169359914-169392264", 1, \ 30, "Rat", "Rattus norvegicus");' | hgsql -h genome-testdb hgcentraltest # Trying to request Espn in this new rat results in 4 different # hits on the genome. Thus, the specified position here is the # desired Espn position as was in the previous Rat # If you need to delete this dbDb entry to rewrite it: echo 'delete from dbDb where name="rn3";' \ | hgsql -h genome-testdb hgcentraltest # To change the default version of the Rat on the organism # menu. First delete it, then add the new one: echo 'delete from defaultDb where genome="Rat";' \ | hgsql -h genome-testdb hgcentraltest echo 'insert into defaultDb values("Rat", "rn3");' \ | hgsql -h genome-testdb hgcentraltest # Make trackDb table so browser knows what tracks to expect: ssh hgwdev cd ~/kent/src/hg/makeDb/trackDb cvs up -d -P # Edit that makefile to add rn3 in all the right places and do make update make alpha cvs commit makefile MAKE HGCENTRALTEST BLATSERVERS ENTRY FOR RN3 (DONE - 2003-07-04 - Jorge) ssh hgwdev echo 'insert into blatServers values("rn3", "blat11", "17778", "1"); \ insert into blatServers values("rn3", "blat11", "17779", "0");' \ | hgsql -h genome-testdb hgcentraltest SIMPLE REPEAT TRACK (DONE - 2003-07-01 - Hiram) # Need to be careful with this one on the cluster. This trf # thing does a lot of I/O on the input and output file. # Place the splits on /cluster/bluearc for cluster access ssh kkstore mkdir -p /cluster/bluearc/rat/rn3 cd /cluster/store3/rn3/ ls ?{,?}/chr*_*/chr?{,?}{,_random}_?{,?}.fa \ | tar -c -T - -f - | (cd /cluster/bluearc/rat/rn3.1.split; tar xfBp -) # now, a safe cluster job to trf mask ssh kk mkdir ~/rn3/bed/simpleRepeat cd ~/rn3/bed/simpleRepeat mkdir trf ls /cluster/bluearc/rat/rn3.1.split/?{,?}/chr*_*/chr?{,?}{,_random}_?{,?}.fa \ > genome.lst cat << '_EOF_' > runTrf #!/bin/csh -fe # set path1 = $1 set inputFN = $1:t set outpath = $2 set outputFN = $2:t mkdir -p /tmp/$outputFN cp $path1 /tmp/$outputFN pushd . cd /tmp/$outputFN /cluster/bin/i386/trfBig -trf=/cluster/bin/i386/trf $inputFN /dev/null -bedAt=$outputFN -tempDir=/tmp popd rm -f $outpath cp -p /tmp/$outputFN/$outputFN $outpath rm -fr /tmp/$outputFN/* rmdir --ignore-fail-on-non-empty /tmp/$outputFN '_EOF_' # << this line makes emacs coloring happy chmod +x runTrf cat << '_EOF_' > gsub #LOOP ./runTrf {check in line+ $(path1)} {check out line trf/$(root1).bed} #ENDLOOP '_EOF_' # << this line makes emacs coloring happy gensub2 genome.lst single gsub spec para create spec para try para check para push ... etc ... Completed: 591 of 591 jobs CPU time in finished jobs: 27837s 463.95m 7.73h 0.32d 0.001 y IO & Wait Time: 13105s 218.41m 3.64h 0.15d 0.000 y Average job time: 69s 1.15m 0.02h 0.00d Longest job: 205s 3.42m 0.06h 0.00d Submission to last job: 263s 4.38m 0.07h 0.00d ssh kkstore cd ~/rn3/bed/simpleRepeat liftUp simpleRepeat.bed ~/rn3/jkStuff/liftAll.lft warn trf/*.bed # Load this into the database as so ssh hgwdev cd ~/rn3/bed/simpleRepeat hgLoadBed rn3 simpleRepeat simpleRepeat.bed \ -sqlTable=$HOME/kent/src/hg/lib/simpleRepeat.sql PROCESS SIMPLE REPEATS INTO MASK (DONE - 2003-07-02 - Hiram) # After the simpleRepeats track has been built, make a filtered version # of the trf output: keep trf's with period <= 12: # XXXX The next time we do this, we want to experiment with period <= 8 ssh kkstore cd ~/rn3/bed/simpleRepeat mkdir -p trfMask foreach f (trf/chr*.bed) awk '{if ($5 <= 12) print;}' $f > trfMask/$f:t end # Lift up filtered trf output to chrom coords as well: cd ~/rn3 mkdir -p bed/simpleRepeat/trfMaskChrom foreach c (?{,?}) perl -wpe 's@(\S+)@bed/simpleRepeat/trfMask/$1.bed@' \ $c/lift/ordered.lst > $c/lift/oTrf.lst if (-e $c/lift/random.lst) then perl -wpe 's@(\S+)@bed/simpleRepeat/trfMask/$1.bed@' \ $c/lift/random.lst > $c/lift/rTrf.lst endif liftUp bed/simpleRepeat/trfMaskChrom/chr$c.bed \ jkStuff/liftAll.lft warn `cat $c/lift/oTrf.lst` if (-e $c/lift/rTrf.lst) then liftUp bed/simpleRepeat/trfMaskChrom/chr${c}_random.bed \ jkStuff/liftAll.lft warn `cat $c/lift/rTrf.lst` endif end MASK SEQUENCE WITH BOTH REPEATMASKER AND SIMPLE REPEAT/TRF (DONE - 2003-07-02) (DONE - 2003-07-02 - Hiram) # This used to be done right after RepeatMasking. Now, we mask with # TRF as well, so do this after the "PROCESS SIMPLE REPEATS" step above. ssh kkstore cd ~/rn3 #- Soft-mask (lower-case) the contig and chr .fa's ./jkStuff/makeFaMasked.sh # You will see warnings from the above command, these are OK: # WARNING: negative rEnd: -203 chr1:36149039-36149198 L1M3c # WARNING: negative rEnd: -14 chr1:185432802-185432864 PB1D10 #- Make hard-masked .fa.masked files as well: ./jkStuff/makeHardMasked.sh #- Rebuild the nib, mixedNib, maskedNib files: ./jkStuff/makeNib.sh # Copy the masked contig fa to /scratch: ssh kkstore rm -rf /cluster/bluearc/rat/rn3/trfFa mkdir -p /cluster/bluearc/rat/rn3/trfFa cp -p ~/rn3/?{,?}/chr*_*/chr?{,?}{,_random}_?{,?}.fa \ /cluster/bluearc/rat/rn3/trfFa RESET NIB POINTERS TO softNib (DONE - 2003-07-02 - Hiram) # As mentioned above the first time this was done, the first set # of nibs linked to were unnecessary. Here, we change the links to # point to softNib which is both repeat masker and trf masked ssh hgwdev cd /gbdb/rn3/nib rm -f * foreach f (/cluster/store3/rn3/softNib/chr*.nib) ln -s $f /gbdb/rn3/nib end # I wonder if this needs to be run again ? I'm going to do it # anyway. # Load /gbdb/rn3/nib paths into database and save size info. ssh hgwdev cd ~/rn3 hgNibSeq -preMadeNib rn3 /gbdb/rn3/nib ?{,?}/chr?{,?}{,_random}.fa # 2835152425 total bases echo "select chrom,size from chromInfo" | hgsql -N rn3 > chrom.sizes # take a look at chrom.sizes, should be 44 lines and reasonable # numbers wc chrom.sizes # 44 88 778 chrom.sizes MAKE DOWNLOADABLE SEQUENCE FILES (DONE - 2003-07-03 - 2003-07-09 - Hiram) ssh kkstore cd ~/rn3 #- Build the .zip files ./jkStuff/zipAll.sh |& tee zipAll.log mkdir zip mv *.zip* zip cd zip #- Look at zipAll.log to make sure all file lists look reasonable. #- Check zip file integrity: foreach f (*.zip) unzip -t $f > $f.test tail -1 $f.test end wc -l *.zip.test #- Copy the .zip files to hgwdev:/usr/local/apache/... ssh hgwdev cd ~/rn3/zip ../jkStuff/cpToWeb.sh cd /usr/local/apache/htdocs/goldenPath/rnJun2003 #- Take a look at bigZips/* and chromosomes/*, update their README.txt's # Then make the upstream sequence files. cd bigZips featureBits rn3 refGene:upstream:1000 -fa=upstream1000.fa zip upstream1000.zip upstream1000.fa rm upstream1000.fa featureBits rn3 refGene:upstream:2000 -fa=upstream2000.fa zip upstream2000.zip upstream2000.fa rm upstream2000.fa featureBits rn3 refGene:upstream:5000 -fa=upstream5000.fa zip upstream5000.zip upstream5000.fa rm upstream5000.fa # CREATE CYTOBAND TRACK # Can be done at any time ssh hgwdev cd /cluster/data/rn3 mkdir cytoBand cd cytoBand # Get file from NCBI wget ftp://ftp.ncbi.nih.gov/genomes/R_norvegicus/maps/mapview/BUILD.2/ideogram.gz gunzip ideogram.gz # Create bed file /cluster/bin/scripts/createNcbiCytoBand ideogram # Load the bed file hgLoadBed -noBin -sqlTable=/cluster/home/kent/src/hg/lib/cytoBand.sql rn3 cytoBand cytoBand.bed # extend to cytoBandIdeo ssh hgwdev cd kent/src/hg/lib hgsql rn3 < cytoBandIdeo.sql # mysql: insert into cytoBandIdeo select * from cytoBand # add entry in kent/src/hg/makeDb/trackDb/rat/rn3/trackDb.ra # PREPARE CLUSTER FOR BLASTZ RUN (IN PROGRESS - 2003-07-10 - kate) # This needs to be done after trf-masking and nib generation. # NOTE: generated lineage-specific .out files # using "old" script, for use vs. human # Will need species-specific version for mouse # 2003-07-09 kate # Before mouse run: # XXXXXXXXX Currently stuck on this one 2003-07-09 - the lineage specific # XXXXXXXXX repeats business is now different. # XXXXXXXXX Trying to work the kinks out of that. Hiram ssh kkstore # Extract lineage-specific repeats using Arian Smit's script: mkdir -p /cluster/data/rn3/bed/linSpecRep cd /cluster/data/rn3/bed/linSpecRep # foreach f (~/rn3/*/chr*.out) # ln -sf $f . # end # don't step on existing .out's foreach f (/cluster/data/rn3/*/chr*.out) cp $f . end /cluster/bin/scripts/rodentSpecificRepeats.pl *.out /cluster/bin/scripts/perl-rename 's/(\.fa|\.nib)//' *.out.*spec /cluster/bin/scripts/perl-rename 's/\.(rod|prim)spec/.spec/' *.out.*spec rm *.out cd /cluster/data/rn3/bed rm -rf /cluster/bluearc/rat/rn3/linSpecRep # This dir already exists from above # mkdir -p /cluster/bluearc/rat/rn3 cp -Rp linSpecRep /cluster/bluearc/rat/rn3 # RepeatMasker .out: cd ~/rn3 rm -rf /cluster/bluearc/rat/rn3/rmsk mkdir -p /cluster/bluearc/rat/rn3/rmsk cp -p ?{,?}/chr?{,?}{,_random}.fa.out /cluster/bluearc/rat/rn3/rmsk # Chrom-level mixed nibs that have been repeat- and trf-masked: rm -rf /cluster/bluearc/rat/rn3/softNib mkdir -p /cluster/bluearc/rat/rn3/softNib cp -p softNib/chr*.nib /cluster/bluearc/rat/rn3/softNib # New RepeatMasker lineage-specific annotation # Hiram ran /cluster/bluearc/rat/rn3/rmsk.spec/runArian # to generate /cluster/bluearc/rat/rn3/rmsk.spec/chrN.fa.out_mus_hum # This script loops through .out's, running # DateRepsinRMoutput.pl <.out> -query rat -comp mouse -comp human # Extract repeats not in mouse (0 in column 1) cd /cluster/bluearc/rat/rn3 mkdir linSpecRep.notInMouse foreach f (rmsk.spec/*.out_mus_hum) set base = $f:t:r:r echo $base.out.spec /cluster/bin/scripts/extractLinSpecReps 1 $f > \ linSpecRep.notInMouse/$base.out.spec end # copy to iscratch and sync ssh kkr1u00 cd /iscratch/i mkdir -p mm3.RM030619 cd mm3.RM030619 #cp -r /cluster/bluearc/rat/rn3/linSpecRep.notInMouse . /cluster/bin/scripts/iSync # BLASTZ MOUSE MM3 (DONE 2003-07-28 kate) # NOTE: This uses mm3.RM030619, which is MM3 masked with new RepeatMasker ssh kkstore mkdir -p /cluster/data/rn3/bed/blastz.mm3 mkdir -p /cluster/bluearc/hg/rn3/bed/blastz.mm3 cd /cluster/data/rn3/bed/blastz.mm3 cat << '_EOF_' > DEF # mouse vs. rat export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/home/angie/schwartzbin:/cluster/home/kent/bin/i386 ALIGN=blastz-run BLASTZ=blastz BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=50000 BLASTZ_T=2 # scoring matrix BLASTZ_Q=/cluster/data/blastz/mus_rat.q BLASTZ_ABRIDGE_REPEATS=1 # TARGET # Rat SEQ1_DIR=/scratch/rat/rn3/softNib/ # not used SEQ1_RMSK= # not used SEQ1_FLAG= SEQ1_SMSK=/cluster/bluearc/rat/rn3/linSpecRep.notInMouse/ SEQ1_IN_CONTIGS=0 SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY # Mouse SEQ2_DIR=/iscratch/i/mm3.RM030619/mixedNib/ # not currently used SEQ2_RMSK= # not currently used SEQ2_FLAG= SEQ2_SMSK=/cluster/bluearc/mm3.RM030619/linSpecRep.notInRat/ SEQ2_IN_CONTIGS=0 SEQ2_CHUNK=30000000 SEQ2_LAP=0 BASE=/cluster/bluearc/hg/rn3/bed/blastz.mm3 DEF=$BASE/DEF RAW=$BASE/raw CDBDIR=$BASE SEQ1_LEN=$BASE/S1.len SEQ2_LEN=$BASE/S2.len '_EOF_' # << this line makes emacs coloring happy # Save the DEF file in the current standard place chmod +x DEF cp DEF ~angie/hummus/DEF.rn3-mm3.RM030619 ssh kk cd /cluster/data/rn3/bed/blastz.mm3 # source the DEF file bash . ./DEF cp DEF $BASE # follow the next set of directions slavishly mkdir -p run # give up on avoiding angie's directories # tcl script # creates xdir.sh and joblist run/j ~angie/hummus/make-joblist $DEF > run/j # xdir.sh makes a bunch of result directories in $BASE/raw/ # based on chrom name and CHUNK size cp $BASE/xdir.sh . sh xdir.sh cd run # now edit j to prefix path to executable name # NOTE: we should have a controlled version of schwartz bin executables sed -e 's#^#/cluster/bin/penn/#' j > j2 wc -l j* head j2 # make sure the j2 edits are OK, then use it: mv j2 j # para create will create the file: 'batch' for the cluster run para create j # 36504 jobs para try para check para push # ... etc ... # 37 min longest job # 2.3 hours elapsed # post-process blastz ssh kkstore cd /cluster/data/rn3/bed/blastz.mm3 # source the DEF file again in case you are coming back to this bash . ./DEF # a new run directory mkdir -p run.1 mkdir -p $BASE/lav # create a new job list to convert out files to lav /cluster/bin/scripts/blastz-make-out2lav $DEF $BASE \ > run.1/jobList cd run.1 # make sure the job list is OK wc -l jobList # 312 jobs head jobList # run on cluster #ssh kkr9u01 ssh kk cd /cluster/data/rn3/bed/blastz.mm3/run.1 para create jobList para try para check para push # etc. # 10 minutes total # convert lav files to axt ssh kkstore cd /cluster/data/rn3/bed/blastz.mm3 mkdir axtChrom # a new run directory mkdir run.2 cd run.2 # create template file for gensub2 cat << '_EOF_' > gsub #LOOP /cluster/bin/scripts/blastz-chromlav2axt /cluster/bluearc/hg/rn3/bed/blastz.mm3/lav/$(root1) {check out line+ /cluster/data/rn3/bed/blastz.mm3/axtChrom/$(root1).axt} /cluster/bluearc/rat/rn3/softNib /iscratch/i/mm3.RM030619/mixedNib #ENDLOOP '_EOF_' # << this line makes emacs coloring happy ls -1S /cluster/bluearc/hg/rn3/bed/blastz.mm3/lav > chrom.list gensub2 chrom.list single gsub jobList wc -l jobList # 44 jobs head jobList ssh kk # NOTE: would have run this on rack 9, but /iscratch/i/mm3.RM030619 # not available there cd /cluster/data/rn3/bed/blastz.mm3 cd run.2 para create jobList para try para check para push # 10 minutes ? # copy chrom axt's back to fileserver cd /cluster/data/rn3/bed/blastz.mm3 mkdir -p axtChrom cp $base/axtChrom/*.axt axtChrom # translate sorted axt files into psl ssh kkstore cd /cluster/data/rn3/bed/blastz.mm3 mkdir -p pslChrom set tbl = "blastzMm3" foreach f (axtChrom/chr*.axt) set c=$f:t:r echo "Processing chr $c" /cluster/bin/i386/axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl end # Load database tables ssh hgwdev set tbl = "blastzMm3" cd /cluster/data/rn3/bed/blastz.mm3 cd pslChrom /cluster/bin/i386/hgLoadPsl -noTNameIx rn3 chr*_${tbl}.psl # create trackDb/rat/rn3/trackDb.ra entry: # track blastzMm3 # shortLabel BLASTZ Mouse # longLabel All BLASTZ Mouse alignments # group compGeno # priority 111 # visibility hide # color 100,50,0 # altColor 255,240,200 # spectrum on # type psl xeno hg15 # MAKE BLASTZ BEST MOUSE MM3 (DONE 2003-07-23 kate) # Consolidate AXT files to chrom level, sort, pick best, make psl. ssh kkstore cd /cluster/data/rn3/bed/blastz.mm3 # copy chrom axt's to bluearc, to avoid hitting fileserver too hard cp -r axtChrom /cluster/bluearc/hg/rn3/bed/blastz.mm3 mkdir -p axtBest pslBest mkdir run.3 cd run.3 # create script to filter files cat << '_EOF_' > doBestAxt #!/bin/csh -f # usage: doBestAxt chr axt-file best-file psl-file /cluster/bin/i386/axtBest $2 $1 $3 -minScore-300 sleep 1 /cluster/bin/i386/axtToPsl $3 /cluster/data/rn3/bed/blastz.mm3/S1.len /cluster/data/rn3/bed/blastz.mm3/S2.len $4 '_EOF_' # << this line makes emacs coloring happy chmod +x doBestAxt cd axtChrom ls -1S | sed 's/.axt$//' > ../run.3/chrom.list cd ../run.3 # create template for cluster job cat << '_EOF_' > gsub #LOOP doBestAxt $(root1) /cluster/bluearc/hg/rn3/bed/blastz.mm3/axtChrom/$(root1).axt {check out line+ /cluster/data/rn3/bed/blastz.mm3/axtBest/$(root1).axt} {check out line+ /cluster/data/rn3/bed/blastz.mm3/pslBest/$(root1)_blastzBestMm3.psl} #ENDLOOP '_EOF_' # << this line makes emacs coloring happy gensub2 chrom.list single gsub jobList wc -l jobList # 44 jobs head jobList ssh kk cd /cluster/data/rn3/bed/blastz.mm3 cd run.3 para create jobList para try para check para push # 1 hour ? # chr1 too large for kk nodes -- reran on kkr1u00 (fileserver would do) ./doBestAxt chr1 /cluster/bluearc/hg/rn3/bed/blastz.mm3/axtChrom/chr1.axt /cluster/data/rn3/bed/blastz.mm3/axtBest/chr1.axt /cluster/data/rn3/bed/blastz.mm3/pslBest/chr1_blastzBestMm3.psl # Read 186121 elements from /cluster/bluearc/hg/rn3/bed/blastz.mm3/axtChrom/chr1.axt # Allocated 268113159 elements in array # 118172 elements in after soft filter. # Output 115210 alignments including 0 trimmed from overlaps # Bases in 236687179, bases out 164536193 (69.52%) # Load tables ssh hgwdev set base="/cluster/data/rn3/bed/blastz.mm3" set tbl="blastzBestMm3" cd $base/pslBest /cluster/bin/i386/hgLoadPsl -noTNameIx rn3 chr*_${tbl}.psl # check results featureBits rn3 blastzBestMm3 # 1667035238 bases of 2571104688 (64.837%) in intersection # no mouse alignment on rn2 featureBits rn3 blastzMm3 # 1669090687 bases of 2571104688 (64.917%) in intersection # create trackDb/rat/rn3/trackDb.ra entry: # track blastzBestMm3 # shortLabel BLASTZ Mouse # longLabel All BLASTZ Mouse alignments # group compGeno # priority 111 # visibility hide # color 100,50,0 # altColor 255,240,200 # spectrum on # type psl xeno hg15 # Make /gbdb links and add them to the axtInfo table: mkdir -p /gbdb/rn3/axtBestMm3 cd /gbdb/rn3/axtBestMm3 foreach f ($base/axtBest/chr*.axt) ln -s $f . end cd $base/axtBest rm -f axtInfoInserts.sql touch axtInfoInserts.sql foreach f (/gbdb/rn3/axtBestMm3/chr*.axt) set chr=$f:t:r echo "INSERT INTO axtInfo (species, alignment, chrom, fileName) \ VALUES ('mm3','Blastz Best in Genome','$chr','$f');" \ >> axtInfoInserts.sql end # already done, above # hgsql rn3 < ~/kent/src/hg/lib/axtInfo.sql hgsql rn3 < axtInfoInserts.sql # CHAIN MOUSE BLASTZ (DONE 2003-07-24 kate) ssh kkstore cd /cluster/data/rn3/bed/blastz.mm3 mkdir -p axtChain/run1 cd axtChain/run1 mkdir out chain # Run axtChain on little cluster ssh kkr1u00 cd /cluster/data/rn3/bed/blastz.mm3 mkdir -p axtChain/run1 cd /cluster/data/rn3/bed/blastz.mm3/axtChain/run1 ls -1S /cluster/data/rn3/bed/blastz.mm3/axtChrom/*.axt > input.lst cat << '_EOF_' > gsub #LOOP doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out} #ENDLOOP '_EOF_' # << this line makes emacs coloring happy cat << '_EOF_' > doChain #!/bin/csh axtFilter -notQ=chrUn_random $1 | axtChain stdin /iscratch/i/rn3/bothMaskedNibs /iscratch/i/mm3.RM030619/mixedNib $2 > $3 '_EOF_' # << this line makes emacs coloring happy chmod a+x doChain mkdir out chain gensub2 input.lst single gsub jobList para create jobList para try para push # ... etc ... # now on the cluster server, sort chains ssh kkstore cd /cluster/data/rn3/bed/blastz.mm3/axtChain chainMergeSort run1/chain/*.chain > all.chain chainSplit chain all.chain # these steps take ~20 minutes # optionally: rm run1/chain/*.chain # Load chains into database # next machine ssh hgwdev cd /cluster/data/rn3/bed/blastz.mm3/axtChain/chain foreach i (*.chain) set c = $i:r hgLoadChain rn3 ${c}_chainMm3 $i echo done $c end # NET MOUSE BLASTZ (DONE 2003-07-24 kate) ssh kkstore cd /cluster/data/rn3/bed/blastz.mm3/axtChain mkdir preNet cd chain foreach i (*.chain) echo preNetting $i chainPreNet $i /cluster/data/rn3/chrom.sizes \ /cluster/data/mm3/chrom.sizes ../preNet/$i end cd .. # This foreach loop will take about 15 min to execute. mkdir n1 cd preNet foreach i (*.chain) set n = $i:r.net echo primary netting $i chainNet $i -minSpace=1 /cluster/data/rn3/chrom.sizes \ /cluster/data/mm3/chrom.sizes ../n1/$n /dev/null end cd .. cat n1/*.net | /cluster/bin/i386/netSyntenic stdin hNoClass.net ssh hgwdev cd /cluster/store3/rn3/bed/blastz.mm3/axtChain netClass hNoClass.net rn3 mm3 mouse.net -tNewR=/cluster/bluearc/rat/rn3/linSpecRep.notInMouse -qNewR=/cluster/bluearc/mm3.RM030619/linSpecRep.notInRat # If things look good do ssh kkstore cd /cluster/data/rn3/bed/blastz.mm3/axtChain rm -r n1 hNoClass.net # Make a 'syntenic' subset of these with netFilter -syn mouse.net > mouseSyn.net # Load the nets into database ssh hgwdev cd /cluster/data/rn3/bed/blastz.mm3/axtChain netFilter -minGap=10 mouse.net | hgLoadNet rn3 netMm3 stdin netFilter -minGap=10 mouseSyn.net | hgLoadNet rn3 syntenyMm3 stdin # Add entries for net and chain to rat/rn3 trackDb # MOUSE MM3 BLASTZ CLEANUP (DONE 2004-09-20 kate) ssh eieio cd /cluster/data/hg17/bed/blastz.mm3 nice rm -rf raw lav nice rm axtChain/run1/chain/* axtChain/preNet axtChain/hNoClass.net nice gzip {axt,psl}Chrom/* axtChain/{all.chain,*.net} axtChain/*Net/*.net nice gzip axt*/*.axt psl*/*.psl # BLASTZ Self (DONE 2003-09-22 angie) ssh kkstore mkdir -p /cluster/data/rn3/bed/blastz.rn3 cd /cluster/data/rn3/bed/blastz.rn3 cat << '_EOF_' > DEF # rat vs. rat export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/home/angie/schwartzbin:/cluster/home/kent/bin/i386 ALIGN=blastz-run BLASTZ=blastz BLASTZ_H=2000 BLASTZ_ABRIDGE_REPEATS=0 # TARGET # Rat SEQ1_DIR=/iscratch/i/rn3/bothMaskedNibs/ # not used SEQ1_RMSK= # not used SEQ1_FLAG= # not used SEQ1_SMSK=/cluster/bluearc/rat/rn3/linSpecRep.notInMouse/ SEQ1_IN_CONTIGS=0 SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY # Mouse SEQ2_DIR=/iscratch/i/rn3/bothMaskedNibs/ # not currently used SEQ2_RMSK= # not currently used SEQ2_FLAG= SEQ2_SMSK=/cluster/bluearc/rat/rn3/linSpecRep.notInMouse/ SEQ2_IN_CONTIGS=0 SEQ2_CHUNK=10000000 SEQ2_LAP=10000 BASE=/cluster/store3/rn3/bed/blastz.rn3 DEF=$BASE/DEF RAW=$BASE/raw CDBDIR=$BASE SEQ1_LEN=$BASE/S1.len SEQ2_LEN=$BASE/S2.len '_EOF_' # << this line makes emacs coloring happy # Save the DEF file in the current standard place chmod +x DEF cp DEF ~angie/hummus/DEF.rn3-rn3 ssh kk cd /cluster/data/rn3/bed/blastz.rn3 # source the DEF file bash . ./DEF # follow the next set of directions slavishly mkdir -p $BASE/run # give up on avoiding angie's directories # tcl script # creates xdir.sh and joblist run/j ~angie/hummus/make-joblist $DEF > $BASE/run/j # xdir.sh makes a bunch of result directories in $BASE/raw/ # based on chrom name and CHUNK size sh $BASE/xdir.sh cd $BASE/run # now edit j to prefix path to executable name # NOTE: we should have a controlled version of schwartz bin executables sed -e 's#^#/cluster/bin/penn/#' j > j2 wc -l j* head j2 # make sure the j2 edits are OK, then use it: mv j2 j # para create will create the file: 'batch' for the cluster run para create j # 97344 jobs para try para check para push # ... etc ... # 1.3 hr longest job # make tSizes, qSizes files ssh hgwdev cd /cluster/store3/rn3/bed/blastz.rn3 echo 'select chrom,size from chromInfo' | hgsql -A -N rn3 \ > tSizes echo 'select chrom,size from chromInfo' | hgsql -A -N rn3 \ > qSizes # post-process blastz # --- normalize (PennSpeak for lift) ssh kk cd /cluster/store3/rn3/bed/blastz.rn3 # run bash shell if not running it already source DEF mkdir -p $BASE/run.1 mkdir -p $BASE/lav # create a new job list to convert out files to lav /cluster/bin/scripts/blastz-make-out2lav $DEF $BASE \ > run.1/jobList cd run.1 # make sure the job list is OK wc -l jobList # 312 jobList head jobList para create jobList para push #Completed: 312 of 312 jobs #CPU time in finished jobs: 42835s 713.92m 11.90h 0.50d 0.001 y #IO & Wait Time: 940201s 15670.02m 261.17h 10.88d 0.030 y #Average job time: 3151s 52.51m 0.88h 0.04d #Longest job: 4794s 79.90m 1.33h 0.06d #Submission to last job: 4795s 79.92m 1.33h 0.06d # convert lav files to axt ssh kk cd /cluster/store3/rn3/bed/blastz.rn3 mkdir axtChrom # a new run directory mkdir run.2 cd run.2 # create template file for gensub2 # Use blastz-self-chromlav2axt, which includes axtDropOverlap in # the pipe (that helps to keep axtSort from barfing). # usage: blastz-self-chromlav2axt lav-dir axt-file seq1-dir seq2-dir cat << '_EOF_' > gsub #LOOP /cluster/bin/scripts/blastz-self-chromlav2axt /cluster/store3/rn3/bed/blastz.rn3/lav/$(root1) {check out line+ /cluster/store3/rn3/bed/blastz.rn3/axtChrom/$(root1).axt} /iscratch/rat/rn3/softNib /iscratch/rat/rn3/softNib /cluster/store3/rn3/bed/blastz.rn3/tSizes /cluster/store3/rn3/bed/blastz.rn3/qSizes #ENDLOOP '_EOF_' # << this line makes emacs coloring happy \ls -1S /cluster/store3/rn3/bed/blastz.rn3/lav > chrom.list gensub2 chrom.list single gsub jobList wc -l jobList # 44 jobs head jobList cd /cluster/data/rn3/bed/blastz.rn3/run.2 para create jobList para try para check para push #Completed: 38 of 44 jobs #Crashed: 6 jobs #CPU time in finished jobs: 11390s 189.83m 3.16h 0.13d 0.000 y #IO & Wait Time: 179130s 2985.51m 49.76h 2.07d 0.006 y #Average job time: 5014s 83.56m 1.39h 0.06d #Longest job: 22358s 372.63m 6.21h 0.26d #Submission to last job: 22377s 372.95m 6.22h 0.26d # Chroms 1,2,3,4,Un,X crashed... run in 2 passes: ssh kkstore cd /cluster/data/rn3/bed/blastz.rn3 set base=/cluster/data/rn3/bed/blastz.rn3 set seq1_dir=/cluster/data/rn3/softNib set seq2_dir=/cluster/data/rn3/softNib foreach c (lav/chr1 lav/chr2 lav/chr3 lav/chr4 lav/chrX lav/chrUn) pushd $c set chr=$c:t set out=$base/axtChrom/$chr.axt echo "Translating $chr lav to $out" foreach d (*.lav) set smallout=$d.axt lavToAxt $d $seq1_dir $seq2_dir stdout \ | axtDropOverlap stdin /cluster/data/rn3/bed/blastz.rn3/tSizes \ /cluster/data/rn3/bed/blastz.rn3/qSizes stdout \ | axtSort stdin $smallout end cat `ls -1 *.lav.axt | sort -g` > $out popd end foreach f (axtChrom/chr*.axt) echo gzipping $f... gzip $f end # Make the axt's available on genome-test: ssh hgwdev mkdir -p /usr/local/apache/htdocs/goldenPath/rnJun2003/alignments/vsRn3 cd /usr/local/apache/htdocs/goldenPath/rnJun2003/alignments/vsRn3 cp -p /cluster/data/rn3/bed/blastz.rn3/axtChrom/*.axt.gz . # CHAIN SELF BLASTZ (DONE 2/28/05 angie) ssh kki mkdir -p /cluster/data/rn3/bed/blastz.rn3/axtChain/run1 cd /cluster/data/rn3/bed/blastz.rn3/axtChain/run1 mkdir out chain ls -1S /cluster/data/rn3/bed/blastz.rn3/axtChrom/*.axt.gz > input.lst cat << '_EOF_' > gsub #LOOP doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} #ENDLOOP '_EOF_' # << this line makes emacs coloring happy cat << '_EOF_' > doChain #!/bin/csh -ef axtChain -verbose=0 $1 \ /iscratch/i/rn3/bothMaskedNibs /iscratch/i/rn3/bothMaskedNibs stdout \ | chainAntiRepeat \ /iscratch/i/rn3/bothMaskedNibs /iscratch/i/rn3/bothMaskedNibs stdin $2 '_EOF_' # << this line makes emacs coloring happy chmod a+x doChain gensub2 input.lst single gsub jobList para create jobList para try, check, push, check, ... # Both chrUn and chr1 crashed with out-of-mem.... #Completed: 42 of 44 jobs #Crashed: 2 jobs #Average job time: 342s 5.70m 0.09h 0.00d #Longest job: 1205s 20.08m 0.33h 0.01d #Submission to last job: 1205s 20.08m 0.33h 0.01d # We can tolerate the lack of chrUn self chains, but not chr1. # Try to split it in half around a large gap and chain halves, then merge. # chr1's largest assembly gap is 664000bp [68333401, 68997401), # but that doesn't divide it very evenly in half (chr1 is 268Mbp). # There is a 316000bp gap [129715113, 130031113), use that. ssh kolossus cd /cluster/data/rn3/bed/blastz.rn3/axtChain axtFilter ../axtChrom/chr1.axt.gz -tStartMax=129873113 > chr1_1.axt axtFilter ../axtChrom/chr1.axt.gz -tStartMin=129873113 > chr1_2.axt # line count sanity check: original == total new: zcat ../axtChrom/chr1.axt.gz | wc -l #21602304 wc -l chr1_?.axt # 11973760 chr1_1.axt # 9628544 chr1_2.axt # 21602304 total # Chain the two halves independently, assuming no self chain would # span such a large gap: axtChain -verbose=0 chr1_1.axt \ /iscratch/i/rn3/bothMaskedNibs /iscratch/i/rn3/bothMaskedNibs stdout \ | chainAntiRepeat \ /iscratch/i/rn3/bothMaskedNibs /iscratch/i/rn3/bothMaskedNibs stdin \ run1/chain/chr1_1.chain axtChain -verbose=0 chr1_2.axt \ /iscratch/i/rn3/bothMaskedNibs /iscratch/i/rn3/bothMaskedNibs stdout \ | chainAntiRepeat \ /iscratch/i/rn3/bothMaskedNibs /iscratch/i/rn3/bothMaskedNibs stdin \ run1/chain/chr1_2.chain rm chr1_1.axt chr1_2.axt # Sort and split: ssh kolossus cd /cluster/data/rn3/bed/blastz.rn3/axtChain chainMergeSort run1/chain/*.chain > all.chain rm run1/chain/* ssh eieio cd /cluster/data/rn3/bed/blastz.rn3/axtChain chainSplit chain all.chain # DONE TO HERE 2/28/05 angie Looks like Brian filtered -minScore=20000 and netted, 10/19/05. # BLASTZ Human hg15 (IN PROGRESS 2003-07-20 kate) # Extract lineage-specific repeats vs. human ssh kkstore cd /cluster/bluearc/rat/rn3 mkdir linSpecRep.notInHuman foreach f (rmsk.spec/*.out_mus_hum) set base = $f:t:r:r echo $base.out.spec /cluster/bin/scripts/extractLinSpecReps 2 $f > \ linSpecRep.notInHuman/$base.out.spec end mkdir -p /cluster/data/rn3/bed/blastz.hg15 cd /cluster/data/rn3/bed/blastz.hg15 cat << '_EOF_' > DEF # rat vs. human export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/home/angie/schwartzbin:/cluster/home/kent/bin/i386 ALIGN=blastz-run BLASTZ=blastz BLASTZ_H=2000 BLASTZ_ABRIDGE_REPEATS=1 # TARGET # Rat SEQ1_DIR=/iscratch/i/rn3/bothMaskedNibs/ # not used SEQ1_RMSK= # not used SEQ1_FLAG= # not used SEQ1_SMSK=/cluster/bluearc/rat/rn3/linSpecRep.notInHuman/ SEQ1_IN_CONTIGS=0 SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY SEQ2_DIR=/iscratch/i/gs.16/build33/chromTrfMixedNib/ SEQ2_RMSK=/iscratch/i/gs.16/build33/rmsk/ SEQ2_SMSK=/iscratch/i/gs.16/build33/linSpecRep/ SEQ2_FLAG=-primate SEQ2_IN_CONTIGS=0 SEQ2_CHUNK=30000000 SEQ2_LAP=0 BASE=/cluster/store3/rn3/bed/blastz.hg15 DEF=$BASE/DEF RAW=$BASE/raw CDBDIR=$BASE SEQ1_LEN=$BASE/S1.len SEQ2_LEN=$BASE/S2.len '_EOF_' # << this line makes emacs coloring happy # Save the DEF file in the current standard place chmod +x DEF cp DEF ~angie/hummus/DEF.rn3-hg15 ssh kk cd /cluster/data/rn3/bed/blastz.hg15 # source the DEF file bash . ./DEF # follow the next set of directions slavishly mkdir -p $BASE/run # give up on avoiding angie's directories # tcl script # creates xdir.sh and joblist run/j ~angie/hummus/make-joblist $DEF > $BASE/run/j # xdir.sh makes a bunch of result directories in $BASE/raw/ # based on chrom name and CHUNK size sh $BASE/xdir.sh cd $BASE/run # now edit j to prefix path to executable name # NOTE: we should have a controlled version of schwartz bin executables sed -e 's#^#/cluster/bin/penn/#' j > j2 wc -l j* head j2 # make sure the j2 edits are OK, then use it: mv j2 j # para create will create the file: 'batch' for the cluster run para create j # 42120 jobs para try para check para push # ... etc ... # post-process blastz # SWAPPING HUMAN-RAT BLASTZ ALIGNMENTS TO RAT-HUMAN (DONE 2003-07-11 kate) ssh kkstore # Human-rat alignments were already run and processed into axt. # Swap target and query to get rat-human alignments. set aliDir = "/cluster/data/hg15/bed/blastz.rn3" set revAliDir = "/cluster/data/rn3/bed/blastz.hg15.swap" mkdir $revAliDir cd $revAliDir # axtBest will need .len files - copy those, swap S1<->S2 cp $aliDir/S1.len S2.len cp $aliDir/S2.len S1.len mkdir unsorted axtChrom # Swap target and query coords, then re-apportion alignments so that # unsorted/chrN.axt has all the alignments with chrN as target. cat $aliDir/axtChrom/chr*.axt \ | /cluster/bin/i386/axtSwap stdin $aliDir/S1.len $aliDir/S2.len stdout \ | /cluster/bin/i386/axtSplitByTarget stdin unsorted # Sort the shuffled .axt files. foreach f (unsorted/*.axt) echo sorting $f:t:r axtSort $f axtChrom/$f:t end rm -r unsorted # Create psl from sorted axt ssh kkstore cd /cluster/data/rn3/bed/blastz.hg15.swap set tbl = "blastzHg15" mkdir -p pslChrom foreach f (axtChrom/chr*.axt) set c=$f:t:r /cluster/bin/i386/axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl end # load blastz results ssh hgwdev cd /cluster/data/rn3/bed mv blastz.hg15.swap blastz.hg15 cd blastz.hg15 cd pslChrom /cluster/bin/i386/hgLoadPsl -noTNameIx rn3 chr*_*.psl # create trackDb/rat/rn3/trackDb.ra entry: # track blastzHg15 # shortLabel BLASTZ Human # longLabel All BLASTZ Human alignments # group compGeno # priority 111 # visibility hide # color 100,50,0 # altColor 255,240,200 # spectrum on # type psl xeno hg15 # CHAIN HUMAN BLASTZ (DONE 2003-07-16 kate) ssh kkstore cd /cluster/data/rn3/bed/blastz.hg15 mkdir -p axtChain/run1 cd axtChain/run1 mkdir out chain # create shell script to run jobs on file server # cat << '_EOF_' > serialJobList #LOOP # something like this #doChain $1 chain/$1.chain out/$1.out} #ENDLOOP # '_EOF_' cat << '_EOF_' > doChain #!/bin/csh axtFilter -notQ=chrUn_random $1 | axtChain stdin /cluster/bluearc/rn3/softNib /cluster/data/hg15/nib $2 > $3 '_EOF_' # << this line makes emacs coloring happy chmod a+x doChain csh serialJobList & # this took 8.5 hours on kkstore # Run axtChain on little cluster # ssh kkr1u00 # cd /cluster/data/rn3/bed/blastz.hg15 # mkdir -p axtChain/run1 # cd /cluster/data/rn3/bed/blastz.hg15/axtChain/run1 # ls -1S /cluster/data/rn3/bed/blastz.hg15/axtChrom/*.axt > input.lst # cat << '_EOF_' > gsub #LOOP # doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out} #ENDLOOP #'_EOF_' # cat << '_EOF_' > doChain #!/bin/csh # axtFilter -notQ=chrUn_random $1 | axtChain stdin /iscratch/i/rn3/bothMaskedNibs /scratch/hg/gs.16/build33/chromTrfMixedNib $2 > $3 #'_EOF_' # chmod a+x doChain # mkdir out chain # gensub2 input.lst single gsub jobList # para create jobList # para try # para push # ... etc ... # now on the cluster server, sort chains ssh kkstore cd /cluster/data/rn3/bed/blastz.hg15/axtChain chainMergeSort run1/chain/*.chain > all.chain chainSplit chain all.chain # these steps take ~20 minutes # optionally: rm run1/chain/*.chain # Load chains into database # next machine ssh hgwdev cd /cluster/data/rn3/bed/blastz.hg15/axtChain/chain foreach i (*.chain) set c = $i:r hgLoadChain rn3 ${c}_chainHg15 $i echo done $c end # NET HUMAN BLASTZ (DONE 2003-07-16 kate) ssh kkstore cd /cluster/data/rn3/bed/blastz.hg15/axtChain mkdir preNet cd chain foreach i (*.chain) echo preNetting $i chainPreNet $i /cluster/data/rn3/chrom.sizes \ /cluster/data/hg15/chrom.sizes ../preNet/$i end cd .. This foreach loop will take about 15 min to execute. mkdir n1 cd preNet foreach i (*.chain) set n = $i:r.net echo primary netting $i chainNet $i -minSpace=1 /cluster/data/rn3/chrom.sizes \ /cluster/data/hg15/chrom.sizes ../n1/$n /dev/null end cd .. cat n1/*.net | netSyntenic stdin hNoClass.net ssh hgwdev cd /cluster/store3/rn3/bed/blastz.hg15/axtChain netClass hNoClass.net rn3 hg15 human.net -tNewR=/cluster/data/rn3/bed/linSpecRep -qNewR=/cluster/data/hg15/bed/linSpecRep # If things look good do ssh kkstore cd /cluster/data/rn3/bed/blastz.hg15/axtChain rm -r n1 hNoClass.net Make a 'syntenic' subset of these with netFilter -syn human.net > humanSyn.net # Load the nets into database ssh hgwdev cd /cluster/data/rn3/bed/blastz.hg15/axtChain netFilter -minGap=10 human.net | hgLoadNet rn3 netHg15 stdin netFilter -minGap=10 humanSyn.net | hgLoadNet rn3 syntenyHg15 stdin # Add entries for net and chain to rat/rn3 trackDb # MAKE THE BLASTZBESTHUMAN TRACK FROM RN3 AXT FILES (DONE 2003-07-16 baertsch, kate) # Consolidate AXT files to chrom level, sort, pick best, make psl. ssh kkstore set base="/cluster/data/rn3/bed/blastz.hg15" set seq1_dir="/cluster/data/rn3/mixedNib/" set seq2_dir="/cluster/data/hg15/mixedNib/" set tbl="blastzBestHg15" cd $base mkdir -p axtBest pslBest foreach f (axtChrom/chr*.axt) set chr=$f:t:r echo axtBesting $chr axtBest axtChrom/$chr.axt $chr axtBest/$chr.axt -minScore=300 echo translating axtBest to psl for $chr axtToPsl axtBest/$chr.axt S1.len S2.len pslBest/${chr}_${tbl}.psl end # If some chromosome's alignments were too big and caused axtSort to # run out of memory, split it in half (by 4-line axt records) and # run axtBest just on the halves. foreach chr (chr1) echo two-pass axtBesting $chr set len = `wc -l < axtChrom/$chr.axt` set numRec = `expr $len / 4` if (($numRec * 4) != $len) then echo "Uh-oh: length of axtChrom/$chr.axt is $len, not a multiple of 4" break endif set halfRec = `expr $numRec / 2` set halfLen = `expr $halfRec \* 4` set halfLenp1 = `expr $halfLen + 1` head -$halfLen axtChrom/$chr.axt > axtChrom/$chr.h0.axt tail +$halfLenp1 axtChrom/$chr.axt > axtChrom/$chr.h1.axt axtBest axtChrom/$chr.h0.axt $chr axtChrom/$chr.h0.axtBest -minScore=300 axtBest axtChrom/$chr.h1.axt $chr axtChrom/$chr.h1.axtBest -minScore=300 cat axtChrom/$chr.h{0,1}.axtBest > axtBest/$chr.axt axtToPsl axtBest/$chr.axt S1.len S2.len pslBest/${chr}_${tbl}.psl rm axtChrom/$chr.h* end # Load tables ssh hgwdev set base="/cluster/store3/rn3/bed/blastz.hg15" set tbl="blastzBestHg15" cd $base/pslBest hgLoadPsl -noTNameIx rn3 chr*_${tbl}.psl # Make /gbdb links and add them to the axtInfo table: mkdir -p /gbdb/rn3/axtBestHg15 cd /gbdb/rn3/axtBestHg15 foreach f ($base/axtBest/chr*.axt) ln -s $f . end cd $base/axtBest rm -f axtInfoInserts.sql touch axtInfoInserts.sql foreach f (/gbdb/rn3/axtBestHg15/chr*.axt) set chr=$f:t:r echo "INSERT INTO axtInfo (species, alignment, chrom, fileName) \ VALUES ('hg15','Blastz Best in Genome','$chr','$f');" \ >> axtInfoInserts.sql end hgsql rn3 < ~/kent/src/hg/lib/axtInfo.sql hgsql rn3 < axtInfoInserts.sql # MAKE AXTTIGHT FROM AXTBEST (DONE 2003-07-21 kate) # After creating axtBest alignments above, use subsetAxt to get axtTight: # NOTE: currently the axtBest's are the blastz.hg15.2003-07-09 directory # since blastz.hg15 is in use for rerun of blastz with # updated lineage-specific repeats for new RepeatMasker ssh kkstore cd ~/rn3/bed/blastz.hg15.2003-07-09/axtBest mkdir -p ../axtTight foreach i (*.axt) echo $i subsetAxt $i ../axtTight/$i \ /cluster/data/subsetAxt/coding.mat 3400 end # translate to psl cd ../axtTight mkdir -p ../pslTight foreach i (*.axt) echo "Processing $i" set c = $i:r /cluster/bin/i386/axtToPsl $i \ ../S1.len ../S2.len ../pslTight/${c}_blastzTightHg15.psl end # Load tables into database ssh hgwdev cd /cluster/data/rn3/bed/blastz.hg15.2003-07-09 cd pslTight /cluster/bin/i386/hgLoadPsl -noTNameIx rn3 chr*_blastzTightHg15.psl # MAKE MULTIPLE ALIGNMENT (MAF) FILES FROM BEST AXT'S (DONE 2003-07-22 kate) # Reran multiz and reloaded table on 8/27, as some chroms were truncated (2003-08-27 kate) ssh kkstore # rat/human cd /cluster/data/rn3/bed/blastz.hg15.2003-07-09 mkdir maf cd axtBest foreach i (*.axt) set c = $i:r echo "Making maf for chr $c" /cluster/bin/i386/axtToMaf -tPrefix=rn3. -qPrefix=hg15. $i \ /cluster/data/rn3/chrom.sizes /cluster/data/hg15/chrom.sizes \ ../maf/${c}.rh.maf /cluster/bin/scripts/fixMaf.pl \ < ../maf/${c}.rh.maf > ../maf/${c}.rh.maf.fixed mv ../maf/${c}.rh.maf ../maf/${c}.rh.maf.unfixed mv ../maf/${c}.rh.maf.fixed ../maf/${c}.rh.maf end # NOTE: remove .unfixed files after checking # rat/mouse cd /cluster/data/rn3/bed/blastz.mm3 mkdir maf cd axtBest foreach i (*.axt) set c = $i:r echo "Making maf for chr $c" /cluster/bin/i386/axtToMaf -tPrefix=rn3. -qPrefix=mm3. $i \ /cluster/data/rn3/chrom.sizes /cluster/data/mm3/chrom.sizes \ ../maf/${c}.rm.maf /cluster/bin/scripts/fixMaf.pl \ < ../maf/${c}.rm.maf > ../maf/${c}.rm.maf.fixed mv ../maf/${c}.rm.maf ../maf/${c}.rm.maf.unfixed mv ../maf/${c}.rm.maf.fixed ../maf/${c}.rm.maf end # 30 minutes ? # Run multiple alignment using Webb Miller's multiz # Note: use - for 3rd arg when doing rodent/rodent (close species) # See penn/multiz/README for details ssh kkstore cd /cluster/data/rn3/bed mkdir -p multiz.hg15-mm3/maf cd multiz.hg15-mm3 cat << "EOF" > doMultiz #!/bin/csh -f cd /cluster/data/rn3/bed cd blastz.hg15.2003-07-09/maf foreach f (*.rh.maf) set c = $f:r:r echo "Multiz on $c" nice /cluster/bin/penn/multiz $f ../../blastz.mm3/maf/${c}.rm.maf - \ > ../../multiz.hg15-mm3/maf/${c}.rmh.maf end "EOF" # << this line makes emacs coloring happy csh doMultiz >&! doMultiz.log & tail -100f doMultiz.log # 3 hours ? # copy files to cluster I/O server mkdir -p /cluster/bluearc/hg/rn3/bed/blastz.hg15.2003-07-09/maf cp -r /cluster/data/rn3/bed/blastz.hg15.2003-07-09/maf /cluster/bluearc/hg/rn3/bed/blastz.hg15.2003-07-09 mkdir -p /cluster/bluearc/hg/rn3/bed/blastz.mm3/maf cp -r /cluster/data/rn3/bed/blastz.mm3/maf /cluster/bluearc/hg/rn3/bed/blastz.mm3 set multizDir = /cluster/data/rn3/bed/multiz.hg15-mm3 cd $multizDir mkdir run cd run cat << "EOF" > doMultiz.kk /cluster/bin/penn/multiz /cluster/bluearc/hg/rn3/bed/blastz.hg15.2003-07-09/maf/$1.rh.maf /cluster/bluearc/hg/rn3/bed/blastz.mm3/maf/$1.rm.maf - > /cluster/data/rn3/bed/multiz.hg15-mm3/maf/$1.rmh.maf "EOF" # << this line makes emacs coloring happy chmod +x doMultiz.kk cat << "EOF" > gsub #LOOP doMultiz.kk $(root1) {check out line+ /cluster/data/rn3/bed/multiz.hg15-mm3/maf/$(root1).rmh.maf} #ENDLOOP "EOF" # << this line makes emacs coloring happy cd /cluster/bluearc/hg/rn3/bed/blastz.hg15.2003-07-09/maf # NOTE: probably want a better way to make the chrom list ls *.maf | awk -F. '{print $1}' > $multizDir/run/chrom.list gensub2 chrom.list single gsub jobList ssh kkr9u01 set multizDir = /cluster/data/rn3/bed/multiz.hg15-mm3 cd $multizDir/run para create jobList para try para check para push # setup up external files for database to reference ssh hgwdev mkdir -p /gbdb/rn3/multizHg15Mm3 cd /gbdb/rn3/multizHg15Mm3 foreach f (/cluster/data/rn3/bed/multiz.hg15-mm3/maf/*.maf) ln -s $f . end # load into database # NOTE: use "warn" option due to presence of score=0 aligments in MAF files /cluster/bin/i386/hgLoadMaf -warn rn3 multizHg15Mm3 #Couldn't find rn3. sequence line 819945 of /gbdb/rn3/multizHg15Mm3/chrX.rmh.maf #Indexing and tabulating /gbdb/rn3/multizHg15Mm3/chrX_random.rmh.maf #Couldn't find rn3. sequence line 4341 of /gbdb/rn3/multizHg15Mm3/chrX_random.rmh.maf #Loading multizHg15Mm3 into database #Warning: load of multizHg15Mm3 did not go as planned: 3512170 record(s), 0 row(s) skipped, 27410 warning(s) loading ./multizHg15Mm3.tab # AUTO UPDATE GENBANK MRNA RUN (DONE - 2003-07-07 - Hiram) ssh eieio cd /cluster/store5/genbank set db = rn3 set nibGlob = '/cluster/bluearc/rat/rn3/softNib/chr*.nib' set liftFile = /cluster/bluearc/rat/rn3/liftAll.lft # the iservers were acting up, the following uses the bluearc instead # make sure 'ssh localhost' works before running this: nice bin/gbAlignStep -verbose=1 -initial -iserver=localhost \ -clusterdir=/cluster/bluearc/genbank/work \ -srcDb=genbank $db "$nibGlob" $liftFile & # This job took almost two days # Checking finished jobsCompleted: 216306 of 216306 jobs # CPU time in finished jobs: 74560544s 1242675.73m 20711.26h 862.97d 2.364 y # IO & Wait Time: 1371840s 22864.00m 381.07h 15.88d 0.044 y # Average job time: 351s 5.85m 0.10h 0.00d # Longest job: 2478s 41.30m 0.69h 0.03d # Submission to last job: 151934s 2532.23m 42.20h 1.76d # To watch the progress of your cluster job, go to machine kk and # cd /cluster/store5/genbank/work/initial.rn3/align # where the batch file is. You can now do normal parasol checking # operations. Beware of "para check" it can take quite a bit of time # if your batch job is very large. 'parasol list batches' or # 'parasol list users' may be quicker to take a look at status. # After that is finished successfully, load the mRNAs: # The drop and load is faster if tables have been loaded before. ssh hgwdev cd /cluster/store5/genbank ./bin/i386/gbLoadRna -drop rn3 nice ./bin/gbDbLoadStep -verbose=1 -initialLoad rn3 # This load took about 7 hours # native mRNAs were suspect due to bug in gbBlat script, redoing them. # alignment now uses a config file, so add rn3 to etc/genbank.conf # remove suspect mRNAs. rm -f data/aligned/genbank.136.0/rn3/*/mrna.* # align genbank and refseq mrnas, using bluearc for cluster fs: nice bin/gbAlignStep -type=mrna -verbose=1 -initial -iserver=no -clusterRootDir=/cluster/bluearc/genbank rn3& # need to clean out all mRNAs from database, but leave ESTs inplace. - drop these tables: chr*_mrna all_mrna xenoMrna mrnaOrientInfo - selectively delete mrna related rows: delete from imageClone where type = 'mRNA'; delete from mrna where type = 'mRNA'; delete from gbStatus where type = 'mRNA'; delete from gbSeq where type = 'mRNA'; delete from gbExtFile where path like '%/mrna.fa'; # load genbank and refseq mRNAs into database nice ./bin/gbDbLoadStep -type=mrna -verbose=1 rn3 # refseq protein ids included version, which caused searches to fail. # clean our refseq sequences and reload: drop table refFlat,refGene,refLink,refSeqAli,refSeqStatus; delete from gbSeq where acc like 'NM_%' or acc like 'NP_%'; delete from gbStatus where acc like 'NM_%' or acc like 'NP_%'; delete from mrna where acc like 'NM_%'; delete from gbLoaded where srcDb='RefSeq'; nice ./bin/gbDbLoadStep -verbose=1 rn3 # GENBANK: add Non-Rat Refseqs (DONE May 9, 2005, Heather) # add to kent/src/hg/makeDb/genbank/etc/genbank.conf and commit: # rn3.refseq.mrna.xeno.load = yes cd /cluster/data/genbank/etc cvs update # new tables: xenoRefGene, xenoRefFlat, xenoRefSeqAli, refLink, refSeqStatus # refSeqSummary CREATE RNACLUSTER TABLE (TO DO) # Make sure that refSeqAli, estOrientInfo and mrnaOrientInfo tables are # made already (see above). ssh hgwdev mkdir -p ~/rn3/bed/rnaCluster/chrom cd ~/rn3/bed/rnaCluster foreach i (~/rn3/?{,?}) foreach f ($i/chr*.fa) set c = $f:t:r clusterRna rn3 /dev/null chrom/$c.bed -chrom=$c echo done $c end end hgLoadBed rn3 rnaCluster chrom/*.bed PRODUCING GENSCAN PREDICTIONS (DONE - 2003-07-09 - Hiram) # Log into kkr1u00 (not kk!). kkr1u00 is the driver node for the small # cluster (kkr2u00 -kkr8u00. (genscan has problem running on the # big cluster, due to limitation of memory and swap space on each # processing node). ssh kkr1u00 mkdir -p ~/rn3/bed/genscan cd ~/rn3/bed/genscan # Make 3 subdirectories for genscan to put their output files in mkdir gtf pep subopt # Generate a list file, genome.list, of all the hard-masked contigs that # *do not* consist of all-N's (which would cause genscan to blow up) rm -f genome.list touch genome.list foreach f ( `ls -1S /cluster/store3/rn3/?{,?}/chr*/chr?{,?}{,_random}_?{,?}.fa.masked` ) egrep '[ACGT]' $f > /dev/null if ($status == 0) echo $f >> genome.list end # Create template file, gsub, for gensub2. For example (3-line file): cat << '_EOF_' > gsub #LOOP /cluster/home/hiram/bin/i386/gsBig {check in line+ $(path1)} {check out line gtf/$(root1).gtf} -trans={check out line pep/$(root1).pep} -subopt={check out line subopt/$(root1).bed} -exe=/cluster/home/fanhsu/projects/compbio/bin/genscan-linux/genscan -par=/cluster/home/fanhsu/projects/compbio/bin/genscan-linux/HumanIso.smat -tmp=/tmp -window=2400000 #ENDLOOP '_EOF_' # << this line makes emacs coloring happy gensub2 genome.list single gsub jobList para create jobList para try para check para push # If there are crashes, diagnose with "para problems". # If a job crashes due to genscan running out of memory, re-run it # manually with "-window=1200000" instead of "-window=2400000". # chr14_21, chr16_4 # There was one job that did crash in this manner. # It was run manually: # /cluster/home/hiram/bin/i386/gsBig /cluster/store3/rn3/10/chr10_3/chr10_3.fa.masked gtf/chr10_3.fa.gtf -trans=pep/chr10_3.fa.pep -subopt=subopt/chr10_3.fa.bed -exe=/cluster/home/fanhsu/projects/compbio/bin/genscan-linux/genscan -par=/cluster/home/fanhsu/projects/compbio/bin/genscan-linux/HumanIso.smat -tmp=/tmp -window=1200000 # Checking finished jobsCompleted: 590 of 591 jobs # Crashed: 1 jobs # CPU time in finished jobs: 312553s 5209.22m 86.82h 3.62d 0.010 y # IO & Wait Time: 1927s 32.11m 0.54h 0.02d 0.000 y # Average job time: 533s 8.88m 0.15h 0.01d # Longest job: 23072s 384.53m 6.41h 0.27d # Submission to last job: 43744s 729.07m 12.15h 0.51d # Convert these to chromosome level files as so: ssh kkstore cd ~/rn3/bed/genscan liftUp genscan.gtf ../../jkStuff/liftAll.lft warn gtf/*.gtf liftUp genscanSubopt.bed ../../jkStuff/liftAll.lft warn subopt/*.bed cat pep/*.pep > genscan.pep # Load into the database as so: ssh hgwdev cd ~/rn3/bed/genscan ldHgGene rn3 genscan genscan.gtf hgPepPred rn3 generic genscanPep genscan.pep hgLoadBed rn3 genscanSubopt genscanSubopt.bed TWINSCAN GENE PREDICTIONS (reloaded -gtf 2004-04-01 kate) mkdir -p ~/rn3/bed/twinscan cd ~/rn3/bed/twinscan wget http://genome.cse.wustl.edu/~bio/rat/Jan03/rat_Jan03_03-26-03.tgz gunzip -c *.tgz | tar xvf - rm -r chr_tx # clean up chrom field of GTF files foreach f (chr_gtf/chr*.gtf) set chr = $f:t:r sed -e "s/^[a-zA-Z0-9]*/$chr/" $f > chr_gtf/$chr-fixed.gtf end # pare down protein FASTA header to id and add missing .a: foreach f (chr_ptx/chr*.ptx) set chr = $f:t:r perl -wpe 's/^\>.*\s+source_id\s*\=\s*(\S+).*$/\>$1.a/;' < \ chr_ptx/$chr.ptx > chr_ptx/$chr-fixed.fa end ldHgGene rn3 twinscan chr_gtf/chr*-fixed.gtf -gtf hgPepPred rn3 generic twinscanPep chr_ptx/chr*-fixed.fa /cluster/bin/scripts/runGeneCheck /cluster/data/rn3/bed/twinscan # 526 badFrame # 452 gap # 320 noStart # 305 noStop # 9 inFrameStop PRODUCING TETRAODON FISH ALIGNMENTS (TO DO) o - Download sequence from ... and put it on the cluster local disk at /scratch/hg/fish o - Do fish/rat alignments. ssh kk cd ~/rn3/bed mkdir blatFish cd blatFish mkdir psl ls -1S /scratch/hg/fish/* > fish.lst ls -1S /scratch/hg/rn3/trfFa/* > rat.lst cp ~/lastRn/blatFish/gsub . gensub2 rat.lst fish.lst gsub spec para create spec para try Make sure jobs are going ok with para check. Then para push wait about 2 hours and do another para push do para checks and if necessary para pushes until done or use para shove. o - Sort alignments as so pslCat -dir psl | liftUp -type=.psl stdout ~/rn3/jkStuff/liftAll.lft warn stdin | pslSortAcc nohead chrom /cluster/store2/temp stdin o - Copy to hgwdev:/scratch. Rename to correspond with tables as so and load into database: ssh hgwdev cd ~/rn3/bed/blatFish/chrom foreach i (chr?{,?}{,_random}.psl) set r = $i:r mv $i ${r}_blatFish.psl end hgLoadPsl -noTNameIx rn3 *.psl hgLoadRna addSeq rn3 /cluster/store2/fish/seq15jun2001/*.fa # PRODUCING SQUIRT ALIGNMENTS (TO DO) ssh kkstore mkdir -p ~/rn3/bed/blatCi1 cd ~/rn3/bed/blatCi1 ls -1S /iscratch/i/squirt/ci1/queryFa/*.fa > squirt.lst ls -1S /scratch/hg/rn3/trfFa/* > rat.lst rm -rf psl foreach ctg (`cat rat.lst`) mkdir -p psl/$ctg:t:r end # get gsub2D from someplace gensub2 rat.lst squirt.lst gsub2D spec ssh kk cd ~/rn3/bed/blatCi1 para create spec .... # When cluster run is done, sort alignments: ssh kkstore cd ~/rn3/bed/blatCi1 mkdir /tmp/$LOGNAME pslSort dirs raw.psl /tmp/$LOGNAME psl/* pslReps raw.psl cooked.psl /dev/null -minAli=0.05 liftUp -nohead lifted.psl ../../jkStuff/liftAll.lft warn cooked.psl pslSortAcc nohead chrom /tmp/$LOGNAME lifted.psl # Rename to correspond with tables as so and load into database: ssh hgwdev cd ~/rn3/bed/blatCi1/chrom rm -f chr*_blatCi1.psl foreach i (chr?{,?}{,_random}.psl) set r = $i:r mv $i ${r}_blatCi1.psl end hgLoadPsl -noTNameIx rn3 *.psl # Make squirt /gbdb/ symlink mkdir /gbdb/rn3/squirtSeq cd /gbdb/rn3/squirtSeq ln -s /cluster/store5/squirt/ci1/ciona.rm.fasta PRODUCING FUGU FISH ALIGNMENTS (TO DO) # (Already done, for mm2:) # Download sequence to /cluster/store3/fuguSeq from ... and put it on the # cluster local disk at /scratch/hg/fugu on kkstore. # Sequence was downloaded from: # ftp://ftp.jgi-psf.org/pub/JGI_data/Fugu/fugu_v3_mask.fasta.Z # ftp://ftp.jgi-psf.org/pub/JGI_data/Fugu/fugu_v3_prot.fasta.Z # mkdir split2.5Mb; cd split2.5Mb; # faSplit about ../fugu_v3_mask.fasta 2500000 fuguSplit ssh kkr1u00 rm -rf /iscratch/i/fugu mkdir /iscratch/i/fugu cp -p /cluster/store3/fuguSeq/split2.5Mb/*.fa /iscratch/i/fugu ~kent/bin/iSync ssh kk mkdir ~/rn3/bed/blatFugu cd ~/rn3/bed/blatFugu ls -1S /iscratch/i/fugu/* > fugu.lst ls -1S /scratch/hg/rn3/trfFa/* > rat.lst cp ~/lastRn/bed/blatFugu/gsub . mkdir psl foreach f (~/rn3/?{,?}/chr*/chr?{,?}{,_random}_?{,?}.fa) set c=$f:t:r mkdir psl/$c end gensub2 rat.lst fugu.lst gsub spec para create spec para try para check para push para check # Sort alignments: ssh kkstore cd ~/rn3/bed/blatFugu pslCat -dir psl/* \ | liftUp -type=.psl stdout ~/rn3/jkStuff/liftAll.lft warn stdin \ | pslSortAcc nohead chrom /cluster/store2/temp stdin # load into database: ssh hgwdev cd ~/rn3/bed/blatFugu/chrom foreach i (chr?{,?}{,_random}.psl) set r = $i:r mv $i ${r}_blatFugu.psl end hgLoadPsl -noTNameIx rn3 *.psl mkdir -p /gbdb/rn3/fuguSeq cd /gbdb/rn3/fuguSeq ln -s /cluster/store3/fuguSeq/fugu_v3_mask.fasta cd /cluster/store2/temp hgLoadRna addSeq rn3 /gbdb/rn3/fuguSeq/fugu_v3_mask.fasta MAKE LIFT FILE FOR AGPS (TO DO) ssh kkstore cd ~/rn3/jkStuff ./jkStuff/agpToLift.pl chrom.sizes ?{,?}/chr?{,?}{,_random}.agp \ > jkStuff/liftRNOR.lft LOAD BACTIG POSITIONS (DONE - 2003-07-14 - Hiram) ssh hgwdev mkdir -p ~/rn3/bed/bactigPos cd ~/rn3/bed/bactigPos awk "-F\t" '{printf "%s\t%d\t%s\t%s\t%s\t%s\n", $1, $2-1, $3, $4, $5, $6;}' < ../../bactigs.contigs > bactigPos.bed hgLoadBed rn3 bactigPos bactigPos.bed \ -noBin -sqlTable=$HOME/kent/src/hg/lib/bactigPos.sql LOAD CPGISSLANDS (DONE - 2003-07-08 - Hiram) ssh hgwdev mkdir -p ~/rn3/bed/cpgIsland cd ~/rn3/bed/cpgIsland # Build software emailed from Asif Chinwalla (achinwal@watson.wustl.edu) # copy the tar file to the current directory cp -p /cluster/store4/rn2/bed/cpgIsland/cpg_dist.tar . tar xvf cpg_dist.tar cd cpg_dist gcc readseq.c cpg_lh.c -o cpglh.exe ssh kkstore cd ~/rn3/bed/cpgIsland foreach f (../../?{,?}/chr?{,?}{,_random}.fa.masked) set fout=$f:t:r:r.cpg echo running cpglh on $f to $fout ./cpg_dist/cpglh.exe $f > $fout.cpg end # copy filter.awk from a previous release cp -p /cluster/store4/rn2/bed/cpgIsland/filter.awk . awk -f filter.awk chr*.cpg > cpgIsland.bed # load into database: ssh hgwdev cd ~/rn3/bed/cpgIsland hgLoadBed rn3 cpgIsland -tab -noBin \ -sqlTable=$HOME/kent/src/hg/lib/cpgIsland.sql cpgIsland.bed LOAD SOFTBERRY GENES (DONE - 2003-07-17 - Hiram) cd /cluster/store3/rn3/bed mkdir softberry cd softberry wget \ ftp://www.softberry.com/pub/SC_RAT_JUN03/Softb_fgenesh_rat_jun03.tar.gz tar xvzf Softb_fgenesh_rat_jun03.tar.gz ldHgGene rn3 softberryGene chr*.gff # 43912 groups 22 seqs 1 sources 4 feature types # 43912 gene predictions hgPepPred rn3 softberry *.protein hgSoftberryHom rn3 *.protein LOAD GENEID GENES (DONE - 2003-07-08 - Hiram RELOADED -gtf 2004-04-02 kate) mkdir -p ~/rn3/bed/geneid/download cd ~/rn3/bed/geneid/download foreach f (~/rn3/?{,?}/chr?{,?}{,_random}.fa) set chr = $f:t:r wget http://genome.imim.es/genepredictions/R.norvegicus/rnJun2003/geneid_v1.1/$chr.gtf wget http://genome.imim.es/genepredictions/R.norvegicus/rnJun2003/geneid_v1.1/$chr.prot end # Add missing .1 to protein id's foreach f (*.prot) perl -wpe 's/^(>chr\w+)$/$1.1/' $f > $f:r-fixed.prot end cd .. ldHgGene rn3 geneid download/*.gtf -gtf # 37571 gene predictions hgPepPred rn3 generic geneidPep download/*-fixed.prot SGP GENE PREDICTIONS (DONE - 2003-07-30 - Hiram) mkdir -p ~/rn3/bed/sgp/download cd ~/rn3/bed/sgp/download foreach f (~/rn3/?{,?}/chr?{,?}{,_random}.fa) set chr = $f:t:r wget http://monstre1.imim.es/genepredictions/R.norvegicus/rnJun2003/SGP/humangp20030410/$chr.gtf wget http://monstre1.imim.es/genepredictions/R.norvegicus/rnJun2003/SGP/humangp20030410/$chr.prot end # Add missing .1 to protein id's foreach f (*.prot) perl -wpe 's/^(>chr\w+)$/$1.1/' $f > $f:r-fixed.prot end cd .. ldHgGene rn3 sgpGene download/*.gtf -exon=CDS hgPepPred rn3 generic sgpPep download/*-fixed.prot SGP GENES (UPDATE 1/18/2006) sgpPep table dropped, replaced by hgc generated protein seq in browser LOAD STS MAP (TO DO ) - login to hgwdev cd ~/rn3/bed rn3 < ~/src/hg/lib/stsMap.sql mkdir stsMap cd stsMap bedSort /projects/cc/hg/mapplots/data/tracks/build28/stsMap.bed stsMap.bed - Enter database with "rn3" command. - At mysql> prompt type in: load data local infile 'stsMap.bed' into table stsMap; - At mysql> prompt type LOAD MGI IDs (TO DO) - The Locuslink ID to MGI IDs converstion data file, LL2MGI.txt, from Jackson Lab should be found under ~/rn3/bed/refSeq - login to hgwdev cd ~/rn3/bed/refSeq rn3 < ~/src/hg/lib/mgiID.sql - Enter database with "rn3" command. - At mysql> prompt type in: load data local infile 'LL2MGI.txt' into table MGIid; - At mysql> prompt type quit LOAD RATREF TRACK (TO DO) First copy in data from kkstore to ~/rn3/bed/ratRef. Then substitute 'genome' for the appropriate chromosome in each of the alignment files. Finally do: hgRefAlign webb rn3 ratRef *.alignments LOAD AVID RAT TRACK (TO DO) ssh cc98 cd ~/rn3/bed mkdir avidRat cd avidRat wget http://pipeline.lbl.gov/tableCS-LBNL.txt hgAvidShortBed *.txt avidRepeat.bed avidUnique.bed hgLoadBed avidRepeat avidRepeat.bed hgLoadBed avidUnique avidUnique.bed LOAD SNPS (TO DO) - ssh hgwdev - cd ~/rn3/bed - mkdir snp - cd snp - Download SNPs from ftp://ftp.ncbi.nlm.nih.gov/pub/sherry/rat.b27.out.gz - Unpack. createBed < rat.b27.out > snpNih.bed hgLoadBed rn3 snpNih snpNih.bed LOAD ENSEMBL ESTs (TO DO) ln -s /cluster/store3/rn3 ~/rn3 mkdir -p ~/rn3/bed/ensembl cd ~/rn3/bed/ensembl wget http://www.ebi.ac.uk/~stabenau/rat-est.gz wget http://www.ebi.ac.uk/~stabenau/rat-est.pep.gz gunzip -c rat-est.gz | \ perl -w -p -e 's/^(\w)/chr$1/' > rat-est-fixed.gtf ldHgGene rn3 ensEst rat-est-fixed.gtf > The id behind '>' is internal and was not in our gtf dump, so > you have to do some more parsing. # pick out the transcript= attribute -- that's the id to use: # also remove the first line: gunzip -c rat-est.pep.gz | tail +2 | \ perl -w -p -e 's/^\>gene_id=.*transcript=(\w+)\s+.*$/\>$1/' > \ rat-est-fixed.pep hgPepPred rn3 generic ensEstPep rat-est-fixed.pep #### LOAD ENSEMBL GENES (DONE - 2004-04-29 - Fan) # ensGene is needed for Superfamily track, Gene Sorter, and Proteome Browser # Ensembl released Rat build 3.1 the week of Feb 9th, 2004 mkdir /cluster/data/rn3/bed/ensembl cd /cluster/data/rn3/bed/ensembl Get the ensembl gene data from http://www.ensembl.org/ Go to the EnsMart link Choose Rattus norvegicus as the organism Follow this sequence through the pages: Page 1) Choose the Ensembl Genes choice. Hit next. Page 2) Uncheck the "Limit to" box in the region choice. Then hit next. Page 3) Choose the "Structures" tab. Page 4) Choose GTF as the ouput, choose gzip compression , name the output file ensGeneRn3 and then hit Export # Ensembl handles random chromosomes differently than us, so we # strip this data. Fortunately it just loses a couple of genes. zcat ensGeneRn3.gff.gz | grep -v ^6_DR51 | grep -v _NT_ > unrandom.gtf # Let's see how much it loses: # For Rn3 there is no difference # # Add "chr" to front of each line in the gene data gtf file to make # it compatible with ldHgGene sed -e "s/^/chr/" unrandom.gtf > ensGene.gtf # change chr*.random to chr*_random perl -wpe 's/(chr.*)\.random/$1_random/' ensGene.gtf > ensGeneFixed.gtf ldHgGene rn3 ensGene ensGeneFixed.gtf # Read 28545 transcripts in 475909 lines in 1 files # 28545 groups 36 seqs 1 sources 4 feature types # 28545 gene predictions # save space, gzip them: gzip unrandom.gtf gzip ensGene.gtf o - Load Ensembl peptides: Get the ensembl protein data from http://www.ensembl.org/ Go to the EnsMart link Choose Rattus norvegicus as the organism Follow this sequence through the pages: Page 1) Choose the Ensembl Genes choice. Hit next. Page 2) Uncheck the "Limit to" box in the region choice. Then hit next. Page 3) Choose the "Sequences" tab. Page 4) Choose Transcripts/Proteins and peptide Only as the output, choose text/fasta and gzip compression, name the file ensGeneRn3.pep and then hit export. If needed, substitute ENST for ENSP in ensPep with the program called subs edit subs.in to read: ENSP|ENST #subs -e rn3EnsGene.pep > /dev/null #delete * at end of each protein zcat ensGeneRn3.pep.fasta.gz | sed "s/\*$//" > ensembl.pep ~matt/bin/fixPep.pl ensembl.pep fixPep_ensembl.pep hgPepPred rn3 generic ensPep fixPep_ensembl.pep o - Load ensGtp table. # ensGtp associates geneId/transcriptId/proteinId for hgPepPred and # hgKnownToSuper. Use ensMart to create it as above, except: # Page 3) Choose the "Features" tab. In "Ensembl Attributes", check # Ensembl Gene ID, Ensembl Transcript ID, Ensembl Peptide ID. # Choose Text, tab-separated as the output format, gzip. # Result name file as ensGtpRn3.tsv.gz gunzip ensGtpRn3.tsv.gz hgsql rn3 < ~/kent/src/hg/lib/ensGtp.sql hgsql -N -e 'load data local infile "ensGtpRn3.tsv" into table ensGtp ignore 1 lines;' rn3 LOAD RNAGENES (TO DO) - login to hgwdev - cd ~kent/src/hg/lib - rn3 < rnaGene.sql - cd /cluster/store3/rn3/bed - mkdir rnaGene - cd rnaGene - download data from ftp.genetics.wustl.edu/pub/eddy/pickup/ncrna-oo27.gff.gz - gunzip *.gz - liftUp chrom.gff ../../jkStuff/liftAll.lft carry ncrna-oo27.gff - hgRnaGenes rn3 chrom.gff LOAD EXOFISH (TO DO) (SEE ECORES TRACKS BELOW, hartera) - login to hgwdev - cd /cluster/store3/rn3/bed - mkdir exoFish - cd exoFish - rn3 < ~kent/src/hg/lib/exoFish.sql - Put email attatchment from Olivier Jaillon (ojaaillon@genoscope.cns.fr) into /cluster/store3/rn3/bed/exoFish/all_maping_ecore - awk -f filter.awk all_maping_ecore > exoFish.bed - hgLoadBed rn3 exoFish exoFish.bed LOAD GENIE (TO DO) mkdir -p ~/rn3/bed/genieAlt cd ~/rn3/bed/genieAlt wget http://www.neomorphic.com/mgap/mgscv3/gtf/mgscv3.genie.gtf.tgz gunzip -c mgscv3.genie.gtf.tgz | tar xvf - ldHgGene rn3 genieAlt mgscv3.genie.gtf/chr*.gtf wget http://www.neomorphic.com/mgap/mgscv3/fa/mgscv3.aa.tgz gunzip -c mgscv3.aa.tgz | tar xvf - hgPepPred rn3 genie geniePep chr*.aa.fa LOAD GENIE CLONE BOUNDS (TO DO) mkdir -p ~/rn3/bed/genieBounds cd ~/rn3/bed/genieBounds wget http://www.neomorphic.com/mgap/mgscv3/cb.bed/mgscv3_cb.bed.tgz gunzip -c mgscv3_cb.bed.tgz | tar xvf - - Trim the track definition from each file (these are actually custom track files): foreach c (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 X Un) tail +2 chr${c}_cb.bed > chr${c}_cb-fixed.bed end hgLoadBed rn3 genieBounds *-fixed.bed LOAD SOFTBERRY GENES (TO DO) - ln -s /cluster/store3/rn3 ~/rn3 - cd ~/rn3/bed - mkdir softberry - cd softberry - get ftp://www.softberry.com/pub/SC_MOU_NOV01/softb_mou_genes_nov01.tar.gz ldHgGene rn3 softberryGene chr*.gff hgPepPred rn3 softberry *.protein hgSoftberryHom rn3 *.protein LOAD GENOMIC DUPES (TO DO) o - Load genomic dupes ssh hgwdev cd ~/rn3/bed mkdir genomicDups cd genomicDups wget http://codon/jab/web/takeoff/oo33_dups_for_kent.zip unzip *.zip awk -f filter.awk oo33_dups_for_kent > genomicDups.bed hgsql rn3 < ~/src/hg/lib/genomicDups.sql hgLoadBed rn3 -oldTable genomicDups genomicDupes.bed TWINSCAN GENE PREDICTIONS (DONE 07/14/03) mkdir -p ~/rn3/bed/twinscan cd ~/rn3/bed/twinscan wget http://genome.cse.wustl.edu/~bio/rat/Jun03/rat_Jun03_07-11-03.tgz gunzip -c *.tgz | tar xvf - rm -r chr_tx # clean up chrom field of GTF files foreach f (chr_gtf/chr*.gtf) set chr = $f:t:r sed -e "s/^[a-zA-Z0-9]*/$chr/" $f > chr_gtf/$chr-fixed.gtf end # pare down protein FASTA header to id and add missing .a: foreach f (chr_ptx/chr*.ptx) set chr = $f:t:r perl -wpe 's/^\>.*\s+source_id\s*\=\s*(\S+).*$/\>$1.a/;' < \ chr_ptx/$chr.ptx > chr_ptx/$chr-fixed.fa end ldHgGene rn3 twinscan chr_gtf/chr*-fixed.gtf -exon=CDS hgPepPred rn3 generic twinscanPep chr_ptx/chr*-fixed.fa AXTBEST # Consolidate AXT files to chrom level, sort, pick best, make psl. ssh kkstore tcsh set base="/cluster/store3/rn3/bed/blastz.hg15" set tbl="blastzBestHg15" cd $base mkdir -p axtBest pslBest #chrom 1 is too big axtFilter -tStartMax=99999999 chr1.axt >chr1FirstHalf.axt axtFilter -tStartMin=100000000 chr1.axt >chr1SecondHalf.axt foreach chrdir (`cat chrom.lst`) set chr=$chrdir:t echo axtBesting $chr axtBest axtChrom/$chr.axt $chr axtBest/$chr.axt -minScore=300 end axtBest axtChrom/chr1FirstHalf.axt chr1 axtBest/chr1FirstHalf.axt -minScore=300 axtBest axtChrom/chr1SecondHalf.axt chr1 axtBest/chr1SecondHalf.axt -minScore=300 cd axtBest cat chr1FirstHalf.axt chr1SecondHalf.axt > chr1.axt rm chr1FirstHalf.axt chr1SecondHalf.axt cd ../axtChrom rm chr1FirstHalf.axt chr1SecondHalf.axt cd .. foreach chrdir (`cat chrom.lst`) set chr=$chrdir:t echo translating axtBest to psl for $chr axtToPsl axtBest/$chr.axt S1.len S2.len pslBest/${chr}_${tbl}.psl end # Load tables ssh hgwdev set base="/cluster/store3/rn3/bed/blastz.hg15" set tbl="blastzBestHg15" cd $base/pslBest hgLoadPsl -noTNameIx hg15 chr*_${tbl}.psl # TBA ALIGNMENT - NON-MAMMAL SPECIES INCLUDED (Fugu & Chicken) 2003-10-20 kate ssh hgwdev mkdir -p /cluster/data/rn3/bed/nisc/cftr ln -s /cluster/data/nisc/targets/cftr/CFTR.non-mammal/rat.out \ /cluster/data/rn3/bed/nisc/cftr/tbaFishBird.maf set table = tbaFishBirdCFTR mkdir -p /gbdb/rn3/$table cd /gbdb/rn3/$table ln -s /cluster/data/rn3/bed/nisc/cftr/tbaFishBird.maf tba.maf cd /cluster/data/rn3/bed/nisc/cftr /cluster/bin/i386/hgLoadMaf -WARN rn3 $table # 463 warnings # 4604 rows loaded # BLASTZ MOUSE MM4 (DONE - 2003-11-03 - Hiram) ssh kk mkdir -p /cluster/data/rn3/bed/blastz.mm4.2003-10-29 cd /cluster/data/rn3/bed ln -s blastz.mm4.2003-10-29 blastz.mm4 cd blastz.mm4 cat << '_EOF_' > DEF # rat vs. mouse export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/home/angie/schwartzbin:/cluster/home/kent/bin/i386 ALIGN=blastz-run BLASTZ=blastz BLASTZ_H=2000 BLASTZ_ABRIDGE_REPEATS=1 # TARGET # Rat SEQ1_DIR=/iscratch/i/rn3/bothMaskedNibs # RMSK not currently used SEQ1_RMSK=/cluster/bluearc/rat/rn3/rmsk # FLAG not currently used SEQ1_FLAG=-rodent SEQ1_SMSK=/cluster/bluearc/rat/rn3/linSpecRep.notInMouse SEQ1_IN_CONTIGS=0 SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY # Mouse SEQ2_DIR=/scratch/mus/mm4/softNib # RMSK not currently used SEQ2_RMSK=/scratch/mus/mm4/rmsk # FLAG not currently used SEQ2_FLAG=-rodent SEQ2_SMSK=/scratch/mus/mm4/linSpecRep.notInRat SEQ2_IN_CONTIGS=0 SEQ2_CHUNK=30000000 SEQ2_LAP=0 BASE=/cluster/data/rn3/bed/blastz.mm4 DEF=$BASE/DEF RAW=$BASE/raw CDBDIR=$BASE SEQ1_LEN=$BASE/S1.len SEQ2_LEN=$BASE/S2.len '_EOF_' # << this line keeps emacs coloring happy # prepare first cluster run source DEF cd /cluster/data/rn3/bed/blastz.mm4 /cluster/data/mm4/jkStuff/BlastZ_run0.sh cd run.0 para try, check, push, check, .... # Completed: 39935 of 39936 jobs # Crashed: 1 jobs # CPU time in finished jobs: 16180312s 269671.87m 4494.53h 187.27d 0.513 y # IO & Wait Time: 687304s 11455.06m 190.92h 7.95d 0.022 y # Average job time: 422s 7.04m 0.12h 0.00d # Longest job: 5808s 96.80m 1.61h 0.07d # Submission to last job: 44812s 746.87m 12.45h 0.52d # the crashed job: # /cluster/home/angie/schwartzbin/blastz-run chr5.nib 1 10010000 chr14.nib 60000001 90000000 /cluster/data/rn3/bed/blastz.mm4/DEF # seq_read(/tmp/blastz.DkVf00/s1.fa): Input/output error # try it again and it works OK # Second cluster run to convert the .out's to .lav's # You do NOT want to run this on the big cluster. It brings # the file server to its knees. Run this on the small cluster. ssh kkr1u00 cd /cluster/data/rn3/bed/blastz.mm4 source DEF /cluster/data/mm4/jkStuff/BlastZ_run1.sh cd run.1 para try, check, push, etc ... # Completed: 312 of 312 jobs # CPU time in finished jobs: 35956s 599.27m 9.99h 0.42d 0.001 y # IO & Wait Time: 15078s 251.30m 4.19h 0.17d 0.000 y # Average job time: 164s 2.73m 0.05h 0.00d # Longest job: 642s 10.70m 0.18h 0.01d # Submission to last job: 5811s 96.85m 1.61h 0.07d # Third cluster run to convert lav's to axt's ssh kk cd /cluster/data/rn3/bed/blastz.mm4 source DEF /cluster/data/mm4/jkStuff/BlastZ_run2.sh cd run.2 para try, check, push, etc ... # Completed: 37 of 44 jobs # Crashed: 7 jobs # CPU time in finished jobs: 5646s 94.10m 1.57h 0.07d 0.000 y # IO & Wait Time: 74584s 1243.07m 20.72h 0.86d 0.002 y # Average job time: 2168s 36.14m 0.60h 0.03d # Longest job: 18498s 308.30m 5.14h 0.21d # Submission to last job: 97292s 1621.53m 27.03h 1.13d # and a recover run of the 7 failing jobs # Completed: 4 of 7 jobs # Crashed: 3 jobs # CPU time in finished jobs: 2333s 38.89m 0.65h 0.03d 0.000 y # IO & Wait Time: 32031s 533.84m 8.90h 0.37d 0.001 y # Average job time: 8591s 143.18m 2.39h 0.10d # Longest job: 9314s 155.23m 2.59h 0.11d # Submission to last job: 10283s 171.38m 2.86h 0.12d # FAILED: chr1, chr4, chr5 # try these on kolossus ssh kolossus cd /cluster/data/rn3/bed/blastz.mm4/run.2 time /cluster/data/mm4/jkStuff/x86_64-chromlav2axt \ /cluster/data/rn3/bed/blastz.mm4/lav/chr1 \ /cluster/data/rn3/bed/blastz.mm4/axtChrom/chr1.axt \ /cluster/data/rn3/nib /cluster/data/mm4/nib time /cluster/data/mm4/jkStuff/x86_64-chromlav2axt \ /cluster/data/rn3/bed/blastz.mm4/lav/chr4 \ /cluster/data/rn3/bed/blastz.mm4/axtChrom/chr4.axt \ /cluster/data/rn3/nib /cluster/data/mm4/nib time /cluster/data/mm4/jkStuff/x86_64-chromlav2axt \ /cluster/data/rn3/bed/blastz.mm4/lav/chr5 \ /cluster/data/rn3/bed/blastz.mm4/axtChrom/chr5.axt \ /cluster/data/rn3/nib /cluster/data/mm4/nib # about 75 minutes in the above jobs # translate sorted axt files into psl ssh kkstore cd /cluster/data/rn3/bed/blastz.mm4 mkdir -p pslChrom set tbl = "blastzMm4" foreach f (axtChrom/chr*.axt) set c=$f:t:r if (-e pslChrom/${c}_${tbl}.psl) then echo "Done pslChrom/${c}_${tbl}.psl" else echo "Processing chr ${c}" /cluster/bin/i386/axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl endif end # That takes about 20 minutes # Load database tables ssh hgwdev set tbl = "blastzMm4" cd /cluster/data/rn3/bed/blastz.mm4/pslChrom /cluster/bin/i386/hgLoadPsl -noTNameIx rn3 chr*_${tbl}.psl # real 162m7.520s # user 32m40.530s # sys 5m31.500s # check results, need to run this on kolossus to avoid an # out-of-memory problem: # featureBits rn3 blastzMm4 # 1804072721 bases of 2571104688 (70.167%) in intersection # featureBits rn3 blastzMm3 # 1669090687 bases of 2571104688 (64.917%) in intersection # CHAIN Mm4 BLASTZ (WORKING - 2003-11-03 - Hiram) # The axtChain is best run on the small kluster, or the kk9 kluster ssh kkr1u00 mkdir -p /cluster/data/rn3/bed/blastz.mm4/axtChain/run1 cd /cluster/data/rn3/bed/blastz.mm4/axtChain/run1 mkdir out chain ls -1S /cluster/data/rn3/bed/blastz.mm4/axtChrom/*.axt > input.lst cat << '_EOF_' > gsub #LOOP doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out} #ENDLOOP '_EOF_' # << this line makes emacs coloring happy cat << '_EOF_' > doChain #!/bin/sh axtFilter -notQ_random $1 | axtChain stdin \ /iscratch/i/rn3/bothMaskedNibs \ /scratch/mus/mm4/softNib $2 > $3 '_EOF_' # << this line makes emacs coloring happy chmod a+x doChain # 41 jobs gensub2 input.lst single gsub jobList para create jobList para try para push # ... etc ... # Completed: 44 of 44 jobs # CPU time in finished jobs: 53019s 883.66m 14.73h 0.61d 0.002 y # IO & Wait Time: 33812s 563.53m 9.39h 0.39d 0.001 y # Average job time: 1973s 32.89m 0.55h 0.02d # Longest job: 22348s 372.47m 6.21h 0.26d # Submission to last job: 22348s 372.47m 6.21h 0.26d # now on the cluster server, sort chains ssh kkstore cd /cluster/data/rn3/bed/blastz.mm4/axtChain time chainMergeSort run1/chain/*.chain > all.chain # real 8m55.100s # user 6m46.680s # sys 0m58.250s time chainSplit chain all.chain # real 8m36.259s # user 5m46.920s # sys 0m59.700s # these steps take ~20 minutes # optionally: rm run1/chain/*.chain # Load chains into database # next machine ssh hgwdev cd /cluster/data/rn3/bed/blastz.mm4/axtChain/chain foreach i (*.chain) set c = $i:r hgLoadChain rn3 ${c}_chainMm4 $i echo done $c end # NET Mm4 BLASTZ (TBD) ssh kkstore cd /cluster/data/rn3/bed/blastz.mm4/axtChain mkdir preNet cd chain foreach i (*.chain) echo preNetting $i /cluster/bin/i386/chainPreNet $i /cluster/data/rn3/chrom.sizes \ /cluster/data/mm4/chrom.sizes ../preNet/$i end # real 11m58.018s # user 4m10.390s # sys 2m10.780s cd .. mkdir n1 cd preNet foreach i (*.chain) set n = $i:r.net echo primary netting $i /cluster/bin/i386/chainNet $i -minSpace=1 /cluster/data/rn3/chrom.sizes \ /cluster/data/mm4/chrom.sizes ../n1/$n /dev/null end cd .. cat n1/*.net | /cluster/bin/i386/netSyntenic stdin hNoClass.net # memory usage 1591906304, utime 6877 s/100, stime 1977 ssh hgwdev cd /cluster/data/rn3/bed/blastz.mm4/axtChain time netClass hNoClass.net rn3 mm4 mouse.net \ -tNewR=/cluster/bluearc/scratch/mus/rn3/linSpecRep.notInHuman \ -qNewR=/cluster/bluearc/scratch/hg/gs.17/build34/linSpecRep.notInMouse # real 11m35.367s # user 7m20.880s # sys 1m33.360s # If things look good do ssh kkstore cd /cluster/data/rn3/bed/blastz.mm4/axtChain rm -r n1 hNoClass.net # Make a 'syntenic' subset of these with netFilter -syn mouse.net > mouseSyn.net # Load the nets into database ssh hgwdev cd /cluster/data/rn3/bed/blastz.mm4/axtChain netFilter -minGap=10 mouse.net | hgLoadNet rn3 netMm4 stdin netFilter -minGap=10 mouseSyn.net | hgLoadNet rn3 syntenyNetMm4 stdin # Add entries for net and chain to rat/rn3 trackDb # make net ssh kkstore cd /cluster/data/rn3/bed/blastz.mm4/axtChain mkdir mouseNet time netSplit mouse.net mouseNet # real 6m13.469s # user 3m22.670s # sys 0m44.880s mkdir ../axtNet foreach n (mouseNet/chr*.net) set c=$n:t:r echo "netToAxt: $c.net -> $c.axt" rm -f ../axtNet/$c.axt netToAxt mouseNet/$c.net chain/$c.chain \ /cluster/data/rn3/nib \ /cluster/bluearc/scratch/hg/gs.17/build34/bothMaskedNibs \ ../axtNet/$c.axt echo "Complete: $c.net -> $c.axt" end ssh hgwdev mkdir -p /cluster/data/rn3/bed/blastz.mm4/axtBest cd /cluster/data/rn3/bed/blastz.mm4/axtBest ln -s ../axtNet/chr*.axt . # copy net axt's to download area ssh hgwdev cd /cluster/data/rn3/bed/blastz.mm4/axtNet mkdir -p /usr/local/apache/htdocs/goldenPath/rn3/vsMm4/axtNet cp -p *.axt /usr/local/apache/htdocs/goldenPath/rn3/vsMm4/axtNet cd /usr/local/apache/htdocs/goldenPath/rn3/vsMm4/axtNet gzip *.axt # add README.txt file to dir, if needed # Convert those axt files to psl ssh kkstore cd /cluster/data/rn3/bed/blastz.mm4 mkdir pslBest foreach a (axtBest/chr*.axt) set c=$a:t:r echo "processing $c.axt -> ${c}_blastzBestMm4.psl" /cluster/bin/i386/axtToPsl axtBest/${c}.axt \ S1.len S2.len pslBest/${c}_blastzBestMm4.psl echo "Done: ${c}_blastzBestMm4.psl" end # Load tables ssh hgwdev cd /cluster/data/rn3/bed/blastz.mm4/pslBest time /cluster/bin/i386/hgLoadPsl -noTNameIx rn3 chr*_blastzBestMm4.psl # real 10m47.853s # user 2m48.700s # sys 0m24.250s # check results # featureBits rn3 blastzBestMm4 # 1030510540 bases of 2627444668 (39.221%) in intersection # featureBits rn3 blastzBestMm3 # 1019460260 bases of 2505900260 (40.682%) in intersection # Make /gbdb links and add them to the axtInfo table: mkdir -p /gbdb/rn3/axtBestMm4 cd /gbdb/rn3/axtBestMm4 ln -s /cluster/data/rn3/bed/blastz.mm4/axtNet/chr*.axt . cd /cluster/data/rn3/bed/blastz.mm4/axtNet rm -f axtInfoInserts.sql foreach f (/gbdb/rn3/axtBestMm4/chr*.axt) set chr=$f:t:r echo "INSERT INTO axtInfo (species, alignment, chrom, fileName) \ VALUES ('mm4','Blastz Best in Genome','$chr','$f');" \ >> axtInfoInserts.sql end hgsql rn3 < ~/kent/src/hg/lib/axtInfo.sql # table axtInfo may already exist, ignore create error. hgsql rn3 < axtInfoInserts.sql # MAKING THE AXTTIGHT FROM AXTBEST (TBD) # After creating axtBest alignments above, use subsetAxt to get axtTight: ssh kkstore cd /cluster/data/rn3/bed/blastz.mm4/axtNet mkdir -p ../axtTight tcsh foreach i (*.axt) echo $i subsetAxt $i ../axtTight/$i \ ~kent/src/hg/mouseStuff/subsetAxt/coding.mat 3400 end # translate to psl cd ../axtTight mkdir ../pslTight foreach i (*.axt) set c = $i:r axtToPsl $i ../S1.len ../S2.len ../pslTight/${c}_blastzTightMm4.psl echo "Done: $i" end # Load tables into database ssh hgwdev cd /cluster/data/rn3/bed/blastz.mm4/pslTight hgLoadPsl -noTNameIx rn3 chr*_blastzTightMm4.psl # copy to axt's to download area cd /cluster/data/rn3/bed/blastz.mm4/axtTight mkdir -p /usr/local/apache/htdocs/goldenPath/rn3/vsMm4/axtTight cp -p *.axt /usr/local/apache/htdocs/goldenPath/rn3/vsMm4/axtTight cd /usr/local/apache/htdocs/goldenPath/rn3/vsMm4/axtTight gzip *.axt # add README.txt file to dir, if needed # MOUSE MM4 BLASTZ CLEANUP (WORKING 2004-09-20 kate) ssh eieio cd /cluster/data/hg17/bed/blastz.mm4 nice rm -rf raw lav nice rm axtChain/run1/chain/* axtChain/preNet axtChain/hNoClass.net # GOT HERE nice gzip axtChrom/*.axt axtChain/{all.chain,*.net} axtChain/*Net/*.net nice gzip pslChrom/*.psl axtBest/*.axt # LOAD ECgene tables (re-done with sort (braney, 2004-01-30)) cd /cluster/data/rn3/bed rm -f ECgene mkdir ECgene.2003-12-19 ln -s ECgene.2003-12-19 ECgene cd ECgene wget "http://genome.ewha.ac.kr/ECgene/download/ECgene_rn3_v1.1_25oct2003_genes.txt.gz" wget "http://genome.ewha.ac.kr/ECgene/download/ECgene_rn3_v1.1_25oct2003_genepep.txt.gz" gunzip *.gz ldHgGene -predTab rn3 ECgene ECgene_rn3_v1.1_25oct2003_genes.txt hgPepPred rn3 tab ECgenePep ECgene_rn3_v1.1_25oct2003_genepep.txt rm genePred.tab gzip * LOAD SNPS (Done. Daryl Thomas; February 18, 2004) # SNP processing has been condensed into a single script, # which makes snpNih, snpTsc, and snpMap # ${HOME}/kent/src/hg/snp/locations/processSnpLocations.csh # snpBuild = 119 # Run from directory $oo/bed/snp/build$snpBuild/snpMap mkdir -p $oo/bed/snp/build$snpBuild/snpMap cd $oo/bed/snp/build$snpBuild/snpMap processSnpLocations.csh rn3 rat 2 119 >& log & # check data: # wc -l snpNih.bed; rn3 -e "select count(*) from snpNih; # wc -l snpMap.bed; rn3 -e "select count(*) from snpMap; # select * from snpNih limit 5; desc snpNih; show indexes from snpNih" # select * from snpMap limit 5; desc snpMap; show indexes from snpMap" # remove temp files # rm rat* *bed.gz LOAD SNP DETAILS (Done. Daryl Thomas; February 18, 2004) # SNP processing has been condensed into a single script, # which makes dbSnpRsMm # ${HOME}/kent/src/hg/snp/details/processSnpDetails.csh # snpBuild = 119 # Run from directory $oo/bed/snp/build$snpBuild/snpMap mkdir -p $oo/bed/snp/build$snpBuild/details/Done mkdir -p $oo/bed/snp/build$snpBuild/details/Observed cd $oo/bed/snp/build$snpBuild/details processSnpDetails.csh rn3 rat 119 >& log & load data local infile "$fileBase.out" into table $database.$table gzip $fileBase.out # check data: # hgFixed -e "select count(*) from dbSnpRsRn; # select * from dbSnpRsRn limit 5; desc dbSnpRsRn; show indexes from dbSnpRSRn" # remove temp files # rm dbSnpRs* # ECORES FROM GENOSCOPE [DONE, hartera, 2004-03-31] # download data from http://www.genoscope.cns.fr/externe/tetraodon/Data3/ecores# ecotigRF - ecores on Rat, genome conserved with Fugu # ecotigRT - ecores on Rat, genome conserved with Tetraodon ssh hgwdev mkdir /cluster/data/rn3/bed/ecores/ # add parse_ecotig.pl to this directory # FUGU mkdir /cluster/data/rn3/bed/ecores/fr1 cd /cluster/data/rn3/bed/ecores/fr1/ # download data for ecotigRF to this directory # parse ecotig files to produce a bed format file perl ../parse_ecotig.pl < ecotigRF > ecotigRF.bed # change from upper to lower case for "CHR" perl -pi.bak -e 's/CHR/chr/g' ecotigRF.bed hgLoadBed -tab rn3 ecoresFr1 ecotigRF.bed # TETRAODON mkdir /cluster/data/rn3/bed/ecores/tetraodon cd /cluster/data/rn3/bed/ecores/tetraodon/ # download data for ecotigHT to this directory # parse ecotig files to produce a bed format file perl ../parse_ecotig.pl < ecotigRT > ecotigRT.bed # change from upper to lower case for "CHR" perl -pi.bak -e 's/CHR/chr/g' ecotigRT.bed hgLoadBed -tab rn3 ecoresTetraodon ecotigRT.bed # clean up rm *.bak # add entries in kent/src/hg/makeDb/trackDb/rat/rn3/trackDb.ra # add html for details pages to this directory: # ecoresFr1.html and ecoresTetraodon.html # MAKE RAT 11.OOC FILE [DONE, 2004-05-04, hartera] # OOC FILE REQUIRED FOR BLAT # Use -repMatch=1024 (based on genome size, for human use 1024) ssh kkr1u00 mkdir -p /cluster/bluearc/rat/rn3 mkdir -p /cluster/data/rn3/bed/ooc cd /cluster/data/rn3/bed/ooc mkdir -p /cluster/bluearc/rat/rn3/softNib cp -p /cluster/data/rn3/softNib/chr*.nib /cluster/bluearc/rat/rn3/softNib ls -1 /cluster/data/rn3/softNib/chr*.nib > nib.lst /cluster/bin/i386/blat nib.lst /dev/null /dev/null -tileSize=11 \ -makeOoc=/cluster/bluearc/rat/rn3/11.ooc -repMatch=1024 # Writing /cluster/bluearc/rat/rn3/11.ooc # Wrote 25600 overused 11-mers to /cluster/bluearc/rat/rn3/11.ooc cp -p /cluster/bluearc/rat/rn3/11.ooc rn3_11.ooc cp -p /cluster/bluearc/rat/rn3/11.ooc /iscratch/i/rn3/rn3_11.ooc iSync # Affymetrix RAE230 GeneChip track - affyRAE230 # RE-DO alignments using rat specific 11.ooc file # [DONE, hartera, 2004-05-04] mkdir /projects/compbio/data/microarray/affyRat # Download RAE230A and RAE230B consensus sequences from Affymetrix web site # http://www.affymetrix.com/support/technical/byproduct.affx?product=rae230 # check for duplicate probes: 100 from 136745_at to 1367551_a_at # remove "consensus:" and ";" from FASTA headers to shorten probeset # names for database sed -e 's/consensus://' RAE230A_consensus | sed -e 's/;/ /' > RAE230_all.fa sed -e 's/consensus://' RAE230B_consensus | sed -e 's/;/ /' >> RAE230_all.fa # copy sequences to a directory visible on kkr1u00 cp /projects/compbio/data/microarray/affyRat/RAE230_all.fa /cluster/bluearc/affy/ # Set up cluster job to align RAE230 consensus sequences to rn3 ssh kkr1u00 cd /cluster/data/rn3/bed mkdir affyRat.2004-05-04 cd affyRat.2004-05-04 mkdir -p /iscratch/i/affy cp /cluster/bluearc/affy/RAE230_all.fa /iscratch/i/affy iSync ssh kk cd /cluster/data/rn3/bed/affyRat.2004-05-04 ls -1 /iscratch/i/affy/RAE230_all.fa > affy.lst ls -1 /scratch/rat/rn3/trfFa/ > allctg.lst echo '#LOOP\n/cluster/bin/i386/blat -fine -mask=lower -minIdentity=95 -ooc=/iscratch/i/rn3/rn3_11.ooc /scratch/rat/rn3/trfFa/$(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl}\n#ENDLOOP' > template.sub gensub2 allctg.lst affy.lst template.sub para.spec mkdir psl para create para.spec # Actually do the job with usual para try/check/push/time etc. # para time 2004-05-04 #Completed: 591 of 591 jobs # CPU time in finished jobs: 9950s 165.83m 2.76h 0.12d 0.000 y # IO & Wait Time: 5522s 92.04m 1.53h 0.06d 0.000 y # Average job time: 26s 0.44m 0.01h 0.00d # Longest job: 49s 0.82m 0.01h 0.00d # Submission to last job: 165s 2.75m 0.05h 0.00d # Do sort, best in genome filter, and convert to chromosome coordinates # to create affyRAE230.psl pslSort dirs raw.psl tmp psl # only use alignments that cover 30% of sequence and have at least # 95% identity in aligned region. minAli = 0.97 too high. # low minCover as a lot of n's in these sequences pslReps -minCover=0.3 -sizeMatters -minAli=0.95 -nearTop=0.005 raw.psl contig.psl /dev/null liftUp affyRAE230.psl ../../jkStuff/liftAll.lft warn contig.psl # shorten names in psl file sed -e 's/RAE230//' affyRAE230.psl > affyRAE230.psl.bak mv affyRAE230.psl.bak affyRAE230.psl # load track into database ssh hgwdev cd /cluster/data/rn3/bed/affyRat.2004-05-04 hgsql -e 'drop table affyRAE230' rn3 hgsql -e 'select * from history' rn3 | less # affyRAE230 was added as entry ix = 23 # so add errata to say these alignments have been removed hgsql -e 'update history set errata = "Removed alignmnents and re-align sequences using rat specific 11.ooc file for Blat" where ix = 23' rn3 hgLoadPsl rn3 affyRAE230.psl # Add consensus sequences for RAE230 # Copy sequences to gbdb is they are not there already mkdir -p /gbdb/hgFixed/affyProbes cd /gbdb/hgFixed/affyProbes ln -s /projects/compbio/data/microarray/affyRat/RAE230_all.fa . # sequences are the still the same so no need to reload these hgLoadSeq -abbr=RAE230 rn3 /gbdb/hgFixed/affyProbes/RAE230_all.fa # Clean up rm -r psl tmp err batch.bak contig.psl raw.psl seq.tab psl.tab # UPDATE knownToRAE230 TABLE - USED BY THE FAMILY BROWSER [DONE, 2004-05-04, hartera] # knownToRAE230 depends on affyRAE230 so update this table ssh hgwdev cd /cluster/data/rn3/bed/affyRat.2004-05-04 hgsql -e 'drop table knownToRAE230' rn3 # create and load knownToRAE230 hgMapToGene rn3 affyRAE230 knownGene knownToRAE23 # gc5Base wiggle TRACK (DONE - 2004-04-07 - Hiram) # Perform a gc count with a 5 base # window. Also compute a "zoomed" view for display efficiency. mkdir /cluster/data/rn3/bed/gc5Base cd /cluster/data/rn3/bed/gc5Base # in the script below, the 'grep -w GC' selects the lines of # output from hgGcPercent that are real data and not just some # information from hgGcPercent. The awk computes the number # of bases that hgGcPercent claimed it measured, which is not # necessarily always 5 if it ran into gaps, and then the division # by 10.0 scales down the numbers from hgGcPercent to the range # [0-100]. Two columns come out of the awk print statement: # and which are fed into wigAsciiToBinary through # the pipe. It is set at a dataSpan of 5 because each value # represents the measurement over five bases beginning with # . The result files end up in ./wigData5. cat << '_EOF_' > runGcPercent.sh #!/bin/sh mkdir -p wigData5 mkdir -p dataLimits5 for n in ../../nib/*.nib do c=`basename ${n} | sed -e "s/.nib//"` C=`echo $c | sed -e "s/chr//"` echo -n "working on ${c} - ${C} ... " hgGcPercent -chr=${c} -doGaps \ -file=stdout -win=5 rn3 ../../nib | grep -w GC | \ awk '{printf "%d\t%.1f\n", $2+1, $5/10.0 }' | \ wigAsciiToBinary \ -dataSpan=5 -chrom=${c} -wibFile=wigData5/gc5Base_${C} \ -name=${C} stdin 2> dataLimits5/${c} echo "done" done '_EOF_' # << emacs chmod +x runGcPercent.sh # This is going to take perhaps two hours to run. It is a lot of # data. make sure you do it on the fileserver: ssh eieio cd /cluster/data/rn3/bed/gc5Base ./runGcPercent.sh # load the .wig files back on hgwdev: ssh hgwdev cd /cluster/data/rn3/bed/gc5Base hgLoadWiggle -pathPrefix=/gbdb/rn3/wib/gc5Base rn3 gc5Base wigData5/*.wig # and symlink the .wib files into /gbdb mkdir /gbdb/rn3/wib/gc5Base ln -s `pwd`/wigData5/*.wib /gbdb/rn3/wib/gc5Base # to speed up display for whole chromosome views, compute a "zoomed" # view and load that on top of the existing table. The savings # comes from the number of data table rows the browser needs to load # for a full chromosome view. Without the zoomed view there are # over 43,000 data rows for chrom 1. With the zoomed view there are # only 222 rows needed for the display. If your original data was # at 1 value per base the savings would be even greater. # Pretty much the same data calculation # situation as above, although this time note the use of the # 'wigZoom -dataSpan=1000 stdin' in the pipeline. This will average # together the data points coming out of the awk print statment over # a span of 1000 bases. Thus each coming out of wigZoom # will represent the measurement of GC in the next 1000 bases. Note # the use of -dataSpan=1000 on the wigAsciiToBinary to account for # this type of data. You want your dataSpan here to be an exact # multiple of your original dataSpan (5*200=1000) and on the order # of at least 1000, doesn't need to go too high. For data that is # originally at 1 base per value, a convenient span is: -dataSpan=1024 # A new set of result files ends up in ./wigData5_1K/*.wi[gb] cat << '_EOF_' > runZoom.sh #!/bin/sh mkdir -p wigData5_1K mkdir -p dataLimits5_1K for n in ../../nib/*.nib do c=`basename ${n} | sed -e "s/.nib//"` C=`echo $c | sed -e "s/chr//"` echo -n "working on ${c} - ${C} ... " hgGcPercent -chr=${c} -doGaps \ -file=stdout -win=5 rn3 ../../nib | grep -w GC | \ awk '{printf "%d\t%.1f\n", $2+1, $5/10.0}' | \ wigZoom -dataSpan=1000 stdin | wigAsciiToBinary \ -dataSpan=1000 -chrom=${c} -wibFile=wigData5_1K/gc5Base_${C}_1K \ -name=${C} stdin 2> dataLimits5_1K/${c} echo "done" done '_EOF_' # << emacs chmod +x runZoom.sh # This is going to take even longer than above, certainly do this # on the fileserver ssh eieio time ./runZoom.sh real 232m3.265s user 302m37.050s sys 16m13.770s # Then load these .wig files into the same database as above ssh hgwdev hgLoadWiggle -pathPrefix=/gbdb/rn3/wib/gc5Base -oldTable rn3 gc5Base \ wigData5_1K/*.wig # and symlink these .wib files into /gbdb ln -s `pwd`/wigData5_1K/*.wib /gbdb/rn3/wib/gc5Base # Affymetrix RG-U34A GeneChip track - affyRGU34A # RG-U34A is the chip used for the GNF Atlas 2 expression data for Rat # (DONE, 2004-05-11, hartera) mkdir -p /projects/compbio/data/microarray/affyRat # Download RG-U34A exemplar sequences from Affymetrix web site # http://www.affymetrix.com/support/technical/byproduct.affx?product=rgu34 # remove "consensus:" and ";" from FASTA headers to shorten probeset # names for database sed -e 's/exemplar://' RG-U34A_exemplar | sed -e 's/;/ /' > RG-U34A_all.fa # copy sequences to a directory visible on kkr1u00 cp /projects/compbio/data/microarray/affyRat/RG-U34A_all.fa \ /cluster/bluearc/affy/ # Set up cluster job to align RG-U34A exemplar sequences to rn3 ssh kkr1u00 cd /cluster/data/rn3/bed mkdir affyU34A cd affyU34A mkdir -p /iscratch/i/affy cp /cluster/bluearc/affy/RG-U34A_all.fa /iscratch/i/affy iSync ssh kk cd /cluster/data/rn3/bed/affyU34A ls -1 /iscratch/i/affy/RG-U34A_all.fa > affy.lst ls -1 /scratch/rat/rn3/trfFa/ > allctg.lst echo '#LOOP\n/cluster/bin/i386/blat -fine -mask=lower -minIdentity=95 -ooc=/iscratch/i/rn3/rn3_11.ooc /scratch/rat/rn3/trfFa/$(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl}\n#ENDLOOP' > template.sub gensub2 allctg.lst affy.lst template.sub para.spec mkdir psl para create para.spec # Actually do the job with usual para try/check/push/time etc. # para time 2004-05-11 #Completed: 591 of 591 jobs # CPU time in finished jobs: 4705s 78.42m 1.31h 0.05d 0.000 y # IO & Wait Time: 6498s 108.29m 1.80h 0.08d 0.000 y # Average job time: 19s 0.32m 0.01h 0.00d # Longest job: 38s 0.63m 0.01h 0.00d # Submission to last job: 481s 8.02m 0.13h 0.01d # Do sort, best in genome filter, and convert to chromosome coordinates # to create affyRGU34A.psl pslSort dirs raw.psl tmp psl # only use alignments that cover 30% of sequence and have at least # 95% identity in aligned region. minAli = 0.97 too high. # low minCover as a lot of n's in these sequences pslReps -minCover=0.3 -sizeMatters -minAli=0.95 -nearTop=0.005 raw.psl contig.psl /dev/null liftUp affyU34A.psl ../../jkStuff/liftAll.lft warn contig.psl # shorten names in psl file sed -e 's/RG-U34A://' affyU34A.psl > affyU34A.psl.bak mv affyU34A.psl.bak affyU34A.psl # load track into database ssh hgwdev cd /cluster/data/rn3/bed/affyU34A hgLoadPsl rn3 affyU34A.psl # Add exemplar sequences for RG-U34A # Copy sequences to gbdb is they are not there already mkdir -p /gbdb/hgFixed/affyProbes cd /gbdb/hgFixed/affyProbes ln -s /projects/compbio/data/microarray/affyRat/RG-U34A_all.fa . # sequences are the still the same so no need to reload these cd /cluster/data/rn3/bed/affyU34A hgLoadSeq -abbr=RG-U34A: rn3 /gbdb/hgFixed/affyProbes/RG-U34A_all.fa # Clean up rm -r psl tmp err batch.bak contig.psl raw.psl seq.tab psl.tab # add trackDb.ra entry and affyU34A.html # CREATE knownToU34A TABLE - USED BY THE FAMILY BROWSER # [DONE, 2004-05-12, hartera] ssh hgwdev cd /cluster/data/rn3/bed/affyU34A # create and load knownToRAE230 hgMapToGene rn3 affyU34A knownGene knownToU34A # ADD EXPRESSION DATA TRACK BY MERGING EXPRESSION AND ALIGNMENT DATA # GNF ATLAS 2 TRACK (DONE, 2004-05-25, hartera) ssh hgwdev cd /cluster/data/rn3/bed/affyU34A hgMapMicroarray gnfRatAtlas2.bed hgFixed.gnfRatAtlas2MedianRatio \ affyU34A.psl # Loaded 8022 rows of expression data from hgFixed.gnfRatAtlas2MedianRatio # Mapped 7442, multiply-mapped 239, missed 694, unmapped 580 hgLoadBed rn3 gnfAtlas2 gnfRatAtlas2.bed ######## PROTEOME BROWSER BUILD ####### (PARTIALLY DONE - 2004-04-14 - Fan) o Create the working directory mkdir /cluster/store3/rn3/bed/pb cd /cluster/store3/rn3/bed/pb o save old PB tables (if they exist) to the backup DB, kgRn3Sav From mysql prompt: alter table rn3.pepCCntDist rename as kgRn3Sav.pepCCntDist; alter table rn3.pepExonCntDist rename as kgRn3Sav.pepExonCntDist; alter table rn3.pepHydroDist rename as kgRn3Sav.pepHydroDist; alter table rn3.pepIPCntDist rename as kgRn3Sav.pepIPCntDist; alter table rn3.pepMolWtDist rename as kgRn3Sav.pepMolWtDist; alter table rn3.pepMwAa rename as kgRn3Sav.pepMwAa; alter table rn3.pepPi rename as kgRn3Sav.pepPi; alter table rn3.pepPiDist rename as kgRn3Sav.pepPiDist; alter table rn3.pepResDist rename as kgRn3Sav.pepResDist; alter table rn3.pbAaDistA rename as kgRn3Sav.pbAaDistA; alter table rn3.pbAaDistC rename as kgRn3Sav.pbAaDistC; alter table rn3.pbAaDistD rename as kgRn3Sav.pbAaDistD; alter table rn3.pbAaDistE rename as kgRn3Sav.pbAaDistE; alter table rn3.pbAaDistF rename as kgRn3Sav.pbAaDistF; alter table rn3.pbAaDistG rename as kgRn3Sav.pbAaDistG; alter table rn3.pbAaDistH rename as kgRn3Sav.pbAaDistH; alter table rn3.pbAaDistI rename as kgRn3Sav.pbAaDistI; alter table rn3.pbAaDistK rename as kgRn3Sav.pbAaDistK; alter table rn3.pbAaDistL rename as kgRn3Sav.pbAaDistL; alter table rn3.pbAaDistM rename as kgRn3Sav.pbAaDistM; alter table rn3.pbAaDistN rename as kgRn3Sav.pbAaDistN; alter table rn3.pbAaDistP rename as kgRn3Sav.pbAaDistP; alter table rn3.pbAaDistQ rename as kgRn3Sav.pbAaDistQ; alter table rn3.pbAaDistR rename as kgRn3Sav.pbAaDistR; alter table rn3.pbAaDistS rename as kgRn3Sav.pbAaDistS; alter table rn3.pbAaDistT rename as kgRn3Sav.pbAaDistT; alter table rn3.pbAaDistV rename as kgRn3Sav.pbAaDistV; alter table rn3.pbAaDistW rename as kgRn3Sav.pbAaDistW; alter table rn3.pbAaDistY rename as kgRn3Sav.pbAaDistY; alter table rn3.pbAnomLimit rename as kgRn3Sav.pbAnomLimit; alter table rn3.pbResAvgStd rename as kgRn3Sav.pbResAvgStd; alter table rn3.pbStamp rename as kgRn3Sav.pbStamp; o Define pep* tables in rn3 DB cat ~/kent/src/hg/lib/pep*.sql > pepAll.sql First edit out pepPred table definition, then hgsql rn3 < pepAll.sql o Build the pepMwAa table hgsql proteins040315 -e "select info.acc, molWeight, aaSize from sp040315.info, sp040315.accToTaxon where accToTaxon.taxon=10116 and accToTaxon.acc = info.acc" > pepMwAa.tab hgsql rn3 < protAcc.lis pbCalPi protAcc.lis sp040315 pepPi.tab hgsql rn3 <pbAaAll.sql hgsql rn3 < pbAaAll.sql From mysql prompt: use rn3; load data local infile "pbAaDistW.tab" into table rn3.pbAaDistW; load data local infile "pbAaDistC.tab" into table rn3.pbAaDistC; load data local infile "pbAaDistM.tab" into table rn3.pbAaDistM; load data local infile "pbAaDistH.tab" into table rn3.pbAaDistH; load data local infile "pbAaDistY.tab" into table rn3.pbAaDistY; load data local infile "pbAaDistN.tab" into table rn3.pbAaDistN; load data local infile "pbAaDistF.tab" into table rn3.pbAaDistF; load data local infile "pbAaDistI.tab" into table rn3.pbAaDistI; load data local infile "pbAaDistD.tab" into table rn3.pbAaDistD; load data local infile "pbAaDistQ.tab" into table rn3.pbAaDistQ; load data local infile "pbAaDistK.tab" into table rn3.pbAaDistK; load data local infile "pbAaDistR.tab" into table rn3.pbAaDistR; load data local infile "pbAaDistT.tab" into table rn3.pbAaDistT; load data local infile "pbAaDistV.tab" into table rn3.pbAaDistV; load data local infile "pbAaDistP.tab" into table rn3.pbAaDistP; load data local infile "pbAaDistG.tab" into table rn3.pbAaDistG; load data local infile "pbAaDistE.tab" into table rn3.pbAaDistE; load data local infile "pbAaDistA.tab" into table rn3.pbAaDistA; load data local infile "pbAaDistL.tab" into table rn3.pbAaDistL; load data local infile "pbAaDistS.tab" into table rn3.pbAaDistS; o Create pbAnomLimit and pbResAvgStd tables hgsql rn3 < ~/src/hg/lib/pbAnomLimit.sql hgsql rn3 < ~/src/hg/lib/pbResAvgStd.sql hgsql rn3 -e 'load data local infile "pbResAvgStd.tab" into table rn3.pbResAvgStd;' hgsql rn3 -e 'load data local infile "pbAnomLimit.tab" into table rn3.pbAnomLimit;' o Download data files from Ensembl and create ensGeneXref and ensTranscript tables mkdir /cluster/bluearc/fan/ensembl/19_3bR cd /cluster/bluearc/fan/ensembl/19_3bR wget --timestamping \ "ftp://ftp.ensembl.org/pub/rat-19.3b/data/mysql/rattus_norvegicus_lite_19_3b/gene_xref.txt.table.gz" wget --timestamping \ "ftp://ftp.ensembl.org/pub/rat-19.3b/data/mysql/rattus_norvegicus_lite_19_3b/transcript.txt.table.gz" wget --timestamping \ "ftp://ftp.ensembl.org/pub/rat-19.3b/data/mysql/rattus_norvegicus_lite_19_3b/rattus_norvegicus_lite_19_3b.sql.gz" gzip -d *.gz If rn3.ensGeneXref table already exists alter table rn3.ensGeneXref rename as kgRn3Sav.ensGeneXref; ENSEMBL IS KNOWN FOR CHANGING THEIR TABLE DEFINTION AND CONTENT!!! So, first check if the table definition of gene_xref in rattus_norvegicus_lite_19_3b.sql is the same as ~/src/hg/lib/ensGeneXref.sql. If they are not the same, some programming change to hgTracks.c and possibly other programs may be needed. If they are the same, create gensGeneXref table in rn3 by: hgsql rn3 < ~/src/hg/lib/ensGeneXref.sql hgsql rn3 -e 'load data local infile "gene_xref.txt.table" into table rn3.ensGeneXref;' If rn3.ensTranscript exists alter table rn3.ensTranscript rename as kgRn3Sav.ensTranscript; Again, check if the table definition of transcript in rattus_norvegicus_lite_19_3b.sql.gz is the same as ~/src/hg/lib/ensTranscript.sql. If they are not the same, some programming change to hgTracks.c and possibly other programs may be needed. If they are the same, create gensTranscript table in rn3 by: hgsql rn3 < ~/src/hg/lib/ensTranscript.sql hgsql rn3 -e 'load data local infile "transcript.txt.table" into table rn3.ensTranscript;' o Download Superfamily data files and build the Superfamily DB Make sure an index on id is created for the des table of superfam040321. Skip this step if it is already done. mkdir /cluster/store1/superFamily/040321 cd /cluster/store1/superFamily/040321 ftp over the following two files: ass_28-Mar-2004.tab.gz supfam_21-Mar-2004.sql.gz hgsql superfam040321 < ~/src/hg/lib/sfAssign.sql hgsql superfam040321 -e \ "load data local infile "ass_28-Mar-2004.tab" into table superfam040321.sfAssign;" o Load sfAssign and sfDes tables into rn3, they are needed by PB and Superfamily track hgsql rn3 < ~/src/hg/lib/sfAssign.sql hgsql rn3 -e 'load data local infile "ass_28-Mar-2004.tab" into table rn3.sfAssign;' If rn3.sfDes already exists, alter table rn3.sfDes rename as kgRn3Sav.sfDes; hgsql superfam040321 -e "select * from des" >des.tab hgsql rn3 < ~/src/hg/lib/sfDes.sql hgsql rn3 -e 'load data local infile "des.tab" into table rn3.sfDes ignore 1 lines' o Build or rebuild Superfamily track (Superfamily track needs Ensembl Genes track (ensGene table) to be built first) If rn3.superfamily already exists alter table rn3.superfamily rename as kgRn3Sav.superfamily; Go to rn3 build working directory. hgSuperfam rn3 > sf.log It is normal that many proteins does not have corresponding Superfamily entries. If rn3.sfDescription exists, alter table rn3.sfDescription rename as kgRn3Sav.sfDescription; hgsql rn3 < ~/src/hg/lib/sfDescription.sql hgsql rn3 -e 'LOAD DATA local INFILE "sfDescription.tab" into table rn3.sfDescription;' Finally, load the superfamily table. hgLoadBed rn3 superfamily superfamily.tab -tab o Create pbStamp table for PB If rn3.pbStamp already exists, hgsql rn3 -e "alter table rn3.pbStamps rename as kgRn3Sav.pbStamp;" Create pbStam table hgsql rn3 < ~/src/hg/lib/pbStamp.sql cd ~/kent/src/hg/protein/pbTracks hgsql rn3 -e 'load data local infile "pbStampRn3.tab" into table rn3.pbStamp;' o Enable Proteome Browser if ~/kent/src/hg/hgGene/hgGeneData/Rat/links.ra does not exist: cd ~/kent/src/hg/hgGene/hgGeneData/Rat mkdir rn3 cd rn3 Create links.ra with the following content ----------------------------- # rn3 link info. name protBrowser shortLabel Proteome Browser tables kgXref idSql select spDisplayID from kgXref where kgID = '%s' nameSql select spId from kgXref where kgID = '%s' url ../cgi-bin/pbTracks?proteinID=%s hgsid on dbInUrl on priority 12 ----------------------------- cd ../.. make o Adjust drawing parameters for Proteome Browser stamps Now invoke Proteome Browser and adjust various drawing parameters (mostly the ymax of each stamp) by updating the pbStampMouse.tab file and then reload the pbStamp table. o Perform preliminary review of Proteome Browser for rn3 o Notify QA for formal review, when all PB build steps done. ####### BUILD/REBUILD RGD TRACKS (DONE 5/26/04 Fan) ############## # download data files from RGD wget ftp://rgd.mcw.edu:21/pub/RGD_genome_annotations/V3.1/GFF_files/RGD_EST.gff wget ftp://rgd.mcw.edu:21/pub/RGD_genome_annotations/V3.1/GFF_files/RGD_QTL.gff wget ftp://rgd.mcw.edu:21/pub/RGD_genome_annotations/V3.1/GFF_files/RGD_QTL.gff wget ftp://rgd.mcw.edu:21/pub/RGD_genome_annotations/V3.1/GFF_files/RGD_RH3.4_Markers.gff wget ftp://rgd.mcw.edu:21/pub/RGD_genome_annotations/V3.1/GFF_files/RGD_SSLP.gff wget ftp://rgd.mcw.edu:21/pub/RGD_genome_annotations/V3.1/GFF_files/RGD_curated_genes.gff wget ftp://rgd.mcw.edu:21/pub/RGD_genome_annotations/V3.1/GFF_files/chr_coordinates.gff # create rgdGeneLink.tab echo "Processing RGD_curated_genes.gff ..." awk '{print $12"\t"$10"\t"$14}' RGD_curated_genes.gff | sed -e 's/"//g' | sed -e 's/RGD://g'|sed -e 's/;//g' |sort -u > rgdGeneLink.tab # load rgdGeneLink table hgsql rn3 -e "drop table rn3.rgdGeneLink" hgsql rn3 < ~/kent/src/hg/lib/rgdGeneLink.sql hgsql rn3 -e 'load data local infile "rgdGeneLink.tab" into table rn3.rgdGeneLink;' # create rgdGene.tab hgsql rn3 -e "select refGene.name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from refGene,rgdGeneLink where refGene.name=rgdGeneLink.refSeq;" >rgdGene.tab # load rgdGene table hgsql rn3 -e "drop table rn3.rgdGene" hgsql rn3 < ~/kent/src/hg/lib/rgdGene.sql hgsql rn3 -e 'load data local infile "rgdGene.tab" into table rn3.rgdGene ignore 1 lines;' # check rgdGene table echo "Performing checkTableCoords rn3 rgdGene ..." checkTableCoords rn3 rgdGene # create rgdEstLink.tab echo "Processing RGD_EST.gff ..." awk '{print $12"\t"$10"\t"$14}' RGD_EST.gff | sed -e 's/"//g' | sed -e 's/RGD://g'|sed -e 's/;//g' |sort -u > rgdEstLink.tab # load rgdEstLink table hgsql rn3 -e "drop table rn3.rgdEstLink" hgsql rn3 < ~/kent/src/hg/lib/rgdEstLink.sql hgsql rn3 -e 'load data local infile "rgdEstLink.tab" into table rn3.rgdEstLink;' # create rgdEst.tab hgsql rn3 -e "select all_est.* from all_est,rgdEstLink where all_est.qName=rgdEstLink.name;" >rgdEst.tab # load rgdEst table hgsql rn3 -e "drop table rn3.rgdEst" hgsql rn3 < ~/kent/src/hg/lib/rgdEst.sql hgsql rn3 -e 'load data local infile "rgdEst.tab" into table rn3.rgdEst ignore 1 lines;' # check rgdEst table echo "Performing checkTableCoords rn3 rgdEst ..." checkTableCoords rn3 rgdEst # create rgdQtl.tab echo "Processing RGD_QTL.gff ..." awk '{print $1"\t"$4-1"\t"$5"\t"$10}' RGD_QTL.gff |sed -e 's/Chr/chr/g'| sed -e 's/"//g' | sed -e 's/RGD://g' | sed -e 's/;//g' > rgdQtl.tab # create rgdQtlLink.tab awk '{printf "%s\t%s\t", $12, $10; for (i = 14;i <= NF; ++i ) {printf "%s ", $i} printf "\n"} ' RGD_QTL.gff | sed -e 's/"//g' | sed -e 's/RGD://g' | sed -e 's/;//g' > rgdQtlLink.tab # load rgdQtl table hgLoadBed rn3 rgdQtl rgdQtl.tab # check rgdQtl table echo "Performing checkTableCoords rn3 rgdQtl ..." checkTableCoords rn3 rgdQtl # load rgdQtlLink table hgsql rn3 -e "drop table rn3.rgdQtlLink;" hgsql rn3 <~/kent/src/hg/lib/rgdQtlLink.sql hgsql rn3 -e 'load data local infile "rgdQtlLink.tab" into table rn3.rgdQtlLink;' # create rgdSslp.tab echo "Processing RGD_SSLP.gff ..." awk '{print $1"\t"$4-1"\t"$5"\t"$10}' RGD_SSLP.gff|sed -e 's/Chr/chr/g'| sed -e 's/;//g'|sed -e 's/"//g'|sed -e 's/RGD://g' >rgdSslp.tab # create rgdSslpLink.tab awk '{print $12"\t"$10}' RGD_SSLP.gff|sed -e 's/"//g'|sed -e 's/RGD://g'|sed -e 's/;//g' >rgdSslpLink.tab # load rgdSslp table hgLoadBed rn3 rgdSslp rgdSslp.tab # check rgdSslp table echo "Performing checkTableCoords rn3 rgdSslp ..." checkTableCoords rn3 rgdSslp # load rgdSslpLink table hgsql rn3 -e "drop table rn3.rgdSslpLink" hgsql rn3 < ~/kent/src/hg/lib/rgdSslpLink.sql hgsql rn3 -e 'load data local infile "rgdSslpLink.tab" into table rn3.rgdSslpLink;' # BLASTZ HUMAN HG17 (DONE - 2004-06-14 - Hiram) ssh kk mkdir /cluster/data/rn3/bed/blastz.hg17.2004-06-13 cd /cluster/data/rn3/bed ln -s blastz.hg17.2004-06-13 blastz.hg17 cd blastz.hg17 cat << '_EOF_' > DEF # rat vs. human export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/home/angie/schwartzbin:/cluster/home/kent/bin/i386 ALIGN=blastz-run BLASTZ=blastz BLASTZ_H=2000 BLASTZ_ABRIDGE_REPEATS=1 # TARGET # Rat SEQ1_DIR=/iscratch/i/rn3/bothMaskedNibs # not used SEQ1_RMSK= # not used SEQ1_FLAG= SEQ1_SMSK=/cluster/bluearc/rat/rn3/linSpecRep.notInHuman SEQ1_IN_CONTIGS=0 SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY # Human SEQ2_DIR=/iscratch/i/gs.18/build35/bothMaskedNibs # not currently used SEQ2_RMSK= # not currently used SEQ2_FLAG= SEQ2_SMSK=/iscratch/i/gs.18/build35/linSpecRep.notInRat SEQ2_IN_CONTIGS=0 SEQ2_CHUNK=30000000 SEQ2_LAP=0 BASE=/cluster/data/rn3/bed/blastz.hg17 DEF=$BASE/DEF RAW=$BASE/raw CDBDIR=$BASE SEQ1_LEN=$BASE/S1.len SEQ2_LEN=$BASE/S2.len '_EOF_' # << this line makes emacs coloring happy # prepare first cluster run ssh kk cd /cluster/data/rn3/bed/blastz.hg17 source DEF # This is a generic script and works for any assembly /cluster/data/hg17/jkStuff/BlastZ_run0.sh cd run.0 para try, check, push, check, .... # Completed: 42744 of 42744 jobs # CPU time in finished jobs: 15578956s 259649.27m 4327.49h 180.31d 0.494 y # IO & Wait Time: 924985s 15416.41m 256.94h 10.71d 0.029 y # Average job time: 386s 6.44m 0.11h 0.00d # Longest job: 4140s 69.00m 1.15h 0.05d # Submission to last job: 19077s 317.95m 5.30h 0.22d # Second cluster run to convert the .out's to .lav's # You do NOT want to run this on the big cluster. It brings # the file server to its knees. Run this on the small cluster. ssh kki cd /cluster/data/rn3/bed/blastz.hg17 source DEF # This is a generic script and works for any assembly /cluster/data/hg17/jkStuff/BlastZ_run1.sh cd run.1 para try, check, push, etc ... # Completed: 312 of 312 jobs # CPU time in finished jobs: 1902s 31.70m 0.53h 0.02d 0.000 y # IO & Wait Time: 4320s 72.00m 1.20h 0.05d 0.000 y # Average job time: 20s 0.33m 0.01h 0.00d # Longest job: 85s 1.42m 0.02h 0.00d # Submission to last job: 453s 7.55m 0.13h 0.01d # Third cluster run to convert lav's to axt's source DEF cd /cluster/data/rn3/bed/blastz.hg17 # This is a generic script and works for any assembly /cluster/data/hg17/jkStuff/BlastZ_run2.sh cd run.2 para try, check, push, etc ... # Completed: 44 of 44 jobs # CPU time in finished jobs: 427s 7.11m 0.12h 0.00d 0.000 y # IO & Wait Time: 4707s 78.45m 1.31h 0.05d 0.000 y # Average job time: 117s 1.94m 0.03h 0.00d # Longest job: 536s 8.93m 0.15h 0.01d # Submission to last job: 536s 8.93m 0.15h 0.01d # translate sorted axt files into psl ssh eieio cd /cluster/data/rn3/bed/blastz.hg17 mkdir pslChrom set tbl = "blastzHg17" foreach f (axtChrom/chr*.axt) set c=$f:t:r echo "Processing chr $c" /cluster/bin/i386/axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl end # That takes about 30 minutes # Load database tables ssh hgwdev cd /cluster/data/rn3/bed/blastz.hg17/pslChrom # this is a 55 minute job for I in *.psl do /cluster/bin/i386/hgLoadPsl -noTNameIx rn3 ${I} echo "done: ${I}" done # CHAIN HG17 BLASTZ (DONE - 2004-06-14 - Hiram) # The axtChain is best run on the small kluster, or the kk9 kluster ssh kki mkdir -p /cluster/data/rn3/bed/blastz.hg17/axtChain/run1 cd /cluster/data/rn3/bed/blastz.hg17/axtChain/run1 mkdir out chain ls -1S /cluster/data/rn3/bed/blastz.hg17/axtChrom/*.axt > input.lst cat << '_EOF_' > gsub #LOOP doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} out/$(root1).out #ENDLOOP '_EOF_' # << this line makes emacs coloring happy cat << '_EOF_' > doChain #!/bin/csh axtFilter -notQ_random $1 | axtChain stdin \ /iscratch/i/rn3/bothMaskedNibs \ /iscratch/i/gs.18/build35/bothMaskedNibs $2 > $3 '_EOF_' # << this line makes emacs coloring happy chmod +x doChain # 44 jobs gensub2 input.lst single gsub jobList para create jobList para try para push # ... etc ... # Completed: 44 of 44 jobs # CPU time in finished jobs: 5395s 89.91m 1.50h 0.06d 0.000 y # IO & Wait Time: 2637s 43.96m 0.73h 0.03d 0.000 y # Average job time: 183s 3.04m 0.05h 0.00d # Longest job: 1032s 17.20m 0.29h 0.01d # Submission to last job: 1032s 17.20m 0.29h 0.01d # now on the file server, sort chains ssh eieio cd /cluster/data/rn3/bed/blastz.hg17/axtChain time chainMergeSort run1/chain/*.chain > all.chain # real 5m45.285s # user 4m51.730s # sys 0m34.290s time chainSplit chain all.chain # real 5m34.606s # user 4m47.860s # sys 0m26.010s # these steps take ~20 minutes # optionally: rm run1/chain/*.chain # Load chains into database # next machine ssh hgwdev cd /cluster/data/rn3/bed/blastz.hg17/axtChain/chain foreach i (*.chain) set c = $i:r hgLoadChain rn3 ${c}_chainHg17 $i echo done $c end # real 26m9.947s # user 14m36.240s # sys 1m54.300s # NET HG17 (WORKING - 2004-06-15 - Hiram) ssh eieio cd /cluster/data/rn3/bed/blastz.hg17/axtChain mkdir preNet cd chain foreach i (*.chain) echo preNetting $i /cluster/bin/i386/chainPreNet $i /cluster/data/rn3/chrom.sizes \ /cluster/data/hg17/chrom.sizes ../preNet/$i end cd .. mkdir n1 cd preNet foreach i (*.chain) set n = $i:r.net echo primary netting $i /cluster/bin/i386/chainNet $i -minSpace=1 /cluster/data/rn3/chrom.sizes \ /cluster/data/hg17/chrom.sizes ../n1/$n /dev/null end cd .. cat n1/*.net | time /cluster/bin/i386/netSyntenic stdin hNoClass.net # memory usage 1499635712, utime 11819 s/100, stime 1808 ssh hgwdev cd /cluster/data/rn3/bed/blastz.hg17/axtChain time netClass hNoClass.net rn3 hg17 human.net \ -tNewR=/cluster/bluearc/rat/rn3/linSpecRep.notInHuman \ -qNewR=/cluster/bluearc/scratch/hg/gs.18/build35/linSpecRep.notInRat # real 13m41.971s # user 7m30.930s # sys 1m53.450s # If things look good do ssh eieio cd /cluster/data/rn3/bed/blastz.hg17/axtChain rm -r n1 hNoClass.net # Make a 'syntenic' subset of these with time netFilter -syn human.net > humanSyn.net # real 6m0.129s # user 5m1.590s # sys 0m30.010s # Load the nets into database ssh hgwdev cd /cluster/data/rn3/bed/blastz.hg17/axtChain netFilter -minGap=10 human.net | hgLoadNet rn3 netHg17 stdin netFilter -minGap=10 humanSyn.net | hgLoadNet rn3 syntenyNetHg17 stdin # real 17m7.948s # user 9m39.800s # sys 1m16.990s # check results # featureBits -countGaps rn3 netHg17 # 2633759972 bases of 2835152425 (92.897%) in intersection # featureBits -countGaps rn3 netHg15 # 2628441031 bases of 2835152425 (92.709%) in intersection # featureBits rn3 syntenyNetHg17 # featureBits -countGaps rn3 syntenyNetHg17 # 2581754861 bases of 2835152425 (91.062%) in intersection # featureBits -countGaps rn3 hg15SynNet # 2578335860 bases of 2835152425 (90.942%) in intersection # Add entries for net and chain to rat/rn3 trackDb # make net ssh eieio cd /cluster/data/rn3/bed/blastz.hg17/axtChain mkdir humanNet time netSplit human.net humanNet # real 6m33.389s # user 5m7.570s # sys 0m52.390s mkdir ../axtNet foreach n (humanNet/chr*.net) set c=$n:t:r echo "netToAxt: $c.net -> $c.axt" rm -f ../axtNet/$c.axt netToAxt humanNet/$c.net chain/$c.chain \ /cluster/data/rn3/nib \ /cluster/data/hg17/nib \ ../axtNet/$c.axt echo "Complete: $c.net -> axtNet/$c.axt" end ssh hgwdev mkdir -p /cluster/data/rn3/bed/blastz.hg17/axtBest cd /cluster/data/rn3/bed/blastz.hg17/axtBest ln -s ../axtNet/chr*.axt . # NOTE: some of these axt files were corrupted (chr4, 8, and X), # (I noticed this while generating maf's from them) # Remade below (kate) # copy net axt's to download area ssh hgwdev cd /cluster/data/rn3/bed/blastz.hg17/axtNet mkdir -p /usr/local/apache/htdocs/goldenPath/rn3/vsHg17/axtNet cp -p *.axt /usr/local/apache/htdocs/goldenPath/rn3/vsHg17/axtNet cd /usr/local/apache/htdocs/goldenPath/rn3/vsHg17/axtNet gzip *.axt # add README.txt file to dir (use previous assembly's copy as template) # Convert those axt files to psl ssh eieio cd /cluster/data/rn3/bed/blastz.hg17 mkdir pslBest foreach a (axtBest/chr*.axt) set c=$a:t:r echo "processing $c.axt -> ${c}_blastzBestHg17.psl" /cluster/bin/i386/axtToPsl axtBest/${c}.axt \ S1.len S2.len pslBest/${c}_blastzBestHg17.psl echo "Done: ${c}_blastzBestHg17.psl" end # Load tables ssh hgwdev cd /cluster/data/rn3/bed/blastz.hg17/pslBest for I in chr*BestHg17.psl do /cluster/bin/i386/hgLoadPsl -noTNameIx rn3 ${I} echo "done ${I}" done # check results # featureBits -countGaps rn3 blastzBestHg17 # 978297270 bases of 2835152425 (34.506%) in intersection # featureBits -countGaps rn3 blastzBestHg15 # 998080478 bases of 2835152425 (35.204%) in intersection # Make /gbdb links and add them to the axtInfo table: mkdir -p /gbdb/rn3/axtBest/Hg17 cd /gbdb/rn3/axtBest/Hg17 ln -s /cluster/data/rn3/bed/blastz.hg17/axtNet/chr*.axt . cd /cluster/data/rn3/bed/blastz.hg17/axtNet rm -f axtInfoInserts.sql foreach f (/gbdb/rn3/axtBest/Hg17/chr*.axt) set chr=$f:t:r echo "INSERT INTO axtInfo (species, alignment, chrom, fileName) \ VALUES ('hg17','Blastz Best in Genome','$chr','$f');" \ >> axtInfoInserts.sql end hgsql rn3 < ~/kent/src/hg/lib/axtInfo.sql # table axtInfo may already exist, ignore create error. hgsql rn3 < axtInfoInserts.sql # ADD CHAIN AND NET TO VSHG17 DOWNLOAD AREAS (DONE May 13, 2005, Heather) ssh hgwdev cd /cluster/data/rn3/bed/blastz.hg17/axtChain gzip all.chain cp -p all.chain.gz /usr/local/apache/htdocs/goldenPath/rn3/vsHg17/human.chain.gz gzip human.net cp -p human.net.gz /usr/local/apache/htdocs/goldenPath/rn3/vsHg17/human.net.gz cd /usr/local/apache/htdocs/goldenPath/rn3/vsHg17 md5sum *.gz */*.gz > md5sum.txt # Update the README.txt # MAKING HUMAN SYNTENY (DONE - 2004-06-16 - Hiram) ssh hgwdev mkdir /cluster/data/rn3/bed/syntenyHg17 cd /cluster/data/rn3/bed/syntenyHg17 # Copy all the needed scripts from /cluster/data/hg16/bed/syntenyMm3 cp -p /cluster/data/hg17/bed/syntenyRn3/*.pl . cp -p /cluster/data/hg17/bed/syntenyRn3/*.sh . ./syntenicBest.pl -db=rn3 -table=blastzBestHg17 ./smooth.pl ./joinsmallgaps.pl ./fillgap.pl -db=g17 -table=blastzBestHg17 ./synteny2bed.pl # The five commands above # real 185m3.399s # user 0m18.620s # sys 0m3.770s # Used to load this in syntenyHg17, but that type is misleading to # the table browser and fails the checkTableCoords check. # Better to use this ensRatMusHom type: sed -e 's/ensPhusionBlast/ensRatMusHom/g' \ $HOME/kent/src/hg/lib/ensPhusionBlast.sql \ > ensRatMusHom.sql hgLoadBed rn3 ensRatMusHom ucsc100k.bed -sqlTable=ensRatMusHom.sql # MAKING HUMAN AXTTIGHT FROM AXTBEST (WORKING - 2004-06-15 - Hiram) # After creating axtBest alignments above, use subsetAxt to get axtTight: ssh eieio cd /cluster/data/rn3/bed/blastz.hg17/axtNet mkdir -p ../axtTight foreach i (*.axt) echo $i subsetAxt $i ../axtTight/$i \ ~kent/src/hg/mouseStuff/subsetAxt/coding.mat 3400 end # translate to psl cd ../axtTight mkdir ../pslTight foreach i (*.axt) set c = $i:r axtToPsl $i ../S1.len ../S2.len ../pslTight/${c}_blastzTightHg17.psl echo "Done: $i" end # Load tables into database ssh hgwdev cd /cluster/data/rn3/bed/blastz.hg17/pslTight for I in chr*TightHg17.psl do /cluster/bin/i386/hgLoadPsl -noTNameIx rn3 ${I} echo "done ${I}" done # copy axt's to download area ssh hgwdev cd /cluster/data/rn3/bed/blastz.hg17/axtTight mkdir -p /usr/local/apache/htdocs/goldenPath/rn3/vsHg17/axtTight cp -p *.axt /usr/local/apache/htdocs/goldenPath/rn3/vsHg17/axtTight cd /usr/local/apache/htdocs/goldenPath/rn3/vsHg17/axtTight gzip *.axt XXX - README file not done yet - 2004-06-15 # add README.txt file to dir # HUMAN HG17 BLASTZ NET MAF FILES (WORKING 2004-09-21 kate) # NOTE: remade axt's on chr4, chrX and chr8, as they contained some garbage # probably due to file corruption on store3, as copies in download # directory are fine ssh eieio cd /cluster/data/rn3/bed/blastz.hg17/axtChain foreach f (humanNet/chr{4,8,X}.net) set c = $f:t:r echo "axt of $c" netToAxt $f chain/$c.chain /cluster/data/rn3/nib \ /cluster/data/hg17/nib ../axtNet/$c.axt end cat > makeMaf.csh << 'EOF' foreach f (humanNet/chr*.net) set c = $f:t:r echo "$c" axtSort ../axtNet/$c.axt stdout | axtToMaf stdin \ /cluster/data/rn3/chrom.sizes /cluster/data/hg17/chrom.sizes \ ../mafNet/$c.maf -tPrefix=rn3. -qPrefix=hg17. end 'EOF' # << emacs csh makeMaf.csh >&! makeMaf.log & tail -100f makeMaf.log # ANDY LAW CPGISSLANDS (DONE 6/15/04 angie) # See notes about this in makeGalGal2.doc. ssh eieio mkdir /cluster/data/rn3/bed/cpgIslandGgfAndy cd /cluster/data/rn3/bed/cpgIslandGgfAndy cp /dev/null cpgIslandAndy.bed cp /dev/null cpgIslandGgfAndy.bed foreach f (../../*/chr*.fa) set chr = $f:t:r echo preproc $chr /cluster/home/angie/bin/$MACHTYPE/preProcGgfAndy $f > $chr.preproc echo running original on $chr awk '{print $1 "\t" $2 "\t" ($3 + $4) "\t" $5;}' $chr.preproc \ | /cluster/home/angie/andy-cpg-island.pl \ | perl -wpe '$i=0 if (not defined $i); \ chomp; ($s,$e) = split("\t"); $s--; \ $_ = "'$chr'\t$s\t$e\tcpg$i\n"; $i++' \ >> cpgIslandAndy.bed echo running modified on $chr /cluster/home/angie/ggf-andy-cpg-island.pl $chr.preproc \ | perl -wpe 'chomp; ($s,$e,$cpg,$n,$c,$g,$oE) = split("\t"); $s--; \ $gc = $c + $g; $pCpG = (100.0 * 2 * $cpg / $n); \ $pGc = (100.0 * $gc / $n); \ $_ = "'$chr'\t$s\t$e\tCpG: $cpg\t$n\t$cpg\t$gc\t" . \ "$pCpG\t$pGc\t$oE\n";' \ >> cpgIslandGgfAndy.bed end # load into database: ssh hgwdev cd /cluster/data/rn3/bed/cpgIslandGgfAndy # this one is a bed 4: hgLoadBed rn3 cpgIAndy -tab -noBin cpgIslandAndy.bed # this one is a cpgIslandExt but with a different table name: sed -e 's/cpgIslandExt/cpgIslandGgfAndy/g' \ $HOME/kent/src/hg/lib/cpgIslandExt.sql > cpgIslandGgfAndy.sql hgLoadBed rn3 cpgIslandGgfAndy -tab -noBin \ -sqlTable=cpgIslandGgfAndy.sql cpgIslandGgfAndy.bed # WOW, even masking out repeat bases from the results, there's a huge # increase in reported islands!! featureBits rn3 cpgIsland #9679881 bases of 2571104688 (0.376%) in intersection featureBits rn3 cpgIslandGgfAndy #70368349 bases of 2571104688 (2.737%) in intersection featureBits rn3 cpgIslandGgfAndy \!rmsk #51841401 bases of 2571104688 (2.016%) in intersection wc -l ../cpgIsland/cpgIsland.bed *bed # 15832 ../cpgIsland/cpgIsland.bed # 206593 cpgIslandAndy.bed # 152003 cpgIslandGgfAndy.bed # LaDeana asked for a run on masked seq for comparison. cp /dev/null cpgIslandAndyMasked.bed cp /dev/null cpgIslandGgfAndyMasked.bed foreach f (../../?{,?}/chr*.fa.masked) set chr = $f:t:r:r echo preproc masked $chr /cluster/home/angie/bin/$MACHTYPE/preProcGgfAndy $f > $chr.masked.preproc echo running original on $chr masked awk '{print $1 "\t" $2 "\t" ($3 + $4) "\t" $5;}' $chr.masked.preproc \ | /cluster/home/angie/andy-cpg-island.pl \ | perl -wpe '$i=0 if (not defined $i); \ chomp; ($s,$e) = split("\t"); $s--; \ $_ = "'$chr'\t$s\t$e\tcpg$i\n"; $i++' \ >> cpgIslandAndyMasked.bed echo running modified on $chr masked /cluster/home/angie/ggf-andy-cpg-island.pl $chr.masked.preproc \ | perl -wpe 'chomp; ($s,$e,$cpg,$n,$c,$g,$oE) = split("\t"); $s--; \ $gc = $c + $g; $pCpG = (100.0 * 2 * $cpg / $n); \ $pGc = (100.0 * $gc / $n); \ $_ = "'$chr'\t$s\t$e\tCpG: $cpg\t$n\t$cpg\t$gc\t" . \ "$pCpG\t$pGc\t$oE\n";' \ >> cpgIslandGgfAndyMasked.bed end # load into database: ssh hgwdev cd /cluster/data/rn3/bed/cpgIslandGgfAndy hgLoadBed rn3 cpgIAndyMasked -tab -noBin cpgIslandAndyMasked.bed sed -e 's/cpgIslandExt/cpgIslandGgfAndyMasked/g' \ $HOME/kent/src/hg/lib/cpgIslandExt.sql > cpgIslandGgfAndyMasked.sql hgLoadBed rn3 cpgIslandGgfAndyMasked -tab -noBin \ -sqlTable=cpgIslandGgfAndyMasked.sql cpgIslandGgfAndyMasked.bed featureBits rn3 cpgIAndyMasked #62121817 bases of 2571104688 (2.416%) in intersection featureBits rn3 cpgIslandGgfAndyMasked #43903454 bases of 2571104688 (1.708%) in intersection wc -l cpgIslandAndyMasked.bed cpgIslandGgfAndyMasked.bed # 124457 cpgIslandAndyMasked.bed # 91015 cpgIslandGgfAndyMasked.bed # 6/28/04 -- masking simpleRepeats, and even repeats other than Alu's, # might not be the right thing to do (?). Give it a try with less-masked # sequence. ssh eieio cd /cluster/data/rn3/bed/cpgIslandGgfAndy cp /dev/null cpgIslandGgfAndyOnlyRM.bed cp /dev/null cpgIslandGgfAndyOnlyRMAlu.bed foreach f (../../?{,?}/chr*.fa) set chr = $f:t:r echo preproc, ggf-andy $chr onlyRM maskOutFa $f $f.out stdout \ | /cluster/home/angie/bin/$MACHTYPE/preProcGgfAndy stdin \ | /cluster/home/angie/ggf-andy-cpg-island.pl \ | perl -wpe 'chomp; ($s,$e,$cpg,$n,$c,$g,$oE) = split("\t"); $s--; \ $gc = $c + $g; $pCpG = (100.0 * 2 * $cpg / $n); \ $pGc = (100.0 * $gc / $n); \ $_ = "'$chr'\t$s\t$e\tCpG: $cpg\t$n\t$cpg\t$gc\t" . \ "$pCpG\t$pGc\t$oE\n";' \ >> cpgIslandGgfAndyOnlyRM.bed echo preproc, ggf-andy $chr onlyRMAlu head -3 $f.out > /tmp/tmp2.fa.out awk '$11 == "SINE/Alu" {print;}' $f.out >> /tmp/tmp2.fa.out maskOutFa $f /tmp/tmp2.fa.out stdout \ | /cluster/home/angie/bin/$MACHTYPE/preProcGgfAndy stdin \ | /cluster/home/angie/ggf-andy-cpg-island.pl \ | perl -wpe 'chomp; ($s,$e,$cpg,$n,$c,$g,$oE) = split("\t"); $s--; \ $gc = $c + $g; $pCpG = (100.0 * 2 * $cpg / $n); \ $pGc = (100.0 * $gc / $n); \ $_ = "'$chr'\t$s\t$e\tCpG: $cpg\t$n\t$cpg\t$gc\t" . \ "$pCpG\t$pGc\t$oE\n";' \ >> cpgIslandGgfAndyOnlyRMAlu.bed end # 91272 cpgIslandGgfAndyOnlyRM.bed # 149225 cpgIslandGgfAndyOnlyRMAlu.bed ssh hgwdev cd /cluster/data/rn3/bed/cpgIslandGgfAndy sed -e 's/cpgIslandExt/cpgIslandGgfAndyOnlyRM/g' \ $HOME/kent/src/hg/lib/cpgIslandExt.sql > /tmp/c.sql hgLoadBed rn3 cpgIslandGgfAndyOnlyRM -tab -noBin -sqlTable=/tmp/c.sql \ cpgIslandGgfAndyOnlyRM.bed sed -e 's/cpgIslandExt/cpgIslandGgfAndyOnlyRMAlu/g' \ $HOME/kent/src/hg/lib/cpgIslandExt.sql > /tmp/c.sql hgLoadBed rn3 cpgIslandGgfAndyOnlyRMAlu -tab -noBin -sqlTable=/tmp/c.sql \ cpgIslandGgfAndyOnlyRMAlu.bed featureBits rn3 cpgIslandGgfAndyOnlyRM.bed #44021003 bases of 2571104688 (1.712%) in intersection featureBits rn3 cpgIslandGgfAndyOnlyRMAlu.bed #68483127 bases of 2571104688 (2.664%) in intersection # 1/26/05: Make better island names in cpgIslandGgfAndyMasked, # for Dave Burt's cross-species island comparisons. ssh eieio cd /cluster/data/rn3/bed/cpgIslandGgfAndy mv cpgIslandGgfAndyMasked.bed cpgIslandGgfAndyMasked.bed.orig perl -wpe '@w=split("\t"); $w[3] = "rn3.$w[0]." . ($w[1]+1) . ".$w[2]"; \ $_ = join("\t", @w);' \ cpgIslandGgfAndyMasked.bed.orig \ > cpgIslandGgfAndyMasked.bed ssh hgwdev cd /cluster/data/rn3/bed/cpgIslandGgfAndy hgLoadBed -noBin -tab -sqlTable=cpgIslandGgfAndyMasked.sql \ rn3 cpgIslandGgfAndyMasked cpgIslandGgfAndyMasked.bed ######## GENE SORTER BUILD ####### (DONE - 2004-06-01 - Fan) # This update is to complement the KG update using SWISS-PROT data release of 3/15/04. # These are instructions for updating the # neighborhood browser. Don't start these until # there is a knownGene track and the affy tracks # Cluster together various alt-splicing isoforms. # Creates the knownIsoforms and knownCanonical tables ssh hgwdev mkdir /cluster/data/rn3/bed/famBro.2004-04-05 rm /cluster/data/rn3/bed/famBro ln -s /cluster/data/rn3/bed/famBro.2004-04-05 /cluster/data/rn3/bed/famBro cd /cluster/data/rn3/bed/famBro hgClusterGenes rn3 knownGene knownIsoforms knownCanonical # Got 25004 clusters, from 41693 genes in 44 chromosomes # row count on knownIsoforms is 41669 # row count on knownCanonical is 25004 # Extract peptides from knownGenes into fasta file # and create a blast database out of them. mkdir /cluster/data/rn3/bed/famBro/blastp cd /cluster/data/rn3/bed/famBro/blastp pepPredToFa rn3 knownGenePep known.faa # You may need to build this binary in src/hg/near/pepPredToFa # # expect to find /scratch/blast loaded with all required blast files /scratch/blast/formatdb -i known.faa -t known -n known # Copy over database to bluearc rm -fr /cluster/bluearc/rn3/blastp mkdir /cluster/bluearc/rn3 mkdir /cluster/bluearc/rn3/blastp cp -p /cluster/data/rn3/bed/famBro/blastp/known.* /cluster/bluearc/rn3/blastp # Split up fasta file into bite sized chunks for cluster cd /cluster/data/rn3/bed/famBro/blastp mkdir split faSplit sequence known.faa 8000 split/kg # Make parasol run directory ssh kk mkdir /cluster/data/rn3/bed/famBro/blastp/self cd /cluster/data/rn3/bed/famBro/blastp/self mkdir run cd run mkdir out # Make blast script cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/scratch/blast/data /scratch/blast/blastall -p blastp \ -d /cluster/bluearc/rn3/blastp/known -i $1 -o $2 -e 0.01 -m 8 -b 1000 '_EOF_' # << emacs chmod a+x blastSome # Make gensub2 file cat << '_EOF_' > gsub #LOOP blastSome {check in line+ $(path1)} {check out line out/$(root1).tab} #ENDLOOP '_EOF_' # << emacs # Create parasol batch # 'ls ../../split/*.fa' is too much, hence the echo echo ../../split/*.fa | wordLine stdin > split.lst gensub2 split.lst single gsub jobList para make jobList # This should finish in ~5 minutes # Completed: 7738 of 7738 jobs # CPU time in finished jobs: 108579s 1809.65m 30.16h 1.26d 0.003 y # IO & Wait Time: 76232s 1270.54m 21.18h 0.88d 0.002 y # Average job time: 24s 0.40m 0.01h 0.00d # Longest job: 188s 3.13m 0.05h 0.00d # Submission to last job: 254s 4.23m 0.07h 0.00d # Load into database. This takes about 15 minutes. ssh hgwdev cd /cluster/data/rn3/bed/famBro/blastp/self/run/out time hgLoadBlastTab rn3 knownBlastTab *.tab # Scanning through 7738 files # Loading database with 8019510 rows # 184.360u 28.660s 17:01.40 20.8% 0+0k 0+0io 202pf+0w # Create table that maps between known genes and RefSeq hgMapToGene rn3 refGene knownGene knownToRefSeq # row count goes from 27744 to 30116 # Part 2, started 5/25/04 # Create table to map between known genes and GNF Atlas2 # expression data. hgMapToGene rn3 gnfAtlas2 knownGene knownToGnfAtlas2 '-type=bed 12' hgExpDistance rn3 hgFixed.gnfRatAtlas2MedianRatio \ hgFixed.gnfRatAtlas2MedianExps gnfAtlas2Distance \ -lookup=knownToGnfAtlas2 # Make sure that GO database is up to date. See README in /cluster/store1/geneOntology. GO database, go, downloaded and updated. # Create knownToEnsembl column cd /cluster/data/rn3/bed/famBro hgMapToGene rn3 ensGene knownGene knownToEnsembl # row count went to 34357 # Make knownToCdsSnp column. hgMapToGene rn3 snpMap knownGene knownToCdsSnp -all -cds # row count went to 12061 # CREATING KNOWNTOSUPER (which enables Superfamily stuff in hgNear/hgGene) (DONE 6/1/04 Fan.) # First see if need to update superfamily data from # ftp server at supfam.mrc-lmb.cam.ac.uk following instructions # in /cluster/store1/superFamily/genomes/README.ucsc. Then # make sure that knownToEnsembl and ensGtp tables are created, then: ssh hgwdev cat /cluster/store1/superFamily/040516/ass_16-May-2004.tab| hgKnownToSuper rn3 rn stdin # 5984 records output # CREATE KNOWNTOLOCUSLINK (DONE 6/1/04 Fan.) # that maps between known genes and LocusLink hgsql --skip-column-names -e "select mrnaAcc,locusLinkId from refLink" rn3 \ > refToLl.txt hgMapToGene rn3 refGene knownGene knownToLocusLink -lookup=refToLl.txt # row count is 6084 # MAP ENCODE ORTHOLOG REGIONS (2004-07-09 kate) ssh eieio cd /cluster/data/rn3/bed mkdir encodeRegions cd encodeRegions # Of 44 regions: # -minMatch=.20 -> 36 mapped liftOver -minMatch=.20 \ /cluster/data/hg16/bed/encodeRegions/encodeRegions.bed \ /cluster/data/hg16/bed/liftOver/hg16Torn3.chain \ encodeRegions.bed encodeRegions.unmapped ssh hgwdev cd /cluster/data/rn3/bed/encodeRegions hgLoadBed rn3 encodeRegions encodeRegions.bed -noBin ssh eieio liftOver -minMatch=.20 \ /cluster/data/hg16/bed/encodeRegions/encodeRegions.bed \ /cluster/data/hg16/bed/liftOver/hg16Torn3.chain \ encodeRegions.bed encodeRegions.unmapped ssh hgwdev cd /cluster/data/rn3/bed/encodeRegions hgLoadBed rn3 encodeRegions encodeRegions.bed -noBin # ACCESSIONS FOR CONTIGS (DONE 2004-08-24 kate) # Used for Assembly track details, and also ENCODE AGP's cd /cluster/data/rn3/bed mkdir -p contigAcc cd contigAcc # extract WGS contig to accession mapping from Entrez Nucleotide summaries # To get the summary file, access the Genbank page for the project # by searching: # genus[ORGN] AND WGS[KYWD] # At the end of the page, click on the list of accessions for the contigs. # Select summary format, and send to file. # output to file .acc contigAccession rat.acc > contigAcc.tab wc -l contigAcc.tab # 137910 contigAcc.tab # extract all records from chrN_gold, using table browser # (includes header line) wc -l rat* # 137911 ratcontigs.tab hgsql rn3 < $HOME/kent/src/hg/lib/contigAcc.sql echo "LOAD DATA LOCAL INFILE 'contigAcc.tab' INTO TABLE contigAcc" | \ hgsql rn3 hgsql rn3 -e "SELECT COUNT(*) FROM contigAcc" # 137910 # SWAP BLASTZ MOUSE-RAT TO RAT-MOUSE (MM5) (DONE 9/1/04 Fan) ssh eieio mkdir /cluster/data/rn3/bed/blastz.mm5.swap.2004-09-01 ln -s blastz.mm5.swap.2004-09-01 /cluster/data/rn3/bed/blastz.mm5 cd /cluster/data/rn3/bed/blastz.mm5.swap.2004-09-01 set aliDir = /cluster/data/mm5/bed/blastz.rn3 cp $aliDir/S1.len S2.len cp $aliDir/S2.len S1.len mkdir unsorted axtChrom cat $aliDir/axtChrom/chr*.axt \ | axtSwap stdin $aliDir/S1.len $aliDir/S2.len stdout \ | axtSplitByTarget stdin unsorted # Sort the shuffled .axt files. foreach f (unsorted/*.axt) echo sorting $f:t:r axtSort $f axtChrom/$f:t end # This took about 2 hours. du -sh $aliDir/axtChrom unsorted axtChrom rm -r unsorted # CHAIN MOUSE BLASTZ (DONE 9/1/04 Fan) # Run axtChain on little cluster ssh kki cd /cluster/data/rn3/bed/blastz.mm5.swap.2004-09-01 mkdir -p axtChain/run1 cd axtChain/run1 mkdir out chain ls -1S /cluster/data/rn3/bed/blastz.mm5.swap.2004-09-01/axtChrom/*.axt \ > input.lst cat << '_EOF_' > gsub #LOOP doChain {check in line+ $(path1)} {check out line+ chain/$(root1).chain} {check out exists out/$(root1).out} #ENDLOOP '_EOF_' # << this line makes emacs coloring happy cat << '_EOF_' > doChain #!/bin/csh axtChain $1 \ /iscratch/i/rn3/bothMaskedNibs \ /iscratch/i/mus/mm5/softNib $2 > $3 '_EOF_' # << this line makes emacs coloring happy chmod a+x doChain gensub2 input.lst single gsub jobList para create jobList para try, check, push, check... # Completed: 44 of 44 jobs # CPU time in finished jobs: 3611s 60.18m 1.00h 0.04d 0.000 y # IO & Wait Time: 1090s 18.17m 0.30h 0.01d 0.000 y # Average job time: 107s 1.78m 0.03h 0.00d # Longest job: 301s 5.02m 0.08h 0.00d # Submission to last job: 484s 8.07m 0.13h 0.01d # now on the cluster server, sort chains ssh eieio cd /cluster/data/rn3/bed/blastz.mm5.swap.2004-09-01/axtChain limit vmemoryuse 5000000 chainMergeSort run1/chain/*.chain > all.chain chainSplit chain all.chain rm run1/chain/*.chain # take a look at score distr's foreach f (chain/*.chain) grep chain $f | awk '{print $2;}' | sort -nr > /tmp/score.$f:t:r echo $f:t:r textHistogram -binSize=5000 /tmp/score.$f:t:r echo "" end # Lots of chaff with scores in the 3000's. Many very-high-scoring # chains. So filter the chain down somewhat... mv all.chain all.chain.unfiltered chainFilter -minScore=5000 all.chain.unfiltered > all.chain rm chain/* chainSplit chain all.chain gzip all.chain.unfiltered # Load chains into database ssh hgwdev cd /cluster/data/rn3/bed/blastz.mm5.swap.2004-09-01/axtChain/chain foreach i (*.chain) set c = $i:r nice hgLoadChain rn3 ${c}_chainMm5 $i end # Similar to human-mouse coverage -- not nearly as high as rat-human. featureBits -chrom=chr1 rn3 chainMm5Link # 158815202 bases of 242966425 (65.365%) in intersection featureBits -chrom=chr2 rn3 chainMm5Link # 145927900 bases of 235737501 (61.903%) in intersection # NET MOUSE BLASTZ (DONE 9/1/04 Fan) ssh eieio cd /cluster/data/rn3/bed/blastz.mm5.swap.2004-09-01/axtChain chainPreNet all.chain ../S1.len ../S2.len stdout \ | chainNet stdin -minSpace=1 ../S1.len ../S2.len rat.net mouse.net limit vmemoryuse 8000000 netSyntenic rat.net hNoClass.net # memory usage 1499435008, utime 11055 s/100, stime 1385 # The netClass operations requires an "ancientRepeat" table to exist # in either rn3 or mm5. So, create the table: ssh hgwdev mkdir -p /cluster/data/rn3/bed/ancientRepeat cd /cluster/data/rn3/bed/ancientRepeat # mysqldump needs write permission to this directory # and you need to use your read/write enabled user with password chmod 777 . hgsqldump --all --tab=. mm4 ancientRepeat chmod 775 . hgsql rn3 < ancientRepeat.sql hgsql mm5 -e 'load data local infile "ancientRepeat.txt" into table ancientRepeat' ssh hgwdev cd /cluster/data/rn3/bed/blastz.mm5.swap.2004-09-01/axtChain time netClass hNoClass.net rn3 mm5 mouse.net \ -tNewR=/cluster/bluearc/rat/rn3/linSpecRep.notInMouse \ -qNewR=/cluster/bluearc/scratch/mus/mm5/linSpecRep.notInRat # 386.820u 84.400s 20:15.00 38.7% 0+0k 0+0io 31184pf+0w # Make a 'syntenic' subset: ssh eieio cd /cluster/data/rn3/bed/blastz.mm5.swap.2004-09-01/axtChain rm hNoClass.net # Make a 'syntenic' subset of these with netFilter -syn mouse.net > mouseSyn.net # Load the nets into database ssh hgwdev cd /cluster/data/rn3/bed/blastz.mm5.swap.2004-09-01/axtChain nice netFilter -minGap=10 mouse.net | hgLoadNet rn3 netMm5 stdin nice netFilter -minGap=10 mouseSyn.net | hgLoadNet rn3 netSyntenyMm5 stdin # Add entries for chainMm5, netMm5, syntenyMm5 to rat/rn3 trackDb # GENERATE MM5 MAF FOR MULTIZ FROM NET (DONE 9/1/04 Fan) ssh eieio cd /cluster/data/rn3/bed/blastz.mm5.swap.2004-09-01/axtChain netSplit mouse.net net cd /cluster/data/rn3/bed/blastz.mm5.swap.2004-09-01 mkdir axtNet foreach f (axtChain/net/*) set chr = $f:t:r netToAxt $f axtChain/chain/$chr.chain /cluster/data/rn3/nib \ /cluster/data/mm5/nib stdout \ | axtSort stdin axtNet/$chr.axt end mkdir mafNet foreach f (axtNet/chr*.axt) set maf = mafNet/$f:t:r.maf axtToMaf $f \ /cluster/data/rn3/chrom.sizes /cluster/data/mm5/chrom.sizes \ $maf -tPrefix=rn3. -qPrefix=mm5. end # MAKE VSMM5 DOWNLOADABLES (DONE 9/2/04 Fan) ssh eieio cd /cluster/data/rn3/bed/blastz.mm5/axtChain ln all.chain mouse.chain foreach f (mouse.chain mouse.net) gzip -c $f > $f.gz end rm mouse.chain # Make chain-format of raw alignments cd /cluster/data/rn3/bed/blastz.mm5 mkdir blastzECF foreach f (axtChrom/chr*.axt) set chr = $f:t:r axtToChain $f S1.len S2.len stdout \ | gzip -c - > blastzECF/$chr.ecf.gz end ssh hgwdev mkdir /usr/local/apache/htdocs/goldenPath/rn3/vsMm5 cd /usr/local/apache/htdocs/goldenPath/rn3/vsMm5 mv /cluster/data/rn3/bed/blastz.mm5/axtChain/mouse*.gz . md5sum *.gz > md5sum.txt cd /cluster/data/rn3/bed/blastz.mm5/axtNet mkdir -p /usr/local/apache/htdocs/goldenPath/rn3/vsmm5/axtNet cp -p *.axt /usr/local/apache/htdocs/goldenPath/rn3/vsmm5/axtNet cd /usr/local/apache/htdocs/goldenPath/rn3/vsmm5/axtNet md5sum * > md5sum.txt gzip *.axt # Copy over & edit README.txt w/pointers to chain, net formats. # Not for pushing -- handle separately. mv /cluster/data/rn3/bed/blastz.mm5/blastzECF . cd blastzECF md5sum *.gz > md5sum.txt # CONSERVATION TRACK - MULTIZ AND PHASTCONS (WORKING 2004-09-21 kate) ssh eieio set multizDir = multiz.2004-09-21 set workingDir = /cluster/bluearc/rn3/$multizDir ln -s $workingDir /cluster/bluearc/rn3/multiz mkdir -p $workingDir mkdir -p /cluster/data/rn3/bed/$multizDir cd /cluster/data/rn3/bed ln -s multiz.2004-09-21 multiz cd multiz cut -f 1 /cluster/data/rn3/chrom.sizes > chrom.lst # copy mafs to bluearc -- mouse set workingDir = /cluster/bluearc/rn3/$multizDir mkdir $workingDir/mm5 cp /cluster/data/rn3/bed/blastz.mm5/mafNet/chr*.maf $workingDir/mm5 # copy mafs to bluearc -- human mkdir $workingDir/hg17 cp /cluster/data/rn3/bed/blastz.hg17/mafNet/chr*.maf $workingDir/hg17 # multiz - add in human to mouse/rat # ssh kki set multizDir = multiz.2004-09-21 set workingDir = /cluster/bluearc/rn3/$multizDir cd /cluster/data/rn3/bed/$multizDir # wrapper script for multiz # NOTE: first arg is pairwise, 2nd arg is multiple (to add to) cat << 'EOF' > doMultiz.csh #!/bin/csh -fe mkdir -p $3:h /cluster/bin/penn/multiz $1 $2 - > $3 'EOF' # << for emacs cat << 'EOF' > gsub #LOOP ../doMultiz.csh {check in line /cluster/bluearc/rn3/multiz.2004-09-21/$(dir1)/$(root2).maf} {check in line /cluster/bluearc/rn3/multiz.2004-09-21/$(root1)/$(root2).maf} {check out line+ /cluster/bluearc/rn3/multiz.2004-09-21/$(root1)$(dir1)/$(root2).maf} #ENDLOOP 'EOF' # << for emacs chmod +x doMultiz.csh # multiz - add in human to mouse/rat mkdir run.hg17 cd run.hg17 echo "hg17/mm5" > species.lst gensub2 species.lst ../chrom.lst ../gsub jobList para create jobList # 44 jobs para try, check, push, check cd .. # copy 3-way mafs to build directory ssh eieio set multizDir = multiz.2004-09-21 set workingDir = /cluster/bluearc/rn3/$multizDir ln -s $workingDir /cluster/bluearc/rn3/multiz ln -s $workingDir/mm5hg17 $workingDir/maf cd /cluster/data/rn3/bed/multiz.2004-09-21 mkdir maf cp $workingDir/maf/*.maf maf # CREATE kgSpAlias TABLE FOR PB (Done 10/20/04) hgsql rn3 -e \ 'select kgXref.kgID, spID, alias from kgXref, kgAlias where kgXref.kgID=kgAlias.kgID' >j.tmp hgsql rn3 -e \ 'select kgXref.kgID, spID, alias from kgXref, kgProtAlias where kgXref.kgID=kgProtAlias.kgID'\ >>j.tmp cat j.tmp|sort -u |grep -v 'kgID' >rn3.kgSpAlias.tab rm j.tmp hgsql rn3 -e 'drop table kgSpAlias'; hgsql rn3 < ~/src/hg/lib/kgSpAlis.sql hgsql rn3 -e 'load data local infile "rn3.kgSpAlias.tab" into table kgSpAlias' # ECGENE TRACK - RELOAD NEW DATA (DONE, 2004-10-29, hartera) ssh eieio rm -f /cluster/data/rn3/bed/ECgene mkdir -p /cluster/data/rn3/bed/ECgene.2004-10-29 ln -s /cluster/data/rn3/bed/ECgene.2004-10-29 \ /cluster/data/rn3/bed/ECgene cd /cluster/data/rn3/bed/ECgene wget \ "http://genome.ewha.ac.kr/ECgene/download/v1.2_ECgene/v1.2_rn3_low_gene.txt.gz" wget \ "http://genome.ewha.ac.kr/ECgene/download/v1.2_ECgene/v1.2_rn3_low_pep.txt.gz" gunzip *.gz # load database ssh hgwdev cd /cluster/data/rn3/bed/ECgene ldHgGene -predTab rn3 ECgene v1.2_rn3_low_gene.txt # 119846 gene predictions # added peptide table 11/15/04 hgPepPred rn3 tab ECgenePep v1.2_rn3_low_pep.txt rm *.tab nice gzip *.txt # ENABLE AUTO BUILD MGC GENES TRACK (DONE markd) - add the following lines to rn3 entry in /cluster/data/genbank/etc/genbank.conf rn3.mgcTables.default = full rn3.mgcTables.mgc = all - wait one day for nightly build to align and load them into the db - rebuild trackDb # Create rn3GeneList.html (to be used by Google). # This step was done 12/08/04. cd /cluster/data/rn3/bed mkdir geneList cd geneList wget -O rn3GeneList.html "http://hgwdev-fanhsu.cse.ucsc.edu/cgi-bin/hgGeneList?db=rn3" cp -p rn3GeneList.html /usr/local/apache/htdocs/goldenPath # Check this html file into CVS. # UPDATE TIGR GENE INDEX (DONE 2004-12-13 Fan) cd /cluster/data/rn3/bed mv tigr tigr-2004-05020 mkdir tigr-2004-12-13 ln -s tigr-2004-12-13 tigr cd tigr wget --timestamp ftp://ftp.tigr.org/pub/data/tgi/Rattus_norvegicus/TGI_track_RatGenome_rn3_12-2004.tgz tar -xvzf TGI*.tgz foreach f (*cattle*) set f1 = `echo $f | sed -e 's/cattle/cow/g'` mv $f $f1 end foreach o (mouse cow human pig rat) echo $o setenv O $o foreach f (chr*_$o*s) tail +2 $f | perl -wpe 's /THC/TC/; s/(TH?C\d+)/$ENV{O}_$1/;' > $f.gff end end ssh hgwdev cd /cluster/data/rn3/bed/tigr hgsql rn3 -e "drop table tigrGeneIndex" hgsql rn3 < ~/kent/src/hg/lib/tigrGeneIndex.sql foreach f (*.gff) echo Processing $f ... /cluster/home/fanhsu/bin/i386/ldHgGene -oldTable -exon=TC rn3 tigrGeneIndex $f hgsql rn3 -e "select count(*) from tigrGeneIndex" end # Total of 367787 entries created in tigrGeneIndex table. hgsql rn3 -e "update tigrGeneIndex set cdsStart = txStart;" hgsql rn3 -e "update tigrGeneIndex set cdsEnd = txEnd;" checkTableCoords rn3 tigrGeneIndex gzip *.gff *TCs # UPDATE TIGR GENE INDEX (RE-DONE 2004-12-21 Fan) # This track is re-done due to an error (missing strand info) in the original files from TIGR. cd /cluster/data/rn3/bed mv tigr tigr_old_wrong mkdir tigr-2004-12-21 ln -s tigr-2004-12-21 tigr cd tigr wget --timestamp ftp://ftp.tigr.org/pub/data/tgi/Rattus_norvegicus/TGI_track_RatGenome_rn3_12-2004.tgz tar -xvzf TGI*.tgz foreach f (*cattle*) set f1 = `echo $f | sed -e 's/cattle/cow/g'` mv $f $f1 end foreach o (mouse cow human pig rat) echo $o setenv O $o foreach f (chr*_$o*s) tail +2 $f | perl -wpe 's /THC/TC/; s/(TH?C\d+)/$ENV{O}_$1/;' > $f.gff end end ssh hgwdev cd /cluster/data/rn3/bed/tigr hgsql rn3 -e "drop table tigrGeneIndex" hgsql rn3 < ~/kent/src/hg/lib/tigrGeneIndex.sql foreach f (*.gff) echo Processing $f ... /cluster/home/fanhsu/bin/i386/ldHgGene -oldTable -exon=TC rn3 tigrGeneIndex $f hgsql rn3 -e "select count(*) from tigrGeneIndex" end # Total of 367787 entries created in tigrGeneIndex table. hgsql rn3 -e "update tigrGeneIndex set cdsStart = txStart;" hgsql rn3 -e "update tigrGeneIndex set cdsEnd = txEnd;" checkTableCoords rn3 tigrGeneIndex gzip *.gff *TCs # Pick up photo from NHGRI (DONE - 2004-12-22 - Hiram) ssh hgwdev cd /tmp wget \ 'http://www.genome.gov/Pages/News/Photos/Science/rat_image.jpg' -O rat_image.jpg # view that image in 'display' to determine crop edges, then: convert -crop 1536x1040+0+560 -quality 80 -sharpen 0 \ -normalize -gamma 1.7 rat_image.jpg rrn.jpg convert -geometry 300x300 -quality 80 rrn.jpg Rattus_norvegicus.jpg rm -f rrn.jpg rat_image.jpg cp -p Rattus_norvegicus.jpg /usr/local/apache/htdocs/images # add links to this image in the description.html page, request push # MAKE RN3-HG17 OVER.CHAIN FOR LIFTOVER (DONE 1/25/05 angie) ssh kolossus set chainDir = /cluster/data/rn3/bed/blastz.hg17/axtChain mkdir -p /cluster/data/rn3/bed/bedOver netChainSubset $chainDir/human.net $chainDir/all.chain \ /cluster/data/rn3/bed/bedOver/rn3ToHg17.over.chain # MAKE RN3-MM5 OVER.CHAIN FOR LIFTOVER (DONE 1/25/05 angie) ssh kolossus set chainDir = /cluster/data/rn3/bed/blastz.mm5/axtChain mkdir -p /cluster/data/rn3/bed/bedOver netChainSubset $chainDir/mouse.net $chainDir/all.chain \ /cluster/data/rn3/bed/bedOver/rn3ToMm5.over.chain # MAKE LINEAGE-SPECIFIC REPEATS FOR CHICKEN (DONE 1/26/05/04 angie) # In an email 2/13/04, Arian said we could treat all human repeats as # lineage-specific for human-chicken blastz. Do the same for rat. # Scripts expect *.out.spec filenames, so set that up: ssh kkr1u00 cd /cluster/data/rn3 mkdir /iscratch/i/rn3/linSpecRep.notInChicken foreach f (/cluster/data/rn3/?{,?}/chr*.fa.out) cp -p $f /iscratch/i/rn3/linSpecRep.notInChicken/$f:t:r:r.out.spec end iSync # Use these the next time we run human-chicken blastz. # UPDATE kgSpAlias TABLE WITH NEW UNIPROT DISPLAY ID ENTRIES (done 2/11/05 Fan) # Add new rn3 protein display IDs to the alias table to support user search ssh hgwdev mkdir -p /cluster/data/rn3/bed/pb/newDisplayId cd /cluster/data/rn3/bed/pb/newDisplayId hgsql proteome -e 'select rn3.kgSpAlias.kgID, rn3.kgSpAlias.SpID, spOldNew.newDisplayId from spOldNew, rn3.kgSpAlias where spOldNew.acc=rn3.kgSpAlias.spID and oldDisplayId != newDisplayId' |sort -u >rn3.tab # get rid of the header line at the end of the file vi rn3.tab hgsql rn3 -e 'load data local infile "rn3.tab" into table rn3.kgSpAlias' # UPDATE kgProtAlias TABLE WITH NEW UNIPROT DISPLAY ID ENTRIES (done 2/11/05 Fan) # Add new rn3 protein display IDs to the alias table to support user search ssh hgwdev cd /cluster/data/rn3/bed/pb/newDisplayId hgsql proteome -e 'select rn3.kgSpAlias.kgID,spOldNew.oldDisplayId,spOldNew.newDisplayId from spOldNew, rn3.kgSpAlias where spOldNew.acc=rn3.kgSpAlias.spID and oldDisplayId != newDisplayId' |sort -u >rn3.kgProtAlias.tab # get rid of the header line at the end of the file vi rn3.kgProtAlias.tab hgsql rn3 -e 'load data local infile "rn3.kgProtAlias.tab" into table rn3.kgProtAlias' ######################################################################### # MOUSE NET/CHAINS MM6 - Info contained in makeMm6.doc (200503 Hiram) # CREATE FULL TEXT INDEX FOR KNOWN GENES (DONE 1/19/2006 JK) # This depends on the go and uniProt databases as well as # the kgAlias and kgProAlias tables. The hgKgGetText takes # about 5 minutes when the database is not too busy. The rest # is real quick. ssh hgwdev cd /cluster/data/rn3/bed/knownGene mkdir index cd index hgKgGetText rn3 knownGene.text ixIxx knownGene.text knownGene.ix knownGene.ixx ln -s /cluster/data/rn3/bed/knownGene/index/knownGene.ix /gbdb/rn3/knownGene.ix ln -s /cluster/data/rn3/bed/knownGene/index/knownGene.ixx /gbdb/rn3/knownGene.ixx # BUILD KNOWN GENE LIST FOR GOOGLE. DONE 6/27/05 Fan. cd /cluster/data/rn3/bed rm -rf knownGeneList/rn3 # Run hgKnownGeneList to generate the tree of HTML pages # under ./knownGeneList/rn3 hgKnownGeneList rn3 # copy over to /usr/local/apache/htdocs rm -rf /usr/local/apache/htdocs/knownGeneList/rn3 mkdir -p /usr/local/apache/htdocs/knownGeneList/rn3 cp -Rfp knownGeneList/rn3/* /usr/local/apache/htdocs/knownGeneList/rn3 # Build kgReactome table for KG to Reactome xref. Done 6/28/05 Fan. # First, make sure the reactome DB is built. See makeHg17.doc for details. ssh hgwdev mkdir -p /cluster/data/rn3/bed/reactome cd /cluster/data/rn3/bed/reactome hgsql reactome -N -e 'select kgId, spID, DB_ID from ReferenceEntity, rn3.kgXref where identifier=spID' >kgReactome.tab; hgsql rn3 -e 'drop table kgReactome' hgsql rn3 < ~/src/hg/lib/kgReactome.sql hgsql rn3 -e 'load data local infile "kgReactome.tab" into table kgReactome' # ADD LINK TO GENENETWORK (Done. 9/6/05 Fan). # Received the file, rat.RefSeqId, list of RefSeq IDs from GeneNetwork. # remove extra CR (or LF?) at end of the line. rmLf rat.RefSeqId >rn3.geneNetworkId.tab hgsql rn3 -e 'drop table geneNetworkId' hgsql rn3 < ~/src/hg/lib/geneNetworkId.sql hgsql rn3 -e 'load data local infile "rn3.geneNetworkId.tab" into table geneNetworkId' # EXTRACT LINEAGE-SPECIFIC REPEATS FOR DOG (DONE 8/12/05 angie) ssh kolossus cd /panasas/store/rn3/rmsk # Run Arian's DateRepsinRMoutput.pl to add extra columns telling # whether repeats in -query are also expected in -comp species. foreach outfl ( *.out ) echo "$outfl" /cluster/bluearc/RepeatMasker/DateRepeats \ ${outfl} -query rat -comp dog end # Now extract dog (extra column 1): cd .. mkdir linSpecRep.notInDog foreach f (rmsk/*.out_canis-familiaris) set base = $f:t:r:r echo $base.out.spec /cluster/bin/scripts/extractRepeats 1 $f > \ linSpecRep.notInDog/$base.out.spec end # Clean up. rm rmsk/*.out_canis* # BLASTZ/CHAIN/NET CANFAM2 (DONE 8/15/05 angie) ssh kkstore01 mkdir /cluster/data/rn3/bed/blastz.canFam2.2005-08-12 cd /cluster/data/rn3/bed/blastz.canFam2.2005-08-12 cat << '_EOF_' > DEF # rat vs. dog # TARGET: Rat SEQ1_DIR=/panasas/store/rn3/nib SEQ1_RMSK=/panasas/store/rn3/rmsk SEQ1_SMSK=/panasas/store/rn3/linSpecRep.notInDog SEQ1_LEN=/cluster/data/rn3/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Dog SEQ2_DIR=/scratch/hg/canFam2/nib #SEQ2_DIR=/iscratch/i/canFam2/nib SEQ2_RMSK=/panasas/store/canFam2/rmsk SEQ2_SMSK=/panasas/store/canFam2/linSpecRep.notInRat SEQ2_LEN=/cluster/data/canFam2/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/cluster/data/rn3/bed/blastz.canFam2.2005-08-12 '_EOF_' # << for emacs doBlastzChainNet.pl DEF \ -blastzOutRoot /panasas/store/blastzRn3CanFam2Out >& do.log & # Bummer -- small cluster can't see /scratch/hg/canFam2 so edit # DEF to use SEQ2_DIR=/iscratch/i/canFam2/nib, then continue: doBlastzChainNet.pl DEF -continue=chainRun \ >>& do.log & ln -s blastz.canFam2.2005-08-12 /cluster/data/rn3/bed/blastz.canFam2 # MYTOUCH FIX - jen - 2006-01-24 sudo mytouch rn3 geneidPep 0404031400.00 sudo mytouch rn3 twinscanPep 0404021800.00 sudo mytouch rn3 ensGeneXref 0404301100.00 sudo mytouch rn3 ensGtp 0404301100.00 sudo mytouch rn3 ensPep 0404301100.00 sudo mytouch rn3 ensTranscript 0404301100.00 sudo mytouch rn3 superfamily 0404301300.00 # LIFTOVER TO RN4 (DONE 2/9/06 angie) ssh kkr1u00 $HOME/kent/src/hg/makeDb/makeLoChain/makeLoChain-split.csh \ rn4 /cluster/data/rn4/nib # Do what its output says to do next: ssh kk (or kk9) $HOME/kent/src/hg/makeDb/makeLoChain/makeLoChain-align.csh \ rn3 /iscratch/i/rn3/bothMaskedNibs rn4 /iscratch/i/rn4/split10k \ /scratch/hg/h/rat11.ooc # Do what its output says to do next: cd /cluster/data/rn3/bed/blat.rn4.2006-02-07/run para make spec para time #Completed: 1980 of 1980 jobs #CPU time in finished jobs: 4111620s 68526.99m 1142.12h 47.59d 0.130 y #IO & Wait Time: 32154s 535.91m 8.93h 0.37d 0.001 y #Average job time: 2093s 34.88m 0.58h 0.02d #Longest finished job: 24496s 408.27m 6.80h 0.28d #Submission to last job: 24496s 408.27m 6.80h 0.28d ssh eieio $HOME/kent/src/hg/makeDb/makeLoChain/makeLoChain-lift.csh rn3 rn4 \ /iscratch/i/rn4/split10k # Do what its output says to do next: ssh kki $HOME/kent/src/hg/makeDb/makeLoChain/makeLoChain-chain.csh \ rn3 /iscratch/i/rn3/bothMaskedNibs rn4 /cluster/bluearc/scratch/hg/rn4/nib # Do what its output says to do next: cd /cluster/data/rn3/bed/blat.rn4.2006-02-07/chainRun para make spec #Completed: 45 of 45 jobs #CPU time in finished jobs: 6512s 108.53m 1.81h 0.08d 0.000 y #IO & Wait Time: 3369s 56.15m 0.94h 0.04d 0.000 y #Average job time: 220s 3.66m 0.06h 0.00d #Longest finished job: 919s 15.32m 0.26h 0.01d #Submission to last job: 14449s 240.82m 4.01h 0.17d ssh eieio $HOME/kent/src/hg/makeDb/makeLoChain/makeLoChain-net.csh rn3 rn4 # Do what its output says to do next: ssh hgwdev $HOME/kent/src/hg/makeDb/makeLoChain/makeLoChain-load.csh rn3 rn4 # Hmmm.. the scripts should be updated to follow the new convention # for placement of files, but I didn't realize that until this was # done. We use bed/liftOver/ now instead of bed/bedOver; the files # should be compressed everywhere; and goldenPath should have a # link not a real file. This was sufficient for now, though: # on eieio (this should be worked into the previous scripts): rm /cluster/data/rn3/bed/bedOver/rn3ToRn4.over.chain nice gzip /cluster/data/rn3/bed/blat.rn4.2006-02-07/rn3ToRn4.over.chain # on hgwdev (this should be worked into -load): rm /gbdb/rn3/liftOver/rn3ToRn4.over.chain ln -s /cluster/data/rn3/bed/blat.rn4.2006-02-07/rn3ToRn4.over.chain.gz \ /gbdb/rn3/liftOver/ hgAddLiftOverChain rn3 rn4 hgcentraltest \ /gbdb/rn3/liftOver/rn3ToRn4.over.chain.gz \ -path=/gbdb/rn3/liftOver/rn3ToRn4.over.chain.gz ########################################################################## # NSCAN track - (2006-05-29 markd) cd /cluster/data/rn3/bed/nscan/ # obtained NSCAN predictions from michael brent's group # at WUSTL wget -nv http://genes.cs.wustl.edu/jeltje/rn3/rn3.nscan.gtf gzip rn3.nscan.gtf wget -nv -r -np http://genes.cs.wustl.edu/jeltje/rn3/chr_ptx/ cat genes.cs.wustl.edu/jeltje/rn3/chr_ptx/*.fa >rn3.nscan.pep.fa gzip rn3.nscan.pep.fa rm -rf genes.cs.wustl.edu chmod a-w *.gz # load tracks. Note that these have *utr features, rather than # exon features. currently ldHgGene creates separate genePred exons # for these. ldHgGene -bin -gtf -genePredExt rn3 nscanGene rn3.nscan.gtf.gz # add .a suffix to match transcript id WRONG, see below: hgPepPred -suffix=.a rn3 generic nscanPep rn3.nscan.pep.fa.gz rm *.tab # update trackDb; need a rn3-specific page to describe informants rat/rn3/nscanGene.html (from CanFam1_hg17_nscan_10_18_2005/description.html ) rat/rn3/trackDb.ra # replace twinScan track with n-scan when pushed. drop table twinscan; drop table twinscanPep; # 2006-07-24 markd: reloaded without -suffix hgPepPred rn3 generic nscanPep rn3.nscan.pep.fa.gz ##################################################################### # SEGMENTAL DUPLICATIONS (DONE 6/30/06 angie) # File emailed from Xinwei She mkdir /cluster/data/rn3/bed/genomicSuperDups cd /cluster/data/rn3/bed/genomicSuperDups sed -e 's/\t_\t/\t-\t/' rn3_genomicSuperDup.tab \ | awk '($3 - $2) >= 1000 && ($9 - $8) >= 1000 {print;}' \ | hgLoadBed rn3 genomicSuperDups stdin \ -sqlTable=$HOME/kent/src/hg/lib/genomicSuperDups.sql ########################################################################## # GenBank gbMiscDiff table (markd 2007-01-10) # Supports `NCBI Clone Validation' section of mgcGenes details page # genbank release 157.0 now contains misc_diff fields for MGC clones # reloading mRNAs results in gbMiscDiff table being created. ./bin/gbDbLoadStep -reload -srcDb=genbank -type=mrna rn3 ################################################ # AUTOMATE UPSTREAM FILE CREATION (2008-10-15 markd) update genbank.conf: rn3.upstreamGeneTbl = refGene