# This file describes how we made the browser database on the Rattus # Norvegicus genome, January 2003 update. DOWNLOAD SEQUENCE (DONE 02/05/03) ssh eieio mkdir /cluster/store4/rn2 cd /cluster/store4/rn2 wget ftp://rat-ftp.hgsc.bcm.tmc.edu/pub/analysis/rat/README wget ftp://rat-ftp.hgsc.bcm.tmc.edu/pub/analysis/rat/conditions_for_use # Get BCM's chrom assemblies -- we will assemble our own chr*.fa from # contig fa + agp, and cross-check against this. foreach c (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 X Un) mkdir $c wget -O $c/chr$c.fa.bcm.gz \ ftp://rat-ftp.hgsc.bcm.tmc.edu/pub/analysis/rat/chromosome/chr$c.fa.gz wget -O $c/chr${c}_random.fa.bcm.gz \ ftp://rat-ftp.hgsc.bcm.tmc.edu/pub/analysis/rat/chromosome/chr$c.random.fa.gz end # Get BCM's contig fa + agp. We will split into our own conveniently-sized # pseudo-contigs, and assemble chrom fa. wget ftp://rat-ftp.hgsc.bcm.tmc.edu/pub/analysis/rat/contigs/bacfile2-1.gz wget ftp://rat-ftp.hgsc.bcm.tmc.edu/pub/analysis/rat/contigs/record.dat.gz foreach c (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 X Un) wget -O $c/chr$c.agp.gz \ ftp://rat-ftp.hgsc.bcm.tmc.edu/pub/analysis/rat/contigs/chr$c.agp.gz wget -O $c/chr$c.contig.fa.gz \ ftp://rat-ftp.hgsc.bcm.tmc.edu/pub/analysis/rat/contigs/chr$c.contig.fa.gz wget -O $c/chr${c}_random.agp.gz \ ftp://rat-ftp.hgsc.bcm.tmc.edu/pub/analysis/rat/contigs/chr$c.random.agp.gz wget -O $c/chr${c}_random.contig.fa.gz \ ftp://rat-ftp.hgsc.bcm.tmc.edu/pub/analysis/rat/contigs/chr$c.random.contig.fa.gz gunzip $c/chr$c.agp.gz gunzip $c/chr${c}_random.agp.gz end BUILD AND CHECK CHROM-LEVEL SEQUENCE (DONE 02/05/03) # Make chrom fa: foreach c (?{,?}) gunzip $c/chr$c.contig.fa.gz agpToFa -simpleMulti $c/chr$c.agp chr$c $c/chr$c.fa $c/chr$c.contig.fa if (-e $c/chr${c}_random.agp) then gunzip $c/chr${c}_random.contig.fa.gz agpToFa -simpleMulti $c/chr${c}_random.agp chr${c}_random \ $c/chr${c}_random.fa $c/chr${c}_random.contig.fa endif end # Check that the size of each chromosome .fa file is equal to the # last coord of the .agp: foreach f ( */*.agp ) set agpLen = `tail -1 $f | awk '{print $3;}'` set g = $f:r set faLen = `faSize $g.fa | awk '{print $1;}'` if ($agpLen == $faLen) then echo $f length = $g length = $faLen else echo Error\!\!\! $f length = $agpLen, but $g length = $faLen endif end # Check that our assembled chrom fa jive with the BCM chrom fa foreach c ( ?{,?} ) set ucscLen = `faSize $c/chr$c.fa | awk '{print $1;}'` set bcmLen = `gunzip -c $c/chr$c.fa.bcm.gz | faSize stdin \ | awk '{print $1;}'` if ($ucscLen == $bcmLen) then echo chr$c.fa length = chr$c.fa.bcm length = $bcmLen else echo Error\!\!\! chr$c.fa length = $ucscLen, but chr$c.fa.bcm length = $bcmLen endif if (-e $c/chr${c}_random.fa) then set ucscLen = `faSize $c/chr${c}_random.fa | awk '{print $1;}'` set bcmLen = `gunzip -c $c/chr${c}_random.fa.bcm.gz | faSize stdin \ | awk '{print $1;}'` if ($ucscLen == $bcmLen) then echo chr${c}_random.fa length = chr${c}_random.fa.bcm length = $bcmLen else echo Error\!\!\! chr${c}_random.fa length = $ucscLen, but chr${c}_random.fa.bcm length = $bcmLen endif endif end BREAK UP SEQUENCE INTO 5 MB CHUNKS AT NON_BRIDGED CONTIGS (DONE 02/05/03) ssh hgwdev cd into your CVS source tree under kent/src/hg/splitFaIntoContigs make # This will split the rat sequence into approx. 5 Mbase # supercontigs between non-bridged clone contigs and drop the # resulting dir structure in /cluster/store4/rn2. The resulting # dir structure will include 1 dir for each chromosome, each of # which has a set of subdirectories, one subdir per supercontig. ssh eieio cd /cluster/store4/rn2 foreach c (?{,?}) cp -p $c/chr$c.agp $c/chr$c.agp.bak cp -p $c/chr$c.fa $c/chr$c.fa.bak splitFaIntoContigs $c/chr$c.agp $c/chr$c.fa . -nSize=5000000 if (-e $c/chr${c}_random.fa) then cp -p $c/chr${c}_random.agp $c/chr${c}_random.agp.bak cp -p $c/chr${c}_random.fa $c/chr${c}_random.fa.bak splitFaIntoContigs $c/chr${c}_random.agp $c/chr${c}_random.fa . \ -nSize=5000000 mv ${c}_random/lift/oOut.lst $c/lift/rOut.lst mv ${c}_random/lift/ordered.lft $c/lift/random.lft mv ${c}_random/lift/ordered.lst $c/lift/random.lst rmdir ${c}_random/lift rm ${c}_random/chr${c}_random.{agp,fa} mv ${c}_random/* $c rmdir ${c}_random endif end # Make sure the reconstructed .fa jives with the original: foreach f ( */*.fa.bak ) echo $f:r diff $f $f:r | wc -l end # The .agp goes through a slight format change, but make sure it # at least ends up with the same number of lines: foreach f ( */*.agp.bak ) set l1 = `wc -l $f | awk '{print $1;}'` set l2 = `wc -l $f:r | awk '{print $1;}'` if ($l1 == $l2) then echo "$f and $f:r have the same #lines" else echo Error\!\!\! $f has $l1 lines, but $f:r has $l2 endif end # Save some space foreach c (?{,?}) echo $c gzip $c/chr*.contig.fa end rm */*.bak COPY OVER JKSTUFF SCRIPTS DIRECTORY (DONE 02/05/03) ssh eieio ln -s /cluster/store4/rn2 ~/rn2 rm -f ~/lastRn ln -s /cluster/store4/rn1 ~/lastRn cd ~/rn2 cp -Rp ~/lastRn/jkStuff . rm jkStuff/*.{out,lst,lft} jkStuff/*~ CREATING DATABASE (DONE 02/06/03) # Create the database. ssh hgwdev # Enter mysql via: hgsql rn1 # At mysql prompt type: create database rn2; quit # make a semi-permanent read-only alias: alias rn2 "mysql -u hguser -phguserstuff -A rn2" # Use df to ake sure there is at least 5 gig free on # hgwdev:/var/lib/mysql CREATING GRP TABLE FOR TRACK GROUPING (DONE 02/11/03) ssh hgwdev echo "create table grp (PRIMARY KEY(NAME)) select * from rn1.grp" \ | hgsql rn2 REPEAT MASKING (DONE 03/06/03) Split contigs, run RepeatMasker, lift results Notes: * If there is a new version of RepeatMasker, build it and ask the admins to binrsync it (kkstore:/scratch/hg/RepeatMasker/*). * Contigs (*/chr*_*/chr*_*.fa) are split into 500kb chunks to make RepeatMasker runs manageable on the cluster ==> results need lifting. * For the NCBI assembly we repeat mask on the sensitive mode setting (RepeatMasker -m -s) #- Split contigs into 500kb chunks: ssh eieio cd ~/rn2 foreach d ( */chr*_?{,?} ) cd $d set contig = $d:t faSplit size $contig.fa 500000 ${contig}_ -lift=$contig.lft \ -maxN=500000 cd ../.. end #- Make the run directory and job list: cd ~/rn2 mkdir RMRun rm -f RMRun/RMJobs touch RMRun/RMJobs foreach d ( ?{,?}/chr*_?{,?} ) set ctg = $d:t foreach f ( $d/${ctg}_?{,?}.fa ) set f = $f:t echo /cluster/bin/scripts/RMRat \ /cluster/store4/rn2/$d $f \ '{'check out line+ /cluster/store4/rn2/$d/$f.out'}' \ >> RMRun/RMJobs end end #- Do the run ssh kk cd ~/rn2/RMRun para create RMJobs para try, para check, para check, para push, para check,... #- Lift up the split-contig .out's to contig-level .out's ssh eieio cd ~/rn2 foreach d ( ?{,?}/chr*_?{,?} ) cd $d set contig = $d:t liftUp $contig.fa.out $contig.lft warn ${contig}_*.fa.out > /dev/null cd ../.. end #- Lift up the contig-level .out's to chr-level cd ~/rn2 ./jkStuff/liftOut5.sh # soft-mask contig .fa's with .out's foreach i (?{,?}) foreach j ($i/chr${i}_?{,?}/chr${i}_?{,?}.fa \ $i/chr${i}_random_?{,?}/chr${i}_random_?{,?}.fa) maskOutFa $j $j.out $j -soft end echo done $i end #- Load the .out files into the database with: ssh hgwdev cd ~/rn2 hgLoadOut rn2 ?{,?}/*.fa.out MAKE LIFTALL.LFT (DONE 02/05/03) cd ~/rn2 cat ?{,?}/lift/{ordered,random}.lft > jkStuff/liftAll.lft VERIFY REPEATMASKER RESULTS (DONE 03/06/03) # Run featureBits on rn2 and on a comparable genome build, and compare: ssh hgwdev featureBits rn2 rmsk # --> 1100534407 bases of 2764911379 (39.804%) in intersection # --> (orig run, July libs) 1058156286 bases of 2764911379 (38.271%) in intersection featureBits rn1 rmsk # --> 1081814344 bases of 2852382926 (37.927%) in intersection STORING O+O SEQUENCE AND ASSEMBLY INFORMATION (DONE 02/06/03) # Make (unmasked) nibs ssh eieio cd ~/rn2 mkdir nib foreach f (?{,?}/chr*.fa) faToNib $f nib/$f:t:r.nib end # Make symbolic links from /gbdb/rn2/nib to the real nibs. ssh hgwdev mkdir -p /gbdb/rn2/nib foreach f (/cluster/store4/rn2/nib/chr*.nib) ln -s $f /gbdb/rn2/nib end # Load /gbdb/rn2/nib paths into database and save size info. ssh hgwdev hgsql rn2 < ~/src/hg/lib/chromInfo.sql cd ~/rn2 hgNibSeq -preMadeNib rn2 /gbdb/rn2/nib ?{,?}/chr?{,?}{,_random}.fa echo "select chrom,size from chromInfo" | hgsql -N rn2 > chrom.sizes GOLD AND GAP TRACKS (DONE 02/06/03) ssh hgwdev cd ~/rn2 hgGoldGapGl -noGl rn2 /cluster/store4/rn2 . MAKE GCPERCENT (DONE 02/06/03) ssh hgwdev mkdir -p /cluster/store4/rn2/bed/gcPercent cd /cluster/store4/rn2/bed/gcPercent hgsql rn2 < ~/src/hg/lib/gcPercent.sql hgGcPercent rn2 ../../nib MAKE HGCENTRALTEST ENTRY AND TRACKDB TABLE FOR RN2 (DONE 02/06/03) # Enter rn2 into hgcentraltest.dbDb so test browser knows about it: mysql -h genome-testdb -u root -pbigSecret -A hgcentraltest insert into dbDb values("rn2", "Rat Jan. 2003", "/gbdb/rn2/nib", "Rat", "Napa", 1, 20, "Rat"); quit # Make trackDb table so browser knows what tracks to expect: ssh hgwdev cd ~/src/hg/makeDb/trackDb cvs up -d -P # Edit that makefile to add rn2 in all the right places and do make update make alpha cvs commit makefile MAKE HGCENTRALTEST BLATSERVERS ENTRY FOR RN2 (DONE 02/13/03) ssh hgwdev echo 'insert into blatServers values("rn2", "blat10", "17778", "1"); \ insert into blatServers values("rn2", "blat10", "17779", "0");' \ | hgsql -h genome-testdb hgcentraltest SIMPLE REPEAT TRACK (DONE 02/07/03) # TRF runs pretty quickly now... it takes a few hours total runtime, # so instead of binrsyncing and para-running, just do this on eieio: ssh eieio mkdir ~/rn2/bed/simpleRepeat cd ~/rn2/bed/simpleRepeat mkdir trf rm -f jobs.csh touch jobs.csh foreach f (/cluster/store4/rn2/?{,?}/chr*_*/chr?{,?}{,_random}_?{,?}.fa) set fout = $f:t:r.bed echo "/cluster/home/kent/bin/i386/trfBig -trf=/cluster/home/kent/bin/i386/trf $f /dev/null -bedAt=trf/$fout -tempDir=/tmp" \ >> jobs.csh end tcsh jobs.csh |& tee jobs.log wc -l jobs.csh ls -1 trf | wc -l # When job is done do: liftUp simpleRepeat.bed ~/rn2/jkStuff/liftAll.lft warn trf/*.bed # Load this into the database as so ssh hgwdev cd ~/rn2/bed/simpleRepeat hgLoadBed rn2 simpleRepeat simpleRepeat.bed \ -sqlTable=$HOME/src/hg/lib/simpleRepeat.sql PROCESS SIMPLE REPEATS INTO MASK (DONE 02/07/03) # After the simpleRepeats track has been built, make a filtered version # of the trf output: keep trf's with period <= 12: ssh eieio cd ~/rn2/bed/simpleRepeat mkdir -p trfMask foreach f (trf/chr*.bed) awk '{if ($5 <= 12) print;}' $f > trfMask/$f:t end # Lift up filtered trf output to chrom coords as well: cd ~/rn2 mkdir -p bed/simpleRepeat/trfMaskChrom foreach c (?{,?}) perl -wpe 's@(\S+)@bed/simpleRepeat/trfMask/$1.bed@' \ $c/lift/ordered.lst > $c/lift/oTrf.lst if (-e $c/lift/random.lst) then perl -wpe 's@(\S+)@bed/simpleRepeat/trfMask/$1.bed@' \ $c/lift/random.lst > $c/lift/rTrf.lst endif liftUp bed/simpleRepeat/trfMaskChrom/chr$c.bed \ jkStuff/liftAll.lft warn `cat $c/lift/oTrf.lst` if (-e $c/lift/rTrf.lst) then liftUp bed/simpleRepeat/trfMaskChrom/chr${c}_random.bed \ jkStuff/liftAll.lft warn `cat $c/lift/rTrf.lst` endif end MASK SEQUENCE WITH BOTH REPEATMASKER AND SIMPLE REPEAT/TRF (DONE 03/06/03) # This used to be done right after RepeatMasking. Now, we mask with # TRF as well, so do this after the "PROCESS SIMPLE REPEATS" step above. ssh eieio cd ~/rn2 #- Soft-mask (lower-case) the contig and chr .fa's ./jkStuff/makeFaMasked.sh #- Make hard-masked .fa.masked files as well: ./jkStuff/makeHardMasked.sh #- Rebuild the nib, mixedNib, maskedNib files: ./jkStuff/makeNib.sh # Copy the masked contig fa to /scratch: ssh kkstore rm -rf /scratch/hg/rn2/trfFa mkdir -p /scratch/hg/rn2/trfFa cp -p ~/rn2/?{,?}/chr*_*/chr?{,?}{,_random}_?{,?}.fa /scratch/hg/rn2/trfFa MAKE DOWNLOADABLE SEQUENCE FILES (DONE 03/06/03) ssh eieio cd ~/rn2 #- Build the .zip files ./jkStuff/zipAll.sh |& tee zipAll.log mkdir zip mv *.zip* zip cd zip #- Look at zipAll.log to make sure all file lists look reasonable. #- Check zip file integrity: foreach f (*.zip) unzip -t $f > $f.test tail -1 $f.test end wc -l *.zip.test #- Copy the .zip files to hgwdev:/usr/local/apache/... ssh hgwdev cd ~/rn2/zip ../jkStuff/cpToWeb.sh cd /usr/local/apache/htdocs/goldenPath/rnJan2003 #- Take a look at bigZips/* and chromosomes/*, update their README.txt's # Then make the upstream sequence files. cd bigZips featureBits rn2 refGene:upstream:1000 -fa=upstream1000.fa zip upstream1000.zip upstream1000.fa rm upstream1000.fa featureBits rn2 refGene:upstream:2000 -fa=upstream2000.fa zip upstream2000.zip upstream2000.fa rm upstream2000.fa featureBits rn2 refGene:upstream:5000 -fa=upstream5000.fa zip upstream5000.zip upstream5000.fa rm upstream5000.fa PREPARE CLUSTER FOR BLASTZ RUN (DONE 03/06/03) # This needs to be done after trf-masking and nib generation. ssh kkstore # Extract lineage-specific repeats using Arian Smit's script: mkdir -p ~/rn2/bed/linSpecRep cd ~/rn2/bed/linSpecRep foreach f (~/rn2/*/chr*.out) ln -sf $f . end /cluster/bin/scripts/rodentSpecificRepeats.pl *.out /cluster/bin/scripts/perl-rename 's/(\.fa|\.nib)//' *.out.*spec /cluster/bin/scripts/perl-rename 's/\.(rod|prim)spec/.spec/' *.out.*spec rm *.out cd .. rm -rf /scratch/hg/rn2/linSpecRep mkdir -p /scratch/hg/rn2 cp -Rp linSpecRep /scratch/hg/rn2 # RepeatMasker .out: cd ~/rn2 rm -rf /scratch/hg/rn2/rmsk mkdir -p /scratch/hg/rn2/rmsk cp -p ?{,?}/chr?{,?}{,_random}.fa.out /scratch/hg/rn2/rmsk # Chrom-level mixed nibs that have been repeat- and trf-masked: rm -rf /scratch/hg/rn2/chromTrfMixedNib mkdir -p /scratch/hg/rn2/chromTrfMixedNib cp -p mixedNib/chr*.nib /scratch/hg/rn2/chromTrfMixedNib # Ask cluster-admin@cse.ucsc.edu to binrsync /scratch/hg to clusters # Jim's comments Feb 12 '03 about the order in which to run blastz: # In general we should do # 1) hg/mm # 2) mm/rn # 3) rn/hg # 4) hg/hg # 5) mm/mm # 6) rn/rn # There is now an 'axtSwap' program that might let us # get out of having to run the inverse of 1,2 & 3, though # 2 in particular is so fast perhaps it's just as well to # do the inverse explicitly. MAKING AND STORING mRNA AND EST ALIGNMENTS (DONE 02/09/03) # Load up the local disks of the cluster with refSeq.fa, mrna.fa and est.fa # from /cluster/store2/mrna.133 into /scratch/hg/mrna.133 # Make sure that /scratch/hg/rn2/trfFa is loaded with chr*_*.fa and pushed # to the cluster nodes. ssh kk cd ~/rn2/bed foreach i (refSeq mrna est) mkdir -p $i cd $i ls -1S /scratch/hg/rn2/trfFa/* > genome.lst ls -1 /mnt/scratch/hg/mrna.133/Rattus_norvegicus/$i.fa > mrna.lst cp ~/lastRn/bed/$i/gsub . mkdir psl gensub2 genome.lst mrna.lst gsub spec para create spec cd .. end # In each dir: para try, para check, para push, para check.... # para time > time # Process refSeq, mRNA, and EST alignments into near best in genome. ssh eieio cd ~/rn2/bed cd refSeq pslSort dirs raw.psl /cluster/store2/temp psl pslReps -minCover=0.2 -sizeMatters -minAli=0.98 -nearTop=0.002 raw.psl \ contig.psl /dev/null liftUp -nohead all_refSeq.psl ../../jkStuff/liftAll.lft warn contig.psl pslSortAcc nohead chrom /cluster/store2/temp all_refSeq.psl cd .. cd mrna pslSort dirs raw.psl /cluster/store2/temp psl pslReps -minAli=0.98 -sizeMatters -nearTop=0.005 raw.psl contig.psl \ /dev/null liftUp -nohead all_mrna.psl ../../jkStuff/liftAll.lft warn contig.psl pslSortAcc nohead chrom /cluster/store2/temp all_mrna.psl cd .. cd est pslSort dirs raw.psl /cluster/store2/temp psl pslReps -minAli=0.98 -sizeMatters -nearTop=0.005 raw.psl contig.psl \ /dev/null liftUp -nohead all_est.psl ../../jkStuff/liftAll.lft warn contig.psl pslSortAcc nohead chrom /cluster/store2/temp all_est.psl cd .. # Load mRNA alignments into database. ssh hgwdev cd ~/rn2/bed/mrna/chrom foreach i (chr?{,?}{,_random}.psl) mv $i $i:r_mrna.psl end hgLoadPsl rn2 *.psl cd .. hgLoadPsl rn2 all_mrna.psl -nobin # Load EST alignments into database. ssh hgwdev cd ~/rn2/bed/est/chrom foreach i (chr?{,?}{,_random}.psl) echo mv $i $i:r_est.psl end hgLoadPsl rn2 *.psl cd .. hgLoadPsl rn2 all_est.psl -nobin # Create subset of ESTs with introns and load into database. ssh eieio cd ~/rn2 tcsh jkStuff/makeIntronEst.sh ssh hgwdev cd ~/rn2/bed/est/intronEst hgLoadPsl rn2 *.psl # Load refSeq alignments into database ssh hgwdev cd ~/rn2/bed/refSeq pslCat -dir chrom > refSeqAli.psl hgLoadPsl rn2 -tNameIx refSeqAli.psl CREATE REFSEQ GENES TRACK (DONE 02/09/03) # Load the refSeq mRNA ssh hgwdev mkdir -p /gbdb/rn2/mrna.133 ln -s /cluster/store2/mrna.133/refSeq/org/Rattus_norvegicus/refSeq.fa \ /gbdb/rn2/mrna.133 hgLoadRna new rn2 hgLoadRna add -type=refSeq rn2 /gbdb/rn2/mrna.133/refSeq.fa \ /cluster/store2/mrna.133/refSeq/org/Rattus_norvegicus/refSeq.ra # Produce refGene, refPep, refMrna, and refLink tables as so: # Get the proteins: ssh eieio cd ~/rn2/bed/refSeq wget ftp://ftp.ncbi.nih.gov/refseq/R_norvegicus/mRNA_Prot/rat.faa.gz wget ftp://ftp.ncbi.nih.gov/refseq/LocusLink/loc2ref wget ftp://ftp.ncbi.nih.gov/refseq/LocusLink/mim2loc gunzip rat.faa.gz ssh hgwdev cd ~/rn2/bed/refSeq hgRefSeqMrna rn2 \ /gbdb/rn2/mrna.133/refSeq.fa \ /cluster/store2/mrna.133/refSeq/org/Rattus_norvegicus/refSeq.ra \ all_refSeq.psl loc2ref rat.faa mim2loc # Don't worry about the "No gene name" errors # Add RefSeq status info hgRefSeqStatus -rat rn2 loc2ref REFFLAT (DONE 02/09/03) # create precomputed join of refFlat and refGene: echo 'CREATE TABLE refFlat (KEY geneName (geneName), KEY name (name), KEY chrom (chrom)) SELECT refLink.name as geneName, refGene.* FROM refLink,refGene WHERE refLink.mrnaAcc = refGene.name' | hgsql rn2 LOAD MRNA DATA (DONE 02/09/03) ssh hgwdev ln -s /cluster/store2/mrna.133/org/Rattus_norvegicus/mrna.fa /gbdb/rn2/mrna.133 ln -s /cluster/store2/mrna.133/org/Rattus_norvegicus/est.fa /gbdb/rn2/mrna.133 hgLoadRna add -type=mRNA rn2 /gbdb/rn2/mrna.133/mrna.fa \ /cluster/store2/mrna.133/org/Rattus_norvegicus/mrna.ra hgLoadRna add -type=EST rn2 /gbdb/rn2/mrna.133/est.fa \ /cluster/store2/mrna.133/org/Rattus_norvegicus/est.ra PRODUCING ESTORIENTINFO TABLE (DONE 03/06/03) This table is needed for proper orientation of ESTs in the browser. Many will appear on the wrong strand without it. This involves a cluster run. First load the EST psl files as so: ssh eieio cd ~/rn2/bed/est pslSortAcc nohead contigs /cluster/store2/temp contig.psl ssh kkstore mkdir /mnt/scratch/hg/rn2/est cd ~/rn2/bed/est cp -r contigs /mnt/scratch/hg/rn2/est Wait for these to finish. mkdir -p ~/rn2/bed/estOrientInfo cd ~/rn2/bed/estOrientInfo mkdir ei ls -1S /mnt/scratch/hg/rn2/est/contigs/* > psl.lst echo placeholder > single cp ~/rn1/bed/estOrientInfo/gsub . Update gsub to refer to rat contig sequence currently on /mnt//scratch/hg/rn2/trfFa, and rat ESTs on /mnt/scratch/hg/rn2/est/contigs and the rat est in /scratch/hg/mrna.133/Rattus_norvegicus/est.fa. gensub2 psl.lst single gsub spec ssh kk para create spec Then run the job on the cluster cd ~/rn2/bed/estOrientInfo para try sleep 60 para check If things look good para push Wait for this to finish then liftUp estOrientInfo.bed ../../jkStuff/liftAll.lft warn ei/*.tab Load them into database as so: ssh hgwdev cd ~/rn2/bed/estOrientInfo hgLoadBed rn2 estOrientInfo estOrientInfo.bed \ -sqlTable=/cluster/home/kent/src/hg/lib/estOrientInfo.sql PRODUCING MRNAORIENTINFO TABLE (DONE 03/06/03) ssh eieio cd ~/rn2/bed/mrna pslSortAcc nohead contig /cluster/store2/temp contig.psl ssh kkstore mkdir /mnt/scratch/hg/rn2/mrna cp -r ~/rn2/bed/mrna/contig /mnt/scratch/hg/rn2/mrna mkdir -p ~/rn2/bed/mrnaOrientInfo/oi cd ~/rn2/bed/mrnaOrientInfo ls -1S /mnt/scratch/hg/rn2/mrna/contig/* > psl.lst cp ~/lastRn/bed/mrnaOrientInfo/gsub . echo placeholder > single gensub2 psl.lst single gsub spec ssh kk cd ~/rn2/bed/mrnaOrientInfo para create spec para try, para check, para push, para check,... liftUp mrnaOrientInfo.bed ../../jkStuff/liftAll.lft warn oi/*.tab ssh hgwdev cd ~/rn2/bed/mrnaOrientInfo hgLoadBed rn2 mrnaOrientInfo mrnaOrientInfo.bed \ -sqlTable=/cluster/home/kent/src/hg/lib/mrnaOrientInfo.sql CREATE RNACLUSTER TABLE (DONE 03/06/03) # Make sure that refSeqAli, estOrientInfo and mrnaOrientInfo tables are # made already (see above). ssh hgwdev mkdir -p ~/rn2/bed/rnaCluster/chrom cd ~/rn2/bed/rnaCluster foreach i (~/rn2/?{,?}) foreach f ($i/chr*.fa) set c = $f:t:r clusterRna rn2 /dev/null chrom/$c.bed -chrom=$c echo done $c end end hgLoadBed rn2 rnaCluster chrom/*.bed PRODUCING GENSCAN PREDICTIONS (TODO - REDO) # Log into kkr1u00 (not kk!). kkr1u00 is the driver node for the small # cluster (kkr2u00 -kkr8u00. (genscan has problem running on the # big cluster, due to limitation of memory and swap space on each # processing node). ssh kkr1u00 mkdir -p ~/rn2/bed/genscan cd ~/rn2/bed/genscan # Make 3 subdirectories for genscan to put their output files in mkdir gtf pep subopt # Generate a list file, genome.list, of all the hard-masked contigs that # *do not* consist of all-N's (which would cause genscan to blow up) rm -f genome.list touch genome.list foreach f ( `ls -1S /cluster/store4/rn2/?{,?}/chr*/chr?{,?}{,_random}_?{,?}.fa.masked` ) egrep '[ACGT]' $f > /dev/null if ($status == 0) echo $f >> genome.list end # Create template file, gsub, for gensub2. For example (3-line file): #LOOP /cluster/home/kent/bin/i386/gsBig {check in line+ $(path1)} {check out line gtf/$(root1).gtf} -trans={check out line pep/$(root1).pep} -subopt={check out line subopt/$(root1).bed} -exe=/cluster/home/fanhsu/projects/compbio/bin/genscan-linux/genscan -par=/cluster/home/fanhsu/projects/compbio/bin/genscan-linux/HumanIso.smat -tmp=/tmp -window=2400000 #ENDLOOP echo "" > dummy.list gensub2 genome.list dummy.list gsub jobList para create jobList para try para check para push # If there are crashes, diagnose with "para problems". # If a job crashes due to genscan running out of memory, re-run it # manually with "-window=1200000" instead of "-window=2400000". # chr14_21, chr16_4 # Convert these to chromosome level files as so: ssh eieio cd ~/rn2/bed/genscan liftUp genscan.gtf ../../jkStuff/liftAll.lft warn gtf/*.gtf liftUp genscanSubopt.bed ../../jkStuff/liftAll.lft warn subopt/*.bed cat pep/*.pep > genscan.pep # Load into the database as so: ssh hgwdev cd ~/rn2/bed/genscan ldHgGene rn2 genscan genscan.gtf hgPepPred rn2 generic genscanPep genscan.pep hgLoadBed rn2 genscanSubopt genscanSubopt.bed SWAPPING HUMAN-RAT BLASTZ ALIGNMENTS TO RAT-HUMAN: (DONE 03/15/03) ssh eieio # Human-rat alignments were already run and processed into axt. # Swap target and query to get rat-human alignments. set aliDir = "/cluster/store4/gs.14/build31/bed/blastz.rn2.2003-03-13-ASH" set revAliDir = "/cluster/store4/rn2/bed/blastz.hg13.2003-03-13-SWAP" mkdir $revAliDir cd $revAliDir # axtBest will need .len files - copy those, swap S1<->S2 cp $aliDir/S1.len S2.len cp $aliDir/S2.len S1.len mkdir unsorted axtChrom # Swap target and query coords, then re-apportion alignments so that # unsorted/chrN.axt has all the alignments with chrN as target. cat $aliDir/axtChrom/chr*.axt \ | axtSwap stdin $aliDir/S1.len $aliDir/S2.len stdout \ | axtSplitByTarget stdin unsorted # Sorted the shuffled .axt files. foreach f (unsorted/*.axt) echo sorting $f:t:r axtSort $f axtChrom/$f:t end rm -r unsorted # Don't bother creating psl for these unfiltered alignments -- but # tell Jim so he can do chaining/netting. MAKING THE BLASTZBESTHUMAN TRACK FROM PENN STATE RN2 AXT FILES (DONE 03/15/03) # Consolidate AXT files to chrom level, sort, pick best, make psl. ssh eieio set base="/cluster/store4/rn2/bed/blastz.hg13.2003-03-13-SWAP" set seq1_dir="/cluster/store4/rn2/mixedNib/" set seq2_dir="/cluster/store4/gs.14/build31/mixedNib/" set tbl="blastzBestHg13" cd $base mkdir -p axtBest pslBest foreach f (axtChrom/chr*.axt) set chr=$f:t:r echo axtBesting $chr axtBest axtChrom/$chr.axt $chr axtBest/$chr.axt -minScore=300 echo translating axtBest to psl for $chr axtToPsl axtBest/$chr.axt S1.len S2.len pslBest/${chr}_${tbl}.psl end # If some chromosome's alignments were too big and caused axtSort to # run out of memory, split it in half (by 4-line axt records) and # run axtBest just on the halves. foreach chr (chr1) echo two-pass axtBesting $chr set len = `wc -l < axtChrom/$chr.axt` set numRec = `expr $len / 4` if (($numRec * 4) != $len) then echo "Uh-oh: length of axtChrom/$chr.axt is $len, not a multiple of 4" break endif set halfRec = `expr $numRec / 2` set halfLen = `expr $halfRec \* 4` set halfLenp1 = `expr $halfLen + 1` head -$halfLen axtChrom/$chr.axt > axtChrom/$chr.h0.axt tail +$halfLenp1 axtChrom/$chr.axt > axtChrom/$chr.h1.axt axtBest axtChrom/$chr.h0.axt $chr axtChrom/$chr.h0.axtBest -minScore=300 axtBest axtChrom/$chr.h1.axt $chr axtChrom/$chr.h1.axtBest -minScore=300 cat axtChrom/$chr.h{0,1}.axtBest > axtBest/$chr.axt axtToPsl axtBest/$chr.axt S1.len S2.len pslBest/${chr}_${tbl}.psl rm axtChrom/$chr.h* end # Load tables ssh hgwdev set base="/cluster/store4/rn2/bed/blastz.hg13.2003-03-13-SWAP" set tbl="blastzBestHg13" cd $base/pslBest hgLoadPsl rn2 chr*_${tbl}.psl # Make /gbdb links and add them to the axtInfo table: mkdir -p /gbdb/rn2/axtBestHg13 cd /gbdb/rn2/axtBestHg13 foreach f ($base/axtBest/chr*.axt) ln -s $f . end cd $base/axtBest rm -f axtInfoInserts.sql touch axtInfoInserts.sql foreach f (/gbdb/rn2/axtBestHg13/chr*.axt) set chr=$f:t:r echo "INSERT INTO axtInfo VALUES ('hg13','Blastz Best in Genome','$chr','$f');" \ >> axtInfoInserts.sql end hgsql rn2 < ~/kent/src/hg/lib/axtInfo.sql hgsql rn2 < axtInfoInserts.sql MAKING THE HUMAN AXTTIGHT FROM AXTBEST (DONE 03/15/03) # After creating axtBest alignments above, use subsetAxt to get axtTight: ssh eieio cd ~/rn2/bed/blastz.hg13.2003-03-13-SWAP/axtBest mkdir -p ../axtTight foreach i (*.axt) subsetAxt $i ../axtTight/$i \ ~kent/src/hg/mouseStuff/subsetAxt/coding.mat 3400 end # translate to psl cd ../axtTight mkdir -p ../pslTight foreach i (*.axt) set c = $i:r axtToPsl $i ../S1.len ../S2.len ../pslTight/${c}_blastzTightHg13.psl end # Load tables into database ssh hgwdev cd ~/rn2/bed/blastz.hg13.2003-03-13-SWAP/pslTight hgLoadPsl rn2 chr*_blastzTightHg13.psl TWINSCAN GENE PREDICTIONS (DONE 03/26/03) mkdir -p ~/rn2/bed/twinscan cd ~/rn2/bed/twinscan wget http://genome.cse.wustl.edu/~bio/rat/Jan03/rat_Jan03_03-26-03.tgz gunzip -c *.tgz | tar xvf - rm -r chr_tx # clean up chrom field of GTF files foreach f (chr_gtf/chr*.gtf) set chr = $f:t:r sed -e "s/^[a-zA-Z0-9]*/$chr/" $f > chr_gtf/$chr-fixed.gtf end # pare down protein FASTA header to id and add missing .a: foreach f (chr_ptx/chr*.ptx) set chr = $f:t:r perl -wpe 's/^\>.*\s+source_id\s*\=\s*(\S+).*$/\>$1.a/;' < \ chr_ptx/$chr.ptx > chr_ptx/$chr-fixed.fa end ldHgGene rn2 twinscan chr_gtf/chr*-fixed.gtf -exon=CDS hgPepPred rn2 generic twinscanPep chr_ptx/chr*-fixed.fa PRODUCING CROSS_SPECIES mRNA ALIGMENTS (DONE 03/11/03) # Here you align non-mouse mRNAs against the masked genome on the # cluster you set up during the previous step. # Make sure that gbpri, gbmam, gbrod, and gbvert are downloaded from # Genbank into /cluster/store2/genbank.133 and unpacked by organism into # /cluster/store2/mrna.133/org. # Set up cluster run more or less as so: ssh kk cd ~/rn2/bed mkdir xenoMrna cd xenoMrna ls -1S /scratch/hg/rn2/trfFa/* > genome.lst cp -R /cluster/store2/mrna.133/org /mnt/scratch/hg/mrna.133 # The below ls command fails when you have too many files so skip it and # instead run the find command after it. # ls -1S /mnt/scratch/hg/mrna.133/org/*/mrna.fa > allMrna.lst find /mnt/scratch/hg/mrna.133/org -name mrna.fa -ls \ | awk '{print $7,$11}' | grep -v /Rattus_norvegicus/ \ | sort -gr | awk '{print $2}' \ > allMrna.lst # Put the first line of allMrna.lst into 1.org, the second line into # 2.org, and so forth: foreach n (1 2 3 4 5 6) head -$n allMrna.lst | tail -1 > $n.org end # After the 6th line just leave the rest in 7.org. tail +7 allMrna.lst > 7.org # Then ls -1 *.org > mrna.lst cp ~/lastRn/bed/xenoMrna/gsub . mkdir psl gensub2 genome.lst mrna.lst gsub spec para create spec para try para check # If all looks well do para push # Sort xeno mRNA alignments as so: ssh eieio cd ~/rn2/bed/xenoMrna pslSort dirs raw.psl /cluster/store2/temp psl pslReps raw.psl cooked.psl /dev/null -minAli=0.25 liftUp chrom.psl ../../jkStuff/liftAll.lft warn cooked.psl pslSortAcc nohead chrom /cluster/store2/temp chrom.psl pslCat -dir chrom > xenoMrna.psl rm -r chrom raw.psl cooked.psl chrom.psl # Load into database as so: ssh hgwdev cd ~/rn2/bed/xenoMrna hgLoadPsl rn2 xenoMrna.psl -tNameIx # Make the xenoRna file # Make a /gbdb symlink for the .fa (not .ra) cd /gbdb/rn2/mrna.133 ln -s /cluster/store2/mrna.133/ratXenoRna.fa ratXenoRna.fa hgLoadRna add -type=xenoRna rn2 /gbdb/rn2/mrna.133/ratXenoRna.fa \ /cluster/store2/mrna.133/ratXenoRna.ra PRODUCING TETRAODON FISH ALIGNMENTS (TODO) o - Download sequence from ... and put it on the cluster local disk at /scratch/hg/fish o - Do fish/rat alignments. ssh kk cd ~/rn2/bed mkdir blatFish cd blatFish mkdir psl ls -1S /scratch/hg/fish/* > fish.lst ls -1S /scratch/hg/rn2/trfFa/* > rat.lst cp ~/lastRn/blatFish/gsub . gensub2 rat.lst fish.lst gsub spec para create spec para try Make sure jobs are going ok with para check. Then para push wait about 2 hours and do another para push do para checks and if necessary para pushes until done or use para shove. o - Sort alignments as so pslCat -dir psl | liftUp -type=.psl stdout ~/rn2/jkStuff/liftAll.lft warn stdin | pslSortAcc nohead chrom /cluster/store2/temp stdin o - Copy to hgwdev:/scratch. Rename to correspond with tables as so and load into database: ssh hgwdev cd ~/rn2/bed/blatFish/chrom foreach i (chr?{,?}{,_random}.psl) set r = $i:r mv $i ${r}_blatFish.psl end hgLoadPsl rn2 *.psl hgLoadRna addSeq rn2 /cluster/store2/fish/seq15jun2001/*.fa # PRODUCING SQUIRT ALIGNMENTS (DONE 2003-06-04 - braney) ssh kkstore mkdir -p ~/rn2/bed/blatCi1 cd ~/rn2/bed/blatCi1 ls -1S /iscratch/i/squirt/ci1/queryFa/*.fa > squirt.lst ls -1S /scratch/hg/rn2/trfFa/* > rat.lst rm -rf psl foreach ctg (`cat rat.lst`) mkdir -p psl/$ctg:t:r end # get gsub2D from someplace gensub2 rat.lst squirt.lst gsub2D spec ssh kk cd ~/rn2/bed/blatCi1 para create spec .... # When cluster run is done, sort alignments: ssh eieio cd ~/rn2/bed/blatCi1 mkdir /tmp/$LOGNAME pslSort dirs raw.psl /tmp/$LOGNAME psl/* pslReps raw.psl cooked.psl /dev/null -minAli=0.05 liftUp -nohead lifted.psl ../../jkStuff/liftAll.lft warn cooked.psl pslSortAcc nohead chrom /tmp/$LOGNAME lifted.psl # Rename to correspond with tables as so and load into database: ssh hgwdev cd ~/rn2/bed/blatCi1/chrom rm -f chr*_blatCi1.psl foreach i (chr?{,?}{,_random}.psl) set r = $i:r mv $i ${r}_blatCi1.psl end hgLoadPsl rn2 *.psl # Make squirt /gbdb/ symlink mkdir /gbdb/rn2/squirtSeq cd /gbdb/rn2/squirtSeq ln -s /cluster/store5/squirt/ci1/ciona.rm.fasta PRODUCING FUGU FISH ALIGNMENTS (DONE 03/13/03) # (Already done, for mm2:) # Download sequence to /cluster/store3/fuguSeq from ... and put it on the # cluster local disk at /scratch/hg/fugu on kkstore. # Sequence was downloaded from: # ftp://ftp.jgi-psf.org/pub/JGI_data/Fugu/fugu_v3_mask.fasta.Z # ftp://ftp.jgi-psf.org/pub/JGI_data/Fugu/fugu_v3_prot.fasta.Z # mkdir split2.5Mb; cd split2.5Mb; # faSplit about ../fugu_v3_mask.fasta 2500000 fuguSplit ssh kkr1u00 rm -rf /iscratch/i/fugu mkdir /iscratch/i/fugu cp -p /cluster/store3/fuguSeq/split2.5Mb/*.fa /iscratch/i/fugu ~kent/bin/iSync ssh kk mkdir ~/rn2/bed/blatFugu cd ~/rn2/bed/blatFugu ls -1S /iscratch/i/fugu/* > fugu.lst ls -1S /scratch/hg/rn2/trfFa/* > rat.lst cp ~/lastRn/bed/blatFugu/gsub . mkdir psl foreach f (~/rn2/?{,?}/chr*/chr?{,?}{,_random}_?{,?}.fa) set c=$f:t:r mkdir psl/$c end gensub2 rat.lst fugu.lst gsub spec para create spec para try para check para push para check # Sort alignments: ssh eieio cd ~/rn2/bed/blatFugu pslCat -dir psl/* \ | liftUp -type=.psl stdout ~/rn2/jkStuff/liftAll.lft warn stdin \ | pslSortAcc nohead chrom /cluster/store2/temp stdin # load into database: ssh hgwdev cd ~/rn2/bed/blatFugu/chrom foreach i (chr?{,?}{,_random}.psl) set r = $i:r mv $i ${r}_blatFugu.psl end hgLoadPsl rn2 *.psl mkdir -p /gbdb/rn2/fuguSeq cd /gbdb/rn2/fuguSeq ln -s /cluster/store3/fuguSeq/fugu_v3_mask.fasta cd /cluster/store2/temp hgLoadRna addSeq rn2 /gbdb/rn2/fuguSeq/fugu_v3_mask.fasta MAKE LIFT FILE FOR AGPS (DONE 02/05/03) ssh eieio cd ~/rn2/jkStuff ./jkStuff/agpToLift.pl chrom.sizes ?{,?}/chr?{,?}{,_random}.agp \ > jkStuff/liftRNOR.lft LOAD BACTIG POSITIONS (DONE 02/18/03) ssh hgwdev mkdir -p ~/rn2/bed/bactigPos cd ~/rn2/bed/bactigPos # Paul Havlak havlak@swan.hgsc.bcm.tmc.edu sent us a BED 4+ email # attachment. # Save the attachment as ~/rn2/bed/bactigPos/Rnor2-1.extreme.fix # Fix the 1-based starts to 0-based: awk "-F\t" '{printf "%s\t%d\t%s\t%s\t%s\t%s\n", $1, $2-1, $3, $4, $5, $6;}' < Rnor2-1.extreme.fix > bactigPos.bed hgLoadBed rn2 bactigPos bactigPos.bed \ -noBin -sqlTable=$HOME/kent/src/hg/lib/bactigPos.sql LOAD CPGISSLANDS (DONE 03/06/03) ssh eieio mkdir -p ~/rn2/bed/cpgIsland cd ~/rn2/bed/cpgIsland # Build software emailed from Asif Chinwalla (achinwal@watson.wustl.edu) # copy the tar file to the current directory cp ~/lastRn/bed/cpgIsland/cpg_dist.tar . tar xvf cpg_dist.tar cd cpg_dist gcc readseq.c cpg_lh.c -o cpglh.exe cd .. foreach f (../../?{,?}/chr?{,?}{,_random}.fa.masked) set fout=$f:t:r:r.cpg echo running cpglh on $f to $fout ./cpg_dist/cpglh.exe $f > $fout.cpg end # copy filter.awk from a previous release cp ~/lastRn/bed/cpgIsland/filter.awk . awk -f filter.awk chr*.cpg > cpgIsland.bed # load into database: ssh hgwdev cd ~/rn2/bed/cpgIsland hgLoadBed rn2 cpgIsland -tab -noBin \ -sqlTable=$HOME/kent/src/hg/lib/cpgIsland.sql cpgIsland.bed LOAD SOFTBERRY GENES (DONE 02/04/03) cd /cluster/store4/rn2/bed mkdir softberry cd softberry wget ftp://www.softberry.com/pub/SC_RAT_JAN03/Softb_rat_gff_j03.tar.gz gunzip -c Softb_rat_gff_j03.tar.gz | tar xvf - ldHgGene rn2 softberryGene chr*.gff hgPepPred rn2 softberry *.protein hgSoftberryHom rn2 *.protein LOAD GENEID GENES (DONE 04/01/03) mkdir -p ~/rn2/bed/geneid/download cd ~/rn2/bed/geneid/download foreach f (~/rn2/?{,?}/chr?{,?}{,_random}.fa) set chr = $f:t:r wget http://genome.imim.es/genepredictions/R.norvegicus/rnJan2003/geneid_v1.1/$chr.gtf wget http://genome.imim.es/genepredictions/R.norvegicus/rnJan2003/geneid_v1.1/$chr.prot end # Add missing .1 to protein id's foreach f (*.prot) perl -wpe 's/^(>chr\w+)$/$1.1/' $f > $f:r-fixed.prot end cd .. ldHgGene rn2 geneid download/*.gtf -exon=CDS hgPepPred rn2 generic geneidPep download/*-fixed.prot SGP GENE PREDICTIONS (DONE 2003-05-19 - Hiram) (RELOADED 10/28/03 angie) mkdir -p ~/rn2/bed/sgp/download cd ~/rn2/bed/sgp/download foreach f (~/rn2/?{,?}/chr?{,?}{,_random}.fa) set chr = $f:t:r wget http://genome.imim.es/genepredictions/R.norvegicus/rnJan2003/SGP/humangp20021114/$chr.gtf wget http://genome.imim.es/genepredictions/R.norvegicus/rnJan2003/SGP/humangp20021114/$chr.prot end # Add missing .1 to protein id's foreach f (*.prot) perl -wpe 's/^(>chr\w+)$/$1.1/' $f > $f:r-fixed.prot end cd .. ldHgGene rn2 sgpGene download/*.gtf -exon=CDS hgPepPred rn2 generic sgpPep download/*-fixed.prot SGP GENES (UPDATE 1/18/2006) sgpPep table dropped, replaced by hgc generated protein seq in browser TIGR GENE INDEX (TODO) mkdir -p ~/rn2/bed/tigr cd ~/rn2/bed/tigr wget ftp://ftp.tigr.org/private/NHGI_mgi_jiashu/TGI_track_RatGenome_Feb2002.tgz gunzip -c TGI_track_RatGenome_Feb2002.tgz | tar xvf - foreach f (*cattle*) set f1 = `echo $f | sed -e 's/cattle/cow/g'` mv $f $f1 end foreach o (rat cow human pig rat) setenv O $o foreach f (chr*_$o*s) tail +2 $f | perl -wpe 's /THC/TC/; s/(TH?C\d+)/$ENV{O}_$1/;' > $f.gff end end ldHgGene -exon=TC rn2 tigrGeneIndex *.gff LOAD STS MAP (todo) - login to hgwdev cd ~/rn2/bed rn2 < ~/src/hg/lib/stsMap.sql mkdir stsMap cd stsMap bedSort /projects/cc/hg/mapplots/data/tracks/build28/stsMap.bed stsMap.bed - Enter database with "rn2" command. - At mysql> prompt type in: load data local infile 'stsMap.bed' into table stsMap; - At mysql> prompt type LOAD MGI IDs (TODO) - The Locuslink ID to MGI IDs converstion data file, LL2MGI.txt, from Jackson Lab should be found under ~/rn2/bed/refSeq - login to hgwdev cd ~/rn2/bed/refSeq rn2 < ~/src/hg/lib/mgiID.sql - Enter database with "rn2" command. - At mysql> prompt type in: load data local infile 'LL2MGI.txt' into table MGIid; - At mysql> prompt type quit LOAD CHROMOSOME BANDS (todo) - login to hgwdev cd /cluster/store4/rn2/bed mkdir cytoBands cp /projects/cc/hg/mapplots/data/tracks/build28/cytobands.bed cytoBands rn2 < ~/src/hg/lib/cytoBand.sql Enter database with "rn2" command. - At mysql> prompt type in: load data local infile 'cytobands.bed' into table cytoBand; - At mysql> prompt type quit LOAD RATREF TRACK (todo) First copy in data from kkstore to ~/rn2/bed/ratRef. Then substitute 'genome' for the appropriate chromosome in each of the alignment files. Finally do: hgRefAlign webb rn2 ratRef *.alignments LOAD AVID RAT TRACK (todo) ssh cc98 cd ~/rn2/bed mkdir avidRat cd avidRat wget http://pipeline.lbl.gov/tableCS-LBNL.txt hgAvidShortBed *.txt avidRepeat.bed avidUnique.bed hgLoadBed avidRepeat avidRepeat.bed hgLoadBed avidUnique avidUnique.bed LOAD SNPS (TODO) - ssh hgwdev - cd ~/rn2/bed - mkdir snp - cd snp - Download SNPs from ftp://ftp.ncbi.nlm.nih.gov/pub/sherry/rat.b27.out.gz - Unpack. createBed < rat.b27.out > snpNih.bed hgLoadBed rn2 snpNih snpNih.bed LOAD ENSEMBL ESTs (TODO) ln -s /cluster/store4/rn2 ~/rn2 mkdir -p ~/rn2/bed/ensembl cd ~/rn2/bed/ensembl wget http://www.ebi.ac.uk/~stabenau/rat-est.gz wget http://www.ebi.ac.uk/~stabenau/rat-est.pep.gz gunzip -c rat-est.gz | \ perl -w -p -e 's/^(\w)/chr$1/' > rat-est-fixed.gtf ldHgGene rn2 ensEst rat-est-fixed.gtf > The id behind '>' is internal and was not in our gtf dump, so > you have to do some more parsing. # pick out the transcript= attribute -- that's the id to use: # also remove the first line: gunzip -c rat-est.pep.gz | tail +2 | \ perl -w -p -e 's/^\>gene_id=.*transcript=(\w+)\s+.*$/\>$1/' > \ rat-est-fixed.pep hgPepPred rn2 generic ensEstPep rat-est-fixed.pep LOAD ENSEMBLE GENES (TODO) mkdir -p ~/rn2/bed/ensembl cd ~/rn2/bed/ensembl wget http://www.ebi.ac.uk/~stabenau/rat-ensembl.gz wget http://www.ebi.ac.uk/~stabenau/rat-ensembl.pep.gz gunzip -c rat-ensembl.gz | \ perl -w -p -e 's/^(\w)/chr$1/' > rat-ensembl-fixed.gtf ldHgGene rn2 ensGene rat-ensembl-fixed.gtf > rat-ensembl contains stopcodons, due to some glitches in our > genebuild. The id behind '>' is internal and was not in our gtf dump, so > you have to do some more parsing. # pick out the transcript= attribute -- that's the id to use: # also remove the first line: tail +2 rat-ensembl.pep | \ perl -w -p -e 's/^\>gene_id=.*transcript=(\w+)\s+.*$/\>$1/' > \ rat-ensembl-fixed.pep hgPepPred rn2 generic ensPep rat-ensembl-fixed.pep LOAD RNAGENES (todo) - login to hgwdev - cd ~kent/src/hg/lib - rn2 < rnaGene.sql - cd /cluster/store4/rn2/bed - mkdir rnaGene - cd rnaGene - download data from ftp.genetics.wustl.edu/pub/eddy/pickup/ncrna-oo27.gff.gz - gunzip *.gz - liftUp chrom.gff ../../jkStuff/liftAll.lft carry ncrna-oo27.gff - hgRnaGenes rn2 chrom.gff LOAD EXOFISH (todo) - login to hgwdev - cd /cluster/store4/rn2/bed - mkdir exoFish - cd exoFish - rn2 < ~kent/src/hg/lib/exoFish.sql - Put email attatchment from Olivier Jaillon (ojaaillon@genoscope.cns.fr) into /cluster/store4/rn2/bed/exoFish/all_maping_ecore - awk -f filter.awk all_maping_ecore > exoFish.bed - hgLoadBed rn2 exoFish exoFish.bed LOAD GENIE (TODO) mkdir -p ~/rn2/bed/genieAlt cd ~/rn2/bed/genieAlt wget http://www.neomorphic.com/mgap/mgscv3/gtf/mgscv3.genie.gtf.tgz gunzip -c mgscv3.genie.gtf.tgz | tar xvf - ldHgGene rn2 genieAlt mgscv3.genie.gtf/chr*.gtf wget http://www.neomorphic.com/mgap/mgscv3/fa/mgscv3.aa.tgz gunzip -c mgscv3.aa.tgz | tar xvf - hgPepPred rn2 genie geniePep chr*.aa.fa LOAD GENIE CLONE BOUNDS (TODO) mkdir -p ~/rn2/bed/genieBounds cd ~/rn2/bed/genieBounds wget http://www.neomorphic.com/mgap/mgscv3/cb.bed/mgscv3_cb.bed.tgz gunzip -c mgscv3_cb.bed.tgz | tar xvf - - Trim the track definition from each file (these are actually custom track files): foreach c (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 X Un) tail +2 chr${c}_cb.bed > chr${c}_cb-fixed.bed end hgLoadBed rn2 genieBounds *-fixed.bed LOAD SOFTBERRY GENES (todo) - ln -s /cluster/store4/rn2 ~/rn2 - cd ~/rn2/bed - mkdir softberry - cd softberry - get ftp://www.softberry.com/pub/SC_MOU_NOV01/softb_mou_genes_nov01.tar.gz ldHgGene rn2 softberryGene chr*.gff hgPepPred rn2 softberry *.protein hgSoftberryHom rn2 *.protein LOAD GENOMIC DUPES (todo) o - Load genomic dupes ssh hgwdev cd ~/rn2/bed mkdir genomicDups cd genomicDups wget http://codon/jab/web/takeoff/oo33_dups_for_kent.zip unzip *.zip awk -f filter.awk oo33_dups_for_kent > genomicDups.bed hgsql rn2 < ~/src/hg/lib/genomicDups.sql hgLoadBed rn2 -oldTable genomicDups genomicDupes.bed LOAD RGD CURATED GENES TRACK - cd rn2 - cd bed - mkdir rgdGene - Browse to http://zephyrus.brc.mcw.edu/cgi-bin/pub/viewcvs.cgi/pub_gbrowse/gff_files/RGD_curated_genes.gff This is a web-based CVS page. Click the download link and save the file to ~/rn2/bed/RGD_curated_genes.gff - Now massage the data format using: rn2/bed/rgdGene/massage.pl - Load the data: ldHgGene rn2 rgdGene Fixed_RGD_Curated_genes.gff - Create the link table for searching In mysql for the rn2 database do: create table rgdLink (id int primary key, name varchar(32) not null); LOAD DATA LOCAL INFILE 'RGD.links' into table rgdLink; FAKING DATA FROM PREVIOUS VERSION (This is just for until proper track arrives. Rescues about 97% of data Just an experiment, not really followed through on). o - Rescuing STS track: - log onto hgwdev - mkdir ~/rn2/rescue - cd !$ - mkdir sts - cd sts - bedDown hg3 mapGenethon sts.fa sts.tab - echo ~/rn2/sts.fa > fa.lst - pslOoJobs ~/rn2 ~/rn2/rescue/sts/fa.lst ~/rn2/rescue/sts g2g - log onto cc01 - cc ~/rn2/rescue/sts - split all.con into 3 parts and condor_submit each part - wait for assembly to finish - cd psl - mkdir all - ln ?/*.psl ??/*.psl *.psl all - pslSort dirs raw.psl temp all - pslReps raw.psl contig.psl /dev/null - rm raw.psl - liftUp chrom.psl ../../../jkStuff/liftAll.lft carry contig.psl - rm contig.psl - mv chrom.psl ../convert.psl # CREATE FULL TEXT INDEX FOR KNOWN GENES (DONE 1/19/2006 JK) # This depends on the go and uniProt databases as well as # the kgAlias and kgProAlias tables. The hgKgGetText takes # about 5 minutes when the database is not too busy. The rest # is real quick. ssh hgwdev cd /cluster/data/rn2/bed mkdir -p knownGene/index cd -p knownGene/index hgKgGetText rn2 knownGene.text ixIxx knownGene.text knownGene.ix knownGene.ixx ln -s /cluster/data/rn2/bed/knownGene/index/knownGene.ix /gbdb/rn2/knownGene.ix ln -s /cluster/data/rn2/bed/knownGene/index/knownGene.ixx /gbdb/rn2/knownGene.ixx # MYTOUCH FIX - jen - 2006-01-24 sudo mytouch rn2 geneidPep 0403251000.00 sudo mytouch rn2 twinscanPep 0403251000.00 sudo mytouch rn2 dupSpMrna 0403251000.00 sudo mytouch rn2 keggPathway 0403251000.00 sudo mytouch rn2 kgAlias 0403251000.00 sudo mytouch rn2 kgProtAlias 0403251000.00 sudo mytouch rn2 kgXref 0403251000.00 sudo mytouch rn2 geneidPep 0404031400.00 sudo mytouch rn2 twinscanPep 0404031400.00 Other fixes at same time: Adjusted all.joiner rule to remove false error added !rn to ensemblTranscriptId $kgDb,!rn2.knownToEnsembl.value chopAfter=. check was comparing an empty table to a track (ensGene) that does not exist for this database.