This file describes how we made the browser database on the Rattus # Norvegicus genome, November 2002 build. CREATING DATABASE (DONE 11/25/02) # Create the database. ssh hgwdev # Enter mysql via: mysql -u hgcat -pbigsecret # At mysql prompt type: create database rn1; quit # make a semi-permanent read-only alias: alias rn1 "mysql -u hguser -phguserstuff -A rn1" # Use df to ake sure there is at least 5 gig free on # hgwdev:/var/lib/mysql COPY OVER JKSTUFF SCRIPTS DIRECTORY (DONE 11/25/02) ssh eieio ln -s /cluster/store4/rn1 ~/rn1 # This is the first rat, so use mm2 as the previous "rat" build: ln -s /cluster/store2/mm.2002.02/mm2 ~/lastRn cd ~/rn1 cp -Rp ~/lastRn/jkStuff . rm jkStuff/*.{out,lst,lft} jkStuff/*~ BREAK UP SEQUENCE INTO 5 MB CHUNKS AT NON_BRIDGED CONTIGS (DONE 11/25/02) # This version of the rat sequence data is in /cluster/store4/rn1. # The version of the sequence downloaded 11/25/02 is multi-record FA # with AGP. Use agpToFA to make chromosome .fa files, then use # splitFaIntoContigs to make contig chromosome files. ssh eieio cd ~/rn1 foreach c (?{,?}) agpToFa -simpleMulti $c/chr$c.agp chr$c $c/chr$c.fa $c/chr$c.contig.fa if (-e $c/chr${c}.random.agp) then agpToFa -simpleMulti $c/chr$c.random.agp chr${c}_random \ $c/chr${c}_random.fa $c/chr$c.random.contig.fa endif end # Check that the size of each chromosome .fa file is equal to the # last coord of the .agp: foreach f ( */*.agp ) set agpLen = `tail -1 $f | awk '{print $3;}'` set g = `echo $f:r | sed -e 's/\.random/_random/'` set faLen = `faSize $g.fa | awk '{print $1;}'` if ($agpLen == $faLen) then echo $f length = $g length = $faLen else echo Error\!\!\! $f length = $agpLen, but $g length = $faLen endif end ssh hgwdev cd into your CVS source tree under kent/src/hg/splitFaIntoContigs make # This will split the rat sequence into approx. 5 Mbase # supercontigs between non-bridged clone contigs and drop the # resulting dir structure in /cluster/store4/rn1. The resulting # dir structure will include 1 dir for each chromosome, each of # which has a set of subdirectories, one subdir per supercontig. ssh eieio cd ~/rn1 foreach c (?{,?}) cp -p $c/chr$c.agp $c/chr$c.agp.bak cp -p $c/chr$c.fa $c/chr$c.fa.bak splitFaIntoContigs $c/chr$c.agp $c/chr$c.fa . -nSize=5000000 if (-e $c/chr${c}_random.fa) then cp -p $c/chr$c.random.agp $c/chr$c.random.agp.bak cp -p $c/chr${c}_random.fa $c/chr${c}_random.fa.bak splitFaIntoContigs $c/chr$c.random.agp $c/chr${c}_random.fa . \ -nSize=5000000 mv ${c}_random/lift/oOut.lst $c/lift/rOut.lst mv ${c}_random/lift/ordered.lft $c/lift/random.lft mv ${c}_random/lift/ordered.lst $c/lift/random.lst rmdir ${c}_random/lift rm ${c}_random/chr${c}_random.{agp,fa} mv ${c}_random/* $c rmdir ${c}_random endif end # Make sure the reconstructed .fa jives with the original: foreach f ( */*.fa.bak ) echo $f:r diff $f $f:r | wc -l end # The .agp goes through a slight format change, but make sure it # at least ends up with the same number of lines: foreach f ( */*.agp.bak ) set l1 = `wc -l $f | awk '{print $1;}'` set l2 = `wc -l $f:r | awk '{print $1;}'` if ($l1 == $l2) then echo "$f and $f:r have the same #lines" else echo Error\!\!\! $f has $l1 lines, but $f:r has $l2 endif end REPEAT MASKING (DONE 11/26/02) Split contigs, run RepeatMasker, lift results Notes: * If there is a new version of RepeatMasker, build it and ask the admins to binrsync it (kkstore:/scratch/hg/RepeatMasker/*). * Contigs (*/chr*_*/chr*_*.fa) are split into 500kb chunks to make RepeatMasker runs manageable on the cluster ==> results need lifting. * For the NCBI assembly we repeat mask on the sensitive mode setting (RepeatMasker -m -s) #- Split contigs into 500kb chunks: ssh eieio cd ~/rn1 foreach d ( */chr*_?{,?} ) cd $d set contig = $d:t faSplit size $contig.fa 500000 ${contig}_ -lift=$contig.lft \ -maxN=500000 cd ../.. end #- Make the run directory and job list: cd ~/rn1 mkdir RMRun rm -f RMRun/RMJobs touch RMRun/RMJobs foreach d ( ?{,?}/chr*_?{,?} ) set ctg = $d:t foreach f ( $d/${ctg}_?{,?}.fa ) set f = $f:t echo /cluster/bin/scripts/RMRat \ /cluster/store4/rn1/$d $f \ '{'check out line+ /cluster/store4/rn1/$d/$f.out'}' \ >> RMRun/RMJobs end end #- Do the run ssh kk cd ~/rn1/RMRun para create RMJobs para try, para check, para check, para push, para check,... #- Lift up the split-contig .out's to contig-level .out's ssh eieio cd ~/rn1 foreach d ( ?{,?}/chr*_?{,?} ) cd $d set contig = $d:t liftUp $contig.fa.out $contig.lft warn ${contig}_*.fa.out > /dev/null cd ../.. end #- Lift up the contig-level .out's to chr-level cd ~/rn1 ./jkStuff/liftOut5.sh # soft-mask contig .fa's with .out's foreach i (?{,?}) foreach j ($i/chr${i}_?{,?}/chr${i}_?{,?}.fa \ $i/chr${i}_random_?{,?}/chr${i}_random_?{,?}.fa) maskOutFa $j $j.out $j -soft end echo done $i end #- copy the contig .fa's to the appropriate place on /scratch ssh kkstore mkdir -p /scratch/hg/rn1/contigs.1126 cp -p ~/rn1/?{,?}/chr*/chr?{,?}{,_random}_?{,?}.fa \ /scratch/hg/rn1/contigs.1126 #- Load the .out files into the database with: ssh hgwdev cd ~/rn1 hgLoadOut rn1 ?{,?}/*.fa.out VERIFY REPEATMASKER RESULTS (DONE 11/26/02) # Run featureBits on rn1 and on a comparable genome build, and compare: ssh hgwdev featureBits rn1 rmsk # --> 1081814344 bases of 2852382926 (37.927%) in intersection featureBits mm2 rmsk # --> 1037964664 bases of 2726995854 (38.063%) in intersection # Run RepeatMasker manually on a couple mini-contigs, verify that # they appear in the chrom .out and are masked in the sequence file. # Look at the DNA sequence of some RepeatMasker elements in the # genome browser; visually compare them to the RepeatMasker library # elements and make sure an alignment is plausible. MAKE LIFTALL.LFT (DONE 11/26/02) cd ~/rn1 cat ?{,?}/lift/{ordered,random}.lft > jkStuff/liftAll.lft STORING O+O SEQUENCE AND ASSEMBLY INFORMATION (DONE 11/26/02) # Load chromosome sequence info into database and save size info. ssh hgwdev hgsql rn1 < ~/src/hg/lib/chromInfo.sql cd ~/rn1 hgNibSeq -preMadeNib rn1 /cluster/store4/rn1/nib \ ?{,?}/chr?{,?}{,_random}.fa echo "select chrom,size from chromInfo" | hgsql rn1 > chrom.sizes # Make /gbdb/rn1/nib directory mkdir -p /gbdb/rn1/nib cd /gbdb/rn1/nib foreach f ( /cluster/store4/rn1/nib/chr*.nib ) ln -s $f end MAKE GCPERCENT (DONE 11/26/02) ssh hgwdev cd /cluster/store4/rn1/bed mkdir gcPercent cd gcPercent hgsql rn1 < ~/src/hg/lib/gcPercent.sql hgGcPercent rn1 ../../nib MAKE HGCENTRALTEST ENTRY AND TRACKDB TABLE FOR RN1 (DONE 11/26/02) # Enter rn1 into hgcentraltest.dbDb so test browser knows about it: ssh genome-testdb mysql -u root -pbigSecret hgcentral insert into dbDb values("rn1", "Rat Nov. 2002", "/cluster/store4/rn1/nib", "Rat", "chr1:15647901-18981566", 1, 10, "Rat"); quit # Make trackDb table so browser knows what tracks to expect: ssh hgwdev cd ~/src/hg/makeDb/hgTrackDb cvs up -d -P # Edit that makefile to add rn1 in all the right places and do make make update cvs commit makefile MAKE HGCENTRALTEST BLATSERVERS ENTRY FOR RN1 (DONE 12/4/02) ssh hgwdev mysql -h genome-testdb -u hgcat -pBIGSECRET -A hgcentraltest insert into blatServers values("rn1", "blat7", "17778", "1"); insert into blatServers values("rn1", "blat7", "17779", "0"); quit COPY GENBANK MRNA DATA TO THE CLUSTER (DONE for mrna.132) ssh kkstore # Copy the rna data to the cluster if it isn't there already: mkdir /scratch/hg/mrna.132 mkdir /scratch/hg/mrna.132/Rattus_norvegicus cp /cluster/store2/mrna.132/org/Rattus_norvegicus/*.fa /scratch/hg/mrna.132/Rattus_norvegicus # Distribute this data to the local nodes: ask sysadmins, or run: sudo /cluster/install/utilities/updateLocal GOLD AND GAP TRACKS (DONE 11/26/02) ssh hgwdev cd ~/rn1 hgGoldGapGl -noGl rn1 /cluster/store4/rn1 . CREATE RELATIONAL MRNA DATABASE (DONE 11/27/02) ssh hgwdev cd /cluster/store2/mrna.132 Make the anyRna.fil file using this text: restrict mol=mRNA hide mol # Make the RNAs gunzip -c /cluster/store2/genbank.132/gb{pri,rod,v,mam,inv,bct,htc,pat,phg,pln}* \ | gbToFaRa -byOrganism=org anyRna.fil mrna.fa mrna.ra mrna.ta stdin # Make the ESTs gunzip -c /cluster/store2/genbank.132/gbest*.gz | \ gbToFaRa anyRna.fil est.fa est.ra est.ta stdin -byOrganism=org # Make symbolic links from hgwdev's /gbdb to the real .fa locations: mkdir -p /gbdb/rn1/mrna.132 cd /gbdb/rn1/mrna.132 ln -s /cluster/store2/mrna.132/org/Rattus_norvegicus/mrna.fa ln -s /cluster/store2/mrna.132/org/Rattus_norvegicus/mrna.ra ln -s /cluster/store2/mrna.132/org/Rattus_norvegicus/est.fa ln -s /cluster/store2/mrna.132/org/Rattus_norvegicus/est.ra ln -s /cluster/store1/refSeqCum/112602/org/Rattus_norvegicus/refSeq.fa ln -s /cluster/store1/refSeqCum/112602/org/Rattus_norvegicus/refSeq.ra # Call hgLoadRna with the /gbdb locations: cd /cluster/store2/temp hgLoadRna new rn1 hgLoadRna add -type=mRNA rn1 /gbdb/rn1/mrna.132/mrna.fa /gbdb/rn1/mrna.132/mrna.ra hgLoadRna add -type=EST rn1 /gbdb/rn1/mrna.132/est.fa /gbdb/rn1/mrna.132/est.ra MAKING AND STORING mRNA AND EST ALIGNMENTS (DONE 11/28/02) # Use BLAT to generate refSeq, mRNA and EST alignments as so: # Make sure that /scratch/hg/rn1/contigs is loaded # with chr*_*.fa and pushed to the cluster nodes. ssh kk cd ~/rn1/bed foreach i (refSeq mrna est) mkdir -p $i cd $i ls -1S /scratch/hg/rn1/contigs.1126/* > genome.lst ls -1 /scratch/hg/mrna.132/Rattus_norvegicus/$i.fa > mrna.lst cp ~/mm2/bed/$i/gsub . mkdir psl gensub2 genome.lst mrna.lst gsub spec cd .. end # check on progress with para check in mrna, est, and refSeq directories. # report on runtime when finished: para create spec para try para push para time > time # Process refSeq, mRNA, and EST alignments into near best in genome. ssh eieio cd ~/rn1/bed cd refSeq pslSort dirs raw.psl /cluster/store2/temp psl pslReps -minCover=0.2 -sizeMatters -minAli=0.98 -nearTop=0.002 raw.psl \ contig.psl /dev/null liftUp -nohead all_refSeq.psl ../../jkStuff/liftAll.lft warn contig.psl pslSortAcc nohead chrom /cluster/store2/temp all_refSeq.psl cd .. cd mrna pslSort dirs raw.psl /cluster/store2/temp psl pslReps -minAli=0.98 -sizeMatters -nearTop=0.005 raw.psl contig.psl \ /dev/null liftUp -nohead all_mrna.psl ../../jkStuff/liftAll.lft warn contig.psl pslSortAcc nohead chrom /cluster/store2/temp all_mrna.psl cd .. cd est pslSort dirs raw.psl /cluster/store2/temp psl pslReps -minAli=0.98 -sizeMatters -nearTop=0.005 raw.psl contig.psl \ /dev/null liftUp -nohead all_est.psl ../../jkStuff/liftAll.lft warn contig.psl pslSortAcc nohead chrom /cluster/store2/temp all_est.psl cd .. # Load mRNA alignments into database. ssh hgwdev cd /cluster/store4/rn1/bed/mrna/chrom foreach i (chr?{,?}{,_random}.psl) mv $i $i:r_mrna.psl end hgLoadPsl rn1 *.psl cd .. hgLoadPsl rn1 all_mrna.psl -nobin # Load EST alignments into database. ssh hgwdev cd /cluster/store4/rn1/bed/est/chrom foreach i (chr?{,?}{,_random}.psl) mv $i $i:r_est.psl end hgLoadPsl rn1 *.psl cd .. hgLoadPsl rn1 all_est.psl -nobin # Create subset of ESTs with introns and load into database. ssh eieio cd ~/rn1 tcsh jkStuff/makeIntronEst.sh ssh hgwdev cd ~/rn1/bed/est/intronEst hgLoadPsl rn1 *.psl # Load refSeq alignments into database ssh hgwdev cd ~/rn1/bed/refSeq pslCat -dir chrom > refSeqAli.psl hgLoadPsl rn1 -tNameIx refSeqAli.psl PRODUCING ESTORIENTINFO TABLE (DONE 12/2/02 - MATT) This table is needed for proper orientation of ESTs in the browser. Many will appear on the wrong strand without it. This involves a cluster run. First load the EST psl files as so: ssh eieio cd ~/rn1/bed/est pslSortAcc nohead contigs /cluster/store2/temp contig.psl ssh kkstore mkdir /scratch/hg/rn1/est cd ~/rn1/bed/est cp -r contigs /scratch/hg/rn1/est Wait for these to finish. cd .. mkdir estOrientInfo cd estOrientInfo mkdir ei ls -1S /scratch/hg/rn1/est/contigs/* > psl.lst cp ~/mm2/bed/estOrientInfo/gsub . Update gsub to refer to rat contig sequence currently on /scratch/hg/rn1/contigs.1126, and rat ESTs on /scratch/hg/rn1/est/contigs and the rat est in ./scratch/hg/mrna.132/Rattus_norvegicus/est/fa. gensub2 psl.lst single gsub spec ssh kk para create spec Then run the job on the cluster cd ~/rn1/bed/estOrientInfo para try sleep 60 para check If things look good para push Wait for this to finish then liftUp estOrientInfo.bed ../../jkStuff/liftAll.lft warn ei/*.tab Load them into database as so: ssh hgwdev cd ~/rn1/bed/estOrientInfo hgLoadBed rn1 estOrientInfo estOrientInfo.bed -sqlTable=/cluster/home/kent/src/hg/lib/estOrientInfo.sql CREATE RNACLUSTER TABLE (DONE 12/2/02 - MATT) Make sure that refSeqAli and estOrientInfo tables are made already (see above). ssh hgwdev cd ~/rn1/bed mkdir rnaCluster cd rnaCluster mkdir rna est foreach i (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 X Un) clusterRna rn1 rna/chr$i.bed est/chr$i.bed -chrom=chr$i echo done $i end hgLoadBed rn1 rnaCluster est/*.bed PRODUCING KNOWN GENES (DONE 11/26/02 - MATT) o - Download everything from ftp://ftp.ncbi.nih.gov/refseq/cumulative into /cluster/store1/refSeqCum/112602 cd /cluster/store1/refSeqCum/112602 uncompress rscu.gbff.Z Put the following lines into anyRna.fil restrict mol=mRNA hide mol gbToFaRa anyRna.fil refSeq.fa refSeq.ra refSeq.ta -byOrganism=org This will create fasta and annotations for the rat in refSeq/org in refSeq.fa and refSeq.ra under the org/Rattus_norvegicus directory o - Get extra info from NCBI and produce refGene table as so: wget ftp://ftp.ncbi.nih.gov/refseq/LocusLink/loc2ref wget ftp://ftp.ncbi.nih.gov/refseq/LocusLink/mim2loc o - hgLoadRna add -type=refSeq rn1 /gbdb/rn1/mrna.132/refSeq.fa /gbdb/rn1/mrna.132/refSeq.ra o - Produce refGene, refPep, refMrna, and refLink tables as so: Get the rat proteins: wget ftp://ftp.ncbi.nih.gov/refseq/R_norvegicus/mRNA_Prot/rat.faa.gz gunzip rat.faa.gz cd ~/rn1/bed/refSeq hgRefSeqMrna rn1 /cluster/store1/refSeqCum/112602/org/Rattus_norvegicus/refSeq.fa \ /cluster/store1/refSeqCum/112602/org/Rattus_norvegicus/refSeq.ra \ all_refSeq.psl /cluster/store1/refSeqCum/112602/loc2ref \ /cluster/store1/refSeqCum/112602/rat.faa \ /cluster/store1/refSeqCum/112602/mim2loc o - Add RefSeq status info hgRefSeqStatus rn1 /cluster/store1/refSeqCum/112602/loc2refE o - create precomputed join of refFlat and refGene: echo 'CREATE TABLE refFlat (KEY geneName (geneName), KEY name (name), KEY chrom (chrom)) SELECT refLink.name as geneName, refGene.* FROM refLink,refGene WHERE refLink.mrnaAcc = refGene.name' | hgsql rn1 SIMPLE REPEAT TRACK (DONE* 11/26/02) * 18 of the 590 contigs were re-run with TRF version 2 (trf.2) due to crash/hang problems with TRF v3.01 (trf). ssh kk mkdir ~/rn1/bed/simpleRepeat cd ~/rn1/bed/simpleRepeat cp ~/lastRn/bed/simpleRepeat/gsub . mkdir trf ls -1 /scratch/hg/rn1/contigs.1126/*.fa > genome.lst touch single gensub2 genome.lst single gsub spec para create spec para try para push # When job is done do: liftUp simpleRepeat.bed ~/rn1/jkStuff/liftAll.lft warn trf/*.bed # Load this into the database as so ssh hgwdev cd ~/rn1/bed/simpleRepeat hgLoadBed rn1 simpleRepeat simpleRepeat.bed \ -sqlTable=$HOME/src/hg/lib/simpleRepeat.sql PREPARING SEQUENCE FOR CROSS SPECIES ALIGNMENTS (DONE 11/27/02) # After the simpleRepeats track has been built, make a filtered version # of the trf output: keep trf's with period <= 12: ssh eieio cd ~/rn1/bed/simpleRepeat mkdir -p trfMask foreach f (trf/chr*.bed) awk '{if ($5 <= 12) print;}' $f > trfMask/$f:t end # Lift up filtered trf output to chrom coords as well: cd ~/rn1 mkdir -p bed/simpleRepeat/trfMaskChrom foreach c (?{,?}) perl -wpe 's@(\S+)@bed/simpleRepeat/trfMask/$1.bed@' \ $c/lift/ordered.lst > $c/lift/oTrf.lst if (-e $c/lift/random.lst) then perl -wpe 's@(\S+)@bed/simpleRepeat/trfMask/$1.bed@' \ $c/lift/random.lst > $c/lift/rTrf.lst endif liftUp bed/simpleRepeat/trfMaskChrom/chr$c.bed \ jkStuff/liftAll.lft warn `cat $c/lift/oTrf.lst` if (-e $c/lift/rTrf.lst) then liftUp bed/simpleRepeat/trfMaskChrom/chr${c}_random.bed \ jkStuff/liftAll.lft warn `cat $c/lift/rTrf.lst` endif end # Make a trfMasked version of the contigs: cd ~/rn1 mkdir trfFa foreach f (?{,?}/chr*/chr?{,?}{,_random}_?{,?}.fa) set c = $f:t:r maskOutFa -softAdd $f bed/simpleRepeat/trfMask/$c.bed trfFa/$c.fa.trf end # Then make sure there is enough space available on /scratch and do ssh kkstore cp -Rp ~/rn1/trfFa /scratch/hg/rn1/trfFa.1127 # ask for binrsync, or sudo /cluster/install/utilities/updateLocal MAKE DOWNLOADABLE SEQUENCE FILES (DONE 11/27/02) # This used to be done right after RepeatMasking. Now, we mask with # TRF as well, so do this after the "PREPARING SEQUENCE" step above. ssh eieio cd ~/rn1 #- Soft-mask (lower-case) the contig and chr .fa's ./jkStuff/makeFaMasked.sh #- Make hard-masked .fa.masked files as well: ./jkStuff/makeHardMasked.sh #- Rebuild the nib, mixedNib, maskedNib files: ./jkStuff/makeNib.sh #- Rebuild the .zip files ./jkStuff/zipAll.sh |& tee zipAll.log #- Look at zipAll.log to make sure all file lists look reasonable. #- Check zip file integrity: foreach f (*.zip) unzip -t $f > $f.test tail -1 $f.test end wc -l *.zip.test #- Copy the .zip files to hgwdev:/usr/local/apache/... ssh hgwdev cd ~/rn1 ./jkStuff/cpToWeb.sh cd /usr/local/apache/htdocs/goldenPath/rnNov2002 #- Take a look at bigZips/* and chromosomes/*, update their README.txt's PRODUCING GENSCAN PREDICTIONS (DONE 12/3/02) # Log into kkr1u00 (not kk!). kkr1u00 is the driver node for the small # cluster (kkr2u00 -kkr8u00. (genscan has problem running on the # big cluster, due to limitation of memory and swap space on each # processing node). ssh kkr1u00 mkdir -p ~/rn1/bed/genscan cd ~/rn1/bed/genscan # Make 3 subdirectories for genscan to put their output files in mkdir gtf pep subopt # Generate a list file, genome.list, of all the hard-masked contigs that # *do not* consist of all-N's (which would cause genscan to blow up) rm -f genome.list touch genome.list foreach f ( `ls -1S /cluster/store4/rn1/?{,?}/chr*/chr?{,?}{,_random}_?{,?}.fa.masked` ) egrep '[ACGT]' $f > /dev/null if ($status == 0) echo $f >> genome.list end # Create template file, gsub, for gensub2. For example (3-line file): #LOOP /cluster/home/kent/bin/i386/gsBig {check in line+ $(path1)} {check out line gtf/$(root1).gtf} -trans={check out line pep/$(root1).pep} -subopt={check out line subopt/$(root1).bed} -exe=/cluster/home/fanhsu/projects/compbio/bin/genscan-linux/genscan -par=/cluster/home/fanhsu/projects/compbio/bin/genscan-linux/HumanIso.smat -tmp=/tmp -window=2400000 #ENDLOOP echo "" > dummy.list gensub2 genome.list dummy.list gsub jobList para create jobList para try para check para push # If there are crashes, diagnose with "para problems". # If a job crashes due to genscan running out of memory, re-run it # manually with "-window=2400000" instead of "-window=1200000". # Convert these to chromosome level files as so: liftUp genscan.gtf ../../jkStuff/liftAll.lft warn gtf/*.gtf liftUp genscanSubopt.bed ../../jkStuff/liftAll.lft warn subopt/*.bed cat pep/*.pep > genscan.pep # Load into the database as so: ssh hgwdev cd ~/rn1/bed/genscan ldHgGene rn1 genscan genscan.gtf hgPepPred rn1 generic genscanPep genscan.pep hgLoadBed rn1 genscanSubopt genscanSubopt.bed PREPARE CLUSTER FOR BLASTZ RUN (DONE 12/3/02) ssh eieio # Extract lineage-specific repeats using Arian Smit's script: mkdir -p ~/rn1/bed/linSpecRep cd ~/rn1/bed/linSpecRep foreach f (~/rn1/*/*.out) ln -sf $f . end /cluster/bin/scripts/rodentSpecificRepeats.pl *.out /cluster/bin/scripts/perl-rename 's/(\.fa|\.nib)//' *.out.*spec /cluster/bin/scripts/perl-rename 's/\.(rod|prim)spec/.spec/' *.out.*spec rm *.out # Copy files to the kkstore:/scratch ssh kkstore # lineage-specific repeats: cd ~/rn1/bed rm -rf /scratch/hg/rn1/linSpecRep cp -Rp linSpecRep /scratch/hg/rn1 # RepeatMasker .out: cd ~/rn1 rm -rf /scratch/hg/rn1/rmsk mkdir -p /scratch/hg/rn1/rmsk cp -p ?{,?}/chr?{,?}{,_random}.fa.out /scratch/hg/rn1/rmsk # Chrom-level mixed nibs that have been repeat- and trf-masked: rm -rf /scratch/hg/rn1/chromTrfMixedNib mkdir -p /scratch/hg/rn1/chromTrfMixedNib cp -p mixedNib/chr*.nib /scratch/hg/rn1/chromTrfMixedNib # Ask cluster-admin@cse.ucsc.edu to binrsync /scratch/hg to clusters MAKING THE BLASTZBESTRAT TRACK FROM PENN STATE AXT FILES (TODO) # When Scott Schwartz is done generating .axt's for the blastz # alignments... # Create tSizes (human chrom size) and qSizes (rat) for axtToPsl. ssh hgwdev cd /cluster/store1/gs.13/build30/blastz.rn1gs13.2002-08-30 In mysql: use rn1 select chrom,size from chromInfo; use hg12 select chrom,size from chromInfo; Edit the results of the first select into a tab-separated tSizes, edit the results of the second select into a tab-separated qSizes. # Consolidate AXT files to chrom level, sort, pick best, make psl. ssh eieio set base="/cluster/store1/gs.13/build30/blastz.rn1gs13.2002-08-30" set tbl="blastzBestHuman_08_30" cd $base mkdir -p axtBest pslBest foreach f (run.2/chr*.axt.gz) set chr=$f:t:r:r gunzip $f axtBest -minScore=300 $f:r $chr axtBest/$chr.axt axtToPsl axtBest/$chr.axt tSizes qSizes pslBest/${chr}_${tbl}.psl gzip $f:r echo Done with $chr. end # If axtBest runs out of memory, split those chromosome files into # smaller pieces and run axtBest in 2 passes. foreach f (run.2/chr{7,Un}.axt.gz) set chr=$f:t:r:r rm -f xx* gunzip -c $f \ | perl -we '$/=""; $seq=0; \ while (<>) { \ if (($. % 1000) == 1) { \ close(OUT) if ($. > 1000); \ $seq++; \ open(OUT, ">xx$seq"); \ } \ print OUT; \ } \ close(OUT);' foreach s (xx*) axtBest -minScore=300 $s $chr $s.axtBest end cat xx?*.axtBest > xx.axtBest axtBest -minScore=300 xx.axtBest $chr axtBest/$chr.axt rm xx* axtToPsl axtBest/$chr.axt tSizes qSizes \ pslBest/${chr}_${tbl}.psl echo Done with $chr. end # Load tables ssh hgwdev set base="/cluster/store1/gs.13/build30/blastz.rn1gs13.2002-08-30" cd $base/pslBest hgLoadPsl rn1 *.psl rm -f ~/kent/src/hg/lib/axtInfoFillRn1.sql touch ~/kent/src/hg/lib/axtInfoFillRn1.sql foreach f ($base/axtBest/chr*.axt) set chr=$f:t:r echo 'insert into axtInfo values ("hg12","Blastz Best in Genome","'$chr'","'$f'");' \ >> ~/kent/src/hg/lib/axtInfoFillRn1.sql end hgsql rn1 < ~/kent/src/hg/lib/axtInfo.sql hgsql rn1 < ~/kent/src/hg/lib/axtInfoFillRn1.sql cvs add ~/kent/src/hg/lib/axtInfoFillRn1.sql cvs ci ~/kent/src/hg/lib/axtInfoFillRn1.sql TWINSCAN GENE PREDICTIONS (DONE 01/27/03) mkdir -p ~/rn1/bed/twinscan cd ~/rn1/bed/twinscan wget http://genome.cse.wustl.edu/~bio/rat/Nov02/1-22-03/rat_Nov02_1-22-03.tgz gunzip -c *.tgz | tar xvf - rm -r chr_tx # clean up chrom field of GTF files foreach f (chr_gtf/chr*.gtf) set chr = $f:t:r sed -e "s/^[0-9]*/$chr/" $f > chr_gtf/$chr-fixed.gtf end # pare down protein FASTA header to id and add missing .a: foreach f (chr_ptx/chr*.ptx) set chr = $f:t:r perl -wpe 's/^\>.*\s+source_id\s*\=\s*(\S+).*$/\>$1.a/;' < \ chr_ptx/$chr.ptx > chr_ptx/$chr-fixed.fa end ldHgGene rn1 twinscan chr_gtf/chr*-fixed.gtf -exon=CDS hgPepPred rn1 generic twinscanPep chr_ptx/chr*-fixed.fa NCBI GENE MODELS (TODO) mkdir -p ~/rn1/bed/ncbiGenes cd ~/rn1/bed/ncbiGenes wget ftp://ftp.ncbi.nih.gov/genomes/M_musculus/MGSCv3_Release1/maps/chr_geneos.gtf.gz wget ftp://ftp.ncbi.nih.gov/genomes/M_musculus/MGSCv3_Release1/protein/protein.fa.gz gunzip chr_genes.gtf.gz gunzip protein.fa.gz - Process the .gtf and .fa together to join IDs ../../jkStuff/mungeNCBIids chr_genes.gtf protein.fa |& uniq ldHgGene rn1 ncbiGenes chr_genes-fixed.gtf hgPepPred rn1 generic ncbiPep protein-fixed.fa NCBI GENOMESCAN MODELS (TODO) mkdir -p ~/rn1/bed/genomeScan cd ~/rn1/bed/genomeScan wget ftp://ftp.ncbi.nih.gov/genomes/M_musculus/MGSCv3_Release1/maps/chr_GenomeScan.gtf.gz # Remove the ".1" at the end of transcript_id's: gunzip -c chr_GenomeScan.gtf.gz | \ perl -wpe 's/transcript_id "([^\"]+)\.1"/transcript_id "$1"/' > \ chr_GenomeScan-fixed.gtf ldHgGene rn1 genomeScan chr_GenomeScan-fixed.gtf wget ftp://ftp.ncbi.nih.gov/genomes/M_musculus/MGSCv3_Release1/protein/GS_prot.fsa.gz hgPepPred rn1 generic genomeScanPep GS_prot.fsa DOING HUMAN/RAT ALIGMENTS (todo) o - Download the lower-case-masked assembly and put it in eieio:/cluster/store1/a2ms. o - Download the assembled rat genome in lower-case masked form to /cluster/store1/arachne.3/whole. Execute the script splitAndCopy.csh to chop it into roughly 50M pieces in arachne.3/parts o - Set up the para job to do the alignment as so: ssh eieio cd /cluster/store4/rn1 mkdir blatRat.phusion cd blatRat.phusion ls -1S /scratch/hg/gs.13/build30/contigTrf/* > human.lst ls -1 /scratch/hg/rn1/contigs.1126/* > rat.lst Make a file 'gsub' with the following three lines in it #LOOP /cluster/home/kent/bin/i386/blat -q=dnax -t=dnax {check in line+ $(path2)} {check in line+ $(path1)} {check out line+ psl/$(root2)_$(root1).psl} -minScore=20 -minIdentity=20 -tileSize=4 -minMatch=2 -oneOff=0 -ooc={check in exists /scratch/hg/h/4.pooc} -qMask=lower -mask=lower #ENDLOOP Process this into a para file and launch the first set of jobs (10,000 out of 70,000) as so: gensub2 rat.lst human.lst gsub spec para create spec para push Do a 'para check' after a few minutes and make sure everything is right. para time > time o - Sort alignments as so ssh eieio cd /cluster/store4/rn1/blatRat pslCat -dir -check psl | liftUp -type=.psl stdout ../liftAll.lft warn stdin | pslSortAcc nohead chrom /cluster/store2/temp stdin o - Get rid of big pile-ups due to contamination as so: cd chrom foreach i (*.psl) echo $i mv $i xxx pslUnpile -maxPile=600 xxx $i rm xxx end o - Remove long redundant bits from read names by making a file called subs.in with the following line: gnl|ti^ti contig_^tig_ and running the commands cd ~/rat/vsOo33/blatRat.phusion/chrom subs -e -c ^ *.psl > /dev/null o - Copy over to network where database is: ssh kks00 cd ~/rn1/bed mkdir blatRat mkdir blatRat/ph.chrom600 cd !$ cp /cluster/store4/rn1/blatRat.phusion/chrom/*.psl . o - Rename to correspond with tables as so and load into database: ssh hgwdev cd ~/rn1/bed/blatRat/ph.chrom600 foreach i (chr?{,?}{,_random}.psl) set r = $i:r mv $i ${r}_blatRat.psl end hgLoadPsl rn1 *.psl o - load sequence into database as so: ssh kks00 faSplit about /projects/hg3/rat/arachne.3/whole/Unplaced.mfa 1200000000 /projects/hg3/rat/arachne.3/whole/unplaced ssh hgwdev hgLoadRna addSeq '-abbr=gnl|' rn1 /projects/hg3/rat/arachne.3/whole/unpla*.fa hgLoadRna addSeq '-abbr=con' rn1 /projects/hg3/rat/arachne.3/whole/SET*.mfa This will take quite some time. Perhaps an hour . o - Produce 'best in genome' filtered version: ssh kks00 cd ~/rat/vsOo33 pslSort dirs blatRatAll.psl temp blatRat pslReps blatRatAll.psl bestRatAll.psl /dev/null -singleHit -minCover=0.3 -minIdentity=0.1 pslSortAcc nohead bestRat temp bestRatAll.psl cd bestRat foreach i (chr?{,?}{,_random}.psl) set r = $i:r mv $i ${r}_bestRat.psl end o - Load best in genome into database as so: ssh hgwdev cd ~/rat/vsOo33/bestRat hgLoadPsl rn1 *.psl PRODUCING CROSS_SPECIES mRNA ALIGMENTS (DONE 12/03/02 - MATT) Here you align vertebrate mRNAs against the masked genome on the cluster you set up during the previous step. Make sure that gbpri, gbmam, gbrod, and gbvert are downloaded from Genbank into /cluster/store2/genbank.132 and unpacked by organism into /cluster/store2/mrna.132/org. Set up cluster run more or less as so: ssh kk cd ~/rn1/bed mkdir xenoMrna cd xenoMrna mkdir psl ls -1S /scratch/hg/rn1/contigs.1126/* > genome.lst # If it hasn't been done already do: # cp -r /cluster/store2/mrna.132/orgFa /scratch/hg/mrna.132/org find /scratch/hg/mrna.132/org/ -name mrna.fa | xargs ls -lS | awk {'print $5 " " $9'} \ | sort -nr | awk {'print $2'} > xenoRna.lst This produces a size-sorted file of all organisms mrna. Then edit xenoRna.lst removing the Rattus_norvegicus line, and writing the first line into 1.org, the second line into 2.org, and so forth. After the 6th line just leave the rest in 7.org. Then ls -1 *.org > ratXenoRna.lst cp ~/rn1/bed/xenoMrna/gsub . gensub2 genome.lst ratZenoRna.lst gsub spec para create spec para try para check If all looks well do para push. Sort xeno mRNA alignments as so: ssh eieio cd ~/rn1/bed/xenoMrna pslSort dirs raw.psl /cluster/store2/temp psl pslReps raw.psl cooked.psl /dev/null -minAli=0.25 liftUp chrom.psl ../../jkStuff/liftAll.lft warn cooked.psl pslSortAcc nohead chrom /cluster/store2/temp chrom.psl pslCat -dir chrom > xenoMrna.psl rm -r chrom raw.psl cooked.psl chrom.psl Load into database as so: ssh hgwdev cd ~/rn1/bed/xenoMrna hgLoadPsl rn1 xenoMrna.psl -tNameIx Make the xenoRna file cd /cluster/store2/mrna.132 gunzip -c /cluster/store2/genbank.132/gb{pri,rod,v,mam,inv,bct,htc,pat,phg,pln}* \ | gbToFaRa ratXenoRna.fil ratXenoRna.fa ratXenoRna.ra ratXenoRna.ta stdin Make a /gbdb symlink cd /gbdb/rn1/mrna.132 ln -s /cluster/store2/mrna.132/ratXenoRna.fa ratXenoRna.fa ln -s /cluster/store2/mrna.132/ratXenoRna.ra ratXenoRna.ra hgLoadRna add -type=xenoRna rn1 /gbdb/rn1/mrna.132/ratXenoRna.fa /gbdb/rn1/mrna.132/ratXenoRna.ra PRODUCING TETRAODON FISH ALIGNMENTS (TODO) o - Download sequence from ... and put it on the cluster local disk at /scratch/hg/fish o - Do fish/rat alignments. ssh kk cd ~/rn1/bed mkdir blatFish cd blatFish mkdir psl ls -1S /scratch/hg/fish/* > fish.lst ls -1S /scratch/hg/rn1/trfFa/* > rat.lst cp ~/lastRn/blatFish/gsub . gensub2 rat.lst fish.lst gsub spec para create spec para try Make sure jobs are going ok with para check. Then para push wait about 2 hours and do another para push do para checks and if necessary para pushes until done or use para shove. o - Sort alignments as so pslCat -dir psl | liftUp -type=.psl stdout ~/rn1/jkStuff/liftAll.lft warn stdin | pslSortAcc nohead chrom /cluster/store2/temp stdin o - Copy to hgwdev:/scratch. Rename to correspond with tables as so and load into database: ssh hgwdev cd ~/rn1/bed/blatFish/chrom foreach i (chr?{,?}{,_random}.psl) set r = $i:r mv $i ${r}_blatFish.psl end hgLoadPsl rn1 *.psl hgLoadRna addSeq rn1 /cluster/store2/fish/seq15jun2001/*.fa PRODUCING FUGU FISH ALIGNMENTS (DONE 11/27/02) # (Already done, for mm2:) # Download sequence to /cluster/store3/fuguSeq from ... and put it on the # cluster local disk at /scratch/hg/fugu on kkstore. # Sequence was downloaded from: # ftp://ftp.jgi-psf.org/pub/JGI_data/Fugu/fugu_v3_mask.fasta.Z # ftp://ftp.jgi-psf.org/pub/JGI_data/Fugu/fugu_v3_prot.fasta.Z # # faSplit sequence ../fugu_v3_mask.fasta 1000 fuguSplit ssh kk cd ~/rn1/bed mkdir blatFugu cd blatFugu ls -1S /scratch/hg/fugu/* > fugu.lst ls -1S /scratch/hg/rn1/trfFa.1127/* > rat.lst cp ~/lastRn/bed/blatFugu/gsub . mkdir psl foreach f (~/rn1/?{,?}/chr*/chr?{,?}{,_random}_?{,?}.fa) set c=$f:t mkdir psl/$c end gensub2 rat.lst fugu.lst gsub spec para create spec para try para check para push # do another para check in about 2 hours; if it looks good: para push -maxPush=500000 # do para checks and if necessary para pushes until done or use para shove. # Sort alignments: ssh eieio cd ~/rn1/bed/blatFugu pslCat -dir psl/* \ | liftUp -type=.psl stdout ~/rn1/jkStuff/liftAll.lft warn stdin \ | pslSortAcc nohead chrom /cluster/store2/temp stdin # load into database: ssh hgwdev cd ~/rn1/bed/blatFugu/chrom foreach i (chr?{,?}{,_random}.psl) set r = $i:r mv $i ${r}_blatFugu.psl end hgLoadPsl rn1 *.psl mkdir -p /gbdb/rn1/fuguSeq cd /gbdb/rn1/fuguSeq ln -s /cluster/store3/fuguSeq/fugu_v3_mask.fasta cd /cluster/store2/temp hgLoadRna addSeq rn1 /gbdb/rn1/fuguSeq/fugu_v3_mask.fasta MAKE LIFT FILE FOR AGPS (DONE 12/02/02) ssh eieio cd ~/rn1/jkStuff ./jkStuff/agpToLift.pl chrom.sizes ?{,?}/chr?{,?}{,.random}.agp \ > jkStuff/liftRNOR.lft CONTIG POSITIONS (DONE 12/02/02) # hgCtgPos creates a table of our macro-contigs (chr*_*), which are # not the contigs that people will be searching for in hgFind (RNOR*). # So create ctgPos with RNOR coords from jkStuff/liftRNOR.lft. ssh hgwdev mkdir -p ~/rn1/bed/ctgPos cd ~/rn1/bed/ctgPos perl -wpe 'chop; @words = split(/\s/); \ $start = $words[0]; \ $ctg = $words[1]; \ $size = $words[2]; \ $chr = $words[3]; \ $end = $start + $size; \ $_ = join("\t", $ctg, $size, $chr, $start, $end) . "\t\n"; \ ' ../../jkStuff/liftRNOR.lft \ > ctgPos.tab # Drop just in case it's already there (this might give error if not): echo 'drop table ctgPos' | hgsql rn1 hgsql rn1 < $HOME/kent/src/hg/lib/ctgPos.sql echo 'LOAD data local infile "ctgPos.tab" into table ctgPos' | hgsql rn1 LOAD BACTIG POSITIONS (DONE 12/13/02) ssh hgwdev mkdir -p ~/rn1/bed/bactigPos cd ~/rn1/bed/bactigPos # Paul Havlak havlak@swan.hgsc.bcm.tmc.edu sent us a BED 4+ email # attachment. # Save the attachment as ~/rn1/bed/bactigPos/bactigPos.orig.bed # Fix the 1-based starts to 0-based: awk "-F\t" '{printf "%s\t%d\t%s\t%s\t%s\t%s\n", $1, $2-1, $3, $4, $5, $6;}' < bactigPos.orig.bed > bactigPos.bed hgLoadBed rn1 bactigPos bactigPos.bed \ -noBin -sqlTable=$HOME/kent/src/hg/lib/bactigPos.sql LOAD CPGISSLANDS (DONE 12/5/02) ssh eieio mkdir -p ~/rn1/bed/cpgIsland cd ~/rn1/bed/cpgIsland # Build software emailed from Asif Chinwalla (achinwal@watson.wustl.edu) # copy the tar file to the current directory tar xvf cpg_dist.tar cd cpg_dist gcc readseq.c cpg_lh.c -o cpglh.exe cd .. foreach f (../../?{,?}/chr?{,?}{,_random}.fa.masked) set fout=$f:t:r:r.cpg echo running cpglh on $f to $fout ./cpg_dist/cpglh.exe $f > $fout.cpg end # copy filter.awk from a previous release cp ~/lastRn/bed/cpgIsland/filter.awk . awk -f filter.awk chr*.cpg > cpgIsland.bed # load into database: ssh hgwdev cd ~/rn1/bed/cpgIsland hgLoadBed rn1 cpgIsland -tab -noBin \ -sqlTable=$HOME/kent/src/hg/lib/cpgIsland.sql cpgIsland.bed LOAD ENSEMBL PHUSIONBLAST RAT-MOUSE HOMOLOGY (DONE 12/03/02) ssh eieio mkdir -p ~/rn1/bed/ensRatMusHom cd ~/rn1/bed/ensRatMusHom wget ftp://ftp.ensembl.org/pub/repository/rat/PBratMouse.gz gunzip -c PBratMouse.gz | tail +2 \ | perl -wpe 'chop; @words = split(/\s/); \ if ($words[1] =~ /chr\S+\.(RNOR\d+):(\d+)-(\d+)/) { \ $rCtg = $1; $rStart = $2; $rEnd = $3; \ } else { die "Bad format for second word: $words[1]"; } \ $mChr = "mm2.chr" . $words[4]; \ $mStart = $words[5]; \ $mEnd = $words[6]; \ $strand = $words[7]; \ $score = $words[9]; \ $_ = join("\t", $rCtg, $rStart-1, $rEnd, $mChr, $score, \ $strand, $mStart-1, $mEnd) . "\n"; \ ' \ | liftUp ensRatMusHom.bed ../../jkStuff/liftRNOR.lft warn stdin sed -e 's/ensPhusionBlast/ensRatMusHom/g' \ $HOME/kent/src/hg/lib/ensPhusionBlast.sql \ > ensRatMusHom.sql ssh hgwdev cd ~/rn1/bed/ensRatMusHom hgLoadBed rn1 ensRatMusHom ensRatMusHom.bed -sqlTable=ensRatMusHom.sql LOAD GENEID GENES (DONE 01/28/03) mkdir -p ~/rn1/bed/geneid/download cd ~/rn1/bed/geneid/download foreach f (~/rn1/?{,?}/chr?{,?}{,_random}.fa) set chr = $f:t:r wget http://genome.imim.es/genepredictions/R.norvegicus/rnNov2002/geneid_v1.1/$chr.gtf wget http://genome.imim.es/genepredictions/R.norvegicus/rnNov2002/geneid_v1.1/$chr.prot end # Add missing .1 to protein id's foreach f (*.prot) perl -wpe 's/^(>chr\w+)$/$1.1/' $f > $f:r-fixed.prot end cd .. ldHgGene rn1 geneid download/*.gtf -exon=CDS hgPepPred rn1 generic geneidPep download/*-fixed.prot SGP GENE PREDICTIONS (DONE 02/10/03) mkdir -p ~/rn1/bed/sgp/download cd ~/rn1/bed/sgp/download foreach f (~/rn1/?{,?}/chr?{,?}{,_random}.fa) set chr = $f:t:r wget http://genome.imim.es/genepredictions/R.norvegicus/rnNov2002/SGP/humangp20020628/$chr.gtf wget http://genome.imim.es/genepredictions/R.norvegicus/rnNov2002/SGP/humangp20020628/$chr.prot end # Add missing .1 to protein id's foreach f (*.prot) perl -wpe 's/^(>chr\w+)$/$1.1/' $f > $f:r-fixed.prot end cd .. ldHgGene rn1 sgpGene download/*.gtf -exon=CDS hgPepPred rn1 generic sgpPep download/*-fixed.prot TIGR GENE INDEX (DONE 2/26/03) o mkdir -p ~/rn1/bed/tigr cd ~/rn1/bed/tigr wget ftp://ftp.tigr.org/private/NHGI_rgi_dashu/TGI_track_RatGenome_Feb2003.tgz tar xvzf TGI*.tgz foreach f (*cattle*) set f1 = `echo $f | sed -e 's/cattle/cow/g'` mv $f $f1 end foreach o (mouse cow human pig rat) setenv O $o foreach f (chr*_$o*s) tail +2 $f | perl -wpe 's /THC/TC/; s/(TH?C\d+)/$ENV{O}_$1/;' > $f.gff end end ldHgGene -exon=TC rn1 tigrGeneIndex *.gff LOAD STS MAP (todo) - login to hgwdev cd ~/rn1/bed rn1 < ~/src/hg/lib/stsMap.sql mkdir stsMap cd stsMap bedSort /projects/cc/hg/mapplots/data/tracks/build28/stsMap.bed stsMap.bed - Enter database with "rn1" command. - At mysql> prompt type in: load data local infile 'stsMap.bed' into table stsMap; - At mysql> prompt type LOAD MGI IDs (TODO) - The Locuslink ID to MGI IDs converstion data file, LL2MGI.txt, from Jackson Lab should be found under ~/rn1/bed/refSeq - login to hgwdev cd ~/rn1/bed/refSeq rn1 < ~/src/hg/lib/mgiID.sql - Enter database with "rn1" command. - At mysql> prompt type in: load data local infile 'LL2MGI.txt' into table MGIid; - At mysql> prompt type quit LOAD CHROMOSOME BANDS (todo) - login to hgwdev cd /cluster/store4/rn1/bed mkdir cytoBands cp /projects/cc/hg/mapplots/data/tracks/build28/cytobands.bed cytoBands rn1 < ~/src/hg/lib/cytoBand.sql Enter database with "rn1" command. - At mysql> prompt type in: load data local infile 'cytobands.bed' into table cytoBand; - At mysql> prompt type quit LOAD RATREF TRACK (todo) First copy in data from kkstore to ~/rn1/bed/ratRef. Then substitute 'genome' for the appropriate chromosome in each of the alignment files. Finally do: hgRefAlign webb rn1 ratRef *.alignments LOAD AVID RAT TRACK (todo) ssh cc98 cd ~/rn1/bed mkdir avidRat cd avidRat wget http://pipeline.lbl.gov/tableCS-LBNL.txt hgAvidShortBed *.txt avidRepeat.bed avidUnique.bed hgLoadBed avidRepeat avidRepeat.bed hgLoadBed avidUnique avidUnique.bed LOAD SNPS (TODO) - ssh hgwdev - cd ~/rn1/bed - mkdir snp - cd snp - Download SNPs from ftp://ftp.ncbi.nlm.nih.gov/pub/sherry/rat.b27.out.gz - Unpack. createBed < rat.b27.out > snpNih.bed hgLoadBed rn1 snpNih snpNih.bed LOAD ENSEMBL ESTs (TODO) ln -s /cluster/store4/rn1 ~/rn1 mkdir -p ~/rn1/bed/ensembl cd ~/rn1/bed/ensembl wget http://www.ebi.ac.uk/~stabenau/rat-est.gz wget http://www.ebi.ac.uk/~stabenau/rat-est.pep.gz gunzip -c rat-est.gz | \ perl -w -p -e 's/^(\w)/chr$1/' > rat-est-fixed.gtf ldHgGene rn1 ensEst rat-est-fixed.gtf > The id behind '>' is internal and was not in our gtf dump, so > you have to do some more parsing. # pick out the transcript= attribute -- that's the id to use: # also remove the first line: gunzip -c rat-est.pep.gz | tail +2 | \ perl -w -p -e 's/^\>gene_id=.*transcript=(\w+)\s+.*$/\>$1/' > \ rat-est-fixed.pep hgPepPred rn1 generic ensEstPep rat-est-fixed.pep LOAD ENSEMBLE GENES (TODO) mkdir -p ~/rn1/bed/ensembl cd ~/rn1/bed/ensembl wget http://www.ebi.ac.uk/~stabenau/rat-ensembl.gz wget http://www.ebi.ac.uk/~stabenau/rat-ensembl.pep.gz gunzip -c rat-ensembl.gz | \ perl -w -p -e 's/^(\w)/chr$1/' > rat-ensembl-fixed.gtf ldHgGene rn1 ensGene rat-ensembl-fixed.gtf > rat-ensembl contains stopcodons, due to some glitches in our > genebuild. The id behind '>' is internal and was not in our gtf dump, so > you have to do some more parsing. # pick out the transcript= attribute -- that's the id to use: # also remove the first line: tail +2 rat-ensembl.pep | \ perl -w -p -e 's/^\>gene_id=.*transcript=(\w+)\s+.*$/\>$1/' > \ rat-ensembl-fixed.pep hgPepPred rn1 generic ensPep rat-ensembl-fixed.pep LOAD RNAGENES (todo) - login to hgwdev - cd ~kent/src/hg/lib - rn1 < rnaGene.sql - cd /cluster/store4/rn1/bed - mkdir rnaGene - cd rnaGene - download data from ftp.genetics.wustl.edu/pub/eddy/pickup/ncrna-oo27.gff.gz - gunzip *.gz - liftUp chrom.gff ../../jkStuff/liftAll.lft carry ncrna-oo27.gff - hgRnaGenes rn1 chrom.gff LOAD EXOFISH (todo) - login to hgwdev - cd /cluster/store4/rn1/bed - mkdir exoFish - cd exoFish - rn1 < ~kent/src/hg/lib/exoFish.sql - Put email attatchment from Olivier Jaillon (ojaaillon@genoscope.cns.fr) into /cluster/store4/rn1/bed/exoFish/all_maping_ecore - awk -f filter.awk all_maping_ecore > exoFish.bed - hgLoadBed rn1 exoFish exoFish.bed LOAD GENIE (TODO) mkdir -p ~/rn1/bed/genieAlt cd ~/rn1/bed/genieAlt wget http://www.neomorphic.com/mgap/mgscv3/gtf/mgscv3.genie.gtf.tgz gunzip -c mgscv3.genie.gtf.tgz | tar xvf - ldHgGene rn1 genieAlt mgscv3.genie.gtf/chr*.gtf wget http://www.neomorphic.com/mgap/mgscv3/fa/mgscv3.aa.tgz gunzip -c mgscv3.aa.tgz | tar xvf - hgPepPred rn1 genie geniePep chr*.aa.fa LOAD GENIE CLONE BOUNDS (TODO) mkdir -p ~/rn1/bed/genieBounds cd ~/rn1/bed/genieBounds wget http://www.neomorphic.com/mgap/mgscv3/cb.bed/mgscv3_cb.bed.tgz gunzip -c mgscv3_cb.bed.tgz | tar xvf - - Trim the track definition from each file (these are actually custom track files): foreach c (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 X Un) tail +2 chr${c}_cb.bed > chr${c}_cb-fixed.bed end hgLoadBed rn1 genieBounds *-fixed.bed LOAD JACKSON LABS QTL (TODO) Carol Bult at Jackson Labs sent us tab-separated QTL info -- See RT289. - Reformat QTL info into bed 6 + (really bed 5, no strand). There will be some complaints about lines that are missing chromStart, chromEnd (they're on MIT Markers that we don't have in our stsMapRat table...?) cd ~/rn1/bed/qtl ./fixit.pl mgi_qtl.rpt > mgi_qtl.bed hgLoadBed -noBin -tab -sqlTable=$HOME/kent/src/hg/lib/jaxQTL.sql \ rn1 jaxQTL mgi_qtl.bed LOAD SOFTBERRY GENES (todo) - ln -s /cluster/store4/rn1 ~/rn1 - cd ~/rn1/bed - mkdir softberry - cd softberry - get ftp://www.softberry.com/pub/SC_MOU_NOV01/softb_mou_genes_nov01.tar.gz ldHgGene rn1 softberryGene chr*.gff hgPepPred rn1 softberry *.protein hgSoftberryHom rn1 *.protein LOAD GENOMIC DUPES (todo) o - Load genomic dupes ssh hgwdev cd ~/rn1/bed mkdir genomicDups cd genomicDups wget http://codon/jab/web/takeoff/oo33_dups_for_kent.zip unzip *.zip awk -f filter.awk oo33_dups_for_kent > genomicDups.bed hgsql rn1 < ~/src/hg/lib/genomicDups.sql hgLoadBed rn1 -oldTable genomicDups genomicDupes.bed LOAD RGD CURATED GENES TRACK - cd rn1 - cd bed - mkdir rgdGene - Browse to http://zephyrus.brc.mcw.edu/cgi-bin/pub/viewcvs.cgi/pub_gbrowse/gff_files/RGD_curated_genes.gff This is a web-based CVS page. Click the download link and save the file to ~/rn1/bed/RGD_curated_genes.gff - Now massage the data format using: rn1/bed/rgdGene/massage.pl - Load the data: ldHgGene rn1 rgdGene Fixed_RGD_Curated_genes.gff - Create the link table for searching In mysql for the rn1 database do: create table rgdLink (id int primary key, name varchar(32) not null); LOAD DATA LOCAL INFILE 'RGD.links' into table rgdLink; FAKING DATA FROM PREVIOUS VERSION (This is just for until proper track arrives. Rescues about 97% of data Just an experiment, not really followed through on). o - Rescuing STS track: - log onto hgwdev - mkdir ~/rn1/rescue - cd !$ - mkdir sts - cd sts - bedDown hg3 mapGenethon sts.fa sts.tab - echo ~/rn1/sts.fa > fa.lst - pslOoJobs ~/rn1 ~/rn1/rescue/sts/fa.lst ~/rn1/rescue/sts g2g - log onto cc01 - cc ~/rn1/rescue/sts - split all.con into 3 parts and condor_submit each part - wait for assembly to finish - cd psl - mkdir all - ln ?/*.psl ??/*.psl *.psl all - pslSort dirs raw.psl temp all - pslReps raw.psl contig.psl /dev/null - rm raw.psl - liftUp chrom.psl ../../../jkStuff/liftAll.lft carry contig.psl - rm contig.psl - mv chrom.psl ../convert.psl LOAD SLAM GENES (hg12) cd /cluster/store3/gs.13/build30/bed/slam wget http://bio.math.berkeley.edu/slam/rat/gff/UCSC/rnCDS.gff.gz wget http://bio.math.berkeley.edu/slam/rat/gff/UCSC/rnCNS.gff.gz gunzip * ldHgGene -exon=CDS hg12 slam rnCDS.gff mv genePred.tab genePred.rn1 sed -e 's/;//g' -e 's/"//g' rnCNS.bed > rnCNS.bed.2 sort -n -k 5,5 rnCNS.bed.2 > rnCNS.bed.sort examine head and tail of sorted file for range of scores rm rnCNS.bed.sort size.pl < rnCNS.bed.2 > rnCNS.bed.2.size sort -n -k 2,2 rnCNS.bed.2.size > rnCNS.bed.2.size.sort examine head and tail of sorted file for range of sizes rm rnCNS.bed.2.size.sort expand.pl < rnCNS.bed.2 > rnCNS.bed.2.expand SLAM (hg13) cd /cluster/store4/rn1/bed/slam mkdir hg13 cd hg13 wget http://baboon.math.berkeley.edu/~cdewey/slam/hs_31_Nov2002_rn_2_Nov2002/gff/rnCDS.gff.gz gunzip rnCDS.gff.gz mv rnCDS.gff ratFromHumanCDS.gff ldHgGene -exon=CDS rn1 slamHuman wget http://baboon.math.berkeley.edu/~scawley/slam/hs_31_Nov2002_rn_2_Nov2002/gff/rnCNS.bed.gz gunzip rnCNS.bed.gz mv rnCNS.bed ratFromHumanCNS.bed expand.pl < ratFromHumanCNS.bed > ratFromHumanCNS.bed.expand hgLoadBed -tab rn1 slamNonCodingHuman ratFromHumanCNS.bed.expand