# This file describes how we made the browser database on the mouse genome, February 2002 build. BREAK UP THE MOUSE SEQUENCE INTO 2 MB CHUNKS AT NON_BRIDGED CONTIGS (done) o - This version of the mouse sequence data is in /cluster/store2/mm.2002.02/mm2/assembly o - cd into your CVS source tree under kent/src/hg/splitFaIntoContigs - Type make - Run gunzip -c /cluster/store2/mm.2002.02/mm2/assembly/*.fasta.gz | splitFaIntoContigs /cluster/store2/mm.2002.02/mm2/assembly/*.agp stdin /cluster/store2/mm.2002.02/mm2 2000000 - This will split the mouse sequence into approx. 2 Mbase supercontigs between non-bridged clone contigs and drop the resulting dir structure in /cluster/store2/mm.2002.02/mm2. - The resulting dir structure will include 1 dir for each chromosome, each of which has a set of subdirectories, one subdir per supercontig. COPY THE MOUSE SEQUENCE DATA TO THE CLUSTER (done) o - ssh kkstore o - Copy the rna data to the cluster if it isn't there already: mkdir /scratch/hg/mrna.128 cp -r /cluster/store1/mrna.128/org /scratch/hg/mrna.128 o - Copy the mouse sequence supercontigs to the cluster mkdir /scratch/hg/mm2/ mkdir /scratch/hg/mm2/contigs cp /cluster/store2/mm.2002.02/mm2/*/chr*/chr*.fa /scratch/hg/mm2/contigs o - Distribute this data to the local nodes: sudo /cluster/install/utilities/localUpdate REPEAT MASKING (DONE 07/30/02) Split contigs, run RepeatMasker, lift results Notes: * If there is a new version of RepeatMasker, build it and ask the admins to binrsync it (kkstore:/scratch/hg/RepeatMasker/*). * Contigs (*/chr*_*/chr*_*.fa) are split into 500kb chunks to make RepeatMasker runs manageable on the cluster ==> results need lifting. * For the NCBI assembly we repeat mask on the sensitive mode setting (RepeatMasker -m -s) #- Split contigs into 500kb chunks: cd ~/mm2 foreach d ( */chr*_?{,?} ) cd $d set contig = $d:t faSplit size $contig.fa 500000 ${contig}_ -lift=$contig.lft \ -maxN=500000 cd ../.. end #- Make the run directory and job list: cd ~/mm2 mkdir RMRun rm -f RMRun/RMJobs touch RMRun/RMJobs foreach d ( ?{,?}/chr*_?{,?} ) foreach f ( $d/chr*_*_*.fa ) set f = $f:t echo /cluster/bin/scripts/RMMouse \ /cluster/store2/mm.2002.02/mm2/$d $f \ '{'check out line+ /cluster/store2/mm.2002.02/mm2/$d/$f.out'}' \ >> RMRun/RMJobs end end #- Do the run ssh kk cd ~/mm2/RMRun para create RMJobs para try, para check, para check, para push, para check,... #- Lift up the split-contig .out's to contig-level .out's cd ~/mm2 foreach d ( ?{,?}/chr*_?{,?} ) cd $d set contig = $d:t liftUp $contig.fa.out $contig.lft warn ${contig}_*.fa.out > /dev/null cd ../.. end #- Lift up the contig-level .out's to chr-level cd ~/mm2 ./jkStuff/liftOut5.sh #- Load the .out files into the database with: ssh hgwdev cd ~/mm2 hgLoadOut mm2 ?/*.fa.out ??/*.fa.out ssh kkstore cd ~/mm2 #- Soft-mask (lower-case) the contig and chr .fa's tcsh jkStuff/makeFaMasked.sh #- Make hard-masked .fa.masked files as well: tcsh jkStuff/makeHardMasked.sh #- Rebuild the nib, mixedNib, maskedNib files: tcsh jkStuff/makeNib.sh #- Rebuild the .zip files tcsh jkStuff/zipAll.sh #- copy the contig .fa's to the appropriate place on /scratch cp -p ?{,?}/chr*/chr?{,?}_?{,?}.fa /scratch/hg/mm2/contigs.0730 #- Copy the .zip files to hgwdev:/usr/local/apache/... ssh hgwdev cd ~/mm2 tcsh jkStuff/cpToWeb.sh EXTRACT LINEAGE-SPECIFIC REPEATS (ARIAN SMIT''s scripts) (DONE 11/4/02) ssh kkstore mkdir -p ~/mm2/bed/linSpecRep cd ~/mm2/bed/linSpecRep foreach f (~/mm2/*/*.out) ln -sf $f . end /cluster/bin/scripts/rodentSpecificRepeats.pl *.out /cluster/bin/scripts/perl-rename 's/(\.fa|\.nib)//' *.out.*spec /cluster/bin/scripts/perl-rename 's/\.(rod|prim)spec/.spec/' *.out.*spec rm *.out rm -rf /scratch/hg/mm2/linSpecRep cd .. cp -R linSpecRep /scratch/hg/mm2 # Ask cluster-admin@cse.ucsc.edu to binrsync /scratch/hg to clusters CREATING DATABASE AND STORING mRNA/EST SEQUENCE AND AUXILIARY INFO o - Create the database. - ssh hgwdev - Enter mysql via: mysql -u hgcat -pbigsecret - At mysql prompt type: create database mm1; quit - make a semi-permanent read-only alias: alias mm2 "mysql -u hguser -phguserstuff -A mm2" o - Use df to ake sure there is at least 5 gig free on hgwdev:/usr/local/mysql o - Store the mRNA (non-alignment) info in database. (Matt - pleas update this section... ) STORING O+O SEQUENCE AND ASSEMBLY INFORMATION (done) Create packed chromosome sequence files ssh kkstore cd ~/mm tcsh jkStuff/makeNib.sh Load chromosome sequence info into database and save size info. ssh hgwdev hgsql mm2 < ~/src/hg/lib/chromInfo.sql cd ~/mm hgNibSeq -preMadeNib mm2 /cluster/store2/mm.2002.02/mm2/nib ?/chr*.fa ??/chr*.fa mysql -u hguser -phguserstuff -N -e "select chrom,size from chromInfo" hg12 > chrom.sizes Store o+o info in database. cd /cluster/store2/mm.2002.02/mm2 hgGoldGapGl mm2 /cluster/store2/mm.2002.02 mm2 -noGl Make and load GC percent table ssh hgwdev cd /cluster/store2/mm.2002.02/mm2/bed mkdir gcPercent cd gcPercent mysql -A -u hgcat -pbigsecret mm2 < ~/src/hg/lib/gcPercent.sql hgGcPercent mm2 ../../nib MAKING AND STORING mRNA AND EST ALIGNMENTS (done) o - Load up the local disks of the cluster with refSeq.fa, mrna.fa and est.fa from /cluster/store1/mrna.127 into /var/tmp/hg/h/mrna o - Use BLAT to generate refSeq, mRNA and EST alignments as so: Make sure that /scratch/hg/mm2/contigs is loaded with chr*_*.fa and pushed to the cluster nodes. The following cshell script needs updating. cd ~/mm/bed foreach i (refSeq mrna est) mkdir $i cd $i echo /scratch/hg/gs.11/build28/contigs | wordLine stdin > genome.lst ls -1 /scratch/hg/mrna.127/$i.fa > mrna.lst mkdir psl gensub2 genome.lst mrna.lst gsub spec jabba make hut spec jabba push hut end check on progress with jabba check hut in mrna, est, and refSeq directories. o - Process refSeq mRNA and EST alignments into near best in genome. cd ~/mm/bed cd refSeq pslSort dirs raw.psl /cluster/fast1/temp psl pslReps -minCover=0.2 -sizeMatters -minAli=0.98 -nearTop=0.002 raw.psl contig.psl /dev/null liftUp -nohead all_refSeq.psl ../../jkStuff/liftAll.lft warn contig.psl pslSortAcc nohead chrom /cluster/fast1/temp all_refSeq.psl cd .. cd mrna pslSort dirs raw.psl /cluster/fast1/temp psl pslReps -minAli=0.98 -sizeMatters -nearTop=0.005 raw.psl contig.psl /dev/null liftUp -nohead all_mrna.psl ../../jkStuff/liftAll.lft warn contig.psl pslSortAcc nohead chrom /cluster/fast1/temp all_mrna.psl cd .. cd est pslSort dirs raw.psl /cluster/fast1/temp psl pslReps -minAli=0.98 -sizeMatters -nearTop=0.005 raw.psl contig.psl /dev/null liftUp -nohead all_est.psl ../../jkStuff/liftAll.lft warn contig.psl pslSortAcc nohead chrom /cluster/fast1/temp all_est.psl cd .. o - Load mRNA alignments into database. ssh hgwdev cd /cluster/store2/mm.2002.02/mm2/bed/mrna/chrom foreach i (*.psl) mv $i $i:r_mrna.psl end hgLoadPsl mm2 *.psl cd .. hgLoadPsl mm2 all_mrna.psl -nobin o - Load EST alignments into database. ssh hgwdev cd /cluster/store2/mm.2002.02/mm2/bed/est/chrom foreach i (*.psl) mv $i $i:r_est.psl end hgLoadPsl mm2 *.psl cd .. hgLoadPsl mm2 all_est.psl -nobin o - Create subset of ESTs with introns and load into database. - ssh kkstore cd ~/mm tcsh jkStuff/makeIntronEst.sh - ssh hgwdev cd ~/mm/bed/est/intronEst hgLoadPsl mm2 *.psl o - Load refSeq alignments into database ssh hgwdev cd ~/mm/bed/refSeq pslCat -dir chrom > refSeqAli.psl hgLoadPsl hg10 -tNameIx refSeqAli.psl PRODUCING ESTORIENTINFO TABLE This table is needed for proper orientation of ESTs in the browser. Many will appear on the wrong strand without it. This involves a cluster run. First load the EST psl files as so: ssh kkstore cd ~/mm/bed/est pslSortAcc nohead contig /cluster/fast1/temp contig.psl mkdir /scratch/hg/mm2/est cp -r contig /scratch/hg/mm2/est sudo /cluster/install/utilities/updateLocal Wait for these to finish. cd .. mkdir estOrientInfo cd estOrientInfo mkdir ei ls -1S /scratch/hg/mm2/est/contig > psl.lst cp ~/lastMm/bed/estOrientInfo/gsub . Update gsub to refer to mouse contig sequence currently on /scratch, and mouse ESTs on /scratch. gensub2 psl.lst single gsub spec para create spec Then run the job on the cluster ssh kk cd ~/mm/bed/estOrientInfo para try sleep 60 para check If things look good para push Wait for this to finish then liftUp estOrientInfo.bed ../../jkStuff/liftAll.lft warn ei/*.tab Load them into database as so: ssh hgwdev cd ~/mm/bed/estOrientInfo hgLoadBed mm2 estOrientInfo estOrientInfo.bed -sqlTable=/cluster/home/kent/src/hg/lib/estOrientInfo.sql CREATE RNACLUSTER TABLE (done) Make sure that refSeqAli and estOrientInfo tables are made already (see above). ssh hgwdev cd ~/mm/bed mkdir rnaCluster cd rnaCluster mkdir rna est foreach i (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 X Un) clusterRna mm2 rna/chr$i.bed est/chr$i.bed -chrom=chr$i echo done $i end hgLoadBed mm2 rnaCluster est/*.bed PRODUCING KNOWN GENES (done) o - Download everything from ftp://ncbi.nlm.nih.gov/refseq/H_sapiens/mRNA_Prot/ into ~/mm/bed/refSeq o - Unpack this into fa files and get extra info with: cd ~/mm/bed/refSeq gunzip mouse.faa.gz gunzip mouse.gbff.gz gbToFaRa ~/hg/h/allRna.fil refSeq.fa refSeq.ra refSeq.ta mouse.gbff o - Get extra info from NCBI and produce refGene table as so: wget ftp://ncbi.nlm.nih.gov/refseq/LocusLink/loc2ref wget ftp://ncbi.nlm.nih.gov/refseq/LocusLink/mim2loc o - Produce refGenn, refPep, refMrna, and refLink tables as so: hgRefSeqMrna mm2 refSeq.fa refSeq.ra all_refSeq.psl loc2ref mouse.faa mim2loc o - Add RefSeq status info (done 6/19/02) hgRefSeqStatus mm2 loc2ref REFFLAT o - create precomputed join of refFlat and refGene: echo 'CREATE TABLE refFlat (KEY geneName (geneName), KEY name (name), KEY chrom (chrom)) SELECT refLink.name as geneName, refGene.* FROM refLink,refGene WHERE refLink.mrnaAcc = refGene.name' | hgsql mm2 SIMPLE REPEAT TRACK (done) o - Create cluster parasol job like so: ssh kk cd ~/mm/bed mkdir simpleRepeat cd simpleRepeat cp ~/lastOo/bed/simpleRepeat/gsub mkdir trf ls -1 /scratch/hg/mm2/contigs/*.fa > genome.lst gensub2 genome.lst single gsub spec para make spec para push When job is done do: liftUp simpleRepeat.bed ~/mm/jkStuff/liftAll.lft warn trf/*.bed o - Load this into the database as so ssh hgwdev cd ~/mm/bed/simpleRepeat hgLoadBed mm2 simpleRepeat simpleRepeat.bed -sqlTable=$HOME/src/hg/lib/simpleRepeat.sql LOADING MOUSE MM2 HUMAN BLASTZ ALIGNMENTS FROM PENN STATE: (IN PROGRESS: generated 12/7/02, not loaded into db) # Translate Penn State .lav files into sorted axt: ssh kkstore set base="/cluster/store2/mm.2002.02/mm2/bed/blastz.gs14.2002-12-6-ASH" set seq1_dir="/cluster/store2/mm.2002.02/mm2/trfMixedNib/" set seq2_dir="/cluster/store4/gs.14/build31/mixedNib/" set tbl="blastzHg13" cd $base mkdir -p axtChrom foreach c (lav/*) pushd $c set chr=$c:t set out=$base/axtChrom/$chr.axt echo "Translating $chr lav to $out" cat `ls -1 *.lav | sort -g` \ | lavToAxt stdin $seq1_dir $seq2_dir stdout \ | axtSort stdin $out popd end # Translate the sorted axt files into psl: cd $base mkdir -p pslChrom foreach f (axtChrom/chr*.axt) set c=$f:t:r axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl end # Load tables ssh hgwdev set base="/cluster/store2/mm.2002.02/mm2/bed/blastz.gs14.2002-12-6-ASH" set tbl="blastzHg13" cd $base/pslChrom hgLoadPsl mm2 chr*_${tbl}.psl MAKING THE BLASTZBESTHUMAN TRACK FROM PENN STATE MM2 AXT FILES (IN PROGRESS: generated 12/7/02, not loaded into db) # Consolidate AXT files to chrom level, sort, pick best, make psl. ssh kkstore set base="/cluster/store2/mm.2002.02/mm2/bed/blastz.gs14.2002-12-6-ASH" set seq1_dir="/cluster/store2/mm.2002.02/mm2/trfMixedNib/" set seq2_dir="/cluster/store4/gs.14/build31/mixedNib/" set tbl="blastzBestHuman" cd $base mkdir -p axtBest pslBest foreach chrdir (lav/chr*) set chr=$chrdir:t echo axtBesting $chr axtBest axtChrom/$chr.axt $chr axtBest/$chr.axt -minScore=300 echo translating axtBest to psl for $chr axtToPsl axtBest/$chr.axt S1.len S2.len pslBest/${chr}_${tbl}.psl end # If a chromosome has so many alignments that axtBest runs out of mem, # run axtBest in 2 passes to reduce size of the input to final axtBest: foreach chrdir (lav/chr7) set chr=$chrdir:t echo two-pass axtBesting $chr foreach d ($chrdir/*.lav) set smallout=$d.axt lavToAxt $d $seq1_dir $seq2_dir stdout \ | axtSort stdin $smallout end foreach a ($chrdir/*.axt) axtBest $a $chr $a:r.axtBest end cat `ls -1 $chrdir/*.axtBest | sort -g` \ > $chrdir/$chr.axtBestPieces axtBest $chrdir/$chr.axtBestPieces $chr axtBest/$chr.axt axtToPsl axtBest/$chr.axt S1.len S2.len pslBest/${chr}_${tbl}.psl end # Load tables ssh hgwdev set base="/cluster/store2/mm.2002.02/mm2/bed/blastz.gs14.2002-12-6-ASH" set tbl="blastzBestHuman" cd $base/pslBest hgLoadPsl mm2 chr*_${tbl}.psl # Make /gbdb links and add them to the axtInfo table: mkdir -p /gbdb/mm2/axtBestHg13 cd /gbdb/mm2/axtBestHg13 foreach f ($base/axtBest/chr*.axt) ln -s $f . end cd $base/axtBest rm -f axtInfoInserts.sql touch axtInfoInserts.sql foreach f (/gbdb/mm2/axtBestHg13/chr*.axt) set chr=$f:t:r echo "INSERT INTO axtInfo VALUES ('hg13','Blastz Best in Genome','$chr','$f');" \ >> axtInfoInserts.sql end hgsql mm2 < ~/kent/src/hg/lib/axtInfo.sql hgsql mm2 < axtInfoInserts.sql MAKING THE HUMAN AXTTIGHT FROM AXTBEST (IN PROGRESS: generated 12/7/02, not loaded into db) # After creating axtBest alignments above, use subsetAxt to get axtTight: ssh kkstore cd ~/mm2/bed/blastz.gs14.2002-12-6-ASH/axtBest mkdir -p ../axtTight foreach i (*.axt) subsetAxt $i ../axtTight/$i \ ~kent/src/hg/mouseStuff/subsetAxt/coding.mat 3400 end # translate to psl cd ../axtTight mkdir -p ../pslTight foreach i (*.axt) set c = $i:r axtToPsl $i ../S1.len ../S2.len ../pslTight/${c}_blastzTightHuman.psl end # Load tables into database ssh hgwdev cd ~/mm2/bed/blastz.gs14.2002-12-6-ASH/pslTight hgLoadPsl mm2 chr*_blastzTightHuman.psl LOADING MOUSE MM2 RAT BLASTZ ALIGNMENTS FROM PENN STATE: (DONE 1/9/03) # Translate Penn State .lav files into sorted axt: ssh kkstore set base="/cluster/store2/mm.2002.02/mm2/bed/blastz.rn1.2003-01-09-ASH" set seq1_dir="/cluster/store2/mm.2002.02/mm2/trfMixedNib/" set seq2_dir="/cluster/store4/rn1/mixedNib/" set tbl="blastzRn1" cd $base mkdir -p axtChrom # Some chromosomes have so many alignments that axtSort runs out of mem, # so generate a sorted .axt for each small .lav chunk, then cat a sorted # list of chunk .axt files together to make the chrom .axt: foreach c (lav/chr*) pushd $c set chr=$c:t set out=$base/axtChrom/$chr.axt echo two-pass lavToAxting $chr foreach d (*.lav) set smallout=$d.axt lavToAxt $d $seq1_dir $seq2_dir stdout \ | axtSort stdin $smallout end cat `ls -1 *.lav.axt | sort -g` \ > $out popd end # Mouse-rat alignments are quite large, and the unfiltered .axt's are # not used as often (or by the browser) as the axtBest .axt's... so # compress them to save disk space: cd $base/axtChrom gzip chr*.axt # Translate the sorted axt files into psl: cd $base mkdir -p pslChrom foreach f (axtChrom/chr*.axt) set c=$f:t:r:r axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl end # Load tables ssh hgwdev set base="/cluster/store2/mm.2002.02/mm2/bed/blastz.rn1.2003-01-09-ASH" set tbl="blastzRn1" cd $base/pslChrom hgLoadPsl mm2 chr*_${tbl}.psl MAKING THE BLASTZBESTRAT TRACK FROM PENN STATE MM2 AXT FILES (DONE 1/9/03) # Consolidate AXT files to chrom level, sort, pick best, make psl. ssh kkstore set base="/cluster/store2/mm.2002.02/mm2/bed/blastz.rn1.2003-01-09-ASH" set seq1_dir="/cluster/store2/mm.2002.02/mm2/trfMixedNib/" set seq2_dir="/cluster/store4/rn1/mixedNib/" set tbl="blastzBestRat" cd $base mkdir -p axtBest pslBest # Again, run in 2 passes (axtBest on small chunks, then axtBest on # those results to resolve overlaps) to avoid running axtBest out of mem. foreach chrdir (lav/chr*) set chr=$chrdir:t echo two-pass axtBesting $chr foreach a ($chrdir/*.lav.axt) axtBest $a $chr $a:r.axtBest end cat `ls -1 $chrdir/*.axtBest | sort -g` | \ axtBest stdin $chr axtBest/$chr.axt axtToPsl axtBest/$chr.axt S1.len S2.len pslBest/${chr}_${tbl}.psl end # Now clean up chunk .axt files to save disk space: cd $base rm lav/chr*/*.lav.axt* # Load tables ssh hgwdev set base="/cluster/store2/mm.2002.02/mm2/bed/blastz.rn1.2003-01-09-ASH" set tbl="blastzBestRat" cd $base/pslBest hgLoadPsl mm2 chr*_${tbl}.psl # Make /gbdb links and add them to the axtInfo table: mkdir -p /gbdb/mm2/axtBestRn1 cd /gbdb/mm2/axtBestRn1 rm -f * foreach f ($base/axtBest/chr*.axt) ln -s $f . end cd $base/axtBest rm -f axtInfoInserts.sql touch axtInfoInserts.sql foreach f (/gbdb/mm2/axtBestRn1/chr*.axt) set chr=$f:t:r echo "INSERT INTO axtInfo VALUES ('rn1','Blastz Best in Genome','$chr','$f');" \ >> axtInfoInserts.sql end hgsql mm2 < ~/kent/src/hg/lib/axtInfo.sql hgsql mm2 < axtInfoInserts.sql MAKING THE RAT AXTTIGHT FROM AXTBEST (DONE 1/9/03) # After creating axtBest alignments above, use subsetAxt to get axtTight: ssh kkstore cd ~/mm2/bed/blastz.rn1.2003-01-09-ASH/axtBest mkdir -p ../axtTight foreach i (*.axt) subsetAxt $i ../axtTight/$i \ ~kent/src/hg/mouseStuff/subsetAxt/coding.mat 3400 end # translate to psl cd ../axtTight mkdir -p ../pslTight foreach i (*.axt) set c = $i:r axtToPsl $i ../S1.len ../S2.len ../pslTight/${c}_blastzTightRat.psl end # Load tables into database ssh hgwdev cd ~/mm2/bed/blastz.rn1.2003-01-09-ASH/pslTight hgLoadPsl mm2 chr*_blastzTightRat.psl PRODUCING GENSCAN PREDICTIONS (done) o - Produce contig genscan.gtf genscan.pep and genscanExtra.bed files like so: First make sure you have appropriate set up, permissions, etc. and you have tried using Parasol to submit and finished a set of jobs successfully. Load up the cluster with hard-masked contigs in /scratch/hg/mm2/mContigs Log into kkr1u00 (not kk!). kkr1u00 is the driver node for the small cluster (kkr2u00 -kkr8u00. (genscan has problem running on the big cluster, due to limitation of memory and swap space on each processing node). cd ~/mm cd bed/genscan Make 3 subdirectories for genscan to put their output files in mkdir gtf pep subopt Generate a list file, genome.list, of all the contigs ls -1S /cluster/store2/mm.2002.02/mm2/mContigs/* >genome.list Edit genome.list to remove jobs on all 20 files of chr??_1.fa.masked. Those files have pure Ns due to heterochromatin (unsequencable stuff) and will cause genscan to run forever. Create template file, gsub, for gensub2. For example (3 lines file): #LOOP /cluster/home/fanhsu/bin/i386/gsBig {check in line+ $(path1)} {check out line gtf/$(root1).gtf} -trans={check out line pep/$(root1).pep} -subopt={check out line subopt/$(root1).bed} -exe=/cluster/home/fanhsu/projects/compbio/bin/genscan-linux/genscan -par=/cluster/home/fanhsu/projects/compbio/bin/genscan-linux/HumanIso.smat -tmp=/tmp -window=2400000 #ENDLOOP Create a file containing a single line. echo single > single Generate job list file, jobList, for Parasol gensub2 genome.list single gsub jobList Edit jobList to find the line containing "chr12_5.fa.masked" and change "-window=2400000" into "-window=1200000", because genscan has "not enough memory" problem with this particular file. First issue the following Parasol command: para create jobList Run the following command, which will try first 10 jobs from jobList para try Check if these 10 jobs run OK by para check If they have problems, debug and fix your program, template file, commands, etc. and try again. If they are OK, then issue the following command, which will ask Parasol to start all the remaining jobs (around ~252 jobs). para push Issue either one of the following two commands to check the status of the cluster and your jobs, until they are done. parasol status para check If any job fails to complete, study the problem and ask Jim to help if necessary. o - Convert these to chromosome level files as so: cd ~/mm cd bed/genscan liftUp genscan.gtf ../../jkStuff/liftAll.lft warn gtf/*.gtf liftUp genscanSubopt.bed ../../jkStuff/liftAll.lft warn subopt/*.bed cat pep/*.pep > genscan.pep o - Load into the database as so: ssh hgwdev cd ~/mm/bed/genscan ldHgGene mm2 genscan genscan.gtf hgPepPred mm2 generic genscanPep genscan.pep hgLoadBed mm2 genscanSubopt genscanSubopt.bed TWINSCAN GENE PREDICTIONS (done 6/10/02; reloaded 12/3/02) mkdir -p ~/mm2/bed/twinscan cd ~/mm2/bed/twinscan mv Gtf.tgz Gtf.020610.tgz mv Ptx.tgz Ptx.020610.tgz rm chr*.gtf chr*.ptx chr*.fa *.tab foreach c (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 X) wget http://genes.cs.wustl.edu/mouse/12-3-02/gtf/chr$c.gtf wget http://genes.cs.wustl.edu/mouse/12-3-02/ptx/chr$c.ptx end ldHgGene mm2 twinscan chr*.gtf -exon=CDS - pare down to id: foreach c (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 X) perl -wpe 's/^\>.*\s+source_id\s*\=\s*(\S+).*$/\>$1/;' < \ chr$c.ptx > chr$c-fixed.fa end hgPepPred mm2 generic twinscanPep chr*-fixed.fa NCBI GENE MODELS (done 05/31/02) mkdir -p ~/mm2/bed/ncbiGenes cd ~/mm2/bed/ncbiGenes wget ftp://ftp.ncbi.nih.gov/genomes/M_musculus/MGSCv3_Release1/maps/chr_genes.gtf.gz wget ftp://ftp.ncbi.nih.gov/genomes/M_musculus/MGSCv3_Release1/protein/protein.fa.gz gunzip chr_genes.gtf.gz gunzip protein.fa.gz - Process the .gtf and .fa together to join IDs ../../jkStuff/mungeNCBIids chr_genes.gtf protein.fa |& uniq ldHgGene mm2 ncbiGenes chr_genes-fixed.gtf hgPepPred mm2 generic ncbiPep protein-fixed.fa NCBI GENOMESCAN MODELS (done 05/31/02) mkdir -p ~/mm2/bed/genomeScan cd ~/mm2/bed/genomeScan wget ftp://ftp.ncbi.nih.gov/genomes/M_musculus/MGSCv3_Release1/maps/chr_GenomeScan.gtf.gz - Remove the ".1" at the end of transcript_id's: gunzip -c chr_GenomeScan.gtf.gz | \ perl -wpe 's/transcript_id "([^\"]+)\.1"/transcript_id "$1"/' > \ chr_GenomeScan-fixed.gtf ldHgGene mm2 genomeScan chr_GenomeScan-fixed.gtf wget ftp://ftp.ncbi.nih.gov/genomes/M_musculus/MGSCv3_Release1/protein/GS_prot.fsa.gz hgPepPred mm2 generic genomeScanPep GS_prot.fsa PREPARING SEQUENCE FOR CROSS SPECIES ALIGNMENTS (DONE 8/02/02) Make sure that the contig files are lower-case repeat masked then do ssh kkstore cd ~/mm source jkStuff/makeTrfFa.sh Then make sure there is enough space available on /scratch and do cp -Rp ~/mm/trfFa /scratch/hg/mm2/trfFa.0802 # sudo /cluster/install/utilities/updateLocal PREPARING POST-TRF CHROM-LEVEL MIXED NIBs for blastz (DONE 11/6/02) # lift trfMask output to chrom-level... this is a pain because all # trf output was put in the same dir. maybe next time around, we # can preserve chrom dir structure... ssh kkstore cd ~/mm2 foreach c (?{,?}) if (-e $c/lift/ordered.lst) then set ntlist = () foreach n (`cat $c/lift/ordered.lst`) set ntlist = ($ntlist bed/simpleRepeat/trf/$n.bed) end liftUp $c/chr$c.trf.bed jkStuff/liftAll.lft warn $ntlist endif end # make trf-masked chrom-level .fa foreach c (?{,?}) cd $c if (-e chr$c.trf.bed) then echo masking $c... cp chr$c.fa chr$c.trf.fa maskOutFa -softAdd chr$c.trf.fa chr$c.trf.bed chr$c.trf.fa endif cd .. end # make nib mkdir trfMixedNib foreach c (?{,?}) if (-e $c/chr$c.trf.fa) then faToNib -softMask $c/chr$c.trf.fa trfMixedNib/chr$c.nib endif end rm -rf /scratch/hg/mm2/chromTrfMixedNib cp -pR trfMixedNib /scratch/hg/mm2/chromTrfMixedNib DOING HUMAN/MOUSE ALIGMENTS (todo) o - Download the lower-case-masked assembly and put it in kkstore:/cluster/store1/a2ms. o - Download the assembled mouse genome in lower-case masked form to /cluster/store1/arachne.3/whole. Execute the script splitAndCopy.csh to chop it into roughly 50M pieces in arachne.3/parts o - Set up the jabba job to do the alignment as so: ssh kkstore cd /cluster/store2/mm.2002.02/mm2 mkdir blatMouse.phusion cd blatMouse.phusion ls -1S /scratch/hg/gs.3/build28/contigTrf/* > human.lst ls -1 /cluster/store1/arachne.3/parts/* > mouse.lst Make a file 'gsub' with the following three lines in it #LOOP /cluster/home/kent/bin/i386/blat -q=dnax -t=dnax {check in line+ $(path2)} {check in line+ $(path1)} {check out line+ psl/$(root2)_$(root1).psl} -minScore=20 -minIdentity=20 -tileSize=4 -minMatch=2 -oneOff=0 -ooc={check in exists /scratch/hg/h/4.pooc} -qMask=lower -mask=lower #ENDLOOP Process this into a jabba file and launch the first set of jobs (10,000 out of 70,000) as so: gensub2 mouse.lst human.lst gsub spec jabba make hut spec jabba push hut Do a 'jabba check hut' after about 20 minutes and make sure everything is right. After that make a little script that does a "jabba push hut" followed by a "sleep 30" about 50 times. Interrupt script when you see jabba push say it's not pushing anything. o - Sort alignments as so ssh kkstore cd /cluster/store2/mm.2002.02/mm2/blatMouse pslCat -dir -check psl | liftUp -type=.psl stdout ../liftAll.lft warn stdin | pslSortAcc nohead chrom /cluster/store2/temp stdin o - Get rid of big pile-ups due to contamination as so: cd chrom foreach i (*.psl) echo $i mv $i xxx pslUnpile -maxPile=600 xxx $i rm xxx end o - Remove long redundant bits from read names by making a file called subs.in with the following line: gnl|ti^ti contig_^tig_ and running the commands cd ~/mouse/vsOo33/blatMouse.phusion/chrom subs -e -c ^ *.psl > /dev/null o - Copy over to network where database is: ssh kks00 cd ~/mm/bed mkdir blatMouse mkdir blatMouse/ph.chrom600 cd !$ cp /cluster/store2/mm.2002.02/mm2/blatMouse.phusion/chrom/*.psl . o - Rename to correspond with tables as so and load into database: ssh hgwdev cd ~/mm/bed/blatMouse/ph.chrom600 foreach i (*.psl) set r = $i:r mv $i ${r}_blatMouse.psl end hgLoadPsl mm2 *.psl o - load sequence into database as so: ssh kks00 faSplit about /projects/hg3/mouse/arachne.3/whole/Unplaced.mfa 1200000000 /projects/hg3/mouse/arachne.3/whole/unplaced ssh hgwdev hgLoadRna addSeq '-abbr=gnl|' mm2 /projects/hg3/mouse/arachne.3/whole/unpla*.fa hgLoadRna addSeq '-abbr=con' mm2 /projects/hg3/mouse/arachne.3/whole/SET*.mfa This will take quite some time. Perhaps an hour . o - Produce 'best in genome' filtered version: ssh kks00 cd ~/mouse/vsOo33 pslSort dirs blatMouseAll.psl temp blatMouse pslReps blatMouseAll.psl bestMouseAll.psl /dev/null -singleHit -minCover=0.3 -minIdentity=0.1 pslSortAcc nohead bestMouse temp bestMouseAll.psl cd bestMouse foreach i (*.psl) set r = $i:r mv $i ${r}_bestMouse.psl end o - Load best in genome into database as so: ssh hgwdev cd ~/mouse/vsOo33/bestMouse hgLoadPsl mm2 *.psl PRODUCING CROSS_SPECIES mRNA ALIGMENTS (done) Here you align vertebrate mRNAs against the masked genome on the cluster you set up during the previous step. Make sure that gbpri, gbmam, gbrod, and gbvert are downloaded from Genbank into /cluster/store1/genbank.128 and unpacked by organism into /cluster/store1/mrna.128/org. Set up cluster run more or less as so: ssh kk cd ~/mm/bed mkdir xenoMrna cd xenoMrna ls -1S /scratch/hg/mm2/mContigs/* > genome.lst ls -1S /scratch/hg/mrna.128/org/*/mrna.fa > allMrna Then edit allMrna removing the Mus.musculus line, and writing the first line into 1.org, the second line into 2.org, and so forth. After the 6th line just leave the rest in 7.org. Then ls -1 *.org > rna.lst cp ~/mm/bed/xenoMrna/gsub . gensub2 genome.lst rna.lst gsub spec para create para try para check If all looks well do para push. Sort xeno mRNA alignments as so: ssh kkstore cd ~/mm/bed/xenoMrna pslSort dirs raw.psl /cluster/store2/temp psl pslReps raw.psl cooked.psl /dev/null -minAli=0.25 liftUp chrom.psl ../../jkStuff/liftAll.lft warn cooked.psl pslSortAcc nohead chrom /cluster/store2/temp chrom.psl pslCat -dir chrom > xenoMrna.psl rm -r chrom raw.psl cooked.psl chrom.psl Load into database as so: ssh hgwdev cd ~/mm/bed/xenoMrna hgLoadPsl mm2 xenoMrna.psl -tNameIx Load other RNA into database as so: cd /cluster/store1/mrna.128/topOrg Note - need to describe how topOrg was made. See topOrg/README... foreach i (*/mrna.fa) hgLoadRna add mm2 /cluster/store1/mrna.128/org/$i $i:r.ra -type=$i:r echo done $i end PRODUCING TETRAODON FISH ALIGNMENTS (done) o - Download sequence from ... and put it on the cluster local disk at /scratch/hg/fish o - Do fish/mouse alignments. ssh kk cd ~/mm/bed mkdir blatFish cd blatFish mkdir psl ls -1S /scratch/hg/fish/* > fish.lst ls -1S /scratch/hg/mm2/trfFa/* > mouse.lst cp ~/lastMm/blatFish/gsub . gensub2 mouse.lst fish.lst gsub spec para create spec para try Make sure jobs are going ok with para check. Then para push wait about 2 hours and do another para push do para checks and if necessary para pushes until done or use para shove. o - Sort alignments as so pslCat -dir psl | liftUp -type=.psl stdout ~/mm/jkStuff/liftAll.lft warn stdin | pslSortAcc nohead chrom /cluster/fast1/temp stdin o - Copy to hgwdev:/scratch. Rename to correspond with tables as so and load into database: ssh hgwdev cd ~/mm/bed/blatFish/chrom foreach i (*.psl) set r = $i:r mv $i ${r}_blatFish.psl end hgLoadPsl mm2 *.psl hgLoadRna addSeq mm2 /cluster/store2/fish/seq15jun2001/*.fa PRODUCING FUGU FISH ALIGNMENTS (Done 10/21/02 by Matt) o - Download sequence to /cluster/store3/fuguSeq from ... and put it on the cluster local disk at /scratch/hg/fugu on kkstore. Sequence was downloaded from: ftp://ftp.jgi-psf.org/pub/JGI_data/Fugu/fugu_v3_mask.fasta.Z ftp://ftp.jgi-psf.org/pub/JGI_data/Fugu/fugu_v3_prot.fasta.Z faSplit sequence ../fugu_v3_mask.fasta 1000 fuguSplit o - Do fish/mouse alignments. ssh kk cd ~/mm/bed mkdir blatFugu cd blatFugu mkdir psl ls -1S /scratch/hg/fugu/* > fugu.lst ls -1S /scratch/hg/mm2/trfFa.0802/* > mouse.lst # Run mkdirs.sh # Edit gsub to fit the dir srtucture gensub2 mouse.lst fugu.lst gsub spec para create spec para try Make sure jobs are going ok with para check. Then para push wait about 2 hours and do another para push do para checks and if necessary para pushes until done or use para shove. o - Sort alignments as so pslCat -dir psl/* | liftUp -type=.psl stdout ~/mm2/jkStuff/liftAll.lft warn stdin | pslSortAcc nohead chrom /oldscratch stdin o - ssh hgwdev load into database: ssh hgwdev cd ~/mm2/bed/blatFugu/chrom foreach i (*.psl) set r = $i:r mv $i ${r}_blatFugu.psl end hgLoadPsl mm2 *.psl hgLoadRna addSeq mm2 /cluster/store3/fuguSeq/fugu_v3_mask.fasta LOAD GENEID GENES (done) cd ~/mm/bed mkdir geneid cd geneid mkdir download cd download Now download *.gtf and *.prot from http://www1.imim.es/genepredictions/M.musculus/mmFeb2002/geneid_v1.1 Get rid of the extra .N in the transcripts with subs. cd .. cp ~/lastMm/bed/geneid/subs . subs -e download/*.gtf > /dev/null ldHgGene mm2 geneid download/*.gtf -exon=CDS hgPepPred mm2 generic geneidPep download/*.prot SGP GENE PREDICTIONS (DONE 01/29/03) mkdir -p ~/mm2/bed/sgp/download cd ~/mm2/bed/sgp/download foreach f (~/mm2/?{,?}/chr?{,?}{,_random}.fa) set chr = $f:t:r wget http://genome.imim.es/genepredictions/M.musculus/mmFeb2002/SGP/humangp20021114/$chr.gtf wget http://genome.imim.es/genepredictions/M.musculus/mmFeb2002/SGP/humangp20021114/$chr.prot end # Add missing .1 to protein id's foreach f (*.prot) perl -wpe 's/^(>chr\w+)$/$1.1/' $f > $f:r-fixed.prot end cd .. ldHgGene mm2 sgpGene download/*.gtf -exon=CDS hgPepPred mm2 generic sgpPep download/*-fixed.prot TIGR GENE INDEX (REDONE 3/31/03) o mkdir -p ~/mm2/bed/tigr cd ~/mm2/bed/tigr wget ftp://ftp.tigr.org/private/NHGI_mgi_jiashu/TGI_track_MouseGenome_Feb2003.tgz tar xvzf TGI*.tgz foreach f (*cattle*) set f1 = `echo $f | sed -e 's/cattle/cow/g'` mv $f $f1 end foreach o (mouse cow human pig rat) setenv O $o foreach f ([Cc]hr*_$o*s) tail +2 $f | perl -wpe 's /THC/TC/; s/(TH?C\d+)/$ENV{O}_$1/;' > $f.gff end end ldHgGene -exon=TC mm2 tigrGeneIndex *.gff LOAD STS MAP (todo) - login to hgwdev cd ~/mm/bed mm2 < ~/src/hg/lib/stsMap.sql mkdir stsMap cd stsMap bedSort /projects/cc/hg/mapplots/data/tracks/build28/stsMap.bed stsMap.bed - Enter database with "mm2" command. - At mysql> prompt type in: load data local infile 'stsMap.bed' into table stsMap; - At mysql> prompt type LOAD MGI IDs (done) - The Locuslink ID to MGI IDs converstion data file, LL2MGI.txt, from Jackson Lab should be found under ~/mm/bed/refSeq - login to hgwdev cd ~/mm/bed/refSeq mm2 < ~/src/hg/lib/mgiID.sql - Enter database with "mm2" command. - At mysql> prompt type in: load data local infile 'LL2MGI.txt' into table MGIid; - At mysql> prompt type quit LOAD CHROMOSOME BANDS (todo) - login to hgwdev cd /cluster/store2/mm.2002.02/mm2/bed mkdir cytoBands cp /projects/cc/hg/mapplots/data/tracks/build28/cytobands.bed cytoBands mm2 < ~/src/hg/lib/cytoBand.sql Enter database with "mm2" command. - At mysql> prompt type in: load data local infile 'cytobands.bed' into table cytoBand; - At mysql> prompt type quit LOAD MOUSEREF TRACK (todo) First copy in data from kkstore to ~/mm/bed/mouseRef. Then substitute 'genome' for the appropriate chromosome in each of the alignment files. Finally do: hgRefAlign webb mm2 mouseRef *.alignments LOAD AVID MOUSE TRACK (todo) ssh cc98 cd ~/mm/bed mkdir avidMouse cd avidMouse wget http://pipeline.lbl.gov/tableCS-LBNL.txt hgAvidShortBed *.txt avidRepeat.bed avidUnique.bed hgLoadBed avidRepeat avidRepeat.bed hgLoadBed avidUnique avidUnique.bed LOAD SNPS (Done. Daryl Thomas August 16, 2002) - ssh hgwdev - cd ~/mm/bed - mkdir snp - cd snp - Download SNPs from ftp://ftp.ncbi.nlm.nih.gov/pub/sherry/mouse.b27.out.gz - Unpack. createBed < mouse.b27.out > snpNih.bed hgLoadBed mm2 snpNih snpNih.bed LOAD CPGISSLANDS (todo) login to hgwdev cd /cluster/store2/mm.2002.02/mm2/bed mkdir cpgIsland cd cpgIsland Get cpgisland tarball out of email from Asif (achinwal@watson.wustl.edu) and unpack it. awk -f filter.awk */ctg*/*.cpg > cpgIsland.bed mysql -u hgcat -pBIGSECRET -A mm2 < ~/src/hg/lib/cpgIsland.sql mysql -u hgcat -pBIGSECRET -A mm2 At mysql> prompt type in: load data local infile 'cpgIsland.bed' into table cpgIsland LOAD ENSEMBL ESTs (done 05/28/02, reloaded w/new data 08/05/02) ln -s /cluster/store2/mm.2002.02/mm2 ~/mm2 mkdir -p ~/mm2/bed/ensembl cd ~/mm2/bed/ensembl wget http://www.ebi.ac.uk/~stabenau/mouse-est.gz wget http://www.ebi.ac.uk/~stabenau/mouse-est.pep.gz gunzip -c mouse-est.gz | \ perl -w -p -e 's/^(\w)/chr$1/' > mouse-est-fixed.gtf ldHgGene mm2 ensEst mouse-est-fixed.gtf > The id behind '>' is internal and was not in our gtf dump, so > you have to do some more parsing. # pick out the transcript= attribute -- that's the id to use: # also remove the first line: gunzip -c mouse-est.pep.gz | tail +2 | \ perl -w -p -e 's/^\>gene_id=.*transcript=(\w+)\s+.*$/\>$1/' > \ mouse-est-fixed.pep hgPepPred mm2 generic ensEstPep mouse-est-fixed.pep LOAD ENSEMBLE GENES (done 05/28/02, reloaded w/new data 08/05/02) mkdir -p ~/mm2/bed/ensembl cd ~/mm2/bed/ensembl wget http://www.ebi.ac.uk/~stabenau/mouse-ensembl.gz wget http://www.ebi.ac.uk/~stabenau/mouse-ensembl.pep.gz gunzip -c mouse-ensembl.gz | \ perl -w -p -e 's/^(\w)/chr$1/' > mouse-ensembl-fixed.gtf ldHgGene mm2 ensGene mouse-ensembl-fixed.gtf > mouse-ensembl contains stopcodons, due to some glitches in our > genebuild. The id behind '>' is internal and was not in our gtf dump, so > you have to do some more parsing. # pick out the transcript= attribute -- that's the id to use: # also remove the first line: tail +2 mouse-ensembl.pep | \ perl -w -p -e 's/^\>gene_id=.*transcript=(\w+)\s+.*$/\>$1/' > \ mouse-ensembl-fixed.pep hgPepPred mm2 generic ensPep mouse-ensembl-fixed.pep LOAD ENSEMBL "Merge" TRACKs - SECRET! (done 6/25/02) - Use mgsc database, not mm2. Only MGSC members should be able to access this track, and only by password protection. mkdir -p ~/mm2/bed/ensembl cd ~/mm2/bed/ensembl foreach tier (b c d) GET http://www.ebi.ac.uk/~stabenau/tier$tier.gtf.gz > tier$tier.gtf.gz GET http://www.ebi.ac.uk/~stabenau/mouse_tier$tier.fa.gz > tier$tier.fa.gz gunzip -c tier$tier.gtf.gz | \ perl -w -p -e 's/^(\w)/chr$1/' > tier$tier-fixed.gtf gunzip -c tier$tier.fa.gz | \ perl -w -p -e 's/^\>.*source_id=(\S+)\s+.*$/\>$1/' > \ tier$tier-fixed.pep set Tier = `echo $tier | tr 'a-z' 'A-Z'` ldHgGene mgsc ensMergeTier$Tier tier$tier-fixed.gtf hgPepPred mgsc generic ensMergeTier${Tier}Pep tier$tier-fixed.pep end NOTE: because this track contains ensRiken transcripts, it had to be made secret - see NOTE's below & revision history comments for hgTracks.c 1.277. LOAD ENSEMBL/RIKEN - SECRET! (05/31/02 - pep todo, reloaded w/new data 08/05/02) - Use mgsc database, not mm2. Only MGSC members should be able to access this track, and only by password protection. mkdir -p ~/mm2/bed/ensRiken cd ~/mm2/bed/ensRiken wget http://www.ebi.ac.uk/~stabenau/mouse-riken.gz wget http://www.ebi.ac.uk/~stabenau/??? gunzip -c mouse-riken.gz | \ perl -w -p -e 's/^(\w)/chr$1/' > mouse-riken-fixed.gtf ldHgGene mgsc ensRiken mouse-riken-fixed.gtf > The id behind '>' is internal and was not in our gtf dump, so > you have to do some more parsing. # pick out the transcript= attribute -- that's the id to use: # also remove the first line: gunzip -c mouse-riken.pep.gz | tail +2 | \ perl -w -p -e 's/^\>gene_id=.*transcript=(\w+)\s+.*$/\>$1/' > \ mouse-riken-fixed.pep hgPepPred mgsc generic ensRikenPep mouse-riken-fixed.pep - NOTE: hooks had to be added to hgTracks.c to enable/disable this track together with the main riken track. see revision history comments for hgTracks.c version 1.262 . - NOTE: had to create empty ensRiken table in mm2 in order for hdb.c to believe the track exists. I used this sql command: CREATE TABLE ensRiken ( name varchar(255) not null, chrom varchar(255) not null, strand char(1) not null, txStart int(10) unsigned not null, txEnd int(10) unsigned not null, cdsStart int(10) unsigned not null, cdsEnd int(10) unsigned not null, exonCount int(10) unsigned not null, exonStarts longblob not null, exonEnds longblob not null ); LOAD SANGER22 GENES (todo) - cd ~/mm/bed - mkdir sanger22 - cd sanger22 - not sure where these files were downloaded from - grep -v Pseudogene Chr22*.genes.gff | hgSanger22 mm2 stdin Chr22*.cds.gff *.genes.dna *.cds.pep 0 | ldHgGene mm2 sanger22pseudo stdin - Note: this creates sanger22extras, but doesn't currently create a correct sanger22 table, which are replaced in the next steps - sanger22-gff-doctor Chr22.3.1x.cds.gff Chr22.3.1x.genes.gff \ | ldHgGene mm2 sanger22 stdin - sanger22-gff-doctor -pseudogenes Chr22.3.1x.cds.gff Chr22.3.1x.genes.gff \ | ldHgGene mm2 sanger22pseudo stdin - hgPepPred mm2 generic sanger22pep *.pep LOAD SANGER 20 GENES (todo) First download files from James Gilbert's email to ~/mm/bed/sanger20 and go to that directory while logged onto hgwdev. Then: grep -v Pseudogene chr_20*.gtf | ldHgGene mm2 sanger20 stdin hgSanger20 mm2 *.gtf *.info LOAD RNAGENES (todo) - login to hgwdev - cd ~kent/src/hg/lib - mm2 < rnaGene.sql - cd /cluster/store2/mm.2002.02/mm2/bed - mkdir rnaGene - cd rnaGene - download data from ftp.genetics.wustl.edu/pub/eddy/pickup/ncrna-oo27.gff.gz - gunzip *.gz - liftUp chrom.gff ../../jkStuff/liftAll.lft carry ncrna-oo27.gff - hgRnaGenes mm2 chrom.gff LOAD EXOFISH (todo) - login to hgwdev - cd /cluster/store2/mm.2002.02/mm2/bed - mkdir exoFish - cd exoFish - mm2 < ~kent/src/hg/lib/exoFish.sql - Put email attatchment from Olivier Jaillon (ojaaillon@genoscope.cns.fr) into /cluster/store2/mm.2002.02/mm2/bed/exoFish/all_maping_ecore - awk -f filter.awk all_maping_ecore > exoFish.bed - hgLoadBed mm2 exoFish exoFish.bed LOAD MOUSE SYNTENY (todo) - login to hgwdev. - cd ~/kent/src/hg/lib - mm2 < mouseSyn.sql - mkdir ~/mm/bed/mouseSyn - cd ~/mm/bed/mouseSyn - Put Deanna Church's (church@ncbi.nlm.nih.gov) email attatchment as mouseSyn.txt - awk -f format.awk *.txt > mouseSyn.bed - delete first line of mouseSyn.bed - Enter database with "mm2" command. - At mysql> prompt type in: load data local infile 'mouseSyn.bed' into table mouseSyn LOAD GENIE (done 05/30/02) mkdir -p ~/mm2/bed/genieAlt cd ~/mm2/bed/genieAlt wget http://www.neomorphic.com/mgap/mgscv3/gtf/mgscv3.genie.gtf.tgz gunzip -c mgscv3.genie.gtf.tgz | tar xvf - ldHgGene mm2 genieAlt mgscv3.genie.gtf/chr*.gtf wget http://www.neomorphic.com/mgap/mgscv3/fa/mgscv3.aa.tgz gunzip -c mgscv3.aa.tgz | tar xvf - hgPepPred mm2 genie geniePep chr*.aa.fa LOAD GENIE CLONE BOUNDS (done 6/3/02) mkdir -p ~/mm2/bed/genieBounds cd ~/mm2/bed/genieBounds wget http://www.neomorphic.com/mgap/mgscv3/cb.bed/mgscv3_cb.bed.tgz gunzip -c mgscv3_cb.bed.tgz | tar xvf - - Trim the track definition from each file (these are actually custom track files): foreach c (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 X Un) tail +2 chr${c}_cb.bed > chr${c}_cb-fixed.bed end hgLoadBed mm2 genieBounds *-fixed.bed LOAD JACKSON LABS QTL (DONE 03/13/03) mkdir ~/mm2/bed/jaxQTL2 # Save the email attachment from Sridhar Ramachandran at Jackson Labs # (bed 8+, jaxQTL2 format). # Strip the column headers and load into the database. tail +2 QTLBedFormat.txt > jaxQTL2.bed hgLoadBed -noBin -tab -sqlTable=$HOME/kent/src/hg/lib/jaxQTL2.sql \ mm2 jaxQTL2 jaxQTL2.bed LOAD SOFTBERRY GENES (todo) - ln -s /cluster/store2/mm.2002.02/mm2 ~/mm - cd ~/mm/bed - mkdir softberry - cd softberry - get ftp://www.softberry.com/pub/SC_MOU_NOV01/softb_mou_genes_nov01.tar.gz ldHgGene mm2 softberryGene chr*.gff hgPepPred mm2 softberry *.protein hgSoftberryHom mm2 *.protein LOAD ACEMBLY (todo) - Get acembly*gene.gff from Jean and Danielle Thierry-Mieg and place in ~/mm/bed/acembly - Replace c_chr with chr in acembly*.gff - Replace G_t1_chr with chr and likewise G_t2_chr with chr, etc. - cd ~/mm/bed/acembly - # The step directly below is not necessary since the files were already lifted # liftUp ./aceChrom.gff /cluster/store2/mm.2002.02/mm2/jkStuff/liftHs.lft warn acemblygenes*.gff - Use /cluster/store2/mm.2002.02/mm2/mattStuff/filterFiles.pl to prepend "chr" to the start of every line in the gene.gff files and to concat them into the aceChrom.gff gile. Read the script to see what it does. It's tiny and simple. - Concatenate all the protein.fasta files into a single acembly.pep file - Load into database as so: ldHgGene mm2 acembly aceChrom.gff hgPepPred mm2 generic acemblyPep acembly.pep LOAD GENOMIC DUPES (todo) o - Load genomic dupes ssh hgwdev cd ~/mm/bed mkdir genomicDups cd genomicDups wget http://codon/jab/web/takeoff/oo33_dups_for_kent.zip unzip *.zip awk -f filter.awk oo33_dups_for_kent > genomicDups.bed mysql -u hgcat -pbigSECRET mm2 < ~/src/hg/lib/genomicDups.sql hgLoadBed mm2 -oldTable genomicDups genomicDupes.bed FAKING DATA FROM PREVIOUS VERSION (This is just for until proper track arrives. Rescues about 97% of data Just an experiment, not really followed through on). o - Rescuing STS track: - log onto hgwdev - mkdir ~/mm/rescue - cd !$ - mkdir sts - cd sts - bedDown hg3 mapGenethon sts.fa sts.tab - echo ~/mm/sts.fa > fa.lst - pslOoJobs ~/mm ~/mm/rescue/sts/fa.lst ~/mm/rescue/sts g2g - log onto cc01 - cc ~/mm/rescue/sts - split all.con into 3 parts and condor_submit each part - wait for assembly to finish - cd psl - mkdir all - ln ?/*.psl ??/*.psl *.psl all - pslSort dirs raw.psl temp all - pslReps raw.psl contig.psl /dev/null - rm raw.psl - liftUp chrom.psl ../../../jkStuff/liftAll.lft carry contig.psl - rm contig.psl - mv chrom.psl ../convert.psl LOAD SLAM GENES (hg12) cd /cluster/store3/gs.13/build30/bed mkdir slam cd slam wget http://bio.math.berkeley.edu/slam/mouse/gff/UCSC/mmCDS.gff.gz wget http://bio.math.berkeley.edu/slam/mouse/gff/UCSC/mmCNS.gff.gz gunzip * ldHgGene -exon=CDS mm2 slam mmCDS.gff mv genePred.tab genePred.mm2 awk '{print $1,$4,$5,$10,$12}' mmCNS.gff > mmCNS.bed sed -e 's/;//g' -e 's/"//g' mmCNS.bed > mmCNS.bed.2 sort -n -k 5,5 mmCNS.bed.2 > mmCNS.bed.sort examine head and tail of sorted file for range of scores rm mmCNS.bed.sort size.pl < mmCNS.bed.2 > mmCNS.bed.2.size sort -n -k 2,2 mmCNS.bed.2.size > mmCNS.bed.2.size.sort examine head and tail of sorted file for range of sizes rm mmCNS.bed.2.size.sort expand.pl < mmCNS.bed.2 > mmCNS.bed.2.expand SLAM (hg13) cd /cluster/store2/mm.2002.02/mm2/bed/slam mkdir hg13 cd hg13 wget http://baboon.math.berkeley.edu/~cdewey/slam/hs_31_Nov2002_mm_3_Feb2002/gff/mmCDS.gff.gz gunzip mmCDS.gff.gz mv mmCDS.gff mouseFromHumanCDS.gff ldHgGene -exon=CDS mm2 slamHuman wget http://baboon.math.berkeley.edu/~scawley/slam/hs_31_Nov2002_mm_3_Feb2002/gff/mmCNS.bed.gz gunzip mmCNS.bed.gz mv mmCNS.bed mouseFromHumanCNS.bed expand.pl < humanFromMouseCNS.bed > humanFromMouseCNS.bed.expand hgLoadBed -tab hg13 slamNonCodingMouse humanFromMouseCNS.bed.expand REATING THE musHumL SAMPLE TRACK (a.k.a WIGGLE TRACK) ------------------------------------------------------ o - refer to the script at src/hg/sampleTracks/makeMm2Hg12.doc #################################################################### ## NIA Mouse Gene Index - (WORKING - 2004-01-07 - Hiram) # requested by: Dudekula, Dawood (NIH/NIA/IRP) DudekulaDB@grc.nia.nih.gov # pick up data (available only this one time) ssh hgwdev mkdir /cluster/data/mm2/bed/NIAGene cd /cluster/data/mm2/bed/NIAGene wget --timestamping \ http://lgsun.grc.nia.nih.gov/geneindex/blatNAP-genome.txt wget --timestamping \ "http://lgsun.grc.nia.nih.gov/Supplemental-Information/NAP.fasta" # This file seems to have an extra field at the end ? # It is always a 1 except for two entries that do not have # this extra field. awk '{for (i=1; i<21; ++i) { printf "%s\t",$i; } printf "%s\n",$21;}' \ blatNAP-genome.txt > blatNAP-genome.psl hgLoadPsl mm2 -table=NIAGene blatNAP-genome.psl # load of NIAGene did not go as planned: 114560 record(s), 0 row(s) skipped, # 12 warning(s) loading psl.tab mkdir /gbdb/mm2/NIAGene ln -s /cluster/data/mm2/bed/NIAGene/NAP.fasta \ /gbdb/mm2/NIAGene/NAP.fa hgLoadSeq mm2 /gbdb/mm2/NIAGene/NAP.fa