# for emacs: -*- mode: sh; -*- # This file describes how we made the browser database on the mouse # genome, February 2003 build. DOWNLOAD THE MOUSE SEQUENCE FROM NCBI (DONE 02/06/03) mkdir -p /cluster/store2/mm.2003.02/ncbi cd /cluster/store2/mm.2003.02/ncbi mkdir chrfasta contigfasta ftp ftp.ncbi.nih.gov # user hgpguest, password from /cse/guests/kent/buildHg6.doc cd mouse_30 prompt bin mget * quit gunzip *.agp.gz GET UPDATED MOUSE ASSEMBLY FILES FROM NCBI (DONE 02/19/03) # Deanna regenerated allrefcontig.chr.agp for us with _random's. # While downloading that, I noticed some other new files. I saved # the original copies of those to *.orig. cd /cluster/store2/mm.2003.02/ncbi ftp ftp.ncbi.nih.gov # user hgpguest, password from /cse/guests/kent/buildHg6.doc cd mouse_30 prompt bin mget allrefcontig.chr.agp.gz contig.idmap ctg_coords seq_contig.md quit BREAK UP SEQUENCE INTO 5 MB CHUNKS AT NON-BRIDGED CONTIGS (DONE 02/06/03) # This version of the mouse sequence data is in # /cluster/store2/mm.2003.02/mm3/assembly # This will split the mouse sequence into approx. 5 Mbase supercontigs # between non-bridged clone contigs and drop the resulting dir structure # in /cluster/store2/mm.2003.02/mm3. # The resulting dir structure will include 1 dir for each chromosome, # each of which has a set of subdirectories, one subdir per supercontig. ssh hgwdev # cd into your CVS source tree under kent/src/hg/splitFaIntoContigs make ssh kkstore mkdir /cluster/store2/mm.2003.02/mm3 cd /cluster/store2/mm.2003.02/mm3 # splitFaIntoContigs doesn't do right with agp lines arriving in a # different order than fasta chrom sequences. so split up the agp # into one per chrom. foreach c ( 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 X Y Un ) mkdir $c perl -we "while(<>){if (/^chr$c\t/) {print;}}" \ ../ncbi/allrefcontig.chr.agp \ > $c/chr$c.agp gunzip -c /cluster/store2/mm.2003.02/ncbi/chrfasta/chr$c.fa.gz \ | perl -wpe 's/^>lcl\|(chr\w+)\.fa.*/>$1/' \ | splitFaIntoContigs $c/chr$c.agp \ stdin /cluster/store2/mm.2003.02/mm3 -nSize=5000000 end CREATE CHROM-LEVEL AGP AND FASTA FOR _RANDOMS (DONE 02/20/03) ssh kkstore cd /cluster/store2/mm.2003.02/ncbi ../mm3/jkStuff/ncbiToRandomAgps seq_contig.md allrefcontig.chr.agp \ contig.idmap ../mm3 cd /cluster/store2/mm.2003.02/mm3 foreach c (?{,?}) if (-e $c/chr${c}_random.ctg.agp) then echo building $c/chr${c}_random.fa gunzip -c ../ncbi/contigfasta/chr$c.fa.gz \ | perl -wpe 's/^>lcl\|(Mm\w+)\s+.*$/>$1/' \ > ./tmp.fa agpToFa -simpleMulti $c/chr${c}_random.ctg.agp chr${c}_random \ $c/chr${c}_random.fa ./tmp.fa rm tmp.fa endif end # At 37402 sequences with 50000-long gaps in between, chrUn is just # too damn big to fit in memory on any of our machines!! So use a # gap size of 1000 (OK with Jim and Terry) just for chrUn. cd /cluster/store2/mm.2003.02/ncbi ../mm3/jkStuff/ncbiToRandomAgps -gapLen 1000 -chrom Un \ seq_contig.md allrefcontig.chr.agp contig.idmap ../mm3 cd /cluster/store2/mm.2003.02/mm3 set c=Un echo building $c/chr${c}_random.fa gunzip -c ../ncbi/contigfasta/chr$c.fa.gz \ | perl -wpe 's/^>lcl\|(Mm\w+)\s+.*$/>$1/' \ > ./tmp.fa agpToFa -simpleMulti $c/chr${c}_random.ctg.agp chr${c}_random \ $c/chr${c}_random.fa ./tmp.fa rm tmp.fa # Clean these up to avoid confusion later... they're easily rebuilt. rm ?{,?}/*.ctg.agp BREAK UP _RANDOMS INTO 5 MB CHUNKS AT NON-BRIDGED CONTIGS (DONE 02/20/03) ssh kkstore cd /cluster/store2/mm.2003.02/mm3 foreach c (?{,?}) if (-e $c/chr${c}_random.agp) then splitFaIntoContigs $c/chr${c}_random.agp $c/chr${c}_random.fa . \ -nSize=5000000 mv ${c}_random/lift/oOut.lst $c/lift/rOut.lst mv ${c}_random/lift/ordered.lft $c/lift/random.lft mv ${c}_random/lift/ordered.lst $c/lift/random.lst rmdir ${c}_random/lift rm ${c}_random/chr${c}_random.{agp,fa} mv ${c}_random/* $c rmdir ${c}_random endif end CREATING DATABASE (DONE 02/06/03) o - Create the database. - ssh hgwdev - Enter mysql via: mysql -u hgcat -pbigsecret - At mysql prompt type: create database mm3; quit - make a semi-permanent read-only alias: alias mm3 "mysql -u hguser -phguserstuff -A mm3" o - Use df to ake sure there is at least 5 gig free on hgwdev:/var/lib/mysql CREATING GRP TABLE FOR TRACK GROUPING (DONE 02/11/03) ssh hgwdev echo "create table grp (PRIMARY KEY(NAME)) select * from mm2.grp" \ | hgsql mm3 STORING O+O SEQUENCE AND ASSEMBLY INFORMATION (DONE 02/20/03) # Create (unmasked) nib files ssh kkstore cd ~/mm mkdir -p nib foreach f (?{,?}/chr*.fa) echo $f:t:r faToNib $f nib/$f:t:r.nib end # Create symbolic links from /gbdb/mm3/nib to real nib files ssh hgwdev mkdir -p /gbdb/mm3/nib foreach f (/cluster/store2/mm.2003.02/mm3/nib/chr*.nib) ln -s $f /gbdb/mm3/nib end # Load /gbdb nib paths into database and save size info. ssh hgwdev hgsql mm3 < ~/src/hg/lib/chromInfo.sql cd ~/mm hgNibSeq -preMadeNib mm3 /gbdb/mm3/nib ?/chr*.fa ??/chr*.fa echo "select chrom,size from chromInfo" | hgsql -N mm3 > chrom.sizes Store o+o info in database. cd /cluster/store2/mm.2003.02/mm3 hgGoldGapGl mm3 /cluster/store2/mm.2003.02 mm3 -noGl Make and load GC percent table ssh hgwdev mkdir -p /cluster/store2/mm.2003.02/mm3/bed/gcPercent cd /cluster/store2/mm.2003.02/mm3/bed/gcPercent hgsql mm3 < ~/src/hg/lib/gcPercent.sql hgGcPercent mm3 ../../nib ADD MAP CONTIGS TRACK (DONE 02/28/03) ssh hgwdev mkdir -p ~/mm3/bed/ctgPos cd ~/mm3/bed/ctgPos # hgCtgPos uses the lift files... but mouse lift files are for the # 5MB contigs from splitFaIntoContigs, not for the real NT_ contigs # from the assembly. (In the future, we should go with the NT's!) # So... just for this release, go straight from the seq_contig.md # to the table def'n: contig, size, chrom, chromStart, chromEnd perl -we \ 'while (<>) { \ if (/^\d+\s+(\w+)\s+(\d+)\s+(\d+)\s+\S+\s+(NT_\d+)\s+.*ref_strain/) { \ $chr=$1; $start=$2; $start -= 1; $end=$3; $ctg=$4; \ print "$ctg\t" . ($end-$start) . "\tchr$chr\t$start\t$end\n"; \ } \ }' /cluster/store2/mm.2003.02/ncbi/seq_contig.md \ > ctgPos.tab hgsql mm3 < ~/kent/src/hg/lib/ctgPos.sql echo "load data local infile 'ctgPos.tab' into table ctgPos" | hgsql mm3 # Note: the info is there in seq_contig.md to also do the _random's, # but we'd have to do some more work: duplicate the gaps of 50000 between # contigs for all _random's except chrUn_random (1000 between). MAKE HGCENTRALTEST ENTRY AND TRACKDB TABLE FOR MM3 (DONE 02/06/03) # Enter mm3 into hgcentraltest.dbDb so test browser knows about it: echo 'insert into dbDb values("mm3", "Mouse Feb. 2003", "/gbdb/mm3/nib", "Mouse", "USP18", 1, 30, "Mouse");' \ | hgsql -h genome-testdb hgcentraltest # Make trackDb table so browser knows what tracks to expect: ssh hgwdev cd ~/src/hg/makeDb/trackDb cvs up -d -P # Edit that makefile to add mm3 in all the right places and do make update make alpha cvs commit makefile MAKE HGCENTRALTEST BLATSERVERS ENTRY FOR MM3 (DONE 02/13/03) ssh hgwdev echo 'insert into blatServers values("mm3", "blat9", "17778", "1"); \ insert into blatServers values("mm3", "blat9", "17779", "0");' \ | hgsql -h genome-testdb hgcentraltest REPEAT MASKING (DONE 03/05/03) Split contigs, run RepeatMasker, lift results Notes: * If there is a new version of RepeatMasker, build it and ask the admins to binrsync it (kkstore:/scratch/hg/RepeatMasker/*). * Contigs (*/chr*_*/chr*_*.fa) are split into 500kb chunks to make RepeatMasker runs manageable on the cluster ==> results need lifting. * For the NCBI assembly we repeat mask on the sensitive mode setting (RepeatMasker -m -s) #- Split contigs into 500kb chunks: cd ~/mm3 foreach d ( */chr*_?{,?} ) cd $d set contig = $d:t faSplit size $contig.fa 500000 ${contig}_ -lift=$contig.lft \ -maxN=500000 cd ../.. end #- Make the run directory and job list: cd ~/mm3 mkdir RMRun rm -f RMRun/RMJobs touch RMRun/RMJobs foreach d ( ?{,?}/chr*_?{,?} ) foreach f ( $d/chr*_*_*.fa ) set f = $f:t echo /cluster/bin/scripts/RMMouse \ /cluster/store2/mm.2003.02/mm3/$d $f \ '{'check out line+ /cluster/store2/mm.2003.02/mm3/$d/$f.out'}' \ >> RMRun/RMJobs end end #- Do the run ssh kk cd ~/mm3/RMRun para create RMJobs para try, para check, para check, para push, para check,... #- Lift up the split-contig .out's to contig-level .out's ssh kkstore cd ~/mm3 foreach d ( ?{,?}/chr*_?{,?} ) cd $d set contig = $d:t liftUp $contig.fa.out $contig.lft warn ${contig}_*.fa.out > /dev/null cd ../.. end #- Lift up the contig-level .out's to chr-level cd ~/mm3 ./jkStuff/liftOut5.sh #- Load the .out files into the database with: ssh hgwdev cd ~/mm3 hgLoadOut mm3 ?/*.fa.out ??/*.fa.out VERIFY REPEATMASKER RESULTS (IN PROGRESS) # Run featureBits on mm3 and on a comparable genome build, and compare: ssh hgwdev featureBits mm3 rmsk # --> 1079906607 bases of 2708220133 (39.875%) in intersection # --> (orig run, July libs:) 1001999794 bases of 2577261074 (38.878%) in intersection featureBits mm2 rmsk # --> 1037964664 bases of 2726995854 (38.063%) in intersection MAKE LIFTALL.LFT (DONE 02/20/03) cd ~/mm3 cat ?{,?}/lift/{ordered,random}.lft > jkStuff/liftAll.lft SIMPLE REPEAT TRACK (DONE 02/07/03; _randoms DONE 03/05/03) # TRF runs pretty quickly now... it takes a few hours total runtime. # Also, it ignores masking of input sequence, so this can be run in # parallel with RepeatMasker! # So instead of binrsyncing and para-running, just do this on kkstore: ssh kkstore mkdir ~/mm3/bed/simpleRepeat cd ~/mm3/bed/simpleRepeat mkdir trf rm -f jobs.csh touch jobs.csh foreach f (/cluster/store2/mm.2003.02/mm3/?{,?}/chr*_*/chr?{,?}{,_random}_?{,?}.fa) set fout = $f:t:r.bed echo "/cluster/home/kent/bin/i386/trfBig -trf=/cluster/home/kent/bin/i386/trf $f /dev/null -bedAt=trf/$fout -tempDir=/tmp" \ >> jobs.csh end tcsh jobs.csh |& tee jobs.log wc -l jobs.csh ls -1 trf | wc -l # When job is done do: liftUp simpleRepeat.bed ~/mm3/jkStuff/liftAll.lft warn trf/*.bed # Load this into the database as so ssh hgwdev cd ~/mm3/bed/simpleRepeat hgLoadBed mm3 simpleRepeat simpleRepeat.bed \ -sqlTable=$HOME/src/hg/lib/simpleRepeat.sql PROCESS SIMPLE REPEATS INTO MASK (DONE 02/07/03; _randoms DONE 03/05/03) # After the simpleRepeats track has been built, make a filtered version # of the trf output: keep trf's with period <= 12: ssh kkstore cd ~/mm3/bed/simpleRepeat mkdir -p trfMask foreach f (trf/chr*.bed) awk '{if ($5 <= 12) print;}' $f > trfMask/$f:t end # Lift up filtered trf output to chrom coords as well: cd ~/mm3 mkdir -p bed/simpleRepeat/trfMaskChrom foreach c (?{,?}) perl -wpe 's@(\S+)@bed/simpleRepeat/trfMask/$1.bed@' \ $c/lift/ordered.lst > $c/lift/oTrf.lst liftUp bed/simpleRepeat/trfMaskChrom/chr$c.bed \ jkStuff/liftAll.lft warn `cat $c/lift/oTrf.lst` if (-e $c/lift/random.lst) then perl -wpe 's@(\S+)@bed/simpleRepeat/trfMask/$1.bed@' \ $c/lift/random.lst > $c/lift/rTrf.lst liftUp bed/simpleRepeat/trfMaskChrom/chr${c}_random.bed \ jkStuff/liftAll.lft warn `cat $c/lift/rTrf.lst` endif end MASK SEQUENCE WITH BOTH REPEATMASKER AND SIMPLE REPEAT/TRF (DONE 03/05/03) ssh kkstore cd ~/mm3 #- Soft-mask (lower-case) the contig and chr .fa's tcsh jkStuff/makeFaMasked.sh #- Make hard-masked .fa.masked files as well: tcsh jkStuff/makeHardMasked.sh #- Rebuild the nib, mixedNib, maskedNib files: tcsh jkStuff/makeNib.sh # Copy the masked contig fa to /scratch: rm -f /scratch/hg/mm3/trfFa mkdir -p /scratch/hg/mm3/trfFa cp -p ~/mm3/?{,?}/chr*_*/chr?{,?}{,_random}_?{,?}.fa /scratch/hg/mm3/trfFa MAKE DOWNLOADABLE SEQUENCE FILES (DONE 03/05/03) ssh kkstore cd ~/mm3 #- Build the .zip files jkStuff/zipAll.sh |& tee zipAll.log mkdir zip mv *.zip zip cd zip #- Look at zipAll.log to make sure all file lists look reasonable. #- Check zip file integrity: foreach f (*.zip) unzip -t $f > $f.test tail -1 $f.test end wc -l *.zip.test #- Copy the .zip files to hgwdev:/usr/local/apache/... ssh hgwdev cd ~/mm3/zip ../jkStuff/cpToWeb.sh cd /usr/local/apache/htdocs/goldenPath/mmFeb2003 #- Take a look at bigZips/* and chromosomes/*, update their README.txt's # Then make the upstream sequence files. cd bigZips featureBits mm3 refGene:upstream:1000 -fa=upstream1000.fa zip upstream1000.zip upstream1000.fa rm upstream1000.fa featureBits mm3 refGene:upstream:2000 -fa=upstream2000.fa zip upstream2000.zip upstream2000.fa rm upstream2000.fa featureBits mm3 refGene:upstream:5000 -fa=upstream5000.fa zip upstream5000.zip upstream5000.fa rm upstream5000.fa CREATE CYTOBAND TRACK # Can be done at any time ssh hgwdev cd /cluster/data/mm3 mkdir cytoBand cd cytoBand # Get file from NCBI wget ftp://ftp.ncbi.nih.gov/genomes/M_musculus/maps/mapview/BUILD.30/Mouse400.dat.gz gunzip Mouse400.dat.gz # Create bed file /cluster/bin/scripts/createNcbiCytoBand Mouse400.dat # Load the bed file hgLoadBed -noBin -sqlTable=/cluster/home/kent/src/hg/lib/cytoBand.sql mm3 cytoBand cytoBand.bed PREPARE CLUSTER FOR BLASTZ RUN (DONE 03/05/03) # This needs to be done after trf-masking and nib generation. ssh kkstore # Extract lineage-specific repeats using Arian Smit's script: mkdir -p ~/mm3/bed/linSpecRep cd ~/mm3/bed/linSpecRep foreach f (~/mm3/*/*.out) ln -sf $f . end /cluster/bin/scripts/rodentSpecificRepeats.pl *.out /cluster/bin/scripts/perl-rename 's/(\.fa|\.nib)//' *.out.*spec /cluster/bin/scripts/perl-rename 's/\.(rod|prim)spec/.spec/' *.out.*spec rm *.out cd .. rm -rf /scratch/hg/mm3/linSpecRep mkdir -p /scratch/hg/mm3 cp -Rp linSpecRep /scratch/hg/mm3 # RepeatMasker .out: cd ~/mm3 rm -rf /scratch/hg/mm3/rmsk mkdir -p /scratch/hg/mm3/rmsk cp -p ?{,?}/chr?{,?}{,_random}.fa.out /scratch/hg/mm3/rmsk # Chrom-level mixed nibs that have been repeat- and trf-masked: rm -rf /scratch/hg/mm3/chromTrfMixedNib mkdir -p /scratch/hg/mm3/chromTrfMixedNib cp -p mixedNib/chr*.nib /scratch/hg/mm3/chromTrfMixedNib # Ask cluster-admin@cse.ucsc.edu to binrsync /scratch/hg to clusters # Jim's comments Feb 12 '03 about the order in which to run blastz: # In general we should do # 1) hg/mm # 2) mm/rn # 3) rn/hg # 4) hg/hg # 5) mm/mm # 6) rn/rn # There is now an 'axtSwap' program that might let us # get out of having to run the inverse of 1,2 & 3, though # 2 in particular is so fast perhaps it's just as well to # do the inverse explicitly. MAKING AND STORING mRNA AND EST ALIGNMENTS (DONE 02/09/03) # Load up the local disks of the cluster with refSeq.fa, mrna.fa and est.fa # from /cluster/store2/mrna.133 into /scratch/hg/mrna.133 # Make sure that /scratch/hg/mm3/trfFa is loaded with chr*_*.fa and pushed # to the cluster nodes. ssh kk cd ~/mm/bed foreach i (refSeq mrna est) mkdir $i cd $i ls -1S /mnt/scratch/hg/mm3/trfFa/*.fa > genome.lst ls -1 /mnt/scratch/hg/mrna.133/Mus_musculus/$i.fa > mrna.lst cp -p ~/lastMm/bed/$i/gsub . mkdir psl gensub2 genome.lst mrna.lst gsub spec para create spec cd .. end # In each dir: para try, para check, para push, para check.... # Process refSeq mRNA and EST alignments into near best in genome. ssh kkstore cd ~/mm/bed cd refSeq pslSort dirs raw.psl /cluster/store2/tmp psl pslReps -minCover=0.2 -sizeMatters -minAli=0.98 -nearTop=0.002 raw.psl contig.psl /dev/null liftUp -nohead all_refSeq.psl ../../jkStuff/liftAll.lft warn contig.psl pslSortAcc nohead chrom /cluster/store2/tmp all_refSeq.psl pslCat -dir chrom > refSeqAli.psl cd .. cd mrna pslSort dirs raw.psl /cluster/store2/tmp psl pslReps -minAli=0.98 -sizeMatters -nearTop=0.005 raw.psl contig.psl /dev/null liftUp -nohead all_mrna.psl ../../jkStuff/liftAll.lft warn contig.psl pslSortAcc nohead chrom /cluster/store2/tmp all_mrna.psl cd .. cd est pslSort dirs raw.psl /cluster/store2/tmp psl pslReps -minAli=0.98 -sizeMatters -nearTop=0.005 raw.psl contig.psl /dev/null liftUp -nohead all_est.psl ../../jkStuff/liftAll.lft warn contig.psl pslSortAcc nohead chrom /cluster/store2/tmp all_est.psl cd .. # Load refSeq alignments into database ssh hgwdev cd ~/mm/bed/refSeq hgLoadPsl mm3 -tNameIx refSeqAli.psl # Load mRNA alignments into database. ssh hgwdev cd ~/mm/bed/mrna/chrom foreach i (*.psl) mv $i $i:r_mrna.psl end hgLoadPsl mm3 *.psl cd .. hgLoadPsl mm3 all_mrna.psl -nobin # Load EST alignments into database. ssh hgwdev cd ~/mm/bed/est/chrom foreach i (*.psl) mv $i $i:r_est.psl end hgLoadPsl mm3 *.psl cd .. hgLoadPsl mm3 all_est.psl -nobin # Create subset of ESTs with introns and load into database. - ssh kkstore cd ~/mm tcsh jkStuff/makeIntronEst.sh - ssh hgwdev cd ~/mm/bed/est/intronEst hgLoadPsl mm3 *.psl ADD REFSEQ/MRNA/EST ALIGNMENTS TO _RANDOMS (DONE 03/06/03) # Note: in future builds, we should get the _randoms in the beginning, # so this should not be necessary! ssh kk cd ~/mm/bed # refSeq, mrna done -- est too large for pslSort when run on unmasked, # so rerun est on masked. foreach i (refSeq mrna est) mkdir ${i}_random cd ${i}_random ls -1S /mnt/scratch/hg/mm3/contigs/chr*_random_*.fa > genome.lst ls -1 /mnt/scratch/hg/mrna.133/Mus_musculus/$i.fa > mrna.lst cp -p ~/lastMm/bed/$i/gsub . #-- Edit the gsub so it doesn't say -mask=lower!!! # _randoms are all lower right now -- I'm running this before masking. mkdir psl gensub2 genome.lst mrna.lst gsub spec para create spec cd .. end # para try,check,push,check... in each ${i}_random directory. # Now run the "Process..." and "Load..." steps above, but for *_random CREATE REFSEQ GENES TRACK (DONE 02/09/03) # Load the refSeq mRNA ssh hgwdev mkdir -p /gbdb/mm3/mrna.133 ln -s /cluster/store2/mrna.133/refSeq/org/Mus_musculus/refSeq.fa \ /gbdb/mm3/mrna.133 hgLoadRna new mm3 hgLoadRna add -type=refSeq mm3 /gbdb/mm3/mrna.133/refSeq.fa \ /cluster/store2/mrna.133/refSeq/org/Mus_musculus/refSeq.ra # Produce refGene, refPep, refMrna, and refLink tables as so: # Get the proteins: ssh kkstore cd ~/mm/bed/refSeq wget ftp://ftp.ncbi.nih.gov/refseq/M_musculus/mRNA_Prot/mouse.faa.gz wget ftp://ftp.ncbi.nih.gov/refseq/LocusLink/loc2ref wget ftp://ftp.ncbi.nih.gov/refseq/LocusLink/mim2loc gunzip mouse.faa.gz ssh hgwdev cd ~/mm/bed/refSeq hgRefSeqMrna mm3 \ /gbdb/mm3/mrna.133/refSeq.fa \ /cluster/store2/mrna.133/refSeq/org/Mus_musculus/refSeq.ra \ all_refSeq.psl loc2ref mouse.faa mim2loc # Don't worry about the "No gene name" errors # Add RefSeq status info hgRefSeqStatus -mouse mm3 loc2ref REFFLAT (DONE 02/25/03) # create precomputed join of refFlat and refGene: echo 'CREATE TABLE refFlat (KEY geneName (geneName), KEY name (name), KEY chrom (chrom)) SELECT refLink.name as geneName, refGene.* FROM refLink,refGene WHERE refLink.mrnaAcc = refGene.name' | hgsql mm3 LOAD MRNA DATA (DONE 02/09/03) ssh hgwdev ln -s /cluster/store2/mrna.133/org/Mus_musculus/mrna.fa /gbdb/mm3/mrna.133 ln -s /cluster/store2/mrna.133/org/Mus_musculus/est.fa /gbdb/mm3/mrna.133 hgLoadRna add -type=mRNA mm3 /gbdb/mm3/mrna.133/mrna.fa \ /cluster/store2/mrna.133/org/Mus_musculus/mrna.ra hgLoadRna add -type=EST mm3 /gbdb/mm3/mrna.133/est.fa \ /cluster/store2/mrna.133/org/Mus_musculus/est.ra LOADING MOUSE MM3 HUMAN BLASTZ ALIGNMENTS FROM PENN STATE: (DONE 03/07/03) # Translate Penn State .lav files into sorted axt: ssh kkstore set base="/cluster/store2/mm.2003.02/mm3/bed/blastz.hg13.2003-03-06-ASH" set seq1_dir="/cluster/store2/mm.2003.02/mm3/mixedNib/" set seq2_dir="/cluster/store4/gs.14/build31/mixedNib/" set tbl="blastzHuman" cd $base mkdir -p axtChrom foreach c (lav/*) pushd $c set chr=$c:t set out=$base/axtChrom/$chr.axt echo "Translating $chr lav to $out" cat `ls -1 *.lav | sort -g` \ | lavToAxt stdin $seq1_dir $seq2_dir stdout \ | axtSort stdin $out popd end # Translate the sorted axt files into psl: cd $base mkdir -p pslChrom foreach f (axtChrom/chr*.axt) set c=$f:t:r axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl end # Load tables ssh hgwdev set base="/cluster/store2/mm.2003.02/mm3/bed/blastz.hg13.2003-03-06-ASH" set tbl="blastzHuman" cd $base/pslChrom hgLoadPsl mm3 chr*_${tbl}.psl MAKING THE BLASTZBESTHUMAN TRACK FROM PENN STATE MM3 AXT FILES (DONE 03/07/03) # Consolidate AXT files to chrom level, sort, pick best, make psl. ssh kkstore set base="/cluster/store2/mm.2003.02/mm3/bed/blastz.hg13.2003-03-06-ASH" set seq1_dir="/cluster/store2/mm.2003.02/mm3/mixedNib/" set seq2_dir="/cluster/store4/gs.14/build31/mixedNib/" set tbl="blastzBestHuman" cd $base mkdir -p axtBest pslBest foreach chrdir (lav/chr*) set chr=$chrdir:t echo axtBesting $chr axtBest axtChrom/$chr.axt $chr axtBest/$chr.axt -minScore=300 echo translating axtBest to psl for $chr axtToPsl axtBest/$chr.axt S1.len S2.len pslBest/${chr}_${tbl}.psl end # Load tables ssh hgwdev set base="/cluster/store2/mm.2003.02/mm3/bed/blastz.hg13.2003-03-06-ASH" set tbl="blastzBestHuman" cd $base/pslBest hgLoadPsl mm3 chr*_${tbl}.psl # Make /gbdb links and add them to the axtInfo table: mkdir -p /gbdb/mm3/axtBestHg13 cd /gbdb/mm3/axtBestHg13 foreach f ($base/axtBest/chr*.axt) ln -s $f . end cd $base/axtBest rm -f axtInfoInserts.sql touch axtInfoInserts.sql foreach f (/gbdb/mm3/axtBestHg13/chr*.axt) set chr=$f:t:r echo "INSERT INTO axtInfo VALUES ('hg13','Blastz Best in Genome','$chr','$f');" \ >> axtInfoInserts.sql end hgsql mm3 < ~/kent/src/hg/lib/axtInfo.sql hgsql mm3 < axtInfoInserts.sql MAKING THE HUMAN AXTTIGHT FROM AXTBEST (DONE 03/07/03) # After creating axtBest alignments above, use subsetAxt to get axtTight: ssh kkstore cd ~/mm3/bed/blastz.hg13.2003-03-06-ASH/axtBest mkdir -p ../axtTight foreach i (*.axt) subsetAxt $i ../axtTight/$i \ ~kent/src/hg/mouseStuff/subsetAxt/coding.mat 3400 end # translate to psl cd ../axtTight mkdir -p ../pslTight foreach i (*.axt) set c = $i:r axtToPsl $i ../S1.len ../S2.len ../pslTight/${c}_blastzTightHuman.psl end # Load tables into database ssh hgwdev cd ~/mm3/bed/blastz.hg13.2003-03-06-ASH/pslTight hgLoadPsl mm3 chr*_blastzTightHuman.psl RELOADING HUMAN BEST AND ALL TRACKS WITH HG15 (WERE HG13) (DONE 2003-09-22 kate) # NOTE: these are from swapped alignments from human browser # generated by kent & matt. Tight has already been done. ssh hgwdev cd /cluster/data/mm3/bed/blastz.hg15.swap/axtBest cat << 'EOF' > makePsl.csh mkdir -p ../pslBest foreach f (chr*.axt.gz) set c = $f:r:r echo $c gunzip -c $f | axtToPsl stdin \ ../S1.len ../S2.len ../pslBest/${c}_blastzBestHuman.psl end 'EOF' csh makePsl.csh >&! makePsl.log & tail -100f makePsl.log cd ../pslBest hgLoadPsl mm3 chr*_blastzBestHuman.psl cd ../axtChrom cat << 'EOF' > makePsl.csh mkdir -p ../pslChrom foreach f (chr*.axt.gz) set c = $f:r:r echo $c gunzip -c $f | axtToPsl stdin \ ../S1.len ../S2.len ../pslChrom/${c}_blastzHuman.psl end 'EOF' csh makePsl.csh >&! makePsl.log & tail -100f makePsl.log cd ../pslChrom hgLoadPsl mm3 chr*_blastzHuman.psl CREATING HUMAN/MOUSE CHAINS AND NET BASED ON BLASTZ ALIGNMENTS MM3 vs HG12 # Do small cluster run on kkr1u00 # Make sure that ~/oo points to the latest human and ~/mm to the # latest mouse. # The little cluster run will probably take about 1 hours. ssh kkr1u00 cd /cluster/store2/mm.2003.02/mm3/bed/blastz.hg13.2003-03-06-ASH mkdir axtChain cd axtChain mkdir run1 cd run1 ls -1S ../../axtChrom/*.axt.gz > input.lst echo '#LOOP' > gsub echo 'doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out}' >> gsub echo '#ENDLOOP' >> gsub gensub2 input.lst single gsub spec para create spec echo '#!/bin/csh' > doChain echo 'gunzip -c $1 | axtChain stdin ~/mm/mixedNib ~/oo/mixedNib $2 > $3' >> doChain chmod a+x doChain para shove # Do some sorting on the little cluster job on the file server # This will take about 20 minutes. This also ends up assigning # a unique id to each member of the chain. ssh kkstore cd /cluster/store2/mm.2003.02/mm3/bed/blastz.hg13.2003-03-06-ASH/axtChain chainMergeSort run1/chain/*.chain > all.chain chainSplit chain all.chain rm run1/chain/*.chain # Load the chains into the database as so. This will take about # 45 minutes. ssh hgwdev cd /cluster/store2/mm.2003.02/mm3/bed/blastz.hg13.2003-03-06-ASH/axtChain cd chain foreach i (*.chain) set c = $i:r hgLoadChain mm3 ${c}_humanChain $i echo done $c end # Create the nets. You can do this while the database is loading # This is fastest done on the file server. All told it takes about # 40 minutes. ssh kkstore cd /cluster/store2/mm.2003.02/mm3/bed/blastz.hg13.2003-03-06-ASH/axtChain # First do a crude filter that eliminates many chains so the # real chainer has less work to do. mkdir preNet cd chain foreach i (*.chain) echo preNetting $i chainPreNet $i ~/mm/chrom.sizes ~/oo/chrom.sizes ../preNet/$i end cd .. # Run the main netter, putting the results in n1. mkdir n1 cd preNet foreach i (*.chain) set n = $i:r.net echo primary netting $i chainNet $i -minSpace=1 ~/mm/chrom.sizes ~/oo/chrom.sizes ../n1/$n /dev/null end cd .. # Classify parts of net as syntenic, nonsyntenic etc. cat n1/*.net | netSyntenic stdin hNoClass.net # The final step of net creation needs the database. # Best to wait for the database load to finish if it # hasn't already. ssh hgwdev cd /cluster/store2/mm.2003.02/mm3/bed/blastz.hg13.2003-03-06-ASH/axtChain netClass hNoClass.net mm3 hg13 mouse.net -tNewR=$HOME/mm/bed/linSpecRep -qNewR=$HOME/oo/bed/linSpecRep rm -r n1 hNoClass.net # Load the net into the database as so: netFilter -minGap=10 mouse.net | hgLoadNet mm3 humanNet stdin # Move back to the file server to create axt files corresponding # to the net. ssh kkstore cd /cluster/store2/mm.2003.02/mm3/bed/blastz.hg13.2003-03-06-ASH/axtChain mkdir ../axtNet netSplit mouse.net mouseNet cd mouseNet foreach i (*.net) set c = $i:r netToAxt $i ../chain/$c.chain ~/mm/mixedNib ~/oo/mixedNib ../../axtNet/$c.axt echo done ../axt/$c.axt end cd .. rm -r mouseNet # At this point there is a blastz..../axtNet directory that should # be used in place of the axtBest for making the human/mouse # conservation track and for loading up the downloads page. LOADING MOUSE MM3 RAT BLASTZ ALIGNMENTS FROM PENN STATE: (DONE 03/07/03) # Translate Penn State .lav files into sorted axt: ssh kkstore set base="/cluster/store2/mm.2003.02/mm3/bed/blastz.rn2.2003-03-07-ASH" set seq1_dir="/cluster/store2/mm.2003.02/mm3/mixedNib/" set seq2_dir="/cluster/store4/rn2/mixedNib/" set tbl="blastzRat" cd $base mkdir -p axtChrom foreach c (lav/*) pushd $c set chr=$c:t set out=$base/axtChrom/$chr.axt echo "Translating $chr lav to $out" cat `ls -1 *.lav | sort -g` \ | lavToAxt stdin $seq1_dir $seq2_dir stdout \ | axtSort stdin $out popd end # Translate the sorted axt files into psl: cd $base mkdir -p pslChrom foreach f (axtChrom/chr*.axt) set c=$f:t:r:r axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl end # Load tables ssh hgwdev set base="/cluster/store2/mm.2003.02/mm3/bed/blastz.rn2.2003-03-07-ASH" set tbl="blastzRat" cd $base/pslChrom hgLoadPsl mm3 chr*_${tbl}.psl MAKING THE BLASTZBESTRAT TRACK FROM PENN STATE MM3 AXT FILES (DONE 03/07/03) # Consolidate AXT files to chrom level, sort, pick best, make psl. ssh kkstore set base="/cluster/store2/mm.2003.02/mm3/bed/blastz.rn2.2003-03-07-ASH" set seq1_dir="/cluster/store2/mm.2003.02/mm3/mixedNib/" set seq2_dir="/cluster/store4/rn2/mixedNib/" set tbl="blastzBestRat" cd $base mkdir -p axtBest pslBest foreach chrdir (lav/chr*) set chr=$chrdir:t echo axtBesting $chr axtBest axtChrom/$chr.axt $chr axtBest/$chr.axt -minScore=300 echo translating axtBest to psl for $chr axtToPsl axtBest/$chr.axt S1.len S2.len pslBest/${chr}_${tbl}.psl end # Load tables ssh hgwdev set base="/cluster/store2/mm.2003.02/mm3/bed/blastz.rn2.2003-03-07-ASH" set tbl="blastzBestRat" cd $base/pslBest hgLoadPsl mm3 chr*_${tbl}.psl # Make /gbdb links and add them to the axtInfo table: mkdir -p /gbdb/mm3/axtBestRn2 cd /gbdb/mm3/axtBestRn2 rm -f * foreach f ($base/axtBest/chr*.axt) ln -s $f . end cd $base/axtBest rm -f axtInfoInserts.sql touch axtInfoInserts.sql foreach f (/gbdb/mm3/axtBestRn2/chr*.axt) set chr=$f:t:r echo "INSERT INTO axtInfo VALUES ('rn2','Blastz Best in Genome','$chr','$f');" \ >> axtInfoInserts.sql end hgsql mm3 < ~/kent/src/hg/lib/axtInfo.sql hgsql mm3 < axtInfoInserts.sql MAKING THE RAT AXTTIGHT FROM AXTBEST (DONE 03/07/03) # After creating axtBest alignments above, use subsetAxt to get axtTight: ssh kkstore cd ~/mm3/bed/blastz.rn2.2003-03-07-ASH/axtBest mkdir -p ../axtTight foreach i (*.axt) subsetAxt $i ../axtTight/$i \ ~kent/src/hg/mouseStuff/subsetAxt/coding.mat 3400 end # translate to psl cd ../axtTight mkdir -p ../pslTight foreach i (*.axt) set c = $i:r axtToPsl $i ../S1.len ../S2.len ../pslTight/${c}_blastzTightRat.psl end # Load tables into database ssh hgwdev cd ~/mm3/bed/blastz.rn2.2003-03-07-ASH/pslTight hgLoadPsl mm3 chr*_blastzTightRat.psl CHAIN&NET MM3-RN3 AXT (SWAPPED FROM RN3-MM3 BLASTZ RUN): (DONE 9/22/03 angie) ssh kkr1u00 cd /cluster/data/mm3/bed/blastz.rn3.swp mkdir -p axtChain/run1 cd axtChain/run1 mkdir out chain ls -1S /cluster/data/mm3/bed/blastz.rn3.swp/axtChrom/*.axt > input.lst cat << '_EOF_' > gsub #LOOP doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out} #ENDLOOP '_EOF_' # << this line makes emacs coloring happy cat << '_EOF_' > doChain #!/bin/csh axtFilter -notQ=chrUn_random $1 \ | axtChain stdin /iscratch/i/mm3/mixedNib /iscratch/i/rn3/bothMaskedNibs $2 > $3 '_EOF_' # << this line makes emacs coloring happy chmod a+x doChain gensub2 input.lst single gsub jobList para create jobList para try para push # ... etc ... #Completed: 37 of 37 jobs #CPU time in finished jobs: 10572s 176.20m 2.94h 0.12d 0.000 y #IO & Wait Time: 2786s 46.43m 0.77h 0.03d 0.000 y #Average job time: 361s 6.02m 0.10h 0.00d #Longest job: 609s 10.15m 0.17h 0.01d #Submission to last job: 1098s 18.30m 0.30h 0.01d # now on the cluster server, sort chains ssh kkstore cd /cluster/data/mm3/bed/blastz.rn3.swp/axtChain chainMergeSort run1/chain/*.chain > all.chain chainSplit chain all.chain # these steps take ~20 minutes # optionally: rm run1/chain/*.chain # Create the nets. You can do this while the database is loading # This is fastest done on the file server. All told it takes about # 40 minutes. ssh kkstore cd /cluster/data/mm3/bed/blastz.rn3.swp/axtChain # First do a crude filter that eliminates many chains so the # real chainer has less work to do. mkdir preNet cd chain foreach i (*.chain) echo preNetting $i chainPreNet $i /cluster/data/mm3/chrom.sizes \ /cluster/data/rn3/chrom.sizes ../preNet/$i end cd .. # Run the main netter, putting the results in n1. mkdir n1 cd preNet foreach i (*.chain) set n = $i:r.net echo primary netting $i chainNet $i -minSpace=1 /cluster/data/mm3/chrom.sizes \ /cluster/data/rn3/chrom.sizes ../n1/$n /dev/null end cd .. # Classify parts of net as syntenic, nonsyntenic etc. cat n1/*.net | netSyntenic stdin hNoClass.net # The final step of net creation needs the database. # Best to wait for the database load to finish if it # hasn't already. ssh hgwdev cd /cluster/data/mm3/bed/blastz.rn3.swp/axtChain netClass hNoClass.net mm3 rn3 mouse.net \ -tNewR=/cluster/data/mm3/bed/linSpecRep \ -qNewR=/cluster/data/rn3/bed/linSpecRep rm -r n1 hNoClass.net # Move back to the file server to create axt files corresponding # to the net. Use netToAxt -maxGap=300 so we can pass nets to multiz. ssh kkstore cd /cluster/data/mm3/bed/blastz.rn3.swp/axtChain mkdir ../axtNet300 netSplit mouse.net mouseNet cd mouseNet foreach i (*.net) set c = $i:r netToAxt -maxGap=300 $i ../chain/$c.chain /cluster/data/mm3/mixedNib \ /cluster/data/rn3/softNib ../../axtNet300/$c.axt echo done ../axt/$c.axt end cd .. rm -r mouseNet # Make maf's for multiz cd /cluster/data/mm3/bed/blastz.rn3.swp mkdir maf300 foreach i (axtNet300/chr*.axt) set maf = maf300/$i:t:r.mr.maf.unfixed axtToMaf $i \ /cluster/data/mm3/chrom.sizes /cluster/data/rn3/chrom.sizes \ $maf -tPrefix=mm3. -qPrefix=rn3. /cluster/bin/scripts/fixmaf.pl < $maf > $maf:r end rm maf300/*.unfixed REGENERATE MAF FROM MOUSE-RAT NET FOR MULTIZ (DONE 10/27/03 angie) # Penn State complained about the drop in coverage -- so omit the # -maxGap=300 in netToAxt and try again. ssh kkstore cd /cluster/data/mm3/bed/blastz.rn3.swp/axtChain mkdir ../axtNet netSplit mouse.net mouseNet cd mouseNet foreach i (*.net) set c = $i:r netToAxt $i ../chain/$c.chain /cluster/data/mm3/mixedNib \ /cluster/data/rn3/softNib ../../axtNet/$c.axt echo done ../axt/$c.axt end cd .. rm -r mouseNet # Make maf's for multiz cd /cluster/data/mm3/bed/blastz.rn3.swp mkdir maf foreach i (axtNet/chr*.axt) set maf = maf/$i:t:r.mr.maf axtToMaf $i \ /cluster/data/mm3/chrom.sizes /cluster/data/rn3/chrom.sizes \ $maf -tPrefix=mm3. -qPrefix=rn3. end REGENERATE MAF FROM MOUSE-RAT AXTBEST FOR MULTIZ (DONE 10/29/03 angie) # Sounds like Penn State's desired coverage needs axtBest not net. ssh kkstore cd /cluster/data/mm3/bed/blastz.rn3.swp mkdir mafBest foreach i (axtBest/chr*.axt) set maf = mafBest/$i:t:r.mr.maf axtToMaf $i \ /cluster/data/mm3/chrom.sizes /cluster/data/rn3/chrom.sizes \ $maf -tPrefix=mm3. -qPrefix=rn3. end CREATE MAF FROM MOUSE-HUMAN NET FOR MULTIZ (DONE 9/22/03 angie) # Use netToAxt -maxGap=300 so we can pass nets to multiz. ssh kkstore cd /cluster/data/mm3/bed/blastz.hg15.swap/axtChain mkdir ../axtNet300 cd net foreach i (chr*.net) set c = $i:r netToAxt -maxGap=300 $i ../chain/$c.chain /cluster/data/mm3/mixedNib \ /cluster/data/hg15/nib ../../axtNet300/$c.axt echo done ../axt/$c.axt end cd .. # Make maf's for multiz cd /cluster/data/mm3/bed/blastz.hg15.swap mkdir maf300 foreach i (axtNet300/chr*.axt) set maf = maf300/$i:t:r.mh.maf.unfixed axtToMaf $i \ /cluster/data/mm3/chrom.sizes /cluster/data/hg15/chrom.sizes \ $maf -tPrefix=mm3. -qPrefix=hg15. /cluster/bin/scripts/fixmaf.pl < $maf > $maf:r end rm maf300/*.unfixed REGENERATE MAF FROM MOUSE-HUMAN NET FOR MULTIZ (DONE 10/28/03 angie) # Penn State complained about the drop in coverage -- so omit the # -maxGap=300 in netToAxt and try again. ssh kkstore cd /cluster/data/mm3/bed/blastz.hg15.swap/axtChain mkdir ../axtNet cd net foreach i (chr*.net) set c = $i:r netToAxt $i ../chain/$c.chain /cluster/data/mm3/mixedNib \ /cluster/data/hg15/nib ../../axtNet/$c.axt echo done ../axtNet/$c.axt end cd .. # Make maf's for multiz cd /cluster/data/mm3/bed/blastz.hg15.swap mkdir maf foreach i (axtNet/chr*.axt) set maf = maf/$i:t:r.mh.maf axtToMaf $i \ /cluster/data/mm3/chrom.sizes /cluster/data/hg15/chrom.sizes \ $maf -tPrefix=mm3. -qPrefix=hg15. end REGENERATE MAF FROM MOUSE-HUMAN BEST FOR MULTIZ (DONE 10/29/03 angie) # Sounds like Penn State's desired coverage needs axtBest not net. ssh kkstore cd /cluster/data/mm3/bed/blastz.hg15.swap mkdir mafBest foreach i (axtBest/chr*.axt.gz) set maf = mafBest/$i:t:r:r.mh.maf gunzip -c $i | axtToMaf stdin \ /cluster/data/mm3/chrom.sizes /cluster/data/hg15/chrom.sizes \ $maf -tPrefix=mm3. -qPrefix=hg15. end RUN 3-WAY ALIGNMENTS MOUSE/HUMAN/RAT USING MULTIZ (DONE 9/23/03 angie) ssh kkstore mkdir /cluster/data/mm3/bed/multiz.mm3hg15rn3.2003-09-23 cd /cluster/data/mm3/bed/multiz.mm3hg15rn3.2003-09-23 cat << "EOF" > ./doMultiz.csh foreach mh (/cluster/data/mm3/bed/blastz.hg15.swap/maf300/chr*.maf) set c = $mh:t:r:r set mr = /cluster/data/mm3/bed/blastz.rn3.swp/maf300/$c.mr.maf echo multiz-ing $c /cluster/bin/penn/multiz.7.23 $mh $mr - > $c.mhr.maf end "EOF" # << this line makes emacs coloring happy csh -ef ./doMultiz.csh >& doMultiz.log & tail -f doMultiz.log # setup up external files for database to reference ssh hgwdev mkdir /gbdb/mm3/multizHg15Rn3 cd /gbdb/mm3/multizHg15Rn3 foreach f (/cluster/data/mm3/bed/multiz.mm3hg15rn3.2003-09-23/*.maf) ln -s $f . end # load into database /cluster/bin/i386/hgLoadMaf -warn mm3 multizHg15Rn3 # get featureBits coverage: featureBits mm3 multizHg15Rn3 # 1738562766 bases of 2505900260 (69.379%) in intersection RE-RUN 3-WAY ALIGNMENTS MOUSE/HUMAN/RAT USING MULTIZ (DONE 10/28/03 angie) ssh kkstore mkdir /cluster/data/mm3/bed/multiz.mm3hg15rn3.2003-10-28 cd /cluster/data/mm3/bed/multiz.mm3hg15rn3.2003-10-28 cat << "EOF" > ./doMultiz.csh foreach mh (/cluster/data/mm3/bed/blastz.hg15.swap/maf/chr*.maf) set c = $mh:t:r:r set mr = /cluster/data/mm3/bed/blastz.rn3.swp/maf/$c.mr.maf echo multiz-ing $c /cluster/bin/penn/multiz.7.23 $mh $mr - > $c.mhr.maf end "EOF" # << this line makes emacs coloring happy csh -ef ./doMultiz.csh >& doMultiz.log & tail -f doMultiz.log NOT DONE: # setup up external files for database to reference ssh hgwdev mkdir /gbdb/mm3/multizHg15Rn3NoMaxGap cd /gbdb/mm3/multizHg15Rn3NoMaxGap foreach f (/cluster/data/mm3/bed/multiz.mm3hg15rn3.2003-10-28/*.maf) ln -s $f . end # load into database /cluster/bin/i386/hgLoadMaf -warn mm3 multizHg15Rn3NoMaxGap # get featureBits coverage: featureBits mm3 multizHg15Rn3NoMaxGap RE-RUN 3-WAY ALIGNMENTS MOUSE/HUMAN/RAT USING MULTIZ (IN PROGRESS 10/29/03 angie) ssh kkstore mkdir /cluster/data/mm3/bed/multiz.mm3hg15rn3.2003-10-29 cd /cluster/data/mm3/bed/multiz.mm3hg15rn3.2003-10-29 cat << "EOF" > ./doMultiz.csh foreach mh (/cluster/data/mm3/bed/blastz.hg15.swap/mafBest/chr*.maf) set c = $mh:t:r:r set mr = /cluster/data/mm3/bed/blastz.rn3.swp/mafBest/$c.mr.maf echo multiz-ing $c /cluster/bin/penn/multiz.7.23 $mh $mr - > $c.mhr.maf end "EOF" # << this line makes emacs coloring happy csh -ef ./doMultiz.csh >& doMultiz.log & tail -f doMultiz.log NOT DONE: # setup up external files for database to reference ssh hgwdev mkdir /gbdb/mm3/multizHg15Rn3Best cd /gbdb/mm3/multizHg15Rn3Best foreach f (/cluster/data/mm3/bed/multiz.mm3hg15rn3.2003-10-29/*.maf) ln -s $f . end # load into database /cluster/bin/i386/hgLoadMaf -warn mm3 multizHg15Rn3Best # get featureBits coverage: featureBits mm3 multizHg15Rn3Best RE-AXT-ING MOUSE MM3 SELF BLASTZ ALIGNMENTS FROM PENN STATE: (DONE 09/25/03) # Translate Penn State .lav files into sorted axt: ssh kksilo set base="/cluster/data/mm3/bed/blastz.mm3.redo.2003-09-23" set seq1_dir="/cluster/data/mm3/mixedNib" set seq2_dir="/cluster/data/mm3/mixedNib" set tbl="blastzMouse" cd $base mkdir -p axtChrom foreach c (lav/*) pushd $c set chr=$c:t set out=$base/axtChrom/$chr.axt echo "Translating $chr lav to $out" cat `ls -1 *.lav | sort -g` \ | lavToAxt stdin $seq1_dir $seq2_dir stdout \ | axtDropOverlap stdin /cluster/data/mm3/chrom.sizes \ /cluster/data/mm3/chrom.sizes stdout \ | axtSort stdin $out popd end # chrUn_random ran out of mem; re-run in 2 passes: foreach chrdir (lav/chrUn_random) set chr=$chrdir:t echo two-pass translating $chr lav to axtChrom/$chr.axt foreach d ($chrdir/*.lav) set smallout=$d.axt lavToAxt $d $seq1_dir $seq2_dir stdout \ | axtDropOverlap stdin /cluster/data/mm3/chrom.sizes \ /cluster/data/mm3/chrom.sizes stdout \ | axtSort stdin $smallout end cat `ls -1 $chrdir/*.axt | sort -g` \ > axtChrom/$chr.axt end rm lav/chr*/*.axt* # Did not load tables for this one -- they're very big, and low demand. TODO # Translate the sorted axt files into psl: cd $base mkdir -p pslChrom foreach f (axtChrom/chr*.axt) set c=$f:t:r axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl end # Load tables ssh hgwdev set base="/cluster/store2/mm.2003.02/mm3/bed/blastz.mm3.2003-03-06-ASH" set tbl="blastzMouse" cd $base/pslChrom hgLoadPsl mm3 chr*_${tbl}.psl MAKING THE BLASTZBESTMOUSE TRACK FROM PENN STATE MM3 AXT FILES (DONE 03/10/03) # Consolidate AXT files to chrom level, sort, pick best, make psl. ssh kkstore set base="/cluster/store2/mm.2003.02/mm3/bed/blastz.mm3.2003-03-06-ASH" set seq1_dir="/cluster/store2/mm.2003.02/mm3/mixedNib/" set seq2_dir="/cluster/store2/mm.2003.02/mm3/mixedNib/" set tbl="blastzBestMouse" cd $base mkdir -p axtBest pslBest foreach chrdir (lav/chr*) set chr=$chrdir:t echo axtBesting $chr axtBest axtChrom/$chr.axt $chr axtBest/$chr.axt -minScore=300 echo translating axtBest to psl for $chr axtToPsl axtBest/$chr.axt S1.len S2.len pslBest/${chr}_${tbl}.psl end # If a chromosome has so many alignments that axtBest runs out of mem, # run axtBest in 2 passes to reduce size of the input to final axtBest: foreach chrdir (lav/chr{1,2,3,4,6,7,X}) set chr=$chrdir:t echo two-pass axtBesting $chr foreach d ($chrdir/*.lav) set smallout=$d.axt lavToAxt $d $seq1_dir $seq2_dir stdout \ | axtDropSelf stdin stdout \ | axtSort stdin $smallout end foreach a ($chrdir/*.axt) axtBest $a $chr $a:r.axtBest end cat `ls -1 $chrdir/*.axtBest | sort -g` \ > $chrdir/$chr.axtBestPieces axtBest $chrdir/$chr.axtBestPieces $chr axtBest/$chr.axt axtToPsl axtBest/$chr.axt S1.len S2.len pslBest/${chr}_${tbl}.psl end rm lav/chr*/*.axt* # Load tables ssh hgwdev set base="/cluster/store2/mm.2003.02/mm3/bed/blastz.mm3.2003-03-06-ASH" set tbl="blastzBestMouse" cd $base/pslBest hgLoadPsl mm3 chr*_${tbl}.psl # Make /gbdb links and add them to the axtInfo table: mkdir -p /gbdb/mm3/axtBestMm3 cd /gbdb/mm3/axtBestMm3 foreach f ($base/axtBest/chr*.axt) ln -s $f . end cd $base/axtBest rm -f axtInfoInserts.sql touch axtInfoInserts.sql foreach f (/gbdb/mm3/axtBestMm3/chr*.axt) set chr=$f:t:r echo "INSERT INTO axtInfo VALUES ('mm3','Best Mouse Mouse','$chr','$f');" \ >> axtInfoInserts.sql end hgsql mm3 < ~/kent/src/hg/lib/axtInfo.sql hgsql mm3 < axtInfoInserts.sql MAKING THE MOUSE SELF AXTTIGHT FROM AXTBEST (DONE 03/10/03) # After creating axtBest alignments above, use subsetAxt to get axtTight: ssh kkstore cd ~/mm3/bed/blastz.mm3.2003-03-06-ASH/axtBest mkdir -p ../axtTight foreach i (*.axt) subsetAxt $i ../axtTight/$i \ ~kent/src/hg/mouseStuff/subsetAxt/coding.mat 3400 end # translate to psl cd ../axtTight mkdir -p ../pslTight foreach i (*.axt) set c = $i:r axtToPsl $i ../S1.len ../S2.len ../pslTight/${c}_blastzTightMouse.psl end # Load tables into database ssh hgwdev cd ~/mm3/bed/blastz.mm3.2003-03-06-ASH/pslTight hgLoadPsl mm3 chr*_blastzTightMouse.psl PRODUCING GENSCAN PREDICTIONS (TODO - REDO) ssh kkstore mkdir -p ~/mm3/bed/genscan cd ~/mm3/bed/genscan # Make 3 subdirectories for genscan to put their output files in mkdir -p gtf pep subopt # Log into kkr1u00 (not kk!). kkr1u00 is the driver node for the small # cluster (kkr2u00 -kkr8u00. Genscan has problem running on the # big cluster, due to limitation of memory and swap space on each # processing node). ssh kkr1u00 cd ~/mm3/bed/genscan ls -1S /cluster/store2/mm.2003.02/mm3/?{,?}/chr*_*/chr*_*.fa.masked \ > genome.list # Create template file, gsub, for gensub2. For example (3-line file): # Note: I changed this to 1800000 in this build because some jobs were # taking so long I thought they had crashed. #LOOP /cluster/home/kent/bin/i386/gsBig {check in line+ $(path1)} {check out line gtf/$(root1).gtf} -trans={check out line pep/$(root1).pep} -subopt={check out line subopt/$(root1).bed} -exe=/cluster/home/fanhsu/projects/compbio/bin/genscan-linux/genscan -par=/cluster/home/fanhsu/projects/compbio/bin/genscan-linux/HumanIso.smat -tmp=/tmp -window=1800000 #ENDLOOP echo "" > dummy.list gensub2 genome.list dummy.list gsub jobList para create jobList para try para check para push # Issue either one of the following two commands to check the # status of the cluster and your jobs, until they are done. parasol status para check # If there were out-of-memory problems (run "para problems"), then # re-run those jobs by hand but change the -window arg from 2400000 # to 1200000. # chr8_24 # Convert these to chromosome level files as so: ssh kkstore cd ~/mm3/bed/genscan liftUp genscan.gtf ../../jkStuff/liftAll.lft warn gtf/chr*.gtf liftUp genscanSubopt.bed ../../jkStuff/liftAll.lft warn subopt/chr*.bed > \ /dev/null cat pep/*.pep > genscan.pep # Load into the database as so: ssh hgwdev cd ~/mm3/bed/genscan ldHgGene mm3 genscan genscan.gtf hgPepPred mm3 generic genscanPep genscan.pep hgLoadBed mm3 genscanSubopt genscanSubopt.bed > /dev/null TWINSCAN GENE PREDICTIONS (Done 2003-05-20 RELOADED -gtf 2004-04-02 kate) mkdir -p ~/mm3/bed/twinscan cd ~/mm3/bed/twinscan mkdir -p ~/mm3/bed/twinscan/gtf mkdir -p ~/mm3/bed/twinscan/ptx foreach c (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 X) cd ~/mm3/bed/twinscan/gtf wget http://genes.cs.wustl.edu/mouse/5-01-03/gtf/chr$c.gtf cd ~/mm3/bed/twinscan/ptx wget http://genes.cs.wustl.edu/mouse/5-01-03/ptx/chr$c.ptx end cd ~/mm3/bed/twinscan/gtf foreach c (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 X) f=chr$c cat chr$c.gtf | sed "s/^[0-9]* //" | sed "s/^/$f /" > foo mv foo chr$c.gtf end ldHgGene mm3 twinscan chr*.gtf -gtf # 24282 gene predictions cd ~/mm3/bed/twinscan/ptx foreach c (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 X) perl -wpe 's/^\>.*\s+source_id\s*\=\s*(\S+).*$/\>$1/;' < \ chr$c.ptx | sed "s/^>chr.*/&.a/" > chr$c-fixed.fa end hgPepPred mm3 generic twinscanPep chr*-fixed.fa NCBI GENE MODELS (TODO) mkdir -p ~/mm3/bed/ncbiGenes cd ~/mm3/bed/ncbiGenes wget ftp://ftp.ncbi.nih.gov/genomes/M_musculus/MGSCv3_Release1/maps/chr_genes.gtf.gz wget ftp://ftp.ncbi.nih.gov/genomes/M_musculus/MGSCv3_Release1/protein/protein.fa.gz gunzip chr_genes.gtf.gz gunzip protein.fa.gz - Process the .gtf and .fa together to join IDs ../../jkStuff/mungeNCBIids chr_genes.gtf protein.fa |& uniq ldHgGene mm3 ncbiGenes chr_genes-fixed.gtf hgPepPred mm3 generic ncbiPep protein-fixed.fa NCBI GENOMESCAN MODELS (TODO) mkdir -p ~/mm3/bed/genomeScan cd ~/mm3/bed/genomeScan wget ftp://ftp.ncbi.nih.gov/genomes/M_musculus/MGSCv3_Release1/maps/chr_GenomeScan.gtf.gz # Remove the ".1" at the end of transcript_id's: gunzip -c chr_GenomeScan.gtf.gz | \ perl -wpe 's/transcript_id "([^\"]+)\.1"/transcript_id "$1"/' > \ chr_GenomeScan-fixed.gtf ldHgGene mm3 genomeScan chr_GenomeScan-fixed.gtf wget ftp://ftp.ncbi.nih.gov/genomes/M_musculus/MGSCv3_Release1/protein/GS_prot.fsa.gz hgPepPred mm3 generic genomeScanPep GS_prot.fsa GET FRESH MRNA SEQUENCE FROM GENBANK This will create a genbank.133 directory containing compressed GenBank flat files and a mrna.133 containing unpacked sequence info and auxiliary info in a relatively easy to parse (.ra) format. o - Point your browser to ftp://ftp.ncbi.nih.gov/genbank and look at the README.genbank. Figure out the current release number (which is 133). o - Consider deleting one of the older genbank releases. It''s good to at least keep one previous release though. o - Where there is space make a new genbank directory. Create a symbolic link to it: mkdir /cluster/store2/genbank.133 ln -s /cluster/store2/genbank.133 ~/genbank cd ~/genbank o - ftp ftp.ncbi.nih.gov (do anonymous log-in). Then do the following commands inside ftp: cd genbank prompt mget gbpri* gbrod* gbv* gbsts* gbest* gbmam* gbinv* gbbct* gbhtc* gbpat* gbphg* gbpln* This will take at least 2 hours. o - Make the refSeq subdir and download files: mkdir -p /cluster/store2/mrna.133/refSeq cd /cluster/store2/mrna.133/refSeq o - ftp ftp.ncbi.nih.gov (do anonymous log-in). Then do the following commands inside ftp: cd refseq/cumulative prompt mget *.gz o - Unpack this into species-specific fa files and get extra info with: cd /cluster/store2/mrna.133/refSeq gunzip -c rscu.gbff.Z | \ gbToFaRa -byOrganism=org ../anyRna.fil refSeq.fa refSeq.ra refSeq.ta stdin o - ssh kkstore cd /cluster/store2/mrna.133 # Make the RNAs gunzip -c /cluster/store2/genbank.133/gb{pri,rod,v,mam,inv,bct,htc,pat,phg,pln}* \ | gbToFaRa -byOrganism=org anyRna.fil mrna.fa mrna.ra mrna.ta stdin # Make the ESTs gunzip -c /cluster/store2/genbank.133/gbest*.gz | \ gbToFaRa anyRna.fil est.fa est.ra est.ta stdin -byOrganism=org # Make the nonhuman RNAs gunzip -c /cluster/store2/genbank.133/gb{pri,rod,v,mam,inv,bct,htc,pat,phg,pln}* \ | gbToFaRa humanXenoRna.fil humanXenoRna.fa humanXenoRna.ra humanXenoRna.ta stdin # Make the nonMouse RNAs gunzip -c /cluster/store2/genbank.133/gb{pri,rod,v,mam,inv,bct,htc,pat,phg,pln}* \ | gbToFaRa mouseXenoRna.fil mouseXenoRna.fa mouseXenoRna.ra mouseXenoRna.ta stdin # Make the nonRat RNAs gunzip -c /cluster/store2/genbank.133/gb{pri,rod,v,mam,inv,bct,htc,pat,phg,pln}* \ | gbToFaRa ratXenoRna.fil ratXenoRna.fa ratXenoRna.ra ratXenoRna.ta stdin # Make the nonhuman ESTs gunzip -c /cluster/store2/genbank.133/gbest*.gz | \ gbToFaRa humanXenoRna.fil humanXenoEst.fa humanXenoEst.ra humanXenoEst.ta stdin PRODUCING CROSS_SPECIES mRNA ALIGMENTS (DONE 03/06/03) # Here you align non-mouse mRNAs against the masked genome on the # cluster you set up during the previous step. # Make sure that gbpri, gbmam, gbrod, and gbvert are downloaded from # Genbank into /cluster/store2/genbank.133 and unpacked by organism into # /cluster/store2/mrna.133/org. # Set up cluster run more or less as so: ssh kk cd ~/mm3/bed mkdir xenoMrna cd xenoMrna ls -1S /scratch/hg/mm3/trfFa/* > genome.lst cp -R /cluster/store2/mrna.133/org /mnt/scratch/hg/mrna.133 # The below ls command fails when you have too many files so skip it and # instead run the find command after it. # ls -1S /mnt/scratch/hg/mrna.133/org/*/mrna.fa > allMrna.lst find /mnt/scratch/hg/mrna.133/org -name mrna.fa -ls \ | awk '{print $7,$11}' | grep -v /Mus_musculus/ \ | sort -gr | awk '{print $2}' \ > allMrna.lst # Put the first line of allMrna.lst into 1.org, the second line into # 2.org, and so forth: foreach n (1 2 3 4 5 6) head -$n allMrna.lst | tail -1 > $n.org end # After the 6th line just leave the rest in 7.org. tail +7 allMrna.lst > 7.org # Then ls -1 *.org > mrna.lst cp ~/lastMm/bed/xenoMrna/gsub . mkdir psl gensub2 genome.lst mrna.lst gsub spec para create spec para try para check # If all looks well do para push # Sort xeno mRNA alignments as so: ssh kkstore cd ~/mm3/bed/xenoMrna pslSort dirs raw.psl /cluster/store2/temp psl pslReps raw.psl cooked.psl /dev/null -minAli=0.25 liftUp chrom.psl ../../jkStuff/liftAll.lft warn cooked.psl pslSortAcc nohead chrom /cluster/store2/temp chrom.psl pslCat -dir chrom > xenoMrna.psl rm -r chrom raw.psl cooked.psl chrom.psl # Load into database as so: ssh hgwdev cd ~/mm3/bed/xenoMrna hgLoadPsl mm3 xenoMrna.psl -tNameIx # Make the xenoRna file # Make a /gbdb symlink for the .fa (not .ra) cd /gbdb/mm3/mrna.133 ln -s /cluster/store2/mrna.133/mouseXenoRna.fa mouseXenoRna.fa hgLoadRna add -type=xenoRna mm3 /gbdb/mm3/mrna.133/mouseXenoRna.fa \ /cluster/store2/mrna.133/mouseXenoRna.ra PRODUCING ESTORIENTINFO TABLE (DONE - 2/19/03 MATT; _randoms DONE 03/06/03) This table is needed for proper orientation of ESTs in the browser. Many will appear on the wrong strand without it. This involves a cluster run. First load the EST psl files as so: ssh kkstore cd ~/mm3/bed/est pslSortAcc nohead contig /cluster/store2/temp contig.psl mkdir /mnt/scratch/hg/mm3/est cp -r contig /mnt/scratch/hg/mm3/est Wait for these to finish. cd .. mkdir estOrientInfo cd estOrientInfo mkdir ei ls -1S /mnt/scratch/hg/mm3/est/contig/* > psl.lst cp ~/lastMm/bed/estOrientInfo/gsub . Update gsub to refer to mouse contig sequence currently on /scratch, and mouse ESTs on /scratch. gensub2 psl.lst single gsub spec para create spec Then run the job on the cluster ssh kk cd ~/mm3/bed/estOrientInfo para try sleep 60 para check If things look good para push Wait for this to finish then liftUp estOrientInfo.bed ../../jkStuff/liftAll.lft warn ei/*.tab Load them into database as so: ssh hgwdev cd ~/mm3/bed/estOrientInfo hgLoadBed mm3 estOrientInfo estOrientInfo.bed \ -sqlTable=/cluster/home/kent/src/hg/lib/estOrientInfo.sql PRODUCING MRNAORIENTINFO TABLE (DONE 03/06/03) ssh kkstore cd ~/mm3/bed/mrna pslSortAcc nohead contig /cluster/store2/temp contig.psl mkdir /mnt/scratch/hg/mm3/mrna cp -r contig /mnt/scratch/hg/mm3/mrna mkdir -p ~/mm3/bed/mrnaOrientInfo/oi cd ~/mm3/bed/mrnaOrientInfo ls -1S /mnt/scratch/hg/mm3/mrna/contig/* > psl.lst cp ~/lastMm/bed/mrnaOrientInfo/gsub . echo placeholder > single gensub2 psl.lst single gsub spec ssh kk cd ~/mm3/bed/mrnaOrientInfo para create spec para try, para check, para push, para check,... liftUp mrnaOrientInfo.bed ../../jkStuff/liftAll.lft warn oi/*.tab ssh hgwdev cd ~/mm3/bed/mrnaOrientInfo hgLoadBed mm3 mrnaOrientInfo mrnaOrientInfo.bed \ -sqlTable=/cluster/home/kent/src/hg/lib/mrnaOrientInfo.sql CREATE RNACLUSTER TABLE (DONE 03/06/03) # Make sure that refSeqAli, estOrientInfo and mrnaOrientInfo tables are # made already (see above). ssh hgwdev mkdir -p ~/mm3/bed/rnaCluster/chrom cd ~/mm3/bed/rnaCluster foreach i (~/mm3/?{,?}) foreach f ($i/chr*.fa) set c = $f:t:r clusterRna mm3 /dev/null chrom/$c.bed -chrom=$c echo done $c end end hgLoadBed mm3 rnaCluster chrom/*.bed PRODUCING TETRAODON FISH ALIGNMENTS (TODO) o - Download sequence from ... and put it on the cluster local disk at /scratch/hg/fish o - Do fish/mouse alignments. ssh kk cd ~/mm/bed mkdir blatFish cd blatFish mkdir psl ls -1S /scratch/hg/fish/* > fish.lst ls -1S /scratch/hg/mm3/trfFa/* > mouse.lst cp ~/lastMm/blatFish/gsub . gensub2 mouse.lst fish.lst gsub spec para create spec para try Make sure jobs are going ok with para check. Then para push wait about 2 hours and do another para push do para checks and if necessary para pushes until done or use para shove. o - Sort alignments as so pslCat -dir psl | liftUp -type=.psl stdout ~/mm/jkStuff/liftAll.lft warn stdin | pslSortAcc nohead chrom /cluster/store2/tmp stdin o - Copy to hgwdev:/scratch. Rename to correspond with tables as so and load into database: ssh hgwdev cd ~/mm/bed/blatFish/chrom foreach i (*.psl) set r = $i:r mv $i ${r}_blatFish.psl end hgLoadPsl mm3 *.psl #*** Make /gbdb/mm3/fish_seq15jun2001 links hgLoadRna addSeq mm3 /gbdb/mm3/fish_seq15jun2001/*.fa # PRODUCING SQUIRT ALIGNMENTS ssh kkstore mkdir -p ~/mm3/bed/blatCi1 cd ~/mm3/bed/blatCi1 ls -1S /iscratch/i/squirt/ci1/queryFa/*.fa > squirt.lst ls -1S /scratch/hg/mm3/trfFa/* > mouse.lst rm -rf psl foreach ctg (`cat mouse.lst`) mkdir -p psl/$ctg:t:r end # get gsub2D from someplace gensub2 mouse.lst squirt.lst gsub2D spec ssh kk cd ~/mm3/bed/blatCi1 para create spec .... # When cluster run is done, sort alignments: ssh eieio cd ~/mm3/bed/blatCi1 rm -rf /tmp/$LOGNAME mkdir /tmp/$LOGNAME pslSort dirs raw.psl /tmp/$LOGNAME psl/* pslReps raw.psl cooked.psl /dev/null -minAli=0.05 liftUp -nohead lifted.psl ../../jkStuff/liftAll.lft warn cooked.psl pslSortAcc nohead chrom /tmp/$LOGNAME lifted.psl # Rename to correspond with tables as so and load into database: ssh hgwdev cd ~/mm3/bed/blatCi1/chrom rm -f chr*_blatCi1.psl foreach i (chr?{,?}{,_random}.psl) set r = $i:r mv $i ${r}_blatCi1.psl end hgLoadPsl mm3 *.psl # Make squirt /gbdb/ symlink mkdir /gbdb/mm3/squirtSeq cd /gbdb/mm3/squirtSeq ln -s /cluster/store5/squirt/ci1/ciona.rm.fasta PRODUCING FUGU FISH ALIGNMENTS (DONE 03/13/03) # sequence was previously downloaded to /cluster/store3/fuguSeq/ from # ftp://ftp.jgi-psf.org/pub/JGI_data/Fugu/fugu_v3_mask.fasta.Z # ftp://ftp.jgi-psf.org/pub/JGI_data/Fugu/fugu_v3_prot.fasta.Z # Load it up on /mnt/scratch: ssh kkstore mkdir -p /mnt/scratch/hg/fugu # Next time, use /cluster/store3/fuguSeq/split2.5Mb !!! cp -p /cluster/store3/fuguSeq/split/*.fa /mnt/scratch/hg/fugu/ ssh kk mkdir ~/mm/bed/blatFugu cd ~/mm/bed/blatFugu ls -1S /mnt/scratch/hg/fugu/* > fugu.lst ls -1S /scratch/hg/mm3/trfFa/* > mouse.lst # Create dir structure for run mkdir psl foreach ctg (`ls -1 /scratch/hg/mm3/trfFa/`) mkdir psl/$ctg:t:r end cp ~/lastMm/bed/blatFugu/gsub . gensub2 mouse.lst fugu.lst gsub spec para create spec para try # Make sure jobs are going ok with para check. Then para push # wait about 2 hours and do another para push # do para checks and if necessary para pushes until done or use para shove. ssh kkstore cd ~/mm/bed/blatFugu pslCat -dir psl/* \ | liftUp -type=.psl stdout ~/mm3/jkStuff/liftAll.lft warn stdin \ | pslSortAcc nohead chrom /cluster/store2/tmp stdin # load into database: ssh hgwdev cd ~/mm3/bed/blatFugu/chrom foreach i (*.psl) set r = $i:r mv $i ${r}_blatFugu.psl end hgLoadPsl mm3 *.psl mkdir -p /gbdb/mm3/fuguSeq ln -s /cluster/store3/fuguSeq/fugu_v3_mask.fasta /gbdb/mm3/fuguSeq/ hgLoadRna addSeq mm3 /gbdb/mm3/fuguSeq/fugu_v3_mask.fasta LOAD SOFTBERRY GENES (DONE 02/04/03) cd /cluster/store2/mm.2003.02/mm3/bed mkdir softberry cd softberry wget ftp://www.softberry.com/pub/SC_MOU_FEB03/Softb_mouse_gff_f03.tar.gz gunzip -c Softb_mouse_gff_f03.tar.gz | tar xvf - ldHgGene mm3 softberryGene chr*.gff hgPepPred mm3 softberry *.protein hgSoftberryHom mm3 *.protein LOAD GENEID GENES (DONE 03/18/03 RELOADED -gtf 2004-04-02 kate) mkdir -p ~/mm3/bed/geneid/download cd ~/mm3/bed/geneid/download set webroot = \ http://genome.imim.es/genepredictions/M.musculus/mmFeb2003/geneid_v1.1 foreach f (~/mm3/?{,?}/chr?{,?}{,_random}.fa) set chr = $f:t:r wget $webroot/$chr.gtf wget $webroot/$chr.prot end # Add missing .1 to protein id's foreach f (*.prot) perl -wpe 's/^(>chr\w+)$/$1.1/' $f > $f:r-fixed.prot end cd .. ldHgGene mm3 geneid download/*.gtf -gtf # 34050 gene predictions runGeneCheck . # 34 noStart # 33 badFrame # 13 noStop # 11 gap hgPepPred mm3 generic geneidPep download/*-fixed.prot SGP GENE PREDICTIONS (4/17/03 KRR) mkdir -p ~/mm3/bed/sgp/download cd ~/mm3/bed/sgp/download #foreach f (~/mm3/?{,?}/chr?{,?}{,_random}.fa) # Doesn't have chr_random entries KRR foreach f (~/mm3/?{,?}/chr?{,?}.fa) set chr = $f:t:r wget http://genome.imim.es/genepredictions/M.musculus/mmFeb2003/SGP/humangp20021114/$chr.gtf wget http://genome.imim.es/genepredictions/M.musculus/mmFeb2003/SGP/humangp20021114/$chr.prot end # Add missing .1 to protein id's foreach f (*.prot) perl -wpe 's/^(>chr\w+)$/$1.1/' $f > $f:r-fixed.prot end cd .. ldHgGene mm3 sgpGene download/*.gtf -exon=CDS # NOTE: perhaps need to use -gtf instead of -exon=CDS # check that stop codon is included (txEnd == last exonEnds) hgPepPred mm3 generic sgpPep download/*-fixed.prot # TIGR GENE INDEX (DONE 2003-11-10 kate) ssh kkstore mkdir -p /cluster/data/mm3/bed/tigr cd /cluster/data/mm3/bed/tigr wget ftp://ftp.tigr.org/pub/data/tgi/Mus_musculus/TGI_track_MouseGenome_Feb2003.tgz tar xvfz TGI*.tgz foreach f (*cattle*) set f1 = `echo $f | sed -e 's/cattle/cow/g'` mv $f $f1 end foreach o (mouse cow human pig rat) echo $o setenv O $o foreach f (chr*_$o*s) tail +2 $f | perl -wpe 's /THC/TC/; s/(TH?C\d+)/$ENV{O}_$1/;' > $f.gff end end ssh hgwdev cd /cluster/data/mm3/bed/tigr ldHgGene -exon=TC mm3 tigrGeneIndex *.gff # 286333 transcripts in 1180977 lines in 105 files gzip *TCs *.gff LOAD STS MAP (TODO) - login to hgwdev cd ~/mm/bed mm3 < ~/src/hg/lib/stsMap.sql mkdir stsMap cd stsMap bedSort /projects/cc/hg/mapplots/data/tracks/build28/stsMap.bed stsMap.bed - Enter database with "mm3" command. - At mysql> prompt type in: load data local infile 'stsMap.bed' into table stsMap; - At mysql> prompt type LOAD MGI IDs (TODO) - The Locuslink ID to MGI IDs converstion data file, LL2MGI.txt, from Jackson Lab should be found under ~/mm/bed/refSeq - login to hgwdev cd ~/mm/bed/refSeq mm3 < ~/src/hg/lib/mgiID.sql - Enter database with "mm3" command. - At mysql> prompt type in: load data local infile 'LL2MGI.txt' into table MGIid; - At mysql> prompt type quit LOAD CHROMOSOME BANDS (TODO) - login to hgwdev cd /cluster/store2/mm.2003.02/mm3/bed mkdir cytoBands cp /projects/cc/hg/mapplots/data/tracks/build28/cytobands.bed cytoBands mm3 < ~/src/hg/lib/cytoBand.sql Enter database with "mm3" command. - At mysql> prompt type in: load data local infile 'cytobands.bed' into table cytoBand; - At mysql> prompt type quit LOAD MOUSEREF TRACK (TODO) First copy in data from kkstore to ~/mm/bed/mouseRef. Then substitute 'genome' for the appropriate chromosome in each of the alignment files. Finally do: hgRefAlign webb mm3 mouseRef *.alignments LOAD AVID MOUSE TRACK (TODO) ssh cc98 cd ~/mm/bed mkdir avidMouse cd avidMouse wget http://pipeline.lbl.gov/tableCS-LBNL.txt hgAvidShortBed *.txt avidRepeat.bed avidUnique.bed hgLoadBed avidRepeat avidRepeat.bed hgLoadBed avidUnique avidUnique.bed LOAD SNPS (TODO) - ssh hgwdev - cd ~/mm/bed - mkdir snp - cd snp - Download SNPs from ftp://ftp.ncbi.nlm.nih.gov/pub/sherry/mouse.b27.out.gz - Unpack. createBed < mouse.b27.out > snpNih.bed hgLoadBed mm3 snpNih snpNih.bed LOAD CPGISSLANDS (DONE 03/06/03) ssh kkstore mkdir -p ~/mm3/bed/cpgIsland cd ~/mm3/bed/cpgIsland # Build software emailed from Asif Chinwalla (achinwal@watson.wustl.edu) # copy the tar file to the current directory tar xvf cpg_dist.tar cd cpg_dist gcc readseq.c cpg_lh.c -o cpglh.exe cd .. # cpglh.exe requires hard-masked (N) .fa's. # There may be warnings about "bad character" for IUPAC ambiguous # characters like R, S, etc. Ignore the warnings. foreach f (../../?{,?}/chr?{,?}{,_random}.fa.masked) set fout=$f:t:r:r.cpg ./cpg_dist/cpglh.exe $f > $fout echo Done with $fout end cp ~/lastMm/bed/cpgIsland/filter.awk . awk -f filter.awk chr*.cpg > cpgIsland.bed # Load into db ssh hgwdev cd ~/mm3/bed/cpgIsland hgLoadBed mm3 cpgIsland -tab -noBin \ -sqlTable=$HOME/kent/src/hg/lib/cpgIsland.sql cpgIsland.bed LOAD ENSEMBL GENES (DONE Matt June 6, 03) Peptides 2003-10-01 braney cd ~/mm3/bed mkdir ensembl cd ensembl Get the ensembl gene data from http://www.ensembl.org/ Go to the Martview link Choose Mus musculus as the organism Follow this sequence through the pages: Page 1) Choose the Ensembl Genes choice. Hit next. Page 2) Uncheck the "Limit to" box in the region choice. Then hit next. Page 3) Choose the "Structures" box. Page 4) Choose GTF as the ouput, choose gzip compression and then hit Export.gunzip the file and name to mm3EnsGene.gtf.gz. Unzip it. # Ensembl handles random chromosomes differently than us, so we # strip this data. Fortunately it just loses a couple of genes. grep -v ^6_DR51 ensembl.gtf | grep -v _NT_ > unrandom.gtf # Add "chr" to front of each line in the gene data gtf file to make # it compatible with ldHgGene ~matt/bin/addchr.pl unrandom.gtf ensGene.gtf ~matt/bin/fixEns.pl ensGene.gtf ensFixed.gtf ldHgGene mm3 ensGene ensGene.gtf o - Load Ensembl peptides: Get the ensembl protein data from http://www.ensembl.org/ Go to the Martview link Choose Mus musculus as the organism Follow this sequence through the pages: Page 1) Choose the Ensembl Genes choice. Hit next. Page 2) Uncheck the "Limit to" box in the region choice. Then hit next. Page 3) Choose the "Sequences" box. Page 4) Choose Transcripts/Proteins and peptide Only as the ouput, choose text/fasta and gzip compression and then hit export. Name to mm3EnsGene.pep.gz If needed, substitute ENST for ENSP in ensPep with the program called subs edit subs.in to read: ENSP|ENST #subs -e mm3EnsGene.pep > /dev/null #~matt/bin/fixPep.pl mm3EnsGene.pep ensembl.pep #delete * at end of each protein sed "s/\*$//" mm3EnsGene.pep > ensembl.pep hgPepPred mm3 generic ensPep ensembl.pep o - Load ensGtp table. # ensGtp associates geneId/transcriptId/proteinId for hgPepPred and # hgKnownToSuper. Use ensMart to create it as above, except: # Page 3) Choose the "Features" box. In "Ensembl Attributes", check # Ensembl Gene ID, Ensembl Transcript ID, Ensembl Peptide ID. # Choose Text, tab-separated as the output format. Result name ensGtp. # Save file as ensGtp.txt.gz gunzip ensGtp.txt.gz hgsql hg16 < ~/kent/src/hg/lib/ensGtp.sql echo "load data local infile 'ensGtp.txt' into table ensGtp" | hgsql -N hg16 LOAD ENSEMBL ESTS (DONE June 6, 03) cd ~/mm3/bed mkdir ensemblEst cd ensemblEst Get the ensembl EST data from http://www.ensembl.org/ Go to the Martview link Choose Mus musculus as the organism Follow this sequence through the pages: Page 1) Choose the Ensembl ESTs choice. Hit next. Page 2) Uncheck the "Limit to" box in the region choice. Then hit next. Page 3) Choose the "Structures" box. Page 4) Choose GTF as the ouput, choose gzip compression and then hit Export.gunzip the file and name to mm3EnsEst.gtf.gz. Unzip it. # Ensembl handles random chromosomes differently than us, so we # strip this data. Fortunately it just loses a couple of genes. grep -v ^6_DR51 mm3EnsEst.gtf | grep -v _NT_ > unrandom.gtf # Add "chr" to front of each line in the gene data gtf file to make # it compatible with ldHgGene ~matt/bin/addchr.pl unrandom.gtf ensEst.gtf ~matt/bin/fixEns.pl ensEst.gtf ensFixed.gtf ldHgGene mm3 ensEst ensEst.gtf o - Load Ensembl peptides: Get the ensembl protein data from http://www.ensembl.org/ Go to the Martview link Choose Mus musculus as the organism Follow this sequence through the pages: Page 1) Choose the Ensembl ESTs choice. Hit next. Page 2) Uncheck the "Limit to" box in the region choice. Then hit next. Page 3) Choose the "Sequences" box. Page 4) Choose Transcripts/Proteins and Gene sequence Only as the ouput, choose text/fasta and gzip compression and then hit export. Name to mm3EnsEst.pep.gz If needed, substitute ENST for ENSP in ensPep with the program called subs edit subs.in to read: ENSP|ENST subs -e mm3EnsEst.pep > /dev/null ~matt/bin/fixPep.pl mm3EnsEst.pep ensembl.pep hgPepPred mm3 generic ensEstPep ensembl.pep LOAD RNAGENES (TODO) - login to hgwdev - cd ~kent/src/hg/lib - mm3 < rnaGene.sql - cd /cluster/store2/mm.2003.02/mm3/bed - mkdir rnaGene - cd rnaGene - download data from ftp.genetics.wustl.edu/pub/eddy/pickup/ncrna-oo27.gff.gz - gunzip *.gz - liftUp chrom.gff ../../jkStuff/liftAll.lft carry ncrna-oo27.gff - hgRnaGenes mm3 chrom.gff LOAD EXOFISH (TODO) - login to hgwdev - cd /cluster/store2/mm.2003.02/mm3/bed - mkdir exoFish - cd exoFish - mm3 < ~kent/src/hg/lib/exoFish.sql - Put email attatchment from Olivier Jaillon (ojaaillon@genoscope.cns.fr) into /cluster/store2/mm.2003.02/mm3/bed/exoFish/all_maping_ecore - awk -f filter.awk all_maping_ecore > exoFish.bed - hgLoadBed mm3 exoFish exoFish.bed LOAD MOUSE SYNTENY (TODO) - login to hgwdev. - cd ~/kent/src/hg/lib - mm3 < mouseSyn.sql - mkdir ~/mm/bed/mouseSyn - cd ~/mm/bed/mouseSyn # Put Deanna Church's (church@ncbi.nlm.nih.gov) email attatchment as # mouseSyn.txt - awk -f format.awk *.txt > mouseSyn.bed - delete first line of mouseSyn.bed - Enter database with "mm3" command. - At mysql> prompt type in: load data local infile 'mouseSyn.bed' into table mouseSyn LOAD GENIE (TODO) mkdir -p ~/mm3/bed/genieAlt cd ~/mm3/bed/genieAlt wget http://www.neomorphic.com/mgap/mgscv3/gtf/mgscv3.genie.gtf.tgz gunzip -c mgscv3.genie.gtf.tgz | tar xvf - ldHgGene mm3 genieAlt mgscv3.genie.gtf/chr*.gtf wget http://www.neomorphic.com/mgap/mgscv3/fa/mgscv3.aa.tgz gunzip -c mgscv3.aa.tgz | tar xvf - hgPepPred mm3 genie geniePep chr*.aa.fa LOAD GENIE CLONE BOUNDS (TODO) mkdir -p ~/mm3/bed/genieBounds cd ~/mm3/bed/genieBounds wget http://www.neomorphic.com/mgap/mgscv3/cb.bed/mgscv3_cb.bed.tgz gunzip -c mgscv3_cb.bed.tgz | tar xvf - - Trim the track definition from each file (these are actually custom track files): foreach c (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 X Un) tail +2 chr${c}_cb.bed > chr${c}_cb-fixed.bed end hgLoadBed mm3 genieBounds *-fixed.bed LOAD JACKSON LABS QTL (TODO) ssh hgwdev mkdir ~/mm3/bed/jaxQTL2 # Save the email attachment from Sridhar Ramachandran at Jackson Labs # (bed 8+, jaxQTL2 format). # Strip the column headers and load into the database. tail +2 QTLBedFormat.txt > jaxQTL2.bed hgLoadBed -noBin -tab -sqlTable=$HOME/kent/src/hg/lib/jaxQTL2.sql \ mm3 jaxQTL2 jaxQTL2.bed MAKING MOUSE AND RAT SYNTENY # syntenicBest.pl -db=mm3 -table=blastzBestHuman smooth.pl joinsmallgaps.pl fillgap.pl -db=mm3 -table=blastzBestHuman synteny2bed.pl hgLoadBed mm3 syntenyHuman ucsc100k.bed syntenicBest.pl -db=mm3 -table=blastzBestRat smooth.pl joinsmallgaps.pl fillgap.pl -db=mm3 -table=blastzBestRat synteny2bed.pl hgLoadBed mm3 syntenyRat ucsc100k.bed CREATING THE mm3Rn2L SAMPLE TRACK (a.k.a WIGGLE TRACK) ------------------------------------------------------ o - refer to the script at src/hg/sampleTracks/makeMm3Rn2.doc # create MGC Genes track (markd 13 may 2003) # this will be automated in the incremental genbank update /cluster/store5/genbank/bin/i386/mgcDbLoad mm3 /cluster/store5/genbank/data/download/mgc/2003.05.06/mgcFullStatus.tab.gz # SWAPPING RAT RN3 BLASTZ ALIGNMENTS TO MOUSE-RAT (IN PROGRESS 2003-08-06 kate) ssh kkstore # Rat-mouse alignments were already run and processed into axt. # Swap target and query to get mouse-rat alignments. set aliDir = "/cluster/data/rn3/bed/blastz.mm3" set revAliDir = "/cluster/data/mm3/bed/blastz.rn3.swp" mkdir $revAliDir cd $revAliDir # axtBest will need .len files - copy those, swap S1<->S2 cp $aliDir/S1.len S2.len cp $aliDir/S2.len S1.len mkdir unsorted axtChrom # Swap target and query coords, then re-apportion alignments so that # unsorted/chrN.axt has all the alignments with chrN as target. cat $aliDir/axtChrom/chr*.axt \ | /cluster/bin/i386/axtSwap stdin $aliDir/S1.len $aliDir/S2.len stdout \ | /cluster/bin/i386/axtSplitByTarget stdin unsorted # Sort the shuffled .axt files. foreach f (unsorted/*.axt) echo sorting $f:t:r axtSort $f axtChrom/$f:t end rm -r unsorted # Consolidate AXT files to chrom level, sort, pick best ssh kkstore set base="/cluster/data/mm3/bed/blastz.rn3.swp" set seq1_dir="/cluster/data/mm3.RM030619/mixedNib/" set seq2_dir="/cluster/data/rn3/softNib/" cd $base mkdir -p axtBest pslBest foreach f (axtChrom/chr*.axt) set chr=$f:t:r echo axtBesting $chr axtBest axtChrom/$chr.axt $chr axtBest/$chr.axt -minScore=300 end ======= MAKE THE affyU74 TRACK # Recalculate alignments and load data using Affy consensus sequences # for Affy mouse U74 chips instead of target sequences #[DONE, hartera, Feb 5, 2004] ---------------------------------- # Load up semi-local disk with target sequences for Affy mouse U74 chips. ssh kkr1u00 mkdir -p /iscratch/i/affy # This /projects filesystem is not available on kkr1u00 # but it is on kk ssh kk cp /projects/compbio/data/microarray/affyGnfMouse/sequences/U74*consensus.fa /iscratch/i/affy ssh kkr1u00 iSync # Run cluster job to do alignments ssh kk cd /cluster/data/mm3/bed mkdir affyU74.2004-02-05/ cd affyU74.2004-02-05 mkdir run cd run mkdir psl echo /scratch/hg/mm3/trfFa/*.fa | wordLine stdin > genome.lst ls -1 /iscratch/i/affy/U74*consensus.fa > affy.lst echo '#LOOP\n/cluster/bin/i386/blat -fine -mask=lower -minIdentity=95 -ooc=/scratch/hg/h/11.ooc $(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl}\n#ENDLOOP' > gsub gensub2 genome.lst affy.lst gsub spe para create spec para try # Then do para check/para push etc. until job is done, or para make. # para time 2004-02-05 # Completed: 1662 of 1662 jobs # CPU time in finished jobs: 12092s 201.53m 3.36h 0.14d 0.000 y # IO & Wait Time: 4374s 72.90m 1.21h 0.05d 0.000 y # Average job time: 10s 0.17m 0.00h 0.00d # Longest job: 37s 0.62m 0.01h 0.00d # Submission to last job: 1801s 30.02m 0.50h 0.02d # Do sort, best in genome filter, and convert to chromosome coordinates # to create affyU74.psl. pslSort dirs raw.psl tmp psl # change filter parameters for these sequences. only use alignments that # cover 30% of sequence and have at least minAli = 0.95. # minAli = 0.97 too high. low minCover as a lot of n's in these sequences pslReps -minCover=0.3 -sizeMatters -minAli=0.95 -nearTop=0.005 raw.psl contig.psl /dev/null liftUp ../all_affyU74.psl ../../../jkStuff/liftAll.lft warn contig.psl cd .. # Sort by chromosome and load into database. ssh hgwdev cd /cluster/data/mm3/bed/affyU74.2004-02-05 pslSortAcc nohead chrom temp all_affyU74.psl cat chrom/*.psl > affyU74.psl # Feb 24, 2004, shorten qName to "xxxx_at" instead of "U74Xv2:xxxx_at;" # and reload data into table, this is so qName matches those in interdependent # tables in hgFixed hgLoadPsl mm3 affyU74.psl -tNameIx rm -r chrom temp run MAKE THE affyGnfU74 TRACKs # Make bed files and load consensus sequences for Affy U74 chip set. # [DONE, hartera, Feb 5, 2004] ---------------------------------- #This needs to be done after affyU74 is already made. # Added path for latest version of affyPslAndAtlasToBed ssh hgwdev mkdir -p /cluster/data/mm3/bed/affyGnf.2004-02-05 cd /cluster/data/mm3/bed/affyGnf.2004-02-05 affyPslAndAtlasToBed ../affyU74.2004-02-05/affyU74.psl /projects/compbio/data/microarray/affyGnfMouse/data/data_public_U74 affyGnfU74A.bed affyGnfU74A.exp -newType -chip=U74Av2 affyPslAndAtlasToBed ../affyU74.2004-02-05/affyU74.psl /projects/compbio/data/microarray/affyGnfMouse/data/U74B_b.txt affyGnfU74B.bed affyGnfU74B.exp -newType -chip=U74Bv2 affyPslAndAtlasToBed ../affyU74.2004-02-05/affyU74.psl /projects/compbio/data/microarray/affyGnfMouse/data/U74C_b.txt affyGnfU74C.bed affyGnfU74C.exp -newType -chip=U74Cv2 # Feb 24, 2004, shorten qName to "xxxx_at" instead of "U74Xv2:xxxx_at;" # and reload data into table hgLoadBed mm3 affyGnfU74A affyGnfU74A.bed hgLoadBed mm3 affyGnfU74B affyGnfU74B.bed hgLoadBed mm3 affyGnfU74C affyGnfU74C.bed # Add in sequence data for U74 tracks. # Copy consensus sequence to /gbdb if it isn't already # [DONE, hartera, Feb 6, 2004] mkdir -p /gbdb/hgFixed/affyProbes cd /gbdb/hgFixed/affyProbes ln -s /projects/compbiodata/microarray/affyGnfMouse/sequences/U74Av2_consensus.fa . ln -s /projects/compbiodata/microarray/affyGnfMouse/sequences/U74Bv2_consensus.fa . ln -s /projects/compbiodata/microarray/affyGnfMouse/sequences/U74Cv2_consensus.fa . # use perl -pi.bak -e 's/;/ /' to remove ";" after probe name # in sequence files. # reload sequences with "U74Xv2" prefix removed so acc matches name used # in other dependent tables. hgLoadSeq -abbr=U74Av2: mm3 /gbdb/hgFixed/affyProbes/U74Av2_consensus.fa hgLoadSeq -abbr=U74Bv2: mm3 /gbdb/hgFixed/affyProbes/U74Bv2_consensus.fa hgLoadSeq -abbr=U74Cv2: mm3 /gbdb/hgFixed/affyProbes/U74Cv2_consensus.fa MAKE ALIAS TABLES FOR hgFind ---------------------------------- o Create alias tables to facilitate hgFind. First create tables of kgXref, kgAlias and kgProtAlias in mm3, using kgXref.sql kgAlias.sql kgProtAlias.sql o Build kgXref table Generate xref .tab file for KG kgXref mm3 proteins0305 Load it into mySQL load data local infile "kgXref.tab" into table mm3.kgXref; o Build gene aliases Generate aliases from hugo, etc kgAliasM mm3 proteins0305 Generate gene aliases from SWISS-PROT data kgAliasP mm3 /cluster/store5/swissprot/0305/sprot.dat sp.lis kgAliasP mm3 /cluster/store5/swissprot/0305/trembl.dat tr.lis kgAliasP mm3 /cluster/store5/swissprot/0305/trembl_new.dat new.lis cat sp.lis tr.lis new.lis |sort|uniq >kgAliasP.tab rm sp.lis tr.lis new.lis Generate gene aliases from RefSeq data kgAliasRefseq mm3 Concatenate all 3 files cat kgAliasM.tab kgAliasRefseq.tab kgAliasP.tab|sort|uniq > kgAlias.tab Load it into mySQL table load data local infile "kgAlias.tab" into table mm3.kgAlias; o Build protein aliases Generate protein aliases kgProtAlias mm3 proteins0305 Generate protein aliases from NCBI data kgProtAliasNCBI mm3 Concatenate both files cat kgProtAliasNCBI.tab kgProtAlias.tab|sort|uniq > kgProtAliasBoth.tab Load it into mySQL tables load data local infile "kgProtAliasBoth.tab" into table mm3.kgProtAlias; # CREATE FULL TEXT INDEX FOR KNOWN GENES (DONE 1/19/2006 JK) # This depends on the go and uniProt databases as well as # the kgAlias and kgProAlias tables. The hgKgGetText takes # about 5 minutes when the database is not too busy. The rest # is real quick. ssh hgwdev cd /cluster/data/mm3/bed/ mkdir -p kgMm3/index cd kgMm3/index hgKgGetText mm3 knownGene.text ixIxx knownGene.text knownGene.ix knownGene.ixx ln -s /cluster/data/mm3/bed/kgMm3/index/knownGene.ix /gbdb/mm3/knownGene.ix ln -s /cluster/data/mm3/bed/kgMm3/index/knownGene.ixx /gbdb/mm3/knownGene.ixx ######## MAKING FOLDUTR TABLES ####### # # First set up directory structure and extract UTR sequence on hgwdev ssh hgwdev mkdir -p /cluster/data/mm3/bed/rnaStruct cd /cluster/data/mm3/bed/rnaStruct mkdir -p utr3/split utr5/split utr3/fold utr5/fold utrFa mm3 knownGene utr3 utr3/utr.fa utrFa mm3 knownGene utr5 utr5/utr.fa # Split up files and make files that define job. ssh kk cd /cluster/data/mm3/bed/rnaStruct faSplit sequence utr3/utr.fa 50000 utr3/split/s >/dev/null faSplit sequence utr5/utr.fa 50000 utr5/split/s >/dev/null ls -1 utr3/split > utr3/in.lst ls -1 utr5/split > utr5/in.lst cd utr3 cat > gsub < blastSome < gsub < split.lst gensub2 split.lst single gsub spec para create spec para try # Wait a couple of minutes, and do a para check, if all is good # then do a para push # This should finish in ~10 minutes if the cluster is free. #Completed: 7733 of 7733 jobs #CPU time in finished jobs: 41604s 693.39m 11.56h 0.48d 0.001 y #IO & Wait Time: 20190s 336.51m 5.61h 0.23d 0.001 y #Average job time: 8s 0.13m 0.00h 0.00d #Longest job: 35s 0.58m 0.01h 0.00d #Submission to last job: 690s 11.50m 0.19h 0.01d # Load into database. This takes about an hour. ssh hgwdev cd /cluster/data/mm3/bed/blastp/self/run/out hgLoadBlastTab mm3 knownBlastTab *.tab # Create table that maps between known genes and RefSeq hgMapToGene mm3 refGene knownGene knownToRefSeq # Create table that maps between known genes and LocusLink (DONE 5/5/04 Fan) hgsql --skip-column-names -e "select mrnaAcc,locusLinkId from refLink" mm3 \ > refToLl.txt hgMapToGene mm3 refGene knownGene knownToLocusLink -lookup=refToLl.txt # row count is 30116 # Create a table that maps between known genes and # the nice affy expression data. hgMapToGene mm3 affyU74 knownGene knownToU74 # Format and load the GNF data cd /cluster/data/mm3/bed mkdir affyGnf95 cd affyGnf95 affyPslAndAtlasToBed -newType ../affyU95.psl /projects/compbio/data/microarray/affyGnfHuman/data_public_U95 affyGnfU95.tab affyGnfU95Exps.tab -shortOut hgsql mm3 < ~/src/hg/affyGnf/affyGnfU95.sql # Create table that gives distance in expression space between # GNF genes. hgExpDistance mm3 affyGnfU74A affyGnfU74AExps affyGnfU74ADistance -lookup=knownToU74 hgExpDistance mm3 affyGnfU74B affyGnfU74BExps affyGnfU74BDistance -lookup=knownToU74 hgExpDistance mm3 affyGnfU74C affyGnfU74CExps affyGnfU74CDistance -lookup=knownToU74 # Make sure that GO database is up to date. See README in /cluster/store1/geneOntology. # Create knownToEnsembl column hgMapToGene mm3 ensGene knownGene knownToEnsembl # Make knownToCdsSnp column. This is a little complicated by # having to merge data form the snpTsc and the snpNih tracks. hgMapToGene mm3 snpNih knownGene knownToCdsSnp -all -cds # Make C. elegans ortholog column using blastp on wormpep. # First make C. elegans protein database and copy it to cluster/bluearc # if it doesn't exist already cd /cluster/data/ce1/bed mkdir blastp cd blastp wget ftp://ftp.sanger.ac.uk/pub/databases/wormpep/wormpep mv wormpep wormPep.faa formatdb -i wormPep.faa -t wormPep -n wormPep ssh kkr1u00 if (-e /cluster/bluearc/ce1/blastp) then rm -r /cluster/bluearc/ce1/blastp endif mkdir -p /cluster/bluearc/ce1/blastp cp /cluster/data/ce1/bed/blastp/wormPep.p?? /cluster/bluearc/ce1/blastp # Make parasol run directory ssh kk mkdir -p /cluster/data/mm3/bed/blastp/ce1 cd /cluster/data/mm3/bed/blastp/ce1 mkdir run cd run mkdir out # Make blast script cat > blastSome < gsub < split.lst gensub2 split.lst single gsub spec para create spec para try # Wait a couple of minutes, and do a para check, if all is good # then do a para push # This should finish in ~10 minutes if the cluster is free. # Here's the para time results #Completed: 7733 of 7733 jobs #CPU time in finished jobs: 22954s 382.56m 6.38h 0.27d 0.001 y #IO & Wait Time: 20560s 342.67m 5.71h 0.24d 0.001 y #Average job time: 6s 0.09m 0.00h 0.00d #Longest job: 16s 0.27m 0.00h 0.00d #Submission to last job: 618s 10.30m 0.17h 0.01d # Load into database. ssh hgwdev cd /cluster/data/mm3/bed/blastp/ce1/run/out hgLoadBlastTab mm3 ceBlastTab -maxPer=1 *.tab # Make human ortholog column using blastp on human known genes. # First make human protein database and copy it to cluster/bluearc # if it doesn't exist already cd /cluster/data/hg16/bed mkdir blastp cd blastp pepPredToFa hg16 knownGenePep known.faa formatdb -i known.faa -t known -n known ssh kkr1u00 if (-e /cluster/bluearc/hg16/blastp) then rm -r /cluster/bluearc/hg16/blastp endif mkdir -p /cluster/bluearc/hg16/blastp cp /cluster/data/hg16/bed/blastp/known.p?? /cluster/bluearc/hg16/blastp # Make parasol run directory ssh kk9 mkdir -p /cluster/data/mm3/bed/blastp/hg16 cd /cluster/data/mm3/bed/blastp/hg16 mkdir run cd run mkdir out # Make blast script cat > blastSome < gsub < split.lst gensub2 split.lst single gsub spec para create spec para try # Wait a couple of minutes, and do a para check, if all is good # then do a para push # Load into database. ssh hgwdev cd /cluster/data/mm3/bed/blastp/hg16/run/out hgLoadBlastTab mm3 hgBlastTab -maxPer=1 *.tab # Make Danio rerio (zebrafish) ortholog column using blastp on Ensembl. # First make protein database and copy it to cluster/bluearc # if it doesn't exist already cd /cluster/data/dm1/bed mkdir blastp cd blastp wget ftp://ftp.ensembl.org/pub/current_zebrafish/data/fasta/pep/Danio_rerio.ZFISH2.pep.fa.gz zcat Dan*.pep.fa.gz > ensembl.faa echo "Translation:" > subs.in subs -e ensembl.faa > /dev/null formatdb -i ensembl.faa -t ensembl -n ensembl if (-e /cluster/bluearc/dr1/blastp) then rm -r /cluster/bluearc/dr1/blastp endif mkdir -p /cluster/bluearc/dr1/blastp cp /cluster/data/dr1/bed/blastp/ensembl.p?? /cluster/bluearc/dr1/blastp # Make parasol run directory ssh kk9 mkdir -p /cluster/data/mm3/bed/blastp/dr1 cd /cluster/data/mm3/bed/blastp/dr1 mkdir run cd run mkdir out # Make blast script cat > blastSome < gsub < split.lst gensub2 split.lst single gsub spec para create spec para try # Wait a couple of minutes, and do a para check, if all is good # then do a para push # Load into database. ssh hgwdev cd /cluster/data/mm3/bed/blastp/dr1/run/out hgLoadBlastTab mm3 drBlastTab -maxPer=1 *.tab # Make Saccharomyces cerevisiae (yeast) ortholog column using blastp on RefSeq. # First make protein database and copy it to cluster/bluearc # if it doesn't exist already cd /cluster/data/sc1/bed mkdir blastp cd blastp wget ftp://genome-ftp.stanford.edu/pub/yeast/data_download/sequence/genomic_sequence/orf_protein/orf_trans.fasta.gz zcat orf_trans.fasta.gz > sgd.faa echo "ORFP:|" > subs.in subs -e sgd.faa > /dev/null formatdb -i sgd.faa -t sgd -n sgd if (-e /cluster/bluearc/sc1/blastp) then rm -r /cluster/bluearc/sc1/blastp endif mkdir -p /cluster/bluearc/sc1/blastp cp /cluster/data/sc1/bed/blastp/sgd.p?? /cluster/bluearc/sc1/blastp # Make parasol run directory ssh kk9 mkdir -p /cluster/data/mm3/bed/blastp/sc1 cd /cluster/data/mm3/bed/blastp/sc1 mkdir run cd run mkdir out # Make blast script cat > blastSome < gsub < split.lst gensub2 split.lst single gsub spec para create spec para try # Wait a couple of minutes, and do a para check, if all is good # then do a para push # Load into database. ssh hgwdev cd /cluster/data/mm3/bed/blastp/sc1/run/out hgLoadBlastTab mm3 scBlastTab -maxPer=1 *.tab # Make Drosophila melanagaster ortholog column using blastp on FlyBase. # First make SwissProt protein database and copy it to cluster/bluearc # if it doesn't exist already cd /cluster/data/dm1/bed mkdir blastp cd blastp wget ftp://ftp.fruitfly.org/pub/download/dmel_RELEASE3-1/FASTA/whole_genome_translation_dmel_RELEASE3-1.FASTA.gz zcat whole_ge*.gz | faFlyBaseToUcsc stdin flyBase.faa formatdb -i flyBase.faa -t flyBase -n flyBase if (-e /cluster/bluearc/dm1/blastp) then rm -r /cluster/bluearc/dm1/blastp endif mkdir -p /cluster/bluearc/dm1/blastp cp /cluster/data/dm1/bed/blastp/flyBase.p?? /cluster/bluearc/dm1/blastp # Make parasol run directory ssh kk mkdir -p /cluster/data/mm3/bed/blastp/dm1 cd /cluster/data/mm3/bed/blastp/dm1 mkdir run cd run mkdir out # Make blast script cat > blastSome < gsub < split.lst gensub2 split.lst single gsub spec para create spec para try # Wait a couple of minutes, and do a para check, if all is good # then do a para push #Results of this: #Completed: 5814 of 5814 jobs #CPU time in finished jobs: 62285s 1038.09m 17.30h 0.72d 0.002 y #IO & Wait Time: 23056s 384.26m 6.40h 0.27d 0.001 y #Average job time: 15s 0.24m 0.00h 0.00d #Longest job: 152s 2.53m 0.04h 0.00d #Submission to last job: 290s 4.83m 0.08h 0.00d # Load into database. ssh hgwdev cd /cluster/data/mm3/bed/blastp/dm1/run/out hgLoadBlastTab mm3 dmBlastTab -maxPer=1 *.tab # CREATING KNOWNtOsUPER (which enables superFamily stuff in hgNear/hgGene) # First see if need to update superfamily data from # ftp server at supfam.mrc-lmb.cam.ac.uk following instructions # in /cluster/store1/superFamily/genomes/README.ucsc. Then # make sure that knownToEnsembl and ensGtp tables are created, then: zcat /cluster/store1/superFamily/genomes/ass_26-Oct-2003.tab.gz | hgKnownToSuper mm3 mm stdin # LOAD ECgene tables (re-done with new ldHgGene (braney 2004-01-30)) cd /cluster/data/mm3/bed rm -f ECgene mkdir ECgene.2003-12-19 ln -s ECgene.2003-12-19 ECgene cd ECgene wget "http://genome.ewha.ac.kr/ECgene/download/ECgene_mm3_v1.1_25oct2003_genes.txt.gz" wget "http://genome.ewha.ac.kr/ECgene/download/ECgene_mm3_v1.1_25oct2003_genepep.txt.gz" gunzip *.gz ldHgGene -predTab mm3 ECgene ECgene_mm3_v1.1_25oct2003_genes.txt hgPepPred mm3 tab ECgenePep ECgene_mm3_v1.1_25oct2003_genepep.txt rm genePred.tab gzip * ######## FAMILY BROWSER UPDATE ####### (WORKING - 2004-02-25 - Hiram) # These are instructions for building the # neighborhood browser. Don't start these until # there is a knownGene track. and the affy tracks # Cluster together various alt-splicing isoforms. # Creates the knownIsoforms and knownCanonical tables ssh hgwdev mkdir /cluster/data/mm3/bed/famBro.2004-02-25 ln -l /cluster/data/mm3/bed/famBro.2004-02-25 /cluster/data/mm3/bed/famBro cd /cluster/data/mm3/bed/famBro hgClusterGenes mm3 knownGene knownIsoforms knownCanonical # Got 22306 clusters, from 38639 genes in 37 chromosomes # row count on knownIsoforms went from 35639 to 38631 # row count on knownCanonical went from 21381 to 22306 # Extract peptides from knownGenes into fasta file # and create a blast database out of them. mkdir /cluster/data/mm3/bed/famBro/blastp cd /cluster/data/mm3/bed/famBro/blastp pepPredToFa mm3 knownGenePep known.faa # You may need to build this binary in src/hg/near/pepPredToFa # # expect to find /scratch/blast loaded with all required blast files /scratch/blast/formatdb -i known.faa -t known -n known # Copy over database to bluearc rm -fr /cluster/bluearc/mm3/blastp mkdir /cluster/bluearc/mm3/blastp cp -p /cluster/data/mm3/bed/famBro/blastp/known.* /cluster/bluearc/mm3/blastp # Split up fasta file into bite sized chunks for cluster cd /cluster/data/mm3/bed/famBro/blastp mkdir split faSplit sequence known.faa 8000 split/kg # Make parasol run directory ssh kk mkdir /cluster/data/mm3/bed/famBro/blastp/self cd /cluster/data/mm3/bed/famBro/blastp/self mkdir run cd run mkdir out # Make blast script cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/scratch/blast/data /scratch/blast/blastall -p blastp \ -d /cluster/bluearc/mm3/blastp/known -i $1 -o $2 -e 0.01 -m 8 -b 1000 '_EOF_' chmod a+x blastSome # Make gensub2 file cat << '_EOF_' > gsub #LOOP blastSome {check in line+ $(path1)} {check out line out/$(root1).tab} #ENDLOOP '_EOF_' # Create parasol batch # 'ls ../../split/*.fa' is too much, hence the echo echo ../../split/*.fa | wordLine stdin > split.lst gensub2 split.lst single gsub jobList para make jobList # This should finish in ~5 minutes # Completed: 7737 of 7737 jobs # CPU time in finished jobs: 104406s 1740.10m 29.00h 1.21d 0.003 y # IO & Wait Time: 24064s 401.06m 6.68h 0.28d 0.001 y # Average job time: 17s 0.28m 0.00h 0.00d # Longest job: 114s 1.90m 0.03h 0.00d # Submission to last job: 395s 6.58m 0.11h 0.00d # Load into database. This takes about 15 minutes. ssh hgwdev cd /cluster/data/mm3/bed/famBro/blastp/self/run/out time hgLoadBlastTab mm3 knownBlastTab *.tab # Scanning through 7737 files # Loading database with 7621524 rows # real 15m43.875s # user 3m0.350s # sys 0m22.970s # Create table that maps between known genes and RefSeq hgMapToGene mm3 refGene knownGene knownToRefSeq # row count goes from 27744 to 30116 # Create a table that maps between known genes and # the nice affy expression data. # fixup problem with names cd /cluster/data/mm3/bed/famBro hgMapToGene mm3 affyU74 knownGene knownToU74 # row count went from 17392 to 25733 # I believe this GNF business is taken care of elsewhere, or at # least it only needs to be done once. # Format and load the GNF data # mkdir /cluster/data/mm3/bed/affyGnf95 # cd /cluster/data/mm3/bed/affyGnf95 # affyPslAndAtlasToBed -newType ../affyU95.psl \ # /projects/compbio/data/microarray/affyGnfHuman/data_public_U95 \ # affyGnfU95.tab affyGnfU95Exps.tab -shortOut # this .sql load was in preceeding instructions, but this .sql file # appears to not exist and it doesn't seem to be needed anyway. # Everything below this seems to create tables OK. # hgsql mm3 < ~/kent/src/hg/affyGnf/affyGnfU95.sql # Create table that gives distance in expression space between # GNF genes. The first one takes about 20 minutes, the other two 5 and 1 # The affyGnfU74?Exps arguments appear to be unused in hgExpDistance # runs most efficiently in /tmp since it makes large temporary files cd /tmp hgExpDistance mm3 affyGnfU74A affyGnfU74AExps affyGnfU74ADistance \ -lookup=knownToU74 # Got 13497 unique elements in affyGnfU74A # row count went from 8256000 to 13497000 hgExpDistance mm3 affyGnfU74B affyGnfU74BExps affyGnfU74BDistance \ -lookup=knownToU74 # Got 8380 unique elements in affyGnfU74B # row count went from 6236000 to 8380000 hgExpDistance mm3 affyGnfU74C affyGnfU74CExps affyGnfU74CDistance \ -lookup=knownToU74 # Got 2286 unique elements in affyGnfU74C # row count went from 2003000 to 2286000 # Make sure that GO database is up to date. See README in /cluster/store1/geneOntology. # Create knownToEnsembl column cd /cluster/data/mm3/bed/famBro hgMapToGene mm3 ensGene knownGene knownToEnsembl # row count went from 31216 to 33504 # Make knownToCdsSnp column. hgMapToGene mm3 snpNih knownGene knownToCdsSnp -all -cds # row count went from 5007 to 8368 # Make C. elegans ortholog column using blastp on wormpep. # First make C. elegans protein database and copy it to cluster/bluearc # if it doesn't exist already # This is already done, see makeMm3.doc for procedure # the directory: /cluster/bluearc/ce1/blastp should have data # Create the ceBlastTab ssh kk mkdir /cluster/data/mm3/bed/famBro/blastp/ce1 cd /cluster/data/mm3/bed/famBro/blastp/ce1 mkdir run cd run mkdir out # Make blast script cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/scratch/blast/data /scratch/blast/blastall \ -p blastp -d /cluster/bluearc/ce1/blastp/wormPep \ -i $1 -o $2 -e 0.01 -m 8 -b 1 '_EOF_' chmod a+x blastSome # Make gensub2 file cat << '_EOF_' > gsub #LOOP blastSome {check in line+ $(path1)} {check out line out/$(root1).tab} #ENDLOOP '_EOF_' # Create parasol batch echo ../../split/*.fa | wordLine stdin > split.lst gensub2 split.lst single gsub jobList para make jobList # Completed: 7737 of 7737 jobs # CPU time in finished jobs: 51967s 866.12m 14.44h 0.60d 0.002 y # IO & Wait Time: 37548s 625.80m 10.43h 0.43d 0.001 y # Average job time: 12s 0.19m 0.00h 0.00d # Longest job: 39s 0.65m 0.01h 0.00d # Submission to last job: 720s 12.00m 0.20h 0.01d # Load into database. ssh hgwdev cd /cluster/data/mm3/bed/famBro/blastp/ce1/run/out hgLoadBlastTab mm3 ceBlastTab -maxPer=1 *.tab # Scanning through 7737 files # Loading database with 24847 rows # row count went from 23017 to 24847 # Make human ortholog column using blastp on human known genes. # First make human protein database and copy it to cluster/bluearc # if it doesn't exist already # This already exists. See makeMm3.doc for procedure # the directory: /cluster/bluearc/hg16/blastp should have data # Make parasol run directory ssh kk mkdir /cluster/data/mm3/bed/famBro/blastp/hg16 cd /cluster/data/mm3/bed/famBro/blastp/hg16 mkdir run cd run mkdir out # Make blast script cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/scratch/blast/data /scratch/blast/blastall \ -p blastp -d /cluster/bluearc/hg16/blastp/known \ -i $1 -o $2 -e 0.001 -m 8 -b 1 '_EOF_' chmod a+x blastSome # Make gensub2 file cat << '_EOF_' > gsub #LOOP blastSome {check in line+ $(path1)} {check out line out/$(root1).tab} #ENDLOOP '_EOF_' # Create parasol batch # this echo trick is used because otherwise the command line is # too long and you can not do a simple ls echo ../../split/*.fa | wordLine stdin > split.lst gensub2 split.lst single gsub jobList para make jobList # Completed: 7737 of 7737 jobs # CPU time in finished jobs: 119088s 1984.80m 33.08h 1.38d 0.004 y # IO & Wait Time: 39644s 660.73m 11.01h 0.46d 0.001 y # Average job time: 21s 0.34m 0.01h 0.00d # Longest job: 128s 2.13m 0.04h 0.00d # Submission to last job: 1180s 19.67m 0.33h 0.01d # Load into database. ssh hgwdev cd /cluster/data/mm3/bed/famBro/blastp/hg16/run/out hgLoadBlastTab mm3 hgBlastTab -maxPer=1 *.tab # Scanning through 7737 files # Loading database with 32987 rows # row count went from 30452 to 32987 # Make Danio rerio (zebrafish) ortholog column using blastp on Ensembl. # First make protein database and copy it to cluster/bluearc # if it doesn't exist already # This is already done, see makeMm3.doc for procedure # the directory: /cluster/bluearc/dr1/blastp should have data # Make parasol run directory ssh kk mkdir /cluster/data/mm3/bed/famBro/blastp/dr1 cd /cluster/data/mm3/bed/famBro/blastp/dr1 mkdir run cd run mkdir out # Make blast script cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/scratch/blast/data /scratch/blast/blastall \ -p blastp -d /cluster/bluearc/dr1/blastp/ensembl \ -i $1 -o $2 -e 0.005 -m 8 -b 1 '_EOF_' chmod a+x blastSome # Make gensub2 file cat << '_EOF_' > gsub #LOOP blastSome {check in line+ $(path1)} {check out line out/$(root1).tab} #ENDLOOP '_EOF_' # Create parasol batch echo ../../split/*.fa | wordLine stdin > split.lst gensub2 split.lst single gsub jobList para make jobList # Completed: 7737 of 7737 jobs # CPU time in finished jobs: 72923s 1215.38m 20.26h 0.84d 0.002 y # IO & Wait Time: 36576s 609.61m 10.16h 0.42d 0.001 y # Average job time: 14s 0.24m 0.00h 0.00d # Longest job: 74s 1.23m 0.02h 0.00d # Submission to last job: 1015s 16.92m 0.28h 0.01d # Load into database. ssh hgwdev cd /cluster/data/mm3/bed/famBro/blastp/dr1/run/out hgLoadBlastTab mm3 drBlastTab -maxPer=1 *.tab # Scanning through 7737 files # Loading database with 29438 rows # row count went from 27260 to 29438 # Make Saccharomyces cerevisiae (yeast) ortholog column using blastp on RefSeq. # First make protein database and copy it to cluster/bluearc # if it doesn't exist already # This is already done, see makeMm3.doc for procedure # the directory: /cluster/bluearc/sc1/blastp should have data # Make parasol run directory ssh kk mkdir /cluster/data/mm3/bed/famBro/blastp/sc1 cd /cluster/data/mm3/bed/famBro/blastp/sc1 mkdir run cd run mkdir out # Make blast script cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/scratch/blast/data /scratch/blast/blastall \ -p blastp -d /cluster/bluearc/sc1/blastp/sgd \ -i $1 -o $2 -e 0.01 -m 8 -b 1 '_EOF_' chmod a+x blastSome # Make gensub2 file cat << '_EOF_' > gsub #LOOP blastSome {check in line+ $(path1)} {check out line out/$(root1).tab} #ENDLOOP '_EOF_' # Create parasol batch echo ../../split/*.fa | wordLine stdin > split.lst gensub2 split.lst single gsub jobList para make jobList # Completed: 7737 of 7737 jobs # CPU time in finished jobs: 15329s 255.48m 4.26h 0.18d 0.000 y # IO & Wait Time: 26523s 442.05m 7.37h 0.31d 0.001 y # Average job time: 5s 0.09m 0.00h 0.00d # Longest job: 25s 0.42m 0.01h 0.00d # Submission to last job: 1025s 17.08m 0.28h 0.01d # Load into database. ssh hgwdev cd /cluster/data/mm3/bed/famBro/blastp/sc1/run/out hgLoadBlastTab mm3 scBlastTab -maxPer=1 *.tab # Scanning through 7737 files # Loading database with 15757 rows # row count went from 14466 to 15757 # Make Drosophila melanagaster ortholog column using blastp on FlyBase. # First make SwissProt protein database and copy it to cluster/bluearc # if it doesn't exist already # This is already done, see makeMm3.doc for procedure # the directory: /cluster/bluearc/dm1/blastp should have data # Make parasol run directory ssh kk mkdir /cluster/data/mm3/bed/famBro/blastp/dm1 cd /cluster/data/mm3/bed/famBro/blastp/dm1 mkdir run cd run mkdir out # Make blast script cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/scratch/blast/data /scratch/blast/blastall \ -p blastp -d /cluster/bluearc/dm1/blastp/flyBase \ -i $1 -o $2 -e 0.01 -m 8 -b 1 '_EOF_' chmod a+x blastSome # Make gensub2 file cat << '_EOF_' > gsub #LOOP blastSome {check in line+ $(path1)} {check out line out/$(root1).tab} #ENDLOOP '_EOF_' # Create parasol batch echo ../../split/*.fa | wordLine stdin > split.lst gensub2 split.lst single gsub jobList para make jobList # Completed: 7737 of 7737 jobs # CPU time in finished jobs: 59446s 990.77m 16.51h 0.69d 0.002 y # IO & Wait Time: 33702s 561.70m 9.36h 0.39d 0.001 y # Average job time: 12s 0.20m 0.00h 0.00d # Longest job: 41s 0.68m 0.01h 0.00d # Submission to last job: 880s 14.67m 0.24h 0.01d # Load into database. ssh hgwdev cd /cluster/data/mm3/bed/famBro/blastp/dm1/run/out hgLoadBlastTab mm3 dmBlastTab -maxPer=1 *.tab # Scanning through 7737 files # Loading database with 26299 rows # row count went from 24352 to 26299 # CREATING KNOWNtOsUPER (which enables superFamily stuff in hgNear/hgGene) # First see if need to update superfamily data from # ftp server at supfam.mrc-lmb.cam.ac.uk following instructions # in /cluster/store1/superFamily/genomes/README.ucsc. Then # make sure that knownToEnsembl and ensGtp tables are created, then: ssh hgwdev zcat /cluster/store1/superFamily/genomes/ass_11-Jan-2004.tab.gz | \ hgKnownToSuper mm3 mm stdin # row count went from 28955 to 29826 # Update the proteins link into gdbPdb.hgcentraltest: hgsql \ -e 'update gdbPdb set proteomeDb = "proteins040115" where genomeDb = "mm3";' \ -h genome-testdb hgcentraltest # ECORES FROM GENOSCOPE [DONE, hartera, 2004-03-31] # download data from http://www.genoscope.cns.fr/externe/tetraodon/Data3/ecores # ecotigMF - ecores on Mouse, genome conserved with Fugu # ecotigMT - ecores on Mouse, genome conserved with Tetraodon ssh hgwdev mkdir /cluster/data/mm3/bed/ecores/ # add parse_ecotig.pl to this directory # FUGU mkdir /cluster/data/mm3/bed/ecores/fr1 cd /cluster/data/mm3/bed/ecores/fr1/ # download data for ecotigMF to this directory # parse ecotig files to produce a bed format file perl ../parse_ecotig.pl < ecotigMF > ecotigMF.bed # change from upper to lower case for "CHR" perl -pi.bak -e 's/CHR/chr/g' ecotigMF.bed hgLoadBed -tab mm3 ecoresFr1 ecotigMF.bed # clean up rm *.bak # TETRAODON mkdir /cluster/data/mm3/bed/ecores/tetraodon cd /cluster/data/mm3/bed/ecores/tetraodon/ # download data for ecotigMT to this directory # parse ecotig files to produce a bed format file perl ../parse_ecotig.pl < ecotigMT > ecotigMT.bed # change from upper to lower case for "CHR" perl -pi.bak -e 's/CHR/chr/g' ecotigMT.bed hgLoadBed -tab mm3 ecoresTetraodon ecotigMT.bed # clean up rm *.bak # add entries in kent/src/hg/makeDb/trackDb/mouse/mm3/trackDb.ra # add html for details pages to this directory: # ecoresFr1.html and ecoresTetraodon.html # gc5Base wiggle TRACK (DONE - 2004-04-07 - Hiram) # Perform a gc count with a 5 base # window. Also compute a "zoomed" view for display efficiency. mkdir /cluster/data/mm3/bed/gc5Base cd /cluster/data/mm3/bed/gc5Base # in the script below, the 'grep -w GC' selects the lines of # output from hgGcPercent that are real data and not just some # information from hgGcPercent. The awk computes the number # of bases that hgGcPercent claimed it measured, which is not # necessarily always 5 if it ran into gaps, and then the division # by 10.0 scales down the numbers from hgGcPercent to the range # [0-100]. Two columns come out of the awk print statement: # and which are fed into wigAsciiToBinary through # the pipe. It is set at a dataSpan of 5 because each value # represents the measurement over five bases beginning with # . The result files end up in ./wigData5. cat << '_EOF_' > runGcPercent.sh #!/bin/sh mkdir -p wigData5 mkdir -p dataLimits5 for n in ../../nib/*.nib do c=`basename ${n} | sed -e "s/.nib//"` C=`echo $c | sed -e "s/chr//"` echo -n "working on ${c} - ${C} ... " hgGcPercent -chr=${c} -doGaps \ -file=stdout -win=5 mm3 ../../nib | grep -w GC | \ awk '{printf "%d\t%.1f\n", $2+1, $5/10.0 }' | \ wigAsciiToBinary \ -dataSpan=5 -chrom=${c} -wibFile=wigData5/gc5Base_${C} \ -name=${C} stdin 2> dataLimits5/${c} echo "done" done '_EOF_' chmod +x runGcPercent.sh # This is going to take perhaps two hours to run. It is a lot of # data. make sure you do it on the fileserver: ssh eieio cd /cluster/data/mm3/bed/gc5Base time ./runGcPercent.sh > run.out 2>&1 # real 214m1.996s # user 286m36.810s # sys 10m46.500s # load the .wig files back on hgwdev: ssh hgwdev cd /cluster/data/mm3/bed/gc5Base hgLoadWiggle -pathPrefix=/gbdb/mm3/wib/gc5Base mm3 gc5Base wigData5/*.wig # and symlink the .wib files into /gbdb mkdir /gbdb/mm3/wib/gc5Base ln -s `pwd`/wigData5/*.wib /gbdb/mm3/wib/gc5Base # to speed up display for whole chromosome views, compute a "zoomed" # view and load that on top of the existing table. The savings # comes from the number of data table rows the browser needs to load # for a full chromosome view. Without the zoomed view there are # over 43,000 data rows for chrom 1. With the zoomed view there are # only 222 rows needed for the display. If your original data was # at 1 value per base the savings would be even greater. # Pretty much the same data calculation # situation as above, although this time note the use of the # 'wigZoom -dataSpan=1000 stdin' in the pipeline. This will average # together the data points coming out of the awk print statment over # a span of 1000 bases. Thus each coming out of wigZoom # will represent the measurement of GC in the next 1000 bases. Note # the use of -dataSpan=1000 on the wigAsciiToBinary to account for # this type of data. You want your dataSpan here to be an exact # multiple of your original dataSpan (5*200=1000) and on the order # of at least 1000, doesn't need to go too high. For data that is # originally at 1 base per value, a convenient span is: -dataSpan=1024 # A new set of result files ends up in ./wigData5_1K/*.wi[gb] cat << '_EOF_' > runZoom.sh #!/bin/sh mkdir -p wigData5_1K mkdir -p dataLimits5_1K for n in ../../nib/*.nib do c=`basename ${n} | sed -e "s/.nib//"` C=`echo $c | sed -e "s/chr//"` echo -n "working on ${c} - ${C} ... " hgGcPercent -chr=${c} -doGaps \ -file=stdout -win=5 mm3 ../../nib | grep -w GC | \ awk '{printf "%d\t%.1f\n", $2+1, $5/10.0}' | \ wigZoom -dataSpan=1000 stdin | wigAsciiToBinary \ -dataSpan=1000 -chrom=${c} -wibFile=wigData5_1K/gc5Base_${C}_1K \ -name=${C} stdin 2> dataLimits5_1K/${c} echo "done" done '_EOF_' chmod +x runZoom.sh # This is going to take just as long as above, certainly do this # on the fileserver ssh eieio time ./runZoom.sh > zoom.out 2>&1 # real 210m41.651s # user 283m40.720s # sys 9m35.820s # Then load these .wig files into the same database as above ssh hgwdev hgLoadWiggle -pathPrefix=/gbdb/mm3/wib/gc5Base -oldTable mm3 gc5Base \ wigData5_1K/*.wig # and symlink these .wib files into /gbdb ln -s `pwd`/wigData5_1K/*.wib /gbdb/mm3/wib/gc5Base # ANDY LAW CPGISSLANDS (DONE 6/17/04 angie) # See notes about this in makeGalGal2.doc. ssh kkstore mkdir /cluster/data/mm3/bed/cpgIslandGgfAndy cd /cluster/data/mm3/bed/cpgIslandGgfAndy cp /dev/null cpgIslandAndy.bed cp /dev/null cpgIslandGgfAndy.bed foreach f (../../?{,?}/chr*.fa) set chr = $f:t:r echo preproc $chr /cluster/home/angie/bin/$MACHTYPE/preProcGgfAndy $f > $chr.preproc echo running original on $chr awk '{print $1 "\t" $2 "\t" ($3 + $4) "\t" $5;}' $chr.preproc \ | /cluster/home/angie/andy-cpg-island.pl \ | perl -wpe '$i=0 if (not defined $i); \ chomp; ($s,$e) = split("\t"); $s--; \ $_ = "'$chr'\t$s\t$e\tcpg$i\n"; $i++' \ >> cpgIslandAndy.bed echo running modified on $chr /cluster/home/angie/ggf-andy-cpg-island.pl $chr.preproc \ | perl -wpe 'chomp; ($s,$e,$cpg,$n,$c,$g,$oE) = split("\t"); $s--; \ $gc = $c + $g; $pCpG = (100.0 * 2 * $cpg / $n); \ $pGc = (100.0 * $gc / $n); \ $_ = "'$chr'\t$s\t$e\tCpG: $cpg\t$n\t$cpg\t$gc\t" . \ "$pCpG\t$pGc\t$oE\n";' \ >> cpgIslandGgfAndy.bed end # load into database: ssh hgwdev cd /cluster/data/mm3/bed/cpgIslandGgfAndy # this one is a bed 4: hgLoadBed mm3 cpgIAndy -tab -noBin cpgIslandAndy.bed # this one is a cpgIslandExt but with a different table name: sed -e 's/cpgIslandExt/cpgIslandGgfAndy/g' \ $HOME/kent/src/hg/lib/cpgIslandExt.sql > cpgIslandGgfAndy.sql hgLoadBed mm3 cpgIslandGgfAndy -tab -noBin \ -sqlTable=cpgIslandGgfAndy.sql cpgIslandGgfAndy.bed # WOW, even masking out repeat bases from the results, there's a huge # increase in reported islands!! featureBits mm3 cpgIsland #10102968 bases of 2505900260 (0.403%) in intersection featureBits mm3 cpgIslandGgfAndy #61612798 bases of 2505900260 (2.459%) in intersection featureBits mm3 cpgIslandGgfAndy \!rmsk #43063885 bases of 2505900260 (1.718%) in intersection wc -l ../cpgIsland/cpgIsland.bed *bed # 15957 ../cpgIsland/cpgIsland.bed # 152344 cpgIslandAndy.bed # 114386 cpgIslandGgfAndy.bed # LaDeana asked for a run on masked seq for comparison. cp /dev/null cpgIslandAndyMasked.bed cp /dev/null cpgIslandGgfAndyMasked.bed foreach f (../../?{,?}/chr*.fa.masked) set chr = $f:t:r:r echo preproc masked $chr /cluster/home/angie/bin/$MACHTYPE/preProcGgfAndy $f > $chr.masked.preproc echo running original on $chr masked awk '{print $1 "\t" $2 "\t" ($3 + $4) "\t" $5;}' $chr.masked.preproc \ | /cluster/home/angie/andy-cpg-island.pl \ | perl -wpe '$i=0 if (not defined $i); \ chomp; ($s,$e) = split("\t"); $s--; \ $_ = "'$chr'\t$s\t$e\tcpg$i\n"; $i++' \ >> cpgIslandAndyMasked.bed echo running modified on $chr masked /cluster/home/angie/ggf-andy-cpg-island.pl $chr.masked.preproc \ | perl -wpe 'chomp; ($s,$e,$cpg,$n,$c,$g,$oE) = split("\t"); $s--; \ $gc = $c + $g; $pCpG = (100.0 * 2 * $cpg / $n); \ $pGc = (100.0 * $gc / $n); \ $_ = "'$chr'\t$s\t$e\tCpG: $cpg\t$n\t$cpg\t$gc\t" . \ "$pCpG\t$pGc\t$oE\n";' \ >> cpgIslandGgfAndyMasked.bed end # load into database: ssh hgwdev cd /cluster/data/mm3/bed/cpgIslandGgfAndy # this one is a bed 4: hgLoadBed mm3 cpgIAndyMasked -tab -noBin cpgIslandAndyMasked.bed # this one is a cpgIslandExt but with a different table name: sed -e 's/cpgIslandExt/cpgIslandGgfAndyMasked/g' \ $HOME/kent/src/hg/lib/cpgIslandExt.sql > cpgIslandGgfAndyMasked.sql hgLoadBed mm3 cpgIslandGgfAndyMasked -tab -noBin \ -sqlTable=cpgIslandGgfAndyMasked.sql cpgIslandGgfAndyMasked.bed featureBits mm3 cpgIAndyMasked #52735386 bases of 2505900260 (2.104%) in intersection featureBits mm3 cpgIslandGgfAndyMasked #37169690 bases of 2505900260 (1.483%) in intersection wc -l cpgIslandAndyMasked.bed cpgIslandGgfAndyMasked.bed # 94517 cpgIslandAndyMasked.bed # 66504 cpgIslandGgfAndyMasked.bed # 6/28/04 -- masking simpleRepeats, and even repeats other than Alu's, # might not be the right thing to do (?). Give it a try with less-masked # sequence. ssh kkstore cd /cluster/data/mm3/bed/cpgIslandGgfAndy cp /dev/null cpgIslandGgfAndyOnlyRM.bed cp /dev/null cpgIslandGgfAndyOnlyRMAlu.bed foreach f (../../?{,?}/chr*.fa) set chr = $f:t:r echo preproc, ggf-andy $chr onlyRM maskOutFa $f $f.out stdout \ | /cluster/home/angie/bin/$MACHTYPE/preProcGgfAndy stdin \ | /cluster/home/angie/ggf-andy-cpg-island.pl \ | perl -wpe 'chomp; ($s,$e,$cpg,$n,$c,$g,$oE) = split("\t"); $s--; \ $gc = $c + $g; $pCpG = (100.0 * 2 * $cpg / $n); \ $pGc = (100.0 * $gc / $n); \ $_ = "'$chr'\t$s\t$e\tCpG: $cpg\t$n\t$cpg\t$gc\t" . \ "$pCpG\t$pGc\t$oE\n";' \ >> cpgIslandGgfAndyOnlyRM.bed echo preproc, ggf-andy $chr onlyRMAlu head -3 $f.out > /tmp/tmp2.fa.out awk '$11 == "SINE/Alu" {print;}' $f.out >> /tmp/tmp2.fa.out maskOutFa $f /tmp/tmp2.fa.out stdout \ | /cluster/home/angie/bin/$MACHTYPE/preProcGgfAndy stdin \ | /cluster/home/angie/ggf-andy-cpg-island.pl \ | perl -wpe 'chomp; ($s,$e,$cpg,$n,$c,$g,$oE) = split("\t"); $s--; \ $gc = $c + $g; $pCpG = (100.0 * 2 * $cpg / $n); \ $pGc = (100.0 * $gc / $n); \ $_ = "'$chr'\t$s\t$e\tCpG: $cpg\t$n\t$cpg\t$gc\t" . \ "$pCpG\t$pGc\t$oE\n";' \ >> cpgIslandGgfAndyOnlyRMAlu.bed end # 110807 cpgIslandGgfAndyOnlyRMAlu.bed # 66614 cpgIslandGgfAndyOnlyRM.bed ssh hgwdev cd /cluster/data/mm3/bed/cpgIslandGgfAndy sed -e 's/cpgIslandExt/cpgIslandGgfAndyOnlyRM/g' \ $HOME/kent/src/hg/lib/cpgIslandExt.sql > /tmp/c.sql hgLoadBed mm3 cpgIslandGgfAndyOnlyRM -tab -noBin -sqlTable=/tmp/c.sql \ cpgIslandGgfAndyOnlyRM.bed sed -e 's/cpgIslandExt/cpgIslandGgfAndyOnlyRMAlu/g' \ $HOME/kent/src/hg/lib/cpgIslandExt.sql > /tmp/c.sql hgLoadBed mm3 cpgIslandGgfAndyOnlyRMAlu -tab -noBin -sqlTable=/tmp/c.sql \ cpgIslandGgfAndyOnlyRMAlu.bed featureBits mm3 cpgIslandGgfAndyOnlyRM #37231991 bases of 2505900260 (1.486%) in intersection featureBits mm3 cpgIslandGgfAndyOnlyRMAlu #58794013 bases of 2505900260 (2.346%) in intersection # MAP ENCODE ORTHOLOG REGIONS (2004-07-09 kate) ssh kkstore cd /cluster/data/mm3/bed mkdir encodeRegions cd encodeRegions # Of 44 regions: # -minMatch=.20 -> 37 mapped liftOver -minMatch=.20 \ /cluster/data/hg16/bed/encodeRegions/encodeRegions.bed \ /cluster/data/hg16/bed/liftOver/hg16Tomm3.chain \ encodeRegions.bed encodeRegions.unmapped ssh hgwdev cd /cluster/data/mm3/bed/encodeRegions hgLoadBed mm3 encodeRegions encodeRegions.bed -noBin ssh kkstore cd /cluster/data/mm3/bed/encodeRegions mkdir minMatch10 cd minMatch10 liftOver -minMatch=.10 \ /cluster/data/hg16/bed/encodeRegions/encodeRegions.bed \ /cluster/data/hg16/bed/liftOver/hg16Tomm3.chain \ encodeRegions.bed encodeRegions.unmapped wc -l * # 42 encodeRegions.bed # 4 encodeRegions.unmapped ssh hgwdev cd /cluster/data/mm3/bed/encodeRegions/minMatch10 hgLoadBed mm3 encodeRegionsNew encodeRegions.bed -noBin # NOTE: need to create this table with the schema # to preserve the Notes field # LIFTOVER CHAINS TO MM6 (2005-09-25 kate) # NOTE: split mm6 is doc'ed in makeMm5.doc ssh kkstore01 cp -r /cluster/data/mm6/liftSplits/ /san/sanvol1/scratch/mm6 ssh pk cd /cluster/data/mm3 makeLoChain-align mm3 /scratch/hg/mm3/chromTrfMixedNib \ mm6 /san/sanvol1/scratch/mm6/liftSplits/split /scratch/hg/h/mouse11.ooc #Setting up blat in /cluster/data/mm3/bed/blat.mm6.2005-09-24/run # 1480 jobs cd bed rm -f blat.mm6 ln -s blat.mm6.2005-09-24 blat.mm6 cd blat.mm6/run para try para check # jobs tie up pk cluster (4G) -- the chrom sequence files should be split # up to create more, smaller cluster jobs # stopped with 1 hippo and 1 crashed. Resubmit # CPU time in finished jobs: 1586754s 26445.90m 440.77h 18.37d 0.050 y # IO & Wait Time: 86144s 1435.73m 23.93h 1.00d 0.003 y # Time in running jobs: 111814s 1863.57m 31.06h 1.29d 0.004 y # Average job time: 1132s 18.86m 0.31h 0.01d # Longest running job: 111814s 1863.57m 31.06h 1.29d # Longest finished job: 22685s 378.08m 6.30h 0.26d # Submission to last job: 29164s 486.07m 8.10h 0.34d # retry chrUn vs chrUn (hippo) on kolossus # NOTE: this took _days_. Needs to be split up. ssh kolossus blat /scratch/hg/mm3/chromTrfMixedNib/chrUn_random.nib /san/sanvol1/scratch/mm6/liftSplits/split/chrUn_random.fa ../raw/chrUn_random_chrUn_random.psl -tileSize=11 -ooc=/scratch/hg/h/mouse11.ooc -minScore=100 -minIdentity=98 -fastMap # lift results # this expects data in bed/blat.mm6, so symlink must be there # use kolossus for speed ssh kolossus cd /cluster/data/mm3/bed/blat.mm6 # NOTE: had to make a local copy of makeLoChain-lift and edit # to remove dependence on date of previous run. makeLoChain-lift mm3 mm6 /san/sanvol1/scratch/mm6/liftSplits/lift >&! lift.log & tail -100f lift.log # chain alignments ssh kki cd /cluster/data/mm3/bed/blat.mm6 makeLoChain-chain mm3 /cluster/data/mm3/nib mm6 /cluster/data/mm6/nib # Created parasol job in /cluster/data/mm3/bed/blat.mm6/chainRun cd /cluster/data/mm3/bed/blat.mm6/chainRun para try # 46 jobs para check para push ssh kkstore01 cd /cluster/data/mm3/bed/blat.mm6 makeLoChain-net mm3 mm6 # GOT HERE # Can't load into liftOverChain table -- format has changed # LIFTOVER CHAINS TO MM5 (WORKING 2004-09-29 kate) # run alignment # NOTE: split mm5 to /iscratch/i is doc'ed in makeHg17.doc ssh kk9 cd /cluster/data/mm3 makeLoChain-align mm3 /scratch/hg/mm3/chromTrfMixedNib \ mm5 /iscratch/i/mm5/liftOver/liftSplit/split /scratch/hg/h/mouse11.ooc #Setting up blat in /cluster/data/mm3/bed/blat.mm5.2004-09-29 # 1591 jobs cd bed rm -f blat.mm5 ln -s blat.mm5.2004-09-29 blat.mm5 cd blat.mm5/run para try para check para push # GOT HERE # lift results # the lift directory was defined in makeMm5.doc when split was performed # this expects data in bed/blat.mm5, so symlink must be there # use kolossus for speed ssh kkstore cd /cluster/data/mm3/bed/blat.mm5 makeLoChain-lift mm3 mm5 /iscratch/i/mm5/liftOver/liftSplit/lift \ >&! lift.log & tail -100f lift.log # 25 minutes # GOT HERE # HUMAN HG13 BLASTZ CLEANUP (DONE 2004-09-20 kate) ssh kkstore cd /cluster/data/mm3/bed/blastz.hg13 nice rm -rf raw lav nice rm axtChain/run1/chain/* axtChain/preNet/*.chain axtChain/hNoClass.net nice gzip axtChrom/*.axt axtChain/{all.chain,*.net} axtChain/*Net/*.net nice gzip axtBest/*.axt pslChrom/*.psl # HUMAN HG15 BLASTZ CLEANUP (DONE 2004-09-21 kate) ssh kkstore cd /cluster/data/mm3/bed/blastz.hg15.swap nice rm -rf raw lav nice rm axtChain/run1/chain/* axtChain/preNet/*.chain axtChain/hNoClass.net nice gzip axtChain/{all.chain,*.net} axtChain/*[nN]et/*.net nice gzip axtChain/chain2/*.chain nice gzip axt*/*.axt psl*/*.psl # RAT RN2 BLASTZ CLEANUP (DONE 2004-09-21 kate) ssh kkstore cd /cluster/data/mm3/bed/blastz.rn2 nice rm axtChain/run1/chain/* axtChain/preNet/*.chain axtChain/hNoClass.net nice gzip axtChain/{all.chain,*.net} axtChain/*[nN]et/*.net nice gzip axt*/*.axt psl*/*.psl rm psl*/psl.tab # MOUSE MM3 BLASTZ CLEANUP (DONE 2004-09-21 kate) ssh kkstore cd /cluster/data/mm3/bed/blastz.mm3.2003-03-06-ASH nice gzip axt*/*.axt psl*/*.psl