# This file describes how we made the browser database on the mouse # genome, October 2003 build. - Mm4 # # # NOTE: There is a new chrMT sequence in the build 32 # >gi|34538597|ref|NC_005089.1| Mus musculus mitochondrion # # Will have to beware of this NC_ contig in the processing since # all previous builds had only NT_ contigs # # NOTE: The README_PREBUILD file for this assembly mentions several # differences from the previous release (build 30): # 1. seq_contig.md - new first line is a comment containing column name # Also, last two columns (group label and weight, have been swapped) # Also, some lines have id with CONTIG: prepended, and upper-case # feature type (CONTIG) # 2. contig.idmap - has an additional column "contig label" # This required changing the jkStuff ncbi* utilities (7/1/03 KRR) # # DOWNLOAD THE MOUSE SEQUENCE FROM NCBI (DONE - 2003-10-16 - Hiram) ssh kksilo mkdir -p /cluster/store6/mm4/ncbi ln -s /cluster/store6/mm4 /cluster/data cd /cluster/data/mm4/ncbi mkdir chrfasta contigfasta ftp ftp.ncbi.nih.gov # user hgpguest, password from /cse/guests/kent/buildHg6.doc cd mouse_32 prompt bin mget * quit gunzip *.agp.gz # Check chromosome files (DONE - 2003-10-19 - Hiram) cd chrfasta foreach f (*.fa.gz) echo $f:r >> faSize.out gunzip $f /cluster/bin/i386/faSize $f:r >> faSize.out echo $f:r done end /cluster/bin/i386/faSize *.fa >> faSize.out grep "^>" *.fa > ../chrfasta.all.fa.headers gzip *.fa cd ../contigfasta gunzip *.fa.gz grep "^>" *.fa > ../contigfasta.all.fa.headers gzip *.fa # BREAK UP SEQUENCE INTO 5 MB CHUNKS AT NON-BRIDGED CONTIGS # (DONE - 2003-10-16 - Hiram) ssh kksilo cd /cluster/data/mm4 gunzip ncbi/allrefcontig.chr.agp.gz # splitFaIntoContigs doesn't do right with agp lines arriving in a # different order than fasta chrom sequences. so split up the agp # into one per chrom. foreach c ( 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 X Y MT Un) mkdir $c perl -we "while(<>){if (/^chr$c\t/) {print;}}" \ ./ncbi/allrefcontig.chr.agp \ > $c/chr$c.agp gunzip -c ./ncbi/chrfasta/chr$c.fa.gz \ | perl -wpe 's/^>lcl\|(chr\w+)\.fa.*/>$1/' \ | splitFaIntoContigs $c/chr$c.agp \ stdin /cluster/data/mm4 -nSize=5000000 end gzip ncbi/chrfasta/chr*.fa # CREATE CHROM-LEVEL AGP AND FASTA FOR _RANDOMS (DONE 2003-10-17 - Hiram) ssh kksilo cd /cluster/data/mm4/ncbi gunzip seq_contig.md.gz # reorder random contigs in allrefcontig agp file to match seq_contig.md # this is required by the ncbiToRandomAgps scripts # had to fixup ncbiToRandomAgps from previous use to match the # lines better, and to do the MT/NC_ mitochondrion thing ../jkStuff/ncbiFixAgp allrefcontig.chr.agp > \ allrefcontig.chr.ordered.agp ../jkStuff/ncbiToRandomAgps seq_contig.md allrefcontig.chr.ordered.agp \ contig.idmap .. # creating ../mm4/1/chr1_random.agp... # ... creating ../mm4/Un/chrUn_random.agp... # The chrUn_random.agp created by this is too large with the 5000 # gaps. it will work with 1000 gaps, so fixup the chrUn_random agp: ../jkStuff/ncbiToRandomAgps -gapLen 1000 -chrom Un \ seq_contig.md allrefcontig.chr.ordered.agp contig.idmap .. ssh kksilo cd /cluster/data/mm4 foreach c (?{,?}) if (-e $c/chr${c}_random.ctg.agp) then echo building $c/chr${c}_random.fa gunzip -c ./ncbi/contigfasta/chr$c.fa.gz \ | perl -wpe 's/^>lcl\|(Mm\w+)\s+.*$/>$1/' \ > ./tmp.fa agpToFa -simpleMulti $c/chr${c}_random.ctg.agp chr${c}_random \ $c/chr${c}_random.fa ./tmp.fa rm tmp.fa endif end # building 1/chr1_random.fa # ... etc ... # building Un/chrUn_random.fa # Writing 85112544 bases to Un/chrUn_random.fa # Clean these up to avoid confusion later... they're easily rebuilt # with the ncbiToRandomAgps script above rm ?/*.ctg.agp ??/*.ctg.agp # BREAK UP _RANDOMS INTO 5 MB CHUNKS AT NON-BRIDGED CONTIGS (2003-10-20 - Hiram) ssh kksilo cd /cluster/data/mm4 foreach c (?{,?}) if (-e $c/chr${c}_random.agp) then splitFaIntoContigs $c/chr${c}_random.agp $c/chr${c}_random.fa . \ -nSize=5000000 mkdir -p $c/lift mv ${c}_random/lift/oOut.lst $c/lift/rOut.lst mv ${c}_random/lift/ordered.lft $c/lift/random.lft mv ${c}_random/lift/ordered.lst $c/lift/random.lst rmdir ${c}_random/lift rm ${c}_random/chr${c}_random.{agp,fa} mv ${c}_random/* $c rmdir ${c}_random endif end # This has a lot of output. It is difficult to see if anything # goes wrong. # Fixup chrMT name to be chrM (DONE - 2003-10-21 - Hiram) ssh kksilo cd /cluster/data/mm4 mv MT MT.ncbi mkdir M mkdir M/chrM_1 mkdir M/lift cd MT.ncbi find . -type f | while read FN do NF=`echo $FN | sed -e "s/MT/M/g"` sed -e "s/chrMT/chrM/g" $FN > ../M/$NF done # The RepeatMasker for this was re-run separately since RM was # already done by the time this was renamed. # MAKE LIFTALL.LFT (DONE - 2003-10-20 - Hiram) cd /cluster/data/mm4 cat ?{,?}/lift/{ordered,random}.lft > jkStuff/liftAll.lft # CREATING DATABASE (DONE - 2003-10-20 - Hiram) o - Create the database. ssh hgwdev hgsql -e 'create database mm4;' '' # if you need to delete this database: !!! WILL DELETE EVERYTHING !!! # hgsql -e "drop database mm4;" mm4 o - Use df to make sure there is at least 5 gig free on hgwdev:/var/lib/mysql # [hiram@hgwdev /] df -h /var/lib/mysql # Filesystem Size Used Avail Use% Mounted on # /dev/sda1 472G 414G 34G 93% /var/lib/mysql # CREATING GRP TABLE FOR TRACK GROUPING (DONE - 2003-10-20 - Hiram) # Use any of the newest databases to ensure that the organization # of the grp table is up to date ssh hgwdev hgsql -e "create table grp (PRIMARY KEY(NAME)) select * from hg16.grp" mm4 # STORING O+O SEQUENCE AND ASSEMBLY INFORMATION (DONE - 2003-10-20 - Hiram) # Create (unmasked) nib files ssh kksilo cd /cluster/data/mm4 mkdir -p unmaskedNib foreach f (?{,?}/chr?{,?}{,_random}.fa) echo $f:t:r faToNib $f unmaskedNib/$f:t:r.nib end # Create symbolic links from /gbdb/mm4/nib to real nib files # These unmasked Nib files are temporary just to get the browser # up an running immediately. After the masking is done and masked # sequence is created, these nibs will be replaced with the masked # nibs ssh hgwdev mkdir -p /gbdb/mm4/nib cd /gbdb/mm4/nib ln -s /cluster/data/mm4/unmaskedNib/chr*.nib . # Load /gbdb nib paths into database and save size info. ssh hgwdev cd /cluster/data/mm4 hgsql mm4 < ~/kent/src/hg/lib/chromInfo.sql hgNibSeq -preMadeNib mm4 /gbdb/mm4/nib ?{,?}/chr?{,?}{,_random}.fa # 2952612207 total bases # NOTE: mm3 was 2708220133, an increase of 244 Mb (~9%) hgsql -N -e "select chrom,size from chromInfo;" mm4 > chrom.sizes # check the resulting file chrom.sizes # Store o+o info in database. cd /cluster/data/mm4/ncbi gunzip sequence.inf cd /cluster/data/mm4 ln -s ncbi fft # remove so as not to confuse hgGoldGap -- they are easily regenerated rm */chr*.ctg.agp # to undo/redo: # jkStuff/dropSplitTable.csh gap # jkStuff/dropSplitTable.csh gold /cluster/bin/i386/hgGoldGapGl mm4 /cluster/data/mm4 . featureBits mm4 gold # 2627444668 bases of 2627444668 (100.000%) in intersection featureBits mm3 gold # 2505900260 bases of 2505900260 (100.000%) in intersection featureBits mm4 gap # 325167539 bases of 2627444668 (12.376%) in intersection featureBits mm3 gap # 202319873 bases of 2505900260 (8.074%) in intersection # Make and load GC percent table (DONE - 2003-10-20 - Hiram) ssh hgwdev mkdir -p /cluster/data/mm4/bed/gcPercent cd /cluster/data/mm4/bed/gcPercent hgsql mm4 < ~/kent/src/hg/lib/gcPercent.sql hgGcPercent mm4 ../../unmaskedNib # MAKE HGCENTRALTEST ENTRY AND TRACKDB TABLE FOR MM4 (DONE - 2003-10-20 - Hiram) # using the Mm3 position blatted onto Mm4: # Enter mm4 into hgcentraltest.dbDb so test browser knows about it: hgsql -e 'INSERT INTO dbDb \ (name, description, nibPath, organism, defaultPos, \ active, orderKey, genome, scientificName, htmlPath, hgNearOK) \ VALUES( "mm4", "Oct. 2003", "/gbdb/mm4/nib", "Mouse", \ "chr6:122959016-122974672", \ 1, 20, "Mouse", "Mus musculus", "/gbdb/mm4/html/description.html", 0);' \ -h genome-testdb hgcentraltest # If you need to delete that entry: hgsql -e 'delete from dbDb where name="mm4";' \ -h genome-testdb hgcentraltest # Make trackDb table so browser knows what tracks to expect: ssh hgwdev cd ~kent/src/hg/makeDb/trackDb cvs up -d -P # Edit that makefile to add mm4 in all the right places and do make update make alpha cvs commit makefile # MAKE HGCENTRALTEST BLATSERVERS ENTRY FOR MM4 (DONE - 2003-07-13 kate) ssh hgwdev hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans) \ VALUES ("mm4", "blat10", "17778", "1"); \ INSERT INTO blatServers (db, host, port, isTrans) \ VALUES ("mm4", "blat10", "17779", "0");' \ -h genome-testdb hgcentraltest # REPEAT MASKING (DONE 2003-10-21 Hiram) # TRF simpleRepeat below can be run at the same time # Split contigs, run RepeatMasker, lift results # * Contigs (*/chr*_*/chr*_*.fa) are split into 500kb chunks to make # RepeatMasker runs manageable on the cluster ==> results need lifting. # * For the NCBI assembly we repeat mask on the sensitive mode setting # (RepeatMasker -m -s -ali) #- Split contigs into 500kb chunks: ssh kksilo cd /cluster/data/mm4 foreach d ( */chr?{,?}{,_random}_?{,?} ) cd $d set contig = $d:t faSplit size $contig.fa 500000 ${contig}_ -lift=$contig.lft \ -maxN=500000 cd ../.. end # ... # 11 pieces of 11 written # 1 pieces of 1 written # ... #- Make the run directory and job list: cd /cluster/data/mm4 cat << '_EOF_' > jkStuff/RMMouse #!/bin/csh -fe cd $1 pushd . /bin/mkdir -p /tmp/mm4/$2 /bin/cp $2 /tmp/mm4/$2 cd /tmp/mm4/$2 /scratch/hg/RepeatMasker/RepeatMasker -ali -s -species mus $2 popd /bin/cp /tmp/mm4/$2/$2.out ./ if (-e /tmp/mm4/$2/$2.align) /bin/cp /tmp/mm4/$2/$2.align ./ if (-e /tmp/mm4/$2/$2.tbl) /bin/cp /tmp/mm4/$2/$2.tbl ./ if (-e /tmp/mm4/$2/$2.cat) /bin/cp /tmp/mm4/$2/$2.cat ./ /bin/rm -fr /tmp/mm4/$2/* /bin/rmdir --ignore-fail-on-non-empty /tmp/mm4/$2 /bin/rmdir --ignore-fail-on-non-empty /tmp/mm4 '_EOF_' chmod +x jkStuff/RMMouse mkdir -p RMRun rm -f RMRun/RMJobs foreach d ( ?{,?}/chr*_?{,?} ) foreach f ( $d/chr*_?{,?}_?{,?}.fa ) set f = $f:t echo /cluster/data/mm4/jkStuff/RMMouse \ /cluster/data/mm4/$d $f \ '{'check out line+ /cluster/data/mm4/$d/$f.out'}' \ >> RMRun/RMJobs end end #- Do the run ssh kk cd /cluster/data/mm4/RMRun para create RMJobs para try, para check, para check, para push, para check,... # Completed: 6429 of 6429 jobs # CPU time in finished jobs: 39351255s 655854.25m 10930.90h 455.45d 1.248 y # IO & Wait Time: 185895s 3098.25m 51.64h 2.15d 0.006 y # Average job time: 6150s 102.50m 1.71h 0.07d # Longest job: 18075s 301.25m 5.02h 0.21d # Submission to last job: 56648s 944.13m 15.74h 0.66d #- Lift up the split-contig .out's to contig-level .out's ssh kksilo cd /cluster/data/mm4 foreach d ( ?{,?}/chr*_?{,?} ) cd $d set contig = $d:t liftUp $contig.fa.out $contig.lft warn ${contig}_*.fa.out > /dev/null cd ../.. end #- Lift up the contig-level .out's to chr-level ssh kksilo cd /cluster/data/mm4 ./jkStuff/liftOut5.csh # This one error is OK # Can not find Un/lift/ordered.lft . #- Load the .out files into the database with: ssh hgwdev cd /cluster/data/mm4 # to redo: # ./jkStuff/dropSplitTable.csh rmsk # make sure there's no chrUn -- rm Un/chrUn.fa.out hgLoadOut mm4 ?/*.fa.out ??/*.fa.out # VERIFY REPEATMASKER RESULTS (DONE - 2003-10-21 Hiram) # Run featureBits on mm4 and on a comparable genome build, and compare: ssh hgwdev featureBits mm4 rmsk # 1130883581 bases of 2627444668 (43.041%) in intersection featureBits mm3 rmsk # 1080265553 bases of 2505900260 (43.109%) in intersection # SIMPLE REPEAT TRACK (DONE - 2003-10-20 Hiram) # TRF can be run in parallel with RepeatMasker on the file server # since it doesn't require masked input sequence. ssh kksilo mkdir /cluster/data/mm4/bed/simpleRepeat cd /cluster/data/mm4/bed/simpleRepeat mkdir trf rm -f jobs.csh echo '#\!/bin/csh -fe' > jobs.csh # create job list of 5MB chunks foreach f \ (/cluster/data/mm4/?{,?}/chr?{,?}_[0-9]*/chr?{,?}_?{,?}.fa \ /cluster/data/mm4/?{,?}/chr*_random_?{,?}/chr*_random_?{,?}.fa) set fout = $f:t:r.bed echo "/cluster/bin/i386/trfBig -trf=/cluster/bin/i386/trf $f /dev/null -bedAt=trf/$fout -tempDir=/tmp" \ >> jobs.csh end chmod +x jobs.csh wc jobs.csh # 600 3596 90543 jobs.csh ./jobs.csh >&! jobs.log & # in bash: ./jobs.csh > jobs.log 2>&1 & tail -f jobs.log # When job is done lift output files liftUp simpleRepeat.bed /cluster/data/mm4/jkStuff/liftAll.lft warn trf/*.bed # Load into the database ssh hgwdev cd /cluster/data/mm4/bed/simpleRepeat hgLoadBed mm4 simpleRepeat simpleRepeat.bed \ -sqlTable=$HOME/src/hg/lib/simpleRepeat.sql # Loaded 1164675 elements of size 16 featureBits mm4 simpleRepeat # 82600648 bases of 2627444668 (3.144%) in intersection featureBits mm3 simpleRepeat # 75457193 bases of 2505900260 (3.011%) in intersection # PROCESS SIMPLE REPEATS INTO MASK (DONE - 2003-10-21 - Hiram) # After the simpleRepeats track has been built, make a filtered version # of the trf output: keep trf's with period <= 12: ssh kksilo cd /cluster/data/mm4/bed/simpleRepeat mkdir -p trfMask foreach f (trf/chr*.bed) awk '{if ($5 <= 12) print;}' $f > trfMask/$f:t end # Lift up filtered trf output to chrom coords cd /cluster/data/mm4 mkdir -p bed/simpleRepeat/trfMaskChrom foreach c (?{,?}) if (-e $c/lift/ordered.lst) then perl -wpe 's@(\S+)@bed/simpleRepeat/trfMask/$1.bed@' \ $c/lift/ordered.lst > $c/lift/oTrf.lst liftUp bed/simpleRepeat/trfMaskChrom/chr$c.bed \ jkStuff/liftAll.lft warn `cat $c/lift/oTrf.lst` else echo "WARNING NO FILE: $c/lift/ordered.lst" endif if (-e $c/lift/random.lst) then perl -wpe 's@(\S+)@bed/simpleRepeat/trfMask/$1.bed@' \ $c/lift/random.lst > $c/lift/rTrf.lst liftUp bed/simpleRepeat/trfMaskChrom/chr${c}_random.bed \ jkStuff/liftAll.lft warn `cat $c/lift/rTrf.lst` endif end # NOTE: ignore warning about non-existent Un/Lift/ordered.lift # since there is no chrUn # MASK SEQUENCE WITH BOTH REPEATMASKER AND SIMPLE REPEAT/TRF # (DONE - 2003-10-21 Hiram) ssh kksilo cd /cluster/data/mm4 #- Soft-mask (lower-case) the contig and chr .fa's ./jkStuff/makeFaMasked.csh >&! maskFa.out & # bash: ./jkStuff/makeFaMasked.csh > maskFa.out 2>&1 & tail -100f maskFa.out #- Make hard-masked .fa.masked files as well: ./jkStuff/makeHardMasked.csh #- Rebuild the nib, mixedNib, maskedNib files: ./jkStuff/makeNib.csh # ignore complaints about missing chrUn # Redo symbolic links from /gbdb/mm4/nib to # mixed (RM and TRF) soft-masked nib files ssh hgwdev rm -fr /gbdb/mm4/nib/* ln -s /cluster/data/mm4/mixedNib/chr*.nib /gbdb/mm4/nib # Copy data to /cluster/bluearc for cluster runs ssh kksilo # masked contigs rm -fr /cluster/bluearc/mm4/trfFa mkdir -p /cluster/bluearc/mm4/trfFa cp -p /cluster/data/mm4/?{,?}/chr*_*/chr?{,?}{,_random}_?{,?}.fa \ /cluster/bluearc/mm4/trfFa # masked chrom nibs cd /cluster/data/mm4 rm -fr /cluster/bluearc/mm4/softNib mkdir -p /cluster/bluearc/mm4/softNib cp -p mixedNib/chr*.nib /cluster/bluearc/mm4/softNib rm -fr /cluster/bluearc/mm4/hardNib mkdir -p /cluster/bluearc/mm4/hardNib cp -p maskedNib/chr*.nib /cluster/bluearc/mm4/hardNib # fasta files rm -fr /cluster/bluearc/mm4/fasta mkdir -p /cluster/bluearc/mm4/fasta cp -p ?/*.fa ??/*.fa /cluster/bluearc/mm4/fasta # RepeatMasker *.out files rm -rf /cluster/bluearc/mm4/rmsk mkdir -p /cluster/bluearc/mm4/rmsk cp -p ?{,?}/chr?{,?}{,_random}.fa.out /cluster/bluearc/mm4/rmsk # lift file, for mrna processing cp -p jkStuff/liftAll.lft /cluster/bluearc/mm4 # also copy to /scratch ssh kksilo mkdir -p /cluster/bluearc/scratch/mus/mm4 cp -Rp /cluster/bluearc/mm4/fasta /cluster/bluearc/scratch/mus/mm4 cp -Rp /cluster/bluearc/mm4/hardNib /cluster/bluearc/scratch/mus/mm4 cp -Rp /cluster/bluearc/mm4/softNib /cluster/bluearc/scratch/mus/mm4 cp -Rp /cluster/bluearc/mm4/rmsk /cluster/bluearc/scratch/mus/mm4 cp -Rp /cluster/bluearc/mm4/trfFa /cluster/bluearc/scratch/mus/mm4 mkdir -p /cluster/bluearc/scratch/mus/mm4/rmsk.spec cp -Rp /cluster/bluearc/mm4/rmsk.spec/* /cluster/bluearc/scratch/mus/mm4/rmsk.spec # also copy to iservers ssh kkr1u00 cd ~/mm4 cp -p liftAll.lft /iscratch/i/mm4 mkdir -p /iscratch/i/mm4/softNib cp -p /cluster/bluearc/mm4/softNib/chr*.nib /iscratch/i/softNib mkdir -p /iscratch/i/mm4/trfFa cp ?{,?}/chr*_*/chr?{,?}{,_random}_?{,?}.fa /cluster/bluearc/mm4/trfFa /cluster/bin/scripts/iSync # PREPARE CLUSTER FOR BLASTZ RUN (DONE - 2003-10-21 - Hiram) ssh kksilo mkdir -p /cluster/bluearc/mm4/rmsk.spec cd /cluster/bluearc/mm4/rmsk.spec ln -s ../rmsk/*.out . cat << '_EOF_' > runArian.sh #!/bin/sh for FN in *.out do echo ${FN} /cluster/bluearc/RepeatMasker030619/DateRepsinRMoutput.pl \ ${FN} -query mouse -comp human -comp rat done '_EOF_' chmod +x runArian.sh ./runArian.sh cd /cluster/bluearc/mm4 mkdir linSpecRep.notInHuman mkdir linSpecRep.notInRat foreach f (rmsk.spec/*.out_hum_rat) set base = $f:t:r:r echo $base.out.spec /cluster/bin/scripts/extractLinSpecReps 1 $f > \ linSpecRep.notInHuman/$base.out.spec end foreach f (rmsk.spec/*.out_hum_rat) set base = $f:t:r:r echo $base.out.spec /cluster/bin/scripts/extractLinSpecReps 2 $f > \ linSpecRep.notInRat/$base.out.spec end cp -Rp /cluster/bluearc/mm4/linSpecRep.notInHuman /cluster/bluearc/scratch/mus/mm4 cp -Rp /cluster/bluearc/mm4/linSpecRep.notInRat /cluster/bluearc/scratch/mus/mm4 # Request rsync of /scratch to the KiloKluster # AUTO UPDATE GENBANK MRNA RUN (IN PROGRESS - 2003-10-21 - Hiram) ssh eieio cd /cluster/data/genbank/etc # edit genbank.conf to specify # # mm4 # mm4.genome = /scratch/mus/mm4/softNib/*.nib # mm4.lift = /cluster/data/mm4/jkStuff/liftAll.lft # mm4.downloadDir = mm4 # mm4.genbank.est.xeno.load = yes # mm4.mgcTables.default = full # mm4.mgcTables.mgc = all cd /cluster/data/genbank nice ./bin/gbAlignStep -initial -type=mrna -verbose=1 \ -clusterRootDir=/cluster/bluearc/genbank -iserver=no mm4 # this failed in the alignment step due to /scratch problems, to continue: ssh kk cd /cluster/bluearc/genbank/work/initial.mm4/align para recover align.jobs recover_align.jobs para create recover_align.jobs para try, push, check, etc ... # then ssh eieio cd /cluster/data/genbank nice ./bin/gbAlignStep -initial -continue=finish -type=mrna -verbose=1 \ -clusterRootDir=/cluster/bluearc/genbank -iserver=no mm4 # to load that: ssh hgwdev cd /cluster/data/genbank nice bin/gbDbLoadStep -verbose=1 -drop -initialLoad mm4 # then, for the big est run ssh eieio cd /cluster/data/genbank nice ./bin/gbAlignStep -initial -type=est -verbose=1 \ -clusterRootDir=/cluster/bluearc/genbank -iserver=no mm4 # Completed: 256925 of 256971 jobs # CPU time in finished jobs: 79788950s 1329815.83m 22163.60h 923.48d 2.530 y # IO & Wait Time: 7185609s 119760.16m 1996.00h 83.17d 0.228 y # Time in running jobs: 290862s 4847.70m 80.80h 3.37d 0.009 y # Average job time: 339s 5.64m 0.09h 0.00d # Longest job: 69850s 1164.17m 19.40h 0.81d # Submission to last job: 141340s 2355.67m 39.26h 1.64d # cluster problems required that to be completed manually: ssh kk cd /cluster/bluearc/genbank/work/initial.mm4/align para recover align.jobs recover.jobs para create recover.jobs para try, push, check, etc ... # Completed: 46 of 46 jobs # CPU time in finished jobs: 11250s 187.50m 3.13h 0.13d 0.000 y # IO & Wait Time: 423s 7.05m 0.12h 0.00d 0.000 y # Average job time: 254s 4.23m 0.07h 0.00d # Longest job: 1532s 25.53m 0.43h 0.02d # Submission to last job: 1532s 25.53m 0.43h 0.02d ssh eieio cd /cluster/data/genbank nice ./bin/gbAlignStep -initial -continue=finish -type=est -verbose=1 \ -clusterRootDir=/cluster/bluearc/genbank -iserver=no mm4 # Then for an efficient load of everything, drop the mrna tables and # load all: ssh hgwdev cd /cluster/data/genbank nice bin/gbDbLoadStep -verbose=1 -drop -initialLoad mm4 # LOAD CPGISSLANDS (DONE 2003-10-21 Hiram) ssh kksilo mkdir -p /cluster/data/mm4/bed/cpgIsland cd /cluster/data/mm4/bed/cpgIsland # cpglh requires hard-masked (N) .fa's. # There may be warnings about "bad character" for IUPAC ambiguous # characters like R, S, etc. Ignore the warnings. foreach f (../../?{,?}/chr?{,?}{,_random}.fa.masked) set fout=$f:t:r:r.cpg /cluster/bin/cpglh $f > $fout echo Done with $fout end cat << '_EOF_' > filter.awk /* chr1 1325 3865 754 CpG: 183 64.9 0.7 */ /* Transforms to: */ /* chr1 1325 3865 CpG: 183 754 183 489 64.9 0.7 */ { width = $3-$2; printf("%s\t%s\t%s\t%s %s\t%s\t%s\t%0.0f\t%0.1f\t%s\n", $1,$2,$3,$5,$6,width,$6,width*$7*0.01,100.0*2*$6/($3-$2),$7);} '_EOF_' awk -f filter.awk chr*.cpg > cpgIsland.bed # Load into db ssh hgwdev cd /cluster/data/mm4/bed/cpgIsland /cluster/bin/i386/hgLoadBed mm4 cpgIsland -tab -noBin \ -sqlTable=$HOME/kent/src/hg/lib/cpgIsland.sql cpgIsland.bed # BLASTZ Human (DONE - 2003-10-28 - Hiram) ssh kk mkdir -p /cluster/data/mm4/bed/blastz.hg16 cd /cluster/data/mm4/bed/blastz.hg16 cat << '_EOF_' > DEF # mouse vs. human export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/home/angie/schwartzbin:/cluster/home/kent/bin/i386 ALIGN=blastz-run BLASTZ=blastz BLASTZ_H=2000 BLASTZ_ABRIDGE_REPEATS=1 # TARGET # Mouse SEQ1_DIR=/scratch/mus/mm4/softNib # RMSK not currently used SEQ1_RMSK=/scratch/mus/mm4/rmsk # FLAG not currently used SEQ1_FLAG=-rodent SEQ1_SMSK=/scratch/mus/mm4/linSpecRep.notInHuman SEQ1_IN_CONTIGS=0 SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY # Human SEQ2_DIR=/iscratch/i/gs.17/build34/bothMaskedNibs # RMSK not currently used SEQ2_RMSK=/iscratch/i/gs.17/build34/rmsk # FLAG not currently used SEQ2_FLAG=-primate SEQ2_SMSK=/iscratch/i/gs.17/build34/linSpecRep.notInMouse SEQ2_IN_CONTIGS=0 SEQ2_CHUNK=30000000 SEQ2_LAP=0 BASE=/cluster/data/mm4/bed/blastz.hg16 DEF=$BASE/DEF RAW=$BASE/raw CDBDIR=$BASE SEQ1_LEN=$BASE/S1.len SEQ2_LEN=$BASE/S2.len '_EOF_' # << this line keeps emacs coloring happy # prepare first cluster run script source DEF cat << '_EOF_' > ../../jkStuff/BlastZ_run0.sh #!/bin/sh # prepare first cluster run for blastz processing # M=`uname -n` if [ "$M" != "kk" ]; then echo "ERROR: you are on machine: '$M'" echo -e "\tthis script expects machine kk" exit 255 fi source DEF mkdir ${RAW} mkdir run.0 echo "running 'make-joblist'" ~angie/hummus/make-joblist $DEF > $BASE/run.0/j if [ ! -f ./xdir.sh ]; then echo "ERROR: no ./xdir.sh file present. Should have been made by make-joblist." exit 255 fi echo "running 'xdir.sh'" sh ./xdir.sh cd run.0 sed -e 's#^#/cluster/home/angie/schwartzbin/#' j > jobList rm j echo "running 'para create'" para create jobList echo "Ready for cluster run. para try, check, push, etc ..." '_EOF_' # << this line keeps emacs coloring happy chmod +x ../../jkStuff/BlastZ_run0.sh # prepare first cluster run ../../jkStuff/BlastZ_run0.sh cd run.0 para try, check, push, check, .... # CPU time in finished jobs: 15818152s 263635.86m 4393.93h 183.08d 0.502 y # IO & Wait Time: 6171331s 102855.52m 1714.26h 71.43d 0.196 y # Average job time: 507s 8.45m 0.14h 0.01d # Longest job: 8930s 148.83m 2.48h 0.10d # Submission to last job: 85042s 1417.37m 23.62h 0.98d # Prepare second cluster run script, .out's to .lav's ssh kk cd /cluster/data/mm4/bed/blastz.hg16 cat << '_EOF_' > ../../jkStuff/BlastZ_run1.sh #!/bin/sh # prepare second cluster run for blastz processing # M=`uname -n` if [ "$M" != "kk" ]; then echo "ERROR: you are on machine: '$M'" echo -e "\tthis script expects machine kk" exit 255 fi source DEF mkdir $BASE/run.1 mkdir $BASE/lav echo "running 'make-joblist'" /cluster/bin/scripts/blastz-make-out2lav $DEF $BASE > $BASE/run.1/jobList cd run.1 wc -l jobList echo "running 'para create'" para create jobList echo "Ready for cluster run. para try, check, push, etc ..." '_EOF_' chmod +x ../../jkStuff/BlastZ_run1.sh # Second cluster run to convert the .out's to .lav's # You do NOT want to run this on the big cluster. It brings # the file server to its knees. Run this on the small cluster. ssh kkr1u00 source DEF ../../jkStuff/BlastZ_run1.sh cd run.1 para try, check, push, etc ... # Completed: 323 of 323 jobs # CPU time in finished jobs: 12410s 206.83m 3.45h 0.14d 0.000 y # IO & Wait Time: 335408s 5590.14m 93.17h 3.88d 0.011 y # Average job time: 1077s 17.95m 0.30h 0.01d # Longest job: 1966s 32.77m 0.55h 0.02d # Submission to last job: 1992s 33.20m 0.55h 0.02d # Prepare third cluster run script to convert lav's to axt's cat << '_EOF_' > ../../jkStuff/BlastZ_run2.sh #!/bin/sh # prepare third cluster run for blastz processing # M=`uname -n` if [ "$M" != "kk" ]; then echo "ERROR: you are on machine: '$M'" echo -e "\tthis script expects machine kk" exit 255 fi source DEF mkdir axtChrom mkdir run.2 cd run.2 # usage: blastz-chromlav2axt lav-dir axt-file seq1-dir seq2-dir echo '#LOOP' > gsub echo '/cluster/bin/scripts/blastz-contiglav2axt '${BASE}'/lav/$(root1) {check out line+ '${BASE}'/axtChrom/$(root1).axt} '${SEQ1_DIR} ${SEQ2_DIR} >> gsub echo '#ENDLOOP' >> gsub ls -1S ${BASE}/lav > chrom.list gensub2 chrom.list single gsub jobList wc -l jobList echo "running 'para create'" para create jobList echo "Ready for cluster run. para try, check, push, etc ..." '_EOF_' chmod +x ../../jkStuff/BlastZ_run2.sh # Third cluster run to convert lav's to axt's source DEF ../../jkStuff/BlastZ_run2.sh cd run.2 para try, check, push, etc ... # Completed: 44 of 44 jobs # CPU time in finished jobs: 3981s 66.35m 1.11h 0.05d 0.000 y # IO & Wait Time: 49397s 823.28m 13.72h 0.57d 0.002 y # Average job time: 1213s 20.22m 0.34h 0.01d # Longest job: 7584s 126.40m 2.11h 0.09d # Submission to last job: 7597s 126.62m 2.11h 0.09d # translate sorted axt files into psl ssh kksilo cd /cluster/data/mm4/bed/blastz.hg16 mkdir -p pslChrom set tbl = "blastzHg16" foreach f (axtChrom/chr*.axt) set c=$f:t:r echo "Processing chr $c" /cluster/bin/i386/axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl end # That takes about 30 minutes # Load database tables ssh hgwdev set tbl = "blastzHg16" cd /cluster/data/mm4/bed/blastz.hg16/pslChrom /cluster/bin/i386/hgLoadPsl -noTNameIx mm4 chr*_${tbl}.psl # this is a 55 minute job # CHAIN HG16 BLASTZ (DONE - 2003-10-24 - Hiram) # The axtChain is best run on the small kluster, or the kk9 kluster # in this case, it was run on the kk kluster ssh kk mkdir -p /cluster/data/mm4/bed/blastz.hg16/axtChain/run1 cd /cluster/data/mm4/bed/blastz.hg16/axtChain/run1 mkdir out chain ls -1S /cluster/data/mm4/bed/blastz.hg16/axtChrom/*.axt > input.lst cat << '_EOF_' > gsub #LOOP doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out} #ENDLOOP '_EOF_' # << this line makes emacs coloring happy cat << '_EOF_' > doChain #!/bin/csh axtFilter -notQ_random $1 | axtChain stdin \ /scratch/mus/mm4/softNib \ /iscratch/i/gs.17/build34/bothMaskedNibs $2 > $3 '_EOF_' # << this line makes emacs coloring happy chmod a+x doChain # 41 jobs gensub2 input.lst single gsub jobList para create jobList para try para push # ... etc ... # Completed: 44 of 44 jobs # CPU time in finished jobs: 27109s 451.82m 7.53h 0.31d 0.001 y # IO & Wait Time: 29921s 498.68m 8.31h 0.35d 0.001 y # Average job time: 1296s 21.60m 0.36h 0.02d # Longest job: 12356s 205.93m 3.43h 0.14d # Submission to last job: 12356s 205.93m 3.43h 0.14d # now on the cluster server, sort chains ssh kksilo cd /cluster/data/mm4/bed/blastz.hg16/axtChain time chainMergeSort run1/chain/*.chain > all.chain # real 8m55.100s # user 6m46.680s # sys 0m58.250s time chainSplit chain all.chain # real 8m36.259s # user 5m46.920s # sys 0m59.700s # these steps take ~20 minutes # optionally: rm run1/chain/*.chain # Load chains into database # next machine ssh hgwdev cd /cluster/data/mm4/bed/blastz.hg16/axtChain/chain foreach i (*.chain) set c = $i:r hgLoadChain mm4 ${c}_chainHg16 $i echo done $c end # NET HUMAN BLASTZ (DONE - 2003-10-31 - Hiram) ssh kksilo cd /cluster/data/mm4/bed/blastz.hg16/axtChain mkdir preNet cd chain foreach i (*.chain) echo preNetting $i /cluster/bin/i386/chainPreNet $i /cluster/data/mm4/chrom.sizes \ /cluster/data/hg16/chrom.sizes ../preNet/$i end # real 11m58.018s # user 4m10.390s # sys 2m10.780s cd .. mkdir n1 cd preNet foreach i (*.chain) set n = $i:r.net echo primary netting $i /cluster/bin/i386/chainNet $i -minSpace=1 /cluster/data/mm4/chrom.sizes \ /cluster/data/hg16/chrom.sizes ../n1/$n /dev/null end cd .. cat n1/*.net | /cluster/bin/i386/netSyntenic stdin hNoClass.net # memory usage 1591906304, utime 6877 s/100, stime 1977 ssh hgwdev cd /cluster/data/mm4/bed/blastz.hg16/axtChain time netClass hNoClass.net mm4 hg16 human.net \ -tNewR=/cluster/bluearc/scratch/mus/mm4/linSpecRep.notInHuman \ -qNewR=/cluster/bluearc/scratch/hg/gs.17/build34/linSpecRep.notInMouse # real 11m35.367s # user 7m20.880s # sys 1m33.360s # manually fixup one line in the linSpecRep.notInHuman/chr3.out.spec file # RepeatMasker or the post-processing script is making an error # Remove the line that is missing the rest of the fields # the line : 2860 6.4 0.0 1 0 0 Expecting 7 words line 156272 of /cluster/bluearc/scratch/mus/mm4/linSpecRep.notInHuman/chr3.out.spec got 6 2860 6.4 0.0 1.1 chr3 164080732 164081087 (231854) + L1Md_F2 LINE/L1 6208 6561 (21) 1 0 0 2860 6.4 0.0 1 0 0 1420 8.1 0.0 0.0 chr3 164081159 164081355 (231586) C ORR1A0-int LTR/MaLR (1593) 199 3 3 0 0 # If things look good do ssh kksilo cd /cluster/data/mm4/bed/blastz.hg16/axtChain rm -r n1 hNoClass.net # Make a 'syntenic' subset of these with netFilter -syn human.net > humanSyn.net # Load the nets into database ssh hgwdev cd /cluster/data/mm4/bed/blastz.hg16/axtChain netFilter -minGap=10 human.net | hgLoadNet mm4 netHg16 stdin netFilter -minGap=10 humanSyn.net | hgLoadNet mm4 syntenyNetHg16 stdin # Add entries for net and chain to mouse/mm4 trackDb # make net ssh kksilo cd /cluster/data/mm4/bed/blastz.hg16/axtChain mkdir humanNet time netSplit human.net humanNet # real 6m13.469s # user 3m22.670s # sys 0m44.880s mkdir ../axtNet foreach n (humanNet/chr*.net) set c=$n:t:r echo "netToAxt: $c.net -> $c.axt" rm -f ../axtNet/$c.axt netToAxt humanNet/$c.net chain/$c.chain \ /cluster/data/mm4/nib \ /cluster/bluearc/scratch/hg/gs.17/build34/bothMaskedNibs \ ../axtNet/$c.axt echo "Complete: $c.net -> $c.axt" end ssh hgwdev mkdir -p /cluster/data/mm4/bed/blastz.hg16/axtBest cd /cluster/data/mm4/bed/blastz.hg16/axtBest ln -s ../axtNet/chr*.axt . # copy net axt's to download area ssh hgwdev cd /cluster/data/mm4/bed/blastz.hg16/axtNet mkdir -p /usr/local/apache/htdocs/goldenPath/mm4/vsHg16/axtNet cp -p *.axt /usr/local/apache/htdocs/goldenPath/mm4/vsHg16/axtNet cd /usr/local/apache/htdocs/goldenPath/mm4/vsHg16/axtNet gzip *.axt # add README.txt file to dir, if needed # Convert those axt files to psl ssh kksilo cd /cluster/data/mm4/bed/blastz.hg16 mkdir pslBest foreach a (axtBest/chr*.axt) set c=$a:t:r echo "processing $c.axt -> ${c}_blastzBestHg16.psl" /cluster/bin/i386/axtToPsl axtBest/${c}.axt \ S1.len S2.len pslBest/${c}_blastzBestHg16.psl echo "Done: ${c}_blastzBestHg16.psl" end # Load tables ssh hgwdev cd /cluster/data/mm4/bed/blastz.hg16/pslBest time /cluster/bin/i386/hgLoadPsl -noTNameIx mm4 chr*_blastzBestHg16.psl # real 10m47.853s # user 2m48.700s # sys 0m24.250s # check results # featureBits mm4 blastzBestHg16 # 1030510540 bases of 2627444668 (39.221%) in intersection # featureBits mm3 blastzBestHuman # 1019460260 bases of 2505900260 (40.682%) in intersection # Make /gbdb links and add them to the axtInfo table: mkdir -p /gbdb/mm4/axtBestHg16 cd /gbdb/mm4/axtBestHg16 ln -s /cluster/data/mm4/bed/blastz.hg16/axtNet/chr*.axt . cd /cluster/data/mm4/bed/blastz.hg16/axtNet rm -f axtInfoInserts.sql foreach f (/gbdb/mm4/axtBestHg16/chr*.axt) set chr=$f:t:r echo "INSERT INTO axtInfo (species, alignment, chrom, fileName) \ VALUES ('hg16','Blastz Best in Genome','$chr','$f');" \ >> axtInfoInserts.sql end hgsql mm4 < ~/kent/src/hg/lib/axtInfo.sql # table axtInfo may already exist, ignore create error. hgsql mm4 < axtInfoInserts.sql # MAKING THE AXTTIGHT FROM AXTBEST (DONE - 2003-10-31 - Hiram) # After creating axtBest alignments above, use subsetAxt to get axtTight: ssh kksilo cd /cluster/data/mm4/bed/blastz.hg16/axtNet mkdir -p ../axtTight tcsh foreach i (*.axt) echo $i subsetAxt $i ../axtTight/$i \ ~kent/src/hg/mouseStuff/subsetAxt/coding.mat 3400 end # translate to psl cd ../axtTight mkdir ../pslTight foreach i (*.axt) set c = $i:r axtToPsl $i ../S1.len ../S2.len ../pslTight/${c}_blastzTightHg16.psl echo "Done: $i" end # Load tables into database ssh hgwdev cd /cluster/data/mm4/bed/blastz.hg16/pslTight hgLoadPsl -noTNameIx mm4 chr*_blastzTightHg16.psl # copy to axt's to download area cd /cluster/data/mm4/bed/blastz.hg16/axtTight mkdir -p /usr/local/apache/htdocs/goldenPath/mm4/vsHg16/axtTight cp -p *.axt /usr/local/apache/htdocs/goldenPath/mm4/vsHg16/axtTight cd /usr/local/apache/htdocs/goldenPath/mm4/vsHg16/axtTight gzip *.axt # add README.txt file to dir, if needed # BLASTZ Rat (DONE - 2003-10-27 - Hiram) ssh kk mkdir -p /cluster/data/mm4/bed/blastz.rn3.2003-10-21 cd /cluster/data/mm4/bed ln -s blastz.rn3.2003-10-21 blastz.rn3 cd blastz.rn3 cat << '_EOF_' > DEF # mouse vs. rat export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/home/angie/schwartzbin:/cluster/home/kent/bin/i386 ALIGN=blastz-run BLASTZ=blastz BLASTZ_H=2000 BLASTZ_ABRIDGE_REPEATS=1 # TARGET # Mouse SEQ1_DIR=/scratch/mus/mm4/softNib # RMSK not currently used SEQ1_RMSK=/scratch/mus/mm4/rmsk # FLAG not currently used SEQ1_FLAG=-rodent SEQ1_SMSK=/scratch/mus/mm4/linSpecRep.notInRat SEQ1_IN_CONTIGS=0 SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY # Rat SEQ2_DIR=/iscratch/i/rn3/bothMaskedNibs # RMSK not currently used SEQ2_RMSK=/cluster/bluearc/rat/rn3/rmsk # FLAG not currently used SEQ2_FLAG=-rodent SEQ2_SMSK=/cluster/bluearc/rat/rn3/linSpecRep.notInMouse SEQ2_IN_CONTIGS=0 SEQ2_CHUNK=30000000 SEQ2_LAP=0 BASE=/cluster/data/mm4/bed/blastz.rn3 DEF=$BASE/DEF RAW=$BASE/raw CDBDIR=$BASE SEQ1_LEN=$BASE/S1.len SEQ2_LEN=$BASE/S2.len '_EOF_' # << this line keeps emacs coloring happy # prepare first cluster run ../../jkStuff/BlastZ_run0.sh cd run.0 para try, check, push, check, .... # Completed: 39728 of 39729 jobs # Crashed: 1 jobs # CPU time in finished jobs: 16243094s 270718.23m 4511.97h 188.00d 0.515 y # IO & Wait Time: 1164706s 19411.77m 323.53h 13.48d 0.037 y # Average job time: 438s 7.30m 0.12h 0.01d # Longest job: 8391s 139.85m 2.33h 0.10d # Submission to last job: 189809s 3163.48m 52.72h 2.20d # Second cluster run to convert the .out's to .lav's # You do NOT want to run this on the big cluster. It brings # the file server to its knees. Run this on the small cluster. ssh kkr1u00 cd /cluster/data/mm4/bed/blastz.rn3 source DEF ../../jkStuff/BlastZ_run1.sh cd run.1 para try, check, push, etc ... # Completed: 323 of 323 jobs # CPU time in finished jobs: 10597s 176.62m 2.94h 0.12d 0.000 y # IO & Wait Time: 207530s 3458.83m 57.65h 2.40d 0.007 y # Average job time: 675s 11.26m 0.19h 0.01d # Longest job: 1613s 26.88m 0.45h 0.02d # Submission to last job: 2456s 40.93m 0.68h 0.03d # Third cluster run to convert lav's to axt's ssh kk cd /cluster/data/mm4/bed/blastz.rn3 source DEF ../../jkStuff/BlastZ_run2.sh cd run.2 para try, check, push, etc ... # Completed: 44 of 44 jobs # CPU time in finished jobs: 10649s 177.48m 2.96h 0.12d 0.000 y # IO & Wait Time: 129039s 2150.65m 35.84h 1.49d 0.004 y # Average job time: 3175s 52.91m 0.88h 0.04d # Longest job: 18946s 315.77m 5.26h 0.22d # Submission to last job: 19121s 318.68m 5.31h 0.22d # translate sorted axt files into psl ssh kksilo cd /cluster/data/mm4/bed/blastz.rn3 mkdir -p pslChrom set tbl = "blastzRn3" foreach f (axtChrom/chr*.axt) set c=$f:t:r echo "Processing chr $c" /cluster/bin/i386/axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl end # That takes about 20 minutes # Load database tables ssh hgwdev set tbl = "blastzRn3" cd /cluster/data/mm4/bed/blastz.rn3/pslChrom /cluster/bin/i386/hgLoadPsl -noTNameIx mm4 chr*_${tbl}.psl # CHAIN Rn3 BLASTZ (DONE - 2003-10-30 - Hiram) # Experiment, run this on kolossus ssh kolossus cd ~/kent.$MACHTYPE/src/hg/mouseStuff/axtChain make cd ../axtFilter make mkdir -p /cluster/data/mm4/bed/blastz.rn3/axtChain/run1 cd /cluster/data/mm4/bed/blastz.rn3/axtChain/run1 mkdir out chain ls -1S /cluster/data/mm4/bed/blastz.rn3/axtChrom/*.axt > input.lst cat << '_EOF_' > gsub #LOOP ./doChain $(path1) chain/$(root1).chain out/$(root1).out #ENDLOOP '_EOF_' # << this line makes emacs coloring happy cat << '_EOF_' > doChain #!/bin/csh $HOME/bin/$MACHTYPE/axtFilter -notQ_random $1 | \ $HOME/bin/$MACHTYPE/axtChain stdin \ /cluster/store6/mm4/nib \ /cluster/store3/rn3/softNib $2 > $3 '_EOF_' # << this line makes emacs coloring happy chmod a+x doChain mkdir out chain gensub2 input.lst single gsub jobList # copy the jobList to runJobs.sh echo "#!/bin/sh" > runJobs.sh cat jobList >> runJobs.sh chmod +x runJobs.sh nohup ./runJobs.sh & # Well, maybe that wasn't such a good idea. That shell # script took about 7 hours to run. I'd recommend the small # kluster or the kk9 kluster for this job # now on the cluster server, sort chains ssh kksilo cd /cluster/data/mm4/bed/blastz.rn3/axtChain time chainMergeSort run1/chain/*.chain > all.chain # real 22m15.194s # user 14m9.970s # sys 2m18.350s time chainSplit chain all.chain # real 99m40.570s # user 13m41.590s # sys 29m44.260s # Load chains into database # next machine ssh hgwdev cd /cluster/data/mm4/bed/blastz.rn3/axtChain/chain foreach i (*.chain) set c = $i:r hgLoadChain mm4 ${c}_chainRn3 $i echo done $c end # problem loading chr13 and chr7 - rerun these two foreach i (chr13.chain chr7.chain) set c = $i:r hgLoadChain mm4 ${c}_chainRn3 $i echo done $c end # NET RAT BLASTZ (DONE - 2003-10-31 - Hiram) ssh kksilo cd /cluster/data/mm4/bed/blastz.rn3/axtChain mkdir preNet cd chain foreach i (*.chain) echo preNetting $i /cluster/bin/i386/chainPreNet $i /cluster/data/mm4/chrom.sizes \ /cluster/data/rn3/chrom.sizes ../preNet/$i end # real 38m28.442s # user 11m24.220s # sys 3m24.980s cd .. mkdir n1 cd preNet foreach i (*.chain) set n = $i:r.net echo primary netting $i /cluster/bin/i386/chainNet $i -minSpace=1 /cluster/data/mm4/chrom.sizes \ /cluster/data/rn3/chrom.sizes ../n1/$n /dev/null end cd .. cat n1/*.net | /cluster/bin/i386/netSyntenic stdin hNoClass.net # memory usage 1855098880, utime 8760 s/100, stime 2368 # The netClass operations requires an "ancientRepeat" table to exist # in either mm4 or rn3. So, create the table: ssh hgwdev mkdir -p /cluster/data/mm4/bed/ancientRepeat cd /cluster/data/mm4/bed/ancientRepeat # mysqldump needs write permission to this directory chmod 777 . hgsqldump --all --tab=. hg16 ancientRepeat chmod 775 . hgsql mm4 < ancientRepeat.sql mysqlimport -u -p hg16 ancientRepeat.txt # mm4.ancientRepeat: Records: 133 Deleted: 0 Skipped: 0 Warnings: 0 # This is a hand curated table obtained from Arian. ssh hgwdev cd /cluster/data/mm4/bed/blastz.rn3/axtChain time netClass hNoClass.net mm4 rn3 rat.net \ -tNewR=/cluster/bluearc/scratch/mus/mm4/linSpecRep.notInHuman \ -qNewR=/cluster/bluearc/rat/rn3/linSpecRep.notInMouse # real 12m58.603s # user 8m7.610s # sys 1m40.280s # If things look good do ssh kksilo cd /cluster/data/mm4/bed/blastz.rn3/axtChain rm n1 hNoClass.net # Make a 'syntenic' subset of these with time netFilter -syn rat.net > ratSyn.net # real 5m52.054s # user 3m24.820s # sys 0m44.020s # Load the nets into database ssh hgwdev cd /cluster/data/mm4/bed/blastz.rn3/axtChain time netFilter -minGap=10 rat.net | hgLoadNet mm4 netRn3 stdin # real 17m0.651s # user 4m51.440s # sys 0m36.060s time netFilter -minGap=10 ratSyn.net | hgLoadNet mm4 syntenyNetRn3 stdin # real 14m4.567s # user 4m41.660s # sys 0m34.420s # Add entries for net and chain to mouse/mm4 trackDb # make net ssh kksilo cd /cluster/data/mm4/bed/blastz.rn3/axtChain mkdir ratNet time netSplit rat.net ratNet # real 28m29.554s # user 4m48.770s # sys 4m3.580s mkdir ../axtNet foreach n (ratNet/chr*.net) set c=$n:t:r echo "netToAxt: $c.net -> $c.axt" rm -f ../axtNet/$c.axt netToAxt ratNet/$c.net chain/$c.chain \ /cluster/data/mm4/nib \ /cluster/data/rn3/nib \ ../axtNet/$c.axt echo "Complete: $c.net -> ../axtNet/$c.axt" end ssh hgwdev mkdir -p /cluster/data/mm4/bed/blastz.rn3/axtBest cd /cluster/data/mm4/bed/blastz.rn3/axtBest ln -s ../axtNet/chr*.axt . # copy net axt's to download area ssh hgwdev cd /cluster/data/mm4/bed/blastz.rn3/axtNet mkdir -p /usr/local/apache/htdocs/goldenPath/mm4/vsRn3/axtNet cp -p *.axt /usr/local/apache/htdocs/goldenPath/mm4/vsRn3/axtNet cd /usr/local/apache/htdocs/goldenPath/mm4/vsRn3/axtNet gzip *.axt # add README.txt file to dir, if needed # Convert those axt files to psl ssh kksilo cd /cluster/data/mm4/bed/blastz.rn3 mkdir pslBest foreach a (axtBest/chr*.axt) set c=$a:t:r echo "processing $c.axt -> ${c}_blastzBestRn3.psl" /cluster/bin/i386/axtToPsl axtBest/${c}.axt \ S1.len S2.len pslBest/${c}_blastzBestRn3.psl echo "Done: ${c}_blastzBestRn3.psl" end # Load tables ssh hgwdev cd /cluster/data/mm4/bed/blastz.rn3/pslBest time /cluster/bin/i386/hgLoadPsl -noTNameIx mm4 chr*_blastzBestRn3.psl # real 9m7.118s # user 2m32.780s # sys 0m25.300s # check results # the original axtBest stuff from the axtBest operation: # featureBits mm4 blastzBestRn3 # 1780774716 bases of 2627444668 (67.776%) in intersection # featureBits mm3 blastzBestRat # 1577582189 bases of 2505900260 (62.955%) in intersection # Make /gbdb links and add them to the axtInfo table: mkdir -p /gbdb/mm4/axtBestRn3 cd /gbdb/mm4/axtBestRn3 ln -s /cluster/data/mm4/bed/blastz.rn3/axtNet/chr*.axt . cd /cluster/data/mm4/bed/blastz.rn3/axtNet rm -f axtInfoInserts.sql touch axtInfoInserts.sql foreach f (/gbdb/mm4/axtBestRn3/chr*.axt) set chr=$f:t:r echo "INSERT INTO axtInfo (species, alignment, chrom, fileName) \ VALUES ('rn3','Blastz Best in Genome','$chr','$f');" \ >> axtInfoInserts.sql end hgsql mm4 < ~/kent/src/hg/lib/axtInfo.sql hgsql mm4 < axtInfoInserts.sql # MAKING THE AXTTIGHT FROM AXTBEST (DONE - 2003-10-31 - Hiram) # After creating axtBest alignments above, use subsetAxt to get axtTight: ssh kksilo cd /cluster/data/mm4/bed/blastz.rn3/axtNet mkdir ../axtTight tcsh foreach i (*.axt) echo $i subsetAxt $i ../axtTight/$i \ ~kent/src/hg/mouseStuff/subsetAxt/coding.mat 3400 end # translate to psl cd ../axtTight mkdir ../pslTight foreach i (*.axt) set c = $i:r axtToPsl $i ../S1.len ../S2.len ../pslTight/${c}_blastzTightRn3.psl echo "Done: $i" end # Load tables into database ssh hgwdev cd /cluster/data/mm4/bed/blastz.rn3/pslTight time hgLoadPsl -noTNameIx mm4 chr*_blastzTightRn3.psl # real 9m53.256s # user 2m2.170s # sys 0m17.370s # copy to axt's to download area cd /cluster/data/mm4/bed/blastz.rn3/axtTight mkdir -p /usr/local/apache/htdocs/goldenPath/mm4/vsRn3/axtTight cp -p *.axt /usr/local/apache/htdocs/goldenPath/mm4/vsRn3/axtTight cd /usr/local/apache/htdocs/goldenPath/mm4/vsRn3/axtTight gzip *.axt # add README.txt file to dir, if needed # MAKING RAT AND HUMAN SYNTENY (DONE - 2003-11-05 - Hiram) ssh hgwdev mkdir -p /cluster/data/mm4/bed/syntenyRn3 cd /cluster/data/mm4/bed/syntenyRn3 # updating the scripts in use here from # /cluster/data/hg16/bed/syntenyMm3 cp -p /cluster/data/hg16/bed/syntenyMm3/*.pl . # fix the syntenicBest script to not try and work on empty # results from its queries. Also, set the db and table name # in the script itself so the arguments are not needed ./syntenicBest.pl # on the order of 3 to 4 hours to complete syntenicBest # almost no time, or only a few minutes at most for any of # the rest ./smooth.pl ./joinsmallgaps.pl # set db and table name in fillgap.pl ./fillgap.pl ./synteny2bed.pl hgLoadBed mm4 syntenyRn3 ucsc100k.bed # same thing for Hg16 mkdir -p /cluster/data/mm4/bed/syntenyHg16 cd /cluster/data/mm4/bed/syntenyHg16 cp ../syntenyRn3/syntenicBest.pl # fixup table name to be Hg16 ./syntenicBest.pl ../syntenyRn3/smooth.pl ../syntenyRn3/joinsmallgaps.pl cp ../syntenyRn3/fillgap.pl # fixup table name to be Hg16 ./fillgap.pl ../syntenyRn3/synteny2bed.pl hgLoadBed mm4 syntenyHg16 ucsc100k.bed # check results # featureBits mm3 syntenyRat # 2384684351 bases of 2505900260 (95.163%) in intersection # featureBits mm4 syntenyRn3 # 2397585098 bases of 2627444668 (91.252%) in intersection # featureBits mm3 syntenyHuman # 2343924066 bases of 2505900260 (93.536%) in intersection # featureBits mm4 syntenyHg16 # 2299774191 bases of 2627444668 (87.529%) in intersection # BLASTZ Self (IN PROGRESS - 2003-10-21 - Hiram) ssh kk # Create artifical linSpecRep.notInMouse # The procedure for lineage spec business with self is to simply # use the actual repeat masker output for this mouse assembly as # the lineage specific repeats for itself. Thus, merely make # symlinks to the repeat masker out files and name them as expected # for blastz. In this case they are called notInMouse but they # really mean InMouse. When asked what to use for this, # Scott Schwartz says: # It depends on what you want to do with the results. # If you don't mind never having any RepeatMasker stuff # align, then it should be fine. But one could imagine # other cases. cd /cluster/bluearc/scratch/mus/mm4 mkdir linSpecRep.notInMouse cd linSpecRep.notInMouse foreach f (../rmsk/*.fa.out) set base = $f:t:r:r echo $base.out.spec ln -s $f $base.out.spec end mkdir -p /cluster/data/mm4/bed/blastzSelf cd /cluster/data/mm4/bed/blastzSelf cat << '_EOF_' > DEF # mouse vs. mouse export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/home/angie/schwartzbin:/cluster/home/kent/bin/i386 ALIGN=blastz-run BLASTZ=blastz BLASTZ_H=2000 BLASTZ_ABRIDGE_REPEATS=1 # TARGET # Mouse SEQ1_DIR=/scratch/mus/mm4/softNib # RMSK not currently used SEQ1_RMSK=/scratch/mus/mm4/rmsk # FLAG not currently used SEQ1_FLAG=-rodent SEQ1_SMSK=/cluster/bluearc/scratch/mus/mm4/linSpecRep.notInMouse SEQ1_IN_CONTIGS=0 SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY # Mouse SEQ2_DIR=/iscratch/i/mm4/softNib # RMSK not currently used SEQ2_RMSK=/iscratch/i/mm4/rmsk # FLAG not currently used SEQ2_FLAG=-rodent SEQ2_SMSK=/cluster/bluearc/scratch/mus/mm4/linSpecRep.notInMouse SEQ2_IN_CONTIGS=0 SEQ2_CHUNK=30000000 SEQ2_LAP=0 BASE=/cluster/data/mm4/bed/blastzSelf DEF=$BASE/DEF RAW=$BASE/raw CDBDIR=$BASE SEQ1_LEN=$BASE/S1.len SEQ2_LEN=$BASE/S2.len '_EOF_' # << this line keeps emacs coloring happy # First cluster run cd /cluster/data/mm4/bed/blastzSelf source DEF ../../jkStuff/BlastZ_run0.sh cd run para try, check, push, check, .... # Completed: 41344 of 41344 jobs # CPU time in finished jobs: 21214466s 353574.44m 5892.91h 245.54d 0.673 y # IO & Wait Time: 2233362s 37222.69m 620.38h 25.85d 0.071 y # Average job time: 567s 9.45m 0.16h 0.01d # Longest job: 338309s 5638.48m 93.97h 3.92d # Submission to last job: 774549s 12909.15m 215.15h 8.96d # note, one of those jobs took 94 hours ! # Second cluster run to convert the .out's to .lav's # You do NOT want to run this on the big cluster. It brings # the file server to its knees. Run this on the small cluster. ssh kkr1u00 cd /cluster/data/mm4/bed/blastzSelf source DEF ../../jkStuff/BlastZ_run1.sh cd run.1 para try, check, push, etc ... # Completed: 323 of 323 jobs # CPU time in finished jobs: 70707s 1178.45m 19.64h 0.82d 0.002 y # IO & Wait Time: 14609s 243.48m 4.06h 0.17d 0.000 y # Average job time: 264s 4.40m 0.07h 0.00d # Longest job: 5502s 91.70m 1.53h 0.06d # Submission to last job: 10167s 169.45m 2.82h 0.12d # Third cluster run to convert lav's to axt's ssh kk cd /cluster/data/mm4/bed/blastzSelf source DEF ../../jkStuff/BlastZ_run2.sh cd run.2 para try, check, push, etc ... # Completed: 37 of 44 jobs # Crashed: 7 jobs # CPU time in finished jobs: 12417s 206.95m 3.45h 0.14d 0.000 y # IO & Wait Time: 186792s 3113.20m 51.89h 2.16d 0.006 y # Average job time: 5384s 89.73m 1.50h 0.06d # Longest job: 33349s 555.82m 9.26h 0.39d # Submission to last job: 33401s 556.68m 9.28h 0.39d # failed: chr17, chr1_random, chr2, chr3, chr5, chr7, chrUn_random # finish these on kolossus cat << '_EOF_' > runFailed.sh #!/bin/sh for c in 17 1_random 2 3 5 7 Un_random do time ../../../jkStuff/x86_64-chromlav2axt \ /cluster/data/mm4/bed/blastzSelf/lav/chr${c} \ /cluster/data/mm4/bed/blastzSelf/axtChrom/chr${c}.axt \ /cluster/data/mm4/nib /cluster/data/mm4/nib done '_EOF_' # chmod +x runFailed.sh ./runFailed.sh > runFailed.out 2>&1 XXXX running 2003-11-03 08:35 # PRODUCING GENSCAN PREDICTIONS (DONE - 2003-10-27 - Hiram) # Must run this on the small cluster since this # process requires more memory than is found in the KK nodes ssh kkr1u00 mkdir -p /cluster/data/mm4/bed/genscan cd /cluster/data/mm4/bed/genscan # Make subdirectories for genscan to put their output files in mkdir gtf pep subopt ls -1S /cluster/data/mm4/?{,?}/chr*_*/chr*_*.fa.masked \ > genome.list # Create template file, gsub, for gensub2. cat << '_EOF_' > gsub #LOOP /cluster/bin/i386/gsBig {check in line+ $(path1)} {check out line gtf/$(root1).gtf} -trans={check out line pep/$(root1).pep} -subopt={check out line subopt/$(root1).bed} -exe=/cluster/home/fanhsu/projects/compbio/bin/genscan-linux/genscan -par=/cluster/home/fanhsu/projects/compbio/bin/genscan-linux/HumanIso.smat -tmp=/tmp -window=2400000 #ENDLOOP '_EOF_' # << this line makes emacs coloring happy gensub2 genome.list single gsub jobList para create jobList para try para check para push # with only 6 CPUs available: # Completed: 598 of 599 jobs # Crashed: 1 jobs # CPU time in finished jobs: 906601s 15110.02m 251.83h 10.49d 0.029 y # IO & Wait Time: 206027s 3433.78m 57.23h 2.38d 0.007 y # Average job time: 1861s 31.01m 0.52h 0.02d # Longest job: 76738s 1278.97m 21.32h 0.89d # Submission to last job: 196259s 3270.98m 54.52h 2.27d # The failed job, rerun on Itanium: ssh snort cd kent.ia64/src/hg/gsBig make cd /cluster/data/mm4/bed/genscan # It would not run at window=2400000, but it did with 2000000 $HOME/bin/$MACHTYPE/gsBig /cluster/data/mm4/2/chr2_15/chr2_15.fa.masked gtf/chr2_15.fa.gtf -trans=pep/chr2_15.fa.pep -subopt=subopt/chr2_15.fa.bed -exe=/cluster/home/fanhsu/projects/compbio/bin/genscan-linux/genscan -par=/cluster/home/fanhsu/projects/compbio/bin/genscan-linux/HumanIso.smat -tmp=/tmp -window=2000000 # Run time 209 seconds : 0.2 s / kb # Got 118 exons # Got 18 genes # Got 216 suboptimal exons # Convert these to chromosome level files as so: ssh kksilo cd /cluster/data/mm4/bed/genscan liftUp genscan.gtf ../../jkStuff/liftAll.lft warn gtf/chr*.gtf liftUp genscanSubopt.bed ../../jkStuff/liftAll.lft warn subopt/chr*.bed > \ /dev/null cat pep/*.pep > genscan.pep # Load into the database as so: ssh hgwdev cd /cluster/data/mm4/bed/genscan ldHgGene mm4 genscan genscan.gtf # Read 46885 transcripts in 331775 lines in 1 files # 46885 groups 44 seqs 1 sources 1 feature types # 46885 gene predictions hgPepPred mm4 generic genscanPep genscan.pep hgLoadBed mm4 genscanSubopt genscanSubopt.bed > /dev/null # MAKE DOWNLOADABLE SEQUENCE FILES (IN PROGRESS 2003-10-21 Hiram) ssh kksilo cd /cluster/data/mm4 # Build the .zip files jkStuff/zipAll.sh >&! zipAll.log & # bash: ./jkStuff/zipAll.sh > zipAll.log 2>&1 & tail -f zipAll.log mkdir zip mv *.zip zip cd zip # Look at zipAll.log to make sure all file lists look reasonable. # Check zip file integrity: foreach f (*.zip) unzip -t $f > $f.test tail -1 $f.test end wc -l *.zip.test # 47 chromAgp.zip.test # 46 chromFa.zip.test # 46 chromFaMasked.zip.test # 46 chromOut.zip.test # 46 chromTrf.zip.test # 601 contigAgp.zip.test # 601 contigFa.zip.test # 601 contigFaMasked.zip.test # 601 contigOut.zip.test # 601 contigTrf.zip.test #3236 total ssh hgwdev cd /cluster/data/mm4/jkStuff # create generic copy program cat << '_EOF_' > cpToWeb.sh #!/bin/sh if [ $# -ne 1 ]; then echo "usage: cpToWeb.sh " echo -e "\texample: cpToWeb.sh mm4" exit 255 fi GP=/usr/local/apache/htdocs/goldenPath/$1 mkdir -p ${GP} mkdir -p ${GP}/chromosomes for f in ../?/*.fa ../??/*.fa do BN=`basename ${f}` zip -j ${GP}/chromosomes/${BN}.zip ${f} echo "zipped: ${BN}" done mkdir -p ${GP}/bigZips for Z in *.zip do cp -p ${Z} ${GP}/bigZips echo "copied: ${Z}" done '_EOF_' chmod +x cpToWeb.sh cd /cluster/data/mm4/zip ../jkStuff/cpToWeb.sh mm4 cd /usr/local/apache/htdocs/goldenPath/mm4 # Take a look at bigZips/* and chromosomes/*, update their README.txt's # Make the upstream sequence files. # NOTE: must be redone due to bad gap track cd bigZips featureBits mm4 refGene:upstream:1000 -fa=upstream1000.fa zip upstream1000.zip upstream1000.fa rm upstream1000.fa featureBits mm4 refGene:upstream:2000 -fa=upstream2000.fa zip upstream2000.zip upstream2000.fa rm upstream2000.fa featureBits mm4 refGene:upstream:5000 -fa=upstream5000.fa zip upstream5000.zip upstream5000.fa rm upstream5000.fa # mrna zips -- auto dump process takes care of this # CREATE CYTOBAND TRACK # Can be done at any time ssh hgwdev cd /cluster/data/mm4 mkdir cytoBand cd cytoBand # Get file from NCBI - might be Mouse400.dat.gz for future wget ftp://ftp.ncbi.nih.gov/genomes/M_musculus/maps/mapview/BUILD.32/ideogram.gz gunzip ideogram # Had to modify chr16 coordinates because chrom size too big. Simply # scaled sizes down to fit in correct chrom size. # Create bed file /cluster/bin/scripts/createNcbiCytoBand ideogram # Load the bed file hgLoadBed -noBin -sqlTable=/cluster/home/kent/src/hg/lib/cytoBand.sql mm4 cytoBand cytoBand.bed # Make cytoBandIdeo track for ideogram gif on hgTracks page. # For mouse cytoBandIdeo is just a replicate of the cytoBand track. # Make the cytoBand track (above) and then: echo "create table cytoBandIdeo (index(chrom(10),chromStart)) as select * from cytoBand;" | hgsql mm4 # UPDATE BACEND SEQUENCES (DONE - 2003-10-27 - Hiram) # Download new files ssh kksilo mkdir /cluster/data/mm4/bed/bacends/ncbi cd /cluster/data/mm4/bed/bacends/ncbi wget ftp://ftp.ncbi.nih.gov/genomes/M_musculus/BACENDS/AllBACends.mfa.gz wget ftp://ftp.ncbi.nih.gov/genomes/M_musculus/BACENDS/cl_acc_gi_len.gz wget ftp://ftp.ncbi.nih.gov/genomes/M_musculus/BACENDS/README gunzip AllBACends.mfa.gz gunzip cl_acc_gi_len.gz # Convert fa file cat << '_EOF_' > convert.pl #!/usr/local/bin/perl -w use strict; while (my $line = <>) { if (substr($line,0,1) ne ">") { print $line; } else { my @fields = split(/\|/, $line); my $printed = 0; for (my $i = 0; $i < $#fields; $i++) { if ($fields[$i] eq "gb") { (my $name, my $vers) = split(/\./,$fields[$i+1]); print ">$name\n"; $i= $#fields; $printed = 1; } } if (!$printed) { die("Failed for $line\n"); } } } '_EOF_' chmod +x convert.pl ./convert.pl < AllBACends.mfa > BACends.fa # Create new pairs files convertBacEndPairInfo cl_acc_gi_len # Split file into pieces and copy to cluster to propagate ssh kksilo /cluster/bin/i386/faSplit sequence BACends.fa 100 BACends cd /cluster/data/mm4/bed/bacends/ncbi rm /cluster/bluearc/scratch/mus/mm4/bacEnds mv BACends???.fa /cluster/bluearc/scratch/mus/mm4/bacEnds cp -p BACends.fa /cluster/bluearc/scratch/mus/mm4/bacEnds # Ask for propagation from sysadmin # Load the sequences (change bacends.# to match correct location) ssh hgwdev mkdir /gbdb/mm4/bacends cd /gbdb/mm4/bacends ln -s /cluster/data/mm4/bed/bacends/ncbi/BACends.fa . cd /tmp hgLoadSeq mm4 /gbdb/mm4/bacends/BACends.fa #Adding /gbdb/mm4/bacends/BACends.fa) #452237 sequences #Updating seq table # One additional step 9/10/04 Fan. # Create a composite index to speed up hgTracks display when BAC Ends track selected. hgsql mm4 -e 'create index bacIndex2 on all_bacends(tName, qName(8));' # This will take hours. #All done # BACEND SEQUENCE ALIGNMENTS (DONE - 2003-10-28 - Hiram) # (alignments done without RepeatMasking) # We need an ooc file for this genome ssh kksilo mkdir /cluster/data/mm4/ooc cd /cluster/data/mm4/ooc ls ../unmaskedNib/chr*.nib > nib.list blat -makeOoc=11.ooc -repMatch=1024 nib.list nib.list output.psl # Wrote 26077 overused 11-mers to 11.ooc # Did not end using this. Used an old one instead. # Create full sequence alignments ssh kk cd /cluster/data/mm4/bed/bacends /cluster/bin/scripts/splitContigList -scratch /iscratch/i/mm4/trfFa 1 # allow blat to run politely in /tmp while it writes output, then # copy results to results file: cat << '_EOF_' > runBlat.sh #!/bin/sh path1=$1 path2=$2 root1=$3 root2=$4 result=$5 rm -fr /tmp/${root1}_${root2} mkdir /tmp/${root1}_${root2} pushd /tmp/${root1}_${root2} /cluster/bin/i386/blat ${path1} ${path2} -ooc=/scratch/hg/h/mouse11.ooc \ ${root1}.${root2}.psl popd rm -f ${result} mv /tmp/${root1}_${root2}/${root1}.${root2}.psl ${result} rm -fr /tmp/${root1}_${root2} '_EOF_' # << this line keeps emacs coloring happy chmod +x runBlat.sh cat << '_EOF_' > template #LOOP ./runBlat.sh {check in exists $(path1)} {check in exists $(path2)} $(root1) $(root2) {check out line+ bacEnds.out/$(root2)/$(root1).$(root2).psl} #ENDLOOP '_EOF_' ls -1S /iscratch/i/mm4/bacEnds/BACends???.fa > bacEnds.lst mkdir bacEnds.out # create results directories for each to avoid the all result files in # one directory problem foreach f (`cat bacEnds.lst`) set b = $f:t:r echo $b mkdir bacEnds.out/$b end gensub2 contig.lst bacEnds.lst template jobList para create jobList # 58702 jobs written to batch para try, check, push, etc ... # Completed: 58702 of 58702 jobs # CPU time in finished jobs: 3702485s 61708.08m 1028.47h 42.85d 0.117 y # IO & Wait Time: 1321608s 22026.80m 367.11h 15.30d 0.042 y # Average job time: 86s 1.43m 0.02h 0.00d # Longest job: 2767s 46.12m 0.77h 0.03d # Submission to last job: 5781s 96.35m 1.61h 0.07d # Compile alignments and lift the files (takes a while) ssh kksilo cd /cluster/data/mm4/bed/bacends time pslSort dirs raw.psl temp bacEnds.out/* # real 852m20.286s <- over 14 hours ! # user 522m19.890s # sys 65m43.910s ssh kolossus time pslReps -nearTop=0.01 -minCover=0.7 -minAli=0.8 -noIntrons raw.psl bacEnds.psl /dev/null # Processed 540122688 alignments # 5289.06user 306.10system 1:56:57elapsed # 79%CPU (0avgtext+0avgdata 0maxresident)k # 0inputs+0outputs (146major+16134522minor)pagefaults 0swaps # time pslReps -nearTop=0.02 -minCover=0.60 -minAli=0.85 -noIntrons raw.psl bacEnds.psl /dev/null # Processed 540122688 alignments # real 157m1.227s # user 113m57.480s # sys 17m13.940s rmdir temp # You will want to keep this file around until later processing is # proven correct rm raw.psl # 68 Gb ! It takes a while even to remove it. ssh kksilo time /cluster/bin/scripts/lifter -psl -mouse /cluster/data/mm4 bacEnds.psl # 4037.85 # user 743.34 # system 1:48:09elapsed 73%CP # real 28m29.554s == 9 hours # user 4m48.770s # sys 4m3.580s cp -p ~booch/bacends/split.pl . cp -p ~booch/bacends/header . time ./split.pl header < bacEnds.psl.lifted # real 169.41 # user 38.91 # sys 43.44 # These are 2 Gb files: # -rw-rw-r-- 1 2275160342 Nov 1 15:15 bacEnds.psl # -rw-rw-r-- 1 2385296298 Nov 1 22:19 bacEnds.psl.lifted # we are up to 8 Gb for these files: # -rw-rw-r-- 1 8476583727 Oct 30 10:49 bacEnds.psl # -rw-rw-r-- 1 8892639725 Oct 30 23:04 bacEnds.psl.lifted cp -p bacEnds.psl.lifted bacEnds.psl.lifted.save time pslSort dirs bacEnds.psl.lifted temp split # real 1135.23 # user 934.31 # sys 112.83 #real 72m23.363s #user 58m48.260s #sys 7m13.090s rmdir temp rm -r split # Copy files to final destination and remove mkdir /cluster/data/mm4/bacends cp -p bacEnds.psl.lifted /cluster/data/mm4/bacends XXX done 2003-11-01 23:50 # BACEND PAIRS TRACK (WORKING - 2003-10-28 - Hiram) ssh kolossus cd /cluster/data/mm4/bacends # two line header file for the cat: cat << '_EOF_' > header bin chr start end clone score strand all features starts sizes names rid rcov lid lcov 10N 10 10N 10N 10 10N 10 10 10N 10 10 10 10N 10N 10N 10N '_EOF_' time ${HOME}/bin/${MACHTYPE}/pslPairs -min=50000 -max=300000 \ -short -long -orphan -mismatch -verbose bacEnds.psl.lifted \ ../bed/bacends/ncbi/bacEndPairs.txt bacEnds XXX running 2003-11-01 23:55 XXXX - running 2003-10-31 14:45 cat header ../bed/bacends/ncbi/bacEndPairs.txt | \ /cluster/bin/scripts/row score ge 300 | \ /cluster/bin/scripts/sorttbl chr start | \ /cluster/bin/scripts/headchg -del > bacEndPairs.bed # edit this Makefile to add this executable path to the scripts: # /projects/compbiousr/booch/booch/scripts/ # Update Makefile with new location of pairs/singles # files, if necessary (DONE) # Make initial file of alignments make bacEnds.rdb # Try to fish out more pairs make bacEndsMiss.psl # Re-make bacEnds.rdb with new info make bacEnds.rdb # Create bacEndPairs track file make bacEndPairs.bed # Create bacEndPairsBad and bacEndPairsLong files make bacEndPairsBad.bed # Create psl file to load make bacEnds.load.psl # Create database tables ssh hgwdev cd /projects/hg2/booch/psl/tables hgsql mm4 < all_bacends.sql hgsql mm4 < bacEndPairs.sql hgsql mm4 < bacEndPairsBad.sql hgsql mm4 < bacEndPairsLong.sql # Load the tables cd /cluster/data/mm4/bacends hgsql -e \ 'load data local infile "bacEnds.load.psl" into table all_bacends;' mm4 hgsql -e \ 'load data local infile "bacEndPairs.bed" into table bacEndPairs;' mm4 hgsql -e \ 'load data local infile "bacEndPairsBad.bed" into table bacEndPairsBad;' \ mm4 hgsql -e \ 'load data local infile "bacEndPairsLong.bed" into table bacEndPairsLong;' \ mm4 # featureBits mm4 all_bacends # 243096171 bases of 2627444668 (9.252%) in intersection # featureBits mm3 all_bacends # 220398676 bases of 2505900260 (8.795%) in intersection # featureBits mm4 bacEndPairs # 2549945356 bases of 2627444668 (97.050%) in intersection # featureBits mm3 bacEndPairs # 2508331798 bases of 2505900260 (100.097%) in intersection # featureBits mm4 bacEndPairsBad # 1074505863 bases of 2627444668 (40.895%) in intersection # featureBits mm3 bacEndPairsBad # 547271514 bases of 2505900260 (21.839%) in intersection # featureBits mm4 bacEndPairsLong # 2833125883 bases of 2627444668 (107.828%) in intersection # featureBits mm3 bacEndPairsLong # 2500534101 bases of 2505900260 (99.786%) in intersection XXXX - running 2003-11-04 10:32 # ADD MAP CONTIGS TRACK (DONE - 2003-11-04 - Hiram) ssh hgwdev mkdir -p /cluster/data/mm4/bed/ctgPos cd /cluster/data/mm4/bed/ctgPos # hgCtgPos uses the lift files... but mouse lift files are for the # 5MB contigs from splitFaIntoContigs, not for the real NT_ contigs # from the assembly. (In the future, we should go with the NT's!) # So... just for this release, go straight from the seq_contig.md # to the table def'n: contig, size, chrom, chromStart, chromEnd cat << '_EOF_' > parseSeqContig.pl #!/usr/local/bin/perl -w use strict; while (<>) { if (/^\d+\s+(\S+)\s+(\d+)\s+(\d+)\s+(\S+)\s+(N[TC]_\d+)\s+(\S+)\s+contig\s+\S+\s+\S+\s*$/i) { my $chr=$1; my $start=$2; $start -= 1; my $end=$3; my $ctg=$5; if ($chr !~ /N/ ) { print "$ctg\t" . ($end-$start) . "\tchr$chr\t$start\t$end\n"; } } } '_EOF_' chmod +x parseSeqContig.pl ./parseSeqContig.pl ../../ncbi/seq_contig.md > ctgPos.tab hgsql mm4 < ~/kent/src/hg/lib/ctgPos.sql echo "load data local infile 'ctgPos.tab' into table ctgPos" | hgsql mm4 # Note: the info is there in seq_contig.md to also do the _random's, # but we'd have to do some more work: duplicate the gaps of 50000 between # contigs for all _random's except chrUn_random (1000 between). # featureBits mm4 ctgPos # 2554101163 bases of 2627444668 (97.209%) in intersection # featureBits mm3 ctgPos # 2500661074 bases of 2505900260 (99.791%) in intersection # KNOWN GENES TRACK (DONE - 2003-11-20 - Hiram) # you will probably need to make the programs in kent/src/hg/protein cd ~/kent/src/hg/protein make # The scripts run below will check for programs and let you know # which ones are missing # obtain new SwissProt database (should be done about once a month) # the swiss prot data is currently living on store5, first step is # on the fileserver. This script was used once as it was created, # it may need to be verified and improved as it is used again. See # comments at the top of the script. ssh eieio cd /cluster/data/swissprot ~/kent/src/hg/protein/mkSwissProtDB.sh # that obtains the data and unpacks it, second step is on hgwdev # to create the database ssh hgwdev cd /cluster/data/swissprot ~/kent/src/hg/protein/mkSwissProtDB.sh # Now the proteins database can be created from that. Must be on hgwdev # Again, a script that has been used once upon creation, see # comments in it. For example currently it is assumed these two # scripts have been run on the same day. In this case 03112 ssh hgwdev cd /cluster/data/proteins ~/kent/src/hg/protein/mkProteinsDb.sh # with those two databases existing, read for the actual known genes # track build. Must be on hgwdev since it is all mostly database # operations. The {Date} argument is the date stamp created by the # above two scripts. Something of the form YYMMDD, e.g.: 031112 # Again, a script that has been used only once at creation, see # comments at top of script. ssh hgwdev mkdir /cluster/data/mm4/bed/knownGenes cd /cluster/data/mm4/bed/knownGenes DateStamp=031112 ~/kent/src/hg/protein/KGprocess.sh ${DateStamp} # that runs to a point where it prepares data and jobList for a # cluster run. Continue with a cluster run on kk ssh kk cd /cluster/data/mm4/bed/knownGenes/kgBestMrna para create jobList para try para check para push # this is a quick cluster job. Less than five minutes. e.g.: # Completed: 43580 of 43580 jobs # CPU time in finished jobs: 114636s 1910.60m 31.84h 1.33d 0.004 y # IO & Wait Time: 111889s 1864.82m 31.08h 1.30d 0.004 y # Average job time: 5s 0.09m 0.00h 0.00d # Longest job: 9s 0.15m 0.00h 0.00d # Submission to last job: 282s 4.70m 0.08h 0.00d # Continuing back on hgwdev, run the same script again ssh hgwdev cd /cluster/data/mm4/bed/knownGenes DateStamp=031112 ~/kent/src/hg/protein/KGprocess.sh ${DateStamp} # that should run to completion and the known genes track is ready # Add the proteins link into gdbPdb.hgcentral: hgsql -e 'INSERT INTO gdbPdb (genomeDb, proteomeDb) \ VALUES ("mm4","proteins031112");' \ -h genome-testdb hgcentraltest # CREATE FULL TEXT INDEX FOR KNOWN GENES (DONE 1/19/2006 JK) # This depends on the go and uniProt databases as well as # the kgAlias and kgProAlias tables. The hgKgGetText takes # about 5 minutes when the database is not too busy. The rest # is real quick. ssh hgwdev cd /cluster/data/mm4/bed/ mkdir -p kgMm4/index cd kgMm4/index hgKgGetText mm4 knownGene.text ixIxx knownGene.text knownGene.ix knownGene.ixx ln -s /cluster/data/mm4/bed/kgMm4/index/knownGene.ix /gbdb/mm4/knownGene.ix ln -s /cluster/data/mm4/bed/kgMm4/index/knownGene.ixx /gbdb/mm4/knownGene.ixx # SET defaultDb in hgcentraltest to move from mm3 to mm4 hgsql -e 'update defaultDb set name = "mm4" where name = "mm3";' \ -h genome-testdb hgcentraltest # STS MARKERS TRACK (STARTED - 2003-11-25 - Hiram) # DONE - 2004-01-07 15:14 ! - this wasn't easy .... ssh kksilo mkdir -p /cluster/data/mm4/bed/STSmarkers/downloads cd /cluster/data/mm4/bed/STSmarkers/downloads # these files appear to be new almost every day wget --timestamping \ ftp://ftp.ncbi.nih.gov/repository/UniSTS/UniSTS_mouse.sts wget --timestamping \ ftp://ftp.ncbi.nih.gov/repository/UniSTS/UniSTS.aliases # these map files appear to be old, 2002 Data wget --timestamping \ ftp://ftp.ncbi.nih.gov/repository/UniSTS/UniSTS_MapReports/Mus_musculus/* # Picks up files: # 345184 Feb 20 2002 10090.MGD.txt # 173294 Jun 27 2002 10090.WI_Mouse_Genetic.txt # 240637 Jun 27 2002 10090.WI_Mouse_YAC.txt # 390088 Jun 27 2002 10090.Whitehead-MRC_RH.txt # If these files have not been changing, then no need to worry about # them. We are just picking them up to see if they have changed # since the last time we worked on this. # these reports from jax.org appear to be changing daily wget --timestamping \ ftp://ftp.informatics.jax.org/pub/reports/MRK_Dump2.rpt wget --timestamping \ ftp://ftp.informatics.jax.org/pub/reports/MRK_Sequence.rpt wget --timestamping \ ftp://ftp.informatics.jax.org/pub/reports/PRB_PrimerSeq.rpt # compare them with previous versions. Before this these were # in /cluster/store5/mouseMarker/orig # these newly picked up files: sum -r 10090* # 48882 338 10090.MGD.txt # 24176 381 10090.Whitehead-MRC_RH.txt # 62367 170 10090.WI_Mouse_Genetic.txt # 50616 235 10090.WI_Mouse_YAC.txt sum -r *.rpt # 21097 4161 MRK_Dump2.rpt # 15420 3139 MRK_Sequence.rpt # 41562 2233 PRB_PrimerSeq.rpt sum -r UniSTS* # 11981 8850 UniSTS.aliases # 42464 2291 UniSTS_mouse.sts # the previous copies cd /cluster/store5/mouseMarker/orig sum -r 10090* # 48882 338 10090.MGD.txt # 24176 381 10090.Whitehead-MRC_RH.txt # 62367 170 10090.WI_Mouse_Genetic.txt # 50616 235 10090.WI_Mouse_YAC.txt sum -r *.rpt # 36880 4160 MRK_Dump2.rpt # 02447 3132 MRK_Sequence.rpt # 57914 2220 PRB_PrimerSeq.rpt sum -r UniSTS* # 36201 8843 UniSTS.aliases # 58524 970 UniSTS_mouse.alias # 42464 2291 UniSTS_mouse.sts # back to our work area, update the bed file # to do this we need a new UniSTS_mouse.alias file # it is created by a combination of information from several # of the above files ! AND ! the previous stsInfoMouse.bed file # # This process has been captured in the script: # /cluster/data/mm4/bed/STSmarkers/downloads/fetchAllAliases.sh # which uses a couple of perl scripts in that same directory. # briefly it is: cd /cluster/data/mm4/bed/STSmarkers/downloads ./UniSTSParse.pl UniSTS_mouse.sts UniSTS.aliases > UniSTS_mouse_alias.0 grep MGI: UniSTS.aliases > MGI.aliases ./stsInfoMouseParse.pl /cluster/store5/mouseMarker/stsInfoMouse.bed > \ stsInfoAliases.txt ./UniSTSParse.pl stsInfoAliases.txt UniSTS.aliases > stsInfo.aliases cat UniSTS_mouse_alias.0 MGI.aliases stsInfo.aliases | sort -u \ | sort -n > UniSTS_mouse.alias # with that, we can create a new stsInfoMouse.bed file: cd /cluster/data/mm4/bed/STSmarkers /cluster/store5/mouseMarker/code/updateBed.pl \ /cluster/store5/mouseMarker/stsInfoMouse.bed \ downloads/MRK_Dump2.rpt downloads/PRB_PrimerSeq.rpt \ downloads/MRK_Sequence.rpt downloads/UniSTS_mouse.alias \ downloads/UniSTS_mouse.sts | sed -e "s/\t*$//" > newbedfile /cluster/store5/mouseMarker/code/cleanInfo.pl newbedfile > stsInfoMouse.bed # comparing to Mm3, this file was used there: # /projects/hg2/booch/psl/mm.2003.02/mm3/info/stsInfoMouseNew.bed # a wc of it shows: # 25341 339289 3572880 stsInfoMouseNew.bed # that is about half of what we have here: # 56406 786036 6425721 stsInfoMouse.bed # and from that, create new primer fa, epcr, etc: /cluster/store5/mouseMarker/code/luConvertPrimerToFa \ stsInfoMouse.bed mouseP.fa mouseC.fa mouseP.info # the mouseC.fa file will be empty wc mouse?.* # 0 0 0 mouseC.fa # 258307 258245 5815248 mouseP.fa # 29906 149545 1890926 mouseP.info # the equivalent Mm3 versions: # 243750 243721 5544726 musPrimer.fa == mouseP.fa # 25341 126705 1609168 mouse.primers == mouseP.info # copy the primers over to the bluearc for the kluster run cp -p mouseP.fa /cluster/bluearc/scratch/mus/mm4 cp -p mouseP.info /cluster/bluearc/scratch/mus/mm4 # CLUSTER RUN FOR THE STS PRIMERS ssh kk mkdir -p /cluster/data/mm4/bed/STSmarkers/primer mkdir -p /cluster/data/mm4/bed/STSmarkers/ePCR cd /cluster/data/mm4/bed/STSmarkers/primer # the mouseP.fa comes from above echo "/cluster/bluearc/scratch/mus/mm4/mouseP.fa" > primers.lst cat << '_EOF_' > template #LOOP /cluster/bin/i386/blat.2 $(path1) $(path2) -ooc=/scratch/hg/h/mouse11.ooc -minMatch=1 -minScore=0 -minIdentity=80 -oneOff {check out line+ primers.out/$(root1).psl} #ENDLOOP '_EOF_' mkdir primers.out /cluster/bin/scripts/splitContigList -mouse -scratch \ /cluster/bluearc/scratch/mus/mm4/trfFa 1 /cluster/bin/i386/gensub2 contig.lst primers.lst template jobList para create jobList para try para check para push ... etc ... # Completed: 599 of 599 jobs # CPU time in finished jobs: 315430s 5257.17m 87.62h 3.65d 0.010 y # IO & Wait Time: 42867s 714.45m 11.91h 0.50d 0.001 y # Average job time: 598s 9.97m 0.17h 0.01d # Longest job: 709s 11.82m 0.20h 0.01d # Submission to last job: 729s 12.15m 0.20h 0.01d # on the file server ssh kksilo cd /cluster/data/mm4/bed/STSmarkers/primer /cluster/bin/i386/pslSort dirs primers.psl temp primers.out rmdir temp # comparing results to mm3: # /projects/hg2/booch/psl/mm.2003.02/mm3/primers/primers.psl # Mm3 wc primers.psl # 4263983 89543582 440456942 primers.psl # Mm4 wc primers.psl /cluster/data/mm4/bed/STSmarkers/primer/primers.psl # 5745617 120657896 592135728 primers.psl XXX This is quite an increase, I wonder if it is reasonable ? # another kluster run ssh kk cd /cluster/data/mm4/bed/STSmarkers/ePCR ls -1S /cluster/bluearc/scratch/mus/mm4/trfFa > contig.lst mkdir epcr.out cat << '_EOF_' > template #LOOP /cluster/bin/scripts/luRunEpcr $(path1) $(path2) epcr.out/$(num2).epcr #ENDLOOP '_EOF_' # the mouseP.info was created above echo "/cluster/bluearc/scratch/mus/mm4/mouseP.info" > epcr.lst gensub2 epcr.lst contig.lst template jobList para create jobList para try para check para push ... etc ... # Completed: 599 of 599 jobs # CPU time in finished jobs: 135807s 2263.45m 37.72h 1.57d 0.004 y # IO & Wait Time: 32256s 537.60m 8.96h 0.37d 0.001 y # Average job time: 281s 4.68m 0.08h 0.00d # Longest job: 353s 5.88m 0.10h 0.00d # Submission to last job: 394s 6.57m 0.11h 0.00d ssh hgwdev # all those results become all.epcr cat epcr.out/*.epcr > all.epcr # comparing results to Mm3: # /projects/hg2/booch/psl/mm.2003.02/mm3/primers/all.epcr # Mm3 wc all.epcr # 24627 98508 1329559 all.epcr # Mm4 wc all.epcr /cluster/data/mm4/bed/STSmarkers/ePCR/all.epcr # 74705 298820 3971712 all.epcr XXX This is an unusual increase ? cd /cluster/data/mm4/bed/STSmarkers/primer # create primers.psl.filter.blat and epcr.not.found # must use -rat here, not the -mouse /cluster/bin/scripts/filterPrimers -rat \ ../stsInfoMouse.bed primers.psl ../mouseP.info \ ../ePCR/all.epcr > primers.psl.filter.blat 2> filterPrimers.err # The filterPrimers.err file should show an increasing count: # Reading name info # Reading primer info # Processing file # 100000 # 200000 # 300000 # ... # 5800000 # 5900000 # Determining ePCR not found # # Mm4: wc primers.psl.filter.blat # 32729 687309 3331894 primers.psl.filter.blat # Checking vs. Mm3 on on kks00: # /projects/hg2/booch/psl/mm.2003.02/mm3/primers/primers.psl.filter.blat # wc of that shows: # 26088 547848 2612720 primers.psl.filter.blat # create accession_info.rdb (chrM added to Terry's script for mouse) touch empty_sequence.inf /cluster/bin/scripts/compileAccInfo -mouse \ /cluster/data/mm4 empty_sequence.inf # works with one seemingly error: # cat: /cluster/data/mm4/M/chrM_random.agp: No such file or directory mv accession_info.rdb accession_info.rdb.tmp /cluster/bin/scripts/sorttbl Chr Ord Start < accession_info.rdb.tmp > accession_info.rdb rm accession_info.rdb.tmp # comparing results to Mm3: # Mm3 wc accession_info.rdb # 173948 1913432 12738822 accession_info.rdb # Mm4 wc accession_info.rdb # 86935 956289 6374930 accession_info.rdb #### This is an interesting difference. The structure of the .agp files # for Mm3 and Mm4 show a large difference in line counts: # wc /cluster/data/mm3/?/*.agp /cluster/data/mm3/??/*.agp # 339937 2893447 18141029 total # wc /cluster/data/mm4/?/*.agp /cluster/data/mm4/??/*.agp # 219652 1885501 11875772 total # creates epcr.not.found.nomatch and epcr.not.found.psl /cluster/bin/scripts/epcrToPsl -mouse \ epcr.not.found ../mouseP.info \ accession_info.rdb /cluster/data/mm4 # Comparing results to Mm3: # Mm3 wc epcr* # 205 820 7585 epcr.not.found # 1 12 95 epcr.not.found.nomatch # 204 4284 19817 epcr.not.found.psl # Mm4 wc epcr* # 328 1312 12011 epcr.not.found # 57 684 5474 epcr.not.found.nomatch # 271 5691 26224 epcr.not.found.psl # there is a single error being propagated here from the file # /cluster/store5/mouseMarker/stsInfoMouse.bed which has an error # at line 53958: 62943 D2J3 91947 D2J3 CAACCAGCTCAC CAACCAGCTCAC 1825, 1025BP 0 MUS MUSCULUS # The value '1825,' is incorrect. Should be a small integer here. # to work around this problem, I'm manually eliminating this problem # from the epcr.not.found.psl file where it has now become two bad # lines: # 24 0 0 0 1 1801 1 1789 + 62943 1825, 0 1825, chr11_16 0 1115413 1117226 2 12,12, 0,1813, 1115413,1117214, # 24 0 0 0 1 1801 1 1789 + 62943 1825, 0 1825, chr11_16 0 1115413 1117226 2 # taking those two lines out, find another two lines: # 24 0 0 0 1 951 1 994 + 64793 975, 0 975, chr5_23 0 3405277 3406295 2 12,12, 0,963, 3405277,3406283, # 24 0 0 0 1 951 1 994 + 64793 975, 0 975, chr5_23 0 3405277 3406295 2 12,12, 0,963, 3405277,3406283, # And one more line that had a 930BP in the wrong place, the last line # in this file. # combine them to one cat primers.psl.filter.blat epcr.not.found.psl > primers.psl.filter # lift those primers (added chrM to this lifter script for mouse) # creates primers.psl.filter.lifted /cluster/bin/scripts/lifter -mouse -psl \ /cluster/data/mm4 primers.psl.filter # wc primers.psl.filter.lifted # 32995 692895 3488746 primers.psl.filter.lifted # create primers.psl.filter.lifted.initial PATH=/cluster/bin/scripts:$PATH \ /cluster/bin/scripts/extractPslInfo \ primers.psl.filter.lifted # wc primers.psl.filter.lifted.initial # 32983 197898 1758225 primers.psl.filter.lifted.initial # create primers.psl.filter.lifted.initial.acc /cluster/bin/scripts/findAccession -agp \ -mouse primers.psl.filter.lifted.initial /cluster/data/mm4 # wc primers.psl.filter.lifted.initial.acc # 32983 230881 2083248 primers.psl.filter.lifted.initial.acc # this needs to be -rat as that specifies how to scan the # stsInfoMouse.bed file and it does not work if you use -mouse /cluster/bin/scripts/getStsId -rat \ ../stsInfoMouse.bed primers.psl.filter.lifted.initial.acc \ > primers.initial.acc.trans # wc primers.initial.acc.trans # 32983 230881 1771293 primers.initial.acc.trans sort -k 4n primers.initial.acc.trans > primers.final rm primers.psl.filter.lifted.initial.acc primers.initial.acc.trans # comparing results to Mm3: # Mm3 wc primers.final # 26292 184044 1253365 primers.final # wc primers.final # 32983 230881 1771293 primers.final cd /cluster/data/mm4/bed/STSmarkers # stsMarkers.final is empty for mouse touch stsMarkers.final dummy PATH=/cluster/bin/scripts:$PATH \ /cluster/bin/scripts/combineSeqPrimerPos \ stsMarkers.final primer/primers.final > stsMarkers_pos.rdb # Comparing results to Mm3 # Mm3 wc stsMarkers_pos.rdb # 25925 181475 1413458 stsMarkers_pos.rdb # Mm4 wc stsMarkers_pos.rdb # 31270 218890 1869417 stsMarkers_pos.rdb /projects/cc/hg/ytlu/bin/script/perl/createStsBed \ stsInfoMouse.bed stsMarkers_pos.rdb 500 > stsMapMouse.bed # wc stsMapMouse.bed # 27976 307388 2198948 stsMapMouse.bed # loading STS markers tables ssh hgwdev cd /cluster/data/mm4/bed/STSmarkers ./ucscAlias.pl stsInfoMouse.bed > ucscStsAlias.tab 2> ucscStsAlias.warnings # wc ucscStsAlias.tab # 114257 342771 2746002 ucscStsAlias.tab hgsql -e "drop table stsAlias;" mm4 hgsql mm4 < ~/kent/src/hg/lib/stsAlias.sql hgsql -e \ 'load data local infile "ucscStsAlias.tab" into table stsAlias;' mm4 hgsql -e "drop table stsMapMouseNew;" mm4 hgsql mm4 < ~/kent/src/hg/lib/stsMapMouseNew.sql hgsql -e \ 'load data local infile "stsMapMouse.bed" into table stsMapMouseNew;' mm4 hgsql -e "drop table stsInfoMouseNew;" mm4 hgsql mm4 < ~/kent/src/hg/lib/stsInfoMouseNew.sql hgsql -e \ 'load data local infile "stsInfoMouse.bed" into table stsInfoMouseNew;' mm4 hgLoadPsl -nobin -table=all_sts_primer mm4 primer/primers.psl.filter.lifted # load primer sequences mkdir /gbdb/mm4/stsMarker ln -s /cluster/data/mm4/bed/STSmarkers/mouseP.fa \ /gbdb/mm4/stsMarker/mouseP.fa hgLoadSeq mm4 /gbdb/mm4/stsMarker/mouseP.fa # Adding /gbdb/mm4/stsMarker/mouseP.fa # 29906 sequences # DONE - 2004-01-07 15:14 # STS Markers plots - needs to wait until stsMarkers_pos_all.rdb # is created correctly from above, also needs the three maps files ### XXXXX These are obsolete, not necessary /cluster/bin/scripts/combineAllSeqPrimer \ -rat stsInfoMouse.bed \ dummy primer/primers.final > stsMarkers_pos_all.rdb # can not find this result for Mm3 # Mm4 wc stsMarkers_pos_all.rdb # 26811 59539 385261 stsMarkers_pos_all.rdb mkdir /cluster/data/mm4/bed/STSmarkers/html cd /cluster/data/mm4/bed/STSmarkers/html mkdir maps plots sts # not sure how to make the three maps/*.map files /projects/compbio/usr/booch/booch/scripts/mergeTPFsts -rat \ ../primer/accession_info.rdb ../stsMarkers_pos.rdb \ maps/wi_rh.id.map 3.0 ../stsMarkers_pos_all.rdb \ /cluster/store5/mouseMarker/stsInfoMouse.bed > rh.rdb mv map.ranges rh.ranges /projects/compbio/usr/booch/booch/scripts/mergeTPFsts -rat \ ../primer/accession_info.rdb ../stsMarkers_pos.rdb \ maps/wi_gen.id.map 3.0 ../stsMarkers_pos_all.rdb \ /cluster/store5/mouseMarker/stsInfoMouse.bed > wi_gen.rdb mv map.ranges wi_gen.ranges /projects/compbio/usr/booch/booch/scripts/mergeTPFsts -rat \ ../primer/accession_info.rdb ../stsMarkers_pos.rdb \ maps/mgd_gen.id.map 3.0 ../stsMarkers_pos_all.rdb \ /cluster/store5/mouseMarker/stsInfoMouse.bed > mgd.rdb mv map.ranges mgd.ranges /projects/compbio/usr/booch/booch/scripts/combineTPFsts \ rh.rdb wi_gen.rdb mgd.rdb > sts.rdb /projects/compbio/usr/booch/booch/scripts/TPFrdb2html \ -mouse maps/wi_rh.id.map -plain ASSEM sts sts.rdb mm4 sts mv *.rdb sts/ mv *.ranges sts/ cd plots STSFLAGS="-dbl -top 0.3 -mouse -m mm4" /projects/compbio/usr/booch/booch/scripts/createUCSCplots \ ${STSFLAGS} /cluster/data/mm4 ../maps ../../stsMarkers_pos.rdb /projects/compbio/usr/booch/booch/scripts/createUCSCplots -x 10000000 \ ${STSFLAGS} /cluster/data/mm4 ../maps ../../stsMarkers_pos.rdb /projects/compbio/usr/booch/booch/scripts/createUCSCplots -x 50000000 \ ${STSFLAGS} /cluster/data/mm4 ../maps ../../stsMarkers_pos.rdb LOAD GENEID GENES (DONE - 2003-11-27 - Hiram RELOADED -gtf 2004-04-02 kate) mkdir -p /cluster/data/mm4/bed/geneid/download cd /cluster/data/mm4/bed/geneid/download foreach f (/cluster/data/mm4/?{,?}/chr?{,?}{,_random}.fa) set chr = $f:t:r wget \ http://genome.imim.es/genepredictions/M.musculus/mmOct2003/geneid_v1.1/$chr.gtf wget \ http://genome.imim.es/genepredictions/M.musculus/mmOct2003/geneid_v1.1/$chr.prot end # Add missing .1 to protein id's foreach f (*.prot) perl -wpe 's/^(>chr\w+)$/$1.1/' $f > $f:r-fixed.prot end cd .. ldHgGene mm4 geneid download/*.gtf -gtf # 36974 gene predictions hgPepPred mm4 generic geneidPep download/*-fixed.prot # SGP GENE PREDICTIONS (DONE - 2003-12-30 - Hiram) mkdir -p /cluster/data/mm4/bed/sgp/download cd /cluster/data/mm4/bed/sgp/download #http://genome.imim.es/genepredictions/M.musculus/mmOct2003/SGP/humangp200307 foreach f (/cluster/data/mm4/?{,?}/chr?{,?}{,_random}.fa) set chr = $f:t:r wget --timestamping \ http://genome.imim.es/genepredictions/M.musculus/mmOct2003/SGP/humangp200307/$chr.gtf wget --timestamping \ http://genome.imim.es/genepredictions/M.musculus/mmOct2003/SGP/humangp200307/$chr.prot end wget --timestamping \ http://genome.imim.es/genepredictions/M.musculus/mmOct2003/SGP/humangp200307/readme # Add missing .1 to protein id's foreach f (*.prot) perl -wpe 's/^(>chr\w+)$/$1.1/' $f > $f:r-fixed.prot end cd .. ldHgGene mm4 sgpGene download/*.gtf -exon=CDS # Read 43988 transcripts in 320332 lines in 21 files # 43988 groups 21 seqs 1 sources 3 feature types # 43988 gene predictions hgPepPred mm4 generic sgpPep download/*-fixed.prot # featureBits mm4 sgpGene # 40100330 bases of 2627444668 (1.526%) in intersection # featureBits mm3 sgpGene # 38897454 bases of 2505900260 (1.552%) in intersection # TWINSCAN 1.3 GENE PREDICTIONS (2003-12-15 braney) cd /cluster/data/mm4/bed rm -fr twinscan mkdir twinscan.2003-12-15 ln -s twinscan.2003-12-15 twinscan cd twinscan tarFile=MGSCv3_v_NCBI34_11-24-03.tgz wget http://genes.cs.wustl.edu/predictions/mouse/11-24-03/MGSCv3_v_NCBI34_11-24-03.tgz wget http://genes.cs.wustl.edu/predictions/mouse/11-24-03/md5sum.txt # check file transferred correctly md5sum $tarFile | diff - md5sum.txt tar xvfz $tarFile unset tarFile # pare down protein FASTA header to id and add missing .a: foreach c (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 X Y ) echo chr$c perl -wpe 's/^(\>\S+)\s.*$/$1.a/' < chr_ptx/chr$c.ptx > chr_ptx/chr$c-fixed.fa end ldHgGene mm4 twinscan chr_gtf/chr*.gtf -gtf hgPepPred mm4 generic twinscanPep chr_ptx/chr*-fixed.fa #### LOAD ENSEMBL GENES (DONE - 2004-02-19 - Hiram) # needed for Gene Sorter procedure below # Ensembl released Mouse build 32 the week of Feb 16 2004 mkdir /cluster/data/mm4/bed/ensembl cd /cluster/data/mm4/bed/ensembl Get the ensembl gene data from http://www.ensembl.org/ Go to the EnsMart link Choose Mus musculus as the organism Follow this sequence through the pages: Page 1) Choose the Ensembl Genes choice. Hit next. Page 2) Uncheck the "Limit to" box in the region choice. Then hit next. Page 3) Choose the "Structures" tab. Page 4) Choose GTF as the ouput, choose gzip compression , name the output file ensGeneMm4.gtf.gz and then hit Export # Ensembl handles random chromosomes differently than us, so we # strip this data. Fortunately it just loses a couple of genes. zcat ensGene.gtf.gz | grep -v ^6_DR51 | grep -v _NT_ > unrandom.gtf # Let's see how much it loses: # zcat ensGene.gtf.gz | wc # 542591 7596274 77247475 # wc unrandom.gtf # 525809 7361326 74690888 unrandom.gtf # Add "chr" to front of each line in the gene data gtf file to make # it compatible with ldHgGene sed -e "s/^/chr/" unrandom.gtf > ensGene.gtf ldHgGene mm4 ensGene ensGene.gtf # Read 30665 transcripts in 525809 lines in 1 files # 30665 groups 21 seqs 1 sources 4 feature types # 30665 gene predictions # save space, gzip them: gzip unrandom.gtf gzip ensGene.gtf o - Load Ensembl peptides: Get the ensembl protein data from http://www.ensembl.org/ Go to the Martview link Choose Mus musculus as the organism Follow this sequence through the pages: Page 1) Choose the Ensembl Genes choice. Hit next. Page 2) Uncheck the "Limit to" box in the region choice. Then hit next. Page 3) Choose the "Sequences" tab. Page 4) Choose Transcripts/Proteins and peptide Only as the output, choose text/fasta and gzip compression, name the file ensGeneMm4.pep.gz and then hit export. If needed, substitute ENST for ENSP in ensPep with the program called subs edit subs.in to read: ENSP|ENST #subs -e mm4EnsGene.pep > /dev/null #delete * at end of each protein zcat ensGeneMm4.pep.gz | sed "s/\*$//" > ensembl.pep ~matt/bin/fixPep.pl ensembl.pep fixPep_ensembl.pep hgPepPred mm4 generic ensPep fixPep_ensembl.pep o - Load ensGtp table. # ensGtp associates geneId/transcriptId/proteinId for hgPepPred and # hgKnownToSuper. Use ensMart to create it as above, except: # Page 3) Choose the "Features" tab. In "Ensembl Attributes", check # Ensembl Gene ID, Ensembl Transcript ID, Ensembl Peptide ID. # Choose Text, tab-separated as the output format, gzip. # Result name file as ensGtpMm4.tab.gz gunzip ensGtpMm4.tab.gz hgsql mm4 < ~/kent/src/hg/lib/ensGtp.sql hgsql -N -e 'load data local infile "ensGtpMm4.tab" into table ensGtp;' mm4 ### MAKE THE affyU74 TRACK - needed for the Gene Sorter # (DONE - 2003-01-06 - Hiram) # MAKE THE affyU74 TRACK using Affy consensus sequences instead of # target sequences. Recalculate alignments and load data # [DONE, hartera, 2004-02-06] ---------------------------------- # Load up semi-local disk with target sequences for Affy mouse U74 chips. ssh kkr1u00 mkdir -p /iscratch/i/affy # This /projects filesystem is not available on kkr1u00 # but it is on kk ssh kk cp /projects/compbio/data/microarray/affyGnfMouse/sequences/U74*consensus.fa /iscratch/i/affy ssh kkr1u00 iSync # Run cluster job to do alignments ssh kk mkdir /cluster/data/mm4/bed/affyU74.2004-02-06 cd /cluster/data/mm4/bed/affyU74.2004-02-06 mkdir run cd run mkdir psl echo /scratch/mus/mm4/trfFa/*.fa | wordLine stdin > genome.lst ls -1 /iscratch/i/affy/U74*consensus.fa > affy.lst cat << '_EOF_' > gsub #LOOP /cluster/bin/i386/blat -fine -mask=lower -minIdentity=95 -ooc=/scratch/hg/h/11.ooc {check in line+ $(path1)} {check in line+ $(path2)} {check out line+ psl/$(root1)_$(root2).psl} #ENDLOOP '_EOF_' gensub2 genome.lst affy.lst gsub jobList para create jobList para try # do usual para check/para push etc. until the job is done. # para time: 2004-02-06 # Completed: 1797 of 1797 jobs # CPU time in finished jobs: 13759s 229.31m 3.82h 0.16d 0.000 y # IO & Wait Time: 37615s 626.92m 10.45h 0.44d 0.001 y # Average job time: 29s 0.48m 0.01h 0.00d # Longest job: 73s 1.22m 0.02h 0.00d # Submission to last job: 176s 2.93m 0.05h 0.00d # Do sort, best in genome filter, and convert to chromosome coordinates # to create affyU74.psl. ssh kksilo cd /cluster/data/mm4/bed/affyU74/run pslSort dirs raw.psl tmp psl # change filter parameters for these sequences. only use alignments that # cover 30% of sequence and have at least minAli = 0.95. # minAli = 0.97 too high. low minCover as a lot of n's in these sequences pslReps -minCover=0.3 -sizeMatters -minAli=0.95 -nearTop=0.005 raw.psl contig.psl /dev/null # Processed 44630 alignments liftUp ../all_affyU74.psl ../../../jkStuff/liftAll.lft warn contig.psl # Sort by chromosome and load into database. ssh hgwdev cd /cluster/data/mm4/bed/affyU74.2004-02-06 pslSortAcc nohead chrom temp all_affyU74.psl cat chrom/*.psl > affyU74.psl # shorten qName to "xxxx_at" instead of "U74Xv2:xxxx_at;" # and reload data into table hgLoadPsl mm4 affyU74.psl rm -fr chrom temp run ## MAKE THE affyGnfU74 TRACKs (DONE - 2004-01-06 - Hiram) # Make bed files and load consensus sequences for Affy U74 chip set. # [DONE, hartera, 2004-02-06] ---------------------------------- #This needs to be done after affyU74 is already made. ssh hgwdev mkdir -p /cluster/data/mm4/bed/affyGnf.2004-02-06 cd /cluster/data/mm4/bed/affyGnf.2004-02-06 # may need to build this command in src/hg/affyGnf affyPslAndAtlasToBed ../affyU74.2004-02-06/affyU74.psl \ /projects/compbio/data/microarray/affyGnfMouse/data/data_public_U74 \ affyGnfU74A.bed affyGnfU74A.exp -newType -chip=U74Av2 affyPslAndAtlasToBed ../affyU74.2004-02-06/affyU74.psl \ /projects/compbio/data/microarray/affyGnfMouse/data/U74B_b.txt \ affyGnfU74B.bed affyGnfU74B.exp -newType -chip=U74Bv2 affyPslAndAtlasToBed ../affyU74.2004-02-06/affyU74.psl \ /projects/compbio/data/microarray/affyGnfMouse/data/U74C_b.txt \ affyGnfU74C.bed affyGnfU74C.exp -newType -chip=U74Cv2 # shorten qName to "xxxx_at" instead of "U74Xv2:xxxx_at;" # and reload data into table hgLoadBed mm4 affyGnfU74A affyGnfU74A.bed hgLoadBed mm4 affyGnfU74B affyGnfU74B.bed hgLoadBed mm4 affyGnfU74C affyGnfU74C.bed # Add in sequence data for U74 tracks. # Copy consensus sequence to /gbdb if it isn't already # [DONE, hartera, Feb 6, 2004] # Fix broken symlinks to microarray data after directory structure changed # (DONE, 2005-05-03, hartera) mkdir -p /gbdb/hgFixed/affyProbes cd /gbdb/hgFixed/affyProbes # fix broken symlinks after directory structure changed # /projects/compbiodata ----> /projects/compbio/data rm U74* # make correct symlinks (hartera, 2005-05-03) ln -s /projects/compbio/data/microarray/affyGnfMouse/sequences/U74Av2_consensus.fa . ln -s /projects/compbio/data/microarray/affyGnfMouse/sequences/U74Bv2_consensus.fa . ln -s /projects/compbio/data/microarray/affyGnfMouse/sequences/U74Cv2_consensus.fa . # used perl -pi.bak -e 's/;/ /' to remove ";" after probe name # reload sequences with prefix removed so acc matches name used in # other dependent tables hgLoadSeq -abbr=U74Av2: mm4 /gbdb/hgFixed/affyProbes/U74Av2_consensus.fa hgLoadSeq -abbr=U74Bv2: mm4 /gbdb/hgFixed/affyProbes/U74Bv2_consensus.fa hgLoadSeq -abbr=U74Cv2: mm4 /gbdb/hgFixed/affyProbes/U74Cv2_consensus.fa ### GNF ATLAS 2 [Done jk 3/29/2004] # Align probes from GNF1M chip. ssh kk cd /cluster/data/mm4/bed mkdir -p geneAtlas2/run/psl cd geneAtlas2/run mkdir -p /cluster/bluearc/geneAtlas2 cp /projects/compbio/data/microarray/geneAtlas2/mouse/gnf1m.fa /cluster/bluearc/geneAtlas2 ls -1 /scratch/mus/mm4/trfFa/ > genome.lst ls -1 /cluster/bluearc/geneAtlas2/gnf1m.fa > mrna.lst echo '#LOOP\nblat -fine -ooc=/scratch/hg/h/mouse11.ooc /scratch/mus/mm4/trfFa/$(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl}\n#ENDLOOP' > gsub gensub2 genome.lst mrna.lst gsub spec para create spec para try para check para push para time #Completed: 599 of 599 jobs #CPU time in finished jobs: 55883s 931.39m 15.52h 0.65d 0.002 y #IO & Wait Time: 1869s 31.15m 0.52h 0.02d 0.000 y #Average job time: 96s 1.61m 0.03h 0.00d #Longest job: 174s 2.90m 0.05h 0.00d #Submission to last job: 411s 6.85m 0.11h 0.00d # Do sort, best in genome filter, and convert to chromosome coordinates # to create gnf1h.psl. pslSort dirs raw.psl tmp psl pslReps -minCover=0.3 -minAli=0.95 -nearTop=0.005 raw.psl contig.psl /dev/null liftUp ../affyGnf1m.psl ../../../jkStuff/liftAll.lft warn contig.psl rm -r contig.psl raw.psl psl # Load probes and alignments from GNF1H into database. ssh hgwdev cd /cluster/data/mm4/bed/geneAtlas2 ln -s /projects/compbio/data/microarray/geneAtlas2/mouse/gnf1m.fa /gbdb/hgFixed/affyProbes hgLoadPsl mm4 affyGnf1m.psl hgLoadSeq mm4 /gbdb/hgFixed/affyProbes/gnf1m.fa # Load up track hgMapMicroarray gnfAtlas2.bed hgFixed.gnfMouseAtlas2MedianRatio \ affyGnf1m.psl # Note that the unmapped 5000 records are from all-N sequences. hgLoadBed mm4 gnfAtlas2 gnfAtlas2.bed ######## MAKING FAMILY BROWSER TABLES ####### (DONE - 2004-01-06 - Hiram) # These are instructions for building the # neighborhood browser. Don't start these until # there is a knownGene track. and the affy tracks # Cluster together various alt-splicing isoforms. # Creates the knownIsoforms and knownCanonical tables ssh hgwdev hgClusterGenes mm4 knownGene knownIsoforms knownCanonical # You may need to build this binary in src/hg/near/hgClusterGenes # Got 24849 clusters, from 40996 genes in 44 chromosomes # featureBits mm4 knownCanonical # 829173598 bases of 2627444668 (31.558%) in intersection # featureBits mm3 knownCanonical # 836749019 bases of 2505900260 (33.391%) in intersection # Extract peptides from knownGenes into fasta file # and create a blast database out of them. mkdir -p /cluster/data/mm4/bed/blastp cd /cluster/data/mm4/bed/blastp pepPredToFa mm4 knownGenePep known.faa # You may need to build this binary in src/hg/near/pepPredToFa formatdb -i known.faa -t known -n known # This command is in /projects/compbio/bin/$MACH/formatdb # Copy over database to bluearc rm -fr /cluster/bluearc/mm4/blastp mkdir -p /cluster/bluearc/mm4/blastp cp /cluster/data/mm4/bed/blastp/known.* /cluster/bluearc/mm4/blastp # Load up cluster/bluearc with blastp and related files # if necessary if (! -e /cluster/bluearc/blast/blastall) then mkdir -p /cluster/bluearc/blast cp /projects/compbio/bin/i686/blastall /cluster/bluearc/blast mkdir -p /cluster/bluearc/blast/data cp /projects/compbio/bin/i686/data/* /cluster/bluearc/blast/data endif # Split up fasta file into bite sized chunks for cluster cd /cluster/data/mm4/bed/blastp mkdir split faSplit sequence known.faa 8000 split/kg # Make parasol run directory ssh kk9 mkdir -p /cluster/data/mm4/bed/blastp/self cd /cluster/data/mm4/bed/blastp/self mkdir run cd run mkdir out # Make blast script cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/cluster/bluearc/blast/data export BLASTMAT /cluster/bluearc/blast/blastall -p blastp \ -d /cluster/bluearc/mm4/blastp/known -i $1 -o $2 -e 0.01 -m 8 -b 1000 '_EOF_' chmod a+x blastSome # Make gensub2 file cat << '_EOF_' > gsub #LOOP blastSome {check in line+ $(path1)} {check out line out/$(root1).tab} #ENDLOOP '_EOF_' # Create parasol batch # 'ls ../../split/*.fa' is too much, hence the echo echo ../../split/*.fa | wordLine stdin > split.lst gensub2 split.lst single gsub jobList para create jobList para try # Wait a couple of minutes, and do a para check, if all is good # then do a para push # This should finish in ~15 minutes if the cluster is free. # Completed: 7737 of 7737 jobs # CPU time in finished jobs: 49565s 826.08m 13.77h 0.57d 0.002 y # IO & Wait Time: 20374s 339.57m 5.66h 0.24d 0.001 y # Average job time: 9s 0.15m 0.00h 0.00d # Longest job: 53s 0.88m 0.01h 0.00d # Submission to last job: 775s 12.92m 0.22h 0.01d # Load into database. This takes about an hour. ssh hgwdev cd /cluster/data/mm4/bed/blastp/self/run/out hgLoadBlastTab mm4 knownBlastTab *.tab # Scanning through 7737 files # Loading database with 7522182 rows # Create known gene mapping table and expression distance tables # for GNF Atlas 2. (The hgExpDistance takes an hour.) hgMapToGene mm4 affyGnf1m knownGene knownToGnf1m hgExpDistance mm4 hgFixed.gnfMouseAtlas2MedianRatio \ hgFixed.gnfMouseAtlas2MedianExps gnfAtlas2Distance \ -lookup=knownToGnf1m # Create table that maps between known genes and RefSeq hgMapToGene mm4 refGene knownGene knownToRefSeq # may need to build this command in src/hg/near/hgMapToGene # Create a table that maps between known genes and # the nice affy expression data. hgMapToGene mm4 affyU74 knownGene knownToU74 hgMapToGene mm4 affyMOE430 knownGene knownToMOE430 hgMapToGene mm4 affyMOE430 -prefix=A: knownGene knownToMOE430A # Format and load Rinn et al sex expression data mkdir /cluster/data/mm4/bed/rinnSex cd !$ hgMapMicroarray rinnSex.bed hgFixed.mouseRinnSexMedianRatio \ ../affyMOE430/affyMOE430.psl hgLoadBed mm4 rinnSex rinnSex.bed # Format and load the GNF data mkdir /cluster/data/mm4/bed/affyGnf95 cd /cluster/data/mm4/bed/affyGnf95 affyPslAndAtlasToBed -newType ../affyU95.psl \ /projects/compbio/data/microarray/affyGnfHuman/data_public_U95 \ affyGnfU95.tab affyGnfU95Exps.tab -shortOut # this .sql load was in preceeding instructions, but this .sql file # appears to not exist and it doesn't seem to be needed anyway. # Everything below this seems to create tables OK. # hgsql mm4 < ~/kent/src/hg/affyGnf/affyGnfU95.sql # Create table that gives distance in expression space between # GNF genes. These commands take about 15 minutes each # The affyGnfU74?Exps arguments appear to be unused in hgExpDistance hgExpDistance mm4 affyGnfU74A affyGnfU74AExps affyGnfU74ADistance \ -lookup=knownToU74 # Got 8900 unique elements in affyGnfU74A hgExpDistance mm4 affyGnfU74B affyGnfU74BExps affyGnfU74BDistance \ -lookup=knownToU74 # Got 6850 unique elements in affyGnfU74B hgExpDistance mm4 affyGnfU74C affyGnfU74CExps affyGnfU74CDistance \ -lookup=knownToU74 # Got 2145 unique elements in affyGnfU74C # Make sure that GO database is up to date. See README in /cluster/store1/geneOntology. # Create knownToEnsembl column # XXXX do not have an ensGene table yet ! # XXXX TBD - to be done when they release Mouse build 32 gene data # OK to leave this our for now, will leave a couple of missing # columns in the family browser hgMapToGene mm4 ensGene knownGene knownToEnsembl # Make knownToCdsSnp column. This is a little complicated by # having to merge data form the snpTsc and the snpNih tracks. hgMapToGene mm4 snpNih knownGene knownToCdsSnp -all -cds # Make C. elegans ortholog column using blastp on wormpep. # First make C. elegans protein database and copy it to cluster/bluearc # if it doesn't exist already # This is already done, see makeMm3.doc for procedure # the directory: /cluster/bluearc/ce1/blastp should have data # Create the ceBlastTab (the blastall binary only works on kk9 for now ...) ssh kk9 mkdir /cluster/data/mm4/bed/blastp/ce1 cd /cluster/data/mm4/bed/blastp/ce1 mkdir run cd run mkdir out # Make blast script cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/cluster/bluearc/blast/data /cluster/bluearc/blast/blastall \ -p blastp -d /cluster/bluearc/ce1/blastp/wormPep \ -i $1 -o $2 -e 0.01 -m 8 -b 1 '_EOF_' chmod a+x blastSome # Make gensub2 file cat << '_EOF_' > gsub #LOOP blastSome {check in line+ $(path1)} {check out line out/$(root1).tab} #ENDLOOP '_EOF_' # Create parasol batch echo ../../split/*.fa | wordLine stdin > split.lst gensub2 split.lst single gsub jobList para create jobList para try # Wait a couple of minutes, and do a para check, if all is good # then do a para push # This should finish in ~10 minutes if the cluster is free. # Here's the para time results # Completed: 7737 of 7737 jobs # CPU time in finished jobs: 24219s 403.64m 6.73h 0.28d 0.001 y # IO & Wait Time: 20227s 337.12m 5.62h 0.23d 0.001 y # Average job time: 6s 0.10m 0.00h 0.00d # Longest job: 20s 0.33m 0.01h 0.00d # Submission to last job: 494s 8.23m 0.14h 0.01d # Load into database. ssh hgwdev cd /cluster/data/mm4/bed/blastp/ce1/run/out hgLoadBlastTab mm4 ceBlastTab -maxPer=1 *.tab # Make human ortholog column using blastp on human known genes. # First make human protein database and copy it to cluster/bluearc # if it doesn't exist already # This already exists. See makeMm3.doc for procedure # the directory: /cluster/bluearc/hg16/blastp should have data # Make parasol run directory ssh kk9 mkdir /cluster/data/mm4/bed/blastp/hg16 cd /cluster/data/mm4/bed/blastp/hg16 mkdir run cd run mkdir out # Make blast script cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/cluster/bluearc/blast/data /cluster/bluearc/blast/blastall \ -p blastp -d /cluster/bluearc/hg16/blastp/known \ -i $1 -o $2 -e 0.001 -m 8 -b 1 '_EOF_' chmod a+x blastSome # Make gensub2 file cat << '_EOF_' > gsub #LOOP blastSome {check in line+ $(path1)} {check out line out/$(root1).tab} #ENDLOOP '_EOF_' # Create parasol batch # (wordLine wouldn't run on kk9: # wordLine: /lib/i686/libc.so.6: version `GLIBC_2.3' not found # run this echo statement on hgwdev # this echo trick is used because otherwise the command line is # too long and you can not do a simple ls echo ../../split/*.fa | wordLine stdin > split.lst gensub2 split.lst single gsub jobList para create jobList para try # Wait a couple of minutes, and do a para check, if all is good # then do a para push # takes about 15 minutes: # Completed: 7737 of 7737 jobs # CPU time in finished jobs: 53172s 886.21m 14.77h 0.62d 0.002 y # IO & Wait Time: 20181s 336.34m 5.61h 0.23d 0.001 y # Average job time: 9s 0.16m 0.00h 0.00d # Longest job: 52s 0.87m 0.01h 0.00d # Submission to last job: 809s 13.48m 0.22h 0.01d # Load into database. ssh hgwdev cd /cluster/data/mm4/bed/blastp/hg16/run/out hgLoadBlastTab mm4 hgBlastTab -maxPer=1 *.tab # Make Danio rerio (zebrafish) ortholog column using blastp on Ensembl. # First make protein database and copy it to cluster/bluearc # if it doesn't exist already # This is already done, see makeMm3.doc for procedure # the directory: /cluster/bluearc/dr1/blastp should have data # Make parasol run directory ssh kk9 mkdir /cluster/data/mm4/bed/blastp/dr1 cd /cluster/data/mm4/bed/blastp/dr1 mkdir run cd run mkdir out # Make blast script cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/cluster/bluearc/blast/data /cluster/bluearc/blast/blastall \ -p blastp -d /cluster/bluearc/dr1/blastp/ensembl \ -i $1 -o $2 -e 0.005 -m 8 -b 1 '_EOF_' chmod a+x blastSome # Make gensub2 file cat << '_EOF_' > gsub #LOOP blastSome {check in line+ $(path1)} {check out line out/$(root1).tab} #ENDLOOP '_EOF_' # Create parasol batch echo ../../split/*.fa | wordLine stdin > split.lst gensub2 split.lst single gsub jobList para create jobList para try # Wait a couple of minutes, and do a para check, if all is good # then do a para push # Completed: 7737 of 7737 jobs # CPU time in finished jobs: 33916s 565.26m 9.42h 0.39d 0.001 y # IO & Wait Time: 20370s 339.50m 5.66h 0.24d 0.001 y # Average job time: 7s 0.12m 0.00h 0.00d # Longest job: 29s 0.48m 0.01h 0.00d # Submission to last job: 655s 10.92m 0.18h 0.01d # Load into database. ssh hgwdev cd /cluster/data/mm4/bed/blastp/dr1/run/out hgLoadBlastTab mm4 drBlastTab -maxPer=1 *.tab # Make Saccharomyces cerevisiae (yeast) ortholog column using blastp on RefSeq. # First make protein database and copy it to cluster/bluearc # if it doesn't exist already # This is already done, see makeMm3.doc for procedure # the directory: /cluster/bluearc/sc1/blastp should have data # Make parasol run directory ssh kk9 mkdir /cluster/data/mm4/bed/blastp/sc1 cd /cluster/data/mm4/bed/blastp/sc1 mkdir run cd run mkdir out # Make blast script cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/cluster/bluearc/blast/data /cluster/bluearc/blast/blastall \ -p blastp -d /cluster/bluearc/sc1/blastp/sgd \ -i $1 -o $2 -e 0.01 -m 8 -b 1 '_EOF_' chmod a+x blastSome # Make gensub2 file cat << '_EOF_' > gsub #LOOP blastSome {check in line+ $(path1)} {check out line out/$(root1).tab} #ENDLOOP '_EOF_' # Create parasol batch echo ../../split/*.fa | wordLine stdin > split.lst gensub2 split.lst single gsub jobList para create jobList para try # Wait a couple of minutes, and do a para check, if all is good # then do a para push # Completed: 7737 of 7737 jobs # CPU time in finished jobs: 6732s 112.19m 1.87h 0.08d 0.000 y # IO & Wait Time: 21851s 364.19m 6.07h 0.25d 0.001 y # Average job time: 4s 0.06m 0.00h 0.00d # Longest job: 9s 0.15m 0.00h 0.00d # Submission to last job: 324s 5.40m 0.09h 0.00d # Load into database. ssh hgwdev cd /cluster/data/mm4/bed/blastp/sc1/run/out hgLoadBlastTab mm4 scBlastTab -maxPer=1 *.tab # Make Drosophila melanagaster ortholog column using blastp on FlyBase. # First make SwissProt protein database and copy it to cluster/bluearc # if it doesn't exist already # This is already done, see makeMm3.doc for procedure # the directory: /cluster/bluearc/dm1/blastp should have data # Make parasol run directory ssh kk9 mkdir /cluster/data/mm4/bed/blastp/dm1 cd /cluster/data/mm4/bed/blastp/dm1 mkdir run cd run mkdir out # Make blast script cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/cluster/bluearc/blast/data /cluster/bluearc/blast/blastall \ -p blastp -d /cluster/bluearc/dm1/blastp/flyBase \ -i $1 -o $2 -e 0.01 -m 8 -b 1 '_EOF_' chmod a+x blastSome # Make gensub2 file cat << '_EOF_' > gsub #LOOP blastSome {check in line+ $(path1)} {check out line out/$(root1).tab} #ENDLOOP '_EOF_' # Create parasol batch echo ../../split/*.fa | wordLine stdin > split.lst gensub2 split.lst single gsub jobList para create jobList para try # Wait a couple of minutes, and do a para check, if all is good # then do a para push # Completed: 7737 of 7737 jobs # CPU time in finished jobs: 27801s 463.35m 7.72h 0.32d 0.001 y # IO & Wait Time: 19866s 331.10m 5.52h 0.23d 0.001 y # Average job time: 6s 0.10m 0.00h 0.00d # Longest job: 23s 0.38m 0.01h 0.00d # Submission to last job: 526s 8.77m 0.15h 0.01d # Load into database. ssh hgwdev cd /cluster/data/mm4/bed/blastp/dm1/run/out hgLoadBlastTab mm4 dmBlastTab -maxPer=1 *.tab # CREATING KNOWNtOsUPER (which enables superFamily stuff in hgNear/hgGene) # First see if need to update superfamily data from # ftp server at supfam.mrc-lmb.cam.ac.uk following instructions # in /cluster/store1/superFamily/genomes/README.ucsc. Then # make sure that knownToEnsembl and ensGtp tables are created, then: ssh hgwdev # Until we get an Ensmart update for Mm4, we need an empty table # here so kgKnownToSuper will function hgsql mm4 < ~/kent/src/hg/lib/ensGtp.sql zcat /cluster/store1/superFamily/genomes/ass_11-Jan-2004.tab.gz | \ hgKnownToSuper mm4 mm stdin LOAD SNPS (Done. Daryl Thomas; February 18, 2004) # SNP processing has been condensed into a single script, # which makes snpNih, snpTsc, and snpMap # ${HOME}/kent/src/hg/snp/locations/processSnpLocations.csh # snpBuild = 119 # Run from directory $oo/bed/snp/build$snpBuild/snpMap mkdir -p $oo/bed/snp/build$snpBuild/snpMap cd $oo/bed/snp/build$snpBuild/snpMap processSnpLocations.csh mm4 mouse 32 119 >& log & # check data: # wc -l snpTsc.bed; mm4 -e "select count(*) from snpTsc; # wc -l snpNih.bed; mm4 -e "select count(*) from snpNih; # wc -l snpMap.bed; mm4 -e "select count(*) from snpMap; # select * from snpNih limit 5; desc snpNih; show indexes from snpNih" # select * from snpTsc limit 5; desc snpTsc; show indexes from snpTsc" # select * from snpMap limit 5; desc snpMap; show indexes from snpMap" # remove temp files # rm mouse* *bed.gz LOAD SNP DETAILS (Done. Daryl Thomas; February 18, 2004) # SNP processing has been condensed into a single script, # which makes dbSnpRsMm # ${HOME}/kent/src/hg/snp/details/processSnpDetails.csh # snpBuild = 119 # Run from directory $oo/bed/snp/build$snpBuild/snpMap mkdir -p $oo/bed/snp/build$snpBuild/details/Done mkdir -p $oo/bed/snp/build$snpBuild/details/Observed cd $oo/bed/snp/build$snpBuild/details processSnpDetails.csh mm4 mouse 119 >& log & load data local infile "$fileBase.out" into table $database.$table gzip $fileBase.out # check data: # hgFixed -e "select count(*) from dbSnpRsMm; # select * from dbSnpRSMm limit 5; desc dbSnpRsMm; show indexes from dbSnpRSMm" # remove temp files # rm dbSnpRs* # KNOWN GENE UPDATE (DONE - 2004-02-19 - Hiram # using proteins and swissProt as created for Hg16 recently ssh hgwdev mkdir /cluster/data/kgDB/bed/kgMm4 cd /cluster/data/kgDB/bed/kgMm4 ~/kent/src/hg/protein/KGprocess.sh kgMm4 mm4 040115 # that runs to the first cluster run, then on kk ssh kk cd /cluster/data/kgDB/bed/kgMm4/kgBestMrna para create jobList para try para push ... etc ... # Completed: 43352 of 43352 jobs # CPU time in finished jobs: 118572s 1976.21m 32.94h 1.37d 0.004 y # IO & Wait Time: 119951s 1999.18m 33.32h 1.39d 0.004 y # Average job time: 6s 0.09m 0.00h 0.00d # Longest job: 25s 0.42m 0.01h 0.00d # Submission to last job: 307s 5.12m 0.09h 0.00d # Continuing back on hgwdev ssh hgwdev cd /cluster/data/kgDB/bed/kgMm4 ~/kent/src/hg/protein/KGprocess.sh kgMm4 mm4 040115 # that runs until it prepares the second cluster run, then ssh kk cd /cluster/data/kgDB/bed/kgMm4/kgProtMap para create jobList para try para push ... etc # Completed: 4944 of 4944 jobs # CPU time in finished jobs: 1113456s 18557.60m 309.29h 12.89d 0.035 y # IO & Wait Time: 192756s 3212.60m 53.54h 2.23d 0.006 y # Average job time: 264s 4.40m 0.07h 0.00d # Longest job: 1269s 21.15m 0.35h 0.01d # Submission to last job: 1777s 29.62m 0.49h 0.02d # Continuing back on hgwdev ssh hgwdev cd /cluster/data/kgDB/bed/kgMm4 ~/kent/src/hg/protein/KGprocess.sh kgMm4 mm4 040115 # That runs to completion # existing tables were backed up into kgMm4_2003_11: # affyGnfU74ADistance affyGnfU74BDistance affyGnfU74CDistance # ceBlastTab cgapAlias cgapBiocDesc cgapBiocPathway dmBlastTab # drBlastTab dupSpMrna hgBlastTab keggMapDesc keggPathway # kgAliaskgProtAlias kgXref knownBlastTab knownCanonical knownGene # knownGeneLink knownGeneMrna knownGenePep knownIsoforms knownToCdsSnp # knownToRefSeq knownToU74 mrnaRefseq scBlastTab spMrna # until QA authorizes this build, then delete this table # rename the new tables into the mm4 database: cat << '_EOF_' > rename.sh #!/bin/sh for T in cgapAlias cgapBiocDesc cgapBiocPathway \ dupSpMrna keggMapDesc keggPathway kgAlias kgProtAlias \ kgProtMap kgXref knownGene knownGeneLink knownGeneMrna \ knownGenePep mrnaRefseq spMrna do hgsql -e "drop table ${T};" mm4 hgsql -e "rename table kgMm4.${T} to mm4.${T};" mysql echo $T done '_EOF_' chmod +s rename.sh ./rename.sh # continue with the FAMILY BROWSER UPDATE to complete this process ######## FAMILY BROWSER UPDATE ####### (DONE - 2004-02-19 - Hiram) # These are instructions for building the # neighborhood browser. Don't start these until # there is a knownGene track. and the affy tracks # Cluster together various alt-splicing isoforms. # Creates the knownIsoforms and knownCanonical tables ssh hgwdev mkdir /cluster/data/mm4/bed/famBro.2004-02-19 ln -l /cluster/data/mm4/bed/famBro.2004-02-19 /cluster/data/mm4/bed/famBro cd /cluster/data/mm4/bed/famBro hgClusterGenes mm4 knownGene knownIsoforms knownCanonical # Got 24894 clusters, from 41225 genes in 44 chromosomes # You may need to build this binary in src/hg/near/hgClusterGenes # featureBits mm4 knownCanonical # 830993563 bases of 2627444668 (31.627%) in intersection # previously was # 829173598 bases of 2627444668 (31.558%) in intersection # Extract peptides from knownGenes into fasta file # and create a blast database out of them. mkdir /cluster/data/mm4/bed/famBro/blastp cd /cluster/data/mm4/bed/famBro/blastp pepPredToFa mm4 knownGenePep known.faa # You may need to build this binary in src/hg/near/pepPredToFa formatdb -i known.faa -t known -n known /scratch/blast/formatdb -i known.faa -t known -n known # Copy over database to bluearc rm -fr /cluster/bluearc/mm4/blastp mkdir /cluster/bluearc/mm4/blastp cp -p /cluster/data/mm4/bed/famBro/blastp/known.* /cluster/bluearc/mm4/blastp # Load up cluster/bluearc with blastp and related files # if necessary if (! -e /cluster/bluearc/blast/blastall) then mkdir -p /cluster/bluearc/blast cp /projects/compbio/bin/i686/blastall /cluster/bluearc/blast mkdir -p /cluster/bluearc/blast/data cp /projects/compbio/bin/i686/data/* /cluster/bluearc/blast/data endif # Split up fasta file into bite sized chunks for cluster cd /cluster/data/mm4/bed/famBro/blastp mkdir split faSplit sequence known.faa 8000 split/kg # Make parasol run directory ssh kk mkdir /cluster/data/mm4/bed/famBro/blastp/self cd /cluster/data/mm4/bed/famBro/blastp/self mkdir run cd run mkdir out # Make blast script cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/scratch/blast/data export BLASTMAT /scratch/blast/blastall -p blastp \ -d /cluster/bluearc/mm4/blastp/known -i $1 -o $2 -e 0.01 -m 8 -b 1000 '_EOF_' chmod a+x blastSome # Make gensub2 file cat << '_EOF_' > gsub #LOOP blastSome {check in line+ $(path1)} {check out line out/$(root1).tab} #ENDLOOP '_EOF_' # Create parasol batch # 'ls ../../split/*.fa' is too much, hence the echo echo ../../split/*.fa | wordLine stdin > split.lst gensub2 split.lst single gsub jobList para create jobList para try # Wait a couple of minutes, and do a para check, if all is good # then do a para push # This should finish in ~5 minutes if the cluster is free. # Completed: 7737 of 7737 jobs # CPU time in finished jobs: 104031s 1733.85m 28.90h 1.20d 0.003 y # IO & Wait Time: 74885s 1248.08m 20.80h 0.87d 0.002 y # Average job time: 23s 0.39m 0.01h 0.00d # Longest job: 188s 3.13m 0.05h 0.00d # Submission to last job: 253s 4.22m 0.07h 0.00d # Load into database. This takes about 15 minutes. ssh hgwdev cd /cluster/data/mm4/bed/famBro/blastp/self/run/out time hgLoadBlastTab mm4 knownBlastTab *.tab # Scanning through 7737 files # Loading database with 7707618 rows # real 15m43.413s # user 2m57.190s # sys 0m20.330s # Create table that maps between known genes and RefSeq hgMapToGene mm4 refGene knownGene knownToRefSeq # row count goes from 29675 to 29958 # Create a table that maps between known genes and # the nice affy expression data. # fixup problem with names cd /cluster/data/mm4/bed/famBro mkdir fixupAffyNames cd fixupAffyNames hgsqldump --all -c mm4 affyGnfU74A > affyGnfU74A.sql.orig hgsqldump --all -c mm4 affyGnfU74B > affyGnfU74B.sql.orig hgsqldump --all -c mm4 affyGnfU74C > affyGnfU74C.sql.orig sed -e "s/U74Av2://" affyGnfU74A.sql.orig > affyGnfU74A.sql sed -e "s/U74Bv2://" affyGnfU74B.sql.orig > affyGnfU74B.sql sed -e "s/U74Cv2://" affyGnfU74C.sql.orig > affyGnfU74C.sql hgsql -e "rename table affyGnfU74A to affyGnfU74AlongName;" mm4 hgsql -e "rename table affyGnfU74B to affyGnfU74BlongName;" mm4 hgsql -e "rename table affyGnfU74C to affyGnfU74ClongName;" mm4 hgsql mm4 < affyGnfU74A.sql hgsql mm4 < affyGnfU74B.sql hgsql mm4 < affyGnfU74C.sql hgsqldump --all -c mm4 affyU74 > affyU74.sql.orig sed -e "s/U74Av2://; s/U74Bv2://; s/U74Cv2://; s/;'/'/" affyU74.sql.orig > \ affyU74.sql hgsql -e "rename table affyU74 to affyU74longName;" mm4 hgsql mm4 < affyU74.sql hgMapToGene mm4 affyU74 knownGene knownToU74 # row count went from 18831 to 25613 # I believe this GNF business is taken care of elsewhere, or at # least it only needs to be done once. # Format and load the GNF data # mkdir /cluster/data/mm4/bed/affyGnf95 # cd /cluster/data/mm4/bed/affyGnf95 # affyPslAndAtlasToBed -newType ../affyU95.psl \ # /projects/compbio/data/microarray/affyGnfHuman/data_public_U95 \ # affyGnfU95.tab affyGnfU95Exps.tab -shortOut # this .sql load was in preceeding instructions, but this .sql file # appears to not exist and it doesn't seem to be needed anyway. # Everything below this seems to create tables OK. # hgsql mm4 < ~/kent/src/hg/affyGnf/affyGnfU95.sql # Create table that gives distance in expression space between # GNF genes. The first one takes about 20 minutes, the other two 5 and 1 # The affyGnfU74?Exps arguments appear to be unused in hgExpDistance # runs most efficiently in /tmp since it makes large temporary files cd /tmp hgExpDistance mm4 affyGnfU74A affyGnfU74AExps affyGnfU74ADistance \ -lookup=knownToU74 # Got 13450 unique elements in affyGnfU74A hgExpDistance mm4 affyGnfU74B affyGnfU74BExps affyGnfU74BDistance \ -lookup=knownToU74 # Got 8314 unique elements in affyGnfU74B hgExpDistance mm4 affyGnfU74C affyGnfU74CExps affyGnfU74CDistance \ -lookup=knownToU74 # Got 2288 unique elements in affyGnfU74C # Make sure that GO database is up to date. See README in /cluster/store1/geneOntology. # Create knownToEnsembl column cd /cluster/data/mm4/bed/famBro hgMapToGene mm4 ensGene knownGene knownToEnsembl # creates 22975 rows in knownToEnsembl # Make knownToCdsSnp column. hgMapToGene mm4 snpNih knownGene knownToCdsSnp -all -cds # row count went from 11832 to 11874 # Make C. elegans ortholog column using blastp on wormpep. # First make C. elegans protein database and copy it to cluster/bluearc # if it doesn't exist already # This is already done, see makeMm3.doc for procedure # the directory: /cluster/bluearc/ce1/blastp should have data # Create the ceBlastTab (the blastall binary only works on kk9 for now ...) ssh kk mkdir /cluster/data/mm4/bed/famBro/blastp/ce1 cd /cluster/data/mm4/bed/famBro/blastp/ce1 mkdir run cd run mkdir out # Make blast script cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/scratch/blast/data /scratch/blast/blastall \ -p blastp -d /cluster/bluearc/ce1/blastp/wormPep \ -i $1 -o $2 -e 0.01 -m 8 -b 1 '_EOF_' chmod a+x blastSome # Make gensub2 file cat << '_EOF_' > gsub #LOOP blastSome {check in line+ $(path1)} {check out line out/$(root1).tab} #ENDLOOP '_EOF_' # Create parasol batch echo ../../split/*.fa | wordLine stdin > split.lst gensub2 split.lst single gsub jobList para create jobList para try # Wait a couple of minutes, and do a para check, if all is good # then do a para push # This should finish in 4 minutes if the cluster is free. # Completed: 7737 of 7737 jobs # CPU time in finished jobs: 51883s 864.71m 14.41h 0.60d 0.002 y # IO & Wait Time: 66670s 1111.17m 18.52h 0.77d 0.002 y # Average job time: 15s 0.26m 0.00h 0.00d # Longest job: 60s 1.00m 0.02h 0.00d # Submission to last job: 217s 3.62m 0.06h 0.00d # Load into database. ssh hgwdev cd /cluster/data/mm4/bed/famBro/blastp/ce1/run/out hgLoadBlastTab mm4 ceBlastTab -maxPer=1 *.tab # Scanning through 7737 files # Loading database with 24776 rows # Make human ortholog column using blastp on human known genes. # First make human protein database and copy it to cluster/bluearc # if it doesn't exist already # This already exists. See makeMm3.doc for procedure # the directory: /cluster/bluearc/hg16/blastp should have data # Make parasol run directory ssh kk mkdir /cluster/data/mm4/bed/famBro/blastp/hg16 cd /cluster/data/mm4/bed/famBro/blastp/hg16 mkdir run cd run mkdir out # Make blast script cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/scratch/blast/data /scratch/blast/blastall \ -p blastp -d /cluster/bluearc/hg16/blastp/known \ -i $1 -o $2 -e 0.001 -m 8 -b 1 '_EOF_' chmod a+x blastSome # Make gensub2 file cat << '_EOF_' > gsub #LOOP blastSome {check in line+ $(path1)} {check out line out/$(root1).tab} #ENDLOOP '_EOF_' # Create parasol batch # this echo trick is used because otherwise the command line is # too long and you can not do a simple ls echo ../../split/*.fa | wordLine stdin > split.lst gensub2 split.lst single gsub jobList para create jobList para try # Wait a couple of minutes, and do a para check, if all is good # then do a para push # takes about 10 minutes: # Completed: 7737 of 7737 jobs # CPU time in finished jobs: 118896s 1981.60m 33.03h 1.38d 0.004 y # IO & Wait Time: 47306s 788.43m 13.14h 0.55d 0.002 y # Average job time: 21s 0.36m 0.01h 0.00d # Longest job: 171s 2.85m 0.05h 0.00d # Submission to last job: 520s 8.67m 0.14h 0.01d # Load into database. ssh hgwdev cd /cluster/data/mm4/bed/famBro/blastp/hg16/run/out hgLoadBlastTab mm4 hgBlastTab -maxPer=1 *.tab # Scanning through 7737 files # Loading database with 32908 rows # Make Danio rerio (zebrafish) ortholog column using blastp on Ensembl. # First make protein database and copy it to cluster/bluearc # if it doesn't exist already # This is already done, see makeMm3.doc for procedure # the directory: /cluster/bluearc/dr1/blastp should have data # Make parasol run directory ssh kk9 mkdir /cluster/data/mm4/bed/famBro/blastp/dr1 cd /cluster/data/mm4/bed/famBro/blastp/dr1 mkdir run cd run mkdir out # Make blast script cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/scratch/blast/data /scratch/blast/blastall \ -p blastp -d /cluster/bluearc/dr1/blastp/ensembl \ -i $1 -o $2 -e 0.005 -m 8 -b 1 '_EOF_' chmod a+x blastSome # Make gensub2 file cat << '_EOF_' > gsub #LOOP blastSome {check in line+ $(path1)} {check out line out/$(root1).tab} #ENDLOOP '_EOF_' # Create parasol batch echo ../../split/*.fa | wordLine stdin > split.lst gensub2 split.lst single gsub jobList para create jobList para try # Wait a couple of minutes, and do a para check, if all is good # then do a para push # Completed: 7737 of 7737 jobs # CPU time in finished jobs: 72793s 1213.21m 20.22h 0.84d 0.002 y # IO & Wait Time: 30073s 501.22m 8.35h 0.35d 0.001 y # Average job time: 13s 0.22m 0.00h 0.00d # Longest job: 82s 1.37m 0.02h 0.00d # Submission to last job: 367s 6.12m 0.10h 0.00d # Load into database. ssh hgwdev cd /cluster/data/mm4/bed/famBro/blastp/dr1/run/out hgLoadBlastTab mm4 drBlastTab -maxPer=1 *.tab # Scanning through 7737 files # Loading database with 29369 rows # Make Saccharomyces cerevisiae (yeast) ortholog column using blastp on RefSeq. # First make protein database and copy it to cluster/bluearc # if it doesn't exist already # This is already done, see makeMm3.doc for procedure # the directory: /cluster/bluearc/sc1/blastp should have data # Make parasol run directory ssh kk mkdir /cluster/data/mm4/bed/famBro/blastp/sc1 cd /cluster/data/mm4/bed/famBro/blastp/sc1 mkdir run cd run mkdir out # Make blast script cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/scratch/blast/data /scratch/blast/blastall \ -p blastp -d /cluster/bluearc/sc1/blastp/sgd \ -i $1 -o $2 -e 0.01 -m 8 -b 1 '_EOF_' chmod a+x blastSome # Make gensub2 file cat << '_EOF_' > gsub #LOOP blastSome {check in line+ $(path1)} {check out line out/$(root1).tab} #ENDLOOP '_EOF_' # Create parasol batch echo ../../split/*.fa | wordLine stdin > split.lst gensub2 split.lst single gsub jobList para create jobList para try # Wait a couple of minutes, and do a para check, if all is good # then do a para push # Completed: 7737 of 7737 jobs # CPU time in finished jobs: 15306s 255.11m 4.25h 0.18d 0.000 y # IO & Wait Time: 22711s 378.51m 6.31h 0.26d 0.001 y # Average job time: 5s 0.08m 0.00h 0.00d # Longest job: 17s 0.28m 0.00h 0.00d # Submission to last job: 160s 2.67m 0.04h 0.00d # Load into database. ssh hgwdev cd /cluster/data/mm4/bed/famBro/blastp/sc1/run/out hgLoadBlastTab mm4 scBlastTab -maxPer=1 *.tab # Scanning through 7737 files # Loading database with 15709 rows # Make Drosophila melanagaster ortholog column using blastp on FlyBase. # First make SwissProt protein database and copy it to cluster/bluearc # if it doesn't exist already # This is already done, see makeMm3.doc for procedure # the directory: /cluster/bluearc/dm1/blastp should have data # Make parasol run directory ssh kk mkdir /cluster/data/mm4/bed/famBro/blastp/dm1 cd /cluster/data/mm4/bed/famBro/blastp/dm1 mkdir run cd run mkdir out # Make blast script cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/scratch/blast/data /scratch/blast/blastall \ -p blastp -d /cluster/bluearc/dm1/blastp/flyBase \ -i $1 -o $2 -e 0.01 -m 8 -b 1 '_EOF_' chmod a+x blastSome # Make gensub2 file cat << '_EOF_' > gsub #LOOP blastSome {check in line+ $(path1)} {check out line out/$(root1).tab} #ENDLOOP '_EOF_' # Create parasol batch echo ../../split/*.fa | wordLine stdin > split.lst gensub2 split.lst single gsub jobList para create jobList para try # Wait a couple of minutes, and do a para check, if all is good # then do a para push # Completed: 7737 of 7737 jobs # CPU time in finished jobs: 59235s 987.24m 16.45h 0.69d 0.002 y # IO & Wait Time: 23669s 394.49m 6.57h 0.27d 0.001 y # Average job time: 11s 0.18m 0.00h 0.00d # Longest job: 53s 0.88m 0.01h 0.00d # Submission to last job: 283s 4.72m 0.08h 0.00d # Load into database. ssh hgwdev cd /cluster/data/mm4/bed/famBro/blastp/dm1/run/out hgLoadBlastTab mm4 dmBlastTab -maxPer=1 *.tab # Scanning through 7737 files # Loading database with 26213 rows # CREATING KNOWNtOsUPER (which enables superFamily stuff in hgNear/hgGene) # First see if need to update superfamily data from # ftp server at supfam.mrc-lmb.cam.ac.uk following instructions # in /cluster/store1/superFamily/genomes/README.ucsc. Then # make sure that knownToEnsembl and ensGtp tables are created, then: ssh hgwdev zcat /cluster/store1/superFamily/genomes/ass_11-Jan-2004.tab.gz | \ hgKnownToSuper mm4 mm stdin # creates 27319 rows # Update the proteins link into gdbPdb.hgcentraltest: hgsql \ -e 'update gdbPdb set proteomeDb = "proteins040115" where genomeDb = "mm4";' \ -h genome-testdb hgcentraltest # miRNA track (UPDATED - 2004-05-04 - Hiram) # (first version done 2004-03-02) # data from: Sam Griffiths-Jones # and Michel.Weber@ibcg.biotoul.fr # notify them if this assembly updates to renew this track # for the update, save the previous version cd /cluster/data/mm4/bed mv miRNA miRNA.2004_03_02 mkdir miRNA cd miRNA wget --timestamping \ "ftp://ftp.sanger.ac.uk/pub/databases/Rfam/miRNA/genomes/mmu_ncbi32.*" grep -v "^track " mmu_ncbi32.bed | sed -e "s/ /\t/g" > mm4.bed # check existing track before update # featureBits mm4 miRNA # 17260 bases of 2627444668 (0.001%) in intersection hgLoadBed mm4 miRNA mm4.bed # entry in trackDb/trackDb.ra already there # and verify similar numbers after: # featureBits mm4 miRNA # 17782 bases of 2627444668 (0.001%) in intersection # gc5Base wiggle TRACK (DONE - 2004-04-07 - Hiram) # Perform a gc count with a 5 base # window. Also compute a "zoomed" view for display efficiency. mkdir /cluster/data/mm4/bed/gc5Base cd /cluster/data/mm4/bed/gc5Base # in the script below, the 'grep -w GC' selects the lines of # output from hgGcPercent that are real data and not just some # information from hgGcPercent. The awk computes the number # of bases that hgGcPercent claimed it measured, which is not # necessarily always 5 if it ran into gaps, and then the division # by 10.0 scales down the numbers from hgGcPercent to the range # [0-100]. Two columns come out of the awk print statement: # and which are fed into wigAsciiToBinary through # the pipe. It is set at a dataSpan of 5 because each value # represents the measurement over five bases beginning with # . The result files end up in ./wigData5. cat << '_EOF_' > runGcPercent.sh #!/bin/sh mkdir -p wigData5 mkdir -p dataLimits5 for n in ../../nib/*.nib do c=`basename ${n} | sed -e "s/.nib//"` C=`echo $c | sed -e "s/chr//"` echo -n "working on ${c} - ${C} ... " hgGcPercent -chr=${c} -doGaps \ -file=stdout -win=5 mm4 ../../nib | grep -w GC | \ awk '{printf "%d\t%.1f\n", $2+1, $5/10.0 }' | \ wigAsciiToBinary \ -dataSpan=5 -chrom=${c} -wibFile=wigData5/gc5Base_${C} \ -name=${C} stdin 2> dataLimits5/${c} echo "done" done '_EOF_' chmod +x runGcPercent.sh # This is going to take perhaps two hours to run. It is a lot of # data. make sure you do it on the fileserver: ssh eieio cd /cluster/data/mm4/bed/gc5Base time ./runGcPercent.sh > run.out 2>&1 # real 105m11.151s # user 153m40.050s # sys 14m57.570s # load the .wig files back on hgwdev: ssh hgwdev cd /cluster/data/mm4/bed/gc5Base hgLoadWiggle -pathPrefix=/gbdb/mm4/wib/gc5Base mm4 gc5Base wigData5/*.wig # and symlink the .wib files into /gbdb mkdir /gbdb/mm4/wib/gc5Base ln -s `pwd`/wigData5/*.wib /gbdb/mm4/wib/gc5Base # to speed up display for whole chromosome views, compute a "zoomed" # view and load that on top of the existing table. The savings # comes from the number of data table rows the browser needs to load # for a full chromosome view. Without the zoomed view there are # over 43,000 data rows for chrom 1. With the zoomed view there are # only 222 rows needed for the display. If your original data was # at 1 value per base the savings would be even greater. # Pretty much the same data calculation # situation as above, although this time note the use of the # 'wigZoom -dataSpan=1000 stdin' in the pipeline. This will average # together the data points coming out of the awk print statment over # a span of 1000 bases. Thus each coming out of wigZoom # will represent the measurement of GC in the next 1000 bases. Note # the use of -dataSpan=1000 on the wigAsciiToBinary to account for # this type of data. You want your dataSpan here to be an exact # multiple of your original dataSpan (5*200=1000) and on the order # of at least 1000, doesn't need to go too high. For data that is # originally at 1 base per value, a convenient span is: -dataSpan=1024 # A new set of result files ends up in ./wigData5_1K/*.wi[gb] cat << '_EOF_' > runZoom.sh #!/bin/sh mkdir -p wigData5_1K mkdir -p dataLimits5_1K for n in ../../nib/*.nib do c=`basename ${n} | sed -e "s/.nib//"` C=`echo $c | sed -e "s/chr//"` echo -n "working on ${c} - ${C} ... " hgGcPercent -chr=${c} -doGaps \ -file=stdout -win=5 mm4 ../../nib | grep -w GC | \ awk '{printf "%d\t%.1f\n", $2+1, $5/10.0}' | \ wigZoom -dataSpan=1000 stdin | wigAsciiToBinary \ -dataSpan=1000 -chrom=${c} -wibFile=wigData5_1K/gc5Base_${C}_1K \ -name=${C} stdin 2> dataLimits5_1K/${c} echo "done" done '_EOF_' chmod +x runZoom.sh # This is going to take even longer than above, certainly do this # on the fileserver ssh eieio time ./runZoom.sh > zoom.out 2>&1 # real 99m1.959s # user 151m1.660s # sys 11m11.070s # Then load these .wig files into the same database as above ssh hgwdev hgLoadWiggle -pathPrefix=/gbdb/mm4/wib/gc5Base -oldTable mm4 gc5Base \ wigData5_1K/*.wig # and symlink these .wib files into /gbdb ln -s `pwd`/wigData5_1K/*.wib /gbdb/mm4/wib/gc5Base # MOUSE AFFYMETRIX MOE430 TRACK (DONE, 2004-04-19, hartera) mkdir -p /projects/compbio/data/microarray/affyMouse # Download MOE430A and MOE430B consensus sequences from Affymetrix web site # http://www.affymetrix.com/support/technical/byproduct.affx?product=moe430 unzip MOE430*_consensus.zip # check for duplicate probes: there are none, all have unique names # check for duplicate probes: 100 from 136745_at to 1367551_a_at # remove "consensus:" and ";" from FASTA headers to shorten probeset # names for database sed -e 's/consensus://' MOE430A_consensus | sed -e 's/;/ /' > MOE430_all.fa sed -e 's/consensus://' MOE430B_consensus | sed -e 's/;/ /' >> MOE430_all.fa cp /projects/compbio/data/microarray/affyMouse/MOE430_all.fa \ /cluster/bluearc/affy/ # Set up cluster job to align MOE430 consensus sequences to mm4 ssh kkr1u00 cd /cluster/data/mm4/bed mkdir -p affyMOE430 cd affyMOE430 mkdir -p /iscratch/i/affy cp /cluster/bluearc/affy/MOE430_all.fa /iscratch/i/affy iSync ssh kk cd /cluster/data/mm4/bed/affyMOE430 ls -1 /iscratch/i/affy/MOE430_all.fa > affy.lst ls -1 /scratch/mus/mm4/trfFa/ > allctg.lst echo '#LOOP\n/cluster/bin/i386/blat -fine -mask=lower -minIdentity=95 -ooc=/scratch/hg/h/mouse11.ooc /scratch/mus/mm4/trfFa/$(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl}\n#ENDLOOP' > template.sub gensub2 allctg.lst affy.lst template.sub para.spec mkdir psl para create para.spec # Actually do the job with usual para try/check/push/time etc. # para time # Completed: 599 of 599 jobs # CPU time in finished jobs: 25242s 420.71m 7.01h 0.29d 0.001 y # IO & Wait Time: 5969s 99.48m 1.66h 0.07d 0.000 y # Average job time: 52s 0.87m 0.01h 0.00d # Longest job: 102s 1.70m 0.03h 0.00d # Submission to last job: 6379s 106.32m 1.77h 0.07d # Do sort, best in genome filter, and convert to chromosome coordinates # to create affyRAE230.psl pslSort dirs raw.psl tmp psl # only use alignments that cover 30% of sequence and have at least # 95% identity in aligned region. # low minCover as a lot of n's in these sequences pslReps -minCover=0.3 -sizeMatters -minAli=0.95 -nearTop=0.005 raw.psl contig.psl /dev/null liftUp affyMOE430.psl ../../jkStuff/liftAll.lft warn contig.psl # Load alignments and sequences into database ssh hgwdev # shorten names in psl file sed -e 's/MOE430//' affyMOE430.psl > affyMOE430.psl.bak mv affyMOE430.psl.bak affyMOE430.psl # load track into database hgLoadPsl mm4 affyMOE430.psl # 1 warning on loading: Blat error so that 1449824_at has a # negative entry (-195) in the qBaseInsert field. # Loading into the database forces this to 0. # Add consensus sequences for MOE430 # Copy sequences to gbdb is they are not there already mkdir -p /gbdb/hgFixed/affyProbes ln -s /projects/compbio/data/microarray/affyMouse/MOE430_all.fa \ /gbdb/hgFixed/affyProbes hgLoadSeq -abbr=MOE430 mm4 /gbdb/hgFixed/affyProbes/MOE430_all.fa # Clean up rm batch.bak contig.psl raw.psl # add entry to trackDb.ra in ~kent/src/hg/makeDb/trackDb/mouse/ # add affyMOE430.html file and then do make alpha to add to trackDb table # Add a knownToMOE430 table for purposes of mapping probesets # to known Genes hgMapToGene mm4 affyMOE430 knownGene knownToMOE430 # ####### FAMILY BROWSER UPDATE ####### (DONE - 2004-04-01 - Fan) # This update is to complement the recent KG update based on # SWISS-PROT data release of 3/15/04. # These are instructions for updating the # neighborhood browser. Don't start these until # there is a knownGene track and the affy tracks # Cluster together various alt-splicing isoforms. # Creates the knownIsoforms and knownCanonical tables ssh hgwdev mkdir /cluster/data/mm4/bed/famBro.2004-04-01 rm /cluster/data/mm4/bed/famBro ln -s /cluster/data/mm4/bed/famBro.2004-04-01 /cluster/data/mm4/bed/famBro cd /cluster/data/mm4/bed/famBro hgClusterGenes mm4 knownGene knownIsoforms knownCanonical # Got 25004 clusters, from 41693 genes in 44 chromosomes # row count on knownIsoforms is 41669 # row count on knownCanonical is 25004 # Extract peptides from knownGenes into fasta file # and create a blast database out of them. mkdir /cluster/data/mm4/bed/famBro/blastp cd /cluster/data/mm4/bed/famBro/blastp pepPredToFa mm4 knownGenePep known.faa # You may need to build this binary in src/hg/near/pepPredToFa # # expect to find /scratch/blast loaded with all required blast files /scratch/blast/formatdb -i known.faa -t known -n known # Copy over database to bluearc rm -fr /cluster/bluearc/mm4/blastp mkdir /cluster/bluearc/mm4/blastp cp -p /cluster/data/mm4/bed/famBro/blastp/known.* /cluster/bluearc/mm4/blastp # Split up fasta file into bite sized chunks for cluster cd /cluster/data/mm4/bed/famBro/blastp mkdir split faSplit sequence known.faa 8000 split/kg # Make parasol run directory ssh kk mkdir /cluster/data/mm4/bed/famBro/blastp/self cd /cluster/data/mm4/bed/famBro/blastp/self mkdir run cd run mkdir out # Make blast script cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/scratch/blast/data /scratch/blast/blastall -p blastp \ -d /cluster/bluearc/mm4/blastp/known -i $1 -o $2 -e 0.01 -m 8 -b 1000 '_EOF_' chmod a+x blastSome # Make gensub2 file cat << '_EOF_' > gsub #LOOP blastSome {check in line+ $(path1)} {check out line out/$(root1).tab} #ENDLOOP '_EOF_' # Create parasol batch # 'ls ../../split/*.fa' is too much, hence the echo echo ../../split/*.fa | wordLine stdin > split.lst gensub2 split.lst single gsub jobList para make jobList # This should finish in ~5 minutes # Completed: 7738 of 7738 jobs # CPU time in finished jobs: 108579s 1809.65m 30.16h 1.26d 0.003 y # IO & Wait Time: 76232s 1270.54m 21.18h 0.88d 0.002 y # Average job time: 24s 0.40m 0.01h 0.00d # Longest job: 188s 3.13m 0.05h 0.00d # Submission to last job: 254s 4.23m 0.07h 0.00d # Load into database. This takes about 15 minutes. ssh hgwdev cd /cluster/data/mm4/bed/famBro/blastp/self/run/out time hgLoadBlastTab mm4 knownBlastTab *.tab # Scanning through 7738 files # Loading database with 8019510 rows # 184.360u 28.660s 17:01.40 20.8% 0+0k 0+0io 202pf+0w # Create table that maps between known genes and RefSeq hgMapToGene mm4 refGene knownGene knownToRefSeq # row count goes from 27744 to 30116 # Create a table that maps between known genes and # the nice affy expression data. # fixup problem with names cd /cluster/data/mm4/bed/famBro hgMapToGene mm4 affyU74 knownGene knownToU74 # row count went to 25944 # I believe this GNF business is taken care of elsewhere, or at # least it only needs to be done once. # Format and load the GNF data # mkdir /cluster/data/mm4/bed/affyGnf95 # cd /cluster/data/mm4/bed/affyGnf95 # affyPslAndAtlasToBed -newType ../affyU95.psl \ # /projects/compbio/data/microarray/affyGnfHuman/data_public_U95 \ # affyGnfU95.tab affyGnfU95Exps.tab -shortOut # this .sql load was in preceeding instructions, but this .sql file # appears to not exist and it doesn't seem to be needed anyway. # Everything below this seems to create tables OK. # hgsql mm4 < ~/kent/src/hg/affyGnf/affyGnfU95.sql # Create table that gives distance in expression space between # GNF genes. The first one takes about 20 minutes, the other two 5 and 1 # The affyGnfU74?Exps arguments appear to be unused in hgExpDistance # runs most efficiently in /tmp since it makes large temporary files cd /tmp hgExpDistance mm4 affyGnfU74A affyGnfU74AExps affyGnfU74ADistance \ -lookup=knownToU74 # Got 13563 unique elements in affyGnfU74A # row count went to 13563000 hgExpDistance mm4 affyGnfU74B affyGnfU74BExps affyGnfU74BDistance \ -lookup=knownToU74 # Got 8473 unique elements in affyGnfU74B # row count went to 8473000 hgExpDistance mm4 affyGnfU74C affyGnfU74CExps affyGnfU74CDistance \ -lookup=knownToU74 # Got 2338 unique elements in affyGnfU74C # row count went to 2338000 # Make sure that GO database is up to date. See README in /cluster/store1/geneOntology. # Create knownToEnsembl column cd /cluster/data/mm4/bed/famBro hgMapToGene mm4 ensGene knownGene knownToEnsembl # row count went to 34357 # Make knownToCdsSnp column. hgMapToGene mm4 snpMap knownGene knownToCdsSnp -all -cds # row count went to 12061 # Make C. elegans ortholog column using blastp on wormpep. # First make C. elegans protein database and copy it to cluster/bluearc # if it doesn't exist already # This is already done, see makeMm4.doc for procedure # the directory: /cluster/bluearc/ce1/blastp should have data # Create the ceBlastTab ssh kk mkdir /cluster/data/mm4/bed/famBro/blastp/ce1 cd /cluster/data/mm4/bed/famBro/blastp/ce1 mkdir run cd run mkdir out # Make blast script cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/scratch/blast/data /scratch/blast/blastall \ -p blastp -d /cluster/bluearc/ce1/blastp/wormPep \ -i $1 -o $2 -e 0.01 -m 8 -b 1 '_EOF_' chmod a+x blastSome # Make gensub2 file cat << '_EOF_' > gsub #LOOP blastSome {check in line+ $(path1)} {check out line out/$(root1).tab} #ENDLOOP '_EOF_' # Create parasol batch echo ../../split/*.fa | wordLine stdin > split.lst gensub2 split.lst single gsub jobList para make jobList # Completed: 7738 of 7738 jobs # CPU time in finished jobs: 52727s 878.78m 14.65h 0.61d 0.002 y # IO & Wait Time: 54604s 910.07m 15.17h 0.63d 0.002 y # Average job time: 14s 0.23m 0.00h 0.00d # Longest job: 62s 1.03m 0.02h 0.00d # Submission to last job: 133s 2.22m 0.04h 0.00d # Load into database. ssh hgwdev cd /cluster/data/mm4/bed/famBro/blastp/ce1/run/out hgLoadBlastTab mm4 ceBlastTab -maxPer=1 *.tab # Scanning through 7738 files # Loading database with 25102 rows # row count went to 25102 # Make human ortholog column using blastp on human known genes. # First make human protein database and copy it to cluster/bluearc # if it doesn't exist already # This already exists. See makeMm4.doc for procedure # the directory: /cluster/bluearc/hg16/blastp should have data # Make parasol run directory ssh kk mkdir /cluster/data/mm4/bed/famBro/blastp/hg16 cd /cluster/data/mm4/bed/famBro/blastp/hg16 mkdir run cd run mkdir out # Make blast script cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/scratch/blast/data /scratch/blast/blastall \ -p blastp -d /cluster/bluearc/hg16/blastp/known \ -i $1 -o $2 -e 0.001 -m 8 -b 1 '_EOF_' chmod a+x blastSome # Make gensub2 file cat << '_EOF_' > gsub #LOOP blastSome {check in line+ $(path1)} {check out line out/$(root1).tab} #ENDLOOP '_EOF_' # Create parasol batch # this echo trick is used because otherwise the command line is # too long and you can not do a simple ls echo ../../split/*.fa | wordLine stdin > split.lst gensub2 split.lst single gsub jobList para make jobList # Completed: 7738 of 7738 jobs # CPU time in finished jobs: 121081s 2018.02m 33.63h 1.40d 0.004 y # IO & Wait Time: 85030s 1417.17m 23.62h 0.98d 0.003 y # Average job time: 27s 0.44m 0.01h 0.00d # Longest job: 201s 3.35m 0.06h 0.00d # Submission to last job: 266s 4.43m 0.07h 0.00d # Load into database. ssh hgwdev cd /cluster/data/mm4/bed/famBro/blastp/hg16/run/out hgLoadBlastTab mm4 hgBlastTab -maxPer=1 *.tab # Scanning through 7738 files # Loading database with 33319 rows # row count went to 33319 # Make Danio rerio (zebrafish) ortholog column using blastp on Ensembl. # First make protein database and copy it to cluster/bluearc # if it doesn't exist already # This is already done, see makeMm4.doc for procedure # the directory: /cluster/bluearc/dr1/blastp should have data # Make parasol run directory ssh kk mkdir /cluster/data/mm4/bed/famBro/blastp/dr1 cd /cluster/data/mm4/bed/famBro/blastp/dr1 mkdir run cd run mkdir out # Make blast script cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/scratch/blast/data /scratch/blast/blastall \ -p blastp -d /cluster/bluearc/dr1/blastp/ensembl \ -i $1 -o $2 -e 0.005 -m 8 -b 1 '_EOF_' chmod a+x blastSome # Make gensub2 file cat << '_EOF_' > gsub #LOOP blastSome {check in line+ $(path1)} {check out line out/$(root1).tab} #ENDLOOP '_EOF_' # Create parasol batch echo ../../split/*.fa | wordLine stdin > split.lst gensub2 split.lst single gsub jobList para make jobList # Completed: 7738 of 7738 jobs # CPU time in finished jobs: 74098s 1234.96m 20.58h 0.86d 0.002 y # IO & Wait Time: 65822s 1097.04m 18.28h 0.76d 0.002 y # Average job time: 18s 0.30m 0.01h 0.00d # Longest job: 105s 1.75m 0.03h 0.00d # Submission to last job: 176s 2.93m 0.05h 0.00d # Load into database. ssh hgwdev cd /cluster/data/mm4/bed/famBro/blastp/dr1/run/out hgLoadBlastTab mm4 drBlastTab -maxPer=1 *.tab Scanning through 7738 files Loading database with 29757 rows # Make Saccharomyces cerevisiae (yeast) ortholog column using blastp on RefSeq. # First make protein database and copy it to cluster/bluearc # if it doesn't exist already # This is already done, see makeMm4.doc for procedure # the directory: /cluster/bluearc/sc1/blastp should have data # Make parasol run directory ssh kk mkdir /cluster/data/mm4/bed/famBro/blastp/sc1 cd /cluster/data/mm4/bed/famBro/blastp/sc1 mkdir run cd run mkdir out # Make blast script cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/scratch/blast/data /scratch/blast/blastall \ -p blastp -d /cluster/bluearc/sc1/blastp/sgd \ -i $1 -o $2 -e 0.01 -m 8 -b 1 '_EOF_' chmod a+x blastSome # Make gensub2 file cat << '_EOF_' > gsub #LOOP blastSome {check in line+ $(path1)} {check out line out/$(root1).tab} #ENDLOOP '_EOF_' # Create parasol batch echo ../../split/*.fa | wordLine stdin > split.lst gensub2 split.lst single gsub jobList para make jobList # Completed: 7738 of 7738 jobs # CPU time in finished jobs: 15489s 258.14m 4.30h 0.18d 0.000 y # IO & Wait Time: 27686s 461.44m 7.69h 0.32d 0.001 y # Average job time: 6s 0.09m 0.00h 0.00d # Longest job: 19s 0.32m 0.01h 0.00d # Submission to last job: 54s 0.90m 0.01h 0.00d # Load into database. ssh hgwdev cd /cluster/data/mm4/bed/famBro/blastp/sc1/run/out hgLoadBlastTab mm4 scBlastTab -maxPer=1 *.tab # Scanning through 7738 files # Loading database with 15945 rows # Make Drosophila melanagaster ortholog column using blastp on FlyBase. # First make SwissProt protein database and copy it to cluster/bluearc # if it doesn't exist already # This is already done, see makeMm4.doc for procedure # the directory: /cluster/bluearc/dm1/blastp should have data # Make parasol run directory ssh kk mkdir /cluster/data/mm4/bed/famBro/blastp/dm1 cd /cluster/data/mm4/bed/famBro/blastp/dm1 mkdir run cd run mkdir out # Make blast script cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/scratch/blast/data /scratch/blast/blastall \ -p blastp -d /cluster/bluearc/dm1/blastp/flyBase \ -i $1 -o $2 -e 0.01 -m 8 -b 1 '_EOF_' chmod a+x blastSome # Make gensub2 file cat << '_EOF_' > gsub #LOOP blastSome {check in line+ $(path1)} {check out line out/$(root1).tab} #ENDLOOP '_EOF_' # Create parasol batch echo ../../split/*.fa | wordLine stdin > split.lst gensub2 split.lst single gsub jobList para make jobList # Completed: 7738 of 7738 jobs # CPU time in finished jobs: 60361s 1006.02m 16.77h 0.70d 0.002 y # IO & Wait Time: 52628s 877.13m 14.62h 0.61d 0.002 y # Average job time: 15s 0.24m 0.00h 0.00d # Longest job: 77s 1.28m 0.02h 0.00d # Submission to last job: 142s 2.37m 0.04h 0.00d # Load into database. ssh hgwdev cd /cluster/data/mm4/bed/famBro/blastp/dm1/run/out hgLoadBlastTab mm4 dmBlastTab -maxPer=1 *.tab # Scanning through 7738 files # Loading database with 26563 rows # CREATING KNOWNTOSUPER (which enables Superfamily stuff in hgNear/hgGene) # First see if need to update superfamily data from # ftp server at supfam.mrc-lmb.cam.ac.uk following instructions # in /cluster/store1/superFamily/genomes/README.ucsc. Then # make sure that knownToEnsembl and ensGtp tables are created, then: ssh hgwdev zcat /cluster/store1/superFamily/040321/ass_28-Mar-2004.tab.gz | \ hgKnownToSuper mm4 mm stdin # 27405 records output # Update the proteins link into gdbPdb.hgcentraltest: hgsql \ -e 'update gdbPdb set proteomeDb = "proteins040315" where genomeDb = "mm4";' \ -h genome-testdb hgcentraltest # Perform preliminary review, using the Genome Browser and Gene Sorter for mm4. # After successful review, notify QA to start formal review. # Create table that maps between known genes and LocusLink (DONE 5/25/04 Fan) hgsql --skip-column-names -e "select mrnaAcc,locusLinkId from refLink" mm4 \ > refToLl.txt hgMapToGene mm4 refGene knownGene knownToLocusLink -lookup=refToLl.txt # row count is 30313 # Create table that maps between known genes and Pfam domains hgMapViaSwissProt mm4 knownGene name proteinID Pfam knownToPfam # Create table to map between known genes and GNF Atlas2 # expression data. hgMapToGene mm4 gnfAtlas2 knownGene knownToGnfAtlas2 '-type=bed 12' ######## PROTEOME BROWSER BUILD ####### (REBUILT - 2004-04-19 - Fan) # These are instructions for building or rebuilding tables # needed for the Proteome Browser. # DON'T START THESE UNTIL TABLES FOR KNOWN GENES AND kgProtMap table # ARE BUILT (OR REBUILT) o Create the working directory mkdir /cluster/store6/mm4/bed/pb cd /cluster/store6/mm4/bed/pb o save old PB tables (if they exist) to the backup DB, kgMm4Sav3 From mysql prompt: alter table mm4.pepCCntDist rename as kgMm4Sav3.pepCCntDist; alter table mm4.pepExonCntDist rename as kgMm4Sav3.pepExonCntDist; alter table mm4.pepHydroDist rename as kgMm4Sav3.pepHydroDist; alter table mm4.pepIPCntDist rename as kgMm4Sav3.pepIPCntDist; alter table mm4.pepMolWtDist rename as kgMm4Sav3.pepMolWtDist; alter table mm4.pepMwAa rename as kgMm4Sav3.pepMwAa; alter table mm4.pepPi rename as kgMm4Sav3.pepPi; alter table mm4.pepPiDist rename as kgMm4Sav3.pepPiDist; alter table mm4.pepResDist rename as kgMm4Sav3.pepResDist; alter table mm4.pbAaDistA rename as kgMm4Sav3.pbAaDistA; alter table mm4.pbAaDistC rename as kgMm4Sav3.pbAaDistC; alter table mm4.pbAaDistD rename as kgMm4Sav3.pbAaDistD; alter table mm4.pbAaDistE rename as kgMm4Sav3.pbAaDistE; alter table mm4.pbAaDistF rename as kgMm4Sav3.pbAaDistF; alter table mm4.pbAaDistG rename as kgMm4Sav3.pbAaDistG; alter table mm4.pbAaDistH rename as kgMm4Sav3.pbAaDistH; alter table mm4.pbAaDistI rename as kgMm4Sav3.pbAaDistI; alter table mm4.pbAaDistK rename as kgMm4Sav3.pbAaDistK; alter table mm4.pbAaDistL rename as kgMm4Sav3.pbAaDistL; alter table mm4.pbAaDistM rename as kgMm4Sav3.pbAaDistM; alter table mm4.pbAaDistN rename as kgMm4Sav3.pbAaDistN; alter table mm4.pbAaDistP rename as kgMm4Sav3.pbAaDistP; alter table mm4.pbAaDistQ rename as kgMm4Sav3.pbAaDistQ; alter table mm4.pbAaDistR rename as kgMm4Sav3.pbAaDistR; alter table mm4.pbAaDistS rename as kgMm4Sav3.pbAaDistS; alter table mm4.pbAaDistT rename as kgMm4Sav3.pbAaDistT; alter table mm4.pbAaDistV rename as kgMm4Sav3.pbAaDistV; alter table mm4.pbAaDistW rename as kgMm4Sav3.pbAaDistW; alter table mm4.pbAaDistY rename as kgMm4Sav3.pbAaDistY; alter table mm4.pbAnomLimit rename as kgMm4Sav3.pbAnomLimit; alter table mm4.pbResAvgStd rename as kgMm4Sav3.pbResAvgStd; alter table mm4.pbStamp rename as kgMm4Sav3.pbStamp; o Define pep* tables in mm4 DB cat ~/kent/src/hg/lib/pep*.sql > pepAll.sql First edit out pepPred table definition, then hgsql mm4 < pepAll.sql o Build the pepMwAa table hgsql proteins040315 -e "select info.acc, molWeight, aaSize from sp040315.info, sp040315.accToTaxon where accToTaxon.taxon=10090 and accToTaxon.acc = info.acc" > pepMwAa.tab hgsql mm4 < protAcc.lis pbCalPi protAcc.lis sp040315 pepPi.tab hgsql mm4 <pbAaAll.sql hgsql mm4 < pbAaAll.sql From mysql prompt: use mm4; load data local infile "pbAaDistW.tab" into table mm4.pbAaDistW; load data local infile "pbAaDistC.tab" into table mm4.pbAaDistC; load data local infile "pbAaDistM.tab" into table mm4.pbAaDistM; load data local infile "pbAaDistH.tab" into table mm4.pbAaDistH; load data local infile "pbAaDistY.tab" into table mm4.pbAaDistY; load data local infile "pbAaDistN.tab" into table mm4.pbAaDistN; load data local infile "pbAaDistF.tab" into table mm4.pbAaDistF; load data local infile "pbAaDistI.tab" into table mm4.pbAaDistI; load data local infile "pbAaDistD.tab" into table mm4.pbAaDistD; load data local infile "pbAaDistQ.tab" into table mm4.pbAaDistQ; load data local infile "pbAaDistK.tab" into table mm4.pbAaDistK; load data local infile "pbAaDistR.tab" into table mm4.pbAaDistR; load data local infile "pbAaDistT.tab" into table mm4.pbAaDistT; load data local infile "pbAaDistV.tab" into table mm4.pbAaDistV; load data local infile "pbAaDistP.tab" into table mm4.pbAaDistP; load data local infile "pbAaDistG.tab" into table mm4.pbAaDistG; load data local infile "pbAaDistE.tab" into table mm4.pbAaDistE; load data local infile "pbAaDistA.tab" into table mm4.pbAaDistA; load data local infile "pbAaDistL.tab" into table mm4.pbAaDistL; load data local infile "pbAaDistS.tab" into table mm4.pbAaDistS; o Create pbAnomLimit and pbResAvgStd tables hgsql mm4 < ~/src/hg/lib/pbAnomLimit.sql hgsql mm4 < ~/src/hg/lib/pbResAvgStd.sql hgsql mm4 -e 'load data local infile "pbResAvgStd.tab" into table mm4.pbResAvgStd;' hgsql mm4 -e 'load data local infile "pbAnomLimit.tab" into table mm4.pbAnomLimit;' o Download data files from Ensembl and create ensGeneXref and ensTranscript tables mkdir /cluster/bluearc/fan/ensembl/19_32aM cd /cluster/bluearc/fan/ensembl/19_32aM wget --timestamping \ "ftp://ftp.ensembl.org/pub/mouse-19.32a/data/mysql/mus_musculus_lite_19_32a/gene_xref.txt.table.gz" wget --timestamping \ "ftp://ftp.ensembl.org/pub/mouse-19.32a/data/mysql/mus_musculus_lite_19_32a/transcript.txt.table.gz" wget --timestamping \ "ftp://ftp.ensembl.org/pub/mouse-19.32a/data/mysql/mus_musculus_lite_19_32a/mus_musculus_lite_19_32a.sql.gz"" gzip -d *.gz If mm4.ensGeneXref table already exists alter table mm4.ensGeneXref rename as kgMm4Sav3.ensGeneXref; ENSEMBL IS KNOWN FOR CHANGING THEIR TABLE DEFINTION AND CONTENT!!! So, first check if the table definition of gene_xref in mus_musculus_lite_19_32a.sql is the same as ~/src/hg/lib/ensGeneXref.sql. If they are not the same, some programming change to hgTracks.c and possibly other programs may be needed. If they are the same, create gensGeneXref table in mm4 by: hgsql mm4 < ~/src/hg/lib/ensGeneXref.sql hgsql mm4 -e \ "load data local infile "gene_xref.txt.table" into table mm4.ensGeneXref;" If mm4.ensTranscript exists alter table mm4.ensTranscript rename as kgMm4Sav3.ensTranscript; Again, check if the table definition of transcript in mus_musculus_lite_19_32a.sql is the same as ~/src/hg/lib/ensTranscript.sql. If they are not the same, some programming change to hgTracks.c and possibly other programs may be needed. If they are the same, create gensTranscript table in mm4 by: hgsql mm4 < ~/src/hg/lib/ensTranscript.sql hgsql mm4 -e 'load data local infile "transcript.txt.table" into table mm4.ensTranscript;' o Download Superfamily data files and build the Superfamily DB mkdir /cluster/store1/superFamily/040321 cd /cluster/store1/superFamily/040321 ftp over the following two files: ass_28-Mar-2004.tab.gz supfam_21-Mar-2004.sql.gz hgsql mm4 -e "create database superfam040321" hgsql superfam040321 < supfam_21-Mar-2004.sql Make sure to add an index on id of the des table of superfam040321. hgsql superfam040321 < ~/src/hg/lib/sfAssign.sql hgsql superfam040321 -e 'load data local infile "ass_28-Mar-2004.tab" into table superfam040321.sfAssign;' o Build or rebuild Superfamily track and create sf tables needed for PB hgsql mm4 < ~/src/hg/lib/sfAssign.sql cd /cluster/store1/superFamily/040321 hgsql mm4 -e 'load data local infile "ass_28-Mar-2004.tab" into table mm4.sfAssign;' If mm4.sfDes already exists, alter table mm4.sfDes rename as kgMm4Sav3.sfDes; hgsql superfam040321 -e "select * from des" >sfDes.tab hgsql mm4 < ~/src/hg/lib/sfDes.sql hgsql mm4 -e 'load data local infile "sfDes.tab" into table mm4.sfDes ignore 1 lines;' If mm4.superfamily already exists alter table mm4.superfamily rename as kgMm4Sav3.superfamily; hgSuperfam mm4 > sf.log It is normal that many proteins does not have corresponding Superfamily entries. If mm4.sfDescription exists, alter table mm4.sfDescription rename as kgMm4Sav3.sfDescription; hgsql mm4 < ~/src/hg/lib/sfDescription.sql hgsql mm4 -e 'LOAD DATA local INFILE "sfDescription.tab" into table mm4.sfDescription;' Finally, load the superfamily table. hgLoadBed mm4 superfamily superfamily.tab -tab o Create pbStamp table for PB hgsql mm4 < ~/src/hg/lib/pbStamp.sql cd ~/kent/src/hg/protein/pbTracks hgsql mm4 -e 'load data local infile "pbStampMouse.tab" into table mm4.pbStamp;' o Enable Proteome Browser if ~/kent/src/hg/hgGene/hgGeneData/Mouse/links.ra does not exist: cd ~/kent/src/hg/hgGene/hgGeneData/Mouse mkdir mm4 cd mm4 Create links.ra with the following content # mm4 Proteome Browser link info. name protBrowser shortLabel Proteome Browser tables kgXref idSql select spDisplayID from kgXref where kgID = '%s' nameSql select spId from kgXref where kgID = '%s' url ../cgi-bin/pbTracks?proteinID=%s priority 12 cd ../.. make alpha o Adjust drawing parameters for Proteome Browser stamps Now invoke Proteome Browser and adjust various drawing parameters (mostly the ymax of each stamp) by updating the pbStampMouse.tab file and then reload the pbStamp table. o Perform preliminary review of Proteom Browser for mm4, then notify QA for formal review. # LIFTOVER ENCODE REGIONS (DONE 2004-04-16 kate) # preliminary test ssh hgwdev mkdir /cluster/data/mm4/bed/encodeLiftOver cd /cluster/data/mm4/bed/encodeLiftOver echo 'select * from hg16.encodeRegions' | hgsql hg16 -N > hg16.eR.bed # Of 44 regions: # -minMatch=.95 (default) -> nothing mapped # -minMatch=.5 -> 6 mapped # -minMatch=.2 -> 35 mapped # -minMatch=.1 -> 43 mapped liftOver -minMatch=.1 hg16.eR.bed \ /cluster/data/hg16/bed/bedOver/hg16ToMm4.over.chain \ encodeRegions.bed encodeRegions.unmapped hgLoadBed mm4 encodeRegions encodeRegions.bed -noBin # TIGR GENE INDEX (DONE 2004-05020 Fan) mkdir -p /cluster/data/mm4/bed/tigr cd /cluster/data/mm4/bed/tigr wget ftp://ftp.tigr.org/pub/data/tgi/Mus_musculus/TGI_track_MouseGenome_mm4_05-2004.tgz tar xvzf TGI*.tgz foreach f (*cattle*) set f1 = `echo $f | sed -e 's/cattle/cow/g'` mv $f $f1 end foreach o (mouse cow human pig rat) echo $o setenv O $o foreach f (chr*_$o*s) tail +2 $f | perl -wpe 's /THC/TC/; s/(TH?C\d+)/$ENV{O}_$1/;' > $f.gff end end ssh hgwdev cd /cluster/data/mm4/bed/tigr hgsql mm4 -e "drop table tigrGeneIndex" hgsql mm4 < ~/kent/src/hg/lib/tigrGeneIndex.sql foreach f (*.gff) echo Processing $f ... /cluster/home/fanhsu/bin/i386/ldHgGene -oldTable -exon=TC mm4 tigrGeneIndex $f hgsql mm4 -e "select count(*) from tigrGeneIndex" end # Total of 354491 entries created in tigrGeneIndex table. hgsql mm4 -e "update tigrGeneIndex set cdsStart = txStart;" hgsql mm4 -e "update tigrGeneIndex set cdsEnd = txEnd;" checkTableCoords mm4 tigrGeneIndex gzip *.gff *TCs # LIFTOVER CHAINS TO MM5 (WORKING 2004-08-02 kate) # run alignment # NOTE: split mm5 to /iscratch/i is doc'ed in makeHg17.doc ssh kk cd /cluster/data/mm4 makeLoChain-align mm4 /iscratch/i/mm4/softNib \ mm5 /iscratch/i/mm5/liftOver/liftSplit/split /scratch/hg/h/mouse11.ooc # Created parasol job in bed/blat.mm5.2004-08-02/run # 1892 jobs cd bed rm blat.mm5 ln -s blat.mm5.2004-08-02 blat.mm5 cd blat.mm5/run para try para check para push # lift results # the lift directory was defined in makeHg17.doc when split was performed # this expects data in bed/blat.mm5, so symlink must be there ssh kksilo cd /cluster/data/mm4/bed/blat.mm5 makeLoChain-lift mm4 mm5 /iscratch/i/mm5/liftOver/liftSplit/lift \ >&! lift.log & tail -100f lift.log # GOT HERE # Problems with lift: # liftUp: psl.c:112: pslLoad: Assertion `sizeOne == ret->blockCount' failed. # sizeOne tStarts 4 bs 17 # Abort # chain alignments ssh kk makeLoChain-chain mm4 /cluster/data/mm4/nib mm5 /cluster/data/mm5/nib # Created parasol job in /cluster/data/mm4/bed/blat.mm5/chainRun cd /cluster/data/mm4/bed/blat.mm5/chainRun para try # 46 jobs para check para push # make alignment net ssh kolossus makeLoChain-net mm4 mm5 # load into database and copy to download directory ssh hgwdev makeLoChain-load mm4 mm5 # Finished loading mm4ToHg17.over.chain # Now, add download link for /usr/local/apache/htdocs/goldenPath/mm4/liftOver/mm4ToHg17.over.chain.gz ## NIA Mouse Gene Index - (DONE - 2004-10-08 Fan) # requested by: Dudekula, Dawood (NIH/NIA/IRP) DudekulaDB@grc.nia.nih.gov # pick up data ssh hgwdev mkdir -p /cluster/data/mm4/bed/NIAGene cd /cluster/data/mm4/bed/NIAGene wget --timestamp http://lgsun.grc.nia.nih.gov/temp/NIA-Mouse-GeneIndex3-Transcript-to-Genome.psl wget --timestamping \ http://lgsun.grc.nia.nih.gov/temp/NIA-Mouse-GeneIndex3-Transcripts.fasta hgLoadPsl mm4 -table=NIAGene NIA-Mouse-GeneIndex3-Transcript-to-Genome.psl mkdir /gbdb/mm4/NIAGene ln -s /cluster/data/mm4/bed/NIAGene/NIA-Mouse-GeneIndex3-Transcripts.fasta \ /gbdb/mm4/NIAGene/NIA-Mouse-GeneIndex3-Transcripts.fasta hgLoadSeq mm4 /gbdb/mm4/NIAGene/NIA-Mouse-GeneIndex3-Transcripts.fasta Added and edited NIAGene.html and trackDb.ra under kent/src/hg/makeDb/trackDb/mouse/mm4 # MM4->MM5 LIFTOVER CHAIN (DONE 2/8/05 Andy) # I'm starting at the chaining point. Kate believes the alignment is OK, but there's no # chrA_chrB.psl files in the /cluster/data/mm4/bed/blat.mm5/raw directory, so I can't # start at the lifting stage. ssh kk9 makeLoChain-chain mm4 /cluster/data/mm4/nib mm5 /cluster/data/mm5/nib cd /cluster/data/mm4/bed/blat.mm5.2005-02-08/chainRun para try para check para push #Completed: 43 of 43 jobs #CPU time in finished jobs: 21502s 358.36m 5.97h 0.25d 0.001 y #IO & Wait Time: 48317s 805.29m 13.42h 0.56d 0.002 y #Average job time: 1624s 27.06m 0.45h 0.02d #Longest job: 21152s 352.53m 5.88h 0.24d #Submission to last job: 26995s 449.92m 7.50h 0.31d ssh kk nice makeLoChain-net mm4 mm5 # LIFTOVER CHAINS TO MM5 (REDONE AGAIN 03-02-2005 TO 03-07-2005 Andy) # run alignment # NOTE: split mm5 to /iscratch/i is doc'ed in makeHg17.doc ssh kk9 cd /cluster/data/mm4 makeLoChain-align mm4 /iscratch/i/mm4/softNib \ mm5 /iscratch/i/mm5/liftOver/liftSplit/split /scratch/hg/h/mouse11.ooc # Created parasol job in bed/blat.mm5.2005-03-02/run cd bed rm blat.mm5 ln -s blat.mm5.2004-03-02 blat.mm5 cd blat.mm5/run grep "mm4/softNib/chr[1239].nib" spec | sed 's/=98/=94/' > spec.reduced grep -v "mm4/softNib/chr[1239].nib" spec > spec.otherChroms para create spec.reduced # 172 jobs written to batch para try para check para push # lift results # the lift directory was defined in makeHg17.doc when split was performed # this expects data in bed/blat.mm5, so symlink must be there ssh kksilo cd /cluster/data/mm4/bed ln -s blat.mm5 blat.mm5.2005-03-03 cd blat.mm5 makeLoChain-lift mm4 mm5 /iscratch/i/mm5/liftOver/liftSplit/lift > lift.log 2> lift.log & # That didn't quite work. I'll do it manually: cd raw for chr in `awk '{print $1;}' /cluster/data/mm5/chrom.sizes`; do echo $chr liftUp -pslQ ../psl/$chr.psl /iscratch/i/mm5/liftOver/liftSplit/lift/$chr.lft warn chr*_$chr.psl done # chaining ssh kki makeLoChain-chain mm4 /cluster/data/mm4/nib mm5 /cluster/data/mm5/nib cd /cluster/data/mm4/bed/blat.mm5/chainRun para try para check para push # netting (this step also creates the liftOver chain) ssh kksilo makeLoChain-net mm4 mm5 ssh hgwdev makeLoChain mm4 mm5 # CREATE kgSpAlias TABLE FOR PB (Done 05/04/05) hgsql mm4 -e \ 'select kgXref.kgID, spID, alias from kgXref, kgAlias where kgXref.kgID=kgAlias.kgID' >j.tmp hgsql mm4 -e \ 'select kgXref.kgID, spID, alias from kgXref, kgProtAlias where kgXref.kgID=kgProtAlias.kgID'\ >>j.tmp cat j.tmp|sort -u |grep -v 'kgID' >mm4.kgSpAlias.tab rm j.tmp hgsql mm4 -e 'drop table kgSpAlias'; hgsql mm4 < ~/src/hg/lib/kgSpAlias.sql hgsql mm4 -e 'load data local infile "mm4.kgSpAlias.tab" into table kgSpAlias'