# for emacs: -*- mode: sh; -*- # This file describes how we made the browser database on the # Dog (Canis familiaris) July 2004 release. # CREATE BUILD DIRECTORY (DONE 7/1/04 angie) ssh kksilo mkdir /cluster/store7/canFam1 ln -s /cluster/store7/canFam1 /cluster/data/canFam1 # DOWNLOAD MITOCHONDRION GENOME SEQUENCE (DONE 7/1/04 angie) mkdir /cluster/data/canFam1/M cd /cluster/data/canFam1/M # go to http://www.ncbi.nih.gov/ and search Nucleotide for # "canis familiaris mitochondrion genome". That shows the gi number: # 17737322 # Use that number in the entrez linking interface to get fasta: wget -O chrM.fa \ 'http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=Text&db=Nucleotide&uid=17737322&dopt=FASTA' # Edit chrM.fa: make sure the long fancy header line says it's the # Canis familiaris mitochondrion complete genome, and then replace the # header line with just ">chrM". # MAKE JKSTUFF AND BED DIRECTORIES (DONE 7/1/04 angie) # This used to hold scripts -- better to keep them inline in the .doc # so they're in CVS. Now it should just hold lift file(s) and # temporary scripts made by copy-paste from this file. mkdir /cluster/data/canFam1/jkStuff # This is where most tracks will be built: mkdir /cluster/data/canFam1/bed # DOWNLOAD AGP, FASTA & QUAL (DONE 7/7/04 angie) ssh kksilo mkdir /cluster/data/canFam1/broad cd /cluster/data/canFam1/broad ftp ftp.broad.mit.edu # use username and password emailed by David Jaffe 7/3/04. prompt bin mget Dog1.0.agp Dog1.0.agp.chromosome.fasta.gz mget Dog1.0.agp.chromosome.qual.gz assembly.format supercontigs mget Dog1.0.masked.agp.chromosome.fasta.gz mget supercontigs.summary contigs.bases.gz bye # Loaded the above 7/5; Reloaded new versions on 7/6 of # Dog1.0.agp Dog1.0.agp.chromosome.fasta.gz # Dog1.0.agp.chromosome.qual.gz Dog1.0.masked.agp.chromosome.fasta.gz # And on 7/7 of Dog1.0.agp Dog1.0.agp.chromosome.fasta.gz # Dog1.0.agp.chromosome.qual.gz # UNPACK CHROM FASTA & AGP AND CROSS-CHECK (DONE 7/7/04 angie) # For some reason, the chromosome fasta has been split into 5000000-base # chunks. Fix the headers so that there is one record per chromosome, # and make one .fa file per chromosome. ssh kksilo cd /cluster/data/canFam1/broad cat > ../jkStuff/fixBroadChromHeaders.pl <<'_EOF_' #!/usr/bin/perl -w while (<>) { if (/^>(\w+)\.1-\d+/) { $chr = "chr$1"; print ">$chr\n"; } elsif (/^>\w+\.\d+000001-\d+/) { # Discard this extra header line. } elsif (/^>/) { die "Cant parse line $.:\n$_"; } else { print; } } '_EOF_' # << this line makes emacs coloring happy chmod a+x ../jkStuff/fixBroadChromHeaders.pl mkdir chroms zcat Dog1.0.agp.chromosome.fasta.gz \ | ../jkStuff/fixBroadChromHeaders.pl \ | faSplit byname stdin chroms/ faSize chroms/chr*.fa #2519779136 bases (159950770 N's 2359828366 real) in 40 sequences in 40 files #Total size: mean 62994478.4 sd 23674450.6 min 26492144 (chr38) max 126913094 (chrX) median 61172541 #N count: mean 3998769.2 sd 2953878.1 # Make our usual chrom dir structure and move chr*.fa into it. cp /dev/null ../chrom.lst foreach f (chroms/chr*.fa) set chr = $f:t:r set c = `echo $chr | sed -e 's/^chr//;'` echo $c >> ../chrom.lst mkdir ../$c mv $f ../$c end echo M >> ../chrom.lst # Split AGP file into per-chrom AGP files in usual chrom dir structure. cd /cluster/data/canFam1 foreach c (`cat chrom.lst`) grep -w "^chr$c" broad/Dog1.0.agp > $c/chr$c.agp end rm M/chrM.agp wc -l ?{,?}/chr*.agp # 119853 total wc -l broad/Dog1.0.agp # 119853 broad/Dog1.0.agp # checkAgpAndFa prints out way too much info -- keep the end/stderr only: cd /cluster/data/canFam1 foreach agp (?{,?}/chr*.agp) set fa = $agp:r.fa echo checking consistency of $agp and $fa checkAgpAndFa $agp $fa | tail -1 end # UNPACK CHROM QUAL SCORES (DONE 7/7/04 angie) ssh kksilo cd /cluster/data/canFam1/broad # Reuse fixBroadChromHeaders.pl script created in previous step, # compress to .qac for use in quality scores step later on. zcat Dog1.0.agp.chromosome.qual.gz \ | ../jkStuff/fixBroadChromHeaders.pl \ | qaToQac stdin fixedQuals.qac # DOUBLE-CHECK BY BUILDING ALT CHROM FASTA FROM CONTIGS+AGP (DONE 7/7/04 angie) # This is a redundant check, very low chance of finding any problem # since checkAgpAndFa passed. It's also very slow. So don't put this # in the critical path to masked sequence -- run it in parallel. ssh kolossus cd /cluster/data/canFam1 mkdir /cluster/bluearc/canFam1 gunzip -c broad/contigs.bases.gz > /cluster/bluearc/canFam1/contigs.fa # The number of "real" bases here should be equal to the number of # "real" bases in the chrom fasta -- and it is now on 7/7, whew. faSize /cluster/bluearc/canFam1/contigs.fa #2359828366 bases (0 N's 2359828366 real) in 59927 sequences in 1 files #Total size: mean 39378.4 sd 70249.8 min 352 (contig_41721) max 1356923 (contig_5971) median 8974 #N count: mean 0.0 sd 0.0 foreach agp (?{,?}/chr*.agp) set fa = $agp:r.fa2 echo building $fa from $agp awk '$5 == "W" {print $6;}' $agp > /tmp/contigs.lst faSomeRecords /cluster/bluearc/canFam1/contigs.fa /tmp/contigs.lst \ /tmp/contigs.fa agpToFa -simpleMultiMixed $agp $agp:t:r $fa /tmp/contigs.fa end faSize */chr*.fa2 #2519779136 bases (159950770 N's 2359828366 real) in 40 sequences in 40 files #Total size: mean 62994478.4 sd 23674450.6 min 26492144 (chr38) max 126913094 (chrX) median 61172541 #N count: mean 3998769.2 sd 2953878.1 foreach f (?{,?}/chr*.fa2) faCmp $f $f:r.fa end rm ?{,?}/chr*.fa2 # COMPARE CONTIG & CHROM NON-N BASE COUNTS (DONE 7/7/04 angie) # In the 7/6 assembly, faSize showed different "real" base counts # for contigs (2359828366) vs. chroms (2359035963): contigs have # 792403 more non-N bases than chroms. # Here's how I determined that the difference was explained by the # omission of some contigs from the AGP. # In the 7/7 assembly, all contigs are in the AGP so we're OK now. ssh kksilo cd /cluster/data/canFam1 awk '$5 == "W" {print $6;}' broad/Dog1.0.agp | sort > /tmp/1 uniq /tmp/1 > /tmp/2 wc -l /tmp/[12] # 59927 /tmp/1 # 59927 /tmp/2 # 119854 total zcat broad/contigs.bases.gz | g '^>' | sed -e 's/^>//' | sort > /tmp/3 wc -l /tmp/3 # 59927 /tmp/3 diff /tmp/1 /tmp/2 # (no difference ==> each contig appears only once) diff /tmp/1 /tmp/3 | egrep -v '^[0-9]+' | wc -l # 0 # Excellent! 0 contigs missing. So we don't have to do this stuff, # but here it is in case we need it again in the future: set missers = `diff /tmp/1 /tmp/3 | egrep -v '^[0-9]+' | sed -e 's/^> //'` zcat broad/contigs.bases.gz | faSize -detailed=on stdin > /tmp/4 set total = 0 foreach ctg ($missers) set size = `grep -w $ctg /tmp/4 | awk '{print $2;}'` echo "$ctg\t$size" set total = `expr $total + $size` end echo "total\t\t$total" #total 0 # BREAK UP SEQUENCE INTO 5 MB CHUNKS AT CONTIGS/GAPS (DONE 7/7/04 angie) ssh kksilo cd /cluster/data/canFam1 foreach agp (?{,?}/chr*.agp) set fa = $agp:r.fa echo splitting $agp and $fa cp -p $agp $agp.bak cp -p $fa $fa.bak splitFaIntoContigs $agp $fa . -nSize=5000000 end # No _random's in this assembly, so no need to clean up after them. # Make a "pseudo-contig" for processing chrM too: mkdir M/chrM_1 sed -e 's/chrM/chrM_1/' M/chrM.fa > M/chrM_1/chrM_1.fa mkdir M/lift echo "chrM_1/chrM_1.fa.out" > M/lift/oOut.lst echo "chrM_1" > M/lift/ordered.lst set msize = `faSize M/chrM.fa | awk '{print $1;}'` echo "0\tM/chrM_1\t$msize\tchrM\t$msize" > M/lift/ordered.lft foreach f ( ?{,?}/chr*.fa.bak ) faCmp $f $f:r end # MAKE LIFTALL.LFT (DONE 7/7/04 angie) ssh kksilo cd /cluster/data/canFam1 cat */lift/{ordered,random}.lft > jkStuff/liftAll.lft # CREATING DATABASE (DONE 7/6/04 angie) ssh hgwdev echo 'create database canFam1' | hgsql '' # Use df to make sure there is at least 75G free on hgwdev:/var/lib/mysql df -h /var/lib/mysql #/dev/sdc1 1.8T 409G 1.3T 25% /var/lib/mysql # CREATING GRP TABLE FOR TRACK GROUPING (DONE 7/6/04 angie) ssh hgwdev echo "create table grp (PRIMARY KEY(NAME)) select * from hg17.grp" \ | hgsql canFam1 # MAKE CHROMINFO TABLE WITH (TEMPORARILY UNMASKED) NIBS (DONE 7/7/04 angie) # Make nib/, unmasked until RepeatMasker and TRF steps are done. # Do this now so we can load up RepeatMasker and run featureBits; # can also load up other tables that don't depend on masking. ssh kksilo cd /cluster/data/canFam1 mkdir nib foreach f (?{,?}/chr*.fa) echo "nibbing $f" faToNib $f nib/$f:t:r.nib end # Make symbolic links from /gbdb/canFam1/nib to the real nibs. ssh hgwdev mkdir -p /gbdb/canFam1/nib foreach f (/cluster/data/canFam1/nib/chr*.nib) ln -s $f /gbdb/canFam1/nib end # Load /gbdb/canFam1/nib paths into database and save size info. cd /cluster/data/canFam1 hgsql canFam1 < $HOME/kent/src/hg/lib/chromInfo.sql hgNibSeq -preMadeNib canFam1 /gbdb/canFam1/nib */chr*.fa echo "select chrom,size from chromInfo" | hgsql -N canFam1 > chrom.sizes # take a look at chrom.sizes, should be 41 lines wc chrom.sizes # 41 82 603 chrom.sizes # GOLD AND GAP TRACKS (DONE 7/7/04 angie) ssh hgwdev cd /cluster/data/canFam1 hgGoldGapGl -noGl -chromLst=chrom.lst canFam1 /cluster/data/canFam1 . # featureBits fails if there's no chrM_gap, so make one: # echo "create table chrM_gap like chr1_gap" | hgsql canFam1 # oops, that won't work until v4.1, so do this for the time being: hgsql canFam1 -e "create table chrM_gap select * from chr1_gap where 0=1" # ACCESSIONS FOR CONTIGS (DONE 2004-09-15 kate) # Used for Assembly track details, and also ENCODE AGP's cd /cluster/data/canFam1/bed mkdir -p contigAcc cd contigAcc # extract WGS contig to accession mapping from Entrez Nucleotide summaries # To get the summary file, access the Genbank page for the project # by searching: # genus[ORGN] AND WGS[KYWD] # At the end of the page, click on the list of accessions for the contigs. # Select summary format, file, and click "send to". # output to file .acc # 59927 items contigAccession dog.acc > contigAcc.tab # 59927 contigAcc.tab wc -l contigAcc.tab # verify this corresponds with the Assembly table # get summary stats for chrN_gold, using table browser # items matching query 59927 # load into database hgsql canFam1 < $HOME/kent/src/hg/lib/contigAcc.sql echo "LOAD DATA LOCAL INFILE 'contigAcc.tab' INTO TABLE contigAcc" | \ hgsql canFam1 hgsql canFam1 -e "SELECT COUNT(*) FROM contigAcc" # MAKE HGCENTRALTEST ENTRY AND TRACKDB TABLE FOR CANFAM1 (DONE 7/7/04 angie) ssh hgwdev # Make trackDb table so browser knows what tracks to expect: ssh hgwdev cd $HOME/kent/src/hg/makeDb/trackDb cvs up -d -P # Edit that makefile to add canFam1 in all the right places and do make update cvs commit makefile mkdir -p dog/canFam1 cvs add dog dog/canFam1 cvs ci -m "trackDb dir for dog genome(s)" dog/canFam1 make alpha # Add dbDb and defaultDb entries: echo 'insert into dbDb (name, description, nibPath, organism, \ defaultPos, active, orderKey, genome, scientificName, \ htmlPath, hgNearOk, hgPbOk, sourceName) \ values("canFam1", "Jul. 2004", \ "/gbdb/canFam1/nib", "Dog", "chr14:10666612-10673232", 1, \ 18, "Dog", "Canis familiaris", \ "/gbdb/canFam1/html/description.html", 0, 0, \ "Broad Institute v. 1.0");' \ | hgsql -h genome-testdb hgcentraltest # If there's another dog assembly, this will be an update not insert: echo 'insert into defaultDb (genome, name) values("Dog", "canFam1");' \ | hgsql -h genome-testdb hgcentraltest # REPEAT MASKING (DONE 7/7/04 angie) #- Split contigs into 500kb chunks, at gaps if possible: ssh kksilo cd /cluster/data/canFam1 foreach c (`cat chrom.lst`) foreach d ($c/chr${c}*_?{,?}) cd $d echo "splitting $d" set contig = $d:t faSplit gap $contig.fa 500000 ${contig}_ -lift=$contig.lft \ -minGapSize=100 cd ../.. end end #- Make the run directory and job list: cd /cluster/data/canFam1 cat << '_EOF_' > jkStuff/RMDog #!/bin/csh -fe cd $1 pushd . /bin/mkdir -p /tmp/canFam1/$2 /bin/cp $2 /tmp/canFam1/$2/ cd /tmp/canFam1/$2 /cluster/bluearc/RepeatMasker/RepeatMasker -ali -s -spec dog $2 popd /bin/cp /tmp/canFam1/$2/$2.out ./ if (-e /tmp/canFam1/$2/$2.align) /bin/cp /tmp/canFam1/$2/$2.align ./ if (-e /tmp/canFam1/$2/$2.tbl) /bin/cp /tmp/canFam1/$2/$2.tbl ./ if (-e /tmp/canFam1/$2/$2.cat) /bin/cp /tmp/canFam1/$2/$2.cat ./ /bin/rm -fr /tmp/canFam1/$2/* /bin/rmdir --ignore-fail-on-non-empty /tmp/canFam1/$2 /bin/rmdir --ignore-fail-on-non-empty /tmp/canFam1 '_EOF_' # << this line makes emacs coloring happy chmod +x jkStuff/RMDog mkdir RMRun cp /dev/null RMRun/RMJobs foreach c (`cat chrom.lst`) foreach d ($c/chr${c}_?{,?}) set ctg = $d:t foreach f ( $d/${ctg}_?{,?}.fa ) set f = $f:t echo /cluster/data/canFam1/jkStuff/RMDog \ /cluster/data/canFam1/$d $f \ '{'check out line+ /cluster/data/canFam1/$d/$f.out'}' \ >> RMRun/RMJobs end end end #- Do the run ssh kk cd /cluster/data/canFam1/RMRun para create RMJobs para try, para check, para check, para push, para check,... #Completed: 6373 of 6373 jobs #CPU time in finished jobs: 24949716s 415828.60m 6930.48h 288.77d 0.791 y #IO & Wait Time: 220014s 3666.90m 61.12h 2.55d 0.007 y #Average job time: 3949s 65.82m 1.10h 0.05d #Longest job: 6389s 106.48m 1.77h 0.07d #Submission to last job: 31173s 519.55m 8.66h 0.36d #- Lift up the 500KB chunk .out's to 5MB ("pseudo-contig") level ssh kksilo cd /cluster/data/canFam1 foreach d (*/chr*_?{,?}) set contig = $d:t echo $contig liftUp $d/$contig.fa.out $d/$contig.lft warn $d/${contig}_*.fa.out \ > /dev/null end #- Lift pseudo-contigs to chromosome level foreach c (`cat chrom.lst`) echo lifting $c cd $c liftUp chr$c.fa.out lift/ordered.lft warn `cat lift/oOut.lst` > /dev/null cd .. end #- Load the .out files into the database with: ssh hgwdev cd /cluster/data/canFam1 hgLoadOut canFam1 */chr*.fa.out # VERIFY REPEATMASKER RESULTS (DONE 7/7/04 angie) # Eyeball some repeat annotations in the browser, compare to lib seqs. # Run featureBits on canFam1 and on a comparable genome build, and compare: ssh hgwdev featureBits canFam1 rmsk #896773874 bases of 2359845093 (38.001%) in intersection # Good, 35-40% seems typical for mammals... # COMPARE OUR REPEATMASKER .OUT'S TO BROAD'S (DONE 7/23/04 angie) ssh kksilo cd /cluster/data/canFam1/broad ftp ftp.broad.mit.edu # same username & password as for fasta etc. bin get dog1.0_repeatinfo.tar.gz bye mkdir /cluster/data/canFam1/bed/compareRmsk cd /cluster/data/canFam1/bed/compareRmsk tar xvzf ../../broad/dog1.0_repeatinfo.tar.gz rm dog1.0_repeatinfo/*.index # compare... files? definitely coverage --- make a bed4. cp /dev/null broadRmskBed4.bed foreach f (dog1.0_repeatinfo/chr*.fa.out) tail +4 $f | awk '{print $5 "\t" $6-1 "\t" $7 "\t" $10;}' \ >> broadRmskBed4.bed end wc -l ../../?{,?}/chr*.fa.out | tail -1 #4337285 total wc -l dog1.0_repeatinfo/chr*.fa.out | tail -1 #4059835 total ssh hgwdev cd /cluster/data/canFam1/bed/compareRmsk #Compare coverage with our RepeatMasker run: #896773874 bases of 2359845093 (38.001%) in intersection featureBits canFam1 broadRmskBed4.bed #883152916 bases of 2359845093 (37.424%) in intersection featureBits canFam1 broadRmskBed4.bed \!rmsk -bed=notInRmsk.bed #7053647 bases of 2359845093 (0.299%) in intersection featureBits canFam1 \!broadRmskBed4.bed rmsk -bed=notInBroad.bed #20674605 bases of 2359845093 (0.876%) in intersection featureBits canFam1 broadRmskBed4.bed \!rmsk \!simpleRepeat \ -bed=notInRmskTRF.bed #6906446 bases of 2359845093 (0.293%) in intersection # Load theirs up as "rmskNew" since hgTracks has a registered handler # for that... easier visual comparison. hgLoadOut -table=rmskNew canFam1 dog1.0_repeatinfo/chr*.fa.out # back on kksilo wc -l notInRmsk.bed # 256372 notInRmsk.bed awk '$3-$2 >= 40 {printf "%s\t%d\t%d\t%s\t%d\n", $1, $2, $3, $4, $3-$2;}' \ notInRmsk.bed > notInRmskOver40.bed awk '$3-$2 >= 40 {printf "%s\t%d\t%d\t%s\t%d\n", $1, $2, $3, $4, $3-$2;}' \ notInBroad.bed > notInBroadOver40.bed # Take a look at some of those positions in the browser. # Here's one we just miss: chr1:4,524,717-4,524,907 # Here's one they just miss: chr1:3,503,297-3,503,495 # Broad folks agreed the diffs are very small (sometimes due to the fact # that they ran on contigs while we ran on ~500k chunks). # SIMPLE REPEATS (TRF) (DONE 7/7/04 angie) ssh kksilo mkdir /cluster/data/canFam1/bed/simpleRepeat cd /cluster/data/canFam1/bed/simpleRepeat mkdir trf cp /dev/null jobs.csh foreach d (/cluster/data/canFam1/*/chr*_?{,?}) set ctg = $d:t foreach f ($d/${ctg}.fa) set fout = $f:t:r.bed echo $fout echo "/cluster/bin/i386/trfBig -trf=/cluster/bin/i386/trf $f /dev/null -bedAt=trf/$fout -tempDir=/tmp" \ >> jobs.csh end end csh -ef jobs.csh >&! jobs.log & # check on this with tail -f jobs.log wc -l jobs.csh ls -1 trf | wc -l endsInLf trf/* # When job is done do: liftUp simpleRepeat.bed /cluster/data/canFam1/jkStuff/liftAll.lft warn \ trf/*.bed # Load into the database: ssh hgwdev hgLoadBed canFam1 simpleRepeat \ /cluster/data/canFam1/bed/simpleRepeat/simpleRepeat.bed \ -sqlTable=$HOME/src/hg/lib/simpleRepeat.sql featureBits canFam1 simpleRepeat #36509895 bases of 2359845093 (1.547%) in intersection # PROCESS SIMPLE REPEATS INTO MASK (DONE 7/7/04/ angie) # After the simpleRepeats track has been built, make a filtered version # of the trf output: keep trf's with period <= 12: ssh kksilo cd /cluster/data/canFam1/bed/simpleRepeat mkdir -p trfMask foreach f (trf/chr*.bed) awk '{if ($5 <= 12) print;}' $f > trfMask/$f:t end # Lift up filtered trf output to chrom coords as well: cd /cluster/data/canFam1 mkdir bed/simpleRepeat/trfMaskChrom foreach c (`cat chrom.lst`) if (-e $c/lift/ordered.lst) then perl -wpe 's@(\S+)@bed/simpleRepeat/trfMask/$1.bed@' \ $c/lift/ordered.lst > $c/lift/oTrf.lst liftUp bed/simpleRepeat/trfMaskChrom/chr$c.bed \ jkStuff/liftAll.lft warn `cat $c/lift/oTrf.lst` endif end # Here's the coverage for the filtered TRF: ssh hgwdev cat /cluster/data/canFam1/bed/simpleRepeat/trfMaskChrom/*.bed \ > /tmp/filtTrf.bed featureBits canFam1 /tmp/filtTrf.bed #23017541 bases of 2359845093 (0.975%) in intersection featureBits canFam1 /tmp/filtTrf.bed \!rmsk #1275941 bases of 2359845093 (0.054%) in intersection # MASK SEQUENCE WITH REPEATMASKER AND SIMPLE REPEAT/TRF (DONE 7/7/04 angie) ssh kksilo cd /cluster/data/canFam1 # Soft-mask (lower-case) the contig and chr .fa's, # then make hard-masked versions from the soft-masked. set trfCtg=bed/simpleRepeat/trfMask set trfChr=bed/simpleRepeat/trfMaskChrom foreach f (*/chr*.fa) echo "repeat- and trf-masking $f" maskOutFa -soft $f $f.out $f set chr = $f:t:r maskOutFa -softAdd $f $trfChr/$chr.bed $f echo "hard-masking $f" maskOutFa $f hard $f.masked end # Tons of warnings like this, mostly for L1M*: #WARNING: negative rEnd: -189 chrX:117586389-117586475 L1M3e foreach c (`cat chrom.lst`) echo "repeat- and trf-masking contigs of chr$c" foreach d ($c/chr*_?{,?}) set ctg=$d:t set f=$d/$ctg.fa maskOutFa -soft $f $f.out $f maskOutFa -softAdd $f $trfCtg/$ctg.bed $f maskOutFa $f hard $f.masked end end #- Rebuild the nib files, using the soft masking in the fa: foreach f (*/chr*.fa) faToNib -softMask $f nib/$f:t:r.nib end # Make one big 2bit file as well, and make a link to it in # /gbdb/canFam1/nib because hgBlat looks there: faToTwoBit */chr*.fa canFam1.2bit ssh hgwdev ln -s /cluster/data/canFam1/canFam1.2bit /gbdb/canFam1/nib/ # MAKE HGCENTRALTEST BLATSERVERS ENTRY FOR CANFAM1 (DONE 7/13/04 angie) ssh hgwdev echo 'insert into blatServers values("canFam1", "blat9", "17778", 1, 0); \ insert into blatServers values("canFam1", "blat9", "17779", 0, 1);' \ | hgsql -h genome-testdb hgcentraltest # MAKE DESCRIPTION/SAMPLE POSITION HTML PAGE (DONE 7/6/04 angie) ssh hgwdev mkdir /gbdb/canFam1/html # Write ~/kent/src/hg/makeDb/trackDb/dog/canFam1/description.html # with a description of the assembly and some sample position queries. chmod a+r $HOME/kent/src/hg/makeDb/trackDb/dog/canFam1/description.html # Check it in and copy (ideally using "make alpha" in trackDb) to # /gbdb/canFam1/html # PUT MASKED SEQUENCE OUT FOR CLUSTER RUNS (DONE 7/7/04 angie) ssh kkr1u00 # Chrom-level mixed nibs that have been repeat- and trf-masked: rm -rf /iscratch/i/canFam1/nib mkdir /iscratch/i/canFam1/nib cp -p /cluster/data/canFam1/nib/chr*.nib /iscratch/i/canFam1/nib # Pseudo-contig fa that have been repeat- and trf-masked: rm -rf /iscratch/i/canFam1/maskedContigs mkdir /iscratch/i/canFam1/maskedContigs foreach d (/cluster/data/canFam1/*/chr*_?{,?}) cp $d/$d:t.fa /iscratch/i/canFam1/maskedContigs end cp -p /cluster/data/canFam1/canFam1.2bit /iscratch/i/canFam1/ du -sh /iscratch/i/canFam1/* #631M /iscratch/i/canFam1/canFam1.2bit #2.5G /iscratch/i/canFam1/maskedContigs #1.2G /iscratch/i/canFam1/nib iSync # Put nibs and rmsk files in /scratch for big cluster blastz. mkdir /cluster/bluearc/scratch/hg/canFam1 cp -pR /iscratch/i/canFam1/nib /cluster/bluearc/scratch/hg/canFam1 ssh kksilo mkdir /cluster/bluearc/scratch/hg/canFam1/rmsk cp -p /cluster/data/canFam1/*/chr*.fa.out \ /cluster/bluearc/scratch/hg/canFam1/rmsk # Ask cluster-admin for an rsync after the next step. # MAKE LINEAGE-SPECIFIC REPEATS VS. HUMAN, MOUSE (DONE 7/7/04 angie) ssh kksilo cd /cluster/bluearc/scratch/hg/canFam1/rmsk # Run Arian's DateRepsinRMoutput.pl to add extra columns telling # whether repeats in -query are also expected in -comp species. # Human in extra column 1, Mouse in extra column 2 foreach outfl ( *.out ) echo "$outfl" /cluster/bluearc/RepeatMasker/DateRepsinRMoutput.pl \ ${outfl} -query dog -comp human -comp mouse end # Now extract human (extra column 1), mouse (extra column). cd /cluster/bluearc/scratch/hg/canFam1 mkdir linSpecRep.notInHuman mkdir linSpecRep.notInMouse foreach f (rmsk/*.out_hum_mus) set base = $f:t:r:r echo $base.out.spec /cluster/bin/scripts/extractLinSpecReps 1 $f > \ linSpecRep.notInHuman/$base.out.spec /cluster/bin/scripts/extractLinSpecReps 2 $f > \ linSpecRep.notInMouse/$base.out.spec end wc -l rmsk/*.out #4337285 total wc -l linSpecRep.notInHuman/* #1333452 total wc -l linSpecRep.notInMouse/* #1333452 total # The linSpecReps.notIn{Human,Mouse} files are the same. I asked Arian # if that's a bad thing last week (human vs. {dog,mouse} -> same). # Haven't heard back yet, but hey, we're all mammals here... foreach f (linSpecRep.notInHuman/*) echo $f:t diff $f linSpecRep.notInMouse/$f:t | wc -l | g -vw 0 end # Clean up. rm /cluster/bluearc/scratch/hg/canFam1/rmsk/*.out_hum_mus # Ask cluster-admin for an rsync. # MAKE 10.OOC, 11.OOC FILES FOR BLAT (DONE 7/7/04 angie) # Use -repMatch=843 (based on size -- for human we use 1024, and # dog size is 2359845093 / 2866216770 = ~82.3% of human judging by # gapless genome size from featureBits) ssh kolossus mkdir /cluster/data/canFam1/bed/ooc cd /cluster/data/canFam1/bed/ooc ls -1 /cluster/data/canFam1/nib/chr*.nib > nib.lst blat nib.lst /dev/null /dev/null -tileSize=11 \ -makeOoc=/cluster/bluearc/canFam1/11.ooc -repMatch=843 #Wrote 26268 overused 11-mers to /cluster/bluearc/canFam1/11.ooc blat nib.lst /dev/null /dev/null -tileSize=10 \ -makeOoc=/cluster/bluearc/canFam1/10.ooc -repMatch=843 #Wrote 143003 overused 10-mers to /cluster/bluearc/canFam1/10.ooc ssh kkr1u00 cp -p /cluster/bluearc/canFam1/*.ooc /iscratch/i/canFam1/ iSync # AUTO UPDATE GENBANK MRNA RUN (DONE 7/8/04 angie) # 7/22/04: added xenoRefGene # 2/22/06: added refGene ssh hgwdev # Update genbank config and source in CVS: cd ~/kent/src/hg/makeDb/genbank cvsup . # See if /cluster/data/genbank/etc/genbank.conf has had any un-checked-in # edits, check them in if necessary: diff /cluster/data/genbank/etc/genbank.conf etc/genbank.conf # Edit etc/genbank.conf and add these lines: # canFam1 (dog) canFam1.genome = /iscratch/i/canFam1/nib/chr*.nib canFam1.lift = /cluster/data/canFam1/jkStuff/liftAll.lft canFam1.refseq.mrna.native.load = no canFam1.refseq.mrna.xeno.load = yes canFam1.refseq.mrna.xeno.pslReps = -minCover=0.15 -minAli=0.75 -nearTop=0.005 canFam1.genbank.mrna.xeno.load = yes canFam1.genbank.est.xeno.load = no canFam1.downloadDir = canFam1 cvs ci etc/genbank.conf # Since dog is a new species for us, edit src/lib/gbGenome.c. # Pick some other browser species, & monkey-see monkey-do. cvs diff src/lib/gbGenome.c make cvs ci src/lib/gbGenome.c # Edit src/align/gbBlat to add /iscratch/i/canFam1/11.ooc cvs diff src/align/gbBlat make cvs ci src/align/gbBlat # Install to /cluster/data/genbank: make install-server ssh eieio cd /cluster/data/genbank # This is an -initial run, (xeno) refseq only: nice bin/gbAlignStep -srcDb=refseq -type=mrna -initial canFam1 & # Load results: ssh hgwdev cd /cluster/data/genbank nice bin/gbDbLoadStep -verbose=1 -drop -initialLoad canFam1 featureBits canFam1 xenoRefGene #39066569 bases of 2359845093 (1.655%) in intersection # Clean up: rm -r work/initial.canFam1 ssh eieio cd /cluster/data/genbank # This is an -initial run, mRNA only: nice bin/gbAlignStep -srcDb=genbank -type=mrna -initial canFam1 & # Load results: ssh hgwdev cd /cluster/data/genbank nice bin/gbDbLoadStep -verbose=1 -drop -initialLoad canFam1 featureBits canFam1 mrna #1850373 bases of 2359845093 (0.078%) in intersection featureBits canFam1 xenoMrna #62236255 bases of 2359845093 (2.637%) in intersection # Clean up: rm -r work/initial.canFam1 ssh eieio # -initial for ESTs: nice bin/gbAlignStep -srcDb=genbank -type=est -initial canFam1 & # Load results: ssh hgwdev cd /cluster/data/genbank nice bin/gbDbLoadStep -verbose=1 canFam1 & featureBits canFam1 intronEst #4090352 bases of 2359845093 (0.173%) in intersection featureBits canFam1 est #10958036 bases of 2359845093 (0.464%) in intersection # Clean up: rm -r work/initial.canFam1 # 2/22/06: add native refseq ssh hgwdev cd ~/kent/src/hg/makeDb/genbank # genbank.conf: canFam1.refseq.mrna.native.load = yes make etc-update ssh kkstore02 cd /cluster/data/genbank nice bin/gbAlignStep -srcDb=refseq -type=mrna -orgCat=native -initial \ canFam1 & tail -f var/build/logs/2006.02.22-17:54:01.canFam1.initalign.log ssh hgwdev cd /cluster/data/genbank nice bin/gbDbLoadStep -verbose=1 -drop -initialLoad canFam1 & tail -f var/dbload/hgwdev/logs/2006.02.22-18:01:09.dbload.log featureBits canFam1 refGene #1220839 bases of 2359845093 (0.052%) in intersection # COMPARE COUNTS OF ALIGNED VS NON-ALIGNED MRNAS (DONE 7/8/04 angie) ssh hgwdev mkdir /cluster/data/canFam1/bed/mrnaCheck cd /cluster/data/canFam1/bed/mrnaCheck # Get accs of all mRNAs and ESTs in our genbank build. # Note: if/when there's a RefSeq for dog, filter NM_* accessions # out of the mRNA list! hgsql -N canFam1 -e 'select acc from mrna where type = "mRNA" and \ organism = 2497 or organism = 2654' \ | sort > accs.mrna hgsql -N canFam1 -e 'select acc from mrna where type = "EST" and \ organism = 2497 or organism = 2654' \ | sort > accs.est # Get accs of mRNAs and ESTs that were aligned: hgsql -N canFam1 -e "select distinct(qName) from all_mrna" \ | sort > alignedAccs.mrna hgsql -N canFam1 -e "select distinct(qName) from all_est" \ | sort > alignedAccs.est # Get accs of mRNAs and ESTs that were not aligned: diff accs.mrna alignedAccs.mrna | egrep -v '^[0-9]' | sed -e 's/^< //' \ > notAlignedAccs.mrna diff accs.est alignedAccs.est | egrep -v '^[0-9]' | sed -e 's/^< //' \ > notAlignedAccs.est # Look at counts, compute percentages. wc -l * # 37387 accs.est # 1188 accs.mrna # 34781 alignedAccs.est # 1133 alignedAccs.mrna # 2606 notAlignedAccs.est # 55 notAlignedAccs.mrna expr 37387 - 34781 #2606 (93.0% aligned, 7.0% not aligned) expr 1188 - 1133 #55 (95.4% aligned, 4.6% not aligned) # Add descriptions to mRNAs not aligned. cp /dev/null notAlignedDesc.mrna foreach acc (`cat notAlignedAccs.mrna`) hgsql -N canFam1 \ -e 'select mrna.acc,description.name from mrna,description \ where mrna.acc = "'$acc'" and mrna.description = description.id' \ >> notAlignedDesc.mrna end # MAKE DOWNLOADABLE SEQUENCE FILES (DONE 7/7/04 angie) ssh kksilo cd /cluster/data/canFam1 #- Build the .zip files cat << '_EOF_' > jkStuff/zipAll.csh rm -rf zip mkdir zip zip -j zip/chromAgp.zip [0-9A-Z]*/chr*.agp zip -j zip/chromOut.zip */chr*.fa.out zip -j zip/chromFa.zip */chr*.fa zip -j zip/chromFaMasked.zip */chr*.fa.masked cd bed/simpleRepeat zip ../../zip/chromTrf.zip trfMaskChrom/chr*.bed cd ../.. cd /cluster/data/genbank ./bin/i386/gbGetSeqs -db=canFam1 -native GenBank mrna \ /cluster/data/canFam1/zip/mrna.fa cd /cluster/data/canFam1/zip zip -j mrna.zip mrna.fa '_EOF_' # << this line makes emacs coloring happy csh ./jkStuff/zipAll.csh |& tee zipAll.log cd zip #- Look at zipAll.log to make sure all file lists look reasonable. #- Check zip file integrity: foreach f (*.zip) unzip -t $f > $f.test tail -1 $f.test end wc -l *.zip.test #- Copy the .zip files to hgwdev:/usr/local/apache/... ssh hgwdev cd /cluster/data/canFam1/zip set gp = /usr/local/apache/htdocs/goldenPath/canFam1 mkdir -p $gp/bigZips cp -p *.zip $gp/bigZips mkdir -p $gp/chromosomes foreach f ( ../*/chr*.fa ) zip -j $gp/chromosomes/$f:t.zip $f end cd $gp/bigZips md5sum *.zip > md5sum.txt cd $gp/chromosomes md5sum *.zip > md5sum.txt # Take a look at bigZips/* and chromosomes/*, update their README.txt's # Can't make refGene upstream sequence files - no refSeq for dog. # Maybe ensGene when we get that. # SWAP BLASTZ HUMAN-DOG TO DOG-HUMAN (HG17) (DONE 7/9/04 angie) ssh kolossus mkdir /cluster/data/canFam1/bed/blastz.hg17.swap.2004-07-08 ln -s blastz.hg17.swap.2004-07-08 /cluster/data/canFam1/bed/blastz.hg17 cd /cluster/data/canFam1/bed/blastz.hg17.swap.2004-07-08 set aliDir = /cluster/data/hg17/bed/blastz.canFam1.2004-07-08 cp $aliDir/S1.len S2.len cp $aliDir/S2.len S1.len mkdir unsorted axtChrom cat $aliDir/axtChrom/chr*.axt \ | axtSwap stdin $aliDir/S1.len $aliDir/S2.len stdout \ | axtSplitByTarget stdin unsorted # Sort the shuffled .axt files. foreach f (unsorted/*.axt) echo sorting $f:t:r axtSort $f axtChrom/$f:t end du -sh $aliDir/axtChrom unsorted axtChrom rm -r unsorted # CHAIN HUMAN BLASTZ (DONE 7/10/04 angie) # Run axtChain on little cluster ssh kki cd /cluster/data/canFam1/bed/blastz.hg17.swap.2004-07-08 mkdir -p axtChain/run1 cd axtChain/run1 mkdir out chain ls -1S /cluster/data/canFam1/bed/blastz.hg17.swap.2004-07-08/axtChrom/*.axt \ > input.lst cat << '_EOF_' > gsub #LOOP doChain {check in line+ $(path1)} {check out line+ chain/$(root1).chain} {check out exists out/$(root1).out} #ENDLOOP '_EOF_' # << this line makes emacs coloring happy cat << '_EOF_' > doChain #!/bin/csh axtChain $1 \ /iscratch/i/canFam1/nib \ /iscratch/i/gs.18/build35/bothMaskedNibs $2 > $3 '_EOF_' # << this line makes emacs coloring happy chmod a+x doChain gensub2 input.lst single gsub jobList para create jobList para try, check, push, check... # Wow, chr1 dwarfs the others... #Completed: 41 of 41 jobs #Average job time: 293s 4.88m 0.08h 0.00d #Longest job: 4419s 73.65m 1.23h 0.05d #Submission to last job: 4419s 73.65m 1.23h 0.05d # now on the cluster server, sort chains ssh kksilo cd /cluster/data/canFam1/bed/blastz.hg17.swap.2004-07-08/axtChain chainMergeSort run1/chain/*.chain > all.chain chainSplit chain all.chain rm run1/chain/*.chain # take a look at score distr's foreach f (chain/*.chain) grep chain $f | awk '{print $2;}' | sort -nr > /tmp/score.$f:t:r echo $f:t:r textHistogram -binSize=4000 /tmp/score.$f:t:r echo "" end # Lots of chaff with scores in the 3000's. Many very-high-scoring # chains. So filter the chain down somewhat... mv all.chain all.chain.unfiltered chainFilter -minScore=5000 all.chain.unfiltered > all.chain rm chain/* chainSplit chain all.chain gzip all.chain.unfiltered # Load chains into database ssh hgwdev cd /cluster/data/canFam1/bed/blastz.hg17.swap.2004-07-08/axtChain/chain foreach i (*.chain) set c = $i:r hgLoadChain canFam1 ${c}_chainHg17 $i end # Wow, very high coverage! featureBits -chrom=chr1 canFam1 chainHg17Link #Out of memory needMem - request size 75 bytes featureBits -chrom=chr2 canFam1 chainHg17Link #55072794 bases of 83844569 (65.684%) in intersection # NET HUMAN BLASTZ (DONE 7/10/04 angie) ssh kksilo cd /cluster/data/canFam1/bed/blastz.hg17.swap.2004-07-08/axtChain chainPreNet all.chain ../S1.len ../S2.len stdout \ | chainNet stdin -minSpace=1 ../S1.len ../S2.len stdout /dev/null \ | netSyntenic stdin noClass.net # Add classification info using db tables: ssh hgwdev cd /cluster/data/canFam1/bed/blastz.hg17.swap.2004-07-08/axtChain netClass noClass.net canFam1 hg17 human.net # Make a 'syntenic' subset: ssh kksilo cd /cluster/data/canFam1/bed/blastz.hg17.swap.2004-07-08/axtChain rm noClass.net # Make a 'syntenic' subset of these with netFilter -syn human.net > humanSyn.net # Load the nets into database ssh hgwdev cd /cluster/data/canFam1/bed/blastz.hg17.swap.2004-07-08/axtChain netFilter -minGap=10 human.net | hgLoadNet canFam1 netHg17 stdin netFilter -minGap=10 humanSyn.net | hgLoadNet canFam1 netSyntenyHg17 stdin # Add entries for chainHg17, netHg17, syntenyHg17 to dog/canFam1 trackDb # GENERATE HG17 MAF FOR MULTIZ FROM NET (DONE 7/19/04 angie) ssh kksilo cd /cluster/data/canFam1/bed/blastz.hg17.swap.2004-07-08/axtChain netSplit human.net net cd /cluster/data/canFam1/bed/blastz.hg17.swap.2004-07-08 mkdir axtNet foreach f (axtChain/net/*) set chr = $f:t:r netToAxt $f axtChain/chain/$chr.chain /cluster/data/canFam1/nib \ /cluster/data/hg17/nib stdout \ | axtSort stdin axtNet/$chr.axt end mkdir mafNet foreach f (axtNet/chr*.axt) set maf = mafNet/$f:t:r.maf axtToMaf $f \ /cluster/data/canFam1/chrom.sizes /cluster/data/hg17/chrom.sizes \ $maf -tPrefix=canFam1. -qPrefix=hg17. end # MAKE VSHG17 DOWNLOADABLES (DONE 7/27/04 angie) ssh kksilo cd /cluster/data/canFam1/bed/blastz.hg17/axtChain gzip -c all.chain > /cluster/data/canFam1/zip/human.chain.gz gzip -c human.net > /cluster/data/canFam1/zip/human.net.gz # Added 9/15/04: mkdir /cluster/data/canFam1/zip/axtNet foreach f (../axtNet/chr*.axt) gzip -c $f > /cluster/data/canFam1/zip/axtNet/$f:t.gz end # Make chain-format of raw alignments -- special request by Broad cd /cluster/data/canFam1/bed/blastz.hg17 mkdir blastzECF foreach f (axtChrom/chr*.axt.gz) set chr = $f:t:r:r gunzip -c $f \ | axtToChain stdin S1.len S2.len stdout \ | gzip -c - > blastzECF/$chr.ecf.gz end ssh hgwdev mkdir /usr/local/apache/htdocs/goldenPath/canFam1/vsHg17 cd /usr/local/apache/htdocs/goldenPath/canFam1/vsHg17 mv /cluster/data/canFam1/zip/human*.gz . mv /cluster/data/canFam1/zip/axtNet . md5sum *.gz */*.gz > md5sum.txt # Copy over & edit README.txt w/pointers to chain, net formats. # Not for pushing -- handle separately. mv /cluster/data/canFam1/bed/blastz.hg17/blastzECF . cd blastzECF md5sum *.gz > md5sum.txt # SWAP BLASTZ MOUSE-DOG TO DOG-MOUSE (MM5) (DONE 7/19/04 angie) ssh kolossus mkdir /cluster/data/canFam1/bed/blastz.mm5.swap.2004-07-15 ln -s blastz.mm5.swap.2004-07-15 /cluster/data/canFam1/bed/blastz.mm5 cd /cluster/data/canFam1/bed/blastz.mm5.swap.2004-07-15 set aliDir = /cluster/data/mm5/bed/blastz.canFam1.2004-07-15 cp $aliDir/S1.len S2.len cp $aliDir/S2.len S1.len mkdir unsorted axtChrom cat $aliDir/axtChrom/chr*.axt \ | axtSwap stdin $aliDir/S1.len $aliDir/S2.len stdout \ | axtSplitByTarget stdin unsorted # Sort the shuffled .axt files. foreach f (unsorted/*.axt) echo sorting $f:t:r axtSort $f axtChrom/$f:t end du -sh $aliDir/axtChrom unsorted axtChrom rm -r unsorted # CHAIN MOUSE BLASTZ (DONE 7/19/04 angie) # Run axtChain on little cluster ssh kki cd /cluster/data/canFam1/bed/blastz.mm5.swap.2004-07-15 mkdir -p axtChain/run1 cd axtChain/run1 mkdir out chain ls -1S /cluster/data/canFam1/bed/blastz.mm5.swap.2004-07-15/axtChrom/*.axt \ > input.lst cat << '_EOF_' > gsub #LOOP doChain {check in line+ $(path1)} {check out line+ chain/$(root1).chain} {check out exists out/$(root1).out} #ENDLOOP '_EOF_' # << this line makes emacs coloring happy cat << '_EOF_' > doChain #!/bin/csh axtChain $1 \ /iscratch/i/canFam1/nib \ /iscratch/i/mus/mm5/softNib $2 > $3 '_EOF_' # << this line makes emacs coloring happy chmod a+x doChain gensub2 input.lst single gsub jobList para create jobList para try, check, push, check... #Completed: 41 of 41 jobs #Average job time: 211s 3.52m 0.06h 0.00d #Longest job: 488s 8.13m 0.14h 0.01d #Submission to last job: 579s 9.65m 0.16h 0.01d # now on the cluster server, sort chains ssh kksilo cd /cluster/data/canFam1/bed/blastz.mm5.swap.2004-07-15/axtChain chainMergeSort run1/chain/*.chain > all.chain chainSplit chain all.chain rm run1/chain/*.chain # take a look at score distr's foreach f (chain/*.chain) grep chain $f | awk '{print $2;}' | sort -nr > /tmp/score.$f:t:r echo $f:t:r textHistogram -binSize=5000 /tmp/score.$f:t:r echo "" end # Lots of chaff with scores in the 3000's. Many very-high-scoring # chains. So filter the chain down somewhat... mv all.chain all.chain.unfiltered chainFilter -minScore=5000 all.chain.unfiltered > all.chain rm chain/* chainSplit chain all.chain gzip all.chain.unfiltered # Load chains into database ssh hgwdev cd /cluster/data/canFam1/bed/blastz.mm5.swap.2004-07-15/axtChain/chain foreach i (*.chain) set c = $i:r hgLoadChain canFam1 ${c}_chainMm5 $i end # Similar to human-mouse coverage -- not nearly as high as dog-human. featureBits -chrom=chr1 canFam1 chainMm5Link #40430697 bases of 120715446 (33.493%) in intersection featureBits -chrom=chr2 canFam1 chainMm5Link #31744742 bases of 83844569 (37.861%) in intersection # NET MOUSE BLASTZ (DONE 7/19/04 angie) ssh kolossus cd /cluster/data/canFam1/bed/blastz.mm5.swap.2004-07-15/axtChain chainPreNet all.chain ../S1.len ../S2.len stdout \ | chainNet stdin -minSpace=1 ../S1.len ../S2.len stdout /dev/null \ | netSyntenic stdin noClass.net # Add classification info using db tables: ssh hgwdev cd /cluster/data/canFam1/bed/blastz.mm5.swap.2004-07-15/axtChain netClass -noAr noClass.net canFam1 mm5 mouse.net # Make a 'syntenic' subset: ssh kksilo cd /cluster/data/canFam1/bed/blastz.mm5.swap.2004-07-15/axtChain rm noClass.net # Make a 'syntenic' subset of these with netFilter -syn mouse.net > mouseSyn.net # Load the nets into database ssh hgwdev cd /cluster/data/canFam1/bed/blastz.mm5.swap.2004-07-15/axtChain netFilter -minGap=10 mouse.net | hgLoadNet canFam1 netMm5 stdin netFilter -minGap=10 mouseSyn.net | hgLoadNet canFam1 netSyntenyMm5 stdin # Add entries for chainMm5, netMm5, syntenyMm5 to dog/canFam1 trackDb # GENERATE MM5 MAF FOR MULTIZ FROM NET (DONE 7/19/04 angie) ssh kksilo cd /cluster/data/canFam1/bed/blastz.mm5.swap.2004-07-15/axtChain netSplit mouse.net net cd /cluster/data/canFam1/bed/blastz.mm5.swap.2004-07-15 mkdir axtNet foreach f (axtChain/net/*) set chr = $f:t:r netToAxt $f axtChain/chain/$chr.chain /cluster/data/canFam1/nib \ /cluster/data/mm5/nib stdout \ | axtSort stdin axtNet/$chr.axt end mkdir mafNet foreach f (axtNet/chr*.axt) set maf = mafNet/$f:t:r.maf axtToMaf $f \ /cluster/data/canFam1/chrom.sizes /cluster/data/mm5/chrom.sizes \ $maf -tPrefix=canFam1. -qPrefix=mm5. end # MAKE VSMM5 DOWNLOADABLES (DONE 7/28/04 angie) ssh kksilo cd /cluster/data/canFam1/bed/blastz.mm5/axtChain gzip -c all.chain > /cluster/data/canFam1/zip/mouse.chain.gz gzip -c mouse.net > /cluster/data/canFam1/zip/mouse.net.gz # Added 9/15/04: mkdir /cluster/data/canFam1/zip/axtNet foreach f (../axtNet/chr*.axt) gzip -c $f > /cluster/data/canFam1/zip/axtNet/$f:t.gz end # Make chain-format of raw alignments -- special request by Broad cd /cluster/data/canFam1/bed/blastz.mm5 mkdir blastzECF foreach f (axtChrom/chr*.axt) set chr = $f:t:r axtToChain $f S1.len S2.len stdout \ | gzip -c - > blastzECF/$chr.ecf.gz end ssh hgwdev mkdir /usr/local/apache/htdocs/goldenPath/canFam1/vsMm5 cd /usr/local/apache/htdocs/goldenPath/canFam1/vsMm5 mv /cluster/data/canFam1/zip/mouse*.gz . mv /cluster/data/canFam1/zip/axtNet . md5sum *.gz > md5sum.txt # Copy over & edit README.txt w/pointers to chain, net formats. # Not for pushing -- handle separately. mv /cluster/data/canFam1/bed/blastz.mm5/blastzECF . cd blastzECF md5sum *.gz > md5sum.txt # MULTIZ DOG/HUMAN/MOUSE (DONE 7/19/04 angie) # put the MAFs on bluearc ssh kksilo mkdir -p /cluster/bluearc/multiz.dog/ch cp /cluster/data/canFam1/bed/blastz.hg17.swap.2004-07-08/mafNet/*.maf \ /cluster/bluearc/multiz.dog/ch mkdir -p /cluster/bluearc/multiz.dog/cm cp /cluster/data/canFam1/bed/blastz.mm5.swap.2004-07-15/mafNet/*.maf \ /cluster/bluearc/multiz.dog/cm ssh kki mkdir /cluster/data/canFam1/bed/multiz.canFam1hg17mm5 cd /cluster/data/canFam1/bed/multiz.canFam1hg17mm5 mkdir chm # Wrapper script required because of stdout redirect: cat << '_EOF_' > doMultiz #!/bin/csh /cluster/bin/penn/multiz $1 $2 - > $3 '_EOF_' # << this line makes emacs coloring happy chmod a+x doMultiz rm -f jobList foreach file (/cluster/bluearc/multiz.dog/ch/*.maf) set root=$file:t:r:r echo "doMultiz {check in line+ /cluster/bluearc/multiz.dog/cm/${root}.maf} {check in line+ $file} {check out line+ /cluster/data/canFam1/bed/multiz.canFam1hg17mm5/chm/${root}.maf}" >> jobList end para create jobList para try, check, push, check #Completed: 41 of 41 jobs #Average job time: 165s 2.75m 0.05h 0.00d #Longest job: 343s 5.72m 0.10h 0.00d #Submission to last job: 569s 9.48m 0.16h 0.01d # clean up bluearc (these are big files!) rm -r /cluster/bluearc/multiz.dog # setup external files for database reference ssh hgwdev mkdir -p /gbdb/canFam1/mzHg17Mm5_phast ln -s /cluster/data/canFam1/bed/multiz.canFam1hg17mm5/chm/*.maf \ /gbdb/canFam1/mzHg17Mm5_phast # load into database hgLoadMaf -warn canFam1 mzHg17Mm5_phast # load pairwise mafs for wigMaf track cd /gbdb/canFam1 mkdir -p human_chm mouse_chm cd /tmp ln -s /cluster/data/canFam1/bed/blastz.hg17.swap.2004-07-08/mafNet/*.maf \ /gbdb/canFam1/human_chm hgLoadMaf -WARN canFam1 human_chm ln -s /cluster/data/canFam1/bed/blastz.mm5.swap.2004-07-15/mafNet/*.maf \ /gbdb/canFam1/mouse_chm hgLoadMaf -WARN canFam1 mouse_chm # MAKE MM5 LIFTOVER CHAIN FILE (DONE 1/17/05 Andy) # This took over an hour. Maybe it's that slow no matter what if # the big chain isn't split up first. ssh kolossus cd /cluster/data/canFam1/bed/blastz.mm5/axtChain mkdir over netSplit mouse.net net mkdir /cluster/data/canFam1/bed/liftOver for file in net/*.net; do chrom=`basename $file .net` netChainSubset net/$chrom.net all.chain.gz over/$chrom.over cat over/$chrom.over >> /cluster/data/canFam1/bed/liftOver/canFam1ToMm5.chain done rm -rf over/ ssh hgwdev mkdir /usr/local/apache/htdocs/goldenPath/canFam1/liftOver cd /usr/local/apache/htdocs/goldenPath/canFam1/liftOver cp /cluster/data/canFam1/bed/liftOver/canFam1ToMm5.chain . gzip canFam1ToMm5.chain mkdir -p /gbdb/canFam1/liftOver ln -s /cluster/data/canFam1/bed/liftOver/canFam1ToMm5.chain /gbdb/canFam1/liftOver/canFam1ToMm5.over.chain hgAddLiftOverChain -multiple canFam1 mm5 # MAKE HG17 LIFTOVER CHAIN FILE (DONE 1/17/05 Andy) # Maybe I can do without the net splitting this time... # nope, ran out of memory. I split up the all.chain file as another step. It # took 1.5 hrs to run. ssh kolossus cd /cluster/data/canFam1/bed/blastz.hg17/axtChain mkdir over netSplit human.net net mkdir chain for file in net/*.net; do chrom=`basename $file .net` chainFilter -t=$chrom all.chain.gz > chain/$chrom.chain netChainSubset net/$chrom.net chain/$chrom.chain over/$chrom.over cat over/$chrom.over >> /cluster/data/canFam1/bed/liftOver/canFam1ToHg17.chain done rm -rf over/ net/ ssh hgwdev cd /usr/local/apache/htdocs/goldenPath/canFam1/liftOver cp /cluster/data/canFam1/bed/liftOver/canFam1ToHg17.chain . gzip canFam1ToHg17.chain ln -s /cluster/data/canFam1/bed/liftOver/canFam1ToHg17.chain /gbdb/canFam1/liftOver/canFam1ToHg17.over.chain hgAddLiftOverChain -multiple canFam1 hg17 # PHASTCONS DOG/HUMAN/MOUSE (DONE 7/21/04 angie) # NOTE: REPLACED, SEE "NEW PHASTCONS" BELOW # step 1: fit phylogenetic model, with rate variation (discrete # gamma model) first we need to extract the sufficient stats for # the phylogenetic model from the MAFs. In this case, we don't # care about the order of the tuples (allows for much more compact # representation) ssh kksilo mkdir /cluster/data/canFam1/bed/phastCons cd /cluster/data/canFam1/bed/phastCons mkdir unordered foreach f (/cluster/data/canFam1/bed/multiz.canFam1hg17mm5/chm/chr*.maf) /cluster/bin/phast/msa_view $f --in-format MAF --out-format SS \ --order canFam1,hg17,mm5 \ --unordered-ss > unordered/$f:t:r.ss end # aggregate for whole genome ls unordered/chr*.ss > fnames /cluster/bin/phast/msa_view "*fnames" -i SS \ --aggregate canFam1,hg17,mm5 -o SS --unordered-ss \ > all.ss # estimate parameters; should be very fast in this case /cluster/bin/phast/phyloFit --msa all.ss --tree "((1,2),3)" \ --msa-format SS --subst-mod REV --nrates 40 --out-root rev-dg \ --output-tree # make sure model looks reasonable cat rev-dg.mod #ALPHABET: A C G T #ORDER: 0 #SUBST_MOD: REV #NRATECATS: 40 #ALPHA: 4.506486 #TRAINING_LNL: -4356689945.768768 #BACKGROUND: 0.297650 0.202241 0.202273 0.297836 #RATE_MAT: # -0.856784 0.157834 0.554732 0.144219 # 0.232292 -1.211122 0.162600 0.816230 # 0.816302 0.162574 -1.211329 0.232452 # 0.144128 0.554249 0.157868 -0.856245 #TREE: ((1:0.176986,2:0.142776):0.170935,3:0.170935); # beware of zero branch lengths (indicates bad topology) or very # large branch lengths (much greater than one; indicates # convergence problems). Can try without --nrates or with # --subst-mod HKY85 to double check (parameter estimates should be # in the same ballpark, REV + dG likelihood should be a bit # better). Also can write a log file to monitor convergence # (--log). Also watch out for alpha much less than one. # Sometimes it's helpful to produce a rendering of the tree using # draw_tree (run on the *.nh file) # Partition the alignments into bite-sized chunks for cluster phastCons # run. This time we need the ordered version of the SS format. set WINSIZE=1000000 set WINOVERLAP=0 mkdir SS foreach f (/cluster/data/canFam1/bed/multiz.canFam1hg17mm5/chm/*.maf) set chr=$f:t:r set c=`echo $chr | sed 's/chr//'` echo $f $chr $c mkdir SS/$c /cluster/bin/phast/msa_split $f -i MAF -o SS \ -O canFam1,hg17,mm5 -M /cluster/data/canFam1/$c/$chr.fa \ -w ${WINSIZE},${WINOVERLAP} -r SS/$c/$chr -I 1000 -d 1 -B 5000 end # save some space and I/O time: gzip SS/*/*.ss # step 2: run phastCons. # Since we haven't run with these particular organisms before, # do a whole-genome run without --transitions so that phastCons does a # max likelihood estimate of the parameters. Some jobs may have trouble # converging [not this time] ==> monitor log files of stragglers: # Warning signs include likelihoods not decreasing monotonically, # very fast convergence (e.g., only 3 or 4 iterations), # wildly different estimates from different windows. ssh kk cd /cluster/data/canFam1/bed/phastCons cat << '_EOF_' > doMLE #!/bin/csh -fe set PHAST=/cluster/bin/phast set TMP=/scratch/phastCons/chm set file=$1 set root = $file:t:r:r set chrom = $root:r mkdir -p $TMP cp $file $TMP/$root.ss.gz zcat $TMP/$root.ss.gz \ | $PHAST/phastCons - rev-dg.mod --nrates 40 \ --quiet --log ${TMP}/$root.log --no-post-probs --quiet mkdir -p MLE/$chrom set lc = `wc -l < $TMP/$root.log` set pq = `tail -1 $TMP/$root.log | awk '{print $3 " " $4;}'` echo "$pq $lc" > MLE/$chrom/$root.pqi rm $TMP/$root.log rm $TMP/$root.ss.gz '_EOF_' # << this line makes emacs coloring happy chmod 775 doMLE rm -f jobs.lst foreach f (`ls -1S SS/*/*.ss.gz`) echo './doMLE {check in exists+ '$f'}' \ >> jobs.lst end para create jobs.lst para try ; para push ; etc.... #Completed: 2427 of 2427 jobs #Average job time: 190s 3.17m 0.05h 0.00d #Longest job: 1256s 20.93m 0.35h 0.01d #Submission to last job: 1995s 33.25m 0.55h 0.02d # back on kksilo: cat MLE/*/*.pqi > all.pqi # Look for ones that converged too fast (one of Adam's warning signs): awk '{print $3;}' all.pqi | sort -n | head -1 #5 awk '{print $3;}' all.pqi | sort -n \ | textHistogram stdin -binSize=5 -maxBinCount=30 # Find median p and q (2427 chunks / 2 = 1213th value): set nChunks = `wc -l < all.pqi` set n50 = `expr \( $nChunks / 2 \) + 1` awk '{print $1;}' all.pqi | sort -n | tail +$n50 | head -1 #0.016533 awk '{print $2;}' all.pqi | sort -n | tail +$n50 | head -1 #0.000991 # Eyeball the histogram of p values: should peak close to the median: awk '{print $1 * 1000;}' all.pqi | textHistogram stdin -maxBinCount=100 # Now run phastCons again to calcluate postprobs and conserved # elements using the calculated p and q in --transitions. # Note -- with the MLE calc'd p and q, the wiggle looked "oversmoothed" # so now we're rerunning with the same p & q used for human 8-way: ssh kk cd /cluster/data/canFam1/bed/phastCons cat << '_EOF_' > doPostProbsEls #!/bin/csh -fe set PHAST=/cluster/bin/phast set TMP=/scratch/phastCons/chm set file=$1 set root = $file:t:r:r set chrom = $root:r mkdir -p $TMP mkdir -p PREDICTIONS/$chrom cp $file $TMP/$root.ss.gz zcat $TMP/$root.ss.gz \ | $PHAST/phastCons - rev-dg.mod --nrates 40 \ --quiet --viterbi PREDICTIONS/$chrom/$root.bed --score --seqname $chrom \ --transitions 0.08,0.008 > $TMP/$root.pp mkdir -p POSTPROBS/$chrom gzip -c $TMP/$root.pp > POSTPROBS/$chrom/$root.pp.gz rm $TMP/$root.pp rm $TMP/$root.ss.gz '_EOF_' # << this line makes emacs coloring happy chmod 775 doPostProbsEls rm -f jobs.lst foreach f (`ls -1S SS/*/*.ss.gz`) echo './doPostProbsEls {check in exists+ '$f'}' \ >> jobs.lst end para create jobs.lst para try ; para push ; etc.... # for p=0.080 run... cluster was really hammering kksilo (usually faster): #Completed: 2427 of 2427 jobs #Average job time: 414s 6.90m 0.11h 0.00d #Longest job: 2436s 40.60m 0.68h 0.03d #Submission to last job: 2724s 45.40m 0.76h 0.03d # now create wiggle track ssh kksilo cd /cluster/data/canFam1/bed/phastCons mkdir wib foreach dir (POSTPROBS/*) echo $dir set chr=$dir:t zcat `ls -1 $dir/*.pp.gz | sort -t\. -k2,2n` | \ wigAsciiToBinary -chrom=$chr \ -wibFile=wib/${chr}_phastCons stdin end ssh hgwdev mkdir -p /gbdb/canFam1/wib/mzHg17Mm5_phast cd /cluster/data/canFam1/bed/phastCons chmod 775 . wib chmod 664 wib/*.wib ln -s `pwd`/wib/*.wib /gbdb/canFam1/wib/mzHg17Mm5_phast/ hgLoadWiggle canFam1 mzHg17Mm5_phast_wig \ -pathPrefix=/gbdb/canFam1/wib/mzHg17Mm5_phast wib/*.wig # Create bed track: tweak scores and names, trim the unused strand column. cat PREDICTIONS/*/*.bed \ | awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' \ | /cluster/bin/scripts/lodToBedScore \ > all.bed hgLoadBed canFam1 phastConsElements all.bed # make top-5000 list and launcher on Adam's home page: sort -k5,5nr raw.bed | head -5000 > top5000.bed /cluster/home/acs/bin/make-launcher-with-scores.sh top5000.bed \ /cse/grads/acs/public_html/cf-top5000 \ "top 5000 conserved elements" canFam1 # MAKE DOWNLOADABLE FILES FOR MULTIZ (DONE 7/21/04 angie) # make compressed copies of mafs (don't compress originals -- need them # for the browser) ssh kksilo cd /cluster/data/canFam1/bed/multiz.canFam1hg17mm5 mkdir gzMaf foreach f (chm/*.maf) gzip -c $f > gzMaf/$f:t.gz end cd /cluster/data/canFam1/bed/blastz.hg17.swap.2004-07-08 mkdir gzMaf foreach f (mafNet/*.maf) gzip -c $f > gzMaf/$f:t.gz end cd /cluster/data/canFam1/bed/blastz.mm5.swap.2004-07-15 mkdir gzMaf foreach f (mafNet/*.maf) gzip -c $f > gzMaf/$f:t.gz end # put multi and pairwise mafs out for download ssh hgwdev mkdir /usr/local/apache/htdocs/goldenPath/canFam1/multizHg17Mm5 cd /usr/local/apache/htdocs/goldenPath/canFam1/multizHg17Mm5 mv /cluster/data/canFam1/bed/multiz.canFam1hg17mm5/gzMaf/* . rmdir /cluster/data/canFam1/bed/multiz.canFam1hg17mm5/gzMaf mv /cluster/data/canFam1/bed/blastz.hg17.swap.2004-07-08/gzMaf hg17 mv /cluster/data/canFam1/bed/blastz.mm5.swap.2004-07-15/gzMaf mm5 md5sum *.gz */*.gz > md5sum.txt # make a README.txt file # PRODUCING GENSCAN PREDICTIONS (DONE 7/9/04 angie) ssh hgwdev mkdir /cluster/data/canFam1/bed/genscan cd /cluster/data/canFam1/bed/genscan # Check out hg3rdParty/genscanlinux to get latest genscan: cvs co hg3rdParty/genscanlinux # Run on small cluster (more mem than big cluster). ssh kki cd /cluster/data/canFam1/bed/genscan # Make 3 subdirectories for genscan to put their output files in mkdir gtf pep subopt # Generate a list file, genome.list, of all the hard-masked contigs that # *do not* consist of all-N's (which would cause genscan to blow up) rm -f genome.list touch genome.list foreach f ( `ls -1S /cluster/data/canFam1/*/chr*_*/chr*_?{,?}.fa.masked` ) egrep '[ACGT]' $f > /dev/null if ($status == 0) echo $f >> genome.list end wc -l genome.list # Create template file, gsub, for gensub2. For example (3-line file): cat << '_EOF_' > gsub #LOOP /cluster/bin/x86_64/gsBig {check in line+ $(path1)} {check out line gtf/$(root1).gtf} -trans={check out line pep/$(root1).pep} -subopt={check out line subopt/$(root1).bed} -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=2400000 #ENDLOOP '_EOF_' # << this line makes emacs coloring happy gensub2 genome.list single gsub jobList para create jobList para try, check, push, check, ... #Completed: 495 of 497 jobs #Crashed: 2 jobs #Average job time: 803s 13.39m 0.22h 0.01d #Longest job: 28087s 468.12m 7.80h 0.33d #Submission to last job: 34710s 578.50m 9.64h 0.40d # If there are crashes, diagnose with "para problems". # If a job crashes due to genscan running out of memory, re-run it # manually with "-window=1200000" instead of "-window=2400000". ssh kkr7u00 cd /cluster/data/canFam1/bed/genscan /cluster/bin/x86_64/gsBig /cluster/data/canFam1/5/chr5_17/chr5_17.fa.masked gtf/chr5_17.fa.gtf -trans=pep/chr5_17.fa.pep -subopt=subopt/chr5_17.fa.bed -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=1200000 /cluster/bin/x86_64/gsBig /cluster/data/canFam1/1/chr1_21/chr1_21.fa.masked gtf/chr1_21.fa.gtf -trans=pep/chr1_21.fa.pep -subopt=subopt/chr1_21.fa.bed -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=1200000 ls -1 gtf | wc -l # 497 endsInLf gtf/* # Convert these to chromosome level files as so: ssh kksilo cd /cluster/data/canFam1/bed/genscan liftUp genscan.gtf ../../jkStuff/liftAll.lft warn gtf/*.gtf liftUp genscanSubopt.bed ../../jkStuff/liftAll.lft warn subopt/*.bed cat pep/*.pep > genscan.pep # Load into the database as so: ssh hgwdev cd /cluster/data/canFam1/bed/genscan # Reloaded without -genePredExt 1/6/05: ldHgGene -gtf canFam1 genscan genscan.gtf hgPepPred canFam1 generic genscanPep genscan.pep hgLoadBed canFam1 genscanSubopt genscanSubopt.bed # QA NOTE (ASZ 6-11-2007): sudo mytouch canFam1 genscanPep 200501061300.00 # PRODUCING FUGU BLAT ALIGNMENTS (DONE 7/15/04 angie) ssh kk mkdir /cluster/data/canFam1/bed/blatFr1 cd /cluster/data/canFam1/bed/blatFr1 ls -1S /cluster/bluearc/fugu/fr1/trfFa/*.fa > fugu.lst ls -1S /iscratch/i/canFam1/maskedContigs/*.fa > dog.lst cat << '_EOF_' > gsub #LOOP blat -mask=lower -q=dnax -t=dnax {check in exists $(path1)} {check in line+ $(path2)} {check out line+ psl/$(root1)_$(root2).psl} #ENDLOOP '_EOF_' # << this line makes emacs coloring happy mkdir psl gensub2 dog.lst fugu.lst gsub spec para create spec para try, check, push, check, ... #Completed: 3479 of 3479 jobs #Average job time: 1336s 22.26m 0.37h 0.02d #Longest job: 4142s 69.03m 1.15h 0.05d #Submission to last job: 14450s 240.83m 4.01h 0.17d # Sort alignments: ssh kksilo cd /cluster/data/canFam1/bed/blatFr1 pslCat -dir psl \ | liftUp -type=.psl stdout ../../jkStuff/liftAll.lft warn stdin \ | liftUp -type=.psl -pslQ stdout /cluster/data/fr1/jkStuff/liftAll.lft \ warn stdin \ | pslSortAcc nohead chrom /tmp stdin # load psl into database: ssh hgwdev cd /cluster/data/canFam1/bed/blatFr1/chrom # Special case: fr1 has only chrUn, so no need to index tName! cat *.psl | hgLoadPsl -noTNameIx -fastLoad -table=blatFr1 canFam1 stdin featureBits canFam1 blatFr1 #24508362 bases of 2359845093 (1.039%) in intersection featureBits hg17 blatFr1 #21977081 bases of 2866216770 (0.767%) in intersection # LOAD CPGISSLANDS (DONE 7/15/04 angie) ssh hgwdev mkdir -p /cluster/data/canFam1/bed/cpgIsland cd /cluster/data/canFam1/bed/cpgIsland # Build software from Asif Chinwalla (achinwal@watson.wustl.edu) cvs co hg3rdParty/cpgIslands cd hg3rdParty/cpgIslands make mv cpglh.exe /cluster/data/canFam1/bed/cpgIsland/ ssh kksilo cd /cluster/data/canFam1/bed/cpgIsland foreach f (../../*/chr*.fa.masked) set fout=$f:t:r:r.cpg echo running cpglh on $f to $fout ./cpglh.exe $f > $fout end # Transform cpglh output to bed + cat << '_EOF_' > filter.awk /* Input columns: */ /* chrom, start, end, len, CpG: cpgNum, perGc, cpg:gpc, observed:expected */ /* chr1\t 41776\t 42129\t 259\t CpG: 34\t 65.8\t 0.92\t 0.94 */ /* Output columns: */ /* chrom, start, end, name, length, cpgNum, gcNum, perCpg, perGc, obsExp */ /* chr1\t41775\t42129\tCpG: 34\t354\t34\t233\t19.2\t65.8\to0.94 */ { $2 = $2 - 1; width = $3 - $2; printf("%s\t%d\t%s\t%s %s\t%s\t%s\t%0.0f\t%0.1f\t%s\t%s\n", $1, $2, $3, $5,$6, width, $6, width*$7*0.01, 100.0*2*$6/width, $7, $9); } '_EOF_' # << this line makes emacs coloring happy awk -f filter.awk chr*.cpg > cpgIsland.bed # load into database: ssh hgwdev cd /cluster/data/canFam1/bed/cpgIsland hgLoadBed canFam1 cpgIslandExt -tab -noBin \ -sqlTable=$HOME/kent/src/hg/lib/cpgIslandExt.sql cpgIsland.bed wc -l cpgIsland.bed # 48070 cpgIsland.bed featureBits canFam1 cpgIslandExt #38799035 bases of 2359845093 (1.644%) in intersection # BUILD BLAST DATABASES cd /cluster/data/canFam1 mkdir blastDb for i in `cat chrom.lst`; do for j in `echo $i/chr$i_*/chr*_*_*.fa`; do ln -s `pwd`/$j blastDb/`basename $j .fa`; done; done cd blastDb for i in *; do formatdb -p F -i $i; done # END BLAST # GC 5 BASE WIGGLE TRACK (DONE 7/7/04 angie) # put unmasked nibs in /iscratch for small cluster run. ssh kkr1u00 mkdir /iscratch/i/canFam1/ cp -pR /cluster/data/canFam1/nib /iscratch/i/canFam1/ iSync ssh kki mkdir /cluster/data/canFam1/bed/gc5Base cd /cluster/data/canFam1/bed/gc5Base mkdir wigData dataLimits cat > doGc5Base.csh <<'_EOF_' #!/bin/csh -fe set c = $1 set chr = "chr$c" /cluster/bin/x86_64/hgGcPercent \ -chr=${chr} -doGaps -file=stdout -win=5 canFam1 ../../nib \ | awk '$4 == "GC" && ($3-$2) >= 5 {printf "%d\t%.1f\n", $2+1, $5/10.0;}' \ | /cluster/bin/x86_64/wigAsciiToBinary \ -dataSpan=5 -chrom=${chr} -wibFile=wigData/gc5Base_${c} \ -name=${c} stdin > dataLimits/${chr} '_EOF_' # << this line makes emacs coloring happy chmod a+x doGc5Base.csh cp /dev/null spec foreach c (`cat ../../chrom.lst`) echo "./doGc5Base.csh $c" >> spec end para create spec para try, check, push, check, ... # ran on kolossus (csh spec >& tee spec.log) because small cluster was # solidly hogged, oh well. # data is complete, load track on hgwdev ssh hgwdev cd /cluster/data/canFam1/bed/gc5Base # create symlinks for .wib files, then load track mkdir -p /gbdb/canFam1/wib/gc5Base ln -s `pwd`/wigData/*.wib /gbdb/canFam1/wib/gc5Base hgLoadWiggle -pathPrefix=/gbdb/canFam1/wib/gc5Base \ canFam1 gc5Base wigData/*.wig # CREATING QUALITY SCORE TRACK (DONE 7/7/04 angie) ssh kolossus mkdir /cluster/data/canFam1/bed/qual cd /cluster/data/canFam1/bed/qual mkdir wigData dataLimits foreach agp (../../?{,?}/chr*.agp) set chr = $agp:t:r set abbrev = `echo $chr | sed -e 's/^chr//; s/_random/r/;'` echo $chr to $abbrev wiggle qacToWig ../../broad/fixedQuals.qac -name=$chr stdout \ | wigAsciiToBinary -dataSpan=1 -chrom=$chr \ -wibFile=wigData/qual_$abbrev -name=$abbrev stdin \ > dataLimits/$chr end # Verify size of .wib file = chrom length foreach f (wigData/*.wib) set abbrev = $f:t:r set chr = `echo $abbrev | sed -e 's/^qual_/chr/; s/r$/_random/;'` set wibLen = `ls -l $f | awk '{print $5;}'` set chromLen = `grep -w $chr ../../chrom.sizes | awk '{print $2;}'` if ($wibLen != $chromLen) then echo "ERROR: $chr size is $chromLen but wib size is $wibLen" else echo $chr OK. endif end # /gbdb & load: ssh hgwdev cd /cluster/data/canFam1/bed/qual mkdir -p /gbdb/canFam1/wib/qual ln -s `pwd`/wigData/*.wib /gbdb/canFam1/wib/qual hgLoadWiggle -pathPrefix=/gbdb/canFam1/wib/qual \ canFam1 quality wigData/*.wig # To speed up display for whole chrom views, need to make zoomed # data views. Zoom to 1K points per pixel. ssh kolossus cd /cluster/data/canFam1/bed/qual mkdir -p wigData1K mkdir -p dataLimits1K foreach c (`cat ../../chrom.lst`) if (-f ../../${c}/chr${c}.agp) then echo "chr${c} quality 1K zoom" qacToWig ../../broad/fixedQuals.qac -name=chr${c} stdout \ | wigZoom stdin \ | wigAsciiToBinary -dataSpan=1024 \ -chrom=chr${c} -wibFile=wigData1K/qc1K_${c} \ -name=${c} stdin > dataLimits1K/chr${c} endif end ssh hgwdev cd /cluster/data/canFam1/bed/qual/wigData1K # create symlinks for .wib files ln -s `pwd`/*.wib /gbdb/canFam1/wib/qual # load in addition to existing data hgLoadWiggle -oldTable -pathPrefix=/gbdb/canFam1/wib/qual \ canFam1 quality *.wig # ANDY LAW CPGISSLANDS (DONE 7/22/04 angie) # See notes in makeGalGal2.doc. ssh kksilo mkdir /cluster/data/canFam1/bed/cpgIslandGgfAndy cd /cluster/data/canFam1/bed/cpgIslandGgfAndy # Use masked sequence since this is a mammal... cp /dev/null cpgIslandGgfAndyMasked.bed foreach f (../../*/chr*.fa.masked) set chr = $f:t:r:r echo preproc and run on masked $chr /cluster/home/angie/bin/$MACHTYPE/preProcGgfAndy $f \ | /cluster/home/angie/ggf-andy-cpg-island.pl \ | perl -wpe 'chomp; ($s,$e,$cpg,$n,$c,$g,$oE) = split("\t"); $s--; \ $gc = $c + $g; $pCpG = (100.0 * 2 * $cpg / $n); \ $pGc = (100.0 * $gc / $n); \ $_ = "'$chr'\t$s\t$e\tCpG: $cpg\t$n\t$cpg\t$gc\t" . \ "$pCpG\t$pGc\t$oE\n";' \ >> cpgIslandGgfAndyMasked.bed end # load into database: ssh hgwdev cd /cluster/data/canFam1/bed/cpgIslandGgfAndy sed -e 's/cpgIslandExt/cpgIslandGgfAndyMasked/g' \ $HOME/kent/src/hg/lib/cpgIslandExt.sql > cpgIslandGgfAndyMasked.sql hgLoadBed canFam1 cpgIslandGgfAndyMasked -tab -noBin \ -sqlTable=cpgIslandGgfAndyMasked.sql cpgIslandGgfAndyMasked.bed featureBits canFam1 cpgIslandExt #38799035 bases of 2359845093 (1.644%) in intersection featureBits canFam1 cpgIslandGgfAndyMasked #100892068 bases of 2359845093 (4.275%) in intersection wc -l ../cpgIsland/cpgIsland.bed *bed # 48070 ../cpgIsland/cpgIsland.bed # 143011 cpgIslandGgfAndyMasked.bed # 6/28/04 -- masking simpleRepeats, and even repeats other than Alu's, # might not be the right thing to do (?). Give it a try with less-masked # sequence. ssh kksilo cd /cluster/data/canFam1/bed/cpgIslandGgfAndy cp /dev/null cpgIslandGgfAndyOnlyRM.bed foreach f (../../*/chr*.fa) set chr = $f:t:r echo preproc, ggf-andy $chr onlyRM maskOutFa $f $f.out stdout \ | /cluster/home/angie/bin/$MACHTYPE/preProcGgfAndy stdin \ | /cluster/home/angie/ggf-andy-cpg-island.pl \ | perl -wpe 'chomp; ($s,$e,$cpg,$n,$c,$g,$oE) = split("\t"); $s--; \ $gc = $c + $g; $pCpG = (100.0 * 2 * $cpg / $n); \ $pGc = (100.0 * $gc / $n); \ $_ = "'$chr'\t$s\t$e\tCpG: $cpg\t$n\t$cpg\t$gc\t" . \ "$pCpG\t$pGc\t$oE\n";' \ >> cpgIslandGgfAndyOnlyRM.bed end wc -l cpgIslandGgfAndyOnlyRM.bed # 142767 cpgIslandGgfAndyOnlyRM.bed ssh hgwdev cd /cluster/data/canFam1/bed/cpgIslandGgfAndy sed -e 's/cpgIslandExt/cpgIslandGgfAndyOnlyRM/g' \ $HOME/kent/src/hg/lib/cpgIslandExt.sql > /tmp/c.sql hgLoadBed canFam1 cpgIslandGgfAndyOnlyRM -tab -noBin -sqlTable=/tmp/c.sql \ cpgIslandGgfAndyOnlyRM.bed featureBits canFam1 cpgIslandGgfAndyOnlyRM.bed #101295282 bases of 2359845093 (4.292%) in intersection # OK, just for fun let's run ggf-andy script with TJ parameters! ssh kksilo cd /cluster/data/canFam1/bed/cpgIslandGgfAndy cp /dev/null cpgIslandGgfAndyTJ.bed foreach f (../../*/chr*.fa) set chr = $f:t:r echo running modified w/TJ on $chr /cluster/home/angie/bin/$MACHTYPE/preProcGgfAndy $f \ | /cluster/home/angie/ggf-andy-cpg-island.pl \ -min-length 500 -percent-gc 55 -obs-over-exp 0.65 \ | perl -wpe 'chomp; ($s,$e,$cpg,$n,$c,$g,$oE) = split("\t"); $s--; \ $gc = $c + $g; $pCpG = (100.0 * 2 * $cpg / $n); \ $pGc = (100.0 * $gc / $n); \ $_ = "'$chr'\t$s\t$e\tCpG: $cpg\t$n\t$cpg\t$gc\t" . \ "$pCpG\t$pGc\t$oE\n";' \ >> cpgIslandGgfAndyTJ.bed end wc -l cpgIslandGgfAndyTJ.bed # 55055 cpgIslandGgfAndyTJ.bed # load into database: ssh hgwdev cd /cluster/data/canFam1/bed/cpgIslandGgfAndy # this one is a cpgIslandExt but with a different table name: sed -e 's/cpgIslandExt/cpgIslandGgfAndyTJ/g' \ $HOME/kent/src/hg/lib/cpgIslandExt.sql > cpgIslandGgfAndyTJ.sql hgLoadBed canFam1 cpgIslandGgfAndyTJ -tab -noBin \ -sqlTable=cpgIslandGgfAndyTJ.sql cpgIslandGgfAndyTJ.bed featureBits canFam1 cpgIslandGgfAndyTJ #80070786 bases of 2359845093 (3.393%) in intersection # Try featureBits -enrichment: featureBits -enrichment canFam1 xenoRefGene:upstream:5000 cpgIslandExt #xenoRefGene:upstream:5000 3.047%, cpgIslandExt 1.644%, both 0.249%, cover 8.16%, enrich 4.96x featureBits -enrichment canFam1 xenoRefGene:upstream:5000 cpgIslandGgfAndyMasked #xenoRefGene:upstream:5000 3.047%, cpgIslandGgfAndyMasked 4.275%, both 0.508%, cover 16.66%, enrich 3.90x featureBits -enrichment canFam1 xenoRefGene:upstream:5000 cpgIslandGgfAndyTJ #xenoRefGene:upstream:5000 3.047%, cpgIslandGgfAndyTJ 3.393%, both 0.570%, cover 18.72%, enrich 5.52x # 8/25/05 -- Laura Elnitski requested unmasked. cd /cluster/data/canFam1/bed/cpgIslandGgfAndy cp /dev/null cpgIslandGgfAndy.bed foreach f (../../*/chr*.fa) set chr = $f:t:r echo preproc and run on unmasked $chr preProcGgfAndy $f \ | /cluster/home/angie/ggf-andy-cpg-island.pl \ | perl -wpe 'chomp; ($s,$e,$cpg,$n,$c,$g,$oE) = split("\t"); $s--; \ $gc = $c + $g; $pCpG = (100.0 * 2 * $cpg / $n); \ $pGc = (100.0 * $gc / $n); \ $_ = "'$chr'\t$s\t$e\tCpG: $cpg\t$n\t$cpg\t$gc\t" . \ "$pCpG\t$pGc\t$oE\n";' \ >> cpgIslandGgfAndy.bed end # load into database: ssh hgwdev cd /cluster/data/canFam1/bed/cpgIslandGgfAndy sed -e 's/cpgIslandExt/cpgIslandGgfAndy/g' \ $HOME/kent/src/hg/lib/cpgIslandExt.sql > cpgIslandGgfAndy.sql hgLoadBed canFam1 cpgIslandGgfAndy -tab -noBin \ -sqlTable=cpgIslandGgfAndy.sql cpgIslandGgfAndy.bed # NEW PHASTCONS DOG/HUMAN/MOUSE (DONE 9/29/04 angie) ssh kksilo # copy chrom fa to bluearc, break up the genome-wide MAFs into pieces mkdir -p /cluster/bluearc/canFam1/chrom cp -p /cluster/data/canFam1/?{,?}/chr*.fa /cluster/bluearc/canFam1/chrom/ ssh kki mkdir /cluster/data/canFam1/bed/multiz.canFam1hg17mm5/phastCons mkdir /cluster/data/canFam1/bed/multiz.canFam1hg17mm5/phastCons/run.split cd /cluster/data/canFam1/bed/multiz.canFam1hg17mm5/phastCons/run.split set WINDOWS = /cluster/bluearc/canFam1/phastCons/WINDOWS rm -fr $WINDOWS mkdir -p $WINDOWS cat << 'EOF' > doSplit.sh #!/bin/csh -ef set PHAST=/cluster/bin/phast set FA_SRC=/cluster/bluearc/canFam1/chrom set WINDOWS=/cluster/bluearc/canFam1/phastCons/WINDOWS set maf=$1 set c = $maf:t:r set tmpDir = /scratch/msa_split/$c rm -rf $tmpDir mkdir -p $tmpDir ${PHAST}/msa_split $maf -i MAF -M ${FA_SRC}/$c.fa -O canFam1,hg17,mm5 \ -w 1000000,0 -r $tmpDir/$c -o SS -I 1000 -B 5000 cd $tmpDir foreach file ($c.*.ss) gzip -c $file > ${WINDOWS}/$file.gz end rm -f $tmpDir/$c.*.ss rmdir $tmpDir 'EOF' # << for emacs chmod a+x doSplit.sh rm -f jobList foreach file (../../chm/*.maf) if (-s $file) then echo "doSplit.sh {check in line+ $file}" >> jobList endif end para create jobList para try, check, push, check... #Completed: 41 of 41 jobs #Average job time: 201s 3.36m 0.06h 0.00d #Longest job: 392s 6.53m 0.11h 0.00d #Submission to last job: 720s 12.00m 0.20h 0.01d cd .. # use the model previously estimated (see above) as a starting model /cluster/bin/phast/tree_doctor \ --rename '1 -> canFam1 ; 2 -> hg17 ; 3 -> mm5' \ ../../phastCons/rev-dg.mod \ > starting-tree.mod #TREE: ((canFam1:0.176986,hg17:0.142776):0.170935,mm5:0.170935); # Get genome-wide average GC content (for all species together, # not just the reference genome). If you have a globally # estimated tree model, as above, you can get this from the # BACKGROUND line in the .mod file. E.g., # ALPHABET: A C G T # ... # BACKGROUND: 0.276938 0.223190 0.223142 0.276730 # add up the C and G: awk '$1 == "BACKGROUND:" {printf "%0.3f\n", $3 + $4;}' starting-tree.mod #0.405 # Great, use 0.405 as the --gc parameter in phastCons below:. # Now set up cluster job to estimate model parameters. # Parameters will be estimated separately for each alignment fragment # then will be combined across fragments. # Use --gc from above, --expected-lengths 12 a typical for mammals, # and --target-coverage computed as follows: # 1. Jim would like phastConsElements coverage to be 5% for mammals. # 2. ~66% of canFam1 is covered by chainHg17Link, so use .05 / .66 = .08 # as an initial --target-coverage. # 3. If actual coverage is different from our target, come back to this # step, adjust --target-coverage and rerun up through phastConsElements. # OK, .08 landed us a little low, try .10: #90197498 bases of 2359845093 (3.822%) in intersection # .10 still a little low, try .15: #95675167 bases of 2359845093 (4.054%) in intersection # OK, .15 seems close enough: #110676039 bases of 2359845093 (4.690%) in intersection mkdir run.estimate cd run.estimate cat << '_EOF_' > doEstimate.sh #!/bin/csh -ef zcat $1 \ | /cluster/bin/phast/phastCons - ../starting-tree.mod --gc 0.405 --nrates 1,1 \ --no-post-probs --ignore-missing --expected-lengths 12 \ --target-coverage 0.15 --quiet --log $2 --estimate-trees $3 '_EOF_' # << for emacs chmod a+x doEstimate.sh rm -fr LOG TREES mkdir -p LOG TREES rm -f jobList foreach f (/cluster/bluearc/canFam1/phastCons/WINDOWS/*.ss.gz) set root = $f:t:r:r echo doEstimate.sh $f LOG/$root.log TREES/$root >> jobList end # run cluster job ssh kk cd /cluster/data/canFam1/bed/multiz.canFam1hg17mm5/phastCons/run.estimate para create jobList para try, check, push, check, ... #Completed: 2427 of 2427 jobs #Average job time: 79s 1.31m 0.02h 0.00d #Longest job: 146s 2.43m 0.04h 0.00d #Submission to last job: 1159s 19.32m 0.32h 0.01d # Now combine parameter estimates. We can average the .mod files # using phyloBoot. This must be done separately for the conserved # and nonconserved models ssh kksilo cd /cluster/data/canFam1/bed/multiz.canFam1hg17mm5/phastCons/run.estimate ls -1 TREES/*.cons.mod > cons.txt /cluster/bin/phast/phyloBoot --read-mods '*cons.txt' \ --output-average ave.cons.mod > cons_summary.txt ls -1 TREES/*.noncons.mod > noncons.txt /cluster/bin/phast/phyloBoot --read-mods '*noncons.txt' \ --output-average ave.noncons.mod > noncons_summary.txt grep TREE ave*.mod #ave.cons.mod:TREE: ((canFam1:0.052234,hg17:0.043243):0.052171,mm5:0.052171); #ave.noncons.mod:TREE: ((canFam1:0.196400,hg17:0.162279):0.195831,mm5:0.195831); cat cons_summary.txt # look over the files cons_summary.txt and noncons_summary.txt. # The means and medians should be roughly equal and the stdevs # should be reasonably small compared to the means, particularly # for rate matrix parameters (at bottom) and for branches to the # leaves of the tree. The stdevs may be fairly high for branches # near the root of the tree; that's okay. Some min values may be # 0 for some parameters. That's okay, but watch out for very large # values in the max column, which might skew the mean. If you see # any signs of bad outliers, you may have to track down the # responsible .mod files and throw them out. I've never had to do # this; the estimates generally seem pretty well behaved. # NOTE: Actually, a random sample of several hundred to a thousand # alignment fragments (say, a number equal to the number of # available cluster nodes) should be more than adequate for # parameter estimation. If pressed for time, use this strategy. # Now we are ready to set up the cluster job for computing the # conservation scores and predicted elements. It's all downhill # from here. ssh kk mkdir /cluster/data/canFam1/bed/multiz.canFam1hg17mm5/phastCons/run.phast cd /cluster/data/canFam1/bed/multiz.canFam1hg17mm5/phastCons/run.phast cat << 'EOF' > doPhastCons.sh #!/bin/csh -ef set pref = $1:t:r:r set chr = `echo $pref | awk -F\. '{print $1}'` set tmpfile = /scratch/phastCons.canFam1.$pref.$$ zcat $1 \ | /cluster/bin/phast/phastCons - \ ../run.estimate/ave.cons.mod,../run.estimate/ave.noncons.mod \ --expected-lengths 12 --target-coverage 0.15 --quiet --seqname $chr \ --idpref $pref \ --viterbi /cluster/bluearc/canFam1/phastCons/ELEMENTS/$pref.bed --score \ --require-informative 0 \ > $tmpfile gzip -c $tmpfile > /cluster/bluearc/canFam1/phastCons/POSTPROBS/$pref.pp.gz rm $tmpfile 'EOF' # << for emacs chmod a+x doPhastCons.sh rm -fr /cluster/bluearc/canFam1/phastCons/{POSTPROBS,ELEMENTS} mkdir -p /cluster/bluearc/canFam1/phastCons/{POSTPROBS,ELEMENTS} rm -f jobList foreach f (/cluster/bluearc/canFam1/phastCons/WINDOWS/*.ss.gz) echo doPhastCons.sh $f >> jobList end para create jobList para try, check, push, check, ... #Completed: 2427 of 2427 jobs #Average job time: 67s 1.12m 0.02h 0.00d #Longest job: 290s 4.83m 0.08h 0.00d #Submission to last job: 920s 15.33m 0.26h 0.01d # back on kksilo: # combine predictions and transform scores to be in 0-1000 interval # do in a way that avoids limits on numbers of args cd /cluster/data/canFam1/bed/multiz.canFam1hg17mm5/phastCons cp /dev/null all.bed foreach chr (`awk '{print $1;}' /cluster/data/canFam1/chrom.sizes`) cat /cluster/bluearc/canFam1/phastCons/ELEMENTS/$chr.*.bed \ | awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' \ | /cluster/bin/scripts/lodToBedScore >> all.bed end ssh hgwdev cd /cluster/data/canFam1/bed/multiz.canFam1hg17mm5/phastCons featureBits canFam1 all.bed #110676039 bases of 2359845093 (4.690%) in intersection # OK, close enough. hgLoadBed canFam1 phastConsElements all.bed # Create wiggle on the small cluster ssh kki mkdir /cluster/data/canFam1/bed/multiz.canFam1hg17mm5/phastCons/run.wib cd /cluster/data/canFam1/bed/multiz.canFam1hg17mm5/phastCons/run.wib rm -rf /cluster/bluearc/canFam1/phastCons/wib mkdir -p /cluster/bluearc/canFam1/phastCons/wib cat << 'EOF' > doWigAsciiToBinary #!/bin/csh -ef set chr = $1 zcat `ls -1 /cluster/bluearc/canFam1/phastCons/POSTPROBS/$chr.*.pp.gz \ | sort -t\. -k2,2n` \ | wigAsciiToBinary -chrom=$chr \ -wibFile=/cluster/bluearc/canFam1/phastCons/wib/${chr}_phastCons stdin 'EOF' # << for emacs chmod a+x doWigAsciiToBinary rm -f jobList foreach chr (`ls -1 /cluster/bluearc/canFam1/phastCons/POSTPROBS \ | awk -F\. '{print $1}' | sort -u`) echo doWigAsciiToBinary $chr >> jobList end para create jobList para try, check, push, check, ... #Completed: 41 of 41 jobs #Average job time: 502s 8.37m 0.14h 0.01d #Longest job: 1194s 19.90m 0.33h 0.01d #Submission to last job: 1459s 24.32m 0.41h 0.02d # back on kksilo, copy wibs, wigs and POSTPROBS (people sometimes want # the raw scores) from bluearc cd /cluster/data/canFam1/bed/multiz.canFam1hg17mm5/phastCons rm -rf wib POSTPROBS rsync -av /cluster/bluearc/canFam1/phastCons/wib . rsync -av /cluster/bluearc/canFam1/phastCons/POSTPROBS . # load wiggle component of Conservation track ssh hgwdev rm -rf /gbdb/canFam1/wib/mzHg17Mm5_phast2 mkdir -p /gbdb/canFam1/wib/mzHg17Mm5_phast2 cd /cluster/data/canFam1/bed/multiz.canFam1hg17mm5/phastCons chmod 775 . wib chmod 664 wib/*.wib ln -s `pwd`/wib/*.wib /gbdb/canFam1/wib/mzHg17Mm5_phast2/ hgLoadWiggle canFam1 mzHg17Mm5_phast2_wig \ -pathPrefix=/gbdb/canFam1/wib/mzHg17Mm5_phast2 wib/*.wig # make top-5000 list and launcher on Adam's home page: sed -e 's/lod=//' all.bed | sort -k4,4nr | head -5000 \ | awk '{printf "%s\t%d\t%d\tlod=%d\t%d\n", $1, $2, $3, $4, $4}' \ > top5000.bed /cluster/home/acs/bin/make-launcher-with-scores.sh top5000.bed \ /cse/grads/acs/public_html/cf-top5000 \ "top 5000 conserved elements" canFam1 # and clean up bluearc. rm -r /cluster/bluearc/canFam1/phastCons rm -r /cluster/bluearc/canFam1/chrom # NEW REPEATMASKER (DONE 10/5/04 angie) # Use the new version of RepeatMasker, load up as rmskNew, # compare results w/orig. #- contigs have already been split into 500kb chunks, at gaps if possible. #- Make the run directory and job list: ssh kksilo cd /cluster/data/canFam1 cat << '_EOF_' > jkStuff/NewRMDog #!/bin/csh -fe cd $1 pushd . /bin/mkdir -p /tmp/canFam1/$2 /bin/cp $2 /tmp/canFam1/$2/ cd /tmp/canFam1/$2 /cluster/bluearc/RepeatMasker.040909/RepeatMasker -s -spec dog $2 popd /bin/cp /tmp/canFam1/$2/$2.out ./$2.new.out /bin/rm -fr /tmp/canFam1/$2/* /bin/rmdir --ignore-fail-on-non-empty /tmp/canFam1/$2 /bin/rmdir --ignore-fail-on-non-empty /tmp/canFam1 '_EOF_' # << this line makes emacs coloring happy chmod +x jkStuff/NewRMDog mkdir NewRMRun cp /dev/null NewRMRun/RMJobs foreach c (`cat chrom.lst`) foreach d ($c/chr${c}_?{,?}) set ctg = $d:t foreach f ( $d/${ctg}_?{,?}.fa ) set f = $f:t echo /cluster/data/canFam1/jkStuff/NewRMDog \ /cluster/data/canFam1/$d $f \ '{'check out line+ /cluster/data/canFam1/$d/$f.new.out'}' \ >> NewRMRun/RMJobs end end end #- Do the run ssh kk cd /cluster/data/canFam1/NewRMRun para create RMJobs para try, para check, para check, para push, para check,... #Completed: 6373 of 6373 jobs #CPU time in finished jobs: 26081453s 434690.89m 7244.85h 301.87d 0.827 y #IO & Wait Time: 281283s 4688.05m 78.13h 3.26d 0.009 y #Average job time: 4137s 68.94m 1.15h 0.05d #Longest job: 6392s 106.53m 1.78h 0.07d #Submission to last job: 50241s 837.35m 13.96h 0.58d # Some jobs crashed, but succeeded after repushing --> longer total time. #- Lift up the 500KB chunk .new.out's to 5MB ("pseudo-contig") level ssh kksilo cd /cluster/data/canFam1 foreach d (*/chr*_?{,?}) set contig = $d:t echo $contig liftUp $d/$contig.fa.new.out $d/$contig.lft warn $d/${contig}_*.fa.new.out \ > /dev/null end #- Lift pseudo-contigs to chromosome level foreach c (`cat chrom.lst`) echo lifting $c cd $c liftUp chr$c.fa.new.out lift/ordered.lft warn `sed -e 's/\.out$/.new.out/' lift/oOut.lst` > /dev/null cd .. end #- Load the .out files into the database with: ssh hgwdev cd /cluster/data/canFam1 # Doh, hgLoadOut can't deal with too many .suffixes, abbreviate: foreach f (*/chr*.fa.new.out) mv $f $f:r:r:r.new.out end hgLoadOut -table=rmskNew canFam1 */chr*.new.out # chr22 looks like the winner here, send example to Robert Hubley: #Strange perc. field -24.8 line 5580 of 10/chr10.new.out #Strange perc. field -1.8 line 5580 of 10/chr10.new.out #Strange perc. field -0.2 line 27436 of 13/chr13.new.out #Strange perc. field -0.1 line 27436 of 13/chr13.new.out #Strange perc. field -0.9 line 7302 of 2/chr2.new.out #Strange perc. field -5.3 line 15894 of 2/chr2.new.out #Strange perc. field -4.1 line 15894 of 2/chr2.new.out #Strange perc. field -677.0 line 78121 of 22/chr22.new.out #Strange perc. field -174.9 line 78121 of 22/chr22.new.out #Strange perc. field -11.7 line 85036 of 23/chr23.new.out #Strange perc. field -0.8 line 85036 of 23/chr23.new.out #Strange perc. field -2.3 line 17185 of 27/chr27.new.out #Strange perc. field -5.0 line 34029 of 27/chr27.new.out #Strange perc. field -1.8 line 26743 of 29/chr29.new.out #Strange perc. field -22.6 line 90657 of 4/chr4.new.out #Strange perc. field -1.1 line 50705 of 5/chr5.new.out nice featureBits canFam1 rmskNew #901188175 bases of 2359845093 (38.188%) in intersection # Yep, pretty close to original rmsk coverage: #896773874 bases of 2359845093 (38.001%) in intersection # TWINSCAN (DONE 11/9/04 angie) ssh kksilo mkdir /cluster/data/canFam1/bed/twinscan cd /cluster/data/canFam1/bed/twinscan wget http://genes.cs.wustl.edu/predictions/dog/canFam1/canFam1_TS13_pseudomasked.tar.gz tar xvzf canFam1_TS13_pseudomasked.tar.gz # Add '.a' to end of protein fasta id's, to match gtf transcript_id's: cp /dev/null twinscanPep.fa perl -wpe 's/^(>\S+).*/$1.a/' chr_ptx/*.ptx > twinscanPep.fa # load. ssh hgwdev cd /cluster/data/canFam1/bed/twinscan ldHgGene -gtf -genePredExt canFam1 twinscan chr_gtf/chr*.gtf hgPepPred canFam1 generic twinscanPep twinscanPep.fa featureBits -enrichment canFam1 xenoRefGene twinscan #xenoRefGene 1.676%, twinscan 1.384%, both 1.030%, cover 61.42%, enrich 44.39x # QA NOTE (ASZ 6-11-2007): sudo mytouch canFam1 twinscanPep 200411091200.00 # Pick up photo from NHGRI (DONE - 2004-12-22 - Hiram) ssh hgwdev cd /tmp wget \ 'http://www.genome.gov/Pages/News/Photos/Science/dog_image.jpg' -O dog_image.jpg # view that image in 'display' to determine crop edges, then: convert -crop 952x1256+72+280 -quality 80 -sharpen 0 \ -normalize dog_image.jpg ccf.jpg convert -geometry 300x200 -quality 80 ccf.jpg Canis_familiaris.jpg cp -p Canis_familiaris.jpg /usr/local/apache/htdocs/images rm -f ccf.jpg dog_image.jpg # add links to this image in the description.html page, request push # MAKE Human Proteins track (DONE 2005-1-15 braney) ssh kksilo mkdir -p /cluster/data/canFam1/blastDb cd /cluster/data/canFam1/blastDb #cut -f 1 ../chrom.sizes | sed "s/chr//" | sed "/NA/d" | sed "/Un/d" > chrom.list for i in `cat ../chrom.lst`; do ls -1 ../$i/*/*.fa . ; done | sed -n "/.*_.*_.*_.*/p" > list for i in `cat list` do ln -s $i . formatdb -i `basename $i` -p F done rm *.log *.fa list cd .. for i in `cat chrom.lst`; do cat $i/chr*/*.lft ; done > jkStuff/subChr.lft ssh kkr1u00 mkdir -p /iscratch/i/canFam1/blastDb cd /cluster/data/canFam1/blastDb for i in nhr nin nsq; do cp *.$i /iscratch/i/canFam1/blastDb ; echo $i; done cd (iSync) > sync.out mkdir -p /cluster/data/canFam1/bed/tblastn.hg17KG cd /cluster/data/canFam1/bed/tblastn.hg17KG echo /iscratch/i/canFam1/blastDb/*.nsq | xargs ls -S | sed "s/\.nsq//" > query.lst # back to kksilo exit # we want around 250000 jobs cd /cluster/data/canFam1/bed/tblastn.hg17KG calc `wc /cluster/data/hg17/bed/blat.hg17KG/hg17KG.psl | awk "{print \\\$1}"`/\(500000/`wc query.lst | awk "{print \\\$1}"`\) # 42156/(500000/6373) = 537.320376 mkdir -p /cluster/bluearc/canFam1/bed/tblastn.hg17KG/kgfa split -l 537 /cluster/data/hg17/bed/blat.hg17KG/hg17KG.psl /cluster/bluearc/canFam1/bed/tblastn.hg17KG/kgfa/kg ln -s /cluster/bluearc/canFam1/bed/tblastn.hg17KG/kgfa kgfa cd kgfa for i in *; do pslxToFa $i $i.fa; rm $i; done cd .. ls -1S kgfa/*.fa > kg.lst mkdir -p /cluster/bluearc/canFam1/bed/tblastn.hg17KG/blastOut ln -s /cluster/bluearc/canFam1/bed/tblastn.hg17KG/blastOut for i in `cat kg.lst`; do mkdir blastOut/`basename $i .fa`; done tcsh cat << '_EOF_' > blastGsub #LOOP blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl } #ENDLOOP '_EOF_' cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/iscratch/i/blast/data export BLASTMAT g=`basename $2` f=/tmp/`basename $3`.$g for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11 do if /scratch/blast/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8 then mv $f.8 $f.1 break; fi done if test -f $f.1 then if /cluster/bin/i386/blastToPsl $f.1 $f.2 then liftUp -nosort -type=".psl" -nohead $f.3 ../../jkStuff/subChr.lft carry $f.2 liftUp -nosort -type=".psl" -nohead $f.4 ../../jkStuff/liftAll.lft carry $f.3 liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg17/bed/blat.hg17KG/protein.lft warn $f.4 if pslCheck -prot $3.tmp then mv $3.tmp $3 rm -f $f.1 $f.2 $f.3 $f.4 fi exit 0 fi fi rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 $f.4 exit 1 '_EOF_' chmod +x blastSome gensub2 query.lst kg.lst blastGsub blastSpec ssh kk cd /cluster/data/canFam1/bed/tblastn.hg17KG para create blastSpec para push # Completed: 503467 of 503467 jobs # CPU time in finished jobs: 101853630s 1697560.49m 28292.67h 1178.86d 3.230 y # IO & Wait Time: 21902864s 365047.74m 6084.13h 253.51d 0.695 y # Average job time: 246s 4.10m 0.07h 0.00d # Longest job: 1208s 20.13m 0.34h 0.01d # Submission to last job: 204867s 3414.45m 56.91h 2.37d cat << '_EOF_' > chainGsub #LOOP chainSome $(path1) #ENDLOOP '_EOF_' cat << '_EOF_' > chainOne (cd $1; cat q."$2"* | simpleChain -prot -outPsl -maxGap=200000 stdin ../c.`basename $1`.$2.psl) '_EOF_' chmod +x chainOne for j in blastOut/kg??; do for i in `cat ../../chrom.lst`; do echo chainOne $j chr"$i"; done ; done > chainSpec ssh kki cd /cluster/data/canFam1/bed/tblastn.hg17KG para create chainSpec para push # Completed: 3239 of 3239 jobs # CPU time in finished jobs: 2859185s 47653.08m 794.22h 33.09d 0.091 y # IO & Wait Time: 212945s 3549.09m 59.15h 2.46d 0.007 y # Average job time: 948s 15.81m 0.26h 0.01d # Longest job: 49695s 828.25m 13.80h 0.58d # Submission to last job: 227112s 3785.20m 63.09h 2.63d exit # back to kksilo cd /cluster/data/canFam1/bed/tblastn.hg17KG/blastOut for i in kg?? do cat c.$i.*.psl | awk "(\$13 - \$12)/\$11 > 0.6 {print}" > c60.$i.psl sort -rn c60.$i.psl | pslUniq stdin u.$i.psl awk "((\$1 / \$11) ) > 0.60 { print }" c60.$i.psl > m60.$i.psl echo $i done cat u.*.psl m60* | sort -T /tmp -k 14,14 -k 17,17n -k 17,17n | uniq > /cluster/data/canFam1/bed/tblastn.hg17KG/blastHg17KG.psl cd .. ssh hgwdev cd /cluster/data/canFam1/bed/tblastn.hg17KG hgLoadPsl canFam1 blastHg17KG.psl # back to kksilo rm -rf blastOut # End tblastn # BLASTZ BOREOEUTHERIAN (BOREUT1) (DONE 1/29/05 braney) ssh kk mkdir /cluster/data/borEut1/bed/zb.canFam1 ln -s /cluster/data/borEut1/bed/zb.canFam1 /cluster/data/canFam1/bed/blastz.borEut1 cd /cluster/data/canFam1/bed/blastz.borEut1 # Use default (Human-Mouse) settings for starters. cat << '_EOF_' > DEF # dog vs. borEut export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin ALIGN=blastz-run BLASTZ=blastz # Default BLASTZ_H=2000 BLASTZ_ABRIDGE_REPEATS=1 # TARGET: Dog SEQ1_DIR=/iscratch/i/canFam1/nib SEQ1_RMSK= SEQ1_FLAG= # SEQ1_SMSK=/scratch/hg/gs.18/build35/linSpecRep.notInDog SEQ1_SMSK= SEQ1_IN_CONTIGS=0 SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: BorEut SEQ2_DIR=/iscratch/i/borEut1/nib SEQ2_RMSK= SEQ2_FLAG= #SEQ2_SMSK=/scratch/hg/canFam1/linSpecRep.notInHuman SEQ2_SMSK= SEQ2_IN_CONTIGS=0 SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/cluster/data/canFam1/bed/blastz.borEut1 DEF=$BASE/DEF RAW=$BASE/raw CDBDIR=$BASE SEQ1_LEN=$BASE/S1.len SEQ2_LEN=$BASE/S2.len '_EOF_' # << this line keeps emacs coloring happy /cluster/data/hg17/jkStuff/BlastZ_run0.sh cd run.0 para push # Completed: 2200 of 2200 jobs # CPU time in finished jobs: 720384s 12006.40m 200.11h 8.34d 0.023 y # IO & Wait Time: 21690s 361.50m 6.03h 0.25d 0.001 y # Average job time: 337s 5.62m 0.09h 0.00d # Longest job: 2099s 34.98m 0.58h 0.02d # Submission to last job: 2267s 37.78m 0.63h 0.03d ssh kki cd /cluster/data/canFam1/bed/blastz.borEut1 /cluster/data/hg17/jkStuff/BlastZ_run1.sh cd run.1 para push # Completed: 275 of 275 jobs # CPU time in finished jobs: 243s 4.06m 0.07h 0.00d 0.000 y # IO & Wait Time: 774s 12.89m 0.21h 0.01d 0.000 y # Average job time: 4s 0.06m 0.00h 0.00d # Longest job: 8s 0.13m 0.00h 0.00d # Submission to last job: 78s 1.30m 0.02h 0.00d ssh kk cd /cluster/data/canFam1/bed/blastz.borEut1 /cluster/data/hg17/jkStuff/BlastZ_run2.sh cd run.2 para push # Completed: 41 of 41 jobs # CPU time in finished jobs: 266s 4.43m 0.07h 0.00d 0.000 y # IO & Wait Time: 822s 13.70m 0.23h 0.01d 0.000 y # Average job time: 27s 0.44m 0.01h 0.00d # Longest job: 54s 0.90m 0.01h 0.00d # Submission to last job: 58s 0.97m 0.02h 0.00d # END BLASTZ BOREOEUTHERIAN # NEW NEW REPEATMASKER (DONE 1/27/05 angie) # Use the new version of RepeatMasker w/new dog lib, load up as rmskNew, # compare results w/orig. #- contigs have already been split into 500kb chunks, at gaps if possible. #- Make the run directory and job list: ssh kksilo cd /cluster/data/canFam1 cat << '_EOF_' > jkStuff/NewRMDog #!/bin/csh -fe cd $1 pushd . /bin/mkdir -p /tmp/canFam1/$2 /bin/cp $2 /tmp/canFam1/$2/ cd /tmp/canFam1/$2 /cluster/bluearc/RepeatMasker050112/RepeatMasker -s -spec dog $2 popd /bin/cp /tmp/canFam1/$2/$2.out ./$2.new.out /bin/rm -fr /tmp/canFam1/$2/* /bin/rmdir --ignore-fail-on-non-empty /tmp/canFam1/$2 /bin/rmdir --ignore-fail-on-non-empty /tmp/canFam1 '_EOF_' # << this line makes emacs coloring happy chmod +x jkStuff/NewRMDog mkdir NewRMRun cp /dev/null NewRMRun/RMJobs foreach c (`cat chrom.lst`) foreach d ($c/chr${c}_?{,?}) set ctg = $d:t foreach f ( $d/${ctg}_?{,?}.fa ) set f = $f:t echo /cluster/data/canFam1/jkStuff/NewRMDog \ /cluster/data/canFam1/$d $f \ '{'check out line+ /cluster/data/canFam1/$d/$f.new.out'}' \ >> NewRMRun/RMJobs end end end #- Do the run ssh kk cd /cluster/data/canFam1/NewRMRun para create RMJobs para try, para check, para check, para push, para check,... #Completed: 4676 of 4676 jobs #CPU time in finished jobs: 23753156s 395885.93m 6598.10h 274.92d 0.753 y #IO & Wait Time: 122587s 2043.12m 34.05h 1.42d 0.004 y #Average job time: 5106s 85.10m 1.42h 0.06d #Longest job: 11286s 188.10m 3.13h 0.13d #Submission to last job: 73093s 1218.22m 20.30h 0.85d #- Lift up the 500KB chunk .new.out's to 5MB ("pseudo-contig") level ssh kksilo cd /cluster/data/canFam1 foreach d (*/chr*_?{,?}) set contig = $d:t echo $contig liftUp $d/$contig.fa.new.out $d/$contig.lft warn $d/${contig}_*.fa.new.out \ > /dev/null end #- Lift pseudo-contigs to chromosome level foreach c (`cat chrom.lst`) echo lifting $c cd $c liftUp chr$c.fa.new.out lift/ordered.lft warn `sed -e 's/\.out$/.new.out/' lift/oOut.lst` > /dev/null cd .. end #- Load the .out files into the database with: ssh hgwdev cd /cluster/data/canFam1 # Doh, hgLoadOut can't deal with too many .suffixes, abbreviate: foreach f (*/chr*.fa.new.out) mv $f $f:r:r:r.new.out end hgLoadOut -table=rmskNew canFam1 */chr*.new.out # Lots of these warnings: #Strange perc. field -29.7 line 40263 of 7/chr7.new.out #Strange perc. field -29.7 line 40266 of 7/chr7.new.out # And this new one: #note: 27 records dropped due to repStart > repEnd # run with -verbose=2 for details nice featureBits canFam1 rmskNew #948099821 bases of 2359845093 (40.176%) in intersection # Good, coverage has increased since original: #896773874 bases of 2359845093 (38.001%) in intersection # MAKE BETTER-MASKED SEQUENCE FOR SELF ALIGNMENTS (DONE 1/27/05 angie) # I tried self alignments with the sequence masked by the original # RepeatMasker run, but some jobs ran for >1 day and then crashed # due to some heinous chrUn sequence, presumably undermasked. # Since we just ran the new RepeatMasker and coverage improved a # bit, try making a separate newly-masked copy of the sequence and # using that in the blastz self run. The increase in coverage for # chrUn was pretty significant (yikes, look where it was already): # featureBits -chrom=chrUn canFam1 rmsk #39357392 bases of 69040064 (57.007%) in intersection # featureBits -chrom=chrUn canFam1 rmskNew #47147529 bases of 69040064 (68.290%) in intersection # So the problematic parts of chrUn may still run forever and crash, # but if they do, at least we did our best. :) ssh kkr1u00 mkdir /iscratch/i/canFam1/newRMNib cd /iscratch/i/canFam1/newRMNib set trfDir = /cluster/data/canFam1/bed/simpleRepeat/trfMaskChrom foreach f (/cluster/data/canFam1/*/chr*.fa) set chr = $f:t:r set newRmOut = $f:r.new.out echo $chr maskOutFa -soft $f $newRmOut $chr.fa maskOutFa -softAdd $chr.fa $trfDir/$chr.bed $chr.fa faToNib -softMask $chr.fa $chr.nib end rm chr*.fa iSync # BLASTZ SELF (DONE 2/2/05 angie) ssh kk mkdir -p /cluster/data/canFam1/bed/blastz.canFam1.2005-01-27 cd /cluster/data/canFam1/bed/blastz.canFam1.2005-01-27 ln -s blastz.canFam1.2005-01-27 /cluster/data/canFam1/bed/blastz.canFam1 cat << '_EOF_' > DEF # dog vs. dog export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin ALIGN=blastz-run BLASTZ=blastz BLASTZ_H=2000 BLASTZ_ABRIDGE_REPEATS=0 # TARGET # Dog SEQ1_DIR=/iscratch/i/canFam1/newRMNib SEQ1_IN_CONTIGS=0 SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY # Dog SEQ2_DIR=/iscratch/i/canFam1/newRMNib SEQ2_IN_CONTIGS=0 SEQ2_CHUNK=10000000 SEQ2_LAP=10000 BASE=/cluster/data/canFam1/bed/blastz.canFam1.2005-01-27 DEF=$BASE/DEF RAW=$BASE/raw CDBDIR=$BASE SEQ1_LEN=$BASE/S1.len SEQ2_LEN=$BASE/S2.len '_EOF_' # << this line makes emacs coloring happy cp /cluster/data/canFam1/chrom.sizes S1.len cp /cluster/data/canFam1/chrom.sizes S2.len mkdir run cd run partitionSequence.pl 10000000 10000 /iscratch/i/canFam1/newRMNib ../S1.len \ -xdir xdir.sh -rawDir ../lav \ > canFam1.lst csh -ef xdir.sh cat << '_EOF_' > gsub #LOOP /cluster/bin/scripts/blastz-run-ucsc $(path1) $(path2) ../DEF {check out line ../lav/$(file1)/$(file1)_$(file2).lav} #ENDLOOP '_EOF_' # << this line keeps emacs coloring happy gensub2 canFam1.lst canFam1.lst gsub jobList para create jobList para try, check, push, check, ... # Tolerating some jobs that crashed blastz due to nasty sequence. # All were on chunks in chrUn:10000000-90010000 to themselves or # each other, so just include a note on the details page that results # are incomplete in that region. #Completed: 75569 of 75625 jobs #Crashed: 56 jobs #CPU time in finished jobs: 13811407s 230190.12m 3836.50h 159.85d 0.438 y #IO & Wait Time: 295434s 4923.90m 82.06h 3.42d 0.009 y #Average job time: 187s 3.11m 0.05h 0.00d #Longest job: 65494s 1091.57m 18.19h 0.76d #Submission to last job: 411521s 6858.68m 114.31h 4.76d # convert lav files to psl # First, combine all files per partition (too many files per chrom # to do in one swoop!). Then do another round to collect all files # per chrom. ssh kki cd /cluster/data/canFam1/bed/blastz.canFam1.2005-01-27 mkdir pslChrom pslParts run.lavToPsl cd run.lavToPsl ls -1d ../lav/* | sed -e 's@/$@@' > parts.lst # For self alignments, we need lavToAxt's -dropSelf behavior, # so go lav->axt->psl... could add -dropSelf to lavToPsl, but that # might be somewhat invasive and perhaps not really much faster (?) # because the sequence must be dug up in order to rescore... cat << '_EOF_' > do.csh #!/bin/csh -ef cat $1/*.lav \ | lavToAxt -dropSelf stdin /iscratch/i/canFam1/newRMNib \ /iscratch/i/canFam1/newRMNib stdout \ | axtToPsl stdin ../S1.len ../S2.len stdout \ | sort -k 14,14 -k 16n,17n \ | gzip -c > $2 '_EOF_' # << this line makes emacs coloring happy chmod a+x do.csh cat << '_EOF_' > gsub #LOOP ./do.csh $(path1) {check out exists ../pslParts/$(file1).psl.gz } #ENDLOOP '_EOF_' # << this line makes emacs coloring happy gensub2 parts.lst single gsub jobList para create jobList para try, check, push, check, ... #Completed: 275 of 275 jobs #Average job time: 35s 0.59m 0.01h 0.00d #Longest job: 417s 6.95m 0.12h 0.00d #Submission to last job: 1017s 16.95m 0.28h 0.01d awk '{print $1;}' /cluster/data/canFam1/chrom.sizes > chroms.lst cat << '_EOF_' > doChrom.csh #!/bin/csh -ef zcat ../pslParts/$1.*.psl.gz \ | sort -k 14,14 -k 16n,17n \ | gzip -c > $2 '_EOF_' # << this line makes emacs coloring happy chmod a+x doChrom.csh cat << '_EOF_' > gsub.chroms #LOOP ./doChrom.csh $(root1) {check out exists ../pslChrom/$(root1).psl.gz } #ENDLOOP '_EOF_' # << this line makes emacs coloring happy gensub2 chroms.lst single gsub.chroms jobList para create jobList para try, check, push, check, ... # This would have been done in 97s if not for monster chrUn! #Completed: 41 of 41 jobs #Average job time: 42s 0.70m 0.01h 0.00d #Longest job: 928s 15.47m 0.26h 0.01d #Submission to last job: 987s 16.45m 0.27h 0.01d # CHAIN SELF BLASTZ (DONE 2/3/05 angie) # Run axtChain on little cluster ssh kki cd /cluster/data/canFam1/bed/blastz.canFam1.2005-01-27 mkdir -p axtChain/run1 cd axtChain/run1 mkdir out chain ls -1S ../../pslChrom/*.psl.gz > input.lst cat << '_EOF_' > gsub #LOOP doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} #ENDLOOP '_EOF_' # << this line makes emacs coloring happy cat << '_EOF_' > doChain #!/bin/csh -ef setenv PATH $HOME/bin/x86_64:$PATH axtChain -verbose=0 -psl $1 /iscratch/i/canFam1/newRMNib \ /iscratch/i/canFam1/newRMNib stdout \ | chainAntiRepeat /iscratch/i/canFam1/newRMNib /iscratch/i/canFam1/newRMNib \ stdin $2 '_EOF_' # << this line makes emacs coloring happy chmod a+x doChain gensub2 input.lst single gsub jobList para create jobList para try, check, push, check... # chrUn crashed it... too bad for chrUn (not viewed as interesting # for self chains anyway). #Completed: 40 of 41 jobs #Crashed: 1 jobs #Average job time: 185s 3.09m 0.05h 0.00d #Longest job: 490s 8.17m 0.14h 0.01d #Submission to last job: 74789s 1246.48m 20.77h 0.87d # now on the cluster server, sort chains ssh kolossus cd /cluster/data/canFam1/bed/blastz.canFam1.2005-01-27/axtChain chainMergeSort run1/chain/*.chain > all.chain chainSplit chain all.chain rm run1/chain/*.chain # take a look at score distr's foreach f (chain/*.chain) grep chain $f | awk '{print $2;}' | sort -nr > /tmp/score.$f:t:r echo $f:t:r textHistogram -binSize=10000 /tmp/score.$f:t:r echo "" end # trim to minScore=20000 to cut some of the fluff mkdir chainFilt foreach f (chain/*.chain) chainFilter -minScore=20000 $f > chainFilt/$f:t end gzip -c all.chain > all.chain.unfiltered.gz rm -r chain mv chainFilt chain chainMergeSort -saveId chain/*.chain > all.chain # Load chains into database ssh hgwdev cd /cluster/data/canFam1/bed/blastz.canFam1.2005-01-27/axtChain/chain foreach i (*.chain) set c = $i:r echo loading $c hgLoadChain canFam1 ${c}_chainSelf $i end # NET SELF BLASTZ (DONE 2/3/05 angie) ssh kolossus cd /cluster/data/canFam1/bed/blastz.canFam1.2005-01-27/axtChain chainPreNet all.chain ../S1.len ../S2.len stdout \ | chainNet stdin -minSpace=1 ../S1.len ../S2.len stdout /dev/null \ | netSyntenic stdin noClass.net # Add classification info using db tables: ssh hgwdev cd /cluster/data/canFam1/bed/blastz.canFam1.2005-01-27/axtChain netClass -noAr noClass.net canFam1 canFam1 self.net rm noClass.net # Load the nets into database netFilter -minGap=10 self.net | hgLoadNet canFam1 netSelf stdin # Add entries for chainSelf, netSelf to dog/canFam1 trackDb # MAKE VSSELF DOWNLOADABLES (DONE 2/3/05 angie) ssh kolossus cd /cluster/data/canFam1/bed/blastz.canFam1.2005-01-27/axtChain gzip all.chain self.net ssh hgwdev mkdir /usr/local/apache/htdocs/goldenPath/canFam1/vsSelf cd /usr/local/apache/htdocs/goldenPath/canFam1/vsSelf cp -p /cluster/data/canFam1/bed/blastz.canFam1/axtChain/all.chain.gz \ self.chain.gz cp -p /cluster/data/canFam1/bed/blastz.canFam1/axtChain/self.net.gz . md5sum *.gz > md5sum.txt # Copy over & edit README.txt w/pointers to chain, net formats. ######################################################################### # MOUSE NET/CHAINS MM6 - Info contained in makeMm6.doc (200503 Hiram) # BAC END PAIRS TRACK (DONE, 2005-11-14, hartera) # Use latest BAC ends data from NCBI ssh kolossus cd /cluster/data/ncbi mkdir -p bacends/dog/bacends.1 cd bacends/dog/bacends.1 # go to the trace archive at: # http://www.ncbi.nlm.nih.gov/Traces/trace.cgi # Query: species_code='canis familiaris' and center_name='abc' # there are 393408 items and can download 40000 items at a time so # download the perl scrip to query the trace archive. # see http://www.ncbi.nlm.nih.gov/Traces/trace.cgi?cmd=show&f=doc&m=obtain&s=stips#download # for instructions. wget --timestamping \ ftp://ftp.ncbi.nih.gov/pub/TraceDB/misc/query_tracedb chmod +x query_tracedb ssh kolossus cd cat > queryTraceDogBacEnds.csh <<'_EOF_' #!/bin/csh -fe foreach n (0 1 2 3 4 5 6 7 8 9) echo "Fetching page $n ..." (echo -n "retrieve_tgz all 0b"; query_tracedb "query page_size 40000 page_number $n binary species_code='CANIS FAMILIARIS' and center_name='abc'") | query_tracedb > data${n}.tgz end '_EOF_' chmod +x queryTraceDogBacEnds.csh nohup nice queryTraceDogBacEnds.csh & # Took about 5.5 hours. wget --timestamping \ ftp://ftp.ncbi.nih.gov/genomes/CLONEEND/canis_familiaris/9615_clone_ends001.mfa.gz wget --timestamping \ ftp://ftp.ncbi.nih.gov/genomes/CLONEEND/canis_familiaris/9615_clone_info001.txt.gz wget --timestamping \ ftp://ftp.ncbi.nih.gov/genomes/CLONEEND/README nice gunzip *.gz foreach n (1 2 3 4 5 6 7 8 9) nice tar xvf data${n}.tar nice rm -r */CANIS_FAMILIARIS/ABC/traces nice rm -r */CANIS_FAMILIARIS/ABC/qscore end foreach f (*/CANIS_FAMILIARIS/ABC/fasta/*.fasta) nice cat $f >> canFamBacends.fa end # took almost 3 hours # change FASTA record to have only the trace name in the header line. perl -pi.bak -e 's/>gnl\|ti\|[0-9]+ name:([0-9]+)/>$1/' canFamBacends.fa mv canFamBacends.fa.bak bacEndSeqs.fa grep '>' canFamBacends.fa | wc -l # 393408 - this is the correct number of sequences cd /cluster/data/ncbi/bacends/dog/bacends.1 # cat together the xml files of BAC clone end information cat */*.xml > CHORI82BacEnds.xml # clean up rm -r 2005* mkdir -p /san/sanvol1/scratch/canFam1/bacends faSplit sequence /cluster/bluearc/canFam1/bacends/canFamBacends.fa \ 100 /san/sanvol1/scratch/canFam1/bacends/bacends ssh kkstore01 cd /cluster/data/ncbi/bacends/dog/bacends.1 # get mate-pair information from xml, forward is SP6, reverse is T7 cat > getBacInfo.pl << EOF #!/usr/bin/perl -w use strict; my ($name, $clone, $dir); while () { chomp; my $l = $_; if ($l =~ /([0-9]+)/) { $name = $1; } elsif ($l =~ /(CH82\-[0-9A-Z]+)/) { $clone = $1; } elsif ($l =~ /(F|R)/) { $dir = $1; # print out end information print "$clone\t$name\t0\tCHORI-82\t0"; if ($dir eq "F") { print "\tSP6\n"; } elsif ($dir eq "R") { print "\tT7\n"; } } } EOF # << for emacs chmod +x getBacInfo.pl perl getBacInfo.pl < CHORI82BacEnds.xml > bacEndInfo.txt # check all the names are there grep '>' canFamBacends.fa > names perl -pi.bak -e 's/>//' names sort names | uniq > names.sort awk '{print $2}' bacEndInfo.txt | sort | uniq > bacEndInfo.names.sort comm -13 bacEndInfo.names.sort names.sort # 17714318 # 17716422 # 17755662 # 17798575 # 17858308 # 17859375 # 17928748 # 17967720 # 17967763 # 18124464 # 18124606 # 18261519 # these 12 are not in the xml file so download manually from the # trace archive. Go to http://www.ncbi.nlm.nih.gov/Traces/trace.cgi # type in search query e.g. # species_code='CANIS FAMILIARIS' and center_name='ABC' and (trace_name='17967763' or trace_name='18124464' or trace_name='18124606' or trace_name='18261519') # and save as Info-XML foreach n (1 2 3) tar xvf trace${n}.tar end cat */*.xml >> CHORI82BacEnds.xml # then create file of BAC end info again rm bacEndInfo.txt perl getBacInfo.pl < CHORI82BacEnds.xml > bacEndInfo.txt # check all the names are there awk '{print $2}' bacEndInfo.txt | sort | uniq > bacEndInfo.names.sort comm -13 bacEndInfo.names.sort names.sort # these are both the same so all the names were retrieved from the xml file # cleanup rm -r 2005* rm *.bak *.sort names # create mate-pair information /cluster/bin/scripts/convertBacEndPairInfo bacEndInfo.txt # 196031 pairs and 1346 singles # Do BLAT alignments of BAC ends to the genome. ssh pk # copy over masked contigs to the san mkdir -p /san/sanvol1/scratch/canFam1/maskedContigs rsync -a --progress kkr1u00:/iscratch/i/canFam1/maskedContigs/ \ /san/sanvol1/scratch/canFam1/maskedContigs # copy over 11.ooc file to the san rsync -a --progress kkr1u00:/iscratch/i/canFam1/11.ooc \ /san/sanvol1/scratch/canFam1 # make output directory mkdir -p /san/sanvol1/scratch/canFam1/bacendsPsl # make run directory mkdir -p /san/sanvol1/scratch/canFam1/bacendsRun mkdir -p /cluster/data/canFam1/bed/bacends cd /cluster/data/canFam1/bed/bacends ln -s /san/sanvol1/scratch/canFam1/bacendsPsl ln -s /san/sanvol1/scratch/canFam1/bacendsRun cd /cluster/data/canFam1/bed/bacends/bacendsRun echo '#LOOP\n/cluster/bin/x86_64/blat $(path1) $(path2) -ooc=/san/sanvol1/scratch/canFam1/11.ooc {check out line+ /san/sanvol1/scratch/canFam1/bacendsPsl/$(root1).$(root2).psl}\n#ENDLOOP' > gsub ls -1S /san/sanvol1/scratch/canFam1/maskedContigs/*.fa > contigs.lst ls -1S /san/sanvol1/scratch/canFam1/bacends/bacends*.fa > bacends.lst gensub2 contigs.lst bacends.lst gsub jobList para create jobList para try, check, push, check etc. # para time # Completed: 49203 of 49203 jobs # CPU time in finished jobs: 185617s 3093.62m 51.56h 2.15d 0.006 y # IO & Wait Time: 169691s 2828.18m 47.14h 1.96d 0.005 y # Average job time: 7s 0.12m 0.00h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 212s 3.53m 0.06h 0.00d # Submission to last job: 1473s 24.55m 0.41h 0.02d # lift alignments ssh kolossus cd /cluster/data/canFam1/bed/bacends pslSort dirs raw.psl tmp bacendsPsl # started 08:15, PID: 16561 pslCheck raw.psl >& check.log wc -l check.log # 2467 check.log so 2467 invalid alignments with overlapping blocks wc -l raw.psl # 27147459 raw.psl - 0.009% of alignments have overlapping blocks. pslReps -nearTop=0.02 -minCover=0.60 -minAli=0.85 -noIntrons \ raw.psl bacEnds.psl /dev/null # Processed 27147454 alignments wc -l bacEnds.psl # 771166 bacEnds.psl # with pslCheck, 1010 of these have overlapping blocks, 0.008% liftUp bacEnds.lifted.psl /cluster/data/canFam1/jkStuff/liftAll.lft \ warn bacEnds.psl awk '{print $10}' bacEnds.lifted.psl | sort | uniq | wc -l # 317098 without headers - 80.6% aligned, 393408 total # for unfiltered alignments, raw.psl, 85.7% aligned. # Make BAC end pairs track: ssh kkstore03 # create alignment pairs mkdir -p /cluster/data/canFam1/bed/bacends/pairs cd /cluster/data/canFam1/bed/bacends/pairs set ncbiDir = /cluster/data/ncbi/bacends/dog/bacends.1 /cluster/bin/x86_64/pslPairs -tInsert=10000 -minId=0.91 -noBin -min=25000 -max=350000 -slopval=10000 -hardMax=500000 -slop -short -long -orphan -mismatch -verbose ../bacEnds.lifted.psl $ncbiDir/bacEndPairs.txt all_bacends bacEnds wc -l * # 29 bacEnds.long # 432 bacEnds.mismatch # 46668 bacEnds.orphan # 131858 bacEnds.pairs # 499 bacEnds.short # 805 bacEnds.slop # create header required by "rdb" tools echo 'chr\tstart\tend\tclone\tscore\tstrand\tall\tfeatures\tstarts\tsizes' \ > header echo '10\t10N\t10N\t10\t10N\t10\t10\t10N\t10\t10' >> header # edit header to make sure \t is/become tab character cat header bacEnds.pairs | row score ge 300 | sorttbl chr start \ | headchg -del > bacEndPairs.bed # Loaded 131858 elements of size 11 cat header bacEnds.slop bacEnds.short bacEnds.long bacEnds.mismatch \ bacEnds.orphan | row score ge 300 | sorttbl chr start \ | headchg -del > bacEndPairsBad.bed # Loaded 48406 elements of size 11 extractPslLoad -noBin ../bacEnds.lifted.psl bacEndPairs.bed \ bacEndPairsBad.bed | sorttbl tname tstart | headchg -del \ > bacEnds.load.psl # load into database ssh hgwdev cd /cluster/data/canFam1/bed/bacends/pairs hgLoadBed canFam1 bacEndPairs bacEndPairs.bed -notItemRgb \ -sqlTable=$HOME/kent/src/hg/lib/bacEndPairs.sql # note - this next track isn't pushed to RR, just used for assembly QA hgLoadBed canFam1 bacEndPairsBad bacEndPairsBad.bed -notItemRgb \ -sqlTable=$HOME/kent/src/hg/lib/bacEndPairsBad.sql hgLoadPsl canFam1 -table=all_bacends bacEnds.load.psl # load of all_bacends did not go as planned: 746545 record(s), 0 row(s) # skipped, 433 warning(s) loading psl.tab # all alignments were loaded into the table. # load BAC end sequences into seq table so alignments may be viewed # symlink to FASTA sequence file in ncbi directory mkdir -p /gbdb/canFam1/bacends ln -s /cluster/data/ncbi/bacends/dog/bacends.1/canFamBacends.fa \ /gbdb/canFam1/bacends/canFamBacends.fa hgLoadSeq canFam1 /gbdb/canFam1/bacends/canFamBacends.fa # 393408 sequences # featureBits canFam1 all_bacends # 211644790 bases of 2359845093 (8.969%) in intersection # featureBits canFam1 bacEndPairs # 2334084046 bases of 2359845093 (98.908%) in intersection # featureBits canFam1 bacEndPairsBad # 548130287 bases of 2359845093 (23.227%) in intersection # add trackDb entry and html # Changes were made to hgc.c so that the BAC clone and end name links # go to the relevant entries in the NCBI Trace Archive. # BROAD(/ENSEMBL/OXFORD) GENE ANNOTATIONS (DONE 2/15/06 angie) # These do not look good enough to push -- I just loaded them because # David and Mark were interested. They do not include CDS annotation # and don't look very good compared to xenoRefGene. ssk kkstore03 mkdir /cluster/data/canFam1/bed/broadGenes cd /cluster/data/canFam1/bed/broadGenes wget --timestamping http://www.broad.mit.edu/ftp/pub/papers/dog_genome/suppinfo/DogGenes_CanFam1.zip # Translate the three GFF-like formats in this file into vanilla GFF # with transcript IDs for grouping: perl -wpe 's/ENSCAFE\d+\s+ENSCAFG\d+\s+(ENSCAFT\d+)\s+\w+\s*$/$1\n/; \ s/\d+\.\d+\.\d+\s+\d+\s+// if /broad/; \ s/\d+\s+\d+\s+(\d+)\s+\w+\s*$/$1\n/ if /oxford/; \ s/^/chr/;' \ dog_final.cdna.gff > cleaned.gff grep ensembl cleaned.gff > ensemblExons.gff grep broad cleaned.gff > broadExons.gff grep oxford cleaned.gff > oxfordExons.gff ssh hgwdev cd /cluster/data/canFam1/bed/broadGenes ldHgGene canFam1 ensemblExons ensemblExons.gff ldHgGene canFam1 broadExons broadExons.gff ldHgGene canFam1 oxfordExons oxfordExons.gff # ENSEMBL GENES (DONE 2/22/06 angie) mkdir /cluster/data/canFam1/bed/ensembl cd /cluster/data/canFam1/bed/ensembl # Get the ensembl gene data from # http://www.ensembl.org/Canis_familiaris/martview # Follow this sequence through the pages: # Page 1) Make sure that the Canis_familiaris choice is selected. Hit next. # Page 2) Uncheck the "Limit to" box in the region choice. Then hit next. # Page 3) Choose the "Structures" box. # Page 4) Choose GTF as the ouput. choose gzip compression. hit export. # Save as ensembl.gff.gz # Add "chr" to front of each line in the gene data gtf file to make # it compatible with our software. gunzip -c ensembl.gff.gz \ | perl -wpe 's/^([0-9]+|X|Y|M|Un(_random)?)/chr$1/ \ || die "Line $. doesnt start with rat chrom:\n$_"; \ s/chrMT/chrM/;' \ > ensGene.gtf ssh hgwdev ldHgGene -gtf -genePredExt canFam1 ensGene \ /cluster/data/canFam1/bed/ensembl/ensGene.gtf # ensGtp associates geneId/transcriptId/proteinId for hgPepPred and # hgKnownToSuper. Use ensMart to create it as above, except: # Page 3) Choose the "Features" box. In "Ensembl Attributes", check # Ensembl Gene ID, Ensembl Transcript ID, Ensembl Peptide ID. # Choose Text, tab-separated as the output format. Result name ensGtp. # Save file as ensGtp.txt.gz #-- Normally the versioned accs are available, to match what we get in #-- the GTF. However, this time the versions are missing... fortunately #-- everything is version .1 this time so we can just fudge it in: gunzip -c ensGtp.txt.gz \ | perl -wpe 's/(ENSCAF[GTP]\d+)/$1.1/g' > ensGtp.tab hgLoadSqlTab canFam1 ensGtp ~/kent/src/hg/lib/ensGtp.sql ensGtp.tab # Load Ensembl peptides: # Get them from ensembl as above in the gene section except for # Page 3) Choose the "Sequences" box. # Page 4) Transcripts/Proteins. Peptide. Format = FASTA. # Save file as ensemblPep.fa.gz gunzip -c ensemblPep.fa.gz \ | perl -wpe 's/^>.*\|(ENSCAFT\d+\.\d+).*/>$1/' \ > ensPep.fa hgPepPred canFam1 generic ensPep ensPep.fa ########################################################################## # NSCAN track - (2006-03-27 markd) cd /cluster/data/canFam1/bed/nscan/ # obtainedf NSCAN predictions from michael brent's group # at WUSTL wget -nv http://genes.cse.wustl.edu/predictions/dog/canFam1/CanFam1_hg17_nscan_10_18_2005.tar.gz chmod a-w CanFam1_hg17_nscan_10_18_2005.tar.gz tar -zxf CanFam1_hg17_nscan_10_18_2005.tar.gz # load tracks. Note that these have *utr features, rather than # exon features. currently ldHgGene creates separate genePred exons # for these. ldHgGene -bin -gtf -genePredExt canFam1 nscanGene CanFam1_hg17_nscan_10_18_2005/chr_gtf/*.gtf # add .a suffix to match transcript id hgPepPred -suffix=.a canFam1 generic nscanPep CanFam1_hg17_nscan_10_18_2005/chr_ptx/*.ptx rm -rf CanFam1_hg17_nscan_10_18_2005 *.tab # update trackDb; need a canFam1-specific page to describe informants dog/canFam1/nscanGene.html (from CanFam1_hg17_nscan_10_18_2005/description.html ) dog/canFam1/trackDb.ra ########################################################################### # LIFTOVER CHAINS CANFAM1 -> CANFAM2 (DONE 7/10/06 angie) # Started 7/5 -- This creates the directory and scripts: ~/kent/src/utils/doSameSpeciesLiftOver.pl canFam1 canFam2 -debug # Run it for real and log it: cd /cluster/data/canFam1/bed/blat.canFam2.2006-07-05 ~/kent/src/utils/doSameSpeciesLiftOver.pl canFam1 canFam2 \ >& do.log & tail -f do.log # Had to use -continue a few times because I'm debugging the script, # and -buildDir /cluster/data/canFam1/bed/blat.canFam2.2006-07-05 when # the date changed, but it finished! ########################################################################### # dbSNP BUILD 125 (Heather, August 2006) # We didn't have ctgPos (I looked for an hour+ on the NCBI site, couldn't find it). # dbSNP build 125 uses chrom coords (phys_pos_from and phys_pos) in ContigLoc # but not for chrUn. chrUn has contig coords only, and without ctgPos # for supercontigs, I can't translate the chrUn contig coords into chrom coords. # Also note: canFam1 has no chrY, and there were no SNPs for chrM. # Not having ctgPos also meant I couldn't process the new locTypes # (4,5,6: rangeInsertion, rangeSubstitution, rangeDeletion) # because those are also only given in contig coords # These were a small number. # Set up directory structure ssh kkstore02 cd /cluster/data/dbSNP/125/organisms/dog_9615 mkdir data mkdir schema mkdir rs_fasta # Get data from NCBI (anonymous FTP) cd /cluster/data/dbSNP/125/organisms/dog_9615/data ftp ftp.ncbi.nih.gov cd snp/organisms/dog_9615/database/organism_data # ContigLoc table has coords, orientation, loc_type, and refNCBI allele get b125_SNPContigLoc_1_1.bcp.gz # ContigLocusId has function get b125_SNPContigLocusId_1_1.bcp.gz get b125_ContigInfo_1_1.bcp.gz # MapInfo has alignment weights get b125_SNPMapInfo_1_1.bcp.gz # SNP has univar_id, validation status and heterozygosity get SNP.bcp.gz # Get schema from NCBI cd /cluster/data/dbSNP/125/organisms/dog_9615/schema ftp ftp.ncbi.nih.gov cd snp/organisms/dog_9615/database/organism_schema get dog_9615_table.sql.gz # Get fasta files from NCBI # using headers of fasta files for molType cd /cluster/data/dbSNP/125/organisms/dog_9615/rs_fasta ftp ftp.ncbi.nih.gov cd snp/organisms/dog_9615/rs_fasta prompt mget *.gz # add rs_fasta to seq/extFile # 2 edits first: strip header to just rsId, and remove duplicates # work on /cluster/store12 (kkstore05) which has more disk space cp rs_ch*.fas.gz /cluster/store12/snp/125/dog/rs_fasta ssh kkstore05 cd /cluster/store12/snp/125/dog/rs_fasta # concat into rsAll.fas cat << '_EOF_' > concat.csh #!/bin/csh -ef rm -f rsAll.fas foreach file (rs_ch*.fas) echo $file zcat $file >> rsAll.fas end '_EOF_' # snpCleanSeq strips the header and skips duplicates /cluster/home/heather/kent/src/hg/snp/snpLoad/snpCleanSeq rsAll.fas snp.fa rm rsAll.fas # load on hgwdev ssh hgwdev mkdir /gbdb/canFam1/snp ln -s /cluster/store12/snp/125/dog/rs_fasta/snp.fa /gbdb/canFam1/snp/snp.fa cd /clsuter/store12/snp/125/dog/rs_fasta hgLoadSeq canFam1 /gbdb/canFam1/snp/snp.fa # look up id in extFile # move into separate table hgsql canFam1 < snpSeq.sql hgsql -e 'insert into snpSeq select acc, file_offset from seq where extFile = 455951' canFam1 hgsql -e 'delete from seq where extFile = 455951' canFam1 hgsql -e 'alter table snpSeq add index acc (acc)' canFam1 # clean up after hgLoadSeq rm seq.tab # Simplify names of data files cd /cluster/data/dbSNP/125/organisms/dog_9615/data mv b125_ContigInfo_1_1.bcp.gz ContigInfo.gz mv b125_SNPContigLoc_1_1.bcp.gz ContigLoc.gz mv b125_SNPContigLocusId_1_1.bcp.gz ContigLocusId.gz mv b125_SNPMapInfo_1_1.bcp.gz MapInfo.gz mv SNP.bcp.gz SNP.gz ls -1 *.gz > filelist # edit table descriptions cd /cluster/data/dbSNP/125/organisms/dog_9615/schema # get CREATE statements from dog_9615_table.sql for our 5 tables # store in table.tmp # convert and rename tables sed -f 'mssqlToMysql.sed' table.tmp > table2.tmp rm table.tmp sed -f 'tableRename.sed' table2.tmp > table.sql rm table2.tmp # get header lines from rs_fasta cd /cluster/data/dbSNP/125/organisms/dog_9615/rs_fasta /bin/csh gnl.csh # load on kkr5u00 ssh kkr5u00 hgsql -e mysql 'create database canFam1snp125' cd /cluster/data/dbSNP/125/organisms/dog_9615/schema hgsql canFam1snp125 < table.sql cd ../data /bin/csh load.csh # note rowcount # ContigLoc 10806955 # SNP 3310003 # MapInfo 3310003 # ContigLocusId 1219541 # Get UniVariation from 126 (recently downloaded for mm8snp126) cd /cluster/data/dbSNP/126/shared hgsql canFam1snp125 < UniVariation.sql zcat UniVariation.bcp.gz | hgsql -e 'load data local infile "/dev/stdin" into table UniVariation' canFam1snp125 # create working /scratch dir cd /scratch/snp/125 mkdir dog cd dog # no canFam1.ctgPos table to compare against # get gnl files cp /cluster/data/dbSNP/125/organisms/dog_9615/rs_fasta/*.gnl . # examine ContigInfo for group_term and edit pipeline.csh # use "ref_assembly" # filter ContigLoc into ContigLocFilter # this lifts from contig coords to chrom coords # phys_pos_from is used to check coords for non-random chroms # errors reported to stdout # this gets rid of alternate assemblies (using ContigInfo) # this also gets rid of poor quality alignments (weight == 10 || weight == 0 in MapInfo) # assumes all contigs are positively oriented; will abort if not true mysql> desc ContigLocFilter; # +---------------+-------------+------+-----+---------+-------+ # | Field | Type | Null | Key | Default | Extra | # +---------------+-------------+------+-----+---------+-------+ # | snp_id | int(11) | NO | | | | # | ctg_id | int(11) | NO | | | | # | chromName | varchar(32) | NO | | | | # | loc_type | tinyint(4) | NO | | | | # | start | int(11) | NO | | | | # | end | int(11) | YES | | NULL | | # | orientation | tinyint(4) | NO | | | | # | allele | blob | YES | | NULL | | # +---------------+-------------+------+-----+---------+-------+ /cluster/home/heather/kent/src/hg/snp/snpLoad/snpContigLocFilter125 canFam1snp125 ref_assembly Dog1.0 # note rowcount # ContigLocFilter 3124054 # how many are positive strand? hopefully 90% # We get 100% here mysql> select count(*) from ContigLocFilter where orientation = 0; # 3124054 # note count by loc_type mysql> select count(*), loc_type from ContigLocFilter group by loc_type; # +----------+----------+ # | count(*) | loc_type | # +----------+----------+ # | 2296 | 1 | # | 2986459 | 2 | # | 135299 | 3 | # +----------+----------+ # filter ContigLocusId into ContigLocusIdFilter # need to filter on contig_acc because ctg_id = 0 for all rows in ContigLocusId /cluster/home/heather/kent/src/hg/snp/snpLoad/snpContigLocusIdFilter125 canFam1snp125 ref_assembly # note rowcount # 1219541 (same as ContigLocusId) # condense ContigLocusIdFilter into ContigLocusIdCondense (one SNP can have multiple functions) # assumes SNPs are in numerical order; will errAbort if not true /cluster/home/heather/kent/src/hg/snp/snpLoad/snpContigLocusIdCondense canFam1snp125 # note rowcount; expect about 50% (ascertainment bias for SNPs within genes) # actually we get about 90% here # ContigLocusIdCondense 1168248 # could delete ContigLocusIdFilter table here # create chrN_snpFasta tables from *.gnl files # we are just using molType, but also storing class and observed # need chromInfo for this /cluster/home/heather/kent/src/hg/snp/snpLoad/snpLoadFasta canFam1snp125 # (could start using pipeline.csh here) # (pipeline.csh takes about 35 minutes to run) # split ContigLocFilter by chrom # create the first chrN_snpTmp # we will reuse this table name, adding/changing columns as we go # at this point chrN_snpTmp will have the same description as ContigLocFilter # this opens a file handle for every chrom, so will not scale to scaffold-based assemblies /cluster/home/heather/kent/src/hg/snp/snpLoad/snpSplitByChrom canFam1snp125 ref_assembly # check coords using loc_type # possible errors logged to snpLocType.error: # Unknown locType # Between with allele != '-' # Exact with end != start + 1 # Range with end < start # possible exceptions logged to snpLocType.exceptions: # RefAlleleWrongSize # This run no errors, no exceptions # morph chrN_snpTmp mysql> desc chr1_snpTmp; # +---------------+-------------+------+-----+---------+-------+ # | Field | Type | Null | Key | Default | Extra | # +---------------+-------------+------+-----+---------+-------+ # | snp_id | int(11) | NO | | | | # | ctg_id | int(11) | NO | | | | # | chromStart | int(11) | NO | | | | # | chromEnd | int(11) | NO | | | | # | loc_type | tinyint(4) | NO | | | | # | orientation | tinyint(4) | NO | | | | # | allele | blob | YES | | NULL | | # +---------------+-------------+------+-----+---------+-------+ /cluster/home/heather/kent/src/hg/snp/snpLoad/snpLoctype125 canFam1snp125 ref_assembly # expand allele as necessary # report syntax errors to snpExpandAllele.errors # possible exceptions logged to snpExpandAllele.exceptions: # RefAlleleWrongSize # This run no errors, no exceptions # 43 alleles expanded /cluster/home/heather/kent/src/hg/snp/snpLoad/snpExpandAllele canFam1snp125 ref_assembly # the next few steps prepare for working in UCSC space # sort by position /cluster/home/heather/kent/src/hg/snp/snpLoad/snpSort canFam1snp125 ref_assembly # rename MT --> M (pipeline.csh takes care of this) hgsql -e "rename table chrMT_snpTmp to chrM_snpTmp" canFam1snp125 # note that chrM_snpTmp is empty at this point # get nib files ssh hgwdev cd /cluster/data/canFam1/nib cp *.nib /cluster/home/heather/transfer/snp/canFam1 ssh kkr5u00 cd /scratch/snp/125/dog mkdir nib cp /cluster/home/heather/transfer/snp/canFam1/*.nib nib rm /cluster/home/heather/transfer/snp/canFam1/*.nib # edit path in chromInfo, load into canFam1snp125 with editted path # lookup reference allele in nibs # keep reverse complement to use in error checking (snpCheckAlleles) # check here for SNPs larger than 1024 # errAbort if detected # check for coords that are too large, log to snpRefUCSC.error and skip # This run no errors /cluster/home/heather/kent/src/hg/snp/snpLoad/snpRefUCSC canFam1snp125 # morph chrN_snpTmp mysql> desc chr1_snpTmp; # +--------------------+-------------+------+-----+---------+-------+ # | Field | Type | Null | Key | Default | Extra | # +--------------------+-------------+------+-----+---------+-------+ # | snp_id | int(11) | NO | | | | # | ctg_id | int(11) | NO | | | | # | chromStart | int(11) | NO | | | | # | chromEnd | int(11) | NO | | | | # | loc_type | tinyint(4) | NO | | | | # | orientation | tinyint(4) | NO | | | | # | allele | blob | YES | | NULL | | # | refUCSC | blob | YES | | NULL | | # | refUCSCReverseComp | blob | YES | | NULL | | # +--------------------+-------------+------+-----+---------+-------+ # compare allele from dbSNP to refUCSC # locType between is excluded from this check # log exceptions to snpCheckAllele.exceptions # if SNP is positive strand, expect allele == refUCSC # log RefAlleleMismatch if not # if SNP is negative strand, if not allele == refUCSC, then check for allele == refUCSCReverseComp # If allele == refUCSCRevComp, log RefAlleleNotRevComp # If allele doesn't match either of refUCSC or refUCSCReverseComp, log RefAlleleMismatch # This run we got: # 0 RefAlleleMismatch # 2758 RefAlleleNotRevComp /cluster/home/heather/kent/src/hg/snp/snpLoad/snpCheckAlleles canFam1snp125 # add class and observed using univar_id from SNP table # to get class (subsnp_class) and observed (var_str) from UniVariation # log errors to snpClassAndObserved.errors # errors detected: # class = 0 in UniVariation # class > 8 in UniVariation # univar_id = 0 in SNP # no row in SNP for snp_id in chrN_snpTmp # This run we got: # 3 class = 0 in UniVariation # 0 class > 8 in UniVariation # 0 univar_id = 0 in SNP # 0 no row in SNP for snp_id in chrN_snpTmp # dbSNP has class = 'in-del' # we promote this to 'deletion' for locType 1&2 and to 'insertion' for locType 3 /cluster/home/heather/kent/src/hg/snp/snpLoad/snpClassAndObserved canFam1snp125 # morph chrN_snpTmp # +--------------------+---------------+------+-----+---------+-------+ # | Field | Type | Null | Key | Default | Extra | # +--------------------+---------------+------+-----+---------+-------+ # | snp_id | int(11) | NO | | | | # | chromStart | int(11) | NO | | | | # | chromEnd | int(11) | NO | | | | # | loc_type | tinyint(4) | NO | | | | # | class | varchar(255) | NO | | | | # | orientation | tinyint(4) | NO | | | | # | allele | blob | YES | | NULL | | # | refUCSC | blob | YES | | NULL | | # | refUCSCReverseComp | blob | YES | | NULL | | # | observed | blob | YES | | NULL | | # +--------------------+---------------+------+-----+---------+-------+ # generate exceptions for class and observed # SingleClassBetweenLocType # SingleClassRangeLocType # NamedClassWrongLocType # ObservedWrongFormat # ObservedWrongSize # ObservedMismatch # RangeSubstitutionLocTypeExactMatch # SingleClassTriAllelic # SingleClassQuadAllelic # This will also detect IUPAC symbols in allele /cluster/home/heather/kent/src/hg/snp/snpLoad/snpCheckClassAndObserved canFam1snp125 # add function /cluster/home/heather/kent/src/hg/snp/snpLoad/snpFunction canFam1snp125 # add validation status and heterozygosity # log error if validation status > 31 or missing # no errors this run /cluster/home/heather/kent/src/hg/snp/snpLoad/snpSNP canFam1snp125 # add molType # errors detected: missing or duplicate molType # this run no errors /cluster/home/heather/kent/src/hg/snp/snpLoad/snpMoltype canFam1snp125 # generate chrN_snp125 and snp125Exceptions tables cp snpCheckAlleles.exceptions snpCheckAlleles.tab cp snpCheckClassAndObserved.exceptions snpCheckClassAndObserved.tab cp snpExpandAllele.exceptions snpExpandAllele.tab cp snpLocType.exceptions snpLocType.tab /cluster/home/heather/kent/src/hg/snp/snpLoad/snpFinalTable canFam1snp125 125 # concat into snp125.tab # cat chr*_snp125.tab >> snp125.tab /bin/sh concat.sh # check for multiple alignments /cluster/home/heather/kent/src/hg/snp/snpLoad/snpMultiple canFam1snp125 125 mysql> load data local infile 'snpMultiple.tab' into table snp125Exceptions; # load on hgwdev cp snp125.tab /cluster/home/heather/transfer/snp/canFam1 hgsql canFam1snp125 -e 'select * from snp125Exceptions' > /cluster/home/heather/transfer/snp/canFam1/snp125Exceptions.tab ssh hgwdev cd transfer/snp/canFam1 mysql> load data local infile 'snp125.tab' into table snp125; mysql> load data local infile 'snp125Exceptions.tab' into table snp125Exceptions; # create indexes mysql> alter table snp125 add index name (name); mysql> alter table snp125 add index chrom (chrom, bin); mysql> alter table snp125Exceptions add index name(name); # create snp125ExceptionDesc table cd /cluster/data/dbSNP hgsql canFam1 < snp125ExceptionDesc.sql # add counts to exception.human.125, can start with exception.template hgsql -e 'select count(*), exception from snp125Exceptions group by exception' canfam1 mysql> load data local infile 'exception.canFam1.snp125' into table snp125ExceptionDesc; mysql> select count(*), exception from snp125Exceptions group by exception; +----------+---------------------------+ | count(*) | exception | +----------+---------------------------+ | 97271 | MultipleAlignments | | 3687 | ObservedMismatch | | 1496 | ObservedWrongSize | | 2758 | RefAlleleNotRevComp | | 4102 | SingleClassBetweenLocType | | 26 | SingleClassQuadAllelic | | 799 | SingleClassRangeLocType | | 359 | SingleClassTriAllelic | +----------+---------------------------+ ##################################################################### # SEGMENTAL DUPLICATIONS (DONE 10/16/06 angie) # File emailed from Ginger Cheng mkdir /cluster/data/canFam1/bed/genomicSuperDups cd /cluster/data/canFam1/bed/genomicSuperDups awk '($3 - $2) >= 1000 && ($9 - $8) >= 1000 {print;}' canFam1_WGAC.tab \ | hgLoadBed canFam1 genomicSuperDups stdin \ -tab -sqlTable=$HOME/kent/src/hg/lib/genomicSuperDups.sql ##################################################################### # MICROSATELLITES (DONE 02/22/08 kuhn) cd /cluster/data/canFam1/bed mkdir microsat cd microsat awk ' ($5==2 || $5==3) && $6 >= 15 && $8 == 100 && $9 == 0 {printf("%s\t%s\t%s\t%dx%s\n", $1, $2, $3, $6, $16);}' ../simpleRepeat/simpleRepeat.bed > microsat.bed hgLoadBed canFam1 microsat microsat.bed # {actually needed to catenate the whole awk command to get it to work)