# for emacs: -*- mode: sh; -*- # This file describes browser build for the Medaka # genome, Oryzias latipes, October 2005, MEDAKA1 from # Institute of Genetics (NIG) and the University of Tokyo # NIG: http://dolphin.lab.nig.ac.jp/medaka/ # UTGB: http://medaka.utgenome.org/o # Data release policy: http://medaka.utgenome.org/#policy # Ensembl: http://www.ensembl.org/Oryzias_latipes/index.html # # "$Id: oryLat1.txt,v 1.42 2008/07/01 16:53:02 markd Exp $" # ########################################################################## ### Fetch sequence (DONE - 2006-11-22 - Hiram) ssh kkstore04 mkdir /cluster/store8/oryLat1 ln -s /cluster/store8/oryLat1 /cluster/data/oryLat1 cd /cluster/data/oryLat1 mkdir rawData cd rawData cat << '_EOF_' > fetch.sh #!/bin/sh for C in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 do wget --timestamping \ "ftp://ftp.ensembl.org/pub/current_oryzias_latipes/data/fasta/dna/Oryzias_latipes.MEDAKA1.41.dna.chromosome.${C}.fa.gz" \ -O chr${C}.fa.gz done wget --timestamping \ "ftp://ftp.ensembl.org/pub/current_oryzias_latipes/data/fasta/dna/Oryzias_latipes.MEDAKA1.41.dna.nonchromosomal.fa.gz" \ -O chrUn.fa.gz wget --timestamping \ "ftp://ftp.ensembl.org/pub/current_oryzias_latipes/data/fasta/dna/Oryzias_latipes.MEDAKA1.41.dna.seqlevel.fa.gz" \ -O seqlevel.fa.gz '_EOF_' # << happy emacs chmod +x fetch.sh ./fetch.sh # There is also repeat masked sequence there too cat << '_EOF_' > fetchRM.sh #!/bin/sh for C in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 do wget --timestamping \ "ftp://ftp.ensembl.org/pub/current_oryzias_latipes/data/fasta/dna/Oryzias_latipes.MEDAKA1.41.dna_rm.chromosome.${C}.fa.gz" \ -O chr${C}.rm.fa.gz done wget --timestamping \ "ftp://ftp.ensembl.org/pub/current_oryzias_latipes/data/fasta/dna/Oryzias_latipes.MEDAKA1.41.dna_rm.nonchromosomal.fa.gz" \ -O chrUn.rm.fa.gz wget --timestamping \ "ftp://ftp.ensembl.org/pub/current_oryzias_latipes/data/fasta/dna/Oryzias_latipes.MEDAKA1.41.dna_rm.seqlevel.fa.gz" \ -O seqlevel.rm.fa.gz '_EOF_' # << happy emacs chmod +x fetchRM.sh ./fetchRM.sh ########################################################################## ## Create AGP file from these sequence files ## (DONE - 2006-11-28 - Hiram) # Looks like we might be able to make an AGP file from the seqlevel.fa # file. Its fasta headers have position information, e.g.: # >scaffold605_contig102266 dna:contig chromosome:MEDAKA1:12:103214:103945:-1 # >scaffold605_contig102265 dna:contig chromosome:MEDAKA1:12:103956:105102:-1 # >scaffold605_contig102264 dna:contig chromosome:MEDAKA1:12:105113:107631:-1 # ... etc # Extract the headers out of the seqlevel.fa.gz file: zcat seqlevel.fa.gz | grep "^>" > seqlevel.fa.headers.txt # use the following perl script on those headers to generate an agp file: cat << '_EOF_' > allSeqLevel.pl #!/usr/bin/env perl use warnings; use strict; my %chrStartOrdered; open (FH,") { chomp $line; my @a = split('\s',$line); # first word on the line is a scaffold_contig identification $a[0] =~ s/^>//; # third word on the line is the chrom position indication my @b = split(':',$a[2]); # length of this scaffold_contig my $len = $b[4] - $b[3] + 1; my $strand = "+"; # a -1 indicates negative strand orientation for this scaffold_contig $strand = "-" if (-1 == $b[5]); my $key = ""; # key will be chrom.start if (($b[2] =~ m/^[0-9]+$/) && ($b[3] =~ m/^[0-9]+$/)) { $key = sprintf("%03d.%09d", $b[2], $b[3]); } else { my @scafContig = split('_',$a[0]); if (length($prevScaf) > 0) { if ($scafContig[0] =~ m/^$prevScaf$/) { $unGapSize = 10; } else { $unGapSize = 100; } } else { $unGapSize = 0; # first contig seen, no gap yet. } $prevScaf = $scafContig[0]; $unStart += $unGapSize; $key = sprintf("Un.%09d", $unStart); $unEnd = $unStart + $len - 1; $chrStartOrdered{$key} = sprintf "chrUn\t%d\t%d\t%d\tW\t%s\t1\t%d\t%s", $unStart,$unEnd, $seqCount++, $a[0], $len, $strand; $unStart = $unEnd + 1; next; } if ($b[4] =~ m/^[0-9]+$/) { $chrStartOrdered{$key} = sprintf "chr%s\t%d\t%d\t%d\tW\t%s\t1\t%d\t%s", $b[2],$b[3],$b[4], $seqCount++, $a[0], $len, $strand; } else { printf STDERR "non-numeric 4 ? '$line'\n"; next; } } close(FH); my $startChr = "chr1"; my $lastEnd = 0; $seqCount = 1; $prevScaf = ""; foreach my $key (sort keys(%chrStartOrdered)) { my ($chr, $start, $end, $seqNum, $type, $name, $one, $len, $strand) = split('\t',$chrStartOrdered{$key}); if ($chr !~ m/^$startChr$/) { $lastEnd = 0; $startChr = $chr; } my @scafContig = split('_',$name); my $gapType = "contig"; my $bridged = "yes"; if (length($prevScaf) > 0) { if ($prevScaf !~ m/^$scafContig[0]$/) { $gapType = "scaffold"; $bridged = "no"; } } $prevScaf = $scafContig[0]; if (1 == ($lastEnd+1)) { $gapType = "telomere"; $bridged = "no"; } if (($lastEnd +1) < $start) { my $gapLen = $start - $lastEnd - 1; if ($chr =~ m/chrUn/) { if (10 == $gapLen) { $gapType = "contig"; $bridged = "yes"; } else { $gapType = "scaffold"; $bridged = "no"; } } printf "%s\t%d\t%d\t%d\tN\t%d\t%s\t%s\n", $chr, $lastEnd+1, $start-1, $seqCount++, $gapLen, $gapType, $bridged; } printf "%s\t%d\t%d\t%d\t%s\t%s\t%s\t%s\t%s\n", $chr, $start, $end, $seqCount++, $type, $name, $one, $len, $strand; $lastEnd = $end; } '_EOF_' # << happy emacs chmod +x allSeqLevel.pl ./allSeqLevel.pl > allSeqLevel.agp 2> err ########################################################################## # Run the makeGenomeDb.pl script (DONE - 2006-11-28 - Hiram) # prepare for the makeGenomeDb.pl script: ssh hgwdev cd /cluster/data/oryLat1 # the config.ra script pretty much specifies everything cat << '_EOF_' > config.ra db oryLat1 scientificName Oryzias latipes assemblyDate Oct. 2005 assemblyLabel MEDAKA1 # Same order key as tetNig1 orderKey 40 # NC_004387 mitoAcc 25057156 fastaFiles /cluster/data/oryLat1/useSequence/seq*.fa.gz dbDbSpeciesDir medaka agpFiles /cluster/data/oryLat1/useSequence/oryLat1.agp commonName Medaka clade Vertebrate genomeCladePriority 105 '_EOF_' # << happy emacs # create a special useSequence directory to collect the essentials into # one single place mkdir useSequence cd useSequence ln -s ../rawData/seqlevel.fa.gz . ln -s ../rawData/allSeqLevel.agp ./oryLat1.agp cd /cluster/data/oryLat1 # Being brave, a complete run will do this entirely makeGenomeDb.pl config.ra > mgdb.out 2>&1 # This sequence creates and loads the following files into the # new database oryLat1 # chr*_gap, chr*_gold, chromInfo, gc5Base, grp # And, when you follow the instructions it gives at the end to check in # the trackDb files it creates, and you do a make in your trackDb # hierarchy, you will then create the trackDb and hgFindSpec tables # (with a 'make alpha') or your specific trackDb_logname # hgFindSpec_logname with a simple 'make' with no arguments. # The sequence also adds an entry to the dbDb table to turn on this # organism in the drop-down menus # run the script one step at a time if there are difficulties # (there were problems until I figured out how to synchronize the agp # file with the sequence file) makeGenomeDb.pl config.ra -stop=seq -workhorse=hgwdev makeGenomeDb.pl config.ra -continue=agp -stop=agp -workhorse=hgwdev makeGenomeDb.pl config.ra -continue=db -stop=db -workhorse=hgwdev makeGenomeDb.pl config.ra -continue=dbDb -stop=dbDb -workhorse=hgwdev makeGenomeDb.pl config.ra -continue=trackDb -stop=trackDb -workhorse=hgwdev # The trackDb step displays instructions on what to do to get the # generated source checked into the source tree. ######################################################################## ## Verify the generated source is correct and matches the chrom source ## obtained from Ensembl (DONE - 2006-11-29 - Hiram) # Extract the source from the 2bit file ssh kkstore04 cd /cluster/data/oryLat1 for C in ? ?? do echo $C twoBitToFa -seq=chr${C} oryLat1.unmasked.2bit stdout | gzip > ${C}/chr${C}.fa.gz done # Compare extracted source to the rawData/chr*.fa.gz files for C in ? ?? do echo -n "${C}: " faSize ${C}/chr${C}.fa.gz echo -n "${C}: " faSize rawData/chr${C}.fa.gz done > checkFa.out # Each faSize output line should be duplicated grep sequence checkFa.out | wc # 51 867 4876 # It is an odd number because there is a M/chrM.fa.gz # but no rawData/chrM.fa.gz grep sequence checkFa.out | sort -u | wc # 27 459 2579 # the chrUn count is different in each case because the # Ensembl chrUn.fa.gz is different that what was put together # here with the seqlevel.fa.gz source. They have the same sequence, # just a different size of gaps between scaffolds, and Ensemble # doesn't assemble it into a chrUn, they leave it as separate sequences: faSize Un/chrUn.fa.gz # 119265241 bases (1021750 N's 118243491 real 118243491 upper 0 lower) # in 1 sequences in 1 files faSize rawData/chrUn.fa.gz # 144764649 bases (26521158 N's 118243491 real 118243491 upper 0 lower) # in 7164 sequences in 1 files ######################################################################### # REPEATMASKER (DONE - 2006-11-29 - Hiram) ssh kkstore04 cd /cluster/data/oryLat1/bed # Run -debug to create the dir structure and preview the scripts: # kk and kki currently down at this run, specify pk ~/kent/src/hg/utils/automation/doRepeatMasker.pl \ oryLat1 -verbose=3 -debug -workhorse=hgwdev -bigClusterHub=pk \ -smallClusterHub=pk > rmRun.debug.log 2>&1 # Run it for real and tail the log to follow progress: ~/kent/src/hg/utils/automation/doRepeatMasker.pl \ oryLat1 -verbose=3 -workhorse=hgwdev -bigClusterHub=pk \ -smallClusterHub=pk > RepeatMasker.2006-11-29/do.log 2>&1 & tail -f RepeatMasker.2006-11-29/do.log # When this finished, it created a masked 2bit: # 214145351 Nov 29 16:33 /cluster/data/oryLat1/oryLat1.rmsk.2bit # But it did not change the /gbdb/oryLat1/oryLat1.2bit symlink # RepeatMasker and lib version from do.log: # October 6 2006 (open-3-1-6) version of RepeatMasker # CC RELEASE 20061006; # Calculate coverage to compare with other masking efforts featureBits oryLat1 rmsk # 21095035 bases of 700386597 (3.012%) in intersection # It appears the Ensemble chr*.RM.fa.gz files have been hard masked. ######################################################################### ## WINDOWMASKER (DONE - 2006-11-29 - Hiram) # Trying different repeat mask process to see if we get significant # different masking ssh hgwdev cd /cluster/data/oryLat1/bed ~/kent/src/hg/utils/automation/doWindowMasker.pl oryLat1 \ -workhorse=kolossus > wmRun.log 2>&1 & # Save the log mv wmRun.log WindowMasker.2006-11-29 # Looks like there are two results: cd WindowMasker.2006-11-29 ls -og *.2bit # -rw-rw-r-- 1 249361695 Nov 29 21:06 oryLat1.wmsk.2bit # -rw-rw-r-- 1 250245447 Nov 29 21:06 oryLat1.wmsk.sdust.2bit # Let's see what got masked twoBitToFa oryLat1.wmsk.2bit stdout | gzip > oryLat1.wmsk.fa.gz twoBitToFa oryLat1.wmsk.sdust.2bit stdout | gzip > oryLat1.wmsk.sdust.fa.gz faSize oryLat1.wmsk.fa.gz > faSize.wmsk.txt faSize oryLat1.wmsk.sdust.fa.gz > faSize.wmsk.sdust.txt cat faSize.wmsk.sdust.txt # 843500808 bases (143114211 N's 700386597 real 468909282 upper # 231477315 lower) in 26 sequences in 1 files cat faSize.wmsk.txt # 843500808 bases (143114211 N's 700386597 real 474213482 upper # 226173115 lower) in 26 sequences in 1 files # Percent masked sdust: %33.05 = 100*231477315/700386597 # Percent masked wmsk: %32.29 = 100*226173115/700386597 # Load this table hgLoadBed -strict oryLat1 windowmaskerSdust windowmasker.sdust.bed.gz ######################################################################## ## Check how much ensembl sequence is repeat masked ssh kkstore04 cd /cluster/data/oryLat1/rawData faSize chr?.fa.gz chr1?.fa.gz chr2?.fa.gz > faSize.txt 2>&1 & faSize chr?.RM.fa.gz chr1?.RM.fa.gz chr2?.RM.fa.gz > faSize.RM.txt 2>&1 & cat faSize.RM.txt # 724218853 bases (171323306 N's 552895547 real 552895547 upper 0 lower) # in 24 sequences in 24 files cat faSize.txt # 724218853 bases (142092461 N's 582126392 real 582126392 upper 0 lower) # in 24 sequences in 24 files # additional N's in the masked sequence: 29230845 = 171323306 - 142092461 # percent of real sequence knocked out %5.02 = 100*29230845/582126392 ######################################################################### # SIMPLE REPEATS (TRF) (DONE 2006-12-04 - Hiram) ssh kolossus mkdir /cluster/data/oryLat1/bed/simpleRepeat cd /cluster/data/oryLat1/bed/simpleRepeat time nice -n 19 twoBitToFa ../../oryLat1.unmasked.2bit stdout \ | trfBig -trf=/cluster/bin/i386/trf stdin /dev/null \ -bedAt=simpleRepeat.bed -tempDir=/tmp > trf.log 2>&1 & # ~1h 20m # Make a filtered version for sequence masking: awk '{if ($5 <= 12) print;}' simpleRepeat.bed > trfMask.bed splitFileByColumn trfMask.bed trfMaskChrom # Load unfiltered repeats into the database: ssh hgwdev hgLoadBed oryLat1 simpleRepeat \ /cluster/data/oryLat1/bed/simpleRepeat/simpleRepeat.bed \ -sqlTable=$HOME/kent/src/hg/lib/simpleRepeat.sql featureBits oryLat1 simpleRepeat # 11740975 bases of 700386597 (1.676%) in intersection ######################################################################### ## Add TRF mask to WindowMasker masked sequence ssh kkstore04 cd /cluster/data/oryLat1 twoBitMask bed/WindowMasker.2006-11-29/oryLat1.wmsk.sdust.2bit \ -add bed/simpleRepeat/trfMask.bed oryLat1.sdTrf.2bit ######################################################################### ## Send sequence to san for kluster runs (DONE - 2006-12-04 - Hiram) ssh kkstore04 cd /cluster/data/oryLat1 mkdir /san/sanvol1/scratch/oryLat1 cp -p oryLat1.sdTrf.2bit /san/sanvol1/scratch/oryLat1/ ######################################################################### # BLASTZ/CHAIN/NET HG18 (DONE - 2006-12-04 - Hiram) # first time with everything, done later without the randoms and chrUn ssh kkstore04 mkdir /cluster/data/oryLat1/bed/blastz.hg18.2006-12-04 cd /cluster/data/oryLat1/bed/blastz.hg18.2006-12-04 cat << '_EOF_' > DEF # Medaka vs. Human # Try "human-fugu" (more distant, less repeat-killed than mammal) params # +M=50: BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_M=50 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET: Medaka oryLat1 (40M chunks covers the largest chroms in one gulp) # except for chrUn of course SEQ1_DIR=/san/sanvol1/scratch/oryLat1/oryLat1.sdTrf.2bit SEQ1_CHUNK=40000000 SEQ1_LAP=10000 SEQ1_LEN=/cluster/data/oryLat1/chrom.sizes # QUERY: Human hg18 SEQ2_DIR=/san/sanvol1/scratch/hg18/hg18.sdTrf.2bit SEQ2_LEN=/cluster/data/hg18/chrom.sizes SEQ2_CHUNK=50000000 SEQ2_LAP=0 SEQ2_LIMIT=100 BASE=/cluster/data/oryLat1/bed/blastz.hg18.2006-12-04 TMPDIR=/scratch/tmp '_EOF_' # << this line keeps emacs coloring happy time doBlastzChainNet.pl DEF -chainMinScore=2000 -chainLinearGap=loose \ -bigClusterHub pk \ -blastzOutRoot /cluster/bluearc/oryLat1Hg18 > do.log 2>&1 & # real 312m34.518s # user 0m0.151s # sys 0m0.123s # failed during the netClass operation in loadUp.csh due to missing # rmsk table in oryLat1 - added options to netClass to allow # specification of repeat table, and rerun the two commands here to # finish off the loadUp.csh step: ssh hgwdev cd /cluster/data/oryLat1/bed/blastz.hg18.2006-12-04/axtChain netClass -verbose=2 -noAr -tRepeats=repeats -qRepeats=windowmaskerSdust \ noClass.net oryLat1 hg18 oryLat1.hg18.net netFilter -minGap=10 oryLat1.hg18.net \ | hgLoadNet -verbose=0 oryLat1 netHg18 stdin # Then, continuing: ssh kkstore04 cd /cluster/data/oryLat1/bed/blastz.hg18.2006-12-04 time $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl DEF \ -chainMinScore=2000 -chainLinearGap=loose \ -tRepeats=repeats -qRepeats=windowmaskerSdust -bigClusterHub=pk \ -continue=download \ -blastzOutRoot /cluster/bluearc/oryLat1Hg18 > download.log 2>&1 & cd /cluster/data/oryLat1/bed ln -s blastz.hg18.2006-12-04 blastz.hg18 ssh hgwdev cd /cluster/data/oryLat1/bed/blastz.hg18 nice featureBits oryLat1 chainHg18Link > fb.oryLat1.hg18.txt 2>&1 # 48309237 bases of 700386597 (6.898%) in intersection # Let's try a swap and see what happens ssh kkstore04 mkdir /cluster/data/hg18/bed/blastz.oryLat1.swap cd /cluster/data/hg18/bed/blastz.oryLat1.swap time $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl -verbose=2 \ /cluster/data/oryLat1/bed/blastz.hg18.2006-12-04/DEF \ -chainMinScore=2000 -chainLinearGap=loose \ -tRepeats=repeats -qRepeats=windowmaskerSdust -bigClusterHub=pk \ -swap \ -blastzOutRoot /cluster/bluearc/oryLat1Hg18 > swap.log 2>&1 & # fails at: # HgStepManager: executing step 'net'. # netChains: looks like previous stage was not successful # (can't find [hg18.oryLat1.]all.chain[.gz]). # it will continue at net after this: time $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl -verbose=2 \ /cluster/data/oryLat1/bed/blastz.hg18.2006-12-04/DEF \ -chainMinScore=2000 -chainLinearGap=loose \ -tRepeats=repeats -qRepeats=windowmaskerSdust -bigClusterHub=pk \ -continue=net -swap \ -blastzOutRoot /cluster/bluearc/oryLat1Hg18 > net_swap.log 2>&1 & # failed on cleanUp because /cluster/bluearc/oryLat1Hg18 did not exist nice -n 19 featureBits hg18 chainOryLat1Link > fb.hg18.oryLat1.txt 2>&1 # 57908796 bases of 2881515245 (2.010%) in intersection ######################################################################### # BLASTZ/CHAIN/NET FR1 (DONE - 2006-12-04 - Hiram) ## first time, run later with fr1 scaffolds and oryLat1 chrUn out of the ## picture ssh kkstore04 mkdir /cluster/data/oryLat1/bed/blastz.fr1.2006-12-04 cd /cluster/data/oryLat1/bed/blastz.fr1.2006-12-04 cat << '_EOF_' > DEF # Medaka vs. Fugu # Try "human-fugu" (more distant, less repeat-killed than mammal) params # +M=50: BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_M=50 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET: Medaka oryLat1 (40M chunks covers the largest chroms in one gulp) # except for chrUn of course SEQ1_DIR=/san/sanvol1/scratch/oryLat1/oryLat1.sdTrf.2bit SEQ1_CHUNK=40000000 SEQ1_LAP=10000 SEQ1_LEN=/cluster/data/oryLat1/chrom.sizes # QUERY: Fugu fr1 SEQ2_DIR=/san/sanvol1/scratch/fr1/fr1.sdTrf.2bit SEQ2_LEN=/cluster/data/fr1/chrom.sizes SEQ2_CHUNK=40000000 SEQ2_LAP=0 SEQ2_LIMIT=100 BASE=/cluster/data/oryLat1/bed/blastz.fr1.2006-12-04 TMPDIR=/scratch/tmp '_EOF_' # << this line keeps emacs coloring happy time doBlastzChainNet.pl DEF -chainMinScore=2000 -chainLinearGap=loose \ -bigClusterHub pk \ -blastzOutRoot /cluster/bluearc/oryLat1Fr1 > do.log 2>&1 & # real 832m8.618s # user 0m0.165s # sys 0m0.109s # failed during the netClass operation in loadUp.csh due to missing # rmsk table in oryLat1 - added options to netClass to allow # specification of repeat table, and rerun the two commands here to # finish off the loadUp.csh step: ssh hgwdev cd /cluster/data/oryLat1/bed/blastz.fr1.2006-12-04/axtChain netClass -verbose=2 -noAr -tRepeats=repeats -qRepeats=windowmaskerSdust \ noClass.net oryLat1 fr1 oryLat1.fr1.net netFilter -minGap=10 oryLat1.fr1.net \ | hgLoadNet -verbose=0 oryLat1 netFr1 stdin # Then, continuing: ssh kkstore04 cd /cluster/data/oryLat1/bed/blastz.fr1.2006-12-04 time $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl DEF \ -chainMinScore=2000 -chainLinearGap=loose \ -tRepeats=repeats -qRepeats=windowmaskerSdust -bigClusterHub=pk \ -continue=download \ -blastzOutRoot /cluster/bluearc/oryLat1Fr1 > download.log 2>&1 & cd /cluster/data/oryLat1/bed ln -s blastz.fr1.2006-12-04 blastz.fr1 ssh hgwdev cd /cluster/data/oryLat1/bed/blastz.fr1 nice -n 19 featureBits oryLat1 chainFr1Link > fb.oryLat1.fr1.txt 2>&1 # 170744130 bases of 700386597 (24.379%) in intersection # Let's try a swap and see what happens ssh kkstore04 mkdir /cluster/data/fr1/bed/blastz.oryLat1.swap cd /cluster/data/fr1/bed/blastz.oryLat1.swap time $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl -verbose=2 \ /cluster/data/oryLat1/bed/blastz.fr1.2006-12-04/DEF \ -chainMinScore=2000 -chainLinearGap=loose \ -tRepeats=repeats -qRepeats=windowmaskerSdust -bigClusterHub=pk \ -swap \ -blastzOutRoot /cluster/bluearc/oryLat1Fr1 > swap.log 2>&1 & # fails at: # HgStepManager: executing step 'net'. # netChains: looks like previous stage was not successful # (can't find [fr1.oryLat1.]all.chain[.gz]). # it will continue at net after this: time $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl -verbose=2 \ /cluster/data/oryLat1/bed/blastz.fr1.2006-12-04/DEF \ -chainMinScore=2000 -chainLinearGap=loose \ -tRepeats=repeats -qRepeats=windowmaskerSdust -bigClusterHub=pk \ -continue=net -swap > net_swap.log 2>&1 & ######################################################################### # BLASTZ/CHAIN/NET danRer4 (DONE - 2006-12-04 - Hiram) ssh kkstore04 mkdir /cluster/data/oryLat1/bed/blastz.danRer4.2006-12-04 cd /cluster/data/oryLat1/bed/blastz.danRer4.2006-12-04 cat << '_EOF_' > DEF # Medaka vs. Zebrafish # Try "human-fugu" (more distant, less repeat-killed than mammal) params # +M=50: BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_M=50 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET: Medaka oryLat1 (40M chunks covers the largest chroms in one gulp) # except for chrUn of course SEQ1_DIR=/san/sanvol1/scratch/oryLat1/oryLat1.sdTrf.2bit SEQ1_CHUNK=40000000 SEQ1_LAP=10000 SEQ1_LEN=/cluster/data/oryLat1/chrom.sizes # QUERY: Zebrafish danRer4 SEQ2_DIR=/san/sanvol1/scratch/danRer4/danRer4.sdTrf.2bit SEQ2_LEN=/cluster/data/danRer4/chrom.sizes SEQ2_CHUNK=40000000 SEQ2_LAP=0 SEQ2_LIMIT=100 BASE=/cluster/data/oryLat1/bed/blastz.danRer4.2006-12-04 TMPDIR=/scratch/tmp '_EOF_' # << this line keeps emacs coloring happy time doBlastzChainNet.pl DEF -chainMinScore=2000 -chainLinearGap=loose \ -bigClusterHub pk \ -blastzOutRoot /cluster/bluearc/oryLat1DanRer4 > do.log 2>&1 & # failed during the netClass operation in loadUp.csh due to missing # rmsk table in oryLat1 - added options to netClass to allow # specification of repeat table, and rerun the two commands here to # finish off the loadUp.csh step: ssh hgwdev cd /cluster/data/oryLat1/bed/blastz.danRer4.2006-12-04/axtChain netClass -verbose=2 -noAr -tRepeats=repeats -qRepeats=windowmaskerSdust \ noClass.net oryLat1 danRer4 oryLat1.danRer4.net netFilter -minGap=10 oryLat1.danRer4.net \ | hgLoadNet -verbose=0 oryLat1 netDanRer4 stdin # Then continuing ssh kkstore04 cd /cluster/data/oryLat1/bed/blastz.danRer4.2006-12-04 time $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl DEF \ -chainMinScore=2000 -chainLinearGap=loose \ -tRepeats=repeats -qRepeats=windowmaskerSdust -bigClusterHub=pk \ -continue=download \ -blastzOutRoot /cluster/bluearc/oryLat1DanRer4 > download.log 2>&1 & cd /cluster/data/oryLat1/bed ln -s blastz.danRer4.2006-12-04 blastz.danRer4 ssh hgwdev cd /cluster/data/oryLat1/bed/blastz.danRer4 nice -n 19 featureBits oryLat1 chainDanRer4Link >fb.oryLat1.danRer4.txt 2>&1 # 161293587 bases of 700386597 (23.029%) in intersection # Let's try a swap and see what happens ssh kkstore04 mkdir /cluster/data/danRer4/bed/blastz.oryLat1.swap cd /cluster/data/danRer4/bed/blastz.oryLat1.swap time $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl -verbose=2 \ /cluster/data/oryLat1/bed/blastz.danRer4.2006-12-04/DEF \ -chainMinScore=2000 -chainLinearGap=loose \ -tRepeats=repeats -qRepeats=windowmaskerSdust -bigClusterHub=pk \ -swap \ -blastzOutRoot /cluster/bluearc/oryLat1DanRer4 > swap.log 2>&1 & time $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl -verbose=2 \ /cluster/data/oryLat1/bed/blastz.danRer4.2006-12-04/DEF \ -chainMinScore=2000 -chainLinearGap=loose \ -tRepeats=repeats -qRepeats=windowmaskerSdust -bigClusterHub=pk \ -continue=net -swap > swap.log 2>&1 & ######################################################################### # BLASTZ/CHAIN/NET gasAcu1 (DONE - 2006-12-04 - Hiram) ## first time - run later with chrUn in the picture ssh kkstore04 mkdir /cluster/data/oryLat1/bed/blastz.gasAcu1.2006-12-04 cd /cluster/data/oryLat1/bed/blastz.gasAcu1.2006-12-04 cat << '_EOF_' > DEF # Medaka vs. Stippleback # Try "human-fugu" (more distant, less repeat-killed than mammal) params # +M=50: BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_M=50 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET: Medaka oryLat1 (40M chunks covers the largest chroms in one gulp) # except for chrUn of course SEQ1_DIR=/san/sanvol1/scratch/oryLat1/oryLat1.sdTrf.2bit SEQ1_CHUNK=40000000 SEQ1_LAP=10000 SEQ1_LEN=/cluster/data/oryLat1/chrom.sizes # QUERY: Stippleback gasAcu1 SEQ2_DIR=/san/sanvol1/scratch/gasAcu1/gasAcu1.sdTrf.2bit SEQ2_LEN=/cluster/data/gasAcu1/chrom.sizes SEQ2_CHUNK=40000000 SEQ2_LAP=0 SEQ2_LIMIT=100 BASE=/cluster/data/oryLat1/bed/blastz.gasAcu1.2006-12-04 TMPDIR=/scratch/tmp '_EOF_' # << this line keeps emacs coloring happy time doBlastzChainNet.pl DEF -chainMinScore=2000 -chainLinearGap=loose \ -bigClusterHub pk \ -blastzOutRoot /cluster/bluearc/oryLat1GasAcu1 > do.log 2>&1 & # real 419m21.385s # user 0m0.168s # sys 0m0.123s time $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl DEF \ -chainMinScore=2000 -chainLinearGap=loose \ -tRepeats=repeats -qRepeats=windowmaskerSdust -bigClusterHub pk \ -blastzOutRoot /cluster/bluearc/oryLat1GasAcu1 > do.log 2>&1 & time $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl DEF \ -chainMinScore=2000 -chainLinearGap=loose \ -tRepeats=repeats -qRepeats=windowmaskerSdust -bigClusterHub=pk \ -continue=download \ -blastzOutRoot /cluster/bluearc/oryLat1GasAcu1 > download.log 2>&1 & cd /cluster/data/oryLat1/bed ln -s blastz.gasAcu1.2006-12-04 blastz.gasAcu1 ssh hgwdev cd /cluster/data/oryLat1/bed/blastz.gasAcu1 nice -n 19 featureBits oryLat1 chainGasAcu1Link >fb.oryLat1.gasAcu1.txt 2>&1 # 200841711 bases of 700386597 (28.676%) in intersection # the 2006-12-22 run on oryLat1 was eventually used instead of a swap ######################################################################### # BLASTZ/CHAIN/NET tetNig1 (DONE - 2006-12-04 - Hiram) ## first time, run later without the chrUn in oryLat1 or randoms in tetNig1 ssh kkstore04 mkdir /cluster/data/oryLat1/bed/blastz.tetNig1.2006-12-04 cd /cluster/data/oryLat1/bed/blastz.tetNig1.2006-12-04 cat << '_EOF_' > DEF # Medaka vs. Tetraodon # Try "human-fugu" (more distant, less repeat-killed than mammal) params # +M=50: BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_M=50 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET: Medaka oryLat1 (40M chunks covers the largest chroms in one gulp) # except for chrUn of course SEQ1_DIR=/san/sanvol1/scratch/oryLat1/oryLat1.sdTrf.2bit SEQ1_CHUNK=40000000 SEQ1_LAP=10000 SEQ1_LEN=/cluster/data/oryLat1/chrom.sizes # QUERY: Tetraodon tetNig1 SEQ2_DIR=/san/sanvol1/scratch/tetNig1/tetNig1.sdTrf.2bit SEQ2_LEN=/cluster/data/tetNig1/chrom.sizes SEQ2_CHUNK=40000000 SEQ2_LAP=0 SEQ2_LIMIT=100 BASE=/cluster/data/oryLat1/bed/blastz.tetNig1.2006-12-04 TMPDIR=/scratch/tmp '_EOF_' # << this line keeps emacs coloring happy time doBlastzChainNet.pl DEF -chainMinScore=2000 -chainLinearGap=loose \ -bigClusterHub pk \ -blastzOutRoot /cluster/bluearc/oryLat1TetNig1 > do.log 2>&1 & # real 362m9.644s # user 0m0.160s # sys 0m0.116s # failed during the netClass operation in loadUp.csh due to missing # rmsk table in oryLat1 - added options to netClass to allow # specification of repeat table, and rerun the two commands here to # finish off the loadUp.csh step: ssh hgwdev cd /cluster/data/oryLat1/bed/blastz.tetNig1.2006-12-04/axtChain netClass -verbose=2 -noAr -tRepeats=repeats -qRepeats=windowmaskerSdust \ noClass.net oryLat1 tetNig1 oryLat1.tetNig1.net netFilter -minGap=10 oryLat1.tetNig1.net \ | hgLoadNet -verbose=0 oryLat1 netTetNig1 stdin # Then, continuing: ssh kkstore04 cd /cluster/data/oryLat1/bed/blastz.tetNig1.2006-12-04 time $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl DEF \ -chainMinScore=2000 -chainLinearGap=loose \ -tRepeats=repeats -qRepeats=windowmaskerSdust -bigClusterHub=pk \ -continue=download \ -blastzOutRoot /cluster/bluearc/oryLat1TetNig1 > download.log 2>&1 & cd /cluster/data/oryLat1/bed ln -s blastz.tetNig1.2006-12-04 blastz.tetNig1 ssh hgwdev cd /cluster/data/oryLat1/bed/blastz.tetNig1 nice -n 19 featureBits oryLat1 chainTetNig1Link >fb.oryLat1.tetNig1.txt 2>&1 # 161622943 bases of 700386597 (23.076%) in intersection # the run from 2006-12-13 was used for the swap # Let's try a swap and see what happens ssh kkstore04 mkdir /cluster/data/tetNig1/bed/blastz.oryLat1.swap cd /cluster/data/tetNig1/bed/blastz.oryLat1.swap time $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl -verbose=2 \ /cluster/data/oryLat1/bed/blastz.tetNig1.2006-12-04/DEF \ -chainMinScore=2000 -chainLinearGap=loose \ -tRepeats=repeats -qRepeats=windowmaskerSdust -bigClusterHub=pk \ -swap > swap.log 2>&1 & # fails at: # HgStepManager: executing step 'net'. # netChains: looks like previous stage was not successful # (can't find [tetNig1.oryLat1.]all.chain[.gz]). time $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl -verbose=2 \ /cluster/data/oryLat1/bed/blastz.tetNig1.2006-12-04/DEF \ -chainMinScore=2000 -chainLinearGap=loose \ -tRepeats=repeats -qRepeats=windowmaskerSdust -bigClusterHub=pk \ -continue=net -swap > net_swap.log 2>&1 & ######################################################################## ######################################################################## gasAcu1 # 200841711 bases of 700386597 (28.676%) in intersection fr1 # 170744130 bases of 700386597 (24.379%) in intersection tetNig1 # 161622943 bases of 700386597 (23.076%) in intersection danRer4 # 161293587 bases of 700386597 (23.029%) in intersection hg18 # 48309237 bases of 700386597 (6.898%) in intersection ######################################################################## on hg18 # 57908796 bases of 2881515245 (2.010%) in intersection ######################################################################## ## # featureBits chainLink measures # chainOryLat1Link chain linearGap # distance on oryLat1 on other minScore # 1 0.xxxx - stickleback gasAcu1 (% 28.676) (% 38.909) 2000 loose # 2 0.xxxx - fugu fr1 (% 24.379) (% 41.994) 2000 loose # 3 0.xxxx - tetraodon tetNig1 (% 23.076) (no swap) 2000 loose # 4 0.xxxx - zebrafish danRer4 (% 23.029) (% 13.084) 2000 loose # 5 0.xxxx - human hg18 (% 6.898) (% 2.010) 2000 loose ###### ## Second time around, without the randoms and chrUn sequences: # 1 0.xxxx - stickleback gasAcu1 (% 28.110) (% 37.899) 2000 loose # 2 0.xxxx - fugu fr1 (% 24.764) (% 40.018) 2000 loose # 3 0.xxxx - tetraodon tetNig1 (% 17.585) (% 39.552) 2000 loose # 4 0.xxxx - zebrafish danRer4 (% 22.755) (% 12.192) 2000 loose # 5 0.xxxx - human hg18 (% 6.896) (% 1.791) 2000 loose intersection gasAcu1 # 163640802 bases of 582143106 (28.110%) in intersection on gasAcu1 # 148335558 bases of 391398261 (37.899%) in intersection tetNig1 # 102368873 bases of 582143106 (17.585%) in intersection on tetNig1 # 81227466 bases of 205367312 (39.552%) in intersection fr1 # 173441855 bases of 700386597 (24.764%) in intersection on fr1 # 126265137 bases of 315518167 (40.018%) in intersection danRer4 # 159369670 bases of 700386597 (22.755%) in intersection on danRer4 # 188598788 bases of 1546950119 (12.192%) in intersection hg18 # 48299069 bases of 700386597 (6.896%) in intersection on hg18 # 51598883 bases of 2881515245 (1.791%) in intersection ######################################################################### # BLASTZ/CHAIN/NET FR1 (DONE - 2006-12-08 - Hiram) ## Second time, no chrUn in oryLat1, and align to fr1 scaffolds, ## results lifted to fr1 chrUn coordinates ssh kkstore04 mkdir /cluster/data/oryLat1/bed/blastz.fr1.2006-12-08 cd /cluster/data/oryLat1/bed/blastz.fr1.2006-12-08 cat << '_EOF_' > DEF # Medaka vs. Fugu # Try "human-fugu" (more distant, less repeat-killed than mammal) params # +M=50: BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_M=50 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET: Medaka oryLat1 (40M chunks covers the largest chroms in one gulp) # no chrUn in this alignment run SEQ1_DIR=/san/sanvol1/scratch/oryLat1/oryLat1.noUn.sdTrf.2bit SEQ1_LEN=/san/sanvol1/scratch/oryLat1/oryLat1.noUn.sdTrf.sizes SEQ1_CHUNK=40000000 SEQ1_LAP=10000 # QUERY: Fugu fr1 # Align to the scaffolds, results lifed up to chrUn.sdTrf coordinates SEQ2_DIR=/san/sanvol1/scratch/fr1/chrUn.sdTrf.2bit SEQ2_LEN=/san/sanvol1/scratch/fr1/chrom.sizes SEQ2_CTGDIR=/san/sanvol1/scratch/fr1/fr1.UnScaffolds.sdTrf.2bit SEQ2_CTGLEN=/san/sanvol1/scratch/fr1/scaffold.sizes SEQ2_LIFT=/san/sanvol1/scratch/fr1/UnScaffolds/ordered.lft SEQ2_CHUNK=20000000 SEQ2_LAP=0 SEQ2_LIMIT=30 BASE=/cluster/data/oryLat1/bed/blastz.fr1.2006-12-08 TMPDIR=/scratch/tmp '_EOF_' # << this line keeps emacs coloring happy time doBlastzChainNet.pl DEF -chainMinScore=2000 -chainLinearGap=loose \ -tRepeats=repeats -qRepeats=windowmaskerSdust -bigClusterHub=pk \ -blastzOutRoot /cluster/bluearc/oryLat1Fr1 > do.log 2>&1 & # real 424m7.567s # user 0m0.199s # sys 0m0.160s cd /cluster/data/oryLat1/bed rm blastz.fr1 ln -s blastz.fr1.2006-12-08 blastz.fr1 ssh hgwdev cd /cluster/data/oryLat1/bed/blastz.fr1 nice -n 19 featureBits oryLat1 chainFr1Link > fb.oryLat1.fr1.txt 2>&1 # 173441855 bases of 700386597 (24.764%) in intersection # Let's try a swap and see what happens ssh kkstore04 mkdir /cluster/data/fr1/bed/blastz.oryLat1.swap cd /cluster/data/fr1/bed/blastz.oryLat1.swap time doBlastzChainNet.pl -verbose=2 \ /cluster/data/oryLat1/bed/blastz.fr1.2006-12-08/DEF \ -chainMinScore=2000 -chainLinearGap=loose \ -tRepeats=repeats -qRepeats=windowmaskerSdust -bigClusterHub=pk \ -swap > swap.log 2>&1 & # real 32m7.742s # user 0m0.148s # sys 0m0.097s ssh hgwdev cd /cluster/data/fr1/bed/blastz.oryLat1.swap time nice -n 19 featureBits fr1 chainOryLat1Link \ > fb.fr1.chainOryLat1Link.txt 2>&1 # 126265137 bases of 315518167 (40.018%) in intersection ######################################################################### # BLASTZ/CHAIN/NET gasAcu1 (DONE - 2006-12-08 - Hiram) ## Second time, without chrUn in either assembly ssh kkstore04 mkdir /cluster/data/oryLat1/bed/blastz.gasAcu1.2006-12-08 cd /cluster/data/oryLat1/bed/blastz.gasAcu1.2006-12-08 cat << '_EOF_' > DEF # Medaka vs. Stippleback # Try "human-fugu" (more distant, less repeat-killed than mammal) params # +M=50: BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_M=50 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET: Medaka oryLat1 (40M chunks covers the largest chroms in one gulp) # no chrUn in this alignment run SEQ1_DIR=/san/sanvol1/scratch/oryLat1/oryLat1.noUn.sdTrf.2bit SEQ1_LEN=/san/sanvol1/scratch/oryLat1/oryLat1.noUn.sdTrf.sizes SEQ1_CHUNK=40000000 SEQ1_LAP=10000 # QUERY: Stippleback gasAcu1, no chrUn in this run SEQ2_DIR=/san/sanvol1/scratch/gasAcu1/gasAcu1.noUn.sdTrf.2bit SEQ2_LEN=/san/sanvol1/scratch/gasAcu1/gasAcu1.noUn.sdTrf.sizes SEQ2_CHUNK=40000000 SEQ2_LAP=0 SEQ2_LIMIT=100 BASE=/cluster/data/oryLat1/bed/blastz.gasAcu1.2006-12-08 TMPDIR=/scratch/tmp '_EOF_' # << this line keeps emacs coloring happy time doBlastzChainNet.pl DEF -chainMinScore=2000 -chainLinearGap=loose \ -tRepeats=repeats -qRepeats=windowmaskerSdust -bigClusterHub pk \ -blastzOutRoot /cluster/bluearc/oryLat1GasAcu1 > do.log 2>&1 & # real 99m16.562s cd /cluster/data/oryLat1/bed rm blastz.gasAcu1 ln -s blastz.gasAcu1.2006-12-08 blastz.gasAcu1 ssh hgwdev cd /cluster/data/oryLat1/bed/blastz.gasAcu1 nice -n 19 featureBits -noRandom oryLat1 chainGasAcu1Link \ >fb.oryLat1.gasAcu1.txt 2>&1 & # 163640802 bases of 582143106 (28.110%) in intersection # And to swap ## This sequence duplicated in gasAcu1.txt too ssh kkstore04 cd /cluster/data/gasAcu1/bed rm blastz.oryLat1.swap mkdir /cluster/data/gasAcu1/bed/blastz.oryLat1.swap cd /cluster/data/gasAcu1/bed/blastz.oryLat1.swap time $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl -verbose=2 \ /cluster/data/oryLat1/bed/blastz.gasAcu1.2006-12-08/DEF \ -chainMinScore=2000 -chainLinearGap=loose \ -tRepeats=repeats -qRepeats=windowmaskerSdust -bigClusterHub=pk \ -swap > swap.log 2>&1 & # real 26m32.549s ssh hgwdev cd /cluster/data/gasAcu1/bed/blastz.oryLat1.swap nice -n 19 featureBits -noRandom gasAcu1 chainOryLat1Link \ >fb.gasAcu1.oryLat1.txt 2>&1 & # 148335558 bases of 391398261 (37.899%) in intersection ######################################################################### # BLASTZ/CHAIN/NET tetNig1 (DONE - 2006-12-08 - Hiram) ## second time, no chrUn in oryLat1 and not randoms or chrUn in tetNig1 ssh kkstore04 mkdir /cluster/data/oryLat1/bed/blastz.tetNig1.2006-12-08 cd /cluster/data/oryLat1/bed/blastz.tetNig1.2006-12-08 cat << '_EOF_' > DEF # Medaka vs. Tetraodon # Try "human-fugu" (more distant, less repeat-killed than mammal) params # +M=50: BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_M=50 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET: Medaka oryLat1 (40M chunks covers the largest chroms in one gulp) # except for chrUn of course SEQ1_DIR=/san/sanvol1/scratch/oryLat1/oryLat1.sdTrf.2bit SEQ1_CHUNK=40000000 SEQ1_LAP=10000 SEQ1_LEN=/cluster/data/oryLat1/chrom.sizes # QUERY: Tetraodon tetNig1 SEQ2_DIR=/san/sanvol1/scratch/tetNig1/tetNig1.sdTrf.2bit SEQ2_LEN=/cluster/data/tetNig1/chrom.sizes SEQ2_CHUNK=40000000 SEQ2_LAP=0 SEQ2_LIMIT=100 BASE=/cluster/data/oryLat1/bed/blastz.tetNig1.2006-12-04 TMPDIR=/scratch/tmp '_EOF_' # << this line keeps emacs coloring happy time doBlastzChainNet.pl DEF -chainMinScore=2000 -chainLinearGap=loose \ -tRepeats=repeats -qRepeats=windowmaskerSdust -bigClusterHub=pk \ -blastzOutRoot /cluster/bluearc/oryLat1TetNig1 > do.log 2>&1 & # real 42m53.388s cd /cluster/data/oryLat1/bed rm blastz.tetNig1 ln -s blastz.tetNig1.2006-12-08 blastz.tetNig1 ssh hgwdev cd /cluster/data/oryLat1/bed/blastz.tetNig1 nice -n 19 featureBits -noRandom oryLat1 chainTetNig1Link \ >fb.oryLat1.tetNig1.txt 2>&1 # 102368873 bases of 582143106 (17.585%) in intersection # Let's try a swap and see what happens ssh kkstore04 cd /cluster/data/tetNig1/bed mv blastz.oryLat1.swap blastz.oryLat1.swap.2006-12-05 mkdir blastz.oryLat1.swap cd /cluster/data/tetNig1/bed/blastz.oryLat1.swap time $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl -verbose=2 \ /cluster/data/oryLat1/bed/blastz.tetNig1.2006-12-08/DEF \ -chainMinScore=2000 -chainLinearGap=loose \ -tRepeats=repeats -qRepeats=windowmaskerSdust -bigClusterHub=pk \ -swap > swap.log 2>&1 & # real 11m57.050s nice -n 19 featureBits -noRandom tetNig1 chainOryLat1Link \ >fb.tetNig1.oryLat1.txt 2>&1 # 81227466 bases of 205367312 (39.552%) in intersection ######################################################################### # BLASTZ/CHAIN/NET danRer4 (DONE - 2006-12-08 - Hiram) ## Second time - now without chrUn in medata, or any of ## the randoms or Un in danRer4 ssh kkstore04 mkdir /cluster/data/oryLat1/bed/blastz.danRer4.2006-12-08 cd /cluster/data/oryLat1/bed/blastz.danRer4.2006-12-08 cat << '_EOF_' > DEF # Medaka vs. Zebrafish # Try "human-fugu" (more distant, less repeat-killed than mammal) params # +M=50: BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_M=50 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET: Medaka oryLat1 (40M chunks covers the largest chroms in one gulp) # no chrUn in this alignment run SEQ1_DIR=/san/sanvol1/scratch/oryLat1/oryLat1.noUn.sdTrf.2bit SEQ1_LEN=/san/sanvol1/scratch/oryLat1/oryLat1.noUn.sdTrf.sizes SEQ1_CHUNK=40000000 SEQ1_LAP=10000 # QUERY: Zebrafish danRer4, no randoms or Un in this sequence SEQ2_DIR=/san/sanvol1/scratch/danRer4/danRer4.noUn.sdTrf.2bit SEQ2_LEN=/san/sanvol1/scratch/danRer4/danRer4.noUn.sdTrf.sizes SEQ2_CHUNK=10000000 SEQ2_LIMIT=30 SEQ2_LAP=0 BASE=/cluster/data/oryLat1/bed/blastz.danRer4.2006-12-08 TMPDIR=/scratch/tmp '_EOF_' # << this line keeps emacs coloring happy time doBlastzChainNet.pl DEF -chainMinScore=2000 -chainLinearGap=loose \ -tRepeats=repeats -qRepeats=windowmaskerSdust -bigClusterHub=pk \ -blastzOutRoot /cluster/bluearc/oryLat1DanRer4 > do.log 2>&1 & # real 290m39.870s # user 0m0.193s # sys 0m0.164s cd /cluster/data/oryLat1/bed rm blastz.danRer4 ln -s blastz.danRer4.2006-12-08 blastz.danRer4 ssh hgwdev cd /cluster/data/oryLat1/bed/blastz.danRer4 nice -n 19 featureBits oryLat1 chainDanRer4Link >fb.oryLat1.danRer4.txt 2>&1 # 159369670 bases of 700386597 (22.755%) in intersection # Let's try a swap and see what happens ssh kkstore04 mkdir /cluster/data/danRer4/bed/blastz.oryLat1.swap cd /cluster/data/danRer4/bed/blastz.oryLat1.swap time doBlastzChainNet.pl -verbose=2 \ /cluster/data/oryLat1/bed/blastz.danRer4.2006-12-08/DEF \ -chainMinScore=2000 -chainLinearGap=loose \ -tRepeats=repeats -qRepeats=windowmaskerSdust -bigClusterHub=pk \ -swap > swap.log 2>&1 & # real 84m39.497s # user 0m0.148s # sys 0m0.086s ssh hgwdev cd /cluster/data/danRer4/bed/blastz.oryLat1.swap time nice -n 19 featureBits -noRandom danRer4 chainOryLat1Link \ > fb.danRer4.chainOryLat1Link.txt 2>&1 # 188598788 bases of 1546950119 (12.192%) in intersection ######################################################################### # BLASTZ/CHAIN/NET HG18 (DONE - 2006-12-08 - Hiram) # second time without the randoms and chrUn ssh kkstore04 mkdir /cluster/data/oryLat1/bed/blastz.hg18.2006-12-08 cd /cluster/data/oryLat1/bed/blastz.hg18.2006-12-08 cat << '_EOF_' > DEF # Medaka vs. Human # Try "human-fugu" (more distant, less repeat-killed than mammal) params # +M=50: BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_M=50 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET: Medaka oryLat1 (40M chunks covers the largest chroms in one gulp) # no chrUn in this alignment run SEQ1_DIR=/san/sanvol1/scratch/oryLat1/oryLat1.noUn.sdTrf.2bit SEQ1_LEN=/san/sanvol1/scratch/oryLat1/oryLat1.noUn.sdTrf.sizes SEQ1_CHUNK=40000000 SEQ1_LAP=10000 # QUERY: Human hg18, no randoms in this sequence SEQ2_DIR=/san/sanvol1/scratch/hg18/hg18.noUn.sdTrf.2bit SEQ2_LEN=/san/sanvol1/scratch/hg18/hg18.noUn.sdTrf.sizes SEQ2_CHUNK=10000000 SEQ2_LIMIT=30 SEQ2_LAP=0 BASE=/cluster/data/oryLat1/bed/blastz.hg18.2006-12-08 TMPDIR=/scratch/tmp '_EOF_' # << this line keeps emacs coloring happy time doBlastzChainNet.pl DEF -chainMinScore=2000 -chainLinearGap=loose \ -tRepeats=repeats -qRepeats=windowmaskerSdust -bigClusterHub=pk \ -blastzOutRoot /cluster/bluearc/oryLat1Hg18 > do.log 2>&1 & # real 353m18.317s # user 0m0.199s # sys 0m0.170s cd /cluster/data/oryLat1/bed rm -f blastz.hg18 ln -s blastz.hg18.2006-12-08 blastz.hg18 ssh hgwdev cd /cluster/data/oryLat1/bed/blastz.hg18 time nice -n 19 featureBits -noRandom oryLat1 chainHg18Link \ > fb.oryLat1.hg18.txt 2>&1 & # 48299069 bases of 700386597 (6.896%) in intersection # 40936335 bases of 582143106 (7.032%) in intersection # Let's try a swap and see what happens ssh kkstore04 mkdir /cluster/data/hg18/bed/blastz.oryLat1.swap cd /cluster/data/hg18/bed/blastz.oryLat1.swap time doBlastzChainNet.pl -verbose=2 \ /cluster/data/oryLat1/bed/blastz.hg18.2006-12-08/DEF \ -chainMinScore=2000 -chainLinearGap=loose \ -tRepeats=repeats -qRepeats=windowmaskerSdust -bigClusterHub=pk \ -swap > swap.log 2>&1 & # real 39m35.656s # user 0m0.147s # sys 0m0.100s time nice -n 19 featureBits -noRandom hg18 chainOryLat1Link \ > fb.hg18.oryLat1.txt 2>&1 & # 51598883 bases of 2881515245 (1.791%) in intersection # 51259072 bases of 2868834265 (1.787%) in intersection ############################################################################# ## make up chrUn sequence of its individual scaffolds ## and a 2bit for the whole thing for blastz runs (DONE - 2006-12-13 - Hiram) ssh kkstore04 cd /cluster/data/oryLat1/jkStuff cp /cluster/data/mm7/jkStuff/agpToLift.pl . cp /cluster/data/mm7/jkStuff/lft2BitToFa.pl . cd /cluster/data/oryLat1/Un ../jkStuff/agpToLift.pl chrUn.agp > chrUn.lift time ../jkStuff/lft2BitToFa.pl ../oryLat1.sdTrf.2bit \ chrUn.lift > chrUnScaffolds.fa # make a 2bit with this scaffold sequence faToTwoBit ../noUn/chr*.fa chrUnScaffolds.fa oryLat1UnScaffolds.2bit twoBitInfo *.2bit oryLat1UnScaffolds.sizes # Verify it is complete twoBitToFa oryLat1UnScaffolds.2bit stdout \ | faSize stdin > oryLat1UnScaffolds.faSize mv oryLat1UnScaffolds.* .. ######################################################################### # BLASTZ/CHAIN/NET FR1 (DONE - 2006-12-13 - 2007-01-18 - Hiram) ## Third time, chrUn in scaffolds in oryLat1, and align to fr1 scaffolds, ## both results lifted properly to chrUn coordinates ssh kkstore04 mkdir /cluster/data/oryLat1/bed/blastz.fr1.2006-12-13 cd /cluster/data/oryLat1/bed/blastz.fr1.2006-12-13 cat << '_EOF_' > DEF # Medaka vs. Fugu # Try "human-fugu" (more distant, less repeat-killed than mammal) params # +M=50: BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_M=50 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET: Medaka oryLat1 (40M chunks covers the largest chroms in one gulp) # chrUn in Scaffolds for this alignment run SEQ1_DIR=/san/sanvol1/scratch/oryLat1/oryLat1.sdTrf.2bit SEQ1_LEN=/san/sanvol1/scratch/oryLat1/chrom.sizes SEQ1_CTGDIR=/san/sanvol1/scratch/oryLat1/oryLat1UnScaffolds.2bit SEQ1_CTGLEN=/san/sanvol1/scratch/oryLat1/oryLat1UnScaffolds.sizes SEQ1_LIFT=/san/sanvol1/scratch/oryLat1/chrUn.lift SEQ1_CHUNK=40000000 SEQ1_LAP=10000 SEQ1_LIMIT=90 # QUERY: Fugu fr1 # Align to the scaffolds, results lifed up to chrUn.sdTrf coordinates SEQ2_DIR=/san/sanvol1/scratch/fr1/chrUn.sdTrf.2bit SEQ2_LEN=/san/sanvol1/scratch/fr1/chrom.sizes SEQ2_CTGDIR=/san/sanvol1/scratch/fr1/fr1.UnScaffolds.sdTrf.2bit SEQ2_CTGLEN=/san/sanvol1/scratch/fr1/scaffold.sizes SEQ2_LIFT=/san/sanvol1/scratch/fr1/UnScaffolds/ordered.lft SEQ2_CHUNK=20000000 SEQ2_LAP=0 SEQ2_LIMIT=90 BASE=/cluster/data/oryLat1/bed/blastz.fr1.2006-12-13 TMPDIR=/scratch/tmp '_EOF_' # << this line keeps emacs coloring happy time doBlastzChainNet.pl DEF -chainMinScore=2000 -chainLinearGap=loose \ -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -verbose=2 -bigClusterHub=pk -blastzOutRoot \ /cluster/bluearc/oryLat1Fr1 > do.log 2>&1 & ### Two jobs fail during the chain step. axtChain gets a segfault # by-passing those, continuing with chainMerge time doBlastzChainNet.pl DEF -chainMinScore=2000 -chainLinearGap=loose \ -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -continue=chainMerge -verbose=2 -bigClusterHub=pk \ -blastzOutRoot /cluster/bluearc/oryLat1Fr1 > chainMerge.log 2>&1 & cd /cluster/data/oryLat1/bed rm blastz.fr1 ln -s blastz.fr1.2006-12-13 blastz.fr1 ssh hgwdev cd /cluster/data/oryLat1/bed/blastz.fr1 nice -n 19 featureBits oryLat1 chainFr1Link > fb.oryLat1.fr1.txt 2>&1 # 165968083 bases of 700386597 (23.697%) in intersection # Let's try a swap and see what happens ssh kkstore04 mkdir /cluster/data/fr1/bed/blastz.oryLat1.swap cd /cluster/data/fr1/bed/blastz.oryLat1.swap time doBlastzChainNet.pl -verbose=2 \ /cluster/data/oryLat1/bed/blastz.fr1.2006-12-13/DEF \ -chainMinScore=2000 -chainLinearGap=loose \ -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -bigClusterHub=pk -swap > swap.log 2>&1 & time doBlastzChainNet.pl -verbose=2 \ /cluster/data/oryLat1/bed/blastz.fr1.2006-12-13/DEF \ -chainMinScore=2000 -chainLinearGap=loose \ -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -continue=net -bigClusterHub=pk -swap > net_swap.log 2>&1 & cat fb.fr1.chainOryLat1Link.txt # 129282429 bases of 315518167 (40.975%) in intersection ######################################################################### # BLASTZ/CHAIN/NET HG18 (DONE - 2006-12-13 - Hiram) # third time with randoms and chrUn in scaffolds on both sequences ssh kkstore04 mkdir /cluster/data/oryLat1/bed/blastz.hg18.2006-12-13 cd /cluster/data/oryLat1/bed/blastz.hg18.2006-12-13 cat << '_EOF_' > DEF # Medaka vs. Human # Try "human-fugu" (more distant, less repeat-killed than mammal) params # +M=50: BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_M=50 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET: Medaka oryLat1 (40M chunks covers the largest chroms in one gulp) # chrUn in Scaffolds for this alignment run SEQ1_DIR=/san/sanvol1/scratch/oryLat1/oryLat1.sdTrf.2bit SEQ1_LEN=/san/sanvol1/scratch/oryLat1/chrom.sizes SEQ1_CTGDIR=/san/sanvol1/scratch/oryLat1/oryLat1UnScaffolds.2bit SEQ1_CTGLEN=/san/sanvol1/scratch/oryLat1/oryLat1UnScaffolds.sizes SEQ1_LIFT=/san/sanvol1/scratch/oryLat1/chrUn.lift SEQ1_CHUNK=40000000 SEQ1_LAP=10000 SEQ1_LIMIT=80 # QUERY: Human hg18, randoms in contigs, lifted to their chr*_random SEQ2_DIR=/san/sanvol1/scratch/hg18/hg18.sdTrf.2bit SEQ2_LEN=/cluster/data/hg18/chrom.sizes SEQ2_CTGDIR=/san/sanvol1/scratch/hg18/hg18.randomContigs.sdTrf.2bit SEQ2_CTGLEN=/san/sanvol1/scratch/hg18/hg18.randomContigs.sdTrf.sizes SEQ2_LIFT=/san/sanvol1/scratch/hg18/hg18.randomContigs.lift SEQ2_CHUNK=10000000 SEQ2_LIMIT=80 SEQ2_LAP=0 BASE=/cluster/data/oryLat1/bed/blastz.hg18.2006-12-13 TMPDIR=/scratch/tmp '_EOF_' # << this line keeps emacs coloring happy time doBlastzChainNet.pl DEF -chainMinScore=2000 -chainLinearGap=loose \ -tRepeats=repeats -qRepeats=windowmaskerSdust -bigClusterHub=pk \ -verbose=2 -blastzOutRoot /cluster/bluearc/oryLat1Hg18 > do.log 2>&1 & # abandonded this effort, the swap from hg18 was good enough ######################################################################### # BLASTZ/CHAIN/NET danRer4 (DONE - 2006-12-13 - Hiram) ## Third time - randoms and Un in scaffolds, both sequences ## The actual run that was used was done on danRer4 and swapped to here ssh kkstore04 mkdir /cluster/data/oryLat1/bed/blastz.danRer4.2006-12-13 cd /cluster/data/oryLat1/bed/blastz.danRer4.2006-12-13 cat << '_EOF_' > DEF # Medaka vs. Zebrafish # Try "human-fugu" (more distant, less repeat-killed than mammal) params # +M=50: BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_M=50 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET: Medaka oryLat1 (40M chunks covers the largest chroms in one gulp) # chrUn in Scaffolds for this alignment run SEQ1_DIR=/san/sanvol1/scratch/oryLat1/oryLat1.sdTrf.2bit SEQ1_LEN=/san/sanvol1/scratch/oryLat1/chrom.sizes SEQ1_CTGDIR=/san/sanvol1/scratch/oryLat1/oryLat1UnScaffolds.2bit SEQ1_CTGLEN=/san/sanvol1/scratch/oryLat1/oryLat1UnScaffolds.sizes SEQ1_LIFT=/san/sanvol1/scratch/oryLat1/chrUn.lift SEQ1_CHUNK=40000000 SEQ1_LAP=10000 SEQ1_LIMIT=100 # QUERY: Zebrafish danRer4, no randoms or Un in this sequence SEQ2_DIR=/san/sanvol1/scratch/danRer4/danRer4.sdTrf.2bit SEQ2_LEN=/san/sanvol1/scratch/danRer4/chrom.sizes SEQ2_CTGDIR=/san/sanvol1/scratch/danRer4/danRer4.randomContigs.sdTrf.2bit SEQ2_CTGLEN=/san/sanvol1/scratch/danRer4/danRer4.randomContigs.sdTrf.sizes SEQ2_LIFT=/san/sanvol1/scratch/danRer4/danRer4.randomContigs.lift SEQ2_CHUNK=10000000 SEQ2_LIMIT=100 SEQ2_LAP=0 BASE=/cluster/data/oryLat1/bed/blastz.danRer4.2006-12-13 TMPDIR=/scratch/tmp '_EOF_' # << this line keeps emacs coloring happy time doBlastzChainNet.pl DEF -chainMinScore=2000 -chainLinearGap=loose \ -tRepeats=repeats -qRepeats=windowmaskerSdust -bigClusterHub=kk \ -verbose=2 -blastzOutRoot /cluster/bluearc/oryLat1DanRer4 >do.log 2>&1 & time doBlastzChainNet.pl DEF -chainMinScore=2000 -chainLinearGap=loose \ -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -bigClusterHub=kk \ -continue=cat -verbose=2 \ -blastzOutRoot /cluster/bluearc/oryLat1DanRer4 >cat.log 2>&1 & time doBlastzChainNet.pl DEF -chainMinScore=2000 -chainLinearGap=loose \ -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -bigClusterHub=kk \ -continue=chainMerge -verbose=2 \ -blastzOutRoot /cluster/bluearc/oryLat1DanRer4 > chainMerge.log 2>&1 & # abandon this effort here, the swap from danRer4 was good enough ######################################################################### ## SWAP danRer4 to oryLat1 (DONE - 2007-01-20 - Hiram) ## This is the one that ended up being used ssh kkstore04 cd /cluster/data/danRer4/bed/blastz.oryLat1.2007-01-19 cat fb.danRer4.chainOryLat1Link.txt # 209746583 bases of 1626093931 (12.899%) in intersection mkdir /cluster/data/oryLat1/bed/blastz.swap.danRer4 cd /cluster/data/oryLat1/bed/blastz.swap.danRer4 time doBlastzChainNet.pl -verbose=2 \ /cluster/data/danRer4/bed/blastz.oryLat1.2007-01-19/DEF \ -chainMinScore=2000 -chainLinearGap=loose \ -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -swap -bigClusterHub=pk > swap.log 2>&1 & cat fb.oryLat1.chainDanRer4Link.txt # 156014546 bases of 700386597 (22.275%) in intersection cd /cluster/data/oryLat1/bed ln -s blastz.swap.danRer4 blastz.danRer4 ######################################################################### # BLASTZ/CHAIN/NET tetNig1 (DONE - 2006-12-13 - Hiram) ## third time, chrUn and randoms are in contigs in each sequence ssh kkstore04 mkdir /cluster/data/oryLat1/bed/blastz.tetNig1.2006-12-13 cd /cluster/data/oryLat1/bed/blastz.tetNig1.2006-12-13 cat << '_EOF_' > DEF # Medaka vs. Tetraodon # Try "human-fugu" (more distant, less repeat-killed than mammal) params # +M=50: BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_M=50 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET: Medaka oryLat1 (40M chunks covers the largest chroms in one gulp) # chrUn in Scaffolds for this alignment run SEQ1_DIR=/san/sanvol1/scratch/oryLat1/oryLat1.sdTrf.2bit SEQ1_LEN=/san/sanvol1/scratch/oryLat1/chrom.sizes SEQ1_CTGDIR=/san/sanvol1/scratch/oryLat1/oryLat1UnScaffolds.2bit SEQ1_CTGLEN=/san/sanvol1/scratch/oryLat1/oryLat1UnScaffolds.sizes SEQ1_LIFT=/san/sanvol1/scratch/oryLat1/chrUn.lift SEQ1_CHUNK=40000000 SEQ1_LAP=10000 SEQ1_LIMIT=100 # QUERY: Tetraodon tetNig1 SEQ2_DIR=/san/sanvol1/scratch/tetNig1/tetNig1.sdTrf.2bit SEQ2_LEN=/san/sanvol1/scratch/tetNig1/chrom.sizes SEQ2_CTGDIR=/san/sanvol1/scratch/tetNig1/tetNig1.randomContigs.sdTrf.2bit SEQ2_CTGLEN=/san/sanvol1/scratch/tetNig1/tetNig1.randomContigs.sdTrf.sizes SEQ2_CHUNK=20000000 SEQ2_LAP=0 SEQ2_LIMIT=100 BASE=/cluster/data/oryLat1/bed/blastz.tetNig1.2006-12-13 TMPDIR=/scratch/tmp '_EOF_' # << this line keeps emacs coloring happy time doBlastzChainNet.pl DEF -chainMinScore=2000 -chainLinearGap=loose \ -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -bigClusterHub=kk -verbose=2 -blastzOutRoot \ /cluster/bluearc/oryLat1TetNig1 > do.log 2>&1 & time doBlastzChainNet.pl DEF -chainMinScore=2000 -chainLinearGap=loose \ -continue=cat \ -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -bigClusterHub=kk -verbose=2 \ -blastzOutRoot /cluster/bluearc/oryLat1TetNig1 > cat.log 2>&1 & time doBlastzChainNet.pl DEF -chainMinScore=2000 -chainLinearGap=loose \ -continue=chainRun \ -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -bigClusterHub=kk -verbose=2 \ -blastzOutRoot /cluster/bluearc/oryLat1TetNig1 > chain.log 2>&1 & cd /cluster/data/oryLat1/bed rm blastz.tetNig1 ln -s blastz.tetNig1.2006-12-13 blastz.tetNig1 ssh hgwdev cd /cluster/data/oryLat1/bed/blastz.tetNig1 nice -n 19 featureBits -noRandom oryLat1 chainTetNig1Link \ >fb.oryLat1.tetNig1.txt 2>&1 # 160388236 bases of 700386597 (22.900%) in intersection # And the swap ssh kkstore04 cd /cluster/data/tetNig1/bed mv blastz.oryLat1.swap blastz.oryLat1.swap.2006-12-08 mkdir blastz.oryLat1.swap cd /cluster/data/tetNig1/bed/blastz.oryLat1.swap time doBlastzChainNet.pl -verbose=2 \ /cluster/data/oryLat1/bed/blastz.tetNig1.2006-12-13/DEF \ -chainMinScore=2000 -chainLinearGap=loose \ -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -swap -bigClusterHub=kk > swap.log 2>&1 & time doBlastzChainNet.pl -verbose=2 \ /cluster/data/oryLat1/bed/blastz.tetNig1.2006-12-13/DEF \ -chainMinScore=2000 -chainLinearGap=loose \ -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -continue=net -swap -bigClusterHub=kk > net-swap.log 2>&1 & ssh hgwdev cd /cluster/data/tetNig1/bed/blastz.oryLat1.swap nice -n 19 featureBits -noRandom tetNig1 chainOryLat1Link \ >fb.tetNig1.oryLat1.txt 2>&1 # 130327582 bases of 342403326 (38.063%) in intersection ######################################################################### # BLASTZ/CHAIN/NET gasAcu1 (DONE - 2006-12-14 - Hiram) ## Third time, randoms and chrUn in contigs for both assemblies ssh kkstore04 mkdir /cluster/data/oryLat1/bed/blastz.gasAcu1.2006-12-14 cd /cluster/data/oryLat1/bed/blastz.gasAcu1.2006-12-14 cat << '_EOF_' > DEF # Medaka vs. Stippleback # Try "human-fugu" (more distant, less repeat-killed than mammal) params # +M=50: BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_M=50 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET: Medaka oryLat1 (40M chunks covers the largest chroms in one gulp) # chrUn in Scaffolds for this alignment run SEQ1_DIR=/san/sanvol1/scratch/oryLat1/oryLat1.sdTrf.2bit SEQ1_LEN=/san/sanvol1/scratch/oryLat1/chrom.sizes SEQ1_CTGDIR=/san/sanvol1/scratch/oryLat1/oryLat1UnScaffolds.2bit SEQ1_CTGLEN=/san/sanvol1/scratch/oryLat1/oryLat1UnScaffolds.sizes SEQ1_LIFT=/san/sanvol1/scratch/oryLat1/chrUn.lift SEQ1_CHUNK=40000000 SEQ1_LAP=10000 SEQ1_LIMIT=100 # QUERY: Stippleback gasAcu1, no chrUn in this run SEQ2_DIR=/san/sanvol1/scratch/gasAcu1/gasAcu1.sdTrf.2bit SEQ2_LEN=/san/sanvol1/scratch/gasAcu1/gasAcu1.sdTrf.sizes SEQ2_CTGDIR=/san/sanvol1/scratch/gasAcu1/gasAcu1.randomContigs.sdTrf.2bit SEQ2_CTGLEN=/san/sanvol1/scratch/gasAcu1/gasAcu1.randomContigs.sdTrf.sizes SEQ2_LIFT=/san/sanvol1/scratch/gasAcu1/gasAcu1.randomContigs.lift SEQ2_CHUNK=35000000 SEQ2_LAP=0 SEQ2_LIMIT=100 BASE=/cluster/data/oryLat1/bed/blastz.gasAcu1.2006-12-14 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time doBlastzChainNet.pl DEF -chainMinScore=2000 -chainLinearGap=loose \ -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -bigClusterHub pk -blastzOutRoot \ /cluster/bluearc/oryLat1GasAcu1 > do.log 2>&1 & # this effort was abandoned and removed ####################################################################### ## BLASTZ Medaka chroms vs Stickleback random contigs ## Experiment only ## (DONE - 2006-12-20 - Hiram) ssh kkstore04 mkdir /cluster/data/oryLat1/bed/blastz.gasAcu1.ChrUn.2006-12-20 cd /cluster/data/oryLat1/bed/blastz.gasAcu1.ChrUn.2006-12-20 cat << '_EOF_' > DEF # Medaka chroms vs Stickleback chroms random contigs # Try "human-fugu" (more distant, less repeat-killed than mammal) params # +M=50: BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_M=50 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET: Medaka oryLat1 no chrUn sequence, only chroms # The largest is 40 million bases, there are 25 of them SEQ1_DIR=/san/sanvol1/scratch/oryLat1/oryLat1.noUn.sdTrf.2bit SEQ1_LEN=/san/sanvol1/scratch/oryLat1/oryLat1.noUn.sdTrf.sizes SEQ1_CHUNK=40000000 SEQ1_LAP=10000 # QUERY: Stickleback gasAcu1 chrUn only # chrUn in scaffolds for this alignment run # The largest contig is only 420,000 bases, there are 5,000 of them # and a total size of 63 million. SEQ2_DIR=/san/sanvol1/scratch/gasAcu1/gasAcu1.chrUn.sdTrf.2bit SEQ2_LEN=/san/sanvol1/scratch/gasAcu1/gasAcu1.chrUn.sdTrf.sizes SEQ2_CTGDIR=/san/sanvol1/scratch/gasAcu1/gasAcu1.chrUnContigsOnly.sdTrf.2bit SEQ2_CTGLEN=/san/sanvol1/scratch/gasAcu1/gasAcu1.chrUnContigsOnly.sdTrf.sizes SEQ2_LIFT=/san/sanvol1/scratch/gasAcu1/chrUn.extraCloneGap.lift SEQ2_CHUNK=1000000 SEQ2_LIMIT=40 SEQ2_LAP=0 BASE=/cluster/data/oryLat1/bed/blastz.gasAcu1.ChrUn.2006-12-20 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time doBlastzChainNet.pl -verbose=2 DEF \ -chainMinScore=2000 -chainLinearGap=loose \ -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -blastzOutRoot /cluster/bluearc/oryLat1GasAcu1Un \ -bigClusterHub=pk > do.log 2>&1 & ######################################################################### ## Reorder Fish organisms (DONE - 2006-12-22 - Hiram) hgsql -h genome-testdb hgcentraltest \ -e "update dbDb set orderKey = 475 where name = 'oryLat1';" ####################################################################### ## Swap of Stickleback chroms vs Medaka chroms and random contigs ## (DONE - 2006-12-22 - Hiram) ssh kkstore04 mkdir /cluster/data/oryLat1/bed/blastz.gasAcu1.swap cd /cluster/data/oryLat1/bed/blastz.gasAcu1.swap time doBlastzChainNet.pl -verbose=2 \ /cluster/data/gasAcu1/bed/blastz.oryLat1.2006-12-22/DEF \ -chainMinScore=2000 -chainLinearGap=loose \ -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -swap -bigClusterHub=pk > swap.log 2>&1 & # real 28m51.553s # user 0m0.137s # sys 0m0.096s ssh hgwdev cd /cluster/data/oryLat1/bed/blastz.gasAcu1.swap time featureBits oryLat1 chainGasAcu1Link \ > fb.oryLat1.chainGasAcu1Link.txt 2>&1 # 189411142 bases of 700386597 (27.044%) in intersection ####################################################################### ## Stickleback gasAcu1 BLASTZ (DONE - 2007-01-11 - Hiram) ## The above swap doesn't contain the the Stickleback random contigs ## and it would be good to have the Medaka alignments back on Stickleback ## randoms. So, to get them, run this blastz of the Stickleback chroms ## and random contigs, then swap it back to Stickleback, no Db loads as ## we are going to extract the random chains and nets out of this business ## (WORKIN - 2007-01-11 - Hiram) ssh kkstore04 mkdir /cluster/data/oryLat1/bed/blastz.gasAcu1.2007-01-11 cd /cluster/data/oryLat1/bed/blastz.gasAcu1.2007-01-11 cat << '_EOF_' > DEF # Medaka chroms vs. Stickleback chroms and randoms in scaffolds # Try "human-fugu" (more distant, less repeat-killed than mammal) params # +M=50: BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_M=50 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET: Medaka oryLat1 no chrUn sequence, only chroms # The largest is 40 million bases, there are 25 of them SEQ1_DIR=/san/sanvol1/scratch/oryLat1/oryLat1.noUn.sdTrf.2bit SEQ1_LEN=/san/sanvol1/scratch/oryLat1/oryLat1.noUn.sdTrf.sizes SEQ1_CHUNK=40000000 SEQ1_LAP=10000 SEQ1_LIMIT=10 # QUERY: Stickleback gasAcu1 chroms and chrUn in contigs # The largest chrom is 32M bases, the largest contig 418,670 bases # The smallest chrom chrM is 15,742 bases, smallest contig 60 bases # there are 5,115 chroms and contig pieces # total size 1,089,690,673 bases # 47107538 N's 1042583135 real 834274605 upper 208308530 lower SEQ2_DIR=/san/sanvol1/scratch/gasAcu1/gasAcu1.sdTrf.2bit SEQ2_LEN=/san/sanvol1/scratch/gasAcu1/gasAcu1.sdTrf.sizes SEQ2_CTGDIR=/san/sanvol1/scratch/gasAcu1/gasAcu1.randomContigs.sdTrf.2bit SEQ2_CTGLEN=/san/sanvol1/scratch/gasAcu1/gasAcu1.randomContigs.sdTrf.sizes SEQ2_LIFT=/san/sanvol1/scratch/gasAcu1/chrUn.extraCloneGap.lift SEQ2_CHUNK=1000000 SEQ2_LIMIT=20 SEQ2_LAP=0 BASE=/cluster/data/oryLat1/bed/blastz.gasAcu1.2007-01-11 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time doBlastzChainNet.pl -verbose=2 DEF -smallClusterHub=pk \ -chainMinScore=2000 -chainLinearGap=loose \ -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -blastzOutRoot /cluster/bluearc/oryLat1GasAcu1Un \ -stop=net -bigClusterHub=pk > do.log 2>&1 & # real 127m37.446s # user 0m0.148s # sys 0m0.111s ## And the swap mkdir /cluster/data/gasAcu1/bed/blastz.orylat1.swap cd /cluster/data/gasAcu1/bed/blastz.orylat1.swap time doBlastzChainNet.pl -verbose=2 -smallClusterHub=pk \ /cluster/data/oryLat1/bed/blastz.gasAcu1.2007-01-11/DEF \ -chainMinScore=2000 -chainLinearGap=loose \ -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -swap -stop=net -bigClusterHub=pk > swap.log 2>&1 & ####################################################################### ## BLASTZ FR1 fr1 chrUn in contigs, Medaka chroms only, no randoms ## (DONE - 2006-12-22 - Hiram) ssh kkstore04 mkdir /cluster/data/oryLat1/bed/blastz.fr1.2006-12-22 cd /cluster/data/oryLat1/bed/blastz.fr1.2006-12-22 cat << '_EOF_' > DEF # Medaka vs. Fugu # Try "human-fugu" (more distant, less repeat-killed than mammal) params # +M=50: BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_M=50 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET: Medaka oryLat1 (40M chunks covers the largest chroms in one gulp) # no chrUn in this alignment run SEQ1_DIR=/san/sanvol1/scratch/oryLat1/oryLat1.noUn.sdTrf.2bit SEQ1_LEN=/san/sanvol1/scratch/oryLat1/oryLat1.noUn.sdTrf.sizes SEQ1_CHUNK=40000000 SEQ1_LAP=10000 # QUERY: Fugu fr1 # Align to the scaffolds, results lifed up to chrUn.sdTrf coordinates # 20,397 scaffolds, largest 1.1 million SEQ2_DIR=/san/sanvol1/scratch/fr1/chrUn.sdTrf.2bit SEQ2_LEN=/san/sanvol1/scratch/fr1/chrom.sizes SEQ2_CTGDIR=/san/sanvol1/scratch/fr1/fr1.UnScaffolds.sdTrf.2bit SEQ2_CTGLEN=/san/sanvol1/scratch/fr1/scaffold.sizes SEQ2_LIFT=/san/sanvol1/scratch/fr1/UnScaffolds/ordered.lft SEQ2_CHUNK=2000000 SEQ2_LIMIT=10 SEQ2_LAP=0 BASE=/cluster/data/oryLat1/bed/blastz.fr1.2006-12-22 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time doBlastzChainNet.pl DEF -chainMinScore=2000 -chainLinearGap=loose \ -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -bigClusterHub=pk \ -blastzOutRoot /cluster/bluearc/oryLat1Fr1 > do.log 2>&1 & # this effort was abandoned, the 2006-12-13 run was used instead ######################################################################### # ENSEMBL GENES (DONE - 2007-01-05 - Hiram) # Good luck with the biomart interface. It seems to be different each time # it is used. mkdir /cluster/data/oryLat1/bed/ensembl cd /cluster/data/oryLat1/bed/ensembl # Get the ensembl gene data from # http://www.biomart.org/biomart/martview/ # # Follow this sequence through the pages: # The default dataset will be Human. In the right side # frame, if you scroll it up and down, you will come to a pull-down # dataset menu where you can select the organism. There appear to be # two pull-down menus in this frame, one for: # Database: ENSEMBL 42 GENE (SANGER) # and you can select the second: # Dataset: Oryzias latipes genes (MEDAKA1) # After selecting the Dataset, in the left frame, click on the # "Attributes" (Features) label, now the right frame changes to radio # buttons, Features, Sequences, Structures # Click the "Structures" button, the three optional buttons can be # expanded to select elements of these: # REGION: # GENE: # EXON: # REGION: has Chromosome checked # GENE: has Ensembl Gene ID and Biotype selected # EXON: has no selections # In the GENE: menu: # Unselect Biotype # and Select # Ensembl Gene ID # Ensembl Transcript ID # External Gene ID # Then, in the black menu bar above these frames, click rhe "Results" # it shows the first ten rows. For this organism, there appear to be no # External Gene ID in the HTML view. Change the "Display maximum" # "rows as" pull-down # to GFF, and use the "Export all results to" pull-down to # Compressed file (.gz), press the "Go" button to download. # Save this file as oryLat1.ensembl42.gff.gz # The regular chroms came with just their numbers, but the chrUn # scaffolds arrived as scaffolds. Will need to lift to chrUn. # Add "chr" to front of each line in the gene data gtf file to make # it compatible with our software, and liftUp to get scaffolds onto chrUn # The scaffolds and ultracontigs mentioned in this are not the scaffolds # we have in our lift file for chrUn ... can't use them. zcat oryLat1.ensembl42.gff.gz | egrep -v "scaffold|ultracontig" \ | sed -e "s/^\([0-9][0-9]*\)/chr\1/" | gzip -c > ensembl42.gff.gz # Verify names OK: zcat ensembl42.gff.gz | awk '{print $1}' | sort | uniq -c # 22938 chr1 # 15887 chr10 # 18645 chr11 # 20162 chr12 # 21474 chr13 # 21302 chr14 # 20148 chr15 # 22978 chr16 # 26164 chr17 # 13671 chr18 # 17109 chr19 # 10988 chr2 # 15705 chr20 # 21361 chr21 # 21030 chr22 # 12984 chr23 # 19009 chr24 # 21767 chr3 # 26204 chr4 # 24335 chr5 # 24300 chr6 # 24329 chr7 # 23323 chr8 # 27795 chr9 cd /cluster/data/oryLat1/bed/ensembl ldHgGene -gtf -genePredExt oryLat1 ensGene ensembl42.gff.gz # Read 23087 transcripts in 493608 lines in 1 files # 23087 groups 24 seqs 1 sources 4 feature types # 23087 gene predictions # ensGtp associates geneId/transcriptId/proteinId for hgPepPred and # hgKnownToSuper. Use ensMart to create it as above, except: # for the Attributes, choose the "Features" box, and then In "GENE:" # select Ensembl Gene ID, Ensembl Transcript ID, Ensembl Peptide ID. # Results, choose txt output and a gzipped file, save this as # ensGtp42.txt.gz # Strip the first lines which is merely column heading labels zcat ensGtp42.txt.gz | headRest 1 stdin | sed -e "s/ /\t/g" > ensGtp.txt hgLoadSqlTab oryLat1 ensGtp ~/kent/src/hg/lib/ensGtp.sql ensGtp.txt hgsql -N -e "select count(*) from ensGtp;" oryLat1 # +-------+ # | 26148 | # +-------+ wc -l ensGtp.txt # 26148 ensGtp.txt # Load Ensembl peptides: # Get them from ensembl as above in the gene section except for # for the Attributes, choose the "Sequences" box, and then # SEQUENCES: Peptide and # Header Information "Ensembl Transcript ID" # Results output as FASTA # Save file as ensPep.fa.gz hgPepPred oryLat1 generic ensPep ensPep.fa ######################################################################### # MAKE 11.OOC FILE FOR BLAT (DONE - 2007-01-18 - Hiram) # Use -repMatch=200 (based on size -- for human we use 1024, and # medaka size is ~20% of human judging by gapless oryLat1 vs. hg18 # genome sizes from featureBits. ssh kolossus blat /cluster/data/oryLat1/oryLat1.2bit /dev/null /dev/null -tileSize=11 \ -makeOoc=/cluster/bluearc/oryLat1/11.ooc -repMatch=200 # Wrote 46600 overused 11-mers to /cluster/bluearc/oryLat1/11.ooc ######################################################################### # GENBANK AUTO UPDATE (DONE - 2007-01-18 - Hiram) # Make a liftAll.lft that specifies 5M chunks for genbank: ssh kkstore04 cd /cluster/data/oryLat1 simplePartition.pl oryLat1.2bit 5000000 /tmp/oryLat1P cat /tmp/oryLat1P/*/*.lft > jkStuff/liftAll.lft rm -r /tmp/oryLat1P # align with latest genbank process. ssh hgwdev cd ~/kent/src/hg/makeDb/genbank cvsup # edit etc/genbank.conf to add oryLat1 just after gasAcu1 # oryLat1 (G. Aculeatus) oryLat1.serverGenome = /cluster/data/oryLat1/oryLat1.2bit oryLat1.clusterGenome = /iscratch/i/oryLat1/oryLat1.2bit oryLat1.refseq.mrna.native.pslCDnaFilter = ${ordered.refseq.mrna.native.pslCDnaFilter} oryLat1.refseq.mrna.xeno.pslCDnaFilter = ${ordered.refseq.mrna.xeno.pslCDnaFilter} oryLat1.genbank.mrna.native.pslCDnaFilter = ${ordered.genbank.mrna.native.pslCDnaFilter} oryLat1.genbank.mrna.xeno.pslCDnaFilter = ${ordered.genbank.mrna.xeno.pslCDnaFilter} oryLat1.genbank.est.native.pslCDnaFilter = ${ordered.genbank.est.native.pslCDnaFilter} oryLat1.ooc = /cluster/bluearc/oryLat1/11.ooc oryLat1.lift = /cluster/data/oryLat1/jkStuff/liftAll.lft oryLat1.genbank.mrna.xeno.load = yes oryLat1.downloadDir = oryLat1 cvs ci -m "Added oryLat1." etc/genbank.conf # update /cluster/data/genbank/: make etc-update # Edit src/lib/gbGenome.c to add new species. cvs ci -m "Added Oryzias latipes (medaka)." src/lib/gbGenome.c make install-server ssh kkstore02 cd /cluster/data/genbank nice -n +19 bin/gbAlignStep -initial oryLat1 & # load database when finished ssh hgwdev cd /cluster/data/genbank time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad oryLat1 # real 17m10.400s # enable daily alignment and update of hgwdev cd ~/kent/src/hg/makeDb/genbank cvsup # add oryLat1 to: etc/align.dbs etc/hgwdev.dbs cvs ci -m "Added oryLat1." etc/align.dbs etc/hgwdev.dbs make etc-update # add Xeno refseqs (2007-04-12 markd) # add this to genbank.com: oryLat1.refseq.mrna.xeno.load = yes ssk kkstore02 cd /cluster/data/genbank (./bin/gbAlignStep -initial -srcDb=refseq oryLat1)|&mail markd& (./bin/gbDbLoadStep -srcDb=refseq oryLat1)|&mail markd& ######################################################################### ## 5-Way Multiz (DONE - 2007-01-21 - Hiram) ## re-done with fr2 in place of fr1 ssh hgwdev mkdir /cluster/data/oryLat1/bed/multiz5way cd /cluster/data/oryLat1/bed/multiz5way cp /cluster/data/fr2/bed/multiz5way/5way.nh . # this file looks like: cat << '_EOF_' > 5way.nh (((tetraodon_tetNig1:0.199381,fugu_fr2:0.239894):0.2, (stickleback_gasAcu1:0.2,medaka_oryLat1:0.2):0.2):0.292961, zebrafish_danRer4:0.782561); '_EOF_' # << happy emacs # Use this specification in the phyloGif tool: # http://genome.ucsc.edu/cgi-bin/phyloGif # to obtain a gif image for htdocs/images/phylo/oryLat1_5way.gif /cluster/bin/phast/all_dists 5way.nh > 5way.distances.txt # Use this output to create the table below grep -y oryLat1 5way.distances.txt | sort -k3,3n # # If you can fill in all the numbers in this table, you are ready for # the multiple alignment procedure # # featureBits chainLink measures # chainOryLat1Link chain linearGap # distance on oryLat1 on other minScore # 1 0.4000 - stickleback gasAcu1 (% 27.044) (% 38.308) 2000 loose # 2 0.7994 - tetraodon tetNig1 (% 22.900) (% 38.063) 2000 loose # 3 0.8399 - fugu fr2 (% 25.344) (% 36.611) 2000 loose # 4 1.4755 - zebrafish danRer4 (% 22.275) (% 12.899) 2000 loose cd /cluster/data/oryLat1/bed/multiz5way # bash shell syntax here ... export H=/cluster/data/oryLat1/bed mkdir mafLinks for G in gasAcu1 tetNig1 fr2 danRer4 do mkdir mafLinks/$G if [ ! -d ${H}/blastz.${G}/mafNet ]; then echo "missing directory blastz.${G}/mafNet" else ln -s ${H}/blastz.$G/mafNet/*.maf.gz ./mafLinks/$G fi done # Copy MAFs to some appropriate NFS server for kluster run ssh kkstore04 mkdir /san/sanvol1/scratch/oryLat1/multiz5way cd /san/sanvol1/scratch/oryLat1/multiz5way time nice -n +19 rsync -a --copy-links --progress \ /cluster/data/oryLat1/bed/multiz5way/mafLinks/ . # 15 seconds to copy 522 Mb mkdir penn cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/multiz penn cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/maf_project penn cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/autoMZ penn # the autoMultiz cluster run ssh pk cd /cluster/data/oryLat1/bed/multiz5way/ # create species list and stripped down tree for autoMZ sed 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//; /^ *$/d' \ 5way.nh > tmp.nh echo `cat tmp.nh` > tree-commas.nh echo `cat tree-commas.nh` | sed 's/ //g; s/,/ /g' > tree.nh sed 's/[()]//g; s/,/ /g' tree.nh > species.lst mkdir run maf cd run # NOTE: you need to set the db and multiz dirname properly in this script cat > autoMultiz << '_EOF_' #!/bin/csh -ef set db = oryLat1 set c = $1 set maf = $2 set binDir = /san/sanvol1/scratch/$db/multiz5way/penn set tmp = /scratch/tmp/$db/multiz.$c set pairs = /san/sanvol1/scratch/$db/multiz5way rm -fr $tmp mkdir -p $tmp cp ../{tree.nh,species.lst} $tmp pushd $tmp foreach s (`cat species.lst`) set in = $pairs/$s/$c.maf set out = $db.$s.sing.maf if ($s == $db) then continue endif if (-e $in.gz) then zcat $in.gz > $out else if (-e $in) then cp $in $out else echo "##maf version=1 scoring=autoMZ" > $out endif end set path = ($binDir $path); rehash $binDir/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c.maf popd cp $tmp/$c.maf $maf rm -fr $tmp '_EOF_' # << happy emacs chmod +x autoMultiz cat << '_EOF_' > template #LOOP ./autoMultiz $(root1) {check out line+ /cluster/data/oryLat1/bed/multiz5way/maf/$(root1).maf} #ENDLOOP '_EOF_' # << happy emacs awk '{print $1}' /cluster/data/oryLat1/chrom.sizes > chrom.lst gensub2 chrom.lst single template jobList para create jobList # 23 jobs para try ... check ... push ... etc ... # Completed: 26 of 26 jobs # CPU time in finished jobs: 7534s 125.57m 2.09h 0.09d 0.000 y # IO & Wait Time: 161s 2.68m 0.04h 0.00d 0.000 y # Average job time: 296s 4.93m 0.08h 0.00d # Longest finished job: 1148s 19.13m 0.32h 0.01d # Submission to last job: 1158s 19.30m 0.32h 0.01d # combine results into a single file for loading and gbdb reference ssh kkstore04 cd /cluster/data/oryLat1/bed/multiz5way time nice -n +19 catDir maf > multiz5way.maf # real 0m5.628s # makes a 1.4 Gb file: # -rw-rw-r-- 1 1413115862 Feb 5 15:21 multiz5way.maf # Create per-chrom individual maf files for downloads # NOT NECESSARY HERE - DONE LATER WITH THE ANNOTATED MAFS ssh kkstore04 cd /cluster/data/oryLat1/bed/multiz5way mkdir mafDownloads time for M in maf/chr*.maf do B=`basename $M` cp -p ${M} mafDownloads/${B} gzip mafDownloads/${B} echo ${B} done done # real 5m9.273 # deliver to downloads *!* NOT NECESSARY HERE - DONE LATER WITH # THE ANNOTATED MAFS ssh hgwdev ln -s /cluster/data/oryLat1/bed/multiz5way/mafDownloads \ /usr/local/apache/htdocs/goldenPath/oryLat1/multiz5way # Load into database ssh hgwdev cd /cluster/data/oryLat1/bed/multiz5way mkdir /gbdb/oryLat1/multiz5way ln -s /cluster/data/oryLat1/bed/multiz5way/multiz5way.maf \ /gbdb/oryLat1/multiz5way time nice -n +19 hgLoadMaf oryLat1 multiz5way # Loaded 1559270 mafs in 1 files from /gbdb/oryLat1/multiz5way # real 0m38.327s time nice -n +19 hgLoadMafSummary -minSize=10000 -mergeGap=500 \ -maxSize=50000 oryLat1 multiz5waySummary multiz5way.maf # Created 724676 summary blocks from 3597261 components # and 1559270 mafs from multiz5way.maf # real 0m47.574s # Create tree image for details page # You can get a better image from the phyloGif tool: # http://genome.ucsc.edu/cgi-bin/phyloGif # cp 5way.nh treeImage.nh # Edit treeImage.nh to make the names as desired # Submit this treeImage.nh to the phyloGif program, or: # /cluster/bin/phast/draw_tree -b -s treeImage.nh > treeImage.ps ############################################################################ # ANNOTATE MULTIZ5WAY MAF AND LOAD TABLES (DONE - 2007-01-21 - Hiram) ## Reloaded 2007-04-13 to get the annotation correct ssh kolossus mkdir /cluster/data/oryLat1/bed/multiz5way/anno cd /cluster/data/oryLat1/bed/multiz5way/anno mkdir maf run cd run rm -f sizes nBeds ln -s /cluster/data/oryLat1/chrom.sizes oryLat1.len twoBitInfo -nBed /cluster/data/oryLat1/oryLat1.sdTrf.{2bit,N.bed} ln -s /cluster/data/oryLat1/oryLat1.sdTrf.N.bed oryLat1.bed for DB in `sed -e "s/ oryLat1//" /cluster/data/oryLat1/bed/multiz5way/species.lst` do ln -s /cluster/data/${DB}/chrom.sizes ${DB}.len ln -s /cluster/data/${DB}/${DB}.N.bed ${DB}.bed echo ${DB}.bed >> nBeds echo ${DB}.len >> sizes echo $DB done echo '#!/bin/csh -ef' > jobs.csh echo date >> jobs.csh # do smaller jobs first so you can see some progress immediately: for F in `ls -1rS ../../maf/*.maf` do echo mafAddIRows -nBeds=nBeds $F \ /cluster/data/oryLat1/oryLat1.sdTrf.2bit ../maf/`basename $F` >> jobs.csh echo "echo $F" >> jobs.csh done echo date >> jobs.csh chmod +x jobs.csh time nice -n +19 ./jobs.csh > jobs.log 2>&1 & # to watch progress; tail -f jobs.log # real 36m19.199s # Load anno/maf ssh hgwdev cd /cluster/data/oryLat1/bed/multiz5way/anno/maf mkdir -p /gbdb/oryLat1/multiz5way/anno/maf ln -s /cluster/data/oryLat1/bed/multiz5way/anno/maf/*.maf \ /gbdb/oryLat1/multiz5way/anno/maf time nice -n +19 hgLoadMaf \ -pathPrefix=/gbdb/oryLat1/multiz5way/anno/maf oryLat1 multiz5way # Loaded 1918235 mafs in 26 files from /gbdb/oryLat1/multiz5way/anno/maf # real 1m1.468s # Do the computation-intensive part of hgLoadMafSummary on a workhorse # machine and then load on hgwdev: ssh kolossus cd /cluster/data/oryLat1/bed/multiz5way/anno/maf time cat *.maf | \ nice -n +19 hgLoadMafSummary oryLat1 -minSize=30000 -mergeGap=1500 \ -maxSize=200000 -test multiz5waySummary stdin # Created 413488 summary blocks from 3682815 components and 1918228 mafs # from stdin # real 1m20.644s ssh hgwdev cd /cluster/data/oryLat1/bed/multiz5way/anno/maf time nice -n +19 hgLoadSqlTab oryLat1 multiz5waySummary \ ~/kent/src/hg/lib/mafSummary.sql multiz5waySummary.tab # real 0m6.295s ####################################################################### # MULTIZ5WAY MAF FRAMES (DONE - 2007-01-21 - Hiram) ## re-done with fr2 in place of fr1 - 2007-02-05 - Hiram ssh hgwdev mkdir /cluster/data/oryLat1/bed/multiz5way/frames cd /cluster/data/oryLat1/bed/multiz5way/frames mkdir genes # The following is adapted from the gasAcu1 8-way sequence #------------------------------------------------------------------------ # get the genes for all genomes # mRNAs with CDS. single select to get cds+psl, then split that up and # create genePred # using refGene for danRer4 for qDB in danRer4 do geneTbl=refGene echo hgsql -N -e \"'select * from '$geneTbl\;\" ${qDB} hgsql -N -e "select * from $geneTbl" ${qDB} | cut -f 2-100 \ | genePredSingleCover stdin stdout | gzip -2c \ > /scratch/tmp/$qDB.tmp.gz mv /scratch/tmp/$qDB.tmp.gz genes/$qDB.gp.gz rm -f $tmpExt done # using genscan for tetNig1 # using ensGene for oryLat1, gasAcu1, and fr2 # genePreds; (must keep only the first 10 columns for knownGene) for qDB in gasAcu1 oryLat1 fr2 tetNig1 do if [ $qDB = "gasAcu1" -o $qDB = "oryLat1" -o $qDB = "fr2" ]; then geneTbl=ensGene elif [ $qDB = "tetNig1" ]; then geneTbl=genscan fi echo hgsql -N -e \"'select * from '$geneTbl\;\" ${qDB} hgsql -N -e "select * from $geneTbl" ${qDB} | cut -f 1-10 \ | genePredSingleCover stdin stdout | gzip -2c \ > /scratch/tmp/$qDB.tmp.gz mv /scratch/tmp/$qDB.tmp.gz genes/$qDB.gp.gz rm -f $tmpExt done ### ssh kkstore04 cd /cluster/data/oryLat1/bed/multiz5way/frames time cat ../maf/*.maf | nice -n +19 genePredToMafFrames oryLat1 \ stdin stdout gasAcu1 genes/gasAcu1.gp.gz oryLat1 \ genes/oryLat1.gp.gz fr2 genes/fr2.gp.gz tetNig1 \ genes/tetNig1.gp.gz danRer4 genes/danRer4.gp.gz \ | gzip > multiz5way.mafFrames.gz # real 0m40.989s ssh hgwdev cd /cluster/data/oryLat1/bed/multiz5way/frames time nice -n +19 hgLoadMafFrames oryLat1 multiz5wayFrames \ multiz5way.mafFrames.gz # real 0m22.466s ######################################################################### # MULTIZ5WAY DOWNLOADABLES (DONE - 2007-02-06 - Hiram) ssh hgwdev mkdir /cluster/data/oryLat1/bed/multiz5way/mafDownloads cd /cluster/data/oryLat1/bed/multiz5way # upstream mafs # rebuilt 2007-12-21 to fix difficulty in mafFrags when species.lst # did not have fr2 as the first one # There isn't any refGene table, trying ensGene instead for S in 1000 2000 5000 do echo "making upstream${S}.maf" nice -n +19 $HOME/bin/$MACHTYPE/featureBits -verbose=2 oryLat1 \ ensGene:upstream:${S} -fa=/dev/null -bed=stdout \ | perl -wpe 's/_up[^\t]+/\t0/' \ | $HOME/kent/src/hg/ratStuff/mafFrags/mafFrags oryLat1 multiz5way \ stdin stdout -orgs=species.lst \ | gzip -c > mafDownloads/ensGene.upstream${S}.maf.gz echo "done ensGene.upstream${S}.maf.gz" done # re-done 2007-04-13 after proper annotation ssh kkstore04 cd /cluster/data/oryLat1/bed/multiz5way time for M in anno/maf/chr*.maf do B=`basename $M` nice -n +19 gzip -c ${M} > mafDownloads/${B}.gz echo ${B}.gz done done # real 6m13.232s cd mafDownloads nice -n +19 md5sum *.maf.gz > md5sum.txt # Make a README.txt ssh hgwdev mkdir /usr/local/apache/htdocs/goldenPath/oryLat1/multiz5way cd /usr/local/apache/htdocs/goldenPath/oryLat1/multiz5way ln -s /cluster/data/oryLat1/bed/multiz5way/mafDownloads/{*.gz,*.txt} . ############################################################################ # CREATE CONSERVATION WIGGLE WITH PHASTCONS # (DONE - 2007-01-23 - Hiram) # The following is adapted from the gasAcu1 8-way sequence # Estimate phastCons parameters ssh kkstore04 mkdir /cluster/data/oryLat1/bed/multiz5way/cons cd /cluster/data/oryLat1/bed/multiz5way/cons # The automatic build process does not leave masked fa sequences for the # chroms, so, create them here. twoBitToFa -seq=chr1 ../../../oryLat1.sdTrf.2bit chr1.fa # Create a starting-tree.mod based on chr1 (the largest one) time /cluster/bin/phast/$MACHTYPE/msa_split ../maf/chr1.maf \ --refseq ./chr1.fa --in-format MAF \ --windows 100000000,1000 --out-format SS \ --between-blocks 5000 --out-root s1 # 20 seconds time nice -n +19 /cluster/bin/phast/$MACHTYPE/phyloFit -i SS s1.*.ss \ --tree "(((tetNig1,fr2),(gasAcu1,oryLat1)),danRer4)" \ --out-root starting-tree # Fitting tree model to s1.1-39973033.ss using REV ... # Writing model to starting-tree.mod ... # real 0m8.798s rm s1.*.ss # add up the C and G: grep BACKGROUND starting-tree.mod | awk '{printf "%0.3f\n", $3 + $4;}' # 0.430 # This 0.430 is used in the --gc argument below # Create SS files on san filesystem # Using a size of 10,000,000 to slow down the phastCons pk jobs ssh kkstore04 mkdir -p /san/sanvol1/scratch/oryLat1/cons/ss cd /san/sanvol1/scratch/oryLat1/cons/ss time for C in `awk '{print $1}' /cluster/data/oryLat1/chrom.sizes` do if [ -s /cluster/data/oryLat1/bed/multiz5way/maf/${C}.maf ]; then mkdir -p ${C} mkdir -p /cluster/data/oryLat1/bed/multiz5way/cons/${C} echo msa_split $C if [ ! -s /cluster/data/oryLat1/bed/multiz5way/cons/${C}/${C}.fa ]; then twoBitToFa -seq=${C} /cluster/data/oryLat1/oryLat1.sdTrf.2bit \ /cluster/data/oryLat1/bed/multiz5way/cons/${C}/${C}.fa fi chrN=${C/chr/} chrN=${chrN/_random/} /cluster/bin/phast/$MACHTYPE/msa_split \ /cluster/data/oryLat1/bed/multiz5way/maf/${C}.maf \ --refseq /cluster/data/oryLat1/bed/multiz5way/cons/${C}/${C}.fa \ --in-format MAF --windows 10000000,0 --between-blocks 5000 \ --out-format SS --out-root ${C}/${C} fi done & # real 7m49.718s # Create a random list of 50 1 mb regions (do not use the _randoms) cd /san/sanvol1/scratch/oryLat1/cons/ss ls -1l chr*/chr*.ss | grep -v Un | \ awk '$5 > 4000000 {print $9;}' | randomLines stdin 50 ../randomSs.list # Set up parasol directory to calculate trees on these 50 regions ssh pk mkdir /san/sanvol1/scratch/oryLat1/cons/treeRun1 cd /san/sanvol1/scratch/oryLat1/cons/treeRun1 mkdir tree log # Tuning this loop should come back to here to recalculate # Create little script that calls phastCons with right arguments # --target-coverage of 0.20 is about right for mouse, will be # tuned exactly below cat > makeTree.csh << '_EOF_' #!/bin/csh -fe set C=$1:h mkdir -p log/${C} tree/${C} /cluster/bin/phast/$MACHTYPE/phastCons ../ss/$1 \ /cluster/data/oryLat1/bed/multiz5way/cons/starting-tree.mod \ --gc 0.430 --nrates 1,1 --no-post-probs --ignore-missing \ --expected-length 10 --target-coverage 0.20 \ --quiet --log log/$1 --estimate-trees tree/$1 '_EOF_' # << happy emacs chmod a+x makeTree.csh # Create gensub file cat > template << '_EOF_' #LOOP ./makeTree.csh $(path1) #ENDLOOP '_EOF_' # << happy emacs # Make cluster job and run it gensub2 ../randomSs.list single template jobList para create jobList para try/push/check/etc # Completed: 50 of 50 jobs # CPU time in finished jobs: 8162s 136.03m 2.27h 0.09d 0.000 y # IO & Wait Time: 188s 3.14m 0.05h 0.00d 0.000 y # Average job time: 167s 2.78m 0.05h 0.00d # Longest finished job: 315s 5.25m 0.09h 0.00d # Submission to last job: 320s 5.33m 0.09h 0.00d # Now combine parameter estimates. We can average the .mod files # using phyloBoot. This must be done separately for the conserved # and nonconserved models ssh pk cd /san/sanvol1/scratch/oryLat1/cons/treeRun1 ls -1 tree/chr*/*.cons.mod > cons.list /cluster/bin/phast/$MACHTYPE/phyloBoot --read-mods '*cons.list' \ --output-average ave.cons.mod > cons_summary.txt ls -1 tree/chr*/*.noncons.mod > noncons.list /cluster/bin/phast/$MACHTYPE/phyloBoot --read-mods '*noncons.list' \ --output-average ave.noncons.mod > noncons_summary.txt cp -p ave.*.mod .. cd .. cp -p ave.*.mod /cluster/data/oryLat1/bed/multiz5way/cons # measuring entropy # consEntopy # ave.cons.mod ave.noncons.mod --NH 9.78 # never stops with the --NH argument time /cluster/bin/phast/$MACHTYPE/consEntropy --NH 9.7834 \ 0.10 12 ave.{cons,noncons}.mod ## 0.20 10 ## ERROR: too many iterations, not converging; try without --NH. Transition parameters: gamma=0.100000, omega=12.000000, mu=0.083333, nu=0.009259 Relative entropy: H=0.811315 bits/site Expected min. length: L_min=14.589283 sites Expected max. length: L_max=8.948974 sites Phylogenetic information threshold: PIT=L_min*H=11.836508 bits # SKIP to here passing by the tuning numbers ssh pk # Create cluster dir to do main phastCons run mkdir /san/sanvol1/scratch/oryLat1/cons/consRun1 cd /san/sanvol1/scratch/oryLat1/cons/consRun1 mkdir ppRaw bed # Create script to run phastCons with right parameters # This job is I/O intensive in its output files, thus it is all # working over in /scratch/tmp/ cat > doPhast << '_EOF_' #!/bin/csh -fe mkdir /scratch/tmp/${2} cp -p ../ss/${1}/${2}.ss ../ave.{cons,noncons}.mod /scratch/tmp/${2} pushd /scratch/tmp/${2} > /dev/null /cluster/bin/phast/${MACHTYPE}/phastCons ${2}.ss ave.cons.mod,ave.noncons.mod \ --expected-length 10 --target-coverage 0.20 \ --quiet --seqname ${1} --idpref ${1} --viterbi ${2}.bed --score > ${2}.pp popd > /dev/null mkdir -p ppRaw/${1} mkdir -p bed/${1} mv /scratch/tmp/${2}/${2}.pp ppRaw/${1} mv /scratch/tmp/${2}/${2}.bed bed/${1} rm /scratch/tmp/${2}/ave.{cons,noncons}.mod rm /scratch/tmp/${2}/${2}.ss rmdir /scratch/tmp/${2} '_EOF_' # << happy emacs chmod a+x doPhast # root1 == chrom name, file1 == ss file name without .ss suffix # Create gsub file cat > template << '_EOF_' #LOOP ./doPhast $(root1) $(file1) #ENDLOOP '_EOF_' # << happy emacs # Create parasol batch and run it ls -1 ../ss/chr*/chr*.ss | sed 's/.ss$//' > in.list gensub2 in.list single template jobList para create jobList para try/check/push/etc. # These jobs are very fast and very I/O intensive, even on the san # they will hang it up as they work at full tilt. # Completed: 98 of 98 jobs # CPU time in finished jobs: 2506s 41.76m 0.70h 0.03d 0.000 y # IO & Wait Time: 1062s 17.70m 0.30h 0.01d 0.000 y # Average job time: 36s 0.61m 0.01h 0.00d # Longest finished job: 50s 0.83m 0.01h 0.00d # Submission to last job: 96s 1.60m 0.03h 0.00d # combine predictions and transform scores to be in 0-1000 interval # it uses a lot of memory, so on kolossus: ssh kolossus cd /san/sanvol1/scratch/oryLat1/cons/consRun1 # The sed's and the sort get the file names in chrom,start order # You might like to verify it is correct by first looking at the # list it produces: find ./bed -type f | sed -e "s#/# x #g; s#\.# y #g; s#-# z #g" \ | sort -k7,7 -k9,9n \ | sed -e "s# z #-#g; s# y #\.#g; s# x #/#g" | less # if that looks right, then let it run: find ./bed -type f | sed -e "s#/# x #g; s#\.# y #g; s#-# z #g" \ | sort -k7,7 -k9,9n \ | sed -e "s# z #-#g; s# y #\.#g; s# x #/#g" | xargs cat \ | awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' \ | /cluster/bin/scripts/lodToBedScore /dev/stdin > mostConserved5Way.bed # ~ 16 seconds cp -p mostConserved5Way.bed /cluster/data/oryLat1/bed/multiz5way # Figure out how much is actually covered by the bed file as so: # Get the non-n genome size from faSize on all chroms: ssh kkstore04 cd /cluster/data/oryLat1/bed/multiz5way/cons faSize chr*/*.fa # 843500808 bases (143114211 N's 700386597 real 468614281 # upper 231772316 lower) in 26 sequences in 26 files cd /san/sanvol1/scratch/oryLat1/cons/consRun1 # The 700386597 comes from the non-n genome as counted above. awk ' {sum+=$3-$2} END{printf "%% %.2f = 100.0*%d/700386597\n",100.0*sum/700386597,sum}' \ mostConserved5Way.bed # % 6.84 = 100.0*47871534/700386597--exp-len 10 --tar-cov 0.20 # Aiming for %70 coverage in # the following featureBits measurement on CDS: # Beware of negative scores when too high. The logToBedScore # will output an error on any negative scores. HGDB_CONF=~/.hg.conf.read-only time nice -n +19 featureBits oryLat1 \ -enrichment ensGene:cds mostConserved5Way.bed # --expected-length 10 --target-coverage 0.20 # ensGene:cds 4.125%, mostConserved5Way.bed 6.835%, both 2.309%, cover # 55.97%, enrich 8.19x # Load most conserved track into database ssh hgwdev cd /cluster/data/oryLat1/bed/multiz5way # ended up using the set: --expected-length 10 --target-coverage 0.20 time nice -n +19 hgLoadBed -strict oryLat1 phastConsElements5way \ mostConserved5Way.bed # Loaded 825134 elements of size 5 # real 0m18.391s # should measure the same as above time nice -n +19 \ featureBits oryLat1 -enrichment ensGene:cds phastConsElements5way # At: --expected-length 10 --target-coverage 0.20 # ensGene:cds 4.125%, phastConsElements5way 6.835%, both 2.309%, cover # 55.97%, enrich 8.19x # Create merged posterier probability file and wiggle track data files ssh pk cd /san/sanvol1/scratch/oryLat1/cons/consRun1 # the sed business gets the names sorted by chromName, chromStart # so that everything goes in numerical order into wigEncode # This was verified above to be correct time nice -n +19 find ./ppRaw -type f \ | sed -e "s#/# x #g; s#\.# y #g; s#-# z #g" \ | sort -k7,7 -k9,9n \ | sed -e "s# z #-#g; s# y #\.#g; s# x #/#g" | xargs cat \ | wigEncode -noOverlap stdin \ phastCons5.wig phastCons5.wib # Converted stdin, upper limit 1.00, lower limit 0.00 # real 2m55.873s # -rw-rw-r-- 1 292833977 Feb 5 16:31 phastCons5.wib # -rw-rw-r-- 1 48444448 Feb 5 16:31 phastCons5.wig time nice -n +19 cp -p phastCons5.wi? /cluster/data/oryLat1/bed/multiz5way/ # real 1m21.329s # prepare compressed copy of ascii data values for downloads ssh pk cd /san/sanvol1/scratch/oryLat1/cons/consRun1 cat << '_EOF_' > gzipAscii.sh #!/bin/sh TOP=`pwd` export TOP mkdir -p phastCons5Scores for D in ppRaw/chr* do C=${D/ppRaw\/} out=phastCons5Scores/${C}.data.gz echo "========================== ${C} ${D}" find ./${D} -type f | sed -e "s#/# x #g; s#\.# y #g; s#-# z #g" \ | sort -k7,7 -k9,9n \ | sed -e "s# z #-#g; s# y #\.#g; s# x #/#g" | xargs cat | gzip > ${out} done '_EOF_' # << happy emacs chmod +x gzipAscii.sh time nice -n +19 ./gzipAscii.sh # real 5m25.145s # copy them for downloads ssh kkstore04 mkdir /cluster/data/oryLat1/bed/multiz5way/phastCons5Scores cd /cluster/data/oryLat1/bed/multiz5way/phastCons5Scores cp -p /san/sanvol1/scratch/oryLat1/cons/consRun1/phastCons5Scores/* . # make a README.txt file here, and an md5sum ssh hgwdev mkdir /usr/local/apache/htdocs/goldenPath/oryLat1/phastCons5Scores cd /usr/local/apache/htdocs/goldenPath/oryLat1/phastCons5Scores ln -s /cluster/data/oryLat1/bed/multiz5way/phastCons5Scores/* . # Load gbdb and database with wiggle. ssh hgwdev cd /cluster/data/oryLat1/bed/multiz5way ln -s `pwd`/phastCons5.wib /gbdb/oryLat1/wib/phastCons5.wib # ended up using the set: --expected-length 10 --target-coverage 0.20 time nice -n +19 hgLoadWiggle oryLat1 phastCons5 phastCons5.wig # real 0m9.256s # Create histogram to get an overview of all the data ssh hgwdev cd /cluster/data/oryLat1/bed/multiz5way time nice -n +19 hgWiggle -doHistogram \ -hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \ -db=oryLat1 phastCons5 > histogram.data 2>&1 # real 0m30.744 # create plot of histogram: cat << '_EOF_' | gnuplot > histo.png set terminal png small color \ xffffff x000000 x000000 x444444 xaa4400 x00ffff xff0000 set size 1.4, 0.8 set key left box set grid noxtics set grid ytics set title " Medaka oryLat1 Histogram phastCons5 track" set xlabel " phastCons5 score - --expected-length 10 --target-coverage 0.20" set ylabel " Relative Frequency" set y2label " Cumulative Relative Frequency (CRF)" set y2range [0:1] set y2tics set yrange [0:0.02] plot "histogram.data" using 2:5 title " RelFreq" with impulses, \ "histogram.data" using 2:7 axes x1y2 title " CRF" with lines '_EOF_' # << happy emacs display histo.png & # The mostConserved can also be characterized by a histogram awk '{print $3-$2}' mostConserved5Way.bed > mostCons.txt textHistogram -verbose=2 -autoScale=1000 -pValues mostCons.txt \ > mostCons.histo.txt cat << '_EOF_' | gnuplot > mostCons.png set terminal png small color \ xffffff x000000 x000000 x444444 xaa4400 x00ffff xff0000 set size 1.4, 0.8 set key left box set grid noxtics set grid ytics set title " Medaka oryLat1 histogram: lengths of mostConserved track elements" set xlabel " mostConserved element length - --expected-length 10 --target-coverage 0.20" set ylabel " # of elements at this length" set y2label " Cumulative Relative Frequency (CRF)" set y2range [0:1] set xrange [0:200] set y2tics set boxwidth 2 set style fill solid plot "mostCons.histo.txt" using 2:3 title " # of elements" with boxes, \ "mostCons.histo.txt" using 2:6 axes x1y2 title " CRF" with lines '_EOF_' # << happy emacs ########################################################################### # HUMAN (hg18) PROTEINS TRACK (DONE braney 2007-01-28) ssh kkstore04 # bash if not using bash shell already mkdir /cluster/data/oryLat1/blastDb cd /cluster/data/oryLat1 cat noUn/chr*fa > temp.fa faSplit gap temp.fa 1000000 blastDb/x -lift=blastDb.lft rm temp.fa faSplit sequence Un/chrUnScaffolds.fa 150 blastDb/y cd blastDb for i in *.fa do /cluster/bluearc/blast229/formatdb -i $i -p F done rm *.fa mkdir -p /san/sanvol1/scratch/oryLat1/blastDb cd /cluster/data/oryLat1/blastDb for i in nhr nin nsq; do echo $i cp *.$i /san/sanvol1/scratch/oryLat1/blastDb done mkdir -p /cluster/data/oryLat1/bed/tblastn.hg18KG cd /cluster/data/oryLat1/bed/tblastn.hg18KG echo /san/sanvol1/scratch/oryLat1/blastDb/*.nsq | xargs ls -S | sed "s/\.nsq//" > query.lst wc -l query.lst # 887 query.lst # we want around 150000 jobs calc `wc /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl | awk "{print \\\$1}"`/\(150000/`wc query.lst | awk "{print \\\$1}"`\) # 36727/(150000/887) = 217.178993 mkdir -p /cluster/bluearc/oryLat1/bed/tblastn.hg18KG/kgfa split -l 217 /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl /cluster/bluearc/oryLat1/bed/tblastn.hg18KG/kgfa/kg ln -s /cluster/bluearc/oryLat1/bed/tblastn.hg18KG/kgfa kgfa cd kgfa for i in *; do nice pslxToFa $i $i.fa; rm $i; done cd .. ls -1S kgfa/*.fa > kg.lst mkdir -p /cluster/bluearc/oryLat1/bed/tblastn.hg18KG/blastOut ln -s /cluster/bluearc/oryLat1/bed/tblastn.hg18KG/blastOut for i in `cat kg.lst`; do mkdir blastOut/`basename $i .fa`; done tcsh cd /cluster/data/oryLat1/bed/tblastn.hg18KG cat << '_EOF_' > blastGsub #LOOP blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl } #ENDLOOP '_EOF_' cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/cluster/bluearc/blast229/data export BLASTMAT g=`basename $2` f=/tmp/`basename $3`.$g for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11 do if /cluster/bluearc/blast229/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8 then mv $f.8 $f.1 break; fi done if test -f $f.1 then if /cluster/bin/i386/blastToPsl $f.1 $f.2 then liftUp -nosort -type=".psl" -nohead $f.3 /cluster/data/oryLat1/blastDb.lft carry $f.2 liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg18/bed/blat.hg18KG/protein.lft warn $f.3 if pslCheck -prot $3.tmp then mv $3.tmp $3 rm -f $f.1 $f.2 $f.3 $f.4 fi exit 0 fi fi rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 $f.4 exit 1 '_EOF_' # << happy emacs chmod +x blastSome gensub2 query.lst kg.lst blastGsub blastSpec exit ssh pk cd /cluster/data/oryLat1/bed/tblastn.hg18KG para create blastSpec # para try, check, push, check etc. para time # times munged due to recovered batches ssh kkstore04 cd /cluster/data/oryLat1/bed/tblastn.hg18KG mkdir chainRun cd chainRun tcsh cat << '_EOF_' > chainGsub #LOOP chainOne $(path1) #ENDLOOP '_EOF_' cat << '_EOF_' > chainOne (cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=100000 stdin /cluster/bluearc/oryLat1/bed/tblastn.hg18KG/blastOut/c.`basename $1`.psl) '_EOF_' chmod +x chainOne ls -1dS /cluster/bluearc/oryLat1/bed/tblastn.hg18KG/blastOut/kg?? > chain.lst gensub2 chain.lst single chainGsub chainSpec # do the cluster run for chaining ssh kk cd /cluster/data/oryLat1/bed/tblastn.hg18KG/chainRun para create chainSpec para maxNode 30 para try, check, push, check etc. # Completed: 170 of 170 jobs # CPU time in finished jobs: 3150s 52.51m 0.88h 0.04d 0.000 y # IO & Wait Time: 42932s 715.53m 11.93h 0.50d 0.001 y # Average job time: 271s 4.52m 0.08h 0.00d # Longest finished job: 453s 7.55m 0.13h 0.01d # Submission to last job: 1632s 27.20m 0.45h 0.02d ssh kkstore04 cd /cluster/data/oryLat1/bed/tblastn.hg18KG/blastOut for i in kg?? do cat c.$i.psl | awk "(\$13 - \$12)/\$11 > 0.6 {print}" > c60.$i.psl sort -rn c60.$i.psl | pslUniq stdin u.$i.psl awk "((\$1 / \$11) ) > 0.60 { print }" c60.$i.psl > m60.$i.psl echo $i done sort -T /tmp -k 14,14 -k 16,16n -k 17,17n u.*.psl m60* | uniq > /cluster/data/oryLat1/bed/tblastn.hg18KG/unliftBlastHg18KG.psl cd .. pslCheck unliftBlastHg18KG.psl liftUp blastHg18KG.psl ../../Un/chrUn.lift carry unliftBlastHg18KG.psl # load table ssh hgwdev cd /cluster/data/oryLat1/bed/tblastn.hg18KG hgLoadPsl oryLat1 blastHg18KG.psl # check coverage featureBits oryLat1 blastHg18KG # 18960975 bases of 700386597 (2.707%) in intersection featureBits oryLat1 ensGene:cds blastHg18KG -enrichment # ensGene:cds 4.125%, blastHg18KG 2.707%, both 2.279%, cover 55.26%, enrich 20.41x ssh kkstore04 rm -rf /cluster/data/oryLat1/bed/tblastn.hg18KG/blastOut rm -rf /cluster/bluearc/oryLat1/bed/tblastn.hg18KG/blastOut #end tblastn ########################################################################## ######################################################################### # BLASTZ/CHAIN/NET FR2 (DONE - 2007-01-24 - Hiram) ## no chrUn in oryLat1, and align to fr2 scaffolds, ## results lifted to fr2 chrUn coordinates ssh kkstore04 mkdir /cluster/data/oryLat1/bed/blastz.fr2.2007-01-24 cd /cluster/data/oryLat1/bed/blastz.fr2.2007-01-24 cat << '_EOF_' > DEF # Medaka vs. Fugu # Try "human-fugu" (more distant, less repeat-killed than mammal) params # +M=50: BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_M=50 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET: Medaka oryLat1 (40M chunks covers the largest chroms in one gulp) # chrUn in Scaffolds for this alignment run SEQ1_DIR=/san/sanvol1/scratch/oryLat1/oryLat1.sdTrf.2bit SEQ1_LEN=/san/sanvol1/scratch/oryLat1/chrom.sizes SEQ1_CHUNK=40000000 SEQ1_LIMIT=30 SEQ1_LAP=10000 # QUERY: Fugu fr2 # Align to the scaffolds, results lifed up to chrUn.sdTrf coordinates SEQ2_DIR=/san/sanvol1/scratch/fr2/fr2.2bit SEQ2_LEN=/san/sanvol1/scratch/fr2/chrom.sizes SEQ2_CTGDIR=/san/sanvol1/scratch/fr2/fr2.scaffolds.2bit SEQ2_CTGLEN=/san/sanvol1/scratch/fr2/fr2.scaffolds.sizes SEQ2_LIFT=/san/sanvol1/scratch/fr2/liftAll.lft SEQ2_CHUNK=20000000 SEQ2_LIMIT=30 SEQ2_LAP=0 BASE=/cluster/data/oryLat1/bed/blastz.fr2.2007-01-24 TMPDIR=/scratch/tmp '_EOF_' # << this line keeps emacs coloring happy time doBlastzChainNet.pl DEF -chainMinScore=2000 -chainLinearGap=loose \ -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -verbose=2 -bigClusterHub=pk \ -blastzOutRoot /cluster/bluearc/oryLat1Fr2 > do.log 2>&1 & # real 601m51.650s mkdir /cluster/data/fr2/bed/blastz.oryLat1.swap cd /cluster/data/fr2/bed/blastz.oryLat1.swap time doBlastzChainNet.pl -verbose=2 \ /cluster/data/oryLat1/bed/blastz.fr2.2007-01-24/DEF \ -chainMinScore=2000 -chainLinearGap=loose \ -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -swap -bigClusterHub=pk > swap.log 2>&1 & ssh hgwdev cd /cluster/data/oryLat1/bed/blastz.fr2.2007-01-24 time nice -n +19 featureBits oryLat1 chainFr2Link \ > fb.oryLat1.chainFr2Link.txt 2>&1 # 177508958 bases of 700386597 (25.344%) in intersection cd /cluster/data/fr2/bed/blastz.oryLat1.swap] time nice -n +19 featureBits fr2 chainOryLat1Link \ > fb.fr2.chainOryLat1Link.txt 2>&1 # 143996507 bases of 393312790 (36.611%) in intersection ############################################################################ ## Set a more interesting default position, location of IGHMBP2 gene ## DONE - 2007-02-08 - Hiram ssh hgwdev hgsql -e \ 'update dbDb set defaultPos="chr6:11,027,149-11,038,707" where name = "oryLat1";' \ -h genome-testdb hgcentraltest ############################################################################ # BLATSERVERS ENTRY (DONE - 2007-02-08 - Hiram) # After getting a blat server assigned by the Blat Server Gods, ssh hgwdev hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ VALUES ("oryLat1", "blat3", "17788", "1", "0"); \ INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ VALUES ("oryLat1", "blat3", "17789", "0", "1");' \ hgcentraltest # test it with some sequence ############################################################################ ## Medaka Photograph (DONE - 2007-02-12 12:15 - Hiram) # Permission to use photo received in email from moris@cb.k.u-tokyo.ac.jp # Shinichi Morishita 2007-02-09 18:25 - Morishita Lab - Univ of Tokyo ssh hgwdev mkdir /cluster/data/oryLat1/photograph cd /cluster/data/oryLat1/photograph wget --timestamping "http://medaka.utgenome.org/picture/Medaka_hd_rR.jpg" \ -O oryLat1.Kiyoshi.Naruse.jpg convert -quality 100 -crop "1464x600+0+300" -sharpen 0 -normalize \ oryLat1.Kiyoshi.Naruse.jpg oryLat1.crop.jpg convert -quality 80 -geometry 300x200 oryLat1.crop.jpg \ Oryzias_latipes.jpg # add this image to the browser doc source tree in images and copy it # to /usr/local/apache/htdocs/images on hgwdev ############################################################################ ## DOWNLOADS - (DONE - 2007-02-12 - 2007-02-16 - Hiram) ssh hgwdev cd /cluster/data/oryLat1 ~/kent/src/hg/utils/automation/makeDownloads.pl oryLat1 \ > makeDownloads.out 2>&1 # This needs to be checked to see if it did the correct procedure # since there isn't any valid Repeat masker output files # Create WindowMasker separate files by chrom, for downloads ssh kkstore04 cd /cluster/data/oryLat1/bed/WindowMasker.2006-11-29 zcat windowmasker.sdust.bed.gz > chr.oryLat1.WMSdust.bed splitFileByColumn chr.oryLat1.WMSdust.bed chrWM # Creating chrWM/chr1.oryLat1.WMSdust.bed # Creating chrWM/chr2.oryLat1.WMSdust.bed # ... etc ... cd chrWM tar cvzf ../chromWMSdust.bed.tar.gz *.bed # Verify this process didn't destroy anything: cat chrWM/*.bed | awk '{sum += $3-$2}END{printf "total size: %d\n",sum}' # total size: 374591526 zcat windowmasker.sdust.bed.gz \ | awk '{sum += $3-$2}END{printf "total size: %d\n",sum}' # total size: 374591526 # deliver to bigZips cd /cluster/data/oryLat1/goldenPath/bigZips ln -s \ /cluster/data/oryLat1/bed/WindowMasker.2006-11-29/chromWMSdust.bed.tar.gz . # get GenBank native mRNAs ssh hgwdev cd /cluster/data/genbank ./bin/x86_64/gbGetSeqs -db=oryLat1 -native \ GenBank mrna /cluster/data/oryLat1/goldenPath/bigZips/mrna.fa # get GenBank xeno mRNAs ./bin/x86_64/gbGetSeqs -db=oryLat1 -xeno \ GenBank mrna /cluster/data/oryLat1/goldenPath/bigZips/xenoMrna.fa cd /cluster/data/oryLat1/goldenPath/bigZips ssh kkstore04 gzip mrna.fa xenoMrna.fa md5sum *.gz > md5sum.txt # Edit the README.txt file to be correct ssh hgwdev cd /usr/local/apache/htdocs/goldenPath/oryLat1/bigZips ln -s /cluster/data/oryLat1/goldenPath/bigZips/* . ############################################################################ ## swap human alignments done properly with oryLat1 contigs ## (DONE - 2005-02-25 - Hiram) ## previous measurement on human ssh hgwdev cd /cluster/data/hg18/bed/blastz.oryLat1.2007-02-24 nice -n +19 featureBits hg18 chainOryLat1Link \ > fb.hg18.chainOryLat1Link.txt 2>&1 & # 57393910 bases of 2881515245 (1.992%) in intersection ssh kkstore04 mkdir /cluster/data/oryLat1/bed/blastz.hg18.swap cd /cluster/data/oryLat1/bed/blastz.hg18.swap time doBlastzChainNet.pl -chainMinScore=2000 -chainLinearGap=loose \ /cluster/data/hg18/bed/blastz.oryLat1.2007-02-24/DEF \ -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -bigClusterHub=pk -verbose=2 -swap > swap.log 2>&1 & ssh hgwdev cd /cluster/data/oryLat1/bed/blastz.hg18.swap nice -n +19 featureBits oryLat1 chainHg18Link \ > fb.oryLat1.chainHg18Link.txt 2>&1 & # 48002423 bases of 700386597 (6.854%) in intersection ########################################################################## ## RepeatMasker run to cover all bases (DONE - 2007-03-07 - Hiram) ssh kkstore02 mkdir /cluster/data/oryLat1/bed/RepeatMasker cd /cluster/data/oryLat1/bed/RepeatMasker time nice -n +19 doRepeatMasker.pl -verbose=2 -bigClusterHub=kk \ -buildDir=/cluster/data/oryLat1/bed/RepeatMasker oryLat1 > do.log 2>&1 & ######################################################################### ## Remake the gold tables to be whole scaffolds only (DONE - 2007-03-21 - Hiram) cd /cluster/data/oryLat1/rawData mkdir gold_tables cat << '_EOF_' > mkGoldTab.pl #!/usr/bin/env perl use warnings; use strict; open (FH,") { if ($line =~ m/scaffold[0-9]+_contig/) { chomp $line; my ($chr, $start, $end, $seqId, $type, $scafContig, $ctgStart, $ctgEnd, $strand) = split('\s+',$line); --$start; --$ctgStart; my $ctgLen = $ctgEnd - $ctgStart; my $chrLen = $end - $start; die "lengths: $ctgLen != $chrLen" if ($ctgLen != $chrLen); $scafContig =~ s/_contig[0-9]+//; if (length($scafName) > 0) { if ($scafName ne $scafContig) { $scafLen = $chrEnd - $chrStart; printf "%s\t%d\t%d\t%d\t%s\t%s\t0\t%d\t+\n", $chrName, $chrStart, $chrEnd, $ix++, $scafType, $scafName, $scafLen; $scafName = $scafContig; $chrStart = $start; } } else { $scafName = $scafContig; $chrStart = $start; } $chrEnd = $end; $chrName = $chr; $scafType = $type; } } # and the last one printf "%s\t%d\t%d\t%d\t%s\t%s\t0\t%d\t+\n", $chrName, $chrStart, $chrEnd, $ix++, $scafType, $scafName, $scafLen; close(FH); '_EOF_' # << happy emacs chmod +x ./mkGoldTab.pl ./mkGoldTab.pl | splitFileByColumn -ending=_gold.tab -tab stdin gold_tables cd gold_tables sed -e "s/agpFrag/chrZX_gold/" $HOME/kent/src/hg/lib/agpFrag.sql \ > chr_gold.sql # edit that .sql file to add the bin column # bin smallint(6) NOT NULL default '0', ls *._gold.tab | sed -e "s/._gold.tab//" | while read C do echo ${C}_gold sed -e "s/chrZX/${C}/" chr_gold.sql > t.sql exit 0 done ls *._gold.tab | sed -e "s/._gold.tab//" | while read C do echo ${C}_gold sed -e "s/chrZX/${C}/" chr_gold.sql > t.sql hgLoadBed -sqlTable=t.sql oryLat1 ${C}_gold ${C}._gold.tab rm -f t.sql done ######################################################################### ## Creating a ctgPos2 table (DONE - 2007-03-21 - Hiram) cd /cluster/data/oryLat1/rawData cat << '_EOF_' > mkCtgPos2.pl #!/usr/bin/env perl use warnings; use strict; open (FH,") { if ($line =~ m/scaffold[0-9]+_contig/) { chomp $line; my ($chr, $start, $end, $seqId, $type, $scafContig, $ctgStart, $ctgEnd, $strand) = split('\s+',$line); --$start; --$ctgStart; my $ctgLen = $ctgEnd - $ctgStart; my $chrLen = $end - $start; die "lengths: $ctgLen != $chrLen" if ($ctgLen != $chrLen); $scafContig =~ s/scaffold[0-9]+_//; printf "%s\t%d\t%s\t%d\t%d\t%s\n", $scafContig, $ctgLen, $chr, $start, $end, $type; } } close(FH); '_EOF_' # << happy emacs chmod +x mkCtgPos2.pl ./mkCtgPos2.pl > ctgPos2.tab hgLoadSqlTab oryLat1 ctgPos2 ~/kent/src/hg/lib/ctgPos2.sql ctgPos2.tab ######################################################################### ## Creating quality tracks for the chromosomes (DONE - 2007-03-23 - Hiram) ## In email conversation with Budrul Ahsan ahsan@cb.k.u-tokyo.ac.jp ## obtain quality scores from: (existed only temporarily here) ssh kkstore04 mkdir /cluster/data/oryLat1/bed/qual cd /cluster/data/oryLat1/bed/qual wget --timestamping 'http://dev.utgenome.org/~utgb/medaka/medakaqual.tar.gz' ## These quality scores have scaffolds and ultracontigs mixed ## together. Our chrUn only has scaffolds. Separate out just the ## chrom scores from the scaffolds and ultracontigs cat << '_EOF_' > onlyChroms.pl #!/usr/bin/env perl use strict; use warnings; open (FH,"chroms.fa.qual") or die "can not write to chroms.fa.qual"; my $nextOf = 1; while (my $line = ) { if ($line =~ m/^>/) { if ($line =~ m/^>scaf|^>ult/) { if ($nextOf) { close (OF); open (OF,">chrUn.fa.qual") or die "can not write to chrUn.fa.qual"; $nextOf = 0; } } else { $line =~ s/^>/>chr/; } } printf OF "%s", $line; } close (OF); close (FH); '_EOF_' # << happy emacs chmod +x ./onlyChroms.pl ./onlyChroms.pl # some of the numbers in this are incorrect, too large. Truncate them # to 90 as per advice from Budrul in email time nice -n +19 sed -e \ "s/ 1[0-9][0-9]/ 90/g; s/ 2[0-9][0-9]/ 90/g; s/^1[0-9][0-9]/90/; s/^2[0-9][0-9]/90/;" \ chroms.fa.qual | qaToQac stdin fixed.chroms.qac # real 2m19.194s time nice -n +19 qacToWig -fixed fixed.chroms.qac stdout \ | wigEncode stdin qual.wig qual.wib # Converted stdin, upper limit 99.00, lower limit 0.00 # real 4m19.515s ssh hgwdev cd /cluster/data/oryLat1/bed/qual time nice -n +19 hgLoadWiggle -tmpDir=/scratch/tmp \ -pathPrefix=/gbdb/oryLat1/wib oryLat1 quality qual.wig # real 0m22.997s ln -s /cluster/data/oryLat1/bed/qual/qual.wib /gbdb/oryLat1/wib ########################################################################## ## Obtain a tar.gz file from Budrul that was their handoff to Ensembl ## They only made this tar file available to me for a week, then they ## remove it. ssh kkstore04 mkdir /cluster/data/oryLat1/downloads cd /cluster/data/oryLat1/downloads wget --timestamping 'http://dev.utgenome.org/~utgb/medakaAGP/EMBL.tar.gz' tar xvzf EMBL.tar.gz # EMBL/200510.chr_random.agp.txt # EMBL/200510.scaffold.agp.txt # EMBL/200510.contig.fasta # EMBL/200510.ultracontig.agp.txt # EMBL/200510.chr.agp.txt # EMBL/200510.UCSC.fasta.qual ########################################################################### ## Fixup gold/gap table entries (DONE - 2007-04-11 - Hiram) ssh hgwdev mkdir /cluster/data/oryLat1/bed/gap cd /cluster/data/oryLat1/bed/gap hgsql -e "show tables;" oryLat1 | grep gap | while read T do hgsql -N -e "select * from $T;" oryLat1 > ${T}.tab echo $T done # change telomere to scaffold hgsql -e "show tables;" oryLat1 | grep _gap | while read T do hgsql -N -e 'update '$T' set type="scaffold" where type="telomere";' oryLat1 done # the gold and ctgPos2 tables on chrM are empty mkdir /cluster/data/oryLat1/bed/gold cd /cluster/data/oryLat1/bed/gold hgsql -e "show tables;" oryLat1 | grep gold | while read T do hgsql -N -e "select * from $T;" oryLat1 > ${T}.tab echo $T done hgsql -N -e "select * from chrM_gold;" hg18 > hg18.chrM_gold.tab # Edit that to make it read: # 585 chrM 0 16714 1 F NC_004387.1 0 16714 + sed -e "s/agpFrag/chrM_gold/" $HOME/kent/src/hg/lib/agpFrag.sql \ > chrM_gold.sql # edit that to add the bin column, then # bin smallint(6) NOT NULL default '0', hgLoadSqlTab oryLat1 chrM_gold chrM_gold.sql chrM_gold.tab hgsql -e 'INSERT INTO ctgPos2 (contig, size, chrom, chromStart, chromEnd, type) VALUES ("NC_004387.1", 16714, "chrM", 0, 16714, "F");' oryLat1 ########################################################################### ## Drop some entries from the ensGtp table since they are not all loaded ## to keep joiner check happy ## (DONE - 2007-04-11 - Hiram) ssh hgwdev hgsql -N -e "select name from ensGene;" oryLat1 \ | sort > name.ensGene.oryLat1.txt hgsql -N -e "select transcript from ensGtp;" oryLat1 \ | sort > transcript.ensGtp.oryLat1 wc -l name.ensGene.oryLat1.txt transcript.ensGtp.oryLat1 # 23087 name.ensGene.oryLat1.txt # 26148 transcript.ensGtp.oryLat1 comm -12 name.ensGene.oryLat1.txt transcript.ensGtp.oryLat1 | wc -l # 23087 comm -13 name.ensGene.oryLat1.txt transcript.ensGtp.oryLat1 | wc -l # 3061 comm -13 name.ensGene.oryLat1.txt transcript.ensGtp.oryLat1 \ > delete.transcript.ensGtp for T in `cat delete.transcript.ensGtp` do hgsql -e 'delete from ensGtp where transcript="'$T'";' oryLat1 echo $T done hgsql -e "select count(*) from ensGtp;" oryLat1 # 23087 hgsql -N -e "select name from ensPep;" oryLat1 | sort > name.ensPep.oryLat1 comm -13 name.ensGene.oryLat1.txt name.ensPep.oryLat1 \ > delete.name.ensPep for T in `cat delete.name.ensPep` do hgsql -e 'delete from ensPep where name="'$T'";' oryLat1 echo $T done hgsql -N -e "select name from ensPep;" oryLat1 | wc runJoiner.csh oryLat1 ensGene # found identifiers: # ensemblTranscriptId # Checking keys on database oryLat1 # oryLat1.ensGtp.transcript - hits 23087 of 23087 ok # oryLat1.ensPep.name - hits 23073 of 23073 ok # running -times flag ######################################################################### ## Reset date on request (DONE - 2007-05-01 - Hiram) hgsql hgcentraltest \ -e 'update dbDb set description = "Apr. 2006" where name = "oryLat1";' ########################################################################## # NSCAN track - (2007-06-05 markd) # uses danRer4 as informant, no pseudogene masking # cd /cluster/data/oryLat1/bed/nscan/ # obtainedf NSCAN predictions from michael brent's group # at WUSTL wget -nv http://mblab.wustl.edu/predictions/Oryzias_latipes/oryLat1.gtf wget -nv http://mblab.wustl.edu/predictions/Oryzias_latipes/oryLat1.prot.fa gzip -9 oryLat1.* chmod a-w *.gz # load tracks. Note that these have *utr features, rather than # exon features. currently ldHgGene creates separate genePred exons # for these. ldHgGene -bin -gtf -genePredExt oryLat1 nscanGene oryLat1.gtf.gz hgPepPred oryLat1 generic nscanPep oryLat1.prot.fa.gz rm -f *.tab # update trackDb; need a oryLat1-specific page to describe informants medaka/oryLat1/nscanGene.html medaka/oryLat1/trackDb.ra ########################################################################## # SWAP DANRER5 BLASTZ RESULT TO GET DANRER5 CHAINS, NETS, AXTNET, MAFNET # AND ALIGNMENT DOWNLOADS ON ORYLAT1 (DONE, 2007-09-11 - 2007-09-12, hartera) # CLEAN UP AND MAKE blastz.danRer5 LINK TO BLASTZ DIRECTORY # (DONE, 2007-09-17, hartera) ssh hgwdev mkdir /cluster/data/oryLat1/bed/blastz.swap.danRer5 cd /cluster/data/oryLat1/bed/blastz.swap.danRer5 # blastz parameters used to align oryLat1 as query to danRer5 as target: # BLASTZ_H=2500 # BLASTZ_M=50 # BLASTZ_Q=/cluster/data/blastz/HoxD55.q time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ -qRepeats=windowmaskerSdust -swap \ /cluster/data/danRer5/bed/blastz.oryLat1.2007-09-10/DEF \ >& swap.log & # 0.070u 0.027s 3:30.52 0.0% 0+0k 0+0io 0pf+0w # netChains: looks like previous stage was not successful (can't find # [oryLat1.danRer5.]all.chain[.gz]). time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ -qRepeats=windowmaskerSdust -swap -continue net \ /cluster/data/danRer5/bed/blastz.oryLat1.2007-09-10/DEF \ >& swap2.log & # 0.153u 0.109s 31:55.61 0.0% 0+0k 0+0io 0pf+0w cat \ /cluster/data/oryLat1/bed/blastz.danRer5.swap/fb.oryLat1.chainDanRer5Link.txt # 130699035 bases of 700386597 (18.661%) in intersection # look at coverage of ensGene CDS, there is no native RefSeqs # track for medaka. oryLat1 ensGene has 18477 genes and 23087 transcripts. featureBits oryLat1 ensGene:cds chainDanRer5Link -enrichment # ensGene:cds 4.125%, chainDanRer5Link 18.661%, both 3.582%, cover 86.85%, # enrich 4.65x featureBits oryLat1 ensGene:cds chainDanRer4Link -enrichment # ensGene:cds 4.125%, chainDanRer4Link 22.275%, both 3.552%, cover 86.12%, # enrich 3.87x featureBits oryLat1 ensGene:cds netDanRer5 -enrichment # ensGene:cds 4.125%, netDanRer5 72.605%, both 3.981%, cover 96.52%, enrich # 1.33x featureBits oryLat1 ensGene:cds netDanRer4 -enrichment # ensGene:cds 4.125%, netDanRer4 73.914%, both 3.948%, cover 95.72%, enrich # 1.30x # clean up a little (2007-09-17, hartera) ssh kkstore04 cd /cluster/data/oryLat1/bed mv ./blastz.swap.danRer5/swap.log ./blastz.danRer5.swap rm -r blastz.swap.danRer5 ln -s blastz.danRer5.swap blastz.danRer5 ######################################################################### # BLASTZ/CHAIN/NET Medaka oryLat1 (DONE - 2008-01-16 - 2008-01-30 - Hiram) # with contigs for Lamprey ssh kkstore04 screen # use screen to control this job mkdir /cluster/data/oryLat1/bed/blastz.petMar1.2008-01-15 cd /cluster/data/oryLat1/bed/blastz.petMar1.2008-01-15 cat << '_EOF_' > DEF # Medaka vs. Lamprey # using the "close" genome alignment parameters # see also: http://genomewiki.ucsc.edu/index.php/Mm9_multiple_alignment BLASTZ_Y=9400 BLASTZ_L=3000 BLASTZ_K=3000 BLASTZ_M=50 # TARGET: Medaka oryLat1 (40M chunks covers the largest chroms in one gulp) # chrUn in Scaffolds for this alignment run SEQ1_DIR=/san/sanvol1/scratch/oryLat1/oryLat1.sdTrf.2bit SEQ1_LEN=/cluster/data/oryLat1/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LIMIT=30 SEQ1_LAP=10000 # QUERY: Lamprey petMar1 SEQ2_DIR=/cluster/bluearc/scratch/data/petMar1/petMar1.2bit SEQ2_LEN=/cluster/data/petMar1/chrom.sizes SEQ2_CTGDIR=/cluster/bluearc/scratch/data/petMar1/petMar1.supers.2bit SEQ2_CTGLEN=/cluster/data/petMar1/super.sizes SEQ2_LIFT=/cluster/data/petMar1/jkStuff/petMar1.nonBridged.lft SEQ2_CHUNK=10000000 SEQ2_LIMIT=150 SEQ2_LAP=0 BASE=/cluster/data/oryLat1/bed/blastz.petMar1.2008-01-16 TMPDIR=/scratch/tmp '_EOF_' # << this line keeps emacs coloring happy time doBlastzChainNet.pl -verbose=2 `pwd`/DEF \ -chainMinScore=3000 -chainLinearGap=medium \ -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -bigClusterHub=kk > do.log 2>&1 & # real 218m14.787s # Completed: 69774 of 70854 jobs # Crashed: 1080 jobs # CPU time in finished jobs: 3950133s 65835.55m 1097.26h 45.72d 0.125 y # IO & Wait Time: 1218716s 20311.93m 338.53h 14.11d 0.039 y # Average job time: 74s 1.23m 0.02h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 1629s 27.15m 0.45h 0.02d # Submission to last job: 12951s 215.85m 3.60h 0.15d # had some parasol bookeeping problems, it actually finished # the kluster run but got confused. Checked manually with # para recover -> nothing in new job list, and check files in # psl result directory finding the correct number of files for the # the job count. Make the run.time file and continuing: time doBlastzChainNet.pl -verbose=2 `pwd`/DEF \ -continue=cat -chainMinScore=3000 -chainLinearGap=medium \ -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -bigClusterHub=kk > cat.log 2>&1 & # real 7m47.963s # something failed # had the specification to the lft file incorrect, failed during # chain run. Fix that, finish the run, now continuing: time doBlastzChainNet.pl -verbose=2 `pwd`/DEF \ -continue=chainMerge -chainMinScore=3000 -chainLinearGap=medium \ -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -bigClusterHub=kk > chainMerge.log 2>&1 & # had the same failure as the first time test of this alignment. # something is wrong with netChainSubset, needs to be debugged. # ignore the step to create the "over.chain" file, finish the net # step manually, then continuing: time doBlastzChainNet.pl -verbose=2 `pwd`/DEF \ -continue=load -chainMinScore=3000 -chainLinearGap=medium \ -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -bigClusterHub=kk > load.log 2>&1 & # real 3m53.686s cat fb.oryLat1.chainPetMar1Link.txt # 24990733 bases of 700386597 (3.568%) in intersection # And, for the swap: mkdir /cluster/data/petMar1/bed/blastz.oryLat1.swap cd /cluster/data/petMar1/bed/blastz.oryLat1.swap time doBlastzChainNet.pl -verbose=2 -swap \ /cluster/data/oryLat1/bed/blastz.petMar1.2008-01-16/DEF \ -chainMinScore=3000 -chainLinearGap=medium \ -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -bigClusterHub=kk > swap.log 2>&1 & # real 1m3.572s # same problem in netChainSubset trying to make the "over.chain" file. # skip that, finish the chains manually: # real 14m35.340s # then continuing: time doBlastzChainNet.pl -verbose=2 -swap \ /cluster/data/oryLat1/bed/blastz.petMar1.2008-01-16/DEF \ -continue=load -chainMinScore=3000 -chainLinearGap=medium \ -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -bigClusterHub=kk > load.log 2>&1 & cat fb.petMar1.chainOryLat1Link.txt # 23358156 bases of 831696438 (2.808%) in intersection ######################################################################### # BLASTZ/CHAIN/NET Medaka oryLat1 (DONE - 2008-01-29 - Hiram) # with contigs for Lamprey ssh kkstore04 screen # use screen to control this job mkdir /cluster/data/oryLat1/bed/blastz.petMar1.2008-01-29 cd /cluster/data/oryLat1/bed/blastz.petMar1.2008-01-29 cat << '_EOF_' > DEF # Medaka vs. Lamprey # using the "close" genome alignment parameters # see also: http://genomewiki.ucsc.edu/index.php/Mm9_multiple_alignment BLASTZ_Y=9400 BLASTZ_L=3000 BLASTZ_K=3000 BLASTZ_M=50 # TARGET: Medaka oryLat1 (40M chunks covers the largest chroms in one gulp) # chrUn in Scaffolds for this alignment run SEQ1_DIR=/san/sanvol1/scratch/oryLat1/oryLat1.sdTrf.2bit SEQ1_LEN=/cluster/data/oryLat1/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LIMIT=30 SEQ1_LAP=10000 # QUERY: Lamprey petMar1 SEQ2_DIR=/cluster/bluearc/scratch/data/petMar1/petMar1.2bit SEQ2_LEN=/cluster/data/petMar1/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LIMIT=150 SEQ2_LAP=0 BASE=/cluster/data/oryLat1/bed/blastz.petMar1.2008-01-29 TMPDIR=/scratch/tmp '_EOF_' # << this line keeps emacs coloring happy time doBlastzChainNet.pl -verbose=2 `pwd`/DEF \ -chainMinScore=3000 -chainLinearGap=medium \ -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -bigClusterHub=memk > do.log 2>&1 & # real 1117m45.898s cat fb.oryLat1.chainPetMar1Link.txt # 24991574 bases of 700386597 (3.568%) in intersection mkdir /cluster/data/petMar1/bed/blastz.oryLat1.swap cd /cluster/data/petMar1/bed/blastz.oryLat1.swap time doBlastzChainNet.pl -verbose=2 -swap \ /cluster/data/oryLat1/bed/blastz.petMar1.2008-01-29/DEF \ -chainMinScore=3000 -chainLinearGap=medium \ -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -bigClusterHub=memk > swap.log 2>&1 & # real 38m58.711s cat fb.petMar1.chainOryLat1Link.txt # 23359179 bases of 831696438 (2.809%) in intersection ############################################################################ # oryLat1 - Medaka - Ensembl Genes (DONE - 2008-03-31 - 04-15 - hiram) ssh kkstore04 cd /cluster/data/oryLat1 cat << '_EOF_' > oryLat1.ensGene.ra # required db variable db oryLat1 # optional nameTranslation, the sed command that will transform # Ensemble names to UCSC names. With quotes just to make sure. nameTranslation "s/^\([0-9][0-9]*\)/chr\1/; s/^MT/chrM/" # ignore 2,687 genes that haven't lifted properly yet skipInvalid yes '_EOF_' # << happy emacs doEnsGeneUpdate.pl -ensVersion=49 oryLat1.ensGene.ra featureBits oryLat1 ensGene # 29680340 bases of 700386597 (4.238%) in intersection # version 49 update had a big change in the Medaka genes. # Many of the new genes are in the Ensembl Gene Scaffold # (actually here ultracontigs) coordinate space that will not # translate to our chrUn assembly. So, some fixups are # necessary to get joinerCheck to be a bit more happy. ssh hgwdev cd ~/kent/src/hg/makeDb/schema joinerCheck -keys -database=oryLat1 -identifier=ensemblTranscriptId \ all.joiner # Checking keys on database oryLat1 # oryLat1.ensGtp.transcript - hits 22447 of 25134 # Error: 2687 of 25134 elements of oryLat1.ensGtp.transcript are not in key ensGene.name line 1726 of all.joiner # Example miss: ENSORLT00000022895 # oryLat1.ensPep.name - hits 22053 of 24661 # Error: 2608 of 24661 elements of oryLat1.ensPep.name are not in key ensGene.name line 1728 of all.joiner # Example miss: ENSORLT00000022872 cd /cluster/data/oryLat1/bed/ensGene.49 grep "invalid chrom" *.errors.txt | cut -d\ -f2 | sort -u > badLifts.name hgsql -N -e "select transcript from ensGtp;" oryLat1 \ | sort > ensGtp.transcript comm -13 ensGene.name ensGtp.transcript | sort > ensGtp.not.ensGene.name comm -13 ensGene.name ensPep.name | sort > ensPep.not.ensGene.name # do the badLifts account for all the missing peptides: comm -12 badLifts.name ensPep.not.ensGene.name | wc -l # 2608 -> yes, this is the count of missing peptides. # So, remove this set of business from ensPep table for N in `cat badLifts.name` do hgsql -e "delete from ensPep where name=\"$N\";" oryLat1 echo $N done # And from the ensGtp table for N in `cat badLifts.name` do hgsql -e "delete from ensGtp where transcript=\"$N\";" oryLat1 echo $N done ############################################################################ ############################################################################ # TRANSMAP vertebrate.2008-05-20 build (2008-05-24 markd) vertebrate-wide transMap alignments were built Tracks are created and loaded by a single Makefile. This is available from: svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-05-20 see doc/builds.txt for specific details. ############################################################################ ############################################################################ # TRANSMAP vertebrate.2008-06-07 build (2008-06-30 markd) vertebrate-wide transMap alignments were built Tracks are created and loaded by a single Makefile. This is available from: svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-06-30 see doc/builds.txt for specific details. ############################################################################