# for emacs: -*- mode: sh; -*- # This file describes browser build for the panTro3 chimp genome: October 2010 # http://www.ncbi.nlm.nih.gov/Traces/wgs/?val=AACZ03 ############################################################################# # Fetch sequence from genbank (DONE - 2011-02-15 - Hiram) mkdir -p /hive/data/genomes/panTro3/genbank cd /hive/data/genomes/panTro3/genbank time wget --timestamping -r --cut-dirs=6 --level=0 -nH -x \ --no-remove-listing -np \ "ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Pan_troglodytes/Pan_troglodytes-2.1.3/*" # real 56m24.981s # user 0m1.053s # sys 0m8.815s # measure total sequence in this assembly cd Primary_Assembly faSize assembled_chromosomes/FASTA/*.fa.gz \ unlocalized_scaffolds/FASTA/*.fa.gz unplaced_scaffolds/FASTA/*.fa.gz # 3307943878 bases (407430668 N's 2900513210 real 2900513210 upper 0 lower) # in 24131 sequences in 50 files # Total size: mean 137082.8 sd 4455051.2 min 373 (gi|284234151|gb|AACZ03151841.1|) max 247518478 (gi|305434869|gb|CM000316.2|) median 2299 # N count: mean 16884.1 sd 887107.9 # U count: mean 120198.6 sd 3921497.1 # L count: mean 0.0 sd 0.0 # %0.00 masked total, %0.00 masked real ############################################################################# # process into UCSC naming scheme (DONE - 2011-02-17 - Hiram) mkdir /hive/data/genomes/panTro3/ucsc cd /hive/data/genomes/panTro3/ucsc cat << '_EOF_' > toUcsc.pl #!/bin/env perl use strict; use warnings; my %accToChr; open (FH, "<../genbank/Primary_Assembly/assembled_chromosomes/chr2acc") or die "can not read Primary_Assembly/assembled_chromosomes/chr2acc"; while (my $line = ) { chomp $line; my ($chrN, $acc) = split('\s+', $line); $accToChr{$acc} = $chrN; } close (FH); foreach my $acc (keys %accToChr) { my $chrN = $accToChr{$acc}; print "$acc $accToChr{$acc}\n"; open (FH, "zcat ../genbank/Primary_Assembly/assembled_chromosomes/AGP/chr${chrN}.agp.gz|") or die "can not read chr${chrN}.agp.gz"; open (UC, ">chr${chrN}.agp") or die "can not write to chr${chrN}.agp"; while (my $line = ) { if ($line =~ m/^#/) { print UC $line; } else { $line =~ s/^$acc/chr${chrN}/; print UC $line; } } close (FH); close (UC); open (FH, "zcat ../genbank/Primary_Assembly/assembled_chromosomes/FASTA/chr${chrN}.fa.gz|") or die "can not read chr${chrN}.fa.gz"; open (UC, ">chr${chrN}.fa") or die "can not write to chr${chrN}.fa"; while (my $line = ) { if ($line =~ m/^>/) { printf UC ">chr${chrN}\n"; } else { print UC $line; } } close (FH); close (UC); } '_EOF_' # << happy emacs chmod +x toUcsc.pl cat << '_EOF_' > unplaced.pl #!/bin/env perl use strict; use warnings; my $agpFile = "../genbank/Primary_Assembly/unplaced_scaffolds/AGP/unplaced.scaf.agp.gz"; my $fastaFile = "../genbank/Primary_Assembly/unplaced_scaffolds/FASTA/unplaced.scaf.fa.gz"; open (FH, "zcat $agpFile|") or die "can not read $agpFile"; open (UC, ">unplaced.agp") or die "can not write to unplaced.agp"; while (my $line = ) { if ($line =~ m/^#/) { print UC $line; } else { $line =~ s/\.1//; printf UC "chrUn_%s", $line; } } close (FH); close (UC); open (FH, "zcat $fastaFile|") or die "can not read $fastaFile"; open (UC, ">unplaced.fa") or die "can not write to unplaced.fa"; while (my $line = ) { if ($line =~ m/^>/) { chomp $line; $line =~ s/.*gb\|//; $line =~ s/\.1\|.*//; printf UC ">chrUn_$line\n"; } else { print UC $line; } } close (FH); close (UC); '_EOF_' # << happy emacs chmod +x unplaced.pl cat << '_EOF_' > unlocalized.pl #!/bin/env perl use strict; use warnings; my %accToChr; my %chrNames; open (FH, "<../genbank/Primary_Assembly/unlocalized_scaffolds/unlocalized.chr2scaf") or die "can not read Primary_Assembly/unlocalized_scaffolds/unlocalized.chr2scaf"; while (my $line = ) { chomp $line; my ($chrN, $acc) = split('\s+', $line); $accToChr{$acc} = $chrN; $chrNames{$chrN} += 1; } close (FH); foreach my $chrN (keys %chrNames) { my $agpFile = "../genbank/Primary_Assembly/unlocalized_scaffolds/AGP/chr$chrN.unlocalized.scaf.agp.gz"; my $fastaFile = "../genbank/Primary_Assembly/unlocalized_scaffolds/FASTA/chr$chrN.unlocalized.scaf.fa.gz"; open (FH, "zcat $agpFile|") or die "can not read $agpFile"; open (UC, ">chr${chrN}_random.agp") or die "can not write to chr${chrN}_random.agp"; while (my $line = ) { if ($line =~ m/^#/) { print UC $line; } else { chomp $line; my (@a) = split('\t', $line); my $acc = $a[0]; my $accNo1 = $acc; $accNo1 =~ s/.1$//; die "ERROR: acc not .1: $acc" if ($accNo1 =~ m/\./); die "ERROR: chrN $chrN not correct for $acc" if ($accToChr{$acc} ne $chrN); my $ucscName = "chr${chrN}_${accNo1}_random"; printf UC "%s", $ucscName; for (my $i = 1; $i < scalar(@a); ++$i) { printf UC "\t%s", $a[$i]; } printf UC "\n"; } } close (FH); close (UC); printf "chr%s\n", $chrN; open (FH, "zcat $fastaFile|") or die "can not read $fastaFile"; open (UC, ">chr${chrN}_random.fa") or die "can not write to chr${chrN}_random.fa"; while (my $line = ) { if ($line =~ m/^>/) { chomp $line; my $acc = $line; $acc =~ s/.*gb\|//; $acc =~ s/\|.*//; my $accNo1 = $acc; $accNo1 =~ s/.1$//; die "ERROR: acc not .1: $acc" if ($accNo1 =~ m/\./); die "ERROR: chrN $chrN not correct for $acc" if ($accToChr{$acc} ne $chrN); my $ucscName = "chr${chrN}_${accNo1}_random"; printf UC ">$ucscName\n"; } else { print UC $line; } } close (FH); close (UC); } '_EOF_' # << happy emacs chmod +x unlocalized.pl ./toUcsc.pl ./unlocalized.pl ./unplaced.pl # verify nothing lost in the translation faSize *.fa # 3307943878 bases (407430668 N's 2900513210 real 2900513210 upper 0 lower) # in 24131 sequences in 50 files # Total size: mean 137082.8 sd 4455051.2 min 373 (chr1_AACZ03151841_random) max 247518478 (chr2B) median 2299 # N count: mean 16884.1 sd 887107.9 # U count: mean 120198.6 sd 3921497.1 # L count: mean 0.0 sd 0.0 # %0.00 masked total, %0.00 masked real ############################################################################# # Initial database build (DONE - 2011-02-17 - Hiram) cd /hive/data/genomes/panTro3 cat << '_EOF_' > panTro3.ra # Config parameters for makeGenomeDb.pl: db panTro3 scientificName Pan troglodytes commonName Chimp assemblyDate Oct. 2010 assemblyShortLabel CGSC 2.1.3 assemblyLabel CGSC 2.1.3 (GCA_000001515.3) orderKey 23 mitoAcc NC_001643 fastaFiles /hive/data/genomes/panTro3/ucsc/*.fa agpFiles /hive/data/genomes/panTro3/ucsc/*.agp # qualFiles none dbDbSpeciesDir chimp taxId 9598 '_EOF_' # << happy emacs time makeGenomeDb.pl -stop=agp -dbHost=hgwdev -fileServer=hgwdev \ -workhorse=hgwdev -noGoldGapSplit panTro3.ra > makeGenome.agp.log 2>&1 # real 3m33.842s time makeGenomeDb.pl -continue=db -dbHost=hgwdev -fileServer=hgwdev \ -workhorse=hgwdev -noGoldGapSplit panTro3.ra > makeGenome.db.log 2>&1 # real 22m7.001s cat fb.panTro3.gold.gap.txt # 3307943878 bases of 3307943878 (100.000%) in intersection ############################################################################# # reload gold and gap tables (DONE - 2011-02-18 - Hiram) # the placed scaffolds AGP is much more detailed, lift it to # the supercontig AGP locations mkdir /hive/data/genomes/panTro3/bed/ctgPos2 cd /hive/data/genomes/panTro3/bed/ctgPos2 cat << '_EOF_' > liftAgp.pl #!/bin/env perl use strict; use warnings; my %contigParts = (); # read the AGP that needs to be lifted to the other AGP open (FH, "zcat ../../genbank/Primary_Assembly/placed_scaffolds/AGP/chr*.agp.gz|") or die "can not read placed_scaffolds/AGP/chr*.agp.gz"; while (my $line = ) { next if ($line =~ m/^#/); chomp $line; my ($contig, $rest) = split('\s+', $line, 2); my $parts = ""; my $index = 0; if (exists($contigParts{$contig})) { $parts = $contigParts{$contig}; $index = scalar(@{$parts}); } else { my @agpLines; $parts = \@agpLines; $contigParts{$contig} = $parts; } $parts->[$index] = $line; } close (FH); my $partCount = 0; my $lineCount = 0; foreach my $key (keys %contigParts) { ++$partCount; my $parts = $contigParts{$key}; $lineCount += scalar(@{$parts}); } printf STDERR "partCount: $partCount, lineCount: $lineCount\n"; # read the other AGP and lift that first AGP to this one open (FH, "<../../panTro3.agp") or die "can not read panTro3.agp"; while (my $line = ) { chomp $line; my @a = split('\s+', $line); if (($a[0] =~ m/chrM|_random|chrUn/) || ($a[4] eq "N")) { printf "%s\n", $line; } else { die "ERROR: not 9 fields at $line" if (scalar(@a) != 9); my $chr = $a[0]; my $lift = $a[1] - 1; my $contig = $a[5]; die "ERROR: can not find contig $contig at $line" if (!exists($contigParts{$contig})); my $parts = $contigParts{$contig}; for (my $i = 0; $i < scalar(@{$parts}); ++$i) { my $line = $parts->[$i]; my @b = split('\s+', $line); my $start = $b[1] + $lift; my $end = $b[2] + $lift; printf "%s\t%d\t%d", $chr, $start, $end; for (my $i = 3; $i < scalar(@b); ++$i) { printf "\t%s", $b[$i]; } printf "\n"; } } } close (FH); '_EOF_' # << happy emacs chmod +x liftAgp.pl ./liftAgp.pl > contig.agp hgGoldGapGl -noGl -noLoad panTro3 contig.agp # take a look at the resulting .tab files, then load into db: hgGoldGapGl -noGl panTro3 contig.agp # verify nothing lost or gained featureBits -or -countGaps panTro3 gold gap 3307960432 bases of 3307960432 (100.000%) in intersection # replace the first agp, but save it for later use mv panTro3.agp panTro3.scaffolds.agp ln -s bed/ctgPos2/contig.agp panTro3.agp ############################################################################# # Initial pushQ entry to get QA started on this assembly # (DONE - 2011-02-17 - Hiram) cd /hive/data/genomes/panTro3 ln -s `pwd`/panTro3.unmasked.2bit /gbdb/panTro3/panTro3.2bit mkdir pushQ cd pushQ makePushQSql.pl panTro3 > panTro3.pushQ.sql scp -p panTro3.pushQ.sql hgwbeta:/tmp ssh hgwbeta cd /tmp hgsql qapushq < panTro3.pushQ.sql ############################################################################# # running repeat masker (DONE - 2011-02-22 - Hiram) mkdir /hive/data/genomes/panTro3/bed/repeatMasker cd /hive/data/genomes/panTro3/bed/repeatMasker time doRepeatMasker.pl -buildDir=`pwd` -noSplit \ -bigClusterHub=swarm -dbHost=hgwdev -workhorse=hgwdev \ -smallClusterHub=memk panTro3 > do.log 2>&1 & # real 456m48.171s egrep "version|RELEASE" do.log # RepeatMasker version development-$Id: RepeatMasker,v 1.25 2010/09/08 21:32:26 angie Exp $ # CC RELEASE 20090604; # failed during the 'cat' step due to 3 missing sequence numbers, cp -p panTro3.fa.out panTro3.fa.out.broken # manually remove those three lines vi panTro3.fa.out # then manually complete last bit in the 'cat' step: /cluster/bin/scripts/extractNestedRepeats.pl panTro3.fa.out \ > panTro3.nestedRepeats.bed # -rw-rw-r-- 1 58826545 Feb 22 09:20 panTro3.nestedRepeats.bed # continuing with the RM run: time doRepeatMasker.pl -buildDir=`pwd` -noSplit \ -bigClusterHub=swarm -dbHost=hgwdev -workhorse=hgwdev \ -continue=mask -smallClusterHub=memk panTro3 > mask.log 2>&1 & # real 28m6.877s cat faSize.rmsk.txt # 3307960432 bases (407430668 N's 2900529764 real 1433143490 upper # 1467386274 lower) in 24132 sequences in 1 files # %44.36 masked total, %50.59 masked real ########################################################################## # running simple repeat (DONE - 2011-02-22 - Hiram) mkdir /hive/data/genomes/panTro3/bed/simpleRepeat cd /hive/data/genomes/panTro3/bed/simpleRepeat time doSimpleRepeat.pl -buildDir=`pwd` -bigClusterHub=swarm \ -dbHost=hgwdev -workhorse=hgwdev -smallClusterHub=memk \ panTro3 > do.log 2>&1 & # real 30m3.767s # two failed due to no sequence in inputs 2011-02-17 17:00 # chr2B:0-50000000 and chr2B:50000000-100000000 # faSize on both indicates: (50000000 N's 0 real 0 upper 0 lower) # TrfPart/049/049.lst.bed # TrfPart/050/050.lst.bed # create empty results: touch /hive/data/genomes/panTro3/TrfPart/049/049.lst.bed touch /hive/data/genomes/panTro3/TrfPart/050/050.lst.bed # run a para time > run.time on memk to get that file to exist: # Completed: 93 of 95 jobs # Crashed: 2 jobs # CPU time in finished jobs: 35389s 589.82m 9.83h 0.41d 0.001 y # IO & Wait Time: 468s 7.79m 0.13h 0.01d 0.000 y # Average job time: 386s 6.43m 0.11h 0.00d # Longest finished job: 9452s 157.53m 2.63h 0.11d # Submission to last job: 10661s 177.68m 2.96h 0.12d # continue time doSimpleRepeat.pl -buildDir=`pwd` -bigClusterHub=swarm \ -dbHost=hgwdev -workhorse=hgwdev -smallClusterHub=memk \ -continue=filter panTro3 > filter.log 2>&1 & # ~ 1 minute cat fb.simpleRepeat # 95418830 bases of 2900529764 (3.290%) in intersection cd /hive/data/genomes/panTro3 twoBitMask panTro3.rmsk.2bit \ -add bed/simpleRepeat/trfMask.bed panTro3.2bit # you can safely ignore the warning about fields >= 13 twoBitToFa panTro3.2bit stdout | faSize stdin > faSize.panTro3.2bit.txt cat faSize.panTro3.2bit.txt # 3307960432 bases (407430668 N's 2900529764 real 1431641829 upper # 1468887935 lower) in 24132 sequences in 1 files # %44.40 masked total, %50.64 masked real # reset symlink to this masked sequence rm /gbdb/panTro3/panTro3.2bit ln -s `pwd`/panTro3.2bit /gbdb/panTro3/panTro3.2bit ########################################################################## # add a ctgPos2 track with other names (DONE - 2011-02-17 - Hiram) mkdir /hive/data/genomes/panTro3/bed/ctgPos2 cd /hive/data/genomes/panTro3/bed/ctgPos2 cat << '_EOF_' > ctgPos2.pl #!/bin/env perl use strict; use warnings; my %accToChr; open (FH, "<../../panTro3.scaffolds.agp") or die "can not read panTro3.scaffolds.agp"; while (my $line = ) { chomp $line; my @a = split('\s+', $line); next if ($a[0] =~ m/_random|chrUn/); if (scalar(@a) == 9) { my $start = $a[1] - 1; my $end = $a[2]; my $size = $end - $start; printf "%s\t%d\t%s\t%d\t%d\t%s\n", $a[5], $size, $a[0], $start, $end, $a[4]; } } close (FH); my %accToComponent; open (FH, "<../../genbank/Primary_Assembly/component_localID2acc") or die "can not read component_localID2acc"; while (my $line = ) { chomp $line; my ($component, $acc) = split('\s+', $line); die "duplicate acc: $acc" if (exists($accToComponent{$acc})); $accToComponent{$acc} = $component; } close (FH); open (FH, "hgsql -N -e 'select chrom,chromStart,chromEnd,type,frag from gold;' panTro3 | egrep 'chrUn\|_random'|") or die "can not select from gold"; while (my $line = ) { chomp $line; my ($chr, $start, $end, $type, $acc) = split('\s+', $line); die "ERR: can not find acc: $acc" if (!exists($accToComponent{$acc})); my $component = $accToComponent{$acc}; my $size = $end - $start; printf "%s\t%d\t%s\t%d\t%d\t%s\n", $component, $size, $chr, $start, $end, $type; } close (FH); '_EOF_' # << happy emacs ./ctgPos2.pl | sed -e "s/NC_001643/GI:5835121/" > ctgPos2.tab # check that we have them all: awk '{print $3}' ctgPos2.tab | sort -u | wc -l # 24132 wc -l ../../chrom.sizes # 24132 ../../chrom.sizes # determine length of unique ctg names: awk '{print $1}' ctgPos2.tab | sed -e "s/\.1$//;s/_random//" \ | awk '{print length($0)}' | sort -rn | head -1 # 14 # determine length of unique chrom names: awk '{print $3}' ctgPos2.tab | sed -e "s/_random//" \ | awk '{print length($0)}' | sort -rn | head -1 # 18 # customize our sql definition with proper sizes, the 20 and 16 are # unique in this template: sed -e "s/20/14/; s/16/18/" $HOME/kent/src/hg/lib/ctgPos2.sql > ctgPos2.sql hgLoadSqlTab panTro3 ctgPos2 ctgPos2.sql ctgPos2.tab ########################################################################## # Marking *all* gaps - they are not all in the AGP file # (DONE - 2011-02-18 - Hiram) mkdir /hive/data/genomes/panTro3/bed/allGaps cd /hive/data/genomes/panTro3/bed/allGaps time nice -n +19 findMotif -motif=gattaca -verbose=4 \ -strand=+ ../../panTro3.unmasked.2bit > findMotif.txt 2>&1 # real 1m12.153s grep "^#GAP " findMotif.txt | sed -e "s/^#GAP //" > allGaps.bed featureBits panTro3 -not gap -bed=notGap.bed # 2900583125 bases of 2900583125 (100.000%) in intersection # real 0m18.559s time featureBits panTro3 allGaps.bed notGap.bed -bed=new.gaps.bed # 53361 bases of 2900583125 (0.002%) in intersection # real 20m48.848s # what is the highest index in the existing gap table: hgsql -N -e "select ix from gap;" panTro3 | sort -n | tail -1 # 3118 cat << '_EOF_' > mkGap.pl #!/bin/env perl use strict; use warnings; my $ix=`hgsql -N -e "select ix from gap;" panTro3 | sort -n | tail -1`; chomp $ix; open (FH,") { my ($chrom, $chromStart, $chromEnd, $rest) = split('\s+', $line); ++$ix; printf "%s\t%d\t%d\t%d\tN\t%d\tother\tyes\n", $chrom, $chromStart, $chromEnd, $ix, $chromEnd-$chromStart; } close (FH); '_EOF_' # << happy emacs chmod +x ./mkGap.pl ./mkGap.pl > other.bed featureBits panTro3 other.bed # 53361 bases of 2900583125 (0.002%) in intersection # verify chrom names are OK for index length hgLoadBed -sqlTable=$HOME/kent/src/hg/lib/gap.sql \ -noLoad panTro3 otherGap other.bed # Loaded 96549 # adding this many: wc -l bed.tab # 23478 # starting with this many hgsql -e "select count(*) from gap;" panTro3 # 159676 hgsql panTro3 -e 'load data local infile "bed.tab" into table gap;' # result count: hgsql -e "select count(*) from gap;" panTro3 # 183154 == 159676 + 23478 ######################################################################### # MAKE 11.OOC FILES FOR BLAT (DONE - 2011-02-22 - Hiram) # numerator is panTro3 gapless bases "real" as reported by faSize # denominator is hg17 gapless bases as reported by featureBits, # 1024 is threshold used for human -repMatch: calc \( 2900529764 / 2897310462 \) \* 1024 # ( 2900529764 / 2897310462 ) * 1024 = 1025.137802 # ==> use -repMatch=1024, ends up same as human sequence cd /hive/data/genomes/panTro3 time blat panTro3.2bit /dev/null /dev/null -tileSize=11 \ -makeOoc=jkStuff/panTro3.11.ooc -repMatch=1024 # Wrote 31038 overused 11-mers to jkStuff/panTro3.11.ooc # make up non bridged gap lift file for use in genbank runs gapToLift -minGap=3001 -bedFile=jkStuff/nonBridgedGaps.bed panTro3 \ jkStuff/panTro3.nonBridged.lft mkdir /hive/data/staging/data/panTro3 cp -p panTro3.2bit chrom.sizes jkStuff/panTro3.11.ooc \ jkStuff/panTro3.nonBridged.lft /hive/data/staging/data/panTro3 ########################################################################## # BLATSERVERS ENTRY (DONE - 2009-12-23 - Hiram) # After getting a blat server assigned by the Blat Server Gods, ssh hgwdev hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ VALUES ("panTro3", "blat1", "17810", "1", "0"); \ INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ VALUES ("panTro3", "blat1", "17811", "0", "1");' \ hgcentraltest # test it with some sequence ############################################################################ # reset position to same area as was panTro2 (gene: HIVEP1) hgsql -e \ 'update dbDb set defaultPos="chr6:12165392-12278032" where name="panTro3";' \ hgcentraltest ############################################################################ # construct liftOver from panTro2 (DONE - 2011-02-22 - Hiram) # documented in panTro2.txt cd /hive/data/genomes/panTro2/bed/blat.panTro3.2011-02-22 ############################################################################ # Orangutan Lastz run (DONE - 2011-02-22,03-07 - Hiram) mkdir /hive/data/genomes/panTro3/bed/lastzPonAbe2.2011-02-22 cd /hive/data/genomes/panTro3/bed/lastzPonAbe2.2011-02-22 cat << '_EOF_' > DEF # chimp vs orangutan BLASTZ=lastz # maximum M allowed with lastz is only 254 BLASTZ_M=254 BLASTZ_Q=/scratch/data/blastz/human_chimp.v2.q BLASTZ_O=600 BLASTZ_E=150 BLASTZ_K=4500 BLASTZ_Y=15000 BLASTZ_T=2 # TARGET: Chimp PanTro3 SEQ1_DIR=/scratch/data/panTro3/panTro3.2bit SEQ1_LEN=/scratch/data/panTro3/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 SEQ1_IN_CONTIGS=0 # QUERY: Chimp PonAbe2 SEQ2_DIR=/scratch/data/ponAbe2/ponAbe2.2bit SEQ2_LEN=/scratch/data/ponAbe2/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_IN_CONTIGS=0 BASE=/hive/data/genomes/panTro3/bed/lastzPonAbe2.2011-02-22 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs screen # use screen to manage this long-running job time nice -n +19 doBlastzChainNet.pl `pwd`/DEF -verbose=2 \ -noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \ -syntenicNet > do.log 2>&1 & # Completed: 417312 of 417312 jobs # CPU time in finished jobs: 77534399s 1292239.98m 21537.33h 897.39d 2.459 y # IO & Wait Time: 29461264s 491021.07m 8183.68h 340.99d 0.934 y # Average job time: 256s 4.27m 0.07h 0.00d # Longest finished job: 5657s 94.28m 1.57h 0.07d # Submission to last job: 1020433s 17007.22m 283.45h 11.81d time nice -n +19 doBlastzChainNet.pl `pwd`/DEF -verbose=2 \ -noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \ -continue=cat -syntenicNet > cat.log 2>&1 & # Elapsed time: 260m12s cat fb.panTro3.chainPonAbe2Link.txt # 2598464298 bases of 2900529764 (89.586%) in intersection # running the swap - DONE - 2011-03-07 ssh swarm mkdir /hive/data/genomes/ponAbe2/bed/blastz.panTro3.swap cd /hive/data/genomes/ponAbe2/bed/blastz.panTro3.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ -swap /hive/data/genomes/panTro3/bed/lastzPonAbe2.2011-02-22/DEF \ -noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \ -syntenicNet -workhorse=hgwdev -smallClusterHub=memk \ -bigClusterHub=swarm > swap.log 2>&1 & # real 94m54.955s cat fb.ponAbe2.chainPanTro3Link.txt # 2714550129 bases of 3093572278 (87.748%) in intersection ############################################################################ # Rhesus Lastz run (DONE - 2011-02-22,03-07 - Hiram) mkdir /hive/data/genomes/panTro3/bed/lastzRheMac2.2011-02-22 cd /hive/data/genomes/panTro3/bed/lastzRheMac2.2011-02-22 cat << '_EOF_' > DEF # chimp vs rhesus BLASTZ=lastz # maximum M allowed with lastz is only 254 BLASTZ_M=254 BLASTZ_Q=/scratch/data/blastz/human_chimp.v2.q BLASTZ_O=600 BLASTZ_E=150 BLASTZ_K=4500 BLASTZ_Y=15000 BLASTZ_T=2 # TARGET: Chimp PanTro3 SEQ1_DIR=/scratch/data/panTro3/panTro3.2bit SEQ1_LEN=/scratch/data/panTro3/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 SEQ1_IN_CONTIGS=0 # QUERY: Rhesus RheMac2 SEQ2_DIR=/scratch/data/rheMac2/rheMac2.2bit SEQ2_LEN=/scratch/data/rheMac2/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_IN_CONTIGS=0 BASE=/hive/data/genomes/panTro3/bed/lastzRheMac2.2011-02-22 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs screen # use screen to manage this long-running job time nice -n +19 doBlastzChainNet.pl `pwd`/DEF -verbose=2 \ -noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \ -syntenicNet > do.log 2>&1 & # Completed: 334530 of 334530 jobs # CPU time in finished jobs: 71819683s 1196994.72m 19949.91h 831.25d 2.277 y # IO & Wait Time: 28582136s 476368.93m 7939.48h 330.81d 0.906 y # Average job time: 300s 5.00m 0.08h 0.00d # Longest finished job: 5511s 91.85m 1.53h 0.06d # Submission to last job: 986049s 16434.15m 273.90h 11.41d # real 173m22.880s time nice -n +19 doBlastzChainNet.pl `pwd`/DEF -verbose=2 \ -noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -continue=cat -syntenicNet > cat.log 2>&1 & # real 147m59.726s cat fb.panTro3.chainRheMac2Link.txt # 2359352558 bases of 2900529764 (81.342%) in intersection # running the swap - DONE - 2011-03-07 ssh swarm mkdir /hive/data/genomes/rheMac2/bed/blastz.panTro3.swap cd /hive/data/genomes/rheMac2/bed/blastz.panTro3.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ -swap /hive/data/genomes/panTro3/bed/lastzRheMac2.2011-02-22/DEF \ -noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \ -syntenicNet -workhorse=hgwdev -smallClusterHub=encodek \ -bigClusterHub=swarm > swap.log 2>&1 & # real 74m46.709s cat fb.rheMac2.chainPanTro3Link.txt # 2267730573 bases of 2646704109 (85.681%) in intersection ############################################################################ # Marmoset Lastz run (DONE - 2011-02-22,03-07 - Hiram) mkdir /hive/data/genomes/panTro3/bed/lastzCalJac3.2011-02-22 cd /hive/data/genomes/panTro3/bed/lastzCalJac3.2011-02-22 cat << '_EOF_' > DEF # chimp vs marmoset BLASTZ=lastz # maximum M allowed with lastz is only 254 BLASTZ_M=254 BLASTZ_Q=/scratch/data/blastz/human_chimp.v2.q BLASTZ_O=600 BLASTZ_E=150 BLASTZ_K=4500 BLASTZ_Y=15000 BLASTZ_T=2 # TARGET: Chimp PanTro3 SEQ1_DIR=/scratch/data/panTro3/panTro3.2bit SEQ1_LEN=/scratch/data/panTro3/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 SEQ1_IN_CONTIGS=0 # QUERY: Marmoset CalJac3 SEQ2_DIR=/scratch/data/calJac3/calJac3.2bit SEQ2_LEN=/scratch/data/calJac3/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_IN_CONTIGS=0 BASE=/hive/data/genomes/panTro3/bed/lastzCalJac3.2011-02-22 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs screen # use screen to manage this long-running job time nice -n +19 doBlastzChainNet.pl `pwd`/DEF -verbose=2 \ -noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \ -syntenicNet > do.log 2>&1 & # Completed: 489888 of 489888 jobs # CPU time in finished jobs: 68728510s 1145475.16m 19091.25h 795.47d 2.179 y # IO & Wait Time: 29884276s 498071.27m 8301.19h 345.88d 0.948 y # Average job time: 201s 3.35m 0.06h 0.00d # Longest finished job: 5588s 93.13m 1.55h 0.06d # Submission to last job: 997302s 16621.70m 277.03h 11.54d time nice -n +19 doBlastzChainNet.pl `pwd`/DEF -verbose=2 \ -noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -continue=cat -syntenicNet > cat.log 2>&1 & # Elapsed time: 169m39s cat fb.panTro3.chainCalJac3Link.txt # 2020558070 bases of 2900529764 (69.662%) in intersection # running the swap - DONE - 2011-05-07 ssh swarm mkdir /hive/data/genomes/calJac3/bed/blastz.panTro3.swap cd /hive/data/genomes/calJac3/bed/blastz.panTro3.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ -swap /hive/data/genomes/panTro3/bed/lastzCalJac3.2011-02-22/DEF \ -noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \ -syntenicNet -workhorse=hgwdev -smallClusterHub=encodek \ -bigClusterHub=swarm > swap.log 2>&1 & # real 83m36.955s cat fb.calJac3.chainPanTro3Link.txt # 1994778494 bases of 2752505800 (72.471%) in intersection ############################################################################ # GENBANK ALIGNMENTS (DONE - 2011-02-22 - Hiram) ssh hgwdev cd ~/kent/src/hg/makeDb/genbank git pull # edit etc/genbank.conf to add panTro3 to add: panTro3.serverGenome = /hive/data/genomes/panTro3/panTro3.2bit panTro3.clusterGenome = /scratch/data/panTro3/panTro3.2bit panTro3.ooc = /scratch/data/panTro3/panTro3.11.ooc panTro3.align.unplacedChroms = chrUn*,chr*_random panTro3.lift = /scratch/data/panTro3/panTro3.nonBridged.lft panTro3.perChromTables = no panTro3.refseq.mrna.native.pslCDnaFilter = ${ordered.refseq.mrna.native.pslCDnaFilter} panTro3.refseq.mrna.xeno.pslCDnaFilter = ${ordered.refseq.mrna.xeno.pslCDnaFilter} panTro3.genbank.mrna.native.pslCDnaFilter = ${ordered.genbank.mrna.native.pslCDnaFilter} panTro3.genbank.mrna.xeno.pslCDnaFilter = ${ordered.genbank.mrna.xeno.pslCDnaFilter} panTro3.genbank.est.native.pslCDnaFilter = ${ordered.genbank.est.native.pslCDnaFilter} panTro3.genbank.est.xeno.pslCDnaFilter = ${ordered.genbank.est.xeno.pslCDnaFilter} panTro3.downloadDir = panTro3 panTro3.refseq.mrna.native.load = yes panTro3.refseq.mrna.xeno.load = yes panTro3.genbank.mrna.xeno.load = yes panTro3.genbank.mrna.xeno.loadDesc = yes panTro3.genbank.est.native.load = yes panTro3.upstreamGeneTbl = refGene git commit -m "adding panTro3" etc/genbank.conf git push make etc-update ssh genbank screen # control this business with a screen since it takes a while cd /cluster/data/genbank time nice -n +19 bin/gbAlignStep -initial panTro3 & # logFile: var/build/logs/2011.02.23-09:44:44.panTro3.initalign.log # real 1658m46.609s (lots of kluster interference from other work) ssh hgwdev cd /cluster/data/genbank time nice -n +19 bin/gbDbLoadStep -initialLoad -drop panTro3 & # logFile: var/dbload/hgwdev/logs/2011.02.24-13:51:55.dbload.log # real 30m58.420s # enable daily alignment and update of hgwdev cd ~/kent/src/hg/makeDb/genbank git pull # add panTro3 to: etc/align.dbs etc/hgwdev.dbs git commit -m "adding panTro3 to the daily update cycle" \ etc/align.dbs etc/hgwdev.dbs git push make etc-update ############################################################################ # LASTZ Rat Rn4 (DONE - 2011-02-22,03-17 - Hiram) mkdir /hive/data/genomes/panTro3/bed/lastzRn4.2011-02-22 cd /hive/data/genomes/panTro3/bed/lastzRn4.2011-02-22 cat << '_EOF_' > DEF # chimp vs rat # TARGET: Chimp PanTro3 SEQ1_DIR=/scratch/data/panTro3/panTro3.2bit SEQ1_LEN=/scratch/data/panTro3/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 SEQ1_IN_CONTIGS=0 # QUERY: Rat Rn4 SEQ2_DIR=/scratch/data/rn4/rn4.2bit SEQ2_LEN=/scratch/data/rn4/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/hive/data/genomes/panTro3/bed/lastzRn4.2011-02-22 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # establish a screen to control this job screen time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet -noLoadChainSplit \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # Completed: 334530 of 334530 jobs # CPU time in finished jobs: 117963946s 1966065.77m 32767.76h 1365.32d 3.741 y # IO & Wait Time: 21241835s 354030.58m 5900.51h 245.85d 0.674 y # Average job time: 416s 6.94m 0.12h 0.00d # Longest finished job: 5954s 99.23m 1.65h 0.07d # Submission to last job: 1898683s 31644.72m 527.41h 21.98d time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -continue=cat -syntenicNet -noLoadChainSplit \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > cat.log 2>&1 & # real 218m40.169s cat fb.panTro3.chainRn4Link.txt # 880312548 bases of 2900529764 (30.350%) in intersection # running the swap - DONE - 2011-03-17 mkdir /hive/data/genomes/rn4/bed/blastz.panTro3.swap cd /hive/data/genomes/rn4/bed/blastz.panTro3.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/panTro3/bed/lastzRn4.2011-02-22/DEF \ -swap -syntenicNet -noLoadChainSplit \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 82m45.556s cat fb.rn4.chainPanTro3Link.txt # 875298047 bases of 2571531505 (34.038%) in intersection ############################################################################## # LASTZ Mouse Mm9 (DONE - 2011-02-23,03-08 - Hiram) mkdir /hive/data/genomes/panTro3/bed/lastzMm9.2011-02-23 cd /hive/data/genomes/panTro3/bed/lastzMm9.2011-02-23 cat << '_EOF_' > DEF # chimp vs mouse # TARGET: Chimp PanTro3 SEQ1_DIR=/scratch/data/panTro3/panTro3.2bit SEQ1_LEN=/scratch/data/panTro3/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 SEQ1_IN_CONTIGS=0 # QUERY: Mouse Mm9 SEQ2_DIR=/scratch/data/mm9/mm9.2bit SEQ2_LEN=/scratch/data/mm9/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/hive/data/genomes/panTro3/bed/lastzMm9.2011-02-23 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # establish a screen to control this job screen time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet -noLoadChainSplit \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # Elapsed time: 17693m37s cat fb.panTro3.chainMm9Link.txt # 929446765 bases of 2900529764 (32.044%) in intersection # running the swap - DONE - 2011-03-08 mkdir /hive/data/genomes/mm9/bed/blastz.panTro3.swap cd /hive/data/genomes/mm9/bed/blastz.panTro3.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/panTro3/bed/lastzMm9.2011-02-23/DEF \ -swap -syntenicNet -noLoadChainSplit \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 70m30.237s cat fb.mm9.chainPanTro3Link.txt # 921877558 bases of 2620346127 (35.182%) in intersection ############################################################################## # Swap Human lastz (DONE - 2011-02-23 - Hiram) # original alignment cd /hive/data/genomes/hg19/bed/lastzPanTro3.2011-02-22 cat fb.hg19.chainPanTro3Link.txt # 2760939621 bases of 2897316137 (95.293%) in intersection # running the swap mkdir /hive/data/genomes/panTro3/bed/blastz.hg19.swap cd /hive/data/genomes/panTro3/bed/blastz.hg19.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ -swap /hive/data/genomes/hg19/bed/lastzPanTro3.2011-02-22/DEF \ -noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -syntenicNet > swap.log 2>&1 & # real 86m49.706s cat fb.panTro3.chainHg19Link.txt # 2772816267 bases of 2900529764 (95.597%) in intersection ############################################################################## # LASTZ Dog CanFam2 (DONE - 2011-02-23,03-08 - Hiram) mkdir /hive/data/genomes/panTro3/bed/lastzCanFam2.2011-02-23 cd /hive/data/genomes/panTro3/bed/lastzCanFam2.2011-02-23 cat << '_EOF_' > DEF # chimp vs dog # TARGET: Chimp PanTro3 SEQ1_DIR=/scratch/data/panTro3/panTro3.2bit SEQ1_LEN=/scratch/data/panTro3/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 SEQ1_IN_CONTIGS=0 # QUERY: Dog CanFam2 SEQ2_DIR=/scratch/data/canFam2/canFam2.2bit SEQ2_LEN=/scratch/data/canFam2/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/hive/data/genomes/panTro3/bed/lastzCanFam2.2011-02-23 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # establish a screen to control this job screen time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet -noLoadChainSplit \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # Elapsed time: 16114m13s cat fb.panTro3.chainCanFam2Link.txt # 1496334162 bases of 2900529764 (51.588%) in intersection # running the swap - DONE - 2009-06-02 mkdir /hive/data/genomes/canFam2/bed/blastz.panTro3.swap cd /hive/data/genomes/canFam2/bed/blastz.panTro3.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/panTro3/bed/lastzCanFam2.2011-02-23/DEF \ -swap -syntenicNet -noLoadChainSplit \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 96m3.779s cat fb.canFam2.chainPanTro3Link.txt # 1438499360 bases of 2384996543 (60.315%) in intersection ############################################################################ # LASTZ Horse EquCab2 (DONE - 2011-02-23,03-08 - Hiram) mkdir /hive/data/genomes/panTro3/bed/lastzEquCab2.2011-02-23 cd /hive/data/genomes/panTro3/bed/lastzEquCab2.2011-02-23 cat << '_EOF_' > DEF # chimp vs horse # TARGET: Chimp PanTro3 SEQ1_DIR=/scratch/data/panTro3/panTro3.2bit SEQ1_LEN=/scratch/data/panTro3/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 SEQ1_IN_CONTIGS=0 # QUERY: Horse EquCab2 SEQ2_DIR=/scratch/data/equCab2/equCab2.2bit SEQ2_LEN=/scratch/data/equCab2/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/hive/data/genomes/panTro3/bed/lastzEquCab2.2011-02-23 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # establish a screen to control this job screen time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet -noLoadChainSplit \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # Elapsed time: 16686m37s cat fb.panTro3.chainEquCab2Link.txt # 1638902226 bases of 2900529764 (56.504%) in intersection # running the swap - DONE - 2009-06-02 mkdir /hive/data/genomes/equCab2/bed/blastz.panTro3.swap cd /hive/data/genomes/equCab2/bed/blastz.panTro3.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/panTro3/bed/lastzEquCab2.2011-02-23/DEF \ -swap -syntenicNet -noLoadChainSplit \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 131m12.302s cat fb.equCab2.chainPanTro3Link.txt # 1599680730 bases of 2428790173 (65.863%) in intersection ############################################################################ # LASTZ Opossum MonDom5 (DONE - 2011-02-23,03-15 - Hiram) mkdir /hive/data/genomes/panTro3/bed/lastzMonDom5.2011-02-23 cd /hive/data/genomes/panTro3/bed/lastzMonDom5.2011-02-23 cat << '_EOF_' > DEF # chimp vs opossum # settings for more distant organism alignments BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_M=50 BLASTZ_Q=/scratch/data/blastz/HoxD55.q # TARGET: Chimp PanTro3 SEQ1_DIR=/scratch/data/panTro3/panTro3.2bit SEQ1_LEN=/scratch/data/panTro3/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 SEQ1_IN_CONTIGS=0 # QUERY: Opossum MonDom5 SEQ2_DIR=/scratch/data/monDom5/monDom5.2bit SEQ2_LEN=/scratch/data/monDom5/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/hive/data/genomes/panTro3/bed/lastzMonDom5.2011-02-23 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # establish a screen to control this job screen time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet -noLoadChainSplit \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=loose > do.log 2>&1 & # Elapsed time: 22679m1s (more than two weeks) cat fb.panTro3.chainMonDom5Link.txt # 398623849 bases of 2900529764 (13.743%) in intersection # running the swap - DONE - 2011-03-15 mkdir /hive/data/genomes/monDom5/bed/blastz.panTro3.swap cd /hive/data/genomes/monDom5/bed/blastz.panTro3.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/panTro3/bed/lastzMonDom5.2011-02-23/DEF \ -swap -syntenicNet -noLoadChainSplit \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=loose > swap.log 2>&1 & # real 85m21.917s cat fb.monDom5.chainPanTro3Link.txt # 396979982 bases of 3501660299 (11.337%) in intersection ############################################################################ # LASTZ Chicken GalGal3 (DONE - 2011-02-23,03-03 - Hiram) mkdir /hive/data/genomes/panTro3/bed/lastzGalGal3.2011-02-23 cd /hive/data/genomes/panTro3/bed/lastzGalGal3.2011-02-23 cat << '_EOF_' > DEF # chimp vs chicken # settings for more distant organism alignments BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_M=50 BLASTZ_Q=/scratch/data/blastz/HoxD55.q # TARGET: Chimp PanTro3 SEQ1_DIR=/scratch/data/panTro3/panTro3.2bit SEQ1_LEN=/scratch/data/panTro3/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 SEQ1_IN_CONTIGS=0 # QUERY: Chicken GalGal3 SEQ2_DIR=/scratch/data/galGal3/galGal3.2bit SEQ2_LEN=/scratch/data/galGal3/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/hive/data/genomes/panTro3/bed/lastzGalGal3.2011-02-23 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # establish a screen to control this job screen time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet -noLoadChainSplit \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=loose > do.log 2>&1 & # real 10418m46.398s (lots of swarm interference due to many lastz) # == 7d 5h 38m cat fb.panTro3.chainGalGal3Link.txt # 132163568 bases of 2900529764 (4.557%) in intersection # running the swap - DONE - 2009-06-02 mkdir /hive/data/genomes/galGal3/bed/blastz.panTro3.swap cd /hive/data/genomes/galGal3/bed/blastz.panTro3.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/panTro3/bed/lastzGalGal3.2011-02-23/DEF \ -swap -syntenicNet -noLoadChainSplit \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=loose > swap.log 2>&1 & # real 35m47.071s cat fb.galGal3.chainPanTro3Link.txt # 115892900 bases of 1042591351 (11.116%) in intersection ############################################################################ # LASTZ Zebrafish DanRer7 (DONE - 2011-02-23 - 03-01 - Hiram) mkdir /hive/data/genomes/panTro3/bed/lastzDanRer7.2011-02-23 cd /hive/data/genomes/panTro3/bed/lastzDanRer7.2011-02-23 cat << '_EOF_' > DEF # chimp vs zebrafish # settings for more distant organism alignments BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_M=50 BLASTZ_Q=/scratch/data/blastz/HoxD55.q # TARGET: Chimp PanTro3 SEQ1_DIR=/scratch/data/panTro3/panTro3.2bit SEQ1_LEN=/scratch/data/panTro3/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 SEQ1_IN_CONTIGS=0 # QUERY: Zebrafish DanRer7 SEQ2_DIR=/scratch/data/danRer7/danRer7.2bit SEQ2_LEN=/scratch/data/danRer7/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/hive/data/genomes/panTro3/bed/lastzDanRer7.2011-02-23 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # establish a screen to control this job screen time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet -noLoadChainSplit \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=loose > do.log 2>&1 & # real 8921m45.996s - lots of swarm cluster interference from # up to nine other lastz runs I have going as well # as everyone else there. cat fb.panTro3.chainDanRer7Link.txt # 74190141 bases of 2900529764 (2.558%) in intersection # running the swap - DONE - 2011-03-01 mkdir /hive/data/genomes/danRer7/bed/blastz.panTro3.swap cd /hive/data/genomes/danRer7/bed/blastz.panTro3.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/panTro3/bed/lastzDanRer7.2011-02-23/DEF \ -swap -syntenicNet -noLoadChainSplit \ -workhorse=hgwdev -smallClusterHub=encodk -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=loose > swap.log 2>&1 & # real 28m29.184s cat fb.danRer7.chainPanTro3Link.txt # 82066902 bases of 1409770109 (5.821%) in intersection ############################################################################ # construct download files (DONE - 2011-02-23 - Hiram) cd /hive/data/genomes/panTro3 time makeDownloads.pl panTro3 -dbHost=hgwdev -workhorse=hgwdev \ > downloads.log 2>&1 # real 26m13.424s # edi the README files: cd goldenPath vi */README.txt ############################################################################ # running cpgIsland business (DONE - 2011-02-24 - Hiram) mkdir /hive/data/genomes/panTro3/bed/cpgIsland cd /hive/data/genomes/panTro3/bed/cpgIsland cvs -d /projects/compbio/cvsroot checkout -P hg3rdParty/cpgIslands cd hg3rdParty/cpgIslands # needed to fixup this source, adding include to readseq.c: #include "string.h" # and to cpg_lh.c: #include "unistd.h" #include "stdlib.h" # and fixing a declaration in cpg_lh.c sed -e "s#\(extern char\* malloc\)#// \1#" cpg_lh.c > tmp.c mv tmp.c cpg_lh.c make cd ../../ ln -s hg3rdParty/cpgIslands/cpglh.exe mkdir -p hardMaskedFa cut -f1 ../../chrom.sizes | while read C do echo ${C} twoBitToFa ../../panTro3.2bit:$C stdout \ | maskOutFa stdin hard hardMaskedFa/${C}.fa done ssh swarm cd /hive/data/genomes/panTro3/bed/cpgIsland mkdir results cut -f1 ../../chrom.sizes > chr.list cat << '_EOF_' > template #LOOP ./runOne $(root1) {check out exists results/$(root1).cpg} #ENDLOOP '_EOF_' # << happy emacs # the faCount business is to make sure there is enough sequence to # work with in the fasta. cpglh.exe does not like files with too many # N's - it gets stuck cat << '_EOF_' > runOne #!/bin/csh -fe set C = `faCount hardMaskedFa/$1.fa | grep ^chr | awk '{print $2 - $7 }'` if ( $C > 200 ) then ./cpglh.exe hardMaskedFa/$1.fa > /scratch/tmp/$1.$$ mv /scratch/tmp/$1.$$ $2 else touch $2 endif '_EOF_' # << happy emacs chmod +x runOne gensub2 chr.list single template jobList para create jobList para try para check ... etc para time # Completed: 24132 of 24132 jobs # CPU time in finished jobs: 262s 4.36m 0.07h 0.00d 0.000 y # IO & Wait Time: 61074s 1017.91m 16.97h 0.71d 0.002 y # Average job time: 3s 0.04m 0.00h 0.00d # Longest finished job: 26s 0.43m 0.01h 0.00d # Submission to last job: 3122s 52.03m 0.87h 0.04d # Transform cpglh output to bed + catDir results | awk '{ $2 = $2 - 1; width = $3 - $2; printf("%s\t%d\t%s\t%s %s\t%s\t%s\t%0.0f\t%0.1f\t%s\t%s\n", $1, $2, $3, $5,$6, width, $6, width*$7*0.01, 100.0*2*$6/width, $7, $9); }' > cpgIsland.bed # verify longest unique chrom name: cut -f1 cpgIsland.bed | sed -e 's/_random//' \ | awk '{print length($0)}' | sort -rn | head -1 # 18 # update the length 14 in the template to be 18: sed -e "s/14/18/" $HOME/kent/src/hg/lib/cpgIslandExt.sql > cpgIslandExt.sql cd /hive/data/genomes/panTro3/bed/cpgIsland hgLoadBed panTro3 cpgIslandExt -tab -sqlTable=cpgIslandExt.sql cpgIsland.bed # Reading cpgIsland.bed # Loaded 28288 elements of size 10 # Sorted # Saving bed.tab # Loading panTro3 featureBits panTro3 cpgIslandExt # 19195554 bases of 2900529764 (0.662%) in intersection # there should be no output from checkTableCoords: checkTableCoords -verboseBlocks -table=cpgIslandExt panTro3 # cleanup rm -fr hardMaskedFa ############################################################################ ## 12-Way Multiz (DONE - 2011-03-18,03-21 - Hiram) ssh hgwdev mkdir /hive/data/genomes/panTro3/bed/multiz12way cd /hive/data/genomes/panTro3/bed/multiz12way wget -O 46way.nh \ http://hgdownload.cse.ucsc.edu/goldenPath/hg19/multiz46way/46way.corrected.nh # panTro3 # hg19 # ponAbe2 # rheMac2 # calJac3 # equCab2 # canFam2 # mm9 # rn4 # monDom5 # galGal3 # danRer7 # All distances remain as specified in the 46way.nh /cluster/bin/phast/tree_doctor --prune-all-but panTro2,hg19,ponAbe2,rheMac2,calJac1,equCab2,canFam2,mm9,rn4,monDom5,galGal3,danRer6 46way.nh | sed -e "s/danRer6/danRer7/; s/calJac1/calJac3/; s/panTro2/panTro3/" > 12way.nh # what that looks like: cat 12way.nh # (((((((((hg19:0.006591,panTro3:0.006639):0.012126,ponAbe2:0.018342):0.014256,rheMac2:0.057695):0.010000,calJac3:0.066389):0.088210,(mm9:0.083220,rn4:0.090564):0.269385):0.020666,(equCab2:0.107726,canFam2:0.150374):0.043195):0.156024,monDom5:0.425899):0.182333,galGal3:0.474279):0.466546,danRer7:0.886380); # simple switch of hg19 and panTro3 to get it on top: cat << '_EOF_' > panTro3.12way.nh (((((((((panTro3:0.006639,hg19:0.006591):0.012126,ponAbe2:0.018342):0.014256,rheMac2:0.057695):0.010000,calJac3:0.066389):0.088210,(mm9:0.083220,rn4:0.090564):0.269385):0.020666,(equCab2:0.107726,canFam2:0.150374):0.043195):0.156024,monDom5:0.425899):0.182333,galGal3:0.474279):0.466546,danRer7:0.886380); '_EOF_' # << happy emacs # convert to species names /cluster/bin/phast/tree_doctor --rename \ "panTro3->Chimp; hg19->Human; ponAbe2->Orangutan; rheMac2->Rhesus; calJac3->Marmoset; mm9->Mouse; rn4->Rat; equCab2->Horse; canFam2->Dog; monDom5->Opossum; galGal3->Chicken; danRer7->Zebrafish" \ panTro3.12way.nh > panTro3.commonNames.12way.nh # (((((((((Chimp:0.006639,Human:0.006591):0.012126,Orangutan:0.018342):0.014256,Rhesus:0.057695):0.010000,Marmoset:0.066389):0.088210,(Mouse:0.083220,Rat:0.090564):0.269385):0.020666,(Horse:0.107726,Dog:0.150374):0.043195):0.156024,Opossum:0.425899):0.182333,Chicken:0.474279):0.466546,Zebrafish:0.886380); # Use this specification in the phyloGif tool: # http://genome.ucsc.edu/cgi-bin/phyloGif # to obtain a png image for src/hg/htdocs/images/phylo/panTro3_12way.png /cluster/bin/phast/all_dists panTro3.12way.nh > 12way.distances.txt # Use this output to create the table below grep -i panTro3 12way.distances.txt | sort -k3,3n # panTro3 hg19 0.013230 # panTro3 ponAbe2 0.037107 # panTro3 rheMac2 0.090716 # panTro3 calJac3 0.109410 # panTro3 equCab2 0.302818 # panTro3 canFam2 0.345466 # panTro3 mm9 0.483836 # panTro3 rn4 0.491180 # panTro3 monDom5 0.733820 # panTro3 galGal3 0.964533 # panTro3 danRer7 1.843180 # If you can fill in all the numbers in this table, you are ready for # the multiple alignment procedure # # featureBits chainLink measures # chainPanTro3Link chain linearGap # distance on panTro3 on other minScore # 1 0.013 - human hg19 (% 95.60) (% 95.29) 5000 medium # 2 0.037 - orangutan ponAbe2 (% 89.59) (% 87.75) 5000 medium # 3 0.091 - rhesus rheMac2 (% 81.34) (% 85.68) 5000 medium # 4 0.109 - marmoset calJac3 (% 69.66) (% 72.47) 5000 medium # 5 0.302 - horse equCab2 (% 56.50) (% 65.86) 3000 medium # 6 0.345 - dog canFam2 (% 51.59) (% 60.32) 3000 medium # 7 0.484 - mouse mm9 (% 32.04) (% 35.18) 3000 medium # 8 0.491 - rat rn4 (% 30.35) (% 34.04) 3000 medium # 9 0.734 - opossum monDom5 (% 13.74) (% 11.34) 5000 loose # 10 0.965 - chicken galGal3 (% 4.56) (% 11.12) 5000 loose # 11 1.843 - zebrafish danRer7 (% 2.56) (% 5.82) 5000 loose # None of this concern for distances matters in building the first step, the # maf files. # create species list and stripped down tree for autoMZ sed 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//; /^ *$/d' \ panTro3.12way.nh > tmp.nh echo `cat tmp.nh` > tree-commas.nh echo `cat tree-commas.nh` | sed 's/ //g; s/,/ /g' > tree.nh sed 's/[()]//g; s/,/ /g' tree.nh > species.list # split the maf files into a set of hashed named files # this hash named split keeps the same chr/contig names in the same # named hash file. mkdir /hive/data/genomes/panTro3/bed/multiz12way/mafSplit cd /hive/data/genomes/panTro3/bed/multiz12way/mafSplit for D in `sed -e "s/panTro3 //" ../species.list` do echo "${D}" mkdir $D cd $D echo "mafSplit -byTarget -useHashedName=10 /dev/null . ../../../lastz.${D}/axtChain/panTro3.${D}.synNet.maf.gz" mafSplit -byTarget -useHashedName=10 /dev/null . \ ../../../lastz.${D}/axtChain/panTro3.${D}.synNet.maf.gz cd .. done # construct a list of all possible maf file names. # they do not all exist in each of the species directories find . -type f | wc -l # 2792 find . -type f | grep ".maf$" | xargs -L 1 basename | sort -u > maf.list wc -l maf.list # 484 maf.list mkdir /hive/data/genomes/panTro3/bed/multiz12way/splitRun cd /hive/data/genomes/panTro3/bed/multiz12way/splitRun mkdir maf run cd run mkdir penn cp -p /cluster/bin/penn/multiz.2009-01-21/multiz penn cp -p /cluster/bin/penn/multiz.2009-01-21/maf_project penn cp -p /cluster/bin/penn/multiz.2009-01-21/autoMZ penn # set the db and pairs directories here cat > autoMultiz.csh << '_EOF_' #!/bin/csh -ef set db = panTro3 set c = $1 set result = $2 set run = `/bin/pwd` set tmp = /scratch/tmp/$db/multiz.$c set pairs = /hive/data/genomes/panTro3/bed/multiz12way/mafSplit /bin/rm -fr $tmp /bin/mkdir -p $tmp /bin/cp -p ../../tree.nh ../../species.list $tmp pushd $tmp > /dev/null foreach s (`/bin/sed -e "s/$db //" species.list`) set in = $pairs/$s/$c.maf set out = $db.$s.sing.maf if (-e $in.gz) then /bin/zcat $in.gz > $out if (! -s $out) then echo "##maf version=1 scoring=autoMZ" > $out endif else if (-e $in) then /bin/ln -s $in $out else echo "##maf version=1 scoring=autoMZ" > $out endif end set path = ($run/penn $path); rehash $run/penn/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c.maf \ > /dev/null popd > /dev/null /bin/rm -f $result /bin/cp -p $tmp/$c.maf $result /bin/rm -fr $tmp /bin/rmdir --ignore-fail-on-non-empty /scratch/tmp/$db '_EOF_' # << happy emacs chmod +x autoMultiz.csh cat << '_EOF_' > template #LOOP ./autoMultiz.csh $(root1) {check out line+ /hive/data/genomes/panTro3/bed/multiz12way/splitRun/maf/$(root1).maf} #ENDLOOP '_EOF_' # << happy emacs ln -s ../../mafSplit/maf.list maf.list ssh swarm cd /hive/data/genomes/panTro3/bed/multiz12way/splitRun/run gensub2 maf.list single template jobList tac jobList > smallJobsFirst para -ram=8g create smallJobsFirst # Completed: 484 of 484 jobs # CPU time in finished jobs: 370423s 6173.72m 102.90h 4.29d 0.012 y # IO & Wait Time: 27319s 455.32m 7.59h 0.32d 0.001 y # Average job time: 822s 13.70m 0.23h 0.01d # Longest finished job: 31994s 533.23m 8.89h 0.37d # Submission to last job: 32007s 533.45m 8.89h 0.37d # assemble into a single maf file cd /hive/data/genomes/panTro3/bed/multiz12way head -1 splitRun/maf/001.maf > multiz12way.maf for F in splitRun/maf/*.maf do egrep -v "^#" ${F} done >> multiz12way.maf tail -1 splitRun/maf/001.maf >> multiz12way.maf # -rw-rw-r-- 1 24598538713 Mar 21 11:02 panTro3.12way.maf # Load into database ssh hgwdev cd /hive/data/genomes/panTro3/bed/multiz12way mkdir /gbdb/panTro3/multiz12way ln -s `pwd`/multiz12way.maf /gbdb/panTro3/multiz12way cd /scratch/tmp time nice -n +19 hgLoadMaf panTro3 multiz12way # Indexing and tabulating /gbdb/panTro3/multiz12way/multiz12way.maf # Loaded 15113841 mafs in 1 files from /gbdb/panTro3/multiz12way # real 9m38.121s time nice -n +19 hgLoadMafSummary -verbose=2 -minSize=30000 \ -mergeGap=1500 -maxSize=200000 panTro3 multiz12waySummary \ /gbdb/panTro3/multiz12way/multiz12way.maf # Created 1828535 summary blocks from 85036569 components # and 15113841 mafs from /gbdb/panTro3/multiz12way/multiz12way.maf # real 8m51.192s wc -l multiz12way*.tab # 15113841 multiz12way.tab # 1828535 multiz12waySummary.tab # 16942376 total rm multiz12way*.tab ############################################################################ # GAP ANNOTATE MULTIZ12WAY MAF AND LOAD TABLES (DONE - 2011-05-03 - Hiram) # mafAddIRows has to be run on single chromosome maf files, it does not # function correctly when more than one reference sequence # are in a single file. mkdir /hive/data/genomes/panTro3/bed/multiz12way/mafSplit cd /hive/data/genomes/panTro3/bed/multiz12way/mafSplit time mafSplit -byTarget -useFullSequenceName \ /dev/null . ../multiz12way.maf # real 16m0.764s mkdir /hive/data/genomes/panTro3/bed/multiz12way/anno cd /hive/data/genomes/panTro3/bed/multiz12way/anno # only one of these does not yet have an N.bed file cd /hive/data/genomes/danRer7 twoBitInfo -nBed danRer7.2bit danRer7.N.bed for DB in `sed -e "s/panTro3 //" ../species.list` do echo "${DB} " ln -s /hive/data/genomes/${DB}/${DB}.N.bed ${DB}.bed echo ${DB}.bed >> nBeds ln -s /hive/data/genomes/${DB}/chrom.sizes ${DB}.len echo ${DB}.len >> sizes done # make sure they all are successful symLinks: ls -ogrtL mkdir mafSplit for F in ../mafSplit/*.maf do B=`basename ${F}` mafAddIRows -nBeds=nBeds ${F} /hive/data/genomes/panTro3/panTro3.2bit \ mafSplit/${B} echo $B done # about 1h 10m # combine into one file head -q -n 1 mafSplit/chr1.maf > panTro3.12way.maf for F in mafSplit/*.maf do grep -h -v "^#" ${F} done >> panTro3.12way.maf # these maf files do not have the end marker, this does nothing: # tail -q -n 1 mafSplit/chr1.maf >> panTro3.12way.maf # How about an official end marker: echo "##eof maf" >> panTro3.12way.maf # Load into database rm /gbdb/panTro3/multiz12way/multiz12way.maf # remove old symlink ln -s `pwd`/panTro3.12way.maf /gbdb/panTro3/multiz12way/multiz12way.maf cd /scratch/tmp time nice -n +19 hgLoadMaf panTro3 multiz12way # Loaded 15388679 mafs in 1 files from /gbdb/panTro3/multiz12way # real 12m45.528s time nice -n +19 hgLoadMafSummary -verbose=2 -minSize=30000 \ -mergeGap=1500 -maxSize=200000 panTro3 multiz12waySummary \ /gbdb/panTro3/multiz12way/multiz12way.maf # Created 1828535 summary blocks from 85036569 components # and 15388679 mafs from /gbdb/panTro3/multiz12way/multiz12way.maf # real 15m20.313s wc -l multiz12way*.tab # 15388679 multiz12way.tab # 1828535 multiz12waySummary.tab # 17217214 total rm multiz12way*.tab ######################################################################### # lifting ensGene from panTro2 (DONE - 2011-03-23 - Hiram) mkdir /hive/data/genomes/panTro3/bed/ensGene.61 cd /hive/data/genomes/panTro3/bed/ensGene.61 # start with panTro2 ensGene (it is v61 identical to v60) hgsql -N -e "select * from ensGene;" panTro2 | cut -f2- > panTro2.ensGene.gp # lift over to panTro3 liftOver -genePred panTro2.ensGene.gp \ /usr/local/apache/htdocs-hgdownload/goldenPath/panTro2/liftOver/panTro2ToPanTro3.over.chain.gz \ panTro3.ensGene.gp unMapped.txt # check to see which genes have gone bad genePredCheck -db=panTro3 panTro3.ensGene.gp > badOnes.txt 2>&1 # checked: 38471 failed: 62 awk '{print $3}' badOnes.txt | sort -u | grep -v failed > toRemove.txt # remove the bad ones cp -p panTro3.ensGene.gp panTro3.unfiltered.ensGene.gp for N in `cat toRemove.txt` do rm -f t.gp grep -v "${N}" panTro3.ensGene.gp > t.gp rm -f panTro3.ensGene.gp mv t.gp panTro3.ensGene.gp done genePredCheck -db=panTro3 panTro3.ensGene.gp # checked: 38446 failed: 0 # OK to load this since all are OK now hgLoadGenePred -genePredExt panTro3 ensGene panTro3.ensGene.gp genePredCheck -db=panTro3 ensGene # checked: 38446 failed: 0 # use the ensPep and ensGtp from panTro2, but remove the bad ones cat /hive/data/genomes/panTro2/bed/ensGene.60/download/Pan_troglodytes.CHIMP2.1.60.pep.all.fa.gz \ | sed -e 's/^>.* transcript:/>/; s/ CCDS.*$//;' | gzip > ensPep.txt.gz zcat ensPep.txt.gz \ | ~/kent/src/utils/faToTab/faToTab.pl /dev/null /dev/stdin \ | sed -e '/^$/d; s/*$//' | sort > ensPep.panTro2.fa.tab cp -p ensPep.panTro2.fa.tab ensPep.panTro3.fa.tab for N in `cat toRemove.txt` do rm -f t.fa.tab grep -v "${N}" ensPep.panTro3.fa.tab > t.fa.tab rm -f panTro3.ensGene.fa.tab mv t.fa.tab panTro3.ensGene.fa.tab echo "${N}" done wc -l ensPep.panTro2.fa.tab ensPep.panTro3.fa.tab awk '{print $1}' panTro2.ensGene.gp | sort -u > panTro2.name.list awk '{print $1}' panTro3.ensGene.gp | sort -u > panTro3.name.list cp -p /hive/data/genomes/panTro2/bed/ensGene.60/process/ensGtp.tab \ ./panTro2.ensGtp.tab cp -p panTro2.ensGtp.tab panTro3.ensGtp.tab comm -23 panTro2.name.list panTro3.name.list | while read N do rm -f t.gtp.tab grep -v "${N}" panTro3.ensGtp.tab > t.gtp.tab rm -f panTro3.ensGtp.tab mv t.gtp.tab panTro3.ensGtp.tab echo "${N}" done cp -p ensPep.panTro2.fa.tab ensPep.panTro3.fa.tab comm -23 panTro2.name.list panTro3.name.list | while read N do rm -f t.fa.tab grep -v "${N}" ensPep.panTro3.fa.tab > t.fa.tab rm -f ensPep.panTro3.fa.tab mv t.fa.tab ensPep.panTro3.fa.tab echo "${N}" done # now that those are clean, load them up hgPepPred panTro3 tab ensPep ensPep.panTro3.fa.tab hgLoadSqlTab panTro3 ensGtp ~/kent/src/hg/lib/ensGtp.sql panTro3.ensGtp.tab # and record version in trackVersion hgsql -e 'INSERT INTO trackVersion \ (db, name, who, version, updateTime, comment, source, dateReference) \ VALUES("panTro3", "ensGene", "hiram", "61", now(), \ "peptides lifted from panTro2", \ "genes lifted from panTro2 ensGene v61", "current" );' hgFixed ######################################################################### # MULTIZ12WAY MAF FRAMES (DONE - 2011-03-24 - Hiram) ssh hgwdev mkdir /hive/data/genomes/panTro3/bed/multiz12way/frames cd /hive/data/genomes/panTro3/bed/multiz12way/frames mkdir genes # looked at rn4 knownGene and only found 7,044 genes # the rn4 ensGene had 22,857 so use that instead # panTro3 ensGene # hg19 knownGene # ponAbe2 ensGene # rheMac2 ensGene # calJac3 ensGene # mm9 knownGene # rn4 ensGene # equCab2 ensGene # canFam2 ensGene # monDom5 ensGene # galGal3 ensGene # danRer7 ensGene panTro3 ponAbe2 rheMac2 calJac3 equCab2 canFam2 monDom5 galGal3 danRer7 #------------------------------------------------------------------------ # get the genes for all genomes # mRNAs with CDS. single select to get cds+psl, then split that up and # create genePred # using ensGene for: # panTro3 ponAbe2 rheMac2 calJac3 equCab2 canFam2 monDom5 galGal3 danRer7 for qDB in panTro3 ponAbe2 rheMac2 calJac3 rn4 equCab2 canFam2 monDom5 galGal3 danRer7 do echo hgsql -N -e \"'select * from 'ensGene\;\" ${qDB} hgsql -N -e "select * from ensGene" ${qDB} | cut -f 2-11 \ | genePredSingleCover stdin stdout | gzip -2c > genes/$qDB.gp.gz done # using knownGene for mm9 hg19 # genePreds; (must keep only the first 10 columns for knownGene) for qDB in mm9 hg19 do echo hgsql -N -e \"'select * from 'knownGene\;\" ${qDB} hgsql -N -e "select * from knownGene" ${qDB} | cut -f 1-10 \ | genePredSingleCover stdin stdout | gzip -2c > genes/$qDB.gp.gz done # verify counts for genes are reasonable: for T in genes/*.gz do echo -n "# $T: " zcat $T | cut -f1 | sort | uniq -c | wc -l done # genes/calJac3.gp.gz: 20442 # genes/canFam2.gp.gz: 19245 # genes/danRer7.gp.gz: 25857 # genes/equCab2.gp.gz: 20403 # genes/galGal3.gp.gz: 16491 # genes/hg19.gp.gz: 20597 # genes/mm9.gp.gz: 20905 # genes/monDom5.gp.gz: 19188 # genes/panTro3.gp.gz: 18414 # genes/ponAbe2.gp.gz: 19895 # genes/rheMac2.gp.gz: 21049 # genes/rn4.gp.gz: 22857 ssh hgwdev cd /hive/data/genomes/panTro3/bed/multiz12way/frames cat ../anno/multiz12way.maf \ | nice -n +19 genePredToMafFrames panTro3 stdin stdout \ `sed -e "s#\([a-zA-Z0-9]*\)#\1 genes/\1.gp.gz#g" ../species.list` \ | gzip > multiz12way.mafFrames.gz # real 12m47.652s # verify there are frames on everything: zcat multiz12way.mafFrames.gz | awk '{print $4}' | sort | uniq -c # 213564 calJac3 # 237030 canFam2 # 48444 danRer7 # 219472 equCab2 # 180209 galGal3 # 196670 hg19 # 229023 mm9 # 214937 monDom5 # 170067 panTro3 # 203651 ponAbe2 # 202922 rheMac2 # 225811 rn4 ssh hgwdev cd /hive/data/genomes/panTro3/bed/multiz12way/frames time hgLoadMafFrames panTro3 multiz12wayFrames multiz12way.mafFrames.gz # real 0m23.641s # Adding automatic generation of upstream files (DONE - 2011-05-09 - Hiram) # edit src/hg/makeDb/genbank/etc/genbank.conf to add: panTro3.upstreamGeneTbl = ensGene panTro3.upstreamMaf = multiz12way /hive/data/genomes/panTro3/bed/multiz12way/species.list ########################################################################### ## create upstream ensGene maf files mkdir -p /hive/data/genomes/panTro3/bed/multiz12way/downloads/maf cd /hive/data/genomes/panTro3/bed/multiz12way/downloads/maf # bash script #!/bin/sh for S in 1000 2000 5000 do echo "making upstream${S}.maf" featureBits panTro3 ensGene:upstream:${S} -fa=/dev/null -bed=stdout \ | perl -wpe 's/_up[^\t]+/\t0/' | sort -k1,1 -k2,2n \ | /cluster/bin/$MACHTYPE/mafFrags panTro3 multiz12way \ stdin stdout \ -orgs=/hive/data/genomes/panTro3/bed/multiz12way/species.list \ | gzip -c > upstream${S}.maf.gz echo "done upstream${S}.maf.gz" done # take a look to see if the names are OK: zcat upstream1000.maf.gz | grep "^s " | awk '{print $2}' \ | sort | uniq -c | sort -rn | head # 16006 rn4 # 16006 rheMac2 # 16006 ponAbe2 # 16006 monDom5 # 16006 mm9 # 16006 hg19 # 16006 galGal3 # 16006 equCab2 # 16006 danRer7 # 16006 canFam2 # 16006 calJac3 # 1 ENSPTRT00000068163 # 1 ENSPTRT00000068158 # ... etc ... zcat upstream1000.maf.gz | grep "^s " | awk '{print $2}' \ | sort | uniq -c | sort -rn | wc # 16017 32034 432327 # check the other files too md5sum up*.gz > md5sum.txt ######################################################################### # Phylogenetic tree from 12-way (DONE - 2011-03-24,03-28 - Hiram) mkdir /hive/data/genomes/panTro3/bed/multiz12way/4d cd /hive/data/genomes/panTro3/bed/multiz12way/4d mkdir mafSplit cd mafSplit time mafSplit -byTarget -useFullSequenceName \ /dev/null . ../../anno/multiz12way.maf # real 16m57.919s # got 3488 mafs named after their chrom/scaff .maf # there are 20644 scaffolds missing that # are too small or have nothing aligning. cd /hive/data/genomes/panTro3/bed/multiz12way/4d # using ensGene for panTro3, only transcribed genes and nothing # from the randoms and other misc. hgsql panTro3 -Ne \ "select * from ensGene WHERE cdsEnd > cdsStart;" | cut -f 2-20 | egrep -E -v "chrM|random|chrUn" \ > ensGene.gp # verify chromosome selection, should just be the ordinary chroms: cut -f2 ensGene.gp | sort | uniq -c wc -l *.gp # 30483 ensGene.gp genePredSingleCover ensGene.gp stdout | sort > ensGeneNR.gp wc -l ensGeneNR.gp # 17915 ensGeneNR.gp ssh encodek mkdir /hive/data/genomes/panTro3/bed/multiz12way/4d/run cd /hive/data/genomes/panTro3/bed/multiz12way/4d/run mkdir ../mfa # newer versions of msa_view have a slightly different operation # the sed of the gp file inserts the reference species in the chr name cat << '_EOF_' > 4d.csh #!/bin/csh -fe set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin set r = "/hive/data/genomes/panTro3/bed/multiz12way/4d" set c = $1 set infile = $r/mafSplit/$2 set outfile = $3 cd /scratch/tmp # 'clean' maf perl -wpe 's/^s ([^.]+)\.\S+/s $1/' $infile > $c.maf awk -v C=$c '$2 == C {print}' $r/ensGeneNR.gp | sed -e "s/\t$c\t/\tpanTro3.$c\t/" > $c.gp set NL=`wc -l $c.gp| gawk '{print $1}'` if ("$NL" != "0") then $PHASTBIN/msa_view --4d --features $c.gp -i MAF $c.maf -o SS > $c.ss $PHASTBIN/msa_view -i SS --tuple-size 1 $c.ss > $r/run/$outfile else echo "" > $r/run/$outfile endif rm -f $c.gp $c.maf $c.ss '_EOF_' # << happy emacs chmod +x 4d.csh ls -1S /hive/data/genomes/panTro3/bed/multiz12way/4d/mafSplit/*.maf | \ egrep -E -v "chrM|random|chrUn" \ | sed -e "s#.*multiz12way/4d/mafSplit/##" \ > maf.list cat << '_EOF_' > template #LOOP 4d.csh $(root1) $(path1) {check out line+ ../mfa/$(root1).mfa} #ENDLOOP '_EOF_' # << happy emacs gensub2 maf.list single template stdout | tac > jobList para create jobList para try ... check para -maxJob=5 push para time # Completed: 25 of 25 jobs # CPU time in finished jobs: 2040s 34.00m 0.57h 0.02d 0.000 y # IO & Wait Time: 171s 2.85m 0.05h 0.00d 0.000 y # Average job time: 88s 1.47m 0.02h 0.00d # Longest finished job: 166s 2.77m 0.05h 0.00d # Submission to last job: 513s 8.55m 0.14h 0.01d # combine mfa files ssh hgwdev cd /hive/data/genomes/panTro3/bed/multiz12way/4d #want comma-less species.list /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/msa_view \ --aggregate "`cat ../species.list`" mfa/*.mfa | sed s/"> "/">"/ \ > 4d.all.mfa # check they are all in there: grep "^>" 4d.all.mfa # >panTro3 # >hg19 # >ponAbe2 # >rheMac2 # >calJac3 # >mm9 # >rn4 # >equCab2 # >canFam2 # >monDom5 # >galGal3 # >danRer7 # tree-commas.nh: # (((((((((panTro3,hg19),ponAbe2),rheMac2),calJac3),(mm9,rn4)),(equCab2,canFam2)),monDom5),galGal3),danRer7) # use phyloFit to create tree model (output is phyloFit.mod) time nice -n +19 \ /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/phyloFit \ --EM --precision MED --msa-format FASTA --subst-mod REV \ --tree ../tree-commas.nh 4d.all.mfa # real 5m31.106s # real 1m21.248s mv phyloFit.mod all.mod grep TREE all.mod # (((((((((panTro3:0.00645513,hg19:0.00635619):0.0112004,ponAbe2:0.0179412):0.013987,rheMac2:0.0351289):0.0223948,calJac3:0.0639981):0.0739019,(mm9:0.0814807,rn4:0.0880817):0.249359):0.0153477,(equCab2:0.107349,canFam2:0.14282):0.0303434):0.257345,monDom5:0.328619):0.22173,galGal3:0.402012):0.635124,danRer7:0.635124); # create subset of primates-only echo "hg19 ponAbe2 rheMac2 calJac3 panTro3" > primate.lst # in this panTro3 case, all these mfa files should have all species: grep -h "^>" mfa/*.mfa | sort | uniq -c # 25 > calJac3 # 25 > canFam2 # 25 > danRer7 # 25 > equCab2 # 25 > galGal3 # 25 > hg19 # 25 > mm9 # 25 > monDom5 # 25 > panTro3 # 25 > ponAbe2 # 25 > rheMac2 # 25 > rn4 # on organisms that do not have all species in all files, the file names # need to be filtered mkdir primateMfa for FILE in mfa/*.mfa do B=`basename ${FILE}` awk ' BEGIN { primate = 0 } { if (match($0, "^>")) { primate = 1 if (match($0, "^> danRer7")) { primate = 0 } if (match($0, "^> galGal3")) { primate = 0 } if (match($0, "^> monDom5")) { primate = 0 } if (match($0, "^> canFam2")) { primate = 0 } if (match($0, "^> equCab2")) { primate = 0 } if (match($0, "^> rn4")) { primate = 0 } if (match($0, "^> mm9")) { primate = 0 } } if (primate) {print} } ' "${FILE}" > primateMfa/${B} done # verify only fish are present grep -h "^>" primateMfa/*.mfa | sort | uniq -c # 25 > calJac3 # 25 > hg19 # 25 > panTro3 # 25 > ponAbe2 # 25 > rheMac2 /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/msa_view \ --aggregate "`cat primate.lst`" primateMfa/*.mfa | sed s/"> "/">"/ \ > 4d.primate.mfa /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/tree_doctor \ --no-branchlen --prune-all-but="`cat primate.lst`" ../tree-commas.nh \ > tree-commas.primate.nh # use phyloFit to create tree model (output is phyloFit.mod) time nice -n +19 \ /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/phyloFit \ --EM --precision MED --msa-format FASTA --subst-mod REV \ --tree ./tree-commas.primate.nh 4d.primate.mfa mv phyloFit.mod primate.mod # real 0m0.871s grep TREE primate.mod | sed 's/TREE\:\ //' > primate.12way.nh # ((((panTro3:0.00644922,hg19:0.00636812):0.01112,ponAbe2:0.0180594):0.014155,rheMac2:0.0351191):0.0431786,calJac3:0.0431786); ######################################################################### # phastCons 12-way (DONE - 2011-03-28 - Hiram) # split 12way mafs into 10M chunks and generate sufficient statistics # files for # phastCons ssh swarm mkdir -p /hive/data/genomes/panTro3/bed/multiz12way/cons/SS cd /hive/data/genomes/panTro3/bed/multiz12way/cons/SS cat << '_EOF_' > mkSS.csh #!/bin/csh -ef set c = $1 set MAF = /hive/data/genomes/panTro3/bed/multiz12way/4d/mafSplit/$c.maf set WINDOWS = /hive/data/genomes/panTro3/bed/multiz12way/cons/SS/$c set WC = `cat $MAF | wc -l` set NL = `grep "^#" $MAF | wc -l` if ( -s $2 ) then exit 0 endif if ( -s $2.running ) then exit 0 endif date >> $2.running rm -fr $WINDOWS mkdir $WINDOWS pushd $WINDOWS > /dev/null if ( $WC != $NL ) then /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/msa_split \ $MAF -i MAF -o SS -r $WINDOWS/$c -w 10000000,0 -I 1000 -B 5000 endif popd > /dev/null date >> $2 rm -f $2.running '_EOF_' # << happy emacs chmod +x mkSS.csh cat << '_EOF_' > template #LOOP mkSS.csh $(root1) {check out line+ done/$(root1)} #ENDLOOP '_EOF_' # << happy emacs # do the easy ones first to see some immediate results ls -1S -r ../../4d/mafSplit | sed -e "s/.maf//" > maf.list gensub2 maf.list single template jobList para -ram=8g create jobList para try ... check ... etc # Completed: 3488 of 3488 jobs # CPU time in finished jobs: 2416s 40.27m 0.67h 0.03d 0.000 y # IO & Wait Time: 38993s 649.88m 10.83h 0.45d 0.001 y # Average job time: 12s 0.20m 0.00h 0.00d # Longest finished job: 258s 4.30m 0.07h 0.00d # Submission to last job: 440s 7.33m 0.12h 0.01d find . -type f | grep ".ss$" | wc # 3774 3774 200118 # Run phastCons # This job is I/O intensive in its output files, beware where this # takes place or do not run too many at once. ssh swarm mkdir -p /hive/data/genomes/panTro3/bed/multiz12way/cons/run.cons cd /hive/data/genomes/panTro3/bed/multiz12way/cons/run.cons # there are going to be only one phastCons run using # this same script. It triggers off of the current working directory # $cwd:t which is the "grp" in this script. It is: # all cat << '_EOF_' > doPhast.csh #!/bin/csh -fe set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin set c = $1 set cX = $1:r set f = $2 set len = $3 set cov = $4 set rho = $5 set grp = $cwd:t set cons = /hive/data/genomes/panTro3/bed/multiz12way/cons set tmp = $cons/tmp/$f mkdir -p $tmp set ssSrc = $cons set useGrp = "$grp.mod" ln -s $ssSrc/SS/$c/$f.ss $tmp ln -s $cons/$grp/$grp.mod $tmp pushd $tmp > /dev/null $PHASTBIN/phastCons $f.ss $useGrp \ --rho $rho --expected-length $len --target-coverage $cov --quiet \ --seqname $c --idpref $c --most-conserved $f.bed --score > $f.pp popd > /dev/null mkdir -p pp/$c bed/$c sleep 4 touch pp/$c bed/$c rm -f pp/$c/$f.pp rm -f bed/$c/$f.bed mv $tmp/$f.pp pp/$c mv $tmp/$f.bed bed/$c rm -fr $tmp '_EOF_' # << happy emacs chmod a+x doPhast.csh # this template will serve for all runs # root1 == chrom name, file1 == ss file name without .ss suffix cat << '_EOF_' > template #LOOP ../run.cons/doPhast.csh $(root1) $(file1) 45 0.3 0.3 {check out line+ pp/$(root1)/$(file1).pp} #ENDLOOP '_EOF_' # << happy emacs find ../SS -type f | grep ".ss$" | sed -e 's/.ss$//' > ss.list wc -l ss.list # 3774 ss.list # Create parasol batch and run it # run for all species cd /hive/data/genomes/panTro3/bed/multiz12way/cons mkdir -p all cd all # Using the .mod tree cp -p ../../4d/all.mod ./all.mod gensub2 ../run.cons/ss.list single ../run.cons/template jobList para -ram=8g create jobList para try ... check ... para -maxJob=64 push # Completed: 3774 of 3774 jobs # CPU time in finished jobs: 7480s 124.66m 2.08h 0.09d 0.000 y # IO & Wait Time: 26052s 434.20m 7.24h 0.30d 0.001 y # Average job time: 9s 0.15m 0.00h 0.00d # Longest finished job: 43s 0.72m 0.01h 0.00d # Submission to last job: 326s 5.43m 0.09h 0.00d cd /hive/data/genomes/panTro3/bed/multiz12way/cons/all find ./bed -type f | grep ".bed$" | xargs cat | sort -k1,1 -k2,2n \ | awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' \ > tmpMostConserved.bed /cluster/bin/scripts/lodToBedScore tmpMostConserved.bed > mostConserved.bed # load into database time nice -n +19 hgLoadBed panTro3 phastConsElements12way mostConserved.bed # Loaded 1425236 elements of size 5 # real 0m8.724s # on human we often try for 5% overall cov, and 70% CDS cov featureBits panTro3 -enrichment ensGene:cds phastConsElements12way # --rho 0.3 --expected-length 45 --target-coverage 0.3 # ensGene:cds 0.988%, phastConsElements12way 4.238%, both 0.726%, # cover 73.51%, enrich 17.34x # hg19 for comparison # refGene:cds 1.196%, phastConsElements46way 5.065%, # both 0.888%, cover 74.22%, enrich 14.65x # ensGene:cds 1.278%, phastConsElements46way 5.065%, # both 0.910%, cover 71.23%, enrich 14.06x # knownGene:cds 1.252%, phastConsElements46way 5.065%, # both 0.905%, cover 72.29%, enrich 14.27x # Create merged posterier probability file and wiggle track data files cd /hive/data/genomes/panTro3/bed/multiz12way/cons/all mkdir downloads find ./pp -type f | sed -e "s#^./##; s#\.# d #g; s#-# m #;" \ | sort -k1,1 -k3,3n | sed -e "s# d #.#g; s# m #-#g;" | xargs cat \ | gzip -c > downloads/phastCons12way.wigFix.gz # check integrity of data with wigToBigWig time (zcat downloads/phastCons12way.wigFix.gz \ | wigToBigWig -verbose=2 stdin /hive/data/genomes/panTro3/chrom.sizes \ phastCons12way.bw) > bigWig.log 2>&1 & grep real bigWig.log # real 42m37.312s grep VmPeak bigWig.log # pid=28269: VmPeak: 31261764 kB bigWigInfo phastCons12way.bw # version: 4 # isCompressed: yes # isSwapped: 0 # primaryDataSize: 5,059,052,759 # primaryIndexSize: 91,595,324 # zoomLevels: 10 # chromCount: 3488 # basesCovered: 2,768,188,151 # mean: 0.131991 # min: 0.000000 # max: 1.000000 # std: 0.226346 # encode those files into wiggle data time (zcat downloads/phastCons12way.wigFix.gz \ | wigEncode stdin phastCons12way.wig phastCons12way.wib) # Converted stdin, upper limit 1.00, lower limit 0.00 # real 20m35.352s du -hsc *.wi? # 2.6G phastCons12way.wib # 279M phastCons12way.wig # 2.9G total # Load gbdb and database with wiggle. ln -s `pwd`/phastCons12way.wib /gbdb/panTro3/multiz12way/phastCons12way.wib time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/panTro3/multiz12way \ panTro3 phastCons12way phastCons12way.wig # real 0m56.245s # use to set trackDb.ra entries for wiggle min and max wigTableStats.sh panTro3 phastCons12way # db.table min max mean count sumData stdDev viewLimits #panTro3.phastCons12way 0 1 0.131991 2768188151 3.65375e+08 0.226346 viewLimits=0:1 # Create histogram to get an overview of all the data time nice -n +19 hgWiggle -doHistogram -db=panTro3 \ -hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \ phastCons12way > histogram.data 2>&1 # real 8m2.130s # create plot of histogram: cat << '_EOF_' | gnuplot > histo.png set terminal png small x000000 xffffff xc000ff x66ff66 xffff00 x00ffff set size 1.4, 0.8 set key left box set grid noxtics set grid ytics set title " Chimp panTro3 Histogram phastCons12way track" set xlabel " phastCons12way score" set ylabel " Relative Frequency" set y2label " Cumulative Relative Frequency (CRF)" set y2range [0:1] set y2tics set yrange [0:0.02] plot "histogram.data" using 2:5 title " RelFreq" with impulses, \ "histogram.data" using 2:7 axes x1y2 title " CRF" with lines '_EOF_' # << happy emacs display histo.png & ######################################################################### # run for primates only species mkdir /hive/data/genomes/panTro3/bed/multiz12way/cons/primate cd /hive/data/genomes/panTro3/bed/multiz12way/cons/primate # Using the primate.mod tree cp -p ../../4d/primate.mod ./primate.mod gensub2 ../run.cons/ss.list single ../run.cons/template jobList para -ram=8g create jobList para try ... check ... para -maxJob=32 push # Completed: 3774 of 3774 jobs # CPU time in finished jobs: 7416s 123.59m 2.06h 0.09d 0.000 y # IO & Wait Time: 29847s 497.46m 8.29h 0.35d 0.001 y # Average job time: 10s 0.16m 0.00h 0.00d # Longest finished job: 45s 0.75m 0.01h 0.00d # Submission to last job: 257s 4.28m 0.07h 0.00d cd /hive/data/genomes/panTro3/bed/multiz12way/cons/primate find ./bed -type f | grep ".bed$" | xargs cat | sort -k1,1 -k2,2n \ | awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' \ > tmpMostConserved.bed /cluster/bin/scripts/lodToBedScore tmpMostConserved.bed > mostConserved.bed # load into database time nice -n +19 hgLoadBed panTro3 \ phastConsElements12wayPrimate mostConserved.bed # Loaded 316935 elements of size 5 # real 0m1.845s # on human we often try for 5% overall cov, and 70% CDS cov featureBits panTro3 -enrichment refGene:cds phastConsElements12wayPrimate # --rho 0.3 --expected-length 45 --target-coverage 0.3 # refGene:cds 1.320%, phastConsElements12wayPrimate 10.576%, both 1.093% # cover 82.82%, enrich 7.83x featureBits panTro3 -enrichment ensGene:cds phastConsElements12wayPrimate # ensGene:cds 2.783%, phastConsElements12wayPrimate 10.576%, both 2.141% # cover 76.93%, enrich 7.27x # hg19 for comparison # refGene:cds 1.196%, phastConsElements46wayPrimates 3.636%, # both 0.738%, cover 61.69%, enrich 16.97x # ensGene:cds 1.278%, phastConsElements46wayPrimates 3.636%, # both 0.749%, cover 58.62%, enrich 16.12x # knownGene:cds 1.252%, phastConsElements46wayPrimates 3.636%, # both 0.747%, cover 59.66%, enrich 16.41x # Create merged posterier probability file and wiggle track data files cd /hive/data/genomes/panTro3/bed/multiz12way/cons/primate mkdir downloads find ./pp -type f | sed -e "s#^./##; s#\.# d #g; s#-# m #;" \ | sort -k1,1 -k3,3n | sed -e "s# d #.#g; s# m #-#g;" | xargs cat \ | gzip -c > downloads/phastCons12wayPrimate.wigFix.gz # -rw-rw-r-- 1 3927580430 Mar 28 13:18 phastCons12wayPrimate.wigFix.gz # check integrity of data with wigToBigWig time (zcat downloads/phastCons12wayPrimate.wigFix.gz \ | wigToBigWig -verbose=2 stdin /hive/data/genomes/panTro3/chrom.sizes \ phastCons12wayPrimate.bw) > bigWig.log 2>&1 & grep real bigWig.log # real 43m40.088s grep VmP bigWig.log # pid=27966: VmPeak: 31261764 kB bigWigInfo phastCons12wayPrimate.bw # version: 4 # isCompressed: yes # isSwapped: 0 # primaryDataSize: 6,112,593,085 # primaryIndexSize: 91,595,324 # zoomLevels: 10 # chromCount: 3488 # basesCovered: 2,768,188,151 # mean: 0.196755 # min: 0.000000 # max: 0.986000 # std: 0.223179 # encode that ascii data into wiggle data time (zcat downloads/phastCons12wayPrimate.wigFix.gz \ | wigEncode stdin phastCons12wayPrimate.wig phastCons12wayPrimate.wib) & # Converted stdin, upper limit 0.99, lower limit 0.00 # real 21m57.151s # Converted stdin, upper limit 1.00, lower limit 0.00 du -hsc *.wi? du -hsc p*wi* # 2.6G phastCons12wayPrimate.wib # 306M phastCons12wayPrimate.wig # 2.9G total # Load gbdb and database with wiggle. ln -s `pwd`/phastCons12wayPrimate.wib \ /gbdb/panTro3/multiz12way/phastCons12wayPrimate.wib nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/panTro3/multiz12way panTro3 \ phastCons12wayPrimate phastCons12wayPrimate.wig # use to set trackDb.ra entries for wiggle min and max wigTableStats.sh panTro3 phastCons12wayPrimate # db.table min max mean count sumData stdDev viewLimits #panTro3.phastCons12wayPrimate 0 0.986 0.196755 2768188151 5.44656e+08 0.223179 viewLimits=0:0.986 # Create histogram to get an overview of all the data time nice -n +19 hgWiggle -doHistogram -db=panTro3 \ -hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \ phastCons12wayPrimate > histogram.data 2>&1 # real 7m32.015s # create plot of histogram: cat << '_EOF_' | gnuplot > histo.png set terminal png small x000000 xffffff xc000ff x66ff66 xffff00 x00ffff set size 1.4, 0.8 set key left box set grid noxtics set grid ytics set title " Chimp panTro3 Histogram phastCons12wayPrimate track" set xlabel " phastCons12wayPrimate score" set ylabel " Relative Frequency" set y2label " Cumulative Relative Frequency (CRF)" set y2range [0:1] set y2tics set yrange [0:0.02] plot "histogram.data" using 2:5 title " RelFreq" with impulses, \ "histogram.data" using 2:7 axes x1y2 title " CRF" with lines '_EOF_' # << happy emacs display histo.png & ############################################################################# # phyloP for 12-way (DONE - 2011-03-29 - Hiram) # run phyloP with score=LRT ssh swarm mkdir /cluster/data/panTro3/bed/multiz12way/consPhyloP cd /cluster/data/panTro3/bed/multiz12way/consPhyloP mkdir run.phyloP cd run.phyloP # Adjust model file base composition background and rate matrix to be # representative of the chromosomes in play grep BACKGROUND ../../cons/all/all.mod | awk '{printf "%0.3f\n", $3 + $4}' # 0.544 /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/modFreqs \ ../../cons/all/all.mod 0.544 > all.mod # and same for primate model: grep BACKGROUND ../../cons/primate/primate.mod \ | awk '{printf "%0.3f\n", $3 + $4}' # 0.550 /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/modFreqs \ ../../cons/primate/primate.mod 0.550 > primate.mod # following the pattern from hg19 with grp: "all" and "primate" cat << '_EOF_' > doPhyloP.csh #!/bin/csh -fe set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin set f = $1 set file1 = $f:t set out = $2 set cName = $f:t:r set n = $f:r:e set grp = $cwd:t set cons = /hive/data/genomes/panTro3/bed/multiz12way/consPhyloP set tmp = $cons/tmp/$grp/$f rm -fr $tmp mkdir -p $tmp set ssSrc = "/hive/data/genomes/panTro3/bed/multiz12way/cons/SS/$f" set useGrp = "$grp.mod" ln -s $cons/run.phyloP/$grp.mod $tmp pushd $tmp > /dev/null $PHASTBIN/phyloP --method LRT --mode CONACC --wig-scores --chrom $cName \ -i SS $useGrp $ssSrc.ss > $file1.wigFix popd > /dev/null mkdir -p $out:h sleep 4 mv $tmp/$file1.wigFix $out rm -fr $tmp rmdir --ignore-fail-on-non-empty $cons/tmp/$grp/$cName rmdir --ignore-fail-on-non-empty $cons/tmp/$grp rmdir --ignore-fail-on-non-empty $cons/tmp '_EOF_' # << happy emacs # Create list of chunks find ../../cons/SS -type f | grep ".ss$" \ | sed -e "s/.ss$//; s#^../../cons/SS/##" > ss.list # Create template file # file1 == $chr/$chunk/file name without .ss suffix cat << '_EOF_' > template #LOOP ../run.phyloP/doPhyloP.csh $(path1) {check out line+ wigFix/$(dir1)/$(file1).wigFix} #ENDLOOP '_EOF_' # << happy emacs ###################### Running all species ####################### # setup run for all species mkdir /hive/data/genomes/panTro3/bed/multiz12way/consPhyloP/all cd /hive/data/genomes/panTro3/bed/multiz12way/consPhyloP/all rm -fr wigFix mkdir wigFix gensub2 ../run.phyloP/ss.list single ../run.phyloP/template jobList # the -ram=8g will allow only one job per node to slow this down since # it would run too fast otherwise para -ram=8g create jobList para try ... check ... etc ... para -maxJob=64 push para time > run.time # Completed: 3774 of 3774 jobs # CPU time in finished jobs: 34046s 567.43m 9.46h 0.39d 0.001 y # IO & Wait Time: 58586s 976.44m 16.27h 0.68d 0.002 y # Average job time: 25s 0.41m 0.01h 0.00d # Longest finished job: 183s 3.05m 0.05h 0.00d # Submission to last job: 17800s 296.67m 4.94h 0.21d # make downloads mkdir downloads time (find ./wigFix -type f | sed -e "s#^./##; s#\.# d #g; s#-# m #;" \ | sort -k1,1 -k3,3n | sed -e "s# d #.#g; s# m #-#g;" | xargs cat \ | gzip -c > downloads/phyloP12way.wigFix.gz) & # real 62m9.710s # check integrity of data with wigToBigWig time (zcat downloads/phyloP12way.wigFix.gz \ | wigToBigWig -verbose=2 stdin /hive/data/genomes/panTro3/chrom.sizes \ phyloP12way.bw) > bigWig.log 2>&1 & egrep "real|VmPeak" bigWig.log # pid=6982: VmPeak: 31261764 kB # real 37m7.950s bigWigInfo phyloP12way.bw # version: 4 # isCompressed: yes # isSwapped: 0 # primaryDataSize: 4,513,393,592 # primaryIndexSize: 91,595,324 # zoomLevels: 10 # chromCount: 3488 # basesCovered: 2,768,188,151 # mean: 0.083685 # min: -7.192000 # max: 2.000000 # std: 0.595180 # encode those files into wiggle data time (zcat downloads/phyloP12way.wigFix.gz \ | wigEncode stdin phyloP12way.wig phyloP12way.wib) & # Converted stdin, upper limit 2.00, lower limit -7.19 # real 21m41.518s # Converted stdin, upper limit 2.57, lower limit -2.14 du -hsc *.wi? # 2.6G phyloP12way.wib # 283M phyloP12way.wig # 2.9G total # Load gbdb and database with wiggle. ln -s `pwd`/phyloP12way.wib /gbdb/panTro3/multiz12way/phyloP12way.wib nice hgLoadWiggle -pathPrefix=/gbdb/panTro3/multiz12way panTro3 \ phyloP12way phyloP12way.wig # use to set trackDb.ra entries for wiggle min and max wigTableStats.sh panTro3 phyloP12way # db.table min max mean count sumData stdDev viewLimits # panTro3.phyloP12way -7.192 2 0.0836853 2768188151 2.31657e+08 0.59518 # viewLimits=-2.89221:2 # Create histogram to get an overview of all the data time nice -n +19 hgWiggle -doHistogram -db=panTro3 \ -hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \ phyloP12way > histogram.data 2>&1 # real 7m23.514s # find out the range for the 2:5 graph grep -v chrom histogram.data | grep "^[0-9]" | ave -col=5 stdin # Q1 0.000189 # median 0.000548 # Q3 0.001120 # average 0.001000 # min 0.000016 # max 0.025490 # create plot of histogram: cat << '_EOF_' | gnuplot > histo.png set terminal png small x000000 xffffff xc000ff x66ff66 xffff00 x00ffff set size 1.4, 0.8 set key left box set grid noxtics set grid ytics set title " Chimp panTro3 Histogram phyloP12way track" set xlabel " phyloP12way score" set ylabel " Relative Frequency" set y2label " Cumulative Relative Frequency (CRF)" set y2range [0:1] set y2tics set yrange [0:0.005] plot "histogram.data" using 2:5 title " RelFreq" with impulses, \ "histogram.data" using 2:7 axes x1y2 title " CRF" with lines '_EOF_' # << happy emacs display histo.png & ################## Running Primate species subset ##################### # setup run primate species subset mkdir /hive/data/genomes/panTro3/bed/multiz12way/consPhyloP/primate cd /hive/data/genomes/panTro3/bed/multiz12way/consPhyloP/primate rm -fr wigFix mkdir wigFix gensub2 ../run.phyloP/ss.list single ../run.phyloP/template jobList para -ram=8g create jobList para try ... check ... etc ... para -maxJob=32 push para time > run.time # Completed: 3774 of 3774 jobs # CPU time in finished jobs: 15439s 257.32m 4.29h 0.18d 0.000 y # IO & Wait Time: 60285s 1004.75m 16.75h 0.70d 0.002 y # Average job time: 20s 0.33m 0.01h 0.00d # Longest finished job: 101s 1.68m 0.03h 0.00d # Submission to last job: 685s 11.42m 0.19h 0.01d # make downloads mkdir downloads time (find ./wigFix -type f | sed -e "s#^./##; s#\.# d #g; s#-# m #;" \ | sort -k1,1 -k3,3n | sed -e "s# d #.#g; s# m #-#g;" | xargs cat \ | gzip -c > downloads/phyloP12wayPrimate.wigFix.gz) & # real 52m32.532s # check integrity of data with wigToBigWig time (zcat downloads/phyloP12wayPrimate.wigFix.gz \ | wigToBigWig -verbose=2 stdin /hive/data/genomes/panTro3/chrom.sizes \ phyloP12wayPrimate.bw) > bigWig.log 2>&1 & egrep "real|VmPeak" bigWig.log # pid=6954: VmPeak: 31261760 kB # real 35m35.006s bigWigInfo phyloP12wayPrimate.bw # version: 4 # isCompressed: yes # isSwapped: 0 # primaryDataSize: 2,630,310,902 # primaryIndexSize: 91,595,324 # zoomLevels: 10 # chromCount: 3488 # basesCovered: 2,768,188,151 # mean: 0.050104 # min: -5.712000 # max: 0.286000 # std: 0.475422 # encode those files into wiggle data time (zcat downloads/phyloP12wayPrimate.wigFix.gz \ | wigEncode stdin phyloP12wayPrimate.wig phyloP12wayPrimate.wib) & # Converted stdin, upper limit 0.29, lower limit -5.71 # real 17m15.848s du -hsc *.wi? # 2.6G phyloP12wayPrimate.wib # 302M phyloP12wayPrimate.wig # 2.9G total # Load gbdb and database with wiggle. ln -s `pwd`/phyloP12wayPrimate.wib \ /gbdb/panTro3/multiz12way/phyloP12wayPrimate.wib nice hgLoadWiggle -pathPrefix=/gbdb/panTro3/multiz12way panTro3 \ phyloP12wayPrimate phyloP12wayPrimate.wig # use to set trackDb.ra entries for wiggle min and max wigTableStats.sh panTro3 phyloP12wayPrimate # db.table min max mean count sumData stdDev viewLimits # panTro3.phyloP12wayPrimate -5.712 0.286 0.0501044 2768188151 # 1.38698e+08 0.475422 viewLimits=-2.32701:0.286 # Create histogram to get an overview of all the data time nice -n +19 hgWiggle -doHistogram -db=panTro3 \ -hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \ phyloP12wayPrimate > histogram.data 2>&1 # real 7m37.778s # find out the range for the 2:5 graph # grep -v chrom histogram.data | grep "^[0-9]" | ave -col=5 stdin # Q1 0.000499 # median 0.001620 # Q3 0.003616 # average 0.003650 # min 0.000000 # max 0.184629 # create plot of histogram: cat << '_EOF_' | gnuplot > histo.png set terminal png small x000000 xffffff xc000ff x66ff66 xffff00 x00ffff set size 1.4, 0.8 set key left box set grid noxtics set grid ytics set title " Chimp panTro3 Histogram phyloP12wayPrimate track" set xlabel " phyloP12wayPrimate score" set ylabel " Relative Frequency" set y2label " Cumulative Relative Frequency (CRF)" set y2range [0:1] set y2tics set yrange [0:0.005] plot "histogram.data" using 2:5 title " RelFreq" with impulses, \ "histogram.data" using 2:7 axes x1y2 title " CRF" with lines '_EOF_' # << happy emacs display histo.png & ############################################################################# # download data for 8-way (DONE - 2011-01-19 - Hiram) mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/panTro3/phastCons12way cd /usr/local/apache/htdocs-hgdownload/goldenPath/panTro3/phastCons12way ln -s \ /hive/data/genomes/panTro3/bed/multiz12way/cons/all/downloads/phastCons12way.wigFix.gz \ ./vertebrate.phastCons12way.wigFix.gz ln -s \ /hive/data/genomes/panTro3/bed/multiz12way/cons/primate/downloads/phastCons12wayPrimate.wigFix.gz \ ./primate.phastCons12way.wigFix.gz ln -s /hive/data/genomes/panTro3/bed/multiz12way/cons/all/phastCons12way.bw \ ./vertebrate.phastCons12way.bw ln -s \ /hive/data/genomes/panTro3/bed/multiz12way/cons/primate/phastCons12wayPrimate.bw \ ./primate.phastCons12way.bw ln -s /hive/data/genomes/panTro3/bed/multiz12way/cons/primate/primate.mod \ ./primate.mod ln -s /hive/data/genomes/panTro3/bed/multiz12way/cons/all/all.mod \ ./vertebrate.mod # use a README from a recent multiz like this, this one is from danRer7 ln -s /hive/data/genomes/panTro3/bed/multiz12way/cons/README.txt . md5sum * > /hive/data/genomes/panTro3/bed/multiz12way/cons/md5sum.txt ln -s /hive/data/genomes/panTro3/bed/multiz12way/cons/md5sum.txt . mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/panTro3/phyloP12way cd /usr/local/apache/htdocs-hgdownload/goldenPath/panTro3/phyloP12way ln -s \ /hive/data/genomes/panTro3/bed/multiz12way/consPhyloP/all/downloads/phyloP12way.wigFix.gz \ ./vertebrate.phyloP12way.wigFix.gz ln -s \ /hive/data/genomes/panTro3/bed/multiz12way/consPhyloP/primate/downloads/phyloP12wayPrimate.wigFix.gz \ ./primate.phyloP12way.wigFix.gz ln -s \ /hive/data/genomes/panTro3/bed/multiz12way/consPhyloP/primate/phyloP12wayPrimate.bw \ ./primate.phyloP12way.bw ln -s \ /hive/data/genomes/panTro3/bed/multiz12way/consPhyloP/all/phyloP12way.bw \ ./vertebrate.phyloP12way.bw ln -s \ /hive/data/genomes/panTro3/bed/multiz12way/consPhyloP/run.phyloP/primate.mod \ ./primate.mod ln -s \ /hive/data/genomes/panTro3/bed/multiz12way/consPhyloP/run.phyloP/all.mod \ ./vertebrate.mod # use a README from a recent multiz like this, this one is from danRer7 ln -s /hive/data/genomes/panTro3/bed/multiz12way/consPhyloP/README.txt . md5sum * > /hive/data/genomes/panTro3/bed/multiz12way/consPhyloP/md5sum.txt ln -s /hive/data/genomes/panTro3/bed/multiz12way/consPhyloP/md5sum.txt . cd /hive/data/genomes/panTro3/bed/multiz12way # this was already done elsewhere: /cluster/bin/phast/tree_doctor --rename \ "panTro3->Chimp; hg19->Human; ponAbe2->Orangutan; rheMac2->Rhesus; calJac3->Marmoset; equCab2->Horse; mm9->Mouse; rn4->Rat; monDom5->Opossum; galGal3->Chicken; danRer7->Zebrafish; " \ panTro3.12way.nh > panTro3.commonNames.12way.nh mkdir /hive/data/genomes/panTro3/bed/multiz12way/downloads cd /hive/data/genomes/panTro3/bed/multiz12way/downloads cp -p ../anno/panTro3.12way.maf ./multiz12way.maf time gzip multiz12way.maf # real 54m14.149s mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/panTro3/multiz12way cd /usr/local/apache/htdocs-hgdownload/goldenPath/panTro3/multiz12way ln -s /hive/data/genomes/panTro3/bed/multiz12way/12way.nh . ln -s \ /hive/data/genomes/panTro3/bed/multiz12way/panTro3.commonNames.12way.nh . ln -s \ /hive/data/genomes/panTro3/bed/multiz12way/downloads/multiz12way.maf.gz . mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/panTro3/multiz12way/maf cd /usr/local/apache/htdocs-hgdownload/goldenPath/panTro3/multiz12way/maf ln -s /hive/data/genomes/panTro3/bed/multiz12way/downloads/maf/up*.gz . ln -s /hive/data/genomes/panTro3/bed/multiz12way/downloads/maf/md5sum.txt . # use a README from a recent multiz like this, this one is from danRer7 ln -s /hive/data/genomes/panTro3/bed/multiz12way/downloads/README.txt . md5sum *.nh *.gz *.txt \ > /hive/data/genomes/panTro3/bed/multiz12way/downloads/md5sum.txt ln -s /hive/data/genomes/panTro3/bed/multiz12way/downloads/md5sum.txt . ############################################################################# # hgPal downloads (DONE 2011-05-18 braney) # FASTA from 12way for refGene, ensGene ssh hgwdev screen bash rm -rf /cluster/data/panTro3/bed/multiz12way/pal mkdir /cluster/data/panTro3/bed/multiz12way/pal cd /cluster/data/panTro3/bed/multiz12way/pal for i in `cat ../species.list`; do echo $i; done > order.lst mz=multiz12way gp=refGene db=panTro3 mkdir exonAA exonNuc ppredAA ppredNuc for j in `sort -nk 2 /cluster/data/$db/chrom.sizes | awk '{print $1}'` do echo "date" echo "mafGene -chrom=$j $db $mz $gp order.lst stdout | \ gzip -c > ppredAA/$j.ppredAA.fa.gz" echo "mafGene -chrom=$j -noTrans $db $mz $gp order.lst stdout | \ gzip -c > ppredNuc/$j.ppredNuc.fa.gz" echo "mafGene -chrom=$j -exons -noTrans $db $mz $gp order.lst stdout | \ gzip -c > exonNuc/$j.exonNuc.fa.gz" echo "mafGene -chrom=$j -exons $db $mz $gp order.lst stdout | \ gzip -c > exonAA/$j.exonAA.fa.gz" done > $gp.jobs time sh -x $gp.jobs > $gp.jobs.log 2>&1 & sleep 1 tail -f $gp.jobs.log # real 350m10.116s # user 38m38.731s # sys 7m22.106s mz=multiz12way gp=refGene db=panTro3 zcat exonAA/*.gz | gzip -c > $gp.$mz.exonAA.fa.gz zcat exonNuc/*.gz | gzip -c > $gp.$mz.exonNuc.fa.gz zcat ppredAA/*.gz | gzip -c > $gp.$mz.ppredAA.fa.gz zcat ppredNuc/*.gz | gzip -c > $gp.$mz.ppredNuc.fa.gz rm -rf exonAA exonNuc ppredAA ppredNuc # we're only distributing exons at the moment mz=multiz12way gp=refGene db=panTro3 pd=/usr/local/apache/htdocs/goldenPath/$db/$mz/alignments mkdir -p $pd ln -s `pwd`/$gp.$mz.exonAA.fa.gz $pd/$gp.exonAA.fa.gz ln -s `pwd`/$gp.$mz.exonNuc.fa.gz $pd/$gp.exonNuc.fa.gz mz=multiz12way gp=ensGene db=panTro3 mkdir exonAA exonNuc ppredAA ppredNuc for j in `sort -nk 2 /cluster/data/$db/chrom.sizes | awk '{print $1}'` do echo "date" echo "mafGene -chrom=$j $db $mz $gp order.lst stdout | \ gzip -c > ppredAA/$j.ppredAA.fa.gz" echo "mafGene -chrom=$j -noTrans $db $mz $gp order.lst stdout | \ gzip -c > ppredNuc/$j.ppredNuc.fa.gz" echo "mafGene -chrom=$j -exons -noTrans $db $mz $gp order.lst stdout | \ gzip -c > exonNuc/$j.exonNuc.fa.gz" echo "mafGene -chrom=$j -exons $db $mz $gp order.lst stdout | \ gzip -c > exonAA/$j.exonAA.fa.gz" done > $gp.jobs time sh -x $gp.jobs > $gp.jobs.log 2>&1 & sleep 1 tail -f $gp.jobs.log # real 56m26.162s # user 10m18.260s # sys 7m32.302s mz=multiz12way gp=ensGene db=panTro3 zcat exonAA/*.gz | gzip -c > $gp.$mz.exonAA.fa.gz zcat exonNuc/*.gz | gzip -c > $gp.$mz.exonNuc.fa.gz zcat ppredAA/*.gz | gzip -c > $gp.$mz.ppredAA.fa.gz zcat ppredNuc/*.gz | gzip -c > $gp.$mz.ppredNuc.fa.gz rm -rf exonAA exonNuc ppredAA ppredNuc # we're only distributing exons at the moment mz=multiz12way gp=ensGene db=panTro3 pd=/usr/local/apache/htdocs/goldenPath/$db/$mz/alignments mkdir -p $pd ln -s `pwd`/$gp.$mz.exonAA.fa.gz $pd/$gp.exonAA.fa.gz ln -s `pwd`/$gp.$mz.exonNuc.fa.gz $pd/$gp.exonNuc.fa.gz ############################################################################ # LIFTOVER TO PanTro2 (DONE - 2011-06-02 - Chin ) mkdir /hive/data/genomes/panTro3/bed/blat.panTro2.2011-06-02 cd /hive/data/genomes/panTro3/bed/blat.panTro2.2011-06-02 # -debug run to create run dir, preview scripts... doSameSpeciesLiftOver.pl -debug -ooc /scratch/data/panTro3/panTro3.11.ooc \ panTro3 panTro2 # Real run: time nice -n +19 doSameSpeciesLiftOver.pl -verbose=2 \ -ooc /scratch/data/panTro3/panTro3.11.ooc \ -bigClusterHub=swarm -dbHost=hgwdev -workhorse=hgwdev \ panTro3 panTro2 > do.log 2>&1 & # real real 108m18.862s # pushQ stuff cd /usr/local/apache/htdocs-hgdownload/goldenPath/panTro3/liftOver/ md5sum panTro3ToPanTro2.over.chain.gz >> md5sum.txt # check /gbdb/panTro3/liftOver for new file there to panTro2 # verify convert functions on genome-test panTro3 to panTro2 # and file is present on htdocs-hgdownload/goldenPath/panTro3/liftOver ############################################################################ # N-SCAN gene predictions (nscanGene) - (2011-06-15 markd) # obtained NSCAN predictions from Jeltje van Baren , cd /hive/data/genomes/panTro3/bed/nscan cp -r /cluster/home/jeltje/hive/panTro3/for_browser/{panTro3.gtf,panTro3.prot.fa,readme.txt} . gzip panTro3.* chmod a-w * # load track gtfToGenePred -genePredExt panTro3.gtf.gz stdout | hgLoadGenePred -genePredExt panTro3 nscanGene stdin hgPepPred panTro3 generic nscanPep panTro3.prot.fa.gz rm *.tab # chimp/panTro3/trackDb.ra, add: track nscanGene override informant Nscan-EST on panTro3 with mm9 as informant and is pseudogene masked. Uses TransMap UCSC genes track as transcript evidence. # verify top-level search spec, should produce no results an an error for egrep: hgsql -Ne 'select name from nscanGene' panTro3 | egrep -v '^chr[0-9a-zA-Z_]+\.([0-9]+|pasa)((\.[0-9a-z]+)?\.[0-9a-z]+)?$' |head ############################################################################ # LASTZ Gorilla GorGor3 (DONE - 2011-10-21 - Hiram) mkdir /hive/data/genomes/panTro3/bed/lastzGorGor3.2011-10-21 cd /hive/data/genomes/panTro3/bed/lastzGorGor3.2011-10-21 cat << '_EOF_' > DEF # chimp vs gorilla # maximum M allowed with lastz is only 254 BLASTZ_M=254 BLASTZ_Q=/scratch/data/blastz/human_chimp.v2.q BLASTZ_O=600 BLASTZ_E=150 # other parameters on advice from Webb BLASTZ_K=4500 BLASTZ_Y=15000 BLASTZ_T=2 # TARGET: Chimp PanTro3 SEQ1_DIR=/scratch/data/panTro3/panTro3.2bit SEQ1_LEN=/scratch/data/panTro3/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 SEQ1_LIMIT=100 # QUERY: Frog gorGor3 SEQ2_DIR=/scratch/data/gorGor3/gorGor3.2bit SEQ2_LEN=/scratch/data/gorGor3/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LAP=0 SEQ2_LIMIT=100 BASE=/hive/data/genomes/panTro3/bed/lastzGorGor3.2011-10-21 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # establish a screen to control this job screen time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ > do.log 2>&1 & # after a manual recover following hive filesystem problems time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -continue=cat -syntenicNet \ -noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ > cat.log 2>&1 & # 131m34s cat fb.panTro3.chainGorGor3Link.txt # 2553716156 bases of 2900529764 (88.043%) in intersection cd /hive/data/genomes/panTro3/bed ln -s lastzGorGor3.2011-10-21 lastz.gorGor3 # running the swap mkdir /hive/data/genomes/gorGor3/bed/blastz.panTro3.swap cd /hive/data/genomes/gorGor3/bed/blastz.panTro3.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/panTro3/bed/lastzGorGor3.2011-10-21/DEF \ -swap -syntenicNet \ -noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ > swap.log 2>&1 & # about 48 minutes # an odd error: # netClass -verbose=0 -noAr noClass.net gorGor3 panTro3 gorGor3.panTro3.net # hashMustFindVal: 'chr2' not found # I can't find any chr2 in any of these files ... # This was due to a bogus simpleRepeat table in panTro3, mistakenly # loaded 09 May 2011, backpushed from the RR 27 October 2011 cat fb.gorGor3.chainPanTro3Link.txt # 2513463838 bases of 2822760080 (89.043%) in intersection # after replacing the simpleRepeat table, finish off the # axtChain/loadUp.csh script and time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/panTro3/bed/lastzGorGor3.2011-10-21/DEF \ -continue=download -swap -syntenicNet \ -noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ > download.log 2>&1 & # real 25m50.173s cd /hive/data/genomes/gorGor3/bed ln -s blastz.panTro3.swap lastz.panTro3 ############################################################################ # LASTZ Mouse Mm10 (DONE - 2012-03-08 - Hiram) # this works better if Mm10 is the primary target and this panTro3 # genome is the query. It goes much faster mkdir /hive/data/genomes/panTro3/bed/lastzMm10.2012-03-08 cd /hive/data/genomes/panTro3/bed/lastzMm10.2012-03-08 cat << '_EOF_' > DEF # chimp vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: Chimp PanTro3 SEQ1_DIR=/scratch/data/panTro3/panTro3.2bit SEQ1_LEN=/scratch/data/panTro3/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: Mouse Mm10 SEQ2_DIR=/scratch/data/mm10/mm10.2bit SEQ2_LEN=/scratch/data/mm10/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/hive/data/genomes/panTro3/bed/lastzMm10.2012-03-08 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # establish a screen to control this job screen -S panTro3Mm10 time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet -noLoadChainSplit \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 5043m15.440s cat fb.panTro3.chainMm10Link.txt # 929073028 bases of 2900529764 (32.031%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/panTro3/bed ln -s lastzMm10.2012-03-08 lastz.mm10 mkdir /hive/data/genomes/mm10/bed/blastz.panTro3.swap cd /hive/data/genomes/mm10/bed/blastz.panTro3.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/panTro3/bed/lastzMm10.2012-03-08/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 68m46.408s cat fb.mm10.chainPanTro3Link.txt # 922491113 bases of 2652783500 (34.774%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s blastz.panTro3.swap lastz.panTro3 ############################################################################## # LIFTOVER TO panTro4 (DONE - 2012-07-24 - Hiram ) mkdir /hive/data/genomes/panTro3/bed/blat.panTro4.2012-07-24 cd /hive/data/genomes/panTro3/bed/blat.panTro4.2012-07-24 # -debug run to create run dir, preview scripts... doSameSpeciesLiftOver.pl -debug \ -ooc /hive/data/genomes/panTro3/jkStuff/panTro3.11.ooc panTro3 panTro4 # Real run: time nice -n +19 doSameSpeciesLiftOver.pl -verbose=2 \ -ooc /hive/data/genomes/panTro3/jkStuff/panTro3.11.ooc \ -bigClusterHub=swarm -dbHost=hgwdev -workhorse=hgwdev \ panTro3 panTro4 > do.log 2>&1 # real 79m23.346s #############################################################################