# for emacs: -*- mode: sh; -*- # Gorilla gorilla gorilla # ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/ # Gorilla_gorilla/gorGor3.1/ # # http://www.ncbi.nlm.nih.gov/Traces/wgs/?val=CABD02 # WGS: AAMC01000001:AAMC01190823 # ########################################################################## # Download sequence (DONE - 2011-10-11 - Hiram) mkdir -p /hive/data/genomes/gorGor3/genbank cd /hive/data/genomes/gorGor3/genbank wget --timestamping -r --cut-dirs=6 --level=0 -nH -x \ --no-remove-listing -np \ "ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Gorilla_gorilla/gorGor3.1/*" faSize Primary_Assembly/unplaced_scaffolds/FASTA/unplaced.scaf.fa.gz 1511717716 bases (153400444 N's 1358317272 real 1358317272 upper 0 lower) in 19549 sequences in 1 files cat << '_EOF_' > mkUcsc.pl #!/usr/bin/env perl use strict; use warnings; my %chrFrNameToChrN; open (FH, "< Primary_Assembly/assembled_chromosomes/chr2acc") or die "can not read Primary_Assembly/assembled_chromosomes/chr2acc"; while (my $line = ) { next if ($line =~ m/^#/); chomp $line; my ($chrN, $frName) = split('\s+', $line); $chrFrNameToChrN{$frName} = $chrN; } close (FH); my $firstHeader = 1; open (UC, "|gzip -c > ucsc.agp.gz") or die "can not write to ucsc.agp.gz"; open (FA, "|gzip -c > ucsc.fa.gz") or die "can not write to uscs.fa.gz"; foreach my $key (sort keys %chrFrNameToChrN) { my $chrN = $chrFrNameToChrN{$key}; printf "%s\tchr%s\n", $key, $chrN; my $fastaFile = "Primary_Assembly/assembled_chromosomes/FASTA/chr${chrN}.fa.gz"; my $agpFile = "Primary_Assembly/assembled_chromosomes/AGP/chr${chrN}.agp.gz"; open (FH, "zcat $agpFile|") or die "can not read $agpFile"; while (my $line = ) { if ($line =~ m/^#/) { if ($firstHeader) { printf UC "%s", $line; } next; } $firstHeader = 0; $line =~ s/$key/chr${chrN}/; printf UC "%s", $line; } close (FH); open (FH, "zcat $fastaFile|") or die "can not read $fastaFile"; while (my $line = ) { if ($line =~ m/^>/) { die "can not match fasta name $key in $fastaFile\n$line" if ($line !~ m/$key/); printf FA ">chr${chrN}\n"; next; } printf FA "%s", $line; } close (FH); } close (UC); close (FA); '_EOF_' # << happy emacs chmod +x mkUcsc.pl time ./mkUcsc.pl # real 18m3.244s zcat Primary_Assembly/unplaced_scaffolds/AGP/unplaced.scaf.agp.gz | \ sed -e 's/\.1//' | gzip -c > ucsc.unplaced.agp.gz zcat Primary_Assembly/unplaced_scaffolds/FASTA/unplaced.scaf.fa.gz \ | sed -e 's/^>.*emb|\([A-Z0-9]*\).*/>\1/' | gzip -c \ > ucsc.unplaced.fa.gz ########################################################################### # Initial genome build (DONE - 2011-10-11 - Hiram) cd /hive/data/genomes/gorGor3 cat << '_EOF_' > gorGor3.config.ra # Config parameters for makeGenomeDb.pl: db gorGor3 clade mammal genomeCladePriority 12 scientificName Gorilla gorilla gorilla commonName Gorilla assemblyDate May 2011 assemblyLabel Wellcome Trust Sanger Institute May 2011 (NCBI project 31265, GCA_000151905.1) assemblyShortLabel gorGor3.1 orderKey 25 mitoAcc NC_011120 fastaFiles /hive/data/genomes/gorGor3/genbank/ucsc.*fa.gz agpFiles /hive/data/genomes/gorGor3/genbank/ucsc.*agp.gz # qualFiles none dbDbSpeciesDir gorilla taxId 9595 '_EOF_' # << happy emacs time makeGenomeDb.pl -stop=agp -workhorse=hgwdev -fileServer=hgwdev \ gorGor3.config.ra > step.agp.log 2>&1 & # real 25m45.151s # take a look at the constructed AGP file to see if it has the names # desired time makeGenomeDb.pl -continue=db -workhorse=hgwdev -fileServer=hgwdev \ gorGor3.config.ra > step.db.log 2>&1 & # real 86m25.408s ########################################################################## # running repeat masker (DONE - 2011-10-12 - Hiram) mkdir /hive/data/genomes/gorGor3/bed/repeatMasker cd /hive/data/genomes/gorGor3/bed/repeatMasker time doRepeatMasker.pl -buildDir=`pwd` -noSplit \ -bigClusterHub=swarm -dbHost=hgwdev -workhorse=hgwdev \ -smallClusterHub=memk gorGor3 > do.log 2>&1 & # real 1356m28.922s # real 35m31.331s cat faSize.rmsk.txt # 3029553646 bases (206793566 N's 2822760080 real 1446718337 upper # 1376041743 lower) in 46823 sequences in 1 files # %45.42 masked total, %48.75 masked real grep -i versi do.log # RepeatMasker version development-$Id: RepeatMasker,v 1.26 2011/09/26 16:19:44 angie Exp $ # April 26 2011 (open-3-3-0) version of RepeatMasker featureBits -countGaps gorGor3 rmsk # 1384011521 bases of 3029553646 (45.684%) in intersection # why is it different than the faSize above ? # because rmsk masks out some N's as well as bases, the count above # separates out the N's from the bases, it doesn't show lower case N's ########################################################################## # running simple repeat (DONE - 2011-10-12 - Hiram) mkdir /hive/data/genomes/gorGor3/bed/simpleRepeat cd /hive/data/genomes/gorGor3/bed/simpleRepeat time doSimpleRepeat.pl -buildDir=`pwd` -bigClusterHub=swarm \ -dbHost=hgwdev -workhorse=hgwdev -smallClusterHub=memk \ gorGor3 > do.log 2>&1 & # real 3208m54.385s # There were problems in three chunks. They would not complete # after a couple days running time. They were taken manually # and broken up into smaller parts to get completed. Note # work in directories: # /hive/data/genomes/gorGor3/bed/simpleRepeat/stragglers # /hive/data/genomes/gorGor3/bed/simpleRepeat/moreStragglers time doSimpleRepeat.pl -buildDir=`pwd` -bigClusterHub=swarm \ -continue=filter -dbHost=hgwdev -workhorse=hgwdev \ -smallClusterHub=memk \ gorGor3 > filter.log 2>&1 & # real 1m1.322s cat fb.simpleRepeat # 199433151 bases of 2822760080 (7.065%) in intersection # This is quite a large intersection, highly unusual compared to # most genomes cd /hive/data/genomes/gorGor3 twoBitMask gorGor3.rmsk.2bit \ -add bed/simpleRepeat/trfMask.bed gorGor3.2bit # you can safely ignore the warning about fields >= 13 twoBitToFa gorGor3.2bit stdout | faSize stdin > faSize.gorGor3.2bit.txt cat faSize.gorGor3.2bit.txt # 3029553646 bases (206793566 N's 2822760080 real 1445288963 upper # 1377471117 lower) in 46823 sequences in 1 files # %45.47 masked total, %48.80 masked real # *** REMEMBER *** to reset they symLink in gbdb: rm /gbdb/gorGor3/gorGor3.2bit ln -s `pwd`/gorGor3.2bit /gbdb/gorGor3/gorGor3.2bit ######################################################################### # Verify all gaps are marked, add any N's not in gap as type 'other' # (DONE - 2011-10-13 - Hiram) mkdir /hive/data/genomes/gorGor3/bed/gap cd /hive/data/genomes/gorGor3/bed/gap time nice -n +19 findMotif -motif=gattaca -verbose=4 \ -strand=+ ../../gorGor3.unmasked.2bit > findMotif.txt 2>&1 # real 1m8.647s grep "^#GAP " findMotif.txt | sed -e "s/^#GAP //" > allGaps.bed featureBits gorGor3 -not gap -bed=notGap.bed # 3026913193 bases of 3026913193 (100.000%) in intersection featureBits gorGor3 allGaps.bed notGap.bed -bed=new.gaps.bed # 204153113 bases of 3026913193 (6.745%) in intersection # Wow, that's a lot of unmarked gap # what is the highest index in the existing gap table: hgsql -N -e "select ix from gap;" gorGor3 | sort -n | tail -1 # 1034 # that number is used below in the script to mark all the new gaps # with a higher ix than that cat << '_EOF_' > mkGap.pl #!/bin/env perl use strict; use warnings; my $ix=`hgsql -N -e "select ix from gap;" gorGor3 | sort -n | tail -1`; chomp $ix; open (FH,") { my ($chrom, $chromStart, $chromEnd, $rest) = split('\s+', $line); ++$ix; printf "%s\t%d\t%d\t%d\tN\t%d\tother\tyes\n", $chrom, $chromStart, $chromEnd, $ix, $chromEnd-$chromStart; } close (FH); '_EOF_' # << happy emacs chmod +x ./mkGap.pl ./mkGap.pl > other.bed featureBits -countGaps gorGor3 other.bed # 204153113 bases of 3029553646 (6.739%) in intersection wc -l other.bed # 423025 hgLoadBed -sqlTable=$HOME/kent/src/hg/lib/gap.sql \ -noLoad gorGor3 otherGap other.bed # starting with this many hgsql -e "select count(*) from gap;" gorGor3 # 10836 hgsql gorGor3 -e 'load data local infile "bed.tab" into table gap;' # result count: hgsql -e "select count(*) from gap;" gorGor3 # 433861 # == 10836 + 423025 # verify we aren't adding gaps where gaps already exist # this would output errors if that were true: gapToLift -minGap=1 gorGor3 nonBridged.lift -bedFile=nonBridged.bed # see example in danRer7.txt # there are non-bridged gaps here: hgsql -N -e "select bridge from gap;" gorGor3 | sort | uniq -c # 7001 no # 426860 yes ########################################################################## # gorGor3 - Gorilla - Ensembl Genes version 64 (DONE - 2011-10-12 - hiram) ssh hgwdev cd /hive/data/genomes/gorGor3 cat << '_EOF_' > gorGor3.ensGene.ra # required db variable db gorGor3 # optional nameTranslation, the sed command that will transform # Ensemble names to UCSC names. With quotes just to make sure. # delete commands take out genes that are only in patch sequence nameTranslation '/^cutchr/d; /^unplaced/d; s/^\([0-9X][0-9ab]*\)/chr\1/; s/^MT/chrM/;' '_EOF_' # << happy emacs doEnsGeneUpdate.pl -ensVersion=64 gorGor3.ensGene.ra ssh hgwdev cd /hive/data/genomes/gorGor3/bed/ensGene.64 featureBits gorGor3 ensGene # 50017329 bases of 3026913193 (1.652%) in intersection ############################################################################ # set this as defaultDb (DONE - 2011-10-13 - Hiram) # and make this the default genome for Pig hgsql -e 'update defaultDb set name="gorGor3" where name="gorGor1";' \ hgcentraltest ############################################################################ # scaffolds ctgPos2 track (DONE - 2011-10-13 - Hiram) mkdir /hive/data/genomes/gorGor3/bed/ctgPos2 cd /hive/data/genomes/gorGor3/bed/ctgPos2 cat << '_EOF_' > scaffoldToCtgPos2.pl #!/usr/bin/env perl use strict; use warnings; my %chromSizes; open (FH, "<../../chrom.sizes") or die "can not read ../../chrom.sizes"; while (my $line = ) { chomp $line; my ($chr, $size) = split('\s+', $line); $chromSizes{$chr} = $size; } close (FH); my %fragToChrom; my %fragToPosition; my %fragType; my %fragSize; my %chrType; open (FH, "hgsql -N -e 'select chrom,chromStart,chromEnd,type,frag,fragStart,fragEnd,strand from gold;' gorGor3|") or die "can not select from gold.gorGor3"; while (my $line = ) { chomp $line; my ($chrom,$chromStart,$chromEnd,$type,$frag,$fragStart,$fragEnd,$strand) = split('\t', $line); next if (exists($fragToChrom{$frag})); my $size = $fragEnd - $fragStart; $fragToChrom{$frag} = $chrom; $fragToPosition{$frag} = $chromStart; $fragType{$frag} = $type; if (exists($chrType{$chrom})) { if ($chrType{$chrom} ne $type) { printf STDERR "WARN: different types for chrom $chrom: $type NE $chrType{$chrom}\n"; } } $chrType{$chrom} = $type; $fragSize{$frag} = $size; } close (FH); open (FH, ") { next if ($line =~ m/^#Scaffold/); chomp $line; my ($scafName, $fragName) = split('\s+', $line); my $noSuffix = $fragName; $noSuffix =~ s/\.1$//; if (exists($chromSizes{$noSuffix})) { my $type = ""; $type = $chrType{$noSuffix} if (exists($chrType{$noSuffix})); $type = $fragType{$fragName} if (exists($fragType{$fragName})); if (length($type) > 0) { my $size = $chromSizes{$noSuffix}; printf "%s\t%d\t%s\t0\t%d\t%s\n", $scafName, $size, $noSuffix, $size, $type; } else { printf STDERR "can not find fragType for $fragName, $noSuffix\n"; } } elsif (exists($fragToChrom{$fragName})) { printf "%s\t%d\t%s\t0\t%d\t%s\n", $scafName, $fragSize{$fragName}, $fragToChrom{$fragName}, $fragSize{$fragName}, $fragType{$fragName}; } else { printf STDERR "can not find: $fragName -> $scafName\n"; } } close (FH); '_EOF_' # << happy emacs chmod +x scaffoldToCtgPos2.pl ./scaffoldToCtgPos2.pl > scaffold.ctgPos2.tab cut -f1 scaffold.ctgPos2.tab | awk '{print length($1)}' | sort -rn | head -1 # 23 cut -f3 scaffold.ctgPos2.tab | awk '{print length($1)}' | sort -rn | head -1 # 12 sed -e "s/20/23/; s/16/12/" $HOME/kent/src/hg/lib/ctgPos2.sql > ctgPos2.sql hgLoadSqlTab gorGor3 ctgPos2 ctgPos2.sql scaffold.ctgPos2.tab ######################################################################## # MAKE 11.OOC FILE FOR BLAT/GENBANK (DONE - 2011-10-14 - Hiram) # Use -repMatch=650, based on size -- for human we use 1024 # use the "real" number from the faSize measurement, # hg19 is 2897316137, calculate the ratio factor for 1024: calc \( 2822760080 / 2897316137 \) \* 1024 # ( 2822760080 / 2897316137 ) * 1024 = 997.649613 # round up to 1000 cd /hive/data/genomes/gorGor3 blat gorGor3.2bit /dev/null /dev/null -tileSize=11 \ -makeOoc=jkStuff/gorGor3.11.ooc -repMatch=1000 # Wrote 29991 overused 11-mers to jkStuff/gorGor3.11.ooc # copy all of this stuff to the klusters: # there are non-bridged gaps, but they aren't really that, since # they are all exactly 100 bases and they are all over the normal # chromosomes: hgsql -N -e "select bridge from gap;" gorGor3 | sort | uniq -c # 7001 no # 426860 yes # cd /hive/data/genomes/gorGor3/jkStuff # gapToLift gorGor3 nonBridged.lift -bedFile=nonBridged.bed cd /hive/data/genomes/gorGor3 mkdir /hive/data/staging/data/gorGor3 cp -p jkStuff/gorGor3.11.ooc chrom.sizes \ gorGor3.2bit /hive/data/staging/data/gorGor3 # request rsync copy from cluster admin ######################################################################### # AUTO UPDATE GENBANK (DONE - 2011-10-14 - Hiram) # examine the file: /cluster/data/genbank/data/organism.lst # for your species to see what counts it has for: # organism mrnaCnt estCnt refSeqCnt # Gorilla gorilla 546 4 0 # Gorilla gorilla gorilla 1 0 0 # to decide which "native" mrna or ests you want to specify in genbank.conf # this appears that gorGor3 has almost no native est's ssh hgwdev cd $HOME/kent/src/hg/makeDb/genbank git pull # edit etc/genbank.conf to add gorGor3 before gorGor2 and commit to GIT # Gorilla gorGor3.serverGenome = /hive/data/genomes/gorGor3/gorGor3.2bit gorGor3.clusterGenome = /scratch/data/gorGor3/gorGor3.2bit gorGor3.ooc = /scratch/data/gorGor3/gorGor3.11.ooc gorGor3.lift = no gorGor3.perChromTables = no gorGor3.refseq.mrna.native.pslCDnaFilter = ${ordered.refseq.mrna.native.pslCDnaFilter} gorGor3.refseq.mrna.xeno.pslCDnaFilter = ${ordered.refseq.mrna.xeno.pslCDnaFilter} gorGor3.genbank.mrna.native.pslCDnaFilter = ${ordered.genbank.mrna.native.pslCDnaFilter} gorGor3.genbank.mrna.xeno.pslCDnaFilter = ${ordered.genbank.mrna.xeno.pslCDnaFilter} gorGor3.genbank.est.native.pslCDnaFilter = ${ordered.genbank.est.native.pslCDnaFilter} gorGor3.genbank.est.xeno.pslCDnaFilter = ${ordered.genbank.est.xeno.pslCDnaFilter} gorGor3.downloadDir = gorGor3 gorGor3.refseq.mrna.native.load = yes gorGor3.refseq.mrna.xeno.load = yes gorGor3.refseq.mrna.xeno.loadDesc = yes gorGor3.genbank.est.native.load = no gorGor3.upstreamGeneTbl = xenoRefGene # end of section added to etc/genbank.conf git commit -m "adding gorGor3 Gorilla" etc/genbank.conf git push make etc-update # ~/kent/src/hg/makeDb/genbank/src/lib/gbGenome.c already contains # gorGor genome information, if this is a new species, need to add stuff # there ssh hgwdev # used to do this on "genbank" machine screen # long running job managed in screen cd /cluster/data/genbank time nice -n +19 ./bin/gbAlignStep -initial gorGor3 & # var/build/logs/2011.10.17-09:14:51.gorGor3.initalign.log # real 788m42.989s # load database when finished ssh hgwdev cd /cluster/data/genbank time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad gorGor3 & # logFile: var/dbload/hgwdev/logs/2011.10.21-08:59:38.dbload.log # real 22m45.028s # enable daily alignment and update of hgwdev cd ~/kent/src/hg/makeDb/genbank git pull # add gorGor3 to: etc/align.dbs etc/hgwdev.dbs git commit -m "daily update for gorGor3 replaces daily update for gorGor2" \ etc/align.dbs etc/hgwdev.dbs git push make etc-update ######################################################################### # BLATSERVERS ENTRY (DONE - 2007-11-04 - Hiram) # After getting a blat server assigned by the Blat Server Gods, ssh hgwdev hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ VALUES ("gorGor3", "blatx", "17798", "1", "0"); \ INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ VALUES ("gorGor3", "blatx", "17799", "0", "1");' \ hgcentraltest # test it with some sequence ########################################################################### # lastz swap Human hg19 (DONE - 2011-10-21 - Hiram) # original alignment cd /hive/data/genomes/hg19/bed/lastzGorGor3.2011-10-17 cat fb.hg19.chainGorGor3Link.txt # 2603997992 bases of 2897316137 (89.876%) in intersection # running the swap mkdir /hive/data/genomes/gorGor3/bed/blastz.hg19.swap cd /hive/data/genomes/gorGor3/bed/blastz.hg19.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/hg19/bed/lastzGorGor3.2011-10-17/DEF \ -swap -syntenicNet \ -noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ > swap.log 2>&1 & # real 69m39.685s cat fb.gorGor3.chainHg19Link.txt # 2571797450 bases of 2822760080 (91.109%) in intersection cd /hive/data/genomes/gorGor3/bed ln -s blastz.hg19.swap lastz.hg19 ############################################################################ # LASTZ Gorilla NomLeu1 (DONE - 2011-10-21,27 - Hiram) mkdir /hive/data/genomes/gorGor3/bed/lastzNomLeu1.2011-10-21 cd /hive/data/genomes/gorGor3/bed/lastzNomLeu1.2011-10-21 cat << '_EOF_' > DEF # Gorilla vs Gibbon # maximum M allowed with lastz is only 254 BLASTZ_M=254 BLASTZ_Q=/scratch/data/blastz/human_chimp.v2.q BLASTZ_O=600 BLASTZ_E=150 # other parameters on advice from Webb BLASTZ_K=4500 BLASTZ_Y=15000 BLASTZ_T=2 # TARGET: Gorilla gorGor3 SEQ1_DIR=/scratch/data/gorGor3/gorGor3.2bit SEQ1_LEN=/scratch/data/gorGor3/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 SEQ1_LIMIT=30 # QUERY: Gibbon nomLeu1 SEQ2_DIR=/scratch/data/nomLeu1/nomLeu1.2bit SEQ2_LEN=/scratch/data/nomLeu1/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LAP=0 SEQ2_LIMIT=100 BASE=/hive/data/genomes/gorGor3/bed/lastzNomLeu1.2011-10-21 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # establish a screen to control this job screen time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ > do.log 2>&1 & # after recovering manually after a hive filesystem outage time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -continue=cat -syntenicNet \ -noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ > cat.log 2>&1 & # Elapsed time: 160m12s cat fb.gorGor3.chainNomLeu1Link.txt # 2356321697 bases of 2822760080 (83.476%) in intersection cd /hive/data/genomes/gorGor3/bed ln -s lastzNomLeu1.2011-10-21 lastz.nomLeu1 # running the swap mkdir /hive/data/genomes/nomLeu1/bed/blastz.gorGor3.swap cd /hive/data/genomes/nomLeu1/bed/blastz.gorGor3.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/gorGor3/bed/lastzNomLeu1.2011-10-21/DEF \ -swap -syntenicNet \ -noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ > swap.log 2>&1 & # real 74m9.554s cat fb.nomLeu1.chainGorGor3Link.txt # 2318822567 bases of 2756591777 (84.119%) in intersection cd /hive/data/genomes/nomLeu1/bed ln -s blastz.gorGor3.swap lastz.gorGor3 ############################################################################ # LASTZ Baboon papHam1 (DONE - 2011-10-21 - Hiram) mkdir /hive/data/genomes/gorGor3/bed/lastzPapHam1.2011-10-21 cd /hive/data/genomes/gorGor3/bed/lastzPapHam1.2011-10-21 cat << '_EOF_' > DEF # Gorilla vs Gibbon # maximum M allowed with lastz is only 254 BLASTZ_M=254 BLASTZ_Q=/scratch/data/blastz/human_chimp.v2.q BLASTZ_O=600 BLASTZ_E=150 # other parameters on advice from Webb BLASTZ_K=4500 BLASTZ_Y=15000 BLASTZ_T=2 # TARGET: Gorilla gorGor3 SEQ1_DIR=/scratch/data/gorGor3/gorGor3.2bit SEQ1_LEN=/scratch/data/gorGor3/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 SEQ1_LIMIT=30 # QUERY: Gibbon papHam1 SEQ2_DIR=/scratch/data/papHam1/papHam1.2bit SEQ2_LEN=/scratch/data/papHam1/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=300 BASE=/hive/data/genomes/gorGor3/bed/lastzPapHam1.2011-10-21 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # establish a screen to control this job screen time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ > do.log 2>&1 & # after finishing the lastz run manually due to hive filesystem problems time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -continue=cat -syntenicNet \ -noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ > cat.log 2>&1 & # real 205m20.060s cat fb.gorGor3.chainPapHam1Link.txt # 2242232481 bases of 2822760080 (79.434%) in intersection cd /hive/data/genomes/gorGor3/bed ln -s lastzPapHam1.2011-10-21 lastz.papHam1 # running the swap - DONE - 2011-09-21 mkdir /hive/data/genomes/papHam1/bed/blastz.gorGor3.swap cd /hive/data/genomes/papHam1/bed/blastz.gorGor3.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/gorGor3/bed/lastzPapHam1.2011-10-21/DEF \ -swap -syntenicNet \ -noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ > swap.log 2>&1 & # real 499m31.793s cat fb.papHam1.chainGorGor3Link.txt # 2217454777 bases of 2741867288 (80.874%) in intersection cd /hive/data/genomes/papHam1/bed ln -s blastz.gorGor3.swap lastz.gorGor3 ############################################################################ # LASTZ Bushbaby otoGar1 (DONE - 2011-10-21,31 - Hiram) mkdir /hive/data/genomes/gorGor3/bed/lastzOtoGar1.2011-10-21 cd /hive/data/genomes/gorGor3/bed/lastzOtoGar1.2011-10-21 cat << '_EOF_' > DEF # Gorilla vs Bushbaby # maximum M allowed with lastz is only 254 BLASTZ_M=254 BLASTZ_Q=/scratch/data/blastz/human_chimp.v2.q BLASTZ_O=600 BLASTZ_E=150 # other parameters on advice from Webb BLASTZ_K=4500 BLASTZ_Y=15000 BLASTZ_T=2 # TARGET: Gorilla gorGor3 SEQ1_DIR=/scratch/data/gorGor3/gorGor3.2bit SEQ1_LEN=/scratch/data/gorGor3/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 SEQ1_LIMIT=30 # QUERY: Bushbaby otoGar1 SEQ2_DIR=/scratch/data/otoGar1/otoGar1.2bit SEQ2_LEN=/scratch/data/otoGar1/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=300 BASE=/hive/data/genomes/gorGor3/bed/lastzOtoGar1.2011-10-21 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # establish a screen to control this job screen time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ > do.log 2>&1 & # after manually finishing the batch after hive filesystem outage time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -continue=cat -syntenicNet \ -noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ > cat.log 2>&1 & # real 134m17.379s cat fb.gorGor3.chainOtoGar1Link.txt # 339201275 bases of 2822760080 (12.017%) in intersection cd /hive/data/genomes/gorGor3/bed ln -s lastzOtoGar1.2011-10-21 lastz.otoGar1 # running the swap mkdir /hive/data/genomes/otoGar1/bed/blastz.gorGor3.swap cd /hive/data/genomes/otoGar1/bed/blastz.gorGor3.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/gorGor3/bed/lastzOtoGar1.2011-10-21/DEF \ -swap -syntenicNet \ -noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ > swap.log 2>&1 & # real 60m56.773s cat fb.otoGar1.chainGorGor3Link.txt # 336379843 bases of 1969052059 (17.083%) in intersection cd /hive/data/genomes/otoGar1/bed ln -s blastz.otoGar1.swap lastz.otoGar1 ############################################################################ # lastz swap Chimp rheMac2 (DONE - 2011-10-23 - Hiram) # original alignment cd /hive/data/genomes/rheMac2/bed/lastzGorGor3.2011-10-21 cat fb.rheMac2.chainGorGor3Link.txt # 2175264883 bases of 2646704109 (82.188%) in intersection # running the swap - DONE - 2011-09-21 mkdir /hive/data/genomes/gorGor3/bed/blastz.rheMac2.swap cd /hive/data/genomes/gorGor3/bed/blastz.rheMac2.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/rheMac2/bed/lastzGorGor3.2011-10-21/DEF \ -swap -syntenicNet \ -noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ > swap.log 2>&1 & # real 85m16.618s cat fb.gorGor3.chainRheMac2Link.txt # 2236867094 bases of 2822760080 (79.244%) in intersection cd /hive/data/genomes/gorGor3/bed ln -s blastz.rheMac2.swap lastz.rheMac2 ############################################################################ # lastz swap Orangutan ponAbe2 (DONE - 2011-10-23 - Hiram) # original alignment to Orangutan cd /hive/data/genomes/ponAbe2/bed/lastzGorGor3.2011-10-21 cat fb.ponAbe2.chainGorGor3Link.txt # 2584628377 bases of 3093572278 (83.548%) in intersection # running the swap mkdir /hive/data/genomes/gorGor3/bed/blastz.ponAbe2.swap cd /hive/data/genomes/gorGor3/bed/blastz.ponAbe2.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/ponAbe2/bed/lastzGorGor3.2011-10-21/DEF \ -swap -syntenicNet \ -noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ > swap.log 2>&1 & # real 86m19.901s cat fb.gorGor3.chainPonAbe2Link.txt # 2437562544 bases of 2822760080 (86.354%) in intersection cd /hive/data/genomes/gorGor3/bed ln -s blastz.ponAbe2.swap lastz.ponAbe2 ############################################################################ # lastz swap Marmoset calJac3 (DONE - 2011-10-27 - Hiram) # the original alignment cd /hive/data/genomes/calJac3/bed/lastzGorGor3.2011-10-21 cat fb.calJac3.chainGorGor3Link.txt # 1928768328 bases of 2752505800 (70.073%) in intersection # running the swap mkdir /hive/data/genomes/gorGor3/bed/blastz.calJac3.swap cd /hive/data/genomes/gorGor3/bed/blastz.calJac3.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/calJac3/bed/lastzGorGor3.2011-10-21/DEF \ -swap -syntenicNet \ -noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ > swap.log 2>&1 & # real 86m1.559s cat fb.gorGor3.chainCalJac3Link.txt # 1933784568 bases of 2822760080 (68.507%) in intersection cd /hive/data/genomes/gorGor3/bed ln -s blastz.calJac3.swap lastz.calJac3 ############################################################################ # lastz swap Chimp panTro3 (DONE - 2011-10-27 - Hiram) # original alignment cd /hive/data/genomes/panTro3/bed/lastzGorGor3.2011-10-21 cat fb.panTro3.chainGorGor3Link.txt # 2553716156 bases of 2900529764 (88.043%) in intersection # running the swap mkdir /hive/data/genomes/gorGor3/bed/blastz.panTro3.swap cd /hive/data/genomes/gorGor3/bed/blastz.panTro3.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/panTro3/bed/lastzGorGor3.2011-10-21/DEF \ -swap -syntenicNet \ -noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ > swap.log 2>&1 & # about 48 minutes # an odd error: # netClass -verbose=0 -noAr noClass.net gorGor3 panTro3 gorGor3.panTro3.net # hashMustFindVal: 'chr2' not found # I can't find any chr2 in any of these files ... # This was due to a bogus simpleRepeat table in panTro3, mistakenly # loaded 09 May 2011, backpushed from the RR 27 October 2011 cat fb.gorGor3.chainPanTro3Link.txt # 2513463838 bases of 2822760080 (89.043%) in intersection # after replacing the simpleRepeat table, finish off the # axtChain/loadUp.csh script and time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/panTro3/bed/lastzGorGor3.2011-10-21/DEF \ -continue=download -swap -syntenicNet \ -noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ > download.log 2>&1 & # real 25m50.173s cd /hive/data/genomes/gorGor3/bed ln -s blastz.panTro3.swap lastz.panTro3 ############################################################################ # LASTZ Bushbaby micMur1 (DONE - 2011-11-01,03 - Hiram) mkdir /hive/data/genomes/gorGor3/bed/lastzMicMur1.2011-11-01 cd /hive/data/genomes/gorGor3/bed/lastzMicMur1.2011-11-01 cat << '_EOF_' > DEF # Gorilla vs Mouse lemur # maximum M allowed with lastz is only 254 BLASTZ_M=254 BLASTZ_Q=/scratch/data/blastz/human_chimp.v2.q BLASTZ_O=600 BLASTZ_E=150 # other parameters on advice from Webb BLASTZ_K=4500 BLASTZ_Y=15000 BLASTZ_T=2 # TARGET: Gorilla gorGor3 SEQ1_DIR=/scratch/data/gorGor3/gorGor3.2bit SEQ1_LEN=/scratch/data/gorGor3/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 SEQ1_LIMIT=30 # QUERY: Mouse lemur micMur1 SEQ2_DIR=/scratch/data/micMur1/micMur1.2bit SEQ2_LEN=/scratch/data/micMur1/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LAP=0 SEQ2_LIMIT=400 BASE=/hive/data/genomes/gorGor3/bed/lastzMicMur1.2011-11-01 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # establish a screen to control this job screen time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ > do.log 2>&1 & # Elapsed time: 2962m15s # real 2962m15.467s cat fb.gorGor3.chainMicMur1Link.txt # 646647849 bases of 2822760080 (22.908%) in intersection cd /hive/data/genomes/gorGor3/bed ln -s lastzMicMur1.2011-11-01 lastz.micMur1 # running the swap mkdir /hive/data/genomes/micMur1/bed/blastz.gorGor3.swap cd /hive/data/genomes/micMur1/bed/blastz.gorGor3.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/gorGor3/bed/lastzMicMur1.2011-11-01/DEF \ -swap -syntenicNet \ -noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ > swap.log 2>&1 & # real 139m3.414s cat fb.micMur1.chainGorGor3Link.txt # 640110238 bases of 1852394361 (34.556%) in intersection cd /hive/data/genomes/micMur1/bed ln -s blastz.gorGor3.swap lastz.gorGor3 ############################################################################ # LASTZ Tarsier tarSyr1 (DONE - 2011-11-01,06 - Hiram) mkdir /hive/data/genomes/gorGor3/bed/lastzTarSyr1.2011-11-01 cd /hive/data/genomes/gorGor3/bed/lastzTarSyr1.2011-11-01 cat << '_EOF_' > DEF # Gorilla vs Tarsier # maximum M allowed with lastz is only 254 BLASTZ_M=254 BLASTZ_Q=/scratch/data/blastz/human_chimp.v2.q BLASTZ_O=600 BLASTZ_E=150 # other parameters on advice from Webb BLASTZ_K=4500 BLASTZ_Y=15000 BLASTZ_T=2 # TARGET: Gorilla gorGor3 SEQ1_DIR=/scratch/data/gorGor3/gorGor3.2bit SEQ1_LEN=/scratch/data/gorGor3/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 SEQ1_LIMIT=100 # QUERY: Tarsier tarSyr1 SEQ2_DIR=/scratch/data/tarSyr1/tarSyr1.2bit SEQ2_LEN=/scratch/data/tarSyr1/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LAP=0 SEQ2_LIMIT=400 BASE=/hive/data/genomes/gorGor3/bed/lastzTarSyr1.2011-11-01 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # establish a screen to control this job screen time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ > do.log 2>&1 & # real 5869m15.429s cat fb.gorGor3.chainTarSyr1Link.txt # 549926460 bases of 2822760080 (19.482%) in intersection cd /hive/data/genomes/gorGor3/bed ln -s lastzTarSyr1.2011-11-01 lastz.tarSyr1 # running the swap mkdir /hive/data/genomes/tarSyr1/bed/blastz.gorGor3.swap cd /hive/data/genomes/tarSyr1/bed/blastz.gorGor3.swap # swarm kluster off-line, hence bigHub=encodek, although it isn't # used for this time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/gorGor3/bed/lastzTarSyr1.2011-11-01/DEF \ -swap -syntenicNet \ -noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=encodek \ > swap.log 2>&1 & # real 599m8.506s cat fb.tarSyr1.chainGorGor3Link.txt # 567036277 bases of 2768536343 (20.481%) in intersection cd /hive/data/genomes/tarSyr1/bed ln -s blastz.tarSyr1.swap lastz.tarSyr1 ############################################################################ # Construct download files (DONE - 2011-11-14 - Hiram) cd /hive/data/genomes/gorGor3 time makeDownloads.pl gorGor3 -workhorse=hgwdev > downloads.log 2>&1 & # real 24m7.903s # you must edit the */README files in ./goldenPath/ # verify joinerCheck is good and gorGor3 is in all.joiner joinerCheck -keys -database=gorGor3 \ $HOME/kent/src/hg/makeDb/schema/all.joiner ############################################################################ # after downloads, construct pushQ (DONE - 2011-11-14 - Hiram) mkdir /hive/data/genomes/gorGor3/pushQ cd /hive/data/genomes/gorGor3/pushQ time makePushQSql.pl gorGor3 > gorGor3.sql 2> stderr.log # check stderr.log for anything not categorized scp -p gorGor3.sql hgwbeta:/tmp ssh hgwbeta cd /tmp hgsql qapushq < gorGor3.sql # after loading, view the subQ in the WEB interface, go through each # item and verify it can measure the tables and files ############################################################################ ## 11-Way Multiz (WORKING - 2011-11-09 - Hiram) ssh hgwdev mkdir /hive/data/genomes/gorGor3/bed/multiz11way cd /hive/data/genomes/gorGor3/bed/multiz11way # species list: cd /hive/data/genomes/gorGor3/bed ls -d lastz.* | sed -e "s/lastz.//" | sort | xargs echo # calJac3 hg19 micMur1 nomLeu1 otoGar1 panTro3 papHam1 ponAbe2 rheMac2 tarSyr1 # plus this one # gorGor3 # All distances remain as specified in the 51way.nh /cluster/bin/phast/tree_doctor --prune-all-but \ calJac3,hg19,micMur1,nomLeu1,otoGar1,panTro3,papHam1,ponAbe2,rheMac2,tarSyr1,gorGor3 \ $HOME/kent/src/hg/utils/phyloTrees/51way.nh > 11way.nh # what that looks like: cat 11way.nh # ((((((((hg19:0.006700,panTro3:0.006667):0.002250,gorGor3:0.008825):0.009680, # ponAbe2:0.018318):0.007000,nomLeu1:0.010000):0.007000,(rheMac2:0.007853, # papHam1:0.007637):0.029618):0.021965,calJac3:0.066131):0.057590, # tarSyr1:0.137823):0.011062,(micMur1:0.092749,otoGar1:0.129725):0.035463); # rearrange to get gorGor3 on top: cat << '_EOF_' > gorGor3.11way.nh (((((((gorGor3:0.008825,(hg19:0.006700,panTro3:0.006667):0.002250):0.009680, ponAbe2:0.018318):0.007000,nomLeu1:0.010000):0.007000,(rheMac2:0.007853, papHam1:0.007637):0.029618):0.021965,calJac3:0.066131):0.057590, tarSyr1:0.137823):0.011062,(micMur1:0.092749,otoGar1:0.129725):0.035463); '_EOF_' # << happy emacs # convert to species names /cluster/bin/phast/tree_doctor --rename \ "gorGor3->Gorilla; hg19->Human; panTro3->Chimp; ponAbe2->Orangutan; nomLeu1->Gibbon; rheMac2->Rhesus; papHam1->Baboon; calJac3->Marmoset; tarSyr1->Tarsier; micMur1->Mouse_lemur; otoGar1->Bushbaby" \ gorGor3.11way.nh > gorGor3.commonNames.11way.nh # (((((((Gorilla:0.008825,(Human:0.006700,Chimp:0.006667):0.002250):0.009680, # Orangutan:0.018318):0.007000,Gibbon:0.010000):0.007000, # (Rhesus:0.007853,Baboon:0.007637):0.029618):0.021965, # Marmoset:0.066131):0.057590,Tarsier:0.137823):0.011062, # (Mouse_lemur:0.092749,Bushbaby:0.129725):0.035463); # Use this specification in the phyloGif tool: # http://genome.ucsc.edu/cgi-bin/phyloGif # to obtain a png image for src/hg/htdocs/images/phylo/gorGor3_11way.png /cluster/bin/phast/all_dists gorGor3.11way.nh > 11way.distances.txt # Use this output to create the table below grep -i gorGor3 11way.distances.txt | sort -k3,3n # gorGor3 panTro3 0.017742 # gorGor3 hg19 0.017775 # gorGor3 nomLeu1 0.035505 # gorGor3 ponAbe2 0.036823 # gorGor3 papHam1 0.069760 # gorGor3 rheMac2 0.069976 # gorGor3 calJac3 0.120601 # gorGor3 tarSyr1 0.249883 # gorGor3 micMur1 0.251334 # gorGor3 otoGar1 0.288310 # If you can fill in all the numbers in this table, you are ready for # the multiple alignment procedure # featureBits chainLink measures # chainGorGor3Link chain linearGap # distance on gorGor3 on other minScore # 1 0.018 - chimp panTro3 (% 89.04) (% 88.04) 5000 medium # 2 0.018 - human hg19 (% 91.11) (% 89.88) 5000 medium # 3 0.036 - gibbon nomLeu1 (% 83.48) (% 84.12) 5000 medium # 4 0.037 - orangutan ponAbe2 (% 86.35) (% 83.55) 5000 medium # 5 0.070 - baboon papHam1 (% 79.43) (% 80.87) 5000 medium # 6 0.070 - rhesus rheMac2 (% 79.24) (% 82.19) 5000 medium # 7 0.121 - marmoset calJac3 (% 68.51) (% 70.07) 5000 medium # 8 0.250 - tarsier tarSyr1 (% 19.48) (% 20.48) 5000 medium # 9 0.251 - mouse lemur micMur1(% 22.91) (% 34.56) 5000 medium # 10 0.288 - bushbaby otoGar1 (% 12.02) (% 17.08) 5000 medium # None of this concern for distances matters in building the first step, the # maf files. # create species list and stripped down tree for autoMZ sed 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//; /^ *$/d' \ gorGor3.11way.nh > tmp.nh echo `cat tmp.nh` | sed 's/ //g; s/,/ /g' > tree.nh sed 's/[()]//g; s/,/ /g' tree.nh > species.list # split the maf files into a set of hashed named files # this hash named split keeps the same chr/contig names in the same # named hash file. mkdir /hive/data/genomes/gorGor3/bed/multiz11way/mafSplit cd /hive/data/genomes/gorGor3/bed/multiz11way/mafSplit for D in `sed -e "s/gorGor3 //" ../species.list` do echo "${D}" mkdir $D cd $D echo "mafSplit -byTarget -useHashedName=10 /dev/null . ../../../lastz.${D}/axtChain/gorGor3.${D}.synNet.maf.gz" mafSplit -byTarget -useHashedName=8 /dev/null . \ ../../../lastz.${D}/axtChain/gorGor3.${D}.synNet.maf.gz cd .. done # construct a list of all possible maf file names. # they do not all exist in each of the species directories find . -type f | wc -l # 1071 find . -type f | grep ".maf$" | xargs -L 1 basename | sort -u > maf.list wc -l maf.list # 206 maf.list mkdir /hive/data/genomes/gorGor3/bed/multiz11way/splitRun cd /hive/data/genomes/gorGor3/bed/multiz11way/splitRun mkdir maf run cd run mkdir penn cp -p /cluster/bin/penn/multiz.2009-01-21/multiz penn cp -p /cluster/bin/penn/multiz.2009-01-21/maf_project penn cp -p /cluster/bin/penn/multiz.2009-01-21/autoMZ penn # set the db and pairs directories here cat > autoMultiz.csh << '_EOF_' #!/bin/csh -ef set db = gorGor3 set c = $1 set result = $2 set run = `/bin/pwd` set tmp = /scratch/tmp/$db/multiz.$c set pairs = /hive/data/genomes/gorGor3/bed/multiz11way/mafSplit /bin/rm -fr $tmp /bin/mkdir -p $tmp /bin/cp -p ../../tree.nh ../../species.list $tmp pushd $tmp > /dev/null foreach s (`/bin/sed -e "s/$db //" species.list`) set in = $pairs/$s/$c.maf set out = $db.$s.sing.maf if (-e $in.gz) then /bin/zcat $in.gz > $out if (! -s $out) then echo "##maf version=1 scoring=autoMZ" > $out endif else if (-e $in) then /bin/ln -s $in $out else echo "##maf version=1 scoring=autoMZ" > $out endif end set path = ($run/penn $path); rehash $run/penn/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c.maf \ > /dev/null popd > /dev/null /bin/rm -f $result /bin/cp -p $tmp/$c.maf $result /bin/rm -fr $tmp /bin/rmdir --ignore-fail-on-non-empty /scratch/tmp/$db '_EOF_' # << happy emacs chmod +x autoMultiz.csh cat << '_EOF_' > template #LOOP ./autoMultiz.csh $(root1) {check out line+ /hive/data/genomes/gorGor3/bed/multiz11way/splitRun/maf/$(root1).maf} #ENDLOOP '_EOF_' # << happy emacs ln -s ../../mafSplit/maf.list maf.list ssh swarm cd /hive/data/genomes/gorGor3/bed/multiz11way/splitRun/run # the tac reverses the list to get the small jobs first gensub2 maf.list single template stdout | tac > jobList # the ram=8g will allow only one job on a node to slow them down para -ram=8g create jobList # Completed: 206 of 206 jobs # CPU time in finished jobs: 436154s 7269.23m 121.15h 5.05d 0.014 y # IO & Wait Time: 38798s 646.64m 10.78h 0.45d 0.001 y # Average job time: 2306s 38.43m 0.64h 0.03d # Longest finished job: 37853s 630.88m 10.51h 0.44d # Submission to last job: 37884s 631.40m 10.52h 0.44d # assemble into a single maf file cd /hive/data/genomes/gorGor3/bed/multiz11way head -1 splitRun/maf/001.maf > multiz11way.maf for F in splitRun/maf/*.maf do egrep -v "^#" ${F} done >> multiz11way.maf tail -1 splitRun/maf/001.maf >> multiz11way.maf # -rw-rw-r-- 1 25370474447 Dec 1 15:33 multiz11way.maf # Load into database ssh hgwdev cd /hive/data/genomes/gorGor3/bed/multiz11way mkdir /gbdb/gorGor3/multiz11way ln -s `pwd`/multiz11way.maf /gbdb/gorGor3/multiz11way cd /scratch/tmp time nice -n +19 hgLoadMaf gorGor3 multiz11way # Indexing and tabulating /gbdb/gorGor3/multiz11way/multiz11way.maf # Loaded 12607889 mafs in 1 files from /gbdb/gorGor3/multiz11way # real 12m16.842s time nice -n +19 hgLoadMafSummary -verbose=2 -minSize=30000 \ -mergeGap=1500 -maxSize=200000 gorGor3 multiz11waySummary \ /gbdb/gorGor3/multiz11way/multiz11way.maf # Created 1350677 summary blocks from 84367135 components # and 12607889 mafs from /gbdb/gorGor3/multiz11way/multiz11way.maf # real 17m3.441s wc -l multiz11way*.tab # 12607889 multiz11way.tab # 1350677 multiz11waySummary.tab # 13958566 total rm multiz11way*.tab ####################################################################### # GAP ANNOTATE MULTIZ11WAY MAF AND LOAD TABLES (DONE - 2011-12-06 - Hiram) # mafAddIRows has to be run on single chromosome maf files, it does not # function correctly when more than one reference sequence # are in a single file. mkdir -p /hive/data/genomes/gorGor3/bed/multiz11way/anno/mafSplit cd /hive/data/genomes/gorGor3/bed/multiz11way/anno/mafSplit time mafSplit -byTarget -useFullSequenceName \ /dev/null . ../../multiz11way.maf # real 11m32.688s ls | wc # 390 390 6073 cd /hive/data/genomes/gorGor3/bed/multiz11way/anno # a couple of these do not yet have an N.bed file # you can tell by making the symlinks below and see which are missing for DB in gorGor3 nomLeu1 panTro3 do cd /hive/data/genomes/${DB} twoBitInfo -nBed ${DB}.2bit ${DB}.N.bed done for DB in `cat ../species.list` do echo "${DB} " ln -s /hive/data/genomes/${DB}/${DB}.N.bed ${DB}.bed echo ${DB}.bed >> nBeds ln -s /hive/data/genomes/${DB}/chrom.sizes ${DB}.len echo ${DB}.len >> sizes done # make sure they all are successful symLinks: ls -ogrtL ssh swarm cd /hive/data/genomes/gorGor3/bed/multiz11way/anno mkdir result cat << '_EOF_' > template #LOOP mafAddIRows -nBeds=nBeds $(path1) /hive/data/genomes/gorGor3/gorGor3.2bit {check out exists+ result/$(file1)} #ENDLOOP '_EOF_' # << happy emacs ls mafSplit/*.maf > maf.list gensub2 maf.list single template jobList # limit jobs to one per node with the ram=8g requirement para -ram=8g create jobList para try ... check ... push ... # Completed: 390 of 390 jobs # CPU time in finished jobs: 2331s 38.85m 0.65h 0.03d 0.000 y # IO & Wait Time: 2261s 37.69m 0.63h 0.03d 0.000 y # Average job time: 12s 0.20m 0.00h 0.00d # Longest finished job: 156s 2.60m 0.04h 0.00d # Submission to last job: 5234s 87.23m 1.45h 0.06d # verify all result files have some content, look for 0 size files: find . -type f -size 0 # should see none # combine into one file head -q -n 1 result/chr1.maf > gorGor3.11way.maf for F in result/*.maf do grep -h -v "^#" ${F} done >> gorGor3.11way.maf # these maf files do not have the end marker, this does nothing: # tail -q -n 1 result/GL172637.maf >> gorGor3.11way.maf # How about an official end marker: echo "##eof maf" >> gorGor3.11way.maf # -rw-rw-r-- 1 30974842562 Dec 6 09:47 gorGor3.11way.maf # Load into database rm /gbdb/gorGor3/multiz11way/multiz11way.maf # remove old symlink ln -s `pwd`/gorGor3.11way.maf /gbdb/gorGor3/multiz11way/multiz11way.maf cd /scratch/tmp time nice -n +19 hgLoadMaf gorGor3 multiz11way # Loaded 12899544 mafs in 1 files from /gbdb/gorGor3/multiz11way # real 15m10.964s time nice -n +19 hgLoadMafSummary -verbose=2 -minSize=30000 \ -mergeGap=1500 -maxSize=200000 gorGor3 multiz11waySummary \ /gbdb/gorGor3/multiz11way/multiz11way.maf # Created 1350677 summary blocks from 84367135 components # and 12899544 mafs from /gbdb/gorGor3/multiz11way/multiz11way.maf # real 17m1.129s wc -l multiz11way*.tab # 12899544 multiz11way.tab # 1350677 multiz11waySummary.tab # 14250221 total rm multiz11way*.tab ######################################################################### # MULTIZ9WAY MAF FRAMES (DONE - 2011-12-06 - Hiram) ssh hgwdev mkdir /hive/data/genomes/gorGor3/bed/multiz11way/frames cd /hive/data/genomes/gorGor3/bed/multiz11way/frames mkdir genes # ensGene produces more annotation on hg19 than using knownGene # So, using ensGene on all species possible. papHam1 has only # xenoRefGene. (see 46-way in hg19 for other gene table options) gorGor3 ensGene hg19 ensGene panTro3 ensGene ponAbe2 ensGene nomLeu1 ensGene rheMac2 ensGene papHam1 xenoRefGene calJac3 ensGene tarSyr1 ensGene micMur1 ensGene otoGar1 ensGene #------------------------------------------------------------------------ # get the genes for all genomes # mRNAs with CDS. single select to get cds+psl, then split that up and # create genePred # using ensGene for: for qDB in gorGor3 hg19 panTro3 ponAbe2 nomLeu1 rheMac2 calJac3 tarSyr1 micMur1 otoGar1 do echo hgsql -N -e \"'select * from 'ensGene\;\" ${qDB} hgsql -N -e "select * from ensGene" ${qDB} | cut -f 2-11 \ | genePredSingleCover stdin stdout | gzip -2c > genes/$qDB.gp.gz done # xenoRefGene for papHam1 for qDB in papHam1 do hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from xenoRefGene" ${qDB} \ | genePredSingleCover stdin stdout | gzip -2c \ > /scratch/tmp/${qDB}.tmp.gz mv /scratch/tmp/${qDB}.tmp.gz genes/$qDB.gp.gz echo "${qDB} done" done # verify counts for genes are reasonable: for T in genes/*.gz do echo -n "# $T: " zcat $T | cut -f1 | sort | uniq -c | wc -l done # genes/calJac3.gp.gz: 20843 # genes/gorGor3.gp.gz: 20759 # genes/hg19.gp.gz: 20869 # genes/micMur1.gp.gz: 16240 # genes/nomLeu1.gp.gz: 18012 # genes/otoGar1.gp.gz: 15388 # genes/panTro3.gp.gz: 18414 # genes/papHam1.gp.gz: 26648 # genes/ponAbe2.gp.gz: 19895 # genes/rheMac2.gp.gz: 21049 # genes/tarSyr1.gp.gz: 13615 ssh hgwdev cd /hive/data/genomes/gorGor3/bed/multiz11way/frames time (cat ../anno/gorGor3.11way.maf \ | nice -n +19 genePredToMafFrames gorGor3 stdin stdout \ `sed -e "s#\([a-zA-Z0-9]*\)#\1 genes/\1.gp.gz#g" ../species.list` \ | gzip > multiz11way.mafFrames.gz) XXX - running - Tue Dec 6 10:40:25 PST 2011 # real 14m22.770s # verify there are frames on everything: zcat multiz11way.mafFrames.gz | awk '{print $4}' | sort | uniq -c # 207882 calJac3 # 181071 gorGor3 # 190188 hg19 # 121668 micMur1 # 187716 nomLeu1 # 77602 otoGar1 # 168287 panTro3 # 199624 papHam1 # 195263 ponAbe2 # 194473 rheMac2 # 46204 tarSyr1 ssh hgwdev cd /hive/data/genomes/gorGor3/bed/multiz11way/frames time hgLoadMafFrames gorGor3 multiz11wayFrames multiz11way.mafFrames.gz # real 0m21.425s ############################################################################ # creating upstream mafs (DONE - 2011-12-06 - Hiram) ssh hgwdev mkdir /hive/data/genomes/gorGor3/goldenPath/multiz11way cd /hive/data/genomes/gorGor3/goldenPath/multiz11way # run this bash script cat << '_EOF_' > mkUpstream.sh DB=gorGor3 GENE=ensGene NWAY=multiz11way export DB GENE for S in 1000 2000 5000 do echo "making upstream${S}.${GENE}.maf" echo "featureBits ${DB} ${GENE}:upstream:${S} -fa=/dev/null -bed=stdout" featureBits ${DB} ${GENE}:upstreamAll:${S} -fa=/dev/null -bed=stdout \ | perl -wpe 's/_up[^\t]+/\t0/' | sort -k1,1 -k2,2n \ | mafFrags ${DB} ${NWAY} stdin stdout \ -orgs=/hive/data/genomes/${DB}/bed/${NWAY}/species.list \ | gzip -c > upstream${S}.${GENE}.maf.gz echo "done upstream${S}.${GENE}.maf.gz" done '_EOF_' # << happy emacs chmod +x mkUpstream.sh time ./mkUpstream.sh # real 73m36.223s # FIXUP/VERIFY the genbank.conf file to indicate this gene choice for the # upstream automatic generation, in etc/genbank.conf: # gorGor3.upstreamGeneTbl = ensGene # gorGor3.upstreamMaf = multiz11way /hive/data/genomes/gorGor3/bed/multiz11way/species.list ######################################################################### # Phylogenetic tree from 11-way (DONE - 2011-12-06 - Hiram) mkdir /hive/data/genomes/gorGor3/bed/multiz11way/4d cd /hive/data/genomes/gorGor3/bed/multiz11way/4d # the split annotated maf's are in: ../anno/result/*.maf cd /hive/data/genomes/gorGor3/bed/multiz11way/4d # using ensGene for gorGor3, only transcribed genes and nothing # from the randoms and other misc. hgsql gorGor3 -Ne \ "select * from ensGene WHERE cdsEnd > cdsStart;" | cut -f 2-20 \ | egrep -v "chrM" > ensGene.gp cut -f2 ensGene.gp | sort | uniq -c | sort -rn | sed -e "s/^/#/" # 2685 chr1 # 2082 chr5 # 1947 chr19 # 1740 chr11 # 1441 chr12 # 1432 chr3 # 1407 chr6 # 1274 chr7 # 1255 chr16 # 1112 chr4 # 1105 chrX # 1069 chr10 # 1061 chr9 # 998 chr8 # 924 chr15 # 879 chr14 # 860 chr2b # 843 chr2a # 798 chr17 # 728 chr20 # 629 chr22 # 451 chr13 # 417 chr18 # 291 chr21 genePredSingleCover ensGene.gp stdout | sort > ensGeneNR.gp cut -f2 ensGeneNR.gp | sort | uniq -c | sort -rn | sed -e "s/^/#/" # 2052 chr1 # 1514 chr5 # 1359 chr19 # 1326 chr11 # 1133 chr3 # 1074 chr12 # 1063 chr6 # 1008 chr7 # 906 chr16 # 873 chrX # 834 chr4 # 824 chr9 # 805 chr10 # 770 chr8 # 686 chr14 # 667 chr2a # 660 chr2b # 660 chr15 # 602 chr17 # 567 chr20 # 477 chr22 # 345 chr13 # 310 chr18 # 234 chr21 ssh encodek mkdir /hive/data/genomes/gorGor3/bed/multiz11way/4d/run cd /hive/data/genomes/gorGor3/bed/multiz11way/4d/run mkdir ../mfa # newer versions of msa_view have a slightly different operation # the sed of the gp file inserts the reference species in the chr name # An interesting discovery with this one, ensGene chrom names for # chr2A and chr2B were actually chr2a and chr2b in the table. Hence # the new 'grep -i' and sed /i options to ignore case on the chrom names. cat << '_EOF_' > 4d.csh #!/bin/csh -fe set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin set r = "/hive/data/genomes/gorGor3/bed/multiz11way" set c = $1 set infile = $r/anno/result/$2 set outfile = $3 cd /scratch/tmp grep -i -w $c $r/4d/ensGeneNR.gp | sed -e "s/\t$c\t/\tgorGor3.$c\t/i" > $c.gp set NL=`wc -l $c.gp| gawk '{print $1}'` if ("$NL" != "0") then # 'clean' maf perl -wpe 's/^s ([^.]+)\.\S+/s $1/' $infile > $c.maf $PHASTBIN/msa_view --4d --features $c.gp -i MAF $c.maf -o SS > $c.ss $PHASTBIN/msa_view -i SS --tuple-size 1 $c.ss > $r/4d/run/$outfile else echo "" > $r/4d/run/$outfile endif rm -f $c.gp $c.maf $c.ss '_EOF_' # << happy emacs chmod +x 4d.csh ls -1S /hive/data/genomes/gorGor3/bed/multiz11way/anno/result/*.maf \ | sed -e "s#.*multiz11way/anno/result/##" \ | grep "^chr" | grep -v "chrM" > maf.list cat << '_EOF_' > template #LOOP 4d.csh $(root1) $(path1) {check out line+ ../mfa/$(root1).mfa} #ENDLOOP '_EOF_' # << happy emacs # the tac puts the quick jobs at the front gensub2 maf.list single template stdout | tac > jobList para create jobList para try ... check para -maxJob=5 push para time # Completed: 24 of 24 jobs # CPU time in finished jobs: 1807s 30.11m 0.50h 0.02d 0.000 y # IO & Wait Time: 143s 2.39m 0.04h 0.00d 0.000 y # Average job time: 81s 1.35m 0.02h 0.00d # Longest finished job: 159s 2.65m 0.04h 0.00d # Submission to last job: 351s 5.85m 0.10h 0.00d # combine mfa files ssh hgwdev cd /hive/data/genomes/gorGor3/bed/multiz11way/4d # check for empty files, size 0 and size 1: ls og mfa | sort -k3n | head # if necessary, remove them to avoid garbage: find ./mfa -type f -size 0 | xargs rm -f # most interesting, this did not identify files of size 1: # find ./mfa -type f -size 1 ls -og mfa | awk '$3 == 1' > empty.list sed -e "s#^#mfa/##" empty.list | xargs rm -f #want comma-less species.list /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/msa_view \ --aggregate "`cat ../species.list`" mfa/*.mfa | sed s/"> "/">"/ \ > 4d.all.mfa # check they are all in there: grep "^>" 4d.all.mfa # >gorGor3 # >hg19 # >panTro3 # >ponAbe2 # >nomLeu1 # >rheMac2 # >papHam1 # >calJac3 # >tarSyr1 # >micMur1 # >otoGar1 # verify tree-commas.nh: cat ../tree-commas.nh # ((((((((hg19,panTro3),gorGor3),ponAbe2),nomLeu1),(rheMac2,papHam1)),calJac3),tarSyr1),(micMur1,otoGar1)) # use phyloFit to create tree model (output is phyloFit.mod) time nice -n +19 \ /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/phyloFit \ --EM --precision MED --msa-format FASTA --subst-mod REV \ --tree ../tree-commas.nh 4d.all.mfa # real 0m31.111s mv phyloFit.mod all.mod grep TREE all.mod # TREE: ((((((((hg19:0.00655096,panTro3:0.00683982):0.00221957,gorGor3:0.00896404):0.00969343,ponAbe2:0.0189386):0.00347097,nomLeu1:0.0222691):0.0120442,(rheMac2:0.00799147,papHam1:0.00804243):0.0296086):0.0218314,calJac3:0.0696516):0.0520857,tarSyr1:0.111445):0.0205225,(micMur1:0.0856019,otoGar1:0.119397):0.0205225); ######################################################################### # phastCons 11-way (DONE - 2011-12-06 - Hiram) # split 11way mafs into 10M chunks and generate sufficient statistics # files for # phastCons ssh encodek mkdir -p /hive/data/genomes/gorGor3/bed/multiz11way/cons/SS cd /hive/data/genomes/gorGor3/bed/multiz11way/cons/SS mkdir result done cat << '_EOF_' > mkSS.csh #!/bin/csh -ef set c = $1 set MAF = /hive/data/genomes/gorGor3/bed/multiz11way/anno/result/$c.maf set WINDOWS = /hive/data/genomes/gorGor3/bed/multiz11way/cons/SS/result/$c set WC = `cat $MAF | wc -l` set NL = `grep "^#" $MAF | wc -l` if ( -s $2 ) then exit 0 endif if ( -s $2.running ) then exit 0 endif date >> $2.running rm -fr $WINDOWS mkdir $WINDOWS pushd $WINDOWS > /dev/null if ( $WC != $NL ) then /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/msa_split \ $MAF -i MAF -o SS -r $WINDOWS/$c -w 10000000,0 -I 1000 -B 5000 endif popd > /dev/null date >> $2 rm -f $2.running '_EOF_' # << happy emacs chmod +x mkSS.csh cat << '_EOF_' > template #LOOP mkSS.csh $(root1) {check out line+ done/$(root1)} #ENDLOOP '_EOF_' # << happy emacs # do the easy ones first to see some immediate results ls -1S -r ../../anno/result | sed -e "s/.maf//" > maf.list gensub2 maf.list single template jobList para create jobList para try ... check ... etc # Completed: 390 of 390 jobs # CPU time in finished jobs: 1920s 31.99m 0.53h 0.02d 0.000 y # IO & Wait Time: 3390s 56.51m 0.94h 0.04d 0.000 y # Average job time: 14s 0.23m 0.00h 0.00d # Longest finished job: 268s 4.47m 0.07h 0.00d # Submission to last job: 457s 7.62m 0.13h 0.01d # Run phastCons # This job is I/O intensive in its output files, beware where this # takes place or do not run too many at once. ssh encodek mkdir -p /hive/data/genomes/gorGor3/bed/multiz11way/cons/run.cons cd /hive/data/genomes/gorGor3/bed/multiz11way/cons/run.cons # there are going to be only one phastCons run using # this same script. It triggers off of the current working directory # $cwd:t which is the "grp" in this script. It is: # all cat << '_EOF_' > doPhast.csh #!/bin/csh -fe set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin set c = $1 set cX = $1:r set f = $2 set len = $3 set cov = $4 set rho = $5 set grp = $cwd:t set cons = /hive/data/genomes/gorGor3/bed/multiz11way/cons set tmp = $cons/tmp/$f mkdir -p $tmp set ssSrc = $cons set useGrp = "$grp.mod" ln -s $ssSrc/SS/result/$c/$f.ss $tmp ln -s $cons/$grp/$grp.mod $tmp pushd $tmp > /dev/null $PHASTBIN/phastCons $f.ss $useGrp \ --rho $rho --expected-length $len --target-coverage $cov --quiet \ --seqname $c --idpref $c --most-conserved $f.bed --score > $f.pp popd > /dev/null mkdir -p pp/$c bed/$c sleep 4 touch pp/$c bed/$c rm -f pp/$c/$f.pp rm -f bed/$c/$f.bed mv $tmp/$f.pp pp/$c mv $tmp/$f.bed bed/$c rm -fr $tmp '_EOF_' # << happy emacs chmod +x doPhast.csh # this template will serve for all runs # root1 == chrom name, file1 == ss file name without .ss suffix cat << '_EOF_' > template #LOOP ../run.cons/doPhast.csh $(root1) $(file1) 45 0.3 0.3 {check out line+ pp/$(root1)/$(file1).pp} #ENDLOOP '_EOF_' # << happy emacs find ../SS -type f | grep ".ss$" | sed -e 's/.ss$//' > ss.list wc -l ss.list # 670 ss.list # Create parasol batch and run it # run for all species mkdir /hive/data/genomes/gorGor3/bed/multiz11way/cons/all cd /hive/data/genomes/gorGor3/bed/multiz11way/cons/all # Using the .mod tree cp -p ../../4d/all.mod ./all.mod gensub2 ../run.cons/ss.list single ../run.cons/template jobList para create jobList para try ... check ... para push # Completed: 670 of 670 jobs # CPU time in finished jobs: 6960s 115.99m 1.93h 0.08d 0.000 y # IO & Wait Time: 4481s 74.69m 1.24h 0.05d 0.000 y # Average job time: 17s 0.28m 0.00h 0.00d # Longest finished job: 34s 0.57m 0.01h 0.00d # Submission to last job: 625s 10.42m 0.17h 0.01d cd /hive/data/genomes/gorGor3/bed/multiz11way/cons/all find ./bed -type f | grep ".bed$" | xargs cat | sort -k1,1 -k2,2n \ | awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' \ > tmpMostConserved.bed /cluster/bin/scripts/lodToBedScore tmpMostConserved.bed > mostConserved.bed # load into database time nice -n +19 hgLoadBed gorGor3 phastConsElements11way mostConserved.bed # Loaded 877501 elements of size 5 # real 0m5.516s # on human we often try for 5% overall cov, and 70% CDS cov # most bets are off here for that goal, these alignments are too few # and too far between featureBits gorGor3 -enrichment ensGene:cds phastConsElements11way # --rho 0.3 --expected-length 45 --target-coverage 0.3 # ensGene:cds 1.155%, phastConsElements11way 5.466%, both 0.703%, cover # 60.85%, enrich 11.13x # Create merged posterier probability file and wiggle track data files cd /hive/data/genomes/gorGor3/bed/multiz11way/cons/all mkdir downloads find ./pp -type f | sed -e "s#^./##; s#\.# d #g; s#-# m #;" \ | sort -k1,1 -k3,3n | sed -e "s# d #.#g; s# m #-#g;" | xargs cat \ | gzip -c > downloads/phastCons11way.wigFix.gz # check integrity of data with wigToBigWig time (zcat downloads/phastCons11way.wigFix.gz \ | wigToBigWig -verbose=2 stdin /hive/data/genomes/gorGor3/chrom.sizes \ phastCons11way.bw) > bigWig.log 2>&1 & tail bigWig.log # pid=5590: VmPeak: 29110252 kB # real 32m38.228s bigWigInfo phastCons11way.bw # version: 4 # isCompressed: yes # isSwapped: 0 # primaryDataSize: 5,663,784,804 # primaryIndexSize: 85,840,704 # zoomLevels: 10 # chromCount: 390 # basesCovered: 2,583,634,871 # mean: 0.202661 # min: 0.000000 # max: 0.999000 # std: 0.256356 # encode those files into wiggle data time (zcat downloads/phastCons11way.wigFix.gz \ | wigEncode stdin phastCons11way.wig phastCons11way.wib) # Converted stdin, upper limit 1.00, lower limit 0.00 # real 13m19.014s du -hsc *.wi? # 2.5G phastCons11way.wib # 263M phastCons11way.wig # 2.7G total # Load gbdb and database with wiggle. ln -s `pwd`/phastCons11way.wib /gbdb/gorGor3/multiz11way/phastCons11way.wib time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/gorGor3/multiz11way \ gorGor3 phastCons11way phastCons11way.wig # real 0m33.431s # use to set trackDb.ra entries for wiggle min and max # and verify table is loaded correctly wigTableStats.sh gorGor3 phastCons11way # db.table min max mean count sumData stdDev viewLimits #gorGor3.phastCons11way 0 0.999 0.202661 2583634871 5.23603e+08 0.256356 viewLimits=0:0.999 # Create histogram to get an overview of all the data time nice -n +19 hgWiggle -doHistogram -db=gorGor3 \ -hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \ phastCons11way > histogram.data 2>&1 # real 3m20.501s # create plot of histogram: cat << '_EOF_' | gnuplot > histo.png set terminal png small x000000 xffffff xc000ff x66ff66 xffff00 x00ffff set size 1.4, 0.8 set key left box set grid noxtics set grid ytics set title " Gorilla gorGor3 Histogram phastCons11way track" set xlabel " phastCons11way score" set ylabel " Relative Frequency" set y2label " Cumulative Relative Frequency (CRF)" set y2range [0:1] set y2tics set yrange [0:0.02] plot "histogram.data" using 2:5 title " RelFreq" with impulses, \ "histogram.data" using 2:7 axes x1y2 title " CRF" with lines '_EOF_' # << happy emacs display histo.png & ############################################################################# # phyloP for 11-way (DONE - 2011-12-07 - Hiram) # run phyloP with score=LRT ssh encodek mkdir /cluster/data/gorGor3/bed/multiz11way/consPhyloP cd /cluster/data/gorGor3/bed/multiz11way/consPhyloP mkdir run.phyloP cd run.phyloP # Adjust model file base composition background and rate matrix to be # representative of the chromosomes in play grep BACKGROUND ../../cons/all/all.mod | awk '{printf "%0.3f\n", $3 + $4}' # 0.560 /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/modFreqs \ ../../cons/all/all.mod 0.560 > all.mod # verify, the BACKGROUND should now be paired up: grep BACK all.mod # BACKGROUND: 0.220000 0.280000 0.280000 0.220000 # following the pattern from panTro3 with grp: "all" (no other grp) cat << '_EOF_' > doPhyloP.csh #!/bin/csh -fe set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin set f = $1 set file1 = $f:t set out = $2 set cName = $f:t:r set n = $f:r:e set grp = $cwd:t set cons = /hive/data/genomes/gorGor3/bed/multiz11way/consPhyloP set tmp = $cons/tmp/$grp/$f rm -fr $tmp mkdir -p $tmp set ssSrc = "/hive/data/genomes/gorGor3/bed/multiz11way/cons/SS/result/$f" set useGrp = "$grp.mod" ln -s $cons/run.phyloP/$grp.mod $tmp pushd $tmp > /dev/null $PHASTBIN/phyloP --method LRT --mode CONACC --wig-scores --chrom $cName \ -i SS $useGrp $ssSrc.ss > $file1.wigFix popd > /dev/null mkdir -p $out:h sleep 4 mv $tmp/$file1.wigFix $out rm -fr $tmp rmdir --ignore-fail-on-non-empty $cons/tmp/$grp/$cName rmdir --ignore-fail-on-non-empty $cons/tmp/$grp rmdir --ignore-fail-on-non-empty $cons/tmp '_EOF_' # << happy emacs chmod +x doPhyloP.csh # Create list of chunks find ../../cons/SS/result -type f | grep ".ss$" \ | sed -e "s/.ss$//; s#^../../cons/SS/result/##" > ss.list # make sure the list looks good wc -l ss.list # 670 ss.list # Create template file # file1 == $chr/$chunk/file name without .ss suffix cat << '_EOF_' > template #LOOP ../run.phyloP/doPhyloP.csh $(path1) {check out line+ wigFix/$(dir1)/$(file1).wigFix} #ENDLOOP '_EOF_' # << happy emacs ###################### Running all species ####################### # setup run for all species mkdir /hive/data/genomes/gorGor3/bed/multiz11way/consPhyloP/all cd /hive/data/genomes/gorGor3/bed/multiz11way/consPhyloP/all rm -fr wigFix mkdir wigFix gensub2 ../run.phyloP/ss.list single ../run.phyloP/template jobList # the -ram=8g will allow only one job per node to slow this down since # it would run too fast otherwise. Either run on one of the small # klusters or use the -ram=8g on the para create para -ram=8g create jobList para try ... check ... etc ... para -maxJob=64 push para time > run.time # Completed: 670 of 670 jobs # CPU time in finished jobs: 13286s 221.44m 3.69h 0.15d 0.000 y # IO & Wait Time: 4513s 75.21m 1.25h 0.05d 0.000 y # Average job time: 27s 0.44m 0.01h 0.00d # Longest finished job: 65s 1.08m 0.02h 0.00d # Submission to last job: 3612s 60.20m 1.00h 0.04d # make downloads mkdir downloads time (find ./wigFix -type f | sed -e "s#^./##; s#\.# d #g; s#-# m #;" \ | sort -k1,1 -k3,3n | sed -e "s# d #.#g; s# m #-#g;" | xargs cat \ | gzip -c > downloads/phyloP11way.wigFix.gz) & # real 25m23.098s # check integrity of data with wigToBigWig time (zcat downloads/phyloP11way.wigFix.gz \ | wigToBigWig -verbose=2 stdin /hive/data/genomes/gorGor3/chrom.sizes \ phyloP11way.bw) > bigWig.log 2>&1 & egrep "real|VmPeak" bigWig.log # pid=18030: VmPeak: 29110248 kB # real 34m13.586s bigWigInfo phyloP11way.bw # version: 4 # isCompressed: yes # isSwapped: 0 # primaryDataSize: 3,124,719,586 # primaryIndexSize: 85,840,704 # zoomLevels: 10 # chromCount: 390 # basesCovered: 2,583,634,871 # mean: 0.091549 # min: -9.604000 # max: 0.649000 # std: 0.507211 # encode those files into wiggle data time (zcat downloads/phyloP11way.wigFix.gz \ | wigEncode stdin phyloP11way.wig phyloP11way.wib) & # Converted stdin, upper limit 0.65, lower limit -9.60 # real 13m30.224s du -hsc *.wi? # 2.5G phyloP11way.wib # 264M phyloP11way.wig # 2.7G total # Load gbdb and database with wiggle. ln -s `pwd`/phyloP11way.wib /gbdb/gorGor3/multiz11way/phyloP11way.wib nice hgLoadWiggle -pathPrefix=/gbdb/gorGor3/multiz11way gorGor3 \ phyloP11way phyloP11way.wig # use to set trackDb.ra entries for wiggle min and max # and verify table is loaded correctly wigTableStats.sh gorGor3 phyloP11way # db.table min max mean count sumData stdDev viewLimits # gorGor3.phyloP11way -9.604 0.649 0.0915485 2583634871 2.36528e+08 # 0.507211 viewLimits=-2.44451:0.649 # Create histogram to get an overview of all the data time nice -n +19 hgWiggle -doHistogram -db=gorGor3 \ -hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \ phyloP11way > histogram.data 2>&1 # real 4m48.739s # find out the range for the 2:5 graph grep -v chrom histogram.data | grep "^[0-9]" | ave -col=5 stdin # Q1 0.000216 # median 0.000864 # Q3 0.001834 # average 0.001565 # min 0.000000 # max 0.065102 # count 639 # total 1.000000 # standard deviation 0.003182 # create plot of histogram: cat << '_EOF_' | gnuplot > histo.png set terminal png small x000000 xffffff xc000ff x66ff66 xffff00 x00ffff set size 1.4, 0.8 set key left box set grid noxtics set grid ytics set title " Gorilla gorGor3 Histogram phyloP11way track" set xlabel " phyloP11way score" set ylabel " Relative Frequency" set y2label " Cumulative Relative Frequency (CRF)" set y2range [0:1] set y2tics set yrange [0:0.007] plot "histogram.data" using 2:5 title " RelFreq" with impulses, \ "histogram.data" using 2:7 axes x1y2 title " CRF" with lines '_EOF_' # << happy emacs display histo.png & ############################################################################# # download data for 11-way (DONE - 2011-12-08 - Hiram) # use a README from the most recent multiz like this # this one is from xenTro3 cd /hive/data/genomes/gorGor3/bed/multiz11way/cons cp -p /hive/data/genomes/xenTro3/bed/multiz9way/cons/README.txt . # need to edit this to finish it off md5sum phastCons11way.bw phastCons11way.wigFix.gz primate.mod > md5sum.txt mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/gorGor3/phastCons11way cd /usr/local/apache/htdocs-hgdownload/goldenPath/gorGor3/phastCons11way ln -s \ /hive/data/genomes/gorGor3/bed/multiz11way/cons/all/downloads/phastCons11way.wigFix.gz \ ./phastCons11way.wigFix.gz ln -s /hive/data/genomes/gorGor3/bed/multiz11way/cons/all/phastCons11way.bw \ ./phastCons11way.bw ln -s /hive/data/genomes/gorGor3/bed/multiz11way/cons/all/all.mod \ ./primate.mod ln -s /hive/data/genomes/gorGor3/bed/multiz11way/cons/README.txt . ln -s /hive/data/genomes/gorGor3/bed/multiz11way/cons/md5sum.txt . # use a README from the most recent multiz like this # this one is from xenTro3 cd /hive/data/genomes/gorGor3/bed/multiz11way/consPhyloP cp /hive/data/genomes/xenTro3/bed/multiz9way/consPhyloP/README.txt . # need to edit this to finish it off md5sum README.txt phastCons11way.bw phastCons11way.wigFix.gz primate.mod \ > md5sum.txt mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/gorGor3/phyloP11way cd /usr/local/apache/htdocs-hgdownload/goldenPath/gorGor3/phyloP11way ln -s \ /hive/data/genomes/gorGor3/bed/multiz11way/consPhyloP/all/downloads/phyloP11way.wigFix.gz \ ./phyloP11way.wigFix.gz ln -s \ /hive/data/genomes/gorGor3/bed/multiz11way/consPhyloP/all/phyloP11way.bw \ ./phyloP11way.bw ln -s \ /hive/data/genomes/gorGor3/bed/multiz11way/consPhyloP/run.phyloP/all.mod \ ./primate.mod ln -s /hive/data/genomes/gorGor3/bed/multiz11way/consPhyloP/README.txt . ln -s /hive/data/genomes/gorGor3/bed/multiz11way/consPhyloP/md5sum.txt . mkdir /hive/data/genomes/gorGor3/bed/multiz11way/downloads cd /hive/data/genomes/gorGor3/bed/multiz11way/downloads # use a README from the most recent multiz like this # this one is from xenTro3 cp -p /hive/data/genomes/xenTro3/bed/multiz9way/downloads/README.txt . # need to edit this README.txt to finish it off # you can find this common name translation above in the section that # began the multiz11way work: grep TREE ../cons/all/all.mod | sed -e "s/TREE: //" > gorGor3.11way.nh /cluster/bin/phast/tree_doctor --rename \ "gorGor3->Gorilla; hg19->Human; panTro3->Chimp; ponAbe2->Orangutan; nomLeu1->Gibbon; rheMac2->Rhesus; papHam1->Baboon; calJac3->Marmoset; tarSyr1->Tarsier; micMur1->Mouse_lemur; otoGar1->Bushbaby" \ gorGor3.11way.nh > gorGor3.commonNames.11way.nh time cp -p ../anno/gorGor3.11way.maf ./multiz11way.maf # real 13m33.422s # -rw-rw-r-- 1 30974842562 Dec 6 09:47 multiz11way.maf time gzip multiz11way.maf # real 40m29.483s # -rw-rw-r-- 1 4187214672 Dec 6 09:47 multiz11way.maf.gz md5sum *.nh *.gz *.txt > md5sum.txt mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/gorGor3/multiz11way cd /usr/local/apache/htdocs-hgdownload/goldenPath/gorGor3/multiz11way ln -s /hive/data/genomes/gorGor3/bed/multiz11way/downloads/gorGor3.11way.nh . ln -s \ /hive/data/genomes/gorGor3/bed/multiz11way/downloads/gorGor3.commonNames.11way.nh . ln -s \ /hive/data/genomes/gorGor3/bed/multiz11way/downloads/multiz11way.maf.gz . ln -s /hive/data/genomes/gorGor3/bed/multiz11way/downloads/README.txt . ln -s /hive/data/genomes/gorGor3/bed/multiz11way/downloads/md5sum.txt . ############################################################################# # hgPal downloads (DONE - Hiram - 2011-12-08) # FASTA from 11way for ensGene ssh hgwdev rm -rf /hive/data/genomes/gorGor3/bed/multiz11way/pal mkdir /hive/data/genomes/gorGor3/bed/multiz11way/pal cd /hive/data/genomes/gorGor3/bed/multiz11way/pal cat ../species.list | tr '[ ]' '[\n]' > order.list mz=multiz11way gp=ensGene db=gorGor3 mkdir exonAA exonNuc ppredAA ppredNuc for j in `sort -nk 2 /hive/data/genomes/$db/chrom.sizes | awk '{print $1}'` do echo "date" echo "mafGene -chrom=$j $db $mz $gp order.lst stdout | \ gzip -c > ppredAA/$j.ppredAA.fa.gz" echo "mafGene -chrom=$j -noTrans $db $mz $gp order.lst stdout | \ gzip -c > ppredNuc/$j.ppredNuc.fa.gz" echo "mafGene -chrom=$j -exons -noTrans $db $mz $gp order.lst stdout | \ gzip -c > exonNuc/$j.exonNuc.fa.gz" echo "mafGene -chrom=$j -exons $db $mz $gp order.lst stdout | \ gzip -c > exonAA/$j.exonAA.fa.gz" done > $gp.jobs time sh -x $gp.jobs > $gp.jobs.log 2>&1 & sleep 1 tail -f $gp.jobs.log # real 115m38.599s mz=multiz11way gp=ensGene db=gorGor3 zcat exonAA/*.gz | gzip -c > $gp.$mz.exonAA.fa.gz zcat exonNuc/*.gz | gzip -c > $gp.$mz.exonNuc.fa.gz zcat ppredAA/*.gz | gzip -c > $gp.$mz.ppredAA.fa.gz zcat ppredNuc/*.gz | gzip -c > $gp.$mz.ppredNuc.fa.gz # about 1h 10m rm -rf exonAA exonNuc ppredAA ppredNuc & md5sum $gp.$mz.exonAA.fa.gz $gp.$mz.exonNuc.fa.gz > md5sum.txt # we're only distributing exons at the moment mz=multiz11way gp=ensGene db=gorGor3 pd=/usr/local/apache/htdocs-hgdownload/goldenPath/$db/$mz/alignments mkdir -p $pd ln -s `pwd`/$gp.$mz.exonAA.fa.gz $pd/$gp.exonAA.fa.gz ln -s `pwd`/$gp.$mz.exonNuc.fa.gz $pd/$gp.exonNuc.fa.gz ln -s `pwd`/md5sum.txt $pd/ # obtain a copy of a most-recent README.txt file cp /hive/data/genomes/hg19/bed/multiz46way/pal/README.txt . # edit to bring up to date for gorGor3 ln -s `pwd`/README.txt $pd/ ########################################################################## # LASTZ Rhesus rheMac3 Swap (DONE - 2012-04-26 - Chin) cd /hive/data/genomes/rheMac3/bed/blastzGorGor3.2012-04-18 cat fb.rheMac3.chainGorGor3Link.txt # 2279633399 bases of 2639145830 (86.378%) in intersection # and, for the swap mkdir /hive/data/genomes/gorGor3/bed/blastz.rheMac3.swap cd /hive/data/genomes/gorGor3/bed/blastz.rheMac3.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/rheMac3/bed/blastzGorGor3.2012-04-18/DEF \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \ -swap -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 289m9.479s cat fb.gorGor3.chainRheMac3Link.txt # 2335269818 bases of 2822760080 (82.730%) in intersection cd /hive/data/genomes/gorGor3/bed ln -s blastz.rheMac3.swap lastz.rheMac3 #############################################################################