# for emacs: -*- mode: sh; -*- # This file describes browser build for the mm10 # Mus musculus (mouse) # DATE: 07-Dec-2011 # ORGANISM: Mus musculus # TAXID: 10090 # ASSEMBLY LONG NAME: Genome Reference Consortium Mouse Build 38 # ASSEMBLY SHORT NAME: GRCm38 # ASSEMBLY SUBMITTER: Genome Reference Consortium # ASSEMBLY TYPE: Haploid + alternate loci # NUMBER OF ASSEMBLY-UNITS: 16 # ASSEMBLY ACCESSION: GCA_000001635.2 # rsync://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Mus_musculus/GRCm38/ # Genome ID: # http://www.ncbi.nlm.nih.gov/genome/52 # Taxonomy: # http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=10090 # http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=39442 # GRC information # http://www.ncbi.nlm.nih.gov/projects/genome/assembly/grc/mouse/ # Mitochondrial sequence: # http://www.ncbi.nlm.nih.gov/bioproject/13767 # C57BL/6J sequence: # http://www.ncbi.nlm.nih.gov/bioproject/51977 # Finishing project: # http://www.ncbi.nlm.nih.gov/bioproject/20689 # Assembly ID: 327618 # http://www.ncbi.nlm.nih.gov/genome/assembly/327618/ # Celera Assembly # http://www.ncbi.nlm.nih.gov/Traces/wgs/?val=AAHY00 ############################################################################# # fetch sequence from genbank (DONE - 2012-01-30 - Hiram) mkdir -p /hive/data/genomes/mm10/genbank cd /hive/data/genomes/mm10/genbank rsync -a -P \ rsync://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Mus_musculus/GRCm38/ ./ # measure sequence to be used here faSize Primary_Assembly/assembled_chromosomes/FASTA/*.fa.gz \ Primary_Assembly/unplaced_scaffolds/FASTA/*.fa.gz \ Primary_Assembly/unlocalized_scaffolds/FASTA/*.fa.gz \ non-nuclear/assembled_chromosomes/FASTA/chrMT.fa.gz # 2730871774 bases (78088274 N's 2652783500 real 2652783500 upper 0 lower) # in 66 sequences in 29 files # Total size: mean 41376845.1 sd 63617337.3 min 1976 # (gi|371559559|gb|JH584295.1|) max 195471971 # (gi|371561115|gb|CM000994.2|) median 184189 ############################################################################# # fixup names for UCSC standards (DONE - 2012-02-06 - Hiram) mkdir /hive/data/genomes/mm10/ucsc cd /hive/data/genomes/mm10/ucsc ######################## Assembled Chromosomes cat << '_EOF_' > toUcsc.pl #!/bin/env perl use strict; use warnings; my %accToChr; open (FH, "<../genbank/Primary_Assembly/assembled_chromosomes/chr2acc") or die "can not read Primary_Assembly/assembled_chromosomes/chr2acc"; while (my $line = ) { next if ($line =~ m/^#/); chomp $line; my ($chrN, $acc) = split('\s+', $line); $accToChr{$acc} = $chrN; } close (FH); foreach my $acc (keys %accToChr) { my $chrN = $accToChr{$acc}; print "$acc $accToChr{$acc}\n"; open (FH, "zcat ../genbank/Primary_Assembly/assembled_chromosomes/AGP/chr${chrN}.agp.gz|") or die "can not read chr${chrN}.agp.gz"; open (UC, ">chr${chrN}.agp") or die "can not write to chr${chrN}.agp"; while (my $line = ) { if ($line =~ m/^#/) { print UC $line; } else { $line =~ s/^$acc/chr${chrN}/; print UC $line; } } close (FH); close (UC); open (FH, "zcat ../genbank/Primary_Assembly/assembled_chromosomes/FASTA/chr${chrN}.fa.gz|") or die "can not read chr${chrN}.fa.gz"; open (UC, ">chr${chrN}.fa") or die "can not write to chr${chrN}.fa"; while (my $line = ) { if ($line =~ m/^>/) { printf UC ">chr${chrN}\n"; } else { print UC $line; } } close (FH); close (UC); } '_EOF_' # << happy emacs chmod +x toUcsc.pl time ./toUcsc.pl # real 0m53.256s faSize chr*.fa # 2725521370 bases (77999939 N's 2647521431 real 2647521431 upper 0 # lower) in 21 sequences in 21 files # Total size: mean 129786731.9 sd 33408399.1 min 61431566 (chr19) # max 195471971 (chr1) median 124902244 ######################## Unplaced scaffolds cat << '_EOF_' > unplaced.pl #!/bin/env perl use strict; use warnings; my $agpFile = "../genbank/Primary_Assembly/unplaced_scaffolds/AGP/unplaced.scaf.agp.gz"; my $fastaFile = "../genbank/Primary_Assembly/unplaced_scaffolds/FASTA/unplaced.scaf.fa.gz"; open (FH, "zcat $agpFile|") or die "can not read $agpFile"; open (UC, ">unplaced.agp") or die "can not write to unplaced.agp"; while (my $line = ) { if ($line =~ m/^#/) { print UC $line; } else { $line =~ s/\.1//; printf UC "chrUn_%s", $line; } } close (FH); close (UC); open (FH, "zcat $fastaFile|") or die "can not read $fastaFile"; open (UC, ">unplaced.fa") or die "can not write to unplaced.fa"; while (my $line = ) { if ($line =~ m/^>/) { chomp $line; $line =~ s/.*gb\|//; $line =~ s/\.1\|.*//; printf UC ">chrUn_$line\n"; } else { print UC $line; } } close (FH); close (UC); '_EOF_' # << happy emacs chmod +x unplaced.pl time ./unplaced.pl # real 0m0.119s # make sure none of the names got to be over 31 characers long: grep -v "^#" unplaced.agp | cut -f1 | sort | uniq -c | sort -rn # not much in that sequence: faSize unplaced.fa # 803895 bases (62411 N's 741484 real 741484 upper 0 lower) # in 22 sequences in 1 files # Total size: mean 36540.7 sd 21518.0 min 20208 (chrUn_GL456368) # max 114452 (chrUn_JH584304) median 28772 ########## chrM zcat ../genbank/non-nuclear/assembled_chromosomes/FASTA/chrMT.fa.gz \ | sed -e "s/^>.*/>chrM/" > chrM.fa zcat ../genbank/non-nuclear/assembled_chromosomes/AGP/chrMT.comp.agp.gz \ | sed -e "s/^AY172335.1/chrM/" > chrM.agp ######################## Unlocalized scaffolds cat << '_EOF_' > unlocalized.pl #!/bin/env perl use strict; use warnings; my %accToChr; my %chrNames; open (FH, "<../genbank/Primary_Assembly/unlocalized_scaffolds/unlocalized.chr2scaf") or die "can not read Primary_Assembly/unlocalized_scaffolds/unlocalized.chr2scaf"; while (my $line = ) { next if ($line =~ m/^#/); chomp $line; my ($chrN, $acc) = split('\s+', $line); $accToChr{$acc} = $chrN; $chrNames{$chrN} += 1; } close (FH); foreach my $chrN (keys %chrNames) { my $agpFile = "../genbank/Primary_Assembly/unlocalized_scaffolds/AGP/chr$chrN.unlocalized.scaf.agp.gz"; my $fastaFile = "../genbank/Primary_Assembly/unlocalized_scaffolds/FASTA/chr$chrN.unlocalized.scaf.fa.gz"; open (FH, "zcat $agpFile|") or die "can not read $agpFile"; open (UC, ">chr${chrN}_random.agp") or die "can not write to chr${chrN}_random.agp"; while (my $line = ) { if ($line =~ m/^#/) { print UC $line; } else { chomp $line; my (@a) = split('\t', $line); my $acc = $a[0]; my $accNo1 = $acc; $accNo1 =~ s/.1$//; die "ERROR: acc not .1: $acc" if ($accNo1 =~ m/\./); die "ERROR: chrN $chrN not correct for $acc" if ($accToChr{$acc} ne $chrN); my $ucscName = "chr${chrN}_${accNo1}_random"; printf UC "%s", $ucscName; for (my $i = 1; $i < scalar(@a); ++$i) { printf UC "\t%s", $a[$i]; } printf UC "\n"; } } close (FH); close (UC); printf "chr%s\n", $chrN; open (FH, "zcat $fastaFile|") or die "can not read $fastaFile"; open (UC, ">chr${chrN}_random.fa") or die "can not write to chr${chrN}_random.fa"; while (my $line = ) { if ($line =~ m/^>/) { chomp $line; my $acc = $line; $acc =~ s/.*gb\|//; $acc =~ s/\|.*//; my $accNo1 = $acc; $accNo1 =~ s/.1$//; die "ERROR: acc not .1: $acc" if ($accNo1 =~ m/\./); die "ERROR: chrN $chrN not correct for $acc" if ($accToChr{$acc} ne $chrN); my $ucscName = "chr${chrN}_${accNo1}_random"; printf UC ">$ucscName\n"; } else { print UC $line; } } close (FH); close (UC); } '_EOF_' # << happy emacs chmod +x unlocalized.pl time ./unlocalized.pl # real 0m0.430s faSize chr*_random.fa # 4530210 bases (25924 N's 4504286 real 4504286 upper 0 lower) # in 22 sequences in 6 files # Total size: mean 205918.6 sd 184688.0 min 1976 (chr4_JH584295_random) # max 953012 (chr5_JH584299_random) median 191905 # verify none of the names are longer than 31 characters: grep -h -v "^#" chr*_random.agp | cut -f1 | sort | uniq -c | sort -nr # compress all these fasta and agp files: gzip *.fa *.agp # verify all the sequence is still here after all this rigamarole: time faSize *.fa.gz # 2730871774 bases (78088274 N's 2652783500 real 2652783500 upper 0 # lower) in 66 sequences in 29 files # Total size: mean 41376845.1 sd 63617337.3 min 1976 # (chr4_JH584295_random) max 195471971 (chr1) median 184189 ############################################################################# # Initial browser build (DONE - 2012-01-06 - Hiram) cd /hive/data/genomes/mm10 cat << '_EOF_' > mm10.config.ra # Config parameters for makeGenomeDb.pl: db mm10 clade mammal genomeCladePriority 40 scientificName Mus musculus commonName Mouse assemblyDate Dec. 2011 assemblyLabel Genome Reference Consortium Mouse Build 38 (GCA_000001635.2) assemblyShortLabel GRCm38 orderKey 1209 mitoAcc none fastaFiles /hive/data/genomes/mm10/ucsc/*.fa.gz agpFiles /hive/data/genomes/mm10/ucsc/*.agp.gz dbDbSpeciesDir mouse taxId 10090 ncbiAssemblyId 327618 ncbiAssemblyName GRCm38 '_EOF_' # << happy emacs time makeGenomeDb.pl -stop=agp mm10.config.ra > agp.log 2>&1 # real 3m4.568s # check the end of agp.log to verify it is OK time makeGenomeDb.pl -workhorse=hgwdev -fileServer=hgwdev \ -continue=db mm10.config.ra > db.log 2>&1 # real 20m51.374s # verify the end of db.log indicates successful ############################################################################# # running repeat masker (DONE - 2012-02-06 - Hiram) mkdir /hive/data/genomes/mm10/bed/repeatMasker cd /hive/data/genomes/mm10/bed/repeatMasker time doRepeatMasker.pl -buildDir=`pwd` -noSplit \ -bigClusterHub=swarm -dbHost=hgwdev -workhorse=hgwdev \ -smallClusterHub=encodek mm10 > do.log 2>&1 & # real 609m48.767s cat faSize.rmsk.txt # 2730871774 bases (78088274 N's 2652783500 real 1456094545 upper # 1196688955 lower) in 66 sequences in 1 files # Total size: mean 41376845.1 sd 63617337.3 min 1976 # (chr4_JH584295_random) max 195471971 (chr1) median 184189 # %43.82 masked total, %45.11 masked real grep -i versi do.log # RepeatMasker version development-$Id: RepeatMasker,v 1.26 2011/09/26 16:19:44 angie Exp $ # April 26 2011 (open-3-3-0) version of RepeatMasker time featureBits -countGaps mm10 rmsk # 1196694219 bases of 2730871774 (43.821%) in intersection # real 0m30.460s # why is it different than the faSize above ? # because rmsk masks out some N's as well as bases, the count above # separates out the N's from the bases, it doesn't show lower case N's ########################################################################## # running simple repeat (DONE - 2012-02-06 - Hiram) mkdir /hive/data/genomes/mm10/bed/simpleRepeat cd /hive/data/genomes/mm10/bed/simpleRepeat time doSimpleRepeat.pl -buildDir=`pwd` -bigClusterHub=swarm \ -dbHost=hgwdev -workhorse=hgwdev -smallClusterHub=encodek \ mm10 > do.log 2>&1 & # real 16m35.603s # batch failed, one job failed: # ./TrfRun.csh /hive/data/genomes/mm10/TrfPart/062/062.lst.bed # which is the chrM sequence - it has no simple repeats # create an empty output file result: touch /hive/data/genomes/mm10/TrfPart/062/062.lst.bed # go to encodek and create the run.time file to signal this step is done cd /hive/data/genomes/mm10/bed/simpleRepeat/run.cluster para time > run.time # Completed: 70 of 71 jobs # Crashed: 1 jobs # CPU time in finished jobs: 13103s 218.38m 3.64h 0.15d 0.000 y # IO & Wait Time: 163s 2.72m 0.05h 0.00d 0.000 y # Average job time: 190s 3.16m 0.05h 0.00d # Longest finished job: 392s 6.53m 0.11h 0.00d # Submission to last job: 894s 14.90m 0.25h 0.01d # continue procedure: time doSimpleRepeat.pl -buildDir=`pwd` -bigClusterHub=swarm \ -dbHost=hgwdev -workhorse=hgwdev -smallClusterHub=encodek \ -continue=filter mm10 > filter.log 2>&1 & # real 1m20.021s cat fb.simpleRepeat # 92161833 bases of 2652783500 (3.474%) in intersection # when RepeatMasker is done, add this mask to the sequence: cd /hive/data/genomes/mm10 twoBitMask mm10.rmsk.2bit \ -add bed/simpleRepeat/trfMask.bed mm10.2bit # you can safely ignore the warning about fields >= 13 twoBitToFa mm10.2bit stdout | faSize stdin > faSize.mm10.2bit.txt cat faSize.mm10.2bit.txt # 2730871774 bases (78088274 N's 2652783500 real 1454267808 upper # 1198515692 lower) in 66 sequences in 1 files # Total size: mean 41376845.1 sd 63617337.3 min 1976 # (chr4_JH584295_random) max 195471971 (chr1) median 184189 # %43.89 masked total, %45.18 masked real # set SymLink in gbdb to this masked sequence rm /gbdb/mm10/mm10.2bit ln -s `pwd`/mm10.2bit /gbdb/mm10/mm10.2bit ######################################################################### # Verify all gaps are marked, add any N's not in gap as type 'other' # (DONE - 2012-02-06 - Hiram) mkdir /hive/data/genomes/mm10/bed/gap cd /hive/data/genomes/mm10/bed/gap time nice -n +19 findMotif -motif=gattaca -verbose=4 \ -strand=+ ../../mm10.unmasked.2bit > findMotif.txt 2>&1 # real 1m0.372s grep "^#GAP " findMotif.txt | sed -e "s/^#GAP //" > allGaps.bed time featureBits -countGaps mm10 -not gap -bed=notGap.bed # 2658879040 bases of 2730871774 (97.364%) in intersection # real 0m13.067s time featureBits -countGaps mm10 allGaps.bed notGap.bed -bed=new.gaps.bed # 6095540 bases of 2730871774 (0.223%) in intersection # real 0m15.177s # what is the highest index in the existing gap table: hgsql -N -e "select ix from gap;" mm10 | sort -n | tail -1 # 54 cat << '_EOF_' > mkGap.pl #!/bin/env perl use strict; use warnings; my $ix=`hgsql -N -e "select ix from gap;" mm10 | sort -n | tail -1`; chomp $ix; open (FH,") { my ($chrom, $chromStart, $chromEnd, $rest) = split('\s+', $line); ++$ix; printf "%s\t%d\t%d\t%d\tN\t%d\tother\tyes\n", $chrom, $chromStart, $chromEnd, $ix, $chromEnd-$chromStart; } close (FH); '_EOF_' # << happy emacs chmod +x ./mkGap.pl ./mkGap.pl > other.bed wc -l other.bed # 384 featureBits -countGaps mm10 other.bed # 6095540 bases of 2730871774 (0.223%) in intersection hgLoadBed -sqlTable=$HOME/kent/src/hg/lib/gap.sql \ -noLoad mm10 otherGap other.bed # verify no overlap with gap table: time featureBits -countGaps mm10 gap other.bed # 0 bases of 2730871774 (0.000%) in intersection # real 0m1.281s # verify no errors before adding to the table: time gapToLift -minGap=1 mm10 nonBridged.before.lift \ -bedFile=nonBridged.before.bed > before.gapToLift.txt 2>&1 & # real 0m7.205s # check for warnings in before.gapToLift.txt, should be empty: # -rw-rw-r-- 1 1633 Jan 6 15:20 before.gapToLift.txt # it indicates that there are telomere's adjacent to centromere's # and heterochromatin # starting with this many: hgsql -e "select count(*) from gap;" mm10 # 302 hgsql mm10 -e 'load data/genomes local infile "bed.tab" into table gap;' # result count: hgsql -e "select count(*) from gap;" mm10 # 686 # == 302 + 384 # verify we aren't adding gaps where gaps already exist # this would output errors if that were true: gapToLift -minGap=1 mm10 nonBridged.lift -bedFile=nonBridged.bed #same set of warnings as before, telomere's centromere's and heterochromatin # there should be no errors or other output, checked bridged gaps: hgsql -N -e "select bridge from gap;" mm10 | sort | uniq -c # 191 no # 495 yes ########################################################################## ## WINDOWMASKER (DONE - 2012-02-06 - Hiram) mkdir /hive/data/genomes/mm10/bed/windowMasker cd /hive/data/genomes/mm10/bed/windowMasker time nice -n +19 doWindowMasker.pl -buildDir=`pwd` -workhorse=hgwdev \ -dbHost=hgwdev mm10 > do.log 2>&1 & # real 167m12.012s # Masking statistics twoBitToFa mm10.wmsk.2bit stdout | faSize stdin # 2730871774 bases (78088274 N's 2652783500 real 1686407708 upper # 966375792 lower) in 66 sequences in 1 files # Total size: mean 41376845.1 sd 63617337.3 min 1976 # (chr4_JH584295_random) max 195471971 (chr1) median 184189 # %35.39 masked total, %36.43 masked real twoBitToFa mm10.wmsk.sdust.2bit stdout | faSize stdin # 2730871774 bases (78088274 N's 2652783500 real 1670424648 upper # 982358852 lower) in 66 sequences in 1 files # Total size: mean 41376845.1 sd 63617337.3 min 1976 # (chr4_JH584295_random) max 195471971 (chr1) median 184189 # %35.97 masked total, %37.03 masked real hgLoadBed mm10 windowmaskerSdust windowmasker.sdust.bed.gz # Loaded 12655947 elements of size 3 featureBits -countGaps mm10 windowmaskerSdust # 1060447084 bases of 2730871774 (38.832%) in intersection # eliminate the gaps from the masking featureBits mm10 -not gap -bed=notGap.bed # 2652783500 bases of 2652783500 (100.000%) in intersection time nice -n +19 featureBits mm10 windowmaskerSdust notGap.bed \ -bed=stdout | gzip -c > cleanWMask.bed.gz # 982358852 bases of 2652783500 (37.031%) in intersection # real 1m42.449s # reload track to get it clean hgLoadBed mm10 windowmaskerSdust cleanWMask.bed.gz # Loaded 12655987 elements of size 4 time featureBits -countGaps mm10 windowmaskerSdust # 982358852 bases of 2730871774 (35.972%) in intersection # real 1m13.889s # do *not* need to mask with this clean result since RepeatMasker # does a very good job here. Using RM masking instead. zcat cleanWMask.bed.gz \ | twoBitMask ../../mm10.unmasked.2bit stdin \ -type=.bed mm10.cleanWMSdust.2bit twoBitToFa mm10.cleanWMSdust.2bit stdout | faSize stdin \ > mm10.cleanWMSdust.faSize.txt cat mm10.cleanWMSdust.faSize.txt # how much does this window masker and repeat masker overlap: time featureBits -countGaps mm10 rmsk windowmaskerSdust # 753614881 bases of 2730871774 (27.596%) in intersection # real 1m42.691s # RM by itself: time featureBits -countGaps mm10 rmsk # 1196694219 bases of 2730871774 (43.821%) in intersection # real 0m30.460s ############################################################################# # PREPARE LINEAGE SPECIFIC REPEAT FILES FOR BLASTZ (DONE - 2012-02-07 - Hiram) ssh encodek mkdir /hive/data/genomes/mm10/bed/linSpecRep cd /hive/data/genomes/mm10/bed/linSpecRep # split the RM output by chromosome name into separate files mkdir rmsk dateRepeats head -3 ../repeatMasker/mm10.sorted.fa.out > rmsk.header.txt headRest 3 ../repeatMasker/mm10.sorted.fa.out \ | splitFileByColumn -ending=.out -col=5 -head=rmsk.header.txt stdin rmsk ls -1S rmsk/* > rmOut.list cat << '_EOF_' > mkLSR #!/bin/csh -fe rm -f dateRepeats/$1_homo-sapiens_rattus_canis-familiaris_bos-taurus /scratch/data/genomes/RepeatMasker/DateRepeats \ $1 -query mouse -comp human -comp rat -comp dog -comp cow mv $1_homo-sapiens_rattus_canis-lupus-familiaris_bos-taurus dateRepeats '_EOF_' # << happy emacs chmod +x mkLSR cat << '_EOF_' > template #LOOP ./mkLSR $(path1) {check out line+ dateRepeats/$(file1)_homo-sapiens_rattus_canis-lupus-familiaris_bos-taurus} #ENDLOOP '_EOF_' # << happy emacs gensub2 rmOut.list single template jobList para create jobList para try ... check ... push ... etc... para time # Completed: 66 of 66 jobs # CPU time in finished jobs: 1743s 29.05m 0.48h 0.02d 0.000 y # IO & Wait Time: 190s 3.16m 0.05h 0.00d 0.000 y # Average job time: 29s 0.49m 0.01h 0.00d # Longest finished job: 65s 1.08m 0.02h 0.00d # Submission to last job: 160s 2.67m 0.04h 0.00d mkdir notInHuman notInRat notInDog notInCow for F in dateRepeats/chr*.out_homo-sapiens* do B=`basename ${F}` B=${B/.out*/} echo $B /cluster/bin/scripts/extractRepeats 1 ${F} > \ notInHuman/${B}.out.spec /cluster/bin/scripts/extractRepeats 2 ${F} > \ notInRat/${B}.out.spec /cluster/bin/scripts/extractRepeats 3 ${F} > \ notInDog/${B}.out.spec /cluster/bin/scripts/extractRepeats 4 ${F} > \ notInCow/${B}.out.spec done # notInDog, and notInCow ended up being identical. # The notInRat and notInHuman are different # To check identical find . -name "*.out.spec" | \ while read FN; do echo `cat ${FN} | sum -r` ${FN}; done \ | sort -k1,1n | sort -t"/" -k3,3 > check.same # this produces a count of 2 for the sums for Cow and Dog, all the same egrep "Cow|Dog" check.same | awk '{print $1}' | sort | uniq -c | sort -rn # this does not produce a count of 2 for the sums for Cow and Human egrep "Cow|Human" check.same | awk '{print $1}' | sort | uniq -c | sort -rn # Copy to data/genomes staging for cluster replication mkdir /hive/data/genomes/staging/data/genomes/mm10 rsync -a -P ./notInRat/ /hive/data/genomes/staging/data/genomes/mm10/notInRat/ rsync -a -P ./notInHuman/ /hive/data/genomes/staging/data/genomes/mm10/notInHuman/ rsync -a -P ./notInCow/ /hive/data/genomes/staging/data/genomes/mm10/notInOthers/ # We also need the nibs for the lastz runs with lineage specific repeats mkdir /hive/data/genomes/mm10/nib cd /hive/data/genomes/mm10 cut -f1 chrom.sizes | while read C do twoBitToFa -seq=${C} mm10.2bit stdout | faToNib -softMask stdin nib/${C}.nib ls -og nib/$C.nib done # verify one is properly masked: nibFrag -masked nib/chrM.nib 0 16299 + stdout | less # compare to: twoBitToFa -seq=chrM mm10.fa stdout | less # Copy to data/genomes staging for cluster replication rsync -a -P ./nib/ /hive/data/genomes/staging/data/genomes/mm10/nib/ ######################################################################### # MAKE 11.OOC FILE FOR BLAT/GENBANK (DONE - 2012-02-08 - Hiram) # Use -repMatch=650, based on size -- for human we use 1024 # use the "real" number from the faSize measurement, # hg19 is 2897316137, calculate the ratio factor for 1024: calc \( 2652783500 / 2897316137 \) \* 1024 # ( 2652783500 / 2897316137 ) * 1024 = 937.574699 # round up to 1000 (mm9 used 912) cd /hive/data/genomes/mm10 time blat mm10.2bit /dev/null /dev/null -tileSize=11 \ -makeOoc=jkStuff/mm10.11.ooc -repMatch=1000 # Wrote 27208 overused 11-mers to jkStuff/mm10.11.ooc # real 2m9.568s # at repMatch=900: # Wrote 31822 overused 11-mers to jkStuff/mm10.11.ooc # there are non-bridged gaps, make lift file for genbank hgsql -N -e "select bridge from gap;" mm10 | sort | uniq -c # 191 no # 495 yes cd /hive/data/genomes/mm10/jkStuff gapToLift mm10 mm10.nonBridged.lift -bedFile=mm10.nonBridged.bed # largest non-bridged contig: awk '{print $3-$2,$0}' mm10.nonBridged.bed | sort -nr | head 116378660 chr2 59120641 175499301 chr2.02 # copy all of this stuff to the klusters: cd /hive/data/genomes/mm10 mkdir /hive/data/genomes/staging/data/genomes/mm10 cp -p jkStuff/mm10.11.ooc jkStuff/mm10.nonBridged.lift chrom.sizes \ mm10.2bit /hive/data/genomes/staging/data/genomes/mm10 # request rsync copy from cluster admin ######################################################################### # AUTO UPDATE GENBANK (DONE - 2012-02-08 - Hiram) # examine the file: /cluster/data/genomes/genbank/data/genomes/organism.lst # for your species to see what counts it has for: # organism mrnaCnt estCnt refSeqCnt # Mus musculus 334577 4853663 26288 # to decide which "native" mrna or ests you want to specify in genbank.conf # of course, mm10 has plenty of everything ssh hgwdev cd $HOME/kent/src/hg/makeDb/genbank git pull # edit etc/genbank.conf to add mm10 just after mm9 and commit to GIT # mm10 mm10.serverGenome = /hive/data/genomes/mm10/mm10.2bit mm10.clusterGenome = /scratch/data/genomes/mm10/mm10.2bit mm10.ooc = /scratch/data/genomes/mm10/mm10.11.ooc mm10.align.unplacedChroms = chr* mm10.lift = /scratch/data/genomes/mm10/mm10.nonBridged.lift mm10.refseq.mrna.native.pslCDnaFilter = ${finished.refseq.mrna.native.pslCDnaFilter} mm10.refseq.mrna.xeno.pslCDnaFilter = ${finished.refseq.mrna.xeno.pslCDnaFilter} mm10.genbank.mrna.native.pslCDnaFilter = ${finished.genbank.mrna.native.pslCDnaFilter} mm10.genbank.mrna.xeno.pslCDnaFilter = ${finished.genbank.mrna.xeno.pslCDnaFilter} mm10.genbank.est.native.pslCDnaFilter = ${finished.genbank.est.native.pslCDnaFilter} mm10.downloadDir = mm10 mm10.refseq.mrna.xeno.load = yes mm10.refseq.mrna.xeno.loadDesc = yes mm10.mgc = yes mm10.genbank.mrna.blatTargetDb = yes # mm10.ccds.ncbiBuild = 37.2 # mm10.upstreamGeneTbl = refGene # mm10.upstreamMaf = multiz30way # /hive/data/genomes/mm10/bed/multiz30way/species.list # end of section added to etc/genbank.conf git commit -m "adding mm10 definitions" genbank.conf git push make etc-update ssh hgwdev # used to do this on "genbank" machine screen # long running job managed in screen cd /cluster/data/genomes/genbank time nice -n +19 ./bin/gbAlignStep -initial mm10 & # var/build/logs/2012.02.08-11:38:50.mm10.initalign.log # real 795m52.388s # load data/genomesbase when finished ssh hgwdev cd /cluster/data/genomes/genbank time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad mm10 & # logFile: var/dbload/hgwdev/logs/2012.02.09-10:05:25.dbload.log # real 114m56.461s # enable daily alignment and update of hgwdev (DONE - 2012-02-09 - Hiram) cd ~/kent/src/hg/makeDb/genbank git pull # add mm10 to: etc/align.dbs etc/hgwdev.dbs git commit -m "Added mm10." etc/align.dbs etc/hgwdev.dbs git push make etc-update ############################################################################ # running cpgIsland business (DONE - 2012-02-09 - Hiram) mkdir /hive/data/genomes/mm10/bed/cpgIsland cd /hive/data/genomes/mm10/bed/cpgIsland # use a previous binary for this program ln -s ../../../mm9/bed/cpgIsland/hg3rdParty/cpgIslands/cpglh.exe . mkdir -p hardMaskedFa cut -f1 ../../chrom.sizes | while read C do echo ${C} twoBitToFa ../../mm10.2bit:$C stdout \ | maskOutFa stdin hard hardMaskedFa/${C}.fa done ssh swarm cd /hive/data/genomes/mm10/bed/cpgIsland mkdir results cut -f1 ../../chrom.sizes > chr.list cat << '_EOF_' > template #LOOP ./runOne $(root1) {check out exists results/$(root1).cpg} #ENDLOOP '_EOF_' # << happy emacs # the faCount business is to make sure there is enough sequence to # work with in the fasta. cpglh.exe does not like files with too many # N's - it gets stuck. cat << '_EOF_' > runOne #!/bin/csh -fe set C = `faCount hardMaskedFa/$1.fa | egrep -v "^#seq|^total" | awk '{print $2 - $7 }'` if ( $C > 200 ) then ./cpglh.exe hardMaskedFa/$1.fa > /scratch/tmp/$1.$$ mv /scratch/tmp/$1.$$ $2 else touch $2 endif '_EOF_' # << happy emacs chmod +x runOne gensub2 chr.list single template jobList para create jobList para try para check ... etc para time # Completed: 66 of 66 jobs # CPU time in finished jobs: 191s 3.19m 0.05h 0.00d 0.000 y # IO & Wait Time: 189s 3.14m 0.05h 0.00d 0.000 y # Average job time: 6s 0.10m 0.00h 0.00d # Longest finished job: 19s 0.32m 0.01h 0.00d # Submission to last job: 51s 0.85m 0.01h 0.00d # Transform cpglh output to bed + catDir results | awk '{ $2 = $2 - 1; width = $3 - $2; printf("%s\t%d\t%s\t%s %s\t%s\t%s\t%0.0f\t%0.1f\t%s\t%s\n", $1, $2, $3, $5,$6, width, $6, width*$7*0.01, 100.0*2*$6/width, $7, $9); }' > cpgIsland.bed # verify longest unique chrom name: cut -f1 cpgIsland.bed | awk '{print length($0)}' | sort -rn | head -1 # 20 # update the length 14 in the template to be 16: sed -e "s/14/20/" $HOME/kent/src/hg/lib/cpgIslandExt.sql > cpgIslandExt.sql cd /hive/data/genomes/mm10/bed/cpgIsland hgLoadBed mm10 cpgIslandExt -tab -sqlTable=cpgIslandExt.sql cpgIsland.bed # Loaded 16023 elements of size 10 featureBits mm10 cpgIslandExt # 10495450 bases of 2652783500 (0.396%) in intersection # compare to previous: featureBits mm9 cpgIslandExt # 10496250 bases of 2620346127 (0.401%) in intersection # there should be no output from checkTableCoords: checkTableCoords -verboseBlocks -table=cpgIslandExt mm10 # cleanup, unless you want to move them to the genscan procedure below rm -fr hardMaskedFa ######################################################################### # GENSCAN GENE PREDICTIONS (DONE - 2012-02-09,10 - Hiram) mkdir /hive/data/genomes/mm10/bed/genscan cd /hive/data/genomes/mm10/bed/genscan # use a previously existing genscan binary ln -s ../../../mm9/bed/genscan/hg3rdParty . # create hard masked .fa files mkdir -p hardMaskedFa cut -f1 ../../chrom.sizes | while read C do echo ${C} twoBitToFa ../../mm10.2bit:$C stdout \ | maskOutFa stdin hard hardMaskedFa/${C}.fa done # Generate a list file, genome.list, of all the hard-masked contig chunks: find ./hardMaskedFa/ -type f | sed -e 's#^./##' > genome.list wc -l genome.list # 66 genome.list # Run on small cluster (more mem than big cluster). ssh encodek cd /hive/data/genomes/mm10/bed/genscan # Make 3 subdirectories for genscan to put their output files in mkdir gtf pep subopt # Create template file, template, for gensub2. For example (3-line file): cat << '_EOF_' > template #LOOP /cluster/bin/x86_64/gsBig {check in exists+ $(path1)} {check out exists gtf/$(root1).gtf} -trans={check out exists pep/$(root1).pep} -subopt={check out exists subopt/$(root1).bed} -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=2400000 #ENDLOOP '_EOF_' # << emacs gensub2 genome.list single template jobList para create jobList para try para check ... etc... para time # Crashed: 2 jobs # CPU time in finished jobs: 171336s 2855.60m 47.59h 1.98d 0.005 y # IO & Wait Time: 261s 4.35m 0.07h 0.00d 0.000 y # Average job time: 2640s 44.00m 0.73h 0.03d # Longest finished job: 22618s 376.97m 6.28h 0.26d # Submission to last job: 28682s 478.03m 7.97h 0.33d # one of the two crashed jobs was just a stray line in the jobList, # somehow a line with the string: '_EOF_' got in there. # as with mm9, chr7 did not work. Break it up into pieces mkdir /hive/data/genomes/mm10/bed/genscan/chr7Split cd /hive/data/genomes/mm10/bed/genscan/chr7Split grep chr7 ../../../jkStuff/mm10.nonBridged.lift | grep -v random \ > chr7.nonBridged.lift faToTwoBit ../hardMaskedFa/chr7.fa chr7.2bit ~/kent/src/hg/utils/lft2BitToFa.pl chr7.2bit chr7.nonBridged.lift \ | sed -e "s/chr7./chr7_/" > chr7.nonBridged.fa faSplit sequence chr7.nonBridged.fa 100 split7/chr7_ ln -s ../../../../mm9/bed/genscan/hg3rdParty . echo '#!/bin/sh' > cmdList.sh ls split7 | while read F do echo "/cluster/bin/x86_64/gsBig split7/${F} gtf/${F}.gtf} -trans=pep/${F}.pep} -subopt=subopt/${F}.bed -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=2400000 &" done >> cmdList.sh echo "wait" >> cmdList.sh chmod +x cmdList.sh mkdir gtf pep subopt time ./cmdList.sh > run.log 2>&1 # about 20 minutes # fix the names in the lift file cat chr7.nonBridged.lift | sed -e "s/chr7./chr7_/" > chr7.lift # the sed mangling will provide unique names for them all, but they # will not be in the strict numerical order that genscan usually produces cat gtf/chr7_*.gtf | liftUp -type=.gtf stdout chr7.lift error stdin \ | sed -e "s/chr7_0\([0-4]\)\./chr7.\1/g" > chr7.gtf cat subopt/chr7_*.bed | liftUp -type=.bed stdout chr7.lift error stdin \ | sed -e "s/chr7_0\([0-4]\)\./chr7.\1/g" > chr7.subopt.bed cat pep/chr7_*.pep | sed -e "s/chr7_0\([0-4]\)\./chr7.\1/g" > chr7.pep cp -p chr7.pep ../pep cp -p chr7.gtf ../gtf cp -p chr7.subopt.bed ../subopt/chr7.bed find ./gtf -type f | xargs -n 256 endsInLf -zeroOk # Concatenate results: cd /hive/data/genomes/mm10/bed/genscan find ./gtf -type f | xargs cat > genscan.gtf find ./pep -type f | xargs cat > genscan.pep find ./subopt -type f | xargs cat > genscanSubopt.bed # Load into the data/genomesbase (without -genePredExt because no frame info): # Don't load the Pep anymore -- redundant since it's from genomic. ssh hgwdev cd /hive/data/genomes/mm10/bed/genscan # to construct a local file with the genePred business: gtfToGenePred genscan.gtf genscan.gp # this produces exactly the same thing and loads the table: ldHgGene -gtf mm10 genscan genscan.gtf # Read 45012 transcripts in 323529 lines in 1 files # 45012 groups 59 seqs 1 sources 1 feature types # 45012 gene predictions hgLoadBed mm10 genscanSubopt genscanSubopt.bed # Read 526572 elements of size 6 from genscanSubopt.bed featureBits mm10 genscan # 55743040 bases of 2652783500 (2.101%) in intersection # previously: featureBits mm9 genscan # 55293837 bases of 2620346127 (2.110%) in intersection ######################################################################### # CREATE MICROSAT TRACK (DONE - 2012-02-09 - Hiram ssh hgwdev mkdir /cluster/data/genomes/mm10/bed/microsat cd /cluster/data/genomes/mm10/bed/microsat awk '($5==2 || $5==3) && $6 >= 15 && $8 == 100 && $9 == 0 {printf("%s\t%s\t%s\t%dx%s\n", $1, $2, $3, $6, $16);}' \ ../simpleRepeat/simpleRepeat.bed > microsat.bed hgLoadBed mm10 microsat microsat.bed # Read 197237 elements of size 4 from microsat.bed ######################################################################### # BLATSERVERS ENTRY (DONE - 2012-02-09 - Hiram) # After getting a blat server assigned by the Blat Server Gods, ssh hgwdev hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ VALUES ("mm10", "blat13", "17832", "1", "0"); \ INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ VALUES ("mm10", "blat13", "17833", "0", "1");' \ hgcentraltest # test it with some sequence ############################################################################ # set default position the same as was mm9 via blat # (DONE - 2012-02-09 - Hiram) hgsql -e \ 'update dbDb set defaultPos="chr12:56694976-56714605" where name="mm10";' \ hgcentraltest ############################################################################ # constructing downloads (DONE - 2012-02-09 - Hiram) cd /hive/data/genomes/mm10 # some of the smaller bits are missing the simple repeat results time makeDownloads.pl -allowMissedTrfs -workhorse=hgwdev mm10 # real 41m42.408s # edit the README files in goldenPath/*/README.txt ######################################################################### # create pushQ entry (DONE - 2012-02-09 - Hiram) # first make sure all.joiner is up to date and has this new organism # a keys check should be clean: cd ~/kent/src/hg/makeDb/schema joinerCheck -data/genomesbase=mm10 -keys all.joiner mkdir /hive/data/genomes/mm10/pushQ cd /hive/data/genomes/mm10/pushQ makePushQSql.pl mm10 > mm10.sql 2> stderr.out # check stderr.out for no significant problems, it is common to see: # WARNING: hgwdev does not have /gbdb/mm10/wib/gc5Base.wib # WARNING: hgwdev does not have /gbdb/mm10/wib/quality.wib # WARNING: hgwdev does not have /gbdb/mm10/bbi/quality.bw # WARNING: mm10 does not have seq # WARNING: mm10 does not have extFile # *** All done! # which are not real problem # if some tables are not identified: # WARNING: Could not tell (from trackDb, all.joiner and hardcoded lists of # supporting and genbank tables) which tracks to assign these tables to: # list of tables will be in the output # put them in manually after loading the pushQ entry scp -p mm10.sql hgwbeta:/tmp ssh hgwbeta cd /tmp hgsql qapushq < mm10.sql ######################################################################### # lifting ensGene track from mm9 (DONE - 2012-02-22 - Hiram) # no gene tracks yet on mm10. liftUp mm9 ensGenes to mm10 # history of mm9 ensGene indicates it is the same as v64 release # with v65 being identical mkdir /hive/data/genomes/mm10/bed/ensGene cd /hive/data/genomes/mm10/bed/ensGene hgsql -N -e "select * from ensGene;" mm9 | cut -f2- > mm9.ensGene.gp liftOver -genePred mm9.ensGene.gp \ /gbdb/mm9/liftOver/mm9ToMm10.over.chain.gz \ mm10.lifted.ensGene.gp unmapped.ensGene.gp wc -l *.gp # 95651 mm10.lifted.ensGene.gp # 95883 mm9.ensGene.gp # 464 unmapped.ensGene.gp hgLoadGenePred -skipInvalid -genePredExt mm10 ensGene mm10.lifted.ensGene.gp # Warning: skipping 118 invalid genePreds # make a list of what did get loaded: hgsql -N -e "select name from ensGene;" mm10 \ | sort -u > mm10.name.ensGene.txt wc -l mm10.name.ensGene.txt # 95533 mm10.name.ensGene.txt hgsql -N -e "select * from ensPep;" mm9 | sort > mm9.ensPep.tab hgsql -N -e "select * from ensGtp;" mm9 | sort -k2,2 > mm9.ensGtp.tab hgsql -N -e "select * from ensemblToGeneName;" mm9 | sort -k1,1 \ > mm9.ensemblToGeneName.tab hgsql -N -e "select * from ensemblSource;" mm9 | sort -k1,1 \ > mm9.ensemblSource.tab # select out ensGtp records that match with the names in mm10 ensGene: join -1 2 -2 1 -o "1.1,1.2,1.3" mm9.ensGtp.tab mm10.name.ensGene.txt \ | tr '[ ]' '[\t]' > mm10.ensGtp.tab wc -l *.ensGtp.tab # 95533 mm10.ensGtp.tab # 95883 mm9.ensGtp.tab # select out ensPep records that match with the names in mm10 ensGene: join -1 1 -2 2 -o "1.1,1.2" mm9.ensPep.tab mm10.ensGtp.tab \ | tr '[ ]' '[\t]' > mm10.ensPep.tab wc -l mm9.ensPep.tab mm10.ensPep.tab # 55798 mm9.ensPep.tab # 55485 mm10.ensPep.tab # select out ensemblSource records that match the mm10 ensGene names: join -1 1 -2 1 -o "1.1,1.2" mm9.ensemblSource.tab mm10.name.ensGene.txt \ | tr '[ ]' '[\t]' > mm10.ensemblSource.tab wc -l mm9.ensemblSource.tab mm10.ensemblSource.tab 95883 mm9.ensemblSource.tab 95533 mm10.ensemblSource.tab # select out ensemblToGeneName records that match the mm10 ensGene names: join -1 1 -2 1 -o "1.1,1.2" mm9.ensemblToGeneName.tab \ mm10.name.ensGene.txt | tr '[ ]' '[\t]' > mm10.ensemblToGeneName.tab wc -l mm9.ensemblToGeneName.tab mm10.ensemblToGeneName.tab # 95883 mm9.ensemblToGeneName.tab # 95533 mm10.ensemblToGeneName.tab hgPepPred mm10 tab ensPep mm10.ensPep.tab hgLoadSqlTab mm10 ensGtp ~/kent/src/hg/lib/ensGtp.sql mm10.ensGtp.tab sed -e "s/15/18/" ~/kent/src/hg/lib/ensemblSource.sql > ensemblSource.sql hgLoadSqlTab mm10 ensemblSource ensemblSource.sql mm10.ensemblSource.tab # find sizes for indexes NL=`awk '{print length($1)}' mm10.ensemblToGeneName.tab | sort -rn | head -1` VL=`awk '{print length($2)}' mm10.ensemblToGeneName.tab | sort -rn | head -1` # construct sql definition with appropriate index sizes sed -e "s/ knownTo / ensemblToGeneName /; s/known gene/ensGen/; s/INDEX(name(12)/PRIMARY KEY(name($NL)/; s/value(12)/value($VL)/" \ ~/kent/src/hg/lib/knownTo.sql > ensemblToGeneName.sql hgLoadSqlTab mm10 ensemblToGeneName ensemblToGeneName.sql \ mm10.ensemblToGeneName.tab hgsql -e 'INSERT INTO trackVersion \ (db, name, who, version, updateTime, comment, source, dateReference) \ VALUES("mm10", "ensGene", "hiram", "65", now(), \ "lifted from mm9 ensGene 65", \ "lifted from mm9 ensGene 65", \ "dec2011" );' hgFixed ######################################################################### # Swap lastz Human hg19 (DONE - 2012-03-08 - Hiram) # original alignment to hg19 cd /hive/data/genomes/hg19/bed/lastzMm10.2012-03-07 cat fb.hg19.chainMm10Link.txt # 1021265143 bases of 2897316137 (35.249%) in intersection # and the swap mkdir /hive/data/genomes/mm10/bed/blastz.hg19.swap cd /hive/data/genomes/mm10/bed/blastz.hg19.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/hg19/bed/lastzMm10.2012-03-07/DEF \ -swap -noLoadChainSplit -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 72m32.794s cat fb.mm10.chainHg19Link.txt # 1014045890 bases of 2652783500 (38.226%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s blastz.hg19.swap lastz.hg19 ######################################################################### # LASTZ RAT Rn4 (DONE - 2012-03-08 - Hiram) mkdir /hive/data/genomes/mm10/bed/lastzRn4.2012-03-08 cd /hive/data/genomes/mm10/bed/lastzRn4.2012-03-08 cat << '_EOF_' > DEF # mouse vs rat BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # Specially tuned blastz parameters from Webb Miller BLASTZ_O=600 BLASTZ_E=150 BLASTZ_Y=15000 BLASTZ_T=2 BLASTZ_K=4500 BLASTZ_Q=/scratch/data/blastz/human_chimp.v2.q # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: Rat Rn4 SEQ2_DIR=/scratch/data/rn4/rn4.2bit SEQ2_LEN=/scratch/data/rn4/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/hive/data/genomes/mm10/bed/lastzRn4.2012-03-08 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # establish a screen to control this job screen -S rn4Mm10 time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -bigClusterHub=swarm -chainMinScore=5000 -chainLinearGap=medium \ -noLoadChainSplit -syntenicNet -workhorse=hgwdev \ -smallClusterHub=encodek > do.log 2>&1 & # real 129m48.444s cat fb.mm10.chainRn4Link.txt # 1449612208 bases of 2652783500 (54.645%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzRn4.2012-03-08 lastz.rn4 # and the swap mkdir /hive/data/genomes/rn4/bed/blastz.mm10.swap cd /hive/data/genomes/rn4/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzRn4.2012-03-08/DEF \ -swap -bigClusterHub=swarm -chainMinScore=5000 -chainLinearGap=medium \ -noLoadChainSplit -syntenicNet -workhorse=hgwdev \ -smallClusterHub=encodek > swap.log 2>&1 & # real 71m10.645s cat fb.rn4.chainMm10Link.txt # 1449012636 bases of 2571531505 (56.348%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/rn4/bed ln -s blastz.mm10.swap lastz.mm10 ######################################################################### # LASTZ Gorilla gorGor3 (DONE - 2012-03-08 - Hiram) mkdir /hive/data/genomes/mm10/bed/lastzGorGor3.2012-03-08 cd /hive/data/genomes/mm10/bed/lastzGorGor3.2012-03-08 cat << '_EOF_' > DEF # gorilla vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: Gorilla GorGor3 SEQ2_DIR=/scratch/data/gorGor3/gorGor3.2bit SEQ2_LEN=/scratch/data/gorGor3/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=300 BASE=/hive/data/genomes/mm10/bed/lastzGorGor3.2012-03-08 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # establish a screen to control this job screen -S mm10GorGor3 time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 625m17.180s cat fb.mm10.chainGorGor3Link.txt # 901610588 bases of 2652783500 (33.987%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzGorGor3.2012-03-08 lastz.gorGor3 mkdir /hive/data/genomes/gorGor3/bed/blastz.mm10.swap cd /hive/data/genomes/gorGor3/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzGorGor3.2012-03-08/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 91m3.616s cat fb.gorGor3.chainMm10Link.txt # 969595533 bases of 2822760080 (34.349%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/gorGor3/bed ln -s blastz.mm10.swap lastz.mm10 ############################################################################## # LASTZ Gibbon nomLeu1 (DONE - 2012-03-08 - Hiram) mkdir /hive/data/genomes/mm10/bed/lastzNomLeu1.2012-03-08 cd /hive/data/genomes/mm10/bed/lastzNomLeu1.2012-03-08 cat << '_EOF_' > DEF # gibbon vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: Gibbon NomLeu1 SEQ2_DIR=/scratch/data/nomLeu1/nomLeu1.2bit SEQ2_LEN=/scratch/data/nomLeu1/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=300 BASE=/hive/data/genomes/mm10/bed/lastzNomLeu1.2012-03-08 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # establish a screen to control this job screen -S mm10NomLeu1 time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 556m26.589s cat fb.mm10.chainNomLeu1Link.txt # 905455766 bases of 2652783500 (34.132%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzNomLeu1.2012-03-08 lastz.nomLeu1 mkdir /hive/data/genomes/nomLeu1/bed/blastz.mm10.swap cd /hive/data/genomes/nomLeu1/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzNomLeu1.2012-03-08/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 66m50.839s cat fb.nomLeu1.chainMm10Link.txt # 892362811 bases of 2756591777 (32.372%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/nomLeu1/bed ln -s blastz.mm10.swap lastz.mm10 ############################################################################## # LASTZ Rhesus rheMac3 (DONE - 2012-03-08 - Hiram) mkdir /hive/data/genomes/mm10/bed/lastzRheMac3.2012-03-08 cd /hive/data/genomes/mm10/bed/lastzRheMac3.2012-03-08 cat << '_EOF_' > DEF # rhesus vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: Rhesus RheMac3 SEQ2_DIR=/scratch/data/rheMac3/rheMac3.2bit SEQ2_LEN=/scratch/data/rheMac3/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=300 BASE=/hive/data/genomes/mm10/bed/lastzRheMac3.2012-03-08 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # establish a screen to control this job screen -S mm10RheMac3 time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 596m55.622s cat fb.mm10.chainRheMac3Link.txt # 900117108 bases of 2652783500 (33.931%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzRheMac3.2012-03-08 lastz.rheMac3 mkdir /hive/data/genomes/rheMac3/bed/blastz.mm10.swap cd /hive/data/genomes/rheMac3/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzRheMac3.2012-03-08/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 69m5.839s cat fb.rheMac3.chainMm10Link.txt # 883164992 bases of 2639145830 (33.464%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/rheMac3/bed ln -s blastz.mm10.swap lastz.mm10 ############################################################################## # LASTZ Baboon papHam1 (DONE - 2012-03-09 - Hiram) mkdir /hive/data/genomes/mm10/bed/lastzPapHam1.2012-03-09 cd /hive/data/genomes/mm10/bed/lastzPapHam1.2012-03-09 cat << '_EOF_' > DEF # baboon vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: Baboon PapHam1 SEQ2_DIR=/scratch/data/papHam1/papHam1.2bit SEQ2_LEN=/scratch/data/papHam1/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=500 BASE=/hive/data/genomes/mm10/bed/lastzPapHam1.2012-03-09 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # establish a screen to control this job screen -S mm10PapHam1 time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 1138m52.716s cat fb.mm10.chainPapHam1Link.txt # 890718423 bases of 2652783500 (33.577%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzPapHam1.2012-03-09 lastz.papHam1 # better to have reciprocal best for this one since it is low coverage: cd /hive/data/genomes/mm10/bed/lastzPapHam1.2012-03-09 time doRecipBest.pl mm10 papHam1 -buildDir=`pwd` -workhorse=hgwdev \ > best.log 2>&1 & # real 899m48.908s mkdir /hive/data/genomes/papHam1/bed/blastz.mm10.swap cd /hive/data/genomes/papHam1/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzPapHam1.2012-03-09/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 548m15.438s cat fb.mm10.chainPapHam1Link.txt # 878016290 bases of 2741867288 (32.023%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/papHam1/bed ln -s blastz.mm10.swap lastz.mm10 ############################################################################## # Swap ponAbe2 lastz (DONE - 2012-03-09 - Hiram) # original alignment result: cd /hive/data/genomes/ponAbe2/bed/lastzMm10.2012-03-08 cat fb.ponAbe2.chainMm10Link.txt # 946932454 bases of 3093572278 (30.610%) in intersection # and the swap mkdir /hive/data/genomes/mm10/bed/blastz.ponAbe2.swap cd /hive/data/genomes/mm10/bed/blastz.ponAbe2.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/ponAbe2/bed/lastzMm10.2012-03-08/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 72m38.550s cat fb.mm10.chainPonAbe2Link.txt # 915093866 bases of 2652783500 (34.496%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s blastz.ponAbe2.swap lastz.ponAbe2 ############################################################################## # LASTZ Squirrel monkey saiBol1 (DONE - 2012-03-09 - Hiram) mkdir /hive/data/genomes/mm10/bed/lastzSaiBol1.2012-03-09 cd /hive/data/genomes/mm10/bed/lastzSaiBol1.2012-03-09 cat << '_EOF_' > DEF # squirrel monkey vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: Squirrel monkey SaiBol1 SEQ2_DIR=/hive/data/genomes/saiBol1/saiBol1.2bit SEQ2_LEN=/hive/data/genomes/saiBol1/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=50 BASE=/hive/data/genomes/mm10/bed/lastzSaiBol1.2012-03-09 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # establish a screen to control this job screen -S mm10SaiBol1 time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 538m42.643s cat fb.mm10.chainSaiBol1Link.txt # 857872391 bases of 2652783500 (32.339%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzSaiBol1.2012-03-09 lastz.saiBol1 mkdir /hive/data/genomes/saiBol1/bed/blastz.mm10.swap cd /hive/data/genomes/saiBol1/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzSaiBol1.2012-03-09/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 59m36.306s cat fb.saiBol1.chainMm10Link.txt # 838457857 bases of 2477131095 (33.848%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/saiBol1/bed ln -s blastz.mm10.swap lastz.mm10 ############################################################################## # LASTZ Marmoset calJac3 (DONE - 2012-03-09 - Hiram) mkdir /hive/data/genomes/mm10/bed/lastzCalJac3.2012-03-09 cd /hive/data/genomes/mm10/bed/lastzCalJac3.2012-03-09 cat << '_EOF_' > DEF # marmoset vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: Marmoset monkey CalJac3 SEQ2_DIR=/scratch/data/calJac3/calJac3.2bit SEQ2_LEN=/scratch/data/calJac3/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=100 BASE=/hive/data/genomes/mm10/bed/lastzCalJac3.2012-03-09 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # establish a screen to control this job screen -S mm10CalJac3 time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 529m39.657s cat fb.mm10.chainCalJac3Link.txt # 860830771 bases of 2652783500 (32.450%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzCalJac3.2012-03-09 lastz.calJac3 mkdir /hive/data/genomes/calJac3/bed/blastz.mm10.swap cd /hive/data/genomes/calJac3/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzCalJac3.2012-03-09/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 67m21.635s cat fb.calJac3.chainMm10Link.txt # 861565545 bases of 2752505800 (31.301%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/calJac3/bed ln -s blastz.mm10.swap lastz.mm10 ############################################################################## # LASTZ Chimp PanTro4 (DONE - 2012-03-09 - Hiram) mkdir /hive/data/genomes/panTro3/bed/lastzMm10.2012-03-09 cd /hive/data/genomes/panTro3/bed/lastzMm10.2012-03-09 cat << '_EOF_' > DEF # chimp vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Chimp PanTro4 SEQ2_DIR=/hive/data/genomes/panTro4/panTro4.2bit SEQ2_LEN=/hive/data/genomes/panTro4/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LAP=0 SEQ2_LIMIT=200 BASE=/hive/data/genomes/mm10/bed/lastzPanTro4.2012-03-09 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # establish a screen to control this job screen -S mm10PanTro4 time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet -noLoadChainSplit \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 682m53.046s cat fb.mm10.chainPanTro4Link.txt # 919836299 bases of 2652783500 (34.674%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzPanTro4.2012-03-09 lastz.panTro4 mkdir /hive/data/genomes/panTro4/bed/blastz.mm10.swap cd /hive/data/genomes/panTro4/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzPanTro4.2012-03-09/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 73m23.855s cat fb.panTro4.chainMm10Link.txt # 926540065 bases of 2902338967 (31.924%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/panTro4/bed ln -s blastz.mm10.swap lastz.mm10 ############################################################################## # LASTZ tarsier tarSyr1 (DONE - 2012-03-10 - Hiram) mkdir /hive/data/genomes/mm10/bed/lastzTarSyr1.2012-03-10 cd /hive/data/genomes/mm10/bed/lastzTarSyr1.2012-03-10 cat << '_EOF_' > DEF # tarsier vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: tarsier TarSyr1 SEQ2_DIR=/scratch/data/tarSyr1/tarSyr1.2bit SEQ2_LEN=/scratch/data/tarSyr1/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=800 BASE=/hive/data/genomes/mm10/bed/lastzTarSyr1.2012-03-10 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # establish a screen to control this job screen -S mm10TarSyr1 time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 2457m45.759s cat fb.mm10.chainTarSyr1Link.txt # 651517559 bases of 2652783500 (24.560%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzTarSyr1.2012-03-10 lastz.tarSyr1 # better to have reciprocal best for this one since it is low coverage: cd /hive/data/genomes/mm10/bed/lastzTarSyr1.2012-03-10 time doRecipBest.pl mm10 tarSyr1 -buildDir=`pwd` -workhorse=hgwdev \ > best.log 2>&1 & # real 1176m19.336s mkdir /hive/data/genomes/tarSyr1/bed/blastz.mm10.swap cd /hive/data/genomes/tarSyr1/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzTarSyr1.2012-03-10/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 746m30.852s cat fb.tarSyr1.chainMm10Link.txt # 691746721 bases of 2768536343 (24.986%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/tarSyr1/bed ln -s blastz.mm10.swap lastz.mm10 ############################################################################## # Swap chimp panTro3 to Mm10 (DONE - 2012-03-12 - Hiram) # original alignment on panTro3 cd /hive/data/genomes/panTro3/bed/lastzMm10.2012-03-08 cat fb.panTro3.chainMm10Link.txt # 929073028 bases of 2900529764 (32.031%) in intersection # and this swap: mkdir /hive/data/genomes/mm10/bed/blastz.panTro3.swap cd /hive/data/genomes/mm10/bed/blastz.panTro3.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/panTro3/bed/lastzMm10.2012-03-08/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 68m46.408s cat fb.mm10.chainPanTro3Link.txt # 922491113 bases of 2652783500 (34.774%) in intersection ############################################################################## # LASTZ bushbaby otoGar3 (DONE - 2012-03-13 - Hiram) mkdir /hive/data/genomes/mm10/bed/lastzOtoGar3.2012-03-13 cd /hive/data/genomes/mm10/bed/lastzOtoGar3.2012-03-13 # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 cat << '_EOF_' > DEF # bushbaby vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: bushbaby OtoGar3 SEQ2_DIR=/hive/data/genomes/otoGar3/otoGar3.2bit SEQ2_LEN=/hive/data/genomes/otoGar3/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=30 BASE=/hive/data/genomes/mm10/bed/lastzOtoGar3.2012-03-13 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # establish a screen to control this job screen -S mm10OtoGar3 time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 757m32.438s cat fb.mm10.chainOtoGar3Link.txt # 790408953 bases of 2652783500 (29.795%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzOtoGar3.2012-03-13 lastz.otoGar3 mkdir /hive/data/genomes/otoGar3/bed/blastz.mm10.swap cd /hive/data/genomes/otoGar3/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzOtoGar3.2012-03-13/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 61m18.952s cat fb.otoGar3.chainMm10Link.txt # 776907989 bases of 2359530453 (32.926%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/otoGar3/bed ln -s blastz.mm10.swap lastz.mm10 ############################################################################## # LASTZ mouse lemur micMur1 (DONE - 2012-03-13 - Hiram) mkdir /hive/data/genomes/mm10/bed/lastzMicMur1.2012-03-13 cd /hive/data/genomes/mm10/bed/lastzMicMur1.2012-03-13 # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 cat << '_EOF_' > DEF # mouse lemur vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: mouse lemur MicMur1 SEQ2_DIR=/scratch/data/micMur1/micMur1.2bit SEQ2_LEN=/scratch/data/micMur1/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=400 BASE=/hive/data/genomes/mm10/bed/lastzMicMur1.2012-03-13 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # establish a screen to control this job screen -S mm10MicMur1 time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 687m41.863s cat fb.mm10.chainMicMur1Link.txt # 706607444 bases of 2652783500 (26.636%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzMicMur1.2012-03-13 lastz.micMur1 # better to have reciprocal best for this one since it is low coverage: cd /hive/data/genomes/mm10/bed/lastzMicMur1.2012-03-13 time doRecipBest.pl mm10 micMur1 -buildDir=`pwd` -workhorse=hgwdev \ > best.log 2>&1 & # real 877m18.105s mkdir /hive/data/genomes/micMur1/bed/blastz.mm10.swap cd /hive/data/genomes/micMur1/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzMicMur1.2012-03-13/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 116m54.411s cat fb.micMur1.chainMm10Link.txt # 696025630 bases of 1852394361 (37.574%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/micMur1/bed ln -s blastz.mm10.swap lastz.mm10 ############################################################################## # LASTZ squirrel speTri2 (DONE - 2012-03-15 - Hiram) mkdir /hive/data/genomes/mm10/bed/lastzSpeTri2.2012-03-15 cd /hive/data/genomes/mm10/bed/lastzSpeTri2.2012-03-15 # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 cat << '_EOF_' > DEF # squirrel vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: squirrel SpeTri2 SEQ2_DIR=/hive/data/genomes/speTri2/speTri2.2bit SEQ2_LEN=/hive/data/genomes/speTri2/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=50 BASE=/hive/data/genomes/mm10/bed/lastzSpeTri2.2012-03-15 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # establish a screen to control this job screen -S mm10SpeTri2 time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 935m27.893s cat fb.mm10.chainSpeTri2Link.txt # 907715417 bases of 2652783500 (34.217%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzSpeTri2.2012-03-15 lastz.speTri2 mkdir /hive/data/genomes/speTri2/bed/blastz.mm10.swap cd /hive/data/genomes/speTri2/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzSpeTri2.2012-03-15/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 74m41.819s # real 116m54.411s cat fb.speTri2.chainMm10Link.txt # 906956512 bases of 2311060300 (39.244%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/speTri2/bed ln -s blastz.mm10.swap lastz.mm10 ############################################################################## # LASTZ kangaroo rat dipOrd1 (DONE - 2012-03-15 - Hiram) # establish a screen to control this job screen -S mm10DipOrd1 mkdir /hive/data/genomes/mm10/bed/lastzDipOrd1.2012-03-15 cd /hive/data/genomes/mm10/bed/lastzDipOrd1.2012-03-15 # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 cat << '_EOF_' > DEF # kangaroo rat vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: kangaroo rat DipOrd1 SEQ2_DIR=/scratch/data/dipOrd1/dipOrd1.2bit SEQ2_LEN=/scratch/data/dipOrd1/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=400 BASE=/hive/data/genomes/mm10/bed/lastzDipOrd1.2012-03-15 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 867m19.972s cat fb.mm10.chainDipOrd1Link.txt # 516232678 bases of 2652783500 (19.460%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzDipOrd1.2012-03-15 lastz.dipOrd1 # better to have reciprocal best for this one since it is low coverage: cd /hive/data/genomes/mm10/bed/lastzDipOrd1.2012-03-15 time doRecipBest.pl mm10 dipOrd1 -buildDir=`pwd` -workhorse=hgwdev \ > best.log 2>&1 & # real 914m20.405s mkdir /hive/data/genomes/dipOrd1/bed/blastz.mm10.swap cd /hive/data/genomes/dipOrd1/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzDipOrd1.2012-03-15/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 115m1.497s cat fb.dipOrd1.chainMm10Link.txt # 507580668 bases of 1844961421 (27.512%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/dipOrd1/bed ln -s blastz.mm10.swap lastz.mm10 ############################################################################## # LASTZ Naked mole-rat hetGla1 (DONE - 2012-03-15 - Hiram) # establish a screen to control this job screen -S mm10HetGla1 mkdir /hive/data/genomes/mm10/bed/lastzHetGla1.2012-03-15 cd /hive/data/genomes/mm10/bed/lastzHetGla1.2012-03-15 # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 cat << '_EOF_' > DEF # Naked mole-rat vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: Naked mole-rat HetGla1 SEQ2_DIR=/scratch/data/hetGla1/hetGla1.2bit SEQ2_LEN=/scratch/data/hetGla1/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=200 BASE=/hive/data/genomes/mm10/bed/lastzHetGla1.2012-03-15 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 745m15.097s cat fb.mm10.chainHetGla1Link.txt # 853221843 bases of 2652783500 (32.163%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzHetGla1.2012-03-15 lastz.hetGla1 mkdir /hive/data/genomes/hetGla1/bed/blastz.mm10.swap cd /hive/data/genomes/hetGla1/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzHetGla1.2012-03-15/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 74m26.471s cat fb.hetGla1.chainMm10Link.txt # 885195861 bases of 2430064805 (36.427%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/hetGla1/bed ln -s blastz.mm10.swap lastz.mm10 ############################################################################## # LASTZ horse equCab2 (DONE - 2012-03-16 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10EquCab2 mkdir /hive/data/genomes/mm10/bed/lastzEquCab2.2012-03-16 cd /hive/data/genomes/mm10/bed/lastzEquCab2.2012-03-16 # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 cat << '_EOF_' > DEF # horse vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: horse EquCab2 SEQ2_DIR=/scratch/data/equCab2/equCab2.2bit SEQ2_LEN=/scratch/data/equCab2/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=1 BASE=/hive/data/genomes/mm10/bed/lastzEquCab2.2012-03-16 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 566m34.024s cat fb.mm10.chainEquCab2Link.txt # 912967841 bases of 2652783500 (34.415%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzEquCab2.2012-03-16 lastz.equCab2 mkdir /hive/data/genomes/equCab2/bed/blastz.mm10.swap cd /hive/data/genomes/equCab2/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzEquCab2.2012-03-16/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 87m2.261s cat fb.equCab2.chainMm10Link.txt # 901995882 bases of 2428790173 (37.138%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/equCab2/bed ln -s blastz.mm10.swap lastz.mm10 ############################################################################## # LASTZ guinea pig cavPor3 (DONE - 2012-03-16 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10CavPor3 mkdir /hive/data/genomes/mm10/bed/lastzCavPor3.2012-03-16 cd /hive/data/genomes/mm10/bed/lastzCavPor3.2012-03-16 # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 cat << '_EOF_' > DEF # guinea pig vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: guinea pig CavPor3 SEQ2_DIR=/scratch/data/cavPor3/cavPor3.2bit SEQ2_LEN=/scratch/data/cavPor3/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=10 BASE=/hive/data/genomes/mm10/bed/lastzCavPor3.2012-03-16 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 1523m35.729s cat fb.mm10.chainCavPor3Link.txt # 754642254 bases of 2652783500 (28.447%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzCavPor3.2012-03-16 lastz.cavPor3 mkdir /hive/data/genomes/cavPor3/bed/blastz.mm10.swap cd /hive/data/genomes/cavPor3/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzCavPor3.2012-03-16/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 80m23.870s cat fb.cavPor3.chainMm10Link.txt # 775452752 bases of 2663369733 (29.115%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/cavPor3/bed ln -s blastz.mm10.swap lastz.mm10 ############################################################################## # LASTZ alpaca vicPac1 (DONE - 2012-03-16 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10VicPac1 mkdir /hive/data/genomes/mm10/bed/lastzVicPac1.2012-03-16 cd /hive/data/genomes/mm10/bed/lastzVicPac1.2012-03-16 # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 cat << '_EOF_' > DEF # guinea pig vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: alpaca VicPac1 SEQ2_DIR=/scratch/data/vicPac1/vicPac1.2bit SEQ2_LEN=/scratch/data/vicPac1/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=700 BASE=/hive/data/genomes/mm10/bed/lastzVicPac1.2012-03-16 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 2049m38.674s cat fb.mm10.chainVicPac1Link.txt # 600477253 bases of 2652783500 (22.636%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzVicPac1.2012-03-16 lastz.vicPac1 # better to have reciprocal best for this one since it is low coverage: cd /hive/data/genomes/mm10/bed/lastzVicPac1.2012-03-16 time doRecipBest.pl mm10 vicPac1 -buildDir=`pwd` -workhorse=hgwdev \ > best.log 2>&1 & # real 824m37.107s mkdir /hive/data/genomes/vicPac1/bed/blastz.mm10.swap cd /hive/data/genomes/vicPac1/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzVicPac1.2012-03-16/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 159m21.952s cat fb.vicPac1.chainMm10Link.txt # 610885692 bases of 1922910435 (31.769%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/vicPac1/bed ln -s blastz.mm10.swap lastz.mm10 ############################################################################## # LASTZ dolphin turTru1 (DONE - 2012-03-16 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10TurTru1 mkdir /hive/data/genomes/mm10/bed/lastzTurTru1.2012-03-16 cd /hive/data/genomes/mm10/bed/lastzTurTru1.2012-03-16 # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 cat << '_EOF_' > DEF # dolphin vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: dolphin TurTru1 SEQ2_DIR=/scratch/data/turTru1/turTru1.2bit SEQ2_LEN=/scratch/data/turTru1/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=300 BASE=/hive/data/genomes/mm10/bed/lastzTurTru1.2012-03-16 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 1484m14.609s cat fb.mm10.chainTurTru1Link.txt # 762961671 bases of 2652783500 (28.761%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzTurTru1.2012-03-16 lastz.turTru1 # better to have reciprocal best for this one since it is low coverage: cd /hive/data/genomes/mm10/bed/lastzTurTru1.2012-03-16 time doRecipBest.pl mm10 turTru1 -buildDir=`pwd` -workhorse=hgwdev \ > best.log 2>&1 & # real 733m37.272s mkdir /hive/data/genomes/turTru1/bed/blastz.mm10.swap cd /hive/data/genomes/turTru1/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzTurTru1.2012-03-16/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 79m38.703s cat fb.turTru1.chainMm10Link.txt # 744359707 bases of 2298444090 (32.385%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/turTru1/bed ln -s blastz.mm10.swap lastz.mm10 ############################################################################## # LASTZ tree shrew tupBel1 (DONE - 2012-03-16 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10TupBel1 mkdir /hive/data/genomes/mm10/bed/lastzTupBel1.2012-03-16 cd /hive/data/genomes/mm10/bed/lastzTupBel1.2012-03-16 # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 cat << '_EOF_' > DEF # tree shrew vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: tree shrew TupBel1 SEQ2_DIR=/scratch/data/tupBel1/tupBel1.2bit SEQ2_LEN=/scratch/data/tupBel1/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=400 BASE=/hive/data/genomes/mm10/bed/lastzTupBel1.2012-03-16 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 1731m30.449s cat fb.mm10.chainTupBel1Link.txt # 524337666 bases of 2652783500 (19.766%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzTupBel1.2012-03-16 lastz.tupBel1 # better to have reciprocal best for this one since it is low coverage: cd /hive/data/genomes/mm10/bed/lastzTupBel1.2012-03-16 time doRecipBest.pl mm10 tupBel1 -buildDir=`pwd` -workhorse=hgwdev \ > best.log 2>&1 & # real 1090m30.429s mkdir /hive/data/genomes/tupBel1/bed/blastz.mm10.swap cd /hive/data/genomes/tupBel1/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzTupBel1.2012-03-16/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 136m7.163s cat fb.tupBel1.chainMm10Link.txt # 537379661 bases of 2137225476 (25.144%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/tupBel1/bed ln -s blastz.mm10.swap lastz.mm10 ############################################################################## # LASTZ pig susScr2 (DONE - 2012-03-16 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10SusScr2 mkdir /hive/data/genomes/mm10/bed/lastzSusScr2.2012-03-16 cd /hive/data/genomes/mm10/bed/lastzSusScr2.2012-03-16 # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 cat << '_EOF_' > DEF # pig vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: pig SusScr2 SEQ2_DIR=/scratch/data/susScr2/susScr2.2bit SEQ2_LEN=/scratch/data/susScr2/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=1 BASE=/hive/data/genomes/mm10/bed/lastzSusScr2.2012-03-16 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 1272m57.727s cat fb.mm10.chainSusScr2Link.txt # 616716602 bases of 2652783500 (23.248%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzSusScr2.2012-03-16 lastz.susScr2 mkdir /hive/data/genomes/susScr2/bed/blastz.mm10.swap cd /hive/data/genomes/susScr2/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzSusScr2.2012-03-16/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 62m47.465s cat fb.susScr2.chainMm10Link.txt # 656498040 bases of 2231298548 (29.422%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/susScr2/bed ln -s blastz.mm10.swap lastz.mm10 ############################################################################## # LASTZ rabbit oryCun2 (DONE - 2012-03-16 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10OryCun2 mkdir /hive/data/genomes/mm10/bed/lastzOryCun2.2012-03-16 cd /hive/data/genomes/mm10/bed/lastzOryCun2.2012-03-16 # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 cat << '_EOF_' > DEF # rabbit vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: rabbit OryCun2 SEQ2_DIR=/scratch/data/oryCun2/oryCun2.2bit SEQ2_LEN=/scratch/data/oryCun2/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=20 BASE=/hive/data/genomes/mm10/bed/lastzOryCun2.2012-03-16 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 1412m58.641s cat fb.mm10.chainOryCun2Link.txt # 669778489 bases of 2652783500 (25.248%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzOryCun2.2012-03-16 lastz.oryCun2 mkdir /hive/data/genomes/oryCun2/bed/blastz.mm10.swap cd /hive/data/genomes/oryCun2/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzOryCun2.2012-03-16/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 64m40.959s cat fb.oryCun2.chainMm10Link.txt # 668643668 bases of 2604023284 (25.677%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/oryCun2/bed ln -s blastz.mm10.swap lastz.mm10 ############################################################################## # LASTZ sloth choHof1 (DONE - 2012-03-19 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10ChoHof1 mkdir /hive/data/genomes/mm10/bed/lastzChoHof1.2012-03-19 cd /hive/data/genomes/mm10/bed/lastzChoHof1.2012-03-19 # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 cat << '_EOF_' > DEF # sloth vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: sloth ChoHof1 SEQ2_DIR=/scratch/data/choHof1/choHof1.2bit SEQ2_LEN=/scratch/data/choHof1/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=800 BASE=/hive/data/genomes/mm10/bed/lastzChoHof1.2012-03-19 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # rebooted hgwdev during first swarm run, continuing: time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -continue=cat -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > cat.log 2>&1 & # Elapsed time: 65m26s cat fb.mm10.chainChoHof1Link.txt # 477994856 bases of 2652783500 (18.019%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzChoHof1.2012-03-19 lastz.choHof1 # better to have reciprocal best for this one since it is low coverage: cd /hive/data/genomes/mm10/bed/lastzChoHof1.2012-03-19 time doRecipBest.pl mm10 choHof1 -buildDir=`pwd` -workhorse=hgwdev \ > best.log 2>&1 & # real 1171m56.481s mkdir /hive/data/genomes/choHof1/bed/blastz.mm10.swap cd /hive/data/genomes/choHof1/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzChoHof1.2012-03-19/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 1613m3.348s cat fb.choHof1.chainMm10Link.txt # 488047499 bases of 2060419685 (23.687%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/choHof1/bed ln -s blastz.mm10.swap lastz.mm10 ############################################################################## # LASTZ megabat pteVam1 (DONE - 2012-03-19 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10PteVam1 mkdir /hive/data/genomes/mm10/bed/lastzPteVam1.2012-03-19 cd /hive/data/genomes/mm10/bed/lastzPteVam1.2012-03-19 # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 cat << '_EOF_' > DEF # megabat vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: megabat PteVam1 SEQ2_DIR=/scratch/data/pteVam1/pteVam1.2bit SEQ2_LEN=/scratch/data/pteVam1/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=200 BASE=/hive/data/genomes/mm10/bed/lastzPteVam1.2012-03-19 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 1843m33.186s cat fb.mm10.chainPteVam1Link.txt # 725414059 bases of 2652783500 (27.345%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzPteVam1.2012-03-19 lastz.pteVam1 # better to have reciprocal best for this one since it is low coverage: cd /hive/data/genomes/mm10/bed/lastzPteVam1.2012-03-19 time doRecipBest.pl mm10 pteVam1 -buildDir=`pwd` -workhorse=hgwdev \ > best.log 2>&1 & # real 743m57.901s mkdir /hive/data/genomes/pteVam1/bed/blastz.mm10.swap cd /hive/data/genomes/pteVam1/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzPteVam1.2012-03-19/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # Elapsed time: 75m35s cat fb.pteVam1.chainMm10Link.txt # 710519911 bases of 1839436660 (38.627%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/pteVam1/bed ln -s blastz.mm10.swap lastz.mm10 ############################################################################## # LASTZ elephant loxAfr3 (DONE - 2012-03-19 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10LoxAfr3 mkdir /hive/data/genomes/mm10/bed/lastzLoxAfr3.2012-03-19 cd /hive/data/genomes/mm10/bed/lastzLoxAfr3.2012-03-19 # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 cat << '_EOF_' > DEF # elephant vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: elephant LoxAfr3 SEQ2_DIR=/scratch/data/loxAfr3/loxAfr3.2bit SEQ2_LEN=/scratch/data/loxAfr3/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=50 BASE=/hive/data/genomes/mm10/bed/lastzLoxAfr3.2012-03-19 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 1848m11.111s cat fb.mm10.chainLoxAfr3Link.txt # 685029753 bases of 2652783500 (25.823%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzLoxAfr3.2012-03-19 lastz.loxAfr3 mkdir /hive/data/genomes/loxAfr3/bed/blastz.mm10.swap cd /hive/data/genomes/loxAfr3/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzLoxAfr3.2012-03-19/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # Elapsed time: 73m14s cat fb.loxAfr3.chainMm10Link.txt # 674108752 bases of 3118565340 (21.616%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/loxAfr3/bed ln -s blastz.mm10.swap lastz.mm10 ############################################################################## # LASTZ cat felCat4 (DONE - 2012-03-19 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10FelCat4 mkdir /hive/data/genomes/mm10/bed/lastzFelCat4.2012-03-19 cd /hive/data/genomes/mm10/bed/lastzFelCat4.2012-03-19 # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 cat << '_EOF_' > DEF # cat vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: cat FelCat4 SEQ2_DIR=/scratch/data/felCat4/felCat4.2bit SEQ2_LEN=/scratch/data/felCat4/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=300 BASE=/hive/data/genomes/mm10/bed/lastzFelCat4.2012-03-19 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 2010m48.963s cat fb.mm10.chainFelCat4Link.txt # 637531191 bases of 2652783500 (24.033%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzFelCat4.2012-03-19 lastz.felCat4 # better to have reciprocal best for this one since it is low coverage: cd /hive/data/genomes/mm10/bed/lastzFelCat4.2012-03-19 time doRecipBest.pl mm10 felCat4 -buildDir=`pwd` -workhorse=hgwdev \ > best.log 2>&1 & # real 1135m12.207s mkdir /hive/data/genomes/felCat4/bed/blastz.mm10.swap cd /hive/data/genomes/felCat4/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzFelCat4.2012-03-19/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # Elapsed time: 88m12s cat fb.felCat4.chainMm10Link.txt # 616167655 bases of 1990635005 (30.953%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/felCat4/bed ln -s blastz.mm10.swap lastz.mm10 ############################################################################## # LASTZ panda ailMel1 (DONE - 2012-03-19 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10AilMel1 mkdir /hive/data/genomes/mm10/bed/lastzAilMel1.2012-03-19 cd /hive/data/genomes/mm10/bed/lastzAilMel1.2012-03-19 # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 cat << '_EOF_' > DEF # panda vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: panda AilMel1 SEQ2_DIR=/scratch/data/ailMel1/ailMel1.2bit SEQ2_LEN=/scratch/data/ailMel1/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=200 BASE=/hive/data/genomes/mm10/bed/lastzAilMel1.2012-03-19 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # forgot to copy to the log time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium # real 1914m15.921s cat fb.mm10.chainAilMel1Link.txt # 821806974 bases of 2652783500 (30.979%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzAilMel1.2012-03-19 lastz.ailMel1 mkdir /hive/data/genomes/ailMel1/bed/blastz.mm10.swap cd /hive/data/genomes/ailMel1/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzAilMel1.2012-03-19/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # Elapsed time: 65m50s cat fb.ailMel1.chainMm10Link.txt # 798482731 bases of 2245312831 (35.562%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/ailMel1/bed ln -s blastz.mm10.swap lastz.mm10 ############################################################################## # LASTZ dog canFam3 (DONE - 2012-03-19 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10CanFam3 mkdir /hive/data/genomes/mm10/bed/lastzCanFam3.2012-03-19 cd /hive/data/genomes/mm10/bed/lastzCanFam3.2012-03-19 # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 cat << '_EOF_' > DEF # dog vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: dog CanFam3 SEQ2_DIR=/hive/data/genomes/canFam3/canFam3.2bit SEQ2_LEN=/hive/data/genomes/canFam3/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=20 BASE=/hive/data/genomes/mm10/bed/lastzCanFam3.2012-03-19 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # forgot to copy to the log time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 1883m21.850s cat fb.mm10.chainCanFam3Link.txt # 773114990 bases of 2652783500 (29.144%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzCanFam3.2012-03-19 lastz.canFam3 mkdir /hive/data/genomes/canFam3/bed/blastz.mm10.swap cd /hive/data/genomes/canFam3/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzCanFam3.2012-03-19/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # Elapsed time: 63m22s cat fb.canFam3.chainMm10Link.txt # 756678903 bases of 2392715236 (31.624%) in intersectio # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/canFam3/bed ln -s blastz.mm10.swap lastz.mm10 ############################################################################## # LASTZ armadillo dasNov2 (DONE - 2012-03-21 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10DasNov2 mkdir /hive/data/genomes/mm10/bed/lastzDasNov2.2012-03-21 cd /hive/data/genomes/mm10/bed/lastzDasNov2.2012-03-21 # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 cat << '_EOF_' > DEF # armadillo vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: armadillo DasNov2 SEQ2_DIR=/scratch/data/dasNov2/dasNov2.2bit SEQ2_LEN=/scratch/data/dasNov2/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=800 BASE=/hive/data/genomes/mm10/bed/lastzDasNov2.2012-03-21 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 2655m49.904s cat fb.mm10.chainDasNov2Link.txt # 451070039 bases of 2652783500 (17.004%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzDasNov2.2012-03-21 lastz.dasNov2 # better to have reciprocal best for this one since it is low coverage: cd /hive/data/genomes/mm10/bed/lastzDasNov2.2012-03-21 time doRecipBest.pl mm10 dasNov2 -buildDir=`pwd` -workhorse=hgwdev \ > best.log 2>&1 & # real 1163m1.023s mkdir /hive/data/genomes/dasNov2/bed/blastz.mm10.swap cd /hive/data/genomes/dasNov2/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzDasNov2.2012-03-21/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 201m9.701s cat fb.dasNov2.chainMm10Link.txt # 461142417 bases of 2371493872 (19.445%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/dasNov2/bed ln -s blastz.mm10.swap lastz.mm10 ############################################################################## # LASTZ microbat myoLuc2 (DONE - 2012-03-21 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10MyoLuc2 mkdir /hive/data/genomes/mm10/bed/lastzMyoLuc2.2012-03-21 cd /hive/data/genomes/mm10/bed/lastzMyoLuc2.2012-03-21 # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 cat << '_EOF_' > DEF # microbat vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: microbat MyoLuc2 SEQ2_DIR=/scratch/data/myoLuc2/myoLuc2.2bit SEQ2_LEN=/scratch/data/myoLuc2/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=50 BASE=/hive/data/genomes/mm10/bed/lastzMyoLuc2.2012-03-21 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 1033m38.184s cat fb.mm10.chainMyoLuc2Link.txt # 646292112 bases of 2652783500 (24.363%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzMyoLuc2.2012-03-21 lastz.myoLuc2 # better to have reciprocal best for this one since it is low coverage: cd /hive/data/genomes/mm10/bed/lastzMyoLuc2.2012-03-21 time doRecipBest.pl mm10 myoLuc2 -buildDir=`pwd` -workhorse=hgwdev \ > best.log 2>&1 & # real 29m16.249s mkdir /hive/data/genomes/myoLuc2/bed/blastz.mm10.swap cd /hive/data/genomes/myoLuc2/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzMyoLuc2.2012-03-21/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 54m5.607s cat fb.myoLuc2.chainMm10Link.txt # 661704053 bases of 1966419868 (33.650%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/myoLuc2/bed ln -s blastz.mm10.swap lastz.mm10 ############################################################################## # LASTZ cow bosTau7 (DONE - 2012-03-21 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10BosTau7 mkdir /hive/data/genomes/mm10/bed/lastzBosTau7.2012-03-21 cd /hive/data/genomes/mm10/bed/lastzBosTau7.2012-03-21 # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 cat << '_EOF_' > DEF # cow vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: cow BosTau7 SEQ2_DIR=/scratch/data/bosTau7/bosTau7.2bit SEQ2_LEN=/scratch/data/bosTau7/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=50 BASE=/hive/data/genomes/mm10/bed/lastzBosTau7.2012-03-21 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 1151m20.445s cat fb.mm10.chainBosTau7Link.txt # 696498363 bases of 2652783500 (26.255%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzBosTau7.2012-03-21 lastz.bosTau7 mkdir /hive/data/genomes/bosTau7/bed/blastz.mm10.swap cd /hive/data/genomes/bosTau7/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzBosTau7.2012-03-21/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 77m58.759s cat fb.bosTau7.chainMm10Link.txt # 711923052 bases of 2804673174 (25.383%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/bosTau7/bed ln -s blastz.mm10.swap lastz.mm10 ############################################################################## # LASTZ sheep oviAri1 (DONE - 2012-03-21 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10OviAri1 mkdir /hive/data/genomes/mm10/bed/lastzOviAri1.2012-03-21 cd /hive/data/genomes/mm10/bed/lastzOviAri1.2012-03-21 # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 cat << '_EOF_' > DEF # sheep vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: sheep OviAri1 SEQ2_DIR=/scratch/data/oviAri1/oviAri1.2bit SEQ2_LEN=/scratch/data/oviAri1/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=50 BASE=/hive/data/genomes/mm10/bed/lastzOviAri1.2012-03-21 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 892m33.068s cat fb.mm10.chainOviAri1Link.txt # 406955832 bases of 2652783500 (15.341%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzOviAri1.2012-03-21 lastz.oviAri1 # better to have reciprocal best for this one since it is low coverage: cd /hive/data/genomes/mm10/bed/lastzOviAri1.2012-03-21 time doRecipBest.pl mm10 oviAri1 -buildDir=`pwd` -workhorse=hgwdev \ > best.log 2>&1 & # real 1183m43.488s mkdir /hive/data/genomes/oviAri1/bed/blastz.mm10.swap cd /hive/data/genomes/oviAri1/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzOviAri1.2012-03-21/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 30m5.554s cat fb.oviAri1.chainMm10Link.txt # 383499897 bases of 1201271277 (31.925%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/oviAri1/bed ln -s blastz.mm10.swap lastz.mm10 ############################################################################## # LASTZ rock hyrax proCap1 (DONE - 2012-03-21 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10ProCap1 mkdir /hive/data/genomes/mm10/bed/lastzProCap1.2012-03-21 cd /hive/data/genomes/mm10/bed/lastzProCap1.2012-03-21 # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 cat << '_EOF_' > DEF # rock hyrax vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: rock hyrax ProCap1 SEQ2_DIR=/scratch/data/proCap1/proCap1.2bit SEQ2_LEN=/scratch/data/proCap1/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=600 BASE=/hive/data/genomes/mm10/bed/lastzProCap1.2012-03-21 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 2859m51.317s cat fb.mm10.chainProCap1Link.txt # 401804601 bases of 2652783500 (15.147%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzProCap1.2012-03-21 lastz.proCap1 # better to have reciprocal best for this one since it is low coverage: cd /hive/data/genomes/mm10/bed/lastzProCap1.2012-03-21 time doRecipBest.pl mm10 proCap1 -buildDir=`pwd` -workhorse=hgwdev \ > best.log 2>&1 & # real 1083m57.139s mkdir /hive/data/genomes/proCap1/bed/blastz.mm10.swap cd /hive/data/genomes/proCap1/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzProCap1.2012-03-21/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 165m10.285s cat fb.proCap1.chainMm10Link.txt # 390409777 bases of 2407847681 (16.214%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/proCap1/bed ln -s blastz.mm10.swap lastz.mm10 ############################################################################## # LASTZ pika ochPri2 (DONE - 2012-03-22 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10OchPri2 mkdir /hive/data/genomes/mm10/bed/lastzOchPri2.2012-03-22 cd /hive/data/genomes/mm10/bed/lastzOchPri2.2012-03-22 # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 cat << '_EOF_' > DEF # pika vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: pika OchPri2 SEQ2_DIR=/scratch/data/ochPri2/ochPri2.2bit SEQ2_LEN=/scratch/data/ochPri2/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=500 BASE=/hive/data/genomes/mm10/bed/lastzOchPri2.2012-03-22 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 2578m43.648s cat fb.mm10.chainOchPri2Link.txt # 385766335 bases of 2652783500 (14.542%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzOchPri2.2012-03-22 lastz.ochPri2 # better to have reciprocal best for this one since it is low coverage: cd /hive/data/genomes/mm10/bed/lastzOchPri2.2012-03-22 time doRecipBest.pl mm10 ochPri2 -buildDir=`pwd` -workhorse=hgwdev \ > best.log 2>&1 & # real 1036m29.080s mkdir /hive/data/genomes/ochPri2/bed/blastz.mm10.swap cd /hive/data/genomes/ochPri2/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzOchPri2.2012-03-22/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 103m34.369s cat fb.ochPri2.chainMm10Link.txt # 382959642 bases of 1923624051 (19.908%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/ochPri2/bed ln -s blastz.mm10.swap lastz.mm10 ############################################################################## # LASTZ hedgehog eriEur1 (DONE - 2012-03-22 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10EriEur1 mkdir /hive/data/genomes/mm10/bed/lastzEriEur1.2012-03-22 cd /hive/data/genomes/mm10/bed/lastzEriEur1.2012-03-22 # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 cat << '_EOF_' > DEF # hedgehog vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: hedgehog EriEur1 SEQ2_DIR=/scratch/data/eriEur1/eriEur1.2bit SEQ2_LEN=/scratch/data/eriEur1/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=700 BASE=/hive/data/genomes/mm10/bed/lastzEriEur1.2012-03-22 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 3006m41.470s cat fb.mm10.chainEriEur1Link.txt # 261447061 bases of 2652783500 (9.856%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzEriEur1.2012-03-22 lastz.eriEur1 # better to have reciprocal best for this one since it is low coverage: cd /hive/data/genomes/mm10/bed/lastzEriEur1.2012-03-22 time doRecipBest.pl mm10 eriEur1 -buildDir=`pwd` -workhorse=hgwdev \ > best.log 2>&1 & # real 1171m41.349s mkdir /hive/data/genomes/eriEur1/bed/blastz.mm10.swap cd /hive/data/genomes/eriEur1/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzEriEur1.2012-03-22/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 241m24.183s cat fb.eriEur1.chainMm10Link.txt # 261605017 bases of 2133134836 (12.264%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/eriEur1/bed ln -s blastz.mm10.swap lastz.mm10 ############################################################################## # LASTZ tenrec echTel1 (DONE - 2012-03-22 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10EchTel1 mkdir /hive/data/genomes/mm10/bed/lastzEchTel1.2012-03-22 cd /hive/data/genomes/mm10/bed/lastzEchTel1.2012-03-22 # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 cat << '_EOF_' > DEF # tenrec vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: tenrec EchTel1 SEQ2_DIR=/scratch/data/echTel1/echTel1.2bit SEQ2_LEN=/scratch/data/echTel1/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=700 BASE=/hive/data/genomes/mm10/bed/lastzEchTel1.2012-03-22 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 3047m28.723s cat fb.mm10.chainEchTel1Link.txt # 290413150 bases of 2652783500 (10.947%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzEchTel1.2012-03-22 lastz.echTel1 # better to have reciprocal best for this one since it is low coverage: cd /hive/data/genomes/mm10/bed/lastzEchTel1.2012-03-22 time doRecipBest.pl mm10 echTel1 -buildDir=`pwd` -workhorse=hgwdev \ > best.log 2>&1 & # real 1201m39.275s mkdir /hive/data/genomes/echTel1/bed/blastz.mm10.swap cd /hive/data/genomes/echTel1/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzEchTel1.2012-03-22/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 269m52.619s cat fb.echTel1.chainMm10Link.txt # 298082139 bases of 2111581369 (14.117%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/echTel1/bed ln -s blastz.mm10.swap lastz.mm10 ############################################################################## # LASTZ shrew sorAra1 (DONE - 2012-03-22 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10SorAra1 mkdir /hive/data/genomes/mm10/bed/lastzSorAra1.2012-03-22 cd /hive/data/genomes/mm10/bed/lastzSorAra1.2012-03-22 # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 cat << '_EOF_' > DEF # shrew vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: shrew SorAra1 SEQ2_DIR=/scratch/data/sorAra1/sorAra1.2bit SEQ2_LEN=/scratch/data/sorAra1/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=500 BASE=/hive/data/genomes/mm10/bed/lastzSorAra1.2012-03-22 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 2600m22.528s cat fb.mm10.chainSorAra1Link.txt # 248874412 bases of 2652783500 (9.382%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzSorAra1.2012-03-22 lastz.sorAra1 # better to have reciprocal best for this one since it is low coverage: cd /hive/data/genomes/mm10/bed/lastzSorAra1.2012-03-22 time doRecipBest.pl mm10 sorAra1 -buildDir=`pwd` -workhorse=hgwdev \ > best.log 2>&1 & # real 1074m22.651s mkdir /hive/data/genomes/sorAra1/bed/blastz.mm10.swap cd /hive/data/genomes/sorAra1/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzSorAra1.2012-03-22/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 141m38.806s cat fb.sorAra1.chainMm10Link.txt # 248692550 bases of 1832864697 (13.569%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/sorAra1/bed ln -s blastz.mm10.swap lastz.mm10 ############################################################################## # LASTZ wallaby macEug2 (DONE - 2012-03-22 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10MacEug2 mkdir /hive/data/genomes/mm10/bed/lastzMacEug2.2012-03-22 cd /hive/data/genomes/mm10/bed/lastzMacEug2.2012-03-22 # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 cat << '_EOF_' > DEF # wallaby vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: wallaby MacEug2 SEQ2_DIR=/scratch/data/macEug2/macEug2.2bit SEQ2_LEN=/scratch/data/macEug2/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=500 BASE=/hive/data/genomes/mm10/bed/lastzMacEug2.2012-03-22 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 2893m50.341s cat fb.mm10.chainMacEug2Link.txt # 115481931 bases of 2652783500 (4.353%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzMacEug2.2012-03-22 lastz.macEug2 # better to have reciprocal best for this one since it is low coverage: cd /hive/data/genomes/mm10/bed/lastzMacEug2.2012-03-22 time doRecipBest.pl mm10 macEug2 -buildDir=`pwd` -workhorse=hgwdev \ > best.log 2>&1 & # real 1032m58.798s mkdir /hive/data/genomes/macEug2/bed/blastz.mm10.swap cd /hive/data/genomes/macEug2/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzMacEug2.2012-03-22/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 130m7.404s cat fb.macEug2.chainMm10Link.txt # 112811810 bases of 2536076957 (4.448%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/macEug2/bed ln -s blastz.mm10.swap lastz.mm10 ############################################################################## # LASTZ RAT Rn5 (DONE - 2012-03-23 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10Rn5 mkdir /hive/data/genomes/mm10/bed/lastzRn5.2012-03-23 cd /hive/data/genomes/mm10/bed/lastzRn5.2012-03-23 cat << '_EOF_' > DEF # mouse vs rat BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # From tuning experiment between mouse chr12:15000000-25000000 and # rat chr6:38000000-48000000 BLASTZ_O=600 BLASTZ_E=55 BLASTZ_Y=5000 BLASTZ_T=2 BLASTZ_K=3000 BLASTZ_L=3000 BLASTZ_Q=/hive/data/genomes/mm10/bed/lastzRn5.2012-03-23/mouse_rat_2.q BLASTZ_ABRIDGE_REPEATS=1 # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_SMSK=/scratch/data/mm10/notInRat SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: Rat Rn5 SEQ2_DIR=/hive/data/genomes/rn5/rn5.2bit SEQ2_LEN=/hive/data/genomes/rn5/chrom.sizes SEQ2_SMSK=/hive/data/genomes/rn5/bed/linSpecRep/notInMouse SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=10 BASE=/hive/data/genomes/mm10/bed/lastzRn5.2012-03-23 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # establish a screen to control this job screen -S rn5Mm10 time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -bigClusterHub=swarm -chainMinScore=5000 -chainLinearGap=medium \ -noLoadChainSplit -syntenicNet -workhorse=hgwdev \ -smallClusterHub=encodek > do.log 2>&1 & # broken lastz run when SMSK files did not exist for some of the # Rn5 contigs - made empty files for those and completed, continuing: time nice -n +19 doBlastzChainNet.pl -verbose=2 \ -continue=cat `pwd`/DEF \ -bigClusterHub=swarm -chainMinScore=5000 -chainLinearGap=medium \ -noLoadChainSplit -syntenicNet -workhorse=hgwdev \ -smallClusterHub=encodek > cat.log 2>&1 & # real 285m28.458s cat fb.mm10.chainRn5Link.txt # 1786721927 bases of 2652783500 (67.353%) in intersection # FYI: rn4 was: # 1449612208 bases of 2652783500 (54.645%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzRn5.2012-03-23 lastz.rn5 # and the swap mkdir /hive/data/genomes/rn5/bed/blastz.mm10.swap cd /hive/data/genomes/rn5/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzRn5.2012-03-23/DEF \ -swap -bigClusterHub=swarm -chainMinScore=5000 -chainLinearGap=medium \ -noLoadChainSplit -syntenicNet -workhorse=hgwdev \ -smallClusterHub=encodek > swap.log 2>&1 & # real 121m21.029s cat fb.rn5.chainMm10Link.txt # 1808154679 bases of 2572853723 (70.278%) in intersection # FYI, rn4 was: # 1449012636 bases of 2571531505 (56.348%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/rn5/bed ln -s blastz.mm10.swap lastz.mm10 ######################################################################### # LASTZ Manatee triMan1 (DONE - 2012-03-29 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10TriMan1 mkdir /hive/data/genomes/mm10/bed/lastzTriMan1.2012-03-29 cd /hive/data/genomes/mm10/bed/lastzTriMan1.2012-03-29 # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 cat << '_EOF_' > DEF # manatee vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: manatee TriMan1 SEQ2_DIR=/hive/data/genomes/triMan1/triMan1.2bit SEQ2_LEN=/hive/data/genomes/triMan1/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=30 BASE=/hive/data/genomes/mm10/bed/lastzTriMan1.2012-03-29 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 1455m24.772s cat fb.mm10.chainTriMan1Link.txt # 704207702 bases of 2652783500 (26.546%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzTriMan1.2012-03-29 lastz.triMan1 mkdir /hive/data/genomes/triMan1/bed/blastz.mm10.swap cd /hive/data/genomes/triMan1/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzTriMan1.2012-03-29/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 62m33.530s cat fb.triMan1.chainMm10Link.txt # 682557025 bases of 2769099677 (24.649%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/triMan1/bed ln -s blastz.mm10.swap lastz.mm10 ######################################################################### # lastz Opossum monDom5 (DONE - 2012-03-29 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10MonDom5 mkdir /hive/data/genomes/mm10/bed/lastzMonDom5.2012-03-29 cd /hive/data/genomes/mm10/bed/lastzMonDom5.2012-03-29 cat << '_EOF_' > DEF # Mouse vs. opossum BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/scratch/data/blastz/HoxD55.q # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: Opossum monDom5 SEQ2_DIR=/scratch/data/monDom5/monDom5.2bit SEQ2_LEN=/scratch/data/monDom5/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/hive/data/genomes/mm10/bed/lastzMonDom5.2012-03-29 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 # Can't do this when there are only the single small set of chroms time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=loose > do.log 2>&1 & # real 1792m40.071s cat fb.mm10.chainMonDom5Link.txt # 254245903 bases of 2652783500 (9.584%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzMonDom5.2012-03-29 lastz.monDom5 # and for the swap mkdir /hive/data/genomes/monDom5/bed/blastz.mm10.swap cd /hive/data/genomes/monDom5/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzMonDom5.2012-03-29/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -swap -chainMinScore=5000 -chainLinearGap=loose > do.log 2>&1 & # real 73m49.230s cat fb.monDom5.chainMm10Link.txt # 252291401 bases of 3501660299 (7.205%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/monDom5/bed ln -s blastz.mm10.swap lastz.mm10 ######################################################################### # lastz Tasmanian Devil sarHar1 (DONE - 2012-03-29 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10SarHar1 mkdir /hive/data/genomes/mm10/bed/lastzSarHar1.2012-03-29 cd /hive/data/genomes/mm10/bed/lastzSarHar1.2012-03-29 cat << '_EOF_' > DEF # Mouse vs. tasmanian devil BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/scratch/data/blastz/HoxD55.q # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: tasmanian devil sarHar1 SEQ2_DIR=/scratch/data/sarHar1/sarHar1.2bit SEQ2_LEN=/scratch/data/sarHar1/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/hive/data/genomes/mm10/bed/lastzSarHar1.2012-03-29 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 # when not present, SEQ2_LIMIT is a default 100 time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=loose > do.log 2>&1 & # real 1208m55.866s cat fb.mm10.chainSarHar1Link.txt # 224935746 bases of 2652783500 (8.479%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzSarHar1.2012-03-29 lastz.sarHar1 # and for the swap mkdir /hive/data/genomes/sarHar1/bed/blastz.mm10.swap cd /hive/data/genomes/sarHar1/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzSarHar1.2012-03-29/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -swap -chainMinScore=5000 -chainLinearGap=loose > do.log 2>&1 & # real 45m53.015s cat fb.sarHar1.chainMm10Link.txt # 231249436 bases of 2931539702 (7.888%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/sarHar1/bed ln -s blastz.mm10.swap lastz.mm10 ######################################################################### # lastz budgerigar melUnd1 (DONE - 2012-03-29 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10MelUnd1 mkdir /hive/data/genomes/mm10/bed/lastzMelUnd1.2012-03-29 cd /hive/data/genomes/mm10/bed/lastzMelUnd1.2012-03-29 cat << '_EOF_' > DEF # Mouse vs. budgerigar BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/scratch/data/blastz/HoxD55.q # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: budgerigar melUnd1 SEQ2_DIR=/hive/data/genomes/melUnd1/melUnd1.2bit SEQ2_LEN=/hive/data/genomes/melUnd1/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=50 BASE=/hive/data/genomes/mm10/bed/lastzMelUnd1.2012-03-29 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 # when not present, SEQ2_LIMIT is a default 100 time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=loose > do.log 2>&1 & # real 883m58.198s cat fb.mm10.chainMelUnd1Link.txt # 95217653 bases of 2652783500 (3.589%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzMelUnd1.2012-03-29 lastz.melUnd1 # and for the swap mkdir /hive/data/genomes/melUnd1/bed/blastz.mm10.swap cd /hive/data/genomes/melUnd1/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzMelUnd1.2012-03-29/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -swap -chainMinScore=5000 -chainLinearGap=loose > do.log 2>&1 & # real 9m9.260s cat fb.melUnd1.chainMm10Link.txt # 79867911 bases of 1086614815 (7.350%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/melUnd1/bed ln -s blastz.mm10.swap lastz.mm10 ######################################################################### # lastz platypus ornAna1 (DONE - 2012-03-29 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10OrnAna1 mkdir /hive/data/genomes/mm10/bed/lastzOrnAna1.2012-03-29 cd /hive/data/genomes/mm10/bed/lastzOrnAna1.2012-03-29 cat << '_EOF_' > DEF # Mouse vs. platypus BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/scratch/data/blastz/HoxD55.q # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: platypus ornAna1 SEQ2_DIR=/scratch/data/ornAna1/ornAna1.2bit SEQ2_LEN=/scratch/data/ornAna1/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=400 BASE=/hive/data/genomes/mm10/bed/lastzOrnAna1.2012-03-29 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 # when not present, SEQ2_LIMIT is a default 100 time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=loose > do.log 2>&1 & # real 1264m1.056s cat fb.mm10.chainOrnAna1Link.txt # 141873792 bases of 2652783500 (5.348%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzOrnAna1.2012-03-29 lastz.ornAna1 # and for the swap mkdir /hive/data/genomes/ornAna1/bed/blastz.mm10.swap cd /hive/data/genomes/ornAna1/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzOrnAna1.2012-03-29/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -swap -chainMinScore=5000 -chainLinearGap=loose > do.log 2>&1 & # real 49m45.308s cat fb.ornAna1.chainMm10Link.txt # 135101083 bases of 1842236818 (7.334%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/ornAna1/bed ln -s blastz.mm10.swap lastz.mm10 ######################################################################### # lastz turtle chrPic1 (DONE - 2012-03-29 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10ChrPic1 mkdir /hive/data/genomes/mm10/bed/lastzChrPic1.2012-03-29 cd /hive/data/genomes/mm10/bed/lastzChrPic1.2012-03-29 cat << '_EOF_' > DEF # Mouse vs. turtle BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/scratch/data/blastz/HoxD55.q # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: turtle chrPic1 SEQ2_DIR=/hive/data/genomes/chrPic1/chrPic1.2bit SEQ2_LEN=/hive/data/genomes/chrPic1/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=200 BASE=/hive/data/genomes/mm10/bed/lastzChrPic1.2012-03-29 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 # when not present, SEQ2_LIMIT is a default 100 time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=loose > do.log 2>&1 & # real 1243m2.518s cat fb.mm10.chainChrPic1Link.txt # 125499965 bases of 2652783500 (4.731%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzChrPic1.2012-03-29 lastz.chrPic1 # and for the swap mkdir /hive/data/genomes/chrPic1/bed/blastz.mm10.swap cd /hive/data/genomes/chrPic1/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzChrPic1.2012-03-29/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -swap -chainMinScore=5000 -chainLinearGap=loose > swap.log 2>&1 & # real 19m26.835s cat fb.chrPic1.chainMm10Link.txt # 118436838 bases of 2158289746 (5.488%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/chrPic1/bed ln -s blastz.mm10.swap lastz.mm10 ######################################################################### # lastz chicken galGal4 (DONE - 2012-04-02 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10GalGal4 mkdir /hive/data/genomes/mm10/bed/lastzGalGal4.2012-04-02 cd /hive/data/genomes/mm10/bed/lastzGalGal4.2012-04-02 cat << '_EOF_' > DEF # Mouse vs. chicken BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/scratch/data/blastz/HoxD55.q # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: chicken galGal4 SEQ2_DIR=/hive/data/genomes/galGal4/galGal4.2bit SEQ2_LEN=/hive/data/genomes/galGal4/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=30 BASE=/hive/data/genomes/mm10/bed/lastzGalGal4.2012-04-02 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 # when not present, SEQ2_LIMIT is a default 100 time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=loose > do.log 2>&1 & # real 109m21.068s # broken swarm cluster, continuing: time nice -n +19 doBlastzChainNet.pl -verbose=2 \ -continue=cat `pwd`/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=loose > cat.log 2>&1 & # real 57m24.155s cat fb.mm10.chainGalGal4Link.txt # 97510773 bases of 2652783500 (3.676%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzGalGal4.2012-04-02 lastz.galGal4 # and for the swap mkdir /hive/data/genomes/galGal4/bed/blastz.mm10.swap cd /hive/data/genomes/galGal4/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzGalGal4.2012-04-02/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -swap -chainMinScore=5000 -chainLinearGap=loose > swap.log 2>&1 & # real 95m50.996s cat fb.galGal4.chainMm10Link.txt # 83660034 bases of 1032854810 (8.100%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/galGal4/bed ln -s blastz.mm10.swap lastz.mm10 ######################################################################### # lastz zebra finch taeGut1 (DONE - 2012-04-02 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10TaeGut1 mkdir /hive/data/genomes/mm10/bed/lastzTaeGut1.2012-04-02 cd /hive/data/genomes/mm10/bed/lastzTaeGut1.2012-04-02 cat << '_EOF_' > DEF # Mouse vs. zebra finch BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/scratch/data/blastz/HoxD55.q # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: zebra finch taeGut1 SEQ2_DIR=/scratch/data/taeGut1/taeGut1.2bit SEQ2_LEN=/scratch/data/taeGut1/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=5 BASE=/hive/data/genomes/mm10/bed/lastzTaeGut1.2012-04-02 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 # when not present, SEQ2_LIMIT is a default 100 time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=loose > do.log 2>&1 & # real 106m11.612s # broken swarm, continuing: time nice -n +19 doBlastzChainNet.pl -verbose=2 \ -continue=cat `pwd`/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=loose > cat.log 2>&1 & # real 29m11.090s cat fb.mm10.chainTaeGut1Link.txt # 95469341 bases of 2652783500 (3.599%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzTaeGut1.2012-04-02 lastz.taeGut1 # and for the swap mkdir /hive/data/genomes/taeGut1/bed/blastz.mm10.swap cd /hive/data/genomes/taeGut1/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzTaeGut1.2012-04-02/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -swap -chainMinScore=5000 -chainLinearGap=loose > swap.log 2>&1 & # real 37m17.483s cat fb.taeGut1.chainMm10Link.txt # 89312133 bases of 1222864691 (7.304%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/taeGut1/bed ln -s blastz.mm10.swap lastz.mm10 ######################################################################### # lastz lizard anoCar2 (DONE - 2012-04-02 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10AnoCar2 mkdir /hive/data/genomes/mm10/bed/lastzAnoCar2.2012-04-02 cd /hive/data/genomes/mm10/bed/lastzAnoCar2.2012-04-02 cat << '_EOF_' > DEF # Mouse vs. lizard BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/scratch/data/blastz/HoxD55.q # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: lizard anoCar2 SEQ2_DIR=/scratch/data/anoCar2/anoCar2.2bit SEQ2_LEN=/scratch/data/anoCar2/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=15 BASE=/hive/data/genomes/mm10/bed/lastzAnoCar2.2012-04-02 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 # when not present, SEQ2_LIMIT is a default 100 time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=loose > do.log 2>&1 & # real 103m17.133s # broken swarm, continuing: time nice -n +19 doBlastzChainNet.pl -verbose=2 \ -continue=cat `pwd`/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=loose > cat.log 2>&1 & # real 43m2.183s cat fb.mm10.chainAnoCar2Link.txt # 88356459 bases of 2652783500 (3.331%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzAnoCar2.2012-04-02 lastz.anoCar2 # and for the swap mkdir /hive/data/genomes/anoCar2/bed/blastz.mm10.swap cd /hive/data/genomes/anoCar2/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzAnoCar2.2012-04-02/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -swap -chainMinScore=5000 -chainLinearGap=loose > swap.log 2>&1 & # real 97m50.599s cat fb.anoCar2.chainMm10Link.txt # 84865552 bases of 1701353770 (4.988%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/anoCar2/bed ln -s blastz.mm10.swap lastz.mm10 ######################################################################### # lastz turkey melGal1 (DONE - 2012-04-02 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10MelGal1 mkdir /hive/data/genomes/mm10/bed/lastzMelGal1.2012-04-02 cd /hive/data/genomes/mm10/bed/lastzMelGal1.2012-04-02 cat << '_EOF_' > DEF # Mouse vs. turkey BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/scratch/data/blastz/HoxD55.q # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: turkey melGal1 SEQ2_DIR=/scratch/data/melGal1/melGal1.2bit SEQ2_LEN=/scratch/data/melGal1/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=15 BASE=/hive/data/genomes/mm10/bed/lastzMelGal1.2012-04-02 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 # when not present, SEQ2_LIMIT is a default 100 time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=loose > do.log 2>&1 & # real 101m17.902s # broken swarm, continuing: time nice -n +19 doBlastzChainNet.pl -verbose=2 \ -continue=cat `pwd`/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=loose > cat.log 2>&1 & # real 20m47.771s cat fb.mm10.chainMelGal1Link.txt # 93132953 bases of 2652783500 (3.511%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzMelGal1.2012-04-02 lastz.melGal1 # and for the swap mkdir /hive/data/genomes/melGal1/bed/blastz.mm10.swap cd /hive/data/genomes/melGal1/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzMelGal1.2012-04-02/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -swap -chainMinScore=5000 -chainLinearGap=loose > swap.log 2>&1 & # real 88m39.591s cat fb.melGal1.chainMm10Link.txt # 76848161 bases of 935922386 (8.211%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/melGal1/bed ln -s blastz.mm10.swap lastz.mm10 ######################################################################### # lastz frog xenTro3 (DONE - 2012-04-02 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10XenTro3 mkdir /hive/data/genomes/mm10/bed/lastzXenTro3.2012-04-02 cd /hive/data/genomes/mm10/bed/lastzXenTro3.2012-04-02 cat << '_EOF_' > DEF # Mouse vs. frog BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/scratch/data/blastz/HoxD55.q # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: frog xenTro3 SEQ2_DIR=/scratch/data/xenTro3/xenTro3.2bit SEQ2_LEN=/scratch/data/xenTro3/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=40 BASE=/hive/data/genomes/mm10/bed/lastzXenTro3.2012-04-02 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 # when not present, SEQ2_LIMIT is a default 100 time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=loose > do.log 2>&1 & # real 99m10.611s # broken swarm, continuing: time nice -n +19 doBlastzChainNet.pl -verbose=2 \ -continue=cat `pwd`/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=loose > cat.log 2>&1 & # real 37m52.678s cat fb.mm10.chainXenTro3Link.txt # 82900338 bases of 2652783500 (3.125%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzXenTro3.2012-04-02 lastz.xenTro3 # and for the swap mkdir /hive/data/genomes/xenTro3/bed/blastz.mm10.swap cd /hive/data/genomes/xenTro3/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzXenTro3.2012-04-02/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -swap -chainMinScore=5000 -chainLinearGap=loose > swap.log 2>&1 & # real 53m19.485s cat fb.xenTro3.chainMm10Link.txt # 90345130 bases of 1358334882 (6.651%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/xenTro3/bed ln -s blastz.mm10.swap lastz.mm10 ######################################################################### # lastz coelacanth latCha1 (DONE - 2012-04-02 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10LatCha1 mkdir /hive/data/genomes/mm10/bed/lastzLatCha1.2012-04-02 cd /hive/data/genomes/mm10/bed/lastzLatCha1.2012-04-02 cat << '_EOF_' > DEF # Mouse vs. coelacanth BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/scratch/data/blastz/HoxD55.q # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: coelacanth latCha1 SEQ2_DIR=/hive/data/genomes/latCha1/latCha1.2bit SEQ2_LEN=/hive/data/genomes/latCha1/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=50 BASE=/hive/data/genomes/mm10/bed/lastzLatCha1.2012-04-02 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 # when not present, SEQ2_LIMIT is a default 100 time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=loose > do.log 2>&1 & # real 95m34.477s # broken swarm, continuing: time nice -n +19 doBlastzChainNet.pl -verbose=2 \ -continue=cat `pwd`/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=loose > cat.log 2>&1 & # real 214m7.324s cat fb.mm10.chainLatCha1Link.txt # 72036116 bases of 2652783500 (2.715%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzLatCha1.2012-04-02 lastz.latCha1 # and for the swap mkdir /hive/data/genomes/latCha1/bed/blastz.mm10.swap cd /hive/data/genomes/latCha1/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzLatCha1.2012-04-02/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -swap -chainMinScore=5000 -chainLinearGap=loose > swap.log 2>&1 & # real 14m44.600s cat fb.latCha1.chainMm10Link.txt # 73798131 bases of 2183592768 (3.380%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/latCha1/bed ln -s blastz.mm10.swap lastz.mm10 ######################################################################### # lastz atlantic cod gadMor1 (DONE - 2012-04-02 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10GadMor1 mkdir /hive/data/genomes/mm10/bed/lastzGadMor1.2012-04-02 cd /hive/data/genomes/mm10/bed/lastzGadMor1.2012-04-02 cat << '_EOF_' > DEF # Mouse vs. atlantic cod BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/scratch/data/blastz/HoxD55.q # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: atlantic cod gadMor1 SEQ2_DIR=/hive/data/genomes/gadMor1/gadMor1.2bit SEQ2_LEN=/hive/data/genomes/gadMor1/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=700 BASE=/hive/data/genomes/mm10/bed/lastzGadMor1.2012-04-02 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 # when not present, SEQ2_LIMIT is a default 100 time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=loose > do.log 2>&1 & # real 91m23.642s # broken swarm, continuing: time nice -n +19 doBlastzChainNet.pl -verbose=2 \ -continue=cat `pwd`/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=loose > cat.log 2>&1 & # real 39m41.194s cat fb.mm10.chainGadMor1Link.txt # 45795692 bases of 2652783500 (1.726%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzGadMor1.2012-04-02 lastz.gadMor1 # and for the swap mkdir /hive/data/genomes/gadMor1/bed/blastz.mm10.swap cd /hive/data/genomes/gadMor1/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzGadMor1.2012-04-02/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -swap -chainMinScore=5000 -chainLinearGap=loose > swap.log 2>&1 & # real 62m58.963s cat fb.gadMor1.chainMm10Link.txt # 41406507 bases of 608038597 (6.810%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/gadMor1/bed ln -s blastz.mm10.swap lastz.mm10 ######################################################################### # lastz nile tilapia oreNil1 (DONE - 2012-04-02 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10OreNil1 mkdir /hive/data/genomes/mm10/bed/lastzOreNil1.2012-04-02 cd /hive/data/genomes/mm10/bed/lastzOreNil1.2012-04-02 cat << '_EOF_' > DEF # Mouse vs. nile tilapia BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/scratch/data/blastz/HoxD55.q # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: nile tilapia oreNil1 SEQ2_DIR=/scratch/data/oreNil1/oreNil1.2bit SEQ2_LEN=/scratch/data/oreNil1/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=10 BASE=/hive/data/genomes/mm10/bed/lastzOreNil1.2012-04-02 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 # when not present, SEQ2_LIMIT is a default 100 time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=loose > do.log 2>&1 & # real 89m6.727s # broken swarm, continuing: time nice -n +19 doBlastzChainNet.pl -verbose=2 \ -continue=cat `pwd`/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=loose > cat.log 2>&1 & # real 24m3.960s cat fb.mm10.chainOreNil1Link.txt # 51915568 bases of 2652783500 (1.957%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzOreNil1.2012-04-02 lastz.oreNil1 # and for the swap mkdir /hive/data/genomes/oreNil1/bed/blastz.mm10.swap cd /hive/data/genomes/oreNil1/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzOreNil1.2012-04-02/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -swap -chainMinScore=5000 -chainLinearGap=loose > swap.log 2>&1 & # real 90m55.298s cat fb.oreNil1.chainMm10Link.txt # 49709461 bases of 816084674 (6.091%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/oreNil1/bed ln -s blastz.mm10.swap lastz.mm10 ######################################################################### # lastz stickleback gasAcu1 (DONE - 2012-04-02 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10GasAcu1 mkdir /hive/data/genomes/mm10/bed/lastzGasAcu1.2012-04-02 cd /hive/data/genomes/mm10/bed/lastzGasAcu1.2012-04-02 cat << '_EOF_' > DEF # Mouse vs. stickleback BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/scratch/data/blastz/HoxD55.q # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: stickleback gasAcu1 SEQ2_DIR=/scratch/data/gasAcu1/gasAcu1.2bit SEQ2_LEN=/scratch/data/gasAcu1/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=1 BASE=/hive/data/genomes/mm10/bed/lastzGasAcu1.2012-04-02 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 # when not present, SEQ2_LIMIT is a default 100 time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=loose > do.log 2>&1 & # real 87m5.963s # broken swarm, continuing: time nice -n +19 doBlastzChainNet.pl -verbose=2 \ -continue=cat `pwd`/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=loose > cat.log 2>&1 & # real 9m49.199s cat fb.mm10.chainGasAcu1Link.txt # 53469711 bases of 2652783500 (2.016%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzGasAcu1.2012-04-02 lastz.gasAcu1 # and for the swap mkdir /hive/data/genomes/gasAcu1/bed/blastz.mm10.swap cd /hive/data/genomes/gasAcu1/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzGasAcu1.2012-04-02/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -swap -chainMinScore=5000 -chainLinearGap=loose > swap.log 2>&1 & # real 12m58.072s cat fb.gasAcu1.chainMm10Link.txt # 48802831 bases of 446627861 (10.927%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/gasAcu1/bed ln -s blastz.mm10.swap lastz.mm10 ######################################################################### # lastz fugu fr3 (DONE - 2012-04-02 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10Fr3 mkdir /hive/data/genomes/mm10/bed/lastzFr3.2012-04-02 cd /hive/data/genomes/mm10/bed/lastzFr3.2012-04-02 cat << '_EOF_' > DEF # Mouse vs. fugu BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/scratch/data/blastz/HoxD55.q # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: fugu fr3 SEQ2_DIR=/scratch/data/fr3/fr3.2bit SEQ2_LEN=/scratch/data/fr3/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=30 BASE=/hive/data/genomes/mm10/bed/lastzFr3.2012-04-02 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 # when not present, SEQ2_LIMIT is a default 100 time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=loose > do.log 2>&1 & # real 84m37.070s # broken swarm, continuing: time nice -n +19 doBlastzChainNet.pl -verbose=2 \ -continue=cat `pwd`/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=loose > cat.log 2>&1 & # real 171m16.627s cat fb.mm10.chainFr3Link.txt # 47460021 bases of 2652783500 (1.789%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzFr3.2012-04-02 lastz.fr3 # and for the swap mkdir /hive/data/genomes/fr3/bed/blastz.mm10.swap cd /hive/data/genomes/fr3/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzFr3.2012-04-02/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -swap -chainMinScore=5000 -chainLinearGap=loose > swap.log 2>&1 & # real 7m13.151s cat fb.fr3.chainMm10Link.txt # 42586058 bases of 350961831 (12.134%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/fr3/bed ln -s blastz.mm10.swap lastz.mm10 ######################################################################### # lastz tetraodon tetNig2 (DONE - 2012-04-02 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10TetNig2 mkdir /hive/data/genomes/mm10/bed/lastzTetNig2.2012-04-02 cd /hive/data/genomes/mm10/bed/lastzTetNig2.2012-04-02 cat << '_EOF_' > DEF # Mouse vs. tetraodon BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/scratch/data/blastz/HoxD55.q # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: tetraodon tetNig2 SEQ2_DIR=/scratch/data/tetNig2/tetNig2.2bit SEQ2_LEN=/scratch/data/tetNig2/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=30 BASE=/hive/data/genomes/mm10/bed/lastzTetNig2.2012-04-02 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 # when not present, SEQ2_LIMIT is a default 100 time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=loose > do.log 2>&1 & time nice -n +19 doBlastzChainNet.pl -verbose=2 \ -continue=cat `pwd`/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=loose > cat.log 2>&1 & # real 13m21.638s cat fb.mm10.chainTetNig2Link.txt # 46035322 bases of 2652783500 (1.735%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzTetNig2.2012-04-02 lastz.tetNig2 # and for the swap mkdir /hive/data/genomes/tetNig2/bed/blastz.mm10.swap cd /hive/data/genomes/tetNig2/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzTetNig2.2012-04-02/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -swap -chainMinScore=5000 -chainLinearGap=loose > swap.log 2>&1 & # real 7m24.115s cat fb.tetNig2.chainMm10Link.txt # 41242926 bases of 302314788 (13.642%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/tetNig2/bed ln -s blastz.mm10.swap lastz.mm10 ######################################################################### # lastz zebrafish danRer7 (DONE - 2012-04-02 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10DanRer7 mkdir /hive/data/genomes/mm10/bed/lastzDanRer7.2012-04-02 cd /hive/data/genomes/mm10/bed/lastzDanRer7.2012-04-02 cat << '_EOF_' > DEF # Mouse vs. zebrafish BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/scratch/data/blastz/HoxD55.q # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: zebrafish danRer7 SEQ2_DIR=/scratch/data/danRer7/danRer7.2bit SEQ2_LEN=/scratch/data/danRer7/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=50 BASE=/hive/data/genomes/mm10/bed/lastzDanRer7.2012-04-02 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 # when not present, SEQ2_LIMIT is a default 100 time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=loose > do.log 2>&1 & # real 80m32.118s # broken swarm, continuing: time nice -n +19 doBlastzChainNet.pl -verbose=2 \ -continue=cat `pwd`/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=loose > cat.log 2>&1 & # real 40m27.762s cat fb.mm10.chainDanRer7Link.txt # 69028912 bases of 2652783500 (2.602%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzDanRer7.2012-04-02 lastz.danRer7 # and for the swap mkdir /hive/data/genomes/danRer7/bed/blastz.mm10.swap cd /hive/data/genomes/danRer7/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzDanRer7.2012-04-02/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -swap -chainMinScore=5000 -chainLinearGap=loose > swap.log 2>&1 & # real 109m49.939s cat fb.danRer7.chainMm10Link.txt # 72001768 bases of 1409770109 (5.107%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/danRer7/bed ln -s blastz.mm10.swap lastz.mm10 ######################################################################### # lastz medaka oryLat2 (DONE - 2012-04-02 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10OryLat2 mkdir /hive/data/genomes/mm10/bed/lastzOryLat2.2012-04-02 cd /hive/data/genomes/mm10/bed/lastzOryLat2.2012-04-02 cat << '_EOF_' > DEF # Mouse vs. medaka BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/scratch/data/blastz/HoxD55.q # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: medaka oryLat2 SEQ2_DIR=/scratch/data/oryLat2/oryLat2.2bit SEQ2_LEN=/scratch/data/oryLat2/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=50 BASE=/hive/data/genomes/mm10/bed/lastzOryLat2.2012-04-02 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 # when not present, SEQ2_LIMIT is a default 100 time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=loose > do.log 2>&1 & # real 78m53.408s # broken swarm, continuing: time nice -n +19 doBlastzChainNet.pl -verbose=2 \ -continue=cat `pwd`/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=loose > cat.log 2>&1 & # real 113m29.462s cat fb.mm10.chainOryLat2Link.txt # 51344841 bases of 2652783500 (1.936%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzOryLat2.2012-04-02 lastz.oryLat2 # and for the swap mkdir /hive/data/genomes/oryLat2/bed/blastz.mm10.swap cd /hive/data/genomes/oryLat2/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzOryLat2.2012-04-02/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -swap -chainMinScore=5000 -chainLinearGap=loose > swap.log 2>&1 & # real 7m52.846s cat fb.oryLat2.chainMm10Link.txt # 45954178 bases of 700386597 (6.561%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/oryLat2/bed ln -s blastz.mm10.swap lastz.mm10 ######################################################################### # lastz lamprey petMar1 (DONE - 2012-04-02 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10PetMar1 mkdir /hive/data/genomes/mm10/bed/lastzPetMar1.2012-04-02 cd /hive/data/genomes/mm10/bed/lastzPetMar1.2012-04-02 cat << '_EOF_' > DEF # Mouse vs. lamprey BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/scratch/data/blastz/HoxD55.q # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: lamprey petMar1 SEQ2_DIR=/scratch/data/petMar1/petMar1.2bit SEQ2_LEN=/scratch/data/petMar1/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=200 BASE=/hive/data/genomes/mm10/bed/lastzPetMar1.2012-04-02 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 # when not present, SEQ2_LIMIT is a default 100 time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=loose > do.log 2>&1 & # real 77m3.923s # broken swarm, continuing: time nice -n +19 doBlastzChainNet.pl -verbose=2 \ -qRepeats=windowmaskerSdust -continue=cat `pwd`/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=loose > cat.log 2>&1 & # missing qRepeats specification rm axtChain/mm10.petMar1.net time nice -n +19 doBlastzChainNet.pl -verbose=2 \ -qRepeats=windowmaskerSdust -continue=load `pwd`/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=loose > load.log 2>&1 & # real 6m31.527s cat fb.mm10.chainPetMar1Link.txt # 29205053 bases of 2652783500 (1.101%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzPetMar1.2012-04-02 lastz.petMar1 # and for the swap mkdir /hive/data/genomes/petMar1/bed/blastz.mm10.swap cd /hive/data/genomes/petMar1/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzPetMar1.2012-04-02/DEF \ -qRepeats=windowmaskerSdust -workhorse=hgwdev \ -smallClusterHub=encodek -bigClusterHub=swarm \ -swap -chainMinScore=5000 -chainLinearGap=loose > swap.log 2>&1 & # real 17m40.196s cat fb.petMar1.chainMm10Link.txt # 26274715 bases of 831696438 (3.159%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/petMar1/bed ln -s blastz.mm10.swap lastz.mm10 ######################################################################### ## 60-Way Multiz (DONE - 2011-09-28 - Hiram) ssh hgwdev mkdir /hive/data/genomes/mm10/bed/multiz60way cd /hive/data/genomes/mm10/bed/multiz60way # from the 62-way in the source tree, do not need aliMis1 and croPor1: /cluster/bin/phast/tree_doctor --prune ailMis1,croPor1 \ /cluster/home/hiram/kent/src/hg/utils/phyloTrees/62way.nh > 60way.nh # note, newer assemblies: susScr3, dasNov3, felCat5, hetGla2, turTru2, # nomLeu2, oreNil2 # what that looks like: cat 60way.nh # (((((((((((((((((((hg19:0.006550,panTro4:0.006840):0.002220, # gorGor3:0.008964):0.009693,ponAbe2:0.018940):0.003471, # nomLeu2:0.022270):0.012040,(rheMac3:0.007991, # papHam1:0.008042):0.029610):0.021830,(calJac3:0.030000, # saiBol1:0.040000):0.039650):0.052090,tarSyr1:0.111400):0.020520, # (micMur1:0.085600,otoGar3:0.119400):0.020520):0.015494, # tupBel1:0.186203):0.004937,(((((mm10:0.084509,rn5:0.091589):0.197773, # dipOrd1:0.211609):0.022992,(hetGla2:0.100000, # cavPor3:0.125629):0.100000):0.010150,speTri2:0.148468):0.025746, # (oryCun2:0.114227,ochPri2:0.201069):0.101463):0.015313):0.020593, # (((susScr3:0.120000,(vicPac1:0.087275,(turTru2:0.064688, # (oviAri1:0.100000,bosTau7:0.100000):0.023592):0.025153):0.020335):0.020000, # ((equCab2:0.109397,(felCat5:0.098612, # (canFam3:0.052458,ailMel1:0.050000):0.050000):0.049845):0.006219, # (myoLuc2:0.142540,pteVam1:0.113399):0.033706):0.004508):0.011671, # (eriEur1:0.221785,sorAra1:0.269562):0.056393):0.021227):0.023664, # ((((loxAfr3:0.082242,proCap1:0.155358):0.026990,echTel1:0.245936):0.010000, # triMan1:0.100000):0.049697,(dasNov3:0.116664, # choHof1:0.096357):0.053145):0.006717):0.234728,(monDom5:0.125686, # (sarHar1:0.100000,macEug2:0.072008):0.050000):0.215100):0.071664, # ornAna1:0.456592):0.109504,(((((melGal1:0.100000,galGal4:0.065536):0.100000, # taeGut1:0.171542):0.199223,melUnd1:0.100000):0.155143, # anoCar2:0.539241):0.122371,chrPic1:0.200000):0.010000):0.050000, # xenTro3:0.855573):0.100000,latCha1:0.855573):0.311354, # ((((((tetNig2:0.224159,fr3:0.203847):0.097590,oreNil2:0.200000):0.097590, # gasAcu1:0.316413):0.030000,oryLat2:0.511970):0.030000, # gadMor1:0.350000):0.225640,danRer7:0.730752):0.147949):0.526688, # petMar1:0.526688); # rearrange to get mm10 on top: cat << '_EOF_' > mm10.60way.nh (((((((((((((((mm10:0.084509,rn5:0.091589):0.197773,dipOrd1:0.211609):0.022992, (hetGla2:0.1,cavPor3:0.125629):0.1):0.01015,speTri2:0.148468):0.025746,(oryCun2:0.114227,ochPri2:0.201069):0.101463):0.015313, (((((((((hg19:0.00655,panTro4:0.00684):0.00222,gorGor3:0.008964):0.009693,ponAbe2:0.01894):0.003471, nomLeu2:0.02227):0.01204,(rheMac3:0.007991,papHam1:0.008042):0.02961):0.02183, (calJac3:0.03,saiBol1:0.04):0.03965):0.05209,tarSyr1:0.1114):0.02052,(micMur1:0.0856,otoGar3:0.1194):0.02052):0.015494, tupBel1:0.186203):0.004937):0.020593, ((susScr3:0.12,(vicPac1:0.087275,(turTru2:0.064688, (oviAri1:0.1,bosTau7:0.1):0.023592):0.025153):0.020335):0.01, ((((felCat5:0.098612, (canFam3:0.052458,ailMel1:0.05):0.05):0.049845,equCab2:0.109397):0.006219, (myoLuc2:0.14254,pteVam1:0.113399):0.033706):0.004508,(eriEur1:0.221785, sorAra1:0.269562):0.056393):0.021227):0.01):0.013664,((((loxAfr3:0.082242,proCap1:0.155358):0.02699, echTel1:0.245936):0.01,triMan1:0.1):0.049697,(dasNov3:0.116664, choHof1:0.096357):0.053145):0.006717):0.234728,(monDom5:0.125686,(sarHar1:0.1, macEug2:0.072008):0.05):0.2151):0.071664,ornAna1:0.456592):0.109504, (((((melGal1:0.1,galGal4:0.065536):0.1,taeGut1:0.171542):0.199223,melUnd1:0.1):0.155143,anoCar2:0.539241):0.122371, chrPic1:0.2):0.01):0.05,xenTro3:0.855573):0.1,latCha1:0.855573):0.311354, ((((((tetNig2:0.224159,fr3:0.203847):0.09759,oreNil2:0.2):0.09759,gasAcu1:0.316413):0.03, oryLat2:0.51197):0.03,gadMor1:0.35):0.22564,danRer7:0.730752):0.147949):0.526688,petMar1:0.526688); '_EOF_' # << happy emacs # extract species list from that .nh file sed 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//; /^ *$/d' \ mm10.60way.nh | xargs echo | sed 's/ //g; s/,/ /g' \ | sed 's/[()]//g; s/,/ /g' | tr '[ ]' '[\n]' > species.list.txt # construct db to name translation list: cat species.list.txt | while read DB do hgsql -N -e "select name,organism from dbDb where name=\"${DB}\";" hgcentraltest done | sed -e "s/\t/->/; s/ /_/g;" | sed -e 's/$/;/' | sed -e 's/\./_/g' \ > db.to.name.txt # construct a common name .nh file: /cluster/bin/phast/tree_doctor --rename \ "`cat db.to.name.txt`" mm10.60way.nh | sed -e 's/00*)/)/g; s/00*,/,/g' \ | sed -e 's/X__trop/X._trop/' > mm10.60way.commonNames.nh # (((((((((((((((Mouse:0.084509,Rat:0.091589):0.197773, # Kangaroo_rat:0.211609):0.022992,(Naked_mole:0.1, # Guinea_pig:0.125629):0.1):0.01015,Squirrel:0.148468):0.025746, # (Rabbit:0.114227,Pika:0.201069):0.101463):0.015313, # (((((((((Human:0.00655,Chimp:0.00684):0.00222,Gorilla:0.008964):0.009693, # Orangutan:0.01894):0.003471,Gibbon:0.02227):0.01204, # (Chinese_rhesus:0.007991,Baboon:0.008042):0.02961):0.02183, # (Marmoset:0.03,Squirrel_monkey:0.04):0.03965):0.05209, # Tarsier:0.1114):0.02052,(Mouse_lemur:0.0856, # Bushbaby:0.1194):0.02052):0.015494,Tree_shrew:0.186203):0.004937):0.020593, # ((Pig:0.12,(Alpaca:0.087275,(Dolphin:0.064688, # (Sheep:0.1,Cow:0.1):0.023592):0.025153):0.020335):0.01, # ((((Cat:0.098612,(Dog:0.052458,Panda:0.05):0.05):0.049845, # Horse:0.109397):0.006219,(Microbat:0.14254, # Megabat:0.113399):0.033706):0.004508,(Hedgehog:0.221785, # Shrew:0.269562):0.056393):0.021227):0.01):0.013664, # ((((Elephant:0.082242,Rock_hyrax:0.155358):0.02699, # Tenrec:0.245936):0.01,Manatee:0.1):0.049697, # (Armadillo:0.116664,Sloth:0.096357):0.053145):0.006717):0.234728, # (Opossum:0.125686,(Tasmanian_devil:0.1, # Wallaby:0.072008):0.05):0.2151):0.071664,Platypus:0.456592):0.109504, # (((((Turkey:0.1,Chicken:0.065536):0.1,Zebra_finch:0.171542):0.199223, # Budgerigar:0.1):0.155143,Lizard:0.539241):0.122371, # Painted_turtle:0.2):0.01):0.05,X._tropicalis:0.855573):0.1, # Coelacanth:0.855573):0.311354,((((((Tetraodon:0.224159, # Fugu:0.203847):0.09759,Nile_tilapia:0.2):0.09759, # Stickleback:0.316413):0.03,Medaka:0.51197):0.03, # Atlantic_cod:0.35):0.22564,Zebrafish:0.730752):0.147949):0.526688, # Lamprey:0.526688); # Use this specification in the phyloGif tool: # http://genome.ucsc.edu/cgi-bin/phyloGif # to obtain a png image for src/hg/htdocs/images/phylo/mm10_60way.png /cluster/bin/phast/all_dists mm10.60way.nh | grep mm10 \ | sed -e "s/mm10^I//" | sort -k2n > 60way.distances.txt # Use this output to create the table below head 60way.distances.txt # rn5 0.176098 # speTri2 0.463892 # micMur1 0.483034 # dipOrd1 0.493891 # vicPac1 0.504686 # hetGla2 0.505274 # hg19 0.505328 # gorGor3 0.505522 # panTro4 0.505618 # nomLeu2 0.505664 cat << '_EOF_' > sizeStats.pl #!/usr/bin/env perl use strict; use warnings; open (FH, "<60way.distances.txt") or die "can not read 60way.distances.txt"; my $count = 0; while (my $line = ) { chomp $line; my ($D, $dist) = split('\s+', $line); my $chain = "chain" . ucfirst($D); my $B="/hive/data/genomes/mm10/bed/lastz.$D/fb.mm10." . $chain . "Link.txt"; my $chainLinkMeasure = `awk '{print \$5}' ${B} 2> /dev/null | sed -e "s/(//; s/)//"`; chomp $chainLinkMeasure; $chainLinkMeasure = 0.0 if (length($chainLinkMeasure) < 1); $chainLinkMeasure =~ s/\%//; my $swapFile="/hive/data/genomes/${D}/bed/lastz.mm10/fb.${D}.chainMm10Link.txt"; my $swapMeasure = "N/A"; if ( -s $swapFile ) { $swapMeasure = `awk '{print \$5}' ${swapFile} 2> /dev/null | sed -e "s/(//; s/)//"`; chomp $swapMeasure; $swapMeasure = 0.0 if (length($swapMeasure) < 1); $swapMeasure =~ s/\%//; } my $orgName= `hgsql -N -e "select organism from dbDb where name='$D';" hgcentraltest`; chomp $orgName; if (length($orgName) < 1) { $orgName="N/A"; } ++$count; printf "# %02d %.4f (%% %06.3f) (%% %06.3f) - %s %s\n", $count, $dist, $chainLinkMeasure, $swapMeasure, $orgName, $D; } close (FH); '_EOF_' # << happy emacs chmod +x ./sizeStats.pl ./sizeStats.pl # # If you can fill in all the numbers in this table, you are ready for # the multiple alignment procedure # featureBits chainLink measures # chainAnoCar2Link # N distance on mm10 on other other species # 01 0.1761 (% 67.353) (% 70.278) - Rat rn5 # 02 0.4639 (% 34.217) (% 39.244) - Squirrel speTri2 # 03 0.4830 (% 26.636) (% 37.574) - Mouse lemur micMur1 # 04 0.4939 (% 19.460) (% 27.512) - Kangaroo rat dipOrd1 # 05 0.5047 (% 22.636) (% 31.769) - Alpaca vicPac1 # 06 0.5053 (% 32.753) (% 37.989) - Naked mole rat hetGla2 # 07 0.5053 (% 38.226) (% 35.249) - Human hg19 # 08 0.5055 (% 33.987) (% 34.349) - Gorilla gorGor3 # 09 0.5056 (% 34.674) (% 31.924) - Chimp panTro4 # 10 0.5057 (% 34.031) (% 32.274) - Gibbon nomLeu2 # 11 0.5058 (% 34.496) (% 30.610) - Orangutan ponAbe2 # 12 0.5073 (% 30.267) (% 33.492) - Dolphin turTru2 # 13 0.5088 (% 24.560) (% 24.986) - Tarsier tarSyr1 # 14 0.5090 (% 33.931) (% 33.464) - Chinese rhesus rheMac3 # 15 0.5090 (% 33.577) (% 32.023) - Baboon papHam1 # 16 0.5168 (% 29.795) (% 32.926) - Bushbaby otoGar3 # 17 0.5171 (% 25.685) (% 29.445) - Pig susScr3 # 18 0.5192 (% 32.450) (% 31.301) - Marmoset calJac3 # 19 0.5284 (% 34.415) (% 37.138) - Horse equCab2 # 20 0.5292 (% 32.339) (% 33.848) - Squirrel monkey saiBol1 # 21 0.5309 (% 28.447) (% 29.115) - Guinea pig cavPor3 # 22 0.5470 (% 18.019) (% 23.687) - Sloth choHof1 # 23 0.5472 (% 26.546) (% 24.649) - Manatee triMan1 # 24 0.5476 (% 19.766) (% 25.144) - Tree shrew tupBel1 # 25 0.5569 (% 25.248) (% 25.677) - Rabbit oryCun2 # 26 0.5599 (% 27.345) (% 38.627) - Megabat pteVam1 # 27 0.5662 (% 26.255) (% 25.383) - Cow bosTau7 # 28 0.5662 (% 15.341) (% 31.925) - Sheep oviAri1 # 29 0.5664 (% 25.823) (% 21.616) - Elephant loxAfr3 # 30 0.5673 (% 25.201) (% 21.066) - Armadillo dasNov3 # 31 0.5675 (% 29.725) (% 32.244) - Cat felCat5 # 32 0.5689 (% 30.979) (% 35.562) - Panda ailMel1 # 33 0.5713 (% 29.144) (% 31.624) - Dog canFam3 # 34 0.5891 (% 24.363) (% 33.650) - Microbat myoLuc2 # 35 0.6395 (% 15.147) (% 16.214) - Rock hyrax proCap1 # 36 0.6437 (% 14.542) (% 19.908) - Pika ochPri2 # 37 0.6865 (% 09.856) (% 12.264) - Hedgehog eriEur1 # 38 0.7031 (% 10.947) (% 14.117) - Tenrec echTel1 # 39 0.7343 (% 09.382) (% 13.569) - Shrew sorAra1 # 40 0.9626 (% 04.353) (% 04.448) - Wallaby macEug2 # 41 0.9663 (% 09.584) (% 07.205) - Opossum monDom5 # 42 0.9906 (% 08.479) (% 07.888) - Tasmanian devil sarHar1 # 43 1.0166 (% 04.731) (% 05.488) - Painted turtle chrPic1 # 44 1.1537 (% 05.348) (% 07.334) - Platypus ornAna1 # 45 1.1942 (% 03.589) (% 07.350) - Budgerigar melUnd1 # 46 1.4589 (% 03.676) (% 08.100) - Chicken galGal4 # 47 1.4649 (% 03.599) (% 07.304) - Zebra finch taeGut1 # 48 1.4782 (% 03.331) (% 04.988) - Lizard anoCar2 # 49 1.4934 (% 03.511) (% 08.211) - Turkey melGal1 # 50 1.7122 (% 03.125) (% 06.651) - X. tropicalis xenTro3 # 51 1.8122 (% 02.715) (% 03.380) - Coelacanth latCha1 # 52 1.9916 (% 01.726) (% 06.810) - Atlantic cod gadMor1 # 53 1.9992 (% 01.957) (% 06.091) - Nile tilapia oreNil2 # 54 2.0180 (% 02.016) (% 10.927) - Stickleback gasAcu1 # 55 2.1006 (% 01.789) (% 12.134) - Fugu fr3 # 56 2.1209 (% 01.735) (% 13.642) - Tetraodon tetNig2 # 57 2.1467 (% 02.602) (% 05.107) - Zebrafish danRer7 # 58 2.1835 (% 01.936) (% 06.561) - Medaka oryLat2 # 59 2.3214 (% 01.101) (% 03.159) - Lamprey petMar1 # None of this concern for distances matters in building the first step, the # maf files. # create species list and stripped down tree for autoMZ sed 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//; /^ *$/d' \ mm10.60way.nh > tmp.nh echo `cat tmp.nh` > tree-commas.nh echo `cat tree-commas.nh` | sed 's/ //g; s/,/ /g' > tree.nh sed 's/[()]//g; s/,/ /g' tree.nh > species.list # bash shell syntax here ... cd /hive/data/genomes/mm10/bed/multiz60way export H=/hive/data/genomes/mm10/bed mkdir mafLinks for G in `sed -e "s/mm10 //" species.list` do mkdir mafLinks/$G if [ -s ${H}/lastz.${G}/mafRBestNet/chr1.maf.gz ]; then echo "$G - recipBest" ln -s ${H}/lastz.$G/mafRBestNet/*.maf.gz ./mafLinks/$G else if [ -s ${H}/lastz.${G}/mafSynNet/chr1.maf.gz ]; then echo "$G - synNet" ln -s ${H}/lastz.$G/mafSynNet/*.maf.gz ./mafLinks/$G else if [ -s ${H}/lastz.${G}/mafNet/chr1.maf.gz ]; then echo "$G - mafNet" ln -s ${H}/lastz.$G/mafNet/*.maf.gz ./mafLinks/$G else echo "missing directory lastz.${G}/*Net" fi fi fi done # verify the alignment type is correct: for D in `grep -v mm10 /hive/users/hiram/bigWays/mm10.60way/ordered.list` do ls -l mafLinks/$D/chr1.maf.gz | awk '{print $NF}' done # compare to the list at: # http://genomewiki.ucsc.edu/index.php/Mm10_Genome_size_statistics # need to split these things up into smaller pieces for # efficient kluster run. cd /hive/data/genomes/mm10/bed/multiz60way mkdir mafSplit cd mafSplit # mafSplitPos splits on gaps or repeat areas that will not have # any chains, approx 5 Mbp intervals, gaps at least 10,000 mafSplitPos -minGap=10000 mm10 5 stdout | sort -u \ | sort -k1,1 -k2,2n > mafSplit.bed # There is a splitRegions.pl script here (copied from previous hg19 46way) # that can create a custom track from this mafSplit.bed file. # Take a look at that in the browser and see if it looks OK, # check the number of sections on each chrom to verify none are # too large. Despite the claim above, it does appear that some # areas are split where actual chains exist. ./splitRegions.pl mafSplit.bed > splitRegions.ct # to see the sizes of the regions: grep "^chr" splitRegions.ct | awk '{print $3-$2,$0}' | sort -rn | less # run a kluster job to split them all ssh swarm cd /hive/data/genomes/mm10/bed/multiz60way/mafSplit cat << '_EOF_' > runOne #!/bin/csh -ef set G = $1 set C = $2 mkdir -p $G pushd $G > /dev/null if ( -s ../../mafLinks/${G}/${C}.maf.gz ) then if ( -s mm10_${C}.00.maf ) then /bin/rm -f mm10_${C}.*.maf endif /cluster/bin/x86_64/mafSplit ../mafSplit.bed mm10_ ../../mafLinks/${G}/${C}.maf.gz /bin/gzip mm10_${C}.*.maf else /bin/touch mm10_${C}.00.maf /bin/gzip mm10_${C}.00.maf endif popd > /dev/null '_EOF_' # << happy emacs chmod +x runOne cat << '_EOF_' > template #LOOP runOne $(root1) $(root2) {check out exists+ $(root1)/mm10_$(root2).00.maf.gz} #ENDLOOP '_EOF_' # << happy emacs for G in `sed -e "s/mm10 //" ../species.list` do echo $G done > species.list cut -f 1 ../../../chrom.sizes > chr.list gensub2 species.list chr.list template jobList para -ram=8g create jobList para try ... check ... push ... etc... # Completed: 3894 of 3894 jobs # CPU time in finished jobs: 18929s 315.49m 5.26h 0.22d 0.001 y # IO & Wait Time: 62908s 1048.46m 17.47h 0.73d 0.002 y # Average job time: 21s 0.35m 0.01h 0.00d # Longest finished job: 346s 5.77m 0.10h 0.00d # Submission to last job: 471s 7.85m 0.13h 0.01d # construct a list of all possible maf file names. # they do not all exist in each of the species directories find . -type f | grep "maf.gz" | wc -l # 19733 find . -type f | grep ".maf.gz$" | xargs -L 1 basename | sort -u > maf.list wc -l maf.list # 336 maf.list mkdir /hive/data/genomes/mm10/bed/multiz60way/splitRun cd /hive/data/genomes/mm10/bed/multiz60way/splitRun mkdir maf run cd run mkdir penn cp -p /cluster/bin/penn/multiz.2009-01-21/multiz penn cp -p /cluster/bin/penn/multiz.2009-01-21/maf_project penn cp -p /cluster/bin/penn/multiz.2009-01-21/autoMZ penn # set the db and pairs directories here cat > autoMultiz.csh << '_EOF_' #!/bin/csh -ef set db = mm10 set c = $1 set result = $2 set run = `/bin/pwd` set tmp = /scratch/tmp/$db/multiz.$c set pairs = /hive/data/genomes/mm10/bed/multiz60way/mafSplit /bin/rm -fr $tmp /bin/mkdir -p $tmp /bin/cp -p ../../tree.nh ../../species.list $tmp pushd $tmp > /dev/null foreach s (`/bin/sed -e "s/$db //" species.list`) set in = $pairs/$s/$c set out = $db.$s.sing.maf if (-e $in.gz) then /bin/zcat $in.gz > $out if (! -s $out) then echo "##maf version=1 scoring=autoMZ" > $out endif else if (-e $in) then /bin/ln -s $in $out else echo "##maf version=1 scoring=autoMZ" > $out endif end set path = ($run/penn $path); rehash $run/penn/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c \ > /dev/null popd > /dev/null /bin/rm -f $result /bin/cp -p $tmp/$c $result /bin/rm -fr $tmp '_EOF_' # << happy emacs chmod +x autoMultiz.csh cat << '_EOF_' > template #LOOP ./autoMultiz.csh $(root1) {check out line+ /hive/data/genomes/mm10/bed/multiz60way/splitRun/maf/$(root1)} #ENDLOOP '_EOF_' # << happy emacs ln -s ../../mafSplit/maf.list maf.list ssh swarm cd /hive/data/genomes/mm10/bed/multiz60way/splitRun/run # the tac reverses the list to get the small jobs first gensub2 maf.list single template stdout | tac > jobList para -ram=8g create jobList # Completed: 336 of 336 jobs # CPU time in finished jobs: 2828651s 47144.19m 785.74h 32.74d 0.090 y # IO & Wait Time: 200533s 3342.21m 55.70h 2.32d 0.006 y # Average job time: 9015s 150.26m 2.50h 0.10d # Longest finished job: 47029s 783.82m 13.06h 0.54d # Submission to last job: 48982s 816.37m 13.61h 0.57d # put the split maf results back together into a single maf file # eliminate duplicate comments ssh hgwdev cd /hive/data/genomes/mm10/bed/multiz60way/splitRun mkdir ../maf # the sed edits take out partitioning name information from the comments # so the multiple parts will condense to smaller number of lines # this takes almost 2 hours of time, resulting in a bit over 150 Gb, # almost all chrom files over 1 Gb, up to almost 10 Gb for chr2 # HOWEVER, this is actually not necessary to maintain these comments, # they are lost during the mafAddIRows cat << '_EOF_' >> runOne #!/bin/csh -fe set C = $1 if ( -s ../maf/${C}.maf.gz ) then rm -f ../maf/${C}.maf.gz endif head -q -n 1 maf/mm10_${C}.*.maf | sort -u > ../maf/${C}.maf grep -h "^#" maf/mm10_${C}.*.maf | egrep -v "maf version=1|eof maf" | \ sed -e "s#${C}.[0-9][0-9]*#${C}#g; s#_MZ_[^ ]* # #g;" \ | sort -u >> ../maf/${C}.maf grep -h -v "^#" `ls maf/mm10_${C}.*.maf | sort -t. -k2,2n` >> ../maf/${C}.maf tail -q -n 1 maf/mm10_${C}.*.maf | sort -u >> ../maf/${C}.maf '_EOF_' # << happy emacs chmod +x runOne cat << '_EOF_' >> template #LOOP runOne $(root1) {check out exists+ ../maf/$(root1).maf} #ENDLOOP '_EOF_' # << happy emacs cut -f1 ../../../chrom.sizes > chr.list ssh encodek cd /hive/data/genomes/mm10/bed/multiz60way/splitRun gensub2 chr.list single template jobList para -ram=8g create jobList para try ... check ... push ... etc ... # Completed: 62 of 66 jobs # Crashed: 4 jobs # CPU time in finished jobs: 461s 7.68m 0.13h 0.01d 0.000 y # IO & Wait Time: 17863s 297.72m 4.96h 0.21d 0.001 y # Average job time: 296s 4.93m 0.08h 0.00d # Longest finished job: 1144s 19.07m 0.32h 0.01d # Submission to last job: 1156s 19.27m 0.32h 0.01d # these four have empty results: # chrUn_GL456383 # chrUn_GL456389 # chrUn_GL456390 # chrUn_GL456396 # Load into database ssh hgwdev mkdir -p /gbdb/mm10/multiz60way cd /hive/data/genomes/mm10/bed/multiz60way/maf ln -s `pwd`/*.maf /gbdb/mm10/multiz60way # this generates an immense multiz60way.tab file in the directory # where it is running. Best to run this over in scratch. # This is going to take all day. cd /scratch/tmp time nice -n +19 hgLoadMaf mm10 multiz60way # Loaded 56185270 mafs in 66 files from /gbdb/mm10/multiz60way # real 72m45.513s # -rw-rw-r-- 1 2857704841 Apr 18 10:49 multiz60way.tab time cat /gbdb/mm10/multiz60way/*.maf \ | nice -n +19 hgLoadMafSummary -verbose=2 -minSize=30000 \ -mergeGap=1500 -maxSize=200000 mm10 multiz60waySummary stdin # Created 12012784 summary blocks from 1074134156 components and # 56185270 mafs from stdin # real 104m2.107s wc -l multiz60way*.tab # 56185270 multiz60way.tab # 12012784 multiz60waySummary.tab # 68198054 total # -rw-rw-r-- 1 2857704841 Apr 18 10:49 multiz60way.tab # -rw-rw-r-- 1 567210414 Apr 18 17:28 multiz60waySummary.tab rm multiz60way*.tab ####################################################################### # GAP ANNOTATE MULTIZ9WAY MAF AND LOAD TABLES (DONE - 2012-05-31 - Hiram) # mafAddIRows has to be run on single chromosome maf files, it does not # function correctly when more than one reference sequence # are in a single file. mkdir -p /hive/data/genomes/mm10/bed/multiz60way/anno cd /hive/data/genomes/mm10/bed/multiz60way/anno cd /hive/data/genomes/mm10/bed/multiz60way/anno # check for N.bed files everywhere: for DB in `cat ../species.list` do if [ ! -s /hive/data/genomes/${DB}/${DB}.N.bed ]; then echo "MISS: ${DB}" cd /hive/data/genomes/${DB} twoBitInfo -nBed ${DB}.2bit ${DB}.N.bed else echo " OK: ${DB}" fi done cd /hive/data/genomes/mm10/bed/multiz60way/anno for DB in `cat ../species.list` do echo "${DB} " ln -s /hive/data/genomes/${DB}/${DB}.N.bed ${DB}.bed echo ${DB}.bed >> nBeds ln -s /hive/data/genomes/${DB}/chrom.sizes ${DB}.len echo ${DB}.len >> sizes done # make sure they all are successful symLinks: ls -ogrtL screen -S mm10 # use a screen to control this longish job ssh swarm cd /hive/data/genomes/mm10/bed/multiz60way/anno mkdir result # NEXT TIME: this template should have a check out exists+ statement cat << '_EOF_' > template #LOOP mafAddIRows -nBeds=nBeds $(path1) /hive/data/genomes/mm10/mm10.2bit {check out line+ result/$(file1)} #ENDLOOP '_EOF_' # << happy emacs ls ../maf/*.maf > maf.list # the tac puts the short jobs first gensub2 maf.list single template stdout | tac > jobList # limit jobs to one per node with the ram=8g requirement para -ram=8g create jobList para try ... check ... push ... # Completed: 46 of 66 jobs # CPU time in finished jobs: 350s 5.83m 0.10h 0.00d 0.000 y # IO & Wait Time: 603s 10.06m 0.17h 0.01d 0.000 y # Average job time: 21s 0.35m 0.01h 0.00d # Longest finished job: 54s 0.90m 0.01h 0.00d # Submission to last job: 113s 1.88m 0.03h 0.00d # a number of these jobs did not finish due to memory limitations. # The jobs would sit on the nodes appearing to occupy 8 Gb of memory, # but did not see any swapping or CPU time accumulation. Stop the # batch and run the rest manually on hgwdev: #!/bin/sh export maxMem=188743680 ulimit -S -m $maxMem -v $maxMem mafAddIRows -nBeds=nBeds ../maf/chrX.maf /hive/data/genomes/mm10/mm10.2bit hgwdev/chrX.maf & mafAddIRows -nBeds=nBeds ../maf/chr9.maf /hive/data/genomes/mm10/mm10.2bit hgwdev/chr9.maf & mafAddIRows -nBeds=nBeds ../maf/chr8.maf /hive/data/genomes/mm10/mm10.2bit hgwdev/chr8.maf & mafAddIRows -nBeds=nBeds ../maf/chr7.maf /hive/data/genomes/mm10/mm10.2bit hgwdev/chr7.maf & wait mafAddIRows -nBeds=nBeds ../maf/chr6.maf /hive/data/genomes/mm10/mm10.2bit hgwdev/chr6.maf & mafAddIRows -nBeds=nBeds ../maf/chr5.maf /hive/data/genomes/mm10/mm10.2bit hgwdev/chr5.maf & mafAddIRows -nBeds=nBeds ../maf/chr4.maf /hive/data/genomes/mm10/mm10.2bit hgwdev/chr4.maf & mafAddIRows -nBeds=nBeds ../maf/chr3.maf /hive/data/genomes/mm10/mm10.2bit hgwdev/chr3.maf & wait ... etc ... # the run time for those 20 jobs: # real 159m49.217s # verify all result files have some content, look for 0 size files: find . -type f -size 0 # should see none # combine into one file (realized after this, that we do *not* need # this single file. Individual files are OK. head -q -n 1 result/chrM.maf > mm10.60way.maf time for F in hgwdev/*.maf result/*.maf do grep -h -v "^#" ${F} done >> mm10.60way.maf # real 1082m47.484s -> 18 hours ! # -rw-rw-r-- 1 261567878241 Jun 8 10:30 mm10.60way.maf du -hsc mm10.60way.maf # 244G mm10.60way.maf # these maf files do not have the end marker, this does nothing: # tail -q -n 1 result/chrM.maf >> mm10.60way.maf # How about an official end marker: echo "##eof maf" >> mm10.60way.maf # construct symlinks to get the individual maf files into gbdb: mkdir /gbdb/mm10/multiz60way/maf ln -s `pwd`/result/*.maf `pwd`/hgwdev/*.maf /gbdb/mm10/multiz60way/maf/ # Load into database rm /gbdb/mm10/multiz60way/*.maf # remove previous results cd /scratch/tmp time nice -n +19 hgLoadMaf -pathPrefix=/gbdb/mm10/multiz60way/maf \ mm10 multiz60way # Loaded 58087742 mafs in 66 files from /gbdb/mm10/multiz60way/maf # real 868m28.108s time (cat /gbdb/mm10/multiz60way/maf/*.maf \ | hgLoadMafSummary -verbose=2 -minSize=30000 \ -mergeGap=1500 -maxSize=200000 mm10 multiz60waySummary stdin) # -rw-rw-r-- 1 3009209972 Jun 9 03:23 multiz60way.tab # -rw-rw-r-- 1 591235982 Jun 11 18:34 multiz60waySummary.tab rm multiz60way*.tab ####################################################################### # MULTIZ60WAY MAF FRAMES (DONE - 2012-05-30 - Hiram) ssh hgwdev mkdir /hive/data/genomes/mm10/bed/multiz60way/frames cd /hive/data/genomes/mm10/bed/multiz60way/frames # survey all the genomes to find out what kinds of gene tracks they have cat << '_EOF_' > showGenes.csh #!/bin/csh -fe foreach db (`cat ../species.list`) echo -n "${db}: " set tables = `hgsql $db -N -e "show tables like '%Gene%'"` foreach table ($tables) if ($table == "ensGene" || $table == "refGene" || \ $table == "mgcGenes" || $table == "knownGene" || \ $table == "xenoRefGene" ) then set count = `hgsql $db -N -e "select count(*) from $table"` echo -n "${table}: ${count}, " endif end set orgName = `hgsql hgcentraltest -N -e \ "select scientificName from dbDb where name='$db'"` set orgId = `hgsql hg19 -N -e \ "select id from organism where name='$orgName'"` if ($orgId == "") then echo "Mrnas: 0" else set count = `hgsql hg19 -N -e "select count(*) from gbCdnaInfo where organism=$orgId"` echo "Mrnas: ${count}" endif end '_EOF_' # << happy emacs chmod +x ./showGenes.csh time ./showGenes.csh > showGenes.txt # real 9m11.678s # rearrange that output to create four sections, and place these names # in .list files here: # 1. knownGene: hg19 # 2. refGene: bosTau7 danRer7 galGal4 mm10 rheMac3 rn5 susScr3 xenTro3 # 3. ensGene: ailMel1 anoCar2 calJac3 cavPor3 choHof1 dipOrd1 echTel1 # equCab2 eriEur1 fr3 gasAcu1 gorGor3 loxAfr3 melGal1 # micMur1 monDom5 myoLuc2 ochPri2 ornAna1 oryCun2 oryLat2 # panTro4 ponAbe2 proCap1 pteVam1 sorAra1 taeGut1 tarSyr1 # tetNig2 tupBel1 vicPac1 # 4. xenoRefGene: canFam3 chrPic1 dasNov3 felCat5 hetGla2 latCha1 macEug2 # nomLeu2 otoGar3 oviAri1 papHam1 petMar1 saiBol1 sarHar1 # triMan1 # 5. genscan: gadMor1 melUnd1 oreNil2 speTri2 turTru2 mkdir genes # 1. knownGene: hg19 hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from knownGene" hg19 \ | genePredSingleCover stdin stdout | gzip -2c \ > genes/hg19.gp.gz # 2. refGene, want the full extended genePred: for DB in `cat refGene.list` do hgsql -N -e "select * from refGene" ${DB} | cut -f2- \ | genePredSingleCover stdin stdout | gzip -2c \ > /scratch/tmp/${DB}.tmp.gz mv /scratch/tmp/${DB}.tmp.gz genes/$DB.gp.gz echo "${DB} done" done # 3. ensGene, want the full extended genePred: for DB in `cat ensGene.list` do hgsql -N -e "select * from ensGene" ${DB} | cut -f2- \ | genePredSingleCover stdin stdout | gzip -2c \ > /scratch/tmp/${DB}.tmp.gz mv /scratch/tmp/${DB}.tmp.gz genes/$DB.gp.gz echo "${DB} done" done # 4. xenoRefGene, want the full extended genePred: for DB in `cat xenoRG.list` do hgsql -N -e "select * from xenoRefGene" ${DB} | cut -f2- \ | genePredSingleCover stdin stdout | gzip -2c \ > /scratch/tmp/${DB}.tmp.gz mv /scratch/tmp/${DB}.tmp.gz genes/$DB.gp.gz echo "${DB} done" done # 5. genscan: gadMor1 melUnd1 oreNil2 speTri2 turTru2 for DB in `cat genscan.list` do hgsql -N -e "select * from xenoRefGene" ${DB} | cut -f2- \ | genePredSingleCover stdin stdout | gzip -2c \ > /scratch/tmp/${DB}.tmp.gz mv /scratch/tmp/${DB}.tmp.gz genes/$DB.gp.gz echo "${DB} done" done # verify counts for genes are reasonable: for T in genes/*.gz do echo -n "# $T: " zcat $T | cut -f1 | sort | uniq -c | wc -l done # genes/ailMel1.gp.gz: 19204 # genes/anoCar2.gp.gz: 17766 # genes/bosTau7.gp.gz: 12958 # genes/calJac3.gp.gz: 20843 # genes/canFam3.gp.gz: 20652 # genes/cavPor3.gp.gz: 18631 # genes/choHof1.gp.gz: 12403 # genes/chrPic1.gp.gz: 19433 # genes/danRer7.gp.gz: 13902 # genes/dasNov3.gp.gz: 29551 # genes/dipOrd1.gp.gz: 15784 # genes/echTel1.gp.gz: 16499 # genes/equCab2.gp.gz: 20403 # genes/eriEur1.gp.gz: 11712 # genes/felCat5.gp.gz: 19512 # genes/fr3.gp.gz: 18014 # genes/gadMor1.gp.gz: 27572 # genes/galGal4.gp.gz: 4892 # genes/gasAcu1.gp.gz: 20631 # genes/gorGor3.gp.gz: 20759 # genes/hetGla2.gp.gz: 25749 # genes/hg19.gp.gz: 20718 # genes/latCha1.gp.gz: 18786 # genes/loxAfr3.gp.gz: 19986 # genes/macEug2.gp.gz: 26006 # genes/melGal1.gp.gz: 14050 # genes/melUnd1.gp.gz: 15296 # genes/micMur1.gp.gz: 16240 # genes/mm10.gp.gz: 20985 # genes/monDom5.gp.gz: 19188 # genes/myoLuc2.gp.gz: 19685 # genes/nomLeu2.gp.gz: 22996 # genes/ochPri2.gp.gz: 15970 # genes/oreNil2.gp.gz: 18636 # genes/ornAna1.gp.gz: 17728 # genes/oryCun2.gp.gz: 18921 # genes/oryLat2.gp.gz: 19576 # genes/otoGar3.gp.gz: 24061 # genes/oviAri1.gp.gz: 17890 # genes/panTro4.gp.gz: 18647 # genes/papHam1.gp.gz: 27842 # genes/petMar1.gp.gz: 11089 # genes/ponAbe2.gp.gz: 19895 # genes/proCap1.gp.gz: 16043 # genes/pteVam1.gp.gz: 16966 # genes/rheMac3.gp.gz: 5580 # genes/rn5.gp.gz: 16393 # genes/saiBol1.gp.gz: 23419 # genes/sarHar1.gp.gz: 20694 # genes/sorAra1.gp.gz: 13156 # genes/speTri2.gp.gz: 22377 # genes/susScr3.gp.gz: 3771 # genes/taeGut1.gp.gz: 17354 # genes/tarSyr1.gp.gz: 13615 # genes/tetNig2.gp.gz: 19539 # genes/triMan1.gp.gz: 19514 # genes/tupBel1.gp.gz: 15407 # genes/turTru2.gp.gz: 28375 # genes/vicPac1.gp.gz: 11754 # genes/xenTro3.gp.gz: 8447 # kluster job to annotate each maf file screen -S mm10 # manage long running procedure with screen ssh swarm cd /hive/data/genomes/mm10/bed/multiz60way/frames cat << '_EOF_' > runOne #!/bin/csh -fe set C = $1 set G = $2 cat ../maf/${C}.maf | genePredToMafFrames mm10 stdin stdout \ ${G} genes/${G}.gp.gz | gzip > parts/${C}.${G}.mafFrames.gz '_EOF_' # << happy emacs chmod +x runOne # older instructions excluded mm10 from the gene.list # this was a mistake. mm10 can be annotated too. # Mistakenly did this the first run through, had to manually # do the mm10 genes separately on hgwdev after this was done ls ../maf | sed -e "s/.maf//" > chr.list ls genes | sed -e "s/.gp.gz//" > gene.list cat << '_EOF_' > template #LOOP runOne $(root1) $(root2) {check out exists+ parts/$(root1).$(root2).mafFrames.gz} #ENDLOOP '_EOF_' # << happy emacs mkdir parts gensub2 chr.list gene.list template jobList para -ram=8g create jobList para try ... check ... push # Completed: 3960 of 3960 jobs # CPU time in finished jobs: 85610s 1426.83m 23.78h 0.99d 0.003 y # IO & Wait Time: 2030956s 33849.27m 564.15h 23.51d 0.064 y # Average job time: 534s 8.91m 0.15h 0.01d # Longest finished job: 3877s 64.62m 1.08h 0.04d # Submission to last job: 12974s 216.23m 3.60h 0.15d # collect all results into one file: cd /hive/data/genomes/mm10/bed/multiz60way/frames find ./parts -type f | while read F do zcat ${F} done | sort -k1,1 -k2,2n > multiz60wayFrames.bed # -rw-rw-r-- 1 1164299719 May 30 11:28 multiz60wayFrames.bed # verify there are frames on everything: cut -f4 multiz60wayFrames.bed | sort | uniq -c | sort -n \ > annotation.survey.txt # should be 60 species: wc -l annotation.survey.txt # 60 annotation.survey.txt # and the minimum numbers: head annotation.survey.txt # 43900 susScr3 # 59839 rheMac3 # 153246 petMar1 # 162501 choHof1 # ... etc ... # load the resulting file ssh hgwdev cd /hive/data/genomes/mm10/bed/multiz60way/frames time gzip multiz60wayFrames.bed # real 0m51.826s # reloading this table 2012-10-11 with more accurate frames: time hgLoadMafFrames mm10 multiz60wayFrames multiz60wayFrames.bed.gz # real 3m2.449s time featureBits -countGaps mm10 multiz60wayFrames # 57707702 bases of 2730871774 (2.113%) in intersection # real 1m45.141s # enable the trackDb entries: # frames multiz60wayFrames # irows on # appears to work OK ######################################################################### # Phylogenetic tree from 60-way (DONE - 2012-05-31 - 2012-06-12 - Hiram) mkdir /hive/data/genomes/mm10/bed/multiz60way/4d cd /hive/data/genomes/mm10/bed/multiz60way/4d # the annotated maf's are in: ../anno/result/*.maf # using ensGene for mm10, only transcribed genes and nothing # from the randoms and other misc. hgsql mm10 -Ne \ "select * from ensGene WHERE cdsEnd > cdsStart;" | cut -f 2-20 \ | egrep -E -v "chrM|chrUn|random|_hap" > ensGene.gp wc -l *.gp # 55423 ensGene.gp genePredSingleCover ensGene.gp stdout | sort > ensGeneNR.gp wc -l ensGeneNR.gp # 22457 ensGeneNR.gp ssh encodek mkdir /hive/data/genomes/mm10/bed/multiz60way/4d/run cd /hive/data/genomes/mm10/bed/multiz60way/4d/run mkdir ../mfa # newer versions of msa_view have a slightly different operation # the sed of the gp file inserts the reference species in the chr name cat << '_EOF_' > 4d.csh #!/bin/csh -fe set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin set r = "/hive/data/genomes/mm10/bed/multiz60way" set c = $1 set infile = $r/anno/result/$2 set outfile = $3 cd /scratch/tmp # 'clean' maf perl -wpe 's/^s ([^.]+)\.\S+/s $1/' $infile > $c.maf awk -v C=$c '$2 == C {print}' $r/4d/ensGeneNR.gp | sed -e "s/\t$c\t/\tmm10.$c\t/" > $c.gp set NL=`wc -l $c.gp| gawk '{print $1}'` if ("$NL" != "0") then $PHASTBIN/msa_view --4d --features $c.gp -i MAF $c.maf -o SS > $c.ss $PHASTBIN/msa_view -i SS --tuple-size 1 $c.ss > $r/4d/run/$outfile else echo "" > $r/4d/run/$outfile endif rm -f $c.gp $c.maf $c.ss '_EOF_' # << happy emacs chmod +x 4d.csh ls -1S /hive/data/genomes/mm10/bed/multiz60way/anno/result/*.maf \ | sed -e "s#.*multiz60way/anno/result/##" \ > maf.list cat << '_EOF_' > template #LOOP 4d.csh $(root1) $(path1) {check out line+ ../mfa/$(root1).mfa} #ENDLOOP '_EOF_' # << happy emacs # the tac puts the quick jobs at the front gensub2 maf.list single template stdout | tac > jobList para create jobList para try ... check para -maxJob=5 push para time # Completed: 66 of 66 jobs # CPU time in finished jobs: 13176s 219.60m 3.66h 0.15d 0.000 y # IO & Wait Time: 31790s 529.84m 8.83h 0.37d 0.001 y # Average job time: 681s 11.36m 0.19h 0.01d # Longest finished job: 2883s 48.05m 0.80h 0.03d # Submission to last job: 2925s 48.75m 0.81h 0.03d # combine mfa files ssh hgwdev cd /hive/data/genomes/mm10/bed/multiz60way/4d # remove the broken empty files, size 0 and size 1: find ./mfa -type f -size 0 | xargs rm -f # most interesting, this did not identify files of size 1: # find ./mfa -type f -size 1 ls -og mfa | awk '$3 == 1' | awk '{print $NF}' > empty.list sed -e "s#^#mfa/##" empty.list | xargs rm -f #want comma-less species.list /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/msa_view \ --aggregate "`cat ../species.list`" mfa/*.mfa | sed s/"> "/">"/ \ > 4d.all.mfa # check they are all in there: grep "^>" 4d.all.mfa | wc -l # 60 # use phyloFit to create tree model (output is phyloFit.mod) time nice -n +19 \ /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/phyloFit \ --EM --precision MED --msa-format FASTA --subst-mod REV \ --tree ../tree-commas.nh 4d.all.mfa # real 98m59.203s mv phyloFit.mod all.mod grep TREE all.mod #TREE: (((((((((((((((mm10:0.0855383,rn5:0.0922719):0.202381,dipOrd1:0.210819):0.0258471,(hetGla2:0.0917322,cavPor3:0.136876):0.0994271):0.00910944,speTri2:0.145483):0.0274969,(oryCun2:0.109639,ochPri2:0.200966):0.102067):0.0141654,(((((((((hg19:0.00674057,panTro4:0.00692231):0.00309904,gorGor3:0.00918625):0.00954082,ponAbe2:0.0191843):0.00356049,nomLeu2:0.0218207):0.0116848,(rheMac3:0.00814945,papHam1:0.0079848):0.0289473):0.0208338,(calJac3:0.0342405,saiBol1:0.0333221):0.0359171):0.0594469,tarSyr1:0.137467):0.011091,(micMur1:0.0918138,otoGar3:0.127231):0.0351527):0.0153171,tupBel1:0.18879):0.0042463):0.0214646,((susScr3:0.121641,(vicPac1:0.109818,(turTru2:0.0635753,(oviAri1:0.0392493,bosTau7:0.0315816):0.0939861):0.0203711):0.00368417):0.0444758,((((felCat5:0.0897448,(canFam3:0.0888602,ailMel1:0.0767935):0.021837):0.05011,equCab2:0.109367):0.00605998,(myoLuc2:0.137144,pteVam1:0.114013):0.0339604):0.00395001,(eriEur1:0.226934,sorAra1:0.270619):0.0628319):0.00292667):0.0291403):0.0231397,((((loxAfr3:0.078841,proCap1:0.160295):0.00825096,echTel1:0.266786):0.0031636,triMan1:0.0685675):0.0736043,(dasNov3:0.112086,choHof1:0.0974658):0.0535724):0.00739115):0.245967,(monDom5:0.139913,(sarHar1:0.132596,macEug2:0.111778):0.0294309):0.21273):0.0770867,ornAna1:0.50425):0.135096,(((((melGal1:0.067697,galGal4:0.05253):0.13729,taeGut1:0.202681):0.00899388,melUnd1:0.127774):0.216078,anoCar2:0.575186):0.0128221,chrPic1:0.201659):0.137011):0.113527,xenTro3:0.943162):0.0646458,latCha1:0.596956):0.463611,((((((tetNig2:0.223213,fr3:0.198755):0.263107,oreNil2:0.33649):0.0139699,gasAcu1:0.314841):0.0573697,oryLat2:0.430105):0.185668,gadMor1:0.562778):0.169352,danRer7:0.753326):0.117017):0.501088,petMar1:0.501088); # four different subset lists: paste glire.list euarchontoglires.list placental.list all.list # mm10 mm10 mm10 mm10 # rn5 rn5 rn5 rn5 # dipOrd1 dipOrd1 dipOrd1 dipOrd1 # hetGla2 hetGla2 hetGla2 hetGla2 # cavPor3 cavPor3 cavPor3 cavPor3 # speTri2 speTri2 speTri2 speTri2 # oryCun2 oryCun2 oryCun2 oryCun2 # ochPri2 ochPri2 ochPri2 ochPri2 # tupBel1 tupBel1 tupBel1 # hg19 hg19 hg19 # gorGor3 gorGor3 gorGor3 # panTro4 panTro4 panTro4 # nomLeu2 nomLeu2 nomLeu2 # ponAbe2 ponAbe2 ponAbe2 # tarSyr1 tarSyr1 tarSyr1 # rheMac3 rheMac3 rheMac3 # papHam1 papHam1 papHam1 # otoGar3 otoGar3 otoGar3 # calJac3 calJac3 calJac3 # micMur1 micMur1 micMur1 # saiBol1 saiBol1 saiBol1 # equCab2 equCab2 # vicPac1 vicPac1 # turTru2 turTru2 # susScr3 susScr3 # bosTau7 bosTau7 # oviAri1 oviAri1 # pteVam1 pteVam1 # myoLuc2 myoLuc2 # felCat5 felCat5 # canFam3 canFam3 # ailMel1 ailMel1 # eriEur1 eriEur1 # sorAra1 sorAra1 # choHof1 choHof1 # dasNov3 dasNov3 # proCap1 proCap1 # echTel1 echTel1 # triMan1 triMan1 # loxAfr3 loxAfr3 # macEug2 # sarHar1 # monDom5 # ornAna1 # galGal4 # taeGut1 # melGal1 # melUnd1 # anoCar2 # chrPic1 # xenTro3 # latCha1 # gadMor1 # gasAcu1 # fr3 # oreNil2 # tetNig2 # danRer7 # oryLat2 # petMar1 # on organisms that do not have all species in all files, the file names # need to be filtered. Using this perl script to extract from # the full mfa files, only the subset of species from the four lists: cat << '_EOF_' > filterMfa.pl #!/usr/bin/env perl use strict; use warnings; my $argc = scalar(@ARGV); if ($argc != 1) { printf STDERR "usage: filterMfa.pl \n"; exit 255; } my %dbList; my $file = shift; open (FH, "<$file") or die "can not read $file"; printf STDERR "using list: $file\n"; while (my $db = ) { chomp $db; $dbList{$db} = 1; } close (FH); my $dirName = $file; $dirName =~ s/.list//; $dirName .= "Mfa"; my @mfaFileList = split('\n', `ls mfa/*.mfa`); for (my $i = 0; $i < scalar(@mfaFileList); ++$i) { my $file = $mfaFileList[$i]; my $chr = $file; $chr =~ s#^mfa/##; # printf STDERR "processing: %s into %s/%s\n", $file, $dirName, $chr; open (FH, "<$file") or die "can not read $file"; open (OF, ">$dirName/$chr") or die "can not write to $dirName/$chr"; my $inGroup = 0; while (my $line = ) { if ($line =~ m/^> /) { chomp $line; my ($faHead, $faDbName) = split('\s+', $line); if (exists($dbList{$faDbName})) { $inGroup = 1; printf OF "> %s\n", $faDbName; } else { $inGroup = 0; } } elsif ($inGroup) { printf OF "%s", $line; } } close (FH); close (OF); } '_EOF_' # << happy emacs chmod +x filterMfa.pl mkdir glireMfa euarchontogliresMfa placentalMfa vertebrateMfa # extract each set from the full mfa files, run msa_view on # each subset and construct .nh tree for that subset for N in glire euarchontoglires placental vertebrate do ./filterMfa.pl ${N}.list /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/msa_view \ --aggregate "`cat ${N}.list|xargs echo`" ${N}Mfa/*.mfa \ | sed s/"> "/">"/ > 4d.${N}.mfa /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/tree_doctor \ --no-branchlen --prune-all-but="`cat ${N}.list|xargs echo`" \ ../tree-commas.nh > tree-commas.${N}.nh done ### XXX ### MOST INTERESTING, this phyloFit operation was repeated ### to verify that the full 60 species vertebrate operation produced the ### same result as the original "all" subset. This phyloFit appears to ### produce a different result each time ? # use phyloFit to create tree model (output is phyloFit.mod) for N in glire euarchontoglires placental vertebrate do time nice -n +19 \ /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/phyloFit \ --EM --precision MED --msa-format FASTA --subst-mod REV \ --tree ./tree-commas.${N}.nh 4d.${N}.mfa mv phyloFit.mod ${N}.mod grep TREE ${N}.mod | sed 's/TREE\:\ //' > ${N}.Nway.nh done # real 0m15.747s # real 4m5.526s # real 20m45.982s # real 141m21.248s ####################################################################### # phastCons 60-way (DONE - 2012-06-12, 2012-08-21 - Hiram) # was unable to split the full chrom MAF files, now working on the # maf files as they were split up during multiz # split 60way mafs into 10M chunks and generate sufficient statistics # files for # phastCons ssh encodek mkdir -p /hive/data/genomes/mm10/bed/multiz60way/cons/ss mkdir -p /hive/data/genomes/mm10/bed/multiz60way/cons/msa.split cd /hive/data/genomes/mm10/bed/multiz60way/cons/msa.split cat << '_EOF_' > doSplit.csh #!/bin/csh -ef set c = $1 set MAF = /hive/data/genomes/mm10/bed/multiz60way/anno/result/$c.maf set WINDOWS = /hive/data/genomes/mm10/bed/multiz60way/cons/ss/$c set WC = `cat $MAF | wc -l` set NL = `grep "^#" $MAF | wc -l` if ( -s $2 ) then exit 0 endif if ( -s $2.running ) then exit 0 endif date >> $2.running rm -fr $WINDOWS mkdir $WINDOWS pushd $WINDOWS > /dev/null if ( $WC != $NL ) then /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/msa_split \ $MAF -i MAF -o SS -r $WINDOWS/$c -w 10000000,0 -I 1000 -B 5000 endif popd > /dev/null date >> $2 rm -f $2.running '_EOF_' # << happy emacs chmod +x doSplit.csh cat << '_EOF_' > template #LOOP doSplit.csh $(root1) {check out line+ $(root1).done} #ENDLOOP '_EOF_' # << happy emacs # do the easy ones first to see some immediate results ls -1S -r ../../anno/result | sed -e "s/.maf//;" > maf.list gensub2 maf.list single template jobList para -ram=8g create jobList para try ... check ... etc # Completed: 64 of 66 jobs # Crashed: 2 jobs # CPU time in finished jobs: 347730s 5795.49m 96.59h 4.02d 0.011 y # IO & Wait Time: 102813s 1713.56m 28.56h 1.19d 0.003 y # Average job time: 7040s 117.33m 1.96h 0.08d # Longest finished job: 42666s 711.10m 11.85h 0.49d # Submission to last job: 150336s 2505.60m 41.76h 1.74d # finish the last two on hgwdev with more memory. # linux data memory, in 1024-byte units export M=188743680 ulimit -S -m $M -v $M ./doSplit.csh chr1 chr1.done & ./doSplit.csh chr2 chr2.done wait # real 864m53.235s # Run phastCons # This job is I/O intensive in its output files, beware where this # takes place or do not run too many at once. ssh swarm mkdir -p /hive/data/genomes/mm10/bed/multiz60way/cons/run.cons cd /hive/data/genomes/mm10/bed/multiz60way/cons/run.cons # there are going to be several different phastCons runs using # this same script. They trigger off of the current working directory # $cwd:t which is the "grp" in this script. It is one of: # all glire glirePrimate glirePrimatePlacental cat << '_EOF_' > doPhast.csh #!/bin/csh -fe set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin set c = $1 set f = $2 set len = $3 set cov = $4 set rho = $5 set grp = $cwd:t set cons = /hive/data/genomes/mm10/bed/multiz60way/cons set tmp = $cons/tmp/$f mkdir -p $tmp set ssSrc = $cons/ss set useGrp = "$grp.mod" if (-s $cons/$grp/$grp.non-inf) then ln -s $cons/$grp/$grp.mod $tmp ln -s $cons/$grp/$grp.non-inf $tmp ln -s $ssSrc/$c/$f.ss $tmp else ln -s $ssSrc/$c/$f.ss $tmp ln -s $cons/$grp/$grp.mod $tmp endif pushd $tmp > /dev/null if (-s $grp.non-inf) then $PHASTBIN/phastCons $f.ss $useGrp \ --rho $rho --expected-length $len --target-coverage $cov --quiet \ --not-informative `cat $grp.non-inf` \ --seqname $c --idpref $c --most-conserved $f.bed --score > $f.pp else $PHASTBIN/phastCons $f.ss $useGrp \ --rho $rho --expected-length $len --target-coverage $cov --quiet \ --seqname $c --idpref $c --most-conserved $f.bed --score > $f.pp endif popd > /dev/null mkdir -p pp/$c bed/$c sleep 4 touch pp/$c bed/$c rm -f pp/$c/$f.pp rm -f bed/$c/$f.bed mv $tmp/$f.pp pp/$c mv $tmp/$f.bed bed/$c rm -fr $tmp '_EOF_' # << happy emacs chmod a+x doPhast.csh # this template will serve for all runs # root1 == chrom name, file1 == ss file name without .ss suffix cat << '_EOF_' > template #LOOP ../run.cons/doPhast.csh $(root1) $(file1) 45 0.3 0.3 {check out line+ pp/$(root1)/$(file1).pp} #ENDLOOP '_EOF_' # << happy emacs ls -1S ../ss/chr*/chr* | sed -e "s/.ss$//" > ss.list # Create parasol batch and run it ############################ run for all species cd /hive/data/genomes/mm10/bed/multiz60way/cons mkdir all cd all cp -p ../../4d/all.mod ./all.mod gensub2 ../run.cons/ss.list single ../run.cons/template jobList para -ram=8g create jobList para try ... check ... push ... etc. # Completed: 314 of 314 jobs # CPU time in finished jobs: 36286s 604.77m 10.08h 0.42d 0.001 y # IO & Wait Time: 10101s 168.35m 2.81h 0.12d 0.000 y # Average job time: 148s 2.46m 0.04h 0.00d # Longest finished job: 219s 3.65m 0.06h 0.00d # Submission to last job: 4383s 73.05m 1.22h 0.05d # create Most Conserved track cd /hive/data/genomes/mm10/bed/multiz60way/cons/all cut -f1 ../../../../chrom.sizes | while read C do ls -d bed/${C} 2> /dev/null | while read D do cat ${D}/${C}*.bed done | sort -k1,1 -k2,2n \ | awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", "'${C}'", $2, $3, $5, $5;}' done > tmpMostConserved.bed # -rw-rw-r-- 1 230642249 Jun 15 11:48 tmpMostConserved.bed /cluster/bin/scripts/lodToBedScore tmpMostConserved.bed > mostConserved.bed # -rw-rw-r-- 1 236425914 Jun 15 11:52 mostConserved.bed # load into database ssh hgwdev cd /hive/data/genomes/mm10/bed/multiz60way/cons/all time nice -n +19 hgLoadBed mm10 phastConsElements60way mostConserved.bed # Read 6748481 elements of size 5 from mostConserved.bed # real 2m20.950s # Try for 5% overall cov, and 70% CDS cov featureBits mm10 -enrichment refGene:cds phastConsElements60way # --rho 0.3 --expected-length 45 --target-coverage 0.3 # refGene:cds 1.281%, phastConsElements60way 6.517%, # both 0.913%, cover 71.29%, enrich 10.94x time featureBits mm10 -enrichment ensGene:cds phastConsElements60way # ensGene:cds 1.357%, phastConsElements60way 6.517%, both 0.942%, cover # 69.39%, enrich 10.65x # real 0m54.109s time featureBits mm10 -enrichment knownGene:cds phastConsElements60way # knownGene:cds 1.325%, phastConsElements60way 6.517%, both 0.930%, # cover 70.18%, enrich 10.77x # real 0m50.472s # Create merged posterier probability file and wiggle track data files cd /hive/data/genomes/mm10/bed/multiz60way/cons/all mkdir downloads for D in `ls -d pp/chr* | sed -e 's#pp/##'` do echo "working: $D" find ./pp/${D} -type f | sed -e "s#^./##; s#\.# d #g; s#-# m #;" \ | sort -k1,1 -k3,3n | sed -e "s# d #.#g; s# m #-#g;" | xargs cat \ | gzip -c > downloads/${D}.phastCons60way.wigFix.gz done # real 102m58.496s # encode those files into wiggle data time (zcat downloads/*.wigFix.gz \ | wigEncode stdin phastCons60way.wig phastCons60way.wib) # Converted stdin, upper limit 1.00, lower limit 0.00 # real 9m32.980s du -hsc *.wi? # 1.8G phastCons60way.wib # 298M phastCons60way.wig # 2.1G total # encode into a bigWig file: # (warning wigToBigWig process grows to about 36 Gb) # in bash, to avoid the 32 Gb memory limit, set 180 Gb here: sizeG=188743680 export sizeG ulimit -d $sizeG ulimit -v $sizeG time (zcat downloads/*.wigFix.gz \ | wigToBigWig stdin ../../../../chrom.sizes phastCons60way.bw) # real 27m1.039s # -rw-rw-r-- 1 4671685725 Jun 18 10:24 phastCons60way.bw bigWigInfo phastCons60way.bw version: 4 isCompressed: yes isSwapped: 0 primaryDataSize: 3,333,510,917 primaryIndexSize: 100,774,056 zoomLevels: 10 chromCount: 59 basesCovered: 1,929,686,275 mean: 0.149660 min: 0.000000 max: 1.000000 std: 0.282516 # if you wanted to use the bigWig file, loading bigWig table: # but we don't use the bigWig file mkdir /gbdb/mm10/bbi ln -s `pwd`/phastCons60way.bw /gbdb/mm10/bbi hgsql mm10 -e 'drop table if exists phastCons60way; \ create table phastCons60way (fileName varchar(255) not null); \ insert into phastCons60way values ("/gbdb/mm10/bbi/phastCons60way.bw");' # Load gbdb and database with wiggle. ssh hgwdev cd /hive/data/genomes/mm10/bed/multiz60way/cons/all ln -s `pwd`/phastCons60way.wib /gbdb/mm10/multiz60way/phastCons60way.wib time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/mm10/multiz60way mm10 \ phastCons60way phastCons60way.wig # real 0m54.546s wigTableStats.sh mm10 phastCons60way # db.table min max mean count sumData # mm10.phastCons60way 0 1 0.14966 1929686275 2.88797e+08 # stdDev viewLimits # 0.282516 viewLimits=0:1 # Create histogram to get an overview of all the data ssh hgwdev cd /hive/data/genomes/mm10/bed/multiz60way/cons/all time nice -n +19 hgWiggle -doHistogram -db=mm10 \ -hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \ phastCons60way > histogram.data 2>&1 # real 7m37.212s # create plot of histogram: cat << '_EOF_' | gnuplot > histo.png set terminal png small x000000 xffffff xc000ff x66ff66 xffff00 x00ffff set size 1.4, 0.8 set key left box set grid noxtics set grid ytics set title " Mouse Mm10 Histogram phastCons60way track" set xlabel " phastCons60way score" set ylabel " Relative Frequency" set y2label " Cumulative Relative Frequency (CRF)" set y2range [0:1] set y2tics set yrange [0:0.02] plot "histogram.data" using 2:5 title " RelFreq" with impulses, \ "histogram.data" using 2:7 axes x1y2 title " CRF" with lines '_EOF_' # << happy emacs display histo.png & ######################################################################## ### Create a phastCons data set for Glires # setup glire-only run ssh swarm mkdir /hive/data/genomes/mm10/bed/multiz60way/cons/glire cd /hive/data/genomes/mm10/bed/multiz60way/cons/glire # glire-only: get the glire only tree from the 4d directory cp -p ../../4d/glire.mod ./glire.mod # and all the others become the non-informative list for phastCons to ignore sort ../../4d/glire.list > glire.list sort ../../4d/vertebrate.list > vertebrate.list comm -13 glire.list vertebrate.list | xargs echo \ | sed -e "s/ /,/g" > glire.non-inf gensub2 ../run.cons/ss.list single ../run.cons/template jobList para -ram=8g create jobList para try ... check ... push ... etc. # Completed: 314 of 314 jobs # CPU time in finished jobs: 12411s 206.85m 3.45h 0.14d 0.000 y # IO & Wait Time: 117850s 1964.16m 32.74h 1.36d 0.004 y # Average job time: 415s 6.91m 0.12h 0.00d # Longest finished job: 658s 10.97m 0.18h 0.01d # Submission to last job: 796s 13.27m 0.22h 0.01d cd /hive/data/genomes/mm10/bed/multiz60way/cons/glire # create Most Conserved track cut -f1 ../../../../chrom.sizes | while read C do ls -d bed/${C} 2> /dev/null | while read D do cat ${D}/${C}*.bed done | sort -k1,1 -k2,2n \ | awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", "'${C}'", $2, $3, $5, $5;}' done > tmpMostConserved.bed # real 0m32.945s /cluster/bin/scripts/lodToBedScore tmpMostConserved.bed > mostConserved.bed # real 0m19.122s featureBits mm10 mostConserved.bed # 117058023 bases of 2652783500 (4.413%) in intersection # real 0m21.506s # load into database ssh hgwdev cd /hive/data/genomes/mm10/bed/multiz60way/cons/glire time nice -n +19 hgLoadBed mm10 phastConsElements60wayGlire \ mostConserved.bed # Loaded 1336504 elements of size 6 # real 0m13.672s # verify coverage time featureBits mm10 phastConsElements60wayGlire # 117058023 bases of 2652783500 (4.413%) in intersection # real 0m15.041s # --rho 0.3 --expected-length 45 --target-coverage 0.3 featureBits mm10 -enrichment refGene:cds phastConsElements60wayGlire # refGene:cds 1.282%, phastConsElements60wayGlire 4.413%, # both 0.944%, cover 73.60%, enrich 16.68x featureBits mm10 -enrichment knownGene:cds phastConsElements60wayGlire # knownGene:cds 1.325%, phastConsElements60wayGlire 4.413%, # both 0.957%, cover 72.22%, enrich 16.37x # Create the downloads .pp files, from which the phastCons wiggle data # is calculated # sort by chromName, chromStart so that items are in numerical order # for wigEncode cd /hive/data/genomes/mm10/bed/multiz60way/cons/glire mkdir downloads for D in `ls -d pp/chr* | sed -e 's#pp/##'` do echo "working: $D" find ./pp/${D} -type f | sed -e "s#^./##; s#\.# d #g; s#-# m #;" \ | sort -k1,1 -k3,3n | sed -e "s# d #.#g; s# m #-#g;" | xargs cat \ | gzip -c > downloads/${D}.phastCons60way.glire.wigFix.gz done # Create merged posterier probability file and wiggle track data files time (zcat downloads/chr*.wigFix.gz \ | wigEncode stdin phastCons60wayGlire.wig phastCons60wayGlire.wib) & # Converted stdin, upper limit 1.00, lower limit 0.00 # real 10m26.712s # encode to bigWig # (warning wigToBigWig process grows to about 36 Gb) # in bash, to avoid the 32 Gb memory limit: export sizeG=188743680 ulimit -d $sizeG ulimit -v $sizeG time (zcat downloads/*.wigFix.gz \ | wigToBigWig -verbose=2 stdin ../../../../chrom.sizes \ phastCons60wayGlire.bw > bigWig.log 2>&1) & # real 52m17.108s grep VmPeak bigWig.log # pid=5552: VmPeak: 20926360 kB bigWigInfo phastCons60wayGlire.bw version: 4 isCompressed: yes isSwapped: 0 primaryDataSize: 3,631,413,425 primaryIndexSize: 100,774,056 zoomLevels: 10 chromCount: 59 basesCovered: 1,929,686,275 mean: 0.142675 min: 0.000000 max: 1.000000 std: 0.252347 # if desired to use the bigWig file, loading bigWig table: ln -s `pwd`/phastCons60wayGlire.bw /gbdb/mm10/bbi hgsql mm10 -e 'drop table if exists phastCons60wayGlire; \ create table phastCons60wayGlire \ (fileName varchar(255) not null); \ insert into phastCons60wayGlire values ("/gbdb/mm10/bbi/phastCons60wayGlire.bw");' ## load table with wiggle data ssh hgwdev cd /hive/data/genomes/mm10/bed/multiz60way/cons/glire ln -s `pwd`/phastCons60wayGlire.wib \ /gbdb/mm10/multiz60way/phastCons60wayGlire.wib time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/mm10/multiz60way mm10 \ phastCons60wayGlire phastCons60wayGlire.wig # real 0m56.786s wigTableStats.sh mm10 phastCons60wayGlire # db.table min max mean count sumData mm10.phastCons60wayGlire 0 1 0.142675 1929686275 2.75318e+08 # stdDev viewLimits # 0.252347 viewLimits=0:1 # Create histogram to get an overview of all the data time nice -n +19 hgWiggle -doHistogram \ -hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \ -db=mm10 phastCons60wayGlire > histogram.data 2>&1 # real 4m28.743s # create plot of histogram: cat << '_EOF_' | gnuplot > histo.png set terminal png small x000000 xffffff xc000ff x66ff66 xffff00 x00ffff set size 1.4, 0.8 set key left box set grid noxtics set grid ytics set title " Mouse Hg19 Histogram phastCons60wayGlire track" set xlabel " phastCons60wayGlire score" set ylabel " Relative Frequency" set y2label " Cumulative Relative Frequency (CRF)" set y2range [0:1] set y2tics set yrange [0:0.02] plot "histogram.data" using 2:5 title " RelFreq" with impulses, \ "histogram.data" using 2:7 axes x1y2 title " CRF" with lines '_EOF_' # << happy emacs display histo.png & ######################################################################## ### Create a phastCons data set for Euarchontoglires # setup euarchontoglires-only run ssh swarm mkdir /hive/data/genomes/mm10/bed/multiz60way/cons/euarchontoglires cd /hive/data/genomes/mm10/bed/multiz60way/cons/euarchontoglires # euarchontoglires-only: get the euarchontoglires only tree from the 4d directory cp -p ../../4d/euarchontoglires.mod ./euarchontoglires.mod # and all the others become the non-informative list for phastCons to ignore sort ../../4d/euarchontoglires.list > euarchontoglires.list sort ../../4d/vertebrate.list > vertebrate.list comm -13 euarchontoglires.list vertebrate.list | xargs echo \ | sed -e "s/ /,/g" > euarchontoglires.non-inf gensub2 ../run.cons/ss.list single ../run.cons/template jobList para -ram=8g create jobList para try ... check ... push ... etc. # Completed: 314 of 314 jobs # CPU time in finished jobs: 17421s 290.36m 4.84h 0.20d 0.001 y # IO & Wait Time: 37430s 623.83m 10.40h 0.43d 0.001 y # Average job time: 175s 2.91m 0.05h 0.00d # Longest finished job: 343s 5.72m 0.10h 0.00d # Submission to last job: 2403s 40.05m 0.67h 0.03d cd /hive/data/genomes/mm10/bed/multiz60way/cons/euarchontoglires # create Most Conserved track cut -f1 ../../../../chrom.sizes | while read C do ls -d bed/${C} 2> /dev/null | while read D do cat ${D}/${C}*.bed done | sort -k1,1 -k2,2n \ | awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", "'${C}'", $2, $3, $5, $5;}' done > tmpMostConserved.bed # real 0m32.945s /cluster/bin/scripts/lodToBedScore tmpMostConserved.bed > mostConserved.bed # real 0m19.122s featureBits mm10 mostConserved.bed # 127113541 bases of 2652783500 (4.792%) in intersection # real 0m21.506s # load into database ssh hgwdev cd /hive/data/genomes/mm10/bed/multiz60way/cons/euarchontoglires time nice -n +19 hgLoadBed mm10 phastConsElements60wayEuarchontoGlires \ mostConserved.bed # Loaded 2327130 elements of size 6 # real 0m24.591s # verify coverage time featureBits mm10 phastConsElements60wayEuarchontoGlires # 127113541 bases of 2652783500 (4.792%) in intersection # real 0m18.857s # --rho 0.3 --expected-length 45 --target-coverage 0.3 featureBits mm10 -enrichment refGene:cds phastConsElements60wayEuarchontoGlires # refGene:cds 1.282%, phastConsElements60wayEuarchontoGlires 4.792%, # both 0.929%, cover 72.46%, enrich 15.12x featureBits mm10 -enrichment knownGene:cds phastConsElements60wayEuarchontoGlires # knownGene:cds 1.325%, phastConsElements60wayEuarchontoGlires 4.792%, # both 0.943%, cover 71.16%, enrich 14.85x # Create the downloads .pp files, from which the phastCons wiggle data # is calculated # sort by chromName, chromStart so that items are in numerical order # for wigEncode cd /hive/data/genomes/mm10/bed/multiz60way/cons/euarchontoglires mkdir downloads for D in `ls -d pp/chr* | sed -e 's#pp/##'` do echo "working: $D" find ./pp/${D} -type f | sed -e "s#^./##; s#\.# d #g; s#-# m #;" \ | sort -k1,1 -k3,3n | sed -e "s# d #.#g; s# m #-#g;" | xargs cat \ | gzip -c > downloads/${D}.phastCons60way.euarchontoglires.wigFix.gz done # Create merged posterier probability file and wiggle track data files time (zcat downloads/chr*.wigFix.gz \ | wigEncode stdin phastCons60wayEuarchontoGlires.wig phastCons60wayEuarchontoGlires.wib \ > wigEncode.log 2>&1) & # Converted stdin, upper limit 1.00, lower limit 0.00 # real 9m49.080s # encode to bigWig # (warning wigToBigWig process grows to about 36 Gb) # in bash, to avoid the 32 Gb memory limit: export sizeG=188743680 ulimit -d $sizeG ulimit -v $sizeG time (zcat downloads/*.wigFix.gz \ | wigToBigWig stdin ../../../../chrom.sizes phastCons60wayEuarchontoGlires.bw \ > bigWig.log 2>&1 ) & # real 26m0.111s bigWigInfo phastCons60wayEuarchontoGlires.bw version: 4 isCompressed: yes isSwapped: 0 primaryDataSize: 3,411,704,465 primaryIndexSize: 100,774,056 zoomLevels: 10 chromCount: 59 basesCovered: 1,929,686,275 mean: 0.133253 min: 0.000000 max: 1.000000 std: 0.256320 # if desired to use the bigWig file, loading bigWig table: ln -s `pwd`/phastCons60wayEuarchontoGlires.bw /gbdb/mm10/bbi hgsql mm10 -e 'drop table if exists phastCons60wayEuarchontoGlires; \ create table phastCons60wayEuarchontoGlires \ (fileName varchar(255) not null); \ insert into phastCons60wayEuarchontoGlires values ("/gbdb/mm10/bbi/phastCons60wayEuarchontoGlires.bw");' ## load table with wiggle data ssh hgwdev cd /hive/data/genomes/mm10/bed/multiz60way/cons/euarchontoglires ln -s `pwd`/phastCons60wayEuarchontoGlires.wib \ /gbdb/mm10/multiz60way/phastCons60wayEuarchontoGlires.wib time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/mm10/multiz60way mm10 \ phastCons60wayEuarchontoGlires phastCons60wayEuarchontoGlires.wig # real 0m50.676s time wigTableStats.sh mm10 phastCons60wayEuarchontoGlires # db.table min max mean count sumData mm10.phastCons60wayEuarchontoGlires 0 1 0.133253 1929686275 2.57137e+08 # stdDev viewLimits # 0.25632 viewLimits=0:1 # real 0m21.964s # Create histogram to get an overview of all the data time nice -n +19 hgWiggle -doHistogram \ -hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \ -db=mm10 phastCons60wayEuarchontoGlires > histogram.data 2>&1 # real 3m31.112s # create plot of histogram: cat << '_EOF_' | gnuplot > histo.png set terminal png small x000000 xffffff xc000ff x66ff66 xffff00 x00ffff set size 1.4, 0.8 set key left box set grid noxtics set grid ytics set title " Mouse Mm10 Histogram phastCons60wayEuarchontoGlires track" set xlabel " phastCons60wayEuarchontoGlires score" set ylabel " Relative Frequency" set y2label " Cumulative Relative Frequency (CRF)" set y2range [0:1] set y2tics set yrange [0:0.02] plot "histogram.data" using 2:5 title " RelFreq" with impulses, \ "histogram.data" using 2:7 axes x1y2 title " CRF" with lines '_EOF_' # << happy emacs display histo.png & ######################################################################## ### Create a phastCons data set for primate ***### This was constructed ### and examined, but not used in the release # setup primate-only run ssh swarm mkdir /hive/data/genomes/mm10/bed/multiz60way/cons/primate cd /hive/data/genomes/mm10/bed/multiz60way/cons/primate # primate-only: get the primate only tree from the 4d directory cp -p ../../4d/primate.mod ./primate.mod # and all the others become the non-informative list for phastCons to ignore cat ../../4d/glire.list ../../4d/placental.list ../../4d/vertebrate.list \ | grep -v mm10 | sort | xargs echo | sed -e "s/ /,/g" \ > primate.non-inf gensub2 ../run.cons/ss.list single ../run.cons/template jobList para -ram=8g create jobList para try ... check ... push ... etc. # Completed: 314 of 314 jobs # CPU time in finished jobs: 13884s 231.39m 3.86h 0.16d 0.000 y # IO & Wait Time: 130791s 2179.86m 36.33h 1.51d 0.004 y # Average job time: 461s 7.68m 0.13h 0.01d # Longest finished job: 741s 12.35m 0.21h 0.01d # Submission to last job: 910s 15.17m 0.25h 0.01d cd /hive/data/genomes/mm10/bed/multiz60way/cons/primate # create Most Conserved track cut -f1 ../../../../chrom.sizes | while read C do ls -d bed/${C} 2> /dev/null | while read D do cat ${D}/${C}*.bed done | sort -k1,1 -k2,2n \ | awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", "'${C}'", $2, $3, $5, $5;}' done > tmpMostConserved.bed /cluster/bin/scripts/lodToBedScore tmpMostConserved.bed > mostConserved.bed # real 0m27.199s featureBits mm10 mostConserved.bed # 112908553 bases of 2652783500 (4.256%) in intersection # load into database ssh hgwdev cd /hive/data/genomes/mm10/bed/multiz60way/cons/primate time nice -n +19 hgLoadBed mm10 phastConsElements60wayPrimate \ mostConserved.bed # Loaded 1119924 elements of size 6 # real 0m17.423s # verify coverage featureBits mm10 phastConsElements60wayPrimate # 112908553 bases of 2652783500 (4.256%) in intersection # real 0m13.684s # --rho 0.3 --expected-length 45 --target-coverage 0.3 featureBits mm10 -enrichment refGene:cds phastConsElements60wayPrimate # refGene:cds 1.281%, phastConsElements60wayPrimate 4.256%, # both 0.897%, cover 69.98%, enrich 16.44x featureBits mm10 -enrichment knownGene:cds phastConsElements60wayPrimate # knownGene:cds 1.325%, phastConsElements60wayPrimate 4.256%, # both 0.909%, cover 68.64%, enrich 16.13x featureBits mm10 -enrichment ensGene:cds phastConsElements60wayPrimate # ensGene:cds 1.357%, phastConsElements60wayPrimate 4.256%, both 0.913%, # cover 67.30%, enrich 15.81x # Create the downloads .pp files, from which the phastCons wiggle data # is calculated # sort by chromName, chromStart so that items are in numerical order cd /hive/data/genomes/mm10/bed/multiz60way/cons/primate mkdir downloads for D in `ls -d pp/chr* | sed -e 's#pp/##'` do echo "working: $D" find ./pp/${D} -type f | sed -e "s#^./##; s#\.# d #g; s#-# m #;" \ | sort -k1,1 -k3,3n | sed -e "s# d #.#g; s# m #-#g;" | xargs cat \ | gzip -c > downloads/${D}.phastCons60way.primate.wigFix.gz done # Create merged posterier probability file and wiggle track data files zcat downloads/chr*.wigFix.gz \ | wigEncode stdin phastCons60wayPrimate.wig phastCons60wayPrimate.wib # Converted stdin, upper limit 1.00, lower limit 0.00 # real 12m22.465s # encode to bigWig # (warning wigToBigWig process grows to about 36 Gb) # in bash, to avoid the 32 Gb memory limit: export sizeG=188743680 ulimit -d $sizeG ulimit -v $sizeG zcat downloads/*.wigFix.gz \ | wigToBigWig stdin ../../../../chrom.sizes phastCons60wayPrimate.bw # real 31m44.517s bigWigInfo phastCons60wayPrimate.bw version: 4 isCompressed: yes isSwapped: 0 primaryDataSize: 2,431,379,060 primaryIndexSize: 100,774,056 zoomLevels: 10 chromCount: 59 basesCovered: 1,929,686,275 mean: 0.093847 min: 0.000000 max: 1.000000 std: 0.233892 # if desired to use the bigWig file, loading bigWig table: ln -s `pwd`/phastCons60wayPrimate.bw /gbdb/mm10/bbi hgsql mm10 -e 'drop table if exists phastCons60wayPrimate; \ create table phastCons60wayPrimate \ (fileName varchar(255) not null); \ insert into phastCons60wayPrimate values ("/gbdb/mm10/bbi/phastCons60wayPrimate.bw");' ## load table with wiggle data ssh hgwdev cd /hive/data/genomes/mm10/bed/multiz60way/cons/primate ln -s `pwd`/phastCons60wayPrimate.wib \ /gbdb/mm10/multiz60way/phastCons60wayPrimate.wib time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/mm10/multiz60way mm10 \ phastCons60wayPrimate phastCons60wayPrimate.wig # real 1m24.188s wigTableStats.sh mm10 phastCons60wayPrimate # db.table min max mean count sumData # mm10.phastCons60wayPrimate 0 1 0.0938475 1929686275 1.81096e+08 # 0.233892 viewLimits=0:1 # Create histogram to get an overview of all the data time nice -n +19 hgWiggle -doHistogram \ -hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \ -db=mm10 phastCons60wayPrimate > histogram.data 2>&1 # real 7m3.198s # create plot of histogram: cat << '_EOF_' | gnuplot > histo.png set terminal png small x000000 xffffff xc000ff x66ff66 xffff00 x00ffff set size 1.4, 0.8 set key left box set grid noxtics set grid ytics set title " Mouse Mm10 Histogram phastCons60wayPrimate track" set xlabel " phastCons60wayPrimate score" set ylabel " Relative Frequency" set y2label " Cumulative Relative Frequency (CRF)" set y2range [0:1] set y2tics set yrange [0:0.02] plot "histogram.data" using 2:5 title " RelFreq" with impulses, \ "histogram.data" using 2:7 axes x1y2 title " CRF" with lines '_EOF_' # << happy emacs display histo.png & ######################################################################### ### Create a phastCons data set for Placental # setup placental-only run ssh swarm mkdir /hive/data/genomes/mm10/bed/multiz60way/cons/placental cd /hive/data/genomes/mm10/bed/multiz60way/cons/placental # placental-only: get the placental only tree from the 4d directory cp -p ../../4d/placental.mod ./placental.mod # and all the others become the non-informative list for phastCons to ignore sort ../../4d/placental.list > placental.list sort ../../4d/vertebrate.list > vertebrate.list comm -13 placental.list vertebrate.list | xargs echo \ | sed -e "s/ /,/g" > placental.non-inf gensub2 ../run.cons/ss.list single ../run.cons/template jobList para create jobList para try ... check ... push ... etc. # Completed: 314 of 314 jobs # CPU time in finished jobs: 27853s 464.21m 7.74h 0.32d 0.001 y # IO & Wait Time: 128981s 2149.69m 35.83h 1.49d 0.004 y # Average job time: 499s 8.32m 0.14h 0.01d # Longest finished job: 785s 13.08m 0.22h 0.01d # Submission to last job: 5970s 99.50m 1.66h 0.07d cd /hive/data/genomes/mm10/bed/multiz60way/cons/placental # create Most Conserved track cut -f1 ../../../../chrom.sizes | while read C do ls -d bed/${C} 2> /dev/null | while read D do cat ${D}/${C}*.bed done | sort -k1,1 -k2,2n \ | awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", "'${C}'", $2, $3, $5, $5;}' done > tmpMostConserved.bed # real 0m44.506s /cluster/bin/scripts/lodToBedScore tmpMostConserved.bed > mostConserved.bed # real 0m44.170s featureBits mm10 mostConserved.bed # 144041584 bases of 2652783500 (5.430%) in intersection # real 0m54.927s # load into database ssh hgwdev cd /hive/data/genomes/mm10/bed/multiz60way/cons/placental time nice -n +19 hgLoadBed mm10 phastConsElements60wayPlacental \ mostConserved.bed # Loaded 5257437 elements of size 6 # real 0m56.788s # verify coverage, should be the same as the file measured above time featureBits mm10 phastConsElements60wayPlacental # 144041584 bases of 2652783500 (5.430%) in intersection # real 0m39.537s # --rho 0.3 --expected-length 45 --target-coverage 0.3 time featureBits mm10 -enrichment refGene:cds phastConsElements60wayPlacental # refGene:cds 1.282%, phastConsElements60wayPlacental 5.430%, # both 0.920%, cover 71.73%, enrich 13.21x # real 0m39.833s time featureBits mm10 -enrichment knownGene:cds phastConsElements60wayPlacental # knownGene:cds 1.325%, phastConsElements60wayPlacental 5.430%, # both 0.934%, cover 70.47%, enrich 12.98x # real 0m44.567s time featureBits mm10 -enrichment ensGene:cds phastConsElements60wayPlacental # ensGene:cds 1.357%, phastConsElements60wayPlacental 5.430%, # both 0.941%, cover 69.32%, enrich 12.77x # real 0m43.093s # Create the downloads .pp files, from which the phastCons wiggle data # is calculated # sort by chromName, chromStart so that items are in numerical order # for wigEncode cd /hive/data/genomes/mm10/bed/multiz60way/cons/placental mkdir downloads for D in `ls -d pp/chr* | sed -e 's#pp/##'` do echo "working: $D" find ./pp/${D} -type f | sed -e "s#^./##; s#\.# d #g; s#-# m #;" \ | sort -k1,1 -k3,3n | sed -e "s# d #.#g; s# m #-#g;" | xargs cat \ | gzip -c > downloads/${D}.phastCons60way.placental.wigFix.gz done # Create merged posterier probability file and wiggle track data files time (zcat downloads/chr*.wigFix.gz \ | wigEncode stdin phastCons60wayPlacental.wig \ phastCons60wayPlacental.wib > wigEncode.log 2>&1) & # Converted stdin, upper limit 1.00, lower limit 0.00 # real 9m48.237s # encode to bigWig # (warning wigToBigWig process grows to about 36 Gb) # in bash, to avoid the 32 Gb memory limit: export sizeG=188743680 ulimit -d $sizeG ulimit -v $sizeG time (zcat downloads/*.wigFix.gz \ | wigToBigWig stdin ../../../../chrom.sizes \ phastCons60wayPlacental.bw > bigWig.log 2>&1) & # real 25m18.556s bigWigInfo phastCons60wayPlacental.bw version: 4 isCompressed: yes isSwapped: 0 primaryDataSize: 3,271,676,156 primaryIndexSize: 100,774,056 zoomLevels: 10 chromCount: 59 basesCovered: 1,929,686,275 mean: 0.135703 min: 0.000000 max: 1.000000 std: 0.266432 # if desired to use the bigWig file, loading bigWig table: ln -s `pwd`/phastCons60wayPlacental.bw /gbdb/mm10/bbi hgsql mm10 -e 'drop table if exists phastCons60wayPlacental; \ create table phastCons60wayPlacental \ (fileName varchar(255) not null); \ insert into phastCons60wayPlacental values ("/gbdb/mm10/bbi/phastCons60wayPlacental.bw");' ## load table with wiggle data ssh hgwdev cd /hive/data/genomes/mm10/bed/multiz60way/cons/placental ln -s `pwd`/phastCons60wayPlacental.wib \ /gbdb/mm10/multiz60way/phastCons60wayPlacental.wib time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/mm10/multiz60way mm10 \ phastCons60wayPlacental phastCons60wayPlacental.wig # real 0m41.999s time wigTableStats.sh mm10 phastCons60wayPlacental # db.table min max mean count sumData # mm10.phastCons60wayPlacental 0 1 0.135703 1929686275 2.61864e+08 # stdDev viewLimits # 0.266432 # viewLimits=0:1 # real 0m21.723s # Create histogram to get an overview of all the data time nice -n +19 hgWiggle -doHistogram \ -hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \ -db=mm10 phastCons60wayPlacental > histogram.data 2>&1 # real 2m39.659s # create plot of histogram: cat << '_EOF_' | gnuplot > histo.png set terminal png small x000000 xffffff xc000ff x66ff66 xffff00 x00ffff set size 1.4, 0.8 set key left box set grid noxtics set grid ytics set title " Mouse Mm10 Histogram phastCons60wayPlacental track" set xlabel " phastCons60wayPlacental score" set ylabel " Relative Frequency" set y2label " Cumulative Relative Frequency (CRF)" set y2range [0:1] set y2tics set yrange [0:0.02] plot "histogram.data" using 2:5 title " RelFreq" with impulses, \ "histogram.data" using 2:7 axes x1y2 title " CRF" with lines '_EOF_' # << happy emacs display histo.png & ######################################################################### ### Create a phastCons data set for Vertebrate # setup vertebrate-only run ssh swarm mkdir /hive/data/genomes/mm10/bed/multiz60way/cons/vertebrate cd /hive/data/genomes/mm10/bed/multiz60way/cons/vertebrate # vertebrate-only: get the vertebrate only tree from the 4d directory cp -p ../../4d/vertebrate.mod ./vertebrate.mod # they are all in this one, no need for non-informative list gensub2 ../run.cons/ss.list single ../run.cons/template jobList para create jobList para try ... check ... push ... etc. # Completed: 313 of 314 jobs # Crashed: 1 jobs # CPU time in finished jobs: 36058s 600.97m 10.02h 0.42d 0.001 y # IO & Wait Time: 125496s 2091.59m 34.86h 1.45d 0.004 y # Average job time: 516s 8.60m 0.14h 0.01d # Longest finished job: 912s 15.20m 0.25h 0.01d # Submission to last job: 2681s 44.68m 0.74h 0.03d # the one failed job was completed manually on hgwdev cd /hive/data/genomes/mm10/bed/multiz60way/cons/vertebrate # create Most Conserved track cut -f1 ../../../../chrom.sizes | while read C do ls -d bed/${C} 2> /dev/null | while read D do cat ${D}/${C}*.bed done | sort -k1,1 -k2,2n \ | awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", "'${C}'", $2, $3, $5, $5;}' done > tmpMostConserved.bed # real 0m44.506s /cluster/bin/scripts/lodToBedScore tmpMostConserved.bed > mostConserved.bed time featureBits mm10 mostConserved.bed # 172842314 bases of 2652783500 (6.516%) in intersection # real 1m23.298s # load into database ssh hgwdev cd /hive/data/genomes/mm10/bed/multiz60way/cons/vertebrate time nice -n +19 hgLoadBed mm10 phastConsElements60wayVertebrate \ mostConserved.bed # Read 6747163 elements of size 5 from mostConserved.bed # real 1m15.122s # verify coverage featureBits mm10 phastConsElements60wayVertebrate # 172842314 bases of 2652783500 (6.516%) in intersection # --rho 0.3 --expected-length 45 --target-coverage 0.3 featureBits mm10 -enrichment refGene:cds phastConsElements60wayVertebrate # refGene:cds 1.282%, phastConsElements60wayVertebrate 6.516%, # both 0.914%, cover 71.26%, enrich 10.94x time featureBits mm10 -enrichment ensGene:cds phastConsElements60wayVertebrate # ensGene:cds 1.357%, phastConsElements60wayVertebrate 6.516%, # both 0.942%, cover 69.39%, enrich 10.65x # real 0m51.139s time featureBits mm10 -enrichment knownGene:cds phastConsElements60wayVertebrate # knownGene:cds 1.325%, phastConsElements60wayVertebrate 6.516%, # both 0.930%, cover 70.18%, enrich 10.77x # real 0m51.545s # Create the downloads .pp files, from which the phastCons wiggle data # is calculated # sort by chromName, chromStart so that items are in numerical order # for wigEncode cd /hive/data/genomes/mm10/bed/multiz60way/cons/vertebrate mkdir downloads for D in `ls -d pp/chr* | sed -e 's#pp/##'` do echo "working: $D" find ./pp/${D} -type f | sed -e "s#^./##; s#\.# d #g; s#-# m #;" \ | sort -k1,1 -k3,3n | sed -e "s# d #.#g; s# m #-#g;" | xargs cat \ | gzip -c > downloads/${D}.phastCons60way.vertebrate.wigFix.gz done # Create merged posterier probability file and wiggle track data files time (zcat downloads/chr*.wigFix.gz \ | wigEncode stdin phastCons60wayVertebrate.wig \ phastCons60wayVertebrate.wib > wigEncode.log 2>&1 ) & # Converted stdin, upper limit 1.00, lower limit 0.00 # real 9m48.554s # encode to bigWig # (warning wigToBigWig process grows to about 36 Gb) # in bash, to avoid the 32 Gb memory limit: export sizeG=188743680 ulimit -d $sizeG ulimit -v $sizeG time (zcat downloads/*.wigFix.gz \ | wigToBigWig stdin ../../../../chrom.sizes \ phastCons60wayVertebrate.bw > bigWig.log 2>&1) & # real 25m8.630s bigWigInfo phastCons60wayVertebrate.bw version: 4 isCompressed: yes isSwapped: 0 primaryDataSize: 3,333,348,984 primaryIndexSize: 100,774,056 zoomLevels: 10 chromCount: 59 basesCovered: 1,929,686,275 mean: 0.149646 min: 0.000000 max: 1.000000 std: 0.282502 # if desired to use the bigWig file, loading bigWig table: ln -s `pwd`/phastCons60wayVertebrate.bw /gbdb/mm10/bbi hgsql mm10 -e 'drop table if exists phastCons60wayVertebrate; \ create table phastCons60wayVertebrate \ (fileName varchar(255) not null); \ insert into phastCons60wayVertebrate values ("/gbdb/mm10/bbi/phastCons60wayVertebrate.bw");' ## load table with wiggle data ssh hgwdev cd /hive/data/genomes/mm10/bed/multiz60way/cons/vertebrate ln -s `pwd`/phastCons60wayVertebrate.wib \ /gbdb/mm10/multiz60way/phastCons60wayVertebrate.wib time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/mm10/multiz60way mm10 \ phastCons60wayVertebrate phastCons60wayVertebrate.wig # real 0m45.432s time wigTableStats.sh mm10 phastCons60wayVertebrate # db.table min max mean count sumData # mm10.phastCons60wayVertebrate 0 1 0.149646 1929686275 2.8877e+08 # stdDev viewLimits # 0.282502 viewLimits=0:1 # real 0m22.224s # Create histogram to get an overview of all the data time nice -n +19 hgWiggle -doHistogram \ -hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \ -db=mm10 phastCons60wayVertebrate > histogram.data 2>&1 # real 2m52.041s # create plot of histogram: cat << '_EOF_' | gnuplot > histo.png set terminal png small x000000 xffffff xc000ff x66ff66 xffff00 x00ffff set size 1.4, 0.8 set key left box set grid noxtics set grid ytics set title " Mouse Mm10 Histogram phastCons60wayVertebrate track" set xlabel " phastCons60wayVertebrate score" set ylabel " Relative Frequency" set y2label " Cumulative Relative Frequency (CRF)" set y2range [0:1] set y2tics set yrange [0:0.02] plot "histogram.data" using 2:5 title " RelFreq" with impulses, \ "histogram.data" using 2:7 axes x1y2 title " CRF" with lines '_EOF_' # << happy emacs display histo.png & ######################################################################### # phyloP conservation for 60-way (DONE - 2012-06-15 - 2012-08-21 - Hiram) # # Vertebrate, Glire, Primate, Placental # # split SS files into 1M chunks, this business needs smaller files # to complete # many of these jobs run too much memory to finish on a kluster node # can run all of this on hgwdev mkdir /hive/data/genomes/mm10/bed/multiz60way/consPhyloP cd /hive/data/genomes/mm10/bed/multiz60way/consPhyloP mkdir ss run.split cd run.split cat << '_EOF_' > doSplit.csh #!/bin/csh -ef set c = $1 set MAF = /hive/data/genomes/mm10/bed/multiz60way/anno/result/$c.maf set WINDOWS = /hive/data/genomes/mm10/bed/multiz60way/consPhyloP/ss/$c set WC = `cat $MAF | wc -l` set NL = `grep "^#" $MAF | wc -l` if ( -s $2 ) then exit 0 endif if ( -s $2.running ) then exit 0 endif date >> $2.running rm -fr $WINDOWS mkdir $WINDOWS pushd $WINDOWS > /dev/null if ( $WC != $NL ) then /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/msa_split \ $MAF -i MAF -o SS -r $WINDOWS/$c -w 1000000,0 -I 1000 -B 5000 endif popd > /dev/null date >> $2 rm -f $2.running '_EOF_' # << happy emacs # do the easy ones first to see some immediate results ls -1S -r ../../anno/result | sed -e "s/.maf//;" > maf.list cat << '_EOF_' > template #LOOP ./doSplit.csh $(root1) $(root1).done #ENDLOOP '_EOF_' # << happy emacs gensub2 maf.list single template jobList # copy the jobList to runEm.sh, edit to make all the commands run in # the background, with wait statements every few commands to run # a small number of these at once, no more than four at once with # the large chroms, the small randoms can run a bunch at once, they # finish quickly. time ./runEm.sh # about 11h30m # run phyloP with score=LRT ssh swarm cd /cluster/data/mm10/bed/multiz60way/consPhyloP mkdir run.phyloP cd run.phyloP # Adjust model file base composition background and rate matrix to be # representative of the chromosomes in play grep BACKGROUND ../../cons/all/all.mod | awk '{printf "%0.3f\n", $3 + $4}' # 0.525 /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/modFreqs \ ../../cons/all/all.mod 0.525 > all.mod grep BACKGROUND ../../cons/glire/glire.mod \ | awk '{printf "%0.3f\n", $3 + $4}' # 0.531 /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/modFreqs \ ../../cons/glire/glire.mod 0.531 > glire.mod grep BACKGROUND ../../cons/primate/primate.mod \ | awk '{printf "%0.3f\n", $3 + $4}' # 0.509 /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/modFreqs \ ../../cons/primate/primate.mod 0.509 > primate.mod grep BACKGROUND ../../cons/euarchontoglires/euarchontoglires.mod \ | awk '{printf "%0.3f\n", $3 + $4}' # 0.518 /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/modFreqs \ ../../cons/euarchontoglires/euarchontoglires.mod 0.518 \ > euarchontoglires.mod grep BACKGROUND ../../cons/placental/placental.mod \ | awk '{printf "%0.3f\n", $3 + $4}' # 0.525 /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/modFreqs \ ../../cons/placental/placental.mod 0.525 > placental.mod grep BACKGROUND ../../cons/vertebrate/vertebrate.mod \ | awk '{printf "%0.3f\n", $3 + $4}' # 0.525 /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/modFreqs \ ../../cons/vertebrate/vertebrate.mod 0.525 > vertebrate.mod cat << '_EOF_' > doPhyloP.csh #!/bin/csh -fex set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin set f = $1 set ssFile = $1:t echo "ssFile: $ssFile" set out = $2 set cName = $f:h echo "cName: $cName" set n = $f:r:e set grp = $cwd:t set cons = /hive/data/genomes/mm10/bed/multiz60way/consPhyloP set tmp = $cons/tmp/$grp/$f rm -fr $tmp mkdir -p $tmp set ssSrc = "$cons/ss/$cName/$ssFile" set useGrp = "$grp.mod" ln -s $cons/run.phyloP/$grp.mod $tmp pushd $tmp > /dev/null echo source: $ssSrc.ss $PHASTBIN/phyloP --method LRT --mode CONACC --wig-scores --chrom $cName \ -i SS $useGrp $ssSrc.ss > $ssFile.wigFix popd > /dev/null mkdir -p $out:h sleep 4 mv $tmp/$ssFile.wigFix $out rm -fr $tmp '_EOF_' # << happy emacs chmod +x doPhyloP.csh # Create list of chunks find ../ss -type f | sed -e "s/.ss$//; s#../ss/##;" > ss.list # Create template file # file1 == $chr/$chunk/file name without .ss suffix cat << '_EOF_' > template #LOOP ../run.phyloP/doPhyloP.csh $(path1) {check out line+ wigFix/$(dir1)/$(file1).wigFix} #ENDLOOP '_EOF_' # << happy emacs ###################### Running all species ####################### # setup run for all species mkdir /hive/data/genomes/mm10/bed/multiz60way/consPhyloP/all cd /hive/data/genomes/mm10/bed/multiz60way/consPhyloP/all rm -fr wigFix mkdir wigFix gensub2 ../run.phyloP/ss.list single ../run.phyloP/template jobList para create jobList para try ... check ... push ... etc ... para time # Completed: 2708 of 2708 jobs # CPU time in finished jobs: 1832980s 30549.67m 509.16h 21.22d 0.058 y # IO & Wait Time: 217434s 3623.90m 60.40h 2.52d 0.007 y # Average job time: 757s 12.62m 0.21h 0.01d # Longest finished job: 1458s 24.30m 0.41h 0.02d # Submission to last job: 3647s 60.78m 1.01h 0.04d # missed chrM in the original run: ../run.phyloP/doPhyloP.csh chrM/chrM.1-16296 wigFix/chrM/chrM.1-16296.wigFix ssh hgwdev cd /cluster/data/mm10/bed/multiz60way/consPhyloP/run.phyloP/all mkdir downloads for D in `ls -d wigFix/chr* | sed -e 's#wigFix/##'` do echo "working: $D" find ./wigFix/${D} -type f | sed -e "s#^./##; s#\.# d #g; s#-# m #;" \ | sort -k1,1 -k3,3n | sed -e "s# d #.#g; s# m #-#g;" | xargs cat \ | gzip -c > downloads/${D}.phyloP60way.wigFix.gz done # real 38m15.538s zcat downloads/*.wigFix.gz \ | wigEncode stdin phyloP60way.wig phyloP60way.wib > wigEncode.log 2>&1 & # Converted stdin, upper limit 7.53, lower limit -20.00 # real 27m53.384s export sizeG=188743680 ulimit -d $sizeG ulimit -v $sizeG time (zcat downloads/*.wigFix.gz \ | wigToBigWig stdin ../../../../chrom.sizes phyloP60way.bw) # real 30m10.440s bigWigInfo phyloP60way.bw version: 4 isCompressed: yes isSwapped: 0 primaryDataSize: 4,533,501,426 primaryIndexSize: 100,775,272 zoomLevels: 10 chromCount: 59 basesCovered: 1,929,686,275 mean: 0.169761 min: -20.000000 max: 7.532000 std: 0.942744 # if you wanted to use the bigWig file, loading bigWig table: ln -s `pwd`/phyloP60way.bw /gbdb/mm10/bbi hgsql mm10 -e 'drop table if exists phyloP60wayAll; \ create table phyloP60wayAll \ (fileName varchar(255) not null); \ insert into phyloP60wayAll values ("/gbdb/mm10/bbi/phyloP60way.bw");' # loading the wiggle table: ln -s `pwd`/phyloP60way.wib /gbdb/mm10/multiz60way time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/mm10/multiz60way mm10 \ phyloP60wayAll phyloP60way.wig # real 1m16.934s wigTableStats.sh mm10 phyloP60wayAll # db.table min max mean count sumData # mm10.phyloP60wayAll -20 7.532 0.169761 1929686275 3.27586e+08 # stdDev viewLimits # 0.942744 viewLimits=-4.54396:4.88348 # that range is: 4.54396+4.88348 = 9.42744 for -hBinSize=0.0942744 below # to get 1,000 bins # Create histogram to get an overview of all the data time nice -n +19 hgWiggle -doHistogram \ -hBinSize=0.0942744 -hBinCount=1000 -hMinVal=-4.54396 -verbose=2 \ -db=mm10 phyloP60wayAll > histogram.data 2>&1 # real real 5m58.309s # create plot of histogram: cat << '_EOF_' | gnuplot > histo.png set terminal png small x000000 xffffff xc000ff x66ff66 xffff00 x00ffff set size 1.4, 0.8 set key left box set grid noxtics set grid ytics set title " Mouse Mm10 Histogram phyloP60way track, all 60 vertebrates" set xlabel " phyloP60way score, all 60 vertebrates" set ylabel " Relative Frequency" set y2label " Cumulative Relative Frequency (CRF)" set y2range [0:1] set y2tics set yrange [0:0.2] set xrange [-2:2] plot "histogram.data" using 2:5 title " RelFreq" with impulses, \ "histogram.data" using 2:7 axes x1y2 title " CRF" with lines '_EOF_' # << happy emacs display histo.png & ###################### Running the glire ####################### mkdir /hive/data/genomes/mm10/bed/multiz60way/consPhyloP/glire cd /hive/data/genomes/mm10/bed/multiz60way/consPhyloP/glire rm -fr wigFix mkdir wigFix gensub2 ../run.phyloP/ss.list single ../run.phyloP/template jobList para create jobList para try ... check ... push ... etc ... para time # Completed: 2709 of 2709 jobs # CPU time in finished jobs: 206723s 3445.39m 57.42h 2.39d 0.007 y # IO & Wait Time: 256366s 4272.76m 71.21h 2.97d 0.008 y # Average job time: 171s 2.85m 0.05h 0.00d # Longest finished job: 487s 8.12m 0.14h 0.01d # Submission to last job: 1926s 32.10m 0.54h 0.02d cd /hive/data/genomes/mm10/bed/multiz60way/consPhyloP/glire mkdir downloads for D in `ls -d wigFix/chr* | sed -e 's#wigFix/##'` do echo "working: $D" find ./wigFix/${D} -type f | sed -e "s#^./##; s#\.# d #g; s#-# m #;" \ | sort -k1,1 -k3,3n | sed -e "s# d #.#g; s# m #-#g;" | xargs cat \ | gzip -c > downloads/${D}.phastCons60way.glire.wigFix.gz done time (zcat downloads/chr*.wigFix.gz \ | wigEncode stdin phyloP60wayGlire.wig phyloP60wayGlire.wib \ > wigEncode.log 2>&1) & # Converted stdin, upper limit 1.17, lower limit -4.35 # real 20m31.753s export sizeG=188743680 ulimit -d $sizeG ulimit -v $sizeG time (zcat downloads/chr*.wigFix.gz \ | wigToBigWig stdin ../../../../chrom.sizes phyloP60wayGlire.bw) & # real 37m9.063s bigWigInfo phyloP60wayGlire.bw version: 4 isCompressed: yes isSwapped: 0 primaryDataSize: 3,158,091,915 primaryIndexSize: 100,775,272 zoomLevels: 10 chromCount: 59 basesCovered: 1,929,686,275 mean: 0.073187 min: -4.346000 max: 1.165000 std: 0.602992 # if you wanted to use the bigWig file, loading bigWig table: ln -s `pwd`/phyloP60wayGlire.bw /gbdb/mm10/bbi hgsql mm10 -e 'drop table if exists phyloP60wayGlire; \ create table phyloP60wayGlire \ (fileName varchar(255) not null); \ insert into phyloP60wayGlire values ("/gbdb/mm10/bbi/phyloP60wayGlire.bw");' # loading the wiggle table: ln -s `pwd`/phyloP60wayGlire.wib /gbdb/mm10/multiz60way time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/mm10/multiz60way mm10 \ phyloP60wayGlire phyloP60wayGlire.wig # real 0m58.536s wigTableStats.sh mm10 phyloP60wayGlire # db.table min max mean count # mm10.phyloP60wayGlire -4.346 1.165 0.0731873 1929686275 1.41229e+08 # stdDev viewLimits # 0.602992 viewLimits=-2.94177:1.165 # that range is: 4.346+1.165 = 5.511 -> hBinSize=0.005511 # Create histogram to get an overview of all the data time nice -n +19 hgWiggle -doHistogram \ -hBinSize=0.005511 -hBinCount=1000 -hMinVal=-4.346 -verbose=2 \ -db=mm10 phyloP60wayGlire > histogram.data 2>&1 # real 8m23.088s # create plot of histogram: cat << '_EOF_' | gnuplot > histo.png set terminal png small x000000 xffffff xc000ff x66ff66 xffff00 x00ffff set size 1.4, 0.8 set key left box set grid noxtics set grid ytics set title " Human Mm10 Histogram phyloP60wayGlire track" set xlabel " phyloP60wayGlire score" set ylabel " Relative Frequency" set y2label " Cumulative Relative Frequency (CRF)" set y2range [0:1] set y2tics set yrange [0:0.15] set xrange [-2:1.2] plot "histogram.data" using 2:5 title " RelFreq" with impulses, \ "histogram.data" using 2:7 axes x1y2 title " CRF" with lines '_EOF_' # << happy emacs display histo.png & ################### Running the euarchontoglires ####################### mkdir /hive/data/genomes/mm10/bed/multiz60way/consPhyloP/euarchontoglires cd /hive/data/genomes/mm10/bed/multiz60way/consPhyloP/euarchontoglires rm -fr wigFix mkdir wigFix gensub2 ../run.phyloP/ss.list single ../run.phyloP/template jobList para create jobList para try ... check ... push ... etc ... para time # Completed: 2709 of 2709 jobs # CPU time in finished jobs: 542547s 9042.45m 150.71h 6.28d 0.017 y # IO & Wait Time: 75914s 1265.23m 21.09h 0.88d 0.002 y # Average job time: 228s 3.80m 0.06h 0.00d # Longest finished job: 430s 7.17m 0.12h 0.00d # Submission to last job: 4149s 69.15m 1.15h 0.05d cd /hive/data/genomes/mm10/bed/multiz60way/consPhyloP/euarchontoglires mkdir downloads for D in `ls -d wigFix/chr* | sed -e 's#wigFix/##'` do echo "working: $D" find ./wigFix/${D} -type f | sed -e "s#^./##; s#\.# d #g; s#-# m #;" \ | sort -k1,1 -k3,3n | sed -e "s# d #.#g; s# m #-#g;" | xargs cat \ | gzip -c > downloads/${D}.phastCons60way.euarchontoglires.wigFix.gz done time (zcat downloads/chr*.wigFix.gz \ | wigEncode stdin phyloP60wayEuarchontoGlires.wig phyloP60wayEuarchontoGlires.wib \ > wigEncode.log 2>&1) & # Converted stdin, upper limit 1.75, lower limit -12.70 # real 10m52.064s export sizeG=188743680 ulimit -d $sizeG ulimit -v $sizeG time (zcat downloads/chr*.wigFix.gz \ | wigToBigWig stdin ../../../../chrom.sizes phyloP60wayEuarchontoGlires.bw) & # real 26m47.912s bigWigInfo phyloP60wayEuarchontoGlires.bw version: 4 isCompressed: yes isSwapped: 0 primaryDataSize: 3,970,501,521 primaryIndexSize: 100,775,272 zoomLevels: 10 chromCount: 59 basesCovered: 1,929,686,275 mean: 0.078739 min: -12.704000 max: 1.753000 std: 0.689759 # if you wanted to use the bigWig file, loading bigWig table: ln -s `pwd`/phyloP60wayEuarchontoGlires.bw /gbdb/mm10/bbi hgsql mm10 -e 'drop table if exists phyloP60wayEuarchontoGlires; \ create table phyloP60wayEuarchontoGlires \ (fileName varchar(255) not null); \ insert into phyloP60wayEuarchontoGlires values ("/gbdb/mm10/bbi/phyloP60wayEuarchontoGlires.bw");' # loading the wiggle table: ln -s `pwd`/phyloP60wayEuarchontoGlires.wib /gbdb/mm10/multiz60way time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/mm10/multiz60way mm10 \ phyloP60wayEuarchontoGlires phyloP60wayEuarchontoGlires.wig # real 0m51.777s time wigTableStats.sh mm10 phyloP60wayEuarchontoGlires # db.table min max mean count # mm10.phyloP60wayEuarchontoGlires -12.704 1.753 0.0787387 1929686275 # sumData stdDev viewLimits # 1.51941e+08 0.689759 viewLimits=-3.37006:1.753 # real 0m26.197s # that range is: 12.704+1.753 = 14.457 -> hBinSize=0.014457 # Create histogram to get an overview of all the data time nice -n +19 hgWiggle -doHistogram \ -hBinSize=0.014457 -hBinCount=1000 -hMinVal=-12.704 -verbose=2 \ -db=mm10 phyloP60wayEuarchontoGlires > histogram.data 2>&1 # real 3m22.205s # create plot of histogram: cat << '_EOF_' | gnuplot > histo.png set terminal png small x000000 xffffff xc000ff x66ff66 xffff00 x00ffff set size 1.4, 0.8 set key left box set grid noxtics set grid ytics set title " Mouse Mm10 Histogram phyloP60wayEuarchontoGlires track" set xlabel " phyloP60wayEuarchontoGlires score" set ylabel " Relative Frequency" set y2label " Cumulative Relative Frequency (CRF)" set y2range [0:1] set y2tics set yrange [0:0.15] set xrange [-2:1.2] plot "histogram.data" using 2:5 title " RelFreq" with impulses, \ "histogram.data" using 2:7 axes x1y2 title " CRF" with lines '_EOF_' # << happy emacs display histo.png & ###################### Running the primate ####################### ### ***### This was constructed ### and examined, but not used in the release mkdir /hive/data/genomes/mm10/bed/multiz60way/consPhyloP/primate cd /hive/data/genomes/mm10/bed/multiz60way/consPhyloP/primate rm -fr wigFix mkdir wigFix gensub2 ../run.phyloP/ss.list single ../run.phyloP/template jobList para -ram=8g create jobList para try ... check ... push ... etc ... para time # Completed: 2709 of 2709 jobs # CPU time in finished jobs: 307901s 5131.68m 85.53h 3.56d 0.010 y # IO & Wait Time: 42937s 715.62m 11.93h 0.50d 0.001 y # Average job time: 130s 2.16m 0.04h 0.00d # Longest finished job: 234s 3.90m 0.07h 0.00d # Submission to last job: 5975s 99.58m 1.66h 0.07d cd /cluster/data/mm10/bed/multiz60way/consPhyloP/run.phyloP/primate mkdir downloads for D in `ls -d wigFix/chr* | sed -e 's#wigFix/##'` do echo "working: $D" find ./wigFix/${D} -type f | sed -e "s#^./##; s#\.# d #g; s#-# m #;" \ | sort -k1,1 -k3,3n | sed -e "s# d #.#g; s# m #-#g;" | xargs cat \ | gzip -c > downloads/${D}.phastCons60way.primate.wigFix.gz done time (zcat downloads/chr*.wigFix.gz \ | wigEncode stdin phyloP60wayPrimate.wig phyloP60wayPrimate.wib \ > wigEncode.log 2>&1) & # real 9m37.055s # Converted stdin, upper limit 0.93, lower limit -10.63 export sizeG=188743680 ulimit -d $sizeG ulimit -v $sizeG time (zcat downloads/chr*.wigFix.gz \ | wigToBigWig stdin ../../../../chrom.sizes phyloP60wayPrimate.bw) & # real 24m18.842s bigWigInfo phyloP60wayPrimate.bw version: 4 isCompressed: yes isSwapped: 0 primaryDataSize: 2,715,332,211 primaryIndexSize: 100,775,272 zoomLevels: 10 chromCount: 59 basesCovered: 1,929,686,275 mean: 0.060017 min: -10.633000 max: 0.930000 std: 0.518027 # loading bigWig table: ln -s `pwd`/phyloP60wayPrimate.bw /gbdb/mm10/bbi hgsql mm10 -e 'drop table if exists phyloP60wayPrimate; \ create table phyloP60wayPrimate \ (fileName varchar(255) not null); \ insert into phyloP60wayPrimate values ("/gbdb/mm10/bbi/phyloP60wayPrimate.bw");' # loading the wiggle table: ln -s `pwd`/phyloP60wayPrimate.wib /gbdb/mm10/multiz60way time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/mm10/multiz60way mm10 \ phyloP60wayPrimate phyloP60wayPrimate.wig # real 0m45.837s wigTableStats.sh mm10 phyloP60wayPrimate # db.table min max mean count sumData stdDev viewLimits # mm10.phyloP60wayPrimate -10.633 0.93 0.0600168 1929686275 1.15814e+08 # stdDev viewLimits # 0.518027 viewLimits=-2.53012:0.93 # that range is: 10.633+0.93 = 11.563 for the hBinSize=0.11563 # Create histogram to get an overview of all the data time nice -n +19 hgWiggle -doHistogram \ -hBinSize=0.11563 -hBinCount=1000 -hMinVal=-10.633 -verbose=2 \ -db=mm10 phyloP60wayPrimate > histogram.data 2>&1 # real 4m36.379s # to see yrange: grep -v "^#" histogram.data | ave -col=5 stdin # create plot of histogram: cat << '_EOF_' | gnuplot > histo.png set terminal png small x000000 xffffff xc000ff x66ff66 xffff00 x00ffff set size 1.4, 0.8 set key left box set grid noxtics set grid ytics set title " Human Mm10 Histogram phyloP60wayPrimate track" set xlabel " phyloP60wayPrimate score" set ylabel " Relative Frequency" set y2label " Cumulative Relative Frequency (CRF)" set y2range [0:1] set y2tics set yrange [0:0.472] set xrange [-2.5:1.0] plot "histogram.data" using 2:5 title " RelFreq" with impulses, \ "histogram.data" using 2:7 axes x1y2 title " CRF" with lines '_EOF_' # << happy emacs display histo.png & ###################### Running the placental ####################### mkdir /hive/data/genomes/mm10/bed/multiz60way/consPhyloP/placental cd /hive/data/genomes/mm10/bed/multiz60way/consPhyloP/placental rm -fr wigFix mkdir wigFix gensub2 ../run.phyloP/ss.list single ../run.phyloP/template jobList para create jobList para try ... check ... push ... etc ... para time # Completed: 2709 of 2709 jobs # CPU time in finished jobs: 1188036s 19800.60m 330.01h 13.75d 0.038 y # IO & Wait Time: 209859s 3497.65m 58.29h 2.43d 0.007 y # Average job time: 516s 8.60m 0.14h 0.01d # Longest finished job: 1672s 27.87m 0.46h 0.02d # Submission to last job: 6336s 105.60m 1.76h 0.07d cd /hive/data/genomes/mm10/bed/multiz60way/consPhyloP/placental mkdir downloads for D in `ls -d wigFix/chr* | sed -e 's#wigFix/##'` do echo "working: $D" find ./wigFix/${D} -type f | sed -e "s#^./##; s#\.# d #g; s#-# m #;" \ | sort -k1,1 -k3,3n | sed -e "s# d #.#g; s# m #-#g;" | xargs cat \ | gzip -c > downloads/${D}.phastCons60way.placental.wigFix.gz done time (zcat downloads/chr*.wigFix.gz \ | wigEncode stdin phyloP60wayPlacental.wig phyloP60wayPlacental.wib \ > wigEncode.log 2>&1) & # Converted stdin, upper limit 3.30, lower limit -20.00 # real 11m54.289s export sizeG=188743680 ulimit -d $sizeG ulimit -v $sizeG time (zcat downloads/chr*.wigFix.gz \ | wigToBigWig stdin ../../../../chrom.sizes phyloP60wayPlacental.bw \ > bigWig.log 2>&1) & # real 28m4.576s bigWigInfo phyloP60wayPlacental.bw version: 4 isCompressed: yes isSwapped: 0 primaryDataSize: 4,423,832,009 primaryIndexSize: 100,775,272 zoomLevels: 10 chromCount: 59 basesCovered: 1,929,686,275 mean: 0.109489 min: -20.000000 max: 3.296000 std: 0.810657 # loading bigWig table if that is what you wanted to do: ln -s `pwd`/phyloP60wayPlacental.bw /gbdb/mm10/bbi hgsql mm10 -e 'drop table if exists phyloP60wayPlacental; \ create table phyloP60wayPlacental \ (fileName varchar(255) not null); \ insert into phyloP60wayPlacental values ("/gbdb/mm10/bbi/phyloP60wayPlacental.bw");' # loading the wiggle table: ln -s `pwd`/phyloP60wayPlacental.wib /gbdb/mm10/multiz60way time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/mm10/multiz60way mm10 \ phyloP60wayPlacental phyloP60wayPlacental.wig # real 0m50.284s wigTableStats.sh mm10 phyloP60wayPlacental # db.table min max mean count sumData # mm10.phyloP60wayPlacental -20 3.296 0.109489 1929686275 2.11279e+08 # stdDev viewLimits # 0.810657 viewLimits=-3.9438:3.296 # that range is: 20+3.296 = 23.296 for hBinSize=0.023296 # Create histogram to get an overview of all the data time nice -n +19 hgWiggle -doHistogram \ -hBinSize=0.023296 -hBinCount=1000 -hMinVal=-20 -verbose=2 \ -db=mm10 phyloP60wayPlacental > histogram.data 2>&1 # real 3m24.650s # to see yrange: grep -v "^#" histogram.data | ave -col=5 stdin # create plot of histogram: cat << '_EOF_' | gnuplot > histo.png set terminal png small x000000 xffffff xc000ff x66ff66 xffff00 x00ffff set size 1.4, 0.8 set key left box set grid noxtics set grid ytics set title " Mouse Mm10 Histogram phyloP60wayPlacental track" set xlabel " phyloP60wayPlacental score" set ylabel " Relative Frequency" set y2label " Cumulative Relative Frequency (CRF)" set y2range [0:1] set y2tics set yrange [0:0.084] set xrange [-2.5:2.5] plot "histogram.data" using 2:5 title " RelFreq" with impulses, \ "histogram.data" using 2:7 axes x1y2 title " CRF" with lines '_EOF_' # << happy emacs display histo.png & ###################### Running the vertebrate ####################### mkdir /hive/data/genomes/mm10/bed/multiz60way/consPhyloP/vertebrate cd /hive/data/genomes/mm10/bed/multiz60way/consPhyloP/vertebrate rm -fr wigFix mkdir wigFix gensub2 ../run.phyloP/ss.list single ../run.phyloP/template jobList para create jobList para try ... check ... push ... etc ... para time # Completed: 2709 of 2709 jobs # CPU time in finished jobs: 1825414s 30423.56m 507.06h 21.13d 0.058 y # IO & Wait Time: 211040s 3517.34m 58.62h 2.44d 0.007 y # Average job time: 752s 12.53m 0.21h 0.01d # Longest finished job: 1530s 25.50m 0.42h 0.02d # Submission to last job: 6045s 100.75m 1.68h 0.07d cd /hive/data/genomes/mm10/bed/multiz60way/consPhyloP/vertebrate mkdir downloads for D in `ls -d wigFix/chr* | sed -e 's#wigFix/##'` do echo "working: $D" find ./wigFix/${D} -type f | sed -e "s#^./##; s#\.# d #g; s#-# m #;" \ | sort -k1,1 -k3,3n | sed -e "s# d #.#g; s# m #-#g;" | xargs cat \ | gzip -c > downloads/${D}.phastCons60way.vertebrate.wigFix.gz done time (zcat downloads/chr*.wigFix.gz \ | wigEncode stdin phyloP60wayVertebrate.wig phyloP60wayVertebrate.wib \ > wigEncode.log 2>&1) & # Converted stdin, upper limit 7.53, lower limit -20.00 # real 12m2.774s export sizeG=188743680 ulimit -d $sizeG ulimit -v $sizeG time (zcat downloads/chr*.wigFix.gz \ | wigToBigWig stdin ../../../../chrom.sizes phyloP60wayVertebrate.bw \ > bigWig.log 2>&1) & # real 27m6.791s bigWigInfo phyloP60wayVertebrate.bw version: 4 isCompressed: yes isSwapped: 0 primaryDataSize: 4,529,467,614 primaryIndexSize: 100,775,272 zoomLevels: 10 chromCount: 59 basesCovered: 1,929,686,275 mean: 0.169653 min: -20.000000 max: 7.532000 std: 0.942808 # loading bigWig table: ln -s `pwd`/phyloP60wayVertebrate.bw /gbdb/mm10/bbi hgsql mm10 -e 'drop table if exists phyloP60wayVertebrate; \ create table phyloP60wayVertebrate \ (fileName varchar(255) not null); \ insert into phyloP60wayVertebrate values ("/gbdb/mm10/bbi/phyloP60wayVertebrate.bw");' # loading the wiggle table: ln -s `pwd`/phyloP60wayVertebrate.wib /gbdb/mm10/multiz60way time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/mm10/multiz60way mm10 \ phyloP60wayVertebrate phyloP60wayVertebrate.wig # real 0m56.535s time wigTableStats.sh mm10 phyloP60wayVertebrate # db.table min max mean count sumData stdDev viewLimits # mm10.phyloP60wayVertebrate -20 7.532 0.169653 1929686275 3.27377e+08 # stdDev viewLimits # 0.942808 viewLimits=-4.54439:4.88369 # real 0m25.320s # that range is: 20+7.532 = 27.532 for hBinSize=0.027532 # Create histogram to get an overview of all the data time nice -n +19 hgWiggle -doHistogram \ -hBinSize=0.027532 -hBinCount=1000 -hMinVal=-20 -verbose=2 \ -db=mm10 phyloP60wayVertebrate > histogram.data 2>&1 # real 3m26.565s # to see yrange: grep -v "^#" histogram.data | ave -col=5 stdin # create plot of histogram: cat << '_EOF_' | gnuplot > histo.png set terminal png small x000000 xffffff xc000ff x66ff66 xffff00 x00ffff set size 1.4, 0.8 set key left box set grid noxtics set grid ytics set title " Mouse Mm10 Histogram phyloP60wayVertebrate track" set xlabel " phyloP60wayVertebrate score" set ylabel " Relative Frequency" set y2label " Cumulative Relative Frequency (CRF)" set y2range [0:1] set y2tics set yrange [0:0.0913] set xrange [-2.5:2.5] plot "histogram.data" using 2:5 title " RelFreq" with impulses, \ "histogram.data" using 2:7 axes x1y2 title " CRF" with lines '_EOF_' # << happy emacs display histo.png & ######################################################################### # construct download files for 60-way (DONE - 2012-06-27 - 2012-08-21 - Hiram) mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/mm10/multiz60way mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/mm10/multiz60way/maf mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/mm10/multiz60way/alignments mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/mm10/phastCons60way mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/mm10/phastCons60way/glire mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/mm10/phastCons60way/primate mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/mm10/phastCons60way/euarchontoglire mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/mm10/phastCons60way/placental mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/mm10/phastCons60way/vertebrate mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/mm10/phastCons60way/mm10.60way.phastCons mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/mm10/phyloP60way mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/mm10/phyloP60way/glire mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/mm10/phyloP60way/primate mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/mm10/phyloP60way/euarchontoglire mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/mm10/phyloP60way/placental mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/mm10/phyloP60way/vertebrate mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/mm10/phyloP60way/mm10.60way.phyloP60way mkdir /hive/data/genomes/mm10/bed/multiz60way/downloads cd /hive/data/genomes/mm10/bed/multiz60way/downloads mkdir multiz60way phastCons60way phyloP60way cd multiz60way mkdir maf alignments cd maf time cp -p ../../../anno/result/chr*.maf . # real 735m35.723s time gzip *.maf # real 700m23.340s md5sum *.maf.gz > md5sum.txt ln -s `pwd`/*.maf.gz `pwd`/md5sum.txt \ /usr/local/apache/htdocs-hgdownload/goldenPath/mm10/multiz60way/maf cd .. du -hsc maf # 24G maf du -hsc ../../anno/result/ # 244G ../../anno/result/ ln -s ../../mm10.60way.nh . ln -s ../../mm10.60way.commonNames.nh . ln -s `pwd`/*.nh \ /usr/local/apache/htdocs-hgdownload/goldenPath/mm10/multiz60way ##################################################################### cd /hive/data/genomes/mm10/bed/multiz60way/downloads/phastCons60way mkdir glire euarchontoglire primate placental vertebrate mm10.60way.phastCons cd glire ln -s ../../../cons/glire/downloads/chr*.gz . time md5sum *.gz > md5sum.txt & ln -s `pwd`/*.gz `pwd`/md5sum.txt \ /usr/local/apache/htdocs-hgdownload/goldenPath/mm10/phastCons60way/glire # real 5m50.001s cd ../euarchontoglire ln -s ../../../cons/euarchontoglires/downloads/chr*.gz . time md5sum *.gz > md5sum.txt & # real 1m14.103s ln -s `pwd`/*.gz `pwd`/md5sum.txt \ /usr/local/apache/htdocs-hgdownload/goldenPath/mm10/phastCons60way/euarchontoglire cd ../primate ln -s ../../../cons/primate/downloads/chr*.gz . time md5sum *.gz > md5sum.txt & ln -s `pwd`/*.gz `pwd`/md5sum.txt \ /usr/local/apache/htdocs-hgdownload/goldenPath/mm10/phastCons60way/primate # real 5m39.288s cd ../placental ln -s ../../../cons/placental/downloads/chr*.gz . time md5sum *.gz > md5sum.txt & ln -s `pwd`/*.gz `pwd`/md5sum.txt \ /usr/local/apache/htdocs-hgdownload/goldenPath/mm10/phastCons60way/placental # real 5m9.762s cd ../vertebrate ln -s ../../../cons/vertebrate/downloads/chr*.gz . time md5sum *.gz > md5sum.txt & ln -s `pwd`/*.gz `pwd`/md5sum.txt \ /usr/local/apache/htdocs-hgdownload/goldenPath/mm10/phastCons60way/vertebrate # real 0m45.408s cd ../mm10.60way.phastCons ln -s ../../../cons/all/downloads/chr*.gz . time md5sum *.gz > md5sum.txt & ln -s `pwd`/*.gz `pwd`/md5sum.txt \ /usr/local/apache/htdocs-hgdownload/goldenPath/mm10/phastCons60way/mm10.60way.phastCons # real 6m11.158s cd .. ln -s ../../cons/all/all.mod mm10.60way.phastCons.mod ln -s ../../cons/glire/glire.mod mm10.60way.phastCons.glire.mod ln -s ../../cons/primate/primate.mod mm10.60way.phastCons.primate.mod ln -s ../../cons/euarchontoglires/euarchontoglires.mod mm10.60way.phastCons.euarchontoglire.mod ln -s ../../cons/placental/placental.mod mm10.60way.phastCons.placental.mod ln -s ../../cons/vertebrate/vertebrate.mod mm10.60way.phastCons.vertebrate.mod ln -s ../../cons/all/phastCons60way.bw mm10.60way.phastCons.bw ln -s ../../cons/glire/phastCons60wayGlire.bw \ mm10.60way.phastCons60wayGlire.bw ln -s ../../cons/placental/phastCons60wayPlacental.bw \ mm10.60way.phastCons60wayPlacental.bw ln -s ../../cons/euarchontoglires/phastCons60wayEuarchontoGlires.bw \ mm10.60way.phastCons60wayEuarchontoGlire.bw ln -s ../../cons/primate/phastCons60wayPrimate.bw \ mm10.60way.phastCons60wayPrimate.bw ln -s ../../cons/vertebrate/phastCons60wayVertebrate.bw \ mm10.60way.phastCons60wayVertebrate.bw time md5sum *.mod *.bw > md5sum.txt # real 20m11.260s # obtain the README.txt from hg19/phastCons46way and update for this # situation ln -s `pwd`/*.mod `pwd`/*.bw `pwd`/README.txt \ /usr/local/apache/htdocs-hgdownload/goldenPath/mm10/phastCons60way ##################################################################### cd /hive/data/genomes/mm10/bed/multiz60way/downloads/phyloP60way mkdir glire euarchontoglire primate placental vertebrate mm10.60way.phyloP60way cd glire ln -s ../../../consPhyloP/glire/downloads/chr*.gz . time md5sum *.gz > md5sum.txt & ln -s `pwd`/*.gz `pwd`/md5sum.txt \ /usr/local/apache/htdocs-hgdownload/goldenPath/mm10/phyloP60way/glire # real 6m5.733s cd ../euarchontoglire ln -s ../../../consPhyloP/euarchontoglires/downloads/chr*.gz . time md5sum *.gz > md5sum.txt & ln -s `pwd`/*.gz `pwd`/md5sum.txt \ /usr/local/apache/htdocs-hgdownload/goldenPath/mm10/phyloP60way/euarchontoglire # real 5m40.272s cd ../primate ln -s ../../../consPhyloP/primate/downloads/chr*.gz . time md5sum *.gz > md5sum.txt & ln -s `pwd`/*.gz `pwd`/md5sum.txt \ /usr/local/apache/htdocs-hgdownload/goldenPath/mm10/phyloP60way/primate # real 7m22.623s cd ../placental ln -s ../../../consPhyloP/placental/downloads/chr*.gz . time md5sum *.gz > md5sum.txt & ln -s `pwd`/*.gz `pwd`/md5sum.txt \ /usr/local/apache/htdocs-hgdownload/goldenPath/mm10/phyloP60way/placental # real 7m39.269s cd ../vertebrate ln -s ../../../consPhyloP/vertebrate/downloads/chr*.gz . time md5sum *.gz > md5sum.txt & ln -s `pwd`/*.gz `pwd`/md5sum.txt \ /usr/local/apache/htdocs-hgdownload/goldenPath/mm10/phyloP60way/vertebrate cd ../mm10.60way.phyloP60way ln -s ../../../consPhyloP/all/downloads/chr*.gz . time md5sum *.gz > md5sum.txt & ln -s `pwd`/*.gz `pwd`/md5sum.txt \ /usr/local/apache/htdocs-hgdownload/goldenPath/mm10/phyloP60way/mm10.60way.phyloP60way # real 8m5.777s cd .. ln -s ../../consPhyloP/run.phyloP/all.mod mm10.60way.phyloP60way.mod ln -s ../../consPhyloP/run.phyloP/glire.mod ./mm10.phyloP.glire.mod ln -s ../../consPhyloP/run.phyloP/placental.mod ./mm10.phyloP.placental.mod ln -s ../../consPhyloP/run.phyloP/euarchontoglires.mod ./mm10.phyloP.euarchontoglire.mod ln -s ../../consPhyloP/run.phyloP/primate.mod ./mm10.phyloP.primate.mod ln -s ../../consPhyloP/run.phyloP/vertebrate.mod ./mm10.60way.vertebrate.mod ln -s ../../consPhyloP/all/phyloP60way.bw mm10.60way.phyloP60way.bw ln -s ../../consPhyloP/glire/phyloP60wayGlire.bw \ mm10.60way.phyloP60wayGlire.bw ln -s ../../consPhyloP/vertebrate/phyloP60wayVertebrate.bw \ mm10.60way.phyloP60wayVertebrate.bw ln -s ../../consPhyloP/placental/phyloP60wayPlacental.bw \ mm10.60way.phyloP60wayPlacental.bw ln -s ../../consPhyloP/euarchontoglires/phyloP60wayEuarchontoGlires.bw \ mm10.60way.phyloP60wayEuarchontoglire.bw ln -s ../../consPhyloP/primate/phyloP60wayPrimate.bw \ mm10.60way.phyloP60wayPrimate.bw time md5sum *.mod *.bw > md5sum.txt & # real 20m17.082s # obtain the README.txt from hg19/phyloP46way and update for this # situation ln -s `pwd`/*.mod `pwd`/*.bw `pwd`/md5sum.txt `pwd`/README.txt \ /usr/local/apache/htdocs-hgdownload/goldenPath/mm10/phyloP60way ########################################################################### ## create upstream refGene maf files cd /hive/data/genomes/mm10/bed/multiz60way/downloads/maf # bash script #!/bin/sh for S in 1000 2000 5000 do echo "making upstream${S}.maf" featureBits mm10 refGene:upstream:${S} -fa=/dev/null -bed=stdout \ | perl -wpe 's/_up[^\t]+/\t0/' | sort -k1,1 -k2,2n \ | /cluster/bin/$MACHTYPE/mafFrags mm10 multiz60way \ stdin stdout \ -orgs=/hive/data/genomes/mm10/bed/multiz60way/species.list \ | gzip -c > upstream${S}.maf.gz echo "done upstream${S}.maf.gz" done # real 199m45.558s md5sum *.nh *.maf.gz > md5sum.txt # real 27m59.778s # obtain the README.txt from hg19/multiz46way and update for this # situation ln -s `pwd`/*.nh `pwd`/*.maf.gz `pwd`/*.txt \ /usr/local/apache/htdocs-hgdownload/goldenPath/mm10/multiz60way ############################################################################# # hgPal downloads (DONE - 2012-07-05 - 2012-07-09 - Hiram) # FASTA from 60-way for refGene ssh hgwdev screen -S mm10HgPal mkdir /hive/data/genomes/mm10/bed/multiz60way/pal cd /hive/data/genomes/mm10/bed/multiz60way/pal cat ../species.list | tr '[ ]' '[\n]' > order.list export mz=multiz60way export gp=refGene export db=mm10 export I=0 mkdir exonAA exonNuc for C in `sort -nk2 ../../../chrom.sizes | cut -f1` do I=`echo $I | awk '{print $1+1}'` echo "mafGene -chrom=$C -exons -noTrans $db $mz $gp order.list stdout | gzip -c > exonNuc/$C.exonNuc.fa.gz &" echo "mafGene -chrom=$C -exons $db $mz $gp order.list stdout | gzip -c > exonAA/$C.exonAA.fa.gz &" if [ $I -gt 6 ]; then echo "date" echo "wait" I=0 fi done > $gp.jobs echo "date" >> $gp.jobs echo "wait" >> $gp.jobs time sh -x $gp.jobs > $gp.jobs.log 2>&1 & # real 93m34.376s mz=multiz60way gp=refGene db=mm10 time zcat exonAA/*.gz | gzip -c > $gp.$mz.exonAA.fa.gz # real 1m16.821s zcat exonNuc/*.gz | gzip -c > $gp.$mz.exonNuc.fa.gz rm -rf exonAA exonNuc # we're only distributing exons at the moment mz=multiz60way gp=refGene db=mm10 pd=/usr/local/apache/htdocs-hgdownload/goldenPath/$db/$mz/alignments mkdir -p $pd md5sum *.fa.gz > md5sum.txt ln -s `pwd`/$gp.$mz.exonAA.fa.gz $pd/$gp.exonAA.fa.gz ln -s `pwd`/$gp.$mz.exonNuc.fa.gz $pd/$gp.exonNuc.fa.gz ln -s `pwd`/md5sum.txt $pd/ ######################################################################### # lastz nile tilapia oreNil2 (DONE - 2012-04-02 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10OreNil2 mkdir /hive/data/genomes/mm10/bed/lastzOreNil2.2012-04-11 cd /hive/data/genomes/mm10/bed/lastzOreNil2.2012-04-11 cat << '_EOF_' > DEF # Mouse vs. nile tilapia BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/scratch/data/blastz/HoxD55.q # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: nile tilapia oreNil2 SEQ2_DIR=/hive/data/genomes/oreNil2/oreNil2.2bit SEQ2_LEN=/hive/data/genomes/oreNil2/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=10 BASE=/hive/data/genomes/mm10/bed/lastzOreNil2.2012-04-11 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 # when not present, SEQ2_LIMIT is a default 100 time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=loose > do.log 2>&1 & # real 108m51.232s cat fb.mm10.chainOreNil2Link.txt # 51909908 bases of 2652783500 (1.957%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzOreNil2.2012-04-11 lastz.oreNil2 # and for the swap mkdir /hive/data/genomes/oreNil2/bed/blastz.mm10.swap cd /hive/data/genomes/oreNil2/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzOreNil2.2012-04-11/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -swap -chainMinScore=5000 -chainLinearGap=loose > swap.log 2>&1 & # real 9m8.213s cat fb.oreNil2.chainMm10Link.txt # 49704887 bases of 816084674 (6.091%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/oreNil2/bed ln -s blastz.mm10.swap lastz.mm10 ######################################################################### # LASTZ pig susScr3 (DONE - 2012-04-13 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10SusScr3 mkdir /hive/data/genomes/mm10/bed/lastzSusScr3.2012-04-13 cd /hive/data/genomes/mm10/bed/lastzSusScr3.2012-04-13 # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 cat << '_EOF_' > DEF # pig vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: pig SusScr3 SEQ2_DIR=/hive/data/genomes/susScr3/susScr3.2bit SEQ2_LEN=/hive/data/genomes/susScr3/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=30 BASE=/hive/data/genomes/mm10/bed/lastzSusScr3.2012-04-13 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 1086m29.992s cat fb.mm10.chainSusScr3Link.txt # 681359766 bases of 2652783500 (25.685%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzSusScr3.2012-04-13 lastz.susScr3 mkdir /hive/data/genomes/susScr3/bed/blastz.mm10.swap cd /hive/data/genomes/susScr3/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzSusScr3.2012-04-13/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 104m56.258s cat fb.susScr3.chainMm10Link.txt # 743574150 bases of 2525294057 (29.445%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/susScr3/bed ln -s blastz.mm10.swap lastz.mm10 ############################################################################## # LASTZ armadillo dasNov3 (DONE - 2012-04-13 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10DasNov3 mkdir /hive/data/genomes/mm10/bed/lastzDasNov3.2012-04-13 cd /hive/data/genomes/mm10/bed/lastzDasNov3.2012-04-13 # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 cat << '_EOF_' > DEF # armadillo vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: armadillo DasNov3 SEQ2_DIR=/hive/data/genomes/dasNov3/dasNov3.2bit SEQ2_LEN=/hive/data/genomes/dasNov3/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=200 BASE=/hive/data/genomes/mm10/bed/lastzDasNov3.2012-04-13 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 1125m34.124s cat fb.mm10.chainDasNov3Link.txt # 668529920 bases of 2652783500 (25.201%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzDasNov3.2012-04-13 lastz.dasNov3 # better to have reciprocal best for this one since it is low coverage: cd /hive/data/genomes/mm10/bed/lastzDasNov3.2012-04-13 time doRecipBest.pl mm10 dasNov3 -buildDir=`pwd` -workhorse=hgwdev \ > best.log 2>&1 & # real 116m51.114s mkdir /hive/data/genomes/dasNov3/bed/blastz.mm10.swap cd /hive/data/genomes/dasNov3/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzDasNov3.2012-04-13/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 150m51.653s cat fb.dasNov3.chainMm10Link.txt # 695161920 bases of 3299882059 (21.066%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/dasNov3/bed ln -s blastz.mm10.swap lastz.mm10 ############################################################################## # LASTZ cat felCat5 (DONE - 2012-04-13 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10FelCat5 mkdir /hive/data/genomes/mm10/bed/lastzFelCat5.2012-04-13 cd /hive/data/genomes/mm10/bed/lastzFelCat5.2012-04-13 # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 cat << '_EOF_' > DEF # cat vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: cat FelCat5 SEQ2_DIR=/hive/data/genomes/felCat5/felCat5.2bit SEQ2_LEN=/hive/data/genomes/felCat5/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=30 BASE=/hive/data/genomes/mm10/bed/lastzFelCat5.2012-04-13 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 1029m54.494s cat fb.mm10.chainFelCat5Link.txt # 788544084 bases of 2652783500 (29.725%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzFelCat5.2012-04-13 lastz.felCat5 # better to have reciprocal best for this one since it is low coverage: cd /hive/data/genomes/mm10/bed/lastzFelCat5.2012-04-13 time doRecipBest.pl mm10 felCat5 -buildDir=`pwd` -workhorse=hgwdev \ > best.log 2>&1 & # real 106m30.011s mkdir /hive/data/genomes/felCat5/bed/blastz.mm10.swap cd /hive/data/genomes/felCat5/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzFelCat5.2012-04-13/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 124m25.850s cat fb.felCat5.chainMm10Link.txt # 762344436 bases of 2364296207 (32.244%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/felCat5/bed ln -s blastz.mm10.swap lastz.mm10 ############################################################################## # LASTZ naked mole rat hetGla2 (DONE - 2012-04-14 - Hiram) # establish a screen to control this job screen -S mm10HetGla2 mkdir /hive/data/genomes/mm10/bed/lastzHetGla2.2012-04-14 cd /hive/data/genomes/mm10/bed/lastzHetGla2.2012-04-14 # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 cat << '_EOF_' > DEF # naked mole rat vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: naked mole rat HetGla2 SEQ2_DIR=/hive/data/genomes/hetGla2/hetGla2.2bit SEQ2_LEN=/hive/data/genomes/hetGla2/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=30 BASE=/hive/data/genomes/mm10/bed/lastzHetGla2.2012-04-14 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 690m7.626s cat fb.mm10.chainHetGla2Link.txt # 853221843 bases of 2652783500 (32.163%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzHetGla2.2012-04-14 lastz.hetGla2 mkdir /hive/data/genomes/hetGla2/bed/blastz.mm10.swap cd /hive/data/genomes/hetGla2/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzHetGla2.2012-04-14/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 92m24.775s cat fb.hetGla2.chainMm10Link.txt # 879356778 bases of 2314771103 (37.989%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/hetGla2/bed ln -s blastz.mm10.swap lastz.mm10 ############################################################################## # LASTZ dolphin turTru2 (DONE - 2012-04-14 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10TurTru2 mkdir /hive/data/genomes/mm10/bed/lastzTurTru2.2012-04-14 cd /hive/data/genomes/mm10/bed/lastzTurTru2.2012-04-14 # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 cat << '_EOF_' > DEF # dolphin vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: dolphin TurTru2 SEQ2_DIR=/hive/data/genomes/turTru2/turTru2.2bit SEQ2_LEN=/hive/data/genomes/turTru2/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=500 BASE=/hive/data/genomes/mm10/bed/lastzTurTru2.2012-04-14 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 624m36.508s cat fb.mm10.chainTurTru2Link.txt # 802921354 bases of 2652783500 (30.267%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzTurTru2.2012-04-14 lastz.turTru2 # better to have reciprocal best for this one since it is low coverage: cd /hive/data/genomes/mm10/bed/lastzTurTru2.2012-04-14 time doRecipBest.pl mm10 turTru2 -buildDir=`pwd` -workhorse=hgwdev \ > best.log 2>&1 & # real 44m47.753s mkdir /hive/data/genomes/turTru2/bed/blastz.mm10.swap cd /hive/data/genomes/turTru2/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzTurTru2.2012-04-14/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 124m17.088s cat fb.turTru2.chainMm10Link.txt # 781169007 bases of 2332402443 (33.492%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/turTru2/bed ln -s blastz.mm10.swap lastz.mm10 ############################################################################## # LASTZ Gibbon nomLeu2 (DONE - 2012-04-14 - Hiram) screen -S mm10NomLeu2 mkdir /hive/data/genomes/mm10/bed/lastzNomLeu2.2012-04-14 cd /hive/data/genomes/mm10/bed/lastzNomLeu2.2012-04-14 cat << '_EOF_' > DEF # gibbon vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: Gibbon NomLeu2 SEQ2_DIR=/hive/data/genomes/nomLeu2/nomLeu2.2bit SEQ2_LEN=/hive/data/genomes/nomLeu2/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=100 BASE=/hive/data/genomes/mm10/bed/lastzNomLeu2.2012-04-14 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # establish a screen to control this job screen -S mm10NomLeu2 time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 621m38.251s cat fb.mm10.chainNomLeu2Link.txt # 902774780 bases of 2652783500 (34.031%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzNomLeu2.2012-04-14 lastz.nomLeu2 mkdir /hive/data/genomes/nomLeu2/bed/blastz.mm10.swap cd /hive/data/genomes/nomLeu2/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzNomLeu2.2012-04-14/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 92m24.775s cat fb.nomLeu2.chainMm10Link.txt # 889660339 bases of 2756609047 (32.274%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/nomLeu2/bed ln -s blastz.mm10.swap lastz.mm10 ##################################################################### # tRNAs track (DONE 2012-04-02 Chin) # # Please refer to the generic tRNS track build documentation # ~/kent/src/hg/makeDb/doc/tRNAsTrack.txt # for details about how the track was build. ############################################################################## # orfeome 2012-03-16 (markd) enabled ORFeome tracks in etc/genbank.conf and reload genbank ############################################################################ # construct liftOver to mm9 (DONE - 2012-04-30 - Hiram) screen -S 10 # manage this longish running job in a screen mkdir /hive/data/genomes/mm10/bed/blat.mm9.2012-04-30 cd /hive/data/genomes/mm10/bed/blat.mm9.2012-04-30 # check it with -debug first to see if it is going to work: time doSameSpeciesLiftOver.pl -buildDir=`pwd` -bigClusterHub=swarm \ -ooc=/scratch/data/mm10/mm10.11.ooc \ -debug -dbHost=hgwdev -workhorse=hgwdev mm10 mm9 > do.log 2>&1 # if that is OK, then run it: time doSameSpeciesLiftOver.pl -buildDir=`pwd` -bigClusterHub=swarm \ -ooc=/scratch/data/mm10/mm10.11.ooc \ -dbHost=hgwdev -workhorse=hgwdev mm10 mm9 > do.log 2>&1 # real 95m21.635s # verify this file exists: og -L /gbdb/mm10/liftOver/mm10ToMm9.over.chain.gz # -rw-rw-r-- 1 535855 Feb 9 12:07 /gbdb/mm9/liftOver/mm9ToMm10.over.chain.gz # and try out the conversion on genome-test from mm9 to mm10 ############################################################################ # EXONIPHY MM10, lifted from hg19 (DONE - braney 2012-05-29) # needed for ucscGenes building # create a syntenic liftOver chain file cd /cluster/data/hg19/bed/lastz.mm10/axtChain time nice -n +19 netFilter -syn hg19.mm10.net.gz \ | netChainSubset -verbose=0 stdin hg19.mm10.all.chain.gz stdout \ | chainStitchId stdin stdout | gzip -c > hg19.mm10.syn.chain.gz #real 2m38.915s #user 3m29.458s #sys 0m16.033s # slightly smaller than the ordinary liftOver chain file: -rw-rw-r-- 1 78419424 Mar 7 18:40 hg19.mm10.over.chain.gz -rw-rw-r-- 1 74588027 May 29 12:29 hg19.mm10.syn.chain.gz # exoniphyMm9.gp is prepared as follows mkdir /cluster/data/mm10/bed/exoniphy cd /cluster/data/mm10/bed/exoniphy hgsql hg19 -e "select * from exoniphy" -N | cut -f 2-16 > exoniphyHg19.gp time nice -n +19 liftOver -genePred exoniphyHg19.gp \ /cluster/data/hg19/bed/lastz.mm10/axtChain/hg19.mm10.syn.chain.gz \ exoniphyMm10.gp unmapped # real 16m0.334s # user 15m46.462s # sys 0m7.115s wc -l * # 186601 exoniphyHg19.gp # 178821 exoniphyMm10.gp # 15560 unmapped cd /cluster/data/mm10/bed/exoniphy nice -n +19 hgLoadGenePred -genePredExt mm10 exoniphy exoniphyMm10.gp nice -n +19 featureBits mm10 exoniphy # 26795543 bases of 2652783500 (1.010%) in intersection nice -n +19 featureBits mm9 exoniphy # 25931742 bases of 2620346127 (0.990%) in intersection ############################################################################## # LASTZ cow bosTau6 (DONE - 2012-06-19 - Chin) # establish a screen to control this job with a name to indicate # what it is screen -S mm10BosTau6 mkdir /hive/data/genomes/mm10/bed/lastzBosTau6.2012-06-19 cd /hive/data/genomes/mm10/bed/lastzBosTau6.2012-06-19 # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 cat << '_EOF_' > DEF # cow vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: cow BosTau6 SEQ2_DIR=/scratch/data/bosTau6/bosTau6.2bit SEQ2_LEN=/scratch/data/bosTau6/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=50 BASE=/hive/data/genomes/mm10/bed/lastzBosTau6.2012-06-19 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 212m21.604s cat fb.mm10.chainBosTau6Link.txt # 700039696 bases of 2652783500 (26.389%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzBosTau6.2012-06-19 lastz.bosTau6 # swap mkdir /hive/data/genomes/bosTau6/bed/blastz.mm10.swap cd /hive/data/genomes/bosTau6/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzBosTau6.2012-06-19/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 72m13.925s cat fb.bosTau6.chainMm10Link.txt # 688651806 bases of 2649682029 (25.990%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/bosTau6/bed ln -s blastz.mm10.swap lastz.mm10 ############################################################################## # lastz Medium Ground Finch geoFor1 (DONE - 2012-07-29 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10 mkdir /hive/data/genomes/mm10/bed/lastzGeoFor1.2012-07-29 cd /hive/data/genomes/mm10/bed/lastzGeoFor1.2012-07-29 cat << '_EOF_' > DEF # Mouse vs. medium ground finch BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/scratch/data/blastz/HoxD55.q # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: Medium Ground Finch GeoFor1 SEQ2_DIR=/hive/data/genomes/geoFor1/geoFor1.2bit SEQ2_LEN=/hive/data/genomes/geoFor1/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=50 BASE=/hive/data/genomes/mm10/bed/lastzGeoFor1.2012-07-29 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 # when not present, SEQ2_LIMIT is a default 100 time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=loose > do.log 2>&1 & # real 251m4.194s cat fb.mm10.chainGeoFor1Link.txt # 93984241 bases of 2652783500 (3.543%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzGeoFor1.2012-07-29 lastz.geoFor1 # and for the swap mkdir /hive/data/genomes/geoFor1/bed/blastz.mm10.swap cd /hive/data/genomes/geoFor1/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzGeoFor1.2012-07-29/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -swap -chainMinScore=5000 -chainLinearGap=loose > swap.log 2>&1 & # real 10m0.875s cat fb.geoFor1.chainMm10Link.txt # 80273915 bases of 1041286029 (7.709%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/geoFor1/bed ln -s blastz.mm10.swap lastz.mm10 ######################################################################### # construct assembly fragments table (DONE - 2012-09-11 - Hiram) mkdir /hive/data/genomes/mm10/bed/assemblyFrags cd /hive/data/genomes/mm10/bed/assemblyFrags zgrep -h -v "^#" "${F}" zgrep -h -v "^#" ../../genbank/Primary_Assembly/assembled_chromosomes/AGP/*.comp.agp.gz \ | awk '$5 != "N"' \ | awk '{printf "%s\t%d\t%d\t%s\t0\t%s\n", $1,$2-1,$3,$6,$9}' \ | sed -e 's/CM000994.2/chr1/; s/CM000995.2/chr2/; s/CM000996.2/chr3/; s/CM000997.2/chr4/; s/CM000998.2/chr5/; s/CM000999.2/chr6/; s/CM001000.2/chr7/; s/CM001001.2/chr8/; s/CM001002.2/chr9/; s/CM001003.2/chr10/; s/CM001004.2/chr11/; s/CM001005.2/chr12/; s/CM001006.2/chr13/; s/CM001007.2/chr14/; s/CM001008.2/chr15/; s/CM001009.2/chr16/; s/CM001010.2/chr17/; s/CM001011.2/chr18/; s/CM001012.2/chr19/; s/CM001013.2/chrX/; s/CM001014.2/chrY/;' > chr.asmFrag.bed zgrep -h -v "^#" ../../genbank/Primary_Assembly/unlocalized_scaffolds/AGP/*.agp.gz \ | awk '$5 != "N"' \ | awk '{printf "%s\t%d\t%d\t%s\t0\t%s\n", $1,$2-1,$3,$6,$9}' \ | sed -e "s#GL456233.1#chrX_GL456233_random#; s#GL456216.1#chr4_GL456216_random#; s#JH584299.1#chr5_JH584299_random#; s#JH584301.1#chrY_JH584301_random#; s#JH584300.1#chrY_JH584300_random#; s#JH584303.1#chrY_JH584303_random#; s#JH584302.1#chrY_JH584302_random#; s#JH584298.1#chr5_JH584298_random#; s#JH584297.1#chr5_JH584297_random#; s#JH584296.1#chr5_JH584296_random#; s#JH584295.1#chr4_JH584295_random#; s#JH584294.1#chr4_JH584294_random#; s#JH584293.1#chr4_JH584293_random#; s#JH584292.1#chr4_JH584292_random#; s#GL456354.1#chr5_GL456354_random#; s#GL456350.1#chr4_GL456350_random#; s#GL456221.1#chr1_GL456221_random#; s#GL456219.1#chr7_GL456219_random#; s#GL456213.1#chr1_GL456213_random#; s#GL456212.1#chr1_GL456212_random#; s#GL456211.1#chr1_GL456211_random#; s#GL456210.1#chr1_GL456210_random#;" > chrUL.asmFrag.bed zgrep -h -v "^#" ../../genbank/Primary_Assembly/unplaced_scaffolds/AGP/*.agp.gz \ | awk '$5 != "N"' | sed -e 's/\.1\t/\t/' \ | awk '{printf "chrUn_%s\t%d\t%d\t%s\t0\t%s\n", $1,$2-1,$3,$6,$9}' \ > chrUn.asmFrag.bed cat chr.asmFrag.bed chrUL.asmFrag.bed chrUn.asmFrag.bed > mm10.asmFrag.bed # add the chrM identity echo -e "chrM\t0\t1629\tAY172335.1\t0\t+" >> mm10.asmFrag.bed hgLoadBed mm10 assemblyFrags mm10.asmFrag.bed featureBits mm10 assemblyFrags # 2652769048 bases of 2652783500 (99.999%) in intersection # should be silent when all chr names are correct: checkTableCoords mm10 assemblyFrags ######################################################################### # construct ucscToEnsembl table (DONE - 2012-09-11 - Hiram) mkdir /hive/data/genomes/mm10/ensembl cd /hive/data/genomes/mm10/ensembl wget --timestamping \ 'ftp://ftp.ensembl.org/pub/release-68/fasta/mus_musculus/dna/Mus_musculus.GRCm38.68.dna.toplevel.fa.gz' wget --timestamping \ 'ftp://ftp.ensembl.org/pub/release-68/fasta/mus_musculus/dna/Mus_musculus.GRCm38.68.dna.nonchromosomal.fa.gz' faCount *.fa.gz > faCount.txt egrep -v "total|seq" faCount.txt | awk '{print $1,$2}' \ | sort -u | sort -k2nr | sed -e "s/ /\t/" > ensembl.chrom.sizes mkdir /hive/data/genomes/mm10/bed/ucscToEnsembl cd /hive/data/genomes/mm10/bed/ucscToEnsembl awk '{printf "%d\t%s\n", $2,$1}' ../../chrom.sizes | sort > sizes.chrom.ucsc awk '{printf "%d\t%s\n", $2,$1}' ../../ensembl/ensembl.chrom.sizes \ | sort > sizes.chrom.ensembl join sizes.chrom.ucsc sizes.chrom.ensembl \ | awk '{printf "%s\t%s\n", $2,$3}' > ucscToEnsembl.tab cut -f1 ucscToEnsembl.tab | awk '{print length($1)}' | sort -rn | head -1 # 20 cat << '_EOF_' > ucscToEnsembl.sql # UCSC to Ensembl chr name translation CREATE TABLE ucscToEnsembl ( ucsc varchar(255) not null, # UCSC chromosome name ensembl varchar(255) not null, # Ensembl chromosome name #Indices PRIMARY KEY(ucsc(20)) ); '_EOF_' hgLoadSqlTab mm10 ucscToEnsembl ucscToEnsembl.sql ucscToEnsembl.tab ######################################################################### # GRC Incident database (DONE - 2012-09-21 - Hiram) # updated the automatic scripts to include the build of this track # on Mm10 # this procedure is run as a cron job in Hiram's account: # 43 09 * * * /hive/data/outside/grc/incidentDb/runUpdate.sh makeItSo # using the two scrips there: runUpdate.sh and update.sh # which are checked into the source tree as files: # src/hg/utils/automation/grcIncidentUpdate.sh # src/hg/utils/automation/grcRunIncidentUpdate.sh # they fetch the XML files from NCBI, convert them to SQL text # files, construct a bigBed file, and pushes it to genomewiki if # it is an update from previous # the table in the dataBase is: grcIncidentDb # which is the URL to the bb file, a single row: # http://genomewiki.ucsc.edu/images/a/a4/Mm10.grcIncidentDb.bb # construct the table after running the script once manually: hgBbiDbLink mm10 grcIncidentDb \ "http://genomewiki.ucsc.edu/images/a/a4/Mm10.grcIncidentDb.bb" ######################################################################### # GRCm38.p1 patch 1 (DONE - 2012-09-21 - Hiram) mkdir /hive/data/genomes/mm10/bed/patch1 cd /hive/data/genomes/mm10/bed/patch1 rsync -a -P rsync://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Mus_musculus/GRCm38.p1/ ./genbank/ # slight modifications to this script from hg19 patch9 work: ./gatherNames.pl genbank > ucscNames.patch1.txt # examine the names for sanity: awk '{print $NF}' ucscNames.patch1.txt | sort # and they should not be longer than 31 characters: awk '{print $NF}' ucscNames.patch1.txt | sort | awk '{print length($0)}' \ | sort -n | tail # script from hg19 patch9, update the variable patchName ./mkTables.pl patches.chrom.sizes ucscNames.patch1.txt genbank/PATCHES/alt_scaffolds/AGP/alt.scaf.agp.gz # output to stdout is the contents of alt.scaf.agp.gz # constructs ctgPos.txt chromInfo.txt gap.txt gold.txt # script from hg19 patch9, update the variable patchName ./mkCtgPos2.pl ucscNames.patch1.txt patches.chrom.sizes > ctgPos2.txt cp -p ../patch5/mkHapLocate.pl . ./mkHapLocate.pl ctgPos.txt \ PATCHES/alt_scaffolds/alt_scaffold_placement.txt \ > haplotypeLocations.bed cp -p haplotypeLocations.bed altSequence.bed ./mkFasta.pl ucscNames.patch1.txt > mm10.patch1.fa # the build of mm10Patch1 can be seen in mm10Patch1.txt egrep -v "32,32,190" altSequence.bed \ | awk '{printf "%s\t%d\t%d\t%s\t%d\t%s\n", $1,$2,$3,$4,$5,$6}' \ > altSeqPatchesP1.tab # no haplotypes yet, this is nothing: egrep "32,32,190" altSequence.bed \ | awk '{printf "%s\t%d\t%d\t%s\t%d\t%s\n", $1,$2,$3,$4,$5,$6}' \ > altSeqHaplotypesP1.tab # verify none lost wc -l altSequence.bed altSeqPatchesP1.tab altSeqHaplotypesP1.tab # 9 altSequence.bed # 9 altSeqPatchesP1.tab # 0 altSeqHaplotypesP1.tab # not necessary, there are none yet: hgLoadBed mm10 altSeqHaplotypesP1 altSeqHaplotypesP1.tab # Loaded 75 elements of size 6 hgLoadBed mm10 altSeqPatchesP1 altSeqPatchesP1.tab # Read 9 elements of size 6 from altSeqPatchesP1.tab # these tables are part of mouse/mm10/altSeqComposite1.ra ############################################################################## # Haplotype track (WORKING - 2012-10-01 - Hiram) cat << '_EOF_' > mkBedFile.pl #!/usr/bin/env perl use strict; use warnings; my $debug = 1; sub usage() { print STDERR "usage: ./mkBedFile.pl ../../mm10/genbank > mm10Haplotypes.bed\n"; print STDERR "expecting the Mus_musculus/GRCm38.p1/ hierarchy in ./genbank from NCBI\n"; exit 255; } my $argc = scalar(@ARGV); if ($argc != 1) { usage; } my $patchDir = shift; if ( ! -d $patchDir ) { print STDERR "ERROR: given directory $patchDir is not a directory or does not exist"; usage; } my %glSize; my %ctgToChr; my %ctgToFastaName; # my $fasta = "$patchDir/PATCHES/alt_scaffolds/FASTA/alt.scaf.fa.gz"; my @placeList = split('\n',`find $patchDir -type f | grep placement.txt | grep alt_scaffolds | grep -v UNKNOWN`); for (my $i = 0; $i < scalar(@placeList); ++$i) { printf STDERR "# %s\n", $placeList[$i]; open (FH, "grep -v '^#' $placeList[$i]|") or die "can not read $placeList[$i]"; while (my $line = ) { # printf STDERR "%s", $line; chomp $line; my @a = split('\s+', $line); next if ($a[11] eq "na"); $a[8] = "+" if ($a[8] eq "b"); my $descr = sprintf("Region name: %s", $a[7]); printf "chr%s\t%d\t%d\t%s\t0\t%s\t%s\t%s\n", $a[5], $a[11], $a[12], $a[0], $a[8], $a[3], $descr; } close (FH); } '_EOF_' # << happy emacs chmod +x mkBedFile.pl ./mkBedFile.pl > mm10Haplotypes.bedDetail cat << '_EOF_' > mm10Haplotypes.sql CREATE TABLE mm10Haplotypes ( chrom varchar(255) not null, # Reference sequence chromosome or scaffold chromStart int unsigned not null, # Start position in chromosome chromEnd int unsigned not null, # End position in chromosome name varchar(255) not null, # Short Name of item score int unsigned, # Score from 0-1000 strand char(1), # + or - id varchar(255) not null, # ID to bed used in URL to link back description longblob not null, # Long description of item for the details page #Indices INDEX(chrom, chromStart) ); '_EOF_' hgLoadSqlTab mm10 mm10Haplotypes mm10Haplotypes.sql mm10Haplotypes.bedDetail # trackDb entry: track mm10Haplotypes shortLabel Alt. strains longLabel Alternate mouse strains, mapped to reference as haplotypes group varRep priority 111 visibility hide type bedDetail 8 url http://www.ncbi.nlm.nih.gov/nuccore/$$ urlLabel NCBI Nucleotide: ########################################################################## ## CYTOBAND - ideogram track (DONE - 2012-10-19 - Hiram) ssh hgwdev mkdir -p /hive/data/outside/ncbi/ideogram/2012-10 cd /hive/data/outside/ncbi/ideogram/2012-10 # fetch all the ideogram files: rsync -a -P rsync://ftp.ncbi.nlm.nih.gov/pub/gdp/ ./ mkdir /hive/data/genomes/mm10/bed/cytoband cd /hive/data/genomes/mm10/bed/cytoband # Create bed file $HOME/kent/src/utils/ncbi/createNcbiCytoBand.pl \ /hive/data/outside/ncbi/ideogram/2012-10/ideogram_10090_GCF_000000055.19_NA_V2 ## can now verify before load: $HOME/kent/src/utils/ncbi/cytoBandVerify.pl # everything checks out OK on 21 chroms # Load the bed file hgLoadBed -noBin -sqlTable=$HOME/kent/src/hg/lib/cytoBand.sql \ mm10 cytoBand cytoBand.bed # Read 403 elements of size 5 from cytoBand.bed # Make cytoBandIdeo track for ideogram gif on hgTracks page. # For mouse cytoBandIdeo is just a replicate of the cytoBand track. hgsql -e "drop table cytoBandIdeo;" mm10 hgsql mm10 -e "create table cytoBandIdeo (index(chrom(10),chromStart)) as select * from cytoBand;" ########################################################################## # CYTOBANDIDEO update - (DONE - 2013-02-27 - kuhn) # adding rows for chroms with no cytology # this is just for navigation/orientation on those chroms set db=mm10 set sql=~/kent/src/hg/lib/cytoBandIdeo.sql # make backup of existing table hgsql -e "CREATE TABLE cytoBandIdeoCopy SELECT * FROM cytoBandIdeo" $db # dump existing table hgsql -N -e "SELECT * FROM cytoBandIdeo" $db > $db.cytoBandIdeo # find chroms already covered hgsql -N -e 'SELECT chrom FROM cytoBandIdeo' $db \ | sort -u > $db.coveredNames # make cytoBand records for chroms not already covered hgsql -N -e 'SELECT chrom, size FROM chromInfo' $db \ | grep -wvf $db.coveredNames \ | awk '{print $1"\t0\t"$2"\t\tgneg"}' > $db.cytoBandNew # check wc -l $db.* # combine and sort cat $db.cytoBandNew $db.cytoBandIdeo > $db.cytoBandIdeoFull bedSort $db.cytoBandIdeoFull $db.cytoBandIdeoFull # replace exsting table hgsql -e "DROP TABLE cytoBandIdeo" $db hgLoadSqlTab $db cytoBandIdeo $sql $db.cytoBandIdeoFull # check and then drop copy ########################################################################## # lastz Lamprey petMar2 (DONE - 2012-10-17 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S petMar2 mkdir /hive/data/genomes/mm10/bed/lastzPetMar2.2012-10-19 cd /hive/data/genomes/mm10/bed/lastzPetMar2.2012-10-19 cat << '_EOF_' > DEF # Mouse vs. Lamprey BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_M=50 BLASTZ_Q=/scratch/data/blastz/HoxD55.q # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: Lamprey PetMar2 SEQ2_DIR=/hive/data/genomes/petMar2/petMar2.2bit SEQ2_LEN=/hive/data/genomes/petMar2/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=60 BASE=/hive/data/genomes/mm10/bed/lastzPetMar2.2012-10-19 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 # when not present, SEQ2_LIMIT is a default 100 time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -qRepeats=windowmaskerSdust \ -chainMinScore=5000 -chainLinearGap=loose > do.log 2>&1 & # real 218m29.078s cat fb.mm10.chainPetMar2Link.txt # 28262565 bases of 2652783500 (1.065%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzPetMar2.2012-10-19 lastz.petMar2 # and for the swap mkdir /hive/data/genomes/petMar2/bed/blastz.mm10.swap cd /hive/data/genomes/petMar2/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzPetMar2.2012-10-19/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -swap -chainMinScore=5000 -chainLinearGap=loose > swap.log 2>&1 & # real 7m2.754s cat fb.petMar2.chainHg19Link.txt # 20923095 bases of 647368134 (3.232%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/petMar2/bed ln -s blastz.mm10.swap lastz.mm10 ######################################################################### # lastz White Rhino cerSim1 (DONE - 2012-10-23 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10CerSim1 mkdir /hive/data/genomes/mm10/bed/lastzCerSim1.2012-10-23 cd /hive/data/genomes/mm10/bed/lastzCerSim1.2012-10-23 cat << '_EOF_' > DEF # Mouse vs. White Rhino BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 SEQ1_LIMIT=10 # QUERY: White Rhino CerSim1 SEQ2_DIR=/hive/data/genomes/cerSim1/cerSim1.2bit SEQ2_LEN=/hive/data/genomes/cerSim1/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=20 BASE=/hive/data/genomes/mm10/bed/lastzCerSim1.2012-10-23 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 # when not present, SEQ2_LIMIT is a default 100 time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 992m45.890s cat fb.mm10.chainCerSim1Link.txt # 942281365 bases of 2652783500 (35.520%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzCerSim1.2012-10-23 lastz.cerSim1 # and for the swap mkdir /hive/data/genomes/cerSim1/bed/blastz.mm10.swap cd /hive/data/genomes/cerSim1/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzCerSim1.2012-10-23/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -swap -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 62m44s cat fb.cerSim1.chainMm10Link.txt # 926131511 bases of 2366858012 (39.129%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/cerSim1/bed ln -s blastz.mm10.swap lastz.mm10 ######################################################################### # QPCR PRIMERS (DONE - 2012-12-10 - Chin) # The track name is changed to "qPCR Primers" # Reload table with new track_mouse.BED (2013-01-28) # Download mkdir /hive/data/outside/Weizmann/qPcrPrimers cd /hive/data/outside/Weizmann/qPcrPrimers wget http://www.weizmann.ac.il/complex/compphys/software/Amit/primers/mouse/track_mouse.BED mkdir -p /hive/data/genomes/mm10/bed/qPcrPrimers cat track_mouse.BED | grep -v track \ > /hive/data/genomes/mm10/bed/qPcrPrimers/qPcrPrimers_mm10.bed cd /hive/data/genomes/mm10/bed/qPcrPrimers hgLoadBed -bedDetail -tab -renameSqlTable \ -sqlTable=$HOME/kent/src/hg/lib/bedDetail.sql \ mm10 qPcrPrimers qPcrPrimers_mm10.bed # Reading qPcrPrimers_mm10.bed # Read 518230 elements of size 14 from qPcrPrimers_mm10.bed # Sorted # Creating table definition for qPcrPrimers # Saving bed.tab # Loading mm10 # NULL descrition column hgsql mm10 -ne "UPDATE qPcrPrimers SET description = NULL;" ######################################################################### # DBSNP B137 / SNP137 (DONE 12/20/12 angie) # Redmine #7043 mkdir -p /hive/data/outside/dbSNP/137/mouse cd /hive/data/outside/dbSNP/137/mouse # Look at the directory listing of ftp://ftp.ncbi.nih.gov/snp/database/organism_data/ # to find the subdir name to use as orgDir below (mouse_10090 in this case). # Then click into that directory and look for file names like # b(1[0-9][0-9])_*_([0-9]+_[0-9]) # -- use the first num for build and the second num_num for buildAssembly. # jkStuff/liftContigs.lft maps NCBI contig names to chroms; use that for liftUp. # # Some trial and error was required to get the config.ra just right -- # the b* filenames don't include buildAssembly! # patch contigs needed to be filtered out: cat > config.ra <& do.log & tail -f do.log # Script ended with feedback about needing refAssemblyLabel because dbSNP # mapped to more than one assembly; add the label that clearly corresponds to # mm10, GRCm38, to config.ra and try again: cat > config.ra <>& do.log & tail -f do.log # Script ended with feedback about unrecognized NT_* contigs from dbSNP. # Inspect the script-generated suggested.lft for liftUp; it's usually right. # For contigs that are labeled as part of GRCm38 but not liftable to mm10, # listed in script-generated cantLiftUpSeqNames.txt, do some entrez # nucleotide searches for contig IDs and convince yourself that they're all # for alt assembly sequences that we don't include in mm10 (e.g. patches, # other strains). Then tell the script to filter out those contigs: cut -f 2 cantLiftUpSeqNames.txt > ignoreAltAssemblyContigs.txt cat > config.ra <>& do.log & tail -f do.log # ... #MultipleAlignments 1667342 This variant aligns in more than one location. #ObservedMismatch 4561144 UCSC reference allele does not match any observed allele from dbSNP. # # *** All done! # That is an unusually high count of ObservedMismatch... follow up with dbSNP. ############################################################################# # FILTER SNP137 (DONE 12/21/12 angie) # Redmine #7043 # Make several tracks that are filtered subsets of snp137: # First, filter out the multiply-aligned and/or weight >1 SNPs -> snp137Mult # Second, siphon off the common variants -> snp137Common # Third, take the (uniquely mapped, not known to be common) variants # w/dbSNP's "clinically-assoc" flag -> snp137Flagged cd /hive/data/outside/dbSNP/137/mouse zcat snp137.bed.gz \ | perl -we \ '$minTotal2N = 10; \ ($multCount, $comCount, $flagCount, $miscCount) = (0,0,0,0); \ open($mult, "| gzip -c > snp137Mult.bed.gz") || die; \ open($common, "| gzip -c > snp137Common.bed.gz") || die; \ open($flagged, "| gzip -c > snp137Flagged.bed.gz") || die; \ open($misc, "| gzip -c > snp137Misc.bed.gz") || die; \ while (<>) { \ @w = split("\t"); \ if ($w[16] > 1 || $w[17] =~ /MultipleAlignments/) { \ print $mult $_; \ $multCount++; \ } else { \ my ($alleleFreqCount, $nStr, $freqStr) = ($w[20], $w[22], $w[23]); \ my @alNs = split(",", $nStr); die unless scalar(@alNs) == $alleleFreqCount; \ my @freqs = split(",", $freqStr); die unless scalar(@freqs) == $alleleFreqCount; \ my ($total2N, $maxAlleleFreq) = (0, 0); \ for (my $i = 0; $i < $alleleFreqCount; $i++) { \ $total2N += $alNs[$i]; \ $maxAlleleFreq = $freqs[$i] if ($freqs[$i] > $maxAlleleFreq); \ } \ if ($alleleFreqCount >= 2 && $total2N >= $minTotal2N && $maxAlleleFreq <= 0.99) { \ print $common $_; \ $comCount++; \ } elsif($w[24] =~ /clinically-assoc/) { \ print $flagged $_; \ $flagCount++; \ } else { \ print $misc $_; \ $miscCount++; \ } \ } \ } \ close($mult); close($common); close($flagged); close($misc); \ print "snp137Mult: $multCount\nsnp137Common: $comCount\nsnp137Flagged: $flagCount\n" . \ "leftover: $miscCount\n";' #snp137Mult: 1671771 #snp137Common: 2709532 #snp137Flagged: 0 #leftover: 66537658 # It's expected for snp137Flagged to be empty because that's for human SNPs. # Load tables foreach subset (Mult Common) hgLoadBed -tab -onServer -tmpDir=/data/tmp -allowStartEqualEnd -renameSqlTable \ mm10 snp137$subset -sqlTable=snp137.sql snp137$subset.bed.gz end ############################################################################ # DBSNP CODING ANNOTATIONS (137) (DONE 12/21/12 angie) # Redmine #7043 cd /hive/data/outside/dbSNP/137/mouse # ncbiFuncAnnotations.txt has NCBI coords: 0-based, fully closed. # For anything except an insertion (0 bases between flanks), # we need to add 1 to the end coord. For an insertion, we need # to add 1 to the start coord. Make a hash of the insertion IDs, # then look up each ID in ncbiFuncAnnotations.txt to tell which # transform to apply. # Note: sort -u with the keys below is too restrictive -- we need full line uniq. zcat ncbiFuncAnnotations.txt.gz \ | perl -we 'open($IDS, "zcat ncbiFuncInsertions.ctg.bed.gz |") || die "ids: $!"; \ while (<$IDS>) { chomp; $ids{$_} = 1; } \ close($IDS); \ %coding = (2=>1, 3=>1, 4=>1, 8=>1, 9=>1, 41=>1, 42=>1, 43=>1, 44=>1, 45=>1); \ while (<>) { \ chomp; @w = split("\t"); # id, ctg, start, end, ... \ next unless $coding{$w[5]}; \ $bed4 = join("\t", $w[1], $w[2], $w[3], $w[0]); \ if (exists $ids{$bed4} && $w[3] == $w[2]+1) { \ $w[2]++; # 2-base insertions: increment start coord \ } else { \ $w[3]++; # increment end coord to get half-open \ } \ print join("\t", @w) . "\n"; \ }' \ | sort -k1n,1n -k2,2 -k3n,3n -k5,5 -k6n,6n \ | uniq \ > ncbiCodingAnnotations.txt wc -l ncbiCodingAnnotations.txt #1884989 ncbiCodingAnnotations.txt # How many & what kinds of function types? cut -f 6 ncbiCodingAnnotations.txt \ | sort -n | uniq -c # 371388 3 (coding-synon) #1301099 8 (cds-reference -- ignored) # 3465 41 (nonsense) # 199148 42 (missense) # 319 43 (stop-loss) # 7422 44 (frameshift) # 2148 45 (cds-indel) # In b137, the functional annotations include non-coding (frame = NULL), # which we'll exclude here because this is supposed to be just coding stuff... # probably need to update how we show dbSNP's func annos anyway, e.g. # it is a shame that we toss out codon number and transcript offset. # Gather up multiple annotation lines into one line per {snp, gene, frame}: perl -e 'while (<>) { chomp; \ my ($rsId, $ctg, $s, $e, $txId, $fxn, $frm, $nt, $aa, $codon) = split("\t"); \ next if ($fxn == 8 && ($frm eq "NULL" && $aa eq "NULL" && $codon eq "NULL")); \ if (defined $lastRs && \ ($lastRs != $rsId || $lastCtg ne $ctg || $lastS != $s || \ $lastTx ne $txId || $lastFrm ne $frm)) { \ if (defined $refRow) { \ $fxns = "$refRow->[0],$fxns"; $nts = "$refRow->[1],$nts"; \ $aas = "$refRow->[2],$aas"; $codons = "$refRow->[3],$codons"; \ } \ $lineOut = "$lastCtg\t$lastS\t$lastE\trs$lastRs\t$lastTx\t$lastFrm\t" . \ "$count\t$fxns\t$nts\t$codons\t$aas\n"; \ $lineOut =~ s@NULL@n/a@g; \ print $lineOut; \ $refRow = undef; @rows = (); ($count, $fxns, $nts, $codons, $aas) = (); \ } \ ($lastRs, $lastCtg, $lastS, $lastE, $lastTx, $lastFrm) = \ ($rsId, $ctg, $s, $e, $txId, $frm); \ $count++; \ if ($fxn == 8) { \ $refRow = [$fxn, $nt, $aa, $codon]; \ } else { \ $fxns .= "$fxn,"; $nts .= "$nt,"; $aas .= "$aa,"; $codons .= "$codon,"; \ } \ } \ if (defined $refRow) { \ $fxns = "$refRow->[0],$fxns"; $nts = "$refRow->[1],$nts"; \ $aas = "$refRow->[2],$aas"; $codons = "$refRow->[3],$codons"; \ } \ $lineOut = "$lastCtg\t$lastS\t$lastE\trs$lastRs\t$lastTx\t$lastFrm\t" . \ "$count\t$fxns\t$nts\t$codons\t$aas\n"; \ $lineOut =~ s@NULL@n/a@g; \ print $lineOut;' \ ncbiCodingAnnotations.txt \ | liftUp snp137CodingDbSnp.bed /hive/data/outside/dbSNP/137/mouse/suggested.lft warn stdin hgLoadBed mm10 snp137CodingDbSnp -sqlTable=$HOME/kent/src/hg/lib/snp125Coding.sql \ -renameSqlTable -tab -notItemRgb -allowStartEqualEnd \ snp137CodingDbSnp.bed #Read 552120 elements of size 11 from snp137CodingDbSnp.bed #########################################################################