# for emacs: -*- mode: sh; -*- # # the above keeps emacs happy while working with this text document # This file describes how we made the browser database on the # Rattus norvegicus genome, March 2012 update (Rnor5.0) from Baylor. # http://www.ncbi.nlm.nih.gov/bioproject/16219 # http://www.ncbi.nlm.nih.gov/genome/73 # http://www.ncbi.nlm.nih.gov/Traces/wgs/?val=AABR06 # Genome Coverage : 3x BAC; 6x WGS ABI Sanger reads # chrMt: NC_001665.2 # DATE: 08-Mar-2012 # ORGANISM: Rattus norvegicus # TAXID: 10116 # ASSEMBLY LONG NAME: Rnor_5.0 # ASSEMBLY SHORT NAME: Rnor_5.0 # ASSEMBLY SUBMITTER: Rat Genome Sequencing Consortium # ASSEMBLY TYPE: Haploid # NUMBER OF ASSEMBLY-UNITS: 1 # ASSEMBLY ACCESSION: GCA_000001895.3 # FTP-RELEASE DATE: 19-Mar-2012 ######################################################################### ## Download sequence (DONE - 2012-03-19 - Hiram) mkdir /hive/data/genomes/rn5 mkdir /hive/data/genomes/rn5/genbank cd /hive/data/genomes/rn5/genbank rsync -a -P \ rsync://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Rattus_norvegicus/Rnor_5.0/ ./ faSize Primary_Assembly/assembled_chromosomes/FASTA/chr*.fa.gz # 2902588968 bases (334517722 N's 2568071246 real 2568071246 upper 0 lower) # in 21 sequences in 21 files # Total size: mean 138218522.3 sd 67159802.9 # min 54450796 (gi|380690185|gb|CM000083.4|) # max 290094216 (gi|380690196|gb|CM000072.4|) median 118718031 faSize Primary_Assembly/unlocalized_scaffolds/FASTA/*.fa.gz # 4156200 bases (1012688 N's 3143512 real 3143512 upper 0 lower) # in 1278 sequences in 21 files # Total size: mean 3252.1 sd 8429.6 # min 500 (gi|380099756|gb|AABR06109458.1|) # max 227955 (gi|380099484|gb|AABR06109730.1|) median 2219 faSize Primary_Assembly/unplaced_scaffolds/FASTA/*.fa.gz # 2937457 bases (1314805 N's 1622652 real 1622652 upper 0 lower) # in 1439 sequences in 1 files # Total size: mean 2041.3 sd 5072.3 min 280 (gi|380097677|gb|AABR06111537.1|) # max 73090 (gi|380452989|gb|JH620568.1|) median 723 # and all together: faSize Primary_Assembly/assembled_chromosomes/FASTA/chr*.fa.gz \ Primary_Assembly/unlocalized_scaffolds/FASTA/*.fa.gz \ Primary_Assembly/unplaced_scaffolds/FASTA/*.fa.gz # 2909682625 bases (336845215 N's 2572837410 real 2572837410 upper 0 lower) # in 2738 sequences in 43 files # Total size: mean 1062703.7 sd 13357023.2 # min 280 (gi|380097677|gb|AABR06111537.1|) # max 290094216 (gi|380690196|gb|CM000072.4|) median 1018 ######################################################################### # fixup names for UCSC standards (DONE - 2012-03-19 - Hiram) mkdir /hive/data/genomes/rn5/ucsc cd /hive/data/genomes/rn5/ucsc ######################## Assembled Chromosomes cat << '_EOF_' > toUcsc.pl #!/bin/env perl use strict; use warnings; my %accToChr; open (FH, "<../genbank/Primary_Assembly/assembled_chromosomes/chr2acc") or die "can not read Primary_Assembly/assembled_chromosomes/chr2acc"; while (my $line = ) { next if ($line =~ m/^#/); chomp $line; my ($chrN, $acc) = split('\s+', $line); $accToChr{$acc} = $chrN; } close (FH); foreach my $acc (keys %accToChr) { my $chrN = $accToChr{$acc}; print "$acc $accToChr{$acc}\n"; open (FH, "zcat ../genbank/Primary_Assembly/assembled_chromosomes/AGP/chr${chrN}.agp.gz|") or die "can not read chr${chrN}.agp.gz"; open (UC, ">chr${chrN}.agp") or die "can not write to chr${chrN}.agp"; while (my $line = ) { if ($line =~ m/^#/) { print UC $line; } else { $line =~ s/^$acc/chr${chrN}/; print UC $line; } } close (FH); close (UC); open (FH, "zcat ../genbank/Primary_Assembly/assembled_chromosomes/FASTA/chr${chrN}.fa.gz|") or die "can not read chr${chrN}.fa.gz"; open (UC, ">chr${chrN}.fa") or die "can not write to chr${chrN}.fa"; while (my $line = ) { if ($line =~ m/^>/) { printf UC ">chr${chrN}\n"; } else { print UC $line; } } close (FH); close (UC); } '_EOF_' # << happy emacs chmod +x toUcsc.pl time ./toUcsc.pl # real 0m50.678s time gzip *.fa *.agp # real 12m46.877s faSize chr*.fa.gz # 2725521370 bases (77999939 N's 2647521431 real 2647521431 upper 0 # lower) in 21 sequences in 21 files # Total size: mean 129786731.9 sd 33408399.1 min 61431566 (chr19) # max 195471971 (chr1) median 124902244 ######################## Unplaced scaffolds cat << '_EOF_' > unplaced.pl #!/bin/env perl use strict; use warnings; my $agpFile = "../genbank/Primary_Assembly/unplaced_scaffolds/AGP/unplaced.scaf.agp.gz"; my $fastaFile = "../genbank/Primary_Assembly/unplaced_scaffolds/FASTA/unplaced.scaf.fa.gz"; open (FH, "zcat $agpFile|") or die "can not read $agpFile"; open (UC, ">unplaced.agp") or die "can not write to unplaced.agp"; while (my $line = ) { if ($line =~ m/^#/) { print UC $line; } else { $line =~ s/\.1//; printf UC "chrUn_%s", $line; } } close (FH); close (UC); open (FH, "zcat $fastaFile|") or die "can not read $fastaFile"; open (UC, ">unplaced.fa") or die "can not write to unplaced.fa"; while (my $line = ) { if ($line =~ m/^>/) { chomp $line; $line =~ s/.*gb\|//; $line =~ s/\.1\|.*//; printf UC ">chrUn_$line\n"; } else { print UC $line; } } close (FH); close (UC); '_EOF_' # << happy emacs chmod +x unplaced.pl time ./unplaced.pl # real 0m0.131s # make sure none of the names got to be over 31 characers long: grep -v "^#" unplaced.agp | cut -f1 | sort | uniq -c | sort -rn gzip *.fa *.agp # not much in that sequence: faSize unplaced.fa.gz # 2937457 bases (1314805 N's 1622652 real 1622652 upper 0 lower) # in 1439 sequences in 1 files # Total size: mean 2041.3 sd 5072.3 min 280 (chrUn_AABR06111537) # max 73090 (chrUn_JH620568) median 723 ######################## Unlocalized scaffolds cat << '_EOF_' > unlocalized.pl #!/bin/env perl use strict; use warnings; my %accToChr; my %chrNames; open (FH, "<../genbank/Primary_Assembly/unlocalized_scaffolds/unlocalized.chr2scaf") or die "can not read Primary_Assembly/unlocalized_scaffolds/unlocalized.chr2scaf"; while (my $line = ) { next if ($line =~ m/^#/); chomp $line; my ($chrN, $acc) = split('\s+', $line); $accToChr{$acc} = $chrN; $chrNames{$chrN} += 1; } close (FH); foreach my $chrN (keys %chrNames) { my $agpFile = "../genbank/Primary_Assembly/unlocalized_scaffolds/AGP/chr$chrN.unlocalized.scaf.agp.gz"; my $fastaFile = "../genbank/Primary_Assembly/unlocalized_scaffolds/FASTA/chr$chrN.unlocalized.scaf.fa.gz"; open (FH, "zcat $agpFile|") or die "can not read $agpFile"; open (UC, ">chr${chrN}_random.agp") or die "can not write to chr${chrN}_random.agp"; while (my $line = ) { if ($line =~ m/^#/) { print UC $line; } else { chomp $line; my (@a) = split('\t', $line); my $acc = $a[0]; my $accNo1 = $acc; $accNo1 =~ s/.1$//; die "ERROR: acc not .1: $acc" if ($accNo1 =~ m/\./); die "ERROR: chrN $chrN not correct for $acc" if ($accToChr{$acc} ne $chrN); my $ucscName = "chr${chrN}_${accNo1}_random"; printf UC "%s", $ucscName; for (my $i = 1; $i < scalar(@a); ++$i) { printf UC "\t%s", $a[$i]; } printf UC "\n"; } } close (FH); close (UC); printf "chr%s\n", $chrN; open (FH, "zcat $fastaFile|") or die "can not read $fastaFile"; open (UC, ">chr${chrN}_random.fa") or die "can not write to chr${chrN}_random.fa"; while (my $line = ) { if ($line =~ m/^>/) { chomp $line; my $acc = $line; $acc =~ s/.*gb\|//; $acc =~ s/\|.*//; my $accNo1 = $acc; $accNo1 =~ s/.1$//; die "ERROR: acc not .1: $acc" if ($accNo1 =~ m/\./); die "ERROR: chrN $chrN not correct for $acc" if ($accToChr{$acc} ne $chrN); my $ucscName = "chr${chrN}_${accNo1}_random"; printf UC ">$ucscName\n"; } else { print UC $line; } } close (FH); close (UC); } '_EOF_' # << happy emacs chmod +x unlocalized.pl time ./unlocalized.pl # real 0m0.592s gzip *.fa *.agp # verify none of the names are longer than 31 characters: zcat *.agp.gz | grep -v "^#" | cut -f1 | sort -u \ | awk '{print length($1),$1}' | sort -rn | head 25 chr20_AABR06110665_random 25 chr20_AABR06110664_random ... faSize chr*_random.fa.gz # 4156200 bases (1012688 N's 3143512 real 3143512 upper 0 lower) # in 1278 sequences in 21 files # Total size: mean 3252.1 sd 8429.6 min 500 (chr2_AABR06109458_random) # max 227955 (chr4_AABR06109730_random) median 2219 # verify all the sequence is still here after all this rigamarole: time faSize *.fa.gz # 2909682625 bases (336845215 N's 2572837410 real 2572837410 upper 0 lower) # in 2738 sequences in 43 files # Total size: mean 1062703.7 sd 13357023.2 min 280 (chrUn_AABR06111537) # max 290094216 (chr1) median 1018 # verify same numbers as was in the original files measured above ######################################################################### # Create .ra file and run makeGenomeDb.pl (DONE - Hiram - 2012-03-19) cd /hive/data/genomes/rn5 cat << '_EOF_' >rn5.config.ra # Config parameters for makeGenomeDb.pl: db rn5 clade mammal scientificName Rattus norvegicus commonName Rat assemblyDate Mar. 2012 assemblyLabel RGSC Rnor_5.0 (GCA_000001895.3) assemblyShortLabel RGSC 5.0 orderKey 1559 mitoAcc NC_001665 fastaFiles /hive/data/genomes/rn5/ucsc/*.fa.gz agpFiles /hive/data/genomes/rn5/ucsc/*.agp.gz # qualFiles none dbDbSpeciesDir rat ncbiAssemblyId 73 ncbiAssemblyName Rnor_5.0 taxId 10116 '_EOF_' # << happy emacs # run agp step first to verify fasta and agp files agree makeGenomeDb.pl -stop=agp rn5.config.ra > agp.log 2>&1 # verify end of agp.log indictates: # All AGP and FASTA entries agree - both files are valid # continue with the build time makeGenomeDb.pl -continue=db rn5.config.ra > db.log 2>&1 # real 22m39.834s ######################################################################### # running repeat masker (DONE - 2012-03-19 - Hiram) mkdir /hive/data/genomes/rn5/bed/repeatMasker cd /hive/data/genomes/rn5/bed/repeatMasker time doRepeatMasker.pl -buildDir=`pwd` -noSplit \ -bigClusterHub=swarm -dbHost=hgwdev -workhorse=hgwdev \ -smallClusterHub=encodek rn5 > do.log 2>&1 & # hgwdev rebooted during kluster run, continuing: time doRepeatMasker.pl -buildDir=`pwd` -noSplit \ -continue=cat -bigClusterHub=swarm -dbHost=hgwdev -workhorse=hgwdev \ -smallClusterHub=encodek rn5 > cat.log 2>&1 & # real 55m22.186s cat faSize.rmsk.txt # 2909698938 bases (336845215 N's 2572853723 real 1473277432 upper # 1099576291 lower) in 2739 sequences in 1 files # Total size: mean 1062321.6 sd 13354598.7 min 280 (chrUn_AABR06111537) # max 290094216 (chr1) median 1018 # %37.79 masked total, %42.74 masked real grep -i versi do.log # RepeatMasker version development-$Id: RepeatMasker,v 1.26 2011/09/26 16:19:44 angie Exp $ # April 26 2011 (open-3-3-0) version of RepeatMasker featureBits -countGaps rn5 rmsk # 1100336249 bases of 2909698938 (37.816%) in intersection # why is it different than the faSize above ? # because rmsk masks out some N's as well as bases, the faSize count above # separates out the N's from the bases, it doesn't show lower case N's ########################################################################## # running simple repeat (DONE - 2012-03-19 - Hiram) mkdir /hive/data/genomes/rn5/bed/simpleRepeat cd /hive/data/genomes/rn5/bed/simpleRepeat time doSimpleRepeat.pl -buildDir=`pwd` -bigClusterHub=swarm \ -dbHost=hgwdev -workhorse=hgwdev -smallClusterHub=encodek \ rn5 > do.log 2>&1 & # real 14m45.644s cat fb.simpleRepeat # 97893561 bases of 2780239565 (3.521%) in intersection # add the TRF mask to the rmsk sequence: # it masks more sequence cd /hive/data/genomes/rn5 twoBitMask rn5.rmsk.2bit \ -add bed/simpleRepeat/trfMask.bed rn5.2bit # you can safely ignore the warning about fields >= 13 twoBitToFa rn5.2bit stdout | faSize stdin > faSize.rn5.2bit.txt cat faSize.rn5.2bit.txt # 2909698938 bases (336845215 N's 2572853723 real 1471484052 upper # 1101369671 lower) in 2739 sequences in 1 files # Total size: mean 1062321.6 sd 13354598.7 min 280 (chrUn_AABR06111537) # max 290094216 (chr1) median 1018 # %37.85 masked total, %42.81 masked real # replace the previous symLink which goes to the unmasked 2bit rm /gbdb/rn5/rn5.2bit ln -s `pwd`/rn5.2bit /gbdb/rn5/rn5.2bit ######################################################################### # Verify all gaps are marked, add any N's not in gap as type 'other' # (DONE - 2012-03-19 - Hiram) mkdir /hive/data/genomes/rn5/bed/gap cd /hive/data/genomes/rn5/bed/gap time nice -n +19 findMotif -motif=gattaca -verbose=4 \ -strand=+ ../../rn5.unmasked.2bit > findMotif.txt 2>&1 # real 0m35.258s grep "^#GAP " findMotif.txt | sed -e "s/^#GAP //" > allGaps.bed time featureBits -countGaps rn5 -not gap -bed=notGap.bed # 2573362844 bases of 2909698938 (88.441%) in intersection # real 0m14.151s time featureBits -countGaps rn5 allGaps.bed notGap.bed -bed=new.gaps.bed # 509121 bases of 2909698938 (0.017%) in intersection # real 2m40.070s # what is the highest index in the existing gap table: hgsql -N -e "select ix from gap;" rn5 | sort -n | tail -1 # 22612 cat << '_EOF_' > mkGap.pl #!/bin/env perl use strict; use warnings; my $ix=`hgsql -N -e "select ix from gap;" rn5 | sort -n | tail -1`; chomp $ix; open (FH,") { my ($chrom, $chromStart, $chromEnd, $rest) = split('\s+', $line); ++$ix; printf "%s\t%d\t%d\t%d\tN\t%d\tother\tyes\n", $chrom, $chromStart, $chromEnd, $ix, $chromEnd-$chromStart; } close (FH); '_EOF_' # << happy emacs chmod +x ./mkGap.pl ./mkGap.pl > other.bed featureBits -countGaps rn5 other.bed # 509121 bases of 2909698938 (0.017%) in intersection wc -l other.bed # 148489 # verify no mistake here: featureBits -countGaps rn5 gap other.bed # 0 bases of 2909698938 (0.000%) in intersection hgLoadBed -sqlTable=$HOME/kent/src/hg/lib/gap.sql \ -noLoad rn5 otherGap other.bed # starting with this many hgsql -e "select count(*) from gap;" rn5 # 109913 hgsql rn5 -e 'load data local infile "bed.tab" into table gap;' # result count: hgsql -e "select count(*) from gap;" rn5 # 258402 calc 109913 \+ 148489 # 109913 + 148489 = 258402 # verify we aren't adding gaps where gaps already exist # this would output errors if that were true: gapToLift -minGap=1 rn5 nonBridged.lift -bedFile=nonBridged.bed # see example in danRer7.txt when problems arise # there are no non-bridged gaps here: hgsql -N -e "select bridge from gap;" rn5 | sort | uniq -c # 8109 no # 101804 yes ########################################################################## ## WINDOWMASKER (DONE - 2012-03-19 - Hiram) mkdir /hive/data/genomes/rn5/bed/windowMasker cd /hive/data/genomes/rn5/bed/windowMasker time nice -n +19 doWindowMasker.pl -buildDir=`pwd` -workhorse=hgwdev \ -dbHost=hgwdev rn5 > do.log 2>&1 & # real 231m12.464s # Masking statistics twoBitToFa rn5.wmsk.2bit stdout | faSize stdin # 2909698938 bases (336845215 N's 2572853723 real 1725214664 upper # 847639059 lower) in 2739 sequences in 1 files # Total size: mean 1062321.6 sd 13354598.7 min 280 (chrUn_AABR06111537) # max 290094216 (chr1) median 1018 # %29.13 masked total, %32.95 masked real twoBitToFa rn5.wmsk.sdust.2bit stdout | faSize stdin # 2909698938 bases (336845215 N's 2572853723 real 1710265619 upper # 862588104 lower) in 2739 sequences in 1 files # Total size: mean 1062321.6 sd 13354598.7 min 280 (chrUn_AABR06111537) # max 290094216 (chr1) median 1018 # %29.65 masked total, %33.53 masked real hgLoadBed rn5 windowmaskerSdust windowmasker.sdust.bed.gz # Loaded 13855996 elements of size 3 featureBits -countGaps rn5 windowmaskerSdust # 1199303584 bases of 2909698938 (41.217%) in intersection # eliminate the gaps from the masking featureBits rn5 -not gap -bed=notGap.bed # 2572853723 bases of 2572853723 (100.000%) in intersection time nice -n +19 featureBits rn5 windowmaskerSdust notGap.bed \ -bed=stdout | gzip -c > cleanWMask.bed.gz # 862588104 bases of 2572853723 (33.527%) in intersection # real 3m53.016s # reload track to get it clean hgLoadBed rn5 windowmaskerSdust cleanWMask.bed.gz # Loaded 13904385 elements of size 4 time featureBits -countGaps rn5 windowmaskerSdust # 862588104 bases of 2909698938 (29.645%) in intersection # real 1m26.398s # mask with this clean result zcat cleanWMask.bed.gz \ | twoBitMask ../../rn5.unmasked.2bit stdin \ -type=.bed rn5.cleanWMSdust.2bit twoBitToFa rn5.cleanWMSdust.2bit stdout | faSize stdin \ > rn5.cleanWMSdust.faSize.txt cat rn5.cleanWMSdust.faSize.txt # 2909698938 bases (336845215 N's 2572853723 real 1710265619 upper # 862588104 lower) in 2739 sequences in 1 files # Total size: mean 1062321.6 sd 13354598.7 min 280 (chrUn_AABR06111537) # max 290094216 (chr1) median 1018 # %29.65 masked total, %33.53 masked real # how much does this window masker and repeat masker overlap: featureBits -countGaps rn5 rmsk windowmaskerSdust # 648600018 bases of 2909698938 (22.291%) in intersection ######################################################################### # MASK SEQUENCE WITH WM+TRF # not running this since RM + TRF is plenty of masking # cd /hive/data/genomes/rn5 # twoBitMask -add bed/windowMasker/rn5.cleanWMSdust.2bit \ # bed/simpleRepeat/trfMask.bed rn5.2bit # safe to ignore the warnings about BED file with >=13 fields # twoBitToFa rn5.2bit stdout | faSize stdin > faSize.rn5.txt # cat faSize.rn5.txt # create symlink to gbdb # ssh hgwdev # rm /gbdb/rn5/rn5.2bit # ln -s `pwd`/rn5.2bit /gbdb/rn5/rn5.2bit ######################################################################### # PREPARE LINEAGE SPECIFIC REPEAT FILES FOR BLASTZ (DONE - 2012-03-23 - Hiram) ssh encodek mkdir /hive/data/genomes/rn5/bed/linSpecRep cd /hive/data/genomes/rn5/bed/linSpecRep # split the RM output by chromosome name into separate files mkdir rmsk dateRepeats head -3 ../repeatMasker/rn5.sorted.fa.out > rmsk.header.txt headRest 3 ../repeatMasker/rn5.sorted.fa.out \ | splitFileByColumn -ending=.out -col=5 -head=rmsk.header.txt stdin rmsk ls -1S rmsk/* > rmOut.list wc -l rmOut.list # 2382 rmOut.list cat << '_EOF_' > mkLSR #!/bin/csh -fe rm -f dateRepeats/$1_homo-sapiens_mus-musculus /scratch/data/RepeatMasker110426/DateRepeats \ $1 -query rat -comp human -comp mouse mv $1_homo-sapiens_mus-musculus dateRepeats '_EOF_' # << happy emacs chmod +x mkLSR cat << '_EOF_' > template #LOOP ./mkLSR $(path1) {check out line+ dateRepeats/$(file1)_homo-sapiens_mus-musculus} #ENDLOOP '_EOF_' # << happy emacs gensub2 rmOut.list single template jobList para create jobList para try ... check ... push ... etc... para time # Completed: 2382 of 2382 jobs # CPU time in finished jobs: 37294s 621.57m 10.36h 0.43d 0.001 y # IO & Wait Time: 6367s 106.11m 1.77h 0.07d 0.000 y # Average job time: 18s 0.31m 0.01h 0.00d # Longest finished job: 78s 1.30m 0.02h 0.00d # Submission to last job: 2245s 37.42m 0.62h 0.03d mkdir notInHuman notInMouse for F in dateRepeats/chr*.out_homo-sapiens* do B=`basename ${F}` B=${B/.out*/} echo $B /cluster/bin/scripts/extractRepeats 1 ${F} > \ notInHuman/${B}.out.spec /cluster/bin/scripts/extractRepeats 2 ${F} > \ notInMouse/${B}.out.spec done # Verify that these two things are actually different # To check identical find ./notInHuman ./notInMouse -name "*.out.spec" | \ while read FN; do echo `cat ${FN} | sum -r` ${FN}; done \ | sort -k1,1n | sort -t"/" -k3,3 > check.same # some of them are the same, but not all: sed -e 's#./notInHuman/##; s#./notInMouse/##' check.same \ | sort | uniq -c | sort -rn | less # you will see a count of two at the beginning, but it becomes one soon # Copy to data/staging for cluster replication mkdir /hive/data/staging/data/rn5 rsync -a -P ./notInMouse/ /hive/data/staging/data/rn5/notInMouse/ rsync -a -P ./notInHuman/ /hive/data/staging/data/rn5/notInHuman/ # We also need the nibs for the lastz runs with lineage specific repeats mkdir /hive/data/staging/data/rn5/nib mkdir /hive/data/genomes/rn5/nib cd /hive/data/genomes/rn5 cut -f1 chrom.sizes | while read C do twoBitToFa -seq=${C} rn5.2bit stdout | faToNib -softMask stdin nib/${C}.nib ls -og nib/$C.nib done # verify one is properly masked: nibFrag -masked nib/chrM.nib 0 `grep -w chrM chrom.sizes | cut -f2` + \ stdout | faSize stdin # 16313 bases (0 N's 16313 real 15922 upper 391 lower) # in 1 sequences in 1 files # %2.40 masked total, %2.40 masked real # compare to: twoBitToFa -seq=chrM rn5.2bit stdout | faSize stdin # 16313 bases (0 N's 16313 real 15922 upper 391 lower) # in 1 sequences in 1 files # %2.40 masked total, %2.40 masked real # Copy to data/genomes staging for cluster replication rsync -a -P ./nib/ /hive/data/staging/data/rn5/nib/ ######################################################################### # cpgIslands - (DONE - 2011-04-20 - Hiram) mkdir /hive/data/genomes/rn5/bed/cpgIslands cd /hive/data/genomes/rn5/bed/cpgIslands time doCpgIslands.pl rn5 > do.log 2>&1 # Elapsed time: 11m10s cat fb.rn5.cpgIslandExt.txt # 10377460 bases of 2572853723 (0.403%) in intersection ######################################################################### # genscan - (DONE - 2011-04-25 - Hiram) mkdir /hive/data/genomes/rn5/bed/genscan cd /hive/data/genomes/rn5/bed/genscan time doGenscan.pl rn5 > do.log 2>&1 # real 104m9.716s # a number of jobs did not finish, rerunning with window size 2000000 # only chr7 completed at 15000000 # rerunning with window size of 1000000 # real 78m10.904s # rerunning with window size of 500000 # rerunning with window size of 250000 # rerunning with window size of 100000 # real 85m5.776s # rerunning with window size of 50000 # real 86m57.706s # This is not working. Need to run these split mkdir /hive/data/genomes/rn5/bed/genscan/splitRun cd /hive/data/genomes/rn5/bed/genscan/splitRun gapToLift rn5 rn5.nonBridged.lift -bedFile=rn5.nonBridged.bed for C in 2 3 4 5 6 7 11 13 15 19 do echo chr${C} cd /hive/data/genomes/rn5/bed/genscan/splitRun grep -w "chr${C}" rn5.nonBridged.lift | grep -v random \ | sed -e "s/chr${C}./chr${C}_/" > chr${C}.nonBridged.lift mkdir chr${C} faToTwoBit ../hardMaskedFa/000/chr${C}.fa chr${C}/chr${C}.2bit ~/kent/src/hg/utils/lft2BitToFa.pl chr${C}/chr${C}.2bit \ chr${C}.nonBridged.lift > chr${C}/chr${C}.nonBridged.fa cd /hive/data/genomes/rn5/bed/genscan/splitRun/chr${C} mkdir split${C} faSplit sequence chr${C}.nonBridged.fa 100 split${C}/chr${C}_ done for C in 2 3 4 5 6 7 11 13 15 19 do grep -w "chr${C}" rn5.nonBridged.lift | grep -v random \ | sed -e "s/chr${C}./chr${C}_/" > chr${C}.nonBridged.lift done for C in 2 3 4 5 6 7 11 13 15 19 do echo chr${C} cd /hive/data/genomes/rn5/bed/genscan/splitRun/chr${C} echo '#!/bin/sh' > cmdList.sh export NL=-1 ls split${C} | while read F do NL=`echo $NL | awk '{print $1+1}'` if [ "${NL}" -gt 7 ]; then NL=0 echo "echo waiting before $F" >> cmdList.sh echo wait >> cmdList.sh fi echo "/cluster/bin/x86_64/gsBig split${C}/${F} gtf/${F}.gtf -trans=pep/${F}.pep -subopt=subopt/${F}.bed -exe=/scratch/data/genscan/genscan -par=/scratch/data/genscan/HumanIso.smat -tmp=/dev/shm -window=2400000 &" done >> cmdList.sh echo "echo waiting at end" >> cmdList.sh echo "wait" >> cmdList.sh chmod +x cmdList.sh rm -fr gtf pep subopt mkdir gtf pep subopt done # running chr15 - real 345m30.138s # running them all: time ./runAll.sh > runAll.log 2>&1 # real 458m23.357s # collecting the results: for C in chr2 chr3 chr4 chr5 chr6 chr7 chr11 chr13 chr15 chr19 do cd /hive/data/genomes/rn5/bed/genscan/splitRun/${C} cat gtf/${C}_*.gtf | liftUp -type=.gtf stdout ../${C}.nonBridged.lift error stdin \ | sed -e "s/${C}_0\([0-4]\)\./${C}.\1/g" > ${C}.gtf cat subopt/${C}_*.bed | liftUp -type=.bed stdout ../${C}.nonBridged.lift error stdin \ | sed -e "s/${C}_0\([0-4]\)\./${C}.\1/g" > ${C}.subopt.bed cat pep/${C}_*.pep | sed -e "s/${C}_0\([0-4]\)\./${C}.\1/g" > ${C}.pep ls -l ../../gtf/00?/${C}.gtf ../../pep/00?/${C}.pep ../../subopt/00?/${C}.bed ls -l ${C}.gtf ${C}.pep ${C}.subopt.bed done # after verifying the sizes of the files seem same compared to what # happened in the main run: for C in chr2 chr3 chr4 chr5 chr6 chr7 chr11 chr13 chr15 chr19 do cd /hive/data/genomes/rn5/bed/genscan/splitRun/${C} ls -l ../../gtf/00?/${C}.gtf ../../pep/00?/${C}.pep ../../subopt/00?/${C}.bed ls -l ${C}.gtf ${C}.pep ${C}.subopt.bed cd /hive/data/genomes/rn5/bed/genscan/splitRun done # this is tricky, it is counting on the file existing, empty or otherwise for C in chr2 chr3 chr4 chr5 chr6 chr7 chr11 chr13 chr15 chr19 do cd /hive/data/genomes/rn5/bed/genscan/splitRun/${C} D=`ls ../../gtf/00?/${C}.gtf` rm -f "${D}" cp -p ${C}.gtf "${D}" D=`ls ../../pep/00?/${C}.pep` rm -f "${D}" cp -p ${C}.pep "${D}" D=`ls ../../subopt/00?/${C}.bed` rm -f "${D}" cp -p ${C}.subopt.bed "${D}" cd /hive/data/genomes/rn5/bed/genscan/splitRun done # Now, we can continue cd /hive/data/genomes/rn5/bed/genscan time doGenscan.pl -continue=makeBed -workhorse=hgwdev -dbHost=hgwdev \ rn5 > makeBed.log 2>&1 # real 1m51.711s cat fb.rn5.genscan.txt # 56153914 bases of 2572853723 (2.183%) in intersection cat fb.rn5.genscanSubopt.txt # 59799363 bases of 2572853723 (2.324%) in intersection ######################################################################### # MAKE 11.OOC FILE FOR BLAT/GENBANK (DONE - 2012-05-04 - Hiram) # Use -repMatch=900, based on size -- for human we use 1024 # use the "real" number from the faSize measurement, # hg19 is 2897316137, calculate the ratio factor for 1024: calc \( 2572837410 / 2897316137 \) \* 1024 # ( 2572837410 / 2897316137 ) * 1024 = 909.319309 # round up to 950 (rn4 was 1024) cd /hive/data/genomes/rn5 time blat rn5.2bit /dev/null /dev/null -tileSize=11 \ -makeOoc=jkStuff/rn5.11.ooc -repMatch=800 # Wrote 34513 overused 11-mers to jkStuff/rn5.11.ooc # rn4 had: Wrote 25608 overused 11-mers to /cluster/bluearc/rn4/11.ooc # real 1m13.627s # there are non-bridged gaps, create lift file needed for genbank hgsql -N -e "select bridge from gap;" rn5 | sort | uniq -c # 8109 no # 250293 yes cd /hive/data/genomes/rn5/jkStuff gapToLift rn5 rn5.nonBridged.lift -bedFile=rn5.nonBridged.bed # largest non-bridged contig: awk '{print $3-$2,$0}' rn5.nonBridged.bed | sort -nr | head # 13151837 chr10 48884958 62036795 chr10.150 ######################################################################### # AUTO UPDATE GENBANK (DONE - 2012-05-04 - Hiram) # examine the file: /cluster/data/genbank/data/organism.lst # for your species to see what counts it has for: # organism mrnaCnt estCnt refSeqCnt # Rattus norvegicus 123478 1103589 16820 # to decide which "native" mrna or ests you want to specify in genbank.conf ssh hgwdev cd $HOME/kent/src/hg/makeDb/genbank git pull # edit etc/genbank.conf to add: # rn5 (rat) rn5.serverGenome = /hive/data/genomes/rn5/rn5.2bit rn5.clusterGenome = /hive/data/genomes/rn5/rn5.2bit rn5.ooc = /hive/data/genomes/rn5/jkStuff/rn5.11.ooc rn5.lift = /hive/data/genomes/rn5/jkStuff/rn5.nonBridged.lift rn5.refseq.mrna.native.pslCDnaFilter = ${finished.refseq.mrna.native.pslCDnaFilter} rn5.refseq.mrna.xeno.pslCDnaFilter = ${finished.refseq.mrna.xeno.pslCDnaFilter} rn5.genbank.mrna.native.pslCDnaFilter = ${finished.genbank.mrna.native.pslCDnaFilter} rn5.genbank.mrna.xeno.pslCDnaFilter = ${finished.genbank.mrna.xeno.pslCDnaFilter} rn5.genbank.est.native.pslCDnaFilter = ${finished.genbank.est.native.pslCDnaFilter} rn5.downloadDir = rn5 rn5.refseq.mrna.xeno.load = yes rn5.refseq.mrna.xeno.loadDesc = yes rn5.perChromTables = no rn5.mgc = yes # rn5.upstreamGeneTbl = refGene # rn5.upstreamMaf = multiz9way /hive/data/genomes/rn5/bed/multiz9way/species.lst # end of section added to etc/genbank.conf git commit -m "adding rn5 rat" etc/genbank.conf git push make etc-update ssh hgwdev # used to do this on "genbank" machine screen -S rn5 # long running job managed in screen cd /cluster/data/genbank time nice -n +19 ./bin/gbAlignStep -initial rn5 & # var/build/logs/2012.05.04-10:28:06.rn5.initalign.log # real 1166m46.194s # load database when finished ssh hgwdev cd /cluster/data/genbank time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad rn5 & # logFile: var/dbload/hgwdev/logs/2012.05.07-13:39:54.dbload.log # real 63m53.369s # enable daily alignment and update of hgwdev (DONE - 2012-02-09 - Hiram) cd ~/kent/src/hg/makeDb/genbank git pull # add rn5 to: etc/align.dbs etc/hgwdev.dbs git commit -m "Added rn5." etc/align.dbs etc/hgwdev.dbs git push make etc-update ######################################################################### # constructing downloads # first add rn5 to all.joiner and make sure joinerCheck is clean on it cd /hive/data/genomes/rn5 time makeDownloads.pl -workhorse=hgwdev -dbHost=hgwdev rn5 \ > downloads.log 2>&1 # real 24m32.649s # need to edit the README.txt files to make sure they are correct. ########################################################################## # create pushQ entry (DONE - 2012-05-23 - Hiram) # first make sure all.joiner is up to date and has this new organism # a keys check should be clean: cd ~/kent/src/hg/makeDb/schema joinerCheck -database=rn5 -keys all.joiner mkdir /hive/data/genomes/rn5/pushQ cd /hive/data/genomes/rn5/pushQ makePushQSql.pl rn5 > rn5.sql 2> stderr.out # check stderr.out for no significant problems, it is common to see: # WARNING: hgwdev does not have /gbdb/rn5/wib/gc5Base.wib # WARNING: hgwdev does not have /gbdb/rn5/wib/quality.wib # WARNING: hgwdev does not have /gbdb/rn5/bbi/quality.bw # WARNING: rn5 does not have seq # WARNING: rn5 does not have extFile # which are no real problem # if some tables are not identified: # WARNING: Could not tell (from trackDb, all.joiner and hardcoded lists of # supporting and genbank tables) which tracks to assign these tables to: # # put them in manually after loading the pushQ entry scp -p rn5.sql hgwbeta:/tmp ssh hgwbeta cd /tmp hgsql qapushq < rn5.sql ######################################################################### # fixup search rule for assembly track/gold table (DONE - 2012-06-07 - Hiram) hgsql -N -e "select frag from gold;" rn5 | sort | head -1 AABR06000001.1 hgsql -N -e "select frag from gold;" rn5 | sort | tail -2 AABR06112651.1 NC_001665 # to test the rule: hgsql -N -e "select frag from gold;" rn5 | wc -l # 112652 # should find all: hgsql -N -e "select frag from gold;" rn5 \ | egrep -e '[AN][AC][B_][R0]0[0-9]+(\.1)?' | wc -l # 112652 # or exclude all: hgsql -N -e "select frag from gold;" rn5 \ | egrep -v -e '[AN][AC][B_][R0]0[0-9]+(\.1)?' | wc -l # 0 # hence, add to trackDb/rat/rn5/trackDb.ra searchTable gold shortCircuit 1 termRegex [AN][AC][B_][R0]0[0-9]+(\.1)? query select chrom,chromStart,chromEnd,frag from %s where frag like '%s%%' searchPriority 8 ######################################################################### # chain/net hg19 (DONE - 2012-06-28 - Hiram) # the original alignment to human: cd /hive/data/genomes/hg19/bed/lastzRn5.2012-06-27 cat fb.hg19.chainRn5Link.txt # 917356917 bases of 2897316137 (31.662%) in intersection # running the swap - DONE - 2012-06-28 mkdir /hive/data/genomes/rn5/bed/blastz.hg19.swap cd /hive/data/genomes/rn5/bed/blastz.hg19.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/hg19/bed/lastzRn5.2012-06-27/DEF \ -swap -noLoadChainSplit \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 66m53.095s cat fb.rn5.chainHg19Link.txt # 933922552 bases of 2572853723 (36.299%) in intersection ############################################################################## # make refSeq amino acid fasta file for use with mm10 UCSC genes # # downloaded rn5.refGenePepProtNames.faa.gz using table browser, selecting # refGene, download sequence, and clicking the "protein" radio button. # This generates a file with the protein id's, whereas as we want the NM_* # id's for the blastp process to work zcat rn5.refGenePepProtNames.faa.gz | awk '/^>/ {print buf; buf=$1 " "} ! /^>/ {buf=buf $1}' | tr -d '>' | sed 's/\.[0-9] / /' | sort > tmp1 echo "select refGene.name,refLink.protAcc from refGene,refLink where refGene.name=refLink.mrnaAcc and refLink.protAcc != ''" | hgsql rn5 | tail -n +2 | awk '{print $2,$1}' | sort > tmp2 join tmp1 tmp2 | awk '{printf ">%s\n%s\n",$3,$2}' > rn5.refGenePep.faa rm tmp1 tmp2 ######################################################################### # LASTZ Macaca rheMac3 (DONE - 2012-07-10 - Chin) screen -S rheMac3-rn5 mkdir /cluster/data/rn5/bed/lastz.rheMac3.2012-07-10 cd /cluster/data/rn5/bed/lastz.rheMac3.2012-07-10 # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 cat << '_EOF_' > DEF # rat vs. macacque BLASTZ_M=254 # TARGET: Rat Rn5 SEQ1_DIR=/cluster/data/rn5/rn5.2bit SEQ1_LEN=/cluster/data/rn5/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: Macaca Mulatta RheMac3 SEQ2_DIR=/cluster/data/rheMac3/rheMac3.2bit SEQ2_LEN=/cluster/data/rheMac3/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=1000 BASE=/cluster/data/rn5/bed/lastz.rheMac3.2012-07-10 '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium \ -syntenicNet `pwd`/DEF > do.log 2>&1 & # real 1526m4.274s cat fb.rn5.chainRheMac3Link.txt # 896256761 bases of 2572853723 (34.835%) in intersection cd /hive/data/genomes/rn5/bed ln -s lastz.rheMac3.2012-07-10 lastz.rheMac3 # then swap mkdir /cluster/data/rheMac3/bed/blastz.rn5.swap cd /cluster/data/rheMac3/bed/blastz.rn5.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /cluster/data/rn5/bed/lastz.rheMac3.2012-07-10/DEF \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium \ -swap -syntenicNet > swap.log 2>&1 & # real 69m12.752s cat fb.rheMac3.chainRn5Link.txt # 858153767 bases of 2639145830 (32.516%) in intersection cd /hive/data/genomes/rheMac3/bed ln -s blastz.rn5.swap lastz.rn5 ############################################################################## # LIFTOVER TO rn4 (DONE - 2012-11-01 - Hiram) mkdir /hive/data/genomes/rn5/bed/blat.rn4.2012-11-01 cd /hive/data/genomes/rn5/bed/blat.rn4.2012-11-01 # -debug run to create run dir, preview scripts... doSameSpeciesLiftOver.pl -debug \ -bigClusterHub=swarm -dbHost=hgwdev -workhorse=hgwdev \ -ooc /hive/data/genomes/rn5/jkStuff/rn5.11.ooc rn5 rn4 # Real run: time nice -n +19 doSameSpeciesLiftOver.pl -verbose=2 \ -bigClusterHub=swarm -dbHost=hgwdev -workhorse=hgwdev \ -ooc /hive/data/genomes/rn5/jkStuff/rn5.11.ooc rn5 rn4 > do.log 2>&1 # real 114m49.176s # test the browser on hgwdev to see if it can convert from rn5 to rn4 ############################################################################# # lastz zebrafish danRer7 (DONE - 2012-11-13,14 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S rn5DanRer7 mkdir /hive/data/genomes/rn5/bed/lastzDanRer7.2012-11-13 cd /hive/data/genomes/rn5/bed/lastzDanRer7.2012-11-13 cat << '_EOF_' > DEF # Rat vs. zebrafish BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/scratch/data/blastz/HoxD55.q # TARGET: Rat Rn5 SEQ1_DIR=/hive/data/genomes/rn5/rn5.2bit SEQ1_LEN=/hive/data/genomes/rn5/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 SEQ1_LIMIT=15 # QUERY: zebrafish danRer7 SEQ2_DIR=/scratch/data/danRer7/danRer7.2bit SEQ2_LEN=/scratch/data/danRer7/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=10 BASE=/hive/data/genomes/rn5/bed/lastzDanRer7.2012-11-13 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 # when not present, SEQ2_LIMIT is a default 100 time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=loose > do.log 2>&1 & # real 1099m42.792s cat fb.rn5.chainDanRer7Link.txt # 66573209 bases of 2572853723 (2.588%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/rn5/bed ln -s lastzDanRer7.2012-11-13 lastz.danRer7 # and for the swap mkdir /hive/data/genomes/danRer7/bed/blastz.rn5.swap cd /hive/data/genomes/danRer7/bed/blastz.rn5.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/rn5/bed/lastzDanRer7.2012-11-13/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -swap -chainMinScore=5000 -chainLinearGap=loose > swap.log 2>&1 & # real 15m5.664s cat fb.danRer7.chainRn5Link.txt # 68007020 bases of 1409770109 (4.824%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/danRer7/bed ln -s blastz.rn5.swap lastz.rn5 ######################################################################### # lastz frog xenTro3 (DONE - 2012-11-13,14 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S rn5XenTro3 mkdir /hive/data/genomes/rn5/bed/lastzXenTro3.2012-11-13 cd /hive/data/genomes/rn5/bed/lastzXenTro3.2012-11-13 cat << '_EOF_' > DEF # Rat vs. frog BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/scratch/data/blastz/HoxD55.q # TARGET: Rat Rn5 SEQ1_DIR=/hive/data/genomes/rn5/rn5.2bit SEQ1_LEN=/hive/data/genomes/rn5/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 SEQ1_LIMIT=30 # QUERY: frog xenTro3 SEQ2_DIR=/scratch/data/xenTro3/xenTro3.2bit SEQ2_LEN=/scratch/data/xenTro3/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=80 BASE=/hive/data/genomes/rn5/bed/lastzXenTro3.2012-11-13 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 # when not present, SEQ2_LIMIT is a default 100 time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=loose > do.log 2>&1 & # real 1307m43.094s cat fb.rn5.chainXenTro3Link.txt # 82011038 bases of 2572853723 (3.188%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/rn5/bed ln -s lastzXenTro3.2012-11-13 lastz.xenTro3 # and for the swap mkdir /hive/data/genomes/xenTro3/bed/blastz.rn5.swap cd /hive/data/genomes/xenTro3/bed/blastz.rn5.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/rn5/bed/lastzXenTro3.2012-11-13/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -swap -chainMinScore=5000 -chainLinearGap=loose > swap.log 2>&1 & # real 25m43.688s cat fb.xenTro3.chainRn5Link.txt # 87276772 bases of 1358334882 (6.425%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/xenTro3/bed ln -s blastz.rn5.swap lastz.rn5 ######################################################################### # lastz turkey melGal1 (DONE - 2012-11-13,14 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S rn5MelGal1 mkdir /hive/data/genomes/rn5/bed/lastzMelGal1.2012-11-13 cd /hive/data/genomes/rn5/bed/lastzMelGal1.2012-11-13 cat << '_EOF_' > DEF # Rat vs. turkey BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/scratch/data/blastz/HoxD55.q # TARGET: Rat Rn5 SEQ1_DIR=/hive/data/genomes/rn5/rn5.2bit SEQ1_LEN=/hive/data/genomes/rn5/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 SEQ1_LIMIT=20 # QUERY: turkey melGal1 SEQ2_DIR=/scratch/data/melGal1/melGal1.2bit SEQ2_LEN=/scratch/data/melGal1/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=30 BASE=/hive/data/genomes/rn5/bed/lastzMelGal1.2012-11-13 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 # when not present, SEQ2_LIMIT is a default 100 time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=loose > do.log 2>&1 & # real 1225m37.742s cat fb.rn5.chainMelGal1Link.txt # 93392251 bases of 2572853723 (3.630%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/rn5/bed ln -s lastzMelGal1.2012-11-13 lastz.melGal1 # and for the swap mkdir /hive/data/genomes/melGal1/bed/blastz.rn5.swap cd /hive/data/genomes/melGal1/bed/blastz.rn5.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/rn5/bed/lastzMelGal1.2012-11-13/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -swap -chainMinScore=5000 -chainLinearGap=loose > swap.log 2>&1 & # real 10m0.139s cat fb.melGal1.chainRn5Link.txt # 74638274 bases of 935922386 (7.975%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/melGal1/bed ln -s blastz.rn5.swap lastz.rn5 ######################################################################### # lastz chicken galGal4 (DONE - 2012-11-29,12-04 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S rn5GalGal4 mkdir /hive/data/genomes/rn5/bed/lastzGalGal4.2012-11-29 cd /hive/data/genomes/rn5/bed/lastzGalGal4.2012-11-29 cat << '_EOF_' > DEF # Rat vs. chicken BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/scratch/data/blastz/HoxD55.q # TARGET: Rat Rn5 SEQ1_DIR=/hive/data/genomes/rn5/rn5.2bit SEQ1_LEN=/hive/data/genomes/rn5/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 SEQ1_LIMIT=20 # QUERY: chicken galGal4 SEQ2_DIR=/hive/data/genomes/galGal4/galGal4.2bit SEQ2_LEN=/hive/data/genomes/galGal4/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=60 BASE=/hive/data/genomes/rn5/bed/lastzGalGal4.2012-11-29 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 # when not present, SEQ2_LIMIT is a default 100 time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=loose > do.log 2>&1 & # Elapsed time: 1483m27s cat fb.rn5.chainGalGal4Link.txt # 97465937 bases of 2572853723 (3.788%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/rn5/bed ln -s lastzGalGal4.2012-11-29 lastz.galGal4 # and for the swap mkdir /hive/data/genomes/galGal4/bed/blastz.rn5.swap cd /hive/data/genomes/galGal4/bed/blastz.rn5.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/rn5/bed/lastzGalGal4.2012-11-29/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -swap -chainMinScore=5000 -chainLinearGap=loose > swap.log 2>&1 & # real 11m49.505s cat fb.galGal4.chainRn5Link.txt # 81184849 bases of 1032854810 (7.860%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/galGal4/bed ln -s blastz.rn5.swap lastz.rn5 ######################################################################### # lastz Opossum monDom5 (WORKING - 2012-11-29 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S rn5MonDom5 mkdir /hive/data/genomes/rn5/bed/lastzMonDom5.2012-11-29 cd /hive/data/genomes/rn5/bed/lastzMonDom5.2012-11-29 cat << '_EOF_' > DEF # Rat vs. opossum BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz BLASTZ_M=254 BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/scratch/data/blastz/HoxD55.q # TARGET: Rat Rn5 SEQ1_DIR=/hive/data/genomes/rn5/rn5.2bit SEQ1_LEN=/hive/data/genomes/rn5/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 SEQ1_LIMIT=20 # QUERY: Opossum monDom5 SEQ2_DIR=/scratch/data/monDom5/monDom5.2bit SEQ2_LEN=/scratch/data/monDom5/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/hive/data/genomes/rn5/bed/lastzMonDom5.2012-11-29 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 # Can't do this when there are only the single small set of chroms time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=loose > do.log 2>&1 & # real 3213m53.797s # chaining failed, running last two for chr1 and chr3 ./chain.csh rn5.2bit:chr1: chain/rn5.2bit:chr1:.chain & ./chain.csh rn5.2bit:chr3: chain/rn5.2bit:chr3:.chain # real 269m41.974s time nice -n +19 doBlastzChainNet.pl -verbose=2 \ -continue=chainMerge `pwd`/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=loose > chainMerge.log 2>&1 & XXX - running - Tue Dec 4 14:05:34 PST 2012 cat fb.rn5.chainMonDom5Link.txt # 254245903 bases of 2652783500 (9.584%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/rn5/bed ln -s lastzMonDom5.2012-11-29 lastz.monDom5 # and for the swap mkdir /hive/data/genomes/monDom5/bed/blastz.rn5.swap cd /hive/data/genomes/monDom5/bed/blastz.rn5.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/rn5/bed/lastzMonDom5.2012-11-29/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -swap -chainMinScore=5000 -chainLinearGap=loose > do.log 2>&1 & # real 73m49.230s cat fb.monDom5.chainRn5Link.txt # 252291401 bases of 3501660299 (7.205%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/monDom5/bed ln -s blastz.rn5.swap lastz.rn5 ######################################################################### # LASTZ cow bosTau7 (DONE - 2012-11-29,12-03 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S rn5BosTau7 mkdir /hive/data/genomes/rn5/bed/lastzBosTau7.2012-11-29 cd /hive/data/genomes/rn5/bed/lastzBosTau7.2012-11-29 cat << '_EOF_' > DEF # cow vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: Rat Rn5 SEQ1_DIR=/hive/data/genomes/rn5/rn5.2bit SEQ1_LEN=/hive/data/genomes/rn5/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 SEQ1_LIMIT=20 # QUERY: cow BosTau7 SEQ2_DIR=/scratch/data/bosTau7/bosTau7.2bit SEQ2_LEN=/scratch/data/bosTau7/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=100 BASE=/hive/data/genomes/rn5/bed/lastzBosTau7.2012-11-29 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # Elapsed time: 2921m25s cat fb.rn5.chainBosTau7Link.txt # 690036664 bases of 2572853723 (26.820%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/rn5/bed ln -s lastzBosTau7.2012-11-29 lastz.bosTau7 mkdir /hive/data/genomes/bosTau7/bed/blastz.rn5.swap cd /hive/data/genomes/bosTau7/bed/blastz.rn5.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/rn5/bed/lastzBosTau7.2012-11-29/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 64m20.430s cat fb.bosTau7.chainRn5Link.txt # 687933878 bases of 2804673174 (24.528%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/bosTau7/bed ln -s blastz.rn5.swap lastz.rn5 ############################################################################## # LASTZ dog canFam3 (DONE - 2012-11-29,12-04 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S rn5CanFam3 mkdir /hive/data/genomes/rn5/bed/lastzCanFam3.2012-11-29 cd /hive/data/genomes/rn5/bed/lastzCanFam3.2012-11-29 cat << '_EOF_' > DEF # dog vs rat BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: Rat Rn5 SEQ1_DIR=/hive/data/genomes/rn5/rn5.2bit SEQ1_LEN=/hive/data/genomes/rn5/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 SEQ1_LIMIT=20 # QUERY: dog CanFam3 SEQ2_DIR=/hive/data/genomes/canFam3/canFam3.2bit SEQ2_LEN=/hive/data/genomes/canFam3/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=40 BASE=/hive/data/genomes/rn5/bed/lastzCanFam3.2012-11-29 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 5963m50.439s # there was a hung-up job that prolonged this time cat fb.rn5.chainCanFam3Link.txt # 768049485 bases of 2572853723 (29.852%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/rn5/bed ln -s lastzCanFam3.2012-11-29 lastz.canFam3 mkdir /hive/data/genomes/canFam3/bed/blastz.rn5.swap cd /hive/data/genomes/canFam3/bed/blastz.rn5.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/rn5/bed/lastzCanFam3.2012-11-29/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # Elapsed time: 67m13s cat fb.canFam3.chainRn5Link.txt # 734454555 bases of 2392715236 (30.695%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/canFam3/bed ln -s blastz.rn5.swap lastz.rn5 ############################################################################## # LASTZ panda ailMel1 (DONE - 2012-11-29,12-04 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S rn5AilMel1 mkdir /hive/data/genomes/rn5/bed/lastzAilMel1.2012-11-29 cd /hive/data/genomes/rn5/bed/lastzAilMel1.2012-11-29 cat << '_EOF_' > DEF # panda vs rat BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: Rat Rn5 SEQ1_DIR=/hive/data/genomes/rn5/rn5.2bit SEQ1_LEN=/hive/data/genomes/rn5/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 SEQ1_LIMIT=50 # QUERY: panda AilMel1 SEQ2_DIR=/scratch/data/ailMel1/ailMel1.2bit SEQ2_LEN=/scratch/data/ailMel1/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=350 BASE=/hive/data/genomes/rn5/bed/lastzAilMel1.2012-11-29 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # Elapsed time: 2389m42s cat fb.rn5.chainAilMel1Link.txt # 818618644 bases of 2572853723 (31.818%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/rn5/bed ln -s lastzAilMel1.2012-11-29 lastz.ailMel1 mkdir /hive/data/genomes/ailMel1/bed/blastz.rn5.swap cd /hive/data/genomes/ailMel1/bed/blastz.rn5.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/rn5/bed/lastzAilMel1.2012-11-29/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # Elapsed time: 72m40.305s cat fb.ailMel1.chainRn5Link.txt # 776437110 bases of 2245312831 (34.580%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/ailMel1/bed ln -s blastz.rn5.swap lastz.rn5 ############################################################################## # LASTZ Chimp PanTro4 (DONE - 2012-11-29,12-04 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S rn5PanTro4 mkdir /hive/data/genomes/rn5/bed/lastzPanTro4.2012-11-29 cd /hive/data/genomes/rn5/bed/lastzPanTro4.2012-11-29 cat << '_EOF_' > DEF # chimp vs rat BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: Rat Rn5 SEQ1_DIR=/hive/data/genomes/rn5/rn5.2bit SEQ1_LEN=/hive/data/genomes/rn5/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 SEQ1_LIMIT=40 # QUERY: Chimp PanTro4 SEQ2_DIR=/hive/data/genomes/panTro4/panTro4.2bit SEQ2_LEN=/hive/data/genomes/panTro4/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LAP=0 SEQ2_LIMIT=100 BASE=/hive/data/genomes/rn5/bed/lastzPanTro4.2012-11-29 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 3194m24.511s cat fb.rn5.chainPanTro4Link.txt # 916343862 bases of 2572853723 (35.616%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/rn5/bed ln -s lastzPanTro4.2012-11-29 lastz.panTro4 mkdir /hive/data/genomes/panTro4/bed/blastz.rn5.swap cd /hive/data/genomes/panTro4/bed/blastz.rn5.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/rn5/bed/lastzPanTro4.2012-11-29/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 85m28.505s cat fb.panTro4.chainRn5Link.txt # 901495233 bases of 2902338967 (31.061%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/panTro4/bed ln -s blastz.rn5.swap lastz.rn5 ############################################################################## # LASTZ guinea pig cavPor3 (DONE - 2012-11-29,12-04 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S rn5CavPor3 mkdir /hive/data/genomes/rn5/bed/lastzCavPor3.2012-11-29 cd /hive/data/genomes/rn5/bed/lastzCavPor3.2012-11-29 cat << '_EOF_' > DEF # guinea pig vs rat BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: Rat Rn5 SEQ1_DIR=/hive/data/genomes/rn5/rn5.2bit SEQ1_LEN=/hive/data/genomes/rn5/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 SEQ1_LIMIT=40 # QUERY: guinea pig CavPor3 SEQ2_DIR=/scratch/data/cavPor3/cavPor3.2bit SEQ2_LEN=/scratch/data/cavPor3/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=20 BASE=/hive/data/genomes/rn5/bed/lastzCavPor3.2012-11-29 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 2935m17.649s cat fb.rn5.chainCavPor3Link.txt # 749383689 bases of 2572853723 (29.127%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/rn5/bed ln -s lastzCavPor3.2012-11-29 lastz.cavPor3 mkdir /hive/data/genomes/cavPor3/bed/blastz.rn5.swap cd /hive/data/genomes/cavPor3/bed/blastz.rn5.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/rn5/bed/lastzCavPor3.2012-11-29/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 82m22.205s cat fb.cavPor3.chainRn5Link.txt # 750114059 bases of 2663369733 (28.164%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/cavPor3/bed ln -s blastz.rn5.swap lastz.rn5 ######################################################################### # construct lift file Ensembl names to UCSC names (DONE 2013-02-19 Chin) cd /hive/data/genomes/rn5/jkStuff cat ../chrom.sizes | while read L do ucName=`echo "${L}" | awk '{print $1}'` ucSize=`echo "${L}" | awk '{print $2}'` ensName=`echo $L | sed -e 's/^chrM/MT/; s/^chr[0-9A-Za-z]*_//; s/_random//; s/^chr//; s/^\([JA][HA][B-Z0-9]*\)/\1.1/;' | awk '{print $1}'` ensSize=`echo $L | awk '{print $2}'` echo -e "0\t$ensName\t$ensSize\t$ucName\t$ucSize" done > ensToUcsc.lift ##############################################################################