# for emacs: -*- mode: sh; -*- # http://www.ncbi.nlm.nih.gov/genome/82 # http://www.ncbi.nlm.nih.gov/bioproject/12555 # http://www.ncbi.nlm.nih.gov/Traces/wgs/?val=AAFC01 # http://www.ncbi.nlm.nih.gov/genome/assembly/313728/ # file template copied from bosTau6.txt # Assembly Info: # DATE: 25-Oct-2011 # ORGANISM: Bos taurus # TAXID: 9913 # ASSEMBLY LONG NAME: Btau_4.6.1 # ASSEMBLY SHORT NAME: Btau_4.6.1 # ASSEMBLY SUBMITTER: Bovine Genome Sequencing Consortium # ASSEMBLY TYPE: Haploid # NUMBER OF ASSEMBLY-UNITS: 1 # ASSEMBLY ACCESSION: GCA_000003205.4 # # FTP-RELEASE DATE: 03-Nov-2011 # Bos taurus (NCBI Project ID: 12555, Accession: GCA_000003205.4) # by Bovine Genome Sequencing and Analysis Consortiumn # assembly] sequence: # ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Bos_taurus/Btau_4.6.1/ # Project 12555 # url: # http://www.ncbi.nlm.nih.gov/nuccore/AAFC00000000 # chrM: AY526085 # http://www.ncbi.nlm.nih.gov/nuccore/AY526085.1 # http://www.ncbi.nlm.nih.gov/Traces/wgs/?val=AAFC00 # 7X WGS coverage ########################################################################## # Download sequence (DONE - 2011-05-02 Chin) mkdir /hive/data/genomes/bosTau7 cd /hive/data/genomes/bosTau7 mkdir genbank cd genbank wget --timestamping -r --cut-dirs=6 --level=0 -nH -x \ --no-remove-listing -np \ "ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Bos_taurus/Btau_4.6.1/*" # FINISHED --2011-12-16 15:36:00-- # Downloaded: 244 files, 2.1G in 30m 24s (1.17 MB/s) # Read ASSEMBLY_INFO # measure total sequence in this assembly faSize Primary_Assembly/assembled_chromosomes/FASTA/chr*.fa \ Primary_Assembly/unplaced_scaffolds/FASTA/un*.fa.gz # 2981103241 bases (176446405 N's 2804656836 real 2804656836 upper # 0 lower) in 11691 sequences in 32 files # Total size: mean 254991.3 sd 4719241.6 min 152 (gi|346076751|gb|JH125284.1|) # max 161428367 (gi|346683426|gb|CM000177.5|) median 8572 # N count: mean 15092.5 sd 280692.5 # U count: mean 239898.8 sd 4441763.7 # L count: mean 0.0 sd 0.0 # %0.00 masked total, %0.00 masked real mkdir ucscChr # stay at genbank directory # fixup the accession names to become UCSC chrom names export S=Primary_Assembly/assembled_chromosomes cat ${S}/chr2acc | grep -v "^#Chromosome" | cut -f2 | while read ACC do C=`grep "${ACC}" ${S}/chr2acc | cut -f1` echo "${ACC} -> chr${C}" zcat ${S}/AGP/chr${C}.comp.agp.gz \ | sed -e "s/^${ACC}/chr${C}/" | gzip > ucscChr/chr${C}.agp.gz done export S=Primary_Assembly/assembled_chromosomes cut -f2 ${S}/chr2acc | while read ACC cat ${S}/chr2acc | grep -v "^#Chromosome" | cut -f2 | while read ACC do C=`grep "${ACC}" ${S}/chr2acc | cut -f1` echo "${ACC} -> chr${C}" echo ">chr${C}" > ucscChr/chr${C}.fa zcat ${S}/FASTA/chr${C}.fa.gz | grep -v "^>" >> ucscChr/chr${C}.fa gzip ucscChr/chr${C}.fa & done # Check them with faSize faSize Primary_Assembly/assembled_chromosomes/FASTA/chr*.fa.gz # 2673141463 bases (159233025 N's 2513908438 real 2513908438 upper # 0 lower) in 31 sequences in 31 files faSize ucscChr/chr*.fa.gz # 2660906405 bases (20740230 N's 2640166175 real 2640166175 upper # 0 lower) in 30 sequences in 30 files # For unplaced scalfolds, named them as chrUn_xxxxxxxx # where xxxxxx is the original access id as: chrUn_JH121286 # The ".1" at the end (if any) need to be filter out since # MySQL does not allow "." as part of the table name and # will casue problems in genbank task step later export S=Primary_Assembly/unplaced_scaffolds zcat ${S}/AGP/unplaced.scaf.agp.gz | grep "^#" > ucscChr/chrUn.agp # append the gap records zcat ${S}/AGP/unplaced.scaf.agp.gz | grep -v "^#" \ | sed -e "s/^/chrUn_/" -e "s/\.1//" >> ucscChr/chrUn.agp gzip ucscChr/chrUn.agp & zcat ${S}/FASTA/unplaced.scaf.fa.gz \ | sed -e "s#^>.*|gb|#>chrUn_#; s#|.*##" -e "s/\.1//" \ | gzip > ucscChr/chrUn.fa.gz # about 11660 sequences in the unplaced zcat ucscChr/chrUn.fa.gz | grep "^>" | wc # 11660 11660 46640 # Check them with faSize faSize Primary_Assembly/unplaced_scaffolds/FASTA/unplaced.scaf.fa.gz # 307961778 bases (17213380 N's 290748398 real 290748398 upper # 0 lower) in 11660 sequences in 1 files faSize ucscChr/chrUn.fa.gz # 307961778 bases (17213380 N's 290748398 real 290748398 upper # 0 lower) in 11660 sequences in 1 files # N50 cd /hive/data/genomes/bosTau7/ n50.pl chrom.sizes # reading: chrom.sizes # contig count: 11692, total size: 2981119579, one half size: 1490559789 # cumulative N50 count contig contig size # 1444820354 12 chrX 88654062 # 1490559789 one half size # 1529939826 13 chr12 85119472 cat << '_EOF_' > bosTau7.config.ra # Config parameters for makeGenomeDb.pl: db bosTau7 clade mammal genomeCladePriority 31 scientificName Bos taurus commonName Cow assemblyDate OCt. 2011 assemblyLabel Baylor Btau_4.6.1 assemblyShortLabel Baylor Btau_4.6.1 orderKey 236 mitoAcc AY526085 fastaFiles /hive/data/genomes/bosTau7/genbank/ucscChr/chr*.fa.gz agpFiles /hive/data/genomes/bosTau7/genbank/ucscChr/chr*.agp.gz # qualFiles none dbDbSpeciesDir cow photoCreditURL http://www.genome.gov/10005141 photoCreditName NHGRI Press Photos ncbiGenomeId 82 ncbiAssemblyId 313728 ncbiAssemblyName Baylor Btau_4.6.1 ncbiBioProject 12555 genBankAccessionID GCA_000003205.4 taxId 9913 '_EOF_' ######################################################################### # Initial database build (DONE - 2012-01-03 - Chin) cd /hive/data/genomes/bosTau7 cat << '_EOF_' > bosTau7.config.ra # Config parameters for makeGenomeDb.pl: db bosTau7 clade mammal genomeCladePriority 31 scientificName Bos taurus commonName Cow assemblyDate OCt. 2011 assemblyLabel Baylor Btau_4.6.1 assemblyShortLabel Btau_4.6.1 orderKey 236 mitoAcc AY526085 fastaFiles /hive/data/genomes/bosTau7/genbank/ucscChr/chr*.fa.gz agpFiles /hive/data/genomes/bosTau7/genbank/ucscChr/chr*.agp.gz # qualFiles none dbDbSpeciesDir cow photoCreditURL http://www.genome.gov/10005141 photoCreditName NHGRI Press Photos ncbiGenomeId 82 ncbiAssemblyId 313728 ncbiAssemblyName Baylor Btau_4.6.1 ncbiBioProject 12555 genBankAccessionID GCA_000003205.4 taxId 9913 '_EOF_' # << happy emacs time makeGenomeDb.pl -noGoldGapSplit -workhorse=hgwdev bosTau7.config.ra \ > makeGenomeDb.log 2>&1 & # real 23m34.828s # add the trackDb entries to the source tree, and the 2bit link: ln -s `pwd`/bosTau7.unmasked.2bit /gbdb/bosTau7/bosTau7.2bit # Per instructions in makeGenomeDb.log: # - replace *** use actual url for project etc.. in all html files #Search for '***' notes in each file in and make corrections (sometimes the #files used for a previous assembly might make a better template): # description.html /cluster/data/bosTau7/html/{trackDb.ra,gap.html,gold.html} # #Then copy these files to your ~/kent/src/hg/makeDb/trackDb/cow/bosTau7 # - cd ~/kent/src/hg/makeDb/trackDb # - edit makefile to add bosTau7 to DBS. # - git add cow/bosTau7/*.{ra,html} # - git commit -m "Added bosTau7 to DBS." makefile # - git commit -m "Initial descriptions for bosTau7." cow/bosTau7/*.{ra,html} # - git pull; git push # - Run make update DBS=bosTau7 and make alpha when done. # - (optional) Clean up /cluster/data/bosTau7/TemporaryTrackDbCheckout # re-generated description html using update makeGenomeDb.pl and config.ra cd /hive/data/genomes/bosTau7 makeGenomeDb.pl -forceDescription -continue=trackDb bosTau7.config.ra ######################################################################### # RepeatMasker (DONE 2012-01-04 - Chin) mkdir /hive/data/genomes/bosTau7/bed/repeatMasker cd /hive/data/genomes/bosTau7/bed/repeatMasker time nice -n +19 doRepeatMasker.pl -buildDir=`pwd` \ -workhorse=hgwdev -bigClusterHub=swarm -noSplit bosTau7 > do.log 2>&1 & # real 472m4.476s cat faSize.rmsk.txt # 2981119579 bases (176446405 N's 2804673174 real 1415188620 upper # 1389484554 lower) in 11692 sequences in 1 files # %46.61 masked total, %49.54 masked real ######################################################################### # simpleRepeats (DONE - 2012-01-05 - Chin) mkdir /hive/data/genomes/bosTau7/bed/simpleRepeat cd /hive/data/genomes/bosTau7/bed/simpleRepeat time nice -n +19 doSimpleRepeat.pl -buildDir=`pwd` -workhorse=hgwdev \ -dbHost=hgwdev -bigClusterHub=swarm -smallClusterHub=memk \ bosTau7 > do.log 2>&1 & # real 39m2.413s cat fb.simpleRepeat # 40432561 bases of 2804801325 (1.442%) in intersection # add to the repeatMasker cd /hive/data/genomes/bosTau7 twoBitMask bosTau7.rmsk.2bit -add bed/simpleRepeat/trfMask.bed bosTau7.2bit # safe to ignore warnings about >=13 fields twoBitToFa bosTau7.2bit stdout | faSize stdin > bosTau7.2bit.faSize.txt cat bosTau7.2bit.faSize.txt # 2981119579 bases (176446405 N's 2804673174 real 1414674956 upper # 1389998218 lower) in 11692 sequences in 1 files # %46.63 masked total, %49.56 masked real # double check with featureBits featureBits -countGaps bosTau7 gap # 176318254 bases of 2981119579 (5.914%) in intersection rm /gbdb/bosTau7/bosTau7.2bit ln -s `pwd`/bosTau7.2bit /gbdb/bosTau7/bosTau7.2bit ######################################################################### # Marking *all* gaps - they are not all in the AGP file # (DONE - 2012-01-05 - Chin) mkdir /hive/data/genomes/bosTau7/bed/allGaps cd /hive/data/genomes/bosTau7/bed/allGaps time nice -n +19 findMotif -motif=gattaca -verbose=4 \ -strand=+ ../../bosTau7.unmasked.2bit > findMotif.txt 2>&1 # real 0m42.321s grep "^#GAP " findMotif.txt | sed -e "s/^#GAP //" > allGaps.bed featureBits bosTau7 -not gap -bed=notGap.bed # 2804801325 bases of 2804801325 (100.000%) in intersection featureBits bosTau7 allGaps.bed notGap.bed -bed=new.gaps.bed # 128151 bases of 2804801325 (0.005%) in intersection # what is the highest index in the existing gap table: hgsql -N -e "select ix from gap;" bosTau7 | sort -n | tail -1 # 10242 # use tcsh and ctrl-c to create the here doc cat << '_EOF_' > mkGap.pl #!/usr/bin/env perl use strict; use warnings; my $ix=`hgsql -N -e "select ix from gap;" bosTau7 | sort -n | tail -1`; chomp $ix; open (FH,") { my ($chrom, $chromStart, $chromEnd, $rest) = split('\s+', $line); ++$ix; printf "%s\t%d\t%d\t%d\tN\t%d\tother\tyes\n", $chrom, $chromStart, $chromEnd, $ix, $chromEnd-$chromStart; } close (FH); '_EOF_' # << happy emacs chmod +x ./mkGap.pl ./mkGap.pl > other.bed featureBits bosTau7 other.bed # 128151 bases of 2804801325 (0.005%) in intersection hgLoadBed -sqlTable=$HOME/kent/src/hg/lib/gap.sql \ -noLoad bosTau7 otherGap other.bed # Loaded 12612 elements of size 8 # adding this many: wc -l bed.tab # 12612 bed.tab # starting with this many hgsql -e "select count(*) from gap;" bosTau7 # 69725 hgsql bosTau7 -e 'load data local infile "bed.tab" into table gap;' # result count: hgsql -e "select count(*) from gap;" bosTau7 # 82337 # == 69725 + 12612 ########################################################################## ## WINDOWMASKER (DONE 2012-01-05 Chin) mkdir /hive/data/genomes/bosTau7/bed/windowMasker cd /hive/data/genomes/bosTau7/bed/windowMasker time nice -n +19 doWindowMasker.pl -buildDir=`pwd` -workhorse=hgwdev \ -dbHost=hgwdev bosTau7 > do.log 2>&1 & # real 191m59.940s # Masking statistics twoBitToFa bosTau7.wmsk.2bit stdout | faSize stdin # 2981119579 bases (176446405 N's 2804673174 real 1660551615 upper # 1144121559 lower) in 11692 sequences in 1 files # %38.38 masked total, %40.79 masked real twoBitToFa bosTau7.wmsk.sdust.2bit stdout | faSize stdin # 2981119579 bases (176446405 N's 2804673174 real 1645487915 upper # 1159185259 lower) in 11692 sequences in 1 files # %38.88 masked total, %41.33 masked real hgLoadBed bosTau7 windowmaskerSdust windowmasker.sdust.bed.gz # Reading windowmasker.sdust.bed.gz # Read 13173670 elements of size 3 from windowmasker.sdust.bed.gz # Sorted # Creating table definition for windowmaskerSdust # Saving bed.tab # Loading bosTau7 featureBits -countGaps bosTau7 windowmaskerSdust # 1335621689 bases of 2981119579 (44.803%) in intersection # eliminate the gaps from the masking featureBits bosTau7 -not gap -bed=notGap.bed # 2804673174 bases of 2804673174 (100.000%) in intersection time nice -n +19 featureBits bosTau7 windowmaskerSdust notGap.bed \ -bed=stdout | gzip -c > cleanWMask.bed.gz # real 4m52.720s # 1159185259 bases of 2804673174 (41.330%) in intersection # reload track to get it clean hgLoadBed bosTau7 windowmaskerSdust cleanWMask.bed.gz # Read 13181547 elements of size 4 from cleanWMask.bed.gz # Loading bosTau7 time featureBits -countGaps bosTau7 windowmaskerSdust # real 1m11.488s # 1159185259 bases of 2981119579 (38.884%) in intersection zcat cleanWMask.bed.gz \ | twoBitMask ../../bosTau7.unmasked.2bit stdin \ -type=.bed bosTau7.cleanWMSdust.2bit twoBitToFa bosTau7.cleanWMSdust.2bit stdout | faSize stdin \ > bosTau7.cleanWMSdust.faSize.txt cat bosTau7.cleanWMSdust.faSize.txt # 2981119579 bases (176446405 N's 2804673174 real 1645487915 upper # 1159185259 lower) in 11692 sequences in 1 files # %38.88 masked total, %41.33 masked real # how much does this window masker and repeat masker overlap: featureBits -countGaps bosTau7 rmsk windowmaskerSdust # 934428494 bases of 2981119579 (31.345%) in intersection ######################################################################## # Create kluster run files (DONE - 2012-01-05 - Chin) # numerator is bosTau7 gapless bases "real" as reported by: featureBits -noRandom -noHap bosTau7 gap # 159233025 bases of 2513924776 (6.334%) in intersection # denominator is hg19 gapless bases as reported by: # featureBits -noRandom -noHap hg19 gap # 234344806 bases of 2861349177 (8.190%) in intersection # 1024 is threshold used for human -repMatch: calc \( 2513924776 / 2861349177 \) \* 1024 ( 2513924776 / 2861349177 ) * 1024 = 899.666140 # ==> use -repMatch=900 according to size scaled down from 1024 for human. # and rounded down to nearest 50 (runup 899.666140 to 900 in this case) cd /hive/data/genomes/bosTau7 blat bosTau7.2bit \ /dev/null /dev/null -tileSize=11 -makeOoc=jkStuff/bosTau7.11.ooc \ -repMatch=900 & # Wrote 37068 overused 11-mers to jkStuff/bosTau7.11.ooc mkdir /hive/data/staging/data/bosTau7 cp -p bosTau7.2bit jkStuff/bosTau7.11.ooc /hive/data/staging/data/bosTau7 cp -p chrom.sizes /hive/data/staging/data/bosTau7 # check non-bridged gaps to see what the typical size is: hgsql -N \ -e 'select * from gap where bridge="no" order by size;' bosTau7 \ | sort -k7,7nr # most non-bridges gaps have size = 5000, follow pig's example (most # 100, but use 5000) # decide on a minimum gap for this break, use either 100 or 5000 will # generate 13387 liftOver rows, but if use 6000, only got 11703 rows. # so use 100 here to get more liftOver row. gapToLift -verbose=2 -minGap=100 bosTau7 jkStuff/nonBridged.lft \ -bedFile=jkStuff/nonBridged.bed cp -p jkStuff/nonBridged.lft \ /hive/data/staging/data/bosTau7/bosTau7.nonBridged.lft # ask cluster-admin to copy (evry time if any file changed) # /hive/data/staging/data/bosTau7 directory to # /scratch/data/bosTau7 on cluster nodes # before porceed to genbank step ############################################################################ # set this as defaultDb (DONE - 2012-01-06 - Chin) # and make this the default genome for Cow hgsql -e 'update defaultDb set name="bosTau7" where name="bosTau6";' \ hgcentraltest ######################################################################## # GENBANK AUTO UPDATE (DONE - 2012-01-09 - Chin) ssh hgwdev cd $HOME/kent/src/hg/makeDb/genbank git pull # /cluster/data/genbank/data/organism.lst shows: # #organism mrnaCnt estCnt refSeqCnt # Bos taurus 18945 1559562 13153 # edit etc/genbank.conf to add bosTau7 just before susScr2 # bosTau7 (Cow) bosTau7.serverGenome = /hive/data/genomes/bosTau7/bosTau7.2bit bosTau7.clusterGenome = /scratch/data/bosTau7/bosTau7.2bit bosTau7.ooc = /scratch/data/bosTau7/bosTau7.11.ooc bosTau7.lift = no bosTau7.perChromTables = no bosTau7.refseq.mrna.native.pslCDnaFilter = ${ordered.refseq.mrna.native.pslCDnaFilter} bosTau7.refseq.mrna.xeno.pslCDnaFilter = ${ordered.refseq.mrna.xeno.pslCDnaFilter} bosTau7.genbank.mrna.native.pslCDnaFilter = ${ordered.genbank.mrna.native.pslCDnaFilter} bosTau7.genbank.mrna.xeno.pslCDnaFilter = ${ordered.genbank.mrna.xeno.pslCDnaFilter} bosTau7.genbank.est.native.pslCDnaFilter = ${ordered.genbank.est.native.pslCDnaFilter} bosTau7.genbank.est.xeno.pslCDnaFilter = ${ordered.genbank.est.xeno.pslCDnaFilter} bosTau7.downloadDir = bosTau7 bosTau7.refseq.mrna.native.load = yes bosTau7.refseq.mrna.xeno.load = yes bosTau7.refseq.mrna.xeno.loadDesc = yes bosTau7.upstreamGeneTbl = refGene git add etc/genbank.conf git commit -m "Added bosTau7" etc/genbank.conf git push # update /cluster/data/genbank/: make etc-update # Edit src/lib/gbGenome.c to add new species. Skipped screen # control this business with a screen since it takes a while cd /cluster/data/genbank time nice -n +19 ./bin/gbAlignStep -initial bosTau7 & # logFile: var/build/logs/2012.01.11-09:14:36.bosTau7.initalign.log # real 1258m43.818s # To re-do, rm the dir first: # /cluster/data/genbank/data/aligned/genbank.187.0/bosTau7 # load database when finished ssh hgwdev cd /cluster/data/genbank time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad bosTau7 & # logFile: var/dbload/hgwdev/logs/2012.01.12-10:33:25.dbload.log # real 58m49.817s # enable daily alignment and update of hgwdev cd ~/kent/src/hg/makeDb/genbank git pull # add bosTau7 to: # etc/align.dbs # etc/hgwdev.dbs git add etc/align.dbs git add etc/hgwdev.dbs git commit -m "Added bosTau7 - Cow" etc/align.dbs etc/hgwdev.dbs git push make etc-update ########################################################################## # BLATSERVERS ENTRY (DONE 2012-01-13 - Chin) # After getting a blat server assigned by the Blat Server Gods, # # bosTau7 (trans) on blatx port 17842 # bosTau7 (untrans) on blatx port 17843 hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ VALUES ("bosTau7", "blatx", "17842", "1", "0"); \ INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ VALUES ("bosTau7", "blatx", "17843", "0", "1");' \ hgcentraltest # test it with some sequence ######################################################################### # reset default position (DONE - 2012-01-13 - Chin) # Reset default position to an area contains RHO and a number of # other genes hgsql -e \ 'update dbDb set defaultPos="chr22:56,647,062-58,290,728" where name="bosTau7";' \ hgcentraltest ############################################################################ # ctgPos2 track - showing clone sequence locations on chromosomes # (DONE 2012-01-17 - Chin) # NOTE - create bosTau7 entry in all.joiner since this is a new species mkdir /hive/data/genomes/bosTau7/bed/ctgPos2 cd /hive/data/genomes/bosTau7/bed/ctgPos2 cat << '_EOF_' > agpToCtgPos2.pl #!/usr/bin/env perl use warnings; use strict; my $argc = scalar(@ARGV); if ($argc != 1) { printf STDERR "usage: zcat your.files.agp.gz | agpToCtgPos2.pl /dev/stdin > ctgPos2.tab\n"; exit 255; } my $agpFile = shift; open (FH, "<$agpFile") or die "can not read $agpFile"; while (my $line = ) { next if ($line =~ m/^#/); chomp $line; my @a = split('\s+', $line); next if ($a[4] =~ m/^U$/); next if ($a[4] =~ m/^N$/); my $chrSize = $a[2]-$a[1]+1; my $ctgSize = $a[7]-$a[6]+1; die "sizes differ $chrSize != $ctgSize\n$line\n" if ($chrSize != $ctgSize); printf "%s\t%d\t%s\t%d\t%d\t%s\n", $a[5], $chrSize, $a[0], $a[1]-1, $a[2], $a[4]; } close (FH); '_EOF_' # << happy emacs chmod 775 agpToCtgPos2.pl export S=../../genbank/Primary_Assembly/assembled_chromosomes cat ${S}/chr2acc | grep -v "^#Chromosome" | cut -f2 | while read ACC do C=`grep "${ACC}" ${S}/chr2acc | cut -f1` zcat ${S}/AGP/chr${C}.agp.gz \ | sed -e "s/^${ACC}/chr${C}/" done | ./agpToCtgPos2.pl /dev/stdin > ctgPos2.tab hgLoadSqlTab bosTau7 ctgPos2 $HOME/kent/src/hg/lib/ctgPos2.sql ctgPos2.tab # add the track ctgPos2 to src/hg/makeDb/trackDb/cow/bosTau7/trackDb.ra # at src/makeDb/trackdb do "make update DBS=bosTau7" or/and "make alpha" # based on result of # hgsql -N -e "select type from ctgPos2;" bosTau7 | sort | uniq # W # prepare the src/hg/makeDb/trackDb/cow/bosTau7/ctgPos2.html ############################################################################ # running cpgIsland business (DONE -2012-01-17 - Chin) mkdir /hive/data/genomes/bosTau7/bed/cpgIsland cd /hive/data/genomes/bosTau7/bed/cpgIsland cvs -d /projects/compbio/cvsroot checkout -P hg3rdParty/cpgIslands cd hg3rdParty/cpgIslands # needed to fixup this source, adding include to readseq.c: #include "string.h" # and to cpg_lh.c: #include "unistd.h" #include "stdlib.h" # and fixing a declaration in cpg_lh.c sed -e "s#\(extern char\* malloc\)#// \1#" cpg_lh.c > tmp.c mv tmp.c cpg_lh.c make cd ../../ ln -s hg3rdParty/cpgIslands/cpglh.exe mkdir -p hardMaskedFa cut -f1 ../../chrom.sizes | while read C do echo ${C} twoBitToFa ../../bosTau7.2bit:$C stdout \ | maskOutFa stdin hard hardMaskedFa/${C}.fa done ssh swarm cd /hive/data/genomes/bosTau7/bed/cpgIsland mkdir results cut -f1 ../../chrom.sizes > chr.list cat << '_EOF_' > template #LOOP ./runOne $(root1) {check out exists results/$(root1).cpg} #ENDLOOP '_EOF_' # << happy emacs # the faCount business is to make sure there is enough sequence to # work with in the fasta. cpglh.exe does not like files with too many # N's - it gets stuck cat << '_EOF_' > runOne #!/bin/csh -fe set C = `faCount hardMaskedFa/$1.fa | grep ^chr | awk '{print $2 - $7 }'` if ( $C > 200 ) then ./cpglh.exe hardMaskedFa/$1.fa > /scratch/tmp/$1.$$ mv /scratch/tmp/$1.$$ $2 else touch $2 endif '_EOF_' # << happy emacs chmod 775 runOne gensub2 chr.list single template jobList para create jobList para try para check ... etc para time para problems para status # then, kick it with para push # check it with plb # when all are done, para time shows: # Checking finished jobs # Completed: 11692 of 11692 jobs # CPU time in finished jobs: 201s 3.35m 0.06h 0.00d 0.000 y # IO & Wait Time: 31839s 530.65m 8.84h 0.37d 0.001 y # Average job time: 3s 0.05m 0.00h 0.00d # Longest finished job: 17s 0.28m 0.00h 0.00d # Submission to last job: 111s 1.85m 0.03h 0.00d # Transform cpglh output to bed + catDir results | awk '{ $2 = $2 - 1; width = $3 - $2; printf("%s\t%d\t%s\t%s %s\t%s\t%s\t%0.0f\t%0.1f\t%s\t%s\n", $1, $2, $3, $5,$6, width, $6, width*$7*0.01, 100.0*2*$6/width, $7, $9); }' > cpgIsland.bed ssh hgwdev cd /hive/data/genomes/bosTau7/bed/cpgIsland hgLoadBed bosTau7 cpgIslandExt -tab \ -sqlTable=$HOME/kent/src/hg/lib/cpgIslandExt.sql cpgIsland.bed # Reading cpgIsland.bed # Read 38217 elements of size 10 from cpgIsland.bed # Sorted # Creating table definition for cpgIslandExt # Saving bed.tab # Loading bosTau7 # cleanup rm -fr hardMaskedFa ########################################################################### # ornAna1 Platypus LASTZ/CHAIN/NET (DONE - 2012-01-18 - Chin) screen # use screen to control this job mkdir /cluster/data/bosTau7/bed/blastzOrnAna1.2012-01-18 cd /cluster/data/bosTau7/bed/blastzOrnAna1.2012-01-18 cat << '_EOF_' > DEF # Cow vs. Platypus BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q BLASTZ_M=50 # TARGET: Cow bosTau7 SEQ1_DIR=/scratch/data/bosTau7/bosTau7.2bit SEQ1_LEN=/cluster/data/bosTau7/chrom.sizes # Maximum number of scaffolds that can be lumped together SEQ1_LIMIT=300 SEQ1_CHUNK=20000000 SEQ1_LAP=0 # QUERY: Platypus ornAna1 SEQ2_DIR=/scratch/data/ornAna1/ornAna1.2bit SEQ2_LEN=/cluster/data/ornAna1/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LIMIT=300 SEQ2_LAP=0 BASE=/cluster/data/bosTau7/bed/blastzOrnAna1.2012-01-18 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl `pwd`/DEF \ -syntenicNet -noDbNameCheck \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \ -chainMinScore=5000 -verbose=2 -chainLinearGap=loose > do.log 2>&1 & # real 1780m16.857s *** All done ! Elapsed time: 1780m17s *** Make sure that goldenPath/bosTau7/vsOrnAna1/README.txt is accurate. *** Add {chain,net}OrnAna1 tracks to trackDb.ra if necessary. # filter with doRecipBest.pl time doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` \ bosTau7 ornAna1 > rbest.log 2>&1 # real 12m55.429s cat fb.bosTau7.chainOrnAna1Link.txt # 204121295 bases of 2804673174 (7.278%) in intersection cd /hive/data/genomes/bosTau7/bed ln -s blastzOrnAna1.2012-01-18 lastz.ornAna1 mkdir /cluster/data/ornAna1/bed/blastz.bosTau7.swap cd /cluster/data/ornAna1/bed/blastz.bosTau7.swap time nice -n +19 doBlastzChainNet.pl \ /cluster/data/bosTau7/bed/blastzOrnAna1.2012-01-18/DEF \ -chainMinScore=5000 -verbose=2 -smallClusterHub=memk \ -swap -syntenicNet \ -chainLinearGap=loose -bigClusterHub=swarm > swap.log 2>&1 & # real 58m6.038s cat fb.ornAna1.chainBosTau7Link.txt # 190094136 bases of 1842236818 (10.319%) in intersection *** All done ! Elapsed time: 58m6s *** Make sure that goldenPath/ornAna1/vsBosTau7/README.txt is accurate. *** Add {chain,net}BosTau7 tracks to trackDb.ra if necessary. cd /cluster/data/ornAna1/bed ln -s blastz.bosTau7.swap lastz.bosTau7 ##################################################################### # susScr2 Pig BLASTZ/CHAIN/NET (DONE - 2012-01-20 - Chin) screen # use a screen to manage this multi-day job mkdir /hive/data/genomes/bosTau7/bed/lastzSusScr2.2012-01-20 cd /hive/data/genomes/bosTau7/bed/lastzSusScr2.2012-01-20 cat << '_EOF_' > DEF # Pig vs. Cow BLASTZ_M=50 # TARGET: Cow BosTau7 SEQ1_DIR=/scratch/data/bosTau7/bosTau7.2bit SEQ1_LEN=/scratch/data/bosTau7/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Pig SusScr2 SEQ2_DIR=/scratch/data/susScr2/susScr2.2bit SEQ2_LEN=/scratch/data/susScr2/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/hive/data/genomes/bosTau7/bed/lastzSusScr2.2012-01-20 TMPDIR=/scratch/tmp '_EOF_' # << this line keeps emacs coloring happy time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -noLoadChainSplit -syntenicNet \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 1624m48.439s cat fb.bosTau7.chainSusScr2Link.txt # 1342940418 bases of 2804673174 (47.882%) in intersection cd /hive/data/genomes/bosTau7/bed ln -s lastzSusScr2.2012-01-20 lastz.susScr2 # then swap mkdir /hive/data/genomes/susScr2/bed/blastz.bosTau7.swap cd /hive/data/genomes/susScr2/bed/blastz.bosTau7.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/bosTau7/bed/lastzSusScr2.2012-01-20/DEF \ -swap -noLoadChainSplit -syntenicNet \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 209m45.776s cat fb.susScr2.chainBosTau7Link.txt # 1443560932 bases of 2231298548 (64.696%) in intersection ######################################################################### # SWAP mm9 LASTZ (DONE - 2012-01-22 - Chin) # original alignment done at mm9.txt cd /hive/data/genomes/mm9/bed/lastzBosTau7.2012-01-22 cat fb.mm9.chainBosTau7Link.txt # 695010371 bases of 2620346127 (26.524%) in intersection # Create link cd /hive/data/genomes/mm9/bed ln -s lastzBosTau7.2012-01-22 lastz.bosTau7 # and the swap mkdir /hive/data/genomes/bosTau7/bed/blastz.mm9.swap cd /hive/data/genomes/bosTau7/bed/blastz.mm9.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm9/bed/lastzBosTau7.2012-01-22/DEF \ -swap -syntenicNet \ -noLoadChainSplit \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 # real 51m44.505s cat fb.bosTau7.chainMm9Link.txt # 711305079 bases of 2804673174 (25.361%) in intersection cd /hive/data/genomes/bosTau7/bed ln -s blastz.mm9.swap lastz.mm9 ############################################################################ # SWAP hg19 LASTZ (DONE 2012-01-23 - Chin) # original alignment cd /hive/data/genomes/hg19/bed/lastzBosTau7.2012-01-23 cat fb.hg19.chainBosTau7Link.txt # 1360887008 bases of 2897316137 (46.971%) in intersection # and the swap mkdir /hive/data/genomes/bosTau7/bed/blastz.hg19.swap cd /hive/data/genomes/bosTau7/bed/blastz.hg19.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/hg19/bed/lastzBosTau7.2012-01-23/DEF \ -swap -syntenicNet \ -noLoadChainSplit \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 95m9.611s cat fb.bosTau7.chainHg19Link.txt # 1388551419 bases of 2804673174 (49.508%) in intersection cd /hive/data/genomes/bosTau7/bed ln -s blastz.hg19.swap lastz.hg19 ############################################################################ # SWAP canFam2 LASTZ (DONE 2012-01-24 - Chin) # original alignment cd /hive/data/genomes/canFam2/bed/lastzBosTau7.2012-01-2 cat fb.canFam2.chainBosTau7Link.txt # 1381869475 bases of 2384996543 (57.940%) in intersection # and the swap mkdir /hive/data/genomes/bosTau7/bed/blastz.canFam2.swap cd /hive/data/genomes/bosTau7/bed/blastz.canFam2.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/canFam2/bed/lastzBosTau7.2012-01-24/DEF \ -swap -syntenicNet \ -noLoadChainSplit \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 86m11.258s cat fb.bosTau7.chainCanFam2Link.txt # 1458512906 bases of 2804673174 (52.003%) in intersection cd /hive/data/genomes/bosTau7/bed ln -s blastz.canFam2.swap lastz.canFam2 ############################################################################ # SWAP rn4 LASTZ (DONE 2012-01-24 - Chin) cd /hive/data/genomes/rn4/bed/blastzBosTau7.2012-01-24 cat fb.rn4.chainBosTau7Link.txt # 656136973 bases of 2571531505 (25.515%) in intersection # and the swap mkdir /hive/data/genomes/bosTau7/bed/blastz.rn4.swap cd /hive/data/genomes/bosTau7/bed/blastz.rn4.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/rn4/bed/blastzBosTau7.2012-01-24/DEF \ -swap -noLoadChainSplit -syntenicNet \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 48m51.145 cat fb.bosTau7.chainRn4Link.txt # 668440773 bases of 2804673174 (23.833%) in intersection cd /hive/data/genomes/bosTau7/bed ln -s blastz.rn4.swap lastz.rn4 ############################################################################ # SWAP monDom5 LASTZ (DONE 2012-01-26 - Chin) cd /hive/data/genomes/monDom5/bed/lastzBosTau7.2012-01-26 cat fb.monDom5.chainBosTau7Link.txt # 206928725 bases of 3501660299 (5.909%) in intersection # and the swap mkdir /hive/data/genomes/bosTau7/bed/blastz.monDom5.swap cd /hive/data/genomes/bosTau7/bed/blastz.monDom5.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/monDom5/bed/lastzBosTau7.2012-01-26/DEF \ -swap -noLoadChainSplit -syntenicNet \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=loose > swap.log 2>&1 & # real 59m33.272s cat fb.bosTau7.chainMonDom5Link.txt # 207518037 bases of 2804673174 (7.399%) in intersection cd /hive/data/genomes/bosTau7/bed ln -s blastz.monDom5.swap lastz.monDom5 ######################################################################### # GENSCAN GENE PREDICTIONS (DONE 2012-01-31 - Chin) mkdir /hive/data/genomes/bosTau7/bed/genscan cd /hive/data/genomes/bosTau7/bed/genscan # Check out hg3rdParty/genscanlinux to get latest genscan: cvs -d /projects/compbio/cvsroot checkout -P hg3rdParty/genscanlinux # create hard masked .fa files mkdir -p hardMaskedFa cut -f1 ../../chrom.sizes | while read C do echo ${C} twoBitToFa ../../bosTau7.2bit:$C stdout \ | maskOutFa stdin hard hardMaskedFa/${C}.fa done # Generate a list file, genome.list, of all the hard-masked contig # chunks: find ./hardMaskedFa/ -type f | sed -e 's#^./##' > genome.list wc -l genome.list # 11692 genome.list # Run on small cluster (more mem than big cluster). ssh swarm cd /hive/data/genomes/bosTau7/bed/genscan # Make 3 subdirectories for genscan to put their output files in mkdir gtf pep subopt # Create template file, template, for gensub2. For example (3-line # file): cat << '_EOF_' > template #LOOP /cluster/bin/x86_64/gsBig {check in exists+ $(path1)} {check out exists gtf/$(root1).gtf} -trans={check out exists pep/$(root1).pep} -subopt={check out exists subopt/$(root1).bed} -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=2400000 #ENDLOOP '_EOF_' # << emacs gensub2 genome.list single template jobList para create jobList para try para check ... etc... para time # Completed: 11690 of 11692 jobs # Crashed: 2 jobs # CPU time in finished jobs: 66494s 1108.23m 18.47h 0.77d 0.002 y # IO & Wait Time: 33892s 564.87m 9.41h 0.39d 0.001 y # Average job time: 9s 0.14m 0.00h 0.00d # Longest finished job: 3664s 61.07m 1.02h 0.04d # Submission to last job: 4007s 66.78m 1.11h 0.05d # Two jobs (chr19 and chr26) failed, re-run them: /cluster/bin/x86_64/gsBig hardMaskedFa/chr26.fa gtf/chr26.gtf -trans=pep/chr26.pep -subopt=subopt/chr26.bed -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=200000 /cluster/bin/x86_64/gsBig hardMaskedFa/chr19.fa gtf/chr19.gtf -trans=pep/chr19.pep -subopt=subopt/chr19.bed -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=2200000 # this did not work, runs out of file handles ? find ./gtf -type f | xargs -n 256 endsInLf -zeroOk # Concatenate results: cd /hive/data/genomes/bosTau7/bed/genscan find ./gtf -type f | xargs cat > genscan.gtf find ./pep -type f | xargs cat > genscan.pep find ./subopt -type f | xargs cat > genscanSubopt.bed # Load into the database (without -genePredExt because no frame # info): # Don't load the Pep anymore -- redundant since it's from genomic. ssh hgwdev cd /hive/data/genomes/bosTau7/bed/genscan # to construct a local file with the genePred business: gtfToGenePred genscan.gtf genscan.gp # this produces exactly the same thing and loads the table: ldHgGene -gtf bosTau7 genscan genscan.gtf # Reading genscan.gtf # Read 47871 transcripts in 344386 lines in 1 files # 47871 groups 4673 seqs 1 sources 1 feature types # 47871 gene predictions hgLoadBed bosTau7 genscanSubopt genscanSubopt.bed # Read 518687 elements of size 6 from genscanSubopt.bed featureBits bosTau7 genscan # 58056267 bases of 2804673174 (2.070%) in intersection # previously: featureBits bosTau4 genscan # 58224594 bases of 2731830700 (2.131%) in intersection featureBits bosTau3 genscan # 59251085 bases of 2731807384 (2.169%) in intersection ############################################################################# # LIFTOVER TO bosTau4 (DONE - 2012-05-11 - Chin) sh hgwdev cd /hive/data/genomes/bosTau7 ln -s jkStuff/bosTau7.11.ooc 11.ooc time nice -n +19 doSameSpeciesLiftOver.pl -verbose=2 \ -bigClusterHub=swarm -dbHost=hgwdev -workhorse=hgwdev \ bosTau7 bosTau4 > doLiftOverToBosTau4.log 2>&1 & # real 1567m22.050s pushd /usr/local/apache/htdocs-hgdownload/goldenPath/bosTau7/liftOver/ md5sum bosTau7ToBosTau4.over.chain.gz >> md5sum.txt popd ############################################################################# # LIFTOVER TO bosTau5 (DONE - 2012-05-11 - Chin) sh hgwdev cd /hive/data/genomes/bosTau7 ln -s jkStuff/bosTau7.11.ooc 11.ooc time nice -n +19 doSameSpeciesLiftOver.pl -verbose=2 \ -bigClusterHub=swarm -dbHost=hgwdev -workhorse=hgwdev \ bosTau7 bosTau5 > doLiftOverToBosTau5.log 2>&1 & # real 1570m4.968s pushd /usr/local/apache/htdocs-hgdownload/goldenPath/bosTau7/liftOver/ md5sum bosTau7ToBosTau5.over.chain.gz >> md5sum.txt popd ############################################################################# # LIFTOVER TO bosTau6 (DONE - 2012-05-11 - Chin) sh hgwdev cd /hive/data/genomes/bosTau7 ln -s jkStuff/bosTau7.11.ooc 11.ooc time nice -n +19 doSameSpeciesLiftOver.pl -verbose=2 \ -bigClusterHub=swarm -dbHost=hgwdev -workhorse=hgwdev \ bosTau7 bosTau6 > doLiftOverToBosTau6.log 2>&1 & # real 1570m6.579s pushd /usr/local/apache/htdocs-hgdownload/goldenPath/bosTau7/liftOver/ md5sum bosTau7ToBosTau6.over.chain.gz >> md5sum.txt popd ######################################################################### # all.joiner update, downloads and in pushQ - (DONE 2012-05-16 - Chin) cd $HOME/kent/src/hg/makeDb/schema # fixup all.joiner until this is a clean output joinerCheck -database=bosTau7 -all all.joiner mkdir /hive/data/genomes/bosTau7/goldenPath cd /hive/data/genomes/bosTau7/goldenPath makeDownloads.pl bosTau7 > do.log 2>&1 # now ready for pushQ entry mkdir /hive/data/genomes/bosTau7/pushQ cd /hive/data/genomes/bosTau7/pushQ makePushQSql.pl bosTau7 > bosTau7.pushQ.sql 2> stderr.out # check for errors in stderr.out, some are OK, e.g.: # WARNING: hgwdev does not have /gbdb/bosTau7/wib/gc5Base.wib # WARNING: hgwdev does not have /gbdb/bosTau7/wib/quality.wib # WARNING: hgwdev does not have /gbdb/bosTau7/bbi/quality.bw # WARNING: bosTau7 does not have seq # WARNING: bosTau7 does not have extFile # # WARNING: Could not tell (from trackDb, all.joiner and hardcoded lists of # supporting and genbank tables) which tracks to assign these tables to: # tableList # copy it to hgwbeta scp -p bosTau7.pushQ.sql hgwbeta:/tmp ssh hgwbeta cd /tmp hgsql qapushq < bosTau7.pushQ.sql # in that pushQ entry walk through each entry and see if the # sizes will set properly ######################################################################### # SWAP mm10 LASTZ (DONE - 2012-06-19 - Chin) # original alignment done at mm10.txt cd /hive/data/genomes/mm10/bed/lastzBosTau7.2012-03-21 cat fb.mm10.chainBosTau7Link.txt # 696498363 bases of 2652783500 (26.255%) in intersection # Create link cd /hive/data/genomes/mm10/bed ln -s lastzBosTau7.2012-03-21 lastz.bosTau7 # and the swap mkdir /hive/data/genomes/bosTau7/bed/blastz.mm10.swap cd /hive/data/genomes/bosTau7/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzBosTau7.2012-03-21/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 77m58.759s cat fb.bosTau7.chainMm10Link.txt # 711923052 bases of 2804673174 (25.383%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/bosTau7/bed ln -s blastz.mm10.swap lastz.mm10 ############################################################################## # LASTZ alpaca vicPac1 (DONE - 2012-06-19 - Chin) # establish a screen to control this job with a name to indicate # what it is screen -S bosTau7VicPac1 mkdir /hive/data/genomes/bosTau7/bed/lastzVicPac1.2012-06-19 cd /hive/data/genomes/bosTau7/bed/lastzVicPac1.2012-06-19 # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 cat << '_EOF_' > DEF # alpaca vs cow BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: cow BosTau7 SEQ1_DIR=/scratch/data/bosTau7/bosTau7.2bit SEQ1_LEN=/scratch/data/bosTau7/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: alpaca VicPac1 SEQ2_DIR=/scratch/data/vicPac1/vicPac1.2bit SEQ2_LEN=/scratch/data/vicPac1/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=4000 BASE=/hive/data/genomes/bosTau7/bed/lastzVicPac1.2012-06-19 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 3089m45.519s cat fb.bosTau7.chainVicPac1Link.txt # 1259182233 bases of 2804673174 (44.896%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/bosTau7/bed ln -s lastzVicPac1.2012-06-19 lastz.vicPac1 # better to have reciprocal best for this one since it is low # coverage: cd /hive/data/genomes/bosTau7/bed/lastzVicPac1.2012-06-19 time doRecipBest.pl bosTau7 vicPac1 -buildDir=`pwd` -workhorse=hgwdev \ > best.log 2>&1 & # real 110m1.192s # swap mkdir /hive/data/genomes/vicPac1/bed/blastz.bosTau7.swap cd /hive/data/genomes/vicPac1/bed/blastz.bosTau7.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/bosTau7/bed/lastzVicPac1.2012-06-19/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 717m22.295s cat fb.vicPac1.chainBosTau7Link.txt # 1309266730 bases of 1922910435 (68.088%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/vicPac1/bed ln -s blastz.bosTau7.swap lastz.bosTau7 ######################################################################### # SWAP canFam3 LASTZ (DONE - 2012-06-23 - Chin) # original alignment done at canFam3.txt cd /hive/data/genomes/canFam3/bed/lastzBosTau7.2012-06-23 cat fb.canFam3.chainBosTau7Link.txt # 1381966556 bases of 2392715236 (57.757%) in intersection # Create link cd /hive/data/genomes/canFam3/bed ln -s lastzBosTau7.2012-06-23 lastz.bosTau7 # and the swap mkdir /hive/data/genomes/bosTau7/bed/blastz.canFam3.swap cd /hive/data/genomes/bosTau7/bed/blastz.canFam3.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/canFam3/bed/lastzBosTau7.2012-06-23/DEF \ -swap -syntenicNet \ -noLoadChainSplit \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 121m44.022s cat fb.bosTau7.chainCanFam3Link.txt # 1456104306 bases of 2804673174 (51.917%) in intersection cd /hive/data/genomes/bosTau7/bed ln -s blastz.canFam3.swap lastz.canFam3 ############################################################################## # LASTZ dolphin turTru2 (DONE 2012-06-19 - Chin) # establish a screen to control this job with a name to indicate # what it is screen -S bosTau7TurTru2 mkdir /hive/data/genomes/bosTau7/bed/lastzTurTru2.2012-06-19 cd /hive/data/genomes/bosTau7/bed/lastzTurTru2.2012-06-19 # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 cat << '_EOF_' > DEF # dolphin vs cow BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: Cow BosTau7 SEQ1_DIR=/scratch/data/bosTau7/bosTau7.2bit SEQ1_LEN=/scratch/data/bosTau7/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: dolphin TurTru2 SEQ2_DIR=/hive/data/genomes/turTru2/turTru2.2bit SEQ2_LEN=/hive/data/genomes/turTru2/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=5000 BASE=/hive/data/genomes/bosTau7/bed/lastzTurTru2.2012-06-19 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 7776m13.787s cat fb.bosTau7.chainTurTru2Link.txt # 1696154184 bases of 2804673174 (60.476%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/bosTau7/bed ln -s lastzTurTru2.2012-06-19 lastz.turTru2 # better to have reciprocal best for this one since it is low # coverage: cd /hive/data/genomes/bosTau7/bed/lastzTurTru2.2012-06-19 time doRecipBest.pl bosTau7 turTru2 -buildDir=`pwd` -workhorse=hgwdev \ > best.log 2>&1 & # 65m6.795s # then swap mkdir /hive/data/genomes/turTru2/bed/blastz.bosTau7.swap cd /hive/data/genomes/turTru2/bed/blastz.bosTau7.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/bosTau7/bed/lastzTurTru2.2012-06-19/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 532m53.448s cat fb.turTru2.chainBosTau7Link.txt # 1625706583 bases of 2332402443 (69.701%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/turTru2/bed ln -s blastz.bosTau7.swap lastz.bosTau7 ############################################################################ # lastz White Rhino cerSim1 (DONE - 2012-10-24 - Hiram) screen -S bosTau7CerSim1 mkdir /hive/data/genomes/bosTau7/bed/lastzCerSim1.2012-10-24 cd /hive/data/genomes/bosTau7/bed/lastzCerSim1.2012-10-24 cat << '_EOF_' > DEF # Cow vs White Rhino BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz BLASTZ_M=50 # TARGET: Cow BosTau7 SEQ1_DIR=/scratch/data/bosTau7/bosTau7.2bit SEQ1_LEN=/scratch/data/bosTau7/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LIMIT=50 SEQ1_LAP=10000 # QUERY: White Rhino CerSim1 SEQ2_DIR=/hive/data/genomes/cerSim1/cerSim1.2bit SEQ2_LEN=/hive/data/genomes/cerSim1/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LIMIT=50 SEQ2_LAP=0 BASE=/hive/data/genomes/bosTau7/bed/lastzCerSim1.2012-10-24 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl \ `pwd`/DEF \ -workhorse=hgwdev -chainMinScore=3000 -chainLinearGap=medium \ -qRepeats=windowmaskerSdust \ -bigClusterHub=swarm -verbose=2 > do.log 2>&1 & # real 7139m16.881s cat fb.bosTau7.chainCerSim1Link.txt # 1628320914 bases of 2804673174 (58.057%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/bosTau7/bed ln -s lastzCerSim1.2012-10-24 lastz.cerSim1 # and the swap mkdir /hive/data/genomes/cerSim1/bed/blastz.bosTau7.swap cd /hive/data/genomes/cerSim1/bed/blastz.bosTau7.swap time nice -n +19 doBlastzChainNet.pl \ /hive/data/genomes/bosTau7/bed/lastzCerSim1.2012-10-24/DEF \ -workhorse=hgwdev -chainMinScore=3000 -chainLinearGap=medium \ -swap -qRepeats=windowmaskerSdust \ -bigClusterHub=swarm -verbose=2 > swap.log 2>&1 & # real 120m35.871s cat fb.cerSim1.chainBosTau7Link.txt # 1592093865 bases of 2366858012 (67.266%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/cerSim1/bed ln -s blastz.bosTau7.swap lastz.bosTau7 ##############################################################################