# for emacs: -*- mode: sh; -*- # This file describes browser build for the canFam3 # Canis lupus familiaris genome: Nov. 2011 # http://www.ncbi.nlm.nih.gov/Traces/wgs/?val=AAEX00 # 7X coverage via a variety of methods # http://www.ncbi.nlm.nih.gov/genome/85 # http://www.ncbi.nlm.nih.gov/bioproject/13179 # http://www.ncbi.nlm.nih.gov/Traces/wgs/?val=AAEX00 # http://www.ncbi.nlm.nih.gov/genome/assembly/317138/ # http://www.ncbi.nlm.nih.gov/Traces/wgs/?val=AAEX03 ############################################################################# # Fetch sequence from genbank (DONE - 2012-01-04 - Hiram) mkdir -p /hive/data/genomes/galGal4/genbank cd /hive/data/genomes/galGal4/genbank wget --timestamping -r --cut-dirs=6 --level=0 -nH -x \ --no-remove-listing -np \ "ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Canis_lupus/CanFam3.1/*" # Downloaded: 293 files, 1.8G in 24m 51s (1.25 MB/s) # measure sequence to be used here faSize Primary_Assembly/assembled_chromosomes/FASTA/*.fa.gz \ Primary_Assembly/unplaced_scaffolds/FASTA/*.fa.gz # 2410960148 bases (18261639 N's 2392698509 real 2392698509 upper 0 # lower) in 3267 sequences in 40 files # %0.00 masked total, %0.00 masked real ############################################################################# # process into UCSC naming scheme (DONE - 2012-01-05 - Hiram) mkdir /hive/data/genomes/canFam3/ucsc cd /hive/data/genomes/canFam3/ucsc cat << '_EOF_' > toUcsc.pl #!/bin/env perl use strict; use warnings; my %accToChr; open (FH, "<../genbank/Primary_Assembly/assembled_chromosomes/chr2acc") or die "can not read Primary_Assembly/assembled_chromosomes/chr2acc"; while (my $line = ) { next if ($line =~ m/^#/); chomp $line; my ($chrN, $acc) = split('\s+', $line); $accToChr{$acc} = $chrN; } close (FH); foreach my $acc (keys %accToChr) { my $chrN = $accToChr{$acc}; print "$acc $accToChr{$acc}\n"; open (FH, "zcat ../genbank/Primary_Assembly/assembled_chromosomes/AGP/chr${chrN}.agp.gz|") or die "can not read chr${chrN}.agp.gz"; open (UC, ">chr${chrN}.agp") or die "can not write to chr${chrN}.agp"; while (my $line = ) { if ($line =~ m/^#/) { print UC $line; } else { $line =~ s/^$acc/chr${chrN}/; print UC $line; } } close (FH); close (UC); open (FH, "zcat ../genbank/Primary_Assembly/assembled_chromosomes/FASTA/chr${chrN}.fa.gz|") or die "can not read chr${chrN}.fa.gz"; open (UC, ">chr${chrN}.fa") or die "can not write to chr${chrN}.fa"; while (my $line = ) { if ($line =~ m/^>/) { printf UC ">chr${chrN}\n"; } else { print UC $line; } } close (FH); close (UC); } '_EOF_' # << happy emacs chmod +x toUcsc.pl cat << '_EOF_' > unplaced.pl #!/bin/env perl use strict; use warnings; my $agpFile = "../genbank/Primary_Assembly/unplaced_scaffolds/AGP/unplaced.scaf.agp.gz"; my $fastaFile = "../genbank/Primary_Assembly/unplaced_scaffolds/FASTA/unplaced.scaf.fa.gz"; open (FH, "zcat $agpFile|") or die "can not read $agpFile"; open (UC, ">unplaced.agp") or die "can not write to unplaced.agp"; while (my $line = ) { if ($line =~ m/^#/) { print UC $line; } else { $line =~ s/\.1//; printf UC "chrUn_%s", $line; } } close (FH); close (UC); open (FH, "zcat $fastaFile|") or die "can not read $fastaFile"; open (UC, ">unplaced.fa") or die "can not write to unplaced.fa"; while (my $line = ) { if ($line =~ m/^>/) { chomp $line; $line =~ s/.*gb\|//; $line =~ s/\.1\|.*//; printf UC ">chrUn_$line\n"; } else { print UC $line; } } close (FH); close (UC); '_EOF_' # << happy emacs chmod +x unplaced.pl ./toUcsc.pl ./unplaced.pl gzip *.fa *.agp # verify nothing lost in the translation, should be the same as above # except for the name translations faSize *.fa # 2410960148 bases (18261639 N's 2392698509 real 2392698509 upper 0 lower) # in 3267 sequences in 40 files # %0.00 masked total, %0.00 masked real # 2410960148 bases (18261639 N's 2392698509 real 2392698509 upper 0 ############################################################################# # Initial browser build (DONE - 2012-01-05 - Hiram) cd /hive/data/genomes/canFam3 cat << '_EOF_' > canFam3.config.ra # Config parameters for makeGenomeDb.pl: db canFam3 clade vertebrate genomeCladePriority 20 scientificName Canis lupus familiaris commonName Dog assemblyDate Sep. 2011 assemblyLabel Broad CanFam3.1 (GCA_000002285.2) assemblyShortLabel Broad CanFam3.1 orderKey 226 mitoAcc NC_002008 fastaFiles /hive/data/genomes/canFam3/ucsc/*.fa.gz agpFiles /hive/data/genomes/canFam3/ucsc/*.agp.gz dbDbSpeciesDir dog taxId 9615 '_EOF_' # << happy emacs time makeGenomeDb.pl -stop=agp canFam3.config.ra > agp.log 2>&1 # less than two minutes # check the end of agp.log to verify it is OK time makeGenomeDb.pl -workhorse=hgwdev -fileServer=hgwdev \ -continue=db canFam3.config.ra > db.log 2>&1 # real 18m17.938s # verify the end of db.log indicates success ############################################################################# # running repeat masker (DONE - 2012-01-05 - Hiram) mkdir /hive/data/genomes/canFam3/bed/repeatMasker cd /hive/data/genomes/canFam3/bed/repeatMasker time doRepeatMasker.pl -buildDir=`pwd` -noSplit \ -bigClusterHub=swarm -dbHost=hgwdev -workhorse=hgwdev \ -smallClusterHub=memk canFam3 > do.log 2>&1 & # real 366m34.586s cat faSize.rmsk.txt # 2410976875 bases (18261639 N's 2392715236 real 1364929603 upper # 1027785633 lower) in 3268 sequences in 1 files # %42.63 masked total, %42.95 masked real grep -i versi do.log # RepeatMasker version development-$Id: RepeatMasker,v 1.26 2011/09/26 16:19:44 angie Exp $ # April 26 2011 (open-3-3-0) version of RepeatMasker featureBits -countGaps canFam3 rmsk # 1027963174 bases of 2410976875 (42.637%) in intersection # why is it different than the faSize above ? # because rmsk masks out some N's as well as bases, the count above # separates out the N's from the bases, it doesn't show lower case N's ########################################################################## # running simple repeat (DONE - 2012-01-05 - Hiram) mkdir /hive/data/genomes/canFam3/bed/simpleRepeat cd /hive/data/genomes/canFam3/bed/simpleRepeat time doSimpleRepeat.pl -buildDir=`pwd` -bigClusterHub=swarm \ -dbHost=hgwdev -workhorse=hgwdev -smallClusterHub=memk \ canFam3 > do.log 2>&1 & # real 98m15.932s cat fb.simpleRepeat # 65887025 bases of 2402709449 (2.742%) in intersection # adding RepeatMasker to get completed masked sequence cd /hive/data/genomes/canFam3 twoBitMask canFam3.rmsk.2bit \ -add bed/simpleRepeat/trfMask.bed canFam3.2bit # you can safely ignore the warning about fields >= 13 twoBitToFa canFam3.2bit stdout | faSize stdin > faSize.canFam3.2bit.txt cat faSize.canFam3.2bit.txt # 2410976875 bases (18261639 N's 2392715236 real 1363595724 upper # 1029119512 lower) in 3268 sequences in 1 files # %42.68 masked total, %43.01 masked real # reset the symlink: rm /gbdb/canFam3/canFam3.2bit ln -s `pwd`/canFam3.2bit /gbdb/canFam3/canFam3.2bit # what would it be like to include WindowMasker: zcat bed/windowMasker/cleanWMask.bed.gz \ | twoBitMask -type=.bed -add canFam3.2bit stdin canFam3.wm.trf.rmsk.2bit twoBitToFa canFam3.wm.trf.rmsk.2bit stdout | faSize stdin # 2410976875 bases (18261639 N's 2392715236 real 1119889172 upper # 1272826064 lower) in 3268 sequences in 1 files # %52.79 masked total, %53.20 masked real # WM would add 243 million bases masked: 1272826064-1029119512 = 243706552 ######################################################################### # Verify all gaps are marked, add any N's not in gap as type 'other' # (DONE - 2012-01-05 - Hiram) mkdir /hive/data/genomes/canFam3/bed/gap cd /hive/data/genomes/canFam3/bed/gap time nice -n +19 findMotif -motif=gattaca -verbose=4 \ -strand=+ ../../canFam3.unmasked.2bit > findMotif.txt 2>&1 # real 0m35.232s grep "^#GAP " findMotif.txt | sed -e "s/^#GAP //" > allGaps.bed time featureBits canFam3 -not gap -bed=notGap.bed # 2402709449 bases of 2402709449 (100.000%) in intersection # real 0m18.120s time featureBits canFam3 allGaps.bed notGap.bed -bed=new.gaps.bed # 9994213 bases of 2402709449 (0.416%) in intersection # real 0m32.175s # what is the highest index in the existing gap table: hgsql -N -e "select ix from gap;" canFam3 | sort -n | tail -1 # 94 cat << '_EOF_' > mkGap.pl #!/bin/env perl use strict; use warnings; my $ix=`hgsql -N -e "select ix from gap;" canFam3 | sort -n | tail -1`; chomp $ix; open (FH,") { my ($chrom, $chromStart, $chromEnd, $rest) = split('\s+', $line); ++$ix; printf "%s\t%d\t%d\t%d\tN\t%d\tother\tyes\n", $chrom, $chromStart, $chromEnd, $ix, $chromEnd-$chromStart; } close (FH); '_EOF_' # << happy emacs chmod +x ./mkGap.pl ./mkGap.pl > other.bed wc -l other.bed # 19473 featureBits -countGaps canFam3 other.bed # 9994213 bases of 2410976875 (0.415%) in intersection # verify no overlap: time featureBits -countGaps canFam3 gap other.bed # 0 bases of 2410976875 (0.000%) in intersection hgLoadBed -sqlTable=$HOME/kent/src/hg/lib/gap.sql \ -noLoad canFam3 otherGap other.bed # Loaded 19473 elements of size 8 # real 0m29.669s # verify no errors before adding to the table: gapToLift -minGap=1 canFam3 nonBridged.before.lift \ -bedFile=nonBridged.before.bed > before.gapToLift.txt 2>&1 # check for warnings in before.gapToLift.txt, should be empty: # -rw-rw-r-- 1 0 Jan 6 08:17 before.gapToLift.txt # starting with this many: hgsql -e "select count(*) from gap;" canFam3 # 4403 hgsql canFam3 -e 'load data local infile "bed.tab" into table gap;' # result count: hgsql -e "select count(*) from gap;" canFam3 # 23876 # == 4403 + 19473 # verify we aren't adding gaps where gaps already exist # this would output errors if that were true: gapToLift -minGap=1 canFam3 nonBridged.lift -bedFile=nonBridged.bed # there should be no errors or other output, checked bridged gaps: hgsql -N -e "select bridge from gap;" canFam3 | sort | uniq -c # 80 no # 23796 yes ########################################################################## ## WINDOWMASKER (DONE - 2011-09-08 - Hiram) mkdir /hive/data/genomes/canFam3/bed/windowMasker cd /hive/data/genomes/canFam3/bed/windowMasker time nice -n +19 doWindowMasker.pl -buildDir=`pwd` -workhorse=hgwdev \ -dbHost=hgwdev canFam3 > do.log 2>&1 & # real 135m51.510s # Masking statistics twoBitToFa canFam3.wmsk.2bit stdout | faSize stdin # 2410976875 bases (18261639 N's 2392715236 real 1597393388 upper # 795321848 lower) in 3268 sequences in 1 files # %32.99 masked total, %33.24 masked real twoBitToFa canFam3.wmsk.sdust.2bit stdout | faSize stdin # 2410976875 bases (18261639 N's 2392715236 real 1582594665 upper # 810120571 lower) in 3268 sequences in 1 files # %33.60 masked total, %33.86 masked real hgLoadBed canFam3 windowmaskerSdust windowmasker.sdust.bed.gz # Loaded 13647111 elements of size 3 featureBits -countGaps canFam3 windowmaskerSdust # 828382210 bases of 2410976875 (34.359%) in intersection # eliminate the gaps from the masking featureBits canFam3 -not gap -bed=notGap.bed # 2392715236 bases of 2392715236 (100.000%) in intersection time nice -n +19 featureBits canFam3 windowmaskerSdust notGap.bed \ -bed=stdout | gzip -c > cleanWMask.bed.gz # 810120571 bases of 2392715236 (33.858%) in intersection # real 2m1.844s # reload track to get it clean hgLoadBed canFam3 windowmaskerSdust cleanWMask.bed.gz # Loaded 13644959 elements of size 4 time featureBits -countGaps canFam3 windowmaskerSdust # 810120571 bases of 2410976875 (33.601%) in intersection # real 1m19.909s # how much overlap with repeat masker: featureBits -countGaps canFam3 rmsk windowmaskerSdust # 565424408 bases of 2410976875 (23.452%) in intersection # mask sequence with this clean result zcat cleanWMask.bed.gz \ | twoBitMask ../../canFam3.unmasked.2bit stdin \ -type=.bed canFam3.cleanWMSdust.2bit twoBitToFa canFam3.cleanWMSdust.2bit stdout | faSize stdin \ > canFam3.cleanWMSdust.faSize.txt cat canFam3.cleanWMSdust.faSize.txt # 2410976875 bases (18261639 N's 2392715236 real 1582594665 upper # 810120571 lower) in 3268 sequences in 1 files # %33.60 masked total, %33.86 masked real ######################################################################### # NOT - MASK SEQUENCE WITH WM+TRF (NOT - 2012-01-06 - Hiram) # not masking this genome with WM+TRF since RepeatMasker has # masked enough cd /hive/data/genomes/canFam3 twoBitMask -add bed/windowMasker/canFam3.cleanWMSdust.2bit \ bed/simpleRepeat/trfMask.bed canFam3.2bit # safe to ignore the warnings about BED file with >=13 fields twoBitToFa canFam3.2bit stdout | faSize stdin > faSize.canFam3.txt cat faSize.canFam3.txt # create symlink to gbdb ssh hgwdev rm /gbdb/canFam3/canFam3.2bit ln -s `pwd`/canFam3.2bit /gbdb/canFam3/canFam3.2bit # what happens with all masks: twoBitMask -add canFam3.2bit bed/repeatMasker/canFam3.sorted.fa.out \ canFam3.wm.trf.rmsk.2bit twoBitToFa canFam3.wm.trf.rmsk.2bit stdout | faSize stdin ######################################################################## # cpgIslands - (DONE - 2011-04-23 - Hiram) mkdir /hive/data/genomes/canFam3/bed/cpgIslands cd /hive/data/genomes/canFam3/bed/cpgIslands time doCpgIslands.pl canFam3 > do.log 2>&1 # real 5m8.626s cat fb.canFam3.cpgIslandExt.txt # 39980778 bases of 2392715236 (1.671%) in intersection ######################################################################### # genscan - (DONE - 2011-04-25 - Hiram) mkdir /hive/data/genomes/canFam3/bed/genscan cd /hive/data/genomes/canFam3/bed/genscan time doGenscan.pl canFam3 > do.log 2>&1 # real 56m58.015s cat fb.canFam3.genscan.txt # 56089579 bases of 2392715236 (2.344%) in intersection cat fb.canFam3.genscanSubopt.txt # 49232090 bases of 2392715236 (2.058%) in intersection ######################################################################### # MAKE 11.OOC FILE FOR BLAT/GENBANK (DONE - 2012-05-01 - Hiram) # Use -repMatch=900, based on size -- for human we use 1024 # use the "real" number from the faSize measurement, # hg19 is 2897316137, calculate the ratio factor for 1024: calc \( 2392715236 / 2897316137 \) \* 1024 # ( 2392715236 / 2897316137 ) * 1024 = 845.658632 # round up to 900 (canFam2 used 852) cd /hive/data/genomes/canFam3 time blat canFam3.2bit /dev/null /dev/null -tileSize=11 \ -makeOoc=jkStuff/canFam3.11.ooc -repMatch=900 # Wrote 24788 overused 11-mers to jkStuff/canFam3.11.ooc # real 1m11.629s # there are a few non-bridged gaps, make lift file for genbank hgsql -N -e "select bridge from gap;" canFam3 | sort | uniq -c # 80 no # 23796 yes cd /hive/data/genomes/canFam3/jkStuff gapToLift canFam3 canFam3.nonBridged.lift -bedFile=canFam3.nonBridged.bed # largest non-bridged contig: awk '{print $3-$2,$0}' canFam3.nonBridged.bed | sort -nr | head # 123773608 chrX 95534 123869142 chrX.01 ######################################################################### # AUTO UPDATE GENBANK (DONE - 2012-02-08 - Hiram) # examine the file: /cluster/data/genomes/genbank/data/genomes/organism.lst # for your species to see what counts it has for: # organism mrnaCnt estCnt refSeqCnt # Mus musculus 334577 4853663 26288 # to decide which "native" mrna or ests you want to specify in genbank.conf # of course, canFam3 has plenty of everything ssh hgwdev cd $HOME/kent/src/hg/makeDb/genbank git pull # edit etc/genbank.conf to add canFam3 just after canFam2 and commit to GIT # canFam3 (dog) canFam3.serverGenome = /hive/data/genomes/canFam3/canFam3.2bit canFam3.clusterGenome = /hive/data/genomes/canFam3/canFam3.2bit canFam3.ooc = /hive/data/genomes/canFam3/jkStuff/canFam3.11.ooc canFam3.lift = /hive/data/genomes/canFam3/jkStuff/canFam3.nonBridged.lift canFam3.align.unplacedChroms = chrUn canFam3.refseq.mrna.native.pslCDnaFilter = ${ordered.refseq.mrna.native.pslCDnaFilter} canFam3.refseq.mrna.xeno.pslCDnaFilter = ${ordered.refseq.mrna.xeno.pslCDnaFilter} canFam3.genbank.mrna.native.pslCDnaFilter = ${ordered.genbank.mrna.native.pslCDnaFilter} canFam3.genbank.mrna.xeno.pslCDnaFilter = ${ordered.genbank.mrna.xeno.pslCDnaFilter} canFam3.genbank.est.native.pslCDnaFilter = ${ordered.genbank.est.native.pslCDnaFilter} canFam3.refseq.mrna.native.load = yes canFam3.refseq.mrna.xeno.load = yes canFam3.genbank.mrna.xeno.load = yes canFam3.downloadDir = canFam3 canFam3.upstreamGeneTbl = ensGene # end of section added to etc/genbank.conf git commit -m "adding definition for canFam3" genbank.conf git push make etc-update ssh hgwdev # used to do this on "genbank" machine screen -S mm10 # long running job managed in screen cd /cluster/data/genbank # rerun this with canFam3.perChromTables = no in the genbank.conf # 2012-05-24,25 time nice -n +19 ./bin/gbAlignStep -initial canFam3 & # logFile: var/build/logs/2012.05.24-15:31:48.canFam3.initalign.log # second time: about 32 minutes # first time: real 381m29.244s # load data/genomesbase when finished ssh hgwdev cd /cluster/data/genbank time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad canFam3 & # logFile: var/dbload/hgwdev/logs/2012.05.25-12:52:27.dbload.log # real 125m30.185s # enable daily alignment and update of hgwdev (DONE - 2012-02-09 - Hiram) cd ~/kent/src/hg/makeDb/genbank git pull # add canFam3 to: vi etc/align.dbs etc/hgwdev.dbs git commit -m "Added canFam3." etc/align.dbs etc/hgwdev.dbs git push make etc-update ############################################################################ # create pushQ entry (DONE - 2012-05-23 - Hiram) # first make sure all.joiner is up to date and has this new organism # a keys check should be clean: cd ~/kent/src/hg/makeDb/schema joinerCheck -database=canFam3 -keys all.joiner mkdir /hive/data/genomes/canFam3/pushQ cd /hive/data/genomes/canFam3/pushQ makePushQSql.pl canFam3 > canFam3.sql 2> stderr.out # check stderr.out for no significant problems, it is common to see: # WARNING: hgwdev does not have /gbdb/canFam3/wib/gc5Base.wib # WARNING: hgwdev does not have /gbdb/canFam3/wib/quality.wib # WARNING: hgwdev does not have /gbdb/canFam3/bbi/quality.bw # WARNING: canFam3 does not have seq # WARNING: canFam3 does not have extFile # which are no real problem # if some tables are not identified: # WARNING: Could not tell (from trackDb, all.joiner and hardcoded lists of # supporting and genbank tables) which tracks to assign these tables to: # # put them in manually after loading the pushQ entry scp -p canFam3.sql hgwbeta:/tmp ssh hgwbeta cd /tmp hgsql qapushq < canFam3.sql ######################################################################### # LASTZ Cow BosTau7 (DONE - 2012-06-23 - Chin) screen -S bosTau7CanFam3 mkdir /hive/data/genomes/canFam3/bed/lastzBosTau7.2012-06-23 cd /hive/data/genomes/canFam3/bed/lastzBosTau7.2012-06-23 # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 cat << '_EOF_' > DEF # dog vs cow # maximum M allowed with lastz is only 254 BLASTZ_M=254 # TARGET: Dog canFam3 SEQ1_DIR=/hive/data/genomes/canFam3/canFam3.2bit SEQ1_LEN=/hive/data/genomes/canFam3/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: Cow bosTau7 SEQ2_DIR=/hive/data/genomes/bosTau7/bosTau7.2bit SEQ2_LEN=/hive/data/genomes/bosTau7/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=2000 BASE=/hive/data/genomes/canFam3/bed/lastzBosTau7.2012-06-23 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -noLoadChainSplit \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 1528m6.142s cat fb.canFam3.chainBosTau7Link.txt # 1381966556 bases of 2392715236 (57.757%) in intersection # Create link cd /hive/data/genomes/canFam3/bed ln -s lastzBosTau7.2012-06-23 lastz.bosTau7 # and the swap mkdir /hive/data/genomes/bosTau7/bed/blastz.canFam3.swap cd /hive/data/genomes/bosTau7/bed/blastz.canFam3.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/canFam3/bed/lastzBosTau7.2012-06-23/DEF \ -swap -syntenicNet \ -noLoadChainSplit \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 121m44.022s cat fb.bosTau7.chainCanFam3Link.txt # 1456104306 bases of 2804673174 (51.917%) in intersection cd /hive/data/genomes/bosTau7/bed ln -s blastz.canFam3.swap lastz.canFam3 ######################################################################### # LASTZ Cow BosTau6 (DONE - 2012-06-24 - Chin) screen -S bosTau6CanFam3 mkdir /hive/data/genomes/canFam3/bed/lastzBosTau6.2012-06-24 cd /hive/data/genomes/canFam3/bed/lastzBosTau6.2012-06-24 # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 cat << '_EOF_' > DEF # dog vs cow # maximum M allowed with lastz is only 254 BLASTZ_M=254 # TARGET: Dog canFam3 SEQ1_DIR=/hive/data/genomes/canFam3/canFam3.2bit SEQ1_LEN=/hive/data/genomes/canFam3/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: Cow bosTau6 SEQ2_DIR=/hive/data/genomes/bosTau6/bosTau6.2bit SEQ2_LEN=/hive/data/genomes/bosTau6/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=2000 BASE=/hive/data/genomes/canFam3/bed/lastzBosTau6.2012-06-24 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -noLoadChainSplit \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 1392m1.959s cat fb.canFam3.chainBosTau6Link.txt # 1387159926 bases of 2392715236 (57.974%) in intersection # Create link cd /hive/data/genomes/canFam3/bed ln -s lastzBosTau6.2012-06-24 lastz.bosTau6 # and the swap mkdir /hive/data/genomes/bosTau6/bed/blastz.canFam3.swap cd /hive/data/genomes/bosTau6/bed/blastz.canFam3.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/canFam3/bed/lastzBosTau6.2012-06-24/DEF \ -swap -syntenicNet \ -noLoadChainSplit \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 119m54.003s cat fb.bosTau6.chainCanFam3Link.txt # 1399687351 bases of 2649682029 (52.825%) in intersection cd /hive/data/genomes/bosTau6/bed ln -s blastz.canFam3.swap lastz.canFam3 ######################################################################### # swap LASTZ from Human hg19 (DONE - 2012-07-04 - Hiram) # the original alignment cd /hive/data/genomes/hg19/bed/lastzCanFam3.2012-07-03 cat fb.hg19.chainCanFam3Link.txt # 1502192631 bases of 2897316137 (51.848%) in intersection # and for the swap mkdir /hive/data/genomes/canFam3/bed/blastz.hg19.swap cd /hive/data/genomes/canFam3/bed/blastz.hg19.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/hg19/bed/lastzCanFam3.2012-07-03/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 103m14.464s cat fb.canFam3.chainHg19Link.txt # 1455183825 bases of 2392715236 (60.817%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/canFam3/bed ln -s blastz.hg19.swap lastz.hg19 ############################################################################## # construct liftOver to canFam2 (DONE - 2012-11-27,29 - Hiram) screen -S canFam2 # manage this longish running job in a screen mkdir /hive/data/genomes/canFam3/bed/blat.canFam2.2012-11-27 cd /hive/data/genomes/canFam3/bed/blat.canFam2.2012-11-27 # check it with -debug first to see if it is going to work: time doSameSpeciesLiftOver.pl -buildDir=`pwd` -bigClusterHub=swarm \ -ooc=/hive/data/genomes/canFam3/jkStuff/canFam3.11.ooc \ -debug -dbHost=hgwdev -workhorse=hgwdev canFam3 canFam2 # if that is OK, then run it: time doSameSpeciesLiftOver.pl -buildDir=`pwd` -bigClusterHub=swarm \ -ooc=/hive/data/genomes/canFam3/jkStuff/canFam3.11.ooc \ -dbHost=hgwdev -workhorse=hgwdev canFam3 canFam2 > do.log 2>&1 # real 1752m51.425s # two jobs appear to be a problem, running manually on hgwdev in run.chain: ./job.csh part025.lst/canFam2.2bit:chrUn: chainRaw/part025.lst/canFam2.2bit:chrUn:.chain & ./job.csh part026.lst/canFam2.2bit:chrUn: chainRaw/part026.lst/canFam2.2bit:chrUn:.chain wait # real 54m24.882s # the problem is that those axtChain operations use a lot of memory, # for some reason on the kluster nodes, they just slow down and get # stuck at 2 Gb # continuing: time doSameSpeciesLiftOver.pl -buildDir=`pwd` -bigClusterHub=swarm \ -ooc=/hive/data/genomes/canFam3/jkStuff/canFam3.11.ooc \ -continue=net -dbHost=hgwdev -workhorse=hgwdev canFam3 canFam2 \ > net.log 2>&1 # missed the output in the net.log # real 117m45.220s # verify this file exists: og -L /gbdb/canFam3/liftOver/canFam3ToCanFam2.over.chain.gz # -rw-rw-r-- 1 583607 May 5 02:16 /gbdb/canFam3/liftOver/canFam3ToCanFam2.over.chain.gz # and try out the conversion on genome-test from canFam3 to canFam2 ############################################################################