# for emacs: -*- mode: sh; -*- # $Id: oviAri1.txt,v 1.6 2010/05/06 16:27:44 chinhli Exp $ # Ovis aries (domestic sheep) -- ISGC Ovis_aries_1.0 (2010-02-01) # http://www.ncbi.nlm.nih.gov/genome/83 # http://www.ncbi.nlm.nih.gov/bioproject/33937 # http://www.ncbi.nlm.nih.gov/Traces/wgs/?val=ACIV01 # file template copied from susScr2.txt # Ovis aries (NCBI Project ID: 10709, Accession: GCA_000005525.1) # by International Sheep Genomics Consortium (ISGC) # assembly] sequence: # ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/ # Ovis_aries/Ovis_aries_1.0 ########################################################################## # Download sequence (DONE - 2010-03-22 2010-0414 Chin) mkdir /hive/data/genomes/oviAri1 cd /hive/data/genomes/oviAri1 mkdir genbank cd genbank wget --timestamping -r --cut-dirs=6 --level=0 -nH -x \ --no-remove-listing -np \ "ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Ovis_aries/Ovis_aries_1.0/*" # FINISHED --09:05:15-- # Downloaded: 151 files, 1.3G in 7m 42s (2.98 MB/s) # Read ASSEMBLY_INFO mkdir ucscChr # stay at genbank directory # fixup the accession names to become UCSC chrom names export S=Primary_Assembly/assembled_chromosomes cut -f2 ${S}/chr2acc | while read ACC do C=`grep "${ACC}" ${S}/chr2acc | cut -f1` echo "${ACC} -> chr${C}" zcat ${S}/AGP/chr${C}.comp.agp.gz \ | sed -e "s/^${ACC}/chr${C}/" | gzip > ucscChr/chr${C}.agp.gz done export S=Primary_Assembly/assembled_chromosomes cut -f2 ${S}/chr2acc | while read ACC do C=`grep "${ACC}" ${S}/chr2acc | cut -f1` echo "${ACC} -> chr${C}" echo ">chr${C}" > ucscChr/chr${C}.fa zcat ${S}/FASTA/chr${C}.fa.gz | grep -v "^>" >> ucscChr/chr${C}.fa gzip ucscChr/chr${C}.fa & done # Check them with faSize faSize Primary_Assembly/assembled_chromosomes/FASTA/chr*.fa.gz # 2784748484 bases (1600136831 N's 1184611653 real 1184611653 upper # 0 lower) in 27 sequences in 27 files faSize ucscChr/chr*.fa.gz # 2784748484 bases (1600136831 N's 1184611653 real 1184611653 upper # 0 lower) in 27 sequences in 27 files # For unplaced scalfolds, named them as chrUn_xxxxxxxx # where xxxxxx is the original access id as: chrUn_GL340781.1 # The ".1" at the end need to be filter out since # MySQL does not allow "." as part of the table name and # will casue problems in genbank task step later export S=Primary_Assembly/unplaced_scaffolds zcat ${S}/AGP/unplaced.scaf.agp.gz | grep "^#" > ucscChr/chrUn.agp # append the gap records zcat ${S}/AGP/unplaced.scaf.agp.gz | grep -v "^#" \ | sed -e "s/^/chrUn_/" -e "s/\.1//" >> ucscChr/chrUn.agp gzip ucscChr/chrUn.agp & zcat ${S}/FASTA/unplaced.scaf.fa.gz \ | sed -e "s#^>.*|gb|#>chrUn_#; s#|.*##" -e "s/\.1//" \ | gzip > ucscChr/chrUn.fa.gz # about 1190 sequences in the unplaced zcat ucscChr/chrUn.fa.gz | grep "^>" | wc # 1190 1190 19040 # Check them with faSize faSize Primary_Assembly/unplaced_scaffolds/FASTA/unplaced.scaf.fa.gz # 75747883 bases (59104875 N's 16643008 real 16643008 upper 0 lower) # in 1190 sequences in 1 files faSize ucscChr/chrUn.fa.gz # 75747883 bases (59104875 N's 16643008 real 16643008 upper 0 lower) # in 1190 sequences in 1 files ######################################################################### # Initial makeGenomeDb.pl (DONE - 2010-04-14 - Chin) cd /hive/data/genomes/oviAri1 cat << '_EOF_' > oviAri1.config.ra # Config parameters for makeGenomeDb.pl: db oviAri1 clade mammal genomeCladePriority 31 scientificName Ovis aries commonName Sheep assemblyDate Feb. 2010 assemblyLabel ISGC (NCBI project 10709, accession GCA_000005525.1) assemblyShortLabel ISGC Ovis_aries_1.0 orderKey 236 mitoAcc NC_001941 fastaFiles /hive/data/genomes/oviAri1/genbank/ucscChr/chr*.fa.gz agpFiles /hive/data/genomes/oviAri1/genbank/ucscChr/chr*.agp.gz # qualFiles none dbDbSpeciesDir sheep taxId 9940 '_EOF_' # << happy emacs time makeGenomeDb.pl -noGoldGapSplit -workhorse=hgwdev oviAri1.config.ra \ > makeGenomeDb.log 2>&1 & # real real 12m42.419s # add the trackDb entries to the source tree, and the 2bit link: ln -s `pwd`/oviAri1.unmasked.2bit /gbdb/oviAri1/oviAri1.2bit # make sure /gbdb/oviAri1/oviAri1.2bit is pointing to the masked 2bit # file: rm /gbdb/oviAri1/oviAri1.2bit ln -s `pwd`/oviAri1.rmsk.2bit /gbdb/oviAri1/oviAri1.2bit # Per instructions in makeGenomeDb.log: # - cvs add sheep/oviAri1 # - cvs add sheep/oviAri1/*.{ra,html} # - cvs ci -m "Added oviAri1 to DBS." makefile # - cvs ci -m "Initial descriptions for oviAri1." sheep/oviAri1 # - (if necessary) cvs ci sheep # - Run make update DBS=oviAri1 and make alpha when done. # - (optional) Clean up /cluster/data/oviAri1/TemporaryTrackDbCheckout # - cvsup your ~/kent/src/hg/makeDb/trackDb and make future edits there. # browser should function now # and checkin the *.ra and *.html files. in # /cluster/home/chinhli/kent/src/hg/makeDb/trackDb/sheep/oviAri1 ######################################################################### # RepeatMasker (DONE - 2010-04-14 - Chin) mkdir /hive/data/genomes/oviAri1/bed/repeatMasker cd /hive/data/genomes/oviAri1/bed/repeatMasker time nice -n +19 doRepeatMasker.pl -buildDir=`pwd` \ -workhorse=hgwdev -bigClusterHub=swarm -noSplit oviAri1 > do.log 2>&1 & # real 178m52.467s cat faSize.rmsk.txt # 2860512983 bases (1659241706 N's 1201271277 real 954826276 upper # 246445001 lower) in 1218 sequences in 1 files # %8.62 masked total, %20.52 masked real ######################################################################### # simpleRepeats ( DONE - 2010-04-14 - Chin) mkdir /hive/data/genomes/oviAri1/bed/simpleRepeat cd /hive/data/genomes/oviAri1/bed/simpleRepeat time nice -n +19 doSimpleRepeat.pl -buildDir=`pwd` -workhorse=hgwdev \ -bigClusterHub=pk -smallClusterHub=pk oviAri1 > do.log 2>&1 & # real 3m23.411s cat fb.simpleRepeat # 4278474 bases of 1201962925 (0.356%) in intersection # add to the repeatMasker cd /hive/data/genomes/oviAri1 twoBitMask oviAri1.rmsk.2bit -add bed/simpleRepeat/trfMask.bed oviAri1.2bit # safe to ignore warnings about >=13 fields twoBitToFa oviAri1.2bit stdout | faSize stdin > oviAri1.2bit.faSize.txt cat oviAri1.2bit.faSize.txt # 2860512983 bases (1659241706 N's 1201271277 real 954641922 upper # 246629355 lower) in 1218 sequences in 1 files # %8.62 masked total, %20.53 masked real ######################################################################### # Marking *all* gaps - they are not all in the AGP file # (DONE - 2010-04-14 - Chin) mkdir /hive/data/genomes/oviAri1/bed/allGaps cd /hive/data/genomes/oviAri1/bed/allGaps time nice -n +19 findMotif -motif=gattaca -verbose=4 \ -strand=+ ../../oviAri1.unmasked.2bit > findMotif.txt 2>&1 # real 1m40.366s grep "^#GAP " findMotif.txt | sed -e "s/^#GAP //" > allGaps.bed featureBits oviAri1 -not gap -bed=notGap.bed # 1201962925 bases of 1201962925 (100.000%) in intersection featureBits oviAri1 allGaps.bed notGap.bed -bed=new.gaps.bed # 691648 bases of 1201962925 (0.058%) in intersection # 0 bases of 1201271277 (0.000%) in intersection zero????? # what is the highest index in the existing gap table: hgsql -N -e "select ix from gap;" oviAri1 | sort -n | tail -1 # 484408 # use tcsh and ctrl-c to create the here doc cat << '_EOF_' > mkGap.pl #!/usr/bin/env perl use strict; use warnings; my $ix=`hgsql -N -e "select ix from gap;" oviAri1 | sort -n | tail -1`; chomp $ix; open (FH,") { my ($chrom, $chromStart, $chromEnd, $rest) = split('\s+', $line); ++$ix; printf "%s\t%d\t%d\t%d\tN\t%d\tother\tyes\n", $chrom, $chromStart, $chromEnd, $ix, $chromEnd-$chromStart; } close (FH); '_EOF_' # << happy emacs chmod +x ./mkGap.pl ./mkGap.pl > other.bed featureBits oviAri1 other.bed # 691648 bases of 1201962925 (0.058%) in intersection hgLoadBed -sqlTable=$HOME/kent/src/hg/lib/gap.sql \ -noLoad oviAri1 otherGap other.bed # Loaded 475536 elements of size 8 # adding this many: wc -l bed.tab # 475536 # starting with this many hgsql -e "select count(*) from gap;" oviAri1 # 2350123 hgsql oviAri1 -e 'load data local infile "bed.tab" into table gap;' # result count: hgsql -e "select count(*) from gap;" oviAri1 # 2825659 # == 2350123 + 475536 ######################################################################## # Create kluster run files (DONE - 2010-04-15 - Chin) # numerator is oviAri1 gapless bases "real" as reported by: featureBits -noRandom -noHap oviAri1 gap # 1600136831 bases of 1184628269 (135.075%) in intersection # denominator is hg19 gapless bases as reported by: # featureBits -noRandom -noHap hg19 gap # 234344806 bases of 2861349177 (8.190%) in intersection # 1024 is threshold used for human -repMatch: calc \( 1184628269 / 2861349177 \) \* 1024 # ( 1184628269 / 2861349177 ) * 1024 = 423.946632 # ==> use -repMatch=400 according to size scaled down from 1024 for human. # and rounded down to nearest 50 cd /hive/data/genomes/oviAri1 blat oviAri1.2bit \ /dev/null /dev/null -tileSize=11 -makeOoc=jkStuff/oviAri1.11.ooc \ -repMatch=400 & # Wrote 19704 overused 11-mers to jkStuff/oviAri1.11.ooc mkdir /hive/data/staging/data/oviAri1 cp -p oviAri1.2bit jkStuff/oviAri1.11.ooc /hive/data/staging/data/oviAri1 cp -p chrom.sizes /hive/data/staging/data/oviAri1 # check non-bridged gaps to see what the typical size is: hgsql -N \ -e 'select * from gap where bridge="no" order by size;' oviAri1 \ | sort -k7,7nr # most gaps have size > 100,000 # decide on a minimum gap for this break gapToLift -verbose=2 -minGap=20000 oviAri1 jkStuff/nonBridged.lft \ -bedFile=jkStuff/nonBridged.bed cp -p jkStuff/nonBridged.lft \ /hive/data/staging/data/oviAri1/oviAri1.nonBridged.lft # ask cluster-admin to copy (evry time if any file chsnged) # /hive/data/staging/data/oviAri1 directory to cluster nodes # /scratch/data/oviAri1 ######################################################################## # GENBANK AUTO UPDATE (DONE - 2010-04-15 - Chin) ssh hgwdev cd $HOME/kent/src/hg/makeDb/genbank cvsup # edit etc/genbank.conf to add oviAri1 just before susScr2 # oviAri1 (Sheep) oviAri1.serverGenome = /hive/data/genomes/oviAri1/oviAri1.2bit oviAri1.clusterGenome = /scratch/data/oviAri1/oviAri1.2bit oviAri1.ooc = /scratch/data/oviAri1/oviAri1.11.ooc oviAri1.lift = no oviAri1.perChromTables = no oviAri1.refseq.mrna.native.pslCDnaFilter = ${ordered.refseq.mrna.native.pslCDnaFilter} oviAri1.refseq.mrna.xeno.pslCDnaFilter = ${ordered.refseq.mrna.xeno.pslCDnaFilter} oviAri1.genbank.mrna.native.pslCDnaFilter = ${ordered.genbank.mrna.native.pslCDnaFilter} oviAri1.genbank.mrna.xeno.pslCDnaFilter = ${ordered.genbank.mrna.xeno.pslCDnaFilter} oviAri1.genbank.est.native.pslCDnaFilter = ${ordered.genbank.est.native.pslCDnaFilter} oviAri1.genbank.est.xeno.pslCDnaFilter = ${ordered.genbank.est.xeno.pslCDnaFilter} oviAri1.downloadDir = oviAri1 oviAri1.refseq.mrna.native.load = yes oviAri1.refseq.mrna.xeno.load = yes oviAri1.refseq.mrna.xeno.loadDesc = yes cvs ci -m "Added oviAri1" etc/genbank.conf # update /cluster/data/genbank/: make etc-update # Edit src/lib/gbGenome.c to add new species. With these two lines: # static char *oviAriNames[] = {"Ovis aries", NULL}; # ... later ... # {"oviAri", oviAriNames}, # gbGenome.c is in # /cluster/home/chinhli/kent/src/hg/makeDb/genbank/src/lib # make and checkin cvs ci -m "adding oviAri1 Sheep" src/lib/gbGenome.c make install-server ssh genbank screen # control this business with a screen since it takes a while cd /cluster/data/genbank time nice -n +19 ./bin/gbAlignStep -initial oviAri1 & # logFile: var/build/logs/2010.04.16-10:04:13.oviAri1.initalign.log # real 365m55.393s # To re-do, rm the dir first: # /cluster/data/genbank/data/aligned/genbank.176.0/oviAri1 # load database when finished ssh hgwdev cd /cluster/data/genbank time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad oviAri1 & # logFile: var/dbload/hgwdev/logs/2011.02.03-13:09:50.dbload.log # real 31m57.999s # enable daily alignment and update of hgwdev cd ~/kent/src/hg/makeDb/genbank cvsup # add oviAri1 to: etc/align.dbs etc/hgwdev.dbs cvs ci -m "Added oviAri1 - Sheep" etc/align.dbs etc/hgwdev.dbs make etc-update ######################################################################### # reset position to RHO location as found from blat of hg19 RHO gene # (DONE - 2010-04-15 - Chin) hgsql -e \ 'update dbDb set defaultPos="chr19:53,315,047-53,326,013" where name="oviAri1";' \ hgcentraltest ############################################################################ # ctgPos2 track - showing clone sequence locations on chromosomes # (DONE 2011-02-03 - Chin) # NOTE - create oviAri1 entry in all.joiner since this is a new species mkdir /hive/data/genomes/oviAri1/bed/ctgPos2 cd /hive/data/genomes/oviAri1/bed/ctgPos2 cat << '_EOF_' > agpToCtgPos2.pl #!/usr/bin/env perl use warnings; use strict; my $argc = scalar(@ARGV); if ($argc != 1) { printf STDERR "usage: zcat your.files.agp.gz | agpToCtgPos2.pl /dev/stdin > ctgPos2.tab\n"; exit 255; } my $agpFile = shift; open (FH, "<$agpFile") or die "can not read $agpFile"; while (my $line = ) { next if ($line =~ m/^#/); chomp $line; my @a = split('\s+', $line); next if ($a[4] =~ m/^N$/); my $chrSize = $a[2]-$a[1]+1; my $ctgSize = $a[7]-$a[6]+1; die "sizes differ $chrSize != $ctgSize\n$line\n" if ($chrSize != $ctgSize); printf "%s\t%d\t%s\t%d\t%d\t%s\n", $a[5], $chrSize, $a[0], $a[1]-1, $a[2], $a[4]; } close (FH); '_EOF_' # << happy emacs chmod 775 agpToCtgPos2.pl export S=../../genbank/Primary_Assembly/assembled_chromosomes cut -f2 ${S}/chr2acc | while read ACC do C=`grep "${ACC}" ${S}/chr2acc | cut -f1` zcat ${S}/AGP/chr${C}.agp.gz \ | sed -e "s/^${ACC}/chr${C}/" done | ./agpToCtgPos2.pl /dev/stdin > ctgPos2.tab hgLoadSqlTab oviAri1 ctgPos2 $HOME/kent/src/hg/lib/ctgPos2.sql ctgPos2.tab # add the track ctgPos2 to src/hg/makeDb/trackDb/sheep/trackDb.ra # at src/makeDb/trackdb do "make update DBS=oviAri1" or/and "make alpha" # based on result of # hgsql -N -e "select type from ctgPos2;" oviAri1 | sort | uniq # W # prepare the src/hg/makeDb/trackDb/sheep/ctgPos2.html ############################################################################ # oviAri1 Sheep BLASTZ/CHAIN/NET (DONE 2011-02-04 - Chin) # request to copy /hive/data/staging.oviAri1 over to /scratch/data/oviAri1 screen # use a screen to manage this multi-day job mkdir /hive/data/genomes/bosTau4/bed/lastzOviAri1.2010-04-16 cd /hive/data/genomes/bosTau4/bed/lastzOviAri1.2010-04-16 cat << '_EOF_' > DEF # Cow vs. Sheep BLASTZ_M=50 # TARGET: Cow BosTau4 SEQ1_DIR=/scratch/data/bosTau4/bosTau4.2bit SEQ1_LEN=/scratch/data/bosTau4/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 SEQ1_LIMIT=50 # QUERY: Sheep OviAri1 SEQ2_DIR=/scratch/data/oviAri1/oviAri1.2bit SEQ2_LEN=/scratch/data/oviAri1/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LIMIT=100 SEQ2_LAP=0 BASE=/hive/data/genomes/bosTau4/bed/lastzOviAri1.2010-04-16 TMPDIR=/scratch/tmp '_EOF_' # << this line keeps emacs coloring happy time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -noLoadChainSplit -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 1651m36.829s # failed during the netChainSubset | chainStitchId out of memory # finish that manually with ulimits to allow more memory on hgwdev: cd /hive/data/genomes/bosTau4/bed/lastzOviAri1.2010-04-16/axtChain export sizeG=188743680 ulimit -d $sizeG ulimit -v $sizeG netChainSubset -verbose=0 noClass.net bosTau4.oviAri1.all.chain.gz stdout \ | chainStitchId stdin stdout | gzip -c > bosTau4.oviAri1.over.chain.gz # and, finish the rest of netChains.csh manually, the netToAxt step # and the axtToMaf step, log is in axtChain/finiChains.log # about 4 hours # after done with netChains - continuing with load: cd /hive/data/genomes/bosTau4/bed/lastzOviAri1.2010-04-16 time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -continue=load -noLoadChainSplit -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > load.log 2>&1 & # this didn't work either due to memory limits with hgLoadChain # add the following to loadUp.csh # limit at 160 Gb limit datasize 163840m limit vmemoryuse 163840m # and finish it manually ./loadUp.csh > loadUp.log 2>&1 & ulimit -d 163840000 ulimit -v 163840000 cat fb.bosTau4.chainOviAri1Link.txt # 1319167970 bases of 2731830700 (48.289%) in intersection # then continuing: time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -continue=download -noLoadChainSplit -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > download.part3.log 2>&1 & # failed again with # needLargeMem: Out of memory - request size 1073741824 bytes, errno: 12 # add the following to netSynteny.csh # limit at 160 Gb limit datasize 163840m limit vmemoryuse 163840m # and finish it manually tcsh cd /hive/data/genomes/bosTau4/bed/lastzOviAri1.2010-04-16/axtChain ./netSynteny.csh > & netSynteny_tcsh.log # and the swap (working 2011-02-04) mkdir /hive/data/genomes/oviAri1/bed/blastz.bosTau4.swap cd /hive/data/genomes/oviAri1/bed/blastz.bosTau4.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/bosTau4/bed/lastzOviAri1.2010-04-16/DEF \ -swap -noLoadChainSplit -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 309m19.204s cat fb.oviAri1.chainBosTau4Link.txt # 1164717810 bases of 1201271277 (96.957%) in intersection ######################################################################### # SWAP mm9 lastz (DONE - 2010-04-12 - Chin) # original alignment cd /hive/data/genomes/mm9/bed/lastzOviAri1.2010-04-16 cat fb.mm9.chainOviAri1Link.txt # 406407377 bases of 2620346127 (15.510%) in intersection # and the swap mkdir /hive/data/genomes/oviAri1/bed/blastz.mm9.swap cd /hive/data/genomes/oviAri1/bed/blastz.mm9.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm9/bed/lastzOviAri1.2010-04-16/DEF \ -swap -noLoadChainSplit -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 35m25.217s cat fb.oviAri1.chainMm9Link.txt # 383753361 bases of 1201271277 (31.946%) in intersection ############################################################################ # SWAP hg19 lastz (DONE 2010-04-12 - Chin) # original alignment cd /hive/data/genomes/hg19/bed/lastzOviAri1.2010-04-16 cat fb.hg19.chainOviAri1Link.txt # 878545517 bases of 2897316137 (30.323%) in intersection # and the swap mkdir /hive/data/genomes/oviAri1/bed/blastz.hg19.swap cd /hive/data/genomes/oviAri1/bed/blastz.hg19.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/hg19/bed/lastzOviAri1.2010-04-16/DEF \ -swap -noLoadChainSplit -syntenicNet \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 72m47.780s cat fb.oviAri1.chainHg19Link.txt # 824310420 bases of 1201271277 (68.620%) in intersection ######################################################################### # SWAP monDom5 lastz (working 2010-04-12 Chin) # original alignment cd /hive/data/genomes/monDom5/bed/lastzOviAri1.2010-04-16 cat fb.monDom5.chainOviAri1Link.txt # 133534458 bases of 3501660299 (3.813%) in intersection cd /hive/data/genomes/monDom5/bed/ ln -s lastzFelCatV17e.2010-03-22 lastz.oviAri1 # and the swap mkdir /hive/data/genomes/oviAri1/bed/blastz.monDom5.swap cd /hive/data/genomes/oviAri1/bed/blastz.monDom5.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/monDom5/bed/lastzOviAri1.2010-04-16/DEF \ -swap -noLoadChainSplit -syntenicNet \ -workhorse=hgwdev -smallClusterHub=memn -bigClusterHub=pk \ -chainMinScore=5000 -chainLinearGap=loose > swap.log 2>&1 & # real 39m13.430s cat fb.oviAri1.chainMonDom5Link.txt # 117493519 bases of 1201271277 (9.781%) in intersection ######################################################################### # SWAP equCap2 lastz (DONE 2010-04-12 Chin) # original alignment cd /hive/data/genomes/equCab2/bed/lastzOviAri1.2010-04-16 cat fb.equCab2.chainOviAri1Link.txt # 1012763540 bases of 2428790173 (41.698%) in intersection # and the swap mkdir /hive/data/genomes/oviAri1/bed/blastz.equCab2.swap cd /hive/data/genomes/oviAri1/bed/blastz.equCab2.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/equCab2/bed/lastzOviAri1.2010-04-16/DEF \ -swap -noLoadChainSplit -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=pk \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 103m0.724s cat fb.oviAri1.chainEquCab2Link.txt # 940763026 bases of 1201271277 (78.314%) in intersection ############################################################################ # running cpgIsland business (DONE -2010-04-29 - Chin) mkdir /hive/data/genomes/oviAri1/bed/cpgIsland cd /hive/data/genomes/oviAri1/bed/cpgIsland cvs -d /projects/compbio/cvsroot checkout -P hg3rdParty/cpgIslands cd hg3rdParty/cpgIslands # needed to fixup this source, adding include to readseq.c: #include "string.h" # and to cpg_lh.c: #include "unistd.h" #include "stdlib.h" # and fixing a declaration in cpg_lh.c sed -e "s#\(extern char\* malloc\)#// \1#" cpg_lh.c > tmp.c mv tmp.c cpg_lh.c make cd ../../ ln -s hg3rdParty/cpgIslands/cpglh.exe mkdir -p hardMaskedFa cut -f1 ../../chrom.sizes | while read C do echo ${C} twoBitToFa ../../oviAri1.2bit:$C stdout \ | maskOutFa stdin hard hardMaskedFa/${C}.fa done ssh swarm cd /hive/data/genomes/oviAri1/bed/cpgIsland mkdir results cut -f1 ../../chrom.sizes > chr.list cat << '_EOF_' > template #LOOP ./runOne $(root1) {check out exists results/$(root1).cpg} #ENDLOOP '_EOF_' # << happy emacs # the faCount business is to make sure there is enough sequence to # work with in the fasta. cpglh.exe does not like files with too many # N's - it gets stuck cat << '_EOF_' > runOne #!/bin/csh -fe set C = `faCount hardMaskedFa/$1.fa | grep ^chr | awk '{print $2 - $7 }'` if ( $C > 200 ) then ./cpglh.exe hardMaskedFa/$1.fa > /scratch/tmp/$1.$$ mv /scratch/tmp/$1.$$ $2 else touch $2 endif '_EOF_' # << happy emacs chmod 775 runOne gensub2 chr.list single template jobList para create jobList para try para check ... etc para time para problems para status # then, kick it with para push # check it with plb # when all are done, para time shows: # para time # 1218 jobs in batch # 319 jobs (including everybody's) in Parasol queue or running. # Checking finished jobs # Completed: 1218 of 1218 jobs # CPU time in finished jobs: 201s 3.36m 0.06h 0.00d 0.000 y # IO & Wait Time: 3099s 51.64m 0.86h 0.04d 0.000 y # Average job time: 3s 0.05m 0.00h 0.00d # Longest finished job: 25s 0.42m 0.01h 0.00d # Submission to last job: 90s 1.50m 0.03h 0.00d # Estimated complete: 0s 0.00m 0.00h 0.00d # Transform cpglh output to bed + catDir results | awk '{ $2 = $2 - 1; width = $3 - $2; printf("%s\t%d\t%s\t%s %s\t%s\t%s\t%0.0f\t%0.1f\t%s\t%s\n", $1, $2, $3, $5,$6, width, $6, width*$7*0.01, 100.0*2*$6/width, $7, $9); }' > cpgIsland.bed ssh hgwdev cd /hive/data/genomes/oviAri1/bed/cpgIsland hgLoadBed oviAri1 cpgIslandExt -tab \ -sqlTable=$HOME/kent/src/hg/lib/cpgIslandExt.sql cpgIsland.bed # Loaded 25384 elements of size 10 # Sorted # Creating table definition for cpgIslandExt # Saving bed.tab # Loading oviAri1 # cleanup rm -fr hardMaskedFa ######################################################################### # all.joiner update, downloads and in pushQ - (DONE 2011-02-07 - Chin) cd $HOME/kent/src/hg/makeDb/schema # fixup all.joiner until this is a clean output joinerCheck -database=oviAri1 -all all.joiner mkdir /hive/data/genomes/oviAri1/goldenPath cd /hive/data/genomes/oviAri1/goldenPath makeDownloads.pl oviAri1 > do.log 2>&1 # now ready for pushQ entry mkdir /hive/data/genomes/oviAri1/pushQ cd /hive/data/genomes/oviAri1/pushQ makePushQSql.pl oviAri1 > oviAri1.pushQ.sql 2> stderr.out # check for errors in stderr.out, some are OK, e.g.: # WARNING: oviAri1 does not have seq # WARNING: oviAri1 does not have extFile # WARNING: Could not tell (from trackDb, all.joiner and hardcoded lists of # supporting and genbank tables) which tracks to assign these tables to: # bosTau4ChainPileUp # copy it to hgwbeta scp -p oviAri1.pushQ.sql hgwbeta:/tmp ssh hgwbeta cd /tmp hgsql qapushq < oviAri1.pushQ.sql # in that pushQ entry walk through each entry and see if the # sizes will set properly ############################################################################