# for emacs: -*- mode: sh; -*- # $Id: susScr2.txt,v 1.9 2010/04/26 15:28:21 markd Exp $ # Sus scrofa - SGSC Sscrofa9.2 NCBI project 10718, CM000812 # ftp://ftp.ncbi.nlm.nih.gov:genbank/genomes/Eukaryotes/vertebrates_mammals/Sus_scrofa/Sscrofa9.2/ # http://www.ncbi.nlm.nih.gov/bioproject/13421 # http://www.ncbi.nlm.nih.gov/genome/84 # http://www.ncbi.nlm.nih.gov/Traces/wgs/?val=AEMK01 ########################################################################## # Download sequence (DONE - 2010-03-25 - Hiram) mkdir /hive/data/genomes/susScr2 cd /hive/data/genomes/susScr2 mkdir genbank cd genbank mkdir Sscrofa9.2 cd Sscrofa9.2 wget --timestamping -r --cut-dirs=6 --level=0 -nH -x --no-remove-listing -np \ "ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Sus_scrofa/Sscrofa9.2/*" cd .. mkdir ucscChr # stay at genbank directory # fixup the accession names to become UCSC chrom names export S=Sscrofa9.2/Primary_Assembly/assembled_chromosomes cut -f2 ${S}/chr2acc | while read ACC do C=`grep "${ACC}" ${S}/chr2acc | cut -f1` echo "${ACC} -> chr${C}" zcat ${S}/AGP/chr${C}.comp.agp.gz \ | sed -e "s/^${ACC}/chr${C}/" | gzip > ucscChr/chr${C}.agp.gz done export S=Sscrofa9.2/Primary_Assembly/assembled_chromosomes cut -f2 ${S}/chr2acc | while read ACC do C=`grep "${ACC}" ${S}/chr2acc | cut -f1` echo "${ACC} -> chr${C}" echo ">chr${C}" > ucscChr/chr${C}.fa zcat ${S}/FASTA/chr${C}.fa.gz | grep -v "^>" >> ucscChr/chr${C}.fa gzip ucscChr/chr${C}.fa & done # Check them with faSize faSize Sscrofa9.2/Primary_Assembly/assembled_chromosomes/FASTA/chr*.fa.gz # 2262484801 bases (31203023 N's 2231281778 real 2231281778 upper # 0 lower) in 19 sequences in 19 files faSize ucscChr/chr*.fa.gz # 2262484801 bases (31203023 N's 2231281778 real 2231281778 upper # 0 lower) in 19 sequences in 19 files ######################################################################### # Initial makeGenomeDb.pl (DONE - 2010-03-25 - Hiram) cd /hive/data/genomes/susScr2 cat << '_EOF_' > susScr2.config.ra # Config parameters for makeGenomeDb.pl: db susScr2 clade mammal genomeCladePriority 35 scientificName Sus scrofa commonName Pig assemblyDate Nov. 2009 assemblyLabel SGSC Sscrofa9.2 (NCBI project 10718, GCA_000003025.2) assemblyShortLabel SGSC Sscrofa9.2 orderKey 234 mitoAcc NC_012095 fastaFiles /hive/data/genomes/susScr2/genbank/ucscChr/chr*.fa.gz agpFiles /hive/data/genomes/susScr2/genbank/ucscChr/chr*.agp.gz # qualFiles none dbDbSpeciesDir pig taxId 9823 '_EOF_' # << happy emacs time makeGenomeDb.pl -noGoldGapSplit -workhorse=hgwdev susScr2.config.ra \ > makeGenomeDb.log 2>&1 # real 9m0.673s # add the trackDb entries to the source tree, and the 2bit link: ln -s `pwd`/susScr2.unmasked.2bit /gbdb/susScr2/susScr2.2bit # browser should function now ######################################################################### # RepeatMasker (DONE - 2010-03-25 - Hiram) mkdir /hive/data/genomes/susScr2/bed/repeatMasker cd /hive/data/genomes/susScr2/bed/repeatMasker time nice -n +19 doRepeatMasker.pl -buildDir=`pwd` \ -workhorse=hgwdev -bigClusterHub=swarm -noSplit susScr2 > do.log 2>&1 # about 7 hours cat faSize.rmsk.txt # 2262501571 bases (31203023 N's 2231298548 real 1286207981 upper # 945090567 lower) in 20 sequences in 1 files # %41.77 masked total, %42.36 masked real ######################################################################### # simpleRepeats (DONE - 2010-03-25 - Hiram) mkdir /hive/data/genomes/susScr2/bed/simpleRepeat cd /hive/data/genomes/susScr2/bed/simpleRepeat time nice -n +19 doSimpleRepeat.pl -buildDir=`pwd` -workhorse=hgwdev \ -bigClusterHub=pk -smallClusterHub=pk susScr2 > do.log 2>&1 # about 14 minutes cat fb.simpleRepeat # 26577190 bases of 2231298548 (1.191%) in intersection # add to the repeatMasker cd /hive/data/genomes/susScr2 twoBitMask susScr2.rmsk.2bit -add bed/simpleRepeat/trfMask.bed susScr2.2bit # safe to ignore warnings about >=13 fields twoBitToFa susScr2.2bit stdout | faSize stdin > susScr2.2bit.faSize.txt cat susScr2.2bit.faSize.txt # 2262501571 bases (31203023 N's 2231298548 real 1285046974 upper # 946251574 lower) in 20 sequences in 1 files # %41.82 masked total, %42.41 masked real ######################################################################### # Marking *all* gaps - they are not all in the AGP file # (DONE - 2010-03-25 - Hiram) mkdir /hive/data/genomes/susScr2/bed/allGaps cd /hive/data/genomes/susScr2/bed/allGaps time nice -n +19 findMotif -motif=gattaca -verbose=4 \ -strand=+ ../../susScr2.unmasked.2bit > findMotif.txt 2>&1 # real 1m12.153s grep "^#GAP " findMotif.txt | sed -e "s/^#GAP //" > allGaps.bed featureBits susScr2 -not gap -bed=notGap.bed # 2231463081 bases of 2231463081 (100.000%) in intersection featureBits susScr2 allGaps.bed notGap.bed -bed=new.gaps.bed # 164533 bases of 2231463081 (0.007%) in intersection # what is the highest index in the existing gap table: hgsql -N -e "select ix from gap;" susScr2 | sort -n | tail -1 # 27295 cat << '_EOF_' > mkGap.pl #!/usr/bin/env perl use strict; use warnings; my $ix=`hgsql -N -e "select ix from gap;" susScr2 | sort -n | tail -1`; chomp $ix; open (FH,") { my ($chrom, $chromStart, $chromEnd, $rest) = split('\s+', $line); ++$ix; printf "%s\t%d\t%d\t%d\tN\t%d\tother\tyes\n", $chrom, $chromStart, $chromEnd, $ix, $chromEnd-$chromStart; } close (FH); '_EOF_' # << happy emacs chmod +x ./mkGap.pl ./mkGap.pl > other.bed featureBits susScr2 other.bed # 164533 bases of 2231463081 (0.007%) in intersection hgLoadBed -sqlTable=$HOME/kent/src/hg/lib/gap.sql \ -noLoad susScr2 otherGap other.bed # Loaded 96549 # adding this many: wc -l bed.tab # 96549 # starting with this many hgsql -e "select count(*) from gap;" susScr2 # 100179 hgsql susScr2 -e 'load data local infile "bed.tab" into table gap;' # result count: hgsql -e "select count(*) from gap;" susScr2 # 196728 # == 100179 + 96549 ######################################################################## # Create kluster run files (DONE - 2010-03-26 - Hiram) # numerator is susScr2 gapless bases "real" as reported by: # featureBits -noRandom -noHap susScr2 gap # denominator is hg19 gapless bases as reported by: # featureBits -noRandom -noHap hg19 gap # 1024 is threshold used for human -repMatch: calc \( 2231298548 / 2861349177 \) \* 1024 # ( 2231298548 / 2861349177 ) * 1024 = 798.521806 # ==> use -repMatch=750 according to size scaled down from 1024 for human. # and rounded down to nearest 50 cd /hive/data/genomes/susScr2 blat susScr2.2bit \ /dev/null /dev/null -tileSize=11 -makeOoc=jkStuff/susScr2.11.ooc \ -repMatch=750 # Wrote 31605 overused 11-mers to jkStuff/susScr2.11.ooc mkdir /hive/data/staging/data/susScr2 cp -p susScr2.2bit jkStuff/susScr2.11.ooc /hive/data/staging/data/susScr2 cp -p chrom.sizes /hive/data/staging/data/susScr2 # check non-bridged gaps to see what the typical size is: hgsql -N \ -e 'select * from gap where bridge="no" order by size;' susScr2 \ | sort -k7,7nr # decide on a minimum gap for this break gapToLift -verbose=2 -minGap=5000 susScr2 jkStuff/nonBridged.lft \ -bedFile=jkStuff/nonBridged.bed cp -p jkStuff/nonBridged.lft \ /hive/data/staging/data/susScr2/susScr2.nonBridged.lft ######################################################################## # GENBANK AUTO UPDATE (DONE - 2010-03-26 - Hiram) ssh hgwdev cd ~/kent/src/hg/makeDb/genbank cvsup # edit etc/genbank.conf to add susScr2 just before susScr1 # susScr2 (Pig) susScr2.serverGenome = /hive/data/genomes/susScr2/susScr2.2bit susScr2.clusterGenome = /scratch/data/susScr2/susScr2.2bit susScr2.ooc = /scratch/data/susScr2/susScr2.11.ooc susScr2.lift = /scratch/data/susScr2/susScr2.nonBridged.lft susScr2.refseq.mrna.native.pslCDnaFilter = ${ordered.refseq.mrna.native.pslCDnaFilter} susScr2.refseq.mrna.xeno.pslCDnaFilter = ${ordered.refseq.mrna.xeno.pslCDnaFilter} susScr2.genbank.mrna.native.pslCDnaFilter = ${ordered.genbank.mrna.native.pslCDnaFilter} susScr2.genbank.mrna.xeno.pslCDnaFilter = ${ordered.genbank.mrna.xeno.pslCDnaFilter} susScr2.genbank.est.native.pslCDnaFilter = ${ordered.genbank.est.native.pslCDnaFilter} susScr2.downloadDir = susScr2 susScr2.genbank.mrna.xeno.loadDesc = yes susScr2.refseq.mrna.native.load = no cvs ci -m "Added susScr2" etc/genbank.conf # update /cluster/data/genbank/: make etc-update ssh genbank screen # use a screen to manage this job cd /cluster/data/genbank time nice -n +19 bin/gbAlignStep -initial susScr2 & # logFile: var/build/logs/2010.03.26-10:25:04.susScr2.initalign.log # real 304m38.588s # load database when finished ssh hgwdev cd /cluster/data/genbank time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad susScr2 # logFile: var/dbload/hgwdev/logs/2010.03.26-15:38:17.dbload.log # real 68m # enable daily alignment and update of hgwdev cd ~/kent/src/hg/makeDb/genbank cvsup # add susScr2 to: etc/align.dbs etc/hgwdev.dbs cvs ci -m "Added susScr2 - Pig" etc/align.dbs etc/hgwdev.dbs make etc-update # DONE 2010-03-31 ######################################################################### # reset position to RHO location as found from blat of hg19 RHO gene # (DONE - 2010-03-31 - Hiram) hgsql -e \ 'update dbDb set defaultPos="chr13:57394166-57402412" where name="susScr2";' \ hgcentraltest # and make this the default genome for Pig hgsql -e 'update defaultDb set name="susScr2" where name="susScr1";' \ hgcentraltest # Zoomed position in on RR a little, to match what is currently on hgwdev # and hgwbeta. # (DONE - 2010-12-02 - Brooke) hgsql -h genome-centdb -e \ 'update dbDb set defaultPos="chr13:57394416-57400490" where name="susScr2"' \ hgcentral ############################################################################ # ctgPos2 track - showing clone sequence locations on chromosomes # (DONE - 2010-03-26 - Hiram) mkdir /hive/data/genomes/susScr2/bed/ctgPos2 cd /hive/data/genomes/susScr2/bed/ctgPos2 cat << '_EOF_' > agpToCtgPos2.pl #!/usr/bin/env perl use warnings; use strict; my $argc = scalar(@ARGV); if ($argc != 1) { printf STDERR "usage: zcat your.files.agp.gz | agpToCtgPos2.pl /dev/stdin > ctgPos2.tab\n"; exit 255; } my $agpFile = shift; open (FH, "<$agpFile") or die "can not read $agpFile"; while (my $line = ) { next if ($line =~ m/^#/); chomp $line; my @a = split('\s+', $line); next if ($a[4] =~ m/^N$/); my $chrSize = $a[2]-$a[1]+1; my $ctgSize = $a[7]-$a[6]+1; die "sizes differ $chrSize != $ctgSize\n$line\n" if ($chrSize != $ctgSize); printf "%s\t%d\t%s\t%d\t%d\t%s\n", $a[5], $chrSize, $a[0], $a[1]-1, $a[2], $a[4]; } close (FH); '_EOF_' # << happy emacs export S=../../genbank/Sscrofa9.2/Primary_Assembly/assembled_chromosomes cut -f2 ${S}/chr2acc | while read ACC do C=`grep "${ACC}" ${S}/chr2acc | cut -f1` zcat ${S}/AGP/chr${C}.agp.gz \ | sed -e "s/^${ACC}/chr${C}/" done | ./agpToCtgPos2.pl /dev/stdin > ctgPos2.tab hgLoadSqlTab susScr2 ctgPos2 $HOME/kent/src/hg/lib/ctgPos2.sql ctgPos2.tab ############################################################################ # susScr2 Pig BLASTZ/CHAIN/NET (DONE - 2010-03-27 - Hiram) screen # use a screen to manage this multi-day job mkdir /hive/data/genomes/susScr2/bed/lastzBosTau4.2010-03-27 cd /hive/data/genomes/susScr2/bed/lastzBosTau4.2010-03-27 cat << '_EOF_' > DEF # Cow vs. Pig BLASTZ_M=50 # TARGET: Pig SusScr2 SEQ1_DIR=/scratch/data/susScr2/susScr2.2bit SEQ1_LEN=/scratch/data/susScr2/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 SEQ1_LIMIT=100 # QUERY: Cow BosTau4 SEQ2_DIR=/scratch/data/bosTau4/bosTau4.2bit SEQ2_LEN=/scratch/data/bosTau4/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/hive/data/genomes/susScr2/bed/lastzBosTau4.2010-03-27 TMPDIR=/scratch/tmp '_EOF_' # << this line keeps emacs coloring happy time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -noLoadChainSplit -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 2422m32.203s # failed during the netChainSubset | chainStitchId out of memory # finish that manually with ulimits to allow more memory on hgwdev: export sizeG=188743680 ulimit -d $sizeG ulimit -v $sizeG netChainSubset -verbose=0 noClass.net susScr2.bosTau4.all.chain.gz stdout \ | chainStitchId stdin stdout | gzip -c > susScr2.bosTau4.over.chain.gz # and, finish the rest of netChains.csh manually, the netToAxt step # and the axtToMaf step, log is in axtChain/finiChains.log # after done with netChains - continuing with load: time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -continue=load -noLoadChainSplit -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > load.log 2>&1 & # this didn't work either due to memory limits with hgLoadChain # add the following to loadUp.csh # limit at 160 Gb limit datasize 163840m limit vmemoryuse 163840m # and finish it manually (7h39m) cat fb.susScr2.chainBosTau4Link.txt # 1478903080 bases of 2231298548 (66.280%) in intersection # then continuing: time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -continue=download -noLoadChainSplit -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > download.log 2>&1 & XXX - running Tue Mar 30 13:18:09 PDT 2010 # creating a bigWig graph to see the chain pileups: cd /hive/data/genomes/susScr2/bed/lastzBosTau4.2010-03-27/axtChain zcat susScr2.bosTau4.all.chain.gz | grep "^chain " \ | awk '{printf "%s\t%d\t%d\t%s\t%s\t%s\n", $3, $6, $7, $8, $2, $5}' \ > all.bed # find the largest score: ave -col=5 all.bed Q1 5300.000000 median 8265.000000 Q3 14079.000000 average 12729.401671 min 3000.000000 max 1506828099.000000 count 144157201 total 1835034915327.000000 standard deviation 419128.061072 # normalize the scores to 0-1000: awk ' {printf "%s\t%d\t%d\t%s\t%d\t%s\n", $1,$2,$3,$4,(1000*$5)/1506828099, $6}' \ all.bed | sort -k1,1 -k2,2n > all.chain.bed bedGraphToBigWig all.chain.overlap.wigVar ../../../chrom.sizes all.chain.bw bigWigInfo all.chain.bw version: 3 isCompressed: yes isSwapped: 0 primaryDataSize: 188,237,695 primaryIndexSize: 1,267,372 zoomLevels: 10 chromCount: 20 basesCovered: 2,255,615,700 mean: 28.797958 min: 1.000000 max: 14674.000000 std: 202.526527 ln -s `pwd`/all.chain.bw /gbdb/susScr2/bbi/bosTau4ChainPileUp.bw hgsql susScr2 -e 'drop table if exists bosTau4ChainPileUp; \ create table bosTau4ChainPileUp (fileName varchar(255) not null); \ insert into bosTau4ChainPileUp values ("/gbdb/susScr2/bbi/bosTau4ChainPileUp.bw");' # and the swap mkdir /hive/data/genomes/bosTau4/bed/blastz.susScr2.swap cd /hive/data/genomes/bosTau4/bed/blastz.susScr2.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/susScr2/bed/lastzBosTau4.2010-03-27/DEF \ -swap -noLoadChainSplit -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=pk \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # most interesting, this failed on the first step chainSwap # but the failure didn't make it stop, it continued and produced # zero results to the end. Running manually: cd /hive/data/genomes/bosTau4/bed/blastz.susScr2.swap/axtChain export sizeG=188743680 ulimit -d $sizeG ulimit -v $sizeG chainSwap /hive/data/genomes/susScr2/bed/lastzBosTau4.2010-03-27/axtChain/susScr2.bosTau4.all.chain.gz stdout \ | nice chainSort stdin stdout | nice gzip -c > bosTau4.susScr2.all.chain.gz # it also runs out of memory in the lift over file creation: export sizeG=188743680 ulimit -d $sizeG ulimit -v $sizeG netChainSubset -verbose=0 noClass.net bosTau4.susScr2.all.chain.gz stdout \ | chainStitchId stdin stdout | gzip -c > bosTau4.susScr2.over.chain.gz # and netChains.csh is finished manually with this added: # memory limit 160 Gb limit datasize 163840m limit vmemoryuse 163840m # manually run the loadUp.csh with this added: # memory limit 160 Gb limit datasize 163840m limit vmemoryuse 163840m # real 498m5.861s cat fb.bosTau4.chainSusScr2Link.txt # 1383557633 bases of 2731830700 (50.646%) in intersection ######################################################################### # SWAP mm9 lastz (DONE - 2010-03-27 - Hiram) # original alignment cd /hive/data/genomes/mm9/bed/lastzSusScr2.2010-03-26 cat fb.mm9.chainSusScr2Link.txt # 616615408 bases of 2620346127 (23.532%) in intersection # and the swap mkdir /hive/data/genomes/susScr2/bed/blastz.mm9.swap cd /hive/data/genomes/susScr2/bed/blastz.mm9.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm9/bed/lastzSusScr2.2010-03-26/DEF \ -swap -noLoadChainSplit -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # Elapsed time: 63m4s cat fb.susScr2.chainMm9Link.txt # 656444411 bases of 2231298548 (29.420%) in intersection ############################################################################ # SWAP hg19 lastz (DONE - 2010-03-27 - Hiram) # original alignment cat fb.hg19.chainSusScr2Link.txt # 1198794058 bases of 2897316137 (41.376%) in intersection # and the swap mkdir /hive/data/genomes/susScr2/bed/blastz.hg19.swap cd /hive/data/genomes/susScr2/bed/blastz.hg19.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/hg19/bed/lastzSusScr2.2010-03-26/DEF \ -swap -noLoadChainSplit -syntenicNet \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # Elapsed time: 112m40s cat fb.susScr2.chainHg19Link.txt # 1272785114 bases of 2231298548 (57.042%) in intersection ######################################################################### # SWAP monDom5 lastz (DONE - 2010-03-27 - Hiram) # original alignment cat fb.monDom5.chainSusScr2Link.txt # 179898307 bases of 3501660299 (5.138%) in intersection # and the swap mkdir /hive/data/genomes/susScr2/bed/blastz.monDom5.swap cd /hive/data/genomes/susScr2/bed/blastz.monDom5.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/monDom5/bed/lastzSusScr2.2010-03-26/DEF \ -swap -noLoadChainSplit -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=pk \ -chainMinScore=5000 -chainLinearGap=loose > swap.log 2>&1 & # Elapsed time: 82m55s cat fb.susScr2.chainMonDom5Link.txt # 182834643 bases of 2231298548 (8.194%) in intersection ############################################################################ # running cpgIsland business (DONE - 2010-03-31 - Hiram) mkdir /hive/data/genomes/susScr2/bed/cpgIsland cd /hive/data/genomes/susScr2/bed/cpgIsland cvs -d /projects/compbio/cvsroot checkout -P hg3rdParty/cpgIslands cd hg3rdParty/cpgIslands # needed to fixup this source, adding include to readseq.c: #include "string.h" # and to cpg_lh.c: #include "unistd.h" #include "stdlib.h" # and fixing a declaration in cpg_lh.c sed -e "s#\(extern char\* malloc\)#// \1#" cpg_lh.c > tmp.c mv tmp.c cpg_lh.c make cd ../../ ln -s hg3rdParty/cpgIslands/cpglh.exe mkdir -p hardMaskedFa cut -f1 ../../chrom.sizes | while read C do echo ${C} twoBitToFa ../../susScr2.2bit:$C stdout \ | maskOutFa stdin hard hardMaskedFa/${C}.fa done ssh swarm cd /hive/data/genomes/susScr2/bed/cpgIsland mkdir results cut -f1 ../../chrom.sizes > chr.list cat << '_EOF_' > template #LOOP ./runOne $(root1) {check out exists results/$(root1).cpg} #ENDLOOP '_EOF_' # << happy emacs # the faCount business is to make sure there is enough sequence to # work with in the fasta. cpglh.exe does not like files with too many # N's - it gets stuck cat << '_EOF_' > runOne #!/bin/csh -fe set C = `faCount hardMaskedFa/$1.fa | grep ^chr | awk '{print $2 - $7 }'` if ( $C > 200 ) then ./cpglh.exe hardMaskedFa/$1.fa > /scratch/tmp/$1.$$ mv /scratch/tmp/$1.$$ $2 else touch $2 endif '_EOF_' # << happy emacs gensub2 chr.list single template jobList para create jobList para try para check ... etc para time # Completed: 20 of 20 jobs # CPU time in finished jobs: 168s 2.79m 0.05h 0.00d 0.000 y # IO & Wait Time: 86s 1.44m 0.02h 0.00d 0.000 y # Average job time: 13s 0.21m 0.00h 0.00d # Longest finished job: 29s 0.48m 0.01h 0.00d # Submission to last job: 31s 0.52m 0.01h 0.00d # Transform cpglh output to bed + catDir results | awk '{ $2 = $2 - 1; width = $3 - $2; printf("%s\t%d\t%s\t%s %s\t%s\t%s\t%0.0f\t%0.1f\t%s\t%s\n", $1, $2, $3, $5,$6, width, $6, width*$7*0.01, 100.0*2*$6/width, $7, $9); }' > cpgIsland.bed ssh hgwdev cd /hive/data/genomes/susScr2/bed/cpgIsland hgLoadBed susScr2 cpgIslandExt -tab \ -sqlTable=$HOME/kent/src/hg/lib/cpgIslandExt.sql cpgIsland.bed # Loaded 38778 elements of size 10 # cleanup rm -fr hardMaskedFa ######################################################################### # all.joiner update, downloads and in pushQ - (DONE - 2010-03-31 - Hiram) cd $HOME/kent/src/hg/makeDb/schema # fixup all.joiner until this is a clean output joinerCheck -database=susScr2 -all all.joiner mkdir /hive/data/genomes/susScr2/goldenPath cd /hive/data/genomes/susScr2/goldenPath makeDownloads.pl susScr2 > do.log 2>&1 # now ready for pushQ entry mkdir /hive/data/genomes/susScr2/pushQ cd /hive/data/genomes/susScr2/pushQ makePushQSql.pl susScr2 > susScr2.pushQ.sql 2> stderr.out # check for errors in stderr.out, some are OK, e.g.: # WARNING: susScr2 does not have seq # WARNING: susScr2 does not have extFile # WARNING: Could not tell (from trackDb, all.joiner and hardcoded lists of # supporting and genbank tables) which tracks to assign these tables to: # bosTau4ChainPileUp # copy it to hgwbeta scp -p susScr2.pushQ.sql hgwbeta:/tmp ssh hgwbeta cd /tmp hgsql qapushq < susScr2.pushQ.sql # in that pushQ entry walk through each entry and see if the # sizes will set properly ############################################################################ # LIFTOVER TO SusScr2 (DONE - 2010-01-15 - Hiram ) mkdir /hive/data/genomes/susScr1/bed/blat.susScr2.2010-04-05 cd /hive/data/genomes/susScr1/bed/blat.susScr2.2010-04-05 # -debug run to create run dir, preview scripts... doSameSpeciesLiftOver.pl -bigClusterHub=swarm -dbHost=hgwdev \ -workhorse=hgwdev \ -ooc=/hive/data/genomes/susScr1/jkStuff/susScr1.11.ooc \ -debug susScr1 susScr2 # Real run: doSameSpeciesLiftOver.pl -bigClusterHub=swarm -dbHost=hgwdev \ -workhorse=hgwdev \ -ooc=/hive/data/genomes/susScr1/jkStuff/susScr1.11.ooc \ susScr1 susScr2 > do.log 2>&1 ############################################################################# # HUMAN (hg18) PROTEINS TRACK (DONE braney 2010-04-14) # bash if not using bash shell already cd /cluster/data/susScr2 mkdir /cluster/data/susScr2/blastDb awk '{if ($2 > 1000000) print $1}' chrom.sizes > 1meg.lst twoBitToFa -seqList=1meg.lst susScr2.2bit temp.fa faSplit gap temp.fa 1000000 blastDb/x -lift=blastDb.lft rm temp.fa 1meg.lst awk '{if ($2 <= 1000000) print $1}' chrom.sizes > less1meg.lst twoBitToFa -seqList=less1meg.lst susScr2.2bit temp.fa faSplit about temp.fa 1000000 blastDb/y rm temp.fa less1meg.lst cd blastDb for i in *.fa do /hive/data/outside/blast229/formatdb -i $i -p F done rm *.fa ls *.nsq | wc -l # 2915 mkdir -p /cluster/data/susScr2/bed/tblastn.hg18KG cd /cluster/data/susScr2/bed/tblastn.hg18KG echo ../../blastDb/*.nsq | xargs ls -S | sed "s/\.nsq//" > query.lst wc -l query.lst # 2915 query.lst # we want around 350000 jobs calc `wc /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl | awk '{print $1}'`/\(350000/`wc query.lst | awk '{print $1}'`\) # 36727/(350000/2915) = 305.883443 mkdir -p kgfa split -l 306 /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl kgfa/kg cd kgfa for i in *; do nice pslxToFa $i $i.fa; rm $i; done cd .. ls -1S kgfa/*.fa > kg.lst wc kg.lst # 121 121 1573 kg.lst mkdir -p blastOut for i in `cat kg.lst`; do mkdir blastOut/`basename $i .fa`; done tcsh cd /cluster/data/susScr2/bed/tblastn.hg18KG cat << '_EOF_' > blastGsub #LOOP blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl } #ENDLOOP '_EOF_' cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/hive/data/outside/blast229/data export BLASTMAT g=`basename $2` f=/tmp/`basename $3`.$g for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11 do if /hive/data/outside/blast229/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8 then mv $f.8 $f.1 break; fi done if test -f $f.1 then if /cluster/bin/i386/blastToPsl $f.1 $f.2 then liftUp -nosort -type=".psl" -nohead $f.3 /cluster/data/susScr2/blastDb.lft carry $f.2 liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg18/bed/blat.hg18KG/protein.lft warn $f.3 if pslCheck -prot $3.tmp then mv $3.tmp $3 rm -f $f.1 $f.2 $f.3 $f.4 fi exit 0 fi fi rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 $f.4 exit 1 '_EOF_' # << happy emacs chmod +x blastSome exit ssh swarm cd /cluster/data/susScr2/bed/tblastn.hg18KG gensub2 query.lst kg.lst blastGsub blastSpec para create blastSpec # para try, check, push, check etc. para time # Completed: 352715 of 352715 jobs # CPU time in finished jobs: 13184770s 219746.16m 3662.44h 152.60d 0.418 y # IO & Wait Time: 1844130s 30735.51m 512.26h 21.34d 0.058 y # Average job time: 43s 0.71m 0.01h 0.00d # Longest finished job: 149s 2.48m 0.04h 0.00d # Submission to last job: 16902s 281.70m 4.70h 0.20d ssh swarm cd /cluster/data/susScr2/bed/tblastn.hg18KG mkdir chainRun cd chainRun tcsh cat << '_EOF_' > chainGsub #LOOP chainOne $(path1) #ENDLOOP '_EOF_' cat << '_EOF_' > chainOne (cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=150000 stdin ../c.`basename $1`.psl) '_EOF_' chmod +x chainOne ls -1dS ../blastOut/kg?? > chain.lst gensub2 chain.lst single chainGsub chainSpec # do the cluster run for chaining para create chainSpec para try, check, push, check etc. # Completed: 121 of 121 jobs # CPU time in finished jobs: 342462s 5707.70m 95.13h 3.96d 0.011 y # IO & Wait Time: 67882s 1131.37m 18.86h 0.79d 0.002 y # Average job time: 3391s 56.52m 0.94h 0.04d # Longest finished job: 13532s 225.53m 3.76h 0.16d # Submission to last job: 13544s 225.73m 3.76h 0.16d cd /cluster/data/susScr2/bed/tblastn.hg18KG/blastOut for i in kg?? do cat c.$i.psl | awk "(\$13 - \$12)/\$11 > 0.6 {print}" > c60.$i.psl sort -rn c60.$i.psl | pslUniq stdin u.$i.psl awk "((\$1 / \$11) ) > 0.60 { print }" c60.$i.psl > m60.$i.psl echo $i done sort u.*.psl m60* | uniq | sort -T /tmp -k 14,14 -k 16,16n -k 17,17n > ../blastHg18KG.psl cd .. pslCheck blastHg18KG.psl # checked: 83553 failed: 0 errors: 0 # load table ssh hgwdev cd /cluster/data/susScr2/bed/tblastn.hg18KG hgLoadPsl susScr2 blastHg18KG.psl # check coverage featureBits susScr2 blastHg18KG # 36312685 bases of 2231298548 (1.627%) in intersection featureBits susScr2 blastHg18KG ensGene -enrichment # blastHg18KG 1.627%, ensGene 1.284%, both 0.845%, cover 51.91%, enrich 40.42x rm -rf blastOut #end tblastn ############################################################################# # N-SCAN gene predictions (nscanGene) - (2010-04-18 markd) # obtained NSCAN predictions from michael brent's group # at WUSTL cd /cluster/data/susScr2/bed/nscan wget -nv http://mblab.wustl.edu/predictions/pig/susScr2/readme.txt wget -nv http://mblab.wustl.edu/predictions/pig/susScr2/susScr2.fa wget -nv http://mblab.wustl.edu/predictions/pig/susScr2/susScr2.gtf gzip susScr2.* chmod a-w * # load track gtfToGenePred -genePredExt susScr2.gtf.gz stdout | hgLoadGenePred -genePredExt susScr2 nscanGene stdin hgPepPred susScr2 generic nscanPep susScr2.fa.gz rm *.tab # pig/susScr2/trackDb.ra, add: track nscanGene override informant Pig N-SCAN uses human (hg19) as the informant, updated with PASA clusters of pig cDNAs. # veryify top-level search spec, should produce no results: hgsql -Ne 'select name from nscanGene' susScr2 | egrep -v '^chr[0-9a-zA-Z_]+\.([0-9]+|pasa)((\.[0-9a-z]+)?\.[0-9a-z]+)?$' |head ######################################################################### # SWAP bosTau6 lastz (DONE - 2011-05-26 - Chin) cd /hive/data/genomes/bosTau6/bed/lastzSusScr2.2011-05-24 cat fb.bosTau6.chainSusScr2Link.txt # 1306738899 bases of 2649682029 (49.317%) in intersection # and the swap mkdir /hive/data/genomes/susScr2/bed/blastz.bosTau6.swap cd /hive/data/genomes/susScr2/bed/blastz.bosTau6.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/bosTau6/bed/lastzSusScr2.2011-05-24/DEF \ -swap -noLoadChainSplit -syntenicNet \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 380m53.612s cat fb.susScr2.chainBosTau6Link.txt # 1475323050 bases of 2231298548 (66.119%) in intersection ############################################################################ # SWAP mm10 lastz (DONE - 2012-03-19 - Hiram) # original alignment to mm10 cat /hive/data/genomes/mm10/bed/lastzSusScr2.2012-03-16/fb.mm10.chainSusScr2Link.txt # 616716602 bases of 2652783500 (23.248%) in intersection # and this swap: mkdir /hive/data/genomes/susScr2/bed/blastz.mm10.swap cd /hive/data/genomes/susScr2/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzSusScr2.2012-03-16/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 62m47.465s cat fb.susScr2.chainMm10Link.txt # 656498040 bases of 2231298548 (29.422%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/susScr2/bed ln -s blastz.mm10.swap lastz.mm10 ############################################################################ # LIFTOVER TO susScr3 (DONE - 2012-07-26 - Hiram ) mkdir /hive/data/genomes/susScr2/bed/blat.susScr3.2012-07-26 cd /hive/data/genomes/susScr2/bed/blat.susScr3.2012-07-26 # -debug run to create run dir, preview scripts... doSameSpeciesLiftOver.pl -debug \ -ooc /hive/data/genomes/susScr2/jkStuff/susScr2.11.ooc susScr2 susScr3 # Real run: time nice -n +19 doSameSpeciesLiftOver.pl -verbose=2 \ -ooc /hive/data/genomes/susScr2/jkStuff/susScr2.11.ooc \ -bigClusterHub=swarm -dbHost=hgwdev -workhorse=hgwdev \ susScr2 susScr3 > do.log 2>&1 # real 28m35.606s ############################################################################ # NUMTS TRACK (DONE 2012-11-01 - Chin) # copy and unzip # http://193.204.182.50/files/tracks.zip and # http://193.204.182.50/files/other%20NumtS%20tracks.zip # to /hive/data/outside/NumtS/2012/tracks mkdir /hive/data/genomes/susScr2/bed/NumtS cd /hive/data/genomes/susScr2/bed/NumtS cp /hive/data/outside/NumtS/2012/tracks/Sus_scrofa_2/* . cat tracks_NumtS_nuclear_SusScrofa2 | grep -v "^track" > susScr2NumtS.bed cat tracks_NumtS_assembled_SusScrofa2 | grep -v "^track" > susScr2NumtSAssembled.bed cat tracks_mit_NumtS_SusScrofa2 | grep -v "^track" > susScr2NumtSMitochondrion.bed # load the 3 bed files to susScr2 hgLoadBed susScr2 numtSAssembled susScr2NumtSAssembled.bed hgLoadBed susScr2 numtS susScr2NumtS.bed hgLoadBed susScr2 numtSMitochondrion susScr2NumtSMitochondrion.bed # get new bam and html file from data provider (dated 2012-12-05) # put them in /hive/data/outside/NumtS/2012/updatePig first # then, copied over to /hive/data/genomes/susScr2/bed/NumtS cd /hive/data/outside/NumtS/2012/updatePig cp pig_NumtS_seqs*.* /hive/data/genomes/susScr2/bed/NumtS # Make /gbdb/ links and load bam mkdir /gbdb/susScr2/NumtS ln -s `pwd`/pig_NumtS_seqs.sorted.bam{,.bai} /gbdb/susScr2/NumtS/ hgBbiDbLink susScr2 bamAllNumtSSorted /gbdb/susScr2/NumtS/pig_NumtS_seqs.sorted.bam # setup trackDb for susScr2 and search rules cp /hive/data/outside/NumtS/2012/updatePig/pig_details.htm \ $HOME/kent/src/hg/makeDb/trackDb/pig/susScr2/numtSeqSusScr2.html # edit numtSeqSusScr2.html to fit UCSC standard # check html in to git #############################################################################