# for emacs: -*- mode: sh; -*- # $Id: felCat4.txt,v 1.2 2010/06/11 17:11:00 chinhli Exp $ # Felis Catus (domestic cat) -- NHGRI/GTB 4/felCat4 (2009-05-25) # file template copied from calJac3.txt # Felis catus (Project ID: 32759) by NHGRI/Genome Technology Branch [Draft # assembly] sequence: # ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/ # Felis_catus/catChr4 # Felis catus # http://www.ncbi.nlm.nih.gov/genome/78 # http://www.ncbi.nlm.nih.gov/bioproject/16726 - UWash - Cinnamon # http://www.ncbi.nlm.nih.gov/Traces/wgs/?val=AANG00 # http://www.ncbi.nlm.nih.gov/bioproject/32759 - NHGRI - multiple cats # http://www.ncbi.nlm.nih.gov/Traces/wgs/?val=ACBE00 ########################################################################## # Download sequence (DONE - 2010-05-23 - Chin) mkdir /hive/data/genomes/felCat4 cd /hive/data/genomes/felCat4 mkdir genbank cd genbank wget --timestamping -r --cut-dirs=6 --level=0 -nH -x \ --no-remove-listing -np \ "ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Felis_catus/catChr4/*" # FINISHED --09:05:15-- # Downloaded: 151 files, 1.3G in 7m 42s (2.98 MB/s) # Read ASSEMBLY_INFO mkdir ucscChr # stay at genbank directory # fixup the accession names to become UCSC chrom names # use component agp file as source S=Primary_Assembly/assembled_chromosomes cut -f1 ${S}/chr2acc | while read C do ACC=`grep "${C}" ${S}/chr2acc | cut -f2` echo "${ACC} -> chr${C}" zcat ${S}/AGP/chr${C}.comp.agp.gz \ | sed -e "s/^${ACC}/chr${C}/" | gzip > ucscChr/chr${C}.agp.gz done S=Primary_Assembly/assembled_chromosomes cut -f1 ${S}/chr2acc | while read C do ACC=`grep "${C}" ${S}/chr2acc | cut -f2` echo "${ACC} -> chr${C}" echo ">chr${C}" > ucscChr/chr${C}.fa zcat ${S}/FASTA/chr${C}.fa.gz | grep -v "^>" >> ucscChr/chr${C}.fa gzip ucscChr/chr${C}.fa & done # Check them with faSize faSize Primary_Assembly/assembled_chromosomes/FASTA/chr*.fa.gz # 2872644707 bases (1165972091 N's 1706672616 real 1706672616 upper 0 # lower) in 19 sequences in 19 files faSize ucscChr/chr*.fa.gz # 2872644707 bases (1165972091 N's 1706672616 real 1706672616 upper 0 # lower) in 19 sequences in 19 files # For unplaced scalfolds, named them as chrUn_xxxxxxxx # where xxxxxx is the original access id as: chrUn_ACBE01000381.1 # and put it into chrUn.* files # The ".1" at the end of field 1 and 6 need to be filter out since # MySQL does not allow "." as part of the table name and # will casue problems in genbank task step later # copy all the comment lines (start with #) S=Primary_Assembly/unplaced_scaffolds zcat ${S}/AGP/unplaced.scaf.agp.gz | grep "^#" > ucscChr/chrUn.agp # append the gap records zcat ${S}/AGP/unplaced.scaf.agp.gz | grep -v "^#" \ | sed -e "s/^/chrUn_/" -e "s/\.1//" >> ucscChr/chrUn.agp gzip ucscChr/chrUn.agp & zcat ${S}/FASTA/unplaced.scaf.fa.gz \ | sed -e "s#^>.*|gb|#>chrUn_#; s#|.*##" \ | sed -e "s/\.1$//" \ | gzip > ucscChr/chrUn.fa.gz # about 104034 sequences in the unplaced zcat ucscChr/chrUn.fa.gz | grep "^>" | wc # 104034 104034 2275648 # Check them with faSize faSize Primary_Assembly/unplaced_scaffolds/FASTA/unplaced.scaf.fa.gz # 287642232 bases (3696852 N's 283945380 real 283945380 upper 0 # lower) in 104034 sequences in 1 files faSize ucscChr/chrUn.fa.gz # 287642232 bases (3696852 N's 283945380 real 283945380 upper 0 # lower) in 104034 sequences in 1 files ########################################################################## # Initial genome build (DONE - 2010-05-24 -Chin) # decide the right order by hgsql -e "select orderKey,name from dbDb order by orderKey;" \ hgcentraltest | less # 168 cavPor3 # 169 cavPor2 # 189 oryCun2 # 190 oryCun1 # 191 ochPri2 # 200 eriEur1 # 217 felCat3 cd /hive/data/genomes/felCat4 cat << '_EOF_' > felCat4.config.ra # Config parameters for makeGenomeDb.pl: db felCat4 clade mammal genomeCladePriority 16 scientificName Felis catus commonName Cat assemblyDate Dec. 2008 assemblyLabel NHGRI/Genome Technology Branch (NCBI project 10703, accession ACBE0100000) assemblyShortLabel NHGRI/GTB 4 orderKey 216 mitoAcc NC_001700 fastaFiles /hive/data/genomes/felCat4/genbank/ucscChr/chr*.fa.gz agpFiles /hive/data/genomes/felCat4/genbank/ucscChr/chr*.agp.gz # qualFiles none dbDbSpeciesDir cat taxId 9685 '_EOF_' time makeGenomeDb.pl -noGoldGapSplit -workhorse=hgwdev felCat4.config.ra \ > makeGenomeDb.log 2>&1 # real 10m1.157s # Manually fix up the description in dbDb: # hgsql -e "UPDATE dbDb set description='Dec. 2008 (NHGRI/GTB V17e/felCat4)' where name='felCat4';" hgcentraltest # set up track mkdir /cluster/home/chinhli/kent/src/hg/makeDb/trackDb/cat/felCat4 cp /cluster/data/felCat4/TemporaryTrackDbCheckout/kent/src/hg/makeDb/trackDb/cat/felCat4/* /cluster/home/chinhli/kent/src/hg/makeDb/trackDb/cat/felCat4/ # Search for '***' notes in each file in and make corrections # cd ../.. (to trackDb/) and # - edit makefile to add felCat4 to DBS. # if necessary) cvs add cat # - cvs add cat/felCat4 # - cvs add cat/felCat4/*.{ra,html} # - cvs ci -m "Added felCat4 to DBS." makefile # - cvs ci -m "Initial descriptions for felCat4." cat/felCat4 # - (if necessary) cvs ci cat # - Run make update DBS=felCat4 and make alpha when done. # - (optional) Clean up /cluster/data/felCat4/TemporaryTrackDbCheckout # - cvsup your ~/kent/src/hg/makeDb/trackDb and make future edits there. ########################################################################## # running repeat masker (DONE 2010-05-26 - Chin) mkdir /hive/data/genomes/felCat4/bed/repeatMasker cd /hive/data/genomes/felCat4/bed/repeatMasker time doRepeatMasker.pl -buildDir=`pwd` -noSplit \ -bigClusterHub=swarm -dbHost=hgwdev -workhorse=hgwdev \ -smallClusterHub=encodek felCat4 > do.log 2>&1 & # real 262m59.644s cat faSize.rmsk.txt # 3160303948 bases (1169668943 N's 1990635005 real 1183631401 # upper 807003604 lower) in 104054 sequences in 1 files # %25.54 masked total, %40.54 masked real ########################################################################## # running simple repeat (DONE - 2010-05-26 - Chin) mkdir /hive/data/genomes/felCat4/bed/simpleRepeat cd /hive/data/genomes/felCat4/bed/simpleRepeat time doSimpleRepeat.pl -buildDir=`pwd` -bigClusterHub=swarm \ -dbHost=hgwdev -workhorse=hgwdev -smallClusterHub=encodek \ felCat4 > do.log 2>&1 & # real 40m36.157s cat fb.simpleRepeat # 49107705 bases of 1990645222 (2.467%) in intersection cd /hive/data/genomes/felCat4 twoBitMask felCat4.rmsk.2bit \ -add bed/simpleRepeat/trfMask.bed felCat4.2bit # you can safely ignore the warning about fields >= 13 twoBitToFa felCat4.2bit stdout | faSize stdin > faSize.felCat4.2bit.txt cat faSize.felCat4.2bit.txt # 3160303948 bases (1169668943 N's 1990635005 real 1182766163 upper # 807868842 lower) in 104054 sequences in 1 files # %25.56 masked total, %40.58 masked real # double check with featureBits featureBits -countGaps felCat4 gap # 1169658726 bases of 3160303948 (37.011%) in intersection rm /gbdb/felCat4/felCat4.2bit ln -s `pwd`/felCat4.2bit /gbdb/felCat4/felCat4.2bit ######################################################################## # Marking *all* gaps - they are not all in the AGP file # (DONE - 2010-05-27 - Chin) mkdir /hive/data/genomes/felCat4/bed/allGaps cd /hive/data/genomes/felCat4/bed/allGaps time nice -n +19 findMotif -motif=gattaca -verbose=4 \ -strand=+ ../../felCat4.rmsk.2bit > findMotif.txt 2>&1 # real 1m11.067s grep "^#GAP " findMotif.txt | sed -e "s/^#GAP //" > allGaps.bed featureBits felCat4 -not gap -bed=notGap.bed # 1990645222 bases of 1990645222 (100.000%) in intersection featureBits felCat4 allGaps.bed notGap.bed -bed=new.gaps.bed # 10217 bases of 1990645222 (0.001%) in intersection # (real time 7+ hours) # what is the last index in the existing gap table: hgsql -N -e "select ix from gap;" felCat4 | sort -n | tail -1 # 101050 cat << '_EOF_' > mkGap.pl #!/usr/bin/env perl use strict; use warnings; my $ix=`hgsql -N -e "select ix from gap;" felCat4 | sort -n | tail -1`; chomp $ix; open (FH,") { my ($chrom, $chromStart, $chromEnd, $rest) = split('\s+', $line); ++$ix; printf "%s\t%d\t%d\t%d\tN\t%d\tother\tyes\n", $chrom, $chromStart, $chromEnd, $ix, $chromEnd-$chromStart; } close (FH); '_EOF_' # << happy emacs chmod +x ./mkGap.pl ./mkGap.pl > other.gap hgLoadBed -sqlTable=$HOME/kent/src/hg/lib/gap.sql \ -noLoad felCat4 otherGap other.gap # Reading other.gap # Loaded 1294 elements of size 8 # Sorted # Saving bed.tab # No load option selected, see file: bed.tab wc -l bed.tab # 1294 bed.tab # starting with this many hgsql -e "select count(*) from gap;" felCat4 # 500507 hgsql felCat4 -e 'load data local infile "bed.tab" into table gap;' # result count: hgsql -e "select count(*) from gap;" felCat4 # 501801 # == 500507 + 1294 ######################################################################### # MAKE 11.OOC FILES FOR BLAT (DONE 2010-05-27 - Chin) cd /hive/data/genomes/felCat4 cat faSize.felCat4.2bit.txt # 3160303948 bases (1169668943 N's 1990635005 real 1182766163 upper # 807868842 lower) in 104054 sequences in 1 files # %25.56 masked total, %40.58 masked real # numerator is felCat4 gapless bases "real" as reported by faSize # denominator is hg17 gapless bases as reported by featureBits, # 1024 is threshold used for human -repMatch: calc \( 1990635005 / 2897310462 \) \* 1024 # ( 1990635005 / 2897310462 ) * 1024 = 703.552578 # ==> use -repMatch=700 according to size scaled down from 1024 for human. # and rounded down to nearest 50 blat felCat4.2bit /dev/null /dev/null -tileSize=11 \ -makeOoc=jkStuff/felCat4.11.ooc -repMatch=700 # Wrote 23033 overused 11-mers to jkStuff/felCat4.11.ooc mkdir /hive/data/staging/data/felCat4 cp -p felCat4.2bit chrom.sizes jkStuff/felCat4.11.ooc \ /hive/data/staging/data/felCat4 gapToLift -bedFile=jkStuff/nonBridgedGaps.bed felCat4 \ jkStuff/felCat4.nonBridged.lft # Ask the admin to copy the # /hive/data/staging/data/felCat4 directory # to cluster nodes: /scratch/data/felCat4 ########################################################################## # BLATSERVERS ENTRY (DONE 2010-05-27 - Chin) # After getting a blat server assigned by the Blat Server Gods, # hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ VALUES ("felCat4", "blat4", "17792", "1", "0"); \ INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ VALUES ("felCat4", "blat4", "17793", "0", "1");' \ hgcentraltest # test it with some sequence ############################################################################ # chrA2:64650765-64656835 # reset position to RHO location as found from blat of hg19 RHO gene hgsql -e \ 'update dbDb set defaultPos="chrA2:64650765-64656835" where name="felCat4";' \ hgcentraltest ############################################################################ # genbank run DONE - 2010-05-27 - Chin ) ssh hgwdev cd $HOME/kent/src/hg/makeDb/genbank # edit etc/genbank.conf to add this section just before calJac1: # felCat4 felCat4.serverGenome = /hive/data/genomes/felCat4/felCat4.2bit felCat4.clusterGenome = /scratch/data/felCat4/felCat4.2bit felCat4.ooc = /scratch/data/felCat4/felCat4.11.ooc felCat4.lift = no felCat4.perChromTables = no felCat4.refseq.mrna.native.pslCDnaFilter = ${ordered.refseq.mrna.native.pslCDnaFilter} felCat4.refseq.mrna.xeno.pslCDnaFilter = ${ordered.refseq.mrna.xeno.pslCDnaFilter} felCat4.genbank.mrna.native.pslCDnaFilter = ${ordered.genbank.mrna.native.pslCDnaFilter} felCat4.genbank.mrna.xeno.pslCDnaFilter = ${ordered.genbank.mrna.xeno.pslCDnaFilter} felCat4.genbank.est.native.pslCDnaFilter = ${ordered.genbank.est.native.pslCDnaFilter} felCat4.genbank.est.xeno.pslCDnaFilter = ${ordered.genbank.est.xeno.pslCDnaFilter} felCat4.downloadDir = felCat4 felCat4.refseq.mrna.native.load = yes felCat4.refseq.mrna.xeno.load = yes felCat4.refseq.mrna.xeno.loadDesc = yes cvs ci -m "adding felCat4" etc/genbank.conf make etc-update ssh genbank screen # control this business with a screen since it takes a while cd /cluster/data/genbank time nice -n +19 bin/gbAlignStep -initial felCat4 & # logFile: var/build/logs/2010.05.29-00:59:34.felCat4.initalign.log # tail var/build/logs/2010.05.29-00:59:34.felCat4.initalign.log # real 1161m8.623s # user 288m16.175s # sys 117m18.519s ssh hgwdev cd /cluster/data/genbank time ./bin/gbDbLoadStep -drop -initialLoad felCat4 & # logFile: var/dbload/hgwdev/logs/2010.06.01-08:22:58.dbload.log # tail var/dbload/hgwdev/logs/2010.06.01-08:22:58.dbload.log # real 37m51.980s # enable daily alignment and update of hgwdev cd ~/kent/src/hg/makeDb/genbank cvsup # add felCat4 to: etc/align.dbs etc/hgwdev.dbs cvs ci -m "Adding felCat4 - Cat - Felis catus" \ etc/align.dbs etc/hgwdev.dbs make etc-update # DONE - 2010-06-01 - Chin ############################################################################ # running cpgIsland business (DONE - 2010-06-01 - Chin) mkdir /hive/data/genomes/felCat4/bed/cpgIsland cd /hive/data/genomes/felCat4/bed/cpgIsland cvs -d /projects/compbio/cvsroot checkout -P hg3rdParty/cpgIslands cd hg3rdParty/cpgIslands # needed to fixup this source, adding include to readseq.c: #include "string.h" # and to cpg_lh.c: #include "unistd.h" #include "stdlib.h" # and fixing a declaration in cpg_lh.c sed -e "s#\(extern char\* malloc\)#// \1#" cpg_lh.c > tmp.c mv tmp.c cpg_lh.c make cd ../../ ln -s hg3rdParty/cpgIslands/cpglh.exe mkdir -p hardMaskedFa cut -f1 ../../chrom.sizes | while read C do echo ${C} twoBitToFa ../../felCat4.2bit:$C stdout \ | maskOutFa stdin hard hardMaskedFa/${C}.fa done ssh swarm cd /hive/data/genomes/felCat4/bed/cpgIsland mkdir results cut -f1 ../../chrom.sizes > chr.list cat << '_EOF_' > template #LOOP ./runOne $(path1) {check out exists results/$(path1).cpg} #ENDLOOP '_EOF_' # << happy emacs # the faCount business is to make sure there is enough sequence to # work with in the fasta. cpglh.exe does not like files with too many # N's - it gets stuck cat << '_EOF_' > runOne #!/bin/csh -fe set C = `faCount hardMaskedFa/$1.fa | grep ^chr | awk '{print $2 - $7 }'` if ( $C > 200 ) then ./cpglh.exe hardMaskedFa/$1.fa > /scratch/tmp/$1.$$ mv /scratch/tmp/$1.$$ $2 else touch $2 endif '_EOF_' # << happy emacs chmod +x ./runOne gensub2 chr.list single template jobList para create jobList para try para check ... etc para push para time # in jobList: # ..... # ./runOne chrD3 {check out exists results/chrD3.cpg} # wc -l jobList # ..... # 104054 jobList # Completed: 104054 of 104054 jobs # CPU time in finished jobs: 209s 3.48m 0.06h 0.00d 0.000 y # IO & Wait Time: 343233s 5720.56m 95.34h 3.97d 0.011 y # Average job time: 3s 0.06m 0.00h 0.00d # Longest finished job: 52s 0.87m 0.01h 0.00d # Submission to last job: 3927s 65.45m 1.09h 0.05d # Transform cpglh output to bed + catDir results | awk '{ $2 = $2 - 1; width = $3 - $2; printf("%s\t%d\t%s\t%s %s\t%s\t%s\t%0.0f\t%0.1f\t%s\t%s\n", $1, $2, $3, $5,$6, width, $6, width*$7*0.01, 100.0*2*$6/width, $7, $9); }' > cpgIsland.bed wc -l cpgIsland.bed # 57950 cpgIsland.bed cd /hive/data/genomes/felCat4/bed/cpgIsland hgLoadBed felCat4 cpgIslandExt -tab \ -sqlTable=$HOME/kent/src/hg/lib/cpgIslandExt.sql cpgIsland.bed # Reading cpgIsland.bed # Loaded 57950 elements of size 10 # Sorted # Creating table definition for cpgIslandExt # Saving bed.tab # Loading felCat4 # check it with featureBits felCat4 cpgIslandExt # 36646772 bases of 1990635005 (1.841%) in intersection # cleanup rm -fr hardMaskedFa ####################################################################### # felCat4 Cat BLASTZ/CHAIN/NET (DONE - 2010-06-01 - Chin) screen # use a screen to manage this multi-day job mkdir /hive/data/genomes/canFam2/bed/lastzFelCat4.2010-06-01 cd /hive/data/genomes/canFam2/bed/lastzFelCat4.2010-06-01 cat << '_EOF_' > DEF # dog vs. cat # maximum M allowed with lastz is only 254 BLASTZ_M=254 # TARGET: Dog canFan2 SEQ1_DIR=/scratch/data/canFam2/canFam2.2bit SEQ1_LEN=/scratch/data/canFam2/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 SEQ1_LIMIT=5 # QUERY: Cat (felCat4) SEQ2_DIR=/scratch/data/felCat4/felCat4.2bit SEQ2_LEN=/scratch/data/felCat4/chrom.sizes SEQ2_LIMIT=50 SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/hive/data/genomes/canFam2/bed/lastzFelCat4.2010-06-01 TMPDIR=/scratch/tmp '_EOF_' # << this line keeps emacs coloring happy time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet -noDbNameCheck \ -chainMinScore=3000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ > do.log 2>&1 & # real 389m24.326s first try got failuer in do.log # -- during chainRun after cat # rerun chain.csf with failed item works. so rerun the # doBlastzChainNet from step chainRun after para stop, para freeBatch # rm chain and liftedChain in # /hive/data/genomes/canFam2/bed/lastzFelCat4.2010-06-01/axtChain/run # and use memk and swarm this time time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -continue chainRun \ -syntenicNet -noDbNameCheck \ -chainMinScore=3000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \ > do_chainRun.log 2>&1 & # real 323m33.250s # *** All done ! Elapsed time: 323m33s # *** Make sure that goldenPath/canFam2/vsFelCat4/README.txt is accurate. # *** Add {chain,net}FelCat4 tracks to trackDb.ra if necessary. cat fb.canFam2.chainFelCat4Link.txt # 1481040604 bases of 2384996543 (62.098%) in intersection cd /hive/data/genomes/canFam2/bed/lastzFelCat4.2010-06-01 ln -s lastzFelCat4.2010-06-01 lastz.felCat4 mkdir /hive/data/genomes/felCat4/bed/blastz.canFam2.swap cd /hive/data/genomes/felCat4/bed/blastz.canFam2.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/canFam2/bed/lastzFelCat4.2010-06-01/DEF \ -swap -syntenicNet -noDbNameCheck \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 327m42.053s # *** All done ! Elapsed time: 327m42s # *** Make sure that goldenPath/felCat4/vsCanFam2/README.txt is accurate. # *** Add {chain,net}CanFam2 tracks to trackDb.ra if necessary. cat fb.felCat4.chainCanFam2Link.txt # 1467506008 bases of 1990635005 (73.720%) in intersection ##################################################################### # LASTZ Mouse Swap (DONE - 2010-06-07 - Chin) cd /hive/data/genomes/mm9/bed/lastzFelCat4.2010-06-07 cat fb.mm9.chainFelCat4Link.txt # 637007193 bases of 2620346127 (24.310%) in intersection # make sure the link exists: cd /hive/data/genomes/mm9/bed ln -s /hive/data/genomes/mm9/bed/lastzFelCat4.2010-06-07 lastz.felCat4 # swap mkdir /hive/data/genomes/felCat4/bed/blastz.mm9.swap cd /hive/data/genomes/felCat4/bed/blastz.mm9.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm9/bed/lastzFelCat4.2010-06-07/DEF \ -swap -syntenicNet -noDbNameCheck \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 176m42.490s # *** All done ! Elapsed time: 176m42s # *** Make sure that goldenPath/felCat4/vsMm9/README.txt is accurate. # *** Add {chain,net}Mm9 tracks to trackDb.ra if necessary. cat fb.felCat4.chainMm9Link.txt # 616529959 bases of 1990635005 (30.972%) in intersection ##################################################################### # LASTZ Human Swap (DONE - 2010-06-07 - Chin) cd /hive/data/genomes/hg19/bed/lastzFelCat4.2010-06-07 cat fb.hg19.chainFelCat4Link.txt # 1266003011 bases of 2897316137 (43.696%) in intersection # Swap mkdir /hive/data/genomes/felCat4/bed/blastz.hg19.swap cd /hive/data/genomes/felCat4/bed/blastz.hg19.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/hg19/bed/lastzFelCat4.2010-06-07/DEF \ -swap -syntenicNet -noDbNameCheck \ -workhorse=hgwdev -smallClusterHub=pk -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 432m36.917s # *** All done ! Elapsed time: 432m37s # *** Make sure that goldenPath/felCat4/vsHg19/README.txt is accurate. # *** Add {chain,net}Hg19 tracks to trackDb.ra if necessary. cat fb.felCat4.chainHg19Link.txt # 1211702270 bases of 1990635005 (60.870%) in intersection ##################################################################### # LASTZ Panda Swap (DONE 2010-06-16 re-do - Chin) cd /hive/data/genomes/ailMel1/bed/lastzFelCat4.2010-06-07 cat fb.felCat4.chainAilMel1Link.txt # 1503647735 bases of 1990635005 (75.536%) in intersection # link it # cd /hive/data/genomes/ailMel1/bed # ln -s /hive/data/genomes/ailMel1/bed/lastzFelCat4.2010-06-07 # lastz.felCat4 # run rbest 2010-04-30 time doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` \ felCat4 ailMel1 > rbest.log 2>&1 & # real 234m49.556s #swap 2010-06-28 mkdir /hive/data/genomes/felCat4/bed/blastz.ailMel1.swap cd /hive/data/genomes/felCat4/bed/blastz.ailMel1.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/ailMel1/bed/lastzFelCat4.2010-06-07/DEF \ -swap -syntenicNet -noDbNameCheck \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 213m28.468s # the swap run failed due to # mod of # /usr/local/apache/htdocs-hgdownload/goldenPath/ailMel1/liftOver/md5sum.txt # was set to # -r--r--r-- 1 382 Apr 26 17:17 md5sum.txt # correct this to 664 by cluster-admin # and continue from download step time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/ailMel1/bed/lastzFelCat4.2010-06-07/DEF \ -continue=download \ -swap -syntenicNet -noDbNameCheck \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > download.log 2>&1 & # real 32m9.204s # *** All done ! Elapsed time: 32m9s # *** Make sure that goldenPath/ailMel1/vsFelCat4/README.txt is accurate. # *** Add {chain,net}FelCat4 tracks to trackDb.ra if necessary. cd /hive/data/genomes/ailMel1/bed/blastz.felCat4.swap cat fb.ailMel1.chainFelCat4Link.txt # 1507273252 bases of 2245312831 (67.130%) in intersection # Due to the SEQ1 and SEQ2 spec'ed in # /hive/data/genomes/ailMel1/bed/lastzFelCat4.2010-06-07/DEF # need to copy the dDirectory from ailMel1 to felCatv17e # without this, 6way step will have problem # cd /hive/data/genomes/ailMel1/bed/blastz.felCat4.swap] # cp -pr * /hive/data/genomes/felCat4/bed/blastz.ailMel1.swap/ # cd /hive/data/genomes/felCat4/bed # ln -s lastzAilMel1.2010-06-07 lastz.ailMel1 # mkdir /hive/data/genomes/felCat4/bed/lastzAilMel1.2010-06-07 # cd /hive/data/genomes/ailMel1/bed/lastzFelCat4.2010-06-07 # cp -rp * /hive/data/genomes/felCat4/bed/lastzAilMel1.2010-06-07/ # cd /hive/data/genomes/felCat4/bed # ln -s /hive/data/genomes/felCat4/bed/lastzAilMel1.2010-06-07 lastz.ailMel1 # cd /hive/data/genomes/ailMel1/bed # ln -s blastz.felCat4.swap lastz.felCat4 ##################################################################### # LASTZ Oppsom Swap (DONE - 2010-06-11 - Chin) cd /hive/data/genomes/monDom5/bed/lastzFelCat4.2010-06-07 cat fb.monDom5.chainFelCat4Link.txt # 178616721 bases of 3501660299 (5.101%) in intersection # make sure the link exist # cd /hive/data/genomes/monDom5/bed # ln -s lastzFelCat4.2010-06-07 lastz.felCat4 # Swap mkdir /hive/data/genomes/felCat4/bed/blastz.monDom5.swap cd /hive/data/genomes/felCat4/bed/blastz.monDom5.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/monDom5/bed/lastzFelCat4.2010-06-07/DEF \ -swap -syntenicNet -noDbNameCheck \ -workhorse=hgwdev -smallClusterHub=menk -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=loose > swap.log 2>&1 & # real 181m0.411s # step 'syntenicNet' failed, continue time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/monDom5/bed/lastzFelCat4.2010-06-07/DEF \ -swap -syntenicNet -noDbNameCheck \ -continue syntenicNet \ -workhorse=hgwdev -smallClusterHub=menk -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=loose > syntenicNet.log 2>&1 & # real 9m46.383s # *** All done ! Elapsed time: 9m46s # *** Make sure that goldenPath/felCat4/vsMonDom5/README.txt is # accurate. # *** Add {chain,net}MonDom5 tracks to trackDb.ra if necessary. cat fb.felCat4.chainMonDom5Link.txt # 166499264 bases of 1990635005 (8.364%) in intersection ############################################################################ # ctgPos2 track - showing clone sequence locations on chromosomes # (DONE - 2010-06-28 - Chin) mkdir /hive/data/genomes/felCat4/bed/ctgPos2 cd /hive/data/genomes/felCat4/bed/ctgPos2 cat << '_EOF_' > agpToCtgPos2.pl #!/usr/bin/env perl use warnings; use strict; my $argc = scalar(@ARGV); if ($argc != 1) { printf STDERR "usage: zcat your.files.agp.gz | agpToCtgPos2.pl /dev/stdin > ctgPos2.tab\n"; exit 255; } my $agpFile = shift; open (FH, "<$agpFile") or die "can not read $agpFile"; while (my $line = ) { next if ($line =~ m/^#/); chomp $line; my @a = split('\s+', $line); next if ($a[4] =~ m/^N$/); next if ($a[4] =~ m/^U$/); my $chrSize = $a[2]-$a[1]+1; my $ctgSize = $a[7]-$a[6]+1; die "sizes differ $chrSize != $ctgSize\n$line\n" if ($chrSize != $ctgSize); printf "%s\t%d\t%s\t%d\t%d\t%s\n", $a[5], $chrSize, $a[0], $a[1]-1, $a[2], $a[4]; } close (FH); '_EOF_' # << happy emacs chmod 775 agpToCtgPos2.pl # use the other gap coordinates - chrXX.agp.gz this time export S=../../genbank/Primary_Assembly/assembled_chromosomes cut -f2 ${S}/chr2acc | while read ACC do C=`grep "${ACC}" ${S}/chr2acc | cut -f1` zcat ${S}/AGP/chr${C}.agp.gz \ | sed -e "s/^${ACC}/chr${C}/" done | ./agpToCtgPos2.pl /dev/stdin > ctgPos2.tab # edit ctgPos2.html with: # hgsql -e "select type from ctgPos2;" felCat4 | sort | uniq # W hgLoadSqlTab felCat4 ctgPos2 $HOME/kent/src/hg/lib/ctgPos2.sql ctgPos2.tab # add the track ctgPos2 to src/hg/makeDb/trackDb/cat/felCat4/trackDb.ra # at src/makeDb/trackdb do "make update DBS=felCat4" or/and "make alpha" ######################################################################### # all.joiner update, downloads and in pushQ - (DONE 2010-06-28 Chin) cd $HOME/kent/src/hg/makeDb/schema # add felCat4 after felCat3 # fixup all.joiner until this is a clean output joinerCheck -database=felCat4 -all all.joiner mkdir /hive/data/genomes/felCat4/goldenPath cd /hive/data/genomes/felCat4/goldenPath makeDownloads.pl felCat4 > do.log 2>&1 # *** All done! # *** Please take a look at the downloads for felCat4 using a web browser. # *** The downloads url is: http://hgdownload-test.cse.ucsc.edu/goldenPath/felCat4. # *** Edit each README.txt to resolve any notes marked with "***": # /hive/data/genomes/felCat4/goldenPath/database/README.txt # /hive/data/genomes/felCat4/goldenPath/bigZips/README.txt # (The htdocs/goldenPath/felCat4/*/README.txt "files" are just links to those.) # *** If you have to make any edits that would always apply to future # assemblies from the same sequencing center, please edit them into # ~/kent/src/hg/utils/automation/makeDownloads.pl (or ask Angie for help). # vi /hive/data/genomes/felCat4/goldenPath/database/README.txt # vi /hive/data/genomes/felCat4/goldenPath/bigZips/README.txt # now ready for pushQ entry mkdir /hive/data/genomes/felCat4/pushQ cd /hive/data/genomes/felCat4/pushQ makePushQSql.pl felCat4 > felCat4.pushQ.sql 2> stderr.out # check for errors in stderr.out, some are OK, e.g.: # WARNING: felCat4 does not have seq # WARNING: felCat4 does not have extFile # WARNING: felCat4 does not have tableDescriptions # copy it to hgwbeta # use hgcat in .hg.conf.beta scp -p felCat4.pushQ.sql hgwbeta:/tmp ssh hgwbeta cd /tmp hgsql qapushq < felCat4.pushQ.sql # in that pushQ entry walk through each entry and see if the # sizes will set properly # reset to hguser in .hg.conf.beta ##################################################################### ## 6-Way Multiz (DONE 2010-07-01 - Chin) ## # use /cluster/home/chinhli/kent/src/hg/utils/phyloTrees/49way.nh mkdir /hive/data/genomes/felCat4/bed/multiz6way cd /hive/data/genomes/felCat4/bed/multiz6way /cluster/bin/phast/tree_doctor \ --prune-all-but=felCat3,hg19,mm9,canFam2,monDom5,ailMel1 \ --rename="felCat3 -> felCat4 " \ /cluster/home/chinhli/kent/src/hg/utils/phyloTrees/49way.nh > 6way.nh # rearrange felCat4 to the top, get some help from tree_doctor: /cluster/bin/phast/tree_doctor --name-ancestors --reroot felCat4 \ --with-branch 6way.nh # edit out the ancestors, and move felCat4 from the bottom to # the top, resulting in this tree: (((felCat4:0.098612,(canFam2:0.052458,ailMel1:0.050000):0.050000):0.093470,(hg19:0.144018,mm9:0.356483):0.020593):0.258392,monDom5:0.340786); # more rearranging after seeing what the distance table looks like # below to get them appearing as much as possible in their # distance order top to bottom: (TO_DO) # use the phyloGif to get the tree image /cluster/bin/x86_64/phyloGif -phyloGif_tree=6way.nh \ -phyloGif_width=260 -phyloGif_height=260 > 6way.gif # more rearranging after seeing what the distance table looks like # below to get them appearing as much as possible in their # distance order top to bottom: (TO_DO) # usee this specification in the phyloGif tool after changing the names: /cluster/bin/phast/tree_doctor \ --rename="felCat4 -> Cat ; hg19 -> Human ; ailMel1 -> Panda ; canFam2 -> Dog ; mm9 -> Mouse ; monDom5 -> Opossum " 6way.nh # http://genome.ucsc.edu/cgi-bin/phyloGif # to obtain a gif image for htdocs/images/phylo/felCat4_6way.gif # use the common names for the gif image /cluster/bin/x86_64/phyloGif -phyloGif_tree=commonNames.6way.nh \ -phyloGif_width=260 -phyloGif_height=260 > felCat4_6way.gif /cluster/bin/phast/all_dists 6way.nh > 6way.distances.txt # make sure all symlinks lastz.DB -> lastzDb-date # exist here and at the swap locations, the perl script expects this # in order to find featureBits numbers. cd /hive/data/genomes/felCat4/bed ln -s blastz.hg19.swap lastz.hg19 ln -s blastz.canFam2.swap lastz.canFam2 ln -s blastz.mm9.swap lastz.mm9 ln -s blastz.monDom5.swap lastz.monDom5 ln -s /hive/data/genomes/felCat4/bed/lastzAilMel1.2010-06-07 \ lastz.ailMel1 cd /hive/data/genomes/felCat4/bed/multiz6way # Use 6way.distances.txt to create the table below # with this perl script: cat << '_EOF_' > sizeStats.pl #!/usr/bin/env perl use strict; use warnings; open (FH, "grep -y felCat4 6way.distances.txt | sort -k3,3n|") or die "can not read 6way.distances.txt"; my $count = 0; while (my $line = ) { chomp $line; my ($felCat4, $D, $dist) = split('\s+', $line); my $chain = "chain" . ucfirst($D); my $B="/hive/data/genomes/felCat4/bed/lastz.$D/fb.felCat4." . $chain . "Link.txt"; my $chainLinkMeasure = `awk '{print \$5}' ${B} 2> /dev/null | sed -e "s/(//; s/)//"`; chomp $chainLinkMeasure; $chainLinkMeasure = 0.0 if (length($chainLinkMeasure) < 1); $chainLinkMeasure =~ s/\%//; my $swapFile="/hive/data/genomes/${D}/bed/lastz.felCat4/fb.${D}.chainFelCat4Link.txt"; my $swapMeasure = "N/A"; if ( -s $swapFile ) { $swapMeasure = `awk '{print \$5}' ${swapFile} 2> /dev/null | sed -e "s/(//; s/)//"`; chomp $swapMeasure; $swapMeasure = 0.0 if (length($swapMeasure) < 1); $swapMeasure =~ s/\%//; } my $orgName= `hgsql -N -e "select organism from dbDb where name='$D';" hgcentraltest`; chomp $orgName; if (length($orgName) < 1) { $orgName="N/A"; } ++$count; ++$count; if ($swapMeasure eq "N/A") { printf "# %02d %.4f - %s %s\t(%% %.3f) (%s)\n", $count, $dist, $orgName, $D, $chainLinkMeasure, $swapMeasure } else { printf "# %02d %.4f - %s %s\t(%% %.3f) (%% %.3f)\n", $count, $dist, $orgName, $D, $chainLinkMeasure, $swapMeasure } } close (FH); '_EOF_' # << happy emacs chmod +x ./sizeStats.pl ./sizeStats.pl # # If you can fill in all the numbers in this table, you are ready for # the multiple alignment procedure # # featureBits chainLink measures # chainFlCat4Link # distance on felCat4 on other # 02 0.1986 - Panda ailMel1 (% 75.536) (% 67.130) # 04 0.2011 - Dog canFam2 (% 73.720) (% 62.098) # 06 0.3567 - Human hg19 (% 60.870) (% 43.696) # 08 0.5692 - Mouse mm9 (% 30.972) (% 24.310) # 10 0.7913 - Opossum monDom5 (% 8.364) (% 5.101) # create species list and stripped down tree for autoMZ sed 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//; /^ *$/d' \ 6way.nh > tmp.nh echo `cat tmp.nh` > tree-commas.nh echo `cat tree-commas.nh` | sed 's/ //g; s/,/ /g' > tree.nh sed 's/[()]//g; s/,/ /g' tree.nh > species.list # collect the single whole mafs into one place for splitting: mkdir singleMafs cd singleMafs ln -s ../../lastz.hg19/axtChain/felCat4.hg19.synNet.maf.gz . ln -s ../../lastz.mm9/axtChain/felCat4.mm9.synNet.maf.gz . ln -s ../../lastz.canFam2/axtChain/felCat4.canFam2.synNet.maf.gz . ln -s ../../lastz.monDom5/axtChain/felCat4.monDom5.synNet.maf.gz . # N50 for ailMel1 is 1281781 (less than 10 ~ 20 MB) use rbest ln -s ../../lastz.ailMel1/mafRBestNet/felCat4.ailMel1.rbest.maf.gz . # to use rbest or net, use n50.pl chrom.size to tell mkdir /hive/data/genomes/felCat4/bed/multiz6way/splitMaf cd /hive/data/genomes/felCat4/bed/multiz6way/splitMaf for D in ailMel1 do mkdir ${D} mafSplit -useHashedName=8 -byTarget /dev/null ${D}/ \ ../singleMafs/felCat4.${D}.rbest.maf.gz done for D in hg19 mm9 canFam2 monDom5 do mkdir ${D} mafSplit -useHashedName=8 -byTarget /dev/null ${D}/ \ ../singleMafs/felCat4.${D}.synNet.maf.gz done cd /hive/data/genomes/felCat4/bed/multiz6way mkdir penn cp -p /cluster/bin/penn/multiz.2008-11-25/multiz penn cp -p /cluster/bin/penn/multiz.2008-11-25/maf_project penn cp -p /cluster/bin/penn/multiz.2008-11-25/autoMZ penn ssh swarm mkdir /hive/data/genomes/felCat4/bed/multiz6way/run mkdir /hive/data/genomes/felCat4/bed/multiz6way/run/maf cd /hive/data/genomes/felCat4/bed/multiz6way/run # set the db and pairs directories here cat > autoMultiz.csh << '_EOF_' #!/bin/csh -ef set db = felCat4 set topDir = /hive/data/genomes/$db/bed/multiz6way set c = $1 set result = $2 set pennBin = $topDir/penn set run = `/bin/pwd` set tmp = /scratch/tmp/$db/multiz.$c set pairs = $topDir/splitMaf /bin/rm -fr $tmp /bin/mkdir -p $tmp /bin/cp -p $topDir/tree.nh $topDir/species.list $tmp pushd $tmp > /dev/null foreach s (`/bin/sed -e "s/^$db //" species.list`) set in = $pairs/$s/$c.maf set out = $db.$s.sing.maf if (-e $in.gz) then /bin/zcat $in.gz > $out if (! -s $out) then echo "##maf version=1 scoring=autoMZ" > $out endif else if (-e $in) then /bin/ln -s $in $out else echo "##maf version=1 scoring=autoMZ" > $out endif end set path = ($pennBin $path); rehash $pennBin/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c.maf \ > /dev/null popd > /dev/null /bin/rm -f $result /bin/cp -p $tmp/$c.maf $result /bin/rm -fr $tmp /bin/rmdir --ignore-fail-on-non-empty /scratch/tmp/$db '_EOF_' # << happy emacs chmod +x autoMultiz.csh cat << '_EOF_' > template #LOOP ./autoMultiz.csh $(root1) {check out line+ /hive/data/genomes/felCat4/bed/multiz6way/run/maf/$(root1).maf} #ENDLOOP '_EOF_' # << happy emacs find ../splitMaf -type f | grep "/[0-9][0-9][0-9].maf" \ | xargs -L 1 basename | sort -u > chr.part.list gensub2 chr.part.list single template jobList para -ram=8g create jobList # para push, para cehck, para time etc.. ssh hgwdev # put the split mafs back together into a single result head -q -n 1 maf/000.maf > felCat4.6way.maf for F in maf/*.maf do grep -h -v "^#" ${F} >> felCat4.6way.maf done tail -q -n 1 maf/000.maf >> felCat4.6way.maf # load tables for a look mkdir -p /gbdb/felCat4/multiz6way/maf cd /hive/data/genomes/felCat4/bed/multiz6way/run ln -s `pwd`/felCat4.6way.maf \ /gbdb/felCat4/multiz6way/maf/multiz6way.maf # this generates an immense multiz6way.tab file in the directory # where it is running. Best to run this over in scratch. cd /data/tmp time nice -n +19 hgLoadMaf \ -pathPrefix=/gbdb/felCat4/multiz6way/maf felCat4 multiz6way # Indexing and tabulating /gbdb/felCat4/multiz6way/maf/multiz6way.maf # Loading multiz6way into database # Loaded 5501001 mafs in 1 files from /gbdb/felCat4/multiz6way/maf # # real 4m34.514s # load summary table time nice -n +19 cat /gbdb/felCat4/multiz6way/maf/*.maf \ | hgLoadMafSummary felCat4 -minSize=30000 -verbose=2 \ -mergeGap=1500 -maxSize=200000 multiz6waySummary stdin # Indexing and tabulating stdin # Created 1029636 summary blocks from 15177243 components and 5501001 mafs # from stdin # Loading into felCat4 table multiz6waySummary... # Loading complete # real 3m8.890s ############################################################################ # GAP ANNOTATE MULTIZ6WAY MAF AND LOAD TABLES (DONE 2011-05-09 - Chin) # mafAddIRows has to be run on single chromosome maf files, it does not # function correctly when more than one reference sequence # are in a single file. # use the # /hive/data/genomes/felCat4/bed/multiz6way/run/felCat4.6way.maf # to split cd /hive/data/genomes/felCat4/bed/multiz6way/ mkdir mafSplit cd mafSplit mafSplit -byTarget -useFullSequenceName /dev/null . ../run/felCat4.6way.maf # Splitting 1 files by target sequence -- ignoring first argument # /dev/null ls -lt | wc -l # 64892 # got 64892 mafs named after their chrom/scaff .maf # although there are over 104054 chroms and scaffolds (wc -l # chrom.sizes), # some are too small or have nothing aligning. mkdir /hive/data/genomes/felCat4/bed/multiz6way/anno cd /hive/data/genomes/felCat4/bed/multiz6way/anno # most of these will already exist from previous multiple # alignments # remove the echo from in front of the twoBitInfo command to get # them # to run if this loop appears to be correct for DB in `cat ../species.list` do CDIR="/hive/data/genomes/${DB}" if [ ! -f ${CDIR}/${DB}.N.bed ]; then echo "creating ${DB}.N.bed" echo twoBitInfo -nBed ${CDIR}/${DB}.2bit ${CDIR}/${DB}.N.bed else ls -og ${CDIR}/${DB}.N.bed fi done for DB in `sed -e "s/felCat4 //" ../species.list` do echo "${DB} " ln -s /hive/data/genomes/${DB}/${DB}.N.bed ${DB}.bed echo ${DB}.bed >> nBeds ln -s /hive/data/genomes/${DB}/chrom.sizes ${DB}.len echo ${DB}.len >> sizes done # make sure they all are successful symLinks: ls -ogrtL mkdir mafSplit for F in ../mafSplit/*.maf do B=`basename ${F}` mafAddIRows -nBeds=nBeds ${F} /hive/data/genomes/felCat4/felCat4.2bit \ mafSplit/${B} echo $B done # about 3h 20m # combine into one file head -q -n 1 mafSplit/chrA1.maf > felCat4.6way.maf for F in mafSplit/*.maf do grep -h -v "^#" ${F} done >> felCat4.6way.maf # these maf files do not have the end marker, this does nothing: # tail -q -n 1 mafSplit/chrA1.maf >> felCat4.6way.maf # How about an official end marker: echo "##eof maf" >> felCat4.6way.maf # Load into database # by loading this into the table multiz6waySummary, it will # replace # the previously loaded table with the unannotated mafs rm /gbdb/felCat4/multiz6way/multiz6way.maf # remove old symlink ln -s `pwd`/felCat4.6way.maf /gbdb/felCat4/multiz6way/multiz6way.maf cd /scratch/tmp time nice -n +19 hgLoadMaf felCat4 multiz6way # Indexing and tabulating /gbdb/felCat4/multiz6way/multiz6way.maf # Loading multiz6way into database # Loaded 6501671 mafs in 1 files from /gbdb/felCat4/multiz6way # # real 3m37.682s time nice -n +19 hgLoadMafSummary -verbose=2 -minSize=30000 \ -mergeGap=1500 -maxSize=200000 felCat4 multiz6waySummary \ /gbdb/felCat4/multiz6way/multiz6way.maf # Indexing and tabulating /gbdb/felCat4/multiz6way/multiz6way.maf # Created 1029636 summary blocks from 15177243 components and # 6501671 mafs from /gbdb/felCat4/multiz6way/multiz6way.maf # Loading into felCat4 table multiz6waySummary... # Loading complete # real 3m34.271s # by loading this into the table multiz6waySummary, it will # replace # the previously loaded table with the unannotated mafs # remove the multiz6way*.tab files in this /scratch/tmp directory wc -l multiz6way*.tab # 6501671 multiz6way.tab # 1029636 multiz6waySummary.tab rm multiz6way*.tab #################################################################### # HUMAN (hg18) PROTEINS TRACK (???? DONE braney 2010-05-05) # bash if not using bash shell already cd /cluster/data/felCat4 mkdir /cluster/data/felCat4/blastDb awk '{if ($2 > 1000000) print $1}' chrom.sizes > 1meg.lst twoBitToFa -seqList=1meg.lst felCat4.2bit temp.fa faSplit gap temp.fa 1000000 blastDb/x -lift=blastDb.lft # 2478 pieces of 2478 written rm temp.fa 1meg.lst awk '{if ($2 <= 1000000) print $1}' chrom.sizes > less1meg.lst twoBitToFa -seqList=less1meg.lst felCat4.2bit temp.fa faSplit about temp.fa 1000000 blastDb/y rm temp.fa less1meg.lst cd blastDb for i in *.fa do /hive/data/outside/blast229/formatdb -i $i -p F done rm *.fa ls *.nsq | wc -l # 2765 mkdir -p /cluster/data/felCat4/bed/tblastn.hg18KG cd /cluster/data/felCat4/bed/tblastn.hg18KG echo ../../blastDb/*.nsq | xargs ls -S | sed "s/\.nsq//" > query.lst wc -l query.lst # 2765 query.lst # we want around 350000 jobs calc `wc /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl | awk '{print $1}'`/\(350000/`wc query.lst | awk '{print $1}'`\) # 36727/(350000/2765) = 290.143300 mkdir -p kgfa split -l 290 /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl kgfa/kg cd kgfa for i in *; do nice pslxToFa $i $i.fa; rm $i; done cd .. ls -1S kgfa/*.fa > kg.lst wc kg.lst # 127 127 1651 kg.lst mkdir -p blastOut for i in `cat kg.lst`; do mkdir blastOut/`basename $i .fa`; done tcsh cd /cluster/data/felCat4/bed/tblastn.hg18KG cat << '_EOF_' > blastGsub #LOOP blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl } #ENDLOOP '_EOF_' cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/hive/data/outside/blast229/data export BLASTMAT g=`basename $2` f=/tmp/`basename $3`.$g for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11 do if /hive/data/outside/blast229/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8 then mv $f.8 $f.1 break; fi done if test -f $f.1 then if /cluster/bin/i386/blastToPsl $f.1 $f.2 then liftUp -nosort -type=".psl" -nohead $f.3 /cluster/data/felCat4/blastDb.lft carry $f.2 liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg18/bed/blat.hg18KG/protein.lft warn $f.3 if pslCheck -prot $3.tmp then mv $3.tmp $3 rm -f $f.1 $f.2 $f.3 $f.4 fi exit 0 fi fi rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 $f.4 exit 1 '_EOF_' # << happy emacs chmod +x blastSome exit ssh swarm cd /cluster/data/felCat4/bed/tblastn.hg18KG gensub2 query.lst kg.lst blastGsub blastSpec para create blastSpec # para try, check, push, check etc. para time #Completed: 351155 of 351155 jobs # CPU time in finished jobs: 15263537s 254392.28m 4239.87h 176.66d 0.484 y # IO & Wait Time: 1999082s 33318.04m 555.30h 23.14d 0.063 y # Average job time: 49s 0.82m 0.01h 0.00d # Longest finished job: 128s 2.13m 0.04h 0.00d # Submission to last job: 21587s 359.78m 6.00h 0.25d ssh swarm cd /cluster/data/felCat4/bed/tblastn.hg18KG mkdir chainRun cd chainRun tcsh cat << '_EOF_' > chainGsub #LOOP chainOne $(path1) #ENDLOOP '_EOF_' cat << '_EOF_' > chainOne (cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=150000 stdin ../c.`basename $1`.psl) '_EOF_' chmod +x chainOne ls -1dS ../blastOut/kg?? > chain.lst gensub2 chain.lst single chainGsub chainSpec # do the cluster run for chaining para create chainSpec para try, check, push, check etc. # Completed: 127 of 127 jobs # CPU time in finished jobs: 548201s 9136.68m 152.28h 6.34d 0.017 y # IO & Wait Time: 87455s 1457.59m 24.29h 1.01d 0.003 y # Average job time: 5005s 83.42m 1.39h 0.06d # Longest finished job: 24309s 405.15m 6.75h 0.28d # Submission to last job: 24318s 405.30m 6.75h 0.28d cd /cluster/data/felCat4/bed/tblastn.hg18KG/blastOut for i in kg?? do cat c.$i.psl | awk "(\$13 - \$12)/\$11 > 0.6 {print}" > c60.$i.psl sort -rn c60.$i.psl | pslUniq stdin u.$i.psl awk "((\$1 / \$11) ) > 0.60 { print }" c60.$i.psl > m60.$i.psl echo $i done sort u.*.psl m60* | uniq | sort -T /tmp -k 14,14 -k 16,16n -k 17,17n > ../blastHg18KG.psl cd .. pslCheck blastHg18KG.psl # checked: 50782 failed: 0 errors: 0 # load table ssh hgwdev cd /cluster/data/felCat4/bed/tblastn.hg18KG hgLoadPsl felCat4 blastHg18KG.psl # check coverage featureBits felCat4 blastHg18KG # 23826621 bases of 1990635005 (1.197%) in intersection featureBits felCat4 blastHg18KG refGene -enrichment # blastHg18KG 1.197%, refGene 0.021%, both 0.013%, cover 1.12%, enrich 52.66x featureBits felCat4 blastHg18KG xenoRefGene -enrichment # blastHg18KG 1.197%, xenoRefGene 2.088%, both 1.011%, cover 84.43%, enrich 40.44x rm -rf blastOut #end tblastn ############################################################################ ## Annotate 6-way multiple alignment with gene annotations ## (DONE 2010-12-15 - Chin) ## rework due to mafAddiRow bug (DONE 2011-05-10 - Chin) # Gene frames ## survey all genomes to see what type of gene track to use ## Rules to decide which genes to use in order: ## 1. ucscGene ## 2. ensGene ## 3. refSeq ## 4. Other include mRNA, nscanGene ## Based on this, # hg19, mm9 ucscGene # monDom5, canFam2: ensGene # ailMel1: mRNA ssh hgwdev mkdir /hive/data/genomes/felCat4/bed/multiz6way/frames cd /hive/data/genomes/felCat4/bed/multiz6way/frames # # survey all the genomes to find out what kinds of gene tracks # they have cat << '_EOF_' > showGenes.csh #!/bin/csh -fe foreach db (`cat ../species.list`) echo -n "${db}: " set tables = `hgsql $db -N -e "show tables like '%Gene%'"` foreach table ($tables) if ($table == "ensGene" || $table == "refGene" || $table == "mgcGenes" || \ $table == "knownGene" || $table == "nscanGene" || $table == "xenoRefGene" ) then set count = `hgsql $db -N -e "select count(*) from $table"` echo -n "${table}: ${count}, " endif end set orgName = `hgsql hgcentraltest -N -e \ "select scientificName from dbDb where name='$db'"` set orgId = `hgsql felCat4 -N -e \ "select id from organism where name='$orgName'"` if ($orgId == "") then echo "Mrnas: 0" else set count = `hgsql felCat4 -N -e "select count(*) from gbCdnaInfo where organism=$orgId"` echo "Mrnas: ${count}" endif end '_EOF_' # << happy emacs chmod +x ./showGenes.csh ./showGenes.csh # felCat4: nscanGene: 18804, refGene: 362, xenoRefGene: 229184, Mrnas: 2310 # canFam2: ensGene: 30914, nscanGene: 24615, refGene: 1206, xenoRefGene: 227488, Mrnas: 3486 # ailMel1: ensGene: 25018, xenoRefGene: 217130, Mrnas: 130 # hg19: ensGene: 168482, knownGene: 77614, mgcGenes: 31354, nscanGene: 67448, refGene: 38999, xenoRefGene: 138604, Mrnas: 295814 # mm9: ensGene: 93805, knownGene: 55419, mgcGenes: 26707, nscanGene: 21895, refGene: 28511, xenoRefGene: 134290, Mrnas: 261833 # monDom5: ensGene: 35113, nscanGene: 45946, refGene: 467, xenoRefGene: 230571, Mrnas: 1719 # Based on this, use # 1. knownGenes for hg19, mm9 # 2. ensGene for monDom5, canFam2 # 3. xenoRefGene for ailMel1, # 4. nscanGene for felCat4 mkdir genes # knownGene echo "hg19 mm9" \ | sed -e "s/ */ /g" > knownGene.list for DB in hg19 mm9 do hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from knownGene" ${DB} \ | genePredSingleCover stdin stdout | gzip -2c \ > /scratch/tmp/${DB}.tmp.gz mv /scratch/tmp/${DB}.tmp.gz genes/$DB.gp.gz echo "${DB} done" done echo "monDom5 canFam2" \ | sed -e "s/ */ /g" > ensGene.list # ensGene for DB in monDom5 canFam2 do hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from ensGene" ${DB} \ | genePredSingleCover stdin stdout | gzip -2c \ > /scratch/tmp/${DB}.tmp.gz mv /scratch/tmp/${DB}.tmp.gz genes/$DB.gp.gz echo "${DB} done" done # xenoRefGene echo "ailMel1" \ | sed -e "s/ */ /g" > xenoRef.list for DB in ailMel1 do hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from xenoRefGene" ${DB} \ | genePredSingleCover stdin stdout | gzip -2c \ > /scratch/tmp/${DB}.tmp.gz mv /scratch/tmp/${DB}.tmp.gz genes/$DB.gp.gz echo "${DB} done" done # nscanGene echo "felCat4" \ | sed -e "s/ */ /g" > nscan.list for DB in felCat4 do hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from nscanGene" ${DB} \ | genePredSingleCover stdin stdout | gzip -2c \ > /scratch/tmp/${DB}.tmp.gz mv /scratch/tmp/${DB}.tmp.gz genes/$DB.gp.gz echo "${DB} done" done # verify counts for genes are reasonable: for T in genes/*.gz do echo -n "# $T: " zcat $T | cut -f1 | sort | uniq -c | wc -l done # genes/ailMel1.gp.gz: 20302 # genes/canFam2.gp.gz: 19245 # genes/felCat4.gp.gz: 18804 # genes/hg19.gp.gz: 20597 # genes/mm9.gp.gz: 20905 # genes/monDom5.gp.gz: 19188 ssh hgwdev cd /hive/data/genomes/felCat4/bed/multiz6way/frames cat ../anno/felCat4.6way.maf \ | nice -n +19 genePredToMafFrames felCat4 stdin stdout \ `sed -e "s#\([a-zA-Z0-9]*\)#\1 genes/\1.gp.gz#g" ../species.list` \ | gzip > multiz6way.mafFrames.gz # real 12m47.652s # verify there are frames on everything: zcat multiz6way.mafFrames.gz | awk '{print $4}' | sort | uniq -c # 201953 ailMel1 # 189814 canFam2 # 159815 felCat4 # 188339 hg19 # 181433 mm9 # 157198 monDom5 ssh hgwdev cd /hive/data/genomes/felCat4/bed/multiz6way/frames time hgLoadMafFrames felCat4 multiz6wayFrames multiz6way.mafFrames.gz # real 0m9.567s ############################################################################ ## create upstream nscanGene maf files mkdir /hive/data/genomes/felCat4/bed/multiz6way/downloads/maf cd /hive/data/genomes/felCat4/bed/multiz6way/downloads/maf # bash script #!/bin/sh for S in 1000 2000 5000 do echo "making upstream${S}.maf" featureBits felCat4 nscanGene:upstreamAll:${S} -fa=/dev/null -bed=stdout \ | perl -wpe 's/_up[^\t]+/\t0/' | sort -k1,1 -k2,2n \ | /cluster/bin/$MACHTYPE/mafFrags felCat4 multiz6way \ stdin stdout \ -orgs=/hive/data/genomes/felCat4/bed/multiz6way/species.list \ | gzip -c > upstream${S}.maf.gz echo "done upstream${S}.maf.gz" done # making upstream1000.maf # done upstream1000.maf.gz # making upstream2000.maf # done upstream2000.maf.gz # making upstream5000.maf # done upstream5000.maf.gz # mv up one level to downloads mv *.* ../ cd .. rm -r maf # download stuff cd /hive/data/genomes/felCat4/bed/multiz6way cp 6way.nh 6way.nh.original cp tree-commas.nh 6way.nh cp 6way.nh downloads/. cp commonNames.6way.nh downloads/. cd downloads # create # /hive/data/genomes/felCat4/bed/multiz6way/downloads/README.txt md5sum * > md5sum.txt mkdir -p /usr/local/apache/htdocs-hgdownload/goldenPath/felCat4/multiz6way cd /usr/local/apache/htdocs-hgdownload/goldenPath/felCat4/multiz6way ln -s /hive/data/genomes/felCat4/bed/multiz6way/downloads/*.gz . ln -s /hive/data/genomes/felCat4/bed/multiz6way/downloads/*.nh . ln -s /hive/data/genomes/felCat4/bed/multiz6way/downloads/README.txt . ln -s /hive/data/genomes/felCat4/bed/multiz6way/downloads/md5sum.txt . ######################################################################### # Phylogenetic tree from 6-way (DONE 2010-12-16 - Chin) # re-run with phast.2010-12-30 (DONE 2011-01-19 - Chin) # We need one tree for all chroms # use the # /hive/data/genomes/felCat4/bed/multiz6way/run/felCat4.6way.maf # to split cd /hive/data/genomes/felCat4/bed/multiz6way/ mkdir mafSplit cd mafSplit mafSplit -byTarget -useFullSequenceName /dev/null . ../run/felCat4.6way.maf # got 64892 mafs named after their chrom/scaff .maf # although there are over 104054 chroms and scaffolds (wc -l # chrom.sizes), # some are too small or have nothing aligning. mkdir /hive/data/genomes/felCat4/bed/multiz6way/4d cd /hive/data/genomes/felCat4/bed/multiz6way/4d # felCat4 does not have refGene; but has # 237116 xenoRefGene and 28723 nscanGene. # use nscanGene hgsql felCat4 -Ne \ "select * from nscanGene;" \ | cut -f 2-20 > nscanGene.gp wc -l nscanGene.gp # 18804 nscanGene.gp # make sure no redundent gene name cat nscanGene.gp | awk '{print $1}' | sort | uniq | wc -l # 18804 genePredSingleCover nscanGene.gp stdout | sort > nscanGeneNR.gp wc -l nscanGeneNR.gp # 18804 nscanGeneNR.gp ssh memk mkdir /hive/data/genomes/felCat4/bed/multiz6way/4d/run cd /hive/data/genomes/felCat4/bed/multiz6way/4d/run mkdir ../mfa # whole chrom mafs version, using new version of # uses memory-efficient version of phast, from Melissa Hubisz at Cornell # mjhubisz at gmail.com # newer versions of msa_view have a slightly different operation # the sed of the gp file inserts the reference species in the chr name cat << '_EOF_' > 4d.csh #!/bin/csh -fe set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin set r = "/hive/data/genomes/felCat4/bed/multiz6way" set c = $1 set infile = $r/mafSplit/$2 set outfile = $3 cd /scratch/tmp # 'clean' maf perl -wpe 's/^s ([^.]+)\.\S+/s $1/' $infile > $c.maf awk -v C=$c '$2 == C {print}' $r/4d/nscanGene.gp > $c.gp.tmp sed "s/\t$c\t/\tfelCat4.$c\t/g" $c.gp.tmp > $c.gp set NL=`wc -l $c.gp| gawk '{print $1}'` if ("$NL" != "0") then set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin $PHASTBIN/msa_view --4d --features $c.gp -i MAF $c.maf -o SS > $c.ss $PHASTBIN/msa_view -i SS --tuple-size 1 $c.ss > $r/4d/run/$outfile else echo "" > $r/4d/run/$outfile endif rm -f $c.gp.tmp $c.gp $c.maf $c.ss '_EOF_' # << happy emacs chmod +x 4d.csh cd /hive/data/genomes/felCat4/bed/multiz6way/mafSplit ls -1S *.maf | \ grep -E -v "chrUn" \ | sed -e "s#.*multiz6way/mafSplit/##" \ > /hive/data/genomes/felCat4/bed/multiz6way/4d/run/maf.list cd /hive/data/genomes/felCat4/bed/multiz6way/4d/run cat << '_EOF_' > template #LOOP 4d.csh $(root1) $(path1) {check out line+ ../mfa/$(root1).mfa} #ENDLOOP '_EOF_' # << happy emacs # 8g on swarm is still not enough, run it on memk. # chrA1 need 10+ cd ../GB ssh memk cd /hive/data/genomes/felCat4/bed/multiz6way/4d/run gensub2 maf.list single template stdout | tac > jobList para -ram=8g create jobList para try ... check ... push ... etc para time # Completed: 20 of 20 jobs # CPU time in finished jobs: 852s 14.20m 0.24h 0.01d 0.000 y # IO & Wait Time: 114s 1.90m 0.03h 0.00d 0.000 y # Average job time: 48s 0.80m 0.01h 0.00d # Longest finished job: 133s 2.22m 0.04h 0.00d # Submission to last job: 515s 8.58m 0.14h 0.01d # combine mfa files ssh hgwdev cd /hive/data/genomes/felCat4/bed/multiz6way/4d # but first clean out junk 1-byte leftovers from above process. # Only 20 (real chrom) out of i104054 files have real data. cd mfa find -type f -size 1c | xargs -iX rm X find -type f -size 0c | xargs -iX rm X cd /hive/data/genomes/felCat4/bed/multiz6way/4d #want comma-less species.list # again, use 2010-04-06 version /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/msa_view \ --aggregate "`cat ../species.list`" mfa/*.mfa | sed s/"> "/">"/ \ > 4d.all.mfa # check they are all in there: grep "^>" 4d.all.mfa # >felCat4 # >canFam2 # >ailMel1 # >hg19 # >mm9 # >monDom5 # tree-commas.nh # (((felCat4,(canFam2,ailMel1)),(hg19,mm9)),monDom5) # use phyloFit to create tree model (output is phyloFit.mod) time nice -n +19 \ /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/phyloFit \ --EM --precision MED --msa-format FASTA --subst-mod REV \ --tree ../tree-commas.nh 4d.all.mfa mv phyloFit.mod all.mod # Reading alignment from 4d.all.mfa ... # Extracting sufficient statistics ... # Compacting sufficient statistics ... # Fitting tree model to 4d.all.mfa using REV ... # Writing model to phyloFit.mod ... # Done. # # real 0m2.879s ############################################################################ # LIFTOVER TO FelCat3 (DONE - 2010-08-23 - Chin ) mkdir /hive/data/genomes/felCat4/bed/blat.felCat3.2010-08-25 cd /hive/data/genomes/felCat4/bed/blat.felCat3.2010-08-25 # -debug run to create run dir, preview scripts... doSameSpeciesLiftOver.pl -debug -ooc /scratch/data/felCat4/felCat4.11.ooc \ felCat4 felCat3 > debug.log 2>&1 # Real run: time nice -n +19 doSameSpeciesLiftOver.pl -verbose=2 \ -ooc /scratch/data/felCat4/felCat4.11.ooc \ -bigClusterHub=swarm -dbHost=hgwdev -workhorse=hgwdev \ felCat4 felCat3 > do.log 2>&1 & # real 274m17.792s # run.blat: # para time # 36381 jobs in batch # 210 jobs (including everybody's) in Parasol queue or running. # Checking finished jobs # Completed: 36381 of 36381 jobs # CPU time in finished jobs:7661953s 127699.22m 2128.32h 88.68d 0.243 y # IO & Wait Time: 1536782s 25613.03m 426.88h 17.79d 0.049 y # Average job time: 253s 4.21m 0.07h 0.00d # Longest finished job: 691s 11.52m 0.19h 0.01d # Submission to last job: 13072s 217.87m 3.63h 0.15d # run.chain: # para time # 36381 jobs in batch # 2 jobs (including everybody's) in Parasol queue or running. # Checking finished jobs # Completed: 36381 of 36381 jobs # CPU time in finished jobs: 157730s 2628.84m 43.81h 1.83d 0.005 y # IO & Wait Time: 127914s 2131.89m 35.53h 1.48d 0.004 y # Average job time: 8s 0.13m 0.00h 0.00d # Longest finished job: 520s 8.67m 0.14h 0.01d # Submission to last job: 610s 10.17m 0.17h 0.01d # pushQ stuff cd /usr/local/apache/htdocs-hgdownload/goldenPath/felCat4/liftOver/ md5sum felCat4ToFelCat3.over.chain.gz >> md5sum.txt ############################################################################ ## Fix gold/gap tables (DONE - 2010-09-22 -Chin) # the hgGoldGapGl program has been fixed for chrom names > 16 characters mkdir /hive/data/genomes/felCat4/bed/goldGap cd /hive/data/genomes/felCat4/bed/goldGap # record table status before loading anything: hgsql -e "show table status;" felCat4 > before.status # verify SQL create statements with -noLoad option: hgGoldGapGl -noLoad -verbose=2 -noGl \ felCat4 /hive/data/genomes/felCat4/felCat4.agp # if that looks OK, load the two tables: hgGoldGapGl -verbose=2 -noGl \ felCat4 /hive/data/genomes/felCat4/felCat4.agp # verify only gold and gap tables have changed hgsql -e "show table status;" felCat4 > after.status # it appears only the gold table actually changed diff before.status after.status < gap MyISAM 10 Dynamic 501801 44 2210626 > gap MyISAM 10 Dynamic 500507 44 22052384 < gold MyISAM 10 Dynamic 506133 47 24249424 > gold MyISAM 10 Dynamic 604561 49 29761392 # gold and gap tables together now cover the entire genome: featureBits -countGaps -or felCat4 gold gap # 3160303948 bases of 3160303948 (100.000%) in intersection # Load the gaps that are not in the felCat4.agp file cd /hive/data/genomes/felCat4/bed/allGaps hgLoadBed -sqlTable=$HOME/kent/src/hg/lib/gap.sql \ -noLoad felCat4 otherGap other.gap # Reading other.gap # Loaded 1294 elements of size 8 # Sorted # Saving bed.tab # No load option selected, see file: bed.tab wc -l bed.tab # 1294 bed.tab # starting with this many hgsql -e "select count(*) from gap;" felCat4 # 500507 hgsql felCat4 -e 'load data local infile "bed.tab" into table gap;' # result count: hgsql -e "select count(*) from gap;" felCat4 # 501801 # == 500507 + 1294 ############################################################################# # N-SCAN gene predictions (nscanGene) - (2010-11-02 markd) # obtained NSCAN predictions from Jeltje van Baren , cd /hive/data/genomes/felCat4/bed/nscan cp /hive/users/jeltje/felis_Catus/for_browser/{felCat4.gtf,felCat4.prot.fa,readme.txt} . wget -nv http://mblab.wustl.edu/predictions/marmoset/felCat4/readme.txt wget -nv http://mblab.wustl.edu/predictions/marmoset/felCat4/felCat4.prot.fa wget -nv http://mblab.wustl.edu/predictions/marmoset/felCat4/felCat4.gtf gzip felCat4.* chmod a-w * # load track gtfToGenePred -genePredExt felCat4.gtf.gz stdout | hgLoadGenePred -genePredExt felCat4 nscanGene stdin hgPepPred felCat4 generic nscanPep felCat4.prot.fa.gz rm *.tab # cat/felCat4/trackDb.ra, add: track nscanGene override informant Cat N-SCAN used human (hg19) as the informant for conservation. # veryify top-level search spec, should produce no results: hgsql -Ne 'select name from nscanGene' felCat4 | egrep -v '^chr[0-9a-zA-Z_]+\.([0-9]+|pasa)((\.[0-9a-z]+)?\.[0-9a-z]+)?$' |head ######################################################################### # phastCons 6-way (DONE - 2011-01-20 - Chin) ssh swarm mkdir -p /hive/data/genomes/felCat4/bed/multiz6way/cons/SS cd /hive/data/genomes/felCat4/bed/multiz6way/cons/SS cat << '_EOF_' > mkSS.csh #!/bin/csh -ef set c = $1 set MAF = /hive/data/genomes/felCat4/bed/multiz6way/mafSplit/$c.maf set WINDOWS = /hive/data/genomes/felCat4/bed/multiz6way/cons/SS/$c set WC = `cat $MAF | wc -l` set NL = `grep "^#" $MAF | wc -l` if ( -s $2 ) then exit 0 endif if ( -s $2.running ) then exit 0 endif date >> $2.running rm -fr $WINDOWS mkdir $WINDOWS pushd $WINDOWS > /dev/null if ( $WC != $NL ) then /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/msa_split \ $MAF -i MAF -o SS -r $WINDOWS/$c -w 10000000,0 -I 1000 -B 5000 endif popd > /dev/null date >> $2 rm -f $2.running '_EOF_' # << happy emacs chmod +x mkSS.csh cat << '_EOF_' > template #LOOP mkSS.csh $(root1) {check out line+ $(root1).done} #ENDLOOP '_EOF_' # << happy emacs # do the easy ones first to see some immediate results ls -1S -r ../../mafSplit | sed -e "s/.maf//" > maf.list gensub2 maf.list single template jobList para -ram=8g create jobList para try ... check ... etc # Completed: 1072 of 1072 jobs # CPU time in finished jobs: 1029s 17.14m 0.29h 0.01d # 0.000 y # IO & Wait Time: 3083s 51.39m 0.86h 0.04d # 0.000 y # Average job time: 4s 0.06m 0.00h 0.00d # Longest finished job: 63s 1.05m 0.02h 0.00d # Submission to last job: 227s 3.78m 0.06h 0.00d find . -type f | grep ".ss$" | wc # 1082 1082 36982 # Run phastCons # This job is I/O intensive in its output files, beware where this # takes place or do not run too many at once. ssh swarm mkdir -p /hive/data/genomes/felCat4/bed/multiz6way/cons/run.cons cd /hive/data/genomes/felCat4/bed/multiz6way/cons/run.cons # there are going to be only one phastCons run using # this same script. It triggers off of the current working # directory # $cwd:t which is the "grp" in this script. It is: # all cat << '_EOF_' > doPhast.csh #!/bin/csh -fe set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin set c = $1 set cX = $1:r set f = $2 set len = $3 set cov = $4 set rho = $5 set grp = $cwd:t set cons = /hive/data/genomes/felCat4/bed/multiz6way/cons set tmp = $cons/tmp/$f mkdir -p $tmp set ssSrc = $cons set useGrp = "$grp.mod" ln -s $ssSrc/SS/$c/$f.ss $tmp ln -s $cons/$grp/$grp.mod $tmp pushd $tmp > /dev/null $PHASTBIN/phastCons $f.ss $useGrp \ --rho $rho --expected-length $len --target-coverage $cov --quiet \ --seqname $c --idpref $c --most-conserved $f.bed --score > $f.pp popd > /dev/null mkdir -p pp/$c bed/$c sleep 4 touch pp/$c bed/$c rm -f pp/$c/$f.pp rm -f bed/$c/$f.bed mv $tmp/$f.pp pp/$c mv $tmp/$f.bed bed/$c rm -fr $tmp '_EOF_' # << happy emacs chmod a+x doPhast.csh # this template will serve for all runs # root1 == chrom name, file1 == ss file name without .ss suffix cat << '_EOF_' > template #LOOP ../run.cons/doPhast.csh $(root1) $(file1) 45 0.3 0.3 {check out line+ pp/$(root1)/$(file1).pp} #ENDLOOP '_EOF_' # << happy emacs find ../SS -type f | grep ".ss$" | sed -e 's/.ss$//' > ss.list wc -l ss.list # 51064 ss.list # Create parasol batch and run it # run for all species cd /hive/data/genomes/felCat4/bed/multiz6way/cons mkdir -p all cd all # Using the .mod tree cp -p ../../4d/all.mod ./all.mod gensub2 ../run.cons/ss.list single ../run.cons/template jobList para -ram=8g create jobList para try ... check ... para -maxJob=64 push # Completed: 51064 of 51064 jobs # CPU time in finished jobs: 6246s 104.11m 1.74h 0.07d 0.000 y # IO & Wait Time: 594205s 9903.41m 165.06h 6.88d 0.019 y # Average job time: 12s 0.20m 0.00h 0.00d # Longest finished job: 40s 0.67m 0.01h 0.00d # Submission to last job: 2454s 40.90m 0.68h 0.03d ssh hgwdev cd /hive/data/genomes/felCat4/bed/multiz6way/cons/all find ./bed -type f | grep ".bed$" | xargs cat | sort -k1,1 -k2,2n \ | awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' \ > tmpMostConserved.bed /cluster/bin/scripts/lodToBedScore tmpMostConserved.bed > mostConserved.bed # load into database time nice -n +19 hgLoadBed felCat4 phastConsElements6way mostConserved.bed # Reading mostConserved.bed # Loaded 1060573 elements of size 5 # Sorted # Creating table definition for phastConsElements6way # Saving bed.tab # Loading felCat4 # # real 0m8.657s # not sure what the targets should be for Cat, on human we # often # try for 5% overall cov, and 70% CDS cov featureBits felCat4 -enrichment refGene:cds phastConsElements6way # --rho 0.3 --expected-length 45 --target-coverage 0.3 # refGene:cds 0.017%, phastConsElements6way 6.402%, both 0.011%, # cover 63.01%, enrich 9.84x featureBits felCat4 -enrichment nscanGene:cds phastConsElements6way # nscanGene:cds 1.391%, phastConsElements6way 6.402%, both 0.908%, # cover 65.30%, enrich 10.20x # hg19 for comparison featureBits hg19 -enrichment refGene:cds phastConsElements46way # refGene:cds 1.196%, phastConsElements46way 5.065%, # both 0.888%, cover 74.22%, enrich 14.65x # ensGene:cds 1.278%, phastConsElements46way 5.065%, # both 0.910%, cover 71.23%, enrich 14.06x # knownGene:cds 1.252%, phastConsElements46way 5.065%, # both 0.905%, cover 72.29%, enrich 14.27x # Create merged posterier probability file and wiggle track data # files cd /hive/data/genomes/felCat4/bed/multiz6way/cons/all mkdir downloads find ./pp -type f | sed -e "s#^./##; s#\.# d #g; s#-# m #;" \ | sort -k1,1 -k3,3n | sed -e "s# d #.#g; s# m #-#g;" | xargs cat \ | gzip -c > downloads/phastCons6way.wigFix.gz # check integrity of data with wigToBigWig zcat downloads/phastCons6way.wigFix.gz \ | wigToBigWig stdin /hive/data/genomes/felCat4/chrom.sizes \ phastCons6way.bw bigWigInfo phastCons6way.bw # version: 4 # isCompressed: yes # isSwapped: 0 # primaryDataSize: 3,167,968,203 # primaryIndexSize: 69,120,232 # zoomLevels: 10 # chromCount: 50801 # basesCovered: 1,636,796,497 # mean: 0.158595 # min: 0.000000 # max: 1.000000 # std: 0.269059 # encode those files into wiggle data zcat downloads/phastCons6way.wigFix.gz \ | wigEncode stdin phastCons6way.wig phastCons6way.wib # Converted stdin, upper limit 1.00, lower limit 0.00 du -hsc *.wi? # 1.6G phastCons6way.wib # 211M phastCons6way.wig # 1.8G total # Load gbdb and database with wiggle. ln -s `pwd`/phastCons6way.wib /gbdb/felCat4/multiz6way/phastCons6way.wib nice hgLoadWiggle -pathPrefix=/gbdb/felCat4/multiz6way felCat4 \ phastCons6way phastCons6way.wig # Connected to database felCat4 for track phastCons6way # Creating wiggle table definition in felCat4.phastCons6way # Saving wiggle.tab # Loading felCat4 # use to set trackDb.ra entries for wiggle min and max wigTableStats.sh felCat4 phastCons6way # db.table min max mean count sumData stdDev viewLimits # felCat4.phastCons6way 0 1 0.158595 1636796497 2.59588e+08 0.269059 # viewLimits=0:1 # Create histogram to get an overview of all the data time nice -n +19 hgWiggle -doHistogram -db=felCat4 \ -hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \ phastCons6way > histogram.data 2>&1 # real 4m16.548s # create plot of histogram: cat << '_EOF_' | gnuplot > histo.png set terminal png small x000000 xffffff xc000ff x66ff66 xffff00 x00ffff set size 1.4, 0.8 set key left box set grid noxtics set grid ytics set title " Cat felCat4 Histogram phastCons6way track" set xlabel " phastCons6way score" set ylabel " Relative Frequency" set y2label " Cumulative Relative Frequency (CRF)" set y2range [0:1] set y2tics set yrange [0:0.02] plot "histogram.data" using 2:5 title " RelFreq" with impulses, \ "histogram.data" using 2:7 axes x1y2 title " CRF" with lines '_EOF_' # << happy emacs display histo.png & ############################################################################# # phyloP for 6-way (DONE - 2011-01-21 - Chin) # run phyloP with score=LRT ssh swarm mkdir /cluster/data/felCat4/bed/multiz6way/consPhyloP cd /cluster/data/felCat4/bed/multiz6way/consPhyloP mkdir run.phyloP cd run.phyloP # Adjust model file base composition background and rate matrix to # be # representative of the chromosomes in play grep BACKGROUND ../../cons/all/all.mod | awk '{printf "%0.3f\n", $3 + $4}' # 0.573 /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/modFreqs \ ../../cons/all/all.mod 0.573 > all.mod # following the pattern from hg19 with only one grp: "all" cat << '_EOF_' > doPhyloP.csh #!/bin/csh -fe set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin set f = $1 set file1 = $f:t set out = $2 set cName = $f:t:r set n = $f:r:e set grp = $cwd:t set cons = /hive/data/genomes/felCat4/bed/multiz6way/consPhyloP set tmp = $cons/tmp/$grp/$f rm -fr $tmp mkdir -p $tmp set ssSrc = "/hive/data/genomes/felCat4/bed/multiz6way/cons/SS/$f" set useGrp = "$grp.mod" ln -s $cons/run.phyloP/$grp.mod $tmp pushd $tmp > /dev/null $PHASTBIN/phyloP --method LRT --mode CONACC --wig-scores --chrom $cName \ -i SS $useGrp $ssSrc.ss > $file1.wigFix popd > /dev/null mkdir -p $out:h sleep 4 mv $tmp/$file1.wigFix $out rm -fr $tmp rmdir --ignore-fail-on-non-empty $cons/tmp/$grp/$cName rmdir --ignore-fail-on-non-empty $cons/tmp/$grp rmdir --ignore-fail-on-non-empty $cons/tmp '_EOF_' # << happy emacs chmod +x doPhyloP.csh # Create list of chunks find ../../cons/SS -type f | grep ".ss$" \ | sed -e "s/.ss$//; s#^../../cons/SS/##" > ss.list # Create template file # file1 == $chr/$chunk/file name without .ss suffix cat << '_EOF_' > template #LOOP ../run.phyloP/doPhyloP.csh $(path1) {check out line+ wigFix/$(dir1)/$(file1).wigFix} #ENDLOOP '_EOF_' # << happy emacs ###################### Running all species ####################### # setup run for all species mkdir /hive/data/genomes/felCat4/bed/multiz6way/consPhyloP/all cd /hive/data/genomes/felCat4/bed/multiz6way/consPhyloP/all rm -fr wigFix mkdir wigFix gensub2 ../run.phyloP/ss.list single ../run.phyloP/template jobList para -ram=8g create jobList para try ... check ... etc ... para -maxJob=64 push para time > run.time # Completed: 51064 of 51064 jobs # CPU time in finished jobs: 2956s 49.26m 0.82h 0.03d 0.000 y # IO & Wait Time: 682189s 11369.82m 189.50h 7.90d 0.022 y # Average job time: 13s 0.22m 0.00h 0.00d # Longest finished job: 35s 0.58m 0.01h 0.00d # Submission to last job: 14468s 241.13m 4.02h 0.17d # make downloads mkdir downloads find ./wigFix -type f | sed -e "s#^./##; s#\.# d #g; s#-# m #;" \ | sort -k1,1 -k3,3n | sed -e "s# d #.#g; s# m #-#g;" | xargs cat \ | gzip -c > downloads/phyloP6way.wigFix.gz # check integrity of data with wigToBigWig export sizeG=188743680 ulimit -d $sizeG ulimit -v $sizeG zcat downloads/phyloP6way.wigFix.gz \ | wigToBigWig stdin /hive/data/genomes/felCat4/chrom.sizes phyloP6way.bw bigWigInfo phyloP6way.bw # version: 4 # isCompressed: yes # isSwapped: 0 # primaryDataSize: 2,502,607,434 # primaryIndexSize: 69,120,232 # zoomLevels: 10 # chromCount: 50801 # basesCovered: 1,636,796,497 # mean: 0.092835 # min: -3.244000 # max: 1.099000 # std: 0.593643 # encode those files into wiggle data zcat downloads/phyloP6way.wigFix.gz \ | wigEncode stdin phyloP6way.wig phyloP6way.wib # Converted stdin, upper limit 1.10, lower limit -3.24 du -hsc *.wi? # 1.6G phyloP6way.wib # 214M phyloP6way.wig # 1.8G total # Load gbdb and database with wiggle. ln -s `pwd`/phyloP6way.wib /gbdb/felCat4/multiz6way/phyloP6way.wib nice hgLoadWiggle -pathPrefix=/gbdb/felCat4/multiz6way felCat4 \ phyloP6way phyloP6way.wig # Connected to database felCat4 for track phyloP6way # Creating wiggle table definition in felCat4.phyloP6way # Saving wiggle.tab # Loading felCat4 # use to set trackDb.ra entries for wiggle min and max wigTableStats.sh felCat4 phyloP6way # db.table min max mean count sumData stdDev viewLimits # felCat4.phyloP6way -3.244 1.099 0.0928346 1636796497 1.51951e+08 # 0.593643 viewLimits=-2.87538:1.099 # Create histogram to get an overview of all the data time nice -n +19 hgWiggle -doHistogram -db=felCat4 \ -hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \ phyloP6way > histogram.data 2>&1 # real 2m40.590s # create plot of histogram: # create plot of histogram: cat << '_EOF_' | gnuplot > histo.png set terminal png small x000000 xffffff xc000ff x66ff66 xffff00 x00ffff set size 1.4, 0.8 set key left box set grid noxtics set grid ytics set title " Cat felCat4 Histogram phyloP6way track" set xlabel " phyloP6way score" set ylabel " Relative Frequency" set y2label " Cumulative Relative Frequency (CRF)" set y2range [0:1] set y2tics set yrange [0:0.002] plot "histogram.data" using 2:5 title " RelFreq" with impulses, \ "histogram.data" using 2:7 axes x1y2 title " CRF" with lines '_EOF_' # << happy emacs display histo.png & ############################################################################# # download data for 6-way (DONE - 2011-01-26 - Chin) mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/felCat4/phastCons6way cd /usr/local/apache/htdocs-hgdownload/goldenPath/felCat4/phastCons6way ln -s \ /hive/data/genomes/felCat4/bed/multiz6way/cons/all/downloads/phastCons6way.wigFix.gz \ ./phastCons6way.wigFix.gz ln -s /hive/data/genomes/felCat4/bed/multiz6way/cons/all/phastCons6way.bw \ ./phastCons6way.bw ln -s /hive/data/genomes/felCat4/bed/multiz6way/cons/all/all.mod ./all.mod ln -s /hive/data/genomes/felCat4/bed/multiz6way/cons/README.txt . md5sum * > /hive/data/genomes/felCat4/bed/multiz6way/cons/md5sum.txt ln -s /hive/data/genomes/felCat4/bed/multiz6way/cons/md5sum.txt . mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/felCat4/phyloP6way/ cd /usr/local/apache/htdocs-hgdownload/goldenPath/felCat4/phyloP6way ln -s /hive/data/genomes/felCat4/bed/multiz6way/consPhyloP/all/downloads/phyloP6way.wigFix.gz \ ./phyloP6way.wigFix.gz ln -s /hive/data/genomes/felCat4/bed/multiz6way/consPhyloP/all/phyloP6way.bw ./phyloP6way.bw ln -s /hive/data/genomes/felCat4/bed/multiz6way/consPhyloP/run.phyloP/all.mod ./all.mod ln -s /hive/data/genomes/felCat4/bed/multiz6way/consPhyloP/README.txt . md5sum * > /hive/data/genomes/felCat4/bed/multiz6way/consPhyloP/md5sum.txt ln -s /hive/data/genomes/felCat4/bed/multiz6way/consPhyloP/md5sum.txt . # multiz stuff cd /hive/data/genomes/felCat4/bed/multiz6way /cluster/bin/phast/tree_doctor --rename \ "felCat4->Cat; canFam2->Dog; ailMel1->Panda; hg19->Human; mm9->Mouse; monDom5->Opossum" \ 6way.nh > commonNames.6way.nh mkdir /hive/data/genomes/felCat4/bed/multiz6way/downloads cd /hive/data/genomes/felCat4/bed/multiz6way/anno gzip felCat4.6way.maf cp felCat4.6way.maf.gz ../downloads/ gzip -d felCat4.6way.maf.gz cd /usr/local/apache/htdocs-hgdownload/goldenPath/felCat4/multiz6way ln -s /hive/data/genomes/felCat4/bed/multiz6way/6way.nh . ln -s /hive/data/genomes/felCat4/bed/multiz6way/commonNames.6way.nh . ln -s /hive/data/genomes/felCat4/bed/multiz6way/downloads/felCat4.6way.maf.gz . ln -s /hive/data/genomes/felCat4/bed/multiz6way/README.txt . md5sum * > /hive/data/genomes/felCat4/bed/multiz6way/downloads/md5sum.txt ln -s /hive/data/genomes/felCat4/bed/multiz6way/downloads/md5sum.txt . ############################################################################# # LIFTOVER TO FelCat5 (DONE - 2012-07-23 - Hiram ) mkdir /hive/data/genomes/felCat4/bed/blat.felCat5.2012-07-23 cd /hive/data/genomes/felCat4/bed/blat.felCat5.2012-07-23 # -debug run to create run dir, preview scripts... doSameSpeciesLiftOver.pl -debug \ -ooc /hive/data/genomes/felCat4/jkStuff/felCat4.11.ooc felCat4 felCat5 # Real run: time nice -n +19 doSameSpeciesLiftOver.pl -verbose=2 \ -ooc /hive/data/genomes/felCat4/jkStuff/felCat4.11.ooc \ -bigClusterHub=swarm -dbHost=hgwdev -workhorse=hgwdev \ felCat4 felCat5 > do.log 2>&1 # real 58m58.107s #############################################################################