# $Id: strPur2.txt,v 1.5 2008/07/10 15:25:17 kord Exp $ # $Source: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/strPur2.txt,v $ # Strongylocentrus purpuratus -- Spur 2.1 assembly September, 2006 # # ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Spurpuratus/fasta/Spur_v2.1/ ############################################################################### # DOWNLOAD SEQUENCE - DONE 2/12/2007 Kord # - select store # - basic directory setup ssh kkstore06 mkdir -p /cluster/store4/strPur2/downloads ln -s /cluster/store4/strPur2 /cluster/data/strPur2 cd /cluster/data/strPur2/downloads wget -r -np ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Spurpuratus/fasta/Spur_v2.1/ # decompress contig files cd /cluster/data/strPur2/downloads/Spur_v2.1/contigs gunzip *gz # Move to working directory mkdir -p /cluster/data/strPur2/fixup cd /cluster/data/strPur2/fixup cp /cluster/data/strPur2/downloads/Spur_v2.1/contigs/* . ############################################################################### # PREP AGP/FASTA/QUAL files - DONE 2/19/2007 Kord # Remove the "BCM_Spur_v2.1_" from the scaffold name in the AGP file sed 's/BCM_Spur_v2.1_//g' BCM_Spur_v2.1.agp > strPur2.agp # trimHeader in fasta and qual files ~/kent/src/hg/snp/snpLoad/trimHeader Spur_v2.1.contigs.fa sed 's/>[a-z]*|[0-9]*|[a-z]*|/>/g' trimHeader.out > strPur2.contigs.fa gzip strPur2.contigs.fa ~/kent/src/hg/snp/snpLoad/trimHeader Spur_v2.1.contigs.fa.qual mv trimHeader.out strPur2.contigs.fa.qual gzip strPur2.contigs.fa.qual ############################################################################### # MAKE GENOME DB - PREP 2/19/2007 Kord # Obtained the commonName value $ hgsql hgcentraltest Welcome to the MySQL monitor. Commands end with ; or \g. Your MySQL connection id is 38380828 to server version: 4.0.27-standard-log Type 'help;' or '\h' for help. Type '\c' to clear the buffer. mysql> show tables; +-------------------------+ | Tables_in_hgcentraltest | +-------------------------+ | blatServers | | clade | | dbDb | | dbDbBak | | dbDbNew | | defaultDb | | gdbPdb | | genomeClade | | genomeCladeTest | | liftOverChain | | liftOverChainBackup | | namedSessionDb | | sessionDb | | userDb | | userDbApr12 | +-------------------------+ 15 rows in set (0.00 sec) mysql> show columns from genomeClade; +----------+--------------+------+-----+---------+-------+ | Field | Type | Null | Key | Default | Extra | +----------+--------------+------+-----+---------+-------+ | genome | varchar(255) | | | | | | clade | varchar(255) | | | | | | priority | float | | | 0 | | +----------+--------------+------+-----+---------+-------+ 3 rows in set (0.00 sec) mysql> select * from genomeClade; +-----------------------------------------+--------------+----------+ | genome | clade | priority | +-----------------------------------------+--------------+----------+ ... | S. purpuratus | deuterostome | 20 | ... +-----------------------------------------+--------------+----------+ 127 rows in set (0.00 sec) mysql> quit; Bye $ cat /cluster/home/kord/urchin/fixup/strPur2.config.ra # Config parameters for makeGenomeDb.pl: db strPur2 scientificName Strongylocentrotus purpuratus assemblyDate Sep. 2006 clade deuterostome assemblyLabel Baylor release 3 Spur 2.1 orderKey 880 # GenBank:X12631 gi:296545 mitoAcc 296545 fastaFiles /cluster/data/strPur2/fixup/strPur2.contigs.fa.gz dbDbSpeciesDir urchin # Optional settings commonName S. purpuratus agpFiles /cluster/data/strPur2/fixup/strPur2.agp qualFiles /cluster/data/strPur2/fixup/strPur2.contigs.fa.qual.gz # verify config.ra file works $ /cluster/home/kord/kent/src/hg/utils/automation/makeGenomeDb.pl -workhorse kkstore06 -debug ./strPur2.config.ra # run it (I did this in a screen) $ ssh kkstore06 $ ~/kent/src/hg/utils/automation/makeGenomeDb.pl -workhorse kkstore06 ./strPur2.config.ra \ | tee -a makeGenomeDb.pl.log # This generated an error with output files in /tmp # There are 2654 sequences in the fasta file not included in the AGP file. # Use faSomeRecords to create a fasta file of only the sequeces contained in # the AFP file $ cp makeGenomeDb.20070220/makeGenomeDb.agpIds.Ay8953 strPur2.AGPlist $ faSomeRecords strPur2.contigs.fa strPur2.AGPlist strPur2.contigs-AGPlist.fa $ faSomeRecords strPur2.contigs.fa.qual strPur2.AGPlist strPur2.contigs-AGPlist.fa.qual # I verified that the diff generated from the original FASTA file and the newly generated # FASTA file (AGP list) generated the correct sequence number (2654) and id # compressed the AGP and fasta file $ gzip strPur2.contigs-AGPlist.fa strPur2.contigs.fa.qual # updated the contig.ra file $ vi strPur2.config.ra # Config parameters for makeGenomeDb.pl: db strPur2 scientificName Strongylocentrotus purpuratus assemblyDate Sep. 2006 clade deuterostome assemblyLabel Baylor release 3 Spur 2.1 orderKey 880 # GenBank:X12631 gi:296545 mitoAcc 296545 fastaFiles /cluster/data/strPur2/fixup/strPur2.contigs-AGPlist.fa.gz dbDbSpeciesDir urchin # Optional settings commonName S. purpuratus agpFiles /cluster/data/strPur2/fixup/strPur2.agp qualFiles /cluster/data/strPur2/fixup/strPur2.contigs.fa.qual.gz # run it (I did this in a screen) $ ssh kkstore06 $ nice ~/kent/src/hg/utils/automation/makeGenomeDb.pl -workhorse kkstore06 ./strPur2.config.ra \ | tee -a makeGenomeDb.pl.log2 # this turned out two errors: # (1) unexpected coordiantes of fragments: length 1 (one) # (2) unable to find chromosome size $ mv strPur2.agp strPur2.agp.scaffoldname $ agpCondense strPur2.agp.scaffoldname strPur2.agp.condense $ mv strPur2.agp.condense strPur2.agp # strPur2.agp: # Scaffold18963 20393 24883 10 W AAGJ02000001 944 5434 - # # strPur2.contigs.fa # >AAGJ02000001 # GTTGACATGACCCTAGCTACTGTCCCTACGGACTATAGCTCCATAGCCCAAATGATTCCATTTGCTATCT # AGTGGATTCAATGGCCATATTAAATGGTACAAGGGCCCACAATCTGGTTCTGTCTTCTTCTTTTTTTAGG # ... # # >AAGJ02000001 # 20 60 60 57 53 60 59 63 63 63 58 58 58 54 54 59 63 54 63 58 # 59 63 53 63 63 63 58 58 57 57 63 63 58 58 63 57 57 59 63 63 # 53 53 53 52 63 59 63 63 58 58 60 58 60 63 63 57 60 57 52 53 # ... $ gzip strPur2.contigs.fa.qual $ gzip strPur2.contigs.fa $ ssh kkstore06 $ nice ~/kent/src/hg/utils/automation/makeGenomeDb.pl -continue db -workhorse kkstore06 \ ./strPur2.config.ra | tee -a makeGenomeDb.pl.log3 2>&1 # loading Gold/Gap manually $ ssh hgwdev $ time nice hgGoldGapGl -noGl strPur2 strPur2.agp >hgGoldGapGl.log1 2>&1 real 0m6.283s user 0m0.565s sys 0m0.070s # as with strPur1, the indices are not built correctly, so they need to be # rebuilt $ time nice hgsql strPur2 -e 'analyze table gold; analyze table gap;' +--------------+---------+----------+----------+ | Table | Op | Msg_type | Msg_text | +--------------+---------+----------+----------+ | strPur2.gold | analyze | status | OK | +--------------+---------+----------+----------+ +-------------+---------+----------+----------+ | Table | Op | Msg_type | Msg_text | +-------------+---------+----------+----------+ | strPur2.gap | analyze | status | OK | +-------------+---------+----------+----------+ real 0m0.298s user 0m0.000s sys 0m0.004s # # Starting over, I have renamed the exisitng table so I can re-run # makeGenomeDB # mysql> show tables; +--------------------+ | Tables_in_strPur2 | +--------------------+ | chromInfo_20070312 | | gap_20070312 | | gc5Base_20070312 | | gold_20070312 | | grp_20070312 | | history_20070312 | | quality_20070312 | +--------------------+ 7 rows in set (0.00 sec) # moved previous runs to the side $ cd /cluster/data/strPur2 $ mv M M_20070223 $ mv chrom.sizes chrom.sizes_20070223 # $ ssh kkstore06 $ cd /cluster/data/strPur2/fixup $ nice ~/kent/src/hg/utils/automation/makeGenomeDb.pl -workhorse kkstore06 ./strPur2.config.ra > makeGenomeDb.log4 2>&1 $ nice ~/kent/src/hg/utils/automation/makeGenomeDb.pl -continue db -workhorse kkstore06 ./strPur2.config.ra >> makeGenomeDb.log4 # 2007/Mar/14 - Kord # Heather and I did a sanity check on the strPur2 database and feel the tables # have been updated correctly, only the qual table isn't correct # From here I have removed the qual line from the .ra file and continued the # rest of the initial setup and will deal with the quality files later on. $ ssh kkstore06 $ nice ~/kent/src/hg/utils/automation/makeGenomeDb.pl -continue dbDb -workhorse kkstore06 ./strPur2.config.ra >> makeGenomeDb.log5 2>&1 ######################################################################################## # REPEATMASKER (COMPLETED 2007/Mar/22 Kord) # verify the species name $ /cluster/bluearc/RepeatMasker/util/queryRepeatDatabase.pl -species Strongylocentrotus -stat # queryRepeatDatabase # =================== # RepeatMasker Database: RepeatMaskerLib.embl # Version: 20061006 # Species: Strongylocentrotus ( strongylocentrotus ) # >IS1#ARTEFACT/ Length = 768 bp # >IS2#ARTEFACT/ Length = 1331 bp # >IS3#ARTEFACT/ Length = 1258 bp # ... # >Polinton-4_SP#DNA/Maverick Length = 15575 bp # >Polinton-5_SP#DNA/Maverick Length = 16525 bp # >CR1-21_SP#LINE/L2 Length = 4603 bp # 176 ancestral and ubiquitous sequence(s) with a total length of 52708 bp # 99 lineage specific sequence(s) with a total length of 224080 bp # -------------------------------------------------------------------------------- # 275 sequence(s) with a total length of 276788 bp # Run -debug to create the dir structure and preview the scripts: $ ssh kkstore06 $ ~/kent/src/hg/utils/automation/doRepeatMasker.pl strPur2 -verbose 3 -debug -fileserver kkstore06 # run it for real $ nice ~/kent/src/hg/utils/automation/doRepeatMasker.pl strPur2 -verbose 3 -fileserver kkstore06 -workhorse kolossus > RepeatMasker.log1 2>&1 # fixed umask issue and did a chmod g+w on /cluster/data/strPur2 and home # sub-directories, added ~hiram/.bashrc.hiram content to my .bashrc to make # sure I had the appropriate environment variables (e.g. $HOST) # run it for real again $ nice ~/kent/src/hg/utils/automation/doRepeatMasker.pl strPur2 -verbose 3 > RepeatMasker.log2 2>&1 # it appears one node in kk is responsible for all the failures: $ ssh kk $ cd /cluster/data/strPur2/bed/RepeatMasker.2007-03-16/run.cluster $ para problems | grep ^host | sort | uniq -c # 2257 jobs in batch # 14081 jobs (including everybody's) in Parasol queue. # Checking finished jobs # 6555 host: kkr4u02.kilokluster.ucsc.edu # we removed the node and re-ran the jobs. $ para push -retries=5 # make some links and files to continue doRepeatMasker $ ssh kk $ cd /cluster/data/strPur2/bed/ $ ln -s RepeatMasker.2007-03-16 RepeatMasker.2007-03-22 $ date > /cluster/data/strPur2/bed/RepeatMasker.2007-03-22/run.cluster/run.time $ cd /cluster/data/strPur2/fixup $ nice ~/kent/src/hg/utils/automation/doRepeatMasker.pl -continue cat strPur2 -verbose 3 > RepeatMasker.log3 2>&1 & # Converage $ ssh hgwdev $ featureBits strPur2 rmsk #115258247 bases of 810038660 (14.229%) in intersection featureBits strPur2 rmsk simpleRepeat #21752322 bases of 810038660 (2.685%) in intersection ######################################################################################## # SIMPLE REPEATS (TRF) (DONE 2007/Mar/14 Kord) $ ssh kkr1u00 $ mkdir /cluster/data/strPur2/bed/simpleRepeat $ time twoBitToFa ../../strPur2.unmasked.2bit stdout | trfBig -trf=/cluster/bin/i386/trf stdin /dev/null \ > -bedAt=simpleRepeat.bed -tempDir=/tmp > trf.log 2>&1 # Complete in approx. 6 hours # Make a filtered version for sequence masking: $ awk '{if ($5 <= 12) print;}' simpleRepeat.bed > trfMask.bed $ splitFileByColumn trfMask.bed trfMaskChrom # Load unfiltered repeats into the database: $ hgLoadBed strPur2 simpleRepeat \ > /cluster/data/strPur2/bed/simpleRepeat/simpleRepeat.bed \ > -sqlTable=/cluster/home/kord/kent/src/hg/lib/simpleRepeat.sql # Coverage $ featureBits strPur2 simpleRepeat 50692153 bases of 810038660 (6.258%) in intersection ######################################################################################## # MASK SEQUENCE WITH FILTERED TRF IN ADDITION TO RM (DONE 2007/MAR/22 Kord) $ ssh kolossus $ cd /cluster/data/strPur2 $ time twoBitMask strPur2.rmsk.2bit -add bed/simpleRepeat/trfMask.bed strPur2.2bit # Warning: BED file bed/simpleRepeat/trfMask.bed has >=13 fields which means it # might contain block coordinates, but this program uses # only the first three fields (the entire span -- no support for blocks). # real 0m10.698s # user 0m1.871s # sys 0m1.678s # Link to it from /gbdb: $ ssh hgwdev $ ln -s /cluster/data/strPur2/strPur2.2bit /gbdb/strPur2/strPur2.2bit ######################################################################################## # BLAT SERVER (STARTED,DONE 2007/MAY/17 Kord) # Sent request w/2bit paths to cluster-admin. # Per Victoria: # strPur2 has been started on blat13 # translated port on: 17780 # untranslated on: 17781 # Added entries into hgcentraltest db # [kord@hgwdev /cluster/data/strPur2] hgsql hgcentraltest Welcome to the MySQL monitor. Commands end with ; or \g. Your MySQL connection id is 40702725 to server version: 4.0.27-standard-log Type 'help;' or '\h' for help. Type '\c' to clear the buffer. mysql> insert into blatServers (db, host, port, isTrans, canPcr) values ("strPur 2", "blat13", 17780, 1, 0); Query OK, 1 row affected (0.00 sec) mysql> insert into blatServers (db, host, port, isTrans, canPcr) values ("strPur 2", "blat13", 17781, 0, 1); Query OK, 1 row affected (0.00 sec) mysql> select * from blatServers where ( db='strpur2' or db='strpur 2'); +----------+--------+-------+---------+--------+ | db | host | port | isTrans | canPcr | +----------+--------+-------+---------+--------+ | strPur 2 | blat13 | 17781 | 0 | 1 | | strPur 2 | blat13 | 17780 | 1 | 0 | | strPur2 | blat13 | 17781 | 0 | 1 | | strPur2 | blat13 | 17780 | 1 | 0 | +----------+--------+-------+---------+--------+ 4 rows in set (0.00 sec) mysql> select * from blatServers where ( db='strpur 2'); +----------+--------+-------+---------+--------+ | db | host | port | isTrans | canPcr | +----------+--------+-------+---------+--------+ | strPur 2 | blat13 | 17781 | 0 | 1 | | strPur 2 | blat13 | 17780 | 1 | 0 | +----------+--------+-------+---------+--------+ 2 rows in set (0.00 sec) mysql> delete from blatServers where ( db='strpur 2'); Query OK, 2 rows affected (0.00 sec) mysql> select * from blatServers where ( db='strpur2' or db='strpur 2'); +---------+--------+-------+---------+--------+ | db | host | port | isTrans | canPcr | +---------+--------+-------+---------+--------+ | strPur2 | blat13 | 17781 | 0 | 1 | | strPur2 | blat13 | 17780 | 1 | 0 | +---------+--------+-------+---------+--------+ 2 rows in set (0.00 sec) ######################################################################################## # MAKE DOWNLOADABLE / GOLDENPATH FILES (STARTED 2007/MAY/17 Kord) # Completed 2007/Aug/20 $ cd /cluster/data/strPur2 $ ln -s /cluster/data/strPur2/bed/RepeatMasker.2007-03-22/strPur2.fa.out $ ~/kent/src/hg/utils/automation/makeDownloads.pl strPur2 -verbose 2 > jkStuff/downloads.log & # Edit these files # /cluster/data/strPur2/goldenPath/database/README.txt # /cluster/data/strPur2/goldenPath/bigZips/README.txt ######################################################################################## # PUT MASKED SEQUENCE OUT FOR CLUSTER RUNS (DONE 2007/MAY/30 Kord) cp /cluster/data/strPur2/strPur2.2bit /cluster/bluearc/strPur2/ cp /cluster/data/strPur2/chrom.sizes /cluster/bluearc/strPur2/ # pitakluster: ssh pk cp /cluster/data/strPur2/strPur2.2bit /san/sanvol1/scratch/strPur2/ cp /cluster/data/strPur2/chrom.sizes /san/sanvol1/scratch/strPur2/ mkdir -p /san/sanvol1/scratch/strPur2/rmsk cp -p /cluster/data/strPur2/strPur2.fa.out /san/sanvol1/scratch/strPur2/rmsk # kki: ssh kkr1u00 mkdir -p /iscratch/i/strPur2 cp -p /cluster/data/strPur2/strPur2.2bit /iscratch/i/strPur2 cp -p /cluster/data/strPur2/chrom.sizes /iscratch/i/strPur2 # sync small cluster ssh kkr1u00 cd /iscratch/i/strPur2 for R in 2 3 4 5 6 7 8 do rsync -av ./ kkr${R}u00:/iscratch/i/strPur2/ \ --progress \ --stats done ######################################################################################## # MAKE 11.00C FILE FOR BLAT (DONE 2007/MAY/30 Kord) # Using -repMatch=300 (per strPur1) ssh kolossus blat /cluster/data/strPur2/strPur2.2bit /dev/null /dev/null -tileSize=11 \ -makeOoc=/cluster/bluearc/strPur2/11.ooc -repMatch=300 # Wrote 36124 overused 11-mers to /cluster/bluearc/strPur2/11.ooc ssh kkr1u00 /iscratch/i/strPur2/ cp -p /cluster/bluearc/strPur2/11.ooc . # sync cluster ssh kkr1u00 cd /iscratch/i/strPur2 for R in 2 3 4 5 6 7 8 do rsync -av ./ kkr${R}u00:/iscratch/i/strPur2/ \ --progress \ --stats done ######################################################################################## # GENBANK AUTO UPDATE # (STARTED 2007/MAY/30 Kord) # (COMPLETED 2007/JUN/01 Kord) ssh hgwdev cd ~/kent/src/hg/makeDb/genbank cvsup # check data/organism.lst for counts of native mRNA, EST, RegSeq cd /cluster/data/genbank/data/processed/genbank.159.0/full/ egrep purpuratus mrna.gbidx | egrep Strongylocentrotus | wc -l #1097 egrep purpuratus est.*gbidx | egrep Strongylocentrotus | wc -l #141833 cd /cluster/data/genbank/data/processed/refseq.23/full egrep purpuratus mrna.gbidx | egrep Strongylocentrotus | wc -l #260 # edit etc/genbank.conf to add strPur2 cd ~/kent/src/hg/makeDb/genbank/etc/ # strPur2 (S. purpuratus) strPur2.serverGenome = /cluster/data/strPur2/strPur2.2bit strPur2.clusterGenome = /cluster/bluearc/strPur2/strPur2.2bit strPur2.ooc = /cluster/bluearc/strPur2/11.ooc strPur2.lift = no strPur2.refseq.mrna.native.pslCDnaFilter = ${lowCover.refseq.mrna.native.pslCDnaFilter} strPur2.refseq.mrna.xeno.pslCDnaFilter = ${lowCover.refseq.mrna.xeno.pslCDnaFilter} strPur2.genbank.mrna.native.pslCDnaFilter = ${lowCover.genbank.mrna.native.pslCDnaFilter} strPur2.genbank.mrna.xeno.pslCDnaFilter = ${lowCover.genbank.mrna.xeno.pslCDnaFilter} strPur2.genbank.est.native.pslCDnaFilter = ${lowCover.genbank.est.native.pslCDnaFilter} strPur2.refseq.mrna.native.load = yes strPur2.refseq.mrna.xeno.load = yes strPur2.genbank.mrna.xeno.load = yes strPur2.genbank.est.native.load = yes strPur2.downloadDir = strPur2 strPur2.perChromTables = no cvs commit -m "added strPur2" genbank.conf # This was already done with strPur1: # edit src/lib/gbGenome.c # static char *strPurNames[] = {"Strongylocentrotus purpuratus", NULL}; # static struct dbToSpecies dbToSpeciesMap[] = { # ... {"strPur", strPurNames, NULL}, ... make install-server ssh kkstore02 cd /cluster/data/genbank/ nice time bin/gbAlignStep -initial strPur2 tail -f var/build/logs/2007.05.30-18:40:41.strPur2.initalign.log # The job failed on kk: # "Out of memory needLargeMem - request size 12 bytes" # Mark D.: "The batch file end up being particularly large and the para # command aborted checking the jobs due to our new memory limits. # I pushed the jobs by hand." # The job was continued with: ssh kkstore02 cd /cluster/data/genbank/ nice bin/gbAlignStep -initial -continue=finish strPur2 tail -f var/build/logs/2007.05.31-13:29:15.strPur2.initalign.log # load database ssh hgwdev cd /cluster/data/genbank nice ./bin/gbDbLoadStep -drop -initalLoad strPur2 featureBits strPur2 all_mrna # 917364 bases of 810038660 (0.113%) in intersection featureBits strPur1 all_mrna # 941460 bases of 835421305 (0.113%) in intersection featureBits strPur2 xenoMrna # 9299478 bases of 810038660 (1.148%) in intersection # Done by Heather: enable daily alignment and update of hgwdev cd ~/kent/src/hg/makeDb/genbank cvsup # add strPur2 to: etc/align.dbs etc/hgwdev.dbs cvs commit make etc-update ## reload database to correct some weird refseq issues (2007-09-28 markd) ssh hgwdev cd /cluster/data/genbank nice ./bin/gbDbLoadStep -drop -initialLoad strPur2 ######################################################################################## # QUALITY SCORES # (STARTED 2007/June/15) # (COMPLETED 2007/June/15) # ssh kkstore06 cd /cluster/home/kord/strPur2/fixup qaToQac ./strPur2.contigs.fa.qual strPur2.contigs.fa.qac qacAgpLift strPur2.agp strPur2.contigs.fa.qac strPur2.contigs.fa.lifted.qac > qacAgpLift.log 2>&1 & head qacAgpLift.log # Read 220581 qacs from strPur2.contigs.fa.qac # Got 114222 chroms in strPur2.agp # Scaffold3648 size=1000 # Scaffold9299 size=1184 mkdir /cluster/data/strPur2/bed/quality qacToWig -fixed strPur2.contigs.fa.lifted.qac stdout | wigEncode stdin /cluster/data/strPur2/bed/quality/strPur2.{wig,wib} 2>&1 > qual.wig.log & # Made 1 .wig files in stdout # Converted stdin, upper limit 90.00, lower limit 0.00 ssh hgwdev cd /cluster/data/strPur2/bed/quality ln -s `pwd`/strPur2.wib /gbdb/strPur2/wib hgLoadWiggle strPur2 quality strPur2.wig # This error is generated on the test-genome.ucsc.edu browser when the quality # track is active: # wigSetItemData: can't open file '/gbdb/strPur2/wib//cluster/data/strPur2/bed/quality/strPur2.wib' (No such file or directory) # # Updating the file column in strPur2 with the correct path fixed it: ssh hgwdev hgsql strPur2 mysql> SELECT file FROM quality WHERE file="/gbdb/strPur2/wib//cluster/data/strPur2/bed/quality/strPur2.wib" LIMIT 10; +-----------------------------------------------------------------+ | file | +-----------------------------------------------------------------+ | /gbdb/strPur2/wib//cluster/data/strPur2/bed/quality/strPur2.wib | | /gbdb/strPur2/wib//cluster/data/strPur2/bed/quality/strPur2.wib | | /gbdb/strPur2/wib//cluster/data/strPur2/bed/quality/strPur2.wib | | /gbdb/strPur2/wib//cluster/data/strPur2/bed/quality/strPur2.wib | | /gbdb/strPur2/wib//cluster/data/strPur2/bed/quality/strPur2.wib | | /gbdb/strPur2/wib//cluster/data/strPur2/bed/quality/strPur2.wib | | /gbdb/strPur2/wib//cluster/data/strPur2/bed/quality/strPur2.wib | | /gbdb/strPur2/wib//cluster/data/strPur2/bed/quality/strPur2.wib | | /gbdb/strPur2/wib//cluster/data/strPur2/bed/quality/strPur2.wib | | /gbdb/strPur2/wib//cluster/data/strPur2/bed/quality/strPur2.wib | +-----------------------------------------------------------------+ 10 rows in set (0.01 sec) mysql> UPDATE quality -> SET file="/gbdb/strPur2/wib/strPur2.wib" WHERE file="/gbdb/strPur2/wib//cluster/data/strPur2/bed/quality/strPur2.wib"; Query OK, 937749 rows affected (9.19 sec) Rows matched: 937749 Changed: 937749 Warnings: 0 mysql> SELECT file FROM quality WHERE file="/gbdb/strPur2/wib//cluster/data/strPur2/bed/quality/strPur2.wib" LIMIT 10; Empty set (1.05 sec) mysql> SELECT file FROM quality WHERE file="/gbdb/strPur2/wib/strPur2.wib" LIMIT 10; +-------------------------------+ | file | +-------------------------------+ | /gbdb/strPur2/wib/strPur2.wib | | /gbdb/strPur2/wib/strPur2.wib | | /gbdb/strPur2/wib/strPur2.wib | | /gbdb/strPur2/wib/strPur2.wib | | /gbdb/strPur2/wib/strPur2.wib | | /gbdb/strPur2/wib/strPur2.wib | | /gbdb/strPur2/wib/strPur2.wib | | /gbdb/strPur2/wib/strPur2.wib | | /gbdb/strPur2/wib/strPur2.wib | | /gbdb/strPur2/wib/strPur2.wib | +-------------------------------+ 10 rows in set (0.00 sec) ########################################################################### # GENSCAN # (STARTED 2007/Jun/15 Kord) # (COMPLETED 2007/Jun/20 Kord) ssh hgwdev mkdir /cluster/data/strPur2/bed/genscan cd /cluster/data/strPur2/bed/genscan # need to be a member of the genecats group to access this source mkdir gtf pep subopt cvs co hg3rdParty/genscanlinux # generate hard-masked sequence ssh kkstore06 cd /cluster/data/strPur2/bed/genscan zcat /cluster/data/strPur2/goldenPath/bigZips/strPur2.fa.gz | maskOutFa stdin hard strPur2.hardmask.fa # split into 2Mb files mkdir split cd split faSplit about ../strPur2.hardmask.fa 2000000 split & # generate file list and check that no files are completely masked # bash syntax for f in `find ./split -name "*fa"`; do egrep '[ACGT]' $f > /dev/null; if [ $? == 0 ]; then echo $f >> genome.list fi done wc -l genome.list # 431 genome.list # run on the small cluster (kkr1u00-kkr8u00) ssh kki cd /cluster/data/strPur2/bed/genscan cat << '_EOF_' > gsub #LOOP /cluster/bin/x86_64/gsBig {check in line+ $(path1)} {check out line gtf/$(root1).gtf} -trans={check out line pep/$(root1).pep} -subopt={check out line subopt/$(root1).bed} -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=2400000 #ENDLOOP '_EOF_' /parasol/bin/gensub2 genome.list single gsub jobList para create jobList #Checking input files #..................... #431 jobs written to batch para try #431 jobs in batch #0 jobs (including everybody's) in Parasol queue. #Checking finished jobs #updated job database on disk #Pushed Jobs: 10 para check #431 jobs in batch #10 jobs (including everybody's) in Parasol queue. #Checking finished jobs #unsubmitted jobs: 421 #running: 10 #total jobs in batch: 431 para push #431 jobs in batch #10 jobs (including everybody's) in Parasol queue. #Checking finished jobs #..................... #updated job database on disk #Pushed Jobs: 421 parasol list batches #user run wait done crash pri max batch #kord 12 419 0 0 10 -1 /cluster/store4/strPur2/bed/genscan/ para time #431 jobs in batch #0 jobs (including everybody's) in Parasol queue. #Checking finished jobs #Completed: 431 of 431 jobs #CPU time in finished jobs: 20032s 333.87m 5.56h 0.23d 0.001 y #IO & Wait Time: 1963s 32.72m 0.55h 0.02d 0.000 y #Average job time: 51s 0.85m 0.01h 0.00d #Longest running job: 0s 0.00m 0.00h 0.00d #Longest finished job: 77s 1.28m 0.02h 0.00d #Submission to last job: 1870s 31.17m 0.52h 0.02d # Concatenate ssh kkstore06 cd /cluster/data/strPur2/bed/genscan cat gtf/*.gtf > genscan.gtf cat pep/*.pep > genscan.pep cat subopt/*.bed > genscanSubopt.bed # Load into the database ssh hgwdev cd /cluster/data/strPur2/bed/genscan ldHgGene -gtf strPur2 genscan genscan.gtf # Reading genscan.gtf # Read 69740 transcripts in 295620 lines in 1 files # 69740 groups 37568 seqs 1 sources 1 feature types # 69740 gene predictions hgPepPred strPur2 generic genscanPep genscan.pep hgLoadBed strPur2 genscanSubopt genscanSubopt.bed featureBits strPur2 genscan # 67907435 bases of 810038660 (8.383%) in intersection featureBits strPur2 genscanSubopt # 42880032 bases of 810038660 (5.294%) in intersection # Should be zero intersection with rmsk featureBits strPur2 genscan rmsk # 3050 bases of 810038660 (0.000%) in intersection ########################################################################### # BLASTZ/CHAIN/NET HG18 # Started 2007/Aug/14 kord # Completed 2007/Aug/15 kord ssh hgwdev cd /cluster/data/strPur2/bed mkdir blastz.hg18.2007-08-15 ln -s blastz.hg18.2007-08-15 blastz.hg18 cd blastz.hg18 cp /cluster/data/strPur2/{strPur2.2bit,chrom.sizes} . cp /san/sanVol1/scratch/strPur1/blastz.hg18/HoxD55.q . cat << "_EOF_" > DEF # Sea urchin vs. Human export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin BASE=/cluster/data/strPur2/bed/blastz.hg18.2007-08-15 BLASTZ=blastz.v7.x86_64 # settings from strPur1 BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=4000 BLASTZ_K=2200 BLASTZ_Q=$BASE/HoxD55.q BLASTZ_ABRIDGE_REPEATS=0 # TARGET - Human hg18 SEQ1_DIR=/scratch/hg/hg18/nib SEQ1_LEN=/scratch/hg/hg18/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY - Sea urchin SEQ2_DIR=/san/sanVol1/scratch/strPur2/strPur2.2bit SEQ2_LEN=/san/sanVol1/scratch/strPur2/chrom.sizes SEQ2_LIMIT=5000 SEQ2_CHUNK=5000000 SEQ2_LAP=0 TMPDIR=/scratch/tmp _EOF_ ssh hgwdev cd /cluster/data/strPur2/bed/blastz.hg18 screen -L doBlastzChainNet.pl -bigCluster pk -smallCluster pk DEF >run.log 2>&1 & tail -f run.log cd run.blastz parasol list batches #user run wait done crash pri max batch #kord 394 64297 514 0 10 -1 #/cluster/store4/strPur2/bed/blastz.hg18.2007-08-15/run.blastz/ para check #65205 jobs in batch #64397 jobs (including everybody's) in Parasol queue. #Checking finished jobs #................ #queued and waiting: 64002 #running: 394 #ranOk: 809 #total jobs in batch: 65205 para time # 65205 jobs in batch # 56 jobs (including everybody's) in Parasol queue. # Checking finished jobs # Completed: 65205 of 65205 jobs # CPU time in finished jobs: 6291713s 104861.88m 1747.70h 72.82d 0.200 y # IO & Wait Time: 620160s 10336.00m 172.27h 7.18d 0.020 y # Average job time: 106s 1.77m 0.03h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 1458s 24.30m 0.41h 0.02d # Submission to last job: 18746s 312.43m 5.21h 0.22d cat fb.hg18.chainStrPur2Link.txt #110298296 bases of 2881515245 (3.828%) in intersection featureBits -chrom=chr1 hg18 chainStrPur2Link #7948016 bases of 224999719 (3.532%) in intersection ssh hgwdev cd /cluster/data/strPur2/bed/blastz.hg18.2007-08-15 cp -r axtChain/* /cluster/data/hg18/bed/blastz.strPur2/axtChain/ doRecipBest.pl hg18 strPur2 >rbest.log 2>&1 & ssh hgwdev cd /cluster/store4/strPur2/bed mkdir /cluster/store4/strPur2/bed/blastz.hg18.2007-08-15.swap cd /cluster/store4/strPur2/bed/blastz.hg18.2007-08-15.swap doBlastzChainNet.pl /cluster/store4/strPur2/bed/blastz.hg18.2007-08-15/DEF -swap >swap.log 2>&1 & # fix symbolic links cd /usr/local/apache/htdocs/goldenPath/currentGenomes/Homo_sapiens/vsStrPur2/reciprocalBest/ ln -s /cluster/data/hg18/bed/blastz.strPur2/axtRBestNet/*.axt.gz axtRBestNet/ cd /cluster/home/kord/kent/src/hg/makeDb/trackDb make update DB=strPur2 ########################################################################### # BLASTZ/CHAIN/NET Ciona intestinalis (ci2) # Started 2007/Aug/20 kord ssh hgwdev cd /cluster/data/strPur2/bed mkdir blastz.ci2.2007-08-27 ln -s blastz.ci2.2007-08-27 blastz.ci2 cd blastz.ci2 cat << "_EOF_" > DEF # S. purpuratus vs. C. intestinalis BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=4000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q BLASTZ_ABRIDGE_REPEATS=0 BASE=/cluster/data/strPur2/bed/blastz.ci2 # TARGET/REFERENCE - Sea urchin SEQ1_DIR=/san/sanvol1/scratch/strPur2/strPur2.2bit SEQ1_LEN=/san/sanvol1/scratch/strPur2/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY - Sea squirt SEQ2_DIR=/san/sanvol1/scratch/ci2/ci2.2bit SEQ2_LEN=/cluster/data/ci2/chrom.sizes SEQ2_CHUNK=5000000 SEQ2_LAP=10000 TMPDIR=/scratch/tmp _EOF_ ssh hgwdev cd /cluster/data/strPur2/bed/blastz.ci2 screen # start screen log to screen.log nohup doBlastzChainNet.pl -bigCluster pk -smallCluster pk DEF >run.log 2>&1 tail -f run.log ssh pk 'parasol list batches' #user run wait done crash pri max batch ... kord 379 133901 155205 16 10 -1 /cluster/store4/strPur2/bed/blastz.ci2.2007-08-25/run.blastz/ nohup doBlastzChainNet.pl -continue load -bigCluster pk -smallCluster pk DEF >run2.log 2>&1 ssh pk 'cd /cluster/store4/strPur2/bed/blastz.ci2/run.blastz;para time' # 289484 jobs in batch # 6 jobs (including everybody's) in Parasol queue. # Checking finished jobs # Completed: 289484 of 289484 jobs # CPU time in finished jobs: 26465295s 441088.24m 7351.47h 306.31d 0.839 y # IO & Wait Time: 2333019s 38883.66m 648.06h 27.00d 0.074 y # Average job time: 99s 1.66m 0.03h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 6847s 114.12m 1.90h 0.08d # Submission to last job: 75148s 1252.47m 20.87h 0.87d # Doesn't look too good... cat fb.strPur2.chainCi2Link.txt # 0 bases of 810038660 (0.000%) in intersection cd /cluster/home/kord/kent/src/hg/makeDb/trackDb vi urchin/strPur2/trackDb.ra make update DB=strPur2 # trying hg18 v. strPur2 settings cd /cluster/data/strPur2/bed mkdir blastz.ci2.2007-08-28 rm blastz.ci2 ln -s blastz.ci2.2007-08-28 blastz.ci2 cd blastz.ci2 cat << "_EOF_" > DEF # using hg18 v. strPur2 settings # S. purpuratus vs. C. intestinalis BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=4000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q BLASTZ_ABRIDGE_REPEATS=0 BASE=/cluster/data/strPur2/bed/blastz.ci2 # TARGET/REFERENCE - Sea urchin SEQ1_DIR=/san/sanvol1/scratch/strPur2/strPur2.2bit SEQ1_LEN=/san/sanvol1/scratch/strPur2/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY - Sea squirt SEQ2_DIR=/san/sanvol1/scratch/ci2/ci2.2bit SEQ2_LEN=/cluster/data/ci2/chrom.sizes SEQ2_LIMIT=5000 SEQ2_CHUNK=5000000 SEQ2_LAP=0 TMPDIR=/scratch/tmp _EOF_ ssh hgwdev cd /cluster/data/strPur2/bed/blastz.ci2 screen # start screen log to screen.log nohup doBlastzChainNet.pl -bigCluster pk -smallCluster pk DEF >run.log 2>&1 & tail -f run.log # detach screen date; ssh pk 'parasol list batches' # Tue Aug 28 19:16:14 PDT 2007 # user run wait done crash pri max batch # ... # kord 386 159876 52 0 10 -1 /cluster/store4/strPur2/bed/blastz.ci2.2007-08-28/run.blastz/ #CPU time in finished jobs: 25269725s 421162.09m 7019.37h 292.47d 0.801 y #IO & Wait Time: 986070s 16434.50m 273.91h 11.41d 0.031 y #Average job time: 164s 2.73m 0.05h 0.00d #Longest running job: 0s 0.00m 0.00h 0.00d #Longest finished job: 376s 6.27m 0.10h 0.00d #Submission to last job: 67884s 1131.40m 18.86h 0.79d # an error occurred: #HgStepManager: executing step 'chainMerge' Wed Aug 29 14:14:24 2007. # ssh -x kolossus nice 'chainMergeSort # /cluster/data/strPur2/bed/blastz.ci2/axtChain/run/chain/*.chain | nice gzip # -c > /cluster/data/strPur2/bed/blastz.ci2/axtChain/strPur2.ci2.all.chain.gz' #bash: /bin/nice: Argument list too long # so, trying to run the command by hand: ssh kolossus cd /cluster/data/strPur2/bed/blastz.ci2/axtChain/ screen # start screen log H nice chainMergeSort /cluster/data/strPur2/bed/blastz.ci2/axtChain/run/chain/*.chain | \ nice gzip -c > /cluster/data/strPur2/bed/blastz.ci2/axtChain/strPur2.ci2.all.chain.gz #bash: /bin/nice: Argument list too long # trying it without nice chainMergeSort /cluster/data/strPur2/bed/blastz.ci2/axtChain/run/chain/*.chain | \ nice gzip -c > /cluster/data/strPur2/bed/blastz.ci2/axtChain/strPur2.ci2.all.chain.gz # bash: /cluster/bin/x86_64/chainMergeSort: Argument list too long # trying it with tcsh rather then bash nice chainMergeSort /cluster/data/strPur2/bed/blastz.ci2/axtChain/run/chain/*.chain | \ nice gzip -c > /cluster/data/strPur2/bed/blastz.ci2/axtChain/strPur2.ci2.all.chain.gz # /cluster/bin/x86_64/chainMergeSort: Argument list too long. # Running from a list of files instead find /cluster/data/strPur2/bed/blastz.ci2/axtChain/run/chain -name "*.chain" | wc -l # 3817 find /cluster/data/strPur2/bed/blastz.ci2/axtChain/run/chain -name "*.chain" | sort > chain.lst nice chainMergeSort -inputList=/cluster/data/strPur2/bed/blastz.ci2/axtChain/chain.lst | \ nice gzip -c > /cluster/data/strPur2/bed/blastz.ci2/axtChain/strPur2.ci2.all.chain.gz # clearing out previous files mv /cluster/data/strPur2/bed/blastz.ci2/axtNet /cluster/data/strPur2/bed/blastz.ci2/axtNet.old mv /cluster/data/strPur2/bed/blastz.ci2/axtChain/strPur2.ci2.net.gz /cluster/data/strPur2/bed/blastz.ci2/axtChain/strPur2.ci2.net.gz.old mv /cluster/data/strPur2/bed/blastz.ci2/mafNet /cluster/data/strPur2/bed/blastz.ci2/mafNet.old mv /cluster/data/strPur2/bed/blastz.ci2/axtChain/noClass.net /cluster/data/strPur2/bed/blastz.ci2/axtChain/noClass.net.old # continuing with the net step nohup doBlastzChainNet.pl -continue net -bigCluster pk -smallCluster pk DEF >run4.log 2>&1 tail -f run4.log # download failed #mkdir /usr/local/apache/htdocs/goldenPath/strPur2/vsCi2 #mkdir: cannot create directory # `/usr/local/apache/htdocs/goldenPath/strPur2/vsCi2': File exists #Command failed: #ssh -x hgwdev nice #/cluster/data/strPur2/bed/blastz.ci2/axtChain/installDownloads.csh nohup doBlastzChainNet.pl -continue download -bigCluster pk -smallCluster pk DEF >run6.log 2>&1 cd /cluster/home/kord/kent/src/hg/makeDb/trackDb make update DB=strPur2 # check for the tables hgsql strPur2 > show tables; #| chainCi2 | #| chainCi2Link | #... #| netCi2 | # cat fb.strPur2.chainCi2Link.txt # 40755914 bases of 810038660 (5.031%) in intersection # adding strPur2 chains to ci2 (kober 2007/09/03 mkdir /cluster/data/ci2/bed/blastz.strPur2.swap cd /cluster/data/ci2/bed/blastz.strPur2.swap nohup doBlastzChainNet.pl -bigCluster pk -smallCluster pk -swap \ /cluster/data/strPur2/bed/blastz.ci2/DEF > swap.log 2>&1 & # This output turned out to be a bug: #165638509 bases of 141233565 (117.280%) in intersection # but Angie was able to fix it: nice featureBits ci2 chainStrPur2Link #26885895 bases of 141233565 (19.036%) in intersection ########################################################################### # HUMAN (hg18) PROTEINS TRACK # Started 2007/Aug/24 kord # Completed 2007/Sep/07 ssh kkstore06 bash # if not already in bash # split up the genome mkdir /cluster/data/strPur2/blastDb cd /cluster/data/strPur2 # From makeGenomeDb settings # fastaFiles /cluster/data/strPur2/fixup/strPur2.contigs-AGPlist.fa.gz zcat fixup/strPur2.contigs-AGPlist.fa.gz > temp.fa faSplit sequence temp.fa 500 blastDb/ rm temp.fa cd blastDb # create blast databases for i in *.fa do /cluster/bluearc/blast229/formatdb -i $i -p F done rm *.fa # copy these to the scratch mkdir -p /san/sanvol1/scratch/strPur2/blastDb cd /cluster/data/strPur2/blastDb for i in nhr nin nsq; do echo $i cp *.$i /san/sanvol1/scratch/strPur2/blastDb done mkdir -p /cluster/data/strPur2/bed/tblastn.hg18KG cd /cluster/data/strPur2/bed/tblastn.hg18KG echo /san/sanvol1/scratch/strPur2/blastDb/*.nsq | xargs ls -1S | \ sed "s/\.nsq//" > query.lst wc -l query.lst # 497 query.lst # for 50,000 jobs (per braney) and 498 queries we need to split at N lines calc `wc /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl | \ awk "{print \\\$1}"`/\(50000/`wc query.lst | \ awk "{print \\\$1}"`\) # 36727/(50000/497) = 365.066380 # split hg18KG.psl in to files of 365 lines mkdir -p /cluster/bluearc/strPur2/bed/tblastn.hg18KG/kgfa split -l 365 /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl \ /cluster/bluearc/strPur2/bed/tblastn.hg18KG/kgfa/kg ln -s /cluster/bluearc/strPur2/bed/tblastn.hg18KG/kgfa kgfa cd kgfa for i in *; do nice pslxToFa $i $i.fa; rm $i; done cd .. # create a directory for each hg18KG file ls -1S kgfa/*.fa > kg.lst mkdir -p /cluster/bluearc/strPur2/bed/tblastn.hg18KG/blastOut ln -s /cluster/bluearc/strPur2/bed/tblastn.hg18KG/blastOut for i in `cat kg.lst`;do mkdir blastOut/`basename $i .fa` done tcsh cd /cluster/data/strPur2/bed/tblastn.hg18KG/ cat << '_EOF_' > blastGsub #LOOP blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl } #ENDLOOP _EOF_ # blastall alignment matrix set based on max. intron size. # From Science 2006 Vol 314 p941-952 : # "The average gene length was 7.7kb with and average primary transcript # length of 8.9kb. A broad distribution of all exon lengths peaked at around # 100 to 115 nucleotides, whereas that for introns at around 750 nucleotides." # BLOSUM80 was used for strPur1 # BLOSUM62 is recommended on the NCBI blast man page for queries >80: # http://www.ncbi.nlm.nih.gov/blast/html/sub_matrix.html # use the same blastSome as felCat3 # replace BLOSUM80 with BLOSUM62 cat /cluster/data/felCat3/bed/tblastn.hg18KG/blastSome | sed 's/BLOSUM80/BLOSUM62/' > blastSome chmod +x blastSome gensub2 query.lst kg.lst blastGsub blastSpec exit # back to bash ssh pk cd /cluster/data/strPur2/bed/tblastn.hg18KG/ para create blastSpec # 50197 jobs written to batch para try para check parasol list batches # user run wait done crash pri max batch # adk 4 0 8 18 10 -1 /san/sanvol1/scratch/adk/hmm/release_0.5/4state/ # adk 8 0 10 11 10 -1 /san/sanvol1/scratch/adk/hmm/release_0.5/5state/ # kord 370 42310 246804 22 10 -1 /cluster/store4/strPur2/bed/blastz.ci2.2007-08-25/run.blastz/ # kord 10 0 0 0 10 -1 /cluster/store4/strPur2/bed/tblastn.hg18KG/ para time #50197 jobs in batch #41285 jobs (including everybody's) in Parasol queue. #Checking finished jobs #Completed: 10 of 50197 jobs #CPU time in finished jobs: 2691s 44.84m 0.75h 0.03d 0.000 y #IO & Wait Time: 103s 1.72m 0.03h 0.00d 0.000 y #Average job time: 279s 4.66m 0.08h 0.00d #Longest running job: 0s 0.00m 0.00h 0.00d #Longest finished job: 311s 5.18m 0.09h 0.00d #Submission to last job: 311s 5.18m 0.09h 0.00d para push parasol list batches #user run wait done crash pri max batch ... kord 101 50086 10 0 10 -1 /cluster/store4/strPur2/bed/tblastn.hg18KG/ ... para time #50197 jobs in batch #0 jobs (including everybody's) in Parasol queue. #Checking finished jobs #Completed: 50197 of 50197 jobs #CPU time in finished jobs: 7606393s 126773.21m 2112.89h 88.04d 0.241 y #IO & Wait Time: 352189s 5869.82m 97.83h 4.08d 0.011 y #Average job time: 159s 2.64m 0.04h 0.00d #Longest running job: 0s 0.00m 0.00h 0.00d #Longest finished job: 356s 5.93m 0.10h 0.00d #Submission to last job: 741350s 12355.83m 205.93h 8.58d ls -l error.log #-rw-rw-r-- 1 kord protein 0 Sep 4 12:33 error.log ssh kkstore06 cd /cluster/data/strPur2/bed/tblastn.hg18KG mkdir chainRun cd chainRun tcsh cat << '_EOF_' > chainGsub #LOOP chainOne $(path1) #ENDLOOP '_EOF_' cat << '_EOF_' > chainOne (cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=75000 stdin /cluster/bluearc/strPur2/bed/tblastn.hg18KG/blastOut/c.`basename $1`.psl) '_EOF_' exit chmod +x chainOne ls -1dS /cluster/bluearc/strPur2/bed/tblastn.hg18KG/blastOut/kg?? > chain.lst gensub2 chain.lst single chainGsub chainSpec # do the cluster run for chaining ssh kk cd /cluster/data/strPur2/bed/tblastn.hg18KG/chainRun/ para create chainSpec #Checking input files #101 jobs written to batch para maxNode 30 #Told hub to set maxNode 30 para try #101 jobs in batch #0 jobs (including everybody's) in Parasol queue. #Checking finished jobs #updated job database on disk #Pushed Jobs: 10 parasol list batches #user run wait done crash pri max batch #kord 10 0 0 10 10 30 /cluster/store4/strPur2/bed/tblastn.hg18KG/chainRun/ ssh pk para push parasol list batches #user run wait done crash pri max batch #kord 24 58 0 0 10 -1 /cluster/store4/strPur2/bed/tblastn.hg18KG/chainRun/ ssh kkstore06 cd /cluster/data/strPur2/bed/tblastn.hg18KG/blastOut bash # if using another shell for i in kg?? do cat c.$i.psl | awk "(\$13 - \$12)/\$11 > 0.6 {print}" > c60.$i.psl sort -rn c60.$i.psl | pslUniq stdin u.$i.psl awk "((\$1 / \$11) ) > 0.60 { print }" c60.$i.psl > m60.$i.psl echo $i done sort -T /tmp -k 14,14 -k 16,16n -k 17,17n u.*.psl m60* | \ uniq > /cluster/data/strPur2/bed/tblastn.hg18KG/blastHg18KG.psl # the annotation is with the genbank accession rather then Scaffold# head -2 blastHg18KG.psl #58 50 0 0 0 0 1 3 ++ BC031427 271 33 212 AAGJ02000008 10343 4311 8474 5 16,14,28,26,24, 33,49,67,151,188 , 4311,4362,6950,8273,8402, #58 50 0 0 0 0 1 3 ++ NM_012399 270 33 212 AAGJ02000008 10343 4311 8474 5 16,14,28,26,24, 33,49,67,151,188 , 4311,4362,6950,8273,8402, # build the hash cd /cluster/data/strPur2/fixup egrep W strPur2.agp | awk '{print $6 "\t" $1}' > strPur2.agp.hash # replace the genbank accession with the Scaffold# ssh kolossus cd /cluster/data/strPur2/bed/tblastn.hg18KG/ mv blastHg18KG.psl blastHg18KG.psl.gb nice /cluster/home/kord/bin/replaceStringWithHash.pl \ /cluster/data/strPur2/fixup/strPur2.agp.hash blastHg18KG.psl.gb > blastHg18KG.psl # verify the replace didn't mess up any other fields and was complete cat blastHg18KG.psl.gb | sed 's/AAGJ[0-9]*//g' > blastHg18KG.psl.outg cat blastHg18KG.psl | sed 's/Scaffold[0-9]*//g' > blastHg18KG.psl.outs diff -s -q blastHg18KG.psl.outs blastHg18KG.psl.outg # Files blastHg18KG.psl.outs and blastHg18KG.psl.outg are identical pslCheck blastHg18KG.psl #checked: 20636 failed: 0 errors: 0 ssh hgwdev cd /cluster/data/strPur2/bed/tblastn.hg18KG/ hgLoadPsl strPur2 blastHg18KG.psl Processing blastHg18KG.psl nice featureBits strPur2 refGene:cds blastHg18KG -enrichment #refGene:cds 0.070%, blastHg18KG 0.706%, both 0.001%, cover 1.71%, enrich 2.43x nice featureBits strPur2 genscan:cds blastHg18KG -enrichment #genscan:cds 8.383%, blastHg18KG 0.706%, both 0.117%, cover 1.39%, enrich 1.97x cd kent/src/hg/makeDb/trackDb make update DBS=strPur2 ssh kkstore06 rm -rf /cluster/data/strPur2/bed/tblastn.hg18KG/blastOut rm -rf /cluster/bluarc/strPur2/bed/tblastn.hg18KG/blastOut ##################################################################### ########################################################################### # HUMAN (hg18) PROTEINS TRACK (DONE braney 2007-10-11) ssh kkstore06 bash # if not using bash shell already mkdir /cluster/data/strPur2/blastDb cd /cluster/data/strPur2 twoBitToFa strPur2.2bit stdout | toUpper stdin temp.fa cat M/chrM.fa >> temp.fa faSplit sequence temp.fa 100 blastDb/x rm temp.fa cd blastDb for i in *.fa do /cluster/bluearc/blast229/formatdb -i $i -p F done rm *.fa clusterTemp=/cluster/bluearc/braney/strPur2 mkdir -p $clusterTemp cd /cluster/data/strPur2/blastDb for i in nhr nin nsq; do echo $i cp *.$i $clusterTemp done mkdir -p /cluster/data/strPur2/bed/tblastn.hg18KG cd /cluster/data/strPur2/bed/tblastn.hg18KG echo $clusterTemp/*.nsq | xargs ls -S | sed "s/\.nsq//" > query.lst wc -l query.lst # 99 query.lst # we want around 50000 jobs calc `wc /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl | awk "{print \\\$1}"`/\(50000/`wc query.lst | awk "{print \\\$1}"`\) # 36727/(50000/99) = 72.719460 kgTmp=$clusterTemp/tblastn.hg18KG/kgfa mkdir -p $kgTmp split -l 73 /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl $kgTmp/kg ln -s $kgTmp kgfa cd kgfa for i in *; do nice pslxToFa $i $i.fa; rm $i; done cd /cluster/data/strPur2/bed/tblastn.hg18KG ls -1S kgfa/*.fa > kg.lst blastTmp=$clusterTemp/tblastn.hg18KG/blastOut mkdir -p $blastTmp ln -s $blastTmp for i in `cat kg.lst`; do mkdir blastOut/`basename $i .fa`; done tcsh cd /cluster/data/strPur2/bed/tblastn.hg18KG cat << '_EOF_' > blastGsub #LOOP blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl } #ENDLOOP '_EOF_' cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/cluster/bluearc/blast229/data export BLASTMAT g=`basename $2` f=/tmp/`basename $3`.$g for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11 do if /cluster/bluearc/blast229/blastall -M BLOSUM62 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8 then mv $f.8 $f.1 break; fi done if test -f $f.1 then if /cluster/bin/i386/blastToPsl $f.1 $f.2 then liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg18/bed/blat.hg18KG/protein.lft warn $f.2 if pslCheck -prot $3.tmp then mv $3.tmp $3 rm -f $f.1 $f.2 $f.3 $f.4 fi exit 0 fi fi rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 $f.4 exit 1 '_EOF_' # << happy emacs chmod +x blastSome gensub2 query.lst kg.lst blastGsub blastSpec exit # back to bash ssh kk cd /cluster/data/strPur2/bed/tblastn.hg18KG para create blastSpec # para try, check, push, check etc. para time # Completed: 49896 of 49896 jobs # CPU time in finished jobs: 16169309s 269488.48m 4491.47h 187.14d 0.513 y # IO & Wait Time: 1865159s 31085.99m 518.10h 21.59d 0.059 y # Average job time: 361s 6.02m 0.10h 0.00d # Longest finished job: 1447s 24.12m 0.40h 0.02d # Submission to last job: 33989s 566.48m 9.44h 0.39d ssh kkstore06 cd /cluster/data/strPur2/bed/tblastn.hg18KG tcsh mkdir chainRun cd chainRun cat << '_EOF_' > chainGsub #LOOP chainOne $(path1) #ENDLOOP '_EOF_' cat << '_EOF_' > chainOne (cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=75000 stdin /cluster/bluearc/braney/strPur2/blastOut/c.`basename $1`.psl) '_EOF_' chmod +x chainOne ls -1dS $blastTmp/kg?? > chain.lst gensub2 chain.lst single chainGsub chainSpec # do the cluster run for chaining ssh kki cd /cluster/data/strPur2/bed/tblastn.hg18KG/chainRun para create chainSpec para try, check, push, check etc. # Completed: 504 of 504 jobs # CPU time in finished jobs: 1788s 29.80m 0.50h 0.02d 0.000 y # IO & Wait Time: 14558s 242.63m 4.04h 0.17d 0.000 y # Average job time: 32s 0.54m 0.01h 0.00d # Longest finished job: 88s 1.47m 0.02h 0.00d # Submission to last job: 556s 9.27m 0.15h 0.01d ssh kkstore06 cd /cluster/data/strPur2/bed/tblastn.hg18KG/blastOut bash # if using another shell for i in kg?? do cat c.$i.psl | awk "(\$13 - \$12)/\$11 > 0.6 {print}" > c60.$i.psl sort -rn c60.$i.psl | pslUniq stdin u.$i.psl awk "((\$1 / \$11) ) > 0.60 { print }" c60.$i.psl > m60.$i.psl echo $i done sort -T /tmp -k 14,14 -k 16,16n -k 17,17n u.*.psl m60* | uniq | sed 's/SCAFFOLD/Scaffold/' > /cluster/data/strPur2/bed/tblastn.hg18KG/blastHg18KG.psl cd .. pslCheck blastHg18KG.psl # load table ssh hgwdev cd /cluster/data/strPur2/bed/tblastn.hg18KG hgLoadPsl strPur2 blastHg18KG.psl # check coverage featureBits strPur2 blastHg18KG # 7625148 bases of 810038660 (0.941%) in intersection ssh kkstore06 rm -rf /cluster/data/strPur2/bed/tblastn.hg18KG/blastOut rm -rf /cluster/bluearc/strPur2/bed/tblastn.hg18KG/blastOut #end tblastn ##################################################################### ##################################################################### # CREATE LIFTOVER FROM strPur2 TO strPur1 # DONE 2008-Apr-1 kord # strPur2 -> /cluster/store4/strPur2 # kkstore06-10:/export/cluster/store4 2.3T 2.0T 191G 92% /cluster/store4 ssh kkstore06 mkdir /cluster/data/strPur2/bed/blat.strPur1 cd /cluster/data/strPur2/bed/blat.strPur1 nice time doSameSpeciesLiftOver.pl strPur2 strPur1 \ -bigClusterHub pk \ -ooc /cluster/bluearc/strPur2/11.ooc \ -buildDir /cluster/data/strPur2/bed/blat.strPur1 >do.log 2>&1 & ssh pk cd /cluster/data/strPur2/bed/blat.strPur1/run.blat date;parasol list batches #Tue Apr 1 09:22:40 PDT 2008 ##user run wait done crash pri max batch #kord 394 32638 768 0 10 -1 /cluster/store4/strPur2/bed/blat.strPur1/run.blat/ #*** All done! #*** Steps were performed in /cluster/data/strPur2/bed/blat.strPur1 #*** Test installation (/gbdb, goldenPath, hgLiftover operation) on hgwdev. # #1.19user 0.70system 11:45:39elapsed 0%CPU (0avgtext+0avgdata 0maxresident)k #0inputs+0outputs (8major+30321minor)pagefaults 0swaps # remove the symbolic link to liftOver chains and copy over the file rm ../liftOver/strPur2ToStrPur1.over.chain.gz cp -p strPur2ToStrPur1.over.chain.gz ../liftOver/ # a link in /usr/local/apache/htdocs/goldenPath/strPur2/liftOver # has already been made to this file and md5sum.txt needs to be updated ssh hgwdev cd /usr/local/apache/htdocs/goldenPath/strPur2/liftOver md5sum *.gz > md5sum.txt