# for emacs: -*- mode: sh; -*- # This file describes browser build for the rheMac3 # Macaca mulatta genome: October 2010 # http://www.ncbi.nlm.nih.gov/Traces/wgs/?val=AEHK00 # 50X WGS coverage ############################################################################# # Fetch sequence from genbank (DONE - 2011-12-22 - Hiram) mkdir -p /hive/data/genomes/rheMac3/genbank cd /hive/data/genomes/rheMac3/genbank wget --timestamping -r --cut-dirs=6 --level=0 -nH -x \ --no-remove-listing -np \ "ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Macaca_mulatta/CR_1.0/*" # measure total sequence in this assembly faSize Primary_Assembly/assembled_chromosomes/FASTA/chr*.fa.gz \ Primary_Assembly/unplaced_scaffolds/FASTA//un*.fa.gz # 2969971616 bases (330842350 N's 2639129266 real 2639129266 upper 0 lower) in # 34102 sequences in 22 files # Total size: mean 87090.8 sd 3586193.4 min 200 (gi|353154774|gb|JH298929.1|) # max 229590362 (gi|353351477|gb|CM001253.1|) median 442 # N count: mean 9701.6 sd 410667.9 # U count: mean 77389.3 sd 3195078.2 # L count: mean 0.0 sd 0.0 # %0.00 masked total, %0.00 masked real ############################################################################# # process into UCSC naming scheme (DONE - 2011-12-22 - Hiram) mkdir /hive/data/genomes/rheMac3/ucsc cd /hive/data/genomes/rheMac3/ucsc cat << '_EOF_' > toUcsc.pl #!/bin/env perl use strict; use warnings; my %accToChr; open (FH, "<../genbank/Primary_Assembly/assembled_chromosomes/chr2acc") or die "can not read Primary_Assembly/assembled_chromosomes/chr2acc"; while (my $line = ) { next if ($line =~ m/^#/); chomp $line; my ($chrN, $acc) = split('\s+', $line); $accToChr{$acc} = $chrN; } close (FH); foreach my $acc (keys %accToChr) { my $chrN = $accToChr{$acc}; print "$acc $accToChr{$acc}\n"; open (FH, "zcat ../genbank/Primary_Assembly/assembled_chromosomes/AGP/chr${chrN}.agp.gz|") or die "can not read chr${chrN}.agp.gz"; open (UC, ">chr${chrN}.agp") or die "can not write to chr${chrN}.agp"; while (my $line = ) { if ($line =~ m/^#/) { print UC $line; } else { $line =~ s/^$acc/chr${chrN}/; print UC $line; } } close (FH); close (UC); open (FH, "zcat ../genbank/Primary_Assembly/assembled_chromosomes/FASTA/chr${chrN}.fa.gz|") or die "can not read chr${chrN}.fa.gz"; open (UC, ">chr${chrN}.fa") or die "can not write to chr${chrN}.fa"; while (my $line = ) { if ($line =~ m/^>/) { printf UC ">chr${chrN}\n"; } else { print UC $line; } } close (FH); close (UC); } '_EOF_' # << happy emacs chmod +x toUcsc.pl time ./toUcsc.pl cat << '_EOF_' > unplaced.pl #!/bin/env perl use strict; use warnings; my $agpFile = "../genbank/Primary_Assembly/unplaced_scaffolds/AGP/unplaced.scaf.agp.gz"; my $fastaFile = "../genbank/Primary_Assembly/unplaced_scaffolds/FASTA/unplaced.scaf.fa.gz"; open (FH, "zcat $agpFile|") or die "can not read $agpFile"; open (UC, ">unplaced.agp") or die "can not write to unplaced.agp"; while (my $line = ) { if ($line =~ m/^#/) { print UC $line; } else { $line =~ s/\.1//; printf UC "chrUn_%s", $line; } } close (FH); close (UC); open (FH, "zcat $fastaFile|") or die "can not read $fastaFile"; open (UC, ">unplaced.fa") or die "can not write to unplaced.fa"; while (my $line = ) { if ($line =~ m/^>/) { chomp $line; $line =~ s/.*gb\|//; $line =~ s/\.1\|.*//; printf UC ">chrUn_$line\n"; } else { print UC $line; } } close (FH); close (UC); '_EOF_' # << happy emacs chmod +x unplaced.pl time ./unplaced.pl # real 0m7.730s # compress these files gzip *.fa *.agp # verify nothing has changed in the sequence, should be the same as above: faSize *.fa.gz # 2969971616 bases (330842350 N's 2639129266 real 2639129266 upper 0 lower) # in 34102 sequences in 22 files # Total size: mean 87090.8 sd 3586193.4 min 200 (chrUn_JH298929) # max 229590362 (chr1) median 442 # N count: mean 9701.6 sd 410667.9 # U count: mean 77389.3 sd 3195078.2 # L count: mean 0.0 sd 0.0 # %0.00 masked total, %0.00 masked real ############################################################################# # Initial database build (DONE - 2011-12-22 - Hiram) cd /hive/data/genomes/rheMac3 cat << '_EOF_' > rheMac3.config.ra # Config parameters for makeGenomeDb.pl: db rheMac3 clade vertebrate genomeCladePriority 15 scientificName Macaca mulatta commonName Rhesus assemblyDate Oct. 2010 assemblyLabel Beijing Genomics Institute, Shenzhen (GCA_000230795.1) assemblyShortLabel BGI CR_1.0 orderKey 35 mitoAcc NC_005943 fastaFiles /hive/data/genomes/rheMac3/ucsc/*.fa.gz agpFiles /hive/data/genomes/rheMac3/ucsc/*.agp.gz dbDbSpeciesDir rhesus taxId 9544 '_EOF_' # << happy emacs # first verify the sequence and AGP files are OK time makeGenomeDb.pl -stop=agp -workhorse=hgwdev rheMac3.config.ra \ > agp.log 2>&1 # real 3m18.816s # verify that was OK, look at the agp.log file time makeGenomeDb.pl -continue=db -workhorse=hgwdev rheMac3.config.ra \ > db.log 2>&1 # real 20m51.021s # verify the end of do.log indicates a successful completion # copy the trackDb business to the source tree, check it in and add # to the trackDb/makefile ############################################################################# # running repeat masker (DONE - 2011-12-29,30 - Hiram) mkdir /hive/data/genomes/rheMac3/bed/repeatMasker cd /hive/data/genomes/rheMac3/bed/repeatMasker time doRepeatMasker.pl -buildDir=`pwd` -noSplit \ -bigClusterHub=swarm -dbHost=hgwdev -workhorse=hgwdev \ -smallClusterHub=memk rheMac3 > do.log 2>&1 & # real 682m32.394s cat faSize.rmsk.txt # 2969988180 bases (330842350 N's 2639145830 real 1359180091 upper # 1279965739 lower) in 34103 sequences in 1 files # %43.10 masked total, %48.50 masked real grep -i versi do.log # RepeatMasker version development-$Id: RepeatMasker,v 1.26 2011/09/26 16:19:44 angie Exp $ # April 26 2011 (open-3-3-0) version of RepeatMasker featureBits -countGaps rheMac3 rmsk # 1290741445 bases of 2969988180 (43.459%) in intersection # why is it different than the faSize above ? # because rmsk masks out some N's as well as bases, the count above # separates out the N's from the bases, it doesn't show lower case N's ########################################################################## # running simple repeat (DONE - 2011-12-29,2012-01-04 - Hiram) mkdir /hive/data/genomes/rheMac3/bed/simpleRepeat cd /hive/data/genomes/rheMac3/bed/simpleRepeat time doSimpleRepeat.pl -buildDir=`pwd` -bigClusterHub=swarm \ -dbHost=hgwdev -workhorse=hgwdev -smallClusterHub=memk \ rheMac3 > do.log 2>&1 & # real 113m25.254s cat fb.simpleRepeat # 61072049 bases of 2792623296 (2.187%) in intersection # add to rmsk after it is done: cd /hive/data/genomes/rheMac3 twoBitMask rheMac3.rmsk.2bit \ -add bed/simpleRepeat/trfMask.bed rheMac3.2bit # you can safely ignore the warning about fields >= 13 twoBitToFa rheMac3.2bit stdout | faSize stdin > faSize.rheMac3.2bit.txt cat faSize.rheMac3.2bit.txt # 2969988180 bases (330842350 N's 2639145830 real 1358178889 upper # 1280966941 lower) in 34103 sequences in 1 files # %43.13 masked total, %48.54 masked real rm /gbdb/rheMac3/rheMac3.2bit ln -s `pwd`/rheMac3.2bit /gbdb/rheMac3/rheMac3.2bit ######################################################################### # Verify all gaps are marked, add any N's not in gap as type 'other' # (DONE - 2012-01-03 - Hiram) mkdir /hive/data/genomes/rheMac3/bed/gap cd /hive/data/genomes/rheMac3/bed/gap time nice -n +19 findMotif -motif=gattaca -verbose=4 \ -strand=+ ../../rheMac3.unmasked.2bit > findMotif.txt 2>&1 # real 0m31.596s grep "^#GAP " findMotif.txt | sed -e "s/^#GAP //" > allGaps.bed time featureBits rheMac3 -not gap -bed=notGap.bed # real 0m31.496s # 2792623296 bases of 2792623296 (100.000%) in intersection featureBits rheMac3 allGaps.bed notGap.bed -bed=new.gaps.bed # about 30 minutes # 153477466 bases of 2969988180 (5.168%) in intersection # what is the highest index in the existing gap table: hgsql -N -e "select ix from gap;" rheMac3 | sort -n | tail -1 # 658 cat << '_EOF_' > mkGap.pl #!/bin/env perl use strict; use warnings; my $ix=`hgsql -N -e "select ix from gap;" rheMac3 | sort -n | tail -1`; chomp $ix; open (FH,") { my ($chrom, $chromStart, $chromEnd, $rest) = split('\s+', $line); ++$ix; printf "%s\t%d\t%d\t%d\tN\t%d\tother\tyes\n", $chrom, $chromStart, $chromEnd, $ix, $chromEnd-$chromStart; } close (FH); '_EOF_' # << happy emacs chmod +x ./mkGap.pl ./mkGap.pl > other.bed wc -l other.bed # 395439 featureBits -countGaps rheMac3 other.bed # 153477466 bases of 2969988180 (5.168%) in intersection hgLoadBed -sqlTable=$HOME/kent/src/hg/lib/gap.sql \ -noLoad rheMac3 otherGap other.bed # verify no overlap: time featureBits -countGaps rheMac3 gap other.bed # real 36m1.614s 0 bases of 2969988180 (0.000%) in intersection # verify no errors: time gapToLift -minGap=1 rheMac3 nonBridged.lift \ -bedFile=nonBridged.bed > before.gapToLift.txt 2>&1 & # real 0m29.947s # starting with this many hgsql -e "select count(*) from gap;" rheMac3 # 19148 hgsql rheMac3 -e 'load data local infile "bed.tab" into table gap;' # result count: hgsql -e "select count(*) from gap;" rheMac3 # 414587 # == 395439 + 19148 # verify we aren't adding gaps where gaps already exist # this would output errors if that were true: gapToLift -minGap=1 rheMac3 nonBridged.lift -bedFile=nonBridged.bed # see example in danRer7.txt if there are errors hgsql -N -e "select bridge from gap;" rheMac3 | sort | uniq -c # 4122 no # 410465 yes ########################################################################## ## WINDOWMASKER (DONE - 2012-01-03 - Hiram) mkdir /hive/data/genomes/rheMac3/bed/windowMasker cd /hive/data/genomes/rheMac3/bed/windowMasker time nice -n +19 doWindowMasker.pl -buildDir=`pwd` -workhorse=hgwdev \ -dbHost=hgwdev rheMac3 > do.log 2>&1 & # real 221m45.041s # Masking statistics twoBitToFa rheMac3.wmsk.2bit stdout | faSize stdin # 2969988180 bases (330842350 N's 2639145830 real 1746831363 upper # 892314467 lower) in 34103 sequences in 1 files # %30.04 masked total, %33.81 masked real twoBitToFa rheMac3.wmsk.sdust.2bit stdout | faSize stdin # 2969988180 bases (330842350 N's 2639145830 real 1732647656 upper # 906498174 lower) in 34103 sequences in 1 files # %30.52 masked total, %34.35 masked real hgLoadBed rheMac3 windowmaskerSdust windowmasker.sdust.bed.gz # Loaded 15866314 elements of size 3 featureBits -countGaps rheMac3 windowmaskerSdust # 1237340524 bases of 2969988180 (41.661%) in intersection # eliminate the gaps from the masking featureBits rheMac3 -not gap -bed=notGap.bed # 2969988180 bases of 2969988180 (100.000%) in intersection time nice -n +19 featureBits rheMac3 windowmaskerSdust notGap.bed \ -bed=stdout | gzip -c > cleanWMask.bed.gz # real 7m34.303s # 1059975640 bases of 2792623296 (37.956%) in intersection # reload track to get it clean hgLoadBed rheMac3 windowmaskerSdust cleanWMask.bed.gz # Loaded 15874168 elements of size 4 time featureBits -countGaps rheMac3 windowmaskerSdust # real 2m1.415s # 1059975640 bases of 2969988180 (35.690%) in intersection # do *not* need to mask the sequence with this clean mask # since RepeatMasker matched well enough zcat cleanWMask.bed.gz \ | twoBitMask ../../rheMac3.unmasked.2bit stdin \ -type=.bed rheMac3.cleanWMSdust.2bit twoBitToFa rheMac3.cleanWMSdust.2bit stdout | faSize stdin \ > rheMac3.cleanWMSdust.faSize.txt cat rheMac3.cleanWMSdust.faSize.txt # 1799143587 bases (97789820 N's 1701353767 real 987278282 upper # 714075485 lower) in 6457 sequences in 1 files # %39.69 masked total, %41.97 masked real # how much does this window masker and repeat masker overlap: featureBits -countGaps rheMac3 rmsk windowmaskerSdust # 280229203 bases of 1511735326 (18.537%) in intersection ######################################################################### # MASK SEQUENCE WITH WM+TRF (DONE - 2012-01-04 - Hiram) cd /hive/data/genomes/rheMac3 twoBitMask -add bed/windowMasker/rheMac3.cleanWMSdust.2bit \ bed/simpleRepeat/trfMask.bed rheMac3.2bit # safe to ignore the warnings about BED file with >=13 fields twoBitToFa rheMac3.2bit stdout | faSize stdin > faSize.rheMac3.txt cat faSize.rheMac3.txt # 1799143587 bases (97789820 N's 1701353767 real 987044085 upper # 714309682 lower) in 6457 sequences in 1 files # %39.70 masked total, %41.98 masked real # create symlink to gbdb ssh hgwdev rm /gbdb/rheMac3/rheMac3.2bit ln -s `pwd`/rheMac3.2bit /gbdb/rheMac3/rheMac3.2bit ############################################################################ # set this as defaultDb (DONE - 2012-01-06 - Chin) # and make this the default genome for Rhesus hgsql -e 'update defaultDb set name="rheMac3" where name="rheMac2";' \ hgcentraltest ######################################################################## # MAKE 11.OOC FILE FOR BLAT/GENBANK (DONE 2012-01-09 - Chin) # Create kluster run files ( # numerator is rheMac3 gapless bases "real" as reported by: featureBits -noRandom -noHap rheMac3 gap # 323705474 bases of 2562964352 (12.630%) in intersection # Use -repMatch=650, based on size -- for human we use 1024 # use the "real" number from the faSize measurement, # hg19 is 2897316137, calculate the ratio factor for 1024: calc \( 2562964352 / 2897316137 \) \* 1024 # ( 2562964352 / 2897316137 ) * 1024 = 905.829869 # ==> use -repMatch=900 according to size scaled down from 1024 for human. # and rounded down to nearest 50 (900 in this case) cd /hive/data/genomes/rheMac3 blat rheMac3.2bit /dev/null /dev/null -tileSize=11 \ -makeOoc=jkStuff/rheMac3.11.ooc -repMatch=900 # Wrote 26800 overused 11-mers to jkStuff/rheMac3.11.ooc # there are non-bridged gaps. hgsql -N -e "select bridge from gap;" rheMac3 | sort | uniq -c # 4122 no # 410465 yes # copy the stuff to kluster mkdir /hive/data/staging/data/rheMac3 cp -p rheMac3.2bit chrom.sizes jkStuff/rheMac3.11.ooc \ /hive/data/staging/data/rheMac3 gapToLift -bedFile=jkStuff/nonBridgedGaps.bed rheMac3 \ jkStuff/rheMac3.nonBridged.lft cp -p jkStuff/rheMac3.nonBridged.lft /hive/data/staging/data/rheMac3 # ask cluster-admin to copy (evry time if any file changed) # /hive/data/staging/data/rheMac3 directory to # /scratch/data/rheMac3 on cluster nodes # before porceed to genbank step ######################################################################### # AUTO UPDATE GENBANK (DONE 2012-01-19 - Chin) # examine the file: /cluster/data/genbank/data/organism.lst # for your species to see what counts it has for: # organism mrnaCnt estCnt refSeqCnt # Macaca mulatta 6156 60360 2302 # Macaca mulatta cytomegalovirus 1 0 0 # Gorilla gorilla 546 4 0 # Gorilla gorilla gorilla 1 0 0 # to decide which "native" mrna or ests you want to specify in # genbank.conf # this appears that rheMac3 has almost no native est's ssh hgwdev cd $HOME/kent/src/hg/makeDb/genbank git pull # edit etc/genbank.conf to add rheMac3 after rheMac2 and commit to # GIT # Rhesus macaque (macaca mulatta) rheMac3.serverGenome = /hive/data/genomes/rheMac3/rheMac3.2bit rheMac3.clusterGenome = /scratch/data/rheMac3/rheMac3.2bit rheMac3.ooc = /scratch/data/rheMac3/rheMac3.11.ooc rheMac3.lift = no rheMac3.perChromTables = no rheMac3.refseq.mrna.native.pslCDnaFilter = ${ordered.refseq.mrna.native.pslCDnaFilter} rheMac3.refseq.mrna.xeno.pslCDnaFilter = ${ordered.refseq.mrna.xeno.pslCDnaFilter} rheMac3.genbank.mrna.native.pslCDnaFilter = ${ordered.genbank.mrna.native.pslCDnaFilter} rheMac3.genbank.mrna.xeno.pslCDnaFilter = ${ordered.genbank.mrna.xeno.pslCDnaFilter} rheMac3.genbank.est.native.pslCDnaFilter = ${ordered.genbank.est.native.pslCDnaFilter} rheMac3.genbank.est.xeno.pslCDnaFilter = ${ordered.genbank.est.xeno.pslCDnaFilter} rheMac3.downloadDir = rheMac3 rheMac3.genbank.mrna.xeno.load = yes rheMac3.refseq.mrna.native.load = yes rheMac3.refseq.mrna.xeno.load = yes rheMac3.upstreamGeneTbl = refGene # end of section added to etc/genbank.conf git commit -m "adding rheMac3 Rhesus macaque" etc/genbank.conf git push make etc-update # ~/kent/src/hg/makeDb/genbank/src/lib/gbGenome.c already contains # rheMac genome information, if this is a new species, need to add # stuff there ssh hgwdev # used to do this on "genbank" machine screen # long running job managed in screen cd /cluster/data/genbank time nice -n +19 ./bin/gbAlignStep -initial rheMac3 & # logFile: var/build/logs/2012.01.11-08:40:51.rheMac3.initalign.log # real 1496m10.582s # load database when finished ssh hgwdev cd /cluster/data/genbank time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad rheMac3 & # logFile: var/dbload/hgwdev/logs/2012.01.12-11:34:20.dbload.log # real 49m3.743s # enable daily alignment and update of hgwdev cd ~/kent/src/hg/makeDb/genbank git pull # add rheMac3 to: etc/align.dbs etc/hgwdev.dbs git commit -m "daily update for rheMac3 replaces daily update for gorGor2" \ etc/align.dbs etc/hgwdev.dbs git push make etc-update ######################################################################### # BLATSERVERS ENTRY (DONE 2012-01-13 - Chin) # After getting a blat server assigned by the Blat Server Gods, # rheMac3 (trans) on blat1 port 17818 # rheMac3 (untrans) on blat1 port 17819 ssh hgwdev hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ VALUES ("rheMac3", "blat1", "17818", "1", "0"); \ INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ VALUES ("rheMac3", "blat1", "17819", "0", "1");' \ hgcentraltest # test it with some sequence ######################################################################### # reset default position (DONE - 2012-01-13 - Chin) # Reset default position to an area contains RHOG and a number of # other genes hgsql -e \ 'update dbDb set defaultPos="chr14:70,034,487-70,048,046" where name="rheMac3";' \ hgcentraltest ############################################################################ # ctgPos2 track - showing clone sequence locations on chromosomes # (DONE 2012-01-18 - Chin) # NOTE - create rheMac3 entry in all.joiner since this is a new species mkdir /hive/data/genomes/rheMac3/bed/ctgPos2 cd /hive/data/genomes/rheMac3/bed/ctgPos2 cat << '_EOF_' > agpToCtgPos2.pl #!/usr/bin/env perl use warnings; use strict; my $argc = scalar(@ARGV); if ($argc != 1) { printf STDERR "usage: zcat your.files.agp.gz | agpToCtgPos2.pl /dev/stdin > ctgPos2.tab\n"; exit 255; } my $agpFile = shift; open (FH, "<$agpFile") or die "can not read $agpFile"; while (my $line = ) { next if ($line =~ m/^#/); chomp $line; my @a = split('\s+', $line); next if ($a[4] =~ m/^U$/); next if ($a[4] =~ m/^N$/); my $chrSize = $a[2]-$a[1]+1; my $ctgSize = $a[7]-$a[6]+1; die "sizes differ $chrSize != $ctgSize\n$line\n" if ($chrSize != $ctgSize); printf "%s\t%d\t%s\t%d\t%d\t%s\n", $a[5], $chrSize, $a[0], $a[1]-1, $a[2], $a[4]; } close (FH); '_EOF_' # << happy emacs chmod 775 agpToCtgPos2.pl export S=../../genbank/Primary_Assembly/assembled_chromosomes cat ${S}/chr2acc | grep -v "^#Chromosome" | cut -f2 | while read ACC do C=`grep "${ACC}" ${S}/chr2acc | cut -f1` zcat ${S}/AGP/chr${C}.agp.gz \ | sed -e "s/^${ACC}/chr${C}/" done | ./agpToCtgPos2.pl /dev/stdin > ctgPos2.tab hgLoadSqlTab rheMac3 ctgPos2 $HOME/kent/src/hg/lib/ctgPos2.sql ctgPos2.tab # add the track ctgPos2 to src/hg/makeDb/trackDb/cow/rheMac3/trackDb.ra # at src/makeDb/trackdb do "make update DBS=rheMac3" or/and "make alpha" # based on result of # hgsql -N -e "select type from ctgPos2;" rheMac3 | sort | uniq # W # prepare the src/hg/makeDb/trackDb/cow/rheMac3/ctgPos2.html ############################################################################ # running cpgIsland business (DONE -2012-01-18 - Chin) mkdir /hive/data/genomes/rheMac3/bed/cpgIsland cd /hive/data/genomes/rheMac3/bed/cpgIsland cvs -d /projects/compbio/cvsroot checkout -P hg3rdParty/cpgIslands cd hg3rdParty/cpgIslands # needed to fixup this source, adding include to readseq.c: #include "string.h" # and to cpg_lh.c: #include "unistd.h" #include "stdlib.h" # and fixing a declaration in cpg_lh.c sed -e "s#\(extern char\* malloc\)#// \1#" cpg_lh.c > tmp.c mv tmp.c cpg_lh.c make cd ../../ ln -s hg3rdParty/cpgIslands/cpglh.exe mkdir -p hardMaskedFa cut -f1 ../../chrom.sizes | while read C do echo ${C} twoBitToFa ../../rheMac3.2bit:$C stdout \ | maskOutFa stdin hard hardMaskedFa/${C}.fa done ssh swarm cd /hive/data/genomes/rheMac3/bed/cpgIsland mkdir results cut -f1 ../../chrom.sizes > chr.list cat << '_EOF_' > template #LOOP ./runOne $(root1) {check out exists results/$(root1).cpg} #ENDLOOP '_EOF_' # << happy emacs # the faCount business is to make sure there is enough sequence to # work with in the fasta. cpglh.exe does not like files with too many # N's - it gets stuck cat << '_EOF_' > runOne #!/bin/csh -fe set C = `faCount hardMaskedFa/$1.fa | grep ^chr | awk '{print $2 - $7 }'` if ( $C > 200 ) then ./cpglh.exe hardMaskedFa/$1.fa > /scratch/tmp/$1.$$ mv /scratch/tmp/$1.$$ $2 else touch $2 endif '_EOF_' # << happy emacs chmod 775 runOne gensub2 chr.list single template jobList para create jobList para try para check ... etc para push para time para problems para status # then, kick it with para push # check it with plb # when all are done, para time shows: # Checking finished jobs # Completed: 34103 of 34103 jobs # CPU time in finished jobs: 209s 3.49m 0.06h 0.00d 0.000 y # IO & Wait Time: 88578s 1476.30m 24.60h 1.03d 0.003 y # Average job time: 3s 0.04m 0.00h 0.00d # Longest finished job: 23s 0.38m 0.01h 0.00d # Submission to last job: 146s 2.43m 0.04h 0.00d # Transform cpglh output to bed + catDir results | awk '{ $2 = $2 - 1; width = $3 - $2; printf("%s\t%d\t%s\t%s %s\t%s\t%s\t%0.0f\t%0.1f\t%s\t%s\n", $1, $2, $3, $5,$6, width, $6, width*$7*0.01, 100.0*2*$6/width, $7, $9); }' > cpgIsland.bed ssh hgwdev cd /hive/data/genomes/rheMac3/bed/cpgIsland hgLoadBed rheMac3 cpgIslandExt -tab \ -sqlTable=$HOME/kent/src/hg/lib/cpgIslandExt.sql cpgIsland.bed # Reading cpgIsland.bed # Read 25189 elements of size 10 from cpgIsland.bed # Sorted # Creating table definition for cpgIslandExt # Saving bed.tab # Loading rheMac3 # cleanup rm -fr hardMaskedFa ############################################################################## # LASTZ Mouse mm10 Swap (DONE - 2012-03-08 - Hiram) cd /hive/data/genomes/mm10/bed/lastzRheMac3.2012-03-08 cat fb.mm10.chainRheMac3Link.txt # 900117108 bases of 2652783500 (33.931%) in intersection mkdir /hive/data/genomes/rheMac3/bed/blastz.mm10.swap cd /hive/data/genomes/rheMac3/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzRheMac3.2012-03-08/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 69m5.839s cat fb.rheMac3.chainMm10Link.txt # 883164992 bases of 2639145830 (33.464%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/rheMac3/bed ln -s blastz.mm10.swap lastz.mm10 ######################################################################### # LASTZ Human Swap (DONE - 2012-03-15 - Chin) cd /hive/data/genomes/hg19/bed/lastzRheMac3.2012-03-15 cat fb.hg19.chainRheMac3Link.txt # 2400694407 bases of 2897316137 (82.859%) in intersection # running the swap - DONE - 20i12-03-16 mkdir /hive/data/genomes/rheMac3/bed/blastz.hg19.swap cd /hive/data/genomes/rheMac3/bed/blastz.hg19.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/hg19/bed/lastzRheMac3.2012-03-15/DEF \ -swap \ -noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \ > swap.log 2>&1 & # 58m38.594s cat fb.rheMac3.chainHg19Link.txt # 2313806886 bases of 2646704109 (87.422%) in intersection cd /hive/data/genomes/rheMac3/bed ln -s blastz.hg19.swap lastz.hg19 ########################################################################## # LASTZ Orangutan ponAbe2 (DONE - 2012-04-18 - Chin) screen # use screen to control this job mkdir /hive/data/genomes/rheMac3/bed/blastzPonAbe2.2012-04-18 cd /hive/data/genomes/rheMac3/bed/blastzPonAbe2.2012-04-18 cat << '_EOF_' > DEF # Rhesus vs Orangutan BLASTZ_M=50 # TARGET: Rhesus RheMac3 SEQ1_DIR=/scratch/data/rheMac3/rheMac3.2bit SEQ1_LEN=/scratch/data/rheMac3/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Orangutan ponAbe2 SEQ2_DIR=/scratch/data/ponAbe2/ponAbe2.2bit SEQ2_LEN=/scratch/data/ponAbe2/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/hive/data/genomes/rheMac3/bed/blastzPonAbe2.2012-04-18 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium \ -syntenicNet `pwd`/DEF > do.log 2>&1 & # real 9679m37.253 cat fb.rheMac3.chainPonAbe2Link.txt # 2354152230 bases of 2639145830 (89.201%) in intersection # and, for the swap mkdir /hive/data/genomes/ponAbe2/bed/blastz.rheMac3.swap cd /hive/data/genomes/ponAbe2/bed/blastz.rheMac3.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/rheMac3/bed/blastzPonAbe2.2012-04-18/DEF \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \ -swap -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 289m9.479s cat fb.ponAbe2.chainRheMac3Link.txt # 2565157739 bases of 3093572278 (82.919%) in intersection cd /hive/data/genomes/ponAbe2/bed ln -s blastz.rheMac3.swap lastz.rheMac3 ########################################################################## # LASTZ Gorilla gorGor3 (DONE - 2012-04-18 - Chin) screen # use screen to control this job mkdir /hive/data/genomes/rheMac3/bed/blastzGorGor3.2012-04-18 cd /hive/data/genomes/rheMac3/bed/blastzGorGor3.2012-04-18 cat << '_EOF_' > DEF # Rhesus vs Gorilla BLASTZ_M=50 # TARGET: Rhesus RheMac3 SEQ1_DIR=/scratch/data/rheMac3/rheMac3.2bit SEQ1_LEN=/scratch/data/rheMac3/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Gorilla gorGor3 SEQ2_DIR=/scratch/data/gorGor3/gorGor3.2bit SEQ2_LEN=/scratch/data/gorGor3/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/hive/data/genomes/rheMac3/bed/blastzGorGor3.2012-04-18 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium \ -syntenicNet `pwd`/DEF > do.log 2>&1 & # real 9670m18.338s cat fb.rheMac3.chainGorGor3Link.txt # 2279633399 bases of 2639145830 (86.378%) in intersection # and, for the swap mkdir /hive/data/genomes/gorGor3/bed/blastz.rheMac3.swap cd /hive/data/genomes/gorGor3/bed/blastz.rheMac3.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/rheMac3/bed/blastzGorGor3.2012-04-18/DEF \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \ -swap -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 289m9.479s cat fb.gorGor3.chainRheMac3Link.txt # 2335269818 bases of 2822760080 (82.730%) in intersection cd /hive/data/genomes/gorGor3/bed ln -s blastz.rheMac3.swap lastz.rheMac3 ######################################################################### # LASTZ Rat rn5 Swap (DONE - 2012-07-10 - Chin) cd /cluster/data/rn5/bed/lastz.rheMac3.2012-07-10 cat fb.rn5.chainRheMac3Link.txt # 896256761 bases of 2572853723 (34.835%) in intersection # then swap mkdir /cluster/data/rheMac3/bed/blastz.rn5.swap cd /cluster/data/rheMac3/bed/blastz.rn5.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /cluster/data/rn5/bed/lastz.rheMac3.2012-07-10/DEF \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium \ -swap -syntenicNet > swap.log 2>&1 & # real 69m12.752s cat fb.rheMac3.chainRn5Link.txt # 858153767 bases of 2639145830 (32.516%) in intersection cd /hive/data/genomes/rheMac3/bed ln -s blastz.rn5.swap lastz.rn5 ############################################################################ # LASTZ Opossum monDom5 (DONE - 2012-07-19 - Chin) mkdir /hive/data/genomes/monDom5/bed/lastzRheMac3.2012-07-19 cd /hive/data/genomes/monDom5/bed/lastzRheMac3.2012-07-19 # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 cat << '_EOF_' > DEF # opossum vs rhesus BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: Opossum MonDom5 SEQ1_DIR=/scratch/data/monDom5/monDom5.2bit SEQ1_LEN=/scratch/data/monDom5/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: Rhesus RheMac3 SEQ2_DIR=/scratch/data/rheMac3/rheMac3.2bit SEQ2_LEN=/scratch/data/rheMac3/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=1000 BASE=/hive/data/genomes/monDom5/bed/lastzRheMac3.2012-07-19 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # establish a screen to control this job screen -S monDom5RheMac3 time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 2473m43.189s cat fb.monDom5.chainRheMac3Link.txt # 221659579 bases of 3501660299 (6.330%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/monDom5/bed ln -s lastzRheMac3.2012-07-19 lastz.rheMac3 mkdir /hive/data/genomes/rheMac3/bed/blastz.monDom5.swap cd /hive/data/genomes/rheMac3/bed/blastz.monDom5.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/monDom5/bed/lastzRheMac3.2012-07-19/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 106m24.321s cat fb.rheMac3.chainMonDom5Link.txt # 214801189 bases of 2639145830 (8.139%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/rheMac3/bed ln -s blastz.monDom5.swap lastz.monDom5 ############################################################################ # LASTZ Chimp panTro4 Swap (DONE 2012-07-12 - Chin) cd /hive/data/genomes/panTro4/bed/lastzRheMac3.2012-07-19 cat fb.panTro4.chainRheMac3Link.txt # 2362219217 bases of 2902338967 (81.390%) in intersection cd /hive/data/genomes/panTro4/bed ln -s lastzRheMac3.2012-07-19 lastz.rheMac3 # running the swap mkdir /hive/data/genomes/rheMac3/bed/blastz.panTro4.swap cd /hive/data/genomes/rheMac3/bed/blastz.panTro4.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ -swap /hive/data/genomes/panTro4/bed/lastzRheMac3.2012-07-19/DEF \ -noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \ -syntenicNet -workhorse=hgwdev -smallClusterHub=encodek \ -bigClusterHub=swarm > swap.log 2>&1 & # real 82m21.660s cat fb.rheMac3.chainPanTro4Link.txt # 2278587464 bases of 2639145830 (86.338%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/rheMac3/bed ln -s blastz.panTro4.swap lastz.panTro4 ############################################################################ # LASTZ Marmoset calJac3 (DONE - 2012-07-19 - Chin) screen -S rhesus_vs_marmoset # use a screen to manage this multi-day job mkdir /hive/data/genomes/rheMac3/bed/lastzCalJac3.2012-07-19 cd /hive/data/genomes/rheMac3/bed/lastzCalJac3.2012-07-19 # same kind of parameters as used in human<->marmoset cat << '_EOF_' > DEF # Rhesus vs. marmoset BLASTZ=lastz # maximum M allowed with lastz is only 254 BLASTZ_M=254 BLASTZ_Q=/scratch/data/blastz/human_chimp.v2.q # and place those items here BLASTZ_O=600 BLASTZ_E=150 BLASTZ_K=4500 BLASTZ_Y=15000 BLASTZ_T=2 # TARGET: Rhesus RheMac3 SEQ1_DIR=/scratch/data/rheMac3/rheMac3.2bit SEQ1_LEN=/scratch/data/rheMac3/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 SEQ1_LIMIT=5 # QUERY: Marmoset (calJac3) SEQ2_DIR=/scratch/data/calJac3/calJac3.2bit SEQ2_LEN=/scratch/data/calJac3/chrom.sizes SEQ2_LIMIT=50 SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/hive/data/genomes/rheMac3/bed/lastzCalJac3.2012-07-19 TMPDIR=/scratch/tmp '_EOF_' # << this line keeps emacs coloring happy time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -chainMinScore=5000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \ > do.log 2>&1 & # real 5720m34.726s cat fb.rheMac3.chainCalJac3Link.txt # 1893656467 bases of 2639145830 (71.753%) in intersection cd /hive/data/genomes/rheMac3/bed ln -s lastzCalJac3.2012-07-19 lastz.calJac3 # running the swap mkdir /hive/data/genomes/calJac3/bed/blastz.rheMac3.swap cd /hive/data/genomes/calJac3/bed/blastz.rheMac3.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/rheMac3/bed/lastzCalJac3.2012-07-19/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=medium > swap.log 2>&1 & # real 84m31.344s cat fb.calJac3.chainRheMac3Link.txt # 1931533122 bases of 2752505800 (70.174%) in intersection cd /hive/data/genomes/calJac3/bed ln -s blastz.rheMac3.swap lastz.rheMac3 *********************************************************************** * TODOain/net *** Make sure that goldenPath/ponAbe2/vsRheMac3/README.txt is accurate. *** Add {chain,net}RheMac3 tracks to trackDb.ra if necessary. *********************************************************************** ############################################################################ # set default position to RHO gene displays (DONE - 2012-07-24 - Hiram) hgsql -e \ 'update dbDb set defaultPos="chr3:132959918-132967918" where name="rheMac3";' \ hgcentraltest ############################################################################ # pushQ entry (DONE - 2012-07-24 - Hiram) mkdir /hive/data/genomes/rheMac3/pushQ cd /hive/data/genomes/rheMac3/pushQ # Mark says don't let the transMap track get there time makePushQSql.pl rheMac3 2> stderr.txt | grep -v transMap > rheMac3.sql # real 4m8.890s # check the stderr.txt for bad stuff, these kinds of warnings are OK: # WARNING: hgwdev does not have /gbdb/rheMac3/wib/gc5Base.wib # WARNING: hgwdev does not have /gbdb/rheMac3/wib/quality.wib # WARNING: hgwdev does not have /gbdb/rheMac3/bbi/quality.bw # WARNING: rheMac3 does not have seq # WARNING: rheMac3 does not have extFile # WARNING: rheMac3 does not have estOrientInfo # eliminated the chainMm9 and chainRn4 chainNet tracks from the .sql # and the rheMac3 chainNet tracks on mm9 and rn4 # the mm10 and rn5 tracks are the new ones scp -p rheMac3.sql hgwbeta:/tmp ssh hgwbeta "hgsql qapushq < /tmp/rheMac3.sql" ############################################################################ # LIFTOVER TO rheMac2 (DONE - 2012-08-28 - Hiram ) mkdir /hive/data/genomes/rheMac3/bed/blat.rheMac2.2012-08-28 cd /hive/data/genomes/rheMac3/bed/blat.rheMac2.2012-08-28 # -debug run to create run dir, preview scripts... doSameSpeciesLiftOver.pl -debug \ -ooc /hive/data/genomes/rheMac3/jkStuff/rheMac3.11.ooc rheMac3 rheMac2 # Real run: time nice -n +19 doSameSpeciesLiftOver.pl -verbose=2 \ -ooc /hive/data/genomes/rheMac3/jkStuff/rheMac3.11.ooc \ -bigClusterHub=swarm -dbHost=hgwdev -workhorse=hgwdev \ rheMac3 rheMac2 > do.log 2>&1 # real 99m53.205s ######################################################################### # GENSCAN GENE PREDICTIONS (DONE 2012-10-15 - Chin) mkdir /hive/data/genomes/rheMac3/bed/genscan cd /hive/data/genomes/rheMac3/bed/genscan time doGenscan.pl rheMac3 > do.log 2>&1 # real 284m5.600s # This is not working. Need to run these split mkdir /hive/data/genomes/rheMac3/bed/genscan/splitRun cd /hive/data/genomes/rheMac3/bed/genscan/splitRun cp ../../../jkStuff/rheMac3.nonBridged.lift . gapToLift rheMac3 rheMac3.nonBridged.lift -bedFile=rheMac3.nonBridged.bed for C in 3 8 13 14 19 20 X do echo chr${C} cd /hive/data/genomes/rheMac3/bed/genscan/splitRun grep -w "chr${C}" rheMac3.nonBridged.lift | grep -v random \ | sed -e "s/chr${C}./chr${C}_/" > chr${C}.nonBridged.lift mkdir chr${C} faToTwoBit ../hardMaskedFa/00?/chr${C}.fa chr${C}/chr${C}.2bit ~/kent/src/hg/utils/lft2BitToFa.pl chr${C}/chr${C}.2bit \ chr${C}.nonBridged.lift > chr${C}/chr${C}.nonBridged.fa cd /hive/data/genomes/rheMac3/bed/genscan/splitRun/chr${C} mkdir split${C} faSplit sequence chr${C}.nonBridged.fa 100 split${C}/chr${C}_ done # verify lift files are sane: cd /hive/data/genomes/rheMac3/bed/genscan/splitRun awk '{print $4}' chr*.nonBridged.lift | sort | uniq -c # should see the list of chroms: # 246 chr13 # 127 chr14 # 139 chr19 # 98 chr20 # 317 chr3 # 169 chr8 # 330 chrX for C in 3 8 13 14 19 20 X do echo chr${C} cd /hive/data/genomes/rheMac3/bed/genscan/splitRun/chr${C} echo '#!/bin/sh' > cmdList.sh export NL=-1 ls split${C} | while read F do NL=`echo $NL | awk '{print $1+1}'` if [ "${NL}" -gt 7 ]; then NL=0 echo "echo waiting before $F" >> cmdList.sh echo wait >> cmdList.sh fi echo "/cluster/bin/x86_64/gsBig split${C}/${F} gtf/${F}.gtf -trans=pep/${F}.pep -subopt=subopt/${F}.bed -exe=/scratch/data/genscan/genscan -par=/scratch/data/genscan/HumanIso.smat -tmp=/dev/shm -window=2400000 &" done >> cmdList.sh echo "echo waiting at end" >> cmdList.sh echo "wait" >> cmdList.sh chmod +x cmdList.sh rm -fr gtf pep subopt mkdir gtf pep subopt done # run chr20 first to test script cd /hive/data/genomes/rheMac3/bed/genscan/splitRun/chr20 time ./cmdList.sh > ../chr20.log 2>&1 # real 6m59.465s # running the rest: 3 8 13 14 19 X for C in chr3 chr8 chr13 chr14 chr19 chrX do cd /hive/data/genomes/rheMac3/bed/genscan/splitRun/${C} time ./cmdList.sh > ../${C}.log 2>&1 done # chr3 real 9m41.925s # chr8 real 18m50.934s # chr13 real 11m1.742s # chr14 real 16m32.590s # chr19 real 5m8.588s # chrX real 8m48.507s # collecting the results: # run chr20 first: cd /hive/data/genomes/rheMac3/bed/genscan/splitRun/chr20 cat gtf/chr20_*.gtf | liftUp -type=.gtf stdout ../chr20.nonBridged.lift error stdin \ | sed -e "s/chr20_0\([0-4]\)\./chr20.\1/g" > chr20.gtf cat subopt/chr20_*.bed | liftUp -type=.bed stdout ../chr20.nonBridged.lift error stdin \ | sed -e "s/chr20_0\([0-4]\)\./chr20.\1/g" > chr20.subopt.bed cat pep/chr20_*.pep | sed -e "s/chr20_0\([0-4]\)\./chr20.\1/g" > chr20.pep ls -l ../../gtf/00?/chr20.gtf ../../pep/00?/chr20.pep ../../subopt/00?/chr20.bed ls -l chr20.gtf chr20.pep chr20.subopt.bed # run the rest for C in chr3 chr8 chr13 chr14 chr19 chrX do cd /hive/data/genomes/rheMac3/bed/genscan/splitRun/${C} cat gtf/${C}_*.gtf | liftUp -type=.gtf stdout ../${C}.nonBridged.lift error stdin \ | sed -e "s/${C}_0\([0-4]\)\./${C}.\1/g" > ${C}.gtf cat subopt/${C}_*.bed | liftUp -type=.bed stdout ../${C}.nonBridged.lift error stdin \ | sed -e "s/${C}_0\([0-4]\)\./${C}.\1/g" > ${C}.subopt.bed cat pep/${C}_*.pep | sed -e "s/${C}_0\([0-4]\)\./${C}.\1/g" > ${C}.pep ls -l ../../gtf/00?/${C}.gtf ../../pep/00?/${C}.pep ../../subopt/00?/${C}.bed ls -l ${C}.gtf ${C}.pep ${C}.subopt.bed done # after verifying the sizes of the files seem same compared to what # happened in the main run: for C in chr3 chr8 chr13 chr14 chr19 chr20 chrX do cd /hive/data/genomes/rheMac3/bed/genscan/splitRun/${C} ls -l ../../gtf/00?/${C}.gtf ../../pep/00?/${C}.pep ../../subopt/00?/${C}.bed ls -l ${C}.gtf ${C}.pep ${C}.subopt.bed cd /hive/data/genomes/rheMac3/bed/genscan/splitRun done -rw-rw-r-- 1 chinhli genecats 0 Oct 15 14:05 ../../gtf/000/chr3.gtf -rw-rw-r-- 1 chinhli genecats 0 Oct 15 14:05 ../../pep/000/chr3.pep -rw-rw-r-- 1 chinhli genecats 0 Oct 15 14:05 ../../subopt/000/chr3.bed -rw-rw-r-- 1 chinhli genecats 2622960 Oct 16 15:51 chr3.gtf -rw-rw-r-- 1 chinhli genecats 1108696 Oct 16 15:51 chr3.pep -rw-rw-r-- 1 chinhli genecats 1419193 Oct 16 15:51 chr3.subopt.bed -rw-rw-r-- 1 chinhli genecats 0 Oct 15 16:46 ../../gtf/000/chr8.gtf .... .... -rw-rw-r-- 1 chinhli genecats 619333 Oct 16 15:49 chr20.subopt.bed -rw-rw-r-- 1 chinhli genecats 0 Oct 15 16:46 ../../gtf/000/chrX.gtf -rw-rw-r-- 1 chinhli genecats 0 Oct 15 16:46 ../../pep/000/chrX.pep -rw-rw-r-- 1 chinhli genecats 0 Oct 15 16:46 ../../subopt/000/chrX.bed -rw-rw-r-- 1 chinhli genecats 1347712 Oct 16 15:52 chrX.gtf -rw-rw-r-- 1 chinhli genecats 641978 Oct 16 15:52 chrX.pep -rw-rw-r-- 1 chinhli genecats 784120 Oct 16 15:52 chrX.subopt.bed # this is tricky, it is counting on the file existing, empty or # otherwise for C in chr3 chr8 chr13 chr14 chr19 chr20 chrX do cd /hive/data/genomes/rheMac3/bed/genscan/splitRun/${C} D=`ls ../../gtf/00?/${C}.gtf` rm -f "${D}" cp -p ${C}.gtf "${D}" D=`ls ../../pep/00?/${C}.pep` rm -f "${D}" cp -p ${C}.pep "${D}" D=`ls ../../subopt/00?/${C}.bed` rm -f "${D}" cp -p ${C}.subopt.bed "${D}" cd /hive/data/genomes/rheMac3/bed/genscan/splitRun done # verify size again: for C in chr3 chr8 chr13 chr14 chr19 chr20 chrX do cd /hive/data/genomes/rheMac3/bed/genscan/splitRun/${C} ls -l ../../gtf/00?/${C}.gtf ../../pep/00?/${C}.pep ../../subopt/00?/${C}.bed ls -l ${C}.gtf ${C}.pep ${C}.subopt.bed cd /hive/data/genomes/rheMac3/bed/genscan/splitRun done -rw-rw-r-- 1 chinhli genecats 2622960 Oct 16 15:51 ../../gtf/000/chr3.gtf -rw-rw-r-- 1 chinhli genecats 1108696 Oct 16 15:51 ../../pep/000/chr3.pep -rw-rw-r-- 1 chinhli genecats 1419193 Oct 16 15:51 ../../subopt/000/chr3.bed -rw-rw-r-- 1 chinhli genecats 2622960 Oct 16 15:51 chr3.gtf -rw-rw-r-- 1 chinhli genecats 1108696 Oct 16 15:51 chr3.pep -rw-rw-r-- 1 chinhli genecats 1419193 Oct 16 15:51 chr3.subopt.bed -rw-rw-r-- 1 chinhli genecats 973693 Oct 16 15:51 chr8.subopt.bed .... ... ... -rw-rw-r-- 1 chinhli genecats 619333 Oct 16 15:49 chr20.subopt.bed -rw-rw-r-- 1 chinhli genecats 1347712 Oct 16 15:52 ../../gtf/000/chrX.gtf -rw-rw-r-- 1 chinhli genecats 641978 Oct 16 15:52 ../../pep/000/chrX.pep -rw-rw-r-- 1 chinhli genecats 784120 Oct 16 15:52 ../../subopt/000/chrX.bed -rw-rw-r-- 1 chinhli genecats 1347712 Oct 16 15:52 chrX.gtf -rw-rw-r-- 1 chinhli genecats 641978 Oct 16 15:52 chrX.pep -rw-rw-r-- 1 chinhli genecats 784120 Oct 16 15:52 chrX.subopt.bed # genes found have been copied over to ../../gtf. pep and s # Now, we can continue cd /hive/data/genomes/rheMac3/bed/genscan time doGenscan.pl -continue=makeBed -workhorse=hgwdev -dbHost=hgwdev \ rheMac3 > makeBed.log 2>&1 # real 7m58.322s cat fb.rheMac3.genscan.txt # 50606442 bases of 2639145830 (1.918%) in intersection cat fb.rheMac3.genscanSubopt.txt # 51718033 bases of 2639145830 (1.960%) in intersection #############################################################################