# for emacs: -*- mode: sh; -*- # This file describes browser build for the galGal4 # Chicken Gallus Gallus genome: Nov. 2011 # http://www.ncbi.nlm.nih.gov/Traces/wgs/?val=AADN00 # 12X coverage via a variety of methods # http://www.ncbi.nlm.nih.gov/genome/111 # http://www.ncbi.nlm.nih.gov/genome/assembly/317958/ # http://www.ncbi.nlm.nih.gov/bioproject/13342 - WashU # http://www.ncbi.nlm.nih.gov/Traces/wgs/?val=AADN00 # http://www.ncbi.nlm.nih.gov/bioproject/10808 - Montreal - chrMt ############################################################################# # Fetch sequence from genbank (DONE - 2011-12-22 - Hiram) mkdir -p /hive/data/genomes/galGal4/genbank cd /hive/data/genomes/galGal4/genbank wget --timestamping -r --cut-dirs=6 --level=0 -nH -x \ --no-remove-listing -np \ "ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_other/Gallus_gallus/Gallus_gallus-4.0/*" # real 11m57.658s # measure sequence to be used here faSize Primary_Assembly/assembled_chromosomes/FASTA/*.fa.gz \ Primary_Assembly/unplaced_scaffolds/FASTA/*.fa.gz \ Primary_Assembly/unlocalized_scaffolds/FASTA//*.fa.gz # 1046915324 bases (14077289 N's 1032838035 real 1032838035 upper 0 lower) in 15931 sequences in 65 files # Total size: mean 65715.6 sd 2473072.9 min 253 (gi|354532582|gb|AADN03009880.1|) max 195276750 (gi|357973830|gb|CM000093.3|) median 1206 ############################################################################# # process into UCSC naming scheme (DONE - 2011-02-17 - Hiram) mkdir /hive/data/genomes/galGal4/ucsc cd /hive/data/genomes/galGal4/ucsc cat << '_EOF_' > toUcsc.pl #!/bin/env perl use strict; use warnings; my %accToChr; open (FH, "<../genbank/Primary_Assembly/assembled_chromosomes/chr2acc") or die "can not read Primary_Assembly/assembled_chromosomes/chr2acc"; while (my $line = ) { next if ($line =~ m/^#/); chomp $line; my ($chrN, $acc) = split('\s+', $line); $accToChr{$acc} = $chrN; } close (FH); foreach my $acc (keys %accToChr) { my $chrN = $accToChr{$acc}; print "$acc $accToChr{$acc}\n"; open (FH, "zcat ../genbank/Primary_Assembly/assembled_chromosomes/AGP/chr${chrN}.agp.gz|") or die "can not read chr${chrN}.agp.gz"; open (UC, ">chr${chrN}.agp") or die "can not write to chr${chrN}.agp"; while (my $line = ) { if ($line =~ m/^#/) { print UC $line; } else { $line =~ s/^$acc/chr${chrN}/; print UC $line; } } close (FH); close (UC); open (FH, "zcat ../genbank/Primary_Assembly/assembled_chromosomes/FASTA/chr${chrN}.fa.gz|") or die "can not read chr${chrN}.fa.gz"; open (UC, ">chr${chrN}.fa") or die "can not write to chr${chrN}.fa"; while (my $line = ) { if ($line =~ m/^>/) { printf UC ">chr${chrN}\n"; } else { print UC $line; } } close (FH); close (UC); } '_EOF_' # << happy emacs chmod +x toUcsc.pl cat << '_EOF_' > unplaced.pl #!/bin/env perl use strict; use warnings; my $agpFile = "../genbank/Primary_Assembly/unplaced_scaffolds/AGP/unplaced.scaf.agp.gz"; my $fastaFile = "../genbank/Primary_Assembly/unplaced_scaffolds/FASTA/unplaced.scaf.fa.gz"; open (FH, "zcat $agpFile|") or die "can not read $agpFile"; open (UC, ">unplaced.agp") or die "can not write to unplaced.agp"; while (my $line = ) { if ($line =~ m/^#/) { print UC $line; } else { $line =~ s/\.1//; printf UC "chrUn_%s", $line; } } close (FH); close (UC); open (FH, "zcat $fastaFile|") or die "can not read $fastaFile"; open (UC, ">unplaced.fa") or die "can not write to unplaced.fa"; while (my $line = ) { if ($line =~ m/^>/) { chomp $line; $line =~ s/.*gb\|//; $line =~ s/\.1\|.*//; printf UC ">chrUn_$line\n"; } else { print UC $line; } } close (FH); close (UC); '_EOF_' # << happy emacs chmod +x unplaced.pl cat << '_EOF_' > unlocalized.pl #!/bin/env perl use strict; use warnings; my %accToChr; my %chrNames; open (FH, "<../genbank/Primary_Assembly/unlocalized_scaffolds/unlocalized.chr2scaf") or die "can not read Primary_Assembly/unlocalized_scaffolds/unlocalized.chr2scaf"; while (my $line = ) { next if ($line =~ m/^#/); chomp $line; my ($chrN, $acc) = split('\s+', $line); $accToChr{$acc} = $chrN; $chrNames{$chrN} += 1; } close (FH); foreach my $chrN (keys %chrNames) { my $agpFile = "../genbank/Primary_Assembly/unlocalized_scaffolds/AGP/chr$chrN.unlocalized.scaf.agp.gz"; my $fastaFile = "../genbank/Primary_Assembly/unlocalized_scaffolds/FASTA/chr$chrN.unlocalized.scaf.fa.gz"; open (FH, "zcat $agpFile|") or die "can not read $agpFile"; open (UC, ">chr${chrN}_random.agp") or die "can not write to chr${chrN}_random.agp"; while (my $line = ) { if ($line =~ m/^#/) { print UC $line; } else { chomp $line; my (@a) = split('\t', $line); my $acc = $a[0]; my $accNo1 = $acc; $accNo1 =~ s/.1$//; die "ERROR: acc not .1: $acc" if ($accNo1 =~ m/\./); die "ERROR: chrN $chrN not correct for $acc" if ($accToChr{$acc} ne $chrN); my $ucscName = "chr${chrN}_${accNo1}_random"; # these names became too long, limit is 31 in browser: if ($chrN =~ m/LGE22C19W28_E50C23/) { my $before = $ucscName; $ucscName =~ s/_E50C23//; $ucscName =~ s/AAD//; printf STDERR "shorter: $before -> $ucscName\n"; } printf UC "%s", $ucscName; for (my $i = 1; $i < scalar(@a); ++$i) { printf UC "\t%s", $a[$i]; } printf UC "\n"; } } close (FH); close (UC); printf "chr%s\n", $chrN; open (FH, "zcat $fastaFile|") or die "can not read $fastaFile"; open (UC, ">chr${chrN}_random.fa") or die "can not write to chr${chrN}_random.fa"; while (my $line = ) { if ($line =~ m/^>/) { chomp $line; my $acc = $line; $acc =~ s/.*gb\|//; $acc =~ s/\|.*//; my $accNo1 = $acc; $accNo1 =~ s/.1$//; die "ERROR: acc not .1: $acc" if ($accNo1 =~ m/\./); die "ERROR: chrN $chrN not correct for $acc" if ($accToChr{$acc} ne $chrN); my $ucscName = "chr${chrN}_${accNo1}_random"; # these names became too long, limit is 31 in browser: if ($chrN =~ m/LGE22C19W28_E50C23/) { $ucscName =~ s/_E50C23//; $ucscName =~ s/AAD//; } printf UC ">$ucscName\n"; } else { print UC $line; } } close (FH); close (UC); } '_EOF_' # << happy emacs chmod +x unlocalized.pl time ./toUcsc.pl # real 0m30.708s ./unlocalized.pl # real 0m1.397s ./unplaced.pl # real 0m1.611s gzip *.fa *.agp # verify nothing lost in the translation, should be the same as above # except for the name translations faSize *.fa # 1046915324 bases (14077289 N's 1032838035 real 1032838035 upper 0 lower) in 15931 sequences in 65 files # Total size: mean 65715.6 sd 2473072.9 min 253 (chr2_AADN03009880_random) max 195276750 (chr1) median 1206 ############################################################################# # Initial browser build (DONE - 2012-01-03 - Hiram) cd /hive/data/genomes/galGal4 cat << '_EOF_' > galGal4.config.ra # Config parameters for makeGenomeDb.pl: db galGal4 clade vertebrate genomeCladePriority 50 scientificName Gallus gallus commonName Chicken assemblyDate Nov. 2011 assemblyLabel ICGSC Gallus_gallus-4.0 (GCA_000002315.2) assemblyShortLabel ICGSC Gallus_gallus-4.0 orderKey 434 mitoAcc NC_001323 fastaFiles /hive/data/genomes/galGal4/ucsc/*.fa.gz agpFiles /hive/data/genomes/galGal4/ucsc/*.agp.gz dbDbSpeciesDir chicken taxId 9031 '_EOF_' # << happy emacs time makeGenomeDb.pl -stop=agp galGal4.config.ra > agp.log 2>&1 # real 1m37.730s # check the end of agp.log to verify it is OK time makeGenomeDb.pl -workhorse=hgwdev -fileServer=hgwdev \ -continue=db galGal4.config.ra > db.log 2>&1 # real 8m20.996s # the first attempted failed in the: # featureBits -or -countGaps galGal4 gold gap # due to the long chrom names, e.g.: # chrLGE22C19W28_E50C23_AADN03011267_random # buffer overflow, size 32, format: %s, buffer: 'chrLGE22C19W28_E50C23_JH375242_' # the unlocalized.pl script was reworked a bit to shorten that one # chr random name. ############################################################################# # running repeat masker (DONE - 2012-01-04 - Hiram) mkdir /hive/data/genomes/galGal4/bed/repeatMasker cd /hive/data/genomes/galGal4/bed/repeatMasker time doRepeatMasker.pl -buildDir=`pwd` -noSplit \ -bigClusterHub=swarm -dbHost=hgwdev -workhorse=hgwdev \ -smallClusterHub=memk galGal4 > do.log 2>&1 & # real 115m21.264s cat faSize.rmsk.txt # 1046932099 bases (14077289 N's 1032854810 real 921370201 upper # 111484609 lower) in 15932 sequences in 1 files # %10.65 masked total, %10.79 masked real grep -i versi do.log # RepeatMasker version development-$Id: RepeatMasker,v 1.26 2011/09/26 16:19:44 angie Exp $ # April 26 2011 (open-3-3-0) version of RepeatMasker featureBits -countGaps galGal4 rmsk # 111540299 bases of 1046932099 (10.654%) in intersection # why is it different than the faSize above ? # because rmsk masks out some N's as well as bases, the count above # separates out the N's from the bases, it doesn't show lower case N's ########################################################################## # running simple repeat (DONE - 2012-01-04 - Hiram) mkdir /hive/data/genomes/galGal4/bed/simpleRepeat cd /hive/data/genomes/galGal4/bed/simpleRepeat time doSimpleRepeat.pl -buildDir=`pwd` -bigClusterHub=swarm \ -dbHost=hgwdev -workhorse=hgwdev -smallClusterHub=memk \ galGal4 > do.log 2>&1 & # about 1 hour cat fb.simpleRepeat # 10508414 bases of 1032854810 (1.017%) in intersection XXX - ready for this when rmsk is done # not going to add to rmsk here, using the window masker instead since # it masks more sequence cd /hive/data/genomes/galGal4 twoBitMask galGal4.rmsk.2bit \ -add bed/simpleRepeat/trfMask.bed galGal4.2bit # you can safely ignore the warning about fields >= 13 twoBitToFa galGal4.2bit stdout | faSize stdin > faSize.galGal4.2bit.txt cat faSize.galGal4.2bit.txt # 1511735326 bases (153400444 N's 1358334882 real 1024824487 upper # 333510395 lower) in 19550 sequences in 1 files # %22.06 masked total, %24.55 masked real rm /gbdb/galGal4/galGal4.2bit ln -s `pwd`/galGal4.2bit /gbdb/galGal4/galGal4.2bit ######################################################################### # Verify all gaps are marked, add any N's not in gap as type 'other' # (DONE - 2012-01-04 - Hiram) mkdir /hive/data/genomes/galGal4/bed/gap cd /hive/data/genomes/galGal4/bed/gap time nice -n +19 findMotif -motif=gattaca -verbose=4 \ -strand=+ ../../galGal4.unmasked.2bit > findMotif.txt 2>&1 # real 0m17.839s grep "^#GAP " findMotif.txt | sed -e "s/^#GAP //" > allGaps.bed time featureBits galGal4 -not gap -bed=notGap.bed # 1043113087 bases of 1043113087 (100.000%) in intersection # real 0m8.581s time featureBits galGal4 allGaps.bed notGap.bed -bed=new.gaps.bed # 10258277 bases of 1043113087 (0.983%) in intersection # real 1m29.109s # what is the highest index in the existing gap table: hgsql -N -e "select ix from gap;" galGal4 | sort -n | tail -1 # 446 cat << '_EOF_' > mkGap.pl #!/bin/env perl use strict; use warnings; my $ix=`hgsql -N -e "select ix from gap;" galGal4 | sort -n | tail -1`; chomp $ix; open (FH,") { my ($chrom, $chromStart, $chromEnd, $rest) = split('\s+', $line); ++$ix; printf "%s\t%d\t%d\t%d\tN\t%d\tother\tyes\n", $chrom, $chromStart, $chromEnd, $ix, $chromEnd-$chromStart; } close (FH); '_EOF_' # << happy emacs chmod +x ./mkGap.pl ./mkGap.pl > other.bed wc -l other.bed # 11035 featureBits -countGaps galGal4 other.bed # 10258277 bases of 1046932099 (0.980%) in intersection hgLoadBed -sqlTable=$HOME/kent/src/hg/lib/gap.sql \ -noLoad galGal4 otherGap other.bed # verify no overlap: time featureBits -countGaps galGal4 gap other.bed # 0 bases of 1046932099 (0.000%) in intersection # real 0m29.669s # verify no errors before adding to the table: time gapToLift -minGap=1 galGal4 nonBridged.before.lift \ -bedFile=nonBridged.before.bed > before.gapToLift.txt 2>&1 & # real 0m7.205s # check for warnings in before.gapToLift.txt, should be empty: # -rw-rw-r-- 1 0 Jan 4 15:13 before.gapToLift.txt # starting with this many: hgsql -e "select count(*) from gap;" galGal4 # 2863 hgsql galGal4 -e 'load data local infile "bed.tab" into table gap;' # result count: hgsql -e "select count(*) from gap;" galGal4 # 13898 # == 2863 + 11035 # verify we aren't adding gaps where gaps already exist # this would output errors if that were true: gapToLift -minGap=1 galGal4 nonBridged.lift -bedFile=nonBridged.bed # there should be no errors or other output, checked bridged gaps: hgsql -N -e "select bridge from gap;" galGal4 | sort | uniq -c # 915 no # 12983 yes ########################################################################## ## WINDOWMASKER (DONE - 2011-09-08 - Hiram) mkdir /hive/data/genomes/galGal4/bed/windowMasker cd /hive/data/genomes/galGal4/bed/windowMasker time nice -n +19 doWindowMasker.pl -buildDir=`pwd` -workhorse=hgwdev \ -dbHost=hgwdev galGal4 > do.log 2>&1 & # about 45 minutes # Masking statistics twoBitToFa galGal4.wmsk.2bit stdout | faSize stdin # 1046932099 bases (14077289 N's 1032854810 real 828377808 upper # 204477002 lower) in 15932 sequences in 1 files # %19.53 masked total, %19.80 masked real twoBitToFa galGal4.wmsk.sdust.2bit stdout | faSize stdin # 1046932099 bases (14077289 N's 1032854810 real 822407863 upper # 210446947 lower) in 15932 sequences in 1 files # %20.10 masked total, %20.38 masked real hgLoadBed galGal4 windowmaskerSdust windowmasker.sdust.bed.gz # Loaded 5996190 elements of size 3 featureBits -countGaps galGal4 windowmaskerSdust # 224522674 bases of 1046932099 (21.446%) in intersection # eliminate the gaps from the masking featureBits galGal4 -not gap -bed=notGap.bed # 1032854810 bases of 1032854810 (100.000%) in intersection time nice -n +19 featureBits galGal4 windowmaskerSdust notGap.bed \ -bed=stdout | gzip -c > cleanWMask.bed.gz # 210446947 bases of 1032854810 (20.375%) in intersection # real 2m3.610s # reload track to get it clean hgLoadBed galGal4 windowmaskerSdust cleanWMask.bed.gz # Loaded 5994965 elements of size 4 time featureBits -countGaps galGal4 windowmaskerSdust # real 0m40.505s # 210446947 bases of 1046932099 (20.101%) in intersection # mask with this clean result zcat cleanWMask.bed.gz \ | twoBitMask ../../galGal4.unmasked.2bit stdin \ -type=.bed galGal4.cleanWMSdust.2bit twoBitToFa galGal4.cleanWMSdust.2bit stdout | faSize stdin \ > galGal4.cleanWMSdust.faSize.txt cat galGal4.cleanWMSdust.faSize.txt # 1046932099 bases (14077289 N's 1032854810 real 822407863 upper # 210446947 lower) in 15932 sequences in 1 files # %20.10 masked total, %20.38 masked real # how much does this window masker and repeat masker overlap: featureBits -countGaps galGal4 rmsk windowmaskerSdust # 71066487 bases of 1046932099 (6.788%) in intersection ######################################################################### # MASK SEQUENCE WITH WM+TRF (DONE - 2012-01-05 - Hiram) cd /hive/data/genomes/galGal4 twoBitMask -add bed/windowMasker/galGal4.cleanWMSdust.2bit \ bed/simpleRepeat/trfMask.bed galGal4.2bit # safe to ignore the warnings about BED file with >=13 fields twoBitToFa galGal4.2bit stdout | faSize stdin > faSize.galGal4.txt cat faSize.galGal4.txt # 1046932099 bases (14077289 N's 1032854810 real 822061190 upper # 210793620 lower) in 15932 sequences in 1 files # %20.13 masked total, %20.41 masked real # create symlink to gbdb ssh hgwdev rm /gbdb/galGal4/galGal4.2bit ln -s `pwd`/galGal4.2bit /gbdb/galGal4/galGal4.2bit # what happens with all masks: twoBitMask -add galGal4.2bit bed/repeatMasker/galGal4.sorted.fa.out \ galGal4.wm.trf.rmsk.2bit twoBitToFa galGal4.wm.trf.rmsk.2bit stdout | faSize stdin # 1046932099 bases (14077289 N's 1032854810 real 781762152 upper # 251092658 lower) in 15932 sequences in 1 files # %23.98 masked total, %24.31 masked real # rmsk covers another 40 million bases 251092658-210793620 = 40299038 ######################################################################## # cpgIslands - (DONE - 2011-04-23 - Hiram) mkdir /hive/data/genomes/galGal4/bed/cpgIslands cd /hive/data/genomes/galGal4/bed/cpgIslands time doCpgIslands.pl galGal4 > do.log 2>&1 # Elapsed time: 58m41s cat fb.galGal4.cpgIslandExt.txt # 14619450 bases of 1032854810 (1.415%) in intersection ######################################################################### # genscan - (DONE - 2011-04-26 - Hiram) mkdir /hive/data/genomes/galGal4/bed/genscan cd /hive/data/genomes/galGal4/bed/genscan time doGenscan.pl galGal4 > do.log 2>&1 # real 29m37.528s # one broken job: ./runGsBig.csh chr8 000 gtf/000/chr8.gtf pep/000/chr8.pep subopt/000/chr8.bed # rerunning with window size of 2000000 # about 10 minutes # continuing: time doGenscan.pl -continue=makeBed galGal4 > makeBed.log 2>&1 # real 12m15.506s cat fb.galGal4.genscan.txt # 22904400 bases of 1032854810 (2.218%) in intersection cat fb.galGal4.genscanSubopt.txt # 24186038 bases of 1032854810 (2.342%) in intersection ######################################################################### # MAKE 11.OOC FILE FOR BLAT/GENBANK (DONE - 2012-05-04 - Hiram) # Use -repMatch=900, based on size -- for human we use 1024 # use the "real" number from the faSize measurement, # hg19 is 2897316137, calculate the ratio factor for 1024: calc \( 1032838035 / 2897316137 \) \* 1024 # ( 1032838035 / 2897316137 ) * 1024 = 365.036502 # round up to 380 (galGal3 was 380) cd /hive/data/genomes/galGal4 time blat galGal4.2bit /dev/null /dev/null -tileSize=11 \ -makeOoc=jkStuff/galGal4.11.ooc -repMatch=380 # Wrote 12593 overused 11-mers to jkStuff/galGal4.11.ooc # galGal3 had: Wrote 13061 overused 11-mers to /cluster/bluearc/galGal3/11.ooc # real 0m25.395s # there are non-bridged gaps, create lift file needed for genbank hgsql -N -e "select bridge from gap;" galGal4 | sort | uniq -c # 915 no # 12983 yes cd /hive/data/genomes/galGal4/jkStuff gapToLift galGal4 galGal4.nonBridged.lift -bedFile=galGal4.nonBridged.bed # largest non-bridged contig: awk '{print $3-$2,$0}' galGal4.nonBridged.bed | sort -nr | head # 44546952 chr3 11581655 56128607 chr3.03 ######################################################################### # AUTO UPDATE GENBANK (DONE - 2012-05-04 - Hiram) # examine the file: /cluster/data/genbank/data/organism.lst # for your species to see what counts it has for: # organism mrnaCnt estCnt refSeqCnt # Felis catus 1081 919 354 # to decide which "native" mrna or ests you want to specify in genbank.conf ssh hgwdev cd $HOME/kent/src/hg/makeDb/genbank git pull # edit etc/genbank.conf to add: # galGal4 (chicken) galGal4.serverGenome = /hive/data/genomes/galGal4/galGal4.2bit galGal4.clusterGenome = /hive/data/genomes/galGal4/galGal4.2bit galGal4.ooc = /hive/data/genomes/galGal4/jkStuff/galGal4.11.ooc galGal4.lift = /hive/data/genomes/galGal4/jkStuff/galGal4.nonBridged.lift galGal4.perChromTables = no galGal4.refseq.mrna.native.pslCDnaFilter = ${ordered.refseq.mrna.native.pslCDnaFilter} galGal4.refseq.mrna.xeno.pslCDnaFilter = ${ordered.refseq.mrna.xeno.pslCDnaFilter} galGal4.genbank.mrna.native.pslCDnaFilter = ${ordered.genbank.mrna.native.pslCDnaFilter} galGal4.genbank.mrna.xeno.pslCDnaFilter = ${ordered.genbank.mrna.xeno.pslCDnaFilter} galGal4.genbank.est.native.pslCDnaFilter = ${ordered.genbank.est.native.pslCDnaFilter} galGal4.refseq.mrna.native.load = yes galGal4.refseq.mrna.xeno.load = yes galGal4.genbank.mrna.xeno.load = yes galGal4.downloadDir = galGal4 # galGal4.upstreamGeneTbl = refGene # galGal4.upstreamMaf = multiz7way # /hive/data/genomes/galGal3/bed/multiz7way/species.lst # end of section added to etc/genbank.conf git commit -m "adding galGal4 chicken" etc/genbank.conf git push make etc-update ssh hgwdev # used to do this on "genbank" machine screen -S galGal4 # long running job managed in screen cd /cluster/data/genbank time nice -n +19 ./bin/gbAlignStep -initial galGal4 & # var/build/logs/2012.05.04-15:58:41.galGal4.initalign.log # real 1604m15.676s # load database when finished ssh hgwdev cd /cluster/data/genbank time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad galGal4 & # logFile: var/dbload/hgwdev/logs/2012.05.08-08:48:35.dbload.log # real 48m18.467s # enable daily alignment and update of hgwdev (DONE - 2012-02-09 - Hiram) cd ~/kent/src/hg/makeDb/genbank git pull # add galGal4 to: vi etc/align.dbs etc/hgwdev.dbs git commit -m "Added galGal4." etc/align.dbs etc/hgwdev.dbs git push make etc-update ######################################################################### # create downloads and pushQ entry (DONE - 2012-05-24 - Hiram) # first make sure all.joiner is up to date and has this new organism # a keys check should be clean: cd ~/kent/src/hg/makeDb/schema time joinerCheck -database=galGal4 -keys all.joiner # real 2m30.542s cd /hive/data/genomes/galGal4 time makeDownloads.pl -workhorse hgwdev galGal4 > download.log 2>&1 # real 17m56.594s mkdir /hive/data/genomes/galGal4/pushQ cd /hive/data/genomes/galGal4/pushQ time makePushQSql.pl galGal4 > galGal4.sql 2> stderr.out # real 3m28.689s # check stderr.out for no significant problems, it is common to see: WARNING: hgwdev does not have /gbdb/galGal4/wib/gc5Base.wib WARNING: hgwdev does not have /gbdb/galGal4/wib/quality.wib WARNING: hgwdev does not have /gbdb/galGal4/bbi/quality.bw WARNING: galGal4 does not have seq WARNING: galGal4 does not have extFile scp -p galGal4.sql hgwbeta:/tmp ssh hgwbeta cd /tmp hgsql qapushq < galGal4.sql ######################################################################### # fixup search rule for assembly track/gold table (DONE - 2012-06-07 - Hiram) hgsql -N -e "select frag from gold;" galGal4 | sort | head -1 AADN03009256.1 hgsql -N -e "select frag from gold;" galGal4 | sort | tail -2 JH375140.1 JH393278.1 NC_001323 # to test the rule: hgsql -N -e "select frag from gold;" galGal4 | wc -l # 18795 # should find all: hgsql -N -e "select frag from gold;" galGal4 \ | egrep -e '[AJN][AHC][D3_][N079][0-9]+(\.1)?' | wc -l # 18795 # or exclude all: hgsql -N -e "select frag from gold;" galGal4 \ | egrep -v -e '[AJN][AHC][D3_][N079][0-9]+(\.1)?' | wc -l # 0 # hence, add to trackDb/chicken/galGal4/trackDb.ra searchTable gold shortCircuit 1 termRegex [AJN][AHC][D3_][N079][0-9]+(\.1)? query select chrom,chromStart,chromEnd,frag from %s where frag like '%s%%' searchPriority 8 ######################################################################### # LASTZ Medium Ground Finch geoFor1 (DONE - 2012-07-27 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S geoFor1 mkdir /hive/data/genomes/galGal4/bed/lastzGeoFor1.2012-07-27 cd /hive/data/genomes/galGal4/bed/lastzGeoFor1.2012-07-27 # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 cat << '_EOF_' > DEF # Chicken vs. Medium Ground Finch BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: chicken galGal4 SEQ1_DIR=/hive/data/genomes/galGal4/galGal4.2bit SEQ1_LEN=/hive/data/genomes/galGal4/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 SEQ1_LIMIT=100 # QUERY: Medium Ground Finch GeoFor1 SEQ2_DIR=/hive/data/genomes/geoFor1/geoFor1.2bit SEQ2_LEN=/hive/data/genomes/geoFor1/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=100 BASE=/hive/data/genomes/galGal4/bed/lastzGeoFor1.2012-07-27 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs doBlastzChainNet.pl -verbose=2 `pwd`/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ > do.log 2>&1 # the chaining step ran for several days, the longest chain job: # Longest finished job: 213567s 3559.45m 59.32h 2.47d # Elapsed time: 4572m44s cat fb.galGal4.chainGeoFor1Link.txt # 744451026 bases of 1032854810 (72.077%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/galGal4/bed ln -s lastzGeoFor1.2012-07-27 lastz.geoFor1 mkdir /hive/data/genomes/geoFor1/bed/blastz.galGal4.swap cd /hive/data/genomes/geoFor1/bed/blastz.galGal4.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/galGal4/bed/lastzGeoFor1.2012-07-27/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ > swap.log 2>&1 & # real 81m0.123s cat fb.geoFor1.chainHg19Link.txt # 736380222 bases of 1041286029 (70.718%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/geoFor1/bed ln -s blastz.galGal4.swap lastz.galGal4 ############################################################################## # LASTZ petMar2 Lamprey (DONE - 2012-10-23 - Hiram) screen -S galGal4PetMar2 # use a screen to control this multi-day job mkdir /cluster/data/galGal4/bed/lastzPetMar2.2012-10-23 cd /cluster/data/galGal4/bed/lastzPetMar2.2012-10-23 cat << '_EOF_' > DEF # Chicken vs Lamprey BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_M=50 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET: Chicken galGal4 SEQ1_DIR=/hive/data/genomes/galGal4/galGal4.2bit SEQ1_LEN=/hive/data/genomes/galGal4/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 SEQ1_LIMIT=100 # QUERY: Lamprey petMar2 SEQ2_DIR=/hive/data/genomes/petMar2/petMar2.2bit SEQ2_LEN=/hive/data/genomes/petMar2/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=100 BASE=/cluster/data/galGal4/bed/lastzPetMar2.2012-10-23 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # why is memk used here for big hub ? Should have been swarm time nice -n +19 doBlastzChainNet.pl \ `pwd`/DEF \ -workhorse=hgwdev -chainMinScore=5000 -chainLinearGap=loose \ -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -bigClusterHub=memk -verbose=2 > do.log 2>&1 & # shut this down on memk after awhile and finished it on swarm # real 4136m52.553s # continuing after finishing on swarm: time nice -n +19 doBlastzChainNet.pl \ -continue=cat `pwd`/DEF \ -workhorse=hgwdev -chainMinScore=5000 -chainLinearGap=loose \ -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -bigClusterHub=swarm -verbose=2 > cat.log 2>&1 & # Elapsed time: 49m2s cat fb.galGal4.chainPetMar2Link.txt # 18440677 bases of 1032854810 (1.785%) in intersection # and the swap mkdir /cluster/data/petMar2/bed/blastz.galGal4.swap cd /cluster/data/petMar2/bed/blastz.galGal4.swap time nice -n +19 doBlastzChainNet.pl \ /cluster/data/galGal4/bed/lastzPetMar2.2012-10-23/DEF \ -chainMinScore=5000 -chainLinearGap=loose \ -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -swap -bigClusterHub=swarm -verbose=2 > swap.log 2>&1 & # real 6m41.091s cat fb.petMar2.chainGalGal4Link.txt # 18576173 bases of 647368134 (2.869%) in intersection ############################################################################