# for emacs: -*- mode: sh; -*- # Papio hamadryas - Baboon # ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Phamadryas/fasta/Pham_1.0/bin0/* # ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Phamadryas/fasta/Pham_1.0/linearScaffolds/* # ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Phamadryas/fasta/Pham_1.0/contigs/* # http://www.hgsc.bcm.tmc.edu/project-species-p-Papio%20hamadryas.hgsc?pageLocation=Papio%20hamadryas ########################################################################## # download sequence (DONE - 2009-05-06 - Hiram) mkdir -p /hive/data/genomes/papHam1/download cd /hive/data/genomes/papHam1/download wget --timestamping \ 'ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Phamadryas/fasta/Pham_1.0/contigs/*' wget --timestamping \ 'ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Phamadryas/fasta/Pham_1.0/linearScaffolds/*' wget --timestamping \ 'ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Phamadryas/fasta/Pham_1.0/bin0/*' cat << '_EOF_' > renameScaffold.sh #!/bin/sh scaf=1 export scaf # zcat Pham.20081120.linear.fa.gz | faCount stdin > faCount.linear.txt egrep -v "^#seq|^total" faCount.linear.txt \ | awk '{printf "%s\t%d\n", $1,$2}' \ | sed -e "s/^Contig//" \ | sort -k2,2nr -k1,1n | sed -e "s/^/Contig/" | awk '{print $1}' \ | while read C do echo "${C} scaffold${scaf}" scaf=`echo $scaf | awk '{print $1+1}'` done '_EOF_' # << happy emacs chmod +x renameScaffold.sh ./renameScaffold.sh > ctg2scaf.txt cat << '_EOF_' > ucscAgp.pl #!/usr/bin/env perl use warnings; use strict; my %scafName; my $scafCount = 0; open (FH, ") { chomp $line; my ($ctg, $scaf) = split('\s+',$line); $scafName{$ctg} = $scaf; ++$scafCount; } close (FH); printf STDERR "read in $scafCount ctg to scaffold name translations\n"; open (FH, ") { my ($ctg, $rest) = split('\s+',$line,2); if ($ctg =~ m/_/) { printf "%s\t%s", $scafName{$ctg}, $rest; } else { printf "%s\t%s", $ctg, $rest; } } close (FH); '_EOF_' # << happy emacs chmod +x ucscAgp.pl ./ucscAgp.pl > ucsc.agp cat << '_EOF_' > rename.pl #!/usr/bin/env perl use warnings; use strict; my %scafName; my $scafCount = 0; open (FH, ") { chomp $line; my ($ctg, $scaf) = split('\s+',$line); $scafName{$ctg} = $scaf; ++$scafCount; } close (FH); printf STDERR "read in $scafCount ctg to scaffold name translations\n"; open (QL,">linear.qual.fa") or die "can not write to linear.qual.fa"; open (FH, "zcat Pham.20081120.linear.fa.qual.gz|") or die "can not zcat Pham.20081120.linear.fa.qual.gz|"; while (my $line = ) { if ($line =~m/^>/) { chomp $line; my $ctg = $line; $ctg =~ s/^>//; if ($ctg =~ m/_/) { printf QL ">%s\n", $scafName{$ctg}; } else { printf QL ">%s\n", $ctg; } } else { printf QL "%s", $line; } } close (FH); close (QL); '_EOF_' # << happy emacs chmod +x rename.pl ./rename.pl ########################################################################## # initial browser construction (DONE - 2009-05-12 - Hiram) cd /hive/data/genomes/papHam1 cat << '_EOF_' > papHam1.config.ra # Config parameters for makeGenomeDb.pl: db papHam1 scientificName Papio hamadryas commonName Baboon assemblyDate Nov. 2008 assemblyLabel Baylor BCM HGSC Pham_1.0 orderKey 39 mitoAcc NC_001992 fastaFiles /hive/data/genomes/papHam1/download/Pham.20081120.contigs.fa.gz agpFiles /hive/data/genomes/papHam1/download/ucsc.agp qualFiles /hive/data/genomes/papHam1/download/linear.qual.qac dbDbSpeciesDir baboon clade mammal genomeCladePriority 16 taxId 9562 '_EOF_' # << happy emacs makeGenomeDb.pl -fileServer=hgwdev \ -workhorse=hgwdev papHam1.config.ra > makeGenomeDb.out 2>&1 ########################################################################## # repeatMasking (DONE - 2009-05-13 - Hiram) mkdir /hive/data/genomes/papHam1/bed/repeatMasker cd /hive/data/genomes/papHam1/bed/repeatMasker time doRepeatMasker.pl -buildDir=`pwd` -noSplit \ -bigClusterHub=swarm -dbHost=hgwdev -workhorse=hgwdev \ -smallClusterHub=memk papHam1 > do.log 2>&1 & # about 15 hours cat faSize.rmsk.txt # 2867564654 bases (125715603 N's 2741849051 real 1407767294 upper # 1334081757 lower) in 387374 sequences in 1 files # %46.52 masked total, %48.66 masked real grep -i versi do.log # RepeatMasker version development-$Id: RepeatMasker,v 1.23 2009/02/02 21:10:05 angie Exp $ # Jan 29 2009 (open-3-2-7) version of RepeatMasker ########################################################################## # running simple repeat (DONE - 2009-05-14 - Hiram) mkdir /hive/data/genomes/papHam1/bed/simpleRepeat cd /hive/data/genomes/papHam1/bed/simpleRepeat time doSimpleRepeat.pl -buildDir=`pwd` -bigClusterHub=swarm \ -dbHost=hgwdev -workhorse=hgwdev -smallClusterHub=memk \ papHam1 > do.log 2>&1 & # about 38 hours featureBits papHam1 simpleRepeat # 64502452 bases of 2741867288 (2.353%) in intersection twoBitMask papHam1.rmsk.2bit -add bed/simpleRepeat/trfMask.bed papHam1.2bit twoBitToFa papHam1.2bit stdout | faSize stdin > papHam1.2bit.faSize.txt # 2867564654 bases (125715603 N's 2741849051 real 1406524634 upper # 1335324417 lower) in 387374 sequences in 1 files # %46.57 masked total, %48.70 masked real twoBitToFa papHam1.rmsk.2bit stdout | faSize stdin > papHam1.rmsk.faSize.txt # 2867564654 bases (125715603 N's 2741849051 real 1407767294 upper # 1334081757 lower) in 387374 sequences in 1 files # %46.52 masked total, %48.66 masked real rm /gbdb/papHam1/papHam1.2bit ln -s /hive/data/genomes/papHam1/papHam1.2bit /gbdb/papHam1/papHam1.2bit ########################################################################## # SWAP lastz from Mm10 (DONE - 2012-03-10 - Hiram) # original alignment on Mm10 cd /hive/data/genomes/mm10/bed/lastzPapHam1.2012-03-09 cat fb.mm10.chainPapHam1Link.txt # 890718423 bases of 2652783500 (33.577%) in intersection # and for this swap mkdir /hive/data/genomes/papHam1/bed/blastz.mm10.swap cd /hive/data/genomes/papHam1/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzPapHam1.2012-03-09/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 548m15.438s cat fb.mm10.chainPapHam1Link.txt # 878016290 bases of 2741867288 (32.023%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/papHam1/bed ln -s blastz.mm10.swap lastz.mm10 ############################################################################## # cpgIsland - (DONE - 2012-04-23 - Hiram) mkdir /hive/data/genomes/papHam1/bed/cpgIslands cd /hive/data/genomes/papHam1/bed/cpgIslands time doCpgIslands.pl papHam1 > do.log 2>&1 # real 940m44.062s # fixing broken command in script: time ./doLoadCpg.csh # real 3m56.264s time doCpgIslands.pl -continue=cleanup papHam1 > cleanup.log 2>&1 # real 121m47.995s cat fb.papHam1.cpgIslandExt.txt # 19960131 bases of 2741867288 (0.728%) in intersection ############################################################################## # genscan - (DONE - 2011-04-25 - Hiram) mkdir /hive/data/genomes/papHam1/bed/genscan cd /hive/data/genomes/papHam1/bed/genscan time doGenscan.pl papHam1 > do.log 2>&1 # real 810m7.177s cat fb.papHam1.genscan.txt # 51391265 bases of 2741867288 (1.874%) in intersection cat fb.papHam1.genscanSubopt.txt # 52066592 bases of 2741867288 (1.899%) in intersection ######################################################################### # Verify all gaps are marked, add any N's not in gap as type 'other' # (DONE - 2012-05-02 - Hiram) mkdir /hive/data/genomes/papHam1/bed/gap cd /hive/data/genomes/papHam1/bed/gap time nice -n +19 findMotif -motif=gattaca -verbose=4 \ -strand=+ ../../papHam1.unmasked.2bit > findMotif.txt 2>&1 # real 0m15.269s grep "^#GAP " findMotif.txt | sed -e "s/^#GAP //" > allGaps.bed featureBits papHam1 -not gap -bed=notGap.bed # 1358346944 bases of 1358346944 (100.000%) in intersection featureBits papHam1 allGaps.bed notGap.bed -bed=new.gaps.bed # 18237 bases of 2741867288 (0.001%) in intersection # what is the highest index in the existing gap table: hgsql -N -e "select ix from gap;" papHam1 | sort -n | tail -1 # 270 cat << '_EOF_' > mkGap.pl #!/bin/env perl use strict; use warnings; my $ix=`hgsql -N -e "select ix from gap;" papHam1 | sort -n | tail -1`; chomp $ix; open (FH,") { my ($chrom, $chromStart, $chromEnd, $rest) = split('\s+', $line); ++$ix; printf "%s\t%d\t%d\t%d\tN\t%d\tother\tyes\n", $chrom, $chromStart, $chromEnd, $ix, $chromEnd-$chromStart; } close (FH); '_EOF_' # << happy emacs chmod +x ./mkGap.pl ./mkGap.pl > other.bed featureBits -countGaps papHam1 other.bed # 18237 bases of 2741867288 (0.001%) in intersection wc -l other.bed # 5743 hgLoadBed -sqlTable=$HOME/kent/src/hg/lib/gap.sql \ -noLoad papHam1 otherGap other.bed # Read 5743 elements of size 8 from other.bed # starting with this many hgsql -e "select count(*) from gap;" papHam1 # 366334 hgsql papHam1 -e 'load data local infile "bed.tab" into table gap;' # result count: hgsql -e "select count(*) from gap;" papHam1 # 372077 # == 366334 + 5743 # verify we aren't adding gaps where gaps already exist # this would output errors if that were true: gapToLift -minGap=1 papHam1 nonBridged.lift -bedFile=nonBridged.bed # see example in danRer7.txt # there are no non-bridged gaps here: hgsql -N -e "select bridge from gap;" papHam1 | sort | uniq -c # 372077 yes ######################################################################### # windowMasker - (DONE - 2012-05-02 - Hiram) screen -S papHam1 mkdir /hive/data/genomes/papHam1/bed/windowMasker cd /hive/data/genomes/papHam1/bed/windowMasker # trying out new version of the script that does all the usual steps # that used to be performed manually after it was done time /cluster/home/hiram/kent/src/hg/utils/automation/doWindowMasker.pl \ -workhorse=hgwdev -buildDir=`pwd` -dbHost=hgwdev papHam1 > do.log 2>&1 # real 1037m3.817s # fixing broken commands in doLoad step: time ./lastLoad.csh # real 17m15.223s time /cluster/home/hiram/kent/src/hg/utils/automation/doWindowMasker.pl \ -continue=cleanup -workhorse=hgwdev -buildDir=`pwd` \ -dbHost=hgwdev papHam1 > cleanup.log 2>&1 # real 2m19.164s sed -e 's/^/ #\t/' fb.papHam1.windowmaskerSdust.beforeClean.txt # 1107561332 bases of 2867564654 (38.624%) in intersection sed -e 's/^/ #\t/' fb.papHam1.windowmaskerSdust.clean.txt # 981847549 bases of 2867564654 (34.240%) in intersection sed -e 's/^/ #\t/' fb.papHam1.rmsk.windowmaskerSdust.txt # 743599327 bases of 2867564654 (25.931%) in intersection ######################################################################### # set default position to RHO gene displays (DONE - 2012-07-24 - Hiram) hgsql -e \ 'update dbDb set defaultPos="scaffold4004:96950-104215" where name="papHam1";' \ hgcentraltest ############################################################################ # pushQ entry (DONE - 2012-07-24 - Hiram) mkdir /hive/data/genomes/papHam1/pushQ cd /hive/data/genomes/papHam1/pushQ # Mark says don't let the transMap track get there time makePushQSql.pl papHam1 2> stderr.txt | grep -v transMap > papHam1.sql # real 3m56.635s # check the stderr.txt for bad stuff, these kinds of warnings are OK: # WARNING: hgwdev does not have /gbdb/papHam1/wib/gc5Base.wib # WARNING: hgwdev does not have /gbdb/papHam1/wib/quality.wib # WARNING: hgwdev does not have /gbdb/papHam1/bbi/quality.bw # WARNING: papHam1 does not have seq # WARNING: papHam1 does not have extFile # WARNING: papHam1 does not have estOrientInfo scp -p papHam1.sql hgwbeta:/tmp ssh hgwbeta "hgsql qapushq < /tmp/papHam1.sql" ############################################################################# # LIFTOVER TO papAnu2 (DONE - 2013-01-16 - Chin) sh hgwdev cd /hive/data/genomes/papHam1 ln -s jkStuff/papHam1.11.ooc 11.ooc screen time nice -n +19 doSameSpeciesLiftOver.pl -verbose=2 \ -bigClusterHub=swarm -dbHost=hgwdev -workhorse=hgwdev \ papHam1 papAnu2 > doLiftOverToPapHam1.log 2>&1 & # real 1062m56.689s pushd /usr/local/apache/htdocs-hgdownload/goldenPath/papHam1/liftOver/ md5sum papAHam1ToPapAnu2.over.chain.gz >> md5sum.txt popd ############################################################################