# for emacs: -*- mode: sh; -*- # $Id: macEug2.txt,v 1.6 2010/05/06 16:27:44 chinhli Exp $ # Macropus eugenii (wallaby) - TWGSC Meug_1.1 (2009-09-29) # http://www.ncbi.nlm.nih.gov/genome/233 # http://www.ncbi.nlm.nih.gov/bioproject/12587 # http://www.ncbi.nlm.nih.gov/Traces/wgs/?val=ABQO01 # file template copied from susScr2.txt DATE: 29-SEP-2009 ORGANISM: Macropus eugenii TAXID: 9315 ASSEMBLY LONG NAME: Meug_1.1 ASSEMBLY SHORT NAME: Meug_1.1 ASSEMBLY SUBMITTER: Tammar Wallaby Genome Sequencing Consortium ASSEMBLY TYPE: Haploid NUMBER OF ASSEMBLY-UNITS: 1 Assembly Accession: GCA_000004035.1 # Macropus eugenii # (NCBI Project ID: 12586, Accession: GCA_000004035.1) # by Tammar Wallaby Genome Sequencing Consortium # ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Macropus_eugenii/Meug_1.1/ ########################################################################## # Download sequence (DONE- 2010-10-22 - Chin) mkdir /hive/data/genomes/macEug2 cd /hive/data/genomes/macEug2 mkdir genbank cd genbank wget --timestamping -r --cut-dirs=6 --level=0 -nH -x \ --no-remove-listing -np \ "ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Macropus_eugenii/Meug_1.1/*" # FINISHED --2010-10-22 08:54:16-- # Downloaded: 12 files, 1.0G in 13m 49s (1.28 MB/s) # Read ASSEMBLY_INFO # stay at genbank directory # Process the unplaced scaffolds, filter out the # The ".1" at the end (i.e. ABQO010000034.1) of contig name, since # MySQL does not allow "." as part of the table name and # will casue problems in genbank task step later export S=Primary_Assembly/unplaced_scaffolds zcat ${S}/AGP/unplaced.scaf.agp.gz | grep "^#" > macEug2.agp # append the gap records zcat ${S}/AGP/unplaced.scaf.agp.gz | grep -v "^#" \ | sed -e "s/\.1//" >> macEug2.agp gzip macEug2.agp & zcat ${S}/FASTA/unplaced.scaf.fa.gz \ | sed -e "s#^>.*|gb|#>#; s#|.*##" -e "s/\.1//" \ | gzip > macEug2.fa.gz zcat macEug2.fa.gz | grep "^>" | wc # 277711 277711 5189116 faSize Primary_Assembly/unplaced_scaffolds/FASTA/unplaced.scaf.fa.gz # 3075184024 bases (539107070 N's 2536076954 real 2536076954 upper # 0 lower) in 277711 sequences in 1 files # N50 mkdir N50 faCount macEug2.fa | awk ' /^(GL|ABQO)/ {print $1, $2}' > N50/chrom.sizes n50.pl N50/chrom.sizes # reading: N50/chrom.sizes # contig count: 277711, total size: 3075184024, one half size: # 1537592012 # cumulative N50 count contig contig size 1537563674 24532 GL116276 36603 1537592012 one half size 1537600276 24533 GL121428 36602 ######################################################################### # Initial makeGenomeDb.pl (DONE - 2010-11-04 - Chin) cd /hive/data/genomes/macEug2 cat << '_EOF_' > macEug2.config.ra # Config parameters for makeGenomeDb.pl: db macEug2 clade mammal genomeCladePriority 67 scientificName Macropus eugenii commonName Wallaby assemblyDate Sep. 2009 assemblyLabel TWGS (NCBI Project ID: 12586, Accession: GCA_000004035.1) assemblyShortLabel TWGS Meug_1.1 orderKey 278 mitoAcc none fastaFiles /hive/data/genomes/macEug2/genbank/macEug2.fa.gz agpFiles /hive/data/genomes/macEug2/genbank/macEug2.agp.gz # qualFiles none dbDbSpeciesDir wallaby taxId 9315 '_EOF_' # << happy emacs time makeGenomeDb.pl -noGoldGapSplit -workhorse=hgwdev macEug2.config.ra \ > makeGenomeDb.log 2>&1 & # real 26m19.911s # add the trackDb entries to the source tree, and the 2bit link: # we will re-ln with masked one after we done the mask business later! *** ln -s `pwd`/macEug2.unmasked.2bit /gbdb/macEug2/macEug2.2bit # Per instructions in makeGenomeDb.log: mkdir -p ~/kent/src/hg/makeDb/trackDb/wallaby/macEug2 cd /cluster/data/macEug2/TemporaryTrackDbCheckout/kent/src/hg/makeDb/trackDb/wallaby/macEug2 cp *.* ~/kent/src/hg/makeDb/trackDb/wallaby/macEug2 cd ~/kent/src/hg/makeDb/trackDb # edit makefile to add macEug2 to DBS. # git add wallaby/macEug2/*.{ra,html} # git commit -m "Added macEug2 to DBS." makefile # git commit -m "Initial descriptions for macEug2." wallaby/macEug2/*.{ra,html} # git pull; git push # Run make update DBS=macEug2 and make alpha when done. # (optional) Clean up /cluster/data/macEug2/TemporaryTrackDbCheckout ######################################################################### # RepeatMasker (DONE - 2010-11-05 - Chin) mkdir /hive/data/genomes/macEug2/bed/repeatMasker cd /hive/data/genomes/macEug2/bed/repeatMasker time nice -n +19 doRepeatMasker.pl -buildDir=`pwd` \ -workhorse=hgwdev -bigClusterHub=swarm -noSplit macEug2 > do.log 2>&1 & # real 294m1.203s cat faSize.rmsk.txt # 3075184024 bases (539107070 N's 2536076954 real 1334921991 upper # 1201154963 lower) in 277711 sequences in 1 files # %39.06 masked total, %47.36 masked real ######################################################################### # simpleRepeats (DONE - 2010-11-06 - Chin) mkdir /hive/data/genomes/macEug2/bed/simpleRepeat cd /hive/data/genomes/macEug2/bed/simpleRepeat time nice -n +19 doSimpleRepeat.pl -buildDir=`pwd` -workhorse=hgwdev \ -bigClusterHub=pk -smallClusterHub=pk macEug2 > do.log 2>&1 & # real 175m13.951s cat fb.simpleRepeat # 38604856 bases of 2536076957 (1.522%) in intersection # add to the repeatMasker cd /hive/data/genomes/macEug2 twoBitMask macEug2.rmsk.2bit -add bed/simpleRepeat/trfMask.bed macEug2.2bit # safe to ignore warnings about >=13 fields twoBitToFa macEug2.2bit stdout | faSize stdin > macEug2.2bit.faSize.txt cat macEug2.2bit.faSize.txt # 3075184024 bases (539107070 N's 2536076954 real 1334271294 upper # 1201805660 lower) in 277711 sequences in 1 files # %39.08 masked total, %47.39 masked real # double check with featureBits featureBits -countGaps macEug2 gap # 539107067 bases of 3075184024 (17.531%) in intersection rm /gbdb/macEug2/macEug2.2bit ln -s `pwd`/macEug2.2bit /gbdb/macEug2/macEug2.2bit ######################################################################### # Marking *all* gaps - they are all in the AGP file # (DONE - 2010-11-08 - Chin) mkdir /hive/data/genomes/macEug2/bed/allGaps cd /hive/data/genomes/macEug2/bed/allGaps time nice -n +19 findMotif -motif=gattaca -verbose=4 \ -strand=+ ../../macEug2.unmasked.2bit > findMotif.txt 2>&1 # real 1m46.677s grep "^#GAP " findMotif.txt | sed -e "s/^#GAP //" > allGaps.bed featureBits macEug2 -not gap -bed=notGap.bed # 2536076957 bases of 2536076957 (100.000%) in intersection featureBits macEug2 allGaps.bed notGap.bed -bed=new.gaps.bed # 3 bases of 2536076957 (0.000%) in intersection hgsql -N -e "select ix from gap;" macEug2 | sort -n | tail -1 # 273 ######################################################################## # Create kluster run files (DONE 2010-11-04 - Chin) # numerator is macEug2 gapless bases "real" as reported by: featureBits -noRandom -noHap macEug2 gap # 1600136831 bases of 1184628269 (135.075%) in intersection # denominator is hg19 gapless bases as reported by: # featureBits -noRandom -noHap hg19 gap # 234344806 bases of 2861349177 (8.190%) in intersection # 1024 is threshold used for human -repMatch: calc \( 1184628269 / 2861349177 \) \* 1024 # ( 1184628269 / 2861349177 ) * 1024 = 423.946632 # ==> use -repMatch=400 according to size scaled down from 1024 for human. # and rounded down to nearest 50 cd /hive/data/genomes/macEug2 blat macEug2.2bit \ /dev/null /dev/null -tileSize=11 -makeOoc=jkStuff/macEug2.11.ooc \ -repMatch=400 & # Wrote 19704 overused 11-mers to jkStuff/macEug2.11.ooc mkdir /hive/data/staging/data/macEug2 cp -p macEug2.2bit jkStuff/macEug2.11.ooc /hive/data/staging/data/macEug2 cp -p chrom.sizes /hive/data/staging/data/macEug2 # check non-bridged gaps to see what the typical size is: hgsql -N \ -e 'select * from gap where bridge="no" order by size;' macEug2 \ | sort -k7,7nr # most gaps have size > 100,000 # decide on a minimum gap for this break gapToLift -verbose=2 -minGap=20000 macEug2 jkStuff/nonBridged.lft \ -bedFile=jkStuff/nonBridged.bed cp -p jkStuff/nonBridged.lft \ /hive/data/staging/data/macEug2/macEug2.nonBridged.lft # ask cluster-admin to copy (evry time if any file chsnged) # /hive/data/staging/data/macEug2 directory to cluster nodes # /scratch/data/macEug2 ######################################################################## # GENBANK AUTO UPDATE (DONE - 2011-10-20 - Chin) ssh hgwdev cd $HOME/kent/src/hg/makeDb/genbank git pull # edit etc/genbank.conf to add macEug2 just before susScr2 # macEug2 (wallaby) macEug2.serverGenome = /hive/data/genomes/macEug2/macEug2.2bit macEug2.clusterGenome = /scratch/data/macEug2/macEug2.2bit macEug2.ooc = /scratch/data/macEug2/macEug2.11.ooc macEug2.lift = no macEug2.perChromTables = no macEug2.refseq.mrna.native.pslCDnaFilter = ${ordered.refseq.mrna.native.pslCDnaFilter} macEug2.refseq.mrna.xeno.pslCDnaFilter = ${ordered.refseq.mrna.xeno.pslCDnaFilter} macEug2.genbank.mrna.native.pslCDnaFilter = ${ordered.genbank.mrna.native.pslCDnaFilter} macEug2.genbank.mrna.xeno.pslCDnaFilter = ${ordered.genbank.mrna.xeno.pslCDnaFilter} macEug2.genbank.est.native.pslCDnaFilter = ${ordered.genbank.est.native.pslCDnaFilter} macEug2.genbank.est.xeno.pslCDnaFilter = ${ordered.genbank.est.xeno.pslCDnaFilter} macEug2.downloadDir = macEug2 macEug2.refseq.mrna.native.load = no macEug2.refseq.mrna.xeno.load = yes macEug2.refseq.mrna.xeno.loadDesc = yes macEug2.genbank.est.native.load = yes macEug2.genbank.est.native.loadDesc = yes macEug2.genbank.mrna.native.load = yes macEug2.genbank.mrna.native.loadDesc = yes macEug2.genbank.mrna.xeno.load = yes macEug2.genbank.mrna.xeno.loadDesc = yes macEug2.genbank.est.native.load = yes macEug2.genbank.est.native.loadDesc = yes git add etc/genbank.conf git commit -m "Added macEug2" etc/genbank.conf git pull git push # update /cluster/data/genbank/: make etc-update # Edit src/lib/gbGenome.c to add new species. With these two lines: # static char *oviAriNames[] = {"Ovis aries", NULL}; # ... later ... # {"macEug2", macEug2Names}, # gbGenome.c is in # /cluster/home/chinhli/kent/src/hg/makeDb/genbank/src/lib # make and checkin make install-server git add src/lib/gbGenome.c git commit -m "adding macEug2 Wallby" src/lib/gbGenome.c git pull git push ssh hgwsev screen # control this business with a screen since it takes a while cd /cluster/data/genbank time nice -n +19 ./bin/gbAlignStep -initial macEug2 & # logFile: var/build/logs/2011.10.23-21:57:16.macEug2.initalign.log # real 4810m43.818s # /cluster/data/genbank/data/aligned/genbank.176.0/macEug2 # load database when finished ssh hgwdev cd /cluster/data/genbank time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad macEug2 & # logFile: var/dbload/hgwdev/logs/2011.10.27-08:35:00.dbload.log # real 66m57.028s # enable daily alignment and update of hgwdev cd ~/kent/src/hg/makeDb/genbank git pull # add macEug2 to: etc/align.dbs etc/hgwdev.dbs git add etc/align.dbs git add etc/hgwdev.dbs git commit -m "Added macEug2 - Wallaby" etc/align.dbs etc/hgwdev.dbs git push make etc-update doGenbankTests macEug2 genbankTest.out # no error ########################################################################## ## WINDOWMASKER (DONE - 2011-11-30 - Chin) mkdir /hive/data/genomes/macEug2/bed/windowMasker cd /hive/data/genomes/macEug2/bed/windowMasker time nice -n +19 doWindowMasker.pl -buildDir=`pwd` -workhorse=hgwdev \ -dbHost=hgwdev macEug2 > do.log 2>&1 & # real 262m19.608s # Masking statistics twoBitToFa macEug2.wmsk.2bit stdout | faSize stdin # 3075184024 bases (539107070 N's 2536076954 real 1634442088 upper # 901634866 lower) in 277711 sequences in 1 files # %29.32 masked total, %35.55 masked real hgLoadBed macEug2 windowmaskerSdust windowmasker.sdust.bed.gz # Loaded 18037550 elements of size 3 featureBits -countGaps macEug2 windowmaskerSdust # 1457250153 bases of 3075184024 (47.387%) in intersection # eliminate the gaps from the masking featureBits macEug2 -not gap -bed=notGap.bed # 2536076957 bases of 2536076957 (100.000%) in intersection time nice -n +19 featureBits macEug2 windowmaskerSdust notGap.bed \ -bed=stdout | gzip -c > cleanWMask.bed.gz # 918147928 bases of 2536076957 (36.203%) in intersection # real 793m0.546s # reload track to get it clean hgLoadBed macEug2 windowmaskerSdust cleanWMask.bed.gz # Loaded 17901097 elements of size 4 featureBits -countGaps macEug2 windowmaskerSdust # 918147928 bases of 3075184024 (29.857%) in intersection # mask the sequence with this clean mask zcat cleanWMask.bed.gz \ | twoBitMask ../../macEug2.unmasked.2bit stdin \ -type=.bed macEug2.cleanWMSdust.2bit twoBitToFa macEug2.cleanWMSdust.2bit stdout | faSize stdin \ > macEug2.cleanWMSdust.faSize.txt cat macEug2.cleanWMSdust.faSize.txt # 3075184024 bases (539107070 N's 2536076954 real 1617929026 upper # 918147928 lower) in 277711 sequences in 1 files # %29.86 masked total, %36.20 masked real ############################################################################ # reset position to RHO location as found from blat of hg19 RHO gene hgsql -e \ 'update dbDb set defaultPos="GL095587:25,998-40,635" where name="macEug2";' \ hgcentraltest # set default Db to macEug2 hgsql -e "update defaultDb set name='macEug2' where genome = 'Wallaby';" \ hgcentraltest ########################################################################## # BLATSERVERS ENTRY (DONE 2011-12-07 - Chin) # After getting a blat server assigned by the Blat Server Gods, # Assembly macEug2 has been started on host blat2 on port 17784 (trans) and # port 17785 (untrans). hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ VALUES ("macEug2", "blat2", "17784", "1", "0"); \ INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ VALUES ("macEug2", "blat2", "17785", "0", "1");' \ hgcentraltest # test it with some sequence ######################################################################### # all.joiner update, downloads and in pushQ - (DONE 2011-12-07 - Chin) cd $HOME/kent/src/hg/makeDb/schema # fixup all.joiner until this is a clean output joinerCheck -database=macEug2 -all all.joiner # Error: macEug2.tableList tables not in .joiner file ignored mkdir /hive/data/genomes/macEug2/goldenPath cd /hive/data/genomes/macEug2/goldenPath makeDownloads.pl macEug2 > do.log 2>&1 # edit all htmls and READMEs per do.log instruction # now ready for pushQ entry mkdir /hive/data/genomes/macEug2/pushQ cd /hive/data/genomes/macEug2/pushQ makePushQSql.pl macEug2 > macEug2.pushQ.sql 2> stderr.out # check for errors in stderr.out, some are OK, e.g.: # WARNING: macEug2 does not have seq # WARNING: macEug2 does not have extFile # WARNING: Could not tell (from trackDb, all.joiner and hardcoded lists of # supporting and genbank tables) which tracks to assign these tables to: # bosTau4ChainPileUp # copy it to hgwbeta scp -p macEug2.pushQ.sql hgwbeta:/tmp ssh hgwbeta cd /tmp hgsql qapushq < macEug2.pushQ.sql # in that pushQ entry walk through each entry and see if the # sizes will set properly #########################################################################