# for emacs: -*- mode: sh; -*- # $Id: nomLeu1.txt,v 1.6 2010/05/06 16:27:44 chinhli Exp $ # Nomascus leucogenys (gibbon) -- GGSC Nleu1.0 (2010-01-29) # file template copied from susScr2.txt # Nomascus leucogenys (NCBI project 13974, accession GCA_000146795.1) # by Gibbon Genome Sequencing Consortium # assembly] sequence: # ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Nomascus_leucogenys/Nleu1.0/ # http://www.ncbi.nlm.nih.gov/Traces/wgs/?val=ADFV00 ########################################################################## # Download sequence (DONE 2010-10-22 - Chin) mkdir /hive/data/genomes/nomLeu1 cd /hive/data/genomes/nomLeu1 mkdir genbank cd genbank wget --timestamping -r --cut-dirs=6 --level=0 -nH -x \ --no-remove-listing -np \ "ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Nomascus_leucogenys/Nleu1.0/" # FINISHED --2010-10-27 16:14:02-- # Downloaded: 12 files, 1.1G in 2m 51s (6.49 MB/s) # Read ASSEMBLY_INFO # stay at genbank directory # Process the unplaced scaffolds, filter out the # The ".1" at the end (i.e. ABQO010000034.1) of contig name, since # MySQL does not allow "." as part of the table name and # will casue problems in genbank task step later export S=Primary_Assembly/unplaced_scaffolds zcat ${S}/AGP/unplaced.scaf.agp.gz | grep "^#" > nomLeu1.agp # append the gap records zcat ${S}/AGP/unplaced.scaf.agp.gz | grep -v "^#" \ | sed -e "s/\.1//" >> nomLeu1.agp gzip nomLeu1.agp & zcat ${S}/FASTA/unplaced.scaf.fa.gz \ | sed -e "s#^>.*|gb|#>#; s#|.*##" -e "s/\.1//" \ | gzip > nomLeu1.fa.gz zcat nomLeu1.fa.gz | grep "^>" | wc -l # 17968 faSize Primary_Assembly/unplaced_scaffolds/FASTA/unplaced.scaf.fa.gz # 2936035333 bases (179443556 N's 2756591777 real 2756591777 upper 0 lower) # in 17968 sequences in 1 files # N50 mkdir N50 zcat nomLeu1.fa.gz | faCount stdin | awk '/^(GL|ADFV)/ {print $1, $2}' > N50/chrom.sizes n50.pl N50/chrom.sizes # reading: N50/chrom.sizes # contig count: 17968, total size: 2936035333, one half size: 1468017666 # cumulative N50 count contig contig size # 1461968777 40 GL397300 22842388 # 1468017666 one half size # 1484660812 41 GL397301 22692035 ######################################################################### # Initial makeGenomeDb.pl (DONE - 2010-10-29 - Chin) cd /hive/data/genomes/nomLeu1 cat << '_EOF_' > nomLeu1.config.ra # Config parameters for makeGenomeDb.pl: db nomLeu1 clade mammal genomeCladePriority 13 scientificName Nomascus leucogenys commonName Gibbon assemblyDate Jan. 2010 assemblyLabel GGSC (NCBI project 13974, accession GCA_000146795.1) assemblyShortLabel GGSC Nleu1.0 orderKey 33 mitoAcc none fastaFiles /hive/data/genomes/nomLeu1/genbank/nomLeu1.fa.gz agpFiles /hive/data/genomes/nomLeu1/genbank/nomLeu1.agp.gz # qualFiles none dbDbSpeciesDir gibbon taxId 61853 '_EOF_' # << happy emacs time makeGenomeDb.pl -noGoldGapSplit -workhorse=hgwdev nomLeu1.config.ra \ > makeGenomeDb.log 2>&1 & # real 23m53.122s # add the trackDb entries to the source tree, and the 2bit link: ln -s `pwd`/nomLeu1.unmasked.2bit /gbdb/nomLeu1/nomLeu1.2bit # Per instructions in makeGenomeDb.log: # cd ~/kent/src/hg/makeDb/trackDb # edit makefile to add nomLeu1 to DBS. # git add gibbon/nomLeu1/*.{ra,html} # git commit -m "Added nomLeu1 to DBS." makefile # git commit -m "Initial descriptions for nomLeu1." gibbon/nomLeu1/*.{ra,html} # git pull; git push # Run make update DBS=nomLeu1 and make alpha when done. # (optional) Clean up /cluster/data/nomLeu1/TemporaryTrackDbCheckout ######################################################################### # RepeatMasker (DONE - 2010-10-30 - Chin) mkdir /hive/data/genomes/nomLeu1/bed/repeatMasker cd /hive/data/genomes/nomLeu1/bed/repeatMasker time nice -n +19 doRepeatMasker.pl -buildDir=`pwd` \ -workhorse=hgwdev -bigClusterHub=swarm -noSplit nomLeu1 > do.log 2>&1 & # real 429m4.089s cat faSize.rmsk.txt # 2936035333 bases (179443556 N's 2756591777 real 1356684256 upper # 1399907521 lower) in 17968 sequences in 1 files # %47.68 masked total, %50.78 masked real #bug 5557 fixed up (2011-11-16 Chin) cd /hive/data/genomes/nomLeu1/bed/repeatMasker head -3 nomLeu1.fa.out > nomLeu1.sorted.fa.out tail -n +4 nomLeu1.fa.out | sort -k5,5 -k6,6n >> nomLeu1.sorted.fa.out hgLoadOut -table=rmsk -nosplit nomLeu1 nomLeu1.sorted.fa.out # Loading up table rmsk # note: 2 records dropped due to repStart > repEnd # run with -verbose=2 for details hgLoadOut -table=rmsk -nosplit -verbose=2 nomLeu1 nomLeu1.sorted.fa.out # hgLoadOut: connected to database: nomLeu1 # bad rep range [5563, 5562] line 2793949 of nomLeu1.sorted.fa.out # bad rep range [3599, 3598] line 3970033 of nomLeu1.sorted.fa.out # Loading up table rmsk # note: 2 records dropped due to repStart > repEnd ######################################################################### # simpleRepeats ( DONE 2010-10-31 - Chin) mkdir /hive/data/genomes/nomLeu1/bed/simpleRepeat cd /hive/data/genomes/nomLeu1/bed/simpleRepeat time nice -n +19 doSimpleRepeat.pl -buildDir=`pwd` -workhorse=hgwdev \ -bigClusterHub=swarm -smallClusterHub=swarm nomLeu1 > do.log 2>&1 & # real 39m4.732s cat fb.simpleRepeat # 122433828 bases of 2756591777 (4.441%) in intersection # add to the repeatMasker cd /hive/data/genomes/nomLeu1 twoBitMask nomLeu1.rmsk.2bit -add bed/simpleRepeat/trfMask.bed nomLeu1.2bit # safe to ignore warnings about >=13 fields twoBitToFa nomLeu1.2bit stdout | faSize stdin > nomLeu1.2bit.faSize.txt cat nomLeu1.2bit.faSize.txt # 2936035333 bases (179443556 N's 2756591777 real 1355629718 upper # 1400962059 lower) in 17968 sequences in 1 files # %47.72 masked total, %50.82 masked real rm /gbdb/nomLeu1/nomLeu1.2bit ln -s `pwd`/nomLeu1.2bit /gbdb/nomLeu1/nomLeu1.2bit ######################################################################### # Marking *all* gaps - they are all in the AGP file # (DONE - 2010-10-31 - Chin) mkdir /hive/data/genomes/nomLeu1/bed/allGaps cd /hive/data/genomes/nomLeu1/bed/allGaps time nice -n +19 findMotif -motif=gattaca -verbose=4 \ -strand=+ ../../nomLeu1.unmasked.2bit > findMotif.txt 2>&1 # real 0m47.822s grep "^#GAP " findMotif.txt | sed -e "s/^#GAP //" > allGaps.bed featureBits nomLeu1 -not gap -bed=notGap.bed # 0 bases of 2756591777 (0.000%) in intersection featureBits nomLeu1 allGaps.bed notGap.bed -bed=new.gaps.bed # 0 bases of 2756591777 (0.000%) in intersection # they are all in the AGP file # what is the highest index in the existing gap table: hgsql -N -e "select ix from gap;" nomLeu1 | sort -n | tail -1 # 8318 ######################################################################## # Create kluster run files (working - 2010-11-01 - Chin) # numerator is nomLeu1 gapless bases "real" as reported by: featureBits -noRandom -noHap nomLeu1 gap # 179443556 bases of 2756591777 (6.510%) in intersection # denominator is hg19 gapless bases as reported by: # featureBits -noRandom -noHap hg19 gap # 234344806 bases of 2861349177 (8.190%) in intersection # 1024 is threshold used for human -repMatch: calc \( 2756591777 / 2861349177 \) \* 1024 # ( 2756591777 / 2861349177 ) * 1024 = 986.510141 # ==> use -repMatch=400 according to size scaled down from 1024 for human. # and rounded down to nearest 50 (in this case, 900) cd /hive/data/genomes/nomLeu1 blat nomLeu1.2bit \ /dev/null /dev/null -tileSize=11 -makeOoc=jkStuff/nomLeu1.11.ooc \ -repMatch=900 & # Loading nomLeu1.2bit # Counting nomLeu1.2bit # Writing jkStuff/nomLeu1.11.ooc # Wrote 36558 overused 11-mers to jkStuff/nomLeu1.11.ooc # Done making jkStuff/nomLeu1.11.ooc mkdir /hive/data/staging/data/nomLeu1 cp -p nomLeu1.2bit jkStuff/nomLeu1.11.ooc /hive/data/staging/data/nomLeu1 cp -p chrom.sizes /hive/data/staging/data/nomLeu1 # check for non-bridged gaps to see what the typical size is: hgsql -e "select bridge from gap;" nomLeu1 | sort | uniq # bridge # yes # all gap are bridged, done # ask cluster-admin to copy (evry time if any file chsnged) # /hive/data/staging/data/nomLeu1 directory to cluster nodes # /scratch/data/nomLeu1 ######################################################################## # GENBANK AUTO UPDATE (DONE 2010-11-01 - Chin) ssh hgwdev cd $HOME/kent/src/hg/makeDb/genbank git pull # edit etc/genbank.conf to add nomLeu1 just after ponAbe2 # nomLeu1 (Gibbon) nomLeu1.serverGenome = /hive/data/genomes/nomLeu1/nomLeu1.2bit nomLeu1.clusterGenome = /scratch/data/nomLeu1/nomLeu1.2bit nomLeu1.ooc = /scratch/data/nomLeu1/nomLeu1.11.ooc nomLeu1.lift = no nomLeu1.perChromTables = no nomLeu1.refseq.mrna.native.pslCDnaFilter = ${ordered.refseq.mrna.native.pslCDnaFilter} nomLeu1.refseq.mrna.xeno.pslCDnaFilter = ${ordered.refseq.mrna.xeno.pslCDnaFilter} nomLeu1.genbank.mrna.native.pslCDnaFilter = ${ordered.genbank.mrna.native.pslCDnaFilter} nomLeu1.genbank.mrna.xeno.pslCDnaFilter = ${ordered.genbank.mrna.xeno.pslCDnaFilter} nomLeu1.genbank.est.native.pslCDnaFilter = ${ordered.genbank.est.native.pslCDnaFilter} nomLeu1.genbank.est.xeno.pslCDnaFilter = ${ordered.genbank.est.xeno.pslCDnaFilter} nomLeu1.downloadDir = nomLeu1 nomLeu1.refseq.mrna.native.load = no nomLeu1.refseq.mrna.xeno.load = yes nomLeu1.refseq.mrna.xeno.loadDesc = yes nomLeu1.genbank.mrna.native.load = yes nomLeu1.genbank.mrna.native.loadDesc = yes nomLeu1.genbank.mrna.xeno.load = yes nomLeu1.genbank.mrna.xeno.loadDesc = yes nomLeu1.genbank.est.native.load = no nomLeu1.genbank.est.native.loadDesc = no git add etc/genbank.conf git commit -m "Added nomLeu1" etc/genbank.conf git pull git push # update /cluster/data/genbank/: make etc-update # Edit src/lib/gbGenome.c to add new species. With these two lines: # static char *nomLeuNames[] = {"Nomascus leucogenys", NULL}; # ... later ... # {"nomLeu", nomLeuNames}, # gbGenome.c is in # /cluster/home/chinhli/kent/src/hg/makeDb/genbank/src/lib # make and checkin git add src/lib/gbGenome.c git commit -m "adding nomLeu1 Gibbon" src/lib/gbGenome.c git pull git push make install-server ssh genbank screen # control this business with a screen since it takes a while cd /cluster/data/genbank time nice -n +19 ./bin/gbAlignStep -initial nomLeu1 & # logFile: var/build/logs/2010.11.02-08:51:08.nomLeu1.initalign.log # real 181m41.973s # To re-do, rm the dir first: # /cluster/data/genbank/data/aligned/genbank.180.0/nomLeu1 # load database when finished ssh hgwdev cd /cluster/data/genbank time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad nomLeu1 & # logFile: var/dbload/hgwdev/logs/2011.02.07-18:36:56.dbload.log # real 19m51.159s # enable daily alignment and update of hgwdev cd ~/kent/src/hg/makeDb/genbank git pull # add nomLeu1 to: etc/align.dbs etc/hgwdev.dbs git add etc/align.dbs etc/hgwdev.dbs git commit -m "Added nomLeu1 - Gibbon" etc/align.dbs etc/hgwdev.dbs git pull git push make etc-update ############################################################################ # nomLeu1 - Gibbon - Ensembl Genes version 62 (DONE - 2011-04-19 - Hiram) ssh hgwdev cd /hive/data/genomes/nomLeu1 cat << '_EOF_' > nomLeu1.ensGene.ra # required db variable db nomLeu1 # optional nameTranslation, the sed command that will transform # Ensemble names to UCSC names. With single quotes to protect # everything in perl nameTranslation 's/^GL\([0-9][0-9]*\).1/GL\1/; s/^ADFV\([0-9][0-9]*\).1/ADFV\1/;' '_EOF_' # << happy emacs doEnsGeneUpdate.pl -ensVersion=62 nomLeu1.ensGene.ra ssh hgwdev cd /hive/data/genomes/nomLeu1/bed/ensGene.62 featureBits nomLeu1 ensGene # 45293349 bases of 2756591777 (1.643%) in intersection hgsql -e \ 'update trackVersion set dateReference="current" where db="nomLeu1";' hgFixed ############################################################################ # lastz swap from Gorilla gorGor3 (DONE - 2011-10-27 - Hiram) # original alignment on gorilla cd /hive/data/genomes/gorGor3/bed/lastzNomLeu1.2011-10-21 cat fb.gorGor3.chainNomLeu1Link.txt # 2356321697 bases of 2822760080 (83.476%) in intersection # running the swap mkdir /hive/data/genomes/nomLeu1/bed/blastz.gorGor3.swap cd /hive/data/genomes/nomLeu1/bed/blastz.gorGor3.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/gorGor3/bed/lastzNomLeu1.2011-10-21/DEF \ -swap -syntenicNet \ -noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ > swap.log 2>&1 & # real 74m9.554s cat fb.nomLeu1.chainGorGor3Link.txt # 2318822567 bases of 2756591777 (84.119%) in intersection cd /hive/data/genomes/nomLeu1/bed ln -s blastz.gorGor3.swap lastz.gorGor3 ########################################################################### # lastz swap Human hg19 (DONE - 2011-11-08 - Chin) # original alignment cd /hive/data/genomes/hg19/bed/lastzNomLeu1.2011-11-04 cat fb.hg19.chainNomLeu1Link.txt # 2543943556 bases of 2897316137 (87.803%) in intersection # running the swap mkdir /hive/data/genomes/nomLeu1/bed/blastz.hg19.swap cd /hive/data/genomes/nomLeu1/bed/blastz.hg19.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/hg19/bed/lastzNomLeu1.2011-11-04/DEF \ -swap -syntenicNet \ -noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ > swap.log 2>&1 & # real 69m39.685s cat fb.nomLeu1.chainHg19Link.txt # 2480558770 bases of 2756591777 (89.986%) in intersection cd /hive/data/genomes/nomLeu1/bed ln -s blastz.hg19.swap lastz.hg19 ######################################################################### # set gibbon/trackDb.chainNet.ra after last lastz run (DONE 2011-11-17 - Chin) # put the output from chainNet.pl and findScore.pl result in the ra file chainNet.pl nomLeu1 # track chainNetGorGor3 override # priority 210.3 # track chainNetHg19 override # priority 230.3 findScores.pl hg19 nomLeu1 # looking in file: # /hive/data/genomes/hg19/bed/lastz.nomLeu1/axtChain/run/chain.csh # -scoreScheme=/scratch/data/blastz/human_chimp.v2.q # matrix 16 90,-330,-236,-356,-330,100,-318,-236,-236,-318,100,-330,-356,-236,-330,90 # -minScore=5000 # -linearGap=medium ######################################################################### # all.joiner update, downloads and in pushQ (DONE 2011-11-10 - Chin) cd $HOME/kent/src/hg/makeDb/schema # fixup all.joiner until this is a clean output joinerCheck -database=nomLeu1 -all all.joiner mkdir /hive/data/genomes/nomLeu1/goldenPath cd /hive/data/genomes/nomLeu1/goldenPath makeDownloads.pl nomLeu1 > do.log 2>&1 # now ready for pushQ entry mkdir /hive/data/genomes/nomLeu1/pushQ cd /hive/data/genomes/nomLeu1/pushQ makePushQSql.pl nomLeu1 > nomLeu1.pushQ.sql 2> stderr.out # check for errors in stderr.out, some are OK, e.g.: # WARNING: hgwdev does not have /gbdb/nomLeu1/wib/gc5Base.wib # WARNING: hgwdev does not have /gbdb/nomLeu1/wib/quality.wib # WARNING: hgwdev does not have /gbdb/nomLeu1/bbi/quality.bw # WARNING: nomLeu1 does not have seq # WARNING: nomLeu1 does not have extFile # # WARNING: Could not tell (from trackDb, all.joiner and hardcoded # lists of # supporting and genbank tables) which tracks to assign these tables # to: # tableList # copy it to hgwbeta scp -p nomLeu1.pushQ.sql hgwbeta:/tmp ssh hgwbeta cd /tmp hgsql qapushq < nomLeu1.pushQ.sql # in that pushQ entry walk through each entry and see if the # sizes will set properly ######################################################################### # BLATSERVERS ENTRY (DONE - 2011-11-17 - Chin) # this has been requested through Steve H. A blat has been started for # nomLeu1 on the host blat1 on ports 17812 (trans) and 17813 (untrans). # After getting a blat server assigned by the Blat Server Gods, ssh hgwdev hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ VALUES ("nomLeu1", "blat1", "17812", "1", "0"); \ INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ VALUES ("nomLeu1", "blat1", "17813", "0", "1");' \ hgcentraltest # test it with some sequence ############################################################################ # construct liftOver to nomLeu2 (DONE - 2012-11-14 - Hiram) screen -S nomLeu2 # manage this longish running job in a screen mkdir /hive/data/genomes/nomLeu1/bed/blat.nomLeu2.2012-11-14 cd /hive/data/genomes/nomLeu1/bed/blat.nomLeu2.2012-11-14 # check it with -debug first to see if it is going to work: time doSameSpeciesLiftOver.pl -buildDir=`pwd` -bigClusterHub=swarm \ -ooc=/hive/data/genomes/nomLeu1/jkStuff/nomLeu1.11.ooc \ -debug -dbHost=hgwdev -workhorse=hgwdev nomLeu1 nomLeu2 # real 0m1.838s # if that is OK, then run it: time doSameSpeciesLiftOver.pl -buildDir=`pwd` -bigClusterHub=swarm \ -ooc=/hive/data/genomes/nomLeu1/jkStuff/nomLeu1.11.ooc \ -dbHost=hgwdev -workhorse=hgwdev nomLeu1 nomLeu2 > do.log 2>&1 # real 277m26.138s # verify this file exists: # /gbdb/nomLeu1/liftOver/nomLeu1ToNomLeu2.over.chain.gz # and try out the conversion on genome-test from nomLeu1 to nomLeu2 ############################################################################