# Tupaia belangeri -- 2X Tree Shrew 2X assembly from the Broad
# Assembly named:  tupBel1
# Sequence downloaded and masked by Robert Baertsch, 12/06

#	http://www.ncbi.nlm.nih.gov/bioproject/13971
#	http://www.ncbi.nlm.nih.gov/genome/473
#	http://www.ncbi.nlm.nih.gov/Traces/wgs/?val=AAPY00

################################################

# DOWNLOADS (2007-06-05 kate)
#  Posted masked sequence to download site since
#  it is used in the 28way multiz on hg18

    ssh kkstore05
    cd /cluster/data/tupBel1
    mkdir bigZips
    cd bigZips
    nice twoBitToFa ../tupBel1.2bit tupBel1.fa
    nice gzip tupBel1.fa
    md5sum *.gz > md5sum.txt

    ssh hgwdev
    set d = /usr/local/apache/htdocs/goldenPath
    set bd = /cluster/data/tupBel1
    cp $d/sorAra1/bigZips/README.txt $bd/bigZips
    # EDIT
    mkdir -p $d/tupBel1/bigZips
    ln -s $bd/bigZips/{*.gz,md5sum.txt,README.txt} $d/tupBel1/bigZips

############################################################################
##  BLASTZ swap from mm9 alignments (2007-11-11 - markd)
# ERROR:  chainMm9Link failed, build not complete
    ssh hgwdev
    mkdir /cluster/data/tupBel1/bed/blastz.mm9.swap
    cd /cluster/data/tupBel1/bed/blastz.mm9.swap
    ln -s blastz.mm9.swap ../blastz.mm9
    /cluster/bin/scripts/doBlastzChainNet.pl \
        -swap /cluster/data/mm9/bed/blastz.tupBel1/DEF >& swap.out&

    # GOT ERROR:
    load data local infile 'link.tab' into table chainMm9Link
    mySQL error 1114: The table 'chainMm9Link' is full

    # fb.tupBel1.chainMm9Link.txt:
    #
    # fixing the broken tables:
    cd /hive/data/genomes/tupBel1/bed/blastz.mm9.swap/axtChain

    # fixups done to finish: Thu May 10 19:18:48 PDT 2012 - Hiram
    #	this failed during the load because the chainLink table became
    #	too large.  These were loaded manually with an sql statement to
    #	start the table definition, then a load data local infile using
    #	the .tab files left over from the failed load.  Note the extra
    #	definitions on the chainMm9Link table
    # need max length of tName for tName KEY size:
    cut -f3 chain.tab | awk '{print length($0)}' | sort -rn | head

CREATE TABLE chainMm9 (
  bin smallint(5) unsigned NOT NULL default '0',
    score double not null,	# score of chain
    tName varchar(255) not null,	# Target sequence name
    tSize int unsigned not null,	# Target sequence size
    tStart int unsigned not null,	# Alignment start position in target
    tEnd int unsigned not null,	# Alignment end position in target
    qName varchar(255) not null,	# Query sequence name
    qSize int unsigned not null,	# Query sequence size
    qStrand char(1) not null,	# Query strand
    qStart int unsigned not null,	# Alignment start position in query
    qEnd int unsigned not null,	# Alignment end position in query
    id int unsigned not null,	# chain id
              #Indices
  KEY tName (tName(25),bin),
  KEY id (id)
) TYPE=MyISAM;

    hgsql -e "drop table chainMm9;" tupBel1
    hgsql tupBel1 < chainMm9.sql

    time nice -n +19 hgsql -e \
	"load data local infile \"chain.tab\" into table chainMm9;" tupBel1
    #	real    1m2.412s

CREATE TABLE chainMm9Link (
  bin smallint(5) unsigned NOT NULL default 0,
  tName varchar(255) NOT NULL default '',
  tStart int(10) unsigned NOT NULL default 0,
  tEnd int(10) unsigned NOT NULL default 0,
  qStart int(10) unsigned NOT NULL default 0,
  chainId int(10) unsigned NOT NULL default 0,
  KEY tName (tName(25),bin),
  KEY chainId (chainId)
) ENGINE=MyISAM max_rows=160000000 avg_row_length=55 pack_keys=1 CHARSET=latin1;

    time nice -n +19 hgsql -e \
      "load data local infile \"link.tab\" into table chainMm9Link;" tupBel1
    #   real    14m51.892s

    # finish off the nets:
    time netClass -verbose=0   -noAr noClass.net tupBel1 mm9 tupBel1.mm9.net
    #   real    5m1.569s

    time netFilter -minGap=10 tupBel1.mm9.net \
        | hgLoadNet -verbose=0 tupBel1 netMm9 stdin
    #   real    1m49.538s

    cd /hive/data/genomes/tupBel1/bed/blastz.mm9.swap
    time featureBits tupBel1 chainMm9Link > fb.tupBel1.chainMm9Link.txt 2>&1
    #   real    12m53.574s
    cat fb.tupBel1.chainMm9Link.txt
    #   639266386 bases of 2137225476 (29.911%) in intersection

    # the DEF file was so old, it had invalid path names
    cp /hive/data/genomes/mm9/bed/blastz.tupBel1/DEF .
    # fix path name:
    #   SEQ2_DIR=/hive/data/genomes/tupBel1/tupBel1.2bit

    /cluster/bin/scripts/doBlastzChainNet.pl `pwd`/DEF \
        -continue=download -swap > download.out 2>&1
    #   Elapsed time: 1m24s

############################################################################
##  BLASTZ swap from hg18 alignments (2007-11-11 - markd)
    ssh hgwdev
    mkdir /cluster/data/tupBel1/bed/blastz.hg18.swap
    cd /cluster/data/tupBel1/bed/blastz.hg18.swap
    ln -s blastz.hg18.swap ../blastz.hg18
    /cluster/bin/scripts/doBlastzChainNet.pl \
        -swap /cluster/data/hg18/bed/blastz.tupBel1/DEF >& swap.out&
# crashed: could be out of memory

    # fb.tupBel1.chainHg18Link.txt:
    #

#########################################################################
## Repeat Masker (WORKING - 2008-10-14 - Hiram)
    screen	# to manage this several day job
    mkdir /hive/data/genomes/tupBel1/bed/repeatMasker
    cd /hive/data/genomes/tupBel1/bed/repeatMasker
    time doRepeatMasker.pl -workhorse=hgwdev -bigClusterHub=swarm \
	-buildDir=`pwd` tupBel1 > do.log 2>&1 &
    #	real    660m58.642s
    #	failed in doMask.csh due to no tupBel1.unmasked.2bit file ?
    #	which was there at the beginning, now it is gone ?
    #	the broken doRepeatMasker.pl script removed it due to hive confusion
    #	went back to the jkStuff/makeUnmasked script and re-ran part of that.
    #	then ran the doMask.csh script to finish that, and continuing:
    time $HOME/kent/src/hg/utils/automation/doRepeatMasker.pl \
	-continue=install -workhorse=hgwdev -bigClusterHub=swarm \
        -buildDir=`pwd` tupBel1 > install.log 2>&1 &
    #	real    23m37.870s
    twoBitToFa tupBel1.rmsk.2bit stdout | faSize stdin > faSize.rmsk.txt
# 3660774957 bases (1523549481 N's 2137225476 real 1701215604 upper 436009872
# lower) in 150851 sequences in 1 files
# %11.91 masked total, %20.40 masked real

###########################################################################
# SIMPLE REPEATS TRF (DONE - 2008-10-15 - Hiram)
    screen # use a screen to manage this job
    mkdir /cluster/data/tupBel1/bed/simpleRepeat
    cd /cluster/data/tupBel1/bed/simpleRepeat
    #
    time  $HOME/kent/src/hg/utils/automation/doSimpleRepeat.pl \
	-buildDir=/cluster/data/tupBel1/bed/simpleRepeat \
	tupBel1 > do.log 2>&1 &
    #	real    81m6.278s
    cat fb.simpleRepeat
    #	77116433 bases of 2137225476 (3.608%) in intersection

    #	after RM run is done, add this mask:
    cd /cluster/data/tupBel1
    rm tupBel1.2bit
    twoBitMask tupBel1.rmsk.2bit -add bed/simpleRepeat/trfMask.bed tupBel1.2bit
    #	can safely ignore warning about >=13 fields in bed file

    twoBitToFa tupBel1.2bit stdout | faSize stdin > tupBel.2bit.faSize.txt
# 3660774957 bases (1523549481 N's 2137225476 real 1700569122 upper 436656354
# lower) in 150851 sequences in 1 files
#	%11.93 masked total, %20.43 masked real

    #	link to gbdb
    ln -s `pwd`/tupBel1.2bit /gbdb/tupBel1

###########################################################################
# prepare for kluster runs (DONE _ 2008-10-15 - Hiram)
    # compare to size of real bases to adjust the repMatch
    #	hg18: 2881421696
    #	turTru1: 2137225476
    # thus: 1024 * 2137225476/2881421696 = 759
    #	rounding up to 800 to be conservative
    cd /hive/data/genomes/tupBel1
    time blat tupBel1.2bit \
	/dev/null /dev/null -tileSize=11 -makeOoc=tupBel1.11.ooc -repMatch=800
    #	real    2m18.283s
    #	Wrote 27284 overused 11-mers to tupBel1.11.ooc
    #	and staging data for push to kluster nodes
    mkdir /hive/data/staging/data/tupBel1
    cp -p tupBel1.2bit chrom.sizes tupBel1.11.ooc \
	/hive/data/staging/data/tupBel1
    #	request to cluster admin to push this to the kluster nodes
    #	/scratch/data/

############################################################################
# add NCBI identifiers to the dbDb (DONE - 2008-10-21 - Hiram)
    hgsql -e 'update dbDb set
sourceName="Broad Institute tupBel1 (NCBI project 13971, AAPY01000000)" where name="tupBel1";' hgcentraltest

############################################################################
#  tupBel1 - TreeShrew - Ensembl Genes version 51  (DONE - 2008-12-03 - hiram)
    ssh kkr14u07
    cd /hive/data/genomes/tupBel1
    cat << '_EOF_' > tupBel1.ensGene.ra
# required db variable
db tupBel1
# do we need to translate geneScaffold coordinates
geneScaffolds yes
# after geneScaffold conversions, change Ensembl chrom names to UCSC names
liftUp /cluster/data/tupBel1/jkStuff/ensGene.lft
# ignore genes that do not properly convert to a gene pred, and contig
#	names that are not in the UCSC assembly
skipInvalid yes
# ignore the two genes that have invalid structures from Ensembl:
# 2993: ENSTBET00000015831 no exonFrame on CDS exon 11
# 3556: ENSTBET00000013522 no exonFrame on CDS exon 1
'_EOF_'
#  << happy emacs

    doEnsGeneUpdate.pl -ensVersion=51 tupBel1.ensGene.ra
    ssh hgwdev
    cd /hive/data/genomes/tupBel1/bed/ensGene.51
    featureBits tupBel1 ensGene
    # 22746299 bases of 2137225476 (1.064%) in intersection

 *** All done!  (through the 'makeDoc' step)
 *** Steps were performed in /hive/data/genomes/tupBel1/bed/ensGene.51

############################################################################
# SWAP lastz mm10 (DONE - 2012-03-19 - Hiram)
    # original alignment to mm10
    cat /hive/data/genomes/mm10/bed/lastzTupBel1.2012-03-16/fb.mm10.chainTupBel1Link.txt
    #	524337666 bases of 2652783500 (19.766%) in intersection

    # and this swap
    mkdir /hive/data/genomes/tupBel1/bed/blastz.mm10.swap
    cd /hive/data/genomes/tupBel1/bed/blastz.mm10.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	/hive/data/genomes/mm10/bed/lastzTupBel1.2012-03-16/DEF \
	-swap -syntenicNet \
	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
	-chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 &
    #	real    136m7.163s
    cat fb.tupBel1.chainMm10Link.txt
    #	537379661 bases of 2137225476 (25.144%) in intersection
    # set sym link to indicate this is the lastz for this genome:
    cd /hive/data/genomes/tupBel1/bed
    ln -s blastz.mm10.swap lastz.mm10

##############################################################################
# cpgIslands - (DONE - 2011-04-24 - Hiram)
    mkdir /hive/data/genomes/tupBel1/bed/cpgIslands
    cd /hive/data/genomes/tupBel1/bed/cpgIslands
    time doCpgIslands.pl tupBel1 > do.log 2>&1
    #   real    296m4.437s

    cat fb.tupBel1.cpgIslandExt.txt
    #   23173299 bases of 2137225476 (1.084%) in intersection

#########################################################################
# genscan - (DONE - 2011-04-27 - Hiram)
    mkdir /hive/data/genomes/tupBel1/bed/genscan
    cd /hive/data/genomes/tupBel1/bed/genscan
    time doGenscan.pl tupBel1 > do.log 2>&1
    #   real    60m14.339s
    # failed kluster run due to out of disk space
    # finished manually:
# Completed: 150851 of 150851 jobs
# CPU time in finished jobs:      51736s     862.26m    14.37h    0.60d  0.002 y
# IO & Wait Time:               1645536s   27425.61m   457.09h   19.05d  0.052 y
# Average job time:                  11s       0.19m     0.00h    0.00d
# Longest finished job:              92s       1.53m     0.03h    0.00d
# Submission to last job:        241720s    4028.67m    67.14h    2.80d

    # continuing:
    time doGenscan.pl -workhorse=hgwdev -continue=makeBed tupBel1 \
        > makeBed.log 2>&1
    #   real    83m4.947s

    cat fb.tupBel1.genscan.txt
    #   57907224 bases of 2137225476 (2.709%) in intersection
    cat fb.tupBel1.genscanSubopt.txt
    #   76071378 bases of 2137225476 (3.559%) in intersection

#########################################################################
# windowMasker - (DONE - 2012-05-02 - Hiram)
    screen -S tupBel1
    mkdir /hive/data/genomes/tupBel1/bed/windowMasker
    cd /hive/data/genomes/tupBel1/bed/windowMasker
    # trying out new version of the script that does all the usual steps
    #   that used to be performed manually after it was done
    time /cluster/home/hiram/kent/src/hg/utils/automation/doWindowMasker.pl \
        -workhorse=hgwdev -buildDir=`pwd` -dbHost=hgwdev tupBel1 > do.log 2>&1
    #   real    989m54.597s
    # fixing a broken command in the doLoad step:
    time ./lastLoad.csh
    #   real    7m1.961s
    time /cluster/home/hiram/kent/src/hg/utils/automation/doWindowMasker.pl \
        -continue=cleanup -workhorse=hgwdev -buildDir=`pwd` \
        -dbHost=hgwdev tupBel1 > cleanup.log 2>&1
    #   real    2m10.270s

    sed -e 's/^/    #\t/' fb.tupBel1.windowmaskerSdust.beforeClean.txt
    #   2375760953 bases of 3660774957 (64.898%) in intersection
    sed -e 's/^/    #\t/' fb.tupBel1.windowmaskerSdust.clean.txt
    #   852211472 bases of 3660774957 (23.280%) in intersection
    sed -e 's/^/    #\t/' fb.tupBel1.rmsk.windowmaskerSdust.txt
    #   191436221 bases of 3660774957 (5.229%) in intersection

#########################################################################
# MAKE 11.OOC FILE FOR BLAT/GENBANK (DONE - 2012-05-08 - Hiram)
    # Use -repMatch=900, based on size -- for human we use 1024
    # use the "real" number from the faSize measurement,
    # hg19 is 2897316137, calculate the ratio factor for 1024:
    calc \( 2137225476 / 2897316137 \) \* 1024
    #	( 2137225476 / 2897316137 ) * 1024 = 755.360749

    # round up to 800

    cd /hive/data/genomes/tupBel1
    time blat tupBel1.2bit /dev/null /dev/null -tileSize=11 \
      -makeOoc=jkStuff/tupBel1.11.ooc -repMatch=800
    #   Wrote 27284 overused 11-mers to jkStuff/tupBel1.11.ooc
    #   real    0m55.398s

    # there are non-bridged gaps, construct lift file needed for genbank
    hgsql -N -e "select bridge from gap;" tupBel1 | sort | uniq -c
    #   95346 no
    #   594809 yes
    cd /hive/data/genomes/tupBel1/jkStuff
    gapToLift tupBel1 tupBel1.nonBridged.lift -bedFile=tupBel1.nonBridged.bed
    # largest non-bridged contig:
    awk '{print $3-$2,$0}' tupBel1.nonBridged.bed | sort -nr | head
    #   301412 scaffold_113710.1-440055 109313  410725 scaffold_113710.1-440055.02

#########################################################################
# AUTO UPDATE GENBANK (DONE - 2012-05-03 - Hiram)
    # examine the file:
    /cluster/data/genbank/data/organism.lst
    # for your species to see what counts it has for:
# organism       mrnaCnt estCnt  refSeqCnt
# Tupaia belangeri	63	2365	0
    # to decide which "native" mrna or ests you want to specify in genbank.conf

    ssh hgwdev
    cd $HOME/kent/src/hg/makeDb/genbank
    git pull
    # edit etc/genbank.conf to add tupBel1

# tupBel1 (tree shrew)
tupBel1.serverGenome = /hive/data/genomes/tupBel1/tupBel1.2bit
tupBel1.clusterGenome = /scratch/data/tupBel1/tupBel1.2bit
tupBel1.ooc = /scratch/data/tupBel1/tupBel1.11.ooc
tupBel1.lift = /hive/data/genomes/tupBel1/jkStuff/tupBel1.nonBridged.lift
tupBel1.refseq.mrna.native.pslCDnaFilter  = ${lowCover.refseq.mrna.native.pslCDnaFilter}
tupBel1.refseq.mrna.xeno.pslCDnaFilter    = ${lowCover.refseq.mrna.xeno.pslCDnaFilter}
tupBel1.genbank.mrna.native.pslCDnaFilter = ${lowCover.genbank.mrna.native.pslCDnaFilter}
tupBel1.genbank.mrna.xeno.pslCDnaFilter   = ${lowCover.genbank.mrna.xeno.pslCDnaFilter}
tupBel1.genbank.est.native.pslCDnaFilter  = ${lowCover.genbank.est.native.pslCDnaFilter}
tupBel1.refseq.mrna.native.load = no
tupBel1.refseq.mrna.xeno.load = yes
tupBel1.genbank.mrna.xeno.load = yes
tupBel1.genbank.est.native.load = yes
tupBel1.downloadDir = tupBel1
tupBel1.perChromTables = no

    # end of section added to etc/genbank.conf
    git commit -m "adding tupBel1 tree shrew" etc/genbank.conf
    git push
    make etc-update

    git pull
    # Edit src/lib/gbGenome.c to add new species.
    git commit -m "adding definition for tupBelNames" src/lib/gbGenome.c
    git push
    make install-server

    ssh hgwdev			# used to do this on "genbank" machine
    screen -S tupBel1           # long running job managed in screen
    cd /cluster/data/genbank
    time nice -n +19 ./bin/gbAlignStep -initial tupBel1 &
    #   var/build/logs/2012.06.08-10:03:29.tupBel1.initalign.log
    #   real    2861m50.257s

    # load database when finished
    ssh hgwdev
    cd /cluster/data/genbank
    time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad tupBel1 &
    #	logFile:  var/dbload/hgwdev/logs/2012.06.11-17:15:26.dbload.log
    #   real    27m16.356s

    # enable daily alignment and update of hgwdev (DONE - 2012-05-12 - Hiram)
    cd ~/kent/src/hg/makeDb/genbank
    git pull
    # add tupBel1 to:
    vi etc/align.dbs etc/hgwdev.dbs
    git commit -m "Added tupBel1." etc/align.dbs etc/hgwdev.dbs
    git push
    make etc-update

#########################################################################
# set default position to RHO gene displays  (DONE - 2012-07-26 - Hiram)
    hgsql -e \
'update dbDb set defaultPos="scaffold_149294.1-299378:275786-282564" where name="tupBel1";' \
	hgcentraltest

############################################################################
# pushQ entry (DONE - 2012-07-26 - Hiram)
    mkdir /hive/data/genomes/tupBel1/pushQ
    cd /hive/data/genomes/tupBel1/pushQ
    # Mark says don't let the transMap track get there
    time makePushQSql.pl tupBel1 2> stderr.txt | grep -v transMap > tupBel1.sql
    #   real    3m48.507s
    # check the stderr.txt for bad stuff, these kinds of warnings are OK:
# WARNING: hgwdev does not have /gbdb/tupBel1/wib/gc5Base.wib
# WARNING: hgwdev does not have /gbdb/tupBel1/wib/quality.wib
# WARNING: hgwdev does not have /gbdb/tupBel1/bbi/quality.bw
# WARNING: tupBel1 does not have seq
# WARNING: tupBel1 does not have extFile
# WARNING: tupBel1 does not have estOrientInfo

    scp -p tupBel1.sql hgwbeta:/tmp
    ssh hgwbeta "hgsql qapushq < /tmp/tupBel1.sql"

############################################################################