# for emacs: -*- mode: sh; -*-

#       $Id: nomLeu1.txt,v 1.6 2010/05/06 16:27:44 chinhli Exp $

# Nomascus leucogenys (gibbon) --  GGSC Nleu1.0 (2010-01-29)


# file template copied from susScr2.txt

# Nomascus leucogenys (NCBI project 13974, accession GCA_000146795.1)
#   by  Gibbon Genome Sequencing Consortium
#    assembly] sequence: 
# ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Nomascus_leucogenys/Nleu1.0/

#	http://www.ncbi.nlm.nih.gov/Traces/wgs/?val=ADFV00


##########################################################################
# Download sequence (DONE 2010-10-22 - Chin)
    mkdir /hive/data/genomes/nomLeu1
    cd /hive/data/genomes/nomLeu1
    mkdir genbank
    cd genbank
    wget --timestamping -r --cut-dirs=6 --level=0 -nH -x \
        --no-remove-listing -np \
"ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Nomascus_leucogenys/Nleu1.0/"
    # FINISHED --2010-10-27 16:14:02--
    # Downloaded: 12 files, 1.1G in 2m 51s (6.49 MB/s)
    # Read ASSEMBLY_INFO 

    # stay at genbank directory
    # Process the unplaced scaffolds, filter out the  
    #   The ".1" at the end (i.e. ABQO010000034.1) of contig name, since
    #   MySQL does not allow "." as part of the table name and 
    #   will casue problems in genbank task step later

    export S=Primary_Assembly/unplaced_scaffolds
    zcat ${S}/AGP/unplaced.scaf.agp.gz | grep "^#" > nomLeu1.agp
    # append the gap records
    zcat ${S}/AGP/unplaced.scaf.agp.gz | grep -v "^#" \
            | sed  -e "s/\.1//"  >> nomLeu1.agp
    gzip nomLeu1.agp &

    zcat ${S}/FASTA/unplaced.scaf.fa.gz \
            | sed -e "s#^>.*|gb|#>#; s#|.*##"  -e "s/\.1//"  \
            | gzip > nomLeu1.fa.gz
    zcat nomLeu1.fa.gz | grep "^>" | wc -l
    #   17968

   faSize Primary_Assembly/unplaced_scaffolds/FASTA/unplaced.scaf.fa.gz
   # 2936035333 bases (179443556 N's 2756591777 real 2756591777 upper 0 lower)
   #  in 17968 sequences in 1 files

   # N50
   mkdir N50
   zcat  nomLeu1.fa.gz | faCount stdin | awk '/^(GL|ADFV)/ {print $1, $2}' > N50/chrom.sizes
   n50.pl N50/chrom.sizes
    # reading: N50/chrom.sizes
    # contig count: 17968, total size: 2936035333, one half size: 1468017666
    # cumulative    N50 count       contig  contig size
    # 1461968777      40      GL397300        22842388
    # 1468017666 one half size
    # 1484660812      41      GL397301        22692035

#########################################################################
# Initial makeGenomeDb.pl (DONE - 2010-10-29 - Chin)
    cd /hive/data/genomes/nomLeu1
    cat << '_EOF_' > nomLeu1.config.ra
# Config parameters for makeGenomeDb.pl:
db nomLeu1
clade mammal
genomeCladePriority 13
scientificName Nomascus leucogenys
commonName Gibbon
assemblyDate Jan. 2010
assemblyLabel GGSC (NCBI project 13974, accession GCA_000146795.1)
assemblyShortLabel GGSC Nleu1.0
orderKey 33
mitoAcc none
fastaFiles /hive/data/genomes/nomLeu1/genbank/nomLeu1.fa.gz
agpFiles /hive/data/genomes/nomLeu1/genbank/nomLeu1.agp.gz
# qualFiles none
dbDbSpeciesDir gibbon
taxId 61853
'_EOF_'
    # << happy emacs
    time makeGenomeDb.pl -noGoldGapSplit -workhorse=hgwdev nomLeu1.config.ra \
	> makeGenomeDb.log 2>&1 &
    # real    23m53.122s
    
    # add the trackDb entries to the source tree, and the 2bit link:
    ln -s `pwd`/nomLeu1.unmasked.2bit /gbdb/nomLeu1/nomLeu1.2bit
    #  Per instructions in makeGenomeDb.log:
    # cd ~/kent/src/hg/makeDb/trackDb
    # edit makefile to add nomLeu1 to DBS.
    # git add gibbon/nomLeu1/*.{ra,html}
    # git commit -m "Added nomLeu1 to DBS." makefile
    # git commit -m "Initial descriptions for nomLeu1." gibbon/nomLeu1/*.{ra,html}
    #  git pull; git push
    # Run make update DBS=nomLeu1 and make alpha when done.
    # (optional) Clean up /cluster/data/nomLeu1/TemporaryTrackDbCheckout


#########################################################################
# RepeatMasker (DONE - 2010-10-30 - Chin)
    mkdir /hive/data/genomes/nomLeu1/bed/repeatMasker
    cd /hive/data/genomes/nomLeu1/bed/repeatMasker

    time nice -n +19 doRepeatMasker.pl -buildDir=`pwd` \
	-workhorse=hgwdev -bigClusterHub=swarm -noSplit nomLeu1 > do.log 2>&1 &
    #   real    429m4.089s
    cat faSize.rmsk.txt
    # 2936035333 bases (179443556 N's 2756591777 real 1356684256 upper 
    # 1399907521 lower) in 17968 sequences in 1 files
    # %47.68 masked total, %50.78 masked real

    #bug 5557 fixed up (2011-11-16 Chin)
    cd /hive/data/genomes/nomLeu1/bed/repeatMasker
    head -3 nomLeu1.fa.out > nomLeu1.sorted.fa.out
    tail -n +4 nomLeu1.fa.out | sort -k5,5 -k6,6n >> nomLeu1.sorted.fa.out
    hgLoadOut -table=rmsk -nosplit nomLeu1 nomLeu1.sorted.fa.out
    # Loading up table rmsk
    # note: 2 records dropped due to repStart > repEnd
    #   run with -verbose=2 for details

    hgLoadOut -table=rmsk -nosplit -verbose=2 nomLeu1 nomLeu1.sorted.fa.out
    #	hgLoadOut: connected to database: nomLeu1
    # bad rep range [5563, 5562] line 2793949 of nomLeu1.sorted.fa.out 
    # bad rep range [3599, 3598] line 3970033 of nomLeu1.sorted.fa.out 
    # Loading up table rmsk
    # note: 2 records dropped due to repStart > repEnd


#########################################################################
# simpleRepeats ( DONE 2010-10-31 - Chin)
    mkdir /hive/data/genomes/nomLeu1/bed/simpleRepeat
    cd /hive/data/genomes/nomLeu1/bed/simpleRepeat

    time nice -n +19 doSimpleRepeat.pl -buildDir=`pwd` -workhorse=hgwdev \
	-bigClusterHub=swarm -smallClusterHub=swarm nomLeu1 > do.log 2>&1 &
    # real    39m4.732s    
    cat fb.simpleRepeat
    # 122433828 bases of 2756591777 (4.441%) in intersection  
    #	add to the repeatMasker
    cd /hive/data/genomes/nomLeu1
    twoBitMask nomLeu1.rmsk.2bit -add bed/simpleRepeat/trfMask.bed nomLeu1.2bit
    #	safe to ignore warnings about >=13 fields
    twoBitToFa nomLeu1.2bit stdout | faSize stdin > nomLeu1.2bit.faSize.txt
    cat nomLeu1.2bit.faSize.txt
    # 2936035333 bases (179443556 N's 2756591777 real 1355629718 upper 
    # 1400962059 lower) in 17968 sequences in 1 files
    # %47.72 masked total, %50.82 masked real

    rm /gbdb/nomLeu1/nomLeu1.2bit
    ln -s `pwd`/nomLeu1.2bit /gbdb/nomLeu1/nomLeu1.2bit

#########################################################################
# Marking *all* gaps - they are all in the AGP file
#	(DONE - 2010-10-31 - Chin)
    mkdir /hive/data/genomes/nomLeu1/bed/allGaps
    cd /hive/data/genomes/nomLeu1/bed/allGaps

    time nice -n +19 findMotif -motif=gattaca -verbose=4 \
	-strand=+ ../../nomLeu1.unmasked.2bit > findMotif.txt 2>&1
    # real    0m47.822s
    grep "^#GAP " findMotif.txt | sed -e "s/^#GAP //" > allGaps.bed
    featureBits nomLeu1 -not gap -bed=notGap.bed
    # 0 bases of 2756591777 (0.000%) in intersection
    
    featureBits nomLeu1 allGaps.bed notGap.bed -bed=new.gaps.bed
    #   0 bases of 2756591777 (0.000%) in intersection
    #   they are all in the AGP file 
    #	what is the highest index in the existing gap table:
    hgsql -N -e "select ix from gap;" nomLeu1 | sort -n | tail -1
    #	8318


########################################################################
# Create kluster run files (working - 2010-11-01 - Chin)
    # numerator is nomLeu1 gapless bases "real" as reported by: 
    featureBits -noRandom -noHap nomLeu1 gap
    # 179443556 bases of 2756591777 (6.510%) in intersection

    # denominator is hg19 gapless bases as reported by:
    #	featureBits -noRandom -noHap hg19 gap
    #     234344806 bases of 2861349177 (8.190%) in intersection
    # 1024 is threshold used for human -repMatch:
    calc \(  2756591777  / 2861349177 \) \* 1024
    # ( 2756591777 / 2861349177 ) * 1024 = 986.510141
    # ==> use -repMatch=400 according to size scaled down from 1024 for human.
    #	and rounded down to nearest 50 (in this case, 900)
    cd /hive/data/genomes/nomLeu1
    blat nomLeu1.2bit \
	 /dev/null /dev/null -tileSize=11 -makeOoc=jkStuff/nomLeu1.11.ooc \
	-repMatch=900 &
    # Loading nomLeu1.2bit
    # Counting nomLeu1.2bit
    # Writing jkStuff/nomLeu1.11.ooc
    # Wrote 36558 overused 11-mers to jkStuff/nomLeu1.11.ooc
    # Done making jkStuff/nomLeu1.11.ooc

    mkdir /hive/data/staging/data/nomLeu1
    cp -p nomLeu1.2bit jkStuff/nomLeu1.11.ooc /hive/data/staging/data/nomLeu1
    cp -p chrom.sizes /hive/data/staging/data/nomLeu1
    #	check  for non-bridged gaps to see what the typical size is:
    hgsql -e "select bridge from gap;" nomLeu1 | sort | uniq
    #  bridge
    #  yes
    # all gap are bridged, done
    
    # ask cluster-admin to copy (evry time if any file chsnged)
    #    /hive/data/staging/data/nomLeu1 directory to cluster nodes
    #    /scratch/data/nomLeu1

########################################################################
# GENBANK AUTO UPDATE (DONE 2010-11-01 - Chin)
    ssh hgwdev
    cd $HOME/kent/src/hg/makeDb/genbank
    git pull

    # edit etc/genbank.conf to add nomLeu1 just after ponAbe2

# nomLeu1 (Gibbon)
nomLeu1.serverGenome = /hive/data/genomes/nomLeu1/nomLeu1.2bit
nomLeu1.clusterGenome = /scratch/data/nomLeu1/nomLeu1.2bit
nomLeu1.ooc = /scratch/data/nomLeu1/nomLeu1.11.ooc
nomLeu1.lift = no
nomLeu1.perChromTables = no
nomLeu1.refseq.mrna.native.pslCDnaFilter  = ${ordered.refseq.mrna.native.pslCDnaFilter}
nomLeu1.refseq.mrna.xeno.pslCDnaFilter    = ${ordered.refseq.mrna.xeno.pslCDnaFilter}
nomLeu1.genbank.mrna.native.pslCDnaFilter = ${ordered.genbank.mrna.native.pslCDnaFilter}
nomLeu1.genbank.mrna.xeno.pslCDnaFilter   = ${ordered.genbank.mrna.xeno.pslCDnaFilter}
nomLeu1.genbank.est.native.pslCDnaFilter  = ${ordered.genbank.est.native.pslCDnaFilter}
nomLeu1.genbank.est.xeno.pslCDnaFilter    = ${ordered.genbank.est.xeno.pslCDnaFilter}
nomLeu1.downloadDir = nomLeu1
nomLeu1.refseq.mrna.native.load  = no
nomLeu1.refseq.mrna.xeno.load = yes
nomLeu1.refseq.mrna.xeno.loadDesc  = yes
nomLeu1.genbank.mrna.native.load = yes
nomLeu1.genbank.mrna.native.loadDesc = yes
nomLeu1.genbank.mrna.xeno.load = yes
nomLeu1.genbank.mrna.xeno.loadDesc = yes
nomLeu1.genbank.est.native.load = no
nomLeu1.genbank.est.native.loadDesc = no

    git add etc/genbank.conf
    git commit -m "Added nomLeu1" etc/genbank.conf
    git pull
    git push
    # update /cluster/data/genbank/:
    make etc-update

# Edit src/lib/gbGenome.c to add new species.  With these two lines:
# static char *nomLeuNames[] = {"Nomascus leucogenys", NULL};
#   ... later ...
#    {"nomLeu", nomLeuNames},
#  gbGenome.c is  in
#  /cluster/home/chinhli/kent/src/hg/makeDb/genbank/src/lib
# make and checkin

    git add src/lib/gbGenome.c
    git commit -m "adding nomLeu1 Gibbon" src/lib/gbGenome.c
    git pull
    git push
    make install-server

    ssh genbank
    screen	#  control this business with a screen since it takes a while


    cd /cluster/data/genbank
     time nice -n +19 ./bin/gbAlignStep -initial nomLeu1 &
    #   logFile: var/build/logs/2010.11.02-08:51:08.nomLeu1.initalign.log
    #   real    181m41.973s
    #   To re-do, rm the dir first:
    #     /cluster/data/genbank/data/aligned/genbank.180.0/nomLeu1

    # load database when finished
    ssh hgwdev
    cd /cluster/data/genbank
    time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad nomLeu1 &
    # logFile: var/dbload/hgwdev/logs/2011.02.07-18:36:56.dbload.log
    #  real    19m51.159s

    # enable daily alignment and update of hgwdev
    cd ~/kent/src/hg/makeDb/genbank
    git pull
    # add nomLeu1 to:
        etc/align.dbs
        etc/hgwdev.dbs
    git add etc/align.dbs etc/hgwdev.dbs
    git commit -m "Added nomLeu1 - Gibbon" etc/align.dbs etc/hgwdev.dbs
    git pull
    git push
    make etc-update


############################################################################
#  nomLeu1 - Gibbon - Ensembl Genes version 62  (DONE - 2011-04-19 - Hiram)
    ssh hgwdev
    cd /hive/data/genomes/nomLeu1
    cat << '_EOF_' > nomLeu1.ensGene.ra
# required db variable
db nomLeu1
# optional nameTranslation, the sed command that will transform
#       Ensemble names to UCSC names.  With single quotes to protect
#       everything in perl
nameTranslation 's/^GL\([0-9][0-9]*\).1/GL\1/; s/^ADFV\([0-9][0-9]*\).1/ADFV\1/;'
'_EOF_'
#  << happy emacs

    doEnsGeneUpdate.pl  -ensVersion=62 nomLeu1.ensGene.ra
    ssh hgwdev
    cd /hive/data/genomes/nomLeu1/bed/ensGene.62
    featureBits nomLeu1 ensGene
    # 45293349 bases of 2756591777 (1.643%) in intersection

    hgsql -e \
'update trackVersion set dateReference="current" where db="nomLeu1";' hgFixed

############################################################################
# lastz swap from Gorilla gorGor3 (DONE - 2011-10-27 - Hiram)
    # original alignment on gorilla
    cd /hive/data/genomes/gorGor3/bed/lastzNomLeu1.2011-10-21
    cat fb.gorGor3.chainNomLeu1Link.txt
    #	2356321697 bases of 2822760080 (83.476%) in intersection

    #	running the swap
    mkdir /hive/data/genomes/nomLeu1/bed/blastz.gorGor3.swap
    cd /hive/data/genomes/nomLeu1/bed/blastz.gorGor3.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	/hive/data/genomes/gorGor3/bed/lastzNomLeu1.2011-10-21/DEF \
	-swap -syntenicNet \
	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \
	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
	> swap.log 2>&1 &
    #	real    74m9.554s
    cat fb.nomLeu1.chainGorGor3Link.txt 
    #	2318822567 bases of 2756591777 (84.119%) in intersection

    cd /hive/data/genomes/nomLeu1/bed
    ln -s blastz.gorGor3.swap lastz.gorGor3

###########################################################################
# lastz swap Human hg19 (DONE - 2011-11-08 - Chin)

    # original alignment
    cd /hive/data/genomes/hg19/bed/lastzNomLeu1.2011-11-04
    cat fb.hg19.chainNomLeu1Link.txt
    #   2543943556 bases of 2897316137 (87.803%) in intersection


    #   running the swap
    mkdir /hive/data/genomes/nomLeu1/bed/blastz.hg19.swap
    cd /hive/data/genomes/nomLeu1/bed/blastz.hg19.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
        /hive/data/genomes/hg19/bed/lastzNomLeu1.2011-11-04/DEF \
        -swap -syntenicNet \
        -noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \
        -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
        > swap.log 2>&1 &
    #   real    69m39.685s
    cat fb.nomLeu1.chainHg19Link.txt
    #   2480558770 bases of 2756591777 (89.986%) in intersection

    cd /hive/data/genomes/nomLeu1/bed
    ln -s blastz.hg19.swap lastz.hg19

#########################################################################
# set gibbon/trackDb.chainNet.ra after last lastz run (DONE 2011-11-17 - Chin)
# put the output from chainNet.pl and findScore.pl result in the ra file
    chainNet.pl nomLeu1
    # track chainNetGorGor3 override
    # priority 210.3

    # track chainNetHg19 override
    # priority 230.3

    findScores.pl hg19 nomLeu1
    # looking in file:
    #   /hive/data/genomes/hg19/bed/lastz.nomLeu1/axtChain/run/chain.csh
    # -scoreScheme=/scratch/data/blastz/human_chimp.v2.q
    # matrix 16 90,-330,-236,-356,-330,100,-318,-236,-236,-318,100,-330,-356,-236,-330,90
    # -minScore=5000
    # -linearGap=medium


#########################################################################
# all.joiner update, downloads and in pushQ (DONE 2011-11-10 - Chin)
    cd $HOME/kent/src/hg/makeDb/schema
    # fixup all.joiner until this is a clean output
    joinerCheck -database=nomLeu1 -all all.joiner

    mkdir /hive/data/genomes/nomLeu1/goldenPath
    cd /hive/data/genomes/nomLeu1/goldenPath
    makeDownloads.pl nomLeu1 > do.log 2>&1

    #   now ready for pushQ entry
    mkdir /hive/data/genomes/nomLeu1/pushQ
    cd /hive/data/genomes/nomLeu1/pushQ
    makePushQSql.pl nomLeu1 > nomLeu1.pushQ.sql 2> stderr.out
    #   check for errors in stderr.out, some are OK, e.g.:
    # WARNING: hgwdev does not have /gbdb/nomLeu1/wib/gc5Base.wib
    # WARNING: hgwdev does not have /gbdb/nomLeu1/wib/quality.wib
    # WARNING: hgwdev does not have /gbdb/nomLeu1/bbi/quality.bw
    # WARNING: nomLeu1 does not have seq
    # WARNING: nomLeu1 does not have extFile
    # 
    # WARNING: Could not tell (from trackDb, all.joiner and hardcoded
    # lists of
    # supporting and genbank tables) which tracks to assign these tables
    # to:
    #   tableList

    #   copy it to hgwbeta
    scp -p nomLeu1.pushQ.sql hgwbeta:/tmp
    ssh hgwbeta
    cd /tmp
    hgsql qapushq < nomLeu1.pushQ.sql
    #   in that pushQ entry walk through each entry and see if the
    #   sizes will set properly


#########################################################################
#  BLATSERVERS ENTRY (DONE - 2011-11-17 - Chin)
#  this has been requested through Steve H. A blat has been started for
#  nomLeu1 on the host blat1 on ports 17812 (trans) and 17813 (untrans).
#       After getting a blat server assigned by the Blat Server Gods,
    ssh hgwdev

    hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
        VALUES ("nomLeu1", "blat1", "17812", "1", "0"); \
        INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
        VALUES ("nomLeu1", "blat1", "17813", "0", "1");' \
            hgcentraltest
    #   test it with some sequence

############################################################################
# construct liftOver to nomLeu2 (DONE - 2012-11-14 - Hiram)
    screen -S nomLeu2	# manage this longish running job in a screen
    mkdir /hive/data/genomes/nomLeu1/bed/blat.nomLeu2.2012-11-14
    cd /hive/data/genomes/nomLeu1/bed/blat.nomLeu2.2012-11-14
    # check it with -debug first to see if it is going to work:
    time doSameSpeciesLiftOver.pl -buildDir=`pwd` -bigClusterHub=swarm \
	-ooc=/hive/data/genomes/nomLeu1/jkStuff/nomLeu1.11.ooc \
	-debug -dbHost=hgwdev -workhorse=hgwdev nomLeu1 nomLeu2
    #   real    0m1.838s
    # if that is OK, then run it:
    time doSameSpeciesLiftOver.pl -buildDir=`pwd` -bigClusterHub=swarm \
	-ooc=/hive/data/genomes/nomLeu1/jkStuff/nomLeu1.11.ooc \
	-dbHost=hgwdev -workhorse=hgwdev nomLeu1 nomLeu2 > do.log 2>&1
    #	real    277m26.138s

    # verify this file exists:
    #	/gbdb/nomLeu1/liftOver/nomLeu1ToNomLeu2.over.chain.gz
    # and try out the conversion on genome-test from nomLeu1 to nomLeu2

############################################################################