# for emacs: -*- mode: sh; -*-

# This file describes browser build for the gadMor1
#	Atlantic Cod Gadus morhua genome: May 2010

# http://www.ncbi.nlm.nih.gov/Traces/wgs/?val=CAEA01
#	WGS, est. size: 830 Mbp
# A new page of information at NCBI:
#	http://www.ncbi.nlm.nih.gov/genome/2661

#############################################################################
# Fetch sequence from genbank (DONE - 2011-12-22 - Hiram)

    mkdir -p /hive/data/genomes/gadMor1/genbank
    cd /hive/data/genomes/gadMor1/genbank

    wget --timestamping -r --cut-dirs=6 --level=0 -nH -x \
        --no-remove-listing -np \
"ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_other/Gadus_morhua/GadMor_May2010/*"

    # measure total sequence in this assembly
    faSize Primary_Assembly/unplaced_scaffolds/FASTA/unplaced.scaf.fa.gz
# 824311139 bases (216289280 N's 608021859 real 608021859 upper 0 lower) in 427427 sequences in 1 files
# Total size: mean 1928.5 sd 38925.4 min 100 (gi|344901537|emb|CAEA01000187.1|) max 4999318 (gi|344906074|emb|HE569754.1|) median 376
# N count: mean 506.0 sd 12840.6
# U count: mean 1422.5 sd 26292.8
# L count: mean 0.0 sd 0.0
# %0.00 masked total, %0.00 masked real

#############################################################################
# process into UCSC naming scheme (DONE - 2011-12-22 - Hiram)
    cd /hive/data/genomes/gadMor1/genbank
    cat << '_EOF_' > unplaced.pl
#!/bin/env perl

use strict;
use warnings;

my $agpFile =  "../genbank/Primary_Assembly/unplaced_scaffolds/AGP/unplaced.scaf.agp.gz";
my $fastaFile =  "../genbank/Primary_Assembly/unplaced_scaffolds/FASTA/unplaced.scaf.fa.gz";
open (FH, "zcat $agpFile|") or die "can not read $agpFile";
open (UC, ">unplaced.agp") or die "can not write to unplaced.agp";
while (my $line = <FH>) {
    if ($line =~ m/^#/) {
        print UC $line;
    } else {
        $line =~ s/\.1//;    
        printf UC "%s", $line;
    }
}
close (FH);
close (UC);

open (FH, "zcat $fastaFile|") or die "can not read $fastaFile";
open (UC, ">unplaced.fa") or die "can not write to unplaced.fa";
while (my $line = <FH>) {
    if ($line =~ m/^>/) {
        chomp $line;
        $line =~ s/.*emb\|//;
        $line =~ s/\.1\|.*//;
        printf UC ">$line\n";
    } else {
        print UC $line;
    }
}
close (FH);
close (UC);
'_EOF_'
    # << happy emacs
    chmod +x unplaced.pl
    time ./unplaced.pl
    #	real    0m28.106s

    # compress these files
    gzip *.fa *.agp

    # verify nothing has changed in the sequence, should be the same as above:
    faSize unplaced.fa.gz
# 824311139 bases (216289280 N's 608021859 real 608021859 upper 0 lower) in
# 427427 sequences in 1 files
# Total size: mean 1928.5 sd 38925.4 min 100 (CAEA01000187) max 4999318 (HE569754) median 376
# N count: mean 506.0 sd 12840.6
# U count: mean 1422.5 sd 26292.8
# L count: mean 0.0 sd 0.0
# %0.00 masked total, %0.00 masked real

#############################################################################
#  Initial database build (DONE - 2011-12-22 - Hiram)
    cd /hive/data/genomes/gadMor1
    cat << '_EOF_' > gadMor1.config.ra
# Config parameters for makeGenomeDb.pl:
db gadMor1
clade vertebrate
genomeCladePriority 125
scientificName Gadus morhua
commonName Atlantic cod
assemblyDate May. 2010
assemblyLabel Genofisk GadMor_May2010 (GCA_000231765.1)
assemblyShortLabel Genofisk GadMor_May2010
orderKey 478
mitoAcc NC_002081
fastaFiles /hive/data/genomes/gadMor1/genbank/unplaced.fa.gz
agpFiles /hive/data/genomes/gadMor1/genbank/unplaced.agp.gz
dbDbSpeciesDir gadMor
taxId   8049
'_EOF_'
    # << happy emacs

    # first verify the sequence and AGP files are OK
    time makeGenomeDb.pl -stop=agp -workhorse=hgwdev gadMor1.config.ra \
	> agp.log 2>&1
    #	real    0m53.946s
    # verify that was OK, look at the agp.log file
    time makeGenomeDb.pl -continue=db -workhorse=hgwdev gadMor1.config.ra \
	> db.log 2>&1
    #	real    10m45.982s
    # verify that was OK, look at the do.log file
    # copy the trackDb business to the source tree, check it in and add
    #	to the trackDb/makefile

#############################################################################
# running repeat masker (DONE - 2011-12-29 - Hiram)
    mkdir /hive/data/genomes/gadMor1/bed/repeatMasker
    cd /hive/data/genomes/gadMor1/bed/repeatMasker
    time doRepeatMasker.pl -buildDir=`pwd` -noSplit \
	-bigClusterHub=swarm -dbHost=hgwdev -workhorse=hgwdev \
	-smallClusterHub=memk gadMor1 > do.log 2>&1 &
    #	real    265m32.852s
    cat faSize.rmsk.txt
    #	824327835 bases (216289280 N's 608038555 real 541851562 upper
    #	66186993 lower) in 427428 sequences in 1 files
    #	%8.03 masked total, %10.89 masked real


    grep -i versi do.log
# RepeatMasker version development-$Id: RepeatMasker,v 1.26 2011/09/26 16:19:44 angie Exp $
#    April 26 2011 (open-3-3-0) version of RepeatMasker

    time featureBits -countGaps gadMor1 rmsk
    #	real    3m0.062s
    #	66224843 bases of 824327835 (8.034%) in intersection
    # why is it different than the faSize above ?
    # because rmsk masks out some N's as well as bases, the count above
    #	separates out the N's from the bases, it doesn't show lower case N's

##########################################################################
# running simple repeat (DONE - 2011-12-29-2012-01-04 - Hiram)
    mkdir /hive/data/genomes/gadMor1/bed/simpleRepeat
    cd /hive/data/genomes/gadMor1/bed/simpleRepeat
    time doSimpleRepeat.pl -buildDir=`pwd` -bigClusterHub=swarm \
	-dbHost=hgwdev -workhorse=hgwdev -smallClusterHub=memk \
	gadMor1 > do.log 2>&1 &
    #	real    2177m47.881s
    # two failed jobs, running them on hgwdev to see what fails:
./TrfRun.csh /hive/data/genomes/gadMor1/TrfPart/013/013.lst.bed > 013.log 2>&1 &
./TrfRun.csh /hive/data/genomes/gadMor1/TrfPart/015/015.lst.bed > 015.log 2>&1
    #	real    347m1.394s
    # These two jobs are failing because the lift file splits in so many
    #	pieces that there is a name SplitLft.gz which is interpreted as
    #	a gzipped file to the liftUp command.  These needed to be done
    #	manually with numeric extensions to three places:
    cd /scratch/tmp/doSimpleRepeat.cluster.M32046
    split -a 3 -d -l 500 \
	/hive/data/genomes/gadMor1/TrfPart/013/013.lft SplitLft.
foreach splitLft (SplitLft.*)
  set bedFiles = `awk '{print $2 ".bed"};' $splitLft`
  endsInLf -zeroOk $bedFiles
  cat $bedFiles \
  | liftUp -type=.bed tmpOut.$splitLft $splitLft error stdin
end
    cat tmpOut.* > tmpOut__bed

    endsInLf -zeroOk tmpOut*
    cp -p tmpOut__bed /hive/data/genomes/gadMor1/TrfPart/013/013.lst.bed
    cd /scratch/tmp/doSimpleRepeat.cluster.L32048
    split -a 3 -d -l 500 \
	/hive/data/genomes/gadMor1/TrfPart/015/015.lft SplitLft.
foreach splitLft (SplitLft.*)
  set bedFiles = `awk '{print $2 ".bed"};' $splitLft`
  endsInLf -zeroOk $bedFiles
  cat $bedFiles \
  | liftUp -type=.bed tmpOut.$splitLft $splitLft error stdin
end
    cat tmpOut.* > tmpOut__bed

    endsInLf -zeroOk tmpOut*
    cp -p tmpOut__bed /hive/data/genomes/gadMor1/TrfPart/015/015.lst.bed

    time doSimpleRepeat.pl -continue=filter -buildDir=`pwd` \
	-bigClusterHub=swarm \
	-dbHost=hgwdev -workhorse=hgwdev -smallClusterHub=memk \
	gadMor1 > filter.log 2>&1 &
    #	real    3m29.203s
    cat fb.simpleRepeat 
    #	62230728 bases of 608038597 (10.235%) in intersection

    #	do *not* add this to rmsk here since we are going to use
    #	windowMasker instead of rmsk
    # add to rmsk after it is done:
    cd /hive/data/genomes/gadMor1
    twoBitMask gadMor1.rmsk.2bit \
	-add bed/simpleRepeat/trfMask.bed gadMor1.2bit
    #	you can safely ignore the warning about fields >= 13

    twoBitToFa gadMor1.2bit stdout | faSize stdin > faSize.gadMor1.2bit.txt
    cat faSize.gadMor1.2bit.txt

    rm /gbdb/gadMor1/gadMor1.2bit
    ln -s `pwd`/gadMor1.2bit /gbdb/gadMor1/gadMor1.2bit

#########################################################################
# Verify all gaps are marked, add any N's not in gap as type 'other'
#	(DONE - 2011-12-29,2012-01-04 - Hiram)
    mkdir /hive/data/genomes/gadMor1/bed/gap
    cd /hive/data/genomes/gadMor1/bed/gap
    time nice -n +19 findMotif -motif=gattaca -verbose=4 \
	-strand=+ ../../gadMor1.unmasked.2bit > findMotif.txt 2>&1
    #	real    0m17.372s
    grep "^#GAP " findMotif.txt | sed -e "s/^#GAP //" > allGaps.bed
    time featureBits gadMor1 -not gap -bed=notGap.bed
    #	real    2m42.073s
    #	608046566 bases of 608046566 (100.000%) in intersection
    time featureBits gadMor1 allGaps.bed notGap.bed -bed=new.gaps.bed
    #	real    681m9.272s
    #	7969 bases of 608046566 (0.001%) in intersection

    #	what is the highest index in the existing gap table:
    hgsql -N -e "select ix from gap;" gadMor1 | sort -n | tail -1
    #	1942
    cat << '_EOF_' > mkGap.pl
#!/bin/env perl

use strict;
use warnings;

my $ix=`hgsql -N -e "select ix from gap;" gadMor1 | sort -n | tail -1`;
chomp $ix;

open (FH,"<new.gaps.bed") or die "can not read new.gaps.bed";
while (my $line = <FH>) {
    my ($chrom, $chromStart, $chromEnd, $rest) = split('\s+', $line);
    ++$ix;
    printf "%s\t%d\t%d\t%d\tN\t%d\tother\tyes\n", $chrom, $chromStart,
        $chromEnd, $ix, $chromEnd-$chromStart;
}
close (FH);
'_EOF_'
    # << happy emacs
    chmod +x ./mkGap.pl
    ./mkGap.pl > other.bed
    featureBits -countGaps gadMor1 other.bed
    #	real    8m36.778s
    #	7969 bases of 824327835 (0.001%) in intersection
    wc -l other.bed
    #	7853
    hgLoadBed -sqlTable=$HOME/kent/src/hg/lib/gap.sql \
	-noLoad gadMor1 otherGap other.bed
    # profile of sizes:
    awk '{print $3-$2}' other.bed | sort | uniq -c
    #	7763 1
    #	78 2
    #	5 3
    #	3 4
    #	2 5
    #	1 6
    #	1 7

    #	starting with this many
    hgsql -e "select count(*) from gap;" gadMor1
    #	127442
    hgsql gadMor1 -e 'load data local infile "bed.tab" into table gap;'
    #	result count:
    hgsql -e "select count(*) from gap;" gadMor1
    #	135295
    # == 127442 + 7853
    # verify we aren't adding gaps where gaps already exist
    # this would output errors if that were true:
    gapToLift -minGap=1 gadMor1 nonBridged.lift -bedFile=nonBridged.bed
    # these warnings are misleading, they aren't overlapping gaps, they
    # are merely adjacent to existing gaps:
# WARNING: overlapping gap at HE572041:1726761-1727075(other) and HE572041:1727075-1727076(other)
# WARNING: overlapping gap at HE572319:1050744-1050745(other) and HE572319:1050745-1050904(fragment)
    # there are 27 of these locations

    # there are no non-bridged gaps here:
    hgsql -N -e "select bridge from gap;" gadMor1 | sort | uniq -c
    #	135295 yes

##########################################################################
## WINDOWMASKER (DONE - 2011-12-29,2012-01-04 - Hiram)
    mkdir /hive/data/genomes/gadMor1/bed/windowMasker
    cd /hive/data/genomes/gadMor1/bed/windowMasker
    time nice -n +19 doWindowMasker.pl -buildDir=`pwd` -workhorse=hgwdev \
	-dbHost=hgwdev gadMor1 > do.log 2>&1 &
    #	real    73m45.656s

    # Masking statistics
    twoBitToFa gadMor1.wmsk.2bit stdout | faSize stdin
    #	824327835 bases (216289280 N's 608038555 real 427305862 upper
    #	180732693 lower) in 427428 sequences in 1 files
    #	%21.92 masked total, %29.72 masked real

    twoBitToFa gadMor1.wmsk.sdust.2bit stdout | faSize stdin
    #	824327835 bases (216289280 N's 608038555 real 418637635 upper
    #	189400920 lower) in 427428 sequences in 1 files
    #	%22.98 masked total, %31.15 masked real

    hgLoadBed gadMor1 windowmaskerSdust windowmasker.sdust.bed.gz
    #	Loaded 3422833 elements of size 3

    featureBits -countGaps gadMor1 windowmaskerSdust
    #	real    5m27.388s
    #	405686830 bases of 824327835 (49.214%) in intersection

    #	eliminate the gaps from the masking
    featureBits gadMor1 -not gap -bed=notGap.bed
    #	real    3m11.708s
    #	608038597 bases of 608038597 (100.000%) in intersection
    time nice -n +19 featureBits gadMor1 windowmaskerSdust notGap.bed \
        -bed=stdout | gzip -c > cleanWMask.bed.gz
    #	real    577m19.317s
    #	189400922 bases of 608038597 (31.149%) in intersection

    hgLoadBed gadMor1 windowmaskerSdust cleanWMask.bed.gz
    #	Loaded 3485901 elements of size 4

    featureBits -countGaps gadMor1 windowmaskerSdust
    #	714075485 bases of 1799143587 (39.690%) in intersection
    #	mask the sequence with this clean mask
    zcat cleanWMask.bed.gz \
	| twoBitMask ../../gadMor1.unmasked.2bit stdin \
	    -type=.bed gadMor1.cleanWMSdust.2bit
    twoBitToFa gadMor1.cleanWMSdust.2bit stdout | faSize stdin \
        > gadMor1.cleanWMSdust.faSize.txt
    cat gadMor1.cleanWMSdust.faSize.txt
    #	824327835 bases (216289280 N's 608038555 real 418637635 upper
    #	189400920 lower) in 427428 sequences in 1 files
    #	%22.98 masked total, %31.15 masked real

    # how much does this window masker and repeat masker overlap:
    featureBits -countGaps gadMor1 rmsk windowmaskerSdust
    #	280229203 bases of 1511735326 (18.537%) in intersection

#########################################################################
# MASK SEQUENCE WITH WM+TRF (DONE - 2011-12-04 - Hiram)
    cd /hive/data/genomes/gadMor1
    twoBitMask -add bed/windowMasker/gadMor1.cleanWMSdust.2bit \
	bed/simpleRepeat/trfMask.bed gadMor1.2bit
    #	safe to ignore the warnings about BED file with >=13 fields
    twoBitToFa gadMor1.2bit stdout | faSize stdin > faSize.gadMor1.txt
    cat faSize.gadMor1.txt
    #	824327835 bases (216289280 N's 608038555 real 418051473 upper
    #	189987082 lower) in 427428 sequences in 1 files
    #	%23.05 masked total, %31.25 masked real

    #	create symlink to gbdb
    ssh hgwdev
    rm /gbdb/gadMor1/gadMor1.2bit
    ln -s `pwd`/gadMor1.2bit /gbdb/gadMor1/gadMor1.2bit

########################################################################
# cpgIslandExt (DONE - 2012-04-19 - Hiram)
    mkdir /hive/data/genomes/gadMor1/bed/cpgIslands
    cd /hive/data/genomes/gadMor1/bed/cpgIslands

    # this procedure was done during development of the script to do all
    #   this automatically.

    time ./doHardMask.csh
    real    876m43.648s

    time ./doMakeBed.csh
    real    24m19.176s

    time doCpgIslands.pl -continue=load

    time featureBits gadMor1 cpgIslandExt > fb.gadMor1.cpgIslandExt.txt 2>&1
    #   real    2m11.771s
    cat fb.gadMor1.cpgIslandExt.txt
    #   19965268 bases of 608038597 (3.284%) in intersection

########################################################################
# genscan - (DONE - 2011-04-25 - Hiram)
    mkdir /hive/data/genomes/gadMor1/bed/genscan
    cd /hive/data/genomes/gadMor1/bed/genscan
    time doGenscan.pl gadMor1 > do.log 2>&1
    #   real    909m27.951s

    cat fb.gadMor1.genscan.txt
    #   31133526 bases of 608038597 (5.120%) in intersection
    cat fb.gadMor1.genscanSubopt.txt
    #   23477881 bases of 608038597 (3.861%) in intersection

#########################################################################
# MAKE 11.OOC FILE FOR BLAT/GENBANK (DONE - 2012-05-03 - Hiram)
    # Use -repMatch=900, based on size -- for human we use 1024
    # use the "real" number from the faSize measurement,
    # hg19 is 2897316137, calculate the ratio factor for 1024:
    calc \( 608038555 / 2897316137 \) \* 1024
    #	( 608038555 / 2897316137 ) * 1024 = 214.899393

    # round up to 250

    cd /hive/data/genomes/gadMor1
    time blat gadMor1.2bit /dev/null /dev/null -tileSize=11 \
      -makeOoc=jkStuff/gadMor1.11.ooc -repMatch=250
    #   Wrote 9115 overused 11-mers to jkStuff/gadMor1.11.ooc
    #   real    0m22.624s

    # there are no non-bridged gaps, no lift file needed for genbank
    hgsql -N -e "select bridge from gap;" gadMor1 | sort | uniq -c
    #   135295 yes
#    cd /hive/data/genomes/gadMor1/jkStuff
#    gapToLift gadMor1 gadMor1.nonBridged.lift -bedFile=gadMor1.nonBridged.bed
    # largest non-bridged contig:
#    awk '{print $3-$2,$0}' gadMor1.nonBridged.bed | sort -nr | head
    #   123773608 chrX  95534   123869142       chrX.01

#########################################################################
# AUTO UPDATE GENBANK (DONE - 2012-05-03 - Hiram)
    # examine the file:
    /cluster/data/genbank/data/organism.lst
    # for your species to see what counts it has for:
# organism    mrnaCnt estCnt  refSeqCnt
# Gadus morhua	636	255256	0
    # to decide which "native" mrna or ests you want to specify in genbank.conf

    ssh hgwdev  
    cd $HOME/kent/src/hg/makeDb/genbank
    git pull
    # edit etc/genbank.conf to add gadMor1 just after ce2
# gadMor1 (Atlantic cod)
#	Atlantic Cod Gadus morhua genome: May 2010
gadMor1.serverGenome = /hive/data/genomes/gadMor1/gadMor1.2bit
gadMor1.clusterGenome = /hive/data/genomes/gadMor1/gadMor1.2bit
gadMor1.ooc = /hive/data/genomes/gadMor1/jkStuff/gadMor1.11.ooc
gadMor1.lift = no
gadMor1.refseq.mrna.native.pslCDnaFilter  = ${lowCover.refseq.mrna.native.pslCDnaFilter}
gadMor1.refseq.mrna.xeno.pslCDnaFilter    = ${lowCover.refseq.mrna.xeno.pslCDnaFilter}
gadMor1.genbank.mrna.native.pslCDnaFilter = ${lowCover.genbank.mrna.native.pslCDnaFilter}
gadMor1.genbank.mrna.xeno.pslCDnaFilter   = ${lowCover.genbank.mrna.xeno.pslCDnaFilter}
gadMor1.genbank.est.native.pslCDnaFilter  = ${lowCover.genbank.est.native.pslCDnaFilter}
gadMor1.refseq.mrna.native.load = no
gadMor1.refseq.mrna.xeno.load = yes
gadMor1.genbank.mrna.xeno.load = no
gadMor1.genbank.est.native.load = yes
gadMor1.downloadDir = gadMor1
gadMor1.perChromTables = no

    # end of section added to etc/genbank.conf
    git commit -m "adding gadMor1 Atlantic cod" etc/genbank.conf
    git push
    make etc-update

    git pull
    # Edit src/lib/gbGenome.c to add new species.
    git commit -m "adding definition for gadMorNames" \
        src/lib/gbGenome.c
    git push
    make install-server

    ssh hgwdev			# used to do this on "genbank" machine
    screen -S gadMor1           # long running job managed in screen
    cd /cluster/data/genbank
    time nice -n +19 ./bin/gbAlignStep -initial gadMor1 &
    #   var/build/logs/2012.06.07-16:07:06.gadMor1.initalign.log
    #   real    3488m35.403s

    # load database when finished
    ssh hgwdev
    cd /cluster/data/genbank
    time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad gadMor1 &
    #	logFile:  var/dbload/hgwdev/logs/2012.06.11-09:25:12.dbload.log
    #   real    43m4.176s

    # enable daily alignment and update of hgwdev (DONE - 2012-05-11 - Hiram)
    cd ~/kent/src/hg/makeDb/genbank
    git pull
    # add gadMor1 to:
    vi etc/align.dbs etc/hgwdev.dbs
    git commit -m "Added gadMor1." etc/align.dbs etc/hgwdev.dbs
    git push
    make etc-update

#########################################################################
# set default position to RHO gene displays  (DONE - 2012-07-23 - Hiram)
    hgsql -e \
'update dbDb set defaultPos="HE567617:3931-7549" where name="gadMor1";' \
	hgcentraltest

############################################################################
# pushQ entry (DONE - 2012-07-23 - Hiram)
    mkdir /hive/data/genomes/gadMor1/pushQ
    cd /hive/data/genomes/gadMor1/pushQ
    # Mark says don't let the transMap track get there
    time makePushQSql.pl gadMor1 2> stderr.txt | grep -v transMap > gadMor1.sql
    #   real    3m56.143s

    scp -p gadMor1.sql hgwbeta:/tmp
    ssh hgwbeta
    cd /tmp
    hgsql qapushq < gadMor1.sql

############################################################################